Home | History | Annotate | Download | only in priv
      1 
      2 /*--------------------------------------------------------------------*/
      3 /*--- begin                                       guest_x86_toIR.c ---*/
      4 /*--------------------------------------------------------------------*/
      5 
      6 /*
      7    This file is part of Valgrind, a dynamic binary instrumentation
      8    framework.
      9 
     10    Copyright (C) 2004-2011 OpenWorks LLP
     11       info (at) open-works.net
     12 
     13    This program is free software; you can redistribute it and/or
     14    modify it under the terms of the GNU General Public License as
     15    published by the Free Software Foundation; either version 2 of the
     16    License, or (at your option) any later version.
     17 
     18    This program is distributed in the hope that it will be useful, but
     19    WITHOUT ANY WARRANTY; without even the implied warranty of
     20    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
     21    General Public License for more details.
     22 
     23    You should have received a copy of the GNU General Public License
     24    along with this program; if not, write to the Free Software
     25    Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
     26    02110-1301, USA.
     27 
     28    The GNU General Public License is contained in the file COPYING.
     29 
     30    Neither the names of the U.S. Department of Energy nor the
     31    University of California nor the names of its contributors may be
     32    used to endorse or promote products derived from this software
     33    without prior written permission.
     34 */
     35 
     36 /* Translates x86 code to IR. */
     37 
     38 /* TODO:
     39 
     40    All Puts to CC_OP/CC_DEP1/CC_DEP2/CC_NDEP should really be checked
     41    to ensure a 32-bit value is being written.
     42 
     43    FUCOMI(P): what happens to A and S flags?  Currently are forced
     44       to zero.
     45 
     46    x87 FP Limitations:
     47 
     48    * all arithmetic done at 64 bits
     49 
     50    * no FP exceptions, except for handling stack over/underflow
     51 
     52    * FP rounding mode observed only for float->int conversions
     53      and int->float conversions which could lose accuracy, and
     54      for float-to-float rounding.  For all other operations,
     55      round-to-nearest is used, regardless.
     56 
     57    * FP sin/cos/tan/sincos: C2 flag is always cleared.  IOW the
     58      simulation claims the argument is in-range (-2^63 <= arg <= 2^63)
     59      even when it isn't.
     60 
     61    * some of the FCOM cases could do with testing -- not convinced
     62      that the args are the right way round.
     63 
     64    * FSAVE does not re-initialise the FPU; it should do
     65 
     66    * FINIT not only initialises the FPU environment, it also
     67      zeroes all the FP registers.  It should leave the registers
     68      unchanged.
     69 
     70    SAHF should cause eflags[1] == 1, and in fact it produces 0.  As
     71    per Intel docs this bit has no meaning anyway.  Since PUSHF is the
     72    only way to observe eflags[1], a proper fix would be to make that
     73    bit be set by PUSHF.
     74 
     75    The state of %eflags.AC (alignment check, bit 18) is recorded by
     76    the simulation (viz, if you set it with popf then a pushf produces
     77    the value you set it to), but it is otherwise ignored.  In
     78    particular, setting it to 1 does NOT cause alignment checking to
     79    happen.  Programs that set it to 1 and then rely on the resulting
     80    SIGBUSs to inform them of misaligned accesses will not work.
     81 
     82    Implementation of sysenter is necessarily partial.  sysenter is a
     83    kind of system call entry.  When doing a sysenter, the return
     84    address is not known -- that is something that is beyond Vex's
     85    knowledge.  So the generated IR forces a return to the scheduler,
     86    which can do what it likes to simulate the systenter, but it MUST
     87    set this thread's guest_EIP field with the continuation address
     88    before resuming execution.  If that doesn't happen, the thread will
     89    jump to address zero, which is probably fatal.
     90 
     91    This module uses global variables and so is not MT-safe (if that
     92    should ever become relevant).
     93 
     94    The delta values are 32-bit ints, not 64-bit ints.  That means
     95    this module may not work right if run on a 64-bit host.  That should
     96    be fixed properly, really -- if anyone ever wants to use Vex to
     97    translate x86 code for execution on a 64-bit host.
     98 
     99    casLE (implementation of lock-prefixed insns) and rep-prefixed
    100    insns: the side-exit back to the start of the insn is done with
    101    Ijk_Boring.  This is quite wrong, it should be done with
    102    Ijk_NoRedir, since otherwise the side exit, which is intended to
    103    restart the instruction for whatever reason, could go somewhere
    104    entirely else.  Doing it right (with Ijk_NoRedir jumps) would make
    105    no-redir jumps performance critical, at least for rep-prefixed
    106    instructions, since all iterations thereof would involve such a
    107    jump.  It's not such a big deal with casLE since the side exit is
    108    only taken if the CAS fails, that is, the location is contended,
    109    which is relatively unlikely.
    110 
    111    XXXX: Nov 2009: handling of SWP on ARM suffers from the same
    112    problem.
    113 
    114    Note also, the test for CAS success vs failure is done using
    115    Iop_CasCmp{EQ,NE}{8,16,32,64} rather than the ordinary
    116    Iop_Cmp{EQ,NE} equivalents.  This is so as to tell Memcheck that it
    117    shouldn't definedness-check these comparisons.  See
    118    COMMENT_ON_CasCmpEQ in memcheck/mc_translate.c for
    119    background/rationale.
    120 */
    121 
    122 /* Performance holes:
    123 
    124    - fcom ; fstsw %ax ; sahf
    125      sahf does not update the O flag (sigh) and so O needs to
    126      be computed.  This is done expensively; it would be better
    127      to have a calculate_eflags_o helper.
    128 
    129    - emwarns; some FP codes can generate huge numbers of these
    130      if the fpucw is changed in an inner loop.  It would be
    131      better for the guest state to have an emwarn-enable reg
    132      which can be set zero or nonzero.  If it is zero, emwarns
    133      are not flagged, and instead control just flows all the
    134      way through bbs as usual.
    135 */
    136 
    137 /* "Special" instructions.
    138 
    139    This instruction decoder can decode three special instructions
    140    which mean nothing natively (are no-ops as far as regs/mem are
    141    concerned) but have meaning for supporting Valgrind.  A special
    142    instruction is flagged by the 12-byte preamble C1C703 C1C70D C1C71D
    143    C1C713 (in the standard interpretation, that means: roll $3, %edi;
    144    roll $13, %edi; roll $29, %edi; roll $19, %edi).  Following that,
    145    one of the following 3 are allowed (standard interpretation in
    146    parentheses):
    147 
    148       87DB (xchgl %ebx,%ebx)   %EDX = client_request ( %EAX )
    149       87C9 (xchgl %ecx,%ecx)   %EAX = guest_NRADDR
    150       87D2 (xchgl %edx,%edx)   call-noredir *%EAX
    151 
    152    Any other bytes following the 12-byte preamble are illegal and
    153    constitute a failure in instruction decoding.  This all assumes
    154    that the preamble will never occur except in specific code
    155    fragments designed for Valgrind to catch.
    156 
    157    No prefixes may precede a "Special" instruction.
    158 */
    159 
    160 /* LOCK prefixed instructions.  These are translated using IR-level
    161    CAS statements (IRCAS) and are believed to preserve atomicity, even
    162    from the point of view of some other process racing against a
    163    simulated one (presumably they communicate via a shared memory
    164    segment).
    165 
    166    Handlers which are aware of LOCK prefixes are:
    167       dis_op2_G_E      (add, or, adc, sbb, and, sub, xor)
    168       dis_cmpxchg_G_E  (cmpxchg)
    169       dis_Grp1         (add, or, adc, sbb, and, sub, xor)
    170       dis_Grp3         (not, neg)
    171       dis_Grp4         (inc, dec)
    172       dis_Grp5         (inc, dec)
    173       dis_Grp8_Imm     (bts, btc, btr)
    174       dis_bt_G_E       (bts, btc, btr)
    175       dis_xadd_G_E     (xadd)
    176 */
    177 
    178 
    179 #include "libvex_basictypes.h"
    180 #include "libvex_ir.h"
    181 #include "libvex.h"
    182 #include "libvex_guest_x86.h"
    183 
    184 #include "main_util.h"
    185 #include "main_globals.h"
    186 #include "guest_generic_bb_to_IR.h"
    187 #include "guest_generic_x87.h"
    188 #include "guest_x86_defs.h"
    189 
    190 
    191 /*------------------------------------------------------------*/
    192 /*--- Globals                                              ---*/
    193 /*------------------------------------------------------------*/
    194 
    195 /* These are set at the start of the translation of an insn, right
    196    down in disInstr_X86, so that we don't have to pass them around
    197    endlessly.  They are all constant during the translation of any
    198    given insn. */
    199 
    200 /* We need to know this to do sub-register accesses correctly. */
    201 static Bool host_is_bigendian;
    202 
    203 /* Pointer to the guest code area (points to start of BB, not to the
    204    insn being processed). */
    205 static UChar* guest_code;
    206 
    207 /* The guest address corresponding to guest_code[0]. */
    208 static Addr32 guest_EIP_bbstart;
    209 
    210 /* The guest address for the instruction currently being
    211    translated. */
    212 static Addr32 guest_EIP_curr_instr;
    213 
    214 /* The IRSB* into which we're generating code. */
    215 static IRSB* irsb;
    216 
    217 
    218 /*------------------------------------------------------------*/
    219 /*--- Debugging output                                     ---*/
    220 /*------------------------------------------------------------*/
    221 
    222 #define DIP(format, args...)           \
    223    if (vex_traceflags & VEX_TRACE_FE)  \
    224       vex_printf(format, ## args)
    225 
    226 #define DIS(buf, format, args...)      \
    227    if (vex_traceflags & VEX_TRACE_FE)  \
    228       vex_sprintf(buf, format, ## args)
    229 
    230 
    231 /*------------------------------------------------------------*/
    232 /*--- Offsets of various parts of the x86 guest state.     ---*/
    233 /*------------------------------------------------------------*/
    234 
    235 #define OFFB_EAX       offsetof(VexGuestX86State,guest_EAX)
    236 #define OFFB_EBX       offsetof(VexGuestX86State,guest_EBX)
    237 #define OFFB_ECX       offsetof(VexGuestX86State,guest_ECX)
    238 #define OFFB_EDX       offsetof(VexGuestX86State,guest_EDX)
    239 #define OFFB_ESP       offsetof(VexGuestX86State,guest_ESP)
    240 #define OFFB_EBP       offsetof(VexGuestX86State,guest_EBP)
    241 #define OFFB_ESI       offsetof(VexGuestX86State,guest_ESI)
    242 #define OFFB_EDI       offsetof(VexGuestX86State,guest_EDI)
    243 
    244 #define OFFB_EIP       offsetof(VexGuestX86State,guest_EIP)
    245 
    246 #define OFFB_CC_OP     offsetof(VexGuestX86State,guest_CC_OP)
    247 #define OFFB_CC_DEP1   offsetof(VexGuestX86State,guest_CC_DEP1)
    248 #define OFFB_CC_DEP2   offsetof(VexGuestX86State,guest_CC_DEP2)
    249 #define OFFB_CC_NDEP   offsetof(VexGuestX86State,guest_CC_NDEP)
    250 
    251 #define OFFB_FPREGS    offsetof(VexGuestX86State,guest_FPREG[0])
    252 #define OFFB_FPTAGS    offsetof(VexGuestX86State,guest_FPTAG[0])
    253 #define OFFB_DFLAG     offsetof(VexGuestX86State,guest_DFLAG)
    254 #define OFFB_IDFLAG    offsetof(VexGuestX86State,guest_IDFLAG)
    255 #define OFFB_ACFLAG    offsetof(VexGuestX86State,guest_ACFLAG)
    256 #define OFFB_FTOP      offsetof(VexGuestX86State,guest_FTOP)
    257 #define OFFB_FC3210    offsetof(VexGuestX86State,guest_FC3210)
    258 #define OFFB_FPROUND   offsetof(VexGuestX86State,guest_FPROUND)
    259 
    260 #define OFFB_CS        offsetof(VexGuestX86State,guest_CS)
    261 #define OFFB_DS        offsetof(VexGuestX86State,guest_DS)
    262 #define OFFB_ES        offsetof(VexGuestX86State,guest_ES)
    263 #define OFFB_FS        offsetof(VexGuestX86State,guest_FS)
    264 #define OFFB_GS        offsetof(VexGuestX86State,guest_GS)
    265 #define OFFB_SS        offsetof(VexGuestX86State,guest_SS)
    266 #define OFFB_LDT       offsetof(VexGuestX86State,guest_LDT)
    267 #define OFFB_GDT       offsetof(VexGuestX86State,guest_GDT)
    268 
    269 #define OFFB_SSEROUND  offsetof(VexGuestX86State,guest_SSEROUND)
    270 #define OFFB_XMM0      offsetof(VexGuestX86State,guest_XMM0)
    271 #define OFFB_XMM1      offsetof(VexGuestX86State,guest_XMM1)
    272 #define OFFB_XMM2      offsetof(VexGuestX86State,guest_XMM2)
    273 #define OFFB_XMM3      offsetof(VexGuestX86State,guest_XMM3)
    274 #define OFFB_XMM4      offsetof(VexGuestX86State,guest_XMM4)
    275 #define OFFB_XMM5      offsetof(VexGuestX86State,guest_XMM5)
    276 #define OFFB_XMM6      offsetof(VexGuestX86State,guest_XMM6)
    277 #define OFFB_XMM7      offsetof(VexGuestX86State,guest_XMM7)
    278 
    279 #define OFFB_EMWARN    offsetof(VexGuestX86State,guest_EMWARN)
    280 
    281 #define OFFB_TISTART   offsetof(VexGuestX86State,guest_TISTART)
    282 #define OFFB_TILEN     offsetof(VexGuestX86State,guest_TILEN)
    283 #define OFFB_NRADDR    offsetof(VexGuestX86State,guest_NRADDR)
    284 
    285 #define OFFB_IP_AT_SYSCALL offsetof(VexGuestX86State,guest_IP_AT_SYSCALL)
    286 
    287 
    288 /*------------------------------------------------------------*/
    289 /*--- Helper bits and pieces for deconstructing the        ---*/
    290 /*--- x86 insn stream.                                     ---*/
    291 /*------------------------------------------------------------*/
    292 
    293 /* This is the Intel register encoding -- integer regs. */
    294 #define R_EAX 0
    295 #define R_ECX 1
    296 #define R_EDX 2
    297 #define R_EBX 3
    298 #define R_ESP 4
    299 #define R_EBP 5
    300 #define R_ESI 6
    301 #define R_EDI 7
    302 
    303 #define R_AL (0+R_EAX)
    304 #define R_AH (4+R_EAX)
    305 
    306 /* This is the Intel register encoding -- segment regs. */
    307 #define R_ES 0
    308 #define R_CS 1
    309 #define R_SS 2
    310 #define R_DS 3
    311 #define R_FS 4
    312 #define R_GS 5
    313 
    314 
    315 /* Add a statement to the list held by "irbb". */
    316 static void stmt ( IRStmt* st )
    317 {
    318    addStmtToIRSB( irsb, st );
    319 }
    320 
    321 /* Generate a new temporary of the given type. */
    322 static IRTemp newTemp ( IRType ty )
    323 {
    324    vassert(isPlausibleIRType(ty));
    325    return newIRTemp( irsb->tyenv, ty );
    326 }
    327 
    328 /* Various simple conversions */
    329 
    330 static UInt extend_s_8to32( UInt x )
    331 {
    332    return (UInt)((((Int)x) << 24) >> 24);
    333 }
    334 
    335 static UInt extend_s_16to32 ( UInt x )
    336 {
    337    return (UInt)((((Int)x) << 16) >> 16);
    338 }
    339 
    340 /* Fetch a byte from the guest insn stream. */
    341 static UChar getIByte ( Int delta )
    342 {
    343    return guest_code[delta];
    344 }
    345 
    346 /* Extract the reg field from a modRM byte. */
    347 static Int gregOfRM ( UChar mod_reg_rm )
    348 {
    349    return (Int)( (mod_reg_rm >> 3) & 7 );
    350 }
    351 
    352 /* Figure out whether the mod and rm parts of a modRM byte refer to a
    353    register or memory.  If so, the byte will have the form 11XXXYYY,
    354    where YYY is the register number. */
    355 static Bool epartIsReg ( UChar mod_reg_rm )
    356 {
    357    return toBool(0xC0 == (mod_reg_rm & 0xC0));
    358 }
    359 
    360 /* ... and extract the register number ... */
    361 static Int eregOfRM ( UChar mod_reg_rm )
    362 {
    363    return (Int)(mod_reg_rm & 0x7);
    364 }
    365 
    366 /* Get a 8/16/32-bit unsigned value out of the insn stream. */
    367 
    368 static UChar getUChar ( Int delta )
    369 {
    370    UChar v = guest_code[delta+0];
    371    return toUChar(v);
    372 }
    373 
    374 static UInt getUDisp16 ( Int delta )
    375 {
    376    UInt v = guest_code[delta+1]; v <<= 8;
    377    v |= guest_code[delta+0];
    378    return v & 0xFFFF;
    379 }
    380 
    381 static UInt getUDisp32 ( Int delta )
    382 {
    383    UInt v = guest_code[delta+3]; v <<= 8;
    384    v |= guest_code[delta+2]; v <<= 8;
    385    v |= guest_code[delta+1]; v <<= 8;
    386    v |= guest_code[delta+0];
    387    return v;
    388 }
    389 
    390 static UInt getUDisp ( Int size, Int delta )
    391 {
    392    switch (size) {
    393       case 4: return getUDisp32(delta);
    394       case 2: return getUDisp16(delta);
    395       case 1: return (UInt)getUChar(delta);
    396       default: vpanic("getUDisp(x86)");
    397    }
    398    return 0; /*notreached*/
    399 }
    400 
    401 
    402 /* Get a byte value out of the insn stream and sign-extend to 32
    403    bits. */
    404 static UInt getSDisp8 ( Int delta )
    405 {
    406    return extend_s_8to32( (UInt) (guest_code[delta]) );
    407 }
    408 
    409 static UInt getSDisp16 ( Int delta0 )
    410 {
    411    UChar* eip = (UChar*)(&guest_code[delta0]);
    412    UInt d = *eip++;
    413    d |= ((*eip++) << 8);
    414    return extend_s_16to32(d);
    415 }
    416 
    417 static UInt getSDisp ( Int size, Int delta )
    418 {
    419    switch (size) {
    420       case 4: return getUDisp32(delta);
    421       case 2: return getSDisp16(delta);
    422       case 1: return getSDisp8(delta);
    423       default: vpanic("getSDisp(x86)");
    424   }
    425   return 0; /*notreached*/
    426 }
    427 
    428 
    429 /*------------------------------------------------------------*/
    430 /*--- Helpers for constructing IR.                         ---*/
    431 /*------------------------------------------------------------*/
    432 
    433 /* Create a 1/2/4 byte read of an x86 integer registers.  For 16/8 bit
    434    register references, we need to take the host endianness into
    435    account.  Supplied value is 0 .. 7 and in the Intel instruction
    436    encoding. */
    437 
    438 static IRType szToITy ( Int n )
    439 {
    440    switch (n) {
    441       case 1: return Ity_I8;
    442       case 2: return Ity_I16;
    443       case 4: return Ity_I32;
    444       default: vpanic("szToITy(x86)");
    445    }
    446 }
    447 
    448 /* On a little-endian host, less significant bits of the guest
    449    registers are at lower addresses.  Therefore, if a reference to a
    450    register low half has the safe guest state offset as a reference to
    451    the full register.
    452 */
    453 static Int integerGuestRegOffset ( Int sz, UInt archreg )
    454 {
    455    vassert(archreg < 8);
    456 
    457    /* Correct for little-endian host only. */
    458    vassert(!host_is_bigendian);
    459 
    460    if (sz == 4 || sz == 2 || (sz == 1 && archreg < 4)) {
    461       switch (archreg) {
    462          case R_EAX: return OFFB_EAX;
    463          case R_EBX: return OFFB_EBX;
    464          case R_ECX: return OFFB_ECX;
    465          case R_EDX: return OFFB_EDX;
    466          case R_ESI: return OFFB_ESI;
    467          case R_EDI: return OFFB_EDI;
    468          case R_ESP: return OFFB_ESP;
    469          case R_EBP: return OFFB_EBP;
    470          default: vpanic("integerGuestRegOffset(x86,le)(4,2)");
    471       }
    472    }
    473 
    474    vassert(archreg >= 4 && archreg < 8 && sz == 1);
    475    switch (archreg-4) {
    476       case R_EAX: return 1+ OFFB_EAX;
    477       case R_EBX: return 1+ OFFB_EBX;
    478       case R_ECX: return 1+ OFFB_ECX;
    479       case R_EDX: return 1+ OFFB_EDX;
    480       default: vpanic("integerGuestRegOffset(x86,le)(1h)");
    481    }
    482 
    483    /* NOTREACHED */
    484    vpanic("integerGuestRegOffset(x86,le)");
    485 }
    486 
    487 static Int segmentGuestRegOffset ( UInt sreg )
    488 {
    489    switch (sreg) {
    490       case R_ES: return OFFB_ES;
    491       case R_CS: return OFFB_CS;
    492       case R_SS: return OFFB_SS;
    493       case R_DS: return OFFB_DS;
    494       case R_FS: return OFFB_FS;
    495       case R_GS: return OFFB_GS;
    496       default: vpanic("segmentGuestRegOffset(x86)");
    497    }
    498 }
    499 
    500 static Int xmmGuestRegOffset ( UInt xmmreg )
    501 {
    502    switch (xmmreg) {
    503       case 0: return OFFB_XMM0;
    504       case 1: return OFFB_XMM1;
    505       case 2: return OFFB_XMM2;
    506       case 3: return OFFB_XMM3;
    507       case 4: return OFFB_XMM4;
    508       case 5: return OFFB_XMM5;
    509       case 6: return OFFB_XMM6;
    510       case 7: return OFFB_XMM7;
    511       default: vpanic("xmmGuestRegOffset");
    512    }
    513 }
    514 
    515 /* Lanes of vector registers are always numbered from zero being the
    516    least significant lane (rightmost in the register).  */
    517 
    518 static Int xmmGuestRegLane16offset ( UInt xmmreg, Int laneno )
    519 {
    520    /* Correct for little-endian host only. */
    521    vassert(!host_is_bigendian);
    522    vassert(laneno >= 0 && laneno < 8);
    523    return xmmGuestRegOffset( xmmreg ) + 2 * laneno;
    524 }
    525 
    526 static Int xmmGuestRegLane32offset ( UInt xmmreg, Int laneno )
    527 {
    528    /* Correct for little-endian host only. */
    529    vassert(!host_is_bigendian);
    530    vassert(laneno >= 0 && laneno < 4);
    531    return xmmGuestRegOffset( xmmreg ) + 4 * laneno;
    532 }
    533 
    534 static Int xmmGuestRegLane64offset ( UInt xmmreg, Int laneno )
    535 {
    536    /* Correct for little-endian host only. */
    537    vassert(!host_is_bigendian);
    538    vassert(laneno >= 0 && laneno < 2);
    539    return xmmGuestRegOffset( xmmreg ) + 8 * laneno;
    540 }
    541 
    542 static IRExpr* getIReg ( Int sz, UInt archreg )
    543 {
    544    vassert(sz == 1 || sz == 2 || sz == 4);
    545    vassert(archreg < 8);
    546    return IRExpr_Get( integerGuestRegOffset(sz,archreg),
    547                       szToITy(sz) );
    548 }
    549 
    550 /* Ditto, but write to a reg instead. */
    551 static void putIReg ( Int sz, UInt archreg, IRExpr* e )
    552 {
    553    IRType ty = typeOfIRExpr(irsb->tyenv, e);
    554    switch (sz) {
    555       case 1: vassert(ty == Ity_I8); break;
    556       case 2: vassert(ty == Ity_I16); break;
    557       case 4: vassert(ty == Ity_I32); break;
    558       default: vpanic("putIReg(x86)");
    559    }
    560    vassert(archreg < 8);
    561    stmt( IRStmt_Put(integerGuestRegOffset(sz,archreg), e) );
    562 }
    563 
    564 static IRExpr* getSReg ( UInt sreg )
    565 {
    566    return IRExpr_Get( segmentGuestRegOffset(sreg), Ity_I16 );
    567 }
    568 
    569 static void putSReg ( UInt sreg, IRExpr* e )
    570 {
    571    vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_I16);
    572    stmt( IRStmt_Put( segmentGuestRegOffset(sreg), e ) );
    573 }
    574 
    575 static IRExpr* getXMMReg ( UInt xmmreg )
    576 {
    577    return IRExpr_Get( xmmGuestRegOffset(xmmreg), Ity_V128 );
    578 }
    579 
    580 static IRExpr* getXMMRegLane64 ( UInt xmmreg, Int laneno )
    581 {
    582    return IRExpr_Get( xmmGuestRegLane64offset(xmmreg,laneno), Ity_I64 );
    583 }
    584 
    585 static IRExpr* getXMMRegLane64F ( UInt xmmreg, Int laneno )
    586 {
    587    return IRExpr_Get( xmmGuestRegLane64offset(xmmreg,laneno), Ity_F64 );
    588 }
    589 
    590 static IRExpr* getXMMRegLane32 ( UInt xmmreg, Int laneno )
    591 {
    592    return IRExpr_Get( xmmGuestRegLane32offset(xmmreg,laneno), Ity_I32 );
    593 }
    594 
    595 static IRExpr* getXMMRegLane32F ( UInt xmmreg, Int laneno )
    596 {
    597    return IRExpr_Get( xmmGuestRegLane32offset(xmmreg,laneno), Ity_F32 );
    598 }
    599 
    600 static void putXMMReg ( UInt xmmreg, IRExpr* e )
    601 {
    602    vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_V128);
    603    stmt( IRStmt_Put( xmmGuestRegOffset(xmmreg), e ) );
    604 }
    605 
    606 static void putXMMRegLane64 ( UInt xmmreg, Int laneno, IRExpr* e )
    607 {
    608    vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_I64);
    609    stmt( IRStmt_Put( xmmGuestRegLane64offset(xmmreg,laneno), e ) );
    610 }
    611 
    612 static void putXMMRegLane64F ( UInt xmmreg, Int laneno, IRExpr* e )
    613 {
    614    vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_F64);
    615    stmt( IRStmt_Put( xmmGuestRegLane64offset(xmmreg,laneno), e ) );
    616 }
    617 
    618 static void putXMMRegLane32F ( UInt xmmreg, Int laneno, IRExpr* e )
    619 {
    620    vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_F32);
    621    stmt( IRStmt_Put( xmmGuestRegLane32offset(xmmreg,laneno), e ) );
    622 }
    623 
    624 static void putXMMRegLane32 ( UInt xmmreg, Int laneno, IRExpr* e )
    625 {
    626    vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_I32);
    627    stmt( IRStmt_Put( xmmGuestRegLane32offset(xmmreg,laneno), e ) );
    628 }
    629 
    630 static void putXMMRegLane16 ( UInt xmmreg, Int laneno, IRExpr* e )
    631 {
    632    vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_I16);
    633    stmt( IRStmt_Put( xmmGuestRegLane16offset(xmmreg,laneno), e ) );
    634 }
    635 
    636 static void assign ( IRTemp dst, IRExpr* e )
    637 {
    638    stmt( IRStmt_WrTmp(dst, e) );
    639 }
    640 
    641 static void storeLE ( IRExpr* addr, IRExpr* data )
    642 {
    643    stmt( IRStmt_Store(Iend_LE, addr, data) );
    644 }
    645 
    646 static IRExpr* unop ( IROp op, IRExpr* a )
    647 {
    648    return IRExpr_Unop(op, a);
    649 }
    650 
    651 static IRExpr* binop ( IROp op, IRExpr* a1, IRExpr* a2 )
    652 {
    653    return IRExpr_Binop(op, a1, a2);
    654 }
    655 
    656 static IRExpr* triop ( IROp op, IRExpr* a1, IRExpr* a2, IRExpr* a3 )
    657 {
    658    return IRExpr_Triop(op, a1, a2, a3);
    659 }
    660 
    661 static IRExpr* mkexpr ( IRTemp tmp )
    662 {
    663    return IRExpr_RdTmp(tmp);
    664 }
    665 
    666 static IRExpr* mkU8 ( UInt i )
    667 {
    668    vassert(i < 256);
    669    return IRExpr_Const(IRConst_U8( (UChar)i ));
    670 }
    671 
    672 static IRExpr* mkU16 ( UInt i )
    673 {
    674    vassert(i < 65536);
    675    return IRExpr_Const(IRConst_U16( (UShort)i ));
    676 }
    677 
    678 static IRExpr* mkU32 ( UInt i )
    679 {
    680    return IRExpr_Const(IRConst_U32(i));
    681 }
    682 
    683 static IRExpr* mkU64 ( ULong i )
    684 {
    685    return IRExpr_Const(IRConst_U64(i));
    686 }
    687 
    688 static IRExpr* mkU ( IRType ty, UInt i )
    689 {
    690    if (ty == Ity_I8)  return mkU8(i);
    691    if (ty == Ity_I16) return mkU16(i);
    692    if (ty == Ity_I32) return mkU32(i);
    693    /* If this panics, it usually means you passed a size (1,2,4)
    694       value as the IRType, rather than a real IRType. */
    695    vpanic("mkU(x86)");
    696 }
    697 
    698 static IRExpr* mkV128 ( UShort mask )
    699 {
    700    return IRExpr_Const(IRConst_V128(mask));
    701 }
    702 
    703 static IRExpr* loadLE ( IRType ty, IRExpr* addr )
    704 {
    705    return IRExpr_Load(Iend_LE, ty, addr);
    706 }
    707 
    708 static IROp mkSizedOp ( IRType ty, IROp op8 )
    709 {
    710    Int adj;
    711    vassert(ty == Ity_I8 || ty == Ity_I16 || ty == Ity_I32);
    712    vassert(op8 == Iop_Add8 || op8 == Iop_Sub8
    713            || op8 == Iop_Mul8
    714            || op8 == Iop_Or8 || op8 == Iop_And8 || op8 == Iop_Xor8
    715            || op8 == Iop_Shl8 || op8 == Iop_Shr8 || op8 == Iop_Sar8
    716            || op8 == Iop_CmpEQ8 || op8 == Iop_CmpNE8
    717            || op8 == Iop_CasCmpNE8
    718            || op8 == Iop_Not8);
    719    adj = ty==Ity_I8 ? 0 : (ty==Ity_I16 ? 1 : 2);
    720    return adj + op8;
    721 }
    722 
    723 static IROp mkWidenOp ( Int szSmall, Int szBig, Bool signd )
    724 {
    725    if (szSmall == 1 && szBig == 4) {
    726       return signd ? Iop_8Sto32 : Iop_8Uto32;
    727    }
    728    if (szSmall == 1 && szBig == 2) {
    729       return signd ? Iop_8Sto16 : Iop_8Uto16;
    730    }
    731    if (szSmall == 2 && szBig == 4) {
    732       return signd ? Iop_16Sto32 : Iop_16Uto32;
    733    }
    734    vpanic("mkWidenOp(x86,guest)");
    735 }
    736 
    737 static IRExpr* mkAnd1 ( IRExpr* x, IRExpr* y )
    738 {
    739    vassert(typeOfIRExpr(irsb->tyenv,x) == Ity_I1);
    740    vassert(typeOfIRExpr(irsb->tyenv,y) == Ity_I1);
    741    return unop(Iop_32to1,
    742                binop(Iop_And32,
    743                      unop(Iop_1Uto32,x),
    744                      unop(Iop_1Uto32,y)));
    745 }
    746 
    747 /* Generate a compare-and-swap operation, operating on memory at
    748    'addr'.  The expected value is 'expVal' and the new value is
    749    'newVal'.  If the operation fails, then transfer control (with a
    750    no-redir jump (XXX no -- see comment at top of this file)) to
    751    'restart_point', which is presumably the address of the guest
    752    instruction again -- retrying, essentially. */
    753 static void casLE ( IRExpr* addr, IRExpr* expVal, IRExpr* newVal,
    754                     Addr32 restart_point )
    755 {
    756    IRCAS* cas;
    757    IRType tyE    = typeOfIRExpr(irsb->tyenv, expVal);
    758    IRType tyN    = typeOfIRExpr(irsb->tyenv, newVal);
    759    IRTemp oldTmp = newTemp(tyE);
    760    IRTemp expTmp = newTemp(tyE);
    761    vassert(tyE == tyN);
    762    vassert(tyE == Ity_I32 || tyE == Ity_I16 || tyE == Ity_I8);
    763    assign(expTmp, expVal);
    764    cas = mkIRCAS( IRTemp_INVALID, oldTmp, Iend_LE, addr,
    765                   NULL, mkexpr(expTmp), NULL, newVal );
    766    stmt( IRStmt_CAS(cas) );
    767    stmt( IRStmt_Exit(
    768             binop( mkSizedOp(tyE,Iop_CasCmpNE8),
    769                    mkexpr(oldTmp), mkexpr(expTmp) ),
    770             Ijk_Boring, /*Ijk_NoRedir*/
    771             IRConst_U32( restart_point )
    772          ));
    773 }
    774 
    775 
    776 /*------------------------------------------------------------*/
    777 /*--- Helpers for %eflags.                                 ---*/
    778 /*------------------------------------------------------------*/
    779 
    780 /* -------------- Evaluating the flags-thunk. -------------- */
    781 
    782 /* Build IR to calculate all the eflags from stored
    783    CC_OP/CC_DEP1/CC_DEP2/CC_NDEP.  Returns an expression ::
    784    Ity_I32. */
    785 static IRExpr* mk_x86g_calculate_eflags_all ( void )
    786 {
    787    IRExpr** args
    788       = mkIRExprVec_4( IRExpr_Get(OFFB_CC_OP,   Ity_I32),
    789                        IRExpr_Get(OFFB_CC_DEP1, Ity_I32),
    790                        IRExpr_Get(OFFB_CC_DEP2, Ity_I32),
    791                        IRExpr_Get(OFFB_CC_NDEP, Ity_I32) );
    792    IRExpr* call
    793       = mkIRExprCCall(
    794            Ity_I32,
    795            0/*regparm*/,
    796            "x86g_calculate_eflags_all", &x86g_calculate_eflags_all,
    797            args
    798         );
    799    /* Exclude OP and NDEP from definedness checking.  We're only
    800       interested in DEP1 and DEP2. */
    801    call->Iex.CCall.cee->mcx_mask = (1<<0) | (1<<3);
    802    return call;
    803 }
    804 
    805 /* Build IR to calculate some particular condition from stored
    806    CC_OP/CC_DEP1/CC_DEP2/CC_NDEP.  Returns an expression ::
    807    Ity_Bit. */
    808 static IRExpr* mk_x86g_calculate_condition ( X86Condcode cond )
    809 {
    810    IRExpr** args
    811       = mkIRExprVec_5( mkU32(cond),
    812                        IRExpr_Get(OFFB_CC_OP,  Ity_I32),
    813                        IRExpr_Get(OFFB_CC_DEP1, Ity_I32),
    814                        IRExpr_Get(OFFB_CC_DEP2, Ity_I32),
    815                        IRExpr_Get(OFFB_CC_NDEP, Ity_I32) );
    816    IRExpr* call
    817       = mkIRExprCCall(
    818            Ity_I32,
    819            0/*regparm*/,
    820            "x86g_calculate_condition", &x86g_calculate_condition,
    821            args
    822         );
    823    /* Exclude the requested condition, OP and NDEP from definedness
    824       checking.  We're only interested in DEP1 and DEP2. */
    825    call->Iex.CCall.cee->mcx_mask = (1<<0) | (1<<1) | (1<<4);
    826    return unop(Iop_32to1, call);
    827 }
    828 
    829 /* Build IR to calculate just the carry flag from stored
    830    CC_OP/CC_DEP1/CC_DEP2/CC_NDEP.  Returns an expression :: Ity_I32. */
    831 static IRExpr* mk_x86g_calculate_eflags_c ( void )
    832 {
    833    IRExpr** args
    834       = mkIRExprVec_4( IRExpr_Get(OFFB_CC_OP,   Ity_I32),
    835                        IRExpr_Get(OFFB_CC_DEP1, Ity_I32),
    836                        IRExpr_Get(OFFB_CC_DEP2, Ity_I32),
    837                        IRExpr_Get(OFFB_CC_NDEP, Ity_I32) );
    838    IRExpr* call
    839       = mkIRExprCCall(
    840            Ity_I32,
    841            3/*regparm*/,
    842            "x86g_calculate_eflags_c", &x86g_calculate_eflags_c,
    843            args
    844         );
    845    /* Exclude OP and NDEP from definedness checking.  We're only
    846       interested in DEP1 and DEP2. */
    847    call->Iex.CCall.cee->mcx_mask = (1<<0) | (1<<3);
    848    return call;
    849 }
    850 
    851 
    852 /* -------------- Building the flags-thunk. -------------- */
    853 
    854 /* The machinery in this section builds the flag-thunk following a
    855    flag-setting operation.  Hence the various setFlags_* functions.
    856 */
    857 
    858 static Bool isAddSub ( IROp op8 )
    859 {
    860    return toBool(op8 == Iop_Add8 || op8 == Iop_Sub8);
    861 }
    862 
    863 static Bool isLogic ( IROp op8 )
    864 {
    865    return toBool(op8 == Iop_And8 || op8 == Iop_Or8 || op8 == Iop_Xor8);
    866 }
    867 
    868 /* U-widen 8/16/32 bit int expr to 32. */
    869 static IRExpr* widenUto32 ( IRExpr* e )
    870 {
    871    switch (typeOfIRExpr(irsb->tyenv,e)) {
    872       case Ity_I32: return e;
    873       case Ity_I16: return unop(Iop_16Uto32,e);
    874       case Ity_I8:  return unop(Iop_8Uto32,e);
    875       default: vpanic("widenUto32");
    876    }
    877 }
    878 
    879 /* S-widen 8/16/32 bit int expr to 32. */
    880 static IRExpr* widenSto32 ( IRExpr* e )
    881 {
    882    switch (typeOfIRExpr(irsb->tyenv,e)) {
    883       case Ity_I32: return e;
    884       case Ity_I16: return unop(Iop_16Sto32,e);
    885       case Ity_I8:  return unop(Iop_8Sto32,e);
    886       default: vpanic("widenSto32");
    887    }
    888 }
    889 
    890 /* Narrow 8/16/32 bit int expr to 8/16/32.  Clearly only some
    891    of these combinations make sense. */
    892 static IRExpr* narrowTo ( IRType dst_ty, IRExpr* e )
    893 {
    894    IRType src_ty = typeOfIRExpr(irsb->tyenv,e);
    895    if (src_ty == dst_ty)
    896       return e;
    897    if (src_ty == Ity_I32 && dst_ty == Ity_I16)
    898       return unop(Iop_32to16, e);
    899    if (src_ty == Ity_I32 && dst_ty == Ity_I8)
    900       return unop(Iop_32to8, e);
    901 
    902    vex_printf("\nsrc, dst tys are: ");
    903    ppIRType(src_ty);
    904    vex_printf(", ");
    905    ppIRType(dst_ty);
    906    vex_printf("\n");
    907    vpanic("narrowTo(x86)");
    908 }
    909 
    910 
    911 /* Set the flags thunk OP, DEP1 and DEP2 fields.  The supplied op is
    912    auto-sized up to the real op. */
    913 
    914 static
    915 void setFlags_DEP1_DEP2 ( IROp op8, IRTemp dep1, IRTemp dep2, IRType ty )
    916 {
    917    Int ccOp = ty==Ity_I8 ? 0 : (ty==Ity_I16 ? 1 : 2);
    918 
    919    vassert(ty == Ity_I8 || ty == Ity_I16 || ty == Ity_I32);
    920 
    921    switch (op8) {
    922       case Iop_Add8: ccOp += X86G_CC_OP_ADDB;   break;
    923       case Iop_Sub8: ccOp += X86G_CC_OP_SUBB;   break;
    924       default:       ppIROp(op8);
    925                      vpanic("setFlags_DEP1_DEP2(x86)");
    926    }
    927    stmt( IRStmt_Put( OFFB_CC_OP,   mkU32(ccOp)) );
    928    stmt( IRStmt_Put( OFFB_CC_DEP1, widenUto32(mkexpr(dep1))) );
    929    stmt( IRStmt_Put( OFFB_CC_DEP2, widenUto32(mkexpr(dep2))) );
    930    /* Set NDEP even though it isn't used.  This makes redundant-PUT
    931       elimination of previous stores to this field work better. */
    932    stmt( IRStmt_Put( OFFB_CC_NDEP, mkU32(0) ));
    933 }
    934 
    935 
    936 /* Set the OP and DEP1 fields only, and write zero to DEP2. */
    937 
    938 static
    939 void setFlags_DEP1 ( IROp op8, IRTemp dep1, IRType ty )
    940 {
    941    Int ccOp = ty==Ity_I8 ? 0 : (ty==Ity_I16 ? 1 : 2);
    942 
    943    vassert(ty == Ity_I8 || ty == Ity_I16 || ty == Ity_I32);
    944 
    945    switch (op8) {
    946       case Iop_Or8:
    947       case Iop_And8:
    948       case Iop_Xor8: ccOp += X86G_CC_OP_LOGICB; break;
    949       default:       ppIROp(op8);
    950                      vpanic("setFlags_DEP1(x86)");
    951    }
    952    stmt( IRStmt_Put( OFFB_CC_OP,   mkU32(ccOp)) );
    953    stmt( IRStmt_Put( OFFB_CC_DEP1, widenUto32(mkexpr(dep1))) );
    954    stmt( IRStmt_Put( OFFB_CC_DEP2, mkU32(0)) );
    955    /* Set NDEP even though it isn't used.  This makes redundant-PUT
    956       elimination of previous stores to this field work better. */
    957    stmt( IRStmt_Put( OFFB_CC_NDEP, mkU32(0) ));
    958 }
    959 
    960 
    961 /* For shift operations, we put in the result and the undershifted
    962    result.  Except if the shift amount is zero, the thunk is left
    963    unchanged. */
    964 
    965 static void setFlags_DEP1_DEP2_shift ( IROp    op32,
    966                                        IRTemp  res,
    967                                        IRTemp  resUS,
    968                                        IRType  ty,
    969                                        IRTemp  guard )
    970 {
    971    Int ccOp = ty==Ity_I8 ? 2 : (ty==Ity_I16 ? 1 : 0);
    972 
    973    vassert(ty == Ity_I8 || ty == Ity_I16 || ty == Ity_I32);
    974    vassert(guard);
    975 
    976    /* Both kinds of right shifts are handled by the same thunk
    977       operation. */
    978    switch (op32) {
    979       case Iop_Shr32:
    980       case Iop_Sar32: ccOp = X86G_CC_OP_SHRL - ccOp; break;
    981       case Iop_Shl32: ccOp = X86G_CC_OP_SHLL - ccOp; break;
    982       default:        ppIROp(op32);
    983                       vpanic("setFlags_DEP1_DEP2_shift(x86)");
    984    }
    985 
    986    /* DEP1 contains the result, DEP2 contains the undershifted value. */
    987    stmt( IRStmt_Put( OFFB_CC_OP,
    988                      IRExpr_Mux0X( mkexpr(guard),
    989                                    IRExpr_Get(OFFB_CC_OP,Ity_I32),
    990                                    mkU32(ccOp))) );
    991    stmt( IRStmt_Put( OFFB_CC_DEP1,
    992                      IRExpr_Mux0X( mkexpr(guard),
    993                                    IRExpr_Get(OFFB_CC_DEP1,Ity_I32),
    994                                    widenUto32(mkexpr(res)))) );
    995    stmt( IRStmt_Put( OFFB_CC_DEP2,
    996                      IRExpr_Mux0X( mkexpr(guard),
    997                                    IRExpr_Get(OFFB_CC_DEP2,Ity_I32),
    998                                    widenUto32(mkexpr(resUS)))) );
    999    /* Set NDEP even though it isn't used.  This makes redundant-PUT
   1000       elimination of previous stores to this field work better. */
   1001    stmt( IRStmt_Put( OFFB_CC_NDEP,
   1002                      IRExpr_Mux0X( mkexpr(guard),
   1003                                    IRExpr_Get(OFFB_CC_NDEP,Ity_I32),
   1004 				   mkU32(0) )));
   1005 }
   1006 
   1007 
   1008 /* For the inc/dec case, we store in DEP1 the result value and in NDEP
   1009    the former value of the carry flag, which unfortunately we have to
   1010    compute. */
   1011 
   1012 static void setFlags_INC_DEC ( Bool inc, IRTemp res, IRType ty )
   1013 {
   1014    Int ccOp = inc ? X86G_CC_OP_INCB : X86G_CC_OP_DECB;
   1015 
   1016    ccOp += ty==Ity_I8 ? 0 : (ty==Ity_I16 ? 1 : 2);
   1017    vassert(ty == Ity_I8 || ty == Ity_I16 || ty == Ity_I32);
   1018 
   1019    /* This has to come first, because calculating the C flag
   1020       may require reading all four thunk fields. */
   1021    stmt( IRStmt_Put( OFFB_CC_NDEP, mk_x86g_calculate_eflags_c()) );
   1022    stmt( IRStmt_Put( OFFB_CC_OP,   mkU32(ccOp)) );
   1023    stmt( IRStmt_Put( OFFB_CC_DEP1, widenUto32(mkexpr(res))) );
   1024    stmt( IRStmt_Put( OFFB_CC_DEP2, mkU32(0)) );
   1025 }
   1026 
   1027 
   1028 /* Multiplies are pretty much like add and sub: DEP1 and DEP2 hold the
   1029    two arguments. */
   1030 
   1031 static
   1032 void setFlags_MUL ( IRType ty, IRTemp arg1, IRTemp arg2, UInt base_op )
   1033 {
   1034    switch (ty) {
   1035       case Ity_I8:
   1036          stmt( IRStmt_Put( OFFB_CC_OP, mkU32(base_op+0) ) );
   1037          break;
   1038       case Ity_I16:
   1039          stmt( IRStmt_Put( OFFB_CC_OP, mkU32(base_op+1) ) );
   1040          break;
   1041       case Ity_I32:
   1042          stmt( IRStmt_Put( OFFB_CC_OP, mkU32(base_op+2) ) );
   1043          break;
   1044       default:
   1045          vpanic("setFlags_MUL(x86)");
   1046    }
   1047    stmt( IRStmt_Put( OFFB_CC_DEP1, widenUto32(mkexpr(arg1)) ));
   1048    stmt( IRStmt_Put( OFFB_CC_DEP2, widenUto32(mkexpr(arg2)) ));
   1049    /* Set NDEP even though it isn't used.  This makes redundant-PUT
   1050       elimination of previous stores to this field work better. */
   1051    stmt( IRStmt_Put( OFFB_CC_NDEP, mkU32(0) ));
   1052 }
   1053 
   1054 
   1055 /* -------------- Condition codes. -------------- */
   1056 
   1057 /* Condition codes, using the Intel encoding.  */
   1058 
   1059 static HChar* name_X86Condcode ( X86Condcode cond )
   1060 {
   1061    switch (cond) {
   1062       case X86CondO:      return "o";
   1063       case X86CondNO:     return "no";
   1064       case X86CondB:      return "b";
   1065       case X86CondNB:     return "nb";
   1066       case X86CondZ:      return "z";
   1067       case X86CondNZ:     return "nz";
   1068       case X86CondBE:     return "be";
   1069       case X86CondNBE:    return "nbe";
   1070       case X86CondS:      return "s";
   1071       case X86CondNS:     return "ns";
   1072       case X86CondP:      return "p";
   1073       case X86CondNP:     return "np";
   1074       case X86CondL:      return "l";
   1075       case X86CondNL:     return "nl";
   1076       case X86CondLE:     return "le";
   1077       case X86CondNLE:    return "nle";
   1078       case X86CondAlways: return "ALWAYS";
   1079       default: vpanic("name_X86Condcode");
   1080    }
   1081 }
   1082 
   1083 static
   1084 X86Condcode positiveIse_X86Condcode ( X86Condcode  cond,
   1085                                       Bool*        needInvert )
   1086 {
   1087    vassert(cond >= X86CondO && cond <= X86CondNLE);
   1088    if (cond & 1) {
   1089       *needInvert = True;
   1090       return cond-1;
   1091    } else {
   1092       *needInvert = False;
   1093       return cond;
   1094    }
   1095 }
   1096 
   1097 
   1098 /* -------------- Helpers for ADD/SUB with carry. -------------- */
   1099 
   1100 /* Given ta1, ta2 and tres, compute tres = ADC(ta1,ta2) and set flags
   1101    appropriately.
   1102 
   1103    Optionally, generate a store for the 'tres' value.  This can either
   1104    be a normal store, or it can be a cas-with-possible-failure style
   1105    store:
   1106 
   1107    if taddr is IRTemp_INVALID, then no store is generated.
   1108 
   1109    if taddr is not IRTemp_INVALID, then a store (using taddr as
   1110    the address) is generated:
   1111 
   1112      if texpVal is IRTemp_INVALID then a normal store is
   1113      generated, and restart_point must be zero (it is irrelevant).
   1114 
   1115      if texpVal is not IRTemp_INVALID then a cas-style store is
   1116      generated.  texpVal is the expected value, restart_point
   1117      is the restart point if the store fails, and texpVal must
   1118      have the same type as tres.
   1119 */
   1120 static void helper_ADC ( Int sz,
   1121                          IRTemp tres, IRTemp ta1, IRTemp ta2,
   1122                          /* info about optional store: */
   1123                          IRTemp taddr, IRTemp texpVal, Addr32 restart_point )
   1124 {
   1125    UInt    thunkOp;
   1126    IRType  ty    = szToITy(sz);
   1127    IRTemp  oldc  = newTemp(Ity_I32);
   1128    IRTemp  oldcn = newTemp(ty);
   1129    IROp    plus  = mkSizedOp(ty, Iop_Add8);
   1130    IROp    xor   = mkSizedOp(ty, Iop_Xor8);
   1131 
   1132    vassert(typeOfIRTemp(irsb->tyenv, tres) == ty);
   1133    vassert(sz == 1 || sz == 2 || sz == 4);
   1134    thunkOp = sz==4 ? X86G_CC_OP_ADCL
   1135                    : (sz==2 ? X86G_CC_OP_ADCW : X86G_CC_OP_ADCB);
   1136 
   1137    /* oldc = old carry flag, 0 or 1 */
   1138    assign( oldc,  binop(Iop_And32,
   1139                         mk_x86g_calculate_eflags_c(),
   1140                         mkU32(1)) );
   1141 
   1142    assign( oldcn, narrowTo(ty, mkexpr(oldc)) );
   1143 
   1144    assign( tres, binop(plus,
   1145                        binop(plus,mkexpr(ta1),mkexpr(ta2)),
   1146                        mkexpr(oldcn)) );
   1147 
   1148    /* Possibly generate a store of 'tres' to 'taddr'.  See comment at
   1149       start of this function. */
   1150    if (taddr != IRTemp_INVALID) {
   1151       if (texpVal == IRTemp_INVALID) {
   1152          vassert(restart_point == 0);
   1153          storeLE( mkexpr(taddr), mkexpr(tres) );
   1154       } else {
   1155          vassert(typeOfIRTemp(irsb->tyenv, texpVal) == ty);
   1156          /* .. and hence 'texpVal' has the same type as 'tres'. */
   1157          casLE( mkexpr(taddr),
   1158                 mkexpr(texpVal), mkexpr(tres), restart_point );
   1159       }
   1160    }
   1161 
   1162    stmt( IRStmt_Put( OFFB_CC_OP,   mkU32(thunkOp) ) );
   1163    stmt( IRStmt_Put( OFFB_CC_DEP1, widenUto32(mkexpr(ta1)) ));
   1164    stmt( IRStmt_Put( OFFB_CC_DEP2, widenUto32(binop(xor, mkexpr(ta2),
   1165                                                          mkexpr(oldcn)) )) );
   1166    stmt( IRStmt_Put( OFFB_CC_NDEP, mkexpr(oldc) ) );
   1167 }
   1168 
   1169 
   1170 /* Given ta1, ta2 and tres, compute tres = SBB(ta1,ta2) and set flags
   1171    appropriately.  As with helper_ADC, possibly generate a store of
   1172    the result -- see comments on helper_ADC for details.
   1173 */
   1174 static void helper_SBB ( Int sz,
   1175                          IRTemp tres, IRTemp ta1, IRTemp ta2,
   1176                          /* info about optional store: */
   1177                          IRTemp taddr, IRTemp texpVal, Addr32 restart_point )
   1178 {
   1179    UInt    thunkOp;
   1180    IRType  ty    = szToITy(sz);
   1181    IRTemp  oldc  = newTemp(Ity_I32);
   1182    IRTemp  oldcn = newTemp(ty);
   1183    IROp    minus = mkSizedOp(ty, Iop_Sub8);
   1184    IROp    xor   = mkSizedOp(ty, Iop_Xor8);
   1185 
   1186    vassert(typeOfIRTemp(irsb->tyenv, tres) == ty);
   1187    vassert(sz == 1 || sz == 2 || sz == 4);
   1188    thunkOp = sz==4 ? X86G_CC_OP_SBBL
   1189                    : (sz==2 ? X86G_CC_OP_SBBW : X86G_CC_OP_SBBB);
   1190 
   1191    /* oldc = old carry flag, 0 or 1 */
   1192    assign( oldc, binop(Iop_And32,
   1193                        mk_x86g_calculate_eflags_c(),
   1194                        mkU32(1)) );
   1195 
   1196    assign( oldcn, narrowTo(ty, mkexpr(oldc)) );
   1197 
   1198    assign( tres, binop(minus,
   1199                        binop(minus,mkexpr(ta1),mkexpr(ta2)),
   1200                        mkexpr(oldcn)) );
   1201 
   1202    /* Possibly generate a store of 'tres' to 'taddr'.  See comment at
   1203       start of this function. */
   1204    if (taddr != IRTemp_INVALID) {
   1205       if (texpVal == IRTemp_INVALID) {
   1206          vassert(restart_point == 0);
   1207          storeLE( mkexpr(taddr), mkexpr(tres) );
   1208       } else {
   1209          vassert(typeOfIRTemp(irsb->tyenv, texpVal) == ty);
   1210          /* .. and hence 'texpVal' has the same type as 'tres'. */
   1211          casLE( mkexpr(taddr),
   1212                 mkexpr(texpVal), mkexpr(tres), restart_point );
   1213       }
   1214    }
   1215 
   1216    stmt( IRStmt_Put( OFFB_CC_OP,   mkU32(thunkOp) ) );
   1217    stmt( IRStmt_Put( OFFB_CC_DEP1, widenUto32(mkexpr(ta1) )) );
   1218    stmt( IRStmt_Put( OFFB_CC_DEP2, widenUto32(binop(xor, mkexpr(ta2),
   1219                                                          mkexpr(oldcn)) )) );
   1220    stmt( IRStmt_Put( OFFB_CC_NDEP, mkexpr(oldc) ) );
   1221 }
   1222 
   1223 
   1224 /* -------------- Helpers for disassembly printing. -------------- */
   1225 
   1226 static HChar* nameGrp1 ( Int opc_aux )
   1227 {
   1228    static HChar* grp1_names[8]
   1229      = { "add", "or", "adc", "sbb", "and", "sub", "xor", "cmp" };
   1230    if (opc_aux < 0 || opc_aux > 7) vpanic("nameGrp1(x86)");
   1231    return grp1_names[opc_aux];
   1232 }
   1233 
   1234 static HChar* nameGrp2 ( Int opc_aux )
   1235 {
   1236    static HChar* grp2_names[8]
   1237      = { "rol", "ror", "rcl", "rcr", "shl", "shr", "shl", "sar" };
   1238    if (opc_aux < 0 || opc_aux > 7) vpanic("nameGrp2(x86)");
   1239    return grp2_names[opc_aux];
   1240 }
   1241 
   1242 static HChar* nameGrp4 ( Int opc_aux )
   1243 {
   1244    static HChar* grp4_names[8]
   1245      = { "inc", "dec", "???", "???", "???", "???", "???", "???" };
   1246    if (opc_aux < 0 || opc_aux > 1) vpanic("nameGrp4(x86)");
   1247    return grp4_names[opc_aux];
   1248 }
   1249 
   1250 static HChar* nameGrp5 ( Int opc_aux )
   1251 {
   1252    static HChar* grp5_names[8]
   1253      = { "inc", "dec", "call*", "call*", "jmp*", "jmp*", "push", "???" };
   1254    if (opc_aux < 0 || opc_aux > 6) vpanic("nameGrp5(x86)");
   1255    return grp5_names[opc_aux];
   1256 }
   1257 
   1258 static HChar* nameGrp8 ( Int opc_aux )
   1259 {
   1260    static HChar* grp8_names[8]
   1261      = { "???", "???", "???", "???", "bt", "bts", "btr", "btc" };
   1262    if (opc_aux < 4 || opc_aux > 7) vpanic("nameGrp8(x86)");
   1263    return grp8_names[opc_aux];
   1264 }
   1265 
   1266 static HChar* nameIReg ( Int size, Int reg )
   1267 {
   1268    static HChar* ireg32_names[8]
   1269      = { "%eax", "%ecx", "%edx", "%ebx",
   1270          "%esp", "%ebp", "%esi", "%edi" };
   1271    static HChar* ireg16_names[8]
   1272      = { "%ax", "%cx", "%dx", "%bx", "%sp", "%bp", "%si", "%di" };
   1273    static HChar* ireg8_names[8]
   1274      = { "%al", "%cl", "%dl", "%bl",
   1275          "%ah{sp}", "%ch{bp}", "%dh{si}", "%bh{di}" };
   1276    if (reg < 0 || reg > 7) goto bad;
   1277    switch (size) {
   1278       case 4: return ireg32_names[reg];
   1279       case 2: return ireg16_names[reg];
   1280       case 1: return ireg8_names[reg];
   1281    }
   1282   bad:
   1283    vpanic("nameIReg(X86)");
   1284    return NULL; /*notreached*/
   1285 }
   1286 
   1287 static HChar* nameSReg ( UInt sreg )
   1288 {
   1289    switch (sreg) {
   1290       case R_ES: return "%es";
   1291       case R_CS: return "%cs";
   1292       case R_SS: return "%ss";
   1293       case R_DS: return "%ds";
   1294       case R_FS: return "%fs";
   1295       case R_GS: return "%gs";
   1296       default: vpanic("nameSReg(x86)");
   1297    }
   1298 }
   1299 
   1300 static HChar* nameMMXReg ( Int mmxreg )
   1301 {
   1302    static HChar* mmx_names[8]
   1303      = { "%mm0", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5", "%mm6", "%mm7" };
   1304    if (mmxreg < 0 || mmxreg > 7) vpanic("nameMMXReg(x86,guest)");
   1305    return mmx_names[mmxreg];
   1306 }
   1307 
   1308 static HChar* nameXMMReg ( Int xmmreg )
   1309 {
   1310    static HChar* xmm_names[8]
   1311      = { "%xmm0", "%xmm1", "%xmm2", "%xmm3",
   1312          "%xmm4", "%xmm5", "%xmm6", "%xmm7" };
   1313    if (xmmreg < 0 || xmmreg > 7) vpanic("name_of_xmm_reg");
   1314    return xmm_names[xmmreg];
   1315 }
   1316 
   1317 static HChar* nameMMXGran ( Int gran )
   1318 {
   1319    switch (gran) {
   1320       case 0: return "b";
   1321       case 1: return "w";
   1322       case 2: return "d";
   1323       case 3: return "q";
   1324       default: vpanic("nameMMXGran(x86,guest)");
   1325    }
   1326 }
   1327 
   1328 static HChar nameISize ( Int size )
   1329 {
   1330    switch (size) {
   1331       case 4: return 'l';
   1332       case 2: return 'w';
   1333       case 1: return 'b';
   1334       default: vpanic("nameISize(x86)");
   1335    }
   1336 }
   1337 
   1338 
   1339 /*------------------------------------------------------------*/
   1340 /*--- JMP helpers                                          ---*/
   1341 /*------------------------------------------------------------*/
   1342 
   1343 static void jmp_lit( IRJumpKind kind, Addr32 d32 )
   1344 {
   1345    irsb->next     = mkU32(d32);
   1346    irsb->jumpkind = kind;
   1347 }
   1348 
   1349 static void jmp_treg( IRJumpKind kind, IRTemp t )
   1350 {
   1351    irsb->next = mkexpr(t);
   1352    irsb->jumpkind = kind;
   1353 }
   1354 
   1355 static
   1356 void jcc_01( X86Condcode cond, Addr32 d32_false, Addr32 d32_true )
   1357 {
   1358    Bool        invert;
   1359    X86Condcode condPos;
   1360    condPos = positiveIse_X86Condcode ( cond, &invert );
   1361    if (invert) {
   1362       stmt( IRStmt_Exit( mk_x86g_calculate_condition(condPos),
   1363                          Ijk_Boring,
   1364                          IRConst_U32(d32_false) ) );
   1365       irsb->next     = mkU32(d32_true);
   1366       irsb->jumpkind = Ijk_Boring;
   1367    } else {
   1368       stmt( IRStmt_Exit( mk_x86g_calculate_condition(condPos),
   1369                          Ijk_Boring,
   1370                          IRConst_U32(d32_true) ) );
   1371       irsb->next     = mkU32(d32_false);
   1372       irsb->jumpkind = Ijk_Boring;
   1373    }
   1374 }
   1375 
   1376 
   1377 /*------------------------------------------------------------*/
   1378 /*--- Disassembling addressing modes                       ---*/
   1379 /*------------------------------------------------------------*/
   1380 
   1381 static
   1382 HChar* sorbTxt ( UChar sorb )
   1383 {
   1384    switch (sorb) {
   1385       case 0:    return ""; /* no override */
   1386       case 0x3E: return "%ds";
   1387       case 0x26: return "%es:";
   1388       case 0x64: return "%fs:";
   1389       case 0x65: return "%gs:";
   1390       default: vpanic("sorbTxt(x86,guest)");
   1391    }
   1392 }
   1393 
   1394 
   1395 /* 'virtual' is an IRExpr* holding a virtual address.  Convert it to a
   1396    linear address by adding any required segment override as indicated
   1397    by sorb. */
   1398 static
   1399 IRExpr* handleSegOverride ( UChar sorb, IRExpr* virtual )
   1400 {
   1401    Int    sreg;
   1402    IRType hWordTy;
   1403    IRTemp ldt_ptr, gdt_ptr, seg_selector, r64;
   1404 
   1405    if (sorb == 0)
   1406       /* the common case - no override */
   1407       return virtual;
   1408 
   1409    switch (sorb) {
   1410       case 0x3E: sreg = R_DS; break;
   1411       case 0x26: sreg = R_ES; break;
   1412       case 0x64: sreg = R_FS; break;
   1413       case 0x65: sreg = R_GS; break;
   1414       default: vpanic("handleSegOverride(x86,guest)");
   1415    }
   1416 
   1417    hWordTy = sizeof(HWord)==4 ? Ity_I32 : Ity_I64;
   1418 
   1419    seg_selector = newTemp(Ity_I32);
   1420    ldt_ptr      = newTemp(hWordTy);
   1421    gdt_ptr      = newTemp(hWordTy);
   1422    r64          = newTemp(Ity_I64);
   1423 
   1424    assign( seg_selector, unop(Iop_16Uto32, getSReg(sreg)) );
   1425    assign( ldt_ptr, IRExpr_Get( OFFB_LDT, hWordTy ));
   1426    assign( gdt_ptr, IRExpr_Get( OFFB_GDT, hWordTy ));
   1427 
   1428    /*
   1429    Call this to do the translation and limit checks:
   1430    ULong x86g_use_seg_selector ( HWord ldt, HWord gdt,
   1431                                  UInt seg_selector, UInt virtual_addr )
   1432    */
   1433    assign(
   1434       r64,
   1435       mkIRExprCCall(
   1436          Ity_I64,
   1437          0/*regparms*/,
   1438          "x86g_use_seg_selector",
   1439          &x86g_use_seg_selector,
   1440          mkIRExprVec_4( mkexpr(ldt_ptr), mkexpr(gdt_ptr),
   1441                         mkexpr(seg_selector), virtual)
   1442       )
   1443    );
   1444 
   1445    /* If the high 32 of the result are non-zero, there was a
   1446       failure in address translation.  In which case, make a
   1447       quick exit.
   1448    */
   1449    stmt(
   1450       IRStmt_Exit(
   1451          binop(Iop_CmpNE32, unop(Iop_64HIto32, mkexpr(r64)), mkU32(0)),
   1452          Ijk_MapFail,
   1453          IRConst_U32( guest_EIP_curr_instr )
   1454       )
   1455    );
   1456 
   1457    /* otherwise, here's the translated result. */
   1458    return unop(Iop_64to32, mkexpr(r64));
   1459 }
   1460 
   1461 
   1462 /* Generate IR to calculate an address indicated by a ModRM and
   1463    following SIB bytes.  The expression, and the number of bytes in
   1464    the address mode, are returned.  Note that this fn should not be
   1465    called if the R/M part of the address denotes a register instead of
   1466    memory.  If print_codegen is true, text of the addressing mode is
   1467    placed in buf.
   1468 
   1469    The computed address is stored in a new tempreg, and the
   1470    identity of the tempreg is returned.  */
   1471 
   1472 static IRTemp disAMode_copy2tmp ( IRExpr* addr32 )
   1473 {
   1474    IRTemp tmp = newTemp(Ity_I32);
   1475    assign( tmp, addr32 );
   1476    return tmp;
   1477 }
   1478 
   1479 static
   1480 IRTemp disAMode ( Int* len, UChar sorb, Int delta, HChar* buf )
   1481 {
   1482    UChar mod_reg_rm = getIByte(delta);
   1483    delta++;
   1484 
   1485    buf[0] = (UChar)0;
   1486 
   1487    /* squeeze out the reg field from mod_reg_rm, since a 256-entry
   1488       jump table seems a bit excessive.
   1489    */
   1490    mod_reg_rm &= 0xC7;                      /* is now XX000YYY */
   1491    mod_reg_rm  = toUChar(mod_reg_rm | (mod_reg_rm >> 3));
   1492                                             /* is now XX0XXYYY */
   1493    mod_reg_rm &= 0x1F;                      /* is now 000XXYYY */
   1494    switch (mod_reg_rm) {
   1495 
   1496       /* (%eax) .. (%edi), not including (%esp) or (%ebp).
   1497          --> GET %reg, t
   1498       */
   1499       case 0x00: case 0x01: case 0x02: case 0x03:
   1500       /* ! 04 */ /* ! 05 */ case 0x06: case 0x07:
   1501          { UChar rm = mod_reg_rm;
   1502            DIS(buf, "%s(%s)", sorbTxt(sorb), nameIReg(4,rm));
   1503            *len = 1;
   1504            return disAMode_copy2tmp(
   1505                   handleSegOverride(sorb, getIReg(4,rm)));
   1506          }
   1507 
   1508       /* d8(%eax) ... d8(%edi), not including d8(%esp)
   1509          --> GET %reg, t ; ADDL d8, t
   1510       */
   1511       case 0x08: case 0x09: case 0x0A: case 0x0B:
   1512       /* ! 0C */ case 0x0D: case 0x0E: case 0x0F:
   1513          { UChar rm = toUChar(mod_reg_rm & 7);
   1514            UInt  d  = getSDisp8(delta);
   1515            DIS(buf, "%s%d(%s)", sorbTxt(sorb), (Int)d, nameIReg(4,rm));
   1516            *len = 2;
   1517            return disAMode_copy2tmp(
   1518                   handleSegOverride(sorb,
   1519                      binop(Iop_Add32,getIReg(4,rm),mkU32(d))));
   1520          }
   1521 
   1522       /* d32(%eax) ... d32(%edi), not including d32(%esp)
   1523          --> GET %reg, t ; ADDL d8, t
   1524       */
   1525       case 0x10: case 0x11: case 0x12: case 0x13:
   1526       /* ! 14 */ case 0x15: case 0x16: case 0x17:
   1527          { UChar rm = toUChar(mod_reg_rm & 7);
   1528            UInt  d  = getUDisp32(delta);
   1529            DIS(buf, "%s0x%x(%s)", sorbTxt(sorb), (Int)d, nameIReg(4,rm));
   1530            *len = 5;
   1531            return disAMode_copy2tmp(
   1532                   handleSegOverride(sorb,
   1533                      binop(Iop_Add32,getIReg(4,rm),mkU32(d))));
   1534          }
   1535 
   1536       /* a register, %eax .. %edi.  This shouldn't happen. */
   1537       case 0x18: case 0x19: case 0x1A: case 0x1B:
   1538       case 0x1C: case 0x1D: case 0x1E: case 0x1F:
   1539          vpanic("disAMode(x86): not an addr!");
   1540 
   1541       /* a 32-bit literal address
   1542          --> MOV d32, tmp
   1543       */
   1544       case 0x05:
   1545          { UInt d = getUDisp32(delta);
   1546            *len = 5;
   1547            DIS(buf, "%s(0x%x)", sorbTxt(sorb), d);
   1548            return disAMode_copy2tmp(
   1549                      handleSegOverride(sorb, mkU32(d)));
   1550          }
   1551 
   1552       case 0x04: {
   1553          /* SIB, with no displacement.  Special cases:
   1554             -- %esp cannot act as an index value.
   1555                If index_r indicates %esp, zero is used for the index.
   1556             -- when mod is zero and base indicates EBP, base is instead
   1557                a 32-bit literal.
   1558             It's all madness, I tell you.  Extract %index, %base and
   1559             scale from the SIB byte.  The value denoted is then:
   1560                | %index == %ESP && %base == %EBP
   1561                = d32 following SIB byte
   1562                | %index == %ESP && %base != %EBP
   1563                = %base
   1564                | %index != %ESP && %base == %EBP
   1565                = d32 following SIB byte + (%index << scale)
   1566                | %index != %ESP && %base != %ESP
   1567                = %base + (%index << scale)
   1568 
   1569             What happens to the souls of CPU architects who dream up such
   1570             horrendous schemes, do you suppose?
   1571          */
   1572          UChar sib     = getIByte(delta);
   1573          UChar scale   = toUChar((sib >> 6) & 3);
   1574          UChar index_r = toUChar((sib >> 3) & 7);
   1575          UChar base_r  = toUChar(sib & 7);
   1576          delta++;
   1577 
   1578          if (index_r != R_ESP && base_r != R_EBP) {
   1579             DIS(buf, "%s(%s,%s,%d)", sorbTxt(sorb),
   1580                       nameIReg(4,base_r), nameIReg(4,index_r), 1<<scale);
   1581             *len = 2;
   1582             return
   1583                disAMode_copy2tmp(
   1584                handleSegOverride(sorb,
   1585                   binop(Iop_Add32,
   1586                         getIReg(4,base_r),
   1587                         binop(Iop_Shl32, getIReg(4,index_r),
   1588                               mkU8(scale)))));
   1589          }
   1590 
   1591          if (index_r != R_ESP && base_r == R_EBP) {
   1592             UInt d = getUDisp32(delta);
   1593             DIS(buf, "%s0x%x(,%s,%d)", sorbTxt(sorb), d,
   1594                       nameIReg(4,index_r), 1<<scale);
   1595             *len = 6;
   1596             return
   1597                disAMode_copy2tmp(
   1598                handleSegOverride(sorb,
   1599                   binop(Iop_Add32,
   1600                         binop(Iop_Shl32, getIReg(4,index_r), mkU8(scale)),
   1601                         mkU32(d))));
   1602          }
   1603 
   1604          if (index_r == R_ESP && base_r != R_EBP) {
   1605             DIS(buf, "%s(%s,,)", sorbTxt(sorb), nameIReg(4,base_r));
   1606             *len = 2;
   1607             return disAMode_copy2tmp(
   1608                    handleSegOverride(sorb, getIReg(4,base_r)));
   1609          }
   1610 
   1611          if (index_r == R_ESP && base_r == R_EBP) {
   1612             UInt d = getUDisp32(delta);
   1613             DIS(buf, "%s0x%x(,,)", sorbTxt(sorb), d);
   1614             *len = 6;
   1615             return disAMode_copy2tmp(
   1616                    handleSegOverride(sorb, mkU32(d)));
   1617          }
   1618          /*NOTREACHED*/
   1619          vassert(0);
   1620       }
   1621 
   1622       /* SIB, with 8-bit displacement.  Special cases:
   1623          -- %esp cannot act as an index value.
   1624             If index_r indicates %esp, zero is used for the index.
   1625          Denoted value is:
   1626             | %index == %ESP
   1627             = d8 + %base
   1628             | %index != %ESP
   1629             = d8 + %base + (%index << scale)
   1630       */
   1631       case 0x0C: {
   1632          UChar sib     = getIByte(delta);
   1633          UChar scale   = toUChar((sib >> 6) & 3);
   1634          UChar index_r = toUChar((sib >> 3) & 7);
   1635          UChar base_r  = toUChar(sib & 7);
   1636          UInt  d       = getSDisp8(delta+1);
   1637 
   1638          if (index_r == R_ESP) {
   1639             DIS(buf, "%s%d(%s,,)", sorbTxt(sorb),
   1640                                    (Int)d, nameIReg(4,base_r));
   1641             *len = 3;
   1642             return disAMode_copy2tmp(
   1643                    handleSegOverride(sorb,
   1644                       binop(Iop_Add32, getIReg(4,base_r), mkU32(d)) ));
   1645          } else {
   1646             DIS(buf, "%s%d(%s,%s,%d)", sorbTxt(sorb), (Int)d,
   1647                      nameIReg(4,base_r), nameIReg(4,index_r), 1<<scale);
   1648             *len = 3;
   1649             return
   1650                 disAMode_copy2tmp(
   1651                 handleSegOverride(sorb,
   1652                   binop(Iop_Add32,
   1653                         binop(Iop_Add32,
   1654                               getIReg(4,base_r),
   1655                               binop(Iop_Shl32,
   1656                                     getIReg(4,index_r), mkU8(scale))),
   1657                         mkU32(d))));
   1658          }
   1659 	 /*NOTREACHED*/
   1660          vassert(0);
   1661       }
   1662 
   1663       /* SIB, with 32-bit displacement.  Special cases:
   1664          -- %esp cannot act as an index value.
   1665             If index_r indicates %esp, zero is used for the index.
   1666          Denoted value is:
   1667             | %index == %ESP
   1668             = d32 + %base
   1669             | %index != %ESP
   1670             = d32 + %base + (%index << scale)
   1671       */
   1672       case 0x14: {
   1673          UChar sib     = getIByte(delta);
   1674          UChar scale   = toUChar((sib >> 6) & 3);
   1675          UChar index_r = toUChar((sib >> 3) & 7);
   1676          UChar base_r  = toUChar(sib & 7);
   1677          UInt d        = getUDisp32(delta+1);
   1678 
   1679          if (index_r == R_ESP) {
   1680             DIS(buf, "%s%d(%s,,)", sorbTxt(sorb),
   1681                                    (Int)d, nameIReg(4,base_r));
   1682             *len = 6;
   1683             return disAMode_copy2tmp(
   1684                    handleSegOverride(sorb,
   1685                       binop(Iop_Add32, getIReg(4,base_r), mkU32(d)) ));
   1686          } else {
   1687             DIS(buf, "%s%d(%s,%s,%d)", sorbTxt(sorb), (Int)d,
   1688                      nameIReg(4,base_r), nameIReg(4,index_r), 1<<scale);
   1689             *len = 6;
   1690             return
   1691                 disAMode_copy2tmp(
   1692                 handleSegOverride(sorb,
   1693                   binop(Iop_Add32,
   1694                         binop(Iop_Add32,
   1695                               getIReg(4,base_r),
   1696                               binop(Iop_Shl32,
   1697                                     getIReg(4,index_r), mkU8(scale))),
   1698                         mkU32(d))));
   1699          }
   1700 	 /*NOTREACHED*/
   1701          vassert(0);
   1702       }
   1703 
   1704       default:
   1705          vpanic("disAMode(x86)");
   1706          return 0; /*notreached*/
   1707    }
   1708 }
   1709 
   1710 
   1711 /* Figure out the number of (insn-stream) bytes constituting the amode
   1712    beginning at delta.  Is useful for getting hold of literals beyond
   1713    the end of the amode before it has been disassembled.  */
   1714 
   1715 static UInt lengthAMode ( Int delta )
   1716 {
   1717    UChar mod_reg_rm = getIByte(delta); delta++;
   1718 
   1719    /* squeeze out the reg field from mod_reg_rm, since a 256-entry
   1720       jump table seems a bit excessive.
   1721    */
   1722    mod_reg_rm &= 0xC7;               /* is now XX000YYY */
   1723    mod_reg_rm  = toUChar(mod_reg_rm | (mod_reg_rm >> 3));
   1724                                      /* is now XX0XXYYY */
   1725    mod_reg_rm &= 0x1F;               /* is now 000XXYYY */
   1726    switch (mod_reg_rm) {
   1727 
   1728       /* (%eax) .. (%edi), not including (%esp) or (%ebp). */
   1729       case 0x00: case 0x01: case 0x02: case 0x03:
   1730       /* ! 04 */ /* ! 05 */ case 0x06: case 0x07:
   1731          return 1;
   1732 
   1733       /* d8(%eax) ... d8(%edi), not including d8(%esp). */
   1734       case 0x08: case 0x09: case 0x0A: case 0x0B:
   1735       /* ! 0C */ case 0x0D: case 0x0E: case 0x0F:
   1736          return 2;
   1737 
   1738       /* d32(%eax) ... d32(%edi), not including d32(%esp). */
   1739       case 0x10: case 0x11: case 0x12: case 0x13:
   1740       /* ! 14 */ case 0x15: case 0x16: case 0x17:
   1741          return 5;
   1742 
   1743       /* a register, %eax .. %edi.  (Not an addr, but still handled.) */
   1744       case 0x18: case 0x19: case 0x1A: case 0x1B:
   1745       case 0x1C: case 0x1D: case 0x1E: case 0x1F:
   1746          return 1;
   1747 
   1748       /* a 32-bit literal address. */
   1749       case 0x05: return 5;
   1750 
   1751       /* SIB, no displacement.  */
   1752       case 0x04: {
   1753          UChar sib    = getIByte(delta);
   1754          UChar base_r = toUChar(sib & 7);
   1755          if (base_r == R_EBP) return 6; else return 2;
   1756       }
   1757       /* SIB, with 8-bit displacement.  */
   1758       case 0x0C: return 3;
   1759 
   1760       /* SIB, with 32-bit displacement.  */
   1761       case 0x14: return 6;
   1762 
   1763       default:
   1764          vpanic("lengthAMode");
   1765          return 0; /*notreached*/
   1766    }
   1767 }
   1768 
   1769 /*------------------------------------------------------------*/
   1770 /*--- Disassembling common idioms                          ---*/
   1771 /*------------------------------------------------------------*/
   1772 
   1773 /* Handle binary integer instructions of the form
   1774       op E, G  meaning
   1775       op reg-or-mem, reg
   1776    Is passed the a ptr to the modRM byte, the actual operation, and the
   1777    data size.  Returns the address advanced completely over this
   1778    instruction.
   1779 
   1780    E(src) is reg-or-mem
   1781    G(dst) is reg.
   1782 
   1783    If E is reg, -->    GET %G,  tmp
   1784                        OP %E,   tmp
   1785                        PUT tmp, %G
   1786 
   1787    If E is mem and OP is not reversible,
   1788                 -->    (getAddr E) -> tmpa
   1789                        LD (tmpa), tmpa
   1790                        GET %G, tmp2
   1791                        OP tmpa, tmp2
   1792                        PUT tmp2, %G
   1793 
   1794    If E is mem and OP is reversible
   1795                 -->    (getAddr E) -> tmpa
   1796                        LD (tmpa), tmpa
   1797                        OP %G, tmpa
   1798                        PUT tmpa, %G
   1799 */
   1800 static
   1801 UInt dis_op2_E_G ( UChar       sorb,
   1802                    Bool        addSubCarry,
   1803                    IROp        op8,
   1804                    Bool        keep,
   1805                    Int         size,
   1806                    Int         delta0,
   1807                    HChar*      t_x86opc )
   1808 {
   1809    HChar   dis_buf[50];
   1810    Int     len;
   1811    IRType  ty   = szToITy(size);
   1812    IRTemp  dst1 = newTemp(ty);
   1813    IRTemp  src  = newTemp(ty);
   1814    IRTemp  dst0 = newTemp(ty);
   1815    UChar   rm   = getUChar(delta0);
   1816    IRTemp  addr = IRTemp_INVALID;
   1817 
   1818    /* addSubCarry == True indicates the intended operation is
   1819       add-with-carry or subtract-with-borrow. */
   1820    if (addSubCarry) {
   1821       vassert(op8 == Iop_Add8 || op8 == Iop_Sub8);
   1822       vassert(keep);
   1823    }
   1824 
   1825    if (epartIsReg(rm)) {
   1826       /* Specially handle XOR reg,reg, because that doesn't really
   1827          depend on reg, and doing the obvious thing potentially
   1828          generates a spurious value check failure due to the bogus
   1829          dependency.  Ditto SBB reg,reg. */
   1830       if ((op8 == Iop_Xor8 || (op8 == Iop_Sub8 && addSubCarry))
   1831           && gregOfRM(rm) == eregOfRM(rm)) {
   1832          putIReg(size, gregOfRM(rm), mkU(ty,0));
   1833       }
   1834       assign( dst0, getIReg(size,gregOfRM(rm)) );
   1835       assign( src,  getIReg(size,eregOfRM(rm)) );
   1836 
   1837       if (addSubCarry && op8 == Iop_Add8) {
   1838          helper_ADC( size, dst1, dst0, src,
   1839                      /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
   1840          putIReg(size, gregOfRM(rm), mkexpr(dst1));
   1841       } else
   1842       if (addSubCarry && op8 == Iop_Sub8) {
   1843          helper_SBB( size, dst1, dst0, src,
   1844                      /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
   1845          putIReg(size, gregOfRM(rm), mkexpr(dst1));
   1846       } else {
   1847          assign( dst1, binop(mkSizedOp(ty,op8), mkexpr(dst0), mkexpr(src)) );
   1848          if (isAddSub(op8))
   1849             setFlags_DEP1_DEP2(op8, dst0, src, ty);
   1850          else
   1851             setFlags_DEP1(op8, dst1, ty);
   1852          if (keep)
   1853             putIReg(size, gregOfRM(rm), mkexpr(dst1));
   1854       }
   1855 
   1856       DIP("%s%c %s,%s\n", t_x86opc, nameISize(size),
   1857                           nameIReg(size,eregOfRM(rm)),
   1858                           nameIReg(size,gregOfRM(rm)));
   1859       return 1+delta0;
   1860    } else {
   1861       /* E refers to memory */
   1862       addr = disAMode ( &len, sorb, delta0, dis_buf);
   1863       assign( dst0, getIReg(size,gregOfRM(rm)) );
   1864       assign( src,  loadLE(szToITy(size), mkexpr(addr)) );
   1865 
   1866       if (addSubCarry && op8 == Iop_Add8) {
   1867          helper_ADC( size, dst1, dst0, src,
   1868                      /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
   1869          putIReg(size, gregOfRM(rm), mkexpr(dst1));
   1870       } else
   1871       if (addSubCarry && op8 == Iop_Sub8) {
   1872          helper_SBB( size, dst1, dst0, src,
   1873                      /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
   1874          putIReg(size, gregOfRM(rm), mkexpr(dst1));
   1875       } else {
   1876          assign( dst1, binop(mkSizedOp(ty,op8), mkexpr(dst0), mkexpr(src)) );
   1877          if (isAddSub(op8))
   1878             setFlags_DEP1_DEP2(op8, dst0, src, ty);
   1879          else
   1880             setFlags_DEP1(op8, dst1, ty);
   1881          if (keep)
   1882             putIReg(size, gregOfRM(rm), mkexpr(dst1));
   1883       }
   1884 
   1885       DIP("%s%c %s,%s\n", t_x86opc, nameISize(size),
   1886                           dis_buf,nameIReg(size,gregOfRM(rm)));
   1887       return len+delta0;
   1888    }
   1889 }
   1890 
   1891 
   1892 
   1893 /* Handle binary integer instructions of the form
   1894       op G, E  meaning
   1895       op reg, reg-or-mem
   1896    Is passed the a ptr to the modRM byte, the actual operation, and the
   1897    data size.  Returns the address advanced completely over this
   1898    instruction.
   1899 
   1900    G(src) is reg.
   1901    E(dst) is reg-or-mem
   1902 
   1903    If E is reg, -->    GET %E,  tmp
   1904                        OP %G,   tmp
   1905                        PUT tmp, %E
   1906 
   1907    If E is mem, -->    (getAddr E) -> tmpa
   1908                        LD (tmpa), tmpv
   1909                        OP %G, tmpv
   1910                        ST tmpv, (tmpa)
   1911 */
   1912 static
   1913 UInt dis_op2_G_E ( UChar       sorb,
   1914                    Bool        locked,
   1915                    Bool        addSubCarry,
   1916                    IROp        op8,
   1917                    Bool        keep,
   1918                    Int         size,
   1919                    Int         delta0,
   1920                    HChar*      t_x86opc )
   1921 {
   1922    HChar   dis_buf[50];
   1923    Int     len;
   1924    IRType  ty   = szToITy(size);
   1925    IRTemp  dst1 = newTemp(ty);
   1926    IRTemp  src  = newTemp(ty);
   1927    IRTemp  dst0 = newTemp(ty);
   1928    UChar   rm   = getIByte(delta0);
   1929    IRTemp  addr = IRTemp_INVALID;
   1930 
   1931    /* addSubCarry == True indicates the intended operation is
   1932       add-with-carry or subtract-with-borrow. */
   1933    if (addSubCarry) {
   1934       vassert(op8 == Iop_Add8 || op8 == Iop_Sub8);
   1935       vassert(keep);
   1936    }
   1937 
   1938    if (epartIsReg(rm)) {
   1939       /* Specially handle XOR reg,reg, because that doesn't really
   1940          depend on reg, and doing the obvious thing potentially
   1941          generates a spurious value check failure due to the bogus
   1942          dependency.  Ditto SBB reg,reg.*/
   1943       if ((op8 == Iop_Xor8 || (op8 == Iop_Sub8 && addSubCarry))
   1944           && gregOfRM(rm) == eregOfRM(rm)) {
   1945          putIReg(size, eregOfRM(rm), mkU(ty,0));
   1946       }
   1947       assign(dst0, getIReg(size,eregOfRM(rm)));
   1948       assign(src,  getIReg(size,gregOfRM(rm)));
   1949 
   1950       if (addSubCarry && op8 == Iop_Add8) {
   1951          helper_ADC( size, dst1, dst0, src,
   1952                      /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
   1953          putIReg(size, eregOfRM(rm), mkexpr(dst1));
   1954       } else
   1955       if (addSubCarry && op8 == Iop_Sub8) {
   1956          helper_SBB( size, dst1, dst0, src,
   1957                      /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
   1958          putIReg(size, eregOfRM(rm), mkexpr(dst1));
   1959       } else {
   1960          assign(dst1, binop(mkSizedOp(ty,op8), mkexpr(dst0), mkexpr(src)));
   1961          if (isAddSub(op8))
   1962             setFlags_DEP1_DEP2(op8, dst0, src, ty);
   1963          else
   1964             setFlags_DEP1(op8, dst1, ty);
   1965          if (keep)
   1966             putIReg(size, eregOfRM(rm), mkexpr(dst1));
   1967       }
   1968 
   1969       DIP("%s%c %s,%s\n", t_x86opc, nameISize(size),
   1970                           nameIReg(size,gregOfRM(rm)),
   1971                           nameIReg(size,eregOfRM(rm)));
   1972       return 1+delta0;
   1973    }
   1974 
   1975    /* E refers to memory */
   1976    {
   1977       addr = disAMode ( &len, sorb, delta0, dis_buf);
   1978       assign(dst0, loadLE(ty,mkexpr(addr)));
   1979       assign(src,  getIReg(size,gregOfRM(rm)));
   1980 
   1981       if (addSubCarry && op8 == Iop_Add8) {
   1982          if (locked) {
   1983             /* cas-style store */
   1984             helper_ADC( size, dst1, dst0, src,
   1985                         /*store*/addr, dst0/*expVal*/, guest_EIP_curr_instr );
   1986          } else {
   1987             /* normal store */
   1988             helper_ADC( size, dst1, dst0, src,
   1989                         /*store*/addr, IRTemp_INVALID, 0 );
   1990          }
   1991       } else
   1992       if (addSubCarry && op8 == Iop_Sub8) {
   1993          if (locked) {
   1994             /* cas-style store */
   1995             helper_SBB( size, dst1, dst0, src,
   1996                         /*store*/addr, dst0/*expVal*/, guest_EIP_curr_instr );
   1997          } else {
   1998             /* normal store */
   1999             helper_SBB( size, dst1, dst0, src,
   2000                         /*store*/addr, IRTemp_INVALID, 0 );
   2001          }
   2002       } else {
   2003          assign(dst1, binop(mkSizedOp(ty,op8), mkexpr(dst0), mkexpr(src)));
   2004          if (keep) {
   2005             if (locked) {
   2006                if (0) vex_printf("locked case\n" );
   2007                casLE( mkexpr(addr),
   2008                       mkexpr(dst0)/*expval*/,
   2009                       mkexpr(dst1)/*newval*/, guest_EIP_curr_instr );
   2010             } else {
   2011                if (0) vex_printf("nonlocked case\n");
   2012                storeLE(mkexpr(addr), mkexpr(dst1));
   2013             }
   2014          }
   2015          if (isAddSub(op8))
   2016             setFlags_DEP1_DEP2(op8, dst0, src, ty);
   2017          else
   2018             setFlags_DEP1(op8, dst1, ty);
   2019       }
   2020 
   2021       DIP("%s%c %s,%s\n", t_x86opc, nameISize(size),
   2022                           nameIReg(size,gregOfRM(rm)), dis_buf);
   2023       return len+delta0;
   2024    }
   2025 }
   2026 
   2027 
   2028 /* Handle move instructions of the form
   2029       mov E, G  meaning
   2030       mov reg-or-mem, reg
   2031    Is passed the a ptr to the modRM byte, and the data size.  Returns
   2032    the address advanced completely over this instruction.
   2033 
   2034    E(src) is reg-or-mem
   2035    G(dst) is reg.
   2036 
   2037    If E is reg, -->    GET %E,  tmpv
   2038                        PUT tmpv, %G
   2039 
   2040    If E is mem  -->    (getAddr E) -> tmpa
   2041                        LD (tmpa), tmpb
   2042                        PUT tmpb, %G
   2043 */
   2044 static
   2045 UInt dis_mov_E_G ( UChar       sorb,
   2046                    Int         size,
   2047                    Int         delta0 )
   2048 {
   2049    Int len;
   2050    UChar rm = getIByte(delta0);
   2051    HChar dis_buf[50];
   2052 
   2053    if (epartIsReg(rm)) {
   2054       putIReg(size, gregOfRM(rm), getIReg(size, eregOfRM(rm)));
   2055       DIP("mov%c %s,%s\n", nameISize(size),
   2056                            nameIReg(size,eregOfRM(rm)),
   2057                            nameIReg(size,gregOfRM(rm)));
   2058       return 1+delta0;
   2059    }
   2060 
   2061    /* E refers to memory */
   2062    {
   2063       IRTemp addr = disAMode ( &len, sorb, delta0, dis_buf );
   2064       putIReg(size, gregOfRM(rm), loadLE(szToITy(size), mkexpr(addr)));
   2065       DIP("mov%c %s,%s\n", nameISize(size),
   2066                            dis_buf,nameIReg(size,gregOfRM(rm)));
   2067       return delta0+len;
   2068    }
   2069 }
   2070 
   2071 
   2072 /* Handle move instructions of the form
   2073       mov G, E  meaning
   2074       mov reg, reg-or-mem
   2075    Is passed the a ptr to the modRM byte, and the data size.  Returns
   2076    the address advanced completely over this instruction.
   2077 
   2078    G(src) is reg.
   2079    E(dst) is reg-or-mem
   2080 
   2081    If E is reg, -->    GET %G,  tmp
   2082                        PUT tmp, %E
   2083 
   2084    If E is mem, -->    (getAddr E) -> tmpa
   2085                        GET %G, tmpv
   2086                        ST tmpv, (tmpa)
   2087 */
   2088 static
   2089 UInt dis_mov_G_E ( UChar       sorb,
   2090                    Int         size,
   2091                    Int         delta0 )
   2092 {
   2093    Int len;
   2094    UChar rm = getIByte(delta0);
   2095    HChar dis_buf[50];
   2096 
   2097    if (epartIsReg(rm)) {
   2098       putIReg(size, eregOfRM(rm), getIReg(size, gregOfRM(rm)));
   2099       DIP("mov%c %s,%s\n", nameISize(size),
   2100                            nameIReg(size,gregOfRM(rm)),
   2101                            nameIReg(size,eregOfRM(rm)));
   2102       return 1+delta0;
   2103    }
   2104 
   2105    /* E refers to memory */
   2106    {
   2107       IRTemp addr = disAMode ( &len, sorb, delta0, dis_buf);
   2108       storeLE( mkexpr(addr), getIReg(size, gregOfRM(rm)) );
   2109       DIP("mov%c %s,%s\n", nameISize(size),
   2110                            nameIReg(size,gregOfRM(rm)), dis_buf);
   2111       return len+delta0;
   2112    }
   2113 }
   2114 
   2115 
   2116 /* op $immediate, AL/AX/EAX. */
   2117 static
   2118 UInt dis_op_imm_A ( Int    size,
   2119                     Bool   carrying,
   2120                     IROp   op8,
   2121                     Bool   keep,
   2122                     Int    delta,
   2123                     HChar* t_x86opc )
   2124 {
   2125    IRType ty   = szToITy(size);
   2126    IRTemp dst0 = newTemp(ty);
   2127    IRTemp src  = newTemp(ty);
   2128    IRTemp dst1 = newTemp(ty);
   2129    UInt lit    = getUDisp(size,delta);
   2130    assign(dst0, getIReg(size,R_EAX));
   2131    assign(src,  mkU(ty,lit));
   2132 
   2133    if (isAddSub(op8) && !carrying) {
   2134       assign(dst1, binop(mkSizedOp(ty,op8), mkexpr(dst0), mkexpr(src)) );
   2135       setFlags_DEP1_DEP2(op8, dst0, src, ty);
   2136    }
   2137    else
   2138    if (isLogic(op8)) {
   2139       vassert(!carrying);
   2140       assign(dst1, binop(mkSizedOp(ty,op8), mkexpr(dst0), mkexpr(src)) );
   2141       setFlags_DEP1(op8, dst1, ty);
   2142    }
   2143    else
   2144    if (op8 == Iop_Add8 && carrying) {
   2145       helper_ADC( size, dst1, dst0, src,
   2146                   /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
   2147    }
   2148    else
   2149    if (op8 == Iop_Sub8 && carrying) {
   2150       helper_SBB( size, dst1, dst0, src,
   2151                   /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
   2152    }
   2153    else
   2154       vpanic("dis_op_imm_A(x86,guest)");
   2155 
   2156    if (keep)
   2157       putIReg(size, R_EAX, mkexpr(dst1));
   2158 
   2159    DIP("%s%c $0x%x, %s\n", t_x86opc, nameISize(size),
   2160                            lit, nameIReg(size,R_EAX));
   2161    return delta+size;
   2162 }
   2163 
   2164 
   2165 /* Sign- and Zero-extending moves. */
   2166 static
   2167 UInt dis_movx_E_G ( UChar      sorb,
   2168                     Int delta, Int szs, Int szd, Bool sign_extend )
   2169 {
   2170    UChar rm = getIByte(delta);
   2171    if (epartIsReg(rm)) {
   2172       if (szd == szs) {
   2173          // mutant case.  See #250799
   2174          putIReg(szd, gregOfRM(rm),
   2175                            getIReg(szs,eregOfRM(rm)));
   2176       } else {
   2177          // normal case
   2178          putIReg(szd, gregOfRM(rm),
   2179                       unop(mkWidenOp(szs,szd,sign_extend),
   2180                            getIReg(szs,eregOfRM(rm))));
   2181       }
   2182       DIP("mov%c%c%c %s,%s\n", sign_extend ? 's' : 'z',
   2183                                nameISize(szs), nameISize(szd),
   2184                                nameIReg(szs,eregOfRM(rm)),
   2185                                nameIReg(szd,gregOfRM(rm)));
   2186       return 1+delta;
   2187    }
   2188 
   2189    /* E refers to memory */
   2190    {
   2191       Int    len;
   2192       HChar  dis_buf[50];
   2193       IRTemp addr = disAMode ( &len, sorb, delta, dis_buf );
   2194       if (szd == szs) {
   2195          // mutant case.  See #250799
   2196          putIReg(szd, gregOfRM(rm),
   2197                            loadLE(szToITy(szs),mkexpr(addr)));
   2198       } else {
   2199          // normal case
   2200          putIReg(szd, gregOfRM(rm),
   2201                       unop(mkWidenOp(szs,szd,sign_extend),
   2202                            loadLE(szToITy(szs),mkexpr(addr))));
   2203       }
   2204       DIP("mov%c%c%c %s,%s\n", sign_extend ? 's' : 'z',
   2205                                nameISize(szs), nameISize(szd),
   2206                                dis_buf, nameIReg(szd,gregOfRM(rm)));
   2207       return len+delta;
   2208    }
   2209 }
   2210 
   2211 
   2212 /* Generate code to divide ArchRegs EDX:EAX / DX:AX / AX by the 32 /
   2213    16 / 8 bit quantity in the given IRTemp.  */
   2214 static
   2215 void codegen_div ( Int sz, IRTemp t, Bool signed_divide )
   2216 {
   2217    IROp   op    = signed_divide ? Iop_DivModS64to32 : Iop_DivModU64to32;
   2218    IRTemp src64 = newTemp(Ity_I64);
   2219    IRTemp dst64 = newTemp(Ity_I64);
   2220    switch (sz) {
   2221       case 4:
   2222          assign( src64, binop(Iop_32HLto64,
   2223                               getIReg(4,R_EDX), getIReg(4,R_EAX)) );
   2224          assign( dst64, binop(op, mkexpr(src64), mkexpr(t)) );
   2225          putIReg( 4, R_EAX, unop(Iop_64to32,mkexpr(dst64)) );
   2226          putIReg( 4, R_EDX, unop(Iop_64HIto32,mkexpr(dst64)) );
   2227          break;
   2228       case 2: {
   2229          IROp widen3264 = signed_divide ? Iop_32Sto64 : Iop_32Uto64;
   2230          IROp widen1632 = signed_divide ? Iop_16Sto32 : Iop_16Uto32;
   2231          assign( src64, unop(widen3264,
   2232                              binop(Iop_16HLto32,
   2233                                    getIReg(2,R_EDX), getIReg(2,R_EAX))) );
   2234          assign( dst64, binop(op, mkexpr(src64), unop(widen1632,mkexpr(t))) );
   2235          putIReg( 2, R_EAX, unop(Iop_32to16,unop(Iop_64to32,mkexpr(dst64))) );
   2236          putIReg( 2, R_EDX, unop(Iop_32to16,unop(Iop_64HIto32,mkexpr(dst64))) );
   2237          break;
   2238       }
   2239       case 1: {
   2240          IROp widen3264 = signed_divide ? Iop_32Sto64 : Iop_32Uto64;
   2241          IROp widen1632 = signed_divide ? Iop_16Sto32 : Iop_16Uto32;
   2242          IROp widen816  = signed_divide ? Iop_8Sto16  : Iop_8Uto16;
   2243          assign( src64, unop(widen3264, unop(widen1632, getIReg(2,R_EAX))) );
   2244          assign( dst64,
   2245                  binop(op, mkexpr(src64),
   2246                            unop(widen1632, unop(widen816, mkexpr(t)))) );
   2247          putIReg( 1, R_AL, unop(Iop_16to8, unop(Iop_32to16,
   2248                            unop(Iop_64to32,mkexpr(dst64)))) );
   2249          putIReg( 1, R_AH, unop(Iop_16to8, unop(Iop_32to16,
   2250                            unop(Iop_64HIto32,mkexpr(dst64)))) );
   2251          break;
   2252       }
   2253       default: vpanic("codegen_div(x86)");
   2254    }
   2255 }
   2256 
   2257 
   2258 static
   2259 UInt dis_Grp1 ( UChar sorb, Bool locked,
   2260                 Int delta, UChar modrm,
   2261                 Int am_sz, Int d_sz, Int sz, UInt d32 )
   2262 {
   2263    Int     len;
   2264    HChar   dis_buf[50];
   2265    IRType  ty   = szToITy(sz);
   2266    IRTemp  dst1 = newTemp(ty);
   2267    IRTemp  src  = newTemp(ty);
   2268    IRTemp  dst0 = newTemp(ty);
   2269    IRTemp  addr = IRTemp_INVALID;
   2270    IROp    op8  = Iop_INVALID;
   2271    UInt    mask = sz==1 ? 0xFF : (sz==2 ? 0xFFFF : 0xFFFFFFFF);
   2272 
   2273    switch (gregOfRM(modrm)) {
   2274       case 0: op8 = Iop_Add8; break;  case 1: op8 = Iop_Or8;  break;
   2275       case 2: break;  // ADC
   2276       case 3: break;  // SBB
   2277       case 4: op8 = Iop_And8; break;  case 5: op8 = Iop_Sub8; break;
   2278       case 6: op8 = Iop_Xor8; break;  case 7: op8 = Iop_Sub8; break;
   2279       /*NOTREACHED*/
   2280       default: vpanic("dis_Grp1: unhandled case");
   2281    }
   2282 
   2283    if (epartIsReg(modrm)) {
   2284       vassert(am_sz == 1);
   2285 
   2286       assign(dst0, getIReg(sz,eregOfRM(modrm)));
   2287       assign(src,  mkU(ty,d32 & mask));
   2288 
   2289       if (gregOfRM(modrm) == 2 /* ADC */) {
   2290          helper_ADC( sz, dst1, dst0, src,
   2291                      /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
   2292       } else
   2293       if (gregOfRM(modrm) == 3 /* SBB */) {
   2294          helper_SBB( sz, dst1, dst0, src,
   2295                      /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
   2296       } else {
   2297          assign(dst1, binop(mkSizedOp(ty,op8), mkexpr(dst0), mkexpr(src)));
   2298          if (isAddSub(op8))
   2299             setFlags_DEP1_DEP2(op8, dst0, src, ty);
   2300          else
   2301             setFlags_DEP1(op8, dst1, ty);
   2302       }
   2303 
   2304       if (gregOfRM(modrm) < 7)
   2305          putIReg(sz, eregOfRM(modrm), mkexpr(dst1));
   2306 
   2307       delta += (am_sz + d_sz);
   2308       DIP("%s%c $0x%x, %s\n", nameGrp1(gregOfRM(modrm)), nameISize(sz), d32,
   2309                               nameIReg(sz,eregOfRM(modrm)));
   2310    } else {
   2311       addr = disAMode ( &len, sorb, delta, dis_buf);
   2312 
   2313       assign(dst0, loadLE(ty,mkexpr(addr)));
   2314       assign(src, mkU(ty,d32 & mask));
   2315 
   2316       if (gregOfRM(modrm) == 2 /* ADC */) {
   2317          if (locked) {
   2318             /* cas-style store */
   2319             helper_ADC( sz, dst1, dst0, src,
   2320                        /*store*/addr, dst0/*expVal*/, guest_EIP_curr_instr );
   2321          } else {
   2322             /* normal store */
   2323             helper_ADC( sz, dst1, dst0, src,
   2324                         /*store*/addr, IRTemp_INVALID, 0 );
   2325          }
   2326       } else
   2327       if (gregOfRM(modrm) == 3 /* SBB */) {
   2328          if (locked) {
   2329             /* cas-style store */
   2330             helper_SBB( sz, dst1, dst0, src,
   2331                        /*store*/addr, dst0/*expVal*/, guest_EIP_curr_instr );
   2332          } else {
   2333             /* normal store */
   2334             helper_SBB( sz, dst1, dst0, src,
   2335                         /*store*/addr, IRTemp_INVALID, 0 );
   2336          }
   2337       } else {
   2338          assign(dst1, binop(mkSizedOp(ty,op8), mkexpr(dst0), mkexpr(src)));
   2339          if (gregOfRM(modrm) < 7) {
   2340             if (locked) {
   2341                casLE( mkexpr(addr), mkexpr(dst0)/*expVal*/,
   2342                                     mkexpr(dst1)/*newVal*/,
   2343                                     guest_EIP_curr_instr );
   2344             } else {
   2345                storeLE(mkexpr(addr), mkexpr(dst1));
   2346             }
   2347          }
   2348          if (isAddSub(op8))
   2349             setFlags_DEP1_DEP2(op8, dst0, src, ty);
   2350          else
   2351             setFlags_DEP1(op8, dst1, ty);
   2352       }
   2353 
   2354       delta += (len+d_sz);
   2355       DIP("%s%c $0x%x, %s\n", nameGrp1(gregOfRM(modrm)), nameISize(sz),
   2356                               d32, dis_buf);
   2357    }
   2358    return delta;
   2359 }
   2360 
   2361 
   2362 /* Group 2 extended opcodes.  shift_expr must be an 8-bit typed
   2363    expression. */
   2364 
   2365 static
   2366 UInt dis_Grp2 ( UChar sorb,
   2367                 Int delta, UChar modrm,
   2368                 Int am_sz, Int d_sz, Int sz, IRExpr* shift_expr,
   2369                 HChar* shift_expr_txt, Bool* decode_OK )
   2370 {
   2371    /* delta on entry points at the modrm byte. */
   2372    HChar  dis_buf[50];
   2373    Int    len;
   2374    Bool   isShift, isRotate, isRotateC;
   2375    IRType ty    = szToITy(sz);
   2376    IRTemp dst0  = newTemp(ty);
   2377    IRTemp dst1  = newTemp(ty);
   2378    IRTemp addr  = IRTemp_INVALID;
   2379 
   2380    *decode_OK = True;
   2381 
   2382    vassert(sz == 1 || sz == 2 || sz == 4);
   2383 
   2384    /* Put value to shift/rotate in dst0. */
   2385    if (epartIsReg(modrm)) {
   2386       assign(dst0, getIReg(sz, eregOfRM(modrm)));
   2387       delta += (am_sz + d_sz);
   2388    } else {
   2389       addr = disAMode ( &len, sorb, delta, dis_buf);
   2390       assign(dst0, loadLE(ty,mkexpr(addr)));
   2391       delta += len + d_sz;
   2392    }
   2393 
   2394    isShift = False;
   2395    switch (gregOfRM(modrm)) { case 4: case 5: case 6: case 7: isShift = True; }
   2396 
   2397    isRotate = False;
   2398    switch (gregOfRM(modrm)) { case 0: case 1: isRotate = True; }
   2399 
   2400    isRotateC = False;
   2401    switch (gregOfRM(modrm)) { case 2: case 3: isRotateC = True; }
   2402 
   2403    if (!isShift && !isRotate && !isRotateC) {
   2404       /*NOTREACHED*/
   2405       vpanic("dis_Grp2(Reg): unhandled case(x86)");
   2406    }
   2407 
   2408    if (isRotateC) {
   2409       /* call a helper; these insns are so ridiculous they do not
   2410          deserve better */
   2411       Bool     left = toBool(gregOfRM(modrm) == 2);
   2412       IRTemp   r64  = newTemp(Ity_I64);
   2413       IRExpr** args
   2414          = mkIRExprVec_4( widenUto32(mkexpr(dst0)), /* thing to rotate */
   2415                           widenUto32(shift_expr),   /* rotate amount */
   2416                           widenUto32(mk_x86g_calculate_eflags_all()),
   2417                           mkU32(sz) );
   2418       assign( r64, mkIRExprCCall(
   2419                       Ity_I64,
   2420                       0/*regparm*/,
   2421                       left ? "x86g_calculate_RCL" : "x86g_calculate_RCR",
   2422                       left ? &x86g_calculate_RCL  : &x86g_calculate_RCR,
   2423                       args
   2424                    )
   2425             );
   2426       /* new eflags in hi half r64; new value in lo half r64 */
   2427       assign( dst1, narrowTo(ty, unop(Iop_64to32, mkexpr(r64))) );
   2428       stmt( IRStmt_Put( OFFB_CC_OP,   mkU32(X86G_CC_OP_COPY) ));
   2429       stmt( IRStmt_Put( OFFB_CC_DEP1, unop(Iop_64HIto32, mkexpr(r64)) ));
   2430       stmt( IRStmt_Put( OFFB_CC_DEP2, mkU32(0) ));
   2431       /* Set NDEP even though it isn't used.  This makes redundant-PUT
   2432          elimination of previous stores to this field work better. */
   2433       stmt( IRStmt_Put( OFFB_CC_NDEP, mkU32(0) ));
   2434    }
   2435 
   2436    if (isShift) {
   2437 
   2438       IRTemp pre32     = newTemp(Ity_I32);
   2439       IRTemp res32     = newTemp(Ity_I32);
   2440       IRTemp res32ss   = newTemp(Ity_I32);
   2441       IRTemp shift_amt = newTemp(Ity_I8);
   2442       IROp   op32;
   2443 
   2444       switch (gregOfRM(modrm)) {
   2445          case 4: op32 = Iop_Shl32; break;
   2446          case 5: op32 = Iop_Shr32; break;
   2447          case 6: op32 = Iop_Shl32; break;
   2448          case 7: op32 = Iop_Sar32; break;
   2449          /*NOTREACHED*/
   2450          default: vpanic("dis_Grp2:shift"); break;
   2451       }
   2452 
   2453       /* Widen the value to be shifted to 32 bits, do the shift, and
   2454          narrow back down.  This seems surprisingly long-winded, but
   2455          unfortunately the Intel semantics requires that 8/16-bit
   2456          shifts give defined results for shift values all the way up
   2457          to 31, and this seems the simplest way to do it.  It has the
   2458          advantage that the only IR level shifts generated are of 32
   2459          bit values, and the shift amount is guaranteed to be in the
   2460          range 0 .. 31, thereby observing the IR semantics requiring
   2461          all shift values to be in the range 0 .. 2^word_size-1. */
   2462 
   2463       /* shift_amt = shift_expr & 31, regardless of operation size */
   2464       assign( shift_amt, binop(Iop_And8, shift_expr, mkU8(31)) );
   2465 
   2466       /* suitably widen the value to be shifted to 32 bits. */
   2467       assign( pre32, op32==Iop_Sar32 ? widenSto32(mkexpr(dst0))
   2468                                      : widenUto32(mkexpr(dst0)) );
   2469 
   2470       /* res32 = pre32 `shift` shift_amt */
   2471       assign( res32, binop(op32, mkexpr(pre32), mkexpr(shift_amt)) );
   2472 
   2473       /* res32ss = pre32 `shift` ((shift_amt - 1) & 31) */
   2474       assign( res32ss,
   2475               binop(op32,
   2476                     mkexpr(pre32),
   2477                     binop(Iop_And8,
   2478                           binop(Iop_Sub8,
   2479                                 mkexpr(shift_amt), mkU8(1)),
   2480                           mkU8(31))) );
   2481 
   2482       /* Build the flags thunk. */
   2483       setFlags_DEP1_DEP2_shift(op32, res32, res32ss, ty, shift_amt);
   2484 
   2485       /* Narrow the result back down. */
   2486       assign( dst1, narrowTo(ty, mkexpr(res32)) );
   2487 
   2488    } /* if (isShift) */
   2489 
   2490    else
   2491    if (isRotate) {
   2492       Int    ccOp      = ty==Ity_I8 ? 0 : (ty==Ity_I16 ? 1 : 2);
   2493       Bool   left      = toBool(gregOfRM(modrm) == 0);
   2494       IRTemp rot_amt   = newTemp(Ity_I8);
   2495       IRTemp rot_amt32 = newTemp(Ity_I8);
   2496       IRTemp oldFlags  = newTemp(Ity_I32);
   2497 
   2498       /* rot_amt = shift_expr & mask */
   2499       /* By masking the rotate amount thusly, the IR-level Shl/Shr
   2500          expressions never shift beyond the word size and thus remain
   2501          well defined. */
   2502       assign(rot_amt32, binop(Iop_And8, shift_expr, mkU8(31)));
   2503 
   2504       if (ty == Ity_I32)
   2505          assign(rot_amt, mkexpr(rot_amt32));
   2506       else
   2507          assign(rot_amt, binop(Iop_And8, mkexpr(rot_amt32), mkU8(8*sz-1)));
   2508 
   2509       if (left) {
   2510 
   2511          /* dst1 = (dst0 << rot_amt) | (dst0 >>u (wordsize-rot_amt)) */
   2512          assign(dst1,
   2513             binop( mkSizedOp(ty,Iop_Or8),
   2514                    binop( mkSizedOp(ty,Iop_Shl8),
   2515                           mkexpr(dst0),
   2516                           mkexpr(rot_amt)
   2517                    ),
   2518                    binop( mkSizedOp(ty,Iop_Shr8),
   2519                           mkexpr(dst0),
   2520                           binop(Iop_Sub8,mkU8(8*sz), mkexpr(rot_amt))
   2521                    )
   2522             )
   2523          );
   2524          ccOp += X86G_CC_OP_ROLB;
   2525 
   2526       } else { /* right */
   2527 
   2528          /* dst1 = (dst0 >>u rot_amt) | (dst0 << (wordsize-rot_amt)) */
   2529          assign(dst1,
   2530             binop( mkSizedOp(ty,Iop_Or8),
   2531                    binop( mkSizedOp(ty,Iop_Shr8),
   2532                           mkexpr(dst0),
   2533                           mkexpr(rot_amt)
   2534                    ),
   2535                    binop( mkSizedOp(ty,Iop_Shl8),
   2536                           mkexpr(dst0),
   2537                           binop(Iop_Sub8,mkU8(8*sz), mkexpr(rot_amt))
   2538                    )
   2539             )
   2540          );
   2541          ccOp += X86G_CC_OP_RORB;
   2542 
   2543       }
   2544 
   2545       /* dst1 now holds the rotated value.  Build flag thunk.  We
   2546          need the resulting value for this, and the previous flags.
   2547          Except don't set it if the rotate count is zero. */
   2548 
   2549       assign(oldFlags, mk_x86g_calculate_eflags_all());
   2550 
   2551       /* CC_DEP1 is the rotated value.  CC_NDEP is flags before. */
   2552       stmt( IRStmt_Put( OFFB_CC_OP,
   2553                         IRExpr_Mux0X( mkexpr(rot_amt32),
   2554                                       IRExpr_Get(OFFB_CC_OP,Ity_I32),
   2555                                       mkU32(ccOp))) );
   2556       stmt( IRStmt_Put( OFFB_CC_DEP1,
   2557                         IRExpr_Mux0X( mkexpr(rot_amt32),
   2558                                       IRExpr_Get(OFFB_CC_DEP1,Ity_I32),
   2559                                       widenUto32(mkexpr(dst1)))) );
   2560       stmt( IRStmt_Put( OFFB_CC_DEP2,
   2561                         IRExpr_Mux0X( mkexpr(rot_amt32),
   2562                                       IRExpr_Get(OFFB_CC_DEP2,Ity_I32),
   2563                                       mkU32(0))) );
   2564       stmt( IRStmt_Put( OFFB_CC_NDEP,
   2565                         IRExpr_Mux0X( mkexpr(rot_amt32),
   2566                                       IRExpr_Get(OFFB_CC_NDEP,Ity_I32),
   2567                                       mkexpr(oldFlags))) );
   2568    } /* if (isRotate) */
   2569 
   2570    /* Save result, and finish up. */
   2571    if (epartIsReg(modrm)) {
   2572       putIReg(sz, eregOfRM(modrm), mkexpr(dst1));
   2573       if (vex_traceflags & VEX_TRACE_FE) {
   2574          vex_printf("%s%c ",
   2575                     nameGrp2(gregOfRM(modrm)), nameISize(sz) );
   2576          if (shift_expr_txt)
   2577             vex_printf("%s", shift_expr_txt);
   2578          else
   2579             ppIRExpr(shift_expr);
   2580          vex_printf(", %s\n", nameIReg(sz,eregOfRM(modrm)));
   2581       }
   2582    } else {
   2583       storeLE(mkexpr(addr), mkexpr(dst1));
   2584       if (vex_traceflags & VEX_TRACE_FE) {
   2585          vex_printf("%s%c ",
   2586                     nameGrp2(gregOfRM(modrm)), nameISize(sz) );
   2587          if (shift_expr_txt)
   2588             vex_printf("%s", shift_expr_txt);
   2589          else
   2590             ppIRExpr(shift_expr);
   2591          vex_printf(", %s\n", dis_buf);
   2592       }
   2593    }
   2594    return delta;
   2595 }
   2596 
   2597 
   2598 /* Group 8 extended opcodes (but BT/BTS/BTC/BTR only). */
   2599 static
   2600 UInt dis_Grp8_Imm ( UChar sorb,
   2601                     Bool locked,
   2602                     Int delta, UChar modrm,
   2603                     Int am_sz, Int sz, UInt src_val,
   2604                     Bool* decode_OK )
   2605 {
   2606    /* src_val denotes a d8.
   2607       And delta on entry points at the modrm byte. */
   2608 
   2609    IRType ty     = szToITy(sz);
   2610    IRTemp t2     = newTemp(Ity_I32);
   2611    IRTemp t2m    = newTemp(Ity_I32);
   2612    IRTemp t_addr = IRTemp_INVALID;
   2613    HChar  dis_buf[50];
   2614    UInt   mask;
   2615 
   2616    /* we're optimists :-) */
   2617    *decode_OK = True;
   2618 
   2619    /* Limit src_val -- the bit offset -- to something within a word.
   2620       The Intel docs say that literal offsets larger than a word are
   2621       masked in this way. */
   2622    switch (sz) {
   2623       case 2:  src_val &= 15; break;
   2624       case 4:  src_val &= 31; break;
   2625       default: *decode_OK = False; return delta;
   2626    }
   2627 
   2628    /* Invent a mask suitable for the operation. */
   2629    switch (gregOfRM(modrm)) {
   2630       case 4: /* BT */  mask = 0;               break;
   2631       case 5: /* BTS */ mask = 1 << src_val;    break;
   2632       case 6: /* BTR */ mask = ~(1 << src_val); break;
   2633       case 7: /* BTC */ mask = 1 << src_val;    break;
   2634          /* If this needs to be extended, probably simplest to make a
   2635             new function to handle the other cases (0 .. 3).  The
   2636             Intel docs do however not indicate any use for 0 .. 3, so
   2637             we don't expect this to happen. */
   2638       default: *decode_OK = False; return delta;
   2639    }
   2640 
   2641    /* Fetch the value to be tested and modified into t2, which is
   2642       32-bits wide regardless of sz. */
   2643    if (epartIsReg(modrm)) {
   2644       vassert(am_sz == 1);
   2645       assign( t2, widenUto32(getIReg(sz, eregOfRM(modrm))) );
   2646       delta += (am_sz + 1);
   2647       DIP("%s%c $0x%x, %s\n", nameGrp8(gregOfRM(modrm)), nameISize(sz),
   2648                               src_val, nameIReg(sz,eregOfRM(modrm)));
   2649    } else {
   2650       Int len;
   2651       t_addr = disAMode ( &len, sorb, delta, dis_buf);
   2652       delta  += (len+1);
   2653       assign( t2, widenUto32(loadLE(ty, mkexpr(t_addr))) );
   2654       DIP("%s%c $0x%x, %s\n", nameGrp8(gregOfRM(modrm)), nameISize(sz),
   2655                               src_val, dis_buf);
   2656    }
   2657 
   2658    /* Compute the new value into t2m, if non-BT. */
   2659    switch (gregOfRM(modrm)) {
   2660       case 4: /* BT */
   2661          break;
   2662       case 5: /* BTS */
   2663          assign( t2m, binop(Iop_Or32, mkU32(mask), mkexpr(t2)) );
   2664          break;
   2665       case 6: /* BTR */
   2666          assign( t2m, binop(Iop_And32, mkU32(mask), mkexpr(t2)) );
   2667          break;
   2668       case 7: /* BTC */
   2669          assign( t2m, binop(Iop_Xor32, mkU32(mask), mkexpr(t2)) );
   2670          break;
   2671       default:
   2672          /*NOTREACHED*/ /*the previous switch guards this*/
   2673          vassert(0);
   2674    }
   2675 
   2676    /* Write the result back, if non-BT.  If the CAS fails then we
   2677       side-exit from the trace at this point, and so the flag state is
   2678       not affected.  This is of course as required. */
   2679    if (gregOfRM(modrm) != 4 /* BT */) {
   2680       if (epartIsReg(modrm)) {
   2681          putIReg(sz, eregOfRM(modrm), narrowTo(ty, mkexpr(t2m)));
   2682       } else {
   2683          if (locked) {
   2684             casLE( mkexpr(t_addr),
   2685                    narrowTo(ty, mkexpr(t2))/*expd*/,
   2686                    narrowTo(ty, mkexpr(t2m))/*new*/,
   2687                    guest_EIP_curr_instr );
   2688          } else {
   2689             storeLE(mkexpr(t_addr), narrowTo(ty, mkexpr(t2m)));
   2690          }
   2691       }
   2692    }
   2693 
   2694    /* Copy relevant bit from t2 into the carry flag. */
   2695    /* Flags: C=selected bit, O,S,Z,A,P undefined, so are set to zero. */
   2696    stmt( IRStmt_Put( OFFB_CC_OP,   mkU32(X86G_CC_OP_COPY) ));
   2697    stmt( IRStmt_Put( OFFB_CC_DEP2, mkU32(0) ));
   2698    stmt( IRStmt_Put(
   2699             OFFB_CC_DEP1,
   2700             binop(Iop_And32,
   2701                   binop(Iop_Shr32, mkexpr(t2), mkU8(src_val)),
   2702                   mkU32(1))
   2703        ));
   2704    /* Set NDEP even though it isn't used.  This makes redundant-PUT
   2705       elimination of previous stores to this field work better. */
   2706    stmt( IRStmt_Put( OFFB_CC_NDEP, mkU32(0) ));
   2707 
   2708    return delta;
   2709 }
   2710 
   2711 
   2712 /* Signed/unsigned widening multiply.  Generate IR to multiply the
   2713    value in EAX/AX/AL by the given IRTemp, and park the result in
   2714    EDX:EAX/DX:AX/AX.
   2715 */
   2716 static void codegen_mulL_A_D ( Int sz, Bool syned,
   2717                                IRTemp tmp, HChar* tmp_txt )
   2718 {
   2719    IRType ty = szToITy(sz);
   2720    IRTemp t1 = newTemp(ty);
   2721 
   2722    assign( t1, getIReg(sz, R_EAX) );
   2723 
   2724    switch (ty) {
   2725       case Ity_I32: {
   2726          IRTemp res64   = newTemp(Ity_I64);
   2727          IRTemp resHi   = newTemp(Ity_I32);
   2728          IRTemp resLo   = newTemp(Ity_I32);
   2729          IROp   mulOp   = syned ? Iop_MullS32 : Iop_MullU32;
   2730          UInt   tBaseOp = syned ? X86G_CC_OP_SMULB : X86G_CC_OP_UMULB;
   2731          setFlags_MUL ( Ity_I32, t1, tmp, tBaseOp );
   2732          assign( res64, binop(mulOp, mkexpr(t1), mkexpr(tmp)) );
   2733          assign( resHi, unop(Iop_64HIto32,mkexpr(res64)));
   2734          assign( resLo, unop(Iop_64to32,mkexpr(res64)));
   2735          putIReg(4, R_EDX, mkexpr(resHi));
   2736          putIReg(4, R_EAX, mkexpr(resLo));
   2737          break;
   2738       }
   2739       case Ity_I16: {
   2740          IRTemp res32   = newTemp(Ity_I32);
   2741          IRTemp resHi   = newTemp(Ity_I16);
   2742          IRTemp resLo   = newTemp(Ity_I16);
   2743          IROp   mulOp   = syned ? Iop_MullS16 : Iop_MullU16;
   2744          UInt   tBaseOp = syned ? X86G_CC_OP_SMULB : X86G_CC_OP_UMULB;
   2745          setFlags_MUL ( Ity_I16, t1, tmp, tBaseOp );
   2746          assign( res32, binop(mulOp, mkexpr(t1), mkexpr(tmp)) );
   2747          assign( resHi, unop(Iop_32HIto16,mkexpr(res32)));
   2748          assign( resLo, unop(Iop_32to16,mkexpr(res32)));
   2749          putIReg(2, R_EDX, mkexpr(resHi));
   2750          putIReg(2, R_EAX, mkexpr(resLo));
   2751          break;
   2752       }
   2753       case Ity_I8: {
   2754          IRTemp res16   = newTemp(Ity_I16);
   2755          IRTemp resHi   = newTemp(Ity_I8);
   2756          IRTemp resLo   = newTemp(Ity_I8);
   2757          IROp   mulOp   = syned ? Iop_MullS8 : Iop_MullU8;
   2758          UInt   tBaseOp = syned ? X86G_CC_OP_SMULB : X86G_CC_OP_UMULB;
   2759          setFlags_MUL ( Ity_I8, t1, tmp, tBaseOp );
   2760          assign( res16, binop(mulOp, mkexpr(t1), mkexpr(tmp)) );
   2761          assign( resHi, unop(Iop_16HIto8,mkexpr(res16)));
   2762          assign( resLo, unop(Iop_16to8,mkexpr(res16)));
   2763          putIReg(2, R_EAX, mkexpr(res16));
   2764          break;
   2765       }
   2766       default:
   2767          vpanic("codegen_mulL_A_D(x86)");
   2768    }
   2769    DIP("%s%c %s\n", syned ? "imul" : "mul", nameISize(sz), tmp_txt);
   2770 }
   2771 
   2772 
   2773 /* Group 3 extended opcodes. */
   2774 static
   2775 UInt dis_Grp3 ( UChar sorb, Bool locked, Int sz, Int delta, Bool* decode_OK )
   2776 {
   2777    UInt    d32;
   2778    UChar   modrm;
   2779    HChar   dis_buf[50];
   2780    Int     len;
   2781    IRTemp  addr;
   2782    IRType  ty = szToITy(sz);
   2783    IRTemp  t1 = newTemp(ty);
   2784    IRTemp dst1, src, dst0;
   2785 
   2786    *decode_OK = True; /* may change this later */
   2787 
   2788    modrm = getIByte(delta);
   2789 
   2790    if (locked && (gregOfRM(modrm) != 2 && gregOfRM(modrm) != 3)) {
   2791       /* LOCK prefix only allowed with not and neg subopcodes */
   2792       *decode_OK = False;
   2793       return delta;
   2794    }
   2795 
   2796    if (epartIsReg(modrm)) {
   2797       switch (gregOfRM(modrm)) {
   2798          case 0: { /* TEST */
   2799             delta++; d32 = getUDisp(sz, delta); delta += sz;
   2800             dst1 = newTemp(ty);
   2801             assign(dst1, binop(mkSizedOp(ty,Iop_And8),
   2802                                getIReg(sz,eregOfRM(modrm)),
   2803                                mkU(ty,d32)));
   2804             setFlags_DEP1( Iop_And8, dst1, ty );
   2805             DIP("test%c $0x%x, %s\n", nameISize(sz), d32,
   2806                                       nameIReg(sz, eregOfRM(modrm)));
   2807             break;
   2808          }
   2809          case 1: /* UNDEFINED */
   2810            /* The Intel docs imply this insn is undefined and binutils
   2811               agrees.  Unfortunately Core 2 will run it (with who
   2812               knows what result?)  sandpile.org reckons it's an alias
   2813               for case 0.  We play safe. */
   2814            *decode_OK = False;
   2815            break;
   2816          case 2: /* NOT */
   2817             delta++;
   2818             putIReg(sz, eregOfRM(modrm),
   2819                         unop(mkSizedOp(ty,Iop_Not8),
   2820                              getIReg(sz, eregOfRM(modrm))));
   2821             DIP("not%c %s\n", nameISize(sz), nameIReg(sz, eregOfRM(modrm)));
   2822             break;
   2823          case 3: /* NEG */
   2824             delta++;
   2825             dst0 = newTemp(ty);
   2826             src  = newTemp(ty);
   2827             dst1 = newTemp(ty);
   2828             assign(dst0, mkU(ty,0));
   2829             assign(src,  getIReg(sz,eregOfRM(modrm)));
   2830             assign(dst1, binop(mkSizedOp(ty,Iop_Sub8), mkexpr(dst0), mkexpr(src)));
   2831             setFlags_DEP1_DEP2(Iop_Sub8, dst0, src, ty);
   2832             putIReg(sz, eregOfRM(modrm), mkexpr(dst1));
   2833             DIP("neg%c %s\n", nameISize(sz), nameIReg(sz, eregOfRM(modrm)));
   2834             break;
   2835          case 4: /* MUL (unsigned widening) */
   2836             delta++;
   2837             src = newTemp(ty);
   2838             assign(src, getIReg(sz,eregOfRM(modrm)));
   2839             codegen_mulL_A_D ( sz, False, src, nameIReg(sz,eregOfRM(modrm)) );
   2840             break;
   2841          case 5: /* IMUL (signed widening) */
   2842             delta++;
   2843             src = newTemp(ty);
   2844             assign(src, getIReg(sz,eregOfRM(modrm)));
   2845             codegen_mulL_A_D ( sz, True, src, nameIReg(sz,eregOfRM(modrm)) );
   2846             break;
   2847          case 6: /* DIV */
   2848             delta++;
   2849             assign( t1, getIReg(sz, eregOfRM(modrm)) );
   2850             codegen_div ( sz, t1, False );
   2851             DIP("div%c %s\n", nameISize(sz), nameIReg(sz, eregOfRM(modrm)));
   2852             break;
   2853          case 7: /* IDIV */
   2854             delta++;
   2855             assign( t1, getIReg(sz, eregOfRM(modrm)) );
   2856             codegen_div ( sz, t1, True );
   2857             DIP("idiv%c %s\n", nameISize(sz), nameIReg(sz, eregOfRM(modrm)));
   2858             break;
   2859          default:
   2860             /* This can't happen - gregOfRM should return 0 .. 7 only */
   2861             vpanic("Grp3(x86)");
   2862       }
   2863    } else {
   2864       addr = disAMode ( &len, sorb, delta, dis_buf );
   2865       t1   = newTemp(ty);
   2866       delta += len;
   2867       assign(t1, loadLE(ty,mkexpr(addr)));
   2868       switch (gregOfRM(modrm)) {
   2869          case 0: { /* TEST */
   2870             d32 = getUDisp(sz, delta); delta += sz;
   2871             dst1 = newTemp(ty);
   2872             assign(dst1, binop(mkSizedOp(ty,Iop_And8),
   2873                                mkexpr(t1), mkU(ty,d32)));
   2874             setFlags_DEP1( Iop_And8, dst1, ty );
   2875             DIP("test%c $0x%x, %s\n", nameISize(sz), d32, dis_buf);
   2876             break;
   2877          }
   2878          case 1: /* UNDEFINED */
   2879            /* See comment above on R case */
   2880            *decode_OK = False;
   2881            break;
   2882          case 2: /* NOT */
   2883             dst1 = newTemp(ty);
   2884             assign(dst1, unop(mkSizedOp(ty,Iop_Not8), mkexpr(t1)));
   2885             if (locked) {
   2886                casLE( mkexpr(addr), mkexpr(t1)/*expd*/, mkexpr(dst1)/*new*/,
   2887                                     guest_EIP_curr_instr );
   2888             } else {
   2889                storeLE( mkexpr(addr), mkexpr(dst1) );
   2890             }
   2891             DIP("not%c %s\n", nameISize(sz), dis_buf);
   2892             break;
   2893          case 3: /* NEG */
   2894             dst0 = newTemp(ty);
   2895             src  = newTemp(ty);
   2896             dst1 = newTemp(ty);
   2897             assign(dst0, mkU(ty,0));
   2898             assign(src,  mkexpr(t1));
   2899             assign(dst1, binop(mkSizedOp(ty,Iop_Sub8),
   2900                                mkexpr(dst0), mkexpr(src)));
   2901             if (locked) {
   2902                casLE( mkexpr(addr), mkexpr(t1)/*expd*/, mkexpr(dst1)/*new*/,
   2903                                     guest_EIP_curr_instr );
   2904             } else {
   2905                storeLE( mkexpr(addr), mkexpr(dst1) );
   2906             }
   2907             setFlags_DEP1_DEP2(Iop_Sub8, dst0, src, ty);
   2908             DIP("neg%c %s\n", nameISize(sz), dis_buf);
   2909             break;
   2910          case 4: /* MUL */
   2911             codegen_mulL_A_D ( sz, False, t1, dis_buf );
   2912             break;
   2913          case 5: /* IMUL */
   2914             codegen_mulL_A_D ( sz, True, t1, dis_buf );
   2915             break;
   2916          case 6: /* DIV */
   2917             codegen_div ( sz, t1, False );
   2918             DIP("div%c %s\n", nameISize(sz), dis_buf);
   2919             break;
   2920          case 7: /* IDIV */
   2921             codegen_div ( sz, t1, True );
   2922             DIP("idiv%c %s\n", nameISize(sz), dis_buf);
   2923             break;
   2924          default:
   2925             /* This can't happen - gregOfRM should return 0 .. 7 only */
   2926             vpanic("Grp3(x86)");
   2927       }
   2928    }
   2929    return delta;
   2930 }
   2931 
   2932 
   2933 /* Group 4 extended opcodes. */
   2934 static
   2935 UInt dis_Grp4 ( UChar sorb, Bool locked, Int delta, Bool* decode_OK )
   2936 {
   2937    Int   alen;
   2938    UChar modrm;
   2939    HChar dis_buf[50];
   2940    IRType ty = Ity_I8;
   2941    IRTemp t1 = newTemp(ty);
   2942    IRTemp t2 = newTemp(ty);
   2943 
   2944    *decode_OK = True;
   2945 
   2946    modrm = getIByte(delta);
   2947 
   2948    if (locked && (gregOfRM(modrm) != 0 && gregOfRM(modrm) != 1)) {
   2949       /* LOCK prefix only allowed with inc and dec subopcodes */
   2950       *decode_OK = False;
   2951       return delta;
   2952    }
   2953 
   2954    if (epartIsReg(modrm)) {
   2955       assign(t1, getIReg(1, eregOfRM(modrm)));
   2956       switch (gregOfRM(modrm)) {
   2957          case 0: /* INC */
   2958             assign(t2, binop(Iop_Add8, mkexpr(t1), mkU8(1)));
   2959             putIReg(1, eregOfRM(modrm), mkexpr(t2));
   2960             setFlags_INC_DEC( True, t2, ty );
   2961             break;
   2962          case 1: /* DEC */
   2963             assign(t2, binop(Iop_Sub8, mkexpr(t1), mkU8(1)));
   2964             putIReg(1, eregOfRM(modrm), mkexpr(t2));
   2965             setFlags_INC_DEC( False, t2, ty );
   2966             break;
   2967          default:
   2968             *decode_OK = False;
   2969             return delta;
   2970       }
   2971       delta++;
   2972       DIP("%sb %s\n", nameGrp4(gregOfRM(modrm)),
   2973                       nameIReg(1, eregOfRM(modrm)));
   2974    } else {
   2975       IRTemp addr = disAMode ( &alen, sorb, delta, dis_buf );
   2976       assign( t1, loadLE(ty, mkexpr(addr)) );
   2977       switch (gregOfRM(modrm)) {
   2978          case 0: /* INC */
   2979             assign(t2, binop(Iop_Add8, mkexpr(t1), mkU8(1)));
   2980             if (locked) {
   2981                casLE( mkexpr(addr), mkexpr(t1)/*expd*/, mkexpr(t2)/*new*/,
   2982                       guest_EIP_curr_instr );
   2983             } else {
   2984                storeLE( mkexpr(addr), mkexpr(t2) );
   2985             }
   2986             setFlags_INC_DEC( True, t2, ty );
   2987             break;
   2988          case 1: /* DEC */
   2989             assign(t2, binop(Iop_Sub8, mkexpr(t1), mkU8(1)));
   2990             if (locked) {
   2991                casLE( mkexpr(addr), mkexpr(t1)/*expd*/, mkexpr(t2)/*new*/,
   2992                       guest_EIP_curr_instr );
   2993             } else {
   2994                storeLE( mkexpr(addr), mkexpr(t2) );
   2995             }
   2996             setFlags_INC_DEC( False, t2, ty );
   2997             break;
   2998          default:
   2999             *decode_OK = False;
   3000             return delta;
   3001       }
   3002       delta += alen;
   3003       DIP("%sb %s\n", nameGrp4(gregOfRM(modrm)), dis_buf);
   3004    }
   3005    return delta;
   3006 }
   3007 
   3008 
   3009 /* Group 5 extended opcodes. */
   3010 static
   3011 UInt dis_Grp5 ( UChar sorb, Bool locked, Int sz, Int delta,
   3012                 DisResult* dres, Bool* decode_OK )
   3013 {
   3014    Int     len;
   3015    UChar   modrm;
   3016    HChar   dis_buf[50];
   3017    IRTemp  addr = IRTemp_INVALID;
   3018    IRType  ty = szToITy(sz);
   3019    IRTemp  t1 = newTemp(ty);
   3020    IRTemp  t2 = IRTemp_INVALID;
   3021 
   3022    *decode_OK = True;
   3023 
   3024    modrm = getIByte(delta);
   3025 
   3026    if (locked && (gregOfRM(modrm) != 0 && gregOfRM(modrm) != 1)) {
   3027       /* LOCK prefix only allowed with inc and dec subopcodes */
   3028       *decode_OK = False;
   3029       return delta;
   3030    }
   3031 
   3032    if (epartIsReg(modrm)) {
   3033       assign(t1, getIReg(sz,eregOfRM(modrm)));
   3034       switch (gregOfRM(modrm)) {
   3035          case 0: /* INC */
   3036             vassert(sz == 2 || sz == 4);
   3037             t2 = newTemp(ty);
   3038             assign(t2, binop(mkSizedOp(ty,Iop_Add8),
   3039                              mkexpr(t1), mkU(ty,1)));
   3040             setFlags_INC_DEC( True, t2, ty );
   3041             putIReg(sz,eregOfRM(modrm),mkexpr(t2));
   3042             break;
   3043          case 1: /* DEC */
   3044             vassert(sz == 2 || sz == 4);
   3045             t2 = newTemp(ty);
   3046             assign(t2, binop(mkSizedOp(ty,Iop_Sub8),
   3047                              mkexpr(t1), mkU(ty,1)));
   3048             setFlags_INC_DEC( False, t2, ty );
   3049             putIReg(sz,eregOfRM(modrm),mkexpr(t2));
   3050             break;
   3051          case 2: /* call Ev */
   3052             vassert(sz == 4);
   3053             t2 = newTemp(Ity_I32);
   3054             assign(t2, binop(Iop_Sub32, getIReg(4,R_ESP), mkU32(4)));
   3055             putIReg(4, R_ESP, mkexpr(t2));
   3056             storeLE( mkexpr(t2), mkU32(guest_EIP_bbstart+delta+1));
   3057             jmp_treg(Ijk_Call,t1);
   3058             dres->whatNext = Dis_StopHere;
   3059             break;
   3060          case 4: /* jmp Ev */
   3061             vassert(sz == 4);
   3062             jmp_treg(Ijk_Boring,t1);
   3063             dres->whatNext = Dis_StopHere;
   3064             break;
   3065          case 6: /* PUSH Ev */
   3066             vassert(sz == 4 || sz == 2);
   3067             t2 = newTemp(Ity_I32);
   3068             assign( t2, binop(Iop_Sub32,getIReg(4,R_ESP),mkU32(sz)) );
   3069             putIReg(4, R_ESP, mkexpr(t2) );
   3070             storeLE( mkexpr(t2), mkexpr(t1) );
   3071             break;
   3072          default:
   3073             *decode_OK = False;
   3074             return delta;
   3075       }
   3076       delta++;
   3077       DIP("%s%c %s\n", nameGrp5(gregOfRM(modrm)),
   3078                        nameISize(sz), nameIReg(sz, eregOfRM(modrm)));
   3079    } else {
   3080       addr = disAMode ( &len, sorb, delta, dis_buf );
   3081       assign(t1, loadLE(ty,mkexpr(addr)));
   3082       switch (gregOfRM(modrm)) {
   3083          case 0: /* INC */
   3084             t2 = newTemp(ty);
   3085             assign(t2, binop(mkSizedOp(ty,Iop_Add8),
   3086                              mkexpr(t1), mkU(ty,1)));
   3087             if (locked) {
   3088                casLE( mkexpr(addr),
   3089                       mkexpr(t1), mkexpr(t2), guest_EIP_curr_instr );
   3090             } else {
   3091                storeLE(mkexpr(addr),mkexpr(t2));
   3092             }
   3093             setFlags_INC_DEC( True, t2, ty );
   3094             break;
   3095          case 1: /* DEC */
   3096             t2 = newTemp(ty);
   3097             assign(t2, binop(mkSizedOp(ty,Iop_Sub8),
   3098                              mkexpr(t1), mkU(ty,1)));
   3099             if (locked) {
   3100                casLE( mkexpr(addr),
   3101                       mkexpr(t1), mkexpr(t2), guest_EIP_curr_instr );
   3102             } else {
   3103                storeLE(mkexpr(addr),mkexpr(t2));
   3104             }
   3105             setFlags_INC_DEC( False, t2, ty );
   3106             break;
   3107          case 2: /* call Ev */
   3108             vassert(sz == 4);
   3109             t2 = newTemp(Ity_I32);
   3110             assign(t2, binop(Iop_Sub32, getIReg(4,R_ESP), mkU32(4)));
   3111             putIReg(4, R_ESP, mkexpr(t2));
   3112             storeLE( mkexpr(t2), mkU32(guest_EIP_bbstart+delta+len));
   3113             jmp_treg(Ijk_Call,t1);
   3114             dres->whatNext = Dis_StopHere;
   3115             break;
   3116          case 4: /* JMP Ev */
   3117             vassert(sz == 4);
   3118             jmp_treg(Ijk_Boring,t1);
   3119             dres->whatNext = Dis_StopHere;
   3120             break;
   3121          case 6: /* PUSH Ev */
   3122             vassert(sz == 4 || sz == 2);
   3123             t2 = newTemp(Ity_I32);
   3124             assign( t2, binop(Iop_Sub32,getIReg(4,R_ESP),mkU32(sz)) );
   3125             putIReg(4, R_ESP, mkexpr(t2) );
   3126             storeLE( mkexpr(t2), mkexpr(t1) );
   3127             break;
   3128          default:
   3129             *decode_OK = False;
   3130             return delta;
   3131       }
   3132       delta += len;
   3133       DIP("%s%c %s\n", nameGrp5(gregOfRM(modrm)),
   3134                        nameISize(sz), dis_buf);
   3135    }
   3136    return delta;
   3137 }
   3138 
   3139 
   3140 /*------------------------------------------------------------*/
   3141 /*--- Disassembling string ops (including REP prefixes)    ---*/
   3142 /*------------------------------------------------------------*/
   3143 
   3144 /* Code shared by all the string ops */
   3145 static
   3146 void dis_string_op_increment(Int sz, Int t_inc)
   3147 {
   3148    if (sz == 4 || sz == 2) {
   3149       assign( t_inc,
   3150               binop(Iop_Shl32, IRExpr_Get( OFFB_DFLAG, Ity_I32 ),
   3151                                mkU8(sz/2) ) );
   3152    } else {
   3153       assign( t_inc,
   3154               IRExpr_Get( OFFB_DFLAG, Ity_I32 ) );
   3155    }
   3156 }
   3157 
   3158 static
   3159 void dis_string_op( void (*dis_OP)( Int, IRTemp ),
   3160                     Int sz, HChar* name, UChar sorb )
   3161 {
   3162    IRTemp t_inc = newTemp(Ity_I32);
   3163    vassert(sorb == 0); /* hmm.  so what was the point of passing it in? */
   3164    dis_string_op_increment(sz, t_inc);
   3165    dis_OP( sz, t_inc );
   3166    DIP("%s%c\n", name, nameISize(sz));
   3167 }
   3168 
   3169 static
   3170 void dis_MOVS ( Int sz, IRTemp t_inc )
   3171 {
   3172    IRType ty = szToITy(sz);
   3173    IRTemp td = newTemp(Ity_I32);   /* EDI */
   3174    IRTemp ts = newTemp(Ity_I32);   /* ESI */
   3175 
   3176    assign( td, getIReg(4, R_EDI) );
   3177    assign( ts, getIReg(4, R_ESI) );
   3178 
   3179    storeLE( mkexpr(td), loadLE(ty,mkexpr(ts)) );
   3180 
   3181    putIReg( 4, R_EDI, binop(Iop_Add32, mkexpr(td), mkexpr(t_inc)) );
   3182    putIReg( 4, R_ESI, binop(Iop_Add32, mkexpr(ts), mkexpr(t_inc)) );
   3183 }
   3184 
   3185 static
   3186 void dis_LODS ( Int sz, IRTemp t_inc )
   3187 {
   3188    IRType ty = szToITy(sz);
   3189    IRTemp ts = newTemp(Ity_I32);   /* ESI */
   3190 
   3191    assign( ts, getIReg(4, R_ESI) );
   3192 
   3193    putIReg( sz, R_EAX, loadLE(ty, mkexpr(ts)) );
   3194 
   3195    putIReg( 4, R_ESI, binop(Iop_Add32, mkexpr(ts), mkexpr(t_inc)) );
   3196 }
   3197 
   3198 static
   3199 void dis_STOS ( Int sz, IRTemp t_inc )
   3200 {
   3201    IRType ty = szToITy(sz);
   3202    IRTemp ta = newTemp(ty);        /* EAX */
   3203    IRTemp td = newTemp(Ity_I32);   /* EDI */
   3204 
   3205    assign( ta, getIReg(sz, R_EAX) );
   3206    assign( td, getIReg(4, R_EDI) );
   3207 
   3208    storeLE( mkexpr(td), mkexpr(ta) );
   3209 
   3210    putIReg( 4, R_EDI, binop(Iop_Add32, mkexpr(td), mkexpr(t_inc)) );
   3211 }
   3212 
   3213 static
   3214 void dis_CMPS ( Int sz, IRTemp t_inc )
   3215 {
   3216    IRType ty  = szToITy(sz);
   3217    IRTemp tdv = newTemp(ty);      /* (EDI) */
   3218    IRTemp tsv = newTemp(ty);      /* (ESI) */
   3219    IRTemp td  = newTemp(Ity_I32); /*  EDI  */
   3220    IRTemp ts  = newTemp(Ity_I32); /*  ESI  */
   3221 
   3222    assign( td, getIReg(4, R_EDI) );
   3223    assign( ts, getIReg(4, R_ESI) );
   3224 
   3225    assign( tdv, loadLE(ty,mkexpr(td)) );
   3226    assign( tsv, loadLE(ty,mkexpr(ts)) );
   3227 
   3228    setFlags_DEP1_DEP2 ( Iop_Sub8, tsv, tdv, ty );
   3229 
   3230    putIReg(4, R_EDI, binop(Iop_Add32, mkexpr(td), mkexpr(t_inc)) );
   3231    putIReg(4, R_ESI, binop(Iop_Add32, mkexpr(ts), mkexpr(t_inc)) );
   3232 }
   3233 
   3234 static
   3235 void dis_SCAS ( Int sz, IRTemp t_inc )
   3236 {
   3237    IRType ty  = szToITy(sz);
   3238    IRTemp ta  = newTemp(ty);       /*  EAX  */
   3239    IRTemp td  = newTemp(Ity_I32);  /*  EDI  */
   3240    IRTemp tdv = newTemp(ty);       /* (EDI) */
   3241 
   3242    assign( ta, getIReg(sz, R_EAX) );
   3243    assign( td, getIReg(4, R_EDI) );
   3244 
   3245    assign( tdv, loadLE(ty,mkexpr(td)) );
   3246    setFlags_DEP1_DEP2 ( Iop_Sub8, ta, tdv, ty );
   3247 
   3248    putIReg(4, R_EDI, binop(Iop_Add32, mkexpr(td), mkexpr(t_inc)) );
   3249 }
   3250 
   3251 
   3252 /* Wrap the appropriate string op inside a REP/REPE/REPNE.
   3253    We assume the insn is the last one in the basic block, and so emit a jump
   3254    to the next insn, rather than just falling through. */
   3255 static
   3256 void dis_REP_op ( X86Condcode cond,
   3257                   void (*dis_OP)(Int, IRTemp),
   3258                   Int sz, Addr32 eip, Addr32 eip_next, HChar* name )
   3259 {
   3260    IRTemp t_inc = newTemp(Ity_I32);
   3261    IRTemp tc    = newTemp(Ity_I32);  /*  ECX  */
   3262 
   3263    assign( tc, getIReg(4,R_ECX) );
   3264 
   3265    stmt( IRStmt_Exit( binop(Iop_CmpEQ32,mkexpr(tc),mkU32(0)),
   3266                       Ijk_Boring,
   3267                       IRConst_U32(eip_next) ) );
   3268 
   3269    putIReg(4, R_ECX, binop(Iop_Sub32, mkexpr(tc), mkU32(1)) );
   3270 
   3271    dis_string_op_increment(sz, t_inc);
   3272    dis_OP (sz, t_inc);
   3273 
   3274    if (cond == X86CondAlways) {
   3275       jmp_lit(Ijk_Boring,eip);
   3276    } else {
   3277       stmt( IRStmt_Exit( mk_x86g_calculate_condition(cond),
   3278                          Ijk_Boring,
   3279                          IRConst_U32(eip) ) );
   3280       jmp_lit(Ijk_Boring,eip_next);
   3281    }
   3282    DIP("%s%c\n", name, nameISize(sz));
   3283 }
   3284 
   3285 
   3286 /*------------------------------------------------------------*/
   3287 /*--- Arithmetic, etc.                                     ---*/
   3288 /*------------------------------------------------------------*/
   3289 
   3290 /* IMUL E, G.  Supplied eip points to the modR/M byte. */
   3291 static
   3292 UInt dis_mul_E_G ( UChar       sorb,
   3293                    Int         size,
   3294                    Int         delta0 )
   3295 {
   3296    Int    alen;
   3297    HChar  dis_buf[50];
   3298    UChar  rm = getIByte(delta0);
   3299    IRType ty = szToITy(size);
   3300    IRTemp te = newTemp(ty);
   3301    IRTemp tg = newTemp(ty);
   3302    IRTemp resLo = newTemp(ty);
   3303 
   3304    assign( tg, getIReg(size, gregOfRM(rm)) );
   3305    if (epartIsReg(rm)) {
   3306       assign( te, getIReg(size, eregOfRM(rm)) );
   3307    } else {
   3308       IRTemp addr = disAMode( &alen, sorb, delta0, dis_buf );
   3309       assign( te, loadLE(ty,mkexpr(addr)) );
   3310    }
   3311 
   3312    setFlags_MUL ( ty, te, tg, X86G_CC_OP_SMULB );
   3313 
   3314    assign( resLo, binop( mkSizedOp(ty, Iop_Mul8), mkexpr(te), mkexpr(tg) ) );
   3315 
   3316    putIReg(size, gregOfRM(rm), mkexpr(resLo) );
   3317 
   3318    if (epartIsReg(rm)) {
   3319       DIP("imul%c %s, %s\n", nameISize(size),
   3320                              nameIReg(size,eregOfRM(rm)),
   3321                              nameIReg(size,gregOfRM(rm)));
   3322       return 1+delta0;
   3323    } else {
   3324       DIP("imul%c %s, %s\n", nameISize(size),
   3325                              dis_buf, nameIReg(size,gregOfRM(rm)));
   3326       return alen+delta0;
   3327    }
   3328 }
   3329 
   3330 
   3331 /* IMUL I * E -> G.  Supplied eip points to the modR/M byte. */
   3332 static
   3333 UInt dis_imul_I_E_G ( UChar       sorb,
   3334                       Int         size,
   3335                       Int         delta,
   3336                       Int         litsize )
   3337 {
   3338    Int    d32, alen;
   3339    HChar  dis_buf[50];
   3340    UChar  rm = getIByte(delta);
   3341    IRType ty = szToITy(size);
   3342    IRTemp te = newTemp(ty);
   3343    IRTemp tl = newTemp(ty);
   3344    IRTemp resLo = newTemp(ty);
   3345 
   3346    vassert(size == 1 || size == 2 || size == 4);
   3347 
   3348    if (epartIsReg(rm)) {
   3349       assign(te, getIReg(size, eregOfRM(rm)));
   3350       delta++;
   3351    } else {
   3352       IRTemp addr = disAMode( &alen, sorb, delta, dis_buf );
   3353       assign(te, loadLE(ty, mkexpr(addr)));
   3354       delta += alen;
   3355    }
   3356    d32 = getSDisp(litsize,delta);
   3357    delta += litsize;
   3358 
   3359    if (size == 1) d32 &= 0xFF;
   3360    if (size == 2) d32 &= 0xFFFF;
   3361 
   3362    assign(tl, mkU(ty,d32));
   3363 
   3364    assign( resLo, binop( mkSizedOp(ty, Iop_Mul8), mkexpr(te), mkexpr(tl) ));
   3365 
   3366    setFlags_MUL ( ty, te, tl, X86G_CC_OP_SMULB );
   3367 
   3368    putIReg(size, gregOfRM(rm), mkexpr(resLo));
   3369 
   3370    DIP("imul %d, %s, %s\n", d32,
   3371        ( epartIsReg(rm) ? nameIReg(size,eregOfRM(rm)) : dis_buf ),
   3372        nameIReg(size,gregOfRM(rm)) );
   3373    return delta;
   3374 }
   3375 
   3376 
   3377 /* Generate an IR sequence to do a count-leading-zeroes operation on
   3378    the supplied IRTemp, and return a new IRTemp holding the result.
   3379    'ty' may be Ity_I16 or Ity_I32 only.  In the case where the
   3380    argument is zero, return the number of bits in the word (the
   3381    natural semantics). */
   3382 static IRTemp gen_LZCNT ( IRType ty, IRTemp src )
   3383 {
   3384    vassert(ty == Ity_I32 || ty == Ity_I16);
   3385 
   3386    IRTemp src32 = newTemp(Ity_I32);
   3387    assign(src32, widenUto32( mkexpr(src) ));
   3388 
   3389    IRTemp src32x = newTemp(Ity_I32);
   3390    assign(src32x,
   3391           binop(Iop_Shl32, mkexpr(src32),
   3392                            mkU8(32 - 8 * sizeofIRType(ty))));
   3393 
   3394    // Clz32 has undefined semantics when its input is zero, so
   3395    // special-case around that.
   3396    IRTemp res32 = newTemp(Ity_I32);
   3397    assign(res32,
   3398           IRExpr_Mux0X(
   3399              unop(Iop_1Uto8,
   3400                   binop(Iop_CmpEQ32, mkexpr(src32x), mkU32(0))),
   3401              unop(Iop_Clz32, mkexpr(src32x)),
   3402              mkU32(8 * sizeofIRType(ty))
   3403    ));
   3404 
   3405    IRTemp res = newTemp(ty);
   3406    assign(res, narrowTo(ty, mkexpr(res32)));
   3407    return res;
   3408 }
   3409 
   3410 
   3411 /*------------------------------------------------------------*/
   3412 /*---                                                      ---*/
   3413 /*--- x87 FLOATING POINT INSTRUCTIONS                      ---*/
   3414 /*---                                                      ---*/
   3415 /*------------------------------------------------------------*/
   3416 
   3417 /* --- Helper functions for dealing with the register stack. --- */
   3418 
   3419 /* --- Set the emulation-warning pseudo-register. --- */
   3420 
   3421 static void put_emwarn ( IRExpr* e /* :: Ity_I32 */ )
   3422 {
   3423    vassert(typeOfIRExpr(irsb->tyenv, e) == Ity_I32);
   3424    stmt( IRStmt_Put( OFFB_EMWARN, e ) );
   3425 }
   3426 
   3427 /* --- Produce an IRExpr* denoting a 64-bit QNaN. --- */
   3428 
   3429 static IRExpr* mkQNaN64 ( void )
   3430 {
   3431   /* QNaN is 0 2047 1 0(51times)
   3432      == 0b 11111111111b 1 0(51times)
   3433      == 0x7FF8 0000 0000 0000
   3434    */
   3435    return IRExpr_Const(IRConst_F64i(0x7FF8000000000000ULL));
   3436 }
   3437 
   3438 /* --------- Get/put the top-of-stack pointer. --------- */
   3439 
   3440 static IRExpr* get_ftop ( void )
   3441 {
   3442    return IRExpr_Get( OFFB_FTOP, Ity_I32 );
   3443 }
   3444 
   3445 static void put_ftop ( IRExpr* e )
   3446 {
   3447    vassert(typeOfIRExpr(irsb->tyenv, e) == Ity_I32);
   3448    stmt( IRStmt_Put( OFFB_FTOP, e ) );
   3449 }
   3450 
   3451 /* --------- Get/put the C3210 bits. --------- */
   3452 
   3453 static IRExpr* get_C3210 ( void )
   3454 {
   3455    return IRExpr_Get( OFFB_FC3210, Ity_I32 );
   3456 }
   3457 
   3458 static void put_C3210 ( IRExpr* e )
   3459 {
   3460    stmt( IRStmt_Put( OFFB_FC3210, e ) );
   3461 }
   3462 
   3463 /* --------- Get/put the FPU rounding mode. --------- */
   3464 static IRExpr* /* :: Ity_I32 */ get_fpround ( void )
   3465 {
   3466    return IRExpr_Get( OFFB_FPROUND, Ity_I32 );
   3467 }
   3468 
   3469 static void put_fpround ( IRExpr* /* :: Ity_I32 */ e )
   3470 {
   3471    stmt( IRStmt_Put( OFFB_FPROUND, e ) );
   3472 }
   3473 
   3474 
   3475 /* --------- Synthesise a 2-bit FPU rounding mode. --------- */
   3476 /* Produces a value in 0 .. 3, which is encoded as per the type
   3477    IRRoundingMode.  Since the guest_FPROUND value is also encoded as
   3478    per IRRoundingMode, we merely need to get it and mask it for
   3479    safety.
   3480 */
   3481 static IRExpr* /* :: Ity_I32 */ get_roundingmode ( void )
   3482 {
   3483    return binop( Iop_And32, get_fpround(), mkU32(3) );
   3484 }
   3485 
   3486 static IRExpr* /* :: Ity_I32 */ get_FAKE_roundingmode ( void )
   3487 {
   3488    return mkU32(Irrm_NEAREST);
   3489 }
   3490 
   3491 
   3492 /* --------- Get/set FP register tag bytes. --------- */
   3493 
   3494 /* Given i, and some expression e, generate 'ST_TAG(i) = e'. */
   3495 
   3496 static void put_ST_TAG ( Int i, IRExpr* value )
   3497 {
   3498    IRRegArray* descr;
   3499    vassert(typeOfIRExpr(irsb->tyenv, value) == Ity_I8);
   3500    descr = mkIRRegArray( OFFB_FPTAGS, Ity_I8, 8 );
   3501    stmt( IRStmt_PutI( descr, get_ftop(), i, value ) );
   3502 }
   3503 
   3504 /* Given i, generate an expression yielding 'ST_TAG(i)'.  This will be
   3505    zero to indicate "Empty" and nonzero to indicate "NonEmpty".  */
   3506 
   3507 static IRExpr* get_ST_TAG ( Int i )
   3508 {
   3509    IRRegArray* descr = mkIRRegArray( OFFB_FPTAGS, Ity_I8, 8 );
   3510    return IRExpr_GetI( descr, get_ftop(), i );
   3511 }
   3512 
   3513 
   3514 /* --------- Get/set FP registers. --------- */
   3515 
   3516 /* Given i, and some expression e, emit 'ST(i) = e' and set the
   3517    register's tag to indicate the register is full.  The previous
   3518    state of the register is not checked. */
   3519 
   3520 static void put_ST_UNCHECKED ( Int i, IRExpr* value )
   3521 {
   3522    IRRegArray* descr;
   3523    vassert(typeOfIRExpr(irsb->tyenv, value) == Ity_F64);
   3524    descr = mkIRRegArray( OFFB_FPREGS, Ity_F64, 8 );
   3525    stmt( IRStmt_PutI( descr, get_ftop(), i, value ) );
   3526    /* Mark the register as in-use. */
   3527    put_ST_TAG(i, mkU8(1));
   3528 }
   3529 
   3530 /* Given i, and some expression e, emit
   3531       ST(i) = is_full(i) ? NaN : e
   3532    and set the tag accordingly.
   3533 */
   3534 
   3535 static void put_ST ( Int i, IRExpr* value )
   3536 {
   3537    put_ST_UNCHECKED( i,
   3538                      IRExpr_Mux0X( get_ST_TAG(i),
   3539                                    /* 0 means empty */
   3540                                    value,
   3541                                    /* non-0 means full */
   3542                                    mkQNaN64()
   3543                    )
   3544    );
   3545 }
   3546 
   3547 
   3548 /* Given i, generate an expression yielding 'ST(i)'. */
   3549 
   3550 static IRExpr* get_ST_UNCHECKED ( Int i )
   3551 {
   3552    IRRegArray* descr = mkIRRegArray( OFFB_FPREGS, Ity_F64, 8 );
   3553    return IRExpr_GetI( descr, get_ftop(), i );
   3554 }
   3555 
   3556 
   3557 /* Given i, generate an expression yielding
   3558   is_full(i) ? ST(i) : NaN
   3559 */
   3560 
   3561 static IRExpr* get_ST ( Int i )
   3562 {
   3563    return
   3564       IRExpr_Mux0X( get_ST_TAG(i),
   3565                     /* 0 means empty */
   3566                     mkQNaN64(),
   3567                     /* non-0 means full */
   3568                     get_ST_UNCHECKED(i));
   3569 }
   3570 
   3571 
   3572 /* Adjust FTOP downwards by one register. */
   3573 
   3574 static void fp_push ( void )
   3575 {
   3576    put_ftop( binop(Iop_Sub32, get_ftop(), mkU32(1)) );
   3577 }
   3578 
   3579 /* Adjust FTOP upwards by one register, and mark the vacated register
   3580    as empty.  */
   3581 
   3582 static void fp_pop ( void )
   3583 {
   3584    put_ST_TAG(0, mkU8(0));
   3585    put_ftop( binop(Iop_Add32, get_ftop(), mkU32(1)) );
   3586 }
   3587 
   3588 /* Clear the C2 bit of the FPU status register, for
   3589    sin/cos/tan/sincos. */
   3590 
   3591 static void clear_C2 ( void )
   3592 {
   3593    put_C3210( binop(Iop_And32, get_C3210(), mkU32(~X86G_FC_MASK_C2)) );
   3594 }
   3595 
   3596 /* Invent a plausible-looking FPU status word value:
   3597       ((ftop & 7) << 11) | (c3210 & 0x4700)
   3598  */
   3599 static IRExpr* get_FPU_sw ( void )
   3600 {
   3601    return
   3602       unop(Iop_32to16,
   3603            binop(Iop_Or32,
   3604                  binop(Iop_Shl32,
   3605                        binop(Iop_And32, get_ftop(), mkU32(7)),
   3606                              mkU8(11)),
   3607                        binop(Iop_And32, get_C3210(), mkU32(0x4700))
   3608       ));
   3609 }
   3610 
   3611 
   3612 /* ------------------------------------------------------- */
   3613 /* Given all that stack-mangling junk, we can now go ahead
   3614    and describe FP instructions.
   3615 */
   3616 
   3617 /* ST(0) = ST(0) `op` mem64/32(addr)
   3618    Need to check ST(0)'s tag on read, but not on write.
   3619 */
   3620 static
   3621 void fp_do_op_mem_ST_0 ( IRTemp addr, HChar* op_txt, HChar* dis_buf,
   3622                          IROp op, Bool dbl )
   3623 {
   3624    DIP("f%s%c %s\n", op_txt, dbl?'l':'s', dis_buf);
   3625    if (dbl) {
   3626       put_ST_UNCHECKED(0,
   3627          triop( op,
   3628                 get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   3629                 get_ST(0),
   3630                 loadLE(Ity_F64,mkexpr(addr))
   3631          ));
   3632    } else {
   3633       put_ST_UNCHECKED(0,
   3634          triop( op,
   3635                 get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   3636                 get_ST(0),
   3637                 unop(Iop_F32toF64, loadLE(Ity_F32,mkexpr(addr)))
   3638          ));
   3639    }
   3640 }
   3641 
   3642 
   3643 /* ST(0) = mem64/32(addr) `op` ST(0)
   3644    Need to check ST(0)'s tag on read, but not on write.
   3645 */
   3646 static
   3647 void fp_do_oprev_mem_ST_0 ( IRTemp addr, HChar* op_txt, HChar* dis_buf,
   3648                             IROp op, Bool dbl )
   3649 {
   3650    DIP("f%s%c %s\n", op_txt, dbl?'l':'s', dis_buf);
   3651    if (dbl) {
   3652       put_ST_UNCHECKED(0,
   3653          triop( op,
   3654                 get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   3655                 loadLE(Ity_F64,mkexpr(addr)),
   3656                 get_ST(0)
   3657          ));
   3658    } else {
   3659       put_ST_UNCHECKED(0,
   3660          triop( op,
   3661                 get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   3662                 unop(Iop_F32toF64, loadLE(Ity_F32,mkexpr(addr))),
   3663                 get_ST(0)
   3664          ));
   3665    }
   3666 }
   3667 
   3668 
   3669 /* ST(dst) = ST(dst) `op` ST(src).
   3670    Check dst and src tags when reading but not on write.
   3671 */
   3672 static
   3673 void fp_do_op_ST_ST ( HChar* op_txt, IROp op, UInt st_src, UInt st_dst,
   3674                       Bool pop_after )
   3675 {
   3676    DIP("f%s%s st(%d), st(%d)\n", op_txt, pop_after?"p":"",
   3677                                  (Int)st_src, (Int)st_dst );
   3678    put_ST_UNCHECKED(
   3679       st_dst,
   3680       triop( op,
   3681              get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   3682              get_ST(st_dst),
   3683              get_ST(st_src) )
   3684    );
   3685    if (pop_after)
   3686       fp_pop();
   3687 }
   3688 
   3689 /* ST(dst) = ST(src) `op` ST(dst).
   3690    Check dst and src tags when reading but not on write.
   3691 */
   3692 static
   3693 void fp_do_oprev_ST_ST ( HChar* op_txt, IROp op, UInt st_src, UInt st_dst,
   3694                          Bool pop_after )
   3695 {
   3696    DIP("f%s%s st(%d), st(%d)\n", op_txt, pop_after?"p":"",
   3697                                  (Int)st_src, (Int)st_dst );
   3698    put_ST_UNCHECKED(
   3699       st_dst,
   3700       triop( op,
   3701              get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   3702              get_ST(st_src),
   3703              get_ST(st_dst) )
   3704    );
   3705    if (pop_after)
   3706       fp_pop();
   3707 }
   3708 
   3709 /* %eflags(Z,P,C) = UCOMI( st(0), st(i) ) */
   3710 static void fp_do_ucomi_ST0_STi ( UInt i, Bool pop_after )
   3711 {
   3712    DIP("fucomi%s %%st(0),%%st(%d)\n", pop_after ? "p" : "", (Int)i );
   3713    /* This is a bit of a hack (and isn't really right).  It sets
   3714       Z,P,C,O correctly, but forces A and S to zero, whereas the Intel
   3715       documentation implies A and S are unchanged.
   3716    */
   3717    /* It's also fishy in that it is used both for COMIP and
   3718       UCOMIP, and they aren't the same (although similar). */
   3719    stmt( IRStmt_Put( OFFB_CC_OP,   mkU32(X86G_CC_OP_COPY) ));
   3720    stmt( IRStmt_Put( OFFB_CC_DEP2, mkU32(0) ));
   3721    stmt( IRStmt_Put( OFFB_CC_DEP1,
   3722                      binop( Iop_And32,
   3723                             binop(Iop_CmpF64, get_ST(0), get_ST(i)),
   3724                             mkU32(0x45)
   3725        )));
   3726    /* Set NDEP even though it isn't used.  This makes redundant-PUT
   3727       elimination of previous stores to this field work better. */
   3728    stmt( IRStmt_Put( OFFB_CC_NDEP, mkU32(0) ));
   3729    if (pop_after)
   3730       fp_pop();
   3731 }
   3732 
   3733 
   3734 static
   3735 UInt dis_FPU ( Bool* decode_ok, UChar sorb, Int delta )
   3736 {
   3737    Int    len;
   3738    UInt   r_src, r_dst;
   3739    HChar  dis_buf[50];
   3740    IRTemp t1, t2;
   3741 
   3742    /* On entry, delta points at the second byte of the insn (the modrm
   3743       byte).*/
   3744    UChar first_opcode = getIByte(delta-1);
   3745    UChar modrm        = getIByte(delta+0);
   3746 
   3747    /* -+-+-+-+-+-+-+-+-+-+-+-+ 0xD8 opcodes +-+-+-+-+-+-+-+ */
   3748 
   3749    if (first_opcode == 0xD8) {
   3750       if (modrm < 0xC0) {
   3751 
   3752          /* bits 5,4,3 are an opcode extension, and the modRM also
   3753            specifies an address. */
   3754          IRTemp addr = disAMode( &len, sorb, delta, dis_buf );
   3755          delta += len;
   3756 
   3757          switch (gregOfRM(modrm)) {
   3758 
   3759             case 0: /* FADD single-real */
   3760                fp_do_op_mem_ST_0 ( addr, "add", dis_buf, Iop_AddF64, False );
   3761                break;
   3762 
   3763             case 1: /* FMUL single-real */
   3764                fp_do_op_mem_ST_0 ( addr, "mul", dis_buf, Iop_MulF64, False );
   3765                break;
   3766 
   3767             case 2: /* FCOM single-real */
   3768                DIP("fcoms %s\n", dis_buf);
   3769                /* This forces C1 to zero, which isn't right. */
   3770                put_C3210(
   3771                    binop( Iop_And32,
   3772                           binop(Iop_Shl32,
   3773                                 binop(Iop_CmpF64,
   3774                                       get_ST(0),
   3775                                       unop(Iop_F32toF64,
   3776                                            loadLE(Ity_F32,mkexpr(addr)))),
   3777                                 mkU8(8)),
   3778                           mkU32(0x4500)
   3779                    ));
   3780                break;
   3781 
   3782             case 3: /* FCOMP single-real */
   3783                DIP("fcomps %s\n", dis_buf);
   3784                /* This forces C1 to zero, which isn't right. */
   3785                put_C3210(
   3786                    binop( Iop_And32,
   3787                           binop(Iop_Shl32,
   3788                                 binop(Iop_CmpF64,
   3789                                       get_ST(0),
   3790                                       unop(Iop_F32toF64,
   3791                                            loadLE(Ity_F32,mkexpr(addr)))),
   3792                                 mkU8(8)),
   3793                           mkU32(0x4500)
   3794                    ));
   3795                fp_pop();
   3796                break;
   3797 
   3798             case 4: /* FSUB single-real */
   3799                fp_do_op_mem_ST_0 ( addr, "sub", dis_buf, Iop_SubF64, False );
   3800                break;
   3801 
   3802             case 5: /* FSUBR single-real */
   3803                fp_do_oprev_mem_ST_0 ( addr, "subr", dis_buf, Iop_SubF64, False );
   3804                break;
   3805 
   3806             case 6: /* FDIV single-real */
   3807                fp_do_op_mem_ST_0 ( addr, "div", dis_buf, Iop_DivF64, False );
   3808                break;
   3809 
   3810             case 7: /* FDIVR single-real */
   3811                fp_do_oprev_mem_ST_0 ( addr, "divr", dis_buf, Iop_DivF64, False );
   3812                break;
   3813 
   3814             default:
   3815                vex_printf("unhandled opc_aux = 0x%2x\n", gregOfRM(modrm));
   3816                vex_printf("first_opcode == 0xD8\n");
   3817                goto decode_fail;
   3818          }
   3819       } else {
   3820          delta++;
   3821          switch (modrm) {
   3822 
   3823             case 0xC0 ... 0xC7: /* FADD %st(?),%st(0) */
   3824                fp_do_op_ST_ST ( "add", Iop_AddF64, modrm - 0xC0, 0, False );
   3825                break;
   3826 
   3827             case 0xC8 ... 0xCF: /* FMUL %st(?),%st(0) */
   3828                fp_do_op_ST_ST ( "mul", Iop_MulF64, modrm - 0xC8, 0, False );
   3829                break;
   3830 
   3831             /* Dunno if this is right */
   3832             case 0xD0 ... 0xD7: /* FCOM %st(?),%st(0) */
   3833                r_dst = (UInt)modrm - 0xD0;
   3834                DIP("fcom %%st(0),%%st(%d)\n", (Int)r_dst);
   3835                /* This forces C1 to zero, which isn't right. */
   3836                put_C3210(
   3837                    binop( Iop_And32,
   3838                           binop(Iop_Shl32,
   3839                                 binop(Iop_CmpF64, get_ST(0), get_ST(r_dst)),
   3840                                 mkU8(8)),
   3841                           mkU32(0x4500)
   3842                    ));
   3843                break;
   3844 
   3845             /* Dunno if this is right */
   3846             case 0xD8 ... 0xDF: /* FCOMP %st(?),%st(0) */
   3847                r_dst = (UInt)modrm - 0xD8;
   3848                DIP("fcomp %%st(0),%%st(%d)\n", (Int)r_dst);
   3849                /* This forces C1 to zero, which isn't right. */
   3850                put_C3210(
   3851                    binop( Iop_And32,
   3852                           binop(Iop_Shl32,
   3853                                 binop(Iop_CmpF64, get_ST(0), get_ST(r_dst)),
   3854                                 mkU8(8)),
   3855                           mkU32(0x4500)
   3856                    ));
   3857                fp_pop();
   3858                break;
   3859 
   3860             case 0xE0 ... 0xE7: /* FSUB %st(?),%st(0) */
   3861                fp_do_op_ST_ST ( "sub", Iop_SubF64, modrm - 0xE0, 0, False );
   3862                break;
   3863 
   3864             case 0xE8 ... 0xEF: /* FSUBR %st(?),%st(0) */
   3865                fp_do_oprev_ST_ST ( "subr", Iop_SubF64, modrm - 0xE8, 0, False );
   3866                break;
   3867 
   3868             case 0xF0 ... 0xF7: /* FDIV %st(?),%st(0) */
   3869                fp_do_op_ST_ST ( "div", Iop_DivF64, modrm - 0xF0, 0, False );
   3870                break;
   3871 
   3872             case 0xF8 ... 0xFF: /* FDIVR %st(?),%st(0) */
   3873                fp_do_oprev_ST_ST ( "divr", Iop_DivF64, modrm - 0xF8, 0, False );
   3874                break;
   3875 
   3876             default:
   3877                goto decode_fail;
   3878          }
   3879       }
   3880    }
   3881 
   3882    /* -+-+-+-+-+-+-+-+-+-+-+-+ 0xD9 opcodes +-+-+-+-+-+-+-+ */
   3883    else
   3884    if (first_opcode == 0xD9) {
   3885       if (modrm < 0xC0) {
   3886 
   3887          /* bits 5,4,3 are an opcode extension, and the modRM also
   3888             specifies an address. */
   3889          IRTemp addr = disAMode( &len, sorb, delta, dis_buf );
   3890          delta += len;
   3891 
   3892          switch (gregOfRM(modrm)) {
   3893 
   3894             case 0: /* FLD single-real */
   3895                DIP("flds %s\n", dis_buf);
   3896                fp_push();
   3897                put_ST(0, unop(Iop_F32toF64,
   3898                               loadLE(Ity_F32, mkexpr(addr))));
   3899                break;
   3900 
   3901             case 2: /* FST single-real */
   3902                DIP("fsts %s\n", dis_buf);
   3903                storeLE(mkexpr(addr),
   3904                        binop(Iop_F64toF32, get_roundingmode(), get_ST(0)));
   3905                break;
   3906 
   3907             case 3: /* FSTP single-real */
   3908                DIP("fstps %s\n", dis_buf);
   3909                storeLE(mkexpr(addr),
   3910                        binop(Iop_F64toF32, get_roundingmode(), get_ST(0)));
   3911                fp_pop();
   3912                break;
   3913 
   3914             case 4: { /* FLDENV m28 */
   3915                /* Uses dirty helper:
   3916                      VexEmWarn x86g_do_FLDENV ( VexGuestX86State*, HWord ) */
   3917                IRTemp   ew = newTemp(Ity_I32);
   3918                IRDirty* d  = unsafeIRDirty_0_N (
   3919                                 0/*regparms*/,
   3920                                 "x86g_dirtyhelper_FLDENV",
   3921                                 &x86g_dirtyhelper_FLDENV,
   3922                                 mkIRExprVec_1( mkexpr(addr) )
   3923                              );
   3924                d->needsBBP = True;
   3925                d->tmp      = ew;
   3926                /* declare we're reading memory */
   3927                d->mFx   = Ifx_Read;
   3928                d->mAddr = mkexpr(addr);
   3929                d->mSize = 28;
   3930 
   3931                /* declare we're writing guest state */
   3932                d->nFxState = 4;
   3933 
   3934                d->fxState[0].fx     = Ifx_Write;
   3935                d->fxState[0].offset = OFFB_FTOP;
   3936                d->fxState[0].size   = sizeof(UInt);
   3937 
   3938                d->fxState[1].fx     = Ifx_Write;
   3939                d->fxState[1].offset = OFFB_FPTAGS;
   3940                d->fxState[1].size   = 8 * sizeof(UChar);
   3941 
   3942                d->fxState[2].fx     = Ifx_Write;
   3943                d->fxState[2].offset = OFFB_FPROUND;
   3944                d->fxState[2].size   = sizeof(UInt);
   3945 
   3946                d->fxState[3].fx     = Ifx_Write;
   3947                d->fxState[3].offset = OFFB_FC3210;
   3948                d->fxState[3].size   = sizeof(UInt);
   3949 
   3950                stmt( IRStmt_Dirty(d) );
   3951 
   3952                /* ew contains any emulation warning we may need to
   3953                   issue.  If needed, side-exit to the next insn,
   3954                   reporting the warning, so that Valgrind's dispatcher
   3955                   sees the warning. */
   3956                put_emwarn( mkexpr(ew) );
   3957                stmt(
   3958                   IRStmt_Exit(
   3959                      binop(Iop_CmpNE32, mkexpr(ew), mkU32(0)),
   3960                      Ijk_EmWarn,
   3961                      IRConst_U32( ((Addr32)guest_EIP_bbstart)+delta)
   3962                   )
   3963                );
   3964 
   3965                DIP("fldenv %s\n", dis_buf);
   3966                break;
   3967             }
   3968 
   3969             case 5: {/* FLDCW */
   3970                /* The only thing we observe in the control word is the
   3971                   rounding mode.  Therefore, pass the 16-bit value
   3972                   (x87 native-format control word) to a clean helper,
   3973                   getting back a 64-bit value, the lower half of which
   3974                   is the FPROUND value to store, and the upper half of
   3975                   which is the emulation-warning token which may be
   3976                   generated.
   3977                */
   3978                /* ULong x86h_check_fldcw ( UInt ); */
   3979                IRTemp t64 = newTemp(Ity_I64);
   3980                IRTemp ew = newTemp(Ity_I32);
   3981                DIP("fldcw %s\n", dis_buf);
   3982                assign( t64, mkIRExprCCall(
   3983                                Ity_I64, 0/*regparms*/,
   3984                                "x86g_check_fldcw",
   3985                                &x86g_check_fldcw,
   3986                                mkIRExprVec_1(
   3987                                   unop( Iop_16Uto32,
   3988                                         loadLE(Ity_I16, mkexpr(addr)))
   3989                                )
   3990                             )
   3991                      );
   3992 
   3993                put_fpround( unop(Iop_64to32, mkexpr(t64)) );
   3994                assign( ew, unop(Iop_64HIto32, mkexpr(t64) ) );
   3995                put_emwarn( mkexpr(ew) );
   3996                /* Finally, if an emulation warning was reported,
   3997                   side-exit to the next insn, reporting the warning,
   3998                   so that Valgrind's dispatcher sees the warning. */
   3999                stmt(
   4000                   IRStmt_Exit(
   4001                      binop(Iop_CmpNE32, mkexpr(ew), mkU32(0)),
   4002                      Ijk_EmWarn,
   4003                      IRConst_U32( ((Addr32)guest_EIP_bbstart)+delta)
   4004                   )
   4005                );
   4006                break;
   4007             }
   4008 
   4009             case 6: { /* FNSTENV m28 */
   4010                /* Uses dirty helper:
   4011                      void x86g_do_FSTENV ( VexGuestX86State*, HWord ) */
   4012                IRDirty* d = unsafeIRDirty_0_N (
   4013                                0/*regparms*/,
   4014                                "x86g_dirtyhelper_FSTENV",
   4015                                &x86g_dirtyhelper_FSTENV,
   4016                                mkIRExprVec_1( mkexpr(addr) )
   4017                             );
   4018                d->needsBBP = True;
   4019                /* declare we're writing memory */
   4020                d->mFx   = Ifx_Write;
   4021                d->mAddr = mkexpr(addr);
   4022                d->mSize = 28;
   4023 
   4024                /* declare we're reading guest state */
   4025                d->nFxState = 4;
   4026 
   4027                d->fxState[0].fx     = Ifx_Read;
   4028                d->fxState[0].offset = OFFB_FTOP;
   4029                d->fxState[0].size   = sizeof(UInt);
   4030 
   4031                d->fxState[1].fx     = Ifx_Read;
   4032                d->fxState[1].offset = OFFB_FPTAGS;
   4033                d->fxState[1].size   = 8 * sizeof(UChar);
   4034 
   4035                d->fxState[2].fx     = Ifx_Read;
   4036                d->fxState[2].offset = OFFB_FPROUND;
   4037                d->fxState[2].size   = sizeof(UInt);
   4038 
   4039                d->fxState[3].fx     = Ifx_Read;
   4040                d->fxState[3].offset = OFFB_FC3210;
   4041                d->fxState[3].size   = sizeof(UInt);
   4042 
   4043                stmt( IRStmt_Dirty(d) );
   4044 
   4045                DIP("fnstenv %s\n", dis_buf);
   4046                break;
   4047             }
   4048 
   4049             case 7: /* FNSTCW */
   4050               /* Fake up a native x87 FPU control word.  The only
   4051                  thing it depends on is FPROUND[1:0], so call a clean
   4052                  helper to cook it up. */
   4053                /* UInt x86h_create_fpucw ( UInt fpround ) */
   4054                DIP("fnstcw %s\n", dis_buf);
   4055                storeLE(
   4056                   mkexpr(addr),
   4057                   unop( Iop_32to16,
   4058                         mkIRExprCCall(
   4059                            Ity_I32, 0/*regp*/,
   4060                            "x86g_create_fpucw", &x86g_create_fpucw,
   4061                            mkIRExprVec_1( get_fpround() )
   4062                         )
   4063                   )
   4064                );
   4065                break;
   4066 
   4067             default:
   4068                vex_printf("unhandled opc_aux = 0x%2x\n", gregOfRM(modrm));
   4069                vex_printf("first_opcode == 0xD9\n");
   4070                goto decode_fail;
   4071          }
   4072 
   4073       } else {
   4074          delta++;
   4075          switch (modrm) {
   4076 
   4077             case 0xC0 ... 0xC7: /* FLD %st(?) */
   4078                r_src = (UInt)modrm - 0xC0;
   4079                DIP("fld %%st(%d)\n", (Int)r_src);
   4080                t1 = newTemp(Ity_F64);
   4081                assign(t1, get_ST(r_src));
   4082                fp_push();
   4083                put_ST(0, mkexpr(t1));
   4084                break;
   4085 
   4086             case 0xC8 ... 0xCF: /* FXCH %st(?) */
   4087                r_src = (UInt)modrm - 0xC8;
   4088                DIP("fxch %%st(%d)\n", (Int)r_src);
   4089                t1 = newTemp(Ity_F64);
   4090                t2 = newTemp(Ity_F64);
   4091                assign(t1, get_ST(0));
   4092                assign(t2, get_ST(r_src));
   4093                put_ST_UNCHECKED(0, mkexpr(t2));
   4094                put_ST_UNCHECKED(r_src, mkexpr(t1));
   4095                break;
   4096 
   4097             case 0xE0: /* FCHS */
   4098                DIP("fchs\n");
   4099                put_ST_UNCHECKED(0, unop(Iop_NegF64, get_ST(0)));
   4100                break;
   4101 
   4102             case 0xE1: /* FABS */
   4103                DIP("fabs\n");
   4104                put_ST_UNCHECKED(0, unop(Iop_AbsF64, get_ST(0)));
   4105                break;
   4106 
   4107             case 0xE4: /* FTST */
   4108                DIP("ftst\n");
   4109                /* This forces C1 to zero, which isn't right. */
   4110                /* Well, in fact the Intel docs say (bizarrely): "C1 is
   4111                   set to 0 if stack underflow occurred; otherwise, set
   4112                   to 0" which is pretty nonsensical.  I guess it's a
   4113                    typo. */
   4114                put_C3210(
   4115                    binop( Iop_And32,
   4116                           binop(Iop_Shl32,
   4117                                 binop(Iop_CmpF64,
   4118                                       get_ST(0),
   4119                                       IRExpr_Const(IRConst_F64i(0x0ULL))),
   4120                                 mkU8(8)),
   4121                           mkU32(0x4500)
   4122                    ));
   4123                break;
   4124 
   4125             case 0xE5: { /* FXAM */
   4126                /* This is an interesting one.  It examines %st(0),
   4127                   regardless of whether the tag says it's empty or not.
   4128                   Here, just pass both the tag (in our format) and the
   4129                   value (as a double, actually a ULong) to a helper
   4130                   function. */
   4131                IRExpr** args
   4132                   = mkIRExprVec_2( unop(Iop_8Uto32, get_ST_TAG(0)),
   4133                                    unop(Iop_ReinterpF64asI64,
   4134                                         get_ST_UNCHECKED(0)) );
   4135                put_C3210(mkIRExprCCall(
   4136                             Ity_I32,
   4137                             0/*regparm*/,
   4138                             "x86g_calculate_FXAM", &x86g_calculate_FXAM,
   4139                             args
   4140                         ));
   4141                DIP("fxam\n");
   4142                break;
   4143             }
   4144 
   4145             case 0xE8: /* FLD1 */
   4146                DIP("fld1\n");
   4147                fp_push();
   4148                /* put_ST(0, IRExpr_Const(IRConst_F64(1.0))); */
   4149                put_ST(0, IRExpr_Const(IRConst_F64i(0x3ff0000000000000ULL)));
   4150                break;
   4151 
   4152             case 0xE9: /* FLDL2T */
   4153                DIP("fldl2t\n");
   4154                fp_push();
   4155                /* put_ST(0, IRExpr_Const(IRConst_F64(3.32192809488736234781))); */
   4156                put_ST(0, IRExpr_Const(IRConst_F64i(0x400a934f0979a371ULL)));
   4157                break;
   4158 
   4159             case 0xEA: /* FLDL2E */
   4160                DIP("fldl2e\n");
   4161                fp_push();
   4162                /* put_ST(0, IRExpr_Const(IRConst_F64(1.44269504088896340739))); */
   4163                put_ST(0, IRExpr_Const(IRConst_F64i(0x3ff71547652b82feULL)));
   4164                break;
   4165 
   4166             case 0xEB: /* FLDPI */
   4167                DIP("fldpi\n");
   4168                fp_push();
   4169                /* put_ST(0, IRExpr_Const(IRConst_F64(3.14159265358979323851))); */
   4170                put_ST(0, IRExpr_Const(IRConst_F64i(0x400921fb54442d18ULL)));
   4171                break;
   4172 
   4173             case 0xEC: /* FLDLG2 */
   4174                DIP("fldlg2\n");
   4175                fp_push();
   4176                /* put_ST(0, IRExpr_Const(IRConst_F64(0.301029995663981143))); */
   4177                put_ST(0, IRExpr_Const(IRConst_F64i(0x3fd34413509f79ffULL)));
   4178                break;
   4179 
   4180             case 0xED: /* FLDLN2 */
   4181                DIP("fldln2\n");
   4182                fp_push();
   4183                /* put_ST(0, IRExpr_Const(IRConst_F64(0.69314718055994530942))); */
   4184                put_ST(0, IRExpr_Const(IRConst_F64i(0x3fe62e42fefa39efULL)));
   4185                break;
   4186 
   4187             case 0xEE: /* FLDZ */
   4188                DIP("fldz\n");
   4189                fp_push();
   4190                /* put_ST(0, IRExpr_Const(IRConst_F64(0.0))); */
   4191                put_ST(0, IRExpr_Const(IRConst_F64i(0x0000000000000000ULL)));
   4192                break;
   4193 
   4194             case 0xF0: /* F2XM1 */
   4195                DIP("f2xm1\n");
   4196                put_ST_UNCHECKED(0,
   4197                   binop(Iop_2xm1F64,
   4198                         get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   4199                         get_ST(0)));
   4200                break;
   4201 
   4202             case 0xF1: /* FYL2X */
   4203                DIP("fyl2x\n");
   4204                put_ST_UNCHECKED(1,
   4205                   triop(Iop_Yl2xF64,
   4206                         get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   4207                         get_ST(1),
   4208                         get_ST(0)));
   4209                fp_pop();
   4210                break;
   4211 
   4212             case 0xF2: /* FPTAN */
   4213                DIP("ftan\n");
   4214                put_ST_UNCHECKED(0,
   4215                   binop(Iop_TanF64,
   4216                         get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   4217                         get_ST(0)));
   4218                fp_push();
   4219                put_ST(0, IRExpr_Const(IRConst_F64(1.0)));
   4220                clear_C2(); /* HACK */
   4221                break;
   4222 
   4223             case 0xF3: /* FPATAN */
   4224                DIP("fpatan\n");
   4225                put_ST_UNCHECKED(1,
   4226                   triop(Iop_AtanF64,
   4227                         get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   4228                         get_ST(1),
   4229                         get_ST(0)));
   4230                fp_pop();
   4231                break;
   4232 
   4233             case 0xF4: { /* FXTRACT */
   4234                IRTemp argF = newTemp(Ity_F64);
   4235                IRTemp sigF = newTemp(Ity_F64);
   4236                IRTemp expF = newTemp(Ity_F64);
   4237                IRTemp argI = newTemp(Ity_I64);
   4238                IRTemp sigI = newTemp(Ity_I64);
   4239                IRTemp expI = newTemp(Ity_I64);
   4240                DIP("fxtract\n");
   4241                assign( argF, get_ST(0) );
   4242                assign( argI, unop(Iop_ReinterpF64asI64, mkexpr(argF)));
   4243                assign( sigI,
   4244                        mkIRExprCCall(
   4245                           Ity_I64, 0/*regparms*/,
   4246                           "x86amd64g_calculate_FXTRACT",
   4247                           &x86amd64g_calculate_FXTRACT,
   4248                           mkIRExprVec_2( mkexpr(argI),
   4249                                          mkIRExpr_HWord(0)/*sig*/ ))
   4250                );
   4251                assign( expI,
   4252                        mkIRExprCCall(
   4253                           Ity_I64, 0/*regparms*/,
   4254                           "x86amd64g_calculate_FXTRACT",
   4255                           &x86amd64g_calculate_FXTRACT,
   4256                           mkIRExprVec_2( mkexpr(argI),
   4257                                          mkIRExpr_HWord(1)/*exp*/ ))
   4258                );
   4259                assign( sigF, unop(Iop_ReinterpI64asF64, mkexpr(sigI)) );
   4260                assign( expF, unop(Iop_ReinterpI64asF64, mkexpr(expI)) );
   4261                /* exponent */
   4262                put_ST_UNCHECKED(0, mkexpr(expF) );
   4263                fp_push();
   4264                /* significand */
   4265                put_ST(0, mkexpr(sigF) );
   4266                break;
   4267             }
   4268 
   4269             case 0xF5: { /* FPREM1 -- IEEE compliant */
   4270                IRTemp a1 = newTemp(Ity_F64);
   4271                IRTemp a2 = newTemp(Ity_F64);
   4272                DIP("fprem1\n");
   4273                /* Do FPREM1 twice, once to get the remainder, and once
   4274                   to get the C3210 flag values. */
   4275                assign( a1, get_ST(0) );
   4276                assign( a2, get_ST(1) );
   4277                put_ST_UNCHECKED(0,
   4278                   triop(Iop_PRem1F64,
   4279                         get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   4280                         mkexpr(a1),
   4281                         mkexpr(a2)));
   4282                put_C3210(
   4283                   triop(Iop_PRem1C3210F64,
   4284                         get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   4285                         mkexpr(a1),
   4286                         mkexpr(a2)) );
   4287                break;
   4288             }
   4289 
   4290             case 0xF7: /* FINCSTP */
   4291                DIP("fprem\n");
   4292                put_ftop( binop(Iop_Add32, get_ftop(), mkU32(1)) );
   4293                break;
   4294 
   4295             case 0xF8: { /* FPREM -- not IEEE compliant */
   4296                IRTemp a1 = newTemp(Ity_F64);
   4297                IRTemp a2 = newTemp(Ity_F64);
   4298                DIP("fprem\n");
   4299                /* Do FPREM twice, once to get the remainder, and once
   4300                   to get the C3210 flag values. */
   4301                assign( a1, get_ST(0) );
   4302                assign( a2, get_ST(1) );
   4303                put_ST_UNCHECKED(0,
   4304                   triop(Iop_PRemF64,
   4305                         get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   4306                         mkexpr(a1),
   4307                         mkexpr(a2)));
   4308                put_C3210(
   4309                   triop(Iop_PRemC3210F64,
   4310                         get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   4311                         mkexpr(a1),
   4312                         mkexpr(a2)) );
   4313                break;
   4314             }
   4315 
   4316             case 0xF9: /* FYL2XP1 */
   4317                DIP("fyl2xp1\n");
   4318                put_ST_UNCHECKED(1,
   4319                   triop(Iop_Yl2xp1F64,
   4320                         get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   4321                         get_ST(1),
   4322                         get_ST(0)));
   4323                fp_pop();
   4324                break;
   4325 
   4326             case 0xFA: /* FSQRT */
   4327                DIP("fsqrt\n");
   4328                put_ST_UNCHECKED(0,
   4329                   binop(Iop_SqrtF64,
   4330                         get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   4331                         get_ST(0)));
   4332                break;
   4333 
   4334             case 0xFB: { /* FSINCOS */
   4335                IRTemp a1 = newTemp(Ity_F64);
   4336                assign( a1, get_ST(0) );
   4337                DIP("fsincos\n");
   4338                put_ST_UNCHECKED(0,
   4339                   binop(Iop_SinF64,
   4340                         get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   4341                         mkexpr(a1)));
   4342                fp_push();
   4343                put_ST(0,
   4344                   binop(Iop_CosF64,
   4345                         get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   4346                         mkexpr(a1)));
   4347                clear_C2(); /* HACK */
   4348                break;
   4349             }
   4350 
   4351             case 0xFC: /* FRNDINT */
   4352                DIP("frndint\n");
   4353                put_ST_UNCHECKED(0,
   4354                   binop(Iop_RoundF64toInt, get_roundingmode(), get_ST(0)) );
   4355                break;
   4356 
   4357             case 0xFD: /* FSCALE */
   4358                DIP("fscale\n");
   4359                put_ST_UNCHECKED(0,
   4360                   triop(Iop_ScaleF64,
   4361                         get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   4362                         get_ST(0),
   4363                         get_ST(1)));
   4364                break;
   4365 
   4366             case 0xFE: /* FSIN */
   4367                DIP("fsin\n");
   4368                put_ST_UNCHECKED(0,
   4369                   binop(Iop_SinF64,
   4370                         get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   4371                         get_ST(0)));
   4372                clear_C2(); /* HACK */
   4373                break;
   4374 
   4375             case 0xFF: /* FCOS */
   4376                DIP("fcos\n");
   4377                put_ST_UNCHECKED(0,
   4378                   binop(Iop_CosF64,
   4379                         get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   4380                         get_ST(0)));
   4381                clear_C2(); /* HACK */
   4382                break;
   4383 
   4384             default:
   4385                goto decode_fail;
   4386          }
   4387       }
   4388    }
   4389 
   4390    /* -+-+-+-+-+-+-+-+-+-+-+-+ 0xDA opcodes +-+-+-+-+-+-+-+ */
   4391    else
   4392    if (first_opcode == 0xDA) {
   4393 
   4394       if (modrm < 0xC0) {
   4395 
   4396          /* bits 5,4,3 are an opcode extension, and the modRM also
   4397             specifies an address. */
   4398          IROp   fop;
   4399          IRTemp addr = disAMode( &len, sorb, delta, dis_buf );
   4400          delta += len;
   4401          switch (gregOfRM(modrm)) {
   4402 
   4403             case 0: /* FIADD m32int */ /* ST(0) += m32int */
   4404                DIP("fiaddl %s\n", dis_buf);
   4405                fop = Iop_AddF64;
   4406                goto do_fop_m32;
   4407 
   4408             case 1: /* FIMUL m32int */ /* ST(0) *= m32int */
   4409                DIP("fimull %s\n", dis_buf);
   4410                fop = Iop_MulF64;
   4411                goto do_fop_m32;
   4412 
   4413             case 2: /* FICOM m32int */
   4414                DIP("ficoml %s\n", dis_buf);
   4415                /* This forces C1 to zero, which isn't right. */
   4416                put_C3210(
   4417                    binop( Iop_And32,
   4418                           binop(Iop_Shl32,
   4419                                 binop(Iop_CmpF64,
   4420                                       get_ST(0),
   4421                                       unop(Iop_I32StoF64,
   4422                                            loadLE(Ity_I32,mkexpr(addr)))),
   4423                                 mkU8(8)),
   4424                           mkU32(0x4500)
   4425                    ));
   4426                break;
   4427 
   4428             case 3: /* FICOMP m32int */
   4429                DIP("ficompl %s\n", dis_buf);
   4430                /* This forces C1 to zero, which isn't right. */
   4431                put_C3210(
   4432                    binop( Iop_And32,
   4433                           binop(Iop_Shl32,
   4434                                 binop(Iop_CmpF64,
   4435                                       get_ST(0),
   4436                                       unop(Iop_I32StoF64,
   4437                                            loadLE(Ity_I32,mkexpr(addr)))),
   4438                                 mkU8(8)),
   4439                           mkU32(0x4500)
   4440                    ));
   4441                fp_pop();
   4442                break;
   4443 
   4444             case 4: /* FISUB m32int */ /* ST(0) -= m32int */
   4445                DIP("fisubl %s\n", dis_buf);
   4446                fop = Iop_SubF64;
   4447                goto do_fop_m32;
   4448 
   4449             case 5: /* FISUBR m32int */ /* ST(0) = m32int - ST(0) */
   4450                DIP("fisubrl %s\n", dis_buf);
   4451                fop = Iop_SubF64;
   4452                goto do_foprev_m32;
   4453 
   4454             case 6: /* FIDIV m32int */ /* ST(0) /= m32int */
   4455                DIP("fidivl %s\n", dis_buf);
   4456                fop = Iop_DivF64;
   4457                goto do_fop_m32;
   4458 
   4459             case 7: /* FIDIVR m32int */ /* ST(0) = m32int / ST(0) */
   4460                DIP("fidivrl %s\n", dis_buf);
   4461                fop = Iop_DivF64;
   4462                goto do_foprev_m32;
   4463 
   4464             do_fop_m32:
   4465                put_ST_UNCHECKED(0,
   4466                   triop(fop,
   4467                         get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   4468                         get_ST(0),
   4469                         unop(Iop_I32StoF64,
   4470                              loadLE(Ity_I32, mkexpr(addr)))));
   4471                break;
   4472 
   4473             do_foprev_m32:
   4474                put_ST_UNCHECKED(0,
   4475                   triop(fop,
   4476                         get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   4477                         unop(Iop_I32StoF64,
   4478                              loadLE(Ity_I32, mkexpr(addr))),
   4479                         get_ST(0)));
   4480                break;
   4481 
   4482             default:
   4483                vex_printf("unhandled opc_aux = 0x%2x\n", gregOfRM(modrm));
   4484                vex_printf("first_opcode == 0xDA\n");
   4485                goto decode_fail;
   4486          }
   4487 
   4488       } else {
   4489 
   4490          delta++;
   4491          switch (modrm) {
   4492 
   4493             case 0xC0 ... 0xC7: /* FCMOVB ST(i), ST(0) */
   4494                r_src = (UInt)modrm - 0xC0;
   4495                DIP("fcmovb %%st(%d), %%st(0)\n", (Int)r_src);
   4496                put_ST_UNCHECKED(0,
   4497                                 IRExpr_Mux0X(
   4498                                     unop(Iop_1Uto8,
   4499                                          mk_x86g_calculate_condition(X86CondB)),
   4500                                     get_ST(0), get_ST(r_src)) );
   4501                break;
   4502 
   4503             case 0xC8 ... 0xCF: /* FCMOVE(Z) ST(i), ST(0) */
   4504                r_src = (UInt)modrm - 0xC8;
   4505                DIP("fcmovz %%st(%d), %%st(0)\n", (Int)r_src);
   4506                put_ST_UNCHECKED(0,
   4507                                 IRExpr_Mux0X(
   4508                                     unop(Iop_1Uto8,
   4509                                          mk_x86g_calculate_condition(X86CondZ)),
   4510                                     get_ST(0), get_ST(r_src)) );
   4511                break;
   4512 
   4513             case 0xD0 ... 0xD7: /* FCMOVBE ST(i), ST(0) */
   4514                r_src = (UInt)modrm - 0xD0;
   4515                DIP("fcmovbe %%st(%d), %%st(0)\n", (Int)r_src);
   4516                put_ST_UNCHECKED(0,
   4517                                 IRExpr_Mux0X(
   4518                                     unop(Iop_1Uto8,
   4519                                          mk_x86g_calculate_condition(X86CondBE)),
   4520                                     get_ST(0), get_ST(r_src)) );
   4521                break;
   4522 
   4523             case 0xD8 ... 0xDF: /* FCMOVU ST(i), ST(0) */
   4524                r_src = (UInt)modrm - 0xD8;
   4525                DIP("fcmovu %%st(%d), %%st(0)\n", (Int)r_src);
   4526                put_ST_UNCHECKED(0,
   4527                                 IRExpr_Mux0X(
   4528                                     unop(Iop_1Uto8,
   4529                                          mk_x86g_calculate_condition(X86CondP)),
   4530                                     get_ST(0), get_ST(r_src)) );
   4531                break;
   4532 
   4533             case 0xE9: /* FUCOMPP %st(0),%st(1) */
   4534                DIP("fucompp %%st(0),%%st(1)\n");
   4535                /* This forces C1 to zero, which isn't right. */
   4536                put_C3210(
   4537                    binop( Iop_And32,
   4538                           binop(Iop_Shl32,
   4539                                 binop(Iop_CmpF64, get_ST(0), get_ST(1)),
   4540                                 mkU8(8)),
   4541                           mkU32(0x4500)
   4542                    ));
   4543                fp_pop();
   4544                fp_pop();
   4545                break;
   4546 
   4547             default:
   4548                goto decode_fail;
   4549          }
   4550 
   4551       }
   4552    }
   4553 
   4554    /* -+-+-+-+-+-+-+-+-+-+-+-+ 0xDB opcodes +-+-+-+-+-+-+-+ */
   4555    else
   4556    if (first_opcode == 0xDB) {
   4557       if (modrm < 0xC0) {
   4558 
   4559          /* bits 5,4,3 are an opcode extension, and the modRM also
   4560             specifies an address. */
   4561          IRTemp addr = disAMode( &len, sorb, delta, dis_buf );
   4562          delta += len;
   4563 
   4564          switch (gregOfRM(modrm)) {
   4565 
   4566             case 0: /* FILD m32int */
   4567                DIP("fildl %s\n", dis_buf);
   4568                fp_push();
   4569                put_ST(0, unop(Iop_I32StoF64,
   4570                               loadLE(Ity_I32, mkexpr(addr))));
   4571                break;
   4572 
   4573             case 1: /* FISTTPL m32 (SSE3) */
   4574                DIP("fisttpl %s\n", dis_buf);
   4575                storeLE( mkexpr(addr),
   4576                         binop(Iop_F64toI32S, mkU32(Irrm_ZERO), get_ST(0)) );
   4577                fp_pop();
   4578                break;
   4579 
   4580             case 2: /* FIST m32 */
   4581                DIP("fistl %s\n", dis_buf);
   4582                storeLE( mkexpr(addr),
   4583                         binop(Iop_F64toI32S, get_roundingmode(), get_ST(0)) );
   4584                break;
   4585 
   4586             case 3: /* FISTP m32 */
   4587                DIP("fistpl %s\n", dis_buf);
   4588                storeLE( mkexpr(addr),
   4589                         binop(Iop_F64toI32S, get_roundingmode(), get_ST(0)) );
   4590                fp_pop();
   4591                break;
   4592 
   4593             case 5: { /* FLD extended-real */
   4594                /* Uses dirty helper:
   4595                      ULong x86g_loadF80le ( UInt )
   4596                   addr holds the address.  First, do a dirty call to
   4597                   get hold of the data. */
   4598                IRTemp   val  = newTemp(Ity_I64);
   4599                IRExpr** args = mkIRExprVec_1 ( mkexpr(addr) );
   4600 
   4601                IRDirty* d = unsafeIRDirty_1_N (
   4602                                val,
   4603                                0/*regparms*/,
   4604                                "x86g_dirtyhelper_loadF80le",
   4605                                &x86g_dirtyhelper_loadF80le,
   4606                                args
   4607                             );
   4608                /* declare that we're reading memory */
   4609                d->mFx   = Ifx_Read;
   4610                d->mAddr = mkexpr(addr);
   4611                d->mSize = 10;
   4612 
   4613                /* execute the dirty call, dumping the result in val. */
   4614                stmt( IRStmt_Dirty(d) );
   4615                fp_push();
   4616                put_ST(0, unop(Iop_ReinterpI64asF64, mkexpr(val)));
   4617 
   4618                DIP("fldt %s\n", dis_buf);
   4619                break;
   4620             }
   4621 
   4622             case 7: { /* FSTP extended-real */
   4623                /* Uses dirty helper: void x86g_storeF80le ( UInt, ULong ) */
   4624                IRExpr** args
   4625                   = mkIRExprVec_2( mkexpr(addr),
   4626                                    unop(Iop_ReinterpF64asI64, get_ST(0)) );
   4627 
   4628                IRDirty* d = unsafeIRDirty_0_N (
   4629                                0/*regparms*/,
   4630                                "x86g_dirtyhelper_storeF80le",
   4631                                &x86g_dirtyhelper_storeF80le,
   4632                                args
   4633                             );
   4634                /* declare we're writing memory */
   4635                d->mFx   = Ifx_Write;
   4636                d->mAddr = mkexpr(addr);
   4637                d->mSize = 10;
   4638 
   4639                /* execute the dirty call. */
   4640                stmt( IRStmt_Dirty(d) );
   4641                fp_pop();
   4642 
   4643                DIP("fstpt\n %s", dis_buf);
   4644                break;
   4645             }
   4646 
   4647             default:
   4648                vex_printf("unhandled opc_aux = 0x%2x\n", gregOfRM(modrm));
   4649                vex_printf("first_opcode == 0xDB\n");
   4650                goto decode_fail;
   4651          }
   4652 
   4653       } else {
   4654 
   4655          delta++;
   4656          switch (modrm) {
   4657 
   4658             case 0xC0 ... 0xC7: /* FCMOVNB ST(i), ST(0) */
   4659                r_src = (UInt)modrm - 0xC0;
   4660                DIP("fcmovnb %%st(%d), %%st(0)\n", (Int)r_src);
   4661                put_ST_UNCHECKED(0,
   4662                                 IRExpr_Mux0X(
   4663                                     unop(Iop_1Uto8,
   4664                                          mk_x86g_calculate_condition(X86CondNB)),
   4665                                     get_ST(0), get_ST(r_src)) );
   4666                break;
   4667 
   4668             case 0xC8 ... 0xCF: /* FCMOVNE(NZ) ST(i), ST(0) */
   4669                r_src = (UInt)modrm - 0xC8;
   4670                DIP("fcmovnz %%st(%d), %%st(0)\n", (Int)r_src);
   4671                put_ST_UNCHECKED(0,
   4672                                 IRExpr_Mux0X(
   4673                                     unop(Iop_1Uto8,
   4674                                          mk_x86g_calculate_condition(X86CondNZ)),
   4675                                     get_ST(0), get_ST(r_src)) );
   4676                break;
   4677 
   4678             case 0xD0 ... 0xD7: /* FCMOVNBE ST(i), ST(0) */
   4679                r_src = (UInt)modrm - 0xD0;
   4680                DIP("fcmovnbe %%st(%d), %%st(0)\n", (Int)r_src);
   4681                put_ST_UNCHECKED(0,
   4682                                 IRExpr_Mux0X(
   4683                                     unop(Iop_1Uto8,
   4684                                          mk_x86g_calculate_condition(X86CondNBE)),
   4685                                     get_ST(0), get_ST(r_src)) );
   4686                break;
   4687 
   4688             case 0xD8 ... 0xDF: /* FCMOVNU ST(i), ST(0) */
   4689                r_src = (UInt)modrm - 0xD8;
   4690                DIP("fcmovnu %%st(%d), %%st(0)\n", (Int)r_src);
   4691                put_ST_UNCHECKED(0,
   4692                                 IRExpr_Mux0X(
   4693                                     unop(Iop_1Uto8,
   4694                                          mk_x86g_calculate_condition(X86CondNP)),
   4695                                     get_ST(0), get_ST(r_src)) );
   4696                break;
   4697 
   4698             case 0xE2:
   4699                DIP("fnclex\n");
   4700                break;
   4701 
   4702             case 0xE3: {
   4703                /* Uses dirty helper:
   4704                      void x86g_do_FINIT ( VexGuestX86State* ) */
   4705                IRDirty* d  = unsafeIRDirty_0_N (
   4706                                 0/*regparms*/,
   4707                                 "x86g_dirtyhelper_FINIT",
   4708                                 &x86g_dirtyhelper_FINIT,
   4709                                 mkIRExprVec_0()
   4710                              );
   4711                d->needsBBP = True;
   4712 
   4713                /* declare we're writing guest state */
   4714                d->nFxState = 5;
   4715 
   4716                d->fxState[0].fx     = Ifx_Write;
   4717                d->fxState[0].offset = OFFB_FTOP;
   4718                d->fxState[0].size   = sizeof(UInt);
   4719 
   4720                d->fxState[1].fx     = Ifx_Write;
   4721                d->fxState[1].offset = OFFB_FPREGS;
   4722                d->fxState[1].size   = 8 * sizeof(ULong);
   4723 
   4724                d->fxState[2].fx     = Ifx_Write;
   4725                d->fxState[2].offset = OFFB_FPTAGS;
   4726                d->fxState[2].size   = 8 * sizeof(UChar);
   4727 
   4728                d->fxState[3].fx     = Ifx_Write;
   4729                d->fxState[3].offset = OFFB_FPROUND;
   4730                d->fxState[3].size   = sizeof(UInt);
   4731 
   4732                d->fxState[4].fx     = Ifx_Write;
   4733                d->fxState[4].offset = OFFB_FC3210;
   4734                d->fxState[4].size   = sizeof(UInt);
   4735 
   4736                stmt( IRStmt_Dirty(d) );
   4737 
   4738                DIP("fninit\n");
   4739                break;
   4740             }
   4741 
   4742             case 0xE8 ... 0xEF: /* FUCOMI %st(0),%st(?) */
   4743                fp_do_ucomi_ST0_STi( (UInt)modrm - 0xE8, False );
   4744                break;
   4745 
   4746             case 0xF0 ... 0xF7: /* FCOMI %st(0),%st(?) */
   4747                fp_do_ucomi_ST0_STi( (UInt)modrm - 0xF0, False );
   4748                break;
   4749 
   4750             default:
   4751                goto decode_fail;
   4752          }
   4753       }
   4754    }
   4755 
   4756    /* -+-+-+-+-+-+-+-+-+-+-+-+ 0xDC opcodes +-+-+-+-+-+-+-+ */
   4757    else
   4758    if (first_opcode == 0xDC) {
   4759       if (modrm < 0xC0) {
   4760 
   4761          /* bits 5,4,3 are an opcode extension, and the modRM also
   4762             specifies an address. */
   4763          IRTemp addr = disAMode( &len, sorb, delta, dis_buf );
   4764          delta += len;
   4765 
   4766          switch (gregOfRM(modrm)) {
   4767 
   4768             case 0: /* FADD double-real */
   4769                fp_do_op_mem_ST_0 ( addr, "add", dis_buf, Iop_AddF64, True );
   4770                break;
   4771 
   4772             case 1: /* FMUL double-real */
   4773                fp_do_op_mem_ST_0 ( addr, "mul", dis_buf, Iop_MulF64, True );
   4774                break;
   4775 
   4776             case 2: /* FCOM double-real */
   4777                DIP("fcoml %s\n", dis_buf);
   4778                /* This forces C1 to zero, which isn't right. */
   4779                put_C3210(
   4780                    binop( Iop_And32,
   4781                           binop(Iop_Shl32,
   4782                                 binop(Iop_CmpF64,
   4783                                       get_ST(0),
   4784                                       loadLE(Ity_F64,mkexpr(addr))),
   4785                                 mkU8(8)),
   4786                           mkU32(0x4500)
   4787                    ));
   4788                break;
   4789 
   4790             case 3: /* FCOMP double-real */
   4791                DIP("fcompl %s\n", dis_buf);
   4792                /* This forces C1 to zero, which isn't right. */
   4793                put_C3210(
   4794                    binop( Iop_And32,
   4795                           binop(Iop_Shl32,
   4796                                 binop(Iop_CmpF64,
   4797                                       get_ST(0),
   4798                                       loadLE(Ity_F64,mkexpr(addr))),
   4799                                 mkU8(8)),
   4800                           mkU32(0x4500)
   4801                    ));
   4802                fp_pop();
   4803                break;
   4804 
   4805             case 4: /* FSUB double-real */
   4806                fp_do_op_mem_ST_0 ( addr, "sub", dis_buf, Iop_SubF64, True );
   4807                break;
   4808 
   4809             case 5: /* FSUBR double-real */
   4810                fp_do_oprev_mem_ST_0 ( addr, "subr", dis_buf, Iop_SubF64, True );
   4811                break;
   4812 
   4813             case 6: /* FDIV double-real */
   4814                fp_do_op_mem_ST_0 ( addr, "div", dis_buf, Iop_DivF64, True );
   4815                break;
   4816 
   4817             case 7: /* FDIVR double-real */
   4818                fp_do_oprev_mem_ST_0 ( addr, "divr", dis_buf, Iop_DivF64, True );
   4819                break;
   4820 
   4821             default:
   4822                vex_printf("unhandled opc_aux = 0x%2x\n", gregOfRM(modrm));
   4823                vex_printf("first_opcode == 0xDC\n");
   4824                goto decode_fail;
   4825          }
   4826 
   4827       } else {
   4828 
   4829          delta++;
   4830          switch (modrm) {
   4831 
   4832             case 0xC0 ... 0xC7: /* FADD %st(0),%st(?) */
   4833                fp_do_op_ST_ST ( "add", Iop_AddF64, 0, modrm - 0xC0, False );
   4834                break;
   4835 
   4836             case 0xC8 ... 0xCF: /* FMUL %st(0),%st(?) */
   4837                fp_do_op_ST_ST ( "mul", Iop_MulF64, 0, modrm - 0xC8, False );
   4838                break;
   4839 
   4840             case 0xE0 ... 0xE7: /* FSUBR %st(0),%st(?) */
   4841                fp_do_oprev_ST_ST ( "subr", Iop_SubF64, 0, modrm - 0xE0, False );
   4842                break;
   4843 
   4844             case 0xE8 ... 0xEF: /* FSUB %st(0),%st(?) */
   4845                fp_do_op_ST_ST ( "sub", Iop_SubF64, 0, modrm - 0xE8, False );
   4846                break;
   4847 
   4848             case 0xF0 ... 0xF7: /* FDIVR %st(0),%st(?) */
   4849                fp_do_oprev_ST_ST ( "divr", Iop_DivF64, 0, modrm - 0xF0, False );
   4850                break;
   4851 
   4852             case 0xF8 ... 0xFF: /* FDIV %st(0),%st(?) */
   4853                fp_do_op_ST_ST ( "div", Iop_DivF64, 0, modrm - 0xF8, False );
   4854                break;
   4855 
   4856             default:
   4857                goto decode_fail;
   4858          }
   4859 
   4860       }
   4861    }
   4862 
   4863    /* -+-+-+-+-+-+-+-+-+-+-+-+ 0xDD opcodes +-+-+-+-+-+-+-+ */
   4864    else
   4865    if (first_opcode == 0xDD) {
   4866 
   4867       if (modrm < 0xC0) {
   4868 
   4869          /* bits 5,4,3 are an opcode extension, and the modRM also
   4870             specifies an address. */
   4871          IRTemp addr = disAMode( &len, sorb, delta, dis_buf );
   4872          delta += len;
   4873 
   4874          switch (gregOfRM(modrm)) {
   4875 
   4876             case 0: /* FLD double-real */
   4877                DIP("fldl %s\n", dis_buf);
   4878                fp_push();
   4879                put_ST(0, loadLE(Ity_F64, mkexpr(addr)));
   4880                break;
   4881 
   4882             case 1: /* FISTTPQ m64 (SSE3) */
   4883                DIP("fistppll %s\n", dis_buf);
   4884                storeLE( mkexpr(addr),
   4885                         binop(Iop_F64toI64S, mkU32(Irrm_ZERO), get_ST(0)) );
   4886                fp_pop();
   4887                break;
   4888 
   4889             case 2: /* FST double-real */
   4890                DIP("fstl %s\n", dis_buf);
   4891                storeLE(mkexpr(addr), get_ST(0));
   4892                break;
   4893 
   4894             case 3: /* FSTP double-real */
   4895                DIP("fstpl %s\n", dis_buf);
   4896                storeLE(mkexpr(addr), get_ST(0));
   4897                fp_pop();
   4898                break;
   4899 
   4900             case 4: { /* FRSTOR m108 */
   4901                /* Uses dirty helper:
   4902                      VexEmWarn x86g_do_FRSTOR ( VexGuestX86State*, Addr32 ) */
   4903                IRTemp   ew = newTemp(Ity_I32);
   4904                IRDirty* d  = unsafeIRDirty_0_N (
   4905                                 0/*regparms*/,
   4906                                 "x86g_dirtyhelper_FRSTOR",
   4907                                 &x86g_dirtyhelper_FRSTOR,
   4908                                 mkIRExprVec_1( mkexpr(addr) )
   4909                              );
   4910                d->needsBBP = True;
   4911                d->tmp      = ew;
   4912                /* declare we're reading memory */
   4913                d->mFx   = Ifx_Read;
   4914                d->mAddr = mkexpr(addr);
   4915                d->mSize = 108;
   4916 
   4917                /* declare we're writing guest state */
   4918                d->nFxState = 5;
   4919 
   4920                d->fxState[0].fx     = Ifx_Write;
   4921                d->fxState[0].offset = OFFB_FTOP;
   4922                d->fxState[0].size   = sizeof(UInt);
   4923 
   4924                d->fxState[1].fx     = Ifx_Write;
   4925                d->fxState[1].offset = OFFB_FPREGS;
   4926                d->fxState[1].size   = 8 * sizeof(ULong);
   4927 
   4928                d->fxState[2].fx     = Ifx_Write;
   4929                d->fxState[2].offset = OFFB_FPTAGS;
   4930                d->fxState[2].size   = 8 * sizeof(UChar);
   4931 
   4932                d->fxState[3].fx     = Ifx_Write;
   4933                d->fxState[3].offset = OFFB_FPROUND;
   4934                d->fxState[3].size   = sizeof(UInt);
   4935 
   4936                d->fxState[4].fx     = Ifx_Write;
   4937                d->fxState[4].offset = OFFB_FC3210;
   4938                d->fxState[4].size   = sizeof(UInt);
   4939 
   4940                stmt( IRStmt_Dirty(d) );
   4941 
   4942                /* ew contains any emulation warning we may need to
   4943                   issue.  If needed, side-exit to the next insn,
   4944                   reporting the warning, so that Valgrind's dispatcher
   4945                   sees the warning. */
   4946                put_emwarn( mkexpr(ew) );
   4947                stmt(
   4948                   IRStmt_Exit(
   4949                      binop(Iop_CmpNE32, mkexpr(ew), mkU32(0)),
   4950                      Ijk_EmWarn,
   4951                      IRConst_U32( ((Addr32)guest_EIP_bbstart)+delta)
   4952                   )
   4953                );
   4954 
   4955                DIP("frstor %s\n", dis_buf);
   4956                break;
   4957             }
   4958 
   4959             case 6: { /* FNSAVE m108 */
   4960                /* Uses dirty helper:
   4961                      void x86g_do_FSAVE ( VexGuestX86State*, UInt ) */
   4962                IRDirty* d = unsafeIRDirty_0_N (
   4963                                0/*regparms*/,
   4964                                "x86g_dirtyhelper_FSAVE",
   4965                                &x86g_dirtyhelper_FSAVE,
   4966                                mkIRExprVec_1( mkexpr(addr) )
   4967                             );
   4968                d->needsBBP = True;
   4969                /* declare we're writing memory */
   4970                d->mFx   = Ifx_Write;
   4971                d->mAddr = mkexpr(addr);
   4972                d->mSize = 108;
   4973 
   4974                /* declare we're reading guest state */
   4975                d->nFxState = 5;
   4976 
   4977                d->fxState[0].fx     = Ifx_Read;
   4978                d->fxState[0].offset = OFFB_FTOP;
   4979                d->fxState[0].size   = sizeof(UInt);
   4980 
   4981                d->fxState[1].fx     = Ifx_Read;
   4982                d->fxState[1].offset = OFFB_FPREGS;
   4983                d->fxState[1].size   = 8 * sizeof(ULong);
   4984 
   4985                d->fxState[2].fx     = Ifx_Read;
   4986                d->fxState[2].offset = OFFB_FPTAGS;
   4987                d->fxState[2].size   = 8 * sizeof(UChar);
   4988 
   4989                d->fxState[3].fx     = Ifx_Read;
   4990                d->fxState[3].offset = OFFB_FPROUND;
   4991                d->fxState[3].size   = sizeof(UInt);
   4992 
   4993                d->fxState[4].fx     = Ifx_Read;
   4994                d->fxState[4].offset = OFFB_FC3210;
   4995                d->fxState[4].size   = sizeof(UInt);
   4996 
   4997                stmt( IRStmt_Dirty(d) );
   4998 
   4999                DIP("fnsave %s\n", dis_buf);
   5000                break;
   5001             }
   5002 
   5003             case 7: { /* FNSTSW m16 */
   5004                IRExpr* sw = get_FPU_sw();
   5005                vassert(typeOfIRExpr(irsb->tyenv, sw) == Ity_I16);
   5006                storeLE( mkexpr(addr), sw );
   5007                DIP("fnstsw %s\n", dis_buf);
   5008                break;
   5009             }
   5010 
   5011             default:
   5012                vex_printf("unhandled opc_aux = 0x%2x\n", gregOfRM(modrm));
   5013                vex_printf("first_opcode == 0xDD\n");
   5014                goto decode_fail;
   5015          }
   5016       } else {
   5017          delta++;
   5018          switch (modrm) {
   5019 
   5020             case 0xC0 ... 0xC7: /* FFREE %st(?) */
   5021                r_dst = (UInt)modrm - 0xC0;
   5022                DIP("ffree %%st(%d)\n", (Int)r_dst);
   5023                put_ST_TAG ( r_dst, mkU8(0) );
   5024                break;
   5025 
   5026             case 0xD0 ... 0xD7: /* FST %st(0),%st(?) */
   5027                r_dst = (UInt)modrm - 0xD0;
   5028                DIP("fst %%st(0),%%st(%d)\n", (Int)r_dst);
   5029                /* P4 manual says: "If the destination operand is a
   5030                   non-empty register, the invalid-operation exception
   5031                   is not generated.  Hence put_ST_UNCHECKED. */
   5032                put_ST_UNCHECKED(r_dst, get_ST(0));
   5033                break;
   5034 
   5035             case 0xD8 ... 0xDF: /* FSTP %st(0),%st(?) */
   5036                r_dst = (UInt)modrm - 0xD8;
   5037                DIP("fstp %%st(0),%%st(%d)\n", (Int)r_dst);
   5038                /* P4 manual says: "If the destination operand is a
   5039                   non-empty register, the invalid-operation exception
   5040                   is not generated.  Hence put_ST_UNCHECKED. */
   5041                put_ST_UNCHECKED(r_dst, get_ST(0));
   5042                fp_pop();
   5043                break;
   5044 
   5045             case 0xE0 ... 0xE7: /* FUCOM %st(0),%st(?) */
   5046                r_dst = (UInt)modrm - 0xE0;
   5047                DIP("fucom %%st(0),%%st(%d)\n", (Int)r_dst);
   5048                /* This forces C1 to zero, which isn't right. */
   5049                put_C3210(
   5050                    binop( Iop_And32,
   5051                           binop(Iop_Shl32,
   5052                                 binop(Iop_CmpF64, get_ST(0), get_ST(r_dst)),
   5053                                 mkU8(8)),
   5054                           mkU32(0x4500)
   5055                    ));
   5056                break;
   5057 
   5058             case 0xE8 ... 0xEF: /* FUCOMP %st(0),%st(?) */
   5059                r_dst = (UInt)modrm - 0xE8;
   5060                DIP("fucomp %%st(0),%%st(%d)\n", (Int)r_dst);
   5061                /* This forces C1 to zero, which isn't right. */
   5062                put_C3210(
   5063                    binop( Iop_And32,
   5064                           binop(Iop_Shl32,
   5065                                 binop(Iop_CmpF64, get_ST(0), get_ST(r_dst)),
   5066                                 mkU8(8)),
   5067                           mkU32(0x4500)
   5068                    ));
   5069                fp_pop();
   5070                break;
   5071 
   5072             default:
   5073                goto decode_fail;
   5074          }
   5075       }
   5076    }
   5077 
   5078    /* -+-+-+-+-+-+-+-+-+-+-+-+ 0xDE opcodes +-+-+-+-+-+-+-+ */
   5079    else
   5080    if (first_opcode == 0xDE) {
   5081 
   5082       if (modrm < 0xC0) {
   5083 
   5084          /* bits 5,4,3 are an opcode extension, and the modRM also
   5085             specifies an address. */
   5086          IROp   fop;
   5087          IRTemp addr = disAMode( &len, sorb, delta, dis_buf );
   5088          delta += len;
   5089 
   5090          switch (gregOfRM(modrm)) {
   5091 
   5092             case 0: /* FIADD m16int */ /* ST(0) += m16int */
   5093                DIP("fiaddw %s\n", dis_buf);
   5094                fop = Iop_AddF64;
   5095                goto do_fop_m16;
   5096 
   5097             case 1: /* FIMUL m16int */ /* ST(0) *= m16int */
   5098                DIP("fimulw %s\n", dis_buf);
   5099                fop = Iop_MulF64;
   5100                goto do_fop_m16;
   5101 
   5102             case 2: /* FICOM m16int */
   5103                DIP("ficomw %s\n", dis_buf);
   5104                /* This forces C1 to zero, which isn't right. */
   5105                put_C3210(
   5106                    binop( Iop_And32,
   5107                           binop(Iop_Shl32,
   5108                                 binop(Iop_CmpF64,
   5109                                       get_ST(0),
   5110                                       unop(Iop_I32StoF64,
   5111                                          unop(Iop_16Sto32,
   5112                                            loadLE(Ity_I16,mkexpr(addr))))),
   5113                                 mkU8(8)),
   5114                           mkU32(0x4500)
   5115                    ));
   5116                break;
   5117 
   5118             case 3: /* FICOMP m16int */
   5119                DIP("ficompw %s\n", dis_buf);
   5120                /* This forces C1 to zero, which isn't right. */
   5121                put_C3210(
   5122                    binop( Iop_And32,
   5123                           binop(Iop_Shl32,
   5124                                 binop(Iop_CmpF64,
   5125                                       get_ST(0),
   5126                                       unop(Iop_I32StoF64,
   5127                                          unop(Iop_16Sto32,
   5128                                               loadLE(Ity_I16,mkexpr(addr))))),
   5129                                 mkU8(8)),
   5130                           mkU32(0x4500)
   5131                    ));
   5132                fp_pop();
   5133                break;
   5134 
   5135             case 4: /* FISUB m16int */ /* ST(0) -= m16int */
   5136                DIP("fisubw %s\n", dis_buf);
   5137                fop = Iop_SubF64;
   5138                goto do_fop_m16;
   5139 
   5140             case 5: /* FISUBR m16int */ /* ST(0) = m16int - ST(0) */
   5141                DIP("fisubrw %s\n", dis_buf);
   5142                fop = Iop_SubF64;
   5143                goto do_foprev_m16;
   5144 
   5145             case 6: /* FIDIV m16int */ /* ST(0) /= m16int */
   5146                DIP("fisubw %s\n", dis_buf);
   5147                fop = Iop_DivF64;
   5148                goto do_fop_m16;
   5149 
   5150             case 7: /* FIDIVR m16int */ /* ST(0) = m16int / ST(0) */
   5151                DIP("fidivrw %s\n", dis_buf);
   5152                fop = Iop_DivF64;
   5153                goto do_foprev_m16;
   5154 
   5155             do_fop_m16:
   5156                put_ST_UNCHECKED(0,
   5157                   triop(fop,
   5158                         get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   5159                         get_ST(0),
   5160                         unop(Iop_I32StoF64,
   5161                              unop(Iop_16Sto32,
   5162                                   loadLE(Ity_I16, mkexpr(addr))))));
   5163                break;
   5164 
   5165             do_foprev_m16:
   5166                put_ST_UNCHECKED(0,
   5167                   triop(fop,
   5168                         get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   5169                         unop(Iop_I32StoF64,
   5170                              unop(Iop_16Sto32,
   5171                                   loadLE(Ity_I16, mkexpr(addr)))),
   5172                         get_ST(0)));
   5173                break;
   5174 
   5175             default:
   5176                vex_printf("unhandled opc_aux = 0x%2x\n", gregOfRM(modrm));
   5177                vex_printf("first_opcode == 0xDE\n");
   5178                goto decode_fail;
   5179          }
   5180 
   5181       } else {
   5182 
   5183          delta++;
   5184          switch (modrm) {
   5185 
   5186             case 0xC0 ... 0xC7: /* FADDP %st(0),%st(?) */
   5187                fp_do_op_ST_ST ( "add", Iop_AddF64, 0, modrm - 0xC0, True );
   5188                break;
   5189 
   5190             case 0xC8 ... 0xCF: /* FMULP %st(0),%st(?) */
   5191                fp_do_op_ST_ST ( "mul", Iop_MulF64, 0, modrm - 0xC8, True );
   5192                break;
   5193 
   5194             case 0xD9: /* FCOMPP %st(0),%st(1) */
   5195                DIP("fuompp %%st(0),%%st(1)\n");
   5196                /* This forces C1 to zero, which isn't right. */
   5197                put_C3210(
   5198                    binop( Iop_And32,
   5199                           binop(Iop_Shl32,
   5200                                 binop(Iop_CmpF64, get_ST(0), get_ST(1)),
   5201                                 mkU8(8)),
   5202                           mkU32(0x4500)
   5203                    ));
   5204                fp_pop();
   5205                fp_pop();
   5206                break;
   5207 
   5208             case 0xE0 ... 0xE7: /* FSUBRP %st(0),%st(?) */
   5209                fp_do_oprev_ST_ST ( "subr", Iop_SubF64, 0,  modrm - 0xE0, True );
   5210                break;
   5211 
   5212             case 0xE8 ... 0xEF: /* FSUBP %st(0),%st(?) */
   5213                fp_do_op_ST_ST ( "sub", Iop_SubF64, 0,  modrm - 0xE8, True );
   5214                break;
   5215 
   5216             case 0xF0 ... 0xF7: /* FDIVRP %st(0),%st(?) */
   5217                fp_do_oprev_ST_ST ( "divr", Iop_DivF64, 0, modrm - 0xF0, True );
   5218                break;
   5219 
   5220             case 0xF8 ... 0xFF: /* FDIVP %st(0),%st(?) */
   5221                fp_do_op_ST_ST ( "div", Iop_DivF64, 0, modrm - 0xF8, True );
   5222                break;
   5223 
   5224             default:
   5225                goto decode_fail;
   5226          }
   5227 
   5228       }
   5229    }
   5230 
   5231    /* -+-+-+-+-+-+-+-+-+-+-+-+ 0xDF opcodes +-+-+-+-+-+-+-+ */
   5232    else
   5233    if (first_opcode == 0xDF) {
   5234 
   5235       if (modrm < 0xC0) {
   5236 
   5237          /* bits 5,4,3 are an opcode extension, and the modRM also
   5238             specifies an address. */
   5239          IRTemp addr = disAMode( &len, sorb, delta, dis_buf );
   5240          delta += len;
   5241 
   5242          switch (gregOfRM(modrm)) {
   5243 
   5244             case 0: /* FILD m16int */
   5245                DIP("fildw %s\n", dis_buf);
   5246                fp_push();
   5247                put_ST(0, unop(Iop_I32StoF64,
   5248                               unop(Iop_16Sto32,
   5249                                    loadLE(Ity_I16, mkexpr(addr)))));
   5250                break;
   5251 
   5252             case 1: /* FISTTPS m16 (SSE3) */
   5253                DIP("fisttps %s\n", dis_buf);
   5254                storeLE( mkexpr(addr),
   5255                         binop(Iop_F64toI16S, mkU32(Irrm_ZERO), get_ST(0)) );
   5256                fp_pop();
   5257                break;
   5258 
   5259             case 2: /* FIST m16 */
   5260                DIP("fistp %s\n", dis_buf);
   5261                storeLE( mkexpr(addr),
   5262                         binop(Iop_F64toI16S, get_roundingmode(), get_ST(0)) );
   5263                break;
   5264 
   5265             case 3: /* FISTP m16 */
   5266                DIP("fistps %s\n", dis_buf);
   5267                storeLE( mkexpr(addr),
   5268                         binop(Iop_F64toI16S, get_roundingmode(), get_ST(0)) );
   5269                fp_pop();
   5270                break;
   5271 
   5272             case 5: /* FILD m64 */
   5273                DIP("fildll %s\n", dis_buf);
   5274                fp_push();
   5275                put_ST(0, binop(Iop_I64StoF64,
   5276                                get_roundingmode(),
   5277                                loadLE(Ity_I64, mkexpr(addr))));
   5278                break;
   5279 
   5280             case 7: /* FISTP m64 */
   5281                DIP("fistpll %s\n", dis_buf);
   5282                storeLE( mkexpr(addr),
   5283                         binop(Iop_F64toI64S, get_roundingmode(), get_ST(0)) );
   5284                fp_pop();
   5285                break;
   5286 
   5287             default:
   5288                vex_printf("unhandled opc_aux = 0x%2x\n", gregOfRM(modrm));
   5289                vex_printf("first_opcode == 0xDF\n");
   5290                goto decode_fail;
   5291          }
   5292 
   5293       } else {
   5294 
   5295          delta++;
   5296          switch (modrm) {
   5297 
   5298             case 0xC0: /* FFREEP %st(0) */
   5299                DIP("ffreep %%st(%d)\n", 0);
   5300                put_ST_TAG ( 0, mkU8(0) );
   5301                fp_pop();
   5302                break;
   5303 
   5304             case 0xE0: /* FNSTSW %ax */
   5305                DIP("fnstsw %%ax\n");
   5306                /* Get the FPU status word value and dump it in %AX. */
   5307                if (0) {
   5308                   /* The obvious thing to do is simply dump the 16-bit
   5309                      status word value in %AX.  However, due to a
   5310                      limitation in Memcheck's origin tracking
   5311                      machinery, this causes Memcheck not to track the
   5312                      origin of any undefinedness into %AH (only into
   5313                      %AL/%AX/%EAX), which means origins are lost in
   5314                      the sequence "fnstsw %ax; test $M,%ah; jcond .." */
   5315                   putIReg(2, R_EAX, get_FPU_sw());
   5316                } else {
   5317                   /* So a somewhat lame kludge is to make it very
   5318                      clear to Memcheck that the value is written to
   5319                      both %AH and %AL.  This generates marginally
   5320                      worse code, but I don't think it matters much. */
   5321                   IRTemp t16 = newTemp(Ity_I16);
   5322                   assign(t16, get_FPU_sw());
   5323                   putIReg( 1, R_AL, unop(Iop_16to8, mkexpr(t16)) );
   5324                   putIReg( 1, R_AH, unop(Iop_16HIto8, mkexpr(t16)) );
   5325                }
   5326                break;
   5327 
   5328             case 0xE8 ... 0xEF: /* FUCOMIP %st(0),%st(?) */
   5329                fp_do_ucomi_ST0_STi( (UInt)modrm - 0xE8, True );
   5330                break;
   5331 
   5332             case 0xF0 ... 0xF7: /* FCOMIP %st(0),%st(?) */
   5333                /* not really right since COMIP != UCOMIP */
   5334                fp_do_ucomi_ST0_STi( (UInt)modrm - 0xF0, True );
   5335                break;
   5336 
   5337             default:
   5338                goto decode_fail;
   5339          }
   5340       }
   5341 
   5342    }
   5343 
   5344    else
   5345    vpanic("dis_FPU(x86): invalid primary opcode");
   5346 
   5347    *decode_ok = True;
   5348    return delta;
   5349 
   5350   decode_fail:
   5351    *decode_ok = False;
   5352    return delta;
   5353 }
   5354 
   5355 
   5356 /*------------------------------------------------------------*/
   5357 /*---                                                      ---*/
   5358 /*--- MMX INSTRUCTIONS                                     ---*/
   5359 /*---                                                      ---*/
   5360 /*------------------------------------------------------------*/
   5361 
   5362 /* Effect of MMX insns on x87 FPU state (table 11-2 of
   5363    IA32 arch manual, volume 3):
   5364 
   5365    Read from, or write to MMX register (viz, any insn except EMMS):
   5366    * All tags set to Valid (non-empty) -- FPTAGS[i] := nonzero
   5367    * FP stack pointer set to zero
   5368 
   5369    EMMS:
   5370    * All tags set to Invalid (empty) -- FPTAGS[i] := zero
   5371    * FP stack pointer set to zero
   5372 */
   5373 
   5374 static void do_MMX_preamble ( void )
   5375 {
   5376    Int         i;
   5377    IRRegArray* descr = mkIRRegArray( OFFB_FPTAGS, Ity_I8, 8 );
   5378    IRExpr*     zero  = mkU32(0);
   5379    IRExpr*     tag1  = mkU8(1);
   5380    put_ftop(zero);
   5381    for (i = 0; i < 8; i++)
   5382       stmt( IRStmt_PutI( descr, zero, i, tag1 ) );
   5383 }
   5384 
   5385 static void do_EMMS_preamble ( void )
   5386 {
   5387    Int         i;
   5388    IRRegArray* descr = mkIRRegArray( OFFB_FPTAGS, Ity_I8, 8 );
   5389    IRExpr*     zero  = mkU32(0);
   5390    IRExpr*     tag0  = mkU8(0);
   5391    put_ftop(zero);
   5392    for (i = 0; i < 8; i++)
   5393       stmt( IRStmt_PutI( descr, zero, i, tag0 ) );
   5394 }
   5395 
   5396 
   5397 static IRExpr* getMMXReg ( UInt archreg )
   5398 {
   5399    vassert(archreg < 8);
   5400    return IRExpr_Get( OFFB_FPREGS + 8 * archreg, Ity_I64 );
   5401 }
   5402 
   5403 
   5404 static void putMMXReg ( UInt archreg, IRExpr* e )
   5405 {
   5406    vassert(archreg < 8);
   5407    vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_I64);
   5408    stmt( IRStmt_Put( OFFB_FPREGS + 8 * archreg, e ) );
   5409 }
   5410 
   5411 
   5412 /* Helper for non-shift MMX insns.  Note this is incomplete in the
   5413    sense that it does not first call do_MMX_preamble() -- that is the
   5414    responsibility of its caller. */
   5415 
   5416 static
   5417 UInt dis_MMXop_regmem_to_reg ( UChar  sorb,
   5418                                Int    delta,
   5419                                UChar  opc,
   5420                                HChar* name,
   5421                                Bool   show_granularity )
   5422 {
   5423    HChar   dis_buf[50];
   5424    UChar   modrm = getIByte(delta);
   5425    Bool    isReg = epartIsReg(modrm);
   5426    IRExpr* argL  = NULL;
   5427    IRExpr* argR  = NULL;
   5428    IRExpr* argG  = NULL;
   5429    IRExpr* argE  = NULL;
   5430    IRTemp  res   = newTemp(Ity_I64);
   5431 
   5432    Bool    invG  = False;
   5433    IROp    op    = Iop_INVALID;
   5434    void*   hAddr = NULL;
   5435    HChar*  hName = NULL;
   5436    Bool    eLeft = False;
   5437 
   5438 #  define XXX(_name) do { hAddr = &_name; hName = #_name; } while (0)
   5439 
   5440    switch (opc) {
   5441       /* Original MMX ones */
   5442       case 0xFC: op = Iop_Add8x8; break;
   5443       case 0xFD: op = Iop_Add16x4; break;
   5444       case 0xFE: op = Iop_Add32x2; break;
   5445 
   5446       case 0xEC: op = Iop_QAdd8Sx8; break;
   5447       case 0xED: op = Iop_QAdd16Sx4; break;
   5448 
   5449       case 0xDC: op = Iop_QAdd8Ux8; break;
   5450       case 0xDD: op = Iop_QAdd16Ux4; break;
   5451 
   5452       case 0xF8: op = Iop_Sub8x8;  break;
   5453       case 0xF9: op = Iop_Sub16x4; break;
   5454       case 0xFA: op = Iop_Sub32x2; break;
   5455 
   5456       case 0xE8: op = Iop_QSub8Sx8; break;
   5457       case 0xE9: op = Iop_QSub16Sx4; break;
   5458 
   5459       case 0xD8: op = Iop_QSub8Ux8; break;
   5460       case 0xD9: op = Iop_QSub16Ux4; break;
   5461 
   5462       case 0xE5: op = Iop_MulHi16Sx4; break;
   5463       case 0xD5: op = Iop_Mul16x4; break;
   5464       case 0xF5: XXX(x86g_calculate_mmx_pmaddwd); break;
   5465 
   5466       case 0x74: op = Iop_CmpEQ8x8; break;
   5467       case 0x75: op = Iop_CmpEQ16x4; break;
   5468       case 0x76: op = Iop_CmpEQ32x2; break;
   5469 
   5470       case 0x64: op = Iop_CmpGT8Sx8; break;
   5471       case 0x65: op = Iop_CmpGT16Sx4; break;
   5472       case 0x66: op = Iop_CmpGT32Sx2; break;
   5473 
   5474       case 0x6B: op = Iop_QNarrowBin32Sto16Sx4; eLeft = True; break;
   5475       case 0x63: op = Iop_QNarrowBin16Sto8Sx8;  eLeft = True; break;
   5476       case 0x67: op = Iop_QNarrowBin16Sto8Ux8;  eLeft = True; break;
   5477 
   5478       case 0x68: op = Iop_InterleaveHI8x8;  eLeft = True; break;
   5479       case 0x69: op = Iop_InterleaveHI16x4; eLeft = True; break;
   5480       case 0x6A: op = Iop_InterleaveHI32x2; eLeft = True; break;
   5481 
   5482       case 0x60: op = Iop_InterleaveLO8x8;  eLeft = True; break;
   5483       case 0x61: op = Iop_InterleaveLO16x4; eLeft = True; break;
   5484       case 0x62: op = Iop_InterleaveLO32x2; eLeft = True; break;
   5485 
   5486       case 0xDB: op = Iop_And64; break;
   5487       case 0xDF: op = Iop_And64; invG = True; break;
   5488       case 0xEB: op = Iop_Or64; break;
   5489       case 0xEF: /* Possibly do better here if argL and argR are the
   5490                     same reg */
   5491                  op = Iop_Xor64; break;
   5492 
   5493       /* Introduced in SSE1 */
   5494       case 0xE0: op = Iop_Avg8Ux8;    break;
   5495       case 0xE3: op = Iop_Avg16Ux4;   break;
   5496       case 0xEE: op = Iop_Max16Sx4;   break;
   5497       case 0xDE: op = Iop_Max8Ux8;    break;
   5498       case 0xEA: op = Iop_Min16Sx4;   break;
   5499       case 0xDA: op = Iop_Min8Ux8;    break;
   5500       case 0xE4: op = Iop_MulHi16Ux4; break;
   5501       case 0xF6: XXX(x86g_calculate_mmx_psadbw); break;
   5502 
   5503       /* Introduced in SSE2 */
   5504       case 0xD4: op = Iop_Add64; break;
   5505       case 0xFB: op = Iop_Sub64; break;
   5506 
   5507       default:
   5508          vex_printf("\n0x%x\n", (Int)opc);
   5509          vpanic("dis_MMXop_regmem_to_reg");
   5510    }
   5511 
   5512 #  undef XXX
   5513 
   5514    argG = getMMXReg(gregOfRM(modrm));
   5515    if (invG)
   5516       argG = unop(Iop_Not64, argG);
   5517 
   5518    if (isReg) {
   5519       delta++;
   5520       argE = getMMXReg(eregOfRM(modrm));
   5521    } else {
   5522       Int    len;
   5523       IRTemp addr = disAMode( &len, sorb, delta, dis_buf );
   5524       delta += len;
   5525       argE = loadLE(Ity_I64, mkexpr(addr));
   5526    }
   5527 
   5528    if (eLeft) {
   5529       argL = argE;
   5530       argR = argG;
   5531    } else {
   5532       argL = argG;
   5533       argR = argE;
   5534    }
   5535 
   5536    if (op != Iop_INVALID) {
   5537       vassert(hName == NULL);
   5538       vassert(hAddr == NULL);
   5539       assign(res, binop(op, argL, argR));
   5540    } else {
   5541       vassert(hName != NULL);
   5542       vassert(hAddr != NULL);
   5543       assign( res,
   5544               mkIRExprCCall(
   5545                  Ity_I64,
   5546                  0/*regparms*/, hName, hAddr,
   5547                  mkIRExprVec_2( argL, argR )
   5548               )
   5549             );
   5550    }
   5551 
   5552    putMMXReg( gregOfRM(modrm), mkexpr(res) );
   5553 
   5554    DIP("%s%s %s, %s\n",
   5555        name, show_granularity ? nameMMXGran(opc & 3) : "",
   5556        ( isReg ? nameMMXReg(eregOfRM(modrm)) : dis_buf ),
   5557        nameMMXReg(gregOfRM(modrm)) );
   5558 
   5559    return delta;
   5560 }
   5561 
   5562 
   5563 /* Vector by scalar shift of G by the amount specified at the bottom
   5564    of E.  This is a straight copy of dis_SSE_shiftG_byE. */
   5565 
   5566 static UInt dis_MMX_shiftG_byE ( UChar sorb, Int delta,
   5567                                  HChar* opname, IROp op )
   5568 {
   5569    HChar   dis_buf[50];
   5570    Int     alen, size;
   5571    IRTemp  addr;
   5572    Bool    shl, shr, sar;
   5573    UChar   rm   = getIByte(delta);
   5574    IRTemp  g0   = newTemp(Ity_I64);
   5575    IRTemp  g1   = newTemp(Ity_I64);
   5576    IRTemp  amt  = newTemp(Ity_I32);
   5577    IRTemp  amt8 = newTemp(Ity_I8);
   5578 
   5579    if (epartIsReg(rm)) {
   5580       assign( amt, unop(Iop_64to32, getMMXReg(eregOfRM(rm))) );
   5581       DIP("%s %s,%s\n", opname,
   5582                         nameMMXReg(eregOfRM(rm)),
   5583                         nameMMXReg(gregOfRM(rm)) );
   5584       delta++;
   5585    } else {
   5586       addr = disAMode ( &alen, sorb, delta, dis_buf );
   5587       assign( amt, loadLE(Ity_I32, mkexpr(addr)) );
   5588       DIP("%s %s,%s\n", opname,
   5589                         dis_buf,
   5590                         nameMMXReg(gregOfRM(rm)) );
   5591       delta += alen;
   5592    }
   5593    assign( g0,   getMMXReg(gregOfRM(rm)) );
   5594    assign( amt8, unop(Iop_32to8, mkexpr(amt)) );
   5595 
   5596    shl = shr = sar = False;
   5597    size = 0;
   5598    switch (op) {
   5599       case Iop_ShlN16x4: shl = True; size = 32; break;
   5600       case Iop_ShlN32x2: shl = True; size = 32; break;
   5601       case Iop_Shl64:    shl = True; size = 64; break;
   5602       case Iop_ShrN16x4: shr = True; size = 16; break;
   5603       case Iop_ShrN32x2: shr = True; size = 32; break;
   5604       case Iop_Shr64:    shr = True; size = 64; break;
   5605       case Iop_SarN16x4: sar = True; size = 16; break;
   5606       case Iop_SarN32x2: sar = True; size = 32; break;
   5607       default: vassert(0);
   5608    }
   5609 
   5610    if (shl || shr) {
   5611      assign(
   5612         g1,
   5613         IRExpr_Mux0X(
   5614            unop(Iop_1Uto8,binop(Iop_CmpLT32U,mkexpr(amt),mkU32(size))),
   5615            mkU64(0),
   5616            binop(op, mkexpr(g0), mkexpr(amt8))
   5617         )
   5618      );
   5619    } else
   5620    if (sar) {
   5621      assign(
   5622         g1,
   5623         IRExpr_Mux0X(
   5624            unop(Iop_1Uto8,binop(Iop_CmpLT32U,mkexpr(amt),mkU32(size))),
   5625            binop(op, mkexpr(g0), mkU8(size-1)),
   5626            binop(op, mkexpr(g0), mkexpr(amt8))
   5627         )
   5628      );
   5629    } else {
   5630       /*NOTREACHED*/
   5631       vassert(0);
   5632    }
   5633 
   5634    putMMXReg( gregOfRM(rm), mkexpr(g1) );
   5635    return delta;
   5636 }
   5637 
   5638 
   5639 /* Vector by scalar shift of E by an immediate byte.  This is a
   5640    straight copy of dis_SSE_shiftE_imm. */
   5641 
   5642 static
   5643 UInt dis_MMX_shiftE_imm ( Int delta, HChar* opname, IROp op )
   5644 {
   5645    Bool    shl, shr, sar;
   5646    UChar   rm   = getIByte(delta);
   5647    IRTemp  e0   = newTemp(Ity_I64);
   5648    IRTemp  e1   = newTemp(Ity_I64);
   5649    UChar   amt, size;
   5650    vassert(epartIsReg(rm));
   5651    vassert(gregOfRM(rm) == 2
   5652            || gregOfRM(rm) == 4 || gregOfRM(rm) == 6);
   5653    amt = getIByte(delta+1);
   5654    delta += 2;
   5655    DIP("%s $%d,%s\n", opname,
   5656                       (Int)amt,
   5657                       nameMMXReg(eregOfRM(rm)) );
   5658 
   5659    assign( e0, getMMXReg(eregOfRM(rm)) );
   5660 
   5661    shl = shr = sar = False;
   5662    size = 0;
   5663    switch (op) {
   5664       case Iop_ShlN16x4: shl = True; size = 16; break;
   5665       case Iop_ShlN32x2: shl = True; size = 32; break;
   5666       case Iop_Shl64:    shl = True; size = 64; break;
   5667       case Iop_SarN16x4: sar = True; size = 16; break;
   5668       case Iop_SarN32x2: sar = True; size = 32; break;
   5669       case Iop_ShrN16x4: shr = True; size = 16; break;
   5670       case Iop_ShrN32x2: shr = True; size = 32; break;
   5671       case Iop_Shr64:    shr = True; size = 64; break;
   5672       default: vassert(0);
   5673    }
   5674 
   5675    if (shl || shr) {
   5676       assign( e1, amt >= size
   5677                      ? mkU64(0)
   5678                      : binop(op, mkexpr(e0), mkU8(amt))
   5679       );
   5680    } else
   5681    if (sar) {
   5682       assign( e1, amt >= size
   5683                      ? binop(op, mkexpr(e0), mkU8(size-1))
   5684                      : binop(op, mkexpr(e0), mkU8(amt))
   5685       );
   5686    } else {
   5687       /*NOTREACHED*/
   5688       vassert(0);
   5689    }
   5690 
   5691    putMMXReg( eregOfRM(rm), mkexpr(e1) );
   5692    return delta;
   5693 }
   5694 
   5695 
   5696 /* Completely handle all MMX instructions except emms. */
   5697 
   5698 static
   5699 UInt dis_MMX ( Bool* decode_ok, UChar sorb, Int sz, Int delta )
   5700 {
   5701    Int   len;
   5702    UChar modrm;
   5703    HChar dis_buf[50];
   5704    UChar opc = getIByte(delta);
   5705    delta++;
   5706 
   5707    /* dis_MMX handles all insns except emms. */
   5708    do_MMX_preamble();
   5709 
   5710    switch (opc) {
   5711 
   5712       case 0x6E:
   5713          /* MOVD (src)ireg-or-mem (E), (dst)mmxreg (G)*/
   5714          if (sz != 4)
   5715             goto mmx_decode_failure;
   5716          modrm = getIByte(delta);
   5717          if (epartIsReg(modrm)) {
   5718             delta++;
   5719             putMMXReg(
   5720                gregOfRM(modrm),
   5721                binop( Iop_32HLto64,
   5722                       mkU32(0),
   5723                       getIReg(4, eregOfRM(modrm)) ) );
   5724             DIP("movd %s, %s\n",
   5725                 nameIReg(4,eregOfRM(modrm)), nameMMXReg(gregOfRM(modrm)));
   5726          } else {
   5727             IRTemp addr = disAMode( &len, sorb, delta, dis_buf );
   5728             delta += len;
   5729             putMMXReg(
   5730                gregOfRM(modrm),
   5731                binop( Iop_32HLto64,
   5732                       mkU32(0),
   5733                       loadLE(Ity_I32, mkexpr(addr)) ) );
   5734             DIP("movd %s, %s\n", dis_buf, nameMMXReg(gregOfRM(modrm)));
   5735          }
   5736          break;
   5737 
   5738       case 0x7E: /* MOVD (src)mmxreg (G), (dst)ireg-or-mem (E) */
   5739          if (sz != 4)
   5740             goto mmx_decode_failure;
   5741          modrm = getIByte(delta);
   5742          if (epartIsReg(modrm)) {
   5743             delta++;
   5744             putIReg( 4, eregOfRM(modrm),
   5745                      unop(Iop_64to32, getMMXReg(gregOfRM(modrm)) ) );
   5746             DIP("movd %s, %s\n",
   5747                 nameMMXReg(gregOfRM(modrm)), nameIReg(4,eregOfRM(modrm)));
   5748          } else {
   5749             IRTemp addr = disAMode( &len, sorb, delta, dis_buf );
   5750             delta += len;
   5751             storeLE( mkexpr(addr),
   5752                      unop(Iop_64to32, getMMXReg(gregOfRM(modrm)) ) );
   5753             DIP("movd %s, %s\n", nameMMXReg(gregOfRM(modrm)), dis_buf);
   5754          }
   5755          break;
   5756 
   5757       case 0x6F:
   5758          /* MOVQ (src)mmxreg-or-mem, (dst)mmxreg */
   5759          if (sz != 4)
   5760             goto mmx_decode_failure;
   5761          modrm = getIByte(delta);
   5762          if (epartIsReg(modrm)) {
   5763             delta++;
   5764             putMMXReg( gregOfRM(modrm), getMMXReg(eregOfRM(modrm)) );
   5765             DIP("movq %s, %s\n",
   5766                 nameMMXReg(eregOfRM(modrm)), nameMMXReg(gregOfRM(modrm)));
   5767          } else {
   5768             IRTemp addr = disAMode( &len, sorb, delta, dis_buf );
   5769             delta += len;
   5770             putMMXReg( gregOfRM(modrm), loadLE(Ity_I64, mkexpr(addr)) );
   5771             DIP("movq %s, %s\n",
   5772                 dis_buf, nameMMXReg(gregOfRM(modrm)));
   5773          }
   5774          break;
   5775 
   5776       case 0x7F:
   5777          /* MOVQ (src)mmxreg, (dst)mmxreg-or-mem */
   5778          if (sz != 4)
   5779             goto mmx_decode_failure;
   5780          modrm = getIByte(delta);
   5781          if (epartIsReg(modrm)) {
   5782             delta++;
   5783             putMMXReg( eregOfRM(modrm), getMMXReg(gregOfRM(modrm)) );
   5784             DIP("movq %s, %s\n",
   5785                 nameMMXReg(gregOfRM(modrm)), nameMMXReg(eregOfRM(modrm)));
   5786          } else {
   5787             IRTemp addr = disAMode( &len, sorb, delta, dis_buf );
   5788             delta += len;
   5789             storeLE( mkexpr(addr), getMMXReg(gregOfRM(modrm)) );
   5790             DIP("mov(nt)q %s, %s\n",
   5791                 nameMMXReg(gregOfRM(modrm)), dis_buf);
   5792          }
   5793          break;
   5794 
   5795       case 0xFC:
   5796       case 0xFD:
   5797       case 0xFE: /* PADDgg (src)mmxreg-or-mem, (dst)mmxreg */
   5798          if (sz != 4)
   5799             goto mmx_decode_failure;
   5800          delta = dis_MMXop_regmem_to_reg ( sorb, delta, opc, "padd", True );
   5801          break;
   5802 
   5803       case 0xEC:
   5804       case 0xED: /* PADDSgg (src)mmxreg-or-mem, (dst)mmxreg */
   5805          if (sz != 4)
   5806             goto mmx_decode_failure;
   5807          delta = dis_MMXop_regmem_to_reg ( sorb, delta, opc, "padds", True );
   5808          break;
   5809 
   5810       case 0xDC:
   5811       case 0xDD: /* PADDUSgg (src)mmxreg-or-mem, (dst)mmxreg */
   5812          if (sz != 4)
   5813             goto mmx_decode_failure;
   5814          delta = dis_MMXop_regmem_to_reg ( sorb, delta, opc, "paddus", True );
   5815          break;
   5816 
   5817       case 0xF8:
   5818       case 0xF9:
   5819       case 0xFA: /* PSUBgg (src)mmxreg-or-mem, (dst)mmxreg */
   5820          if (sz != 4)
   5821             goto mmx_decode_failure;
   5822          delta = dis_MMXop_regmem_to_reg ( sorb, delta, opc, "psub", True );
   5823          break;
   5824 
   5825       case 0xE8:
   5826       case 0xE9: /* PSUBSgg (src)mmxreg-or-mem, (dst)mmxreg */
   5827          if (sz != 4)
   5828             goto mmx_decode_failure;
   5829          delta = dis_MMXop_regmem_to_reg ( sorb, delta, opc, "psubs", True );
   5830          break;
   5831 
   5832       case 0xD8:
   5833       case 0xD9: /* PSUBUSgg (src)mmxreg-or-mem, (dst)mmxreg */
   5834          if (sz != 4)
   5835             goto mmx_decode_failure;
   5836          delta = dis_MMXop_regmem_to_reg ( sorb, delta, opc, "psubus", True );
   5837          break;
   5838 
   5839       case 0xE5: /* PMULHW (src)mmxreg-or-mem, (dst)mmxreg */
   5840          if (sz != 4)
   5841             goto mmx_decode_failure;
   5842          delta = dis_MMXop_regmem_to_reg ( sorb, delta, opc, "pmulhw", False );
   5843          break;
   5844 
   5845       case 0xD5: /* PMULLW (src)mmxreg-or-mem, (dst)mmxreg */
   5846          if (sz != 4)
   5847             goto mmx_decode_failure;
   5848          delta = dis_MMXop_regmem_to_reg ( sorb, delta, opc, "pmullw", False );
   5849          break;
   5850 
   5851       case 0xF5: /* PMADDWD (src)mmxreg-or-mem, (dst)mmxreg */
   5852          vassert(sz == 4);
   5853          delta = dis_MMXop_regmem_to_reg ( sorb, delta, opc, "pmaddwd", False );
   5854          break;
   5855 
   5856       case 0x74:
   5857       case 0x75:
   5858       case 0x76: /* PCMPEQgg (src)mmxreg-or-mem, (dst)mmxreg */
   5859          if (sz != 4)
   5860             goto mmx_decode_failure;
   5861          delta = dis_MMXop_regmem_to_reg ( sorb, delta, opc, "pcmpeq", True );
   5862          break;
   5863 
   5864       case 0x64:
   5865       case 0x65:
   5866       case 0x66: /* PCMPGTgg (src)mmxreg-or-mem, (dst)mmxreg */
   5867          if (sz != 4)
   5868             goto mmx_decode_failure;
   5869          delta = dis_MMXop_regmem_to_reg ( sorb, delta, opc, "pcmpgt", True );
   5870          break;
   5871 
   5872       case 0x6B: /* PACKSSDW (src)mmxreg-or-mem, (dst)mmxreg */
   5873          if (sz != 4)
   5874             goto mmx_decode_failure;
   5875          delta = dis_MMXop_regmem_to_reg ( sorb, delta, opc, "packssdw", False );
   5876          break;
   5877 
   5878       case 0x63: /* PACKSSWB (src)mmxreg-or-mem, (dst)mmxreg */
   5879          if (sz != 4)
   5880             goto mmx_decode_failure;
   5881          delta = dis_MMXop_regmem_to_reg ( sorb, delta, opc, "packsswb", False );
   5882          break;
   5883 
   5884       case 0x67: /* PACKUSWB (src)mmxreg-or-mem, (dst)mmxreg */
   5885          if (sz != 4)
   5886             goto mmx_decode_failure;
   5887          delta = dis_MMXop_regmem_to_reg ( sorb, delta, opc, "packuswb", False );
   5888          break;
   5889 
   5890       case 0x68:
   5891       case 0x69:
   5892       case 0x6A: /* PUNPCKHgg (src)mmxreg-or-mem, (dst)mmxreg */
   5893          if (sz != 4)
   5894             goto mmx_decode_failure;
   5895          delta = dis_MMXop_regmem_to_reg ( sorb, delta, opc, "punpckh", True );
   5896          break;
   5897 
   5898       case 0x60:
   5899       case 0x61:
   5900       case 0x62: /* PUNPCKLgg (src)mmxreg-or-mem, (dst)mmxreg */
   5901          if (sz != 4)
   5902             goto mmx_decode_failure;
   5903          delta = dis_MMXop_regmem_to_reg ( sorb, delta, opc, "punpckl", True );
   5904          break;
   5905 
   5906       case 0xDB: /* PAND (src)mmxreg-or-mem, (dst)mmxreg */
   5907          if (sz != 4)
   5908             goto mmx_decode_failure;
   5909          delta = dis_MMXop_regmem_to_reg ( sorb, delta, opc, "pand", False );
   5910          break;
   5911 
   5912       case 0xDF: /* PANDN (src)mmxreg-or-mem, (dst)mmxreg */
   5913          if (sz != 4)
   5914             goto mmx_decode_failure;
   5915          delta = dis_MMXop_regmem_to_reg ( sorb, delta, opc, "pandn", False );
   5916          break;
   5917 
   5918       case 0xEB: /* POR (src)mmxreg-or-mem, (dst)mmxreg */
   5919          if (sz != 4)
   5920             goto mmx_decode_failure;
   5921          delta = dis_MMXop_regmem_to_reg ( sorb, delta, opc, "por", False );
   5922          break;
   5923 
   5924       case 0xEF: /* PXOR (src)mmxreg-or-mem, (dst)mmxreg */
   5925          if (sz != 4)
   5926             goto mmx_decode_failure;
   5927          delta = dis_MMXop_regmem_to_reg ( sorb, delta, opc, "pxor", False );
   5928          break;
   5929 
   5930 #     define SHIFT_BY_REG(_name,_op)                                 \
   5931                 delta = dis_MMX_shiftG_byE(sorb, delta, _name, _op); \
   5932                 break;
   5933 
   5934       /* PSLLgg (src)mmxreg-or-mem, (dst)mmxreg */
   5935       case 0xF1: SHIFT_BY_REG("psllw", Iop_ShlN16x4);
   5936       case 0xF2: SHIFT_BY_REG("pslld", Iop_ShlN32x2);
   5937       case 0xF3: SHIFT_BY_REG("psllq", Iop_Shl64);
   5938 
   5939       /* PSRLgg (src)mmxreg-or-mem, (dst)mmxreg */
   5940       case 0xD1: SHIFT_BY_REG("psrlw", Iop_ShrN16x4);
   5941       case 0xD2: SHIFT_BY_REG("psrld", Iop_ShrN32x2);
   5942       case 0xD3: SHIFT_BY_REG("psrlq", Iop_Shr64);
   5943 
   5944       /* PSRAgg (src)mmxreg-or-mem, (dst)mmxreg */
   5945       case 0xE1: SHIFT_BY_REG("psraw", Iop_SarN16x4);
   5946       case 0xE2: SHIFT_BY_REG("psrad", Iop_SarN32x2);
   5947 
   5948 #     undef SHIFT_BY_REG
   5949 
   5950       case 0x71:
   5951       case 0x72:
   5952       case 0x73: {
   5953          /* (sz==4): PSLLgg/PSRAgg/PSRLgg mmxreg by imm8 */
   5954          UChar byte2, subopc;
   5955          if (sz != 4)
   5956             goto mmx_decode_failure;
   5957          byte2  = getIByte(delta);           /* amode / sub-opcode */
   5958          subopc = toUChar( (byte2 >> 3) & 7 );
   5959 
   5960 #        define SHIFT_BY_IMM(_name,_op)                         \
   5961              do { delta = dis_MMX_shiftE_imm(delta,_name,_op);  \
   5962              } while (0)
   5963 
   5964               if (subopc == 2 /*SRL*/ && opc == 0x71)
   5965                  SHIFT_BY_IMM("psrlw", Iop_ShrN16x4);
   5966          else if (subopc == 2 /*SRL*/ && opc == 0x72)
   5967                  SHIFT_BY_IMM("psrld", Iop_ShrN32x2);
   5968          else if (subopc == 2 /*SRL*/ && opc == 0x73)
   5969                  SHIFT_BY_IMM("psrlq", Iop_Shr64);
   5970 
   5971          else if (subopc == 4 /*SAR*/ && opc == 0x71)
   5972                  SHIFT_BY_IMM("psraw", Iop_SarN16x4);
   5973          else if (subopc == 4 /*SAR*/ && opc == 0x72)
   5974                  SHIFT_BY_IMM("psrad", Iop_SarN32x2);
   5975 
   5976          else if (subopc == 6 /*SHL*/ && opc == 0x71)
   5977                  SHIFT_BY_IMM("psllw", Iop_ShlN16x4);
   5978          else if (subopc == 6 /*SHL*/ && opc == 0x72)
   5979                  SHIFT_BY_IMM("pslld", Iop_ShlN32x2);
   5980          else if (subopc == 6 /*SHL*/ && opc == 0x73)
   5981                  SHIFT_BY_IMM("psllq", Iop_Shl64);
   5982 
   5983          else goto mmx_decode_failure;
   5984 
   5985 #        undef SHIFT_BY_IMM
   5986          break;
   5987       }
   5988 
   5989       case 0xF7: {
   5990          IRTemp addr    = newTemp(Ity_I32);
   5991          IRTemp regD    = newTemp(Ity_I64);
   5992          IRTemp regM    = newTemp(Ity_I64);
   5993          IRTemp mask    = newTemp(Ity_I64);
   5994          IRTemp olddata = newTemp(Ity_I64);
   5995          IRTemp newdata = newTemp(Ity_I64);
   5996 
   5997          modrm = getIByte(delta);
   5998          if (sz != 4 || (!epartIsReg(modrm)))
   5999             goto mmx_decode_failure;
   6000          delta++;
   6001 
   6002          assign( addr, handleSegOverride( sorb, getIReg(4, R_EDI) ));
   6003          assign( regM, getMMXReg( eregOfRM(modrm) ));
   6004          assign( regD, getMMXReg( gregOfRM(modrm) ));
   6005          assign( mask, binop(Iop_SarN8x8, mkexpr(regM), mkU8(7)) );
   6006          assign( olddata, loadLE( Ity_I64, mkexpr(addr) ));
   6007          assign( newdata,
   6008                  binop(Iop_Or64,
   6009                        binop(Iop_And64,
   6010                              mkexpr(regD),
   6011                              mkexpr(mask) ),
   6012                        binop(Iop_And64,
   6013                              mkexpr(olddata),
   6014                              unop(Iop_Not64, mkexpr(mask)))) );
   6015          storeLE( mkexpr(addr), mkexpr(newdata) );
   6016          DIP("maskmovq %s,%s\n", nameMMXReg( eregOfRM(modrm) ),
   6017                                  nameMMXReg( gregOfRM(modrm) ) );
   6018          break;
   6019       }
   6020 
   6021       /* --- MMX decode failure --- */
   6022       default:
   6023       mmx_decode_failure:
   6024          *decode_ok = False;
   6025          return delta; /* ignored */
   6026 
   6027    }
   6028 
   6029    *decode_ok = True;
   6030    return delta;
   6031 }
   6032 
   6033 
   6034 /*------------------------------------------------------------*/
   6035 /*--- More misc arithmetic and other obscure insns.        ---*/
   6036 /*------------------------------------------------------------*/
   6037 
   6038 /* Double length left and right shifts.  Apparently only required in
   6039    v-size (no b- variant). */
   6040 static
   6041 UInt dis_SHLRD_Gv_Ev ( UChar sorb,
   6042                        Int delta, UChar modrm,
   6043                        Int sz,
   6044                        IRExpr* shift_amt,
   6045                        Bool amt_is_literal,
   6046                        HChar* shift_amt_txt,
   6047                        Bool left_shift )
   6048 {
   6049    /* shift_amt :: Ity_I8 is the amount to shift.  shift_amt_txt is used
   6050       for printing it.   And eip on entry points at the modrm byte. */
   6051    Int len;
   6052    HChar dis_buf[50];
   6053 
   6054    IRType ty       = szToITy(sz);
   6055    IRTemp gsrc     = newTemp(ty);
   6056    IRTemp esrc     = newTemp(ty);
   6057    IRTemp addr     = IRTemp_INVALID;
   6058    IRTemp tmpSH    = newTemp(Ity_I8);
   6059    IRTemp tmpL     = IRTemp_INVALID;
   6060    IRTemp tmpRes   = IRTemp_INVALID;
   6061    IRTemp tmpSubSh = IRTemp_INVALID;
   6062    IROp   mkpair;
   6063    IROp   getres;
   6064    IROp   shift;
   6065    IRExpr* mask = NULL;
   6066 
   6067    vassert(sz == 2 || sz == 4);
   6068 
   6069    /* The E-part is the destination; this is shifted.  The G-part
   6070       supplies bits to be shifted into the E-part, but is not
   6071       changed.
   6072 
   6073       If shifting left, form a double-length word with E at the top
   6074       and G at the bottom, and shift this left.  The result is then in
   6075       the high part.
   6076 
   6077       If shifting right, form a double-length word with G at the top
   6078       and E at the bottom, and shift this right.  The result is then
   6079       at the bottom.  */
   6080 
   6081    /* Fetch the operands. */
   6082 
   6083    assign( gsrc, getIReg(sz, gregOfRM(modrm)) );
   6084 
   6085    if (epartIsReg(modrm)) {
   6086       delta++;
   6087       assign( esrc, getIReg(sz, eregOfRM(modrm)) );
   6088       DIP("sh%cd%c %s, %s, %s\n",
   6089           ( left_shift ? 'l' : 'r' ), nameISize(sz),
   6090           shift_amt_txt,
   6091           nameIReg(sz, gregOfRM(modrm)), nameIReg(sz, eregOfRM(modrm)));
   6092    } else {
   6093       addr = disAMode ( &len, sorb, delta, dis_buf );
   6094       delta += len;
   6095       assign( esrc, loadLE(ty, mkexpr(addr)) );
   6096       DIP("sh%cd%c %s, %s, %s\n",
   6097           ( left_shift ? 'l' : 'r' ), nameISize(sz),
   6098           shift_amt_txt,
   6099           nameIReg(sz, gregOfRM(modrm)), dis_buf);
   6100    }
   6101 
   6102    /* Round up the relevant primops. */
   6103 
   6104    if (sz == 4) {
   6105       tmpL     = newTemp(Ity_I64);
   6106       tmpRes   = newTemp(Ity_I32);
   6107       tmpSubSh = newTemp(Ity_I32);
   6108       mkpair   = Iop_32HLto64;
   6109       getres   = left_shift ? Iop_64HIto32 : Iop_64to32;
   6110       shift    = left_shift ? Iop_Shl64 : Iop_Shr64;
   6111       mask     = mkU8(31);
   6112    } else {
   6113       /* sz == 2 */
   6114       tmpL     = newTemp(Ity_I32);
   6115       tmpRes   = newTemp(Ity_I16);
   6116       tmpSubSh = newTemp(Ity_I16);
   6117       mkpair   = Iop_16HLto32;
   6118       getres   = left_shift ? Iop_32HIto16 : Iop_32to16;
   6119       shift    = left_shift ? Iop_Shl32 : Iop_Shr32;
   6120       mask     = mkU8(15);
   6121    }
   6122 
   6123    /* Do the shift, calculate the subshift value, and set
   6124       the flag thunk. */
   6125 
   6126    assign( tmpSH, binop(Iop_And8, shift_amt, mask) );
   6127 
   6128    if (left_shift)
   6129       assign( tmpL, binop(mkpair, mkexpr(esrc), mkexpr(gsrc)) );
   6130    else
   6131       assign( tmpL, binop(mkpair, mkexpr(gsrc), mkexpr(esrc)) );
   6132 
   6133    assign( tmpRes, unop(getres, binop(shift, mkexpr(tmpL), mkexpr(tmpSH)) ) );
   6134    assign( tmpSubSh,
   6135            unop(getres,
   6136                 binop(shift,
   6137                       mkexpr(tmpL),
   6138                       binop(Iop_And8,
   6139                             binop(Iop_Sub8, mkexpr(tmpSH), mkU8(1) ),
   6140                             mask))) );
   6141 
   6142    setFlags_DEP1_DEP2_shift ( left_shift ? Iop_Shl32 : Iop_Sar32,
   6143                               tmpRes, tmpSubSh, ty, tmpSH );
   6144 
   6145    /* Put result back. */
   6146 
   6147    if (epartIsReg(modrm)) {
   6148       putIReg(sz, eregOfRM(modrm), mkexpr(tmpRes));
   6149    } else {
   6150       storeLE( mkexpr(addr), mkexpr(tmpRes) );
   6151    }
   6152 
   6153    if (amt_is_literal) delta++;
   6154    return delta;
   6155 }
   6156 
   6157 
   6158 /* Handle BT/BTS/BTR/BTC Gv, Ev.  Apparently b-size is not
   6159    required. */
   6160 
   6161 typedef enum { BtOpNone, BtOpSet, BtOpReset, BtOpComp } BtOp;
   6162 
   6163 static HChar* nameBtOp ( BtOp op )
   6164 {
   6165    switch (op) {
   6166       case BtOpNone:  return "";
   6167       case BtOpSet:   return "s";
   6168       case BtOpReset: return "r";
   6169       case BtOpComp:  return "c";
   6170       default: vpanic("nameBtOp(x86)");
   6171    }
   6172 }
   6173 
   6174 
   6175 static
   6176 UInt dis_bt_G_E ( VexAbiInfo* vbi,
   6177                   UChar sorb, Bool locked, Int sz, Int delta, BtOp op )
   6178 {
   6179    HChar  dis_buf[50];
   6180    UChar  modrm;
   6181    Int    len;
   6182    IRTemp t_fetched, t_bitno0, t_bitno1, t_bitno2, t_addr0,
   6183           t_addr1, t_esp, t_mask, t_new;
   6184 
   6185    vassert(sz == 2 || sz == 4);
   6186 
   6187    t_fetched = t_bitno0 = t_bitno1 = t_bitno2
   6188              = t_addr0 = t_addr1 = t_esp
   6189              = t_mask = t_new = IRTemp_INVALID;
   6190 
   6191    t_fetched = newTemp(Ity_I8);
   6192    t_new     = newTemp(Ity_I8);
   6193    t_bitno0  = newTemp(Ity_I32);
   6194    t_bitno1  = newTemp(Ity_I32);
   6195    t_bitno2  = newTemp(Ity_I8);
   6196    t_addr1   = newTemp(Ity_I32);
   6197    modrm     = getIByte(delta);
   6198 
   6199    assign( t_bitno0, widenSto32(getIReg(sz, gregOfRM(modrm))) );
   6200 
   6201    if (epartIsReg(modrm)) {
   6202       delta++;
   6203       /* Get it onto the client's stack. */
   6204       t_esp = newTemp(Ity_I32);
   6205       t_addr0 = newTemp(Ity_I32);
   6206 
   6207       /* For the choice of the value 128, see comment in dis_bt_G_E in
   6208          guest_amd64_toIR.c.  We point out here only that 128 is
   6209          fast-cased in Memcheck and is > 0, so seems like a good
   6210          choice. */
   6211       vassert(vbi->guest_stack_redzone_size == 0);
   6212       assign( t_esp, binop(Iop_Sub32, getIReg(4, R_ESP), mkU32(128)) );
   6213       putIReg(4, R_ESP, mkexpr(t_esp));
   6214 
   6215       storeLE( mkexpr(t_esp), getIReg(sz, eregOfRM(modrm)) );
   6216 
   6217       /* Make t_addr0 point at it. */
   6218       assign( t_addr0, mkexpr(t_esp) );
   6219 
   6220       /* Mask out upper bits of the shift amount, since we're doing a
   6221          reg. */
   6222       assign( t_bitno1, binop(Iop_And32,
   6223                               mkexpr(t_bitno0),
   6224                               mkU32(sz == 4 ? 31 : 15)) );
   6225 
   6226    } else {
   6227       t_addr0 = disAMode ( &len, sorb, delta, dis_buf );
   6228       delta += len;
   6229       assign( t_bitno1, mkexpr(t_bitno0) );
   6230    }
   6231 
   6232    /* At this point: t_addr0 is the address being operated on.  If it
   6233       was a reg, we will have pushed it onto the client's stack.
   6234       t_bitno1 is the bit number, suitably masked in the case of a
   6235       reg.  */
   6236 
   6237    /* Now the main sequence. */
   6238    assign( t_addr1,
   6239            binop(Iop_Add32,
   6240                  mkexpr(t_addr0),
   6241                  binop(Iop_Sar32, mkexpr(t_bitno1), mkU8(3))) );
   6242 
   6243    /* t_addr1 now holds effective address */
   6244 
   6245    assign( t_bitno2,
   6246            unop(Iop_32to8,
   6247                 binop(Iop_And32, mkexpr(t_bitno1), mkU32(7))) );
   6248 
   6249    /* t_bitno2 contains offset of bit within byte */
   6250 
   6251    if (op != BtOpNone) {
   6252       t_mask = newTemp(Ity_I8);
   6253       assign( t_mask, binop(Iop_Shl8, mkU8(1), mkexpr(t_bitno2)) );
   6254    }
   6255 
   6256    /* t_mask is now a suitable byte mask */
   6257 
   6258    assign( t_fetched, loadLE(Ity_I8, mkexpr(t_addr1)) );
   6259 
   6260    if (op != BtOpNone) {
   6261       switch (op) {
   6262          case BtOpSet:
   6263             assign( t_new,
   6264                     binop(Iop_Or8, mkexpr(t_fetched), mkexpr(t_mask)) );
   6265             break;
   6266          case BtOpComp:
   6267             assign( t_new,
   6268                     binop(Iop_Xor8, mkexpr(t_fetched), mkexpr(t_mask)) );
   6269             break;
   6270          case BtOpReset:
   6271             assign( t_new,
   6272                     binop(Iop_And8, mkexpr(t_fetched),
   6273                                     unop(Iop_Not8, mkexpr(t_mask))) );
   6274             break;
   6275          default:
   6276             vpanic("dis_bt_G_E(x86)");
   6277       }
   6278       if (locked && !epartIsReg(modrm)) {
   6279          casLE( mkexpr(t_addr1), mkexpr(t_fetched)/*expd*/,
   6280                                  mkexpr(t_new)/*new*/,
   6281                                  guest_EIP_curr_instr );
   6282       } else {
   6283          storeLE( mkexpr(t_addr1), mkexpr(t_new) );
   6284       }
   6285    }
   6286 
   6287    /* Side effect done; now get selected bit into Carry flag */
   6288    /* Flags: C=selected bit, O,S,Z,A,P undefined, so are set to zero. */
   6289    stmt( IRStmt_Put( OFFB_CC_OP,   mkU32(X86G_CC_OP_COPY) ));
   6290    stmt( IRStmt_Put( OFFB_CC_DEP2, mkU32(0) ));
   6291    stmt( IRStmt_Put(
   6292             OFFB_CC_DEP1,
   6293             binop(Iop_And32,
   6294                   binop(Iop_Shr32,
   6295                         unop(Iop_8Uto32, mkexpr(t_fetched)),
   6296                         mkexpr(t_bitno2)),
   6297                   mkU32(1)))
   6298        );
   6299    /* Set NDEP even though it isn't used.  This makes redundant-PUT
   6300       elimination of previous stores to this field work better. */
   6301    stmt( IRStmt_Put( OFFB_CC_NDEP, mkU32(0) ));
   6302 
   6303    /* Move reg operand from stack back to reg */
   6304    if (epartIsReg(modrm)) {
   6305       /* t_esp still points at it. */
   6306       putIReg(sz, eregOfRM(modrm), loadLE(szToITy(sz), mkexpr(t_esp)) );
   6307       putIReg(4, R_ESP, binop(Iop_Add32, mkexpr(t_esp), mkU32(128)) );
   6308    }
   6309 
   6310    DIP("bt%s%c %s, %s\n",
   6311        nameBtOp(op), nameISize(sz), nameIReg(sz, gregOfRM(modrm)),
   6312        ( epartIsReg(modrm) ? nameIReg(sz, eregOfRM(modrm)) : dis_buf ) );
   6313 
   6314    return delta;
   6315 }
   6316 
   6317 
   6318 
   6319 /* Handle BSF/BSR.  Only v-size seems necessary. */
   6320 static
   6321 UInt dis_bs_E_G ( UChar sorb, Int sz, Int delta, Bool fwds )
   6322 {
   6323    Bool   isReg;
   6324    UChar  modrm;
   6325    HChar  dis_buf[50];
   6326 
   6327    IRType ty  = szToITy(sz);
   6328    IRTemp src = newTemp(ty);
   6329    IRTemp dst = newTemp(ty);
   6330 
   6331    IRTemp src32 = newTemp(Ity_I32);
   6332    IRTemp dst32 = newTemp(Ity_I32);
   6333    IRTemp src8  = newTemp(Ity_I8);
   6334 
   6335    vassert(sz == 4 || sz == 2);
   6336 
   6337    modrm = getIByte(delta);
   6338 
   6339    isReg = epartIsReg(modrm);
   6340    if (isReg) {
   6341       delta++;
   6342       assign( src, getIReg(sz, eregOfRM(modrm)) );
   6343    } else {
   6344       Int    len;
   6345       IRTemp addr = disAMode( &len, sorb, delta, dis_buf );
   6346       delta += len;
   6347       assign( src, loadLE(ty, mkexpr(addr)) );
   6348    }
   6349 
   6350    DIP("bs%c%c %s, %s\n",
   6351        fwds ? 'f' : 'r', nameISize(sz),
   6352        ( isReg ? nameIReg(sz, eregOfRM(modrm)) : dis_buf ),
   6353        nameIReg(sz, gregOfRM(modrm)));
   6354 
   6355    /* Generate an 8-bit expression which is zero iff the
   6356       original is zero, and nonzero otherwise */
   6357    assign( src8,
   6358            unop(Iop_1Uto8, binop(mkSizedOp(ty,Iop_CmpNE8),
   6359                            mkexpr(src), mkU(ty,0))) );
   6360 
   6361    /* Flags: Z is 1 iff source value is zero.  All others
   6362       are undefined -- we force them to zero. */
   6363    stmt( IRStmt_Put( OFFB_CC_OP,   mkU32(X86G_CC_OP_COPY) ));
   6364    stmt( IRStmt_Put( OFFB_CC_DEP2, mkU32(0) ));
   6365    stmt( IRStmt_Put(
   6366             OFFB_CC_DEP1,
   6367             IRExpr_Mux0X( mkexpr(src8),
   6368                           /* src==0 */
   6369                           mkU32(X86G_CC_MASK_Z),
   6370                           /* src!=0 */
   6371                           mkU32(0)
   6372                         )
   6373        ));
   6374    /* Set NDEP even though it isn't used.  This makes redundant-PUT
   6375       elimination of previous stores to this field work better. */
   6376    stmt( IRStmt_Put( OFFB_CC_NDEP, mkU32(0) ));
   6377 
   6378    /* Result: iff source value is zero, we can't use
   6379       Iop_Clz32/Iop_Ctz32 as they have no defined result in that case.
   6380       But anyway, Intel x86 semantics say the result is undefined in
   6381       such situations.  Hence handle the zero case specially. */
   6382 
   6383    /* Bleh.  What we compute:
   6384 
   6385           bsf32:  if src == 0 then 0 else  Ctz32(src)
   6386           bsr32:  if src == 0 then 0 else  31 - Clz32(src)
   6387 
   6388           bsf16:  if src == 0 then 0 else  Ctz32(16Uto32(src))
   6389           bsr16:  if src == 0 then 0 else  31 - Clz32(16Uto32(src))
   6390 
   6391       First, widen src to 32 bits if it is not already.
   6392 
   6393       Postscript 15 Oct 04: it seems that at least VIA Nehemiah leaves the
   6394       dst register unchanged when src == 0.  Hence change accordingly.
   6395    */
   6396    if (sz == 2)
   6397       assign( src32, unop(Iop_16Uto32, mkexpr(src)) );
   6398    else
   6399       assign( src32, mkexpr(src) );
   6400 
   6401    /* The main computation, guarding against zero. */
   6402    assign( dst32,
   6403            IRExpr_Mux0X(
   6404               mkexpr(src8),
   6405               /* src == 0 -- leave dst unchanged */
   6406               widenUto32( getIReg( sz, gregOfRM(modrm) ) ),
   6407               /* src != 0 */
   6408               fwds ? unop(Iop_Ctz32, mkexpr(src32))
   6409                    : binop(Iop_Sub32,
   6410                            mkU32(31),
   6411                            unop(Iop_Clz32, mkexpr(src32)))
   6412            )
   6413          );
   6414 
   6415    if (sz == 2)
   6416       assign( dst, unop(Iop_32to16, mkexpr(dst32)) );
   6417    else
   6418       assign( dst, mkexpr(dst32) );
   6419 
   6420    /* dump result back */
   6421    putIReg( sz, gregOfRM(modrm), mkexpr(dst) );
   6422 
   6423    return delta;
   6424 }
   6425 
   6426 
   6427 static
   6428 void codegen_xchg_eAX_Reg ( Int sz, Int reg )
   6429 {
   6430    IRType ty = szToITy(sz);
   6431    IRTemp t1 = newTemp(ty);
   6432    IRTemp t2 = newTemp(ty);
   6433    vassert(sz == 2 || sz == 4);
   6434    assign( t1, getIReg(sz, R_EAX) );
   6435    assign( t2, getIReg(sz, reg) );
   6436    putIReg( sz, R_EAX, mkexpr(t2) );
   6437    putIReg( sz, reg, mkexpr(t1) );
   6438    DIP("xchg%c %s, %s\n",
   6439        nameISize(sz), nameIReg(sz, R_EAX), nameIReg(sz, reg));
   6440 }
   6441 
   6442 
   6443 static
   6444 void codegen_SAHF ( void )
   6445 {
   6446    /* Set the flags to:
   6447       (x86g_calculate_flags_all() & X86G_CC_MASK_O)  -- retain the old O flag
   6448       | (%AH & (X86G_CC_MASK_S|X86G_CC_MASK_Z|X86G_CC_MASK_A
   6449                 |X86G_CC_MASK_P|X86G_CC_MASK_C)
   6450    */
   6451    UInt   mask_SZACP = X86G_CC_MASK_S|X86G_CC_MASK_Z|X86G_CC_MASK_A
   6452                        |X86G_CC_MASK_C|X86G_CC_MASK_P;
   6453    IRTemp oldflags   = newTemp(Ity_I32);
   6454    assign( oldflags, mk_x86g_calculate_eflags_all() );
   6455    stmt( IRStmt_Put( OFFB_CC_OP,   mkU32(X86G_CC_OP_COPY) ));
   6456    stmt( IRStmt_Put( OFFB_CC_NDEP, mkU32(0) ));
   6457    stmt( IRStmt_Put( OFFB_CC_DEP2, mkU32(0) ));
   6458    stmt( IRStmt_Put( OFFB_CC_DEP1,
   6459          binop(Iop_Or32,
   6460                binop(Iop_And32, mkexpr(oldflags), mkU32(X86G_CC_MASK_O)),
   6461                binop(Iop_And32,
   6462                      binop(Iop_Shr32, getIReg(4, R_EAX), mkU8(8)),
   6463                      mkU32(mask_SZACP))
   6464               )
   6465    ));
   6466    /* Set NDEP even though it isn't used.  This makes redundant-PUT
   6467       elimination of previous stores to this field work better. */
   6468    stmt( IRStmt_Put( OFFB_CC_NDEP, mkU32(0) ));
   6469 }
   6470 
   6471 
   6472 static
   6473 void codegen_LAHF ( void  )
   6474 {
   6475    /* AH <- EFLAGS(SF:ZF:0:AF:0:PF:1:CF) */
   6476    IRExpr* eax_with_hole;
   6477    IRExpr* new_byte;
   6478    IRExpr* new_eax;
   6479    UInt    mask_SZACP = X86G_CC_MASK_S|X86G_CC_MASK_Z|X86G_CC_MASK_A
   6480                         |X86G_CC_MASK_C|X86G_CC_MASK_P;
   6481 
   6482    IRTemp  flags = newTemp(Ity_I32);
   6483    assign( flags, mk_x86g_calculate_eflags_all() );
   6484 
   6485    eax_with_hole
   6486       = binop(Iop_And32, getIReg(4, R_EAX), mkU32(0xFFFF00FF));
   6487    new_byte
   6488       = binop(Iop_Or32, binop(Iop_And32, mkexpr(flags), mkU32(mask_SZACP)),
   6489                         mkU32(1<<1));
   6490    new_eax
   6491       = binop(Iop_Or32, eax_with_hole,
   6492                         binop(Iop_Shl32, new_byte, mkU8(8)));
   6493    putIReg(4, R_EAX, new_eax);
   6494 }
   6495 
   6496 
   6497 static
   6498 UInt dis_cmpxchg_G_E ( UChar       sorb,
   6499                        Bool        locked,
   6500                        Int         size,
   6501                        Int         delta0 )
   6502 {
   6503    HChar dis_buf[50];
   6504    Int   len;
   6505 
   6506    IRType ty    = szToITy(size);
   6507    IRTemp acc   = newTemp(ty);
   6508    IRTemp src   = newTemp(ty);
   6509    IRTemp dest  = newTemp(ty);
   6510    IRTemp dest2 = newTemp(ty);
   6511    IRTemp acc2  = newTemp(ty);
   6512    IRTemp cond8 = newTemp(Ity_I8);
   6513    IRTemp addr  = IRTemp_INVALID;
   6514    UChar  rm    = getUChar(delta0);
   6515 
   6516    /* There are 3 cases to consider:
   6517 
   6518       reg-reg: ignore any lock prefix, generate sequence based
   6519                on Mux0X
   6520 
   6521       reg-mem, not locked: ignore any lock prefix, generate sequence
   6522                            based on Mux0X
   6523 
   6524       reg-mem, locked: use IRCAS
   6525    */
   6526    if (epartIsReg(rm)) {
   6527       /* case 1 */
   6528       assign( dest, getIReg(size, eregOfRM(rm)) );
   6529       delta0++;
   6530       assign( src, getIReg(size, gregOfRM(rm)) );
   6531       assign( acc, getIReg(size, R_EAX) );
   6532       setFlags_DEP1_DEP2(Iop_Sub8, acc, dest, ty);
   6533       assign( cond8, unop(Iop_1Uto8, mk_x86g_calculate_condition(X86CondZ)) );
   6534       assign( dest2, IRExpr_Mux0X(mkexpr(cond8), mkexpr(dest), mkexpr(src)) );
   6535       assign( acc2,  IRExpr_Mux0X(mkexpr(cond8), mkexpr(dest), mkexpr(acc)) );
   6536       putIReg(size, R_EAX, mkexpr(acc2));
   6537       putIReg(size, eregOfRM(rm), mkexpr(dest2));
   6538       DIP("cmpxchg%c %s,%s\n", nameISize(size),
   6539                                nameIReg(size,gregOfRM(rm)),
   6540                                nameIReg(size,eregOfRM(rm)) );
   6541    }
   6542    else if (!epartIsReg(rm) && !locked) {
   6543       /* case 2 */
   6544       addr = disAMode ( &len, sorb, delta0, dis_buf );
   6545       assign( dest, loadLE(ty, mkexpr(addr)) );
   6546       delta0 += len;
   6547       assign( src, getIReg(size, gregOfRM(rm)) );
   6548       assign( acc, getIReg(size, R_EAX) );
   6549       setFlags_DEP1_DEP2(Iop_Sub8, acc, dest, ty);
   6550       assign( cond8, unop(Iop_1Uto8, mk_x86g_calculate_condition(X86CondZ)) );
   6551       assign( dest2, IRExpr_Mux0X(mkexpr(cond8), mkexpr(dest), mkexpr(src)) );
   6552       assign( acc2,  IRExpr_Mux0X(mkexpr(cond8), mkexpr(dest), mkexpr(acc)) );
   6553       putIReg(size, R_EAX, mkexpr(acc2));
   6554       storeLE( mkexpr(addr), mkexpr(dest2) );
   6555       DIP("cmpxchg%c %s,%s\n", nameISize(size),
   6556                                nameIReg(size,gregOfRM(rm)), dis_buf);
   6557    }
   6558    else if (!epartIsReg(rm) && locked) {
   6559       /* case 3 */
   6560       /* src is new value.  acc is expected value.  dest is old value.
   6561          Compute success from the output of the IRCAS, and steer the
   6562          new value for EAX accordingly: in case of success, EAX is
   6563          unchanged. */
   6564       addr = disAMode ( &len, sorb, delta0, dis_buf );
   6565       delta0 += len;
   6566       assign( src, getIReg(size, gregOfRM(rm)) );
   6567       assign( acc, getIReg(size, R_EAX) );
   6568       stmt( IRStmt_CAS(
   6569          mkIRCAS( IRTemp_INVALID, dest, Iend_LE, mkexpr(addr),
   6570                   NULL, mkexpr(acc), NULL, mkexpr(src) )
   6571       ));
   6572       setFlags_DEP1_DEP2(Iop_Sub8, acc, dest, ty);
   6573       assign( cond8, unop(Iop_1Uto8, mk_x86g_calculate_condition(X86CondZ)) );
   6574       assign( acc2,  IRExpr_Mux0X(mkexpr(cond8), mkexpr(dest), mkexpr(acc)) );
   6575       putIReg(size, R_EAX, mkexpr(acc2));
   6576       DIP("cmpxchg%c %s,%s\n", nameISize(size),
   6577                                nameIReg(size,gregOfRM(rm)), dis_buf);
   6578    }
   6579    else vassert(0);
   6580 
   6581    return delta0;
   6582 }
   6583 
   6584 
   6585 /* Handle conditional move instructions of the form
   6586       cmovcc E(reg-or-mem), G(reg)
   6587 
   6588    E(src) is reg-or-mem
   6589    G(dst) is reg.
   6590 
   6591    If E is reg, -->    GET %E, tmps
   6592                        GET %G, tmpd
   6593                        CMOVcc tmps, tmpd
   6594                        PUT tmpd, %G
   6595 
   6596    If E is mem  -->    (getAddr E) -> tmpa
   6597                        LD (tmpa), tmps
   6598                        GET %G, tmpd
   6599                        CMOVcc tmps, tmpd
   6600                        PUT tmpd, %G
   6601 */
   6602 static
   6603 UInt dis_cmov_E_G ( UChar       sorb,
   6604                     Int         sz,
   6605                     X86Condcode cond,
   6606                     Int         delta0 )
   6607 {
   6608    UChar rm  = getIByte(delta0);
   6609    HChar dis_buf[50];
   6610    Int   len;
   6611 
   6612    IRType ty   = szToITy(sz);
   6613    IRTemp tmps = newTemp(ty);
   6614    IRTemp tmpd = newTemp(ty);
   6615 
   6616    if (epartIsReg(rm)) {
   6617       assign( tmps, getIReg(sz, eregOfRM(rm)) );
   6618       assign( tmpd, getIReg(sz, gregOfRM(rm)) );
   6619 
   6620       putIReg(sz, gregOfRM(rm),
   6621                   IRExpr_Mux0X( unop(Iop_1Uto8,
   6622                                      mk_x86g_calculate_condition(cond)),
   6623                                 mkexpr(tmpd),
   6624                                 mkexpr(tmps) )
   6625              );
   6626       DIP("cmov%c%s %s,%s\n", nameISize(sz),
   6627                               name_X86Condcode(cond),
   6628                               nameIReg(sz,eregOfRM(rm)),
   6629                               nameIReg(sz,gregOfRM(rm)));
   6630       return 1+delta0;
   6631    }
   6632 
   6633    /* E refers to memory */
   6634    {
   6635       IRTemp addr = disAMode ( &len, sorb, delta0, dis_buf );
   6636       assign( tmps, loadLE(ty, mkexpr(addr)) );
   6637       assign( tmpd, getIReg(sz, gregOfRM(rm)) );
   6638 
   6639       putIReg(sz, gregOfRM(rm),
   6640                   IRExpr_Mux0X( unop(Iop_1Uto8,
   6641                                      mk_x86g_calculate_condition(cond)),
   6642                                 mkexpr(tmpd),
   6643                                 mkexpr(tmps) )
   6644              );
   6645 
   6646       DIP("cmov%c%s %s,%s\n", nameISize(sz),
   6647                               name_X86Condcode(cond),
   6648                               dis_buf,
   6649                               nameIReg(sz,gregOfRM(rm)));
   6650       return len+delta0;
   6651    }
   6652 }
   6653 
   6654 
   6655 static
   6656 UInt dis_xadd_G_E ( UChar sorb, Bool locked, Int sz, Int delta0,
   6657                     Bool* decodeOK )
   6658 {
   6659    Int   len;
   6660    UChar rm = getIByte(delta0);
   6661    HChar dis_buf[50];
   6662 
   6663    IRType ty    = szToITy(sz);
   6664    IRTemp tmpd  = newTemp(ty);
   6665    IRTemp tmpt0 = newTemp(ty);
   6666    IRTemp tmpt1 = newTemp(ty);
   6667 
   6668    /* There are 3 cases to consider:
   6669 
   6670       reg-reg: ignore any lock prefix,
   6671                generate 'naive' (non-atomic) sequence
   6672 
   6673       reg-mem, not locked: ignore any lock prefix, generate 'naive'
   6674                            (non-atomic) sequence
   6675 
   6676       reg-mem, locked: use IRCAS
   6677    */
   6678 
   6679    if (epartIsReg(rm)) {
   6680       /* case 1 */
   6681       assign( tmpd,  getIReg(sz, eregOfRM(rm)));
   6682       assign( tmpt0, getIReg(sz, gregOfRM(rm)) );
   6683       assign( tmpt1, binop(mkSizedOp(ty,Iop_Add8),
   6684                            mkexpr(tmpd), mkexpr(tmpt0)) );
   6685       setFlags_DEP1_DEP2( Iop_Add8, tmpd, tmpt0, ty );
   6686       putIReg(sz, eregOfRM(rm), mkexpr(tmpt1));
   6687       putIReg(sz, gregOfRM(rm), mkexpr(tmpd));
   6688       DIP("xadd%c %s, %s\n",
   6689           nameISize(sz), nameIReg(sz,gregOfRM(rm)),
   6690           				 nameIReg(sz,eregOfRM(rm)));
   6691       *decodeOK = True;
   6692       return 1+delta0;
   6693    }
   6694    else if (!epartIsReg(rm) && !locked) {
   6695       /* case 2 */
   6696       IRTemp addr = disAMode ( &len, sorb, delta0, dis_buf );
   6697       assign( tmpd,  loadLE(ty, mkexpr(addr)) );
   6698       assign( tmpt0, getIReg(sz, gregOfRM(rm)) );
   6699       assign( tmpt1, binop(mkSizedOp(ty,Iop_Add8),
   6700                            mkexpr(tmpd), mkexpr(tmpt0)) );
   6701       storeLE( mkexpr(addr), mkexpr(tmpt1) );
   6702       setFlags_DEP1_DEP2( Iop_Add8, tmpd, tmpt0, ty );
   6703       putIReg(sz, gregOfRM(rm), mkexpr(tmpd));
   6704       DIP("xadd%c %s, %s\n",
   6705           nameISize(sz), nameIReg(sz,gregOfRM(rm)), dis_buf);
   6706       *decodeOK = True;
   6707       return len+delta0;
   6708    }
   6709    else if (!epartIsReg(rm) && locked) {
   6710       /* case 3 */
   6711       IRTemp addr = disAMode ( &len, sorb, delta0, dis_buf );
   6712       assign( tmpd,  loadLE(ty, mkexpr(addr)) );
   6713       assign( tmpt0, getIReg(sz, gregOfRM(rm)) );
   6714       assign( tmpt1, binop(mkSizedOp(ty,Iop_Add8),
   6715                            mkexpr(tmpd), mkexpr(tmpt0)) );
   6716       casLE( mkexpr(addr), mkexpr(tmpd)/*expVal*/,
   6717                            mkexpr(tmpt1)/*newVal*/, guest_EIP_curr_instr );
   6718       setFlags_DEP1_DEP2( Iop_Add8, tmpd, tmpt0, ty );
   6719       putIReg(sz, gregOfRM(rm), mkexpr(tmpd));
   6720       DIP("xadd%c %s, %s\n",
   6721           nameISize(sz), nameIReg(sz,gregOfRM(rm)), dis_buf);
   6722       *decodeOK = True;
   6723       return len+delta0;
   6724    }
   6725    /*UNREACHED*/
   6726    vassert(0);
   6727 }
   6728 
   6729 /* Move 16 bits from Ew (ireg or mem) to G (a segment register). */
   6730 
   6731 static
   6732 UInt dis_mov_Ew_Sw ( UChar sorb, Int delta0 )
   6733 {
   6734    Int    len;
   6735    IRTemp addr;
   6736    UChar  rm  = getIByte(delta0);
   6737    HChar  dis_buf[50];
   6738 
   6739    if (epartIsReg(rm)) {
   6740       putSReg( gregOfRM(rm), getIReg(2, eregOfRM(rm)) );
   6741       DIP("movw %s,%s\n", nameIReg(2,eregOfRM(rm)), nameSReg(gregOfRM(rm)));
   6742       return 1+delta0;
   6743    } else {
   6744       addr = disAMode ( &len, sorb, delta0, dis_buf );
   6745       putSReg( gregOfRM(rm), loadLE(Ity_I16, mkexpr(addr)) );
   6746       DIP("movw %s,%s\n", dis_buf, nameSReg(gregOfRM(rm)));
   6747       return len+delta0;
   6748    }
   6749 }
   6750 
   6751 /* Move 16 bits from G (a segment register) to Ew (ireg or mem).  If
   6752    dst is ireg and sz==4, zero out top half of it.  */
   6753 
   6754 static
   6755 UInt dis_mov_Sw_Ew ( UChar sorb,
   6756                      Int   sz,
   6757                      Int   delta0 )
   6758 {
   6759    Int    len;
   6760    IRTemp addr;
   6761    UChar  rm  = getIByte(delta0);
   6762    HChar  dis_buf[50];
   6763 
   6764    vassert(sz == 2 || sz == 4);
   6765 
   6766    if (epartIsReg(rm)) {
   6767       if (sz == 4)
   6768          putIReg(4, eregOfRM(rm), unop(Iop_16Uto32, getSReg(gregOfRM(rm))));
   6769       else
   6770          putIReg(2, eregOfRM(rm), getSReg(gregOfRM(rm)));
   6771 
   6772       DIP("mov %s,%s\n", nameSReg(gregOfRM(rm)), nameIReg(sz,eregOfRM(rm)));
   6773       return 1+delta0;
   6774    } else {
   6775       addr = disAMode ( &len, sorb, delta0, dis_buf );
   6776       storeLE( mkexpr(addr), getSReg(gregOfRM(rm)) );
   6777       DIP("mov %s,%s\n", nameSReg(gregOfRM(rm)), dis_buf);
   6778       return len+delta0;
   6779    }
   6780 }
   6781 
   6782 
   6783 static
   6784 void dis_push_segreg ( UInt sreg, Int sz )
   6785 {
   6786     IRTemp t1 = newTemp(Ity_I16);
   6787     IRTemp ta = newTemp(Ity_I32);
   6788     vassert(sz == 2 || sz == 4);
   6789 
   6790     assign( t1, getSReg(sreg) );
   6791     assign( ta, binop(Iop_Sub32, getIReg(4, R_ESP), mkU32(sz)) );
   6792     putIReg(4, R_ESP, mkexpr(ta));
   6793     storeLE( mkexpr(ta), mkexpr(t1) );
   6794 
   6795     DIP("push%c %s\n", sz==2 ? 'w' : 'l', nameSReg(sreg));
   6796 }
   6797 
   6798 static
   6799 void dis_pop_segreg ( UInt sreg, Int sz )
   6800 {
   6801     IRTemp t1 = newTemp(Ity_I16);
   6802     IRTemp ta = newTemp(Ity_I32);
   6803     vassert(sz == 2 || sz == 4);
   6804 
   6805     assign( ta, getIReg(4, R_ESP) );
   6806     assign( t1, loadLE(Ity_I16, mkexpr(ta)) );
   6807 
   6808     putIReg(4, R_ESP, binop(Iop_Add32, mkexpr(ta), mkU32(sz)) );
   6809     putSReg( sreg, mkexpr(t1) );
   6810     DIP("pop%c %s\n", sz==2 ? 'w' : 'l', nameSReg(sreg));
   6811 }
   6812 
   6813 static
   6814 void dis_ret ( UInt d32 )
   6815 {
   6816    IRTemp t1 = newTemp(Ity_I32), t2 = newTemp(Ity_I32);
   6817    assign(t1, getIReg(4,R_ESP));
   6818    assign(t2, loadLE(Ity_I32,mkexpr(t1)));
   6819    putIReg(4, R_ESP,binop(Iop_Add32, mkexpr(t1), mkU32(4+d32)));
   6820    jmp_treg(Ijk_Ret,t2);
   6821 }
   6822 
   6823 /*------------------------------------------------------------*/
   6824 /*--- SSE/SSE2/SSE3 helpers                                ---*/
   6825 /*------------------------------------------------------------*/
   6826 
   6827 /* Worker function; do not call directly.
   6828    Handles full width G = G `op` E   and   G = (not G) `op` E.
   6829 */
   6830 
   6831 static UInt dis_SSE_E_to_G_all_wrk (
   6832                UChar sorb, Int delta,
   6833                HChar* opname, IROp op,
   6834                Bool   invertG
   6835             )
   6836 {
   6837    HChar   dis_buf[50];
   6838    Int     alen;
   6839    IRTemp  addr;
   6840    UChar   rm = getIByte(delta);
   6841    IRExpr* gpart
   6842       = invertG ? unop(Iop_NotV128, getXMMReg(gregOfRM(rm)))
   6843                 : getXMMReg(gregOfRM(rm));
   6844    if (epartIsReg(rm)) {
   6845       putXMMReg( gregOfRM(rm),
   6846                  binop(op, gpart,
   6847                            getXMMReg(eregOfRM(rm))) );
   6848       DIP("%s %s,%s\n", opname,
   6849                         nameXMMReg(eregOfRM(rm)),
   6850                         nameXMMReg(gregOfRM(rm)) );
   6851       return delta+1;
   6852    } else {
   6853       addr = disAMode ( &alen, sorb, delta, dis_buf );
   6854       putXMMReg( gregOfRM(rm),
   6855                  binop(op, gpart,
   6856                            loadLE(Ity_V128, mkexpr(addr))) );
   6857       DIP("%s %s,%s\n", opname,
   6858                         dis_buf,
   6859                         nameXMMReg(gregOfRM(rm)) );
   6860       return delta+alen;
   6861    }
   6862 }
   6863 
   6864 
   6865 /* All lanes SSE binary operation, G = G `op` E. */
   6866 
   6867 static
   6868 UInt dis_SSE_E_to_G_all ( UChar sorb, Int delta, HChar* opname, IROp op )
   6869 {
   6870    return dis_SSE_E_to_G_all_wrk( sorb, delta, opname, op, False );
   6871 }
   6872 
   6873 /* All lanes SSE binary operation, G = (not G) `op` E. */
   6874 
   6875 static
   6876 UInt dis_SSE_E_to_G_all_invG ( UChar sorb, Int delta,
   6877                                HChar* opname, IROp op )
   6878 {
   6879    return dis_SSE_E_to_G_all_wrk( sorb, delta, opname, op, True );
   6880 }
   6881 
   6882 
   6883 /* Lowest 32-bit lane only SSE binary operation, G = G `op` E. */
   6884 
   6885 static UInt dis_SSE_E_to_G_lo32 ( UChar sorb, Int delta,
   6886                                   HChar* opname, IROp op )
   6887 {
   6888    HChar   dis_buf[50];
   6889    Int     alen;
   6890    IRTemp  addr;
   6891    UChar   rm = getIByte(delta);
   6892    IRExpr* gpart = getXMMReg(gregOfRM(rm));
   6893    if (epartIsReg(rm)) {
   6894       putXMMReg( gregOfRM(rm),
   6895                  binop(op, gpart,
   6896                            getXMMReg(eregOfRM(rm))) );
   6897       DIP("%s %s,%s\n", opname,
   6898                         nameXMMReg(eregOfRM(rm)),
   6899                         nameXMMReg(gregOfRM(rm)) );
   6900       return delta+1;
   6901    } else {
   6902       /* We can only do a 32-bit memory read, so the upper 3/4 of the
   6903          E operand needs to be made simply of zeroes. */
   6904       IRTemp epart = newTemp(Ity_V128);
   6905       addr = disAMode ( &alen, sorb, delta, dis_buf );
   6906       assign( epart, unop( Iop_32UtoV128,
   6907                            loadLE(Ity_I32, mkexpr(addr))) );
   6908       putXMMReg( gregOfRM(rm),
   6909                  binop(op, gpart, mkexpr(epart)) );
   6910       DIP("%s %s,%s\n", opname,
   6911                         dis_buf,
   6912                         nameXMMReg(gregOfRM(rm)) );
   6913       return delta+alen;
   6914    }
   6915 }
   6916 
   6917 
   6918 /* Lower 64-bit lane only SSE binary operation, G = G `op` E. */
   6919 
   6920 static UInt dis_SSE_E_to_G_lo64 ( UChar sorb, Int delta,
   6921                                   HChar* opname, IROp op )
   6922 {
   6923    HChar   dis_buf[50];
   6924    Int     alen;
   6925    IRTemp  addr;
   6926    UChar   rm = getIByte(delta);
   6927    IRExpr* gpart = getXMMReg(gregOfRM(rm));
   6928    if (epartIsReg(rm)) {
   6929       putXMMReg( gregOfRM(rm),
   6930                  binop(op, gpart,
   6931                            getXMMReg(eregOfRM(rm))) );
   6932       DIP("%s %s,%s\n", opname,
   6933                         nameXMMReg(eregOfRM(rm)),
   6934                         nameXMMReg(gregOfRM(rm)) );
   6935       return delta+1;
   6936    } else {
   6937       /* We can only do a 64-bit memory read, so the upper half of the
   6938          E operand needs to be made simply of zeroes. */
   6939       IRTemp epart = newTemp(Ity_V128);
   6940       addr = disAMode ( &alen, sorb, delta, dis_buf );
   6941       assign( epart, unop( Iop_64UtoV128,
   6942                            loadLE(Ity_I64, mkexpr(addr))) );
   6943       putXMMReg( gregOfRM(rm),
   6944                  binop(op, gpart, mkexpr(epart)) );
   6945       DIP("%s %s,%s\n", opname,
   6946                         dis_buf,
   6947                         nameXMMReg(gregOfRM(rm)) );
   6948       return delta+alen;
   6949    }
   6950 }
   6951 
   6952 
   6953 /* All lanes unary SSE operation, G = op(E). */
   6954 
   6955 static UInt dis_SSE_E_to_G_unary_all (
   6956                UChar sorb, Int delta,
   6957                HChar* opname, IROp op
   6958             )
   6959 {
   6960    HChar   dis_buf[50];
   6961    Int     alen;
   6962    IRTemp  addr;
   6963    UChar   rm = getIByte(delta);
   6964    if (epartIsReg(rm)) {
   6965       putXMMReg( gregOfRM(rm),
   6966                  unop(op, getXMMReg(eregOfRM(rm))) );
   6967       DIP("%s %s,%s\n", opname,
   6968                         nameXMMReg(eregOfRM(rm)),
   6969                         nameXMMReg(gregOfRM(rm)) );
   6970       return delta+1;
   6971    } else {
   6972       addr = disAMode ( &alen, sorb, delta, dis_buf );
   6973       putXMMReg( gregOfRM(rm),
   6974                  unop(op, loadLE(Ity_V128, mkexpr(addr))) );
   6975       DIP("%s %s,%s\n", opname,
   6976                         dis_buf,
   6977                         nameXMMReg(gregOfRM(rm)) );
   6978       return delta+alen;
   6979    }
   6980 }
   6981 
   6982 
   6983 /* Lowest 32-bit lane only unary SSE operation, G = op(E). */
   6984 
   6985 static UInt dis_SSE_E_to_G_unary_lo32 (
   6986                UChar sorb, Int delta,
   6987                HChar* opname, IROp op
   6988             )
   6989 {
   6990    /* First we need to get the old G value and patch the low 32 bits
   6991       of the E operand into it.  Then apply op and write back to G. */
   6992    HChar   dis_buf[50];
   6993    Int     alen;
   6994    IRTemp  addr;
   6995    UChar   rm = getIByte(delta);
   6996    IRTemp  oldG0 = newTemp(Ity_V128);
   6997    IRTemp  oldG1 = newTemp(Ity_V128);
   6998 
   6999    assign( oldG0, getXMMReg(gregOfRM(rm)) );
   7000 
   7001    if (epartIsReg(rm)) {
   7002       assign( oldG1,
   7003               binop( Iop_SetV128lo32,
   7004                      mkexpr(oldG0),
   7005                      getXMMRegLane32(eregOfRM(rm), 0)) );
   7006       putXMMReg( gregOfRM(rm), unop(op, mkexpr(oldG1)) );
   7007       DIP("%s %s,%s\n", opname,
   7008                         nameXMMReg(eregOfRM(rm)),
   7009                         nameXMMReg(gregOfRM(rm)) );
   7010       return delta+1;
   7011    } else {
   7012       addr = disAMode ( &alen, sorb, delta, dis_buf );
   7013       assign( oldG1,
   7014               binop( Iop_SetV128lo32,
   7015                      mkexpr(oldG0),
   7016                      loadLE(Ity_I32, mkexpr(addr)) ));
   7017       putXMMReg( gregOfRM(rm), unop(op, mkexpr(oldG1)) );
   7018       DIP("%s %s,%s\n", opname,
   7019                         dis_buf,
   7020                         nameXMMReg(gregOfRM(rm)) );
   7021       return delta+alen;
   7022    }
   7023 }
   7024 
   7025 
   7026 /* Lowest 64-bit lane only unary SSE operation, G = op(E). */
   7027 
   7028 static UInt dis_SSE_E_to_G_unary_lo64 (
   7029                UChar sorb, Int delta,
   7030                HChar* opname, IROp op
   7031             )
   7032 {
   7033    /* First we need to get the old G value and patch the low 64 bits
   7034       of the E operand into it.  Then apply op and write back to G. */
   7035    HChar   dis_buf[50];
   7036    Int     alen;
   7037    IRTemp  addr;
   7038    UChar   rm = getIByte(delta);
   7039    IRTemp  oldG0 = newTemp(Ity_V128);
   7040    IRTemp  oldG1 = newTemp(Ity_V128);
   7041 
   7042    assign( oldG0, getXMMReg(gregOfRM(rm)) );
   7043 
   7044    if (epartIsReg(rm)) {
   7045       assign( oldG1,
   7046               binop( Iop_SetV128lo64,
   7047                      mkexpr(oldG0),
   7048                      getXMMRegLane64(eregOfRM(rm), 0)) );
   7049       putXMMReg( gregOfRM(rm), unop(op, mkexpr(oldG1)) );
   7050       DIP("%s %s,%s\n", opname,
   7051                         nameXMMReg(eregOfRM(rm)),
   7052                         nameXMMReg(gregOfRM(rm)) );
   7053       return delta+1;
   7054    } else {
   7055       addr = disAMode ( &alen, sorb, delta, dis_buf );
   7056       assign( oldG1,
   7057               binop( Iop_SetV128lo64,
   7058                      mkexpr(oldG0),
   7059                      loadLE(Ity_I64, mkexpr(addr)) ));
   7060       putXMMReg( gregOfRM(rm), unop(op, mkexpr(oldG1)) );
   7061       DIP("%s %s,%s\n", opname,
   7062                         dis_buf,
   7063                         nameXMMReg(gregOfRM(rm)) );
   7064       return delta+alen;
   7065    }
   7066 }
   7067 
   7068 
   7069 /* SSE integer binary operation:
   7070       G = G `op` E   (eLeft == False)
   7071       G = E `op` G   (eLeft == True)
   7072 */
   7073 static UInt dis_SSEint_E_to_G(
   7074                UChar sorb, Int delta,
   7075                HChar* opname, IROp op,
   7076                Bool   eLeft
   7077             )
   7078 {
   7079    HChar   dis_buf[50];
   7080    Int     alen;
   7081    IRTemp  addr;
   7082    UChar   rm = getIByte(delta);
   7083    IRExpr* gpart = getXMMReg(gregOfRM(rm));
   7084    IRExpr* epart = NULL;
   7085    if (epartIsReg(rm)) {
   7086       epart = getXMMReg(eregOfRM(rm));
   7087       DIP("%s %s,%s\n", opname,
   7088                         nameXMMReg(eregOfRM(rm)),
   7089                         nameXMMReg(gregOfRM(rm)) );
   7090       delta += 1;
   7091    } else {
   7092       addr  = disAMode ( &alen, sorb, delta, dis_buf );
   7093       epart = loadLE(Ity_V128, mkexpr(addr));
   7094       DIP("%s %s,%s\n", opname,
   7095                         dis_buf,
   7096                         nameXMMReg(gregOfRM(rm)) );
   7097       delta += alen;
   7098    }
   7099    putXMMReg( gregOfRM(rm),
   7100               eLeft ? binop(op, epart, gpart)
   7101 	            : binop(op, gpart, epart) );
   7102    return delta;
   7103 }
   7104 
   7105 
   7106 /* Helper for doing SSE FP comparisons. */
   7107 
   7108 static void findSSECmpOp ( Bool* needNot, IROp* op,
   7109                            Int imm8, Bool all_lanes, Int sz )
   7110 {
   7111    imm8 &= 7;
   7112    *needNot = False;
   7113    *op      = Iop_INVALID;
   7114    if (imm8 >= 4) {
   7115       *needNot = True;
   7116       imm8 -= 4;
   7117    }
   7118 
   7119    if (sz == 4 && all_lanes) {
   7120       switch (imm8) {
   7121          case 0: *op = Iop_CmpEQ32Fx4; return;
   7122          case 1: *op = Iop_CmpLT32Fx4; return;
   7123          case 2: *op = Iop_CmpLE32Fx4; return;
   7124          case 3: *op = Iop_CmpUN32Fx4; return;
   7125          default: break;
   7126       }
   7127    }
   7128    if (sz == 4 && !all_lanes) {
   7129       switch (imm8) {
   7130          case 0: *op = Iop_CmpEQ32F0x4; return;
   7131          case 1: *op = Iop_CmpLT32F0x4; return;
   7132          case 2: *op = Iop_CmpLE32F0x4; return;
   7133          case 3: *op = Iop_CmpUN32F0x4; return;
   7134          default: break;
   7135       }
   7136    }
   7137    if (sz == 8 && all_lanes) {
   7138       switch (imm8) {
   7139          case 0: *op = Iop_CmpEQ64Fx2; return;
   7140          case 1: *op = Iop_CmpLT64Fx2; return;
   7141          case 2: *op = Iop_CmpLE64Fx2; return;
   7142          case 3: *op = Iop_CmpUN64Fx2; return;
   7143          default: break;
   7144       }
   7145    }
   7146    if (sz == 8 && !all_lanes) {
   7147       switch (imm8) {
   7148          case 0: *op = Iop_CmpEQ64F0x2; return;
   7149          case 1: *op = Iop_CmpLT64F0x2; return;
   7150          case 2: *op = Iop_CmpLE64F0x2; return;
   7151          case 3: *op = Iop_CmpUN64F0x2; return;
   7152          default: break;
   7153       }
   7154    }
   7155    vpanic("findSSECmpOp(x86,guest)");
   7156 }
   7157 
   7158 /* Handles SSE 32F/64F comparisons. */
   7159 
   7160 static UInt dis_SSEcmp_E_to_G ( UChar sorb, Int delta,
   7161 				HChar* opname, Bool all_lanes, Int sz )
   7162 {
   7163    HChar   dis_buf[50];
   7164    Int     alen, imm8;
   7165    IRTemp  addr;
   7166    Bool    needNot = False;
   7167    IROp    op      = Iop_INVALID;
   7168    IRTemp  plain   = newTemp(Ity_V128);
   7169    UChar   rm      = getIByte(delta);
   7170    UShort  mask    = 0;
   7171    vassert(sz == 4 || sz == 8);
   7172    if (epartIsReg(rm)) {
   7173       imm8 = getIByte(delta+1);
   7174       findSSECmpOp(&needNot, &op, imm8, all_lanes, sz);
   7175       assign( plain, binop(op, getXMMReg(gregOfRM(rm)),
   7176                                getXMMReg(eregOfRM(rm))) );
   7177       delta += 2;
   7178       DIP("%s $%d,%s,%s\n", opname,
   7179                             (Int)imm8,
   7180                             nameXMMReg(eregOfRM(rm)),
   7181                             nameXMMReg(gregOfRM(rm)) );
   7182    } else {
   7183       addr = disAMode ( &alen, sorb, delta, dis_buf );
   7184       imm8 = getIByte(delta+alen);
   7185       findSSECmpOp(&needNot, &op, imm8, all_lanes, sz);
   7186       assign( plain,
   7187               binop(
   7188                  op,
   7189                  getXMMReg(gregOfRM(rm)),
   7190                    all_lanes  ? loadLE(Ity_V128, mkexpr(addr))
   7191                  : sz == 8    ? unop( Iop_64UtoV128, loadLE(Ity_I64, mkexpr(addr)))
   7192                  : /*sz==4*/    unop( Iop_32UtoV128, loadLE(Ity_I32, mkexpr(addr)))
   7193              )
   7194       );
   7195       delta += alen+1;
   7196       DIP("%s $%d,%s,%s\n", opname,
   7197                             (Int)imm8,
   7198                             dis_buf,
   7199                             nameXMMReg(gregOfRM(rm)) );
   7200    }
   7201 
   7202    if (needNot && all_lanes) {
   7203       putXMMReg( gregOfRM(rm),
   7204                  unop(Iop_NotV128, mkexpr(plain)) );
   7205    }
   7206    else
   7207    if (needNot && !all_lanes) {
   7208       mask = toUShort( sz==4 ? 0x000F : 0x00FF );
   7209       putXMMReg( gregOfRM(rm),
   7210                  binop(Iop_XorV128, mkexpr(plain), mkV128(mask)) );
   7211    }
   7212    else {
   7213       putXMMReg( gregOfRM(rm), mkexpr(plain) );
   7214    }
   7215 
   7216    return delta;
   7217 }
   7218 
   7219 
   7220 /* Vector by scalar shift of G by the amount specified at the bottom
   7221    of E. */
   7222 
   7223 static UInt dis_SSE_shiftG_byE ( UChar sorb, Int delta,
   7224                                  HChar* opname, IROp op )
   7225 {
   7226    HChar   dis_buf[50];
   7227    Int     alen, size;
   7228    IRTemp  addr;
   7229    Bool    shl, shr, sar;
   7230    UChar   rm   = getIByte(delta);
   7231    IRTemp  g0   = newTemp(Ity_V128);
   7232    IRTemp  g1   = newTemp(Ity_V128);
   7233    IRTemp  amt  = newTemp(Ity_I32);
   7234    IRTemp  amt8 = newTemp(Ity_I8);
   7235    if (epartIsReg(rm)) {
   7236       assign( amt, getXMMRegLane32(eregOfRM(rm), 0) );
   7237       DIP("%s %s,%s\n", opname,
   7238                         nameXMMReg(eregOfRM(rm)),
   7239                         nameXMMReg(gregOfRM(rm)) );
   7240       delta++;
   7241    } else {
   7242       addr = disAMode ( &alen, sorb, delta, dis_buf );
   7243       assign( amt, loadLE(Ity_I32, mkexpr(addr)) );
   7244       DIP("%s %s,%s\n", opname,
   7245                         dis_buf,
   7246                         nameXMMReg(gregOfRM(rm)) );
   7247       delta += alen;
   7248    }
   7249    assign( g0,   getXMMReg(gregOfRM(rm)) );
   7250    assign( amt8, unop(Iop_32to8, mkexpr(amt)) );
   7251 
   7252    shl = shr = sar = False;
   7253    size = 0;
   7254    switch (op) {
   7255       case Iop_ShlN16x8: shl = True; size = 32; break;
   7256       case Iop_ShlN32x4: shl = True; size = 32; break;
   7257       case Iop_ShlN64x2: shl = True; size = 64; break;
   7258       case Iop_SarN16x8: sar = True; size = 16; break;
   7259       case Iop_SarN32x4: sar = True; size = 32; break;
   7260       case Iop_ShrN16x8: shr = True; size = 16; break;
   7261       case Iop_ShrN32x4: shr = True; size = 32; break;
   7262       case Iop_ShrN64x2: shr = True; size = 64; break;
   7263       default: vassert(0);
   7264    }
   7265 
   7266    if (shl || shr) {
   7267      assign(
   7268         g1,
   7269         IRExpr_Mux0X(
   7270            unop(Iop_1Uto8,binop(Iop_CmpLT32U,mkexpr(amt),mkU32(size))),
   7271            mkV128(0x0000),
   7272            binop(op, mkexpr(g0), mkexpr(amt8))
   7273         )
   7274      );
   7275    } else
   7276    if (sar) {
   7277      assign(
   7278         g1,
   7279         IRExpr_Mux0X(
   7280            unop(Iop_1Uto8,binop(Iop_CmpLT32U,mkexpr(amt),mkU32(size))),
   7281            binop(op, mkexpr(g0), mkU8(size-1)),
   7282            binop(op, mkexpr(g0), mkexpr(amt8))
   7283         )
   7284      );
   7285    } else {
   7286       /*NOTREACHED*/
   7287       vassert(0);
   7288    }
   7289 
   7290    putXMMReg( gregOfRM(rm), mkexpr(g1) );
   7291    return delta;
   7292 }
   7293 
   7294 
   7295 /* Vector by scalar shift of E by an immediate byte. */
   7296 
   7297 static
   7298 UInt dis_SSE_shiftE_imm ( Int delta, HChar* opname, IROp op )
   7299 {
   7300    Bool    shl, shr, sar;
   7301    UChar   rm   = getIByte(delta);
   7302    IRTemp  e0   = newTemp(Ity_V128);
   7303    IRTemp  e1   = newTemp(Ity_V128);
   7304    UChar   amt, size;
   7305    vassert(epartIsReg(rm));
   7306    vassert(gregOfRM(rm) == 2
   7307            || gregOfRM(rm) == 4 || gregOfRM(rm) == 6);
   7308    amt = getIByte(delta+1);
   7309    delta += 2;
   7310    DIP("%s $%d,%s\n", opname,
   7311                       (Int)amt,
   7312                       nameXMMReg(eregOfRM(rm)) );
   7313    assign( e0, getXMMReg(eregOfRM(rm)) );
   7314 
   7315    shl = shr = sar = False;
   7316    size = 0;
   7317    switch (op) {
   7318       case Iop_ShlN16x8: shl = True; size = 16; break;
   7319       case Iop_ShlN32x4: shl = True; size = 32; break;
   7320       case Iop_ShlN64x2: shl = True; size = 64; break;
   7321       case Iop_SarN16x8: sar = True; size = 16; break;
   7322       case Iop_SarN32x4: sar = True; size = 32; break;
   7323       case Iop_ShrN16x8: shr = True; size = 16; break;
   7324       case Iop_ShrN32x4: shr = True; size = 32; break;
   7325       case Iop_ShrN64x2: shr = True; size = 64; break;
   7326       default: vassert(0);
   7327    }
   7328 
   7329    if (shl || shr) {
   7330       assign( e1, amt >= size
   7331                      ? mkV128(0x0000)
   7332                      : binop(op, mkexpr(e0), mkU8(amt))
   7333       );
   7334    } else
   7335    if (sar) {
   7336       assign( e1, amt >= size
   7337                      ? binop(op, mkexpr(e0), mkU8(size-1))
   7338                      : binop(op, mkexpr(e0), mkU8(amt))
   7339       );
   7340    } else {
   7341       /*NOTREACHED*/
   7342       vassert(0);
   7343    }
   7344 
   7345    putXMMReg( eregOfRM(rm), mkexpr(e1) );
   7346    return delta;
   7347 }
   7348 
   7349 
   7350 /* Get the current SSE rounding mode. */
   7351 
   7352 static IRExpr* /* :: Ity_I32 */ get_sse_roundingmode ( void )
   7353 {
   7354    return binop( Iop_And32,
   7355                  IRExpr_Get( OFFB_SSEROUND, Ity_I32 ),
   7356                  mkU32(3) );
   7357 }
   7358 
   7359 static void put_sse_roundingmode ( IRExpr* sseround )
   7360 {
   7361    vassert(typeOfIRExpr(irsb->tyenv, sseround) == Ity_I32);
   7362    stmt( IRStmt_Put( OFFB_SSEROUND, sseround ) );
   7363 }
   7364 
   7365 /* Break a 128-bit value up into four 32-bit ints. */
   7366 
   7367 static void breakup128to32s ( IRTemp t128,
   7368 			      /*OUTs*/
   7369                               IRTemp* t3, IRTemp* t2,
   7370                               IRTemp* t1, IRTemp* t0 )
   7371 {
   7372    IRTemp hi64 = newTemp(Ity_I64);
   7373    IRTemp lo64 = newTemp(Ity_I64);
   7374    assign( hi64, unop(Iop_V128HIto64, mkexpr(t128)) );
   7375    assign( lo64, unop(Iop_V128to64,   mkexpr(t128)) );
   7376 
   7377    vassert(t0 && *t0 == IRTemp_INVALID);
   7378    vassert(t1 && *t1 == IRTemp_INVALID);
   7379    vassert(t2 && *t2 == IRTemp_INVALID);
   7380    vassert(t3 && *t3 == IRTemp_INVALID);
   7381 
   7382    *t0 = newTemp(Ity_I32);
   7383    *t1 = newTemp(Ity_I32);
   7384    *t2 = newTemp(Ity_I32);
   7385    *t3 = newTemp(Ity_I32);
   7386    assign( *t0, unop(Iop_64to32,   mkexpr(lo64)) );
   7387    assign( *t1, unop(Iop_64HIto32, mkexpr(lo64)) );
   7388    assign( *t2, unop(Iop_64to32,   mkexpr(hi64)) );
   7389    assign( *t3, unop(Iop_64HIto32, mkexpr(hi64)) );
   7390 }
   7391 
   7392 /* Construct a 128-bit value from four 32-bit ints. */
   7393 
   7394 static IRExpr* mk128from32s ( IRTemp t3, IRTemp t2,
   7395                               IRTemp t1, IRTemp t0 )
   7396 {
   7397    return
   7398       binop( Iop_64HLtoV128,
   7399              binop(Iop_32HLto64, mkexpr(t3), mkexpr(t2)),
   7400              binop(Iop_32HLto64, mkexpr(t1), mkexpr(t0))
   7401    );
   7402 }
   7403 
   7404 /* Break a 64-bit value up into four 16-bit ints. */
   7405 
   7406 static void breakup64to16s ( IRTemp t64,
   7407                              /*OUTs*/
   7408                              IRTemp* t3, IRTemp* t2,
   7409                              IRTemp* t1, IRTemp* t0 )
   7410 {
   7411    IRTemp hi32 = newTemp(Ity_I32);
   7412    IRTemp lo32 = newTemp(Ity_I32);
   7413    assign( hi32, unop(Iop_64HIto32, mkexpr(t64)) );
   7414    assign( lo32, unop(Iop_64to32,   mkexpr(t64)) );
   7415 
   7416    vassert(t0 && *t0 == IRTemp_INVALID);
   7417    vassert(t1 && *t1 == IRTemp_INVALID);
   7418    vassert(t2 && *t2 == IRTemp_INVALID);
   7419    vassert(t3 && *t3 == IRTemp_INVALID);
   7420 
   7421    *t0 = newTemp(Ity_I16);
   7422    *t1 = newTemp(Ity_I16);
   7423    *t2 = newTemp(Ity_I16);
   7424    *t3 = newTemp(Ity_I16);
   7425    assign( *t0, unop(Iop_32to16,   mkexpr(lo32)) );
   7426    assign( *t1, unop(Iop_32HIto16, mkexpr(lo32)) );
   7427    assign( *t2, unop(Iop_32to16,   mkexpr(hi32)) );
   7428    assign( *t3, unop(Iop_32HIto16, mkexpr(hi32)) );
   7429 }
   7430 
   7431 /* Construct a 64-bit value from four 16-bit ints. */
   7432 
   7433 static IRExpr* mk64from16s ( IRTemp t3, IRTemp t2,
   7434                              IRTemp t1, IRTemp t0 )
   7435 {
   7436    return
   7437       binop( Iop_32HLto64,
   7438              binop(Iop_16HLto32, mkexpr(t3), mkexpr(t2)),
   7439              binop(Iop_16HLto32, mkexpr(t1), mkexpr(t0))
   7440    );
   7441 }
   7442 
   7443 /* Generate IR to set the guest %EFLAGS from the pushfl-format image
   7444    in the given 32-bit temporary.  The flags that are set are: O S Z A
   7445    C P D ID AC.
   7446 
   7447    In all cases, code to set AC is generated.  However, VEX actually
   7448    ignores the AC value and so can optionally emit an emulation
   7449    warning when it is enabled.  In this routine, an emulation warning
   7450    is only emitted if emit_AC_emwarn is True, in which case
   7451    next_insn_EIP must be correct (this allows for correct code
   7452    generation for popfl/popfw).  If emit_AC_emwarn is False,
   7453    next_insn_EIP is unimportant (this allows for easy if kludgey code
   7454    generation for IRET.) */
   7455 
   7456 static
   7457 void set_EFLAGS_from_value ( IRTemp t1,
   7458                              Bool   emit_AC_emwarn,
   7459                              Addr32 next_insn_EIP )
   7460 {
   7461    vassert(typeOfIRTemp(irsb->tyenv,t1) == Ity_I32);
   7462 
   7463    /* t1 is the flag word.  Mask out everything except OSZACP and set
   7464       the flags thunk to X86G_CC_OP_COPY. */
   7465    stmt( IRStmt_Put( OFFB_CC_OP,   mkU32(X86G_CC_OP_COPY) ));
   7466    stmt( IRStmt_Put( OFFB_CC_DEP2, mkU32(0) ));
   7467    stmt( IRStmt_Put( OFFB_CC_DEP1,
   7468                      binop(Iop_And32,
   7469                            mkexpr(t1),
   7470                            mkU32( X86G_CC_MASK_C | X86G_CC_MASK_P
   7471                                   | X86G_CC_MASK_A | X86G_CC_MASK_Z
   7472                                   | X86G_CC_MASK_S| X86G_CC_MASK_O )
   7473                           )
   7474                     )
   7475        );
   7476    /* Set NDEP even though it isn't used.  This makes redundant-PUT
   7477       elimination of previous stores to this field work better. */
   7478    stmt( IRStmt_Put( OFFB_CC_NDEP, mkU32(0) ));
   7479 
   7480    /* Also need to set the D flag, which is held in bit 10 of t1.
   7481       If zero, put 1 in OFFB_DFLAG, else -1 in OFFB_DFLAG. */
   7482    stmt( IRStmt_Put(
   7483             OFFB_DFLAG,
   7484             IRExpr_Mux0X(
   7485                unop(Iop_32to8,
   7486                     binop(Iop_And32,
   7487                           binop(Iop_Shr32, mkexpr(t1), mkU8(10)),
   7488                           mkU32(1))),
   7489                mkU32(1),
   7490                mkU32(0xFFFFFFFF)))
   7491        );
   7492 
   7493    /* Set the ID flag */
   7494    stmt( IRStmt_Put(
   7495             OFFB_IDFLAG,
   7496             IRExpr_Mux0X(
   7497                unop(Iop_32to8,
   7498                     binop(Iop_And32,
   7499                           binop(Iop_Shr32, mkexpr(t1), mkU8(21)),
   7500                           mkU32(1))),
   7501                mkU32(0),
   7502                mkU32(1)))
   7503        );
   7504 
   7505    /* And set the AC flag.  If setting it 1 to, possibly emit an
   7506       emulation warning. */
   7507    stmt( IRStmt_Put(
   7508             OFFB_ACFLAG,
   7509             IRExpr_Mux0X(
   7510                unop(Iop_32to8,
   7511                     binop(Iop_And32,
   7512                           binop(Iop_Shr32, mkexpr(t1), mkU8(18)),
   7513                           mkU32(1))),
   7514                mkU32(0),
   7515                mkU32(1)))
   7516        );
   7517 
   7518    if (emit_AC_emwarn) {
   7519       put_emwarn( mkU32(EmWarn_X86_acFlag) );
   7520       stmt(
   7521          IRStmt_Exit(
   7522             binop( Iop_CmpNE32,
   7523                    binop(Iop_And32, mkexpr(t1), mkU32(1<<18)),
   7524                    mkU32(0) ),
   7525             Ijk_EmWarn,
   7526             IRConst_U32( next_insn_EIP )
   7527          )
   7528       );
   7529    }
   7530 }
   7531 
   7532 
   7533 /* Helper for the SSSE3 (not SSE3) PMULHRSW insns.  Given two 64-bit
   7534    values (aa,bb), computes, for each of the 4 16-bit lanes:
   7535 
   7536    (((aa_lane *s32 bb_lane) >>u 14) + 1) >>u 1
   7537 */
   7538 static IRExpr* dis_PMULHRSW_helper ( IRExpr* aax, IRExpr* bbx )
   7539 {
   7540    IRTemp aa      = newTemp(Ity_I64);
   7541    IRTemp bb      = newTemp(Ity_I64);
   7542    IRTemp aahi32s = newTemp(Ity_I64);
   7543    IRTemp aalo32s = newTemp(Ity_I64);
   7544    IRTemp bbhi32s = newTemp(Ity_I64);
   7545    IRTemp bblo32s = newTemp(Ity_I64);
   7546    IRTemp rHi     = newTemp(Ity_I64);
   7547    IRTemp rLo     = newTemp(Ity_I64);
   7548    IRTemp one32x2 = newTemp(Ity_I64);
   7549    assign(aa, aax);
   7550    assign(bb, bbx);
   7551    assign( aahi32s,
   7552            binop(Iop_SarN32x2,
   7553                  binop(Iop_InterleaveHI16x4, mkexpr(aa), mkexpr(aa)),
   7554                  mkU8(16) ));
   7555    assign( aalo32s,
   7556            binop(Iop_SarN32x2,
   7557                  binop(Iop_InterleaveLO16x4, mkexpr(aa), mkexpr(aa)),
   7558                  mkU8(16) ));
   7559    assign( bbhi32s,
   7560            binop(Iop_SarN32x2,
   7561                  binop(Iop_InterleaveHI16x4, mkexpr(bb), mkexpr(bb)),
   7562                  mkU8(16) ));
   7563    assign( bblo32s,
   7564            binop(Iop_SarN32x2,
   7565                  binop(Iop_InterleaveLO16x4, mkexpr(bb), mkexpr(bb)),
   7566                  mkU8(16) ));
   7567    assign(one32x2, mkU64( (1ULL << 32) + 1 ));
   7568    assign(
   7569       rHi,
   7570       binop(
   7571          Iop_ShrN32x2,
   7572          binop(
   7573             Iop_Add32x2,
   7574             binop(
   7575                Iop_ShrN32x2,
   7576                binop(Iop_Mul32x2, mkexpr(aahi32s), mkexpr(bbhi32s)),
   7577                mkU8(14)
   7578             ),
   7579             mkexpr(one32x2)
   7580          ),
   7581          mkU8(1)
   7582       )
   7583    );
   7584    assign(
   7585       rLo,
   7586       binop(
   7587          Iop_ShrN32x2,
   7588          binop(
   7589             Iop_Add32x2,
   7590             binop(
   7591                Iop_ShrN32x2,
   7592                binop(Iop_Mul32x2, mkexpr(aalo32s), mkexpr(bblo32s)),
   7593                mkU8(14)
   7594             ),
   7595             mkexpr(one32x2)
   7596          ),
   7597          mkU8(1)
   7598       )
   7599    );
   7600    return
   7601       binop(Iop_CatEvenLanes16x4, mkexpr(rHi), mkexpr(rLo));
   7602 }
   7603 
   7604 /* Helper for the SSSE3 (not SSE3) PSIGN{B,W,D} insns.  Given two 64-bit
   7605    values (aa,bb), computes, for each lane:
   7606 
   7607           if aa_lane < 0 then - bb_lane
   7608      else if aa_lane > 0 then bb_lane
   7609      else 0
   7610 */
   7611 static IRExpr* dis_PSIGN_helper ( IRExpr* aax, IRExpr* bbx, Int laneszB )
   7612 {
   7613    IRTemp aa       = newTemp(Ity_I64);
   7614    IRTemp bb       = newTemp(Ity_I64);
   7615    IRTemp zero     = newTemp(Ity_I64);
   7616    IRTemp bbNeg    = newTemp(Ity_I64);
   7617    IRTemp negMask  = newTemp(Ity_I64);
   7618    IRTemp posMask  = newTemp(Ity_I64);
   7619    IROp   opSub    = Iop_INVALID;
   7620    IROp   opCmpGTS = Iop_INVALID;
   7621 
   7622    switch (laneszB) {
   7623       case 1: opSub = Iop_Sub8x8;  opCmpGTS = Iop_CmpGT8Sx8;  break;
   7624       case 2: opSub = Iop_Sub16x4; opCmpGTS = Iop_CmpGT16Sx4; break;
   7625       case 4: opSub = Iop_Sub32x2; opCmpGTS = Iop_CmpGT32Sx2; break;
   7626       default: vassert(0);
   7627    }
   7628 
   7629    assign( aa,      aax );
   7630    assign( bb,      bbx );
   7631    assign( zero,    mkU64(0) );
   7632    assign( bbNeg,   binop(opSub,    mkexpr(zero), mkexpr(bb)) );
   7633    assign( negMask, binop(opCmpGTS, mkexpr(zero), mkexpr(aa)) );
   7634    assign( posMask, binop(opCmpGTS, mkexpr(aa),   mkexpr(zero)) );
   7635 
   7636    return
   7637       binop(Iop_Or64,
   7638             binop(Iop_And64, mkexpr(bb),    mkexpr(posMask)),
   7639             binop(Iop_And64, mkexpr(bbNeg), mkexpr(negMask)) );
   7640 
   7641 }
   7642 
   7643 /* Helper for the SSSE3 (not SSE3) PABS{B,W,D} insns.  Given a 64-bit
   7644    value aa, computes, for each lane
   7645 
   7646    if aa < 0 then -aa else aa
   7647 
   7648    Note that the result is interpreted as unsigned, so that the
   7649    absolute value of the most negative signed input can be
   7650    represented.
   7651 */
   7652 static IRExpr* dis_PABS_helper ( IRExpr* aax, Int laneszB )
   7653 {
   7654    IRTemp aa      = newTemp(Ity_I64);
   7655    IRTemp zero    = newTemp(Ity_I64);
   7656    IRTemp aaNeg   = newTemp(Ity_I64);
   7657    IRTemp negMask = newTemp(Ity_I64);
   7658    IRTemp posMask = newTemp(Ity_I64);
   7659    IROp   opSub   = Iop_INVALID;
   7660    IROp   opSarN  = Iop_INVALID;
   7661 
   7662    switch (laneszB) {
   7663       case 1: opSub = Iop_Sub8x8;  opSarN = Iop_SarN8x8;  break;
   7664       case 2: opSub = Iop_Sub16x4; opSarN = Iop_SarN16x4; break;
   7665       case 4: opSub = Iop_Sub32x2; opSarN = Iop_SarN32x2; break;
   7666       default: vassert(0);
   7667    }
   7668 
   7669    assign( aa,      aax );
   7670    assign( negMask, binop(opSarN, mkexpr(aa), mkU8(8*laneszB-1)) );
   7671    assign( posMask, unop(Iop_Not64, mkexpr(negMask)) );
   7672    assign( zero,    mkU64(0) );
   7673    assign( aaNeg,   binop(opSub, mkexpr(zero), mkexpr(aa)) );
   7674    return
   7675       binop(Iop_Or64,
   7676             binop(Iop_And64, mkexpr(aa),    mkexpr(posMask)),
   7677             binop(Iop_And64, mkexpr(aaNeg), mkexpr(negMask)) );
   7678 }
   7679 
   7680 static IRExpr* dis_PALIGNR_XMM_helper ( IRTemp hi64,
   7681                                         IRTemp lo64, Int byteShift )
   7682 {
   7683    vassert(byteShift >= 1 && byteShift <= 7);
   7684    return
   7685       binop(Iop_Or64,
   7686             binop(Iop_Shl64, mkexpr(hi64), mkU8(8*(8-byteShift))),
   7687             binop(Iop_Shr64, mkexpr(lo64), mkU8(8*byteShift))
   7688       );
   7689 }
   7690 
   7691 /* Generate a SIGSEGV followed by a restart of the current instruction
   7692    if effective_addr is not 16-aligned.  This is required behaviour
   7693    for some SSE3 instructions and all 128-bit SSSE3 instructions.
   7694    This assumes that guest_RIP_curr_instr is set correctly! */
   7695 /* TODO(glider): we've replaced the 0xF mask with 0x0, effectively disabling
   7696  * the check. Need to enable it once TSan stops generating unaligned
   7697  * accesses in the wrappers.
   7698  * See http://code.google.com/p/data-race-test/issues/detail?id=49 */
   7699 static void gen_SEGV_if_not_16_aligned ( IRTemp effective_addr )
   7700 {
   7701    stmt(
   7702       IRStmt_Exit(
   7703          binop(Iop_CmpNE32,
   7704                binop(Iop_And32,mkexpr(effective_addr),mkU32(0x0)),
   7705                mkU32(0)),
   7706          Ijk_SigSEGV,
   7707          IRConst_U32(guest_EIP_curr_instr)
   7708       )
   7709    );
   7710 }
   7711 
   7712 
   7713 /* Helper for deciding whether a given insn (starting at the opcode
   7714    byte) may validly be used with a LOCK prefix.  The following insns
   7715    may be used with LOCK when their destination operand is in memory.
   7716    AFAICS this is exactly the same for both 32-bit and 64-bit mode.
   7717 
   7718    ADD        80 /0,  81 /0,  82 /0,  83 /0,  00,  01
   7719    OR         80 /1,  81 /1,  82 /x,  83 /1,  08,  09
   7720    ADC        80 /2,  81 /2,  82 /2,  83 /2,  10,  11
   7721    SBB        81 /3,  81 /3,  82 /x,  83 /3,  18,  19
   7722    AND        80 /4,  81 /4,  82 /x,  83 /4,  20,  21
   7723    SUB        80 /5,  81 /5,  82 /x,  83 /5,  28,  29
   7724    XOR        80 /6,  81 /6,  82 /x,  83 /6,  30,  31
   7725 
   7726    DEC        FE /1,  FF /1
   7727    INC        FE /0,  FF /0
   7728 
   7729    NEG        F6 /3,  F7 /3
   7730    NOT        F6 /2,  F7 /2
   7731 
   7732    XCHG       86, 87
   7733 
   7734    BTC        0F BB,  0F BA /7
   7735    BTR        0F B3,  0F BA /6
   7736    BTS        0F AB,  0F BA /5
   7737 
   7738    CMPXCHG    0F B0,  0F B1
   7739    CMPXCHG8B  0F C7 /1
   7740 
   7741    XADD       0F C0,  0F C1
   7742 
   7743    ------------------------------
   7744 
   7745    80 /0  =  addb $imm8,  rm8
   7746    81 /0  =  addl $imm32, rm32  and  addw $imm16, rm16
   7747    82 /0  =  addb $imm8,  rm8
   7748    83 /0  =  addl $simm8, rm32  and  addw $simm8, rm16
   7749 
   7750    00     =  addb r8,  rm8
   7751    01     =  addl r32, rm32  and  addw r16, rm16
   7752 
   7753    Same for ADD OR ADC SBB AND SUB XOR
   7754 
   7755    FE /1  = dec rm8
   7756    FF /1  = dec rm32  and  dec rm16
   7757 
   7758    FE /0  = inc rm8
   7759    FF /0  = inc rm32  and  inc rm16
   7760 
   7761    F6 /3  = neg rm8
   7762    F7 /3  = neg rm32  and  neg rm16
   7763 
   7764    F6 /2  = not rm8
   7765    F7 /2  = not rm32  and  not rm16
   7766 
   7767    0F BB     = btcw r16, rm16    and  btcl r32, rm32
   7768    OF BA /7  = btcw $imm8, rm16  and  btcw $imm8, rm32
   7769 
   7770    Same for BTS, BTR
   7771 */
   7772 static Bool can_be_used_with_LOCK_prefix ( UChar* opc )
   7773 {
   7774    switch (opc[0]) {
   7775       case 0x00: case 0x01: case 0x08: case 0x09:
   7776       case 0x10: case 0x11: case 0x18: case 0x19:
   7777       case 0x20: case 0x21: case 0x28: case 0x29:
   7778       case 0x30: case 0x31:
   7779          if (!epartIsReg(opc[1]))
   7780             return True;
   7781          break;
   7782 
   7783       case 0x80: case 0x81: case 0x82: case 0x83:
   7784          if (gregOfRM(opc[1]) >= 0 && gregOfRM(opc[1]) <= 6
   7785              && !epartIsReg(opc[1]))
   7786             return True;
   7787          break;
   7788 
   7789       case 0xFE: case 0xFF:
   7790          if (gregOfRM(opc[1]) >= 0 && gregOfRM(opc[1]) <= 1
   7791              && !epartIsReg(opc[1]))
   7792             return True;
   7793          break;
   7794 
   7795       case 0xF6: case 0xF7:
   7796          if (gregOfRM(opc[1]) >= 2 && gregOfRM(opc[1]) <= 3
   7797              && !epartIsReg(opc[1]))
   7798             return True;
   7799          break;
   7800 
   7801       case 0x86: case 0x87:
   7802          if (!epartIsReg(opc[1]))
   7803             return True;
   7804          break;
   7805 
   7806       case 0x0F: {
   7807          switch (opc[1]) {
   7808             case 0xBB: case 0xB3: case 0xAB:
   7809                if (!epartIsReg(opc[2]))
   7810                   return True;
   7811                break;
   7812             case 0xBA:
   7813                if (gregOfRM(opc[2]) >= 5 && gregOfRM(opc[2]) <= 7
   7814                    && !epartIsReg(opc[2]))
   7815                   return True;
   7816                break;
   7817             case 0xB0: case 0xB1:
   7818                if (!epartIsReg(opc[2]))
   7819                   return True;
   7820                break;
   7821             case 0xC7:
   7822                if (gregOfRM(opc[2]) == 1 && !epartIsReg(opc[2]) )
   7823                   return True;
   7824                break;
   7825             case 0xC0: case 0xC1:
   7826                if (!epartIsReg(opc[2]))
   7827                   return True;
   7828                break;
   7829             default:
   7830                break;
   7831          } /* switch (opc[1]) */
   7832          break;
   7833       }
   7834 
   7835       default:
   7836          break;
   7837    } /* switch (opc[0]) */
   7838 
   7839    return False;
   7840 }
   7841 
   7842 
   7843 /*------------------------------------------------------------*/
   7844 /*--- Disassemble a single instruction                     ---*/
   7845 /*------------------------------------------------------------*/
   7846 
   7847 /* Disassemble a single instruction into IR.  The instruction is
   7848    located in host memory at &guest_code[delta].  *expect_CAS is set
   7849    to True if the resulting IR is expected to contain an IRCAS
   7850    statement, and False if it's not expected to.  This makes it
   7851    possible for the caller of disInstr_X86_WRK to check that
   7852    LOCK-prefixed instructions are at least plausibly translated, in
   7853    that it becomes possible to check that a (validly) LOCK-prefixed
   7854    instruction generates a translation containing an IRCAS, and
   7855    instructions without LOCK prefixes don't generate translations
   7856    containing an IRCAS.
   7857 */
   7858 static
   7859 DisResult disInstr_X86_WRK (
   7860              /*OUT*/Bool* expect_CAS,
   7861              Bool         put_IP,
   7862              Bool         (*resteerOkFn) ( /*opaque*/void*, Addr64 ),
   7863              Bool         resteerCisOk,
   7864              void*        callback_opaque,
   7865              Long         delta64,
   7866              VexArchInfo* archinfo,
   7867              VexAbiInfo*  vbi
   7868           )
   7869 {
   7870    IRType    ty;
   7871    IRTemp    addr, t0, t1, t2, t3, t4, t5, t6;
   7872    Int       alen;
   7873    UChar     opc, modrm, abyte, pre;
   7874    UInt      d32;
   7875    HChar     dis_buf[50];
   7876    Int       am_sz, d_sz, n_prefixes;
   7877    DisResult dres;
   7878    UChar*    insn; /* used in SSE decoders */
   7879 
   7880    /* The running delta */
   7881    Int delta = (Int)delta64;
   7882 
   7883    /* Holds eip at the start of the insn, so that we can print
   7884       consistent error messages for unimplemented insns. */
   7885    Int delta_start = delta;
   7886 
   7887    /* sz denotes the nominal data-op size of the insn; we change it to
   7888       2 if an 0x66 prefix is seen */
   7889    Int sz = 4;
   7890 
   7891    /* sorb holds the segment-override-prefix byte, if any.  Zero if no
   7892       prefix has been seen, else one of {0x26, 0x3E, 0x64, 0x65}
   7893       indicating the prefix.  */
   7894    UChar sorb = 0;
   7895 
   7896    /* Gets set to True if a LOCK prefix is seen. */
   7897    Bool pfx_lock = False;
   7898 
   7899    /* Set result defaults. */
   7900    dres.whatNext   = Dis_Continue;
   7901    dres.len        = 0;
   7902    dres.continueAt = 0;
   7903 
   7904    *expect_CAS = False;
   7905 
   7906    addr = t0 = t1 = t2 = t3 = t4 = t5 = t6 = IRTemp_INVALID;
   7907 
   7908    vassert(guest_EIP_bbstart + delta == guest_EIP_curr_instr);
   7909    DIP("\t0x%x:  ", guest_EIP_bbstart+delta);
   7910 
   7911    /* We may be asked to update the guest EIP before going further. */
   7912    if (put_IP)
   7913       stmt( IRStmt_Put( OFFB_EIP, mkU32(guest_EIP_curr_instr)) );
   7914 
   7915    /* Spot "Special" instructions (see comment at top of file). */
   7916    {
   7917       UChar* code = (UChar*)(guest_code + delta);
   7918       /* Spot the 12-byte preamble:
   7919          C1C703   roll $3,  %edi
   7920          C1C70D   roll $13, %edi
   7921          C1C71D   roll $29, %edi
   7922          C1C713   roll $19, %edi
   7923       */
   7924       if (code[ 0] == 0xC1 && code[ 1] == 0xC7 && code[ 2] == 0x03 &&
   7925           code[ 3] == 0xC1 && code[ 4] == 0xC7 && code[ 5] == 0x0D &&
   7926           code[ 6] == 0xC1 && code[ 7] == 0xC7 && code[ 8] == 0x1D &&
   7927           code[ 9] == 0xC1 && code[10] == 0xC7 && code[11] == 0x13) {
   7928          /* Got a "Special" instruction preamble.  Which one is it? */
   7929          if (code[12] == 0x87 && code[13] == 0xDB /* xchgl %ebx,%ebx */) {
   7930             /* %EDX = client_request ( %EAX ) */
   7931             DIP("%%edx = client_request ( %%eax )\n");
   7932             delta += 14;
   7933             jmp_lit(Ijk_ClientReq, guest_EIP_bbstart+delta);
   7934             dres.whatNext = Dis_StopHere;
   7935             goto decode_success;
   7936          }
   7937          else
   7938          if (code[12] == 0x87 && code[13] == 0xC9 /* xchgl %ecx,%ecx */) {
   7939             /* %EAX = guest_NRADDR */
   7940             DIP("%%eax = guest_NRADDR\n");
   7941             delta += 14;
   7942             putIReg(4, R_EAX, IRExpr_Get( OFFB_NRADDR, Ity_I32 ));
   7943             goto decode_success;
   7944          }
   7945          else
   7946          if (code[12] == 0x87 && code[13] == 0xD2 /* xchgl %edx,%edx */) {
   7947             /* call-noredir *%EAX */
   7948             DIP("call-noredir *%%eax\n");
   7949             delta += 14;
   7950             t1 = newTemp(Ity_I32);
   7951             assign(t1, getIReg(4,R_EAX));
   7952             t2 = newTemp(Ity_I32);
   7953             assign(t2, binop(Iop_Sub32, getIReg(4,R_ESP), mkU32(4)));
   7954             putIReg(4, R_ESP, mkexpr(t2));
   7955             storeLE( mkexpr(t2), mkU32(guest_EIP_bbstart+delta));
   7956             jmp_treg(Ijk_NoRedir,t1);
   7957             dres.whatNext = Dis_StopHere;
   7958             goto decode_success;
   7959          }
   7960          /* We don't know what it is. */
   7961          goto decode_failure;
   7962          /*NOTREACHED*/
   7963       }
   7964    }
   7965 
   7966    /* Handle a couple of weird-ass NOPs that have been observed in the
   7967       wild. */
   7968    {
   7969       UChar* code = (UChar*)(guest_code + delta);
   7970       /* Sun's JVM 1.5.0 uses the following as a NOP:
   7971          26 2E 64 65 90  %es:%cs:%fs:%gs:nop */
   7972       if (code[0] == 0x26 && code[1] == 0x2E && code[2] == 0x64
   7973           && code[3] == 0x65 && code[4] == 0x90) {
   7974          DIP("%%es:%%cs:%%fs:%%gs:nop\n");
   7975          delta += 5;
   7976          goto decode_success;
   7977       }
   7978       /* Don't barf on recent binutils padding,
   7979          all variants of which are: nopw %cs:0x0(%eax,%eax,1)
   7980          66 2e 0f 1f 84 00 00 00 00 00
   7981          66 66 2e 0f 1f 84 00 00 00 00 00
   7982          66 66 66 2e 0f 1f 84 00 00 00 00 00
   7983          66 66 66 66 2e 0f 1f 84 00 00 00 00 00
   7984          66 66 66 66 66 2e 0f 1f 84 00 00 00 00 00
   7985          66 66 66 66 66 66 2e 0f 1f 84 00 00 00 00 00
   7986       */
   7987       if (code[0] == 0x66) {
   7988          Int data16_cnt;
   7989          for (data16_cnt = 1; data16_cnt < 6; data16_cnt++)
   7990             if (code[data16_cnt] != 0x66)
   7991                break;
   7992          if (code[data16_cnt] == 0x2E && code[data16_cnt + 1] == 0x0F
   7993              && code[data16_cnt + 2] == 0x1F && code[data16_cnt + 3] == 0x84
   7994              && code[data16_cnt + 4] == 0x00 && code[data16_cnt + 5] == 0x00
   7995              && code[data16_cnt + 6] == 0x00 && code[data16_cnt + 7] == 0x00
   7996              && code[data16_cnt + 8] == 0x00 ) {
   7997             DIP("nopw %%cs:0x0(%%eax,%%eax,1)\n");
   7998             delta += 9 + data16_cnt;
   7999             goto decode_success;
   8000          }
   8001       }
   8002    }
   8003 
   8004    /* Normal instruction handling starts here. */
   8005 
   8006    /* Deal with some but not all prefixes:
   8007          66(oso)
   8008          F0(lock)
   8009          2E(cs:) 3E(ds:) 26(es:) 64(fs:) 65(gs:) 36(ss:)
   8010       Not dealt with (left in place):
   8011          F2 F3
   8012    */
   8013    n_prefixes = 0;
   8014    while (True) {
   8015       if (n_prefixes > 7) goto decode_failure;
   8016       pre = getUChar(delta);
   8017       switch (pre) {
   8018          case 0x66:
   8019             sz = 2;
   8020             break;
   8021          case 0xF0:
   8022             pfx_lock = True;
   8023             *expect_CAS = True;
   8024             break;
   8025          case 0x3E: /* %DS: */
   8026          case 0x26: /* %ES: */
   8027          case 0x64: /* %FS: */
   8028          case 0x65: /* %GS: */
   8029             if (sorb != 0)
   8030                goto decode_failure; /* only one seg override allowed */
   8031             sorb = pre;
   8032             break;
   8033          case 0x2E: { /* %CS: */
   8034             /* 2E prefix on a conditional branch instruction is a
   8035                branch-prediction hint, which can safely be ignored.  */
   8036             UChar op1 = getIByte(delta+1);
   8037             UChar op2 = getIByte(delta+2);
   8038             if ((op1 >= 0x70 && op1 <= 0x7F)
   8039                 || (op1 == 0xE3)
   8040                 || (op1 == 0x0F && op2 >= 0x80 && op2 <= 0x8F)) {
   8041                if (0) vex_printf("vex x86->IR: ignoring branch hint\n");
   8042             } else {
   8043                /* All other CS override cases are not handled */
   8044                goto decode_failure;
   8045             }
   8046             break;
   8047          }
   8048          case 0x36: /* %SS: */
   8049             /* SS override cases are not handled */
   8050             goto decode_failure;
   8051          default:
   8052             goto not_a_prefix;
   8053       }
   8054       n_prefixes++;
   8055       delta++;
   8056    }
   8057 
   8058    not_a_prefix:
   8059 
   8060    /* Now we should be looking at the primary opcode byte or the
   8061       leading F2 or F3.  Check that any LOCK prefix is actually
   8062       allowed. */
   8063 
   8064    if (pfx_lock) {
   8065       if (can_be_used_with_LOCK_prefix( (UChar*)&guest_code[delta] )) {
   8066          DIP("lock ");
   8067       } else {
   8068          *expect_CAS = False;
   8069          goto decode_failure;
   8070       }
   8071    }
   8072 
   8073 
   8074    /* ---------------------------------------------------- */
   8075    /* --- The SSE decoder.                             --- */
   8076    /* ---------------------------------------------------- */
   8077 
   8078    /* What did I do to deserve SSE ?  Perhaps I was really bad in a
   8079       previous life? */
   8080 
   8081    /* Note, this doesn't handle SSE2 or SSE3.  That is handled in a
   8082       later section, further on. */
   8083 
   8084    insn = (UChar*)&guest_code[delta];
   8085 
   8086    /* Treat fxsave specially.  It should be doable even on an SSE0
   8087       (Pentium-II class) CPU.  Hence be prepared to handle it on
   8088       any subarchitecture variant.
   8089    */
   8090 
   8091    /* 0F AE /0 = FXSAVE m512 -- write x87 and SSE state to memory */
   8092    if (sz == 4 && insn[0] == 0x0F && insn[1] == 0xAE
   8093        && !epartIsReg(insn[2]) && gregOfRM(insn[2]) == 0) {
   8094       IRDirty* d;
   8095       modrm = getIByte(delta+2);
   8096       vassert(sz == 4);
   8097       vassert(!epartIsReg(modrm));
   8098 
   8099       addr = disAMode ( &alen, sorb, delta+2, dis_buf );
   8100       delta += 2+alen;
   8101       gen_SEGV_if_not_16_aligned(addr);
   8102 
   8103       DIP("fxsave %s\n", dis_buf);
   8104 
   8105       /* Uses dirty helper:
   8106             void x86g_do_FXSAVE ( VexGuestX86State*, UInt ) */
   8107       d = unsafeIRDirty_0_N (
   8108              0/*regparms*/,
   8109              "x86g_dirtyhelper_FXSAVE",
   8110              &x86g_dirtyhelper_FXSAVE,
   8111              mkIRExprVec_1( mkexpr(addr) )
   8112           );
   8113       d->needsBBP = True;
   8114 
   8115       /* declare we're writing memory */
   8116       d->mFx   = Ifx_Write;
   8117       d->mAddr = mkexpr(addr);
   8118       d->mSize = 512;
   8119 
   8120       /* declare we're reading guest state */
   8121       d->nFxState = 7;
   8122 
   8123       d->fxState[0].fx     = Ifx_Read;
   8124       d->fxState[0].offset = OFFB_FTOP;
   8125       d->fxState[0].size   = sizeof(UInt);
   8126 
   8127       d->fxState[1].fx     = Ifx_Read;
   8128       d->fxState[1].offset = OFFB_FPREGS;
   8129       d->fxState[1].size   = 8 * sizeof(ULong);
   8130 
   8131       d->fxState[2].fx     = Ifx_Read;
   8132       d->fxState[2].offset = OFFB_FPTAGS;
   8133       d->fxState[2].size   = 8 * sizeof(UChar);
   8134 
   8135       d->fxState[3].fx     = Ifx_Read;
   8136       d->fxState[3].offset = OFFB_FPROUND;
   8137       d->fxState[3].size   = sizeof(UInt);
   8138 
   8139       d->fxState[4].fx     = Ifx_Read;
   8140       d->fxState[4].offset = OFFB_FC3210;
   8141       d->fxState[4].size   = sizeof(UInt);
   8142 
   8143       d->fxState[5].fx     = Ifx_Read;
   8144       d->fxState[5].offset = OFFB_XMM0;
   8145       d->fxState[5].size   = 8 * sizeof(U128);
   8146 
   8147       d->fxState[6].fx     = Ifx_Read;
   8148       d->fxState[6].offset = OFFB_SSEROUND;
   8149       d->fxState[6].size   = sizeof(UInt);
   8150 
   8151       /* Be paranoid ... this assertion tries to ensure the 8 %xmm
   8152 	 images are packed back-to-back.  If not, the value of
   8153 	 d->fxState[5].size is wrong. */
   8154       vassert(16 == sizeof(U128));
   8155       vassert(OFFB_XMM7 == (OFFB_XMM0 + 7 * 16));
   8156 
   8157       stmt( IRStmt_Dirty(d) );
   8158 
   8159       goto decode_success;
   8160    }
   8161 
   8162    /* 0F AE /1 = FXRSTOR m512 -- read x87 and SSE state from memory */
   8163    if (sz == 4 && insn[0] == 0x0F && insn[1] == 0xAE
   8164        && !epartIsReg(insn[2]) && gregOfRM(insn[2]) == 1) {
   8165       IRDirty* d;
   8166       modrm = getIByte(delta+2);
   8167       vassert(sz == 4);
   8168       vassert(!epartIsReg(modrm));
   8169 
   8170       addr = disAMode ( &alen, sorb, delta+2, dis_buf );
   8171       delta += 2+alen;
   8172       gen_SEGV_if_not_16_aligned(addr);
   8173 
   8174       DIP("fxrstor %s\n", dis_buf);
   8175 
   8176       /* Uses dirty helper:
   8177             VexEmWarn x86g_do_FXRSTOR ( VexGuestX86State*, UInt )
   8178          NOTE:
   8179             the VexEmWarn value is simply ignored (unlike for FRSTOR)
   8180       */
   8181       d = unsafeIRDirty_0_N (
   8182              0/*regparms*/,
   8183              "x86g_dirtyhelper_FXRSTOR",
   8184              &x86g_dirtyhelper_FXRSTOR,
   8185              mkIRExprVec_1( mkexpr(addr) )
   8186           );
   8187       d->needsBBP = True;
   8188 
   8189       /* declare we're reading memory */
   8190       d->mFx   = Ifx_Read;
   8191       d->mAddr = mkexpr(addr);
   8192       d->mSize = 512;
   8193 
   8194       /* declare we're writing guest state */
   8195       d->nFxState = 7;
   8196 
   8197       d->fxState[0].fx     = Ifx_Write;
   8198       d->fxState[0].offset = OFFB_FTOP;
   8199       d->fxState[0].size   = sizeof(UInt);
   8200 
   8201       d->fxState[1].fx     = Ifx_Write;
   8202       d->fxState[1].offset = OFFB_FPREGS;
   8203       d->fxState[1].size   = 8 * sizeof(ULong);
   8204 
   8205       d->fxState[2].fx     = Ifx_Write;
   8206       d->fxState[2].offset = OFFB_FPTAGS;
   8207       d->fxState[2].size   = 8 * sizeof(UChar);
   8208 
   8209       d->fxState[3].fx     = Ifx_Write;
   8210       d->fxState[3].offset = OFFB_FPROUND;
   8211       d->fxState[3].size   = sizeof(UInt);
   8212 
   8213       d->fxState[4].fx     = Ifx_Write;
   8214       d->fxState[4].offset = OFFB_FC3210;
   8215       d->fxState[4].size   = sizeof(UInt);
   8216 
   8217       d->fxState[5].fx     = Ifx_Write;
   8218       d->fxState[5].offset = OFFB_XMM0;
   8219       d->fxState[5].size   = 8 * sizeof(U128);
   8220 
   8221       d->fxState[6].fx     = Ifx_Write;
   8222       d->fxState[6].offset = OFFB_SSEROUND;
   8223       d->fxState[6].size   = sizeof(UInt);
   8224 
   8225       /* Be paranoid ... this assertion tries to ensure the 8 %xmm
   8226 	 images are packed back-to-back.  If not, the value of
   8227 	 d->fxState[5].size is wrong. */
   8228       vassert(16 == sizeof(U128));
   8229       vassert(OFFB_XMM7 == (OFFB_XMM0 + 7 * 16));
   8230 
   8231       stmt( IRStmt_Dirty(d) );
   8232 
   8233       goto decode_success;
   8234    }
   8235 
   8236    /* ------ SSE decoder main ------ */
   8237 
   8238    /* Skip parts of the decoder which don't apply given the stated
   8239       guest subarchitecture. */
   8240    if (archinfo->hwcaps == 0/*baseline, no sse at all*/)
   8241       goto after_sse_decoders;
   8242 
   8243    /* Otherwise we must be doing sse1 or sse2, so we can at least try
   8244       for SSE1 here. */
   8245 
   8246    /* 0F 58 = ADDPS -- add 32Fx4 from R/M to R */
   8247    if (sz == 4 && insn[0] == 0x0F && insn[1] == 0x58) {
   8248       delta = dis_SSE_E_to_G_all( sorb, delta+2, "addps", Iop_Add32Fx4 );
   8249       goto decode_success;
   8250    }
   8251 
   8252    /* F3 0F 58 = ADDSS -- add 32F0x4 from R/M to R */
   8253    if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0x58) {
   8254       vassert(sz == 4);
   8255       delta = dis_SSE_E_to_G_lo32( sorb, delta+3, "addss", Iop_Add32F0x4 );
   8256       goto decode_success;
   8257    }
   8258 
   8259    /* 0F 55 = ANDNPS -- G = (not G) and E */
   8260    if (sz == 4 && insn[0] == 0x0F && insn[1] == 0x55) {
   8261       delta = dis_SSE_E_to_G_all_invG( sorb, delta+2, "andnps", Iop_AndV128 );
   8262       goto decode_success;
   8263    }
   8264 
   8265    /* 0F 54 = ANDPS -- G = G and E */
   8266    if (sz == 4 && insn[0] == 0x0F && insn[1] == 0x54) {
   8267       delta = dis_SSE_E_to_G_all( sorb, delta+2, "andps", Iop_AndV128 );
   8268       goto decode_success;
   8269    }
   8270 
   8271    /* 0F C2 = CMPPS -- 32Fx4 comparison from R/M to R */
   8272    if (sz == 4 && insn[0] == 0x0F && insn[1] == 0xC2) {
   8273       delta = dis_SSEcmp_E_to_G( sorb, delta+2, "cmpps", True, 4 );
   8274       goto decode_success;
   8275    }
   8276 
   8277    /* F3 0F C2 = CMPSS -- 32F0x4 comparison from R/M to R */
   8278    if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0xC2) {
   8279       vassert(sz == 4);
   8280       delta = dis_SSEcmp_E_to_G( sorb, delta+3, "cmpss", False, 4 );
   8281       goto decode_success;
   8282    }
   8283 
   8284    /* 0F 2F = COMISS  -- 32F0x4 comparison G,E, and set ZCP */
   8285    /* 0F 2E = UCOMISS -- 32F0x4 comparison G,E, and set ZCP */
   8286    if (sz == 4 && insn[0] == 0x0F && (insn[1] == 0x2F || insn[1] == 0x2E)) {
   8287       IRTemp argL = newTemp(Ity_F32);
   8288       IRTemp argR = newTemp(Ity_F32);
   8289       modrm = getIByte(delta+2);
   8290       if (epartIsReg(modrm)) {
   8291          assign( argR, getXMMRegLane32F( eregOfRM(modrm), 0/*lowest lane*/ ) );
   8292          delta += 2+1;
   8293          DIP("[u]comiss %s,%s\n", nameXMMReg(eregOfRM(modrm)),
   8294                                   nameXMMReg(gregOfRM(modrm)) );
   8295       } else {
   8296          addr = disAMode ( &alen, sorb, delta+2, dis_buf );
   8297 	 assign( argR, loadLE(Ity_F32, mkexpr(addr)) );
   8298          delta += 2+alen;
   8299          DIP("[u]comiss %s,%s\n", dis_buf,
   8300                                   nameXMMReg(gregOfRM(modrm)) );
   8301       }
   8302       assign( argL, getXMMRegLane32F( gregOfRM(modrm), 0/*lowest lane*/ ) );
   8303 
   8304       stmt( IRStmt_Put( OFFB_CC_OP,   mkU32(X86G_CC_OP_COPY) ));
   8305       stmt( IRStmt_Put( OFFB_CC_DEP2, mkU32(0) ));
   8306       stmt( IRStmt_Put(
   8307                OFFB_CC_DEP1,
   8308                binop( Iop_And32,
   8309                       binop(Iop_CmpF64,
   8310                             unop(Iop_F32toF64,mkexpr(argL)),
   8311                             unop(Iop_F32toF64,mkexpr(argR))),
   8312                       mkU32(0x45)
   8313           )));
   8314       /* Set NDEP even though it isn't used.  This makes redundant-PUT
   8315          elimination of previous stores to this field work better. */
   8316       stmt( IRStmt_Put( OFFB_CC_NDEP, mkU32(0) ));
   8317       goto decode_success;
   8318    }
   8319 
   8320    /* 0F 2A = CVTPI2PS -- convert 2 x I32 in mem/mmx to 2 x F32 in low
   8321       half xmm */
   8322    if (sz == 4 && insn[0] == 0x0F && insn[1] == 0x2A) {
   8323       IRTemp arg64 = newTemp(Ity_I64);
   8324       IRTemp rmode = newTemp(Ity_I32);
   8325       vassert(sz == 4);
   8326 
   8327       modrm = getIByte(delta+2);
   8328       do_MMX_preamble();
   8329       if (epartIsReg(modrm)) {
   8330          assign( arg64, getMMXReg(eregOfRM(modrm)) );
   8331          delta += 2+1;
   8332          DIP("cvtpi2ps %s,%s\n", nameMMXReg(eregOfRM(modrm)),
   8333                                  nameXMMReg(gregOfRM(modrm)));
   8334       } else {
   8335          addr = disAMode ( &alen, sorb, delta+2, dis_buf );
   8336 	 assign( arg64, loadLE(Ity_I64, mkexpr(addr)) );
   8337          delta += 2+alen;
   8338          DIP("cvtpi2ps %s,%s\n", dis_buf,
   8339                                  nameXMMReg(gregOfRM(modrm)) );
   8340       }
   8341 
   8342       assign( rmode, get_sse_roundingmode() );
   8343 
   8344       putXMMRegLane32F(
   8345          gregOfRM(modrm), 0,
   8346          binop(Iop_F64toF32,
   8347                mkexpr(rmode),
   8348                unop(Iop_I32StoF64,
   8349                     unop(Iop_64to32, mkexpr(arg64)) )) );
   8350 
   8351       putXMMRegLane32F(
   8352          gregOfRM(modrm), 1,
   8353          binop(Iop_F64toF32,
   8354                mkexpr(rmode),
   8355                unop(Iop_I32StoF64,
   8356                     unop(Iop_64HIto32, mkexpr(arg64)) )) );
   8357 
   8358       goto decode_success;
   8359    }
   8360 
   8361    /* F3 0F 2A = CVTSI2SS -- convert I32 in mem/ireg to F32 in low
   8362       quarter xmm */
   8363    if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0x2A) {
   8364       IRTemp arg32 = newTemp(Ity_I32);
   8365       IRTemp rmode = newTemp(Ity_I32);
   8366       vassert(sz == 4);
   8367 
   8368       modrm = getIByte(delta+3);
   8369       if (epartIsReg(modrm)) {
   8370          assign( arg32, getIReg(4, eregOfRM(modrm)) );
   8371          delta += 3+1;
   8372          DIP("cvtsi2ss %s,%s\n", nameIReg(4, eregOfRM(modrm)),
   8373                                  nameXMMReg(gregOfRM(modrm)));
   8374       } else {
   8375          addr = disAMode ( &alen, sorb, delta+3, dis_buf );
   8376 	 assign( arg32, loadLE(Ity_I32, mkexpr(addr)) );
   8377          delta += 3+alen;
   8378          DIP("cvtsi2ss %s,%s\n", dis_buf,
   8379                                  nameXMMReg(gregOfRM(modrm)) );
   8380       }
   8381 
   8382       assign( rmode, get_sse_roundingmode() );
   8383 
   8384       putXMMRegLane32F(
   8385          gregOfRM(modrm), 0,
   8386          binop(Iop_F64toF32,
   8387                mkexpr(rmode),
   8388                unop(Iop_I32StoF64, mkexpr(arg32)) ) );
   8389 
   8390       goto decode_success;
   8391    }
   8392 
   8393    /* 0F 2D = CVTPS2PI -- convert 2 x F32 in mem/low half xmm to 2 x
   8394       I32 in mmx, according to prevailing SSE rounding mode */
   8395    /* 0F 2C = CVTTPS2PI -- convert 2 x F32 in mem/low half xmm to 2 x
   8396       I32 in mmx, rounding towards zero */
   8397    if (sz == 4 && insn[0] == 0x0F && (insn[1] == 0x2D || insn[1] == 0x2C)) {
   8398       IRTemp dst64  = newTemp(Ity_I64);
   8399       IRTemp rmode  = newTemp(Ity_I32);
   8400       IRTemp f32lo  = newTemp(Ity_F32);
   8401       IRTemp f32hi  = newTemp(Ity_F32);
   8402       Bool   r2zero = toBool(insn[1] == 0x2C);
   8403 
   8404       do_MMX_preamble();
   8405       modrm = getIByte(delta+2);
   8406 
   8407       if (epartIsReg(modrm)) {
   8408          delta += 2+1;
   8409 	 assign(f32lo, getXMMRegLane32F(eregOfRM(modrm), 0));
   8410 	 assign(f32hi, getXMMRegLane32F(eregOfRM(modrm), 1));
   8411          DIP("cvt%sps2pi %s,%s\n", r2zero ? "t" : "",
   8412                                    nameXMMReg(eregOfRM(modrm)),
   8413                                    nameMMXReg(gregOfRM(modrm)));
   8414       } else {
   8415          addr = disAMode ( &alen, sorb, delta+2, dis_buf );
   8416 	 assign(f32lo, loadLE(Ity_F32, mkexpr(addr)));
   8417 	 assign(f32hi, loadLE(Ity_F32, binop( Iop_Add32,
   8418                                               mkexpr(addr),
   8419                                               mkU32(4) )));
   8420          delta += 2+alen;
   8421          DIP("cvt%sps2pi %s,%s\n", r2zero ? "t" : "",
   8422                                    dis_buf,
   8423                                    nameMMXReg(gregOfRM(modrm)));
   8424       }
   8425 
   8426       if (r2zero) {
   8427          assign(rmode, mkU32((UInt)Irrm_ZERO) );
   8428       } else {
   8429          assign( rmode, get_sse_roundingmode() );
   8430       }
   8431 
   8432       assign(
   8433          dst64,
   8434          binop( Iop_32HLto64,
   8435                 binop( Iop_F64toI32S,
   8436                        mkexpr(rmode),
   8437                        unop( Iop_F32toF64, mkexpr(f32hi) ) ),
   8438                 binop( Iop_F64toI32S,
   8439                        mkexpr(rmode),
   8440                        unop( Iop_F32toF64, mkexpr(f32lo) ) )
   8441               )
   8442       );
   8443 
   8444       putMMXReg(gregOfRM(modrm), mkexpr(dst64));
   8445       goto decode_success;
   8446    }
   8447 
   8448    /* F3 0F 2D = CVTSS2SI -- convert F32 in mem/low quarter xmm to
   8449       I32 in ireg, according to prevailing SSE rounding mode */
   8450    /* F3 0F 2C = CVTTSS2SI -- convert F32 in mem/low quarter xmm to
   8451       I32 in ireg, rounding towards zero */
   8452    if (insn[0] == 0xF3 && insn[1] == 0x0F
   8453        && (insn[2] == 0x2D || insn[2] == 0x2C)) {
   8454       IRTemp rmode = newTemp(Ity_I32);
   8455       IRTemp f32lo = newTemp(Ity_F32);
   8456       Bool   r2zero = toBool(insn[2] == 0x2C);
   8457       vassert(sz == 4);
   8458 
   8459       modrm = getIByte(delta+3);
   8460       if (epartIsReg(modrm)) {
   8461          delta += 3+1;
   8462 	 assign(f32lo, getXMMRegLane32F(eregOfRM(modrm), 0));
   8463          DIP("cvt%sss2si %s,%s\n", r2zero ? "t" : "",
   8464                                    nameXMMReg(eregOfRM(modrm)),
   8465                                    nameIReg(4, gregOfRM(modrm)));
   8466       } else {
   8467          addr = disAMode ( &alen, sorb, delta+3, dis_buf );
   8468 	 assign(f32lo, loadLE(Ity_F32, mkexpr(addr)));
   8469          delta += 3+alen;
   8470          DIP("cvt%sss2si %s,%s\n", r2zero ? "t" : "",
   8471                                    dis_buf,
   8472                                    nameIReg(4, gregOfRM(modrm)));
   8473       }
   8474 
   8475       if (r2zero) {
   8476          assign( rmode, mkU32((UInt)Irrm_ZERO) );
   8477       } else {
   8478          assign( rmode, get_sse_roundingmode() );
   8479       }
   8480 
   8481       putIReg(4, gregOfRM(modrm),
   8482                  binop( Iop_F64toI32S,
   8483                         mkexpr(rmode),
   8484                         unop( Iop_F32toF64, mkexpr(f32lo) ) )
   8485       );
   8486 
   8487       goto decode_success;
   8488    }
   8489 
   8490    /* 0F 5E = DIVPS -- div 32Fx4 from R/M to R */
   8491    if (sz == 4 && insn[0] == 0x0F && insn[1] == 0x5E) {
   8492       delta = dis_SSE_E_to_G_all( sorb, delta+2, "divps", Iop_Div32Fx4 );
   8493       goto decode_success;
   8494    }
   8495 
   8496    /* F3 0F 5E = DIVSS -- div 32F0x4 from R/M to R */
   8497    if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0x5E) {
   8498       vassert(sz == 4);
   8499       delta = dis_SSE_E_to_G_lo32( sorb, delta+3, "divss", Iop_Div32F0x4 );
   8500       goto decode_success;
   8501    }
   8502 
   8503    /* 0F AE /2 = LDMXCSR m32 -- load %mxcsr */
   8504    if (insn[0] == 0x0F && insn[1] == 0xAE
   8505        && !epartIsReg(insn[2]) && gregOfRM(insn[2]) == 2) {
   8506 
   8507       IRTemp t64 = newTemp(Ity_I64);
   8508       IRTemp ew = newTemp(Ity_I32);
   8509 
   8510       modrm = getIByte(delta+2);
   8511       vassert(!epartIsReg(modrm));
   8512       vassert(sz == 4);
   8513 
   8514       addr = disAMode ( &alen, sorb, delta+2, dis_buf );
   8515       delta += 2+alen;
   8516       DIP("ldmxcsr %s\n", dis_buf);
   8517 
   8518       /* The only thing we observe in %mxcsr is the rounding mode.
   8519          Therefore, pass the 32-bit value (SSE native-format control
   8520          word) to a clean helper, getting back a 64-bit value, the
   8521          lower half of which is the SSEROUND value to store, and the
   8522          upper half of which is the emulation-warning token which may
   8523          be generated.
   8524       */
   8525       /* ULong x86h_check_ldmxcsr ( UInt ); */
   8526       assign( t64, mkIRExprCCall(
   8527                       Ity_I64, 0/*regparms*/,
   8528                       "x86g_check_ldmxcsr",
   8529                       &x86g_check_ldmxcsr,
   8530                       mkIRExprVec_1( loadLE(Ity_I32, mkexpr(addr)) )
   8531                    )
   8532             );
   8533 
   8534       put_sse_roundingmode( unop(Iop_64to32, mkexpr(t64)) );
   8535       assign( ew, unop(Iop_64HIto32, mkexpr(t64) ) );
   8536       put_emwarn( mkexpr(ew) );
   8537       /* Finally, if an emulation warning was reported, side-exit to
   8538          the next insn, reporting the warning, so that Valgrind's
   8539          dispatcher sees the warning. */
   8540       stmt(
   8541          IRStmt_Exit(
   8542             binop(Iop_CmpNE32, mkexpr(ew), mkU32(0)),
   8543             Ijk_EmWarn,
   8544             IRConst_U32( ((Addr32)guest_EIP_bbstart)+delta)
   8545          )
   8546       );
   8547       goto decode_success;
   8548    }
   8549 
   8550    /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
   8551    /* 0F F7 = MASKMOVQ -- 8x8 masked store */
   8552    if (sz == 4 && insn[0] == 0x0F && insn[1] == 0xF7) {
   8553       Bool ok = False;
   8554       delta = dis_MMX( &ok, sorb, sz, delta+1 );
   8555       if (!ok)
   8556          goto decode_failure;
   8557       goto decode_success;
   8558    }
   8559 
   8560    /* 0F 5F = MAXPS -- max 32Fx4 from R/M to R */
   8561    if (sz == 4 && insn[0] == 0x0F && insn[1] == 0x5F) {
   8562       delta = dis_SSE_E_to_G_all( sorb, delta+2, "maxps", Iop_Max32Fx4 );
   8563       goto decode_success;
   8564    }
   8565 
   8566    /* F3 0F 5F = MAXSS -- max 32F0x4 from R/M to R */
   8567    if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0x5F) {
   8568       vassert(sz == 4);
   8569       delta = dis_SSE_E_to_G_lo32( sorb, delta+3, "maxss", Iop_Max32F0x4 );
   8570       goto decode_success;
   8571    }
   8572 
   8573    /* 0F 5D = MINPS -- min 32Fx4 from R/M to R */
   8574    if (sz == 4 && insn[0] == 0x0F && insn[1] == 0x5D) {
   8575       delta = dis_SSE_E_to_G_all( sorb, delta+2, "minps", Iop_Min32Fx4 );
   8576       goto decode_success;
   8577    }
   8578 
   8579    /* F3 0F 5D = MINSS -- min 32F0x4 from R/M to R */
   8580    if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0x5D) {
   8581       vassert(sz == 4);
   8582       delta = dis_SSE_E_to_G_lo32( sorb, delta+3, "minss", Iop_Min32F0x4 );
   8583       goto decode_success;
   8584    }
   8585 
   8586    /* 0F 28 = MOVAPS -- move from E (mem or xmm) to G (xmm). */
   8587    /* 0F 10 = MOVUPS -- move from E (mem or xmm) to G (xmm). */
   8588    if (sz == 4 && insn[0] == 0x0F && (insn[1] == 0x28 || insn[1] == 0x10)) {
   8589       modrm = getIByte(delta+2);
   8590       if (epartIsReg(modrm)) {
   8591          putXMMReg( gregOfRM(modrm),
   8592                     getXMMReg( eregOfRM(modrm) ));
   8593          DIP("mov[ua]ps %s,%s\n", nameXMMReg(eregOfRM(modrm)),
   8594                                   nameXMMReg(gregOfRM(modrm)));
   8595          delta += 2+1;
   8596       } else {
   8597          addr = disAMode ( &alen, sorb, delta+2, dis_buf );
   8598          if (insn[1] == 0x28/*movaps*/)
   8599             gen_SEGV_if_not_16_aligned( addr );
   8600          putXMMReg( gregOfRM(modrm),
   8601                     loadLE(Ity_V128, mkexpr(addr)) );
   8602          DIP("mov[ua]ps %s,%s\n", dis_buf,
   8603                                   nameXMMReg(gregOfRM(modrm)));
   8604          delta += 2+alen;
   8605       }
   8606       goto decode_success;
   8607    }
   8608 
   8609    /* 0F 29 = MOVAPS -- move from G (xmm) to E (mem or xmm). */
   8610    /* 0F 11 = MOVUPS -- move from G (xmm) to E (mem or xmm). */
   8611    if (sz == 4 && insn[0] == 0x0F
   8612        && (insn[1] == 0x29 || insn[1] == 0x11)) {
   8613       modrm = getIByte(delta+2);
   8614       if (epartIsReg(modrm)) {
   8615          /* fall through; awaiting test case */
   8616       } else {
   8617          addr = disAMode ( &alen, sorb, delta+2, dis_buf );
   8618          if (insn[1] == 0x29/*movaps*/)
   8619             gen_SEGV_if_not_16_aligned( addr );
   8620          storeLE( mkexpr(addr), getXMMReg(gregOfRM(modrm)) );
   8621          DIP("mov[ua]ps %s,%s\n", nameXMMReg(gregOfRM(modrm)),
   8622                                   dis_buf );
   8623          delta += 2+alen;
   8624          goto decode_success;
   8625       }
   8626    }
   8627 
   8628    /* 0F 16 = MOVHPS -- move from mem to high half of XMM. */
   8629    /* 0F 16 = MOVLHPS -- move from lo half to hi half of XMM. */
   8630    if (sz == 4 && insn[0] == 0x0F && insn[1] == 0x16) {
   8631       modrm = getIByte(delta+2);
   8632       if (epartIsReg(modrm)) {
   8633          delta += 2+1;
   8634          putXMMRegLane64( gregOfRM(modrm), 1/*upper lane*/,
   8635                           getXMMRegLane64( eregOfRM(modrm), 0 ) );
   8636          DIP("movhps %s,%s\n", nameXMMReg(eregOfRM(modrm)),
   8637                                nameXMMReg(gregOfRM(modrm)));
   8638       } else {
   8639          addr = disAMode ( &alen, sorb, delta+2, dis_buf );
   8640          delta += 2+alen;
   8641          putXMMRegLane64( gregOfRM(modrm), 1/*upper lane*/,
   8642                           loadLE(Ity_I64, mkexpr(addr)) );
   8643          DIP("movhps %s,%s\n", dis_buf,
   8644                                nameXMMReg( gregOfRM(modrm) ));
   8645       }
   8646       goto decode_success;
   8647    }
   8648 
   8649    /* 0F 17 = MOVHPS -- move from high half of XMM to mem. */
   8650    if (sz == 4 && insn[0] == 0x0F && insn[1] == 0x17) {
   8651       if (!epartIsReg(insn[2])) {
   8652          delta += 2;
   8653          addr = disAMode ( &alen, sorb, delta, dis_buf );
   8654          delta += alen;
   8655          storeLE( mkexpr(addr),
   8656                   getXMMRegLane64( gregOfRM(insn[2]),
   8657                                    1/*upper lane*/ ) );
   8658          DIP("movhps %s,%s\n", nameXMMReg( gregOfRM(insn[2]) ),
   8659                                dis_buf);
   8660          goto decode_success;
   8661       }
   8662       /* else fall through */
   8663    }
   8664 
   8665    /* 0F 12 = MOVLPS -- move from mem to low half of XMM. */
   8666    /* OF 12 = MOVHLPS -- from from hi half to lo half of XMM. */
   8667    if (sz == 4 && insn[0] == 0x0F && insn[1] == 0x12) {
   8668       modrm = getIByte(delta+2);
   8669       if (epartIsReg(modrm)) {
   8670          delta += 2+1;
   8671          putXMMRegLane64( gregOfRM(modrm),
   8672                           0/*lower lane*/,
   8673                           getXMMRegLane64( eregOfRM(modrm), 1 ));
   8674          DIP("movhlps %s, %s\n", nameXMMReg(eregOfRM(modrm)),
   8675                                  nameXMMReg(gregOfRM(modrm)));
   8676       } else {
   8677          addr = disAMode ( &alen, sorb, delta+2, dis_buf );
   8678          delta += 2+alen;
   8679          putXMMRegLane64( gregOfRM(modrm),  0/*lower lane*/,
   8680                           loadLE(Ity_I64, mkexpr(addr)) );
   8681          DIP("movlps %s, %s\n",
   8682              dis_buf, nameXMMReg( gregOfRM(modrm) ));
   8683       }
   8684       goto decode_success;
   8685    }
   8686 
   8687    /* 0F 13 = MOVLPS -- move from low half of XMM to mem. */
   8688    if (sz == 4 && insn[0] == 0x0F && insn[1] == 0x13) {
   8689       if (!epartIsReg(insn[2])) {
   8690          delta += 2;
   8691          addr = disAMode ( &alen, sorb, delta, dis_buf );
   8692          delta += alen;
   8693          storeLE( mkexpr(addr),
   8694                   getXMMRegLane64( gregOfRM(insn[2]),
   8695                                    0/*lower lane*/ ) );
   8696          DIP("movlps %s, %s\n", nameXMMReg( gregOfRM(insn[2]) ),
   8697                                 dis_buf);
   8698          goto decode_success;
   8699       }
   8700       /* else fall through */
   8701    }
   8702 
   8703    /* 0F 50 = MOVMSKPS - move 4 sign bits from 4 x F32 in xmm(E)
   8704       to 4 lowest bits of ireg(G) */
   8705    if (insn[0] == 0x0F && insn[1] == 0x50) {
   8706       modrm = getIByte(delta+2);
   8707       if (sz == 4 && epartIsReg(modrm)) {
   8708          Int src;
   8709          t0 = newTemp(Ity_I32);
   8710          t1 = newTemp(Ity_I32);
   8711          t2 = newTemp(Ity_I32);
   8712          t3 = newTemp(Ity_I32);
   8713          delta += 2+1;
   8714          src = eregOfRM(modrm);
   8715          assign( t0, binop( Iop_And32,
   8716                             binop(Iop_Shr32, getXMMRegLane32(src,0), mkU8(31)),
   8717                             mkU32(1) ));
   8718          assign( t1, binop( Iop_And32,
   8719                             binop(Iop_Shr32, getXMMRegLane32(src,1), mkU8(30)),
   8720                             mkU32(2) ));
   8721          assign( t2, binop( Iop_And32,
   8722                             binop(Iop_Shr32, getXMMRegLane32(src,2), mkU8(29)),
   8723                             mkU32(4) ));
   8724          assign( t3, binop( Iop_And32,
   8725                             binop(Iop_Shr32, getXMMRegLane32(src,3), mkU8(28)),
   8726                             mkU32(8) ));
   8727          putIReg(4, gregOfRM(modrm),
   8728                     binop(Iop_Or32,
   8729                           binop(Iop_Or32, mkexpr(t0), mkexpr(t1)),
   8730                           binop(Iop_Or32, mkexpr(t2), mkexpr(t3))
   8731                          )
   8732                  );
   8733          DIP("movmskps %s,%s\n", nameXMMReg(src),
   8734                                  nameIReg(4, gregOfRM(modrm)));
   8735          goto decode_success;
   8736       }
   8737       /* else fall through */
   8738    }
   8739 
   8740    /* 0F 2B = MOVNTPS -- for us, just a plain SSE store. */
   8741    /* 66 0F 2B = MOVNTPD -- for us, just a plain SSE store. */
   8742    if (insn[0] == 0x0F && insn[1] == 0x2B) {
   8743       modrm = getIByte(delta+2);
   8744       if (!epartIsReg(modrm)) {
   8745          addr = disAMode ( &alen, sorb, delta+2, dis_buf );
   8746          gen_SEGV_if_not_16_aligned( addr );
   8747          storeLE( mkexpr(addr), getXMMReg(gregOfRM(modrm)) );
   8748          DIP("movntp%s %s,%s\n", sz==2 ? "d" : "s",
   8749                                  dis_buf,
   8750                                  nameXMMReg(gregOfRM(modrm)));
   8751          delta += 2+alen;
   8752          goto decode_success;
   8753       }
   8754       /* else fall through */
   8755    }
   8756 
   8757    /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
   8758    /* 0F E7 = MOVNTQ -- for us, just a plain MMX store.  Note, the
   8759       Intel manual does not say anything about the usual business of
   8760       the FP reg tags getting trashed whenever an MMX insn happens.
   8761       So we just leave them alone.
   8762    */
   8763    if (insn[0] == 0x0F && insn[1] == 0xE7) {
   8764       modrm = getIByte(delta+2);
   8765       if (sz == 4 && !epartIsReg(modrm)) {
   8766          /* do_MMX_preamble(); Intel docs don't specify this */
   8767          addr = disAMode ( &alen, sorb, delta+2, dis_buf );
   8768          storeLE( mkexpr(addr), getMMXReg(gregOfRM(modrm)) );
   8769          DIP("movntq %s,%s\n", dis_buf,
   8770                                nameMMXReg(gregOfRM(modrm)));
   8771          delta += 2+alen;
   8772          goto decode_success;
   8773       }
   8774       /* else fall through */
   8775    }
   8776 
   8777    /* F3 0F 10 = MOVSS -- move 32 bits from E (mem or lo 1/4 xmm) to G
   8778       (lo 1/4 xmm).  If E is mem, upper 3/4 of G is zeroed out. */
   8779    if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0x10) {
   8780       vassert(sz == 4);
   8781       modrm = getIByte(delta+3);
   8782       if (epartIsReg(modrm)) {
   8783          putXMMRegLane32( gregOfRM(modrm), 0,
   8784                           getXMMRegLane32( eregOfRM(modrm), 0 ));
   8785          DIP("movss %s,%s\n", nameXMMReg(eregOfRM(modrm)),
   8786                               nameXMMReg(gregOfRM(modrm)));
   8787          delta += 3+1;
   8788       } else {
   8789          addr = disAMode ( &alen, sorb, delta+3, dis_buf );
   8790          /* zero bits 127:64 */
   8791          putXMMRegLane64( gregOfRM(modrm), 1, mkU64(0) );
   8792          /* zero bits 63:32 */
   8793          putXMMRegLane32( gregOfRM(modrm), 1, mkU32(0) );
   8794          /* write bits 31:0 */
   8795          putXMMRegLane32( gregOfRM(modrm), 0,
   8796                           loadLE(Ity_I32, mkexpr(addr)) );
   8797          DIP("movss %s,%s\n", dis_buf,
   8798                               nameXMMReg(gregOfRM(modrm)));
   8799          delta += 3+alen;
   8800       }
   8801       goto decode_success;
   8802    }
   8803 
   8804    /* F3 0F 11 = MOVSS -- move 32 bits from G (lo 1/4 xmm) to E (mem
   8805       or lo 1/4 xmm). */
   8806    if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0x11) {
   8807       vassert(sz == 4);
   8808       modrm = getIByte(delta+3);
   8809       if (epartIsReg(modrm)) {
   8810          /* fall through, we don't yet have a test case */
   8811       } else {
   8812          addr = disAMode ( &alen, sorb, delta+3, dis_buf );
   8813          storeLE( mkexpr(addr),
   8814                   getXMMRegLane32(gregOfRM(modrm), 0) );
   8815          DIP("movss %s,%s\n", nameXMMReg(gregOfRM(modrm)),
   8816                               dis_buf);
   8817          delta += 3+alen;
   8818          goto decode_success;
   8819       }
   8820    }
   8821 
   8822    /* 0F 59 = MULPS -- mul 32Fx4 from R/M to R */
   8823    if (sz == 4 && insn[0] == 0x0F && insn[1] == 0x59) {
   8824       delta = dis_SSE_E_to_G_all( sorb, delta+2, "mulps", Iop_Mul32Fx4 );
   8825       goto decode_success;
   8826    }
   8827 
   8828    /* F3 0F 59 = MULSS -- mul 32F0x4 from R/M to R */
   8829    if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0x59) {
   8830       vassert(sz == 4);
   8831       delta = dis_SSE_E_to_G_lo32( sorb, delta+3, "mulss", Iop_Mul32F0x4 );
   8832       goto decode_success;
   8833    }
   8834 
   8835    /* 0F 56 = ORPS -- G = G and E */
   8836    if (sz == 4 && insn[0] == 0x0F && insn[1] == 0x56) {
   8837       delta = dis_SSE_E_to_G_all( sorb, delta+2, "orps", Iop_OrV128 );
   8838       goto decode_success;
   8839    }
   8840 
   8841    /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
   8842    /* 0F E0 = PAVGB -- 8x8 unsigned Packed Average, with rounding */
   8843    if (sz == 4 && insn[0] == 0x0F && insn[1] == 0xE0) {
   8844       do_MMX_preamble();
   8845       delta = dis_MMXop_regmem_to_reg (
   8846                 sorb, delta+2, insn[1], "pavgb", False );
   8847       goto decode_success;
   8848    }
   8849 
   8850    /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
   8851    /* 0F E3 = PAVGW -- 16x4 unsigned Packed Average, with rounding */
   8852    if (sz == 4 && insn[0] == 0x0F && insn[1] == 0xE3) {
   8853       do_MMX_preamble();
   8854       delta = dis_MMXop_regmem_to_reg (
   8855                 sorb, delta+2, insn[1], "pavgw", False );
   8856       goto decode_success;
   8857    }
   8858 
   8859    /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
   8860    /* 0F C5 = PEXTRW -- extract 16-bit field from mmx(E) and put
   8861       zero-extend of it in ireg(G). */
   8862    if (insn[0] == 0x0F && insn[1] == 0xC5) {
   8863       modrm = insn[2];
   8864       if (sz == 4 && epartIsReg(modrm)) {
   8865          IRTemp sV = newTemp(Ity_I64);
   8866          t5 = newTemp(Ity_I16);
   8867          do_MMX_preamble();
   8868          assign(sV, getMMXReg(eregOfRM(modrm)));
   8869          breakup64to16s( sV, &t3, &t2, &t1, &t0 );
   8870          switch (insn[3] & 3) {
   8871             case 0:  assign(t5, mkexpr(t0)); break;
   8872             case 1:  assign(t5, mkexpr(t1)); break;
   8873             case 2:  assign(t5, mkexpr(t2)); break;
   8874             case 3:  assign(t5, mkexpr(t3)); break;
   8875             default: vassert(0); /*NOTREACHED*/
   8876          }
   8877          putIReg(4, gregOfRM(modrm), unop(Iop_16Uto32, mkexpr(t5)));
   8878          DIP("pextrw $%d,%s,%s\n",
   8879              (Int)insn[3], nameMMXReg(eregOfRM(modrm)),
   8880                            nameIReg(4,gregOfRM(modrm)));
   8881          delta += 4;
   8882          goto decode_success;
   8883       }
   8884       /* else fall through */
   8885    }
   8886 
   8887    /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
   8888    /* 0F C4 = PINSRW -- get 16 bits from E(mem or low half ireg) and
   8889       put it into the specified lane of mmx(G). */
   8890    if (sz == 4 && insn[0] == 0x0F && insn[1] == 0xC4) {
   8891       /* Use t0 .. t3 to hold the 4 original 16-bit lanes of the
   8892          mmx reg.  t4 is the new lane value.  t5 is the original
   8893          mmx value. t6 is the new mmx value. */
   8894       Int lane;
   8895       t4 = newTemp(Ity_I16);
   8896       t5 = newTemp(Ity_I64);
   8897       t6 = newTemp(Ity_I64);
   8898       modrm = insn[2];
   8899       do_MMX_preamble();
   8900 
   8901       assign(t5, getMMXReg(gregOfRM(modrm)));
   8902       breakup64to16s( t5, &t3, &t2, &t1, &t0 );
   8903 
   8904       if (epartIsReg(modrm)) {
   8905          assign(t4, getIReg(2, eregOfRM(modrm)));
   8906          delta += 3+1;
   8907          lane = insn[3+1-1];
   8908          DIP("pinsrw $%d,%s,%s\n", (Int)lane,
   8909                                    nameIReg(2,eregOfRM(modrm)),
   8910                                    nameMMXReg(gregOfRM(modrm)));
   8911       } else {
   8912          addr = disAMode ( &alen, sorb, delta+2, dis_buf );
   8913          delta += 3+alen;
   8914          lane = insn[3+alen-1];
   8915          assign(t4, loadLE(Ity_I16, mkexpr(addr)));
   8916          DIP("pinsrw $%d,%s,%s\n", (Int)lane,
   8917                                    dis_buf,
   8918                                    nameMMXReg(gregOfRM(modrm)));
   8919       }
   8920 
   8921       switch (lane & 3) {
   8922          case 0:  assign(t6, mk64from16s(t3,t2,t1,t4)); break;
   8923          case 1:  assign(t6, mk64from16s(t3,t2,t4,t0)); break;
   8924          case 2:  assign(t6, mk64from16s(t3,t4,t1,t0)); break;
   8925          case 3:  assign(t6, mk64from16s(t4,t2,t1,t0)); break;
   8926          default: vassert(0); /*NOTREACHED*/
   8927       }
   8928       putMMXReg(gregOfRM(modrm), mkexpr(t6));
   8929       goto decode_success;
   8930    }
   8931 
   8932    /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
   8933    /* 0F EE = PMAXSW -- 16x4 signed max */
   8934    if (sz == 4 && insn[0] == 0x0F && insn[1] == 0xEE) {
   8935       do_MMX_preamble();
   8936       delta = dis_MMXop_regmem_to_reg (
   8937                 sorb, delta+2, insn[1], "pmaxsw", False );
   8938       goto decode_success;
   8939    }
   8940 
   8941    /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
   8942    /* 0F DE = PMAXUB -- 8x8 unsigned max */
   8943    if (sz == 4 && insn[0] == 0x0F && insn[1] == 0xDE) {
   8944       do_MMX_preamble();
   8945       delta = dis_MMXop_regmem_to_reg (
   8946                 sorb, delta+2, insn[1], "pmaxub", False );
   8947       goto decode_success;
   8948    }
   8949 
   8950    /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
   8951    /* 0F EA = PMINSW -- 16x4 signed min */
   8952    if (sz == 4 && insn[0] == 0x0F && insn[1] == 0xEA) {
   8953       do_MMX_preamble();
   8954       delta = dis_MMXop_regmem_to_reg (
   8955                 sorb, delta+2, insn[1], "pminsw", False );
   8956       goto decode_success;
   8957    }
   8958 
   8959    /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
   8960    /* 0F DA = PMINUB -- 8x8 unsigned min */
   8961    if (sz == 4 && insn[0] == 0x0F && insn[1] == 0xDA) {
   8962       do_MMX_preamble();
   8963       delta = dis_MMXop_regmem_to_reg (
   8964                 sorb, delta+2, insn[1], "pminub", False );
   8965       goto decode_success;
   8966    }
   8967 
   8968    /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
   8969    /* 0F D7 = PMOVMSKB -- extract sign bits from each of 8 lanes in
   8970       mmx(G), turn them into a byte, and put zero-extend of it in
   8971       ireg(G). */
   8972    if (sz == 4 && insn[0] == 0x0F && insn[1] == 0xD7) {
   8973       modrm = insn[2];
   8974       if (epartIsReg(modrm)) {
   8975          do_MMX_preamble();
   8976          t0 = newTemp(Ity_I64);
   8977          t1 = newTemp(Ity_I32);
   8978          assign(t0, getMMXReg(eregOfRM(modrm)));
   8979          assign(t1, mkIRExprCCall(
   8980                        Ity_I32, 0/*regparms*/,
   8981                        "x86g_calculate_mmx_pmovmskb",
   8982                        &x86g_calculate_mmx_pmovmskb,
   8983                        mkIRExprVec_1(mkexpr(t0))));
   8984          putIReg(4, gregOfRM(modrm), mkexpr(t1));
   8985          DIP("pmovmskb %s,%s\n", nameMMXReg(eregOfRM(modrm)),
   8986                                  nameIReg(4,gregOfRM(modrm)));
   8987          delta += 3;
   8988          goto decode_success;
   8989       }
   8990       /* else fall through */
   8991    }
   8992 
   8993    /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
   8994    /* 0F E4 = PMULUH -- 16x4 hi-half of unsigned widening multiply */
   8995    if (sz == 4 && insn[0] == 0x0F && insn[1] == 0xE4) {
   8996       do_MMX_preamble();
   8997       delta = dis_MMXop_regmem_to_reg (
   8998                 sorb, delta+2, insn[1], "pmuluh", False );
   8999       goto decode_success;
   9000    }
   9001 
   9002    /* 0F 18 /0 = PREFETCHNTA -- prefetch into caches, */
   9003    /* 0F 18 /1 = PREFETCH0   -- with various different hints */
   9004    /* 0F 18 /2 = PREFETCH1 */
   9005    /* 0F 18 /3 = PREFETCH2 */
   9006    if (insn[0] == 0x0F && insn[1] == 0x18
   9007        && !epartIsReg(insn[2])
   9008        && gregOfRM(insn[2]) >= 0 && gregOfRM(insn[2]) <= 3) {
   9009       HChar* hintstr = "??";
   9010 
   9011       modrm = getIByte(delta+2);
   9012       vassert(!epartIsReg(modrm));
   9013 
   9014       addr = disAMode ( &alen, sorb, delta+2, dis_buf );
   9015       delta += 2+alen;
   9016 
   9017       switch (gregOfRM(modrm)) {
   9018          case 0: hintstr = "nta"; break;
   9019          case 1: hintstr = "t0"; break;
   9020          case 2: hintstr = "t1"; break;
   9021          case 3: hintstr = "t2"; break;
   9022          default: vassert(0); /*NOTREACHED*/
   9023       }
   9024 
   9025       DIP("prefetch%s %s\n", hintstr, dis_buf);
   9026       goto decode_success;
   9027    }
   9028 
   9029    /* 0F 0D /0 = PREFETCH  m8 -- 3DNow! prefetch */
   9030    /* 0F 0D /1 = PREFETCHW m8 -- ditto, with some other hint */
   9031    if (insn[0] == 0x0F && insn[1] == 0x0D
   9032        && !epartIsReg(insn[2])
   9033        && gregOfRM(insn[2]) >= 0 && gregOfRM(insn[2]) <= 1) {
   9034       HChar* hintstr = "??";
   9035 
   9036       modrm = getIByte(delta+2);
   9037       vassert(!epartIsReg(modrm));
   9038 
   9039       addr = disAMode ( &alen, sorb, delta+2, dis_buf );
   9040       delta += 2+alen;
   9041 
   9042       switch (gregOfRM(modrm)) {
   9043          case 0: hintstr = ""; break;
   9044          case 1: hintstr = "w"; break;
   9045          default: vassert(0); /*NOTREACHED*/
   9046       }
   9047 
   9048       DIP("prefetch%s %s\n", hintstr, dis_buf);
   9049       goto decode_success;
   9050    }
   9051 
   9052    /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
   9053    /* 0F F6 = PSADBW -- sum of 8Ux8 absolute differences */
   9054    if (sz == 4 && insn[0] == 0x0F && insn[1] == 0xF6) {
   9055       do_MMX_preamble();
   9056       delta = dis_MMXop_regmem_to_reg (
   9057                  sorb, delta+2, insn[1], "psadbw", False );
   9058       goto decode_success;
   9059    }
   9060 
   9061    /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
   9062    /* 0F 70 = PSHUFW -- rearrange 4x16 from E(mmx or mem) to G(mmx) */
   9063    if (sz == 4 && insn[0] == 0x0F && insn[1] == 0x70) {
   9064       Int order;
   9065       IRTemp sV, dV, s3, s2, s1, s0;
   9066       s3 = s2 = s1 = s0 = IRTemp_INVALID;
   9067       sV = newTemp(Ity_I64);
   9068       dV = newTemp(Ity_I64);
   9069       do_MMX_preamble();
   9070       modrm = insn[2];
   9071       if (epartIsReg(modrm)) {
   9072          assign( sV, getMMXReg(eregOfRM(modrm)) );
   9073          order = (Int)insn[3];
   9074          delta += 2+2;
   9075          DIP("pshufw $%d,%s,%s\n", order,
   9076                                    nameMMXReg(eregOfRM(modrm)),
   9077                                    nameMMXReg(gregOfRM(modrm)));
   9078       } else {
   9079          addr = disAMode ( &alen, sorb, delta+2, dis_buf );
   9080          assign( sV, loadLE(Ity_I64, mkexpr(addr)) );
   9081 	 order = (Int)insn[2+alen];
   9082          delta += 3+alen;
   9083          DIP("pshufw $%d,%s,%s\n", order,
   9084                                    dis_buf,
   9085                                    nameMMXReg(gregOfRM(modrm)));
   9086       }
   9087       breakup64to16s( sV, &s3, &s2, &s1, &s0 );
   9088 
   9089 #     define SEL(n) \
   9090                 ((n)==0 ? s0 : ((n)==1 ? s1 : ((n)==2 ? s2 : s3)))
   9091       assign(dV,
   9092 	     mk64from16s( SEL((order>>6)&3), SEL((order>>4)&3),
   9093                           SEL((order>>2)&3), SEL((order>>0)&3) )
   9094       );
   9095       putMMXReg(gregOfRM(modrm), mkexpr(dV));
   9096 #     undef SEL
   9097       goto decode_success;
   9098    }
   9099 
   9100    /* 0F 53 = RCPPS -- approx reciprocal 32Fx4 from R/M to R */
   9101    if (insn[0] == 0x0F && insn[1] == 0x53) {
   9102       vassert(sz == 4);
   9103       delta = dis_SSE_E_to_G_unary_all( sorb, delta+2,
   9104                                         "rcpps", Iop_Recip32Fx4 );
   9105       goto decode_success;
   9106    }
   9107 
   9108    /* F3 0F 53 = RCPSS -- approx reciprocal 32F0x4 from R/M to R */
   9109    if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0x53) {
   9110       vassert(sz == 4);
   9111       delta = dis_SSE_E_to_G_unary_lo32( sorb, delta+3,
   9112                                          "rcpss", Iop_Recip32F0x4 );
   9113       goto decode_success;
   9114    }
   9115 
   9116    /* 0F 52 = RSQRTPS -- approx reciprocal sqrt 32Fx4 from R/M to R */
   9117    if (insn[0] == 0x0F && insn[1] == 0x52) {
   9118       vassert(sz == 4);
   9119       delta = dis_SSE_E_to_G_unary_all( sorb, delta+2,
   9120                                         "rsqrtps", Iop_RSqrt32Fx4 );
   9121       goto decode_success;
   9122    }
   9123 
   9124    /* F3 0F 52 = RSQRTSS -- approx reciprocal sqrt 32F0x4 from R/M to R */
   9125    if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0x52) {
   9126       vassert(sz == 4);
   9127       delta = dis_SSE_E_to_G_unary_lo32( sorb, delta+3,
   9128                                          "rsqrtss", Iop_RSqrt32F0x4 );
   9129       goto decode_success;
   9130    }
   9131 
   9132    /* 0F AE /7 = SFENCE -- flush pending operations to memory */
   9133    if (insn[0] == 0x0F && insn[1] == 0xAE
   9134        && epartIsReg(insn[2]) && gregOfRM(insn[2]) == 7) {
   9135       vassert(sz == 4);
   9136       delta += 3;
   9137       /* Insert a memory fence.  It's sometimes important that these
   9138          are carried through to the generated code. */
   9139       stmt( IRStmt_MBE(Imbe_Fence) );
   9140       DIP("sfence\n");
   9141       goto decode_success;
   9142    }
   9143 
   9144    /* 0F C6 /r ib = SHUFPS -- shuffle packed F32s */
   9145    if (sz == 4 && insn[0] == 0x0F && insn[1] == 0xC6) {
   9146       Int    select;
   9147       IRTemp sV, dV;
   9148       IRTemp s3, s2, s1, s0, d3, d2, d1, d0;
   9149       sV = newTemp(Ity_V128);
   9150       dV = newTemp(Ity_V128);
   9151       s3 = s2 = s1 = s0 = d3 = d2 = d1 = d0 = IRTemp_INVALID;
   9152       modrm = insn[2];
   9153       assign( dV, getXMMReg(gregOfRM(modrm)) );
   9154 
   9155       if (epartIsReg(modrm)) {
   9156          assign( sV, getXMMReg(eregOfRM(modrm)) );
   9157          select = (Int)insn[3];
   9158          delta += 2+2;
   9159          DIP("shufps $%d,%s,%s\n", select,
   9160                                    nameXMMReg(eregOfRM(modrm)),
   9161                                    nameXMMReg(gregOfRM(modrm)));
   9162       } else {
   9163          addr = disAMode ( &alen, sorb, delta+2, dis_buf );
   9164          assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
   9165          select = (Int)insn[2+alen];
   9166          delta += 3+alen;
   9167          DIP("shufps $%d,%s,%s\n", select,
   9168                                    dis_buf,
   9169                                    nameXMMReg(gregOfRM(modrm)));
   9170       }
   9171 
   9172       breakup128to32s( dV, &d3, &d2, &d1, &d0 );
   9173       breakup128to32s( sV, &s3, &s2, &s1, &s0 );
   9174 
   9175 #     define SELD(n) ((n)==0 ? d0 : ((n)==1 ? d1 : ((n)==2 ? d2 : d3)))
   9176 #     define SELS(n) ((n)==0 ? s0 : ((n)==1 ? s1 : ((n)==2 ? s2 : s3)))
   9177 
   9178       putXMMReg(
   9179          gregOfRM(modrm),
   9180          mk128from32s( SELS((select>>6)&3), SELS((select>>4)&3),
   9181                        SELD((select>>2)&3), SELD((select>>0)&3) )
   9182       );
   9183 
   9184 #     undef SELD
   9185 #     undef SELS
   9186 
   9187       goto decode_success;
   9188    }
   9189 
   9190    /* 0F 51 = SQRTPS -- approx sqrt 32Fx4 from R/M to R */
   9191    if (sz == 4 && insn[0] == 0x0F && insn[1] == 0x51) {
   9192       delta = dis_SSE_E_to_G_unary_all( sorb, delta+2,
   9193                                         "sqrtps", Iop_Sqrt32Fx4 );
   9194       goto decode_success;
   9195    }
   9196 
   9197    /* F3 0F 51 = SQRTSS -- approx sqrt 32F0x4 from R/M to R */
   9198    if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0x51) {
   9199       vassert(sz == 4);
   9200       delta = dis_SSE_E_to_G_unary_lo32( sorb, delta+3,
   9201                                          "sqrtss", Iop_Sqrt32F0x4 );
   9202       goto decode_success;
   9203    }
   9204 
   9205    /* 0F AE /3 = STMXCSR m32 -- store %mxcsr */
   9206    if (insn[0] == 0x0F && insn[1] == 0xAE
   9207        && !epartIsReg(insn[2]) && gregOfRM(insn[2]) == 3) {
   9208       modrm = getIByte(delta+2);
   9209       vassert(sz == 4);
   9210       vassert(!epartIsReg(modrm));
   9211 
   9212       addr = disAMode ( &alen, sorb, delta+2, dis_buf );
   9213       delta += 2+alen;
   9214 
   9215       /* Fake up a native SSE mxcsr word.  The only thing it depends
   9216          on is SSEROUND[1:0], so call a clean helper to cook it up.
   9217       */
   9218       /* UInt x86h_create_mxcsr ( UInt sseround ) */
   9219       DIP("stmxcsr %s\n", dis_buf);
   9220       storeLE( mkexpr(addr),
   9221                mkIRExprCCall(
   9222                   Ity_I32, 0/*regp*/,
   9223                   "x86g_create_mxcsr", &x86g_create_mxcsr,
   9224                   mkIRExprVec_1( get_sse_roundingmode() )
   9225                )
   9226              );
   9227       goto decode_success;
   9228    }
   9229 
   9230    /* 0F 5C = SUBPS -- sub 32Fx4 from R/M to R */
   9231    if (sz == 4 && insn[0] == 0x0F && insn[1] == 0x5C) {
   9232       delta = dis_SSE_E_to_G_all( sorb, delta+2, "subps", Iop_Sub32Fx4 );
   9233       goto decode_success;
   9234    }
   9235 
   9236    /* F3 0F 5C = SUBSS -- sub 32F0x4 from R/M to R */
   9237    if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0x5C) {
   9238       vassert(sz == 4);
   9239       delta = dis_SSE_E_to_G_lo32( sorb, delta+3, "subss", Iop_Sub32F0x4 );
   9240       goto decode_success;
   9241    }
   9242 
   9243    /* 0F 15 = UNPCKHPS -- unpack and interleave high part F32s */
   9244    /* 0F 14 = UNPCKLPS -- unpack and interleave low part F32s */
   9245    /* These just appear to be special cases of SHUFPS */
   9246    if (sz == 4 && insn[0] == 0x0F && (insn[1] == 0x15 || insn[1] == 0x14)) {
   9247       IRTemp sV, dV;
   9248       IRTemp s3, s2, s1, s0, d3, d2, d1, d0;
   9249       Bool hi = toBool(insn[1] == 0x15);
   9250       sV = newTemp(Ity_V128);
   9251       dV = newTemp(Ity_V128);
   9252       s3 = s2 = s1 = s0 = d3 = d2 = d1 = d0 = IRTemp_INVALID;
   9253       modrm = insn[2];
   9254       assign( dV, getXMMReg(gregOfRM(modrm)) );
   9255 
   9256       if (epartIsReg(modrm)) {
   9257          assign( sV, getXMMReg(eregOfRM(modrm)) );
   9258          delta += 2+1;
   9259          DIP("unpck%sps %s,%s\n", hi ? "h" : "l",
   9260                                   nameXMMReg(eregOfRM(modrm)),
   9261                                   nameXMMReg(gregOfRM(modrm)));
   9262       } else {
   9263          addr = disAMode ( &alen, sorb, delta+2, dis_buf );
   9264          assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
   9265          delta += 2+alen;
   9266          DIP("unpck%sps %s,%s\n", hi ? "h" : "l",
   9267                                   dis_buf,
   9268                                   nameXMMReg(gregOfRM(modrm)));
   9269       }
   9270 
   9271       breakup128to32s( dV, &d3, &d2, &d1, &d0 );
   9272       breakup128to32s( sV, &s3, &s2, &s1, &s0 );
   9273 
   9274       if (hi) {
   9275          putXMMReg( gregOfRM(modrm), mk128from32s( s3, d3, s2, d2 ) );
   9276       } else {
   9277          putXMMReg( gregOfRM(modrm), mk128from32s( s1, d1, s0, d0 ) );
   9278       }
   9279 
   9280       goto decode_success;
   9281    }
   9282 
   9283    /* 0F 57 = XORPS -- G = G and E */
   9284    if (sz == 4 && insn[0] == 0x0F && insn[1] == 0x57) {
   9285       delta = dis_SSE_E_to_G_all( sorb, delta+2, "xorps", Iop_XorV128 );
   9286       goto decode_success;
   9287    }
   9288 
   9289    /* ---------------------------------------------------- */
   9290    /* --- end of the SSE decoder.                      --- */
   9291    /* ---------------------------------------------------- */
   9292 
   9293    /* ---------------------------------------------------- */
   9294    /* --- start of the SSE2 decoder.                   --- */
   9295    /* ---------------------------------------------------- */
   9296 
   9297    /* Skip parts of the decoder which don't apply given the stated
   9298       guest subarchitecture. */
   9299    if (0 == (archinfo->hwcaps & VEX_HWCAPS_X86_SSE2))
   9300       goto after_sse_decoders; /* no SSE2 capabilities */
   9301 
   9302    insn = (UChar*)&guest_code[delta];
   9303 
   9304    /* 66 0F 58 = ADDPD -- add 32Fx4 from R/M to R */
   9305    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x58) {
   9306       delta = dis_SSE_E_to_G_all( sorb, delta+2, "addpd", Iop_Add64Fx2 );
   9307       goto decode_success;
   9308    }
   9309 
   9310    /* F2 0F 58 = ADDSD -- add 64F0x2 from R/M to R */
   9311    if (insn[0] == 0xF2 && insn[1] == 0x0F && insn[2] == 0x58) {
   9312       vassert(sz == 4);
   9313       delta = dis_SSE_E_to_G_lo64( sorb, delta+3, "addsd", Iop_Add64F0x2 );
   9314       goto decode_success;
   9315    }
   9316 
   9317    /* 66 0F 55 = ANDNPD -- G = (not G) and E */
   9318    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x55) {
   9319       delta = dis_SSE_E_to_G_all_invG( sorb, delta+2, "andnpd", Iop_AndV128 );
   9320       goto decode_success;
   9321    }
   9322 
   9323    /* 66 0F 54 = ANDPD -- G = G and E */
   9324    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x54) {
   9325       delta = dis_SSE_E_to_G_all( sorb, delta+2, "andpd", Iop_AndV128 );
   9326       goto decode_success;
   9327    }
   9328 
   9329    /* 66 0F C2 = CMPPD -- 64Fx2 comparison from R/M to R */
   9330    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xC2) {
   9331       delta = dis_SSEcmp_E_to_G( sorb, delta+2, "cmppd", True, 8 );
   9332       goto decode_success;
   9333    }
   9334 
   9335    /* F2 0F C2 = CMPSD -- 64F0x2 comparison from R/M to R */
   9336    if (insn[0] == 0xF2 && insn[1] == 0x0F && insn[2] == 0xC2) {
   9337       vassert(sz == 4);
   9338       delta = dis_SSEcmp_E_to_G( sorb, delta+3, "cmpsd", False, 8 );
   9339       goto decode_success;
   9340    }
   9341 
   9342    /* 66 0F 2F = COMISD  -- 64F0x2 comparison G,E, and set ZCP */
   9343    /* 66 0F 2E = UCOMISD -- 64F0x2 comparison G,E, and set ZCP */
   9344    if (sz == 2 && insn[0] == 0x0F && (insn[1] == 0x2F || insn[1] == 0x2E)) {
   9345       IRTemp argL = newTemp(Ity_F64);
   9346       IRTemp argR = newTemp(Ity_F64);
   9347       modrm = getIByte(delta+2);
   9348       if (epartIsReg(modrm)) {
   9349          assign( argR, getXMMRegLane64F( eregOfRM(modrm), 0/*lowest lane*/ ) );
   9350          delta += 2+1;
   9351          DIP("[u]comisd %s,%s\n", nameXMMReg(eregOfRM(modrm)),
   9352                                   nameXMMReg(gregOfRM(modrm)) );
   9353       } else {
   9354          addr = disAMode ( &alen, sorb, delta+2, dis_buf );
   9355 	 assign( argR, loadLE(Ity_F64, mkexpr(addr)) );
   9356          delta += 2+alen;
   9357          DIP("[u]comisd %s,%s\n", dis_buf,
   9358                                   nameXMMReg(gregOfRM(modrm)) );
   9359       }
   9360       assign( argL, getXMMRegLane64F( gregOfRM(modrm), 0/*lowest lane*/ ) );
   9361 
   9362       stmt( IRStmt_Put( OFFB_CC_OP,   mkU32(X86G_CC_OP_COPY) ));
   9363       stmt( IRStmt_Put( OFFB_CC_DEP2, mkU32(0) ));
   9364       stmt( IRStmt_Put(
   9365                OFFB_CC_DEP1,
   9366                binop( Iop_And32,
   9367                       binop(Iop_CmpF64, mkexpr(argL), mkexpr(argR)),
   9368                       mkU32(0x45)
   9369           )));
   9370       /* Set NDEP even though it isn't used.  This makes redundant-PUT
   9371          elimination of previous stores to this field work better. */
   9372       stmt( IRStmt_Put( OFFB_CC_NDEP, mkU32(0) ));
   9373       goto decode_success;
   9374    }
   9375 
   9376    /* F3 0F E6 = CVTDQ2PD -- convert 2 x I32 in mem/lo half xmm to 2 x
   9377       F64 in xmm(G) */
   9378    if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0xE6) {
   9379       IRTemp arg64 = newTemp(Ity_I64);
   9380       vassert(sz == 4);
   9381 
   9382       modrm = getIByte(delta+3);
   9383       if (epartIsReg(modrm)) {
   9384          assign( arg64, getXMMRegLane64(eregOfRM(modrm), 0) );
   9385          delta += 3+1;
   9386          DIP("cvtdq2pd %s,%s\n", nameXMMReg(eregOfRM(modrm)),
   9387                                  nameXMMReg(gregOfRM(modrm)));
   9388       } else {
   9389          addr = disAMode ( &alen, sorb, delta+3, dis_buf );
   9390 	 assign( arg64, loadLE(Ity_I64, mkexpr(addr)) );
   9391          delta += 3+alen;
   9392          DIP("cvtdq2pd %s,%s\n", dis_buf,
   9393                                  nameXMMReg(gregOfRM(modrm)) );
   9394       }
   9395 
   9396       putXMMRegLane64F(
   9397          gregOfRM(modrm), 0,
   9398          unop(Iop_I32StoF64, unop(Iop_64to32, mkexpr(arg64)))
   9399       );
   9400 
   9401       putXMMRegLane64F(
   9402          gregOfRM(modrm), 1,
   9403          unop(Iop_I32StoF64, unop(Iop_64HIto32, mkexpr(arg64)))
   9404       );
   9405 
   9406       goto decode_success;
   9407    }
   9408 
   9409    /* 0F 5B = CVTDQ2PS -- convert 4 x I32 in mem/xmm to 4 x F32 in
   9410       xmm(G) */
   9411    if (sz == 4 && insn[0] == 0x0F && insn[1] == 0x5B) {
   9412       IRTemp argV  = newTemp(Ity_V128);
   9413       IRTemp rmode = newTemp(Ity_I32);
   9414 
   9415       modrm = getIByte(delta+2);
   9416       if (epartIsReg(modrm)) {
   9417          assign( argV, getXMMReg(eregOfRM(modrm)) );
   9418          delta += 2+1;
   9419          DIP("cvtdq2ps %s,%s\n", nameXMMReg(eregOfRM(modrm)),
   9420                                  nameXMMReg(gregOfRM(modrm)));
   9421       } else {
   9422          addr = disAMode ( &alen, sorb, delta+2, dis_buf );
   9423 	 assign( argV, loadLE(Ity_V128, mkexpr(addr)) );
   9424          delta += 2+alen;
   9425          DIP("cvtdq2ps %s,%s\n", dis_buf,
   9426                                  nameXMMReg(gregOfRM(modrm)) );
   9427       }
   9428 
   9429       assign( rmode, get_sse_roundingmode() );
   9430       breakup128to32s( argV, &t3, &t2, &t1, &t0 );
   9431 
   9432 #     define CVT(_t)  binop( Iop_F64toF32,                    \
   9433                              mkexpr(rmode),                   \
   9434                              unop(Iop_I32StoF64,mkexpr(_t)))
   9435 
   9436       putXMMRegLane32F( gregOfRM(modrm), 3, CVT(t3) );
   9437       putXMMRegLane32F( gregOfRM(modrm), 2, CVT(t2) );
   9438       putXMMRegLane32F( gregOfRM(modrm), 1, CVT(t1) );
   9439       putXMMRegLane32F( gregOfRM(modrm), 0, CVT(t0) );
   9440 
   9441 #     undef CVT
   9442 
   9443       goto decode_success;
   9444    }
   9445 
   9446    /* F2 0F E6 = CVTPD2DQ -- convert 2 x F64 in mem/xmm to 2 x I32 in
   9447       lo half xmm(G), and zero upper half */
   9448    if (insn[0] == 0xF2 && insn[1] == 0x0F && insn[2] == 0xE6) {
   9449       IRTemp argV  = newTemp(Ity_V128);
   9450       IRTemp rmode = newTemp(Ity_I32);
   9451       vassert(sz == 4);
   9452 
   9453       modrm = getIByte(delta+3);
   9454       if (epartIsReg(modrm)) {
   9455          assign( argV, getXMMReg(eregOfRM(modrm)) );
   9456          delta += 3+1;
   9457          DIP("cvtpd2dq %s,%s\n", nameXMMReg(eregOfRM(modrm)),
   9458                                  nameXMMReg(gregOfRM(modrm)));
   9459       } else {
   9460          addr = disAMode ( &alen, sorb, delta+3, dis_buf );
   9461 	 assign( argV, loadLE(Ity_V128, mkexpr(addr)) );
   9462          delta += 3+alen;
   9463          DIP("cvtpd2dq %s,%s\n", dis_buf,
   9464                                  nameXMMReg(gregOfRM(modrm)) );
   9465       }
   9466 
   9467       assign( rmode, get_sse_roundingmode() );
   9468       t0 = newTemp(Ity_F64);
   9469       t1 = newTemp(Ity_F64);
   9470       assign( t0, unop(Iop_ReinterpI64asF64,
   9471                        unop(Iop_V128to64, mkexpr(argV))) );
   9472       assign( t1, unop(Iop_ReinterpI64asF64,
   9473                        unop(Iop_V128HIto64, mkexpr(argV))) );
   9474 
   9475 #     define CVT(_t)  binop( Iop_F64toI32S,                   \
   9476                              mkexpr(rmode),                   \
   9477                              mkexpr(_t) )
   9478 
   9479       putXMMRegLane32( gregOfRM(modrm), 3, mkU32(0) );
   9480       putXMMRegLane32( gregOfRM(modrm), 2, mkU32(0) );
   9481       putXMMRegLane32( gregOfRM(modrm), 1, CVT(t1) );
   9482       putXMMRegLane32( gregOfRM(modrm), 0, CVT(t0) );
   9483 
   9484 #     undef CVT
   9485 
   9486       goto decode_success;
   9487    }
   9488 
   9489    /* 66 0F 2D = CVTPD2PI -- convert 2 x F64 in mem/xmm to 2 x
   9490       I32 in mmx, according to prevailing SSE rounding mode */
   9491    /* 66 0F 2C = CVTTPD2PI -- convert 2 x F64 in mem/xmm to 2 x
   9492       I32 in mmx, rounding towards zero */
   9493    if (sz == 2 && insn[0] == 0x0F && (insn[1] == 0x2D || insn[1] == 0x2C)) {
   9494       IRTemp dst64  = newTemp(Ity_I64);
   9495       IRTemp rmode  = newTemp(Ity_I32);
   9496       IRTemp f64lo  = newTemp(Ity_F64);
   9497       IRTemp f64hi  = newTemp(Ity_F64);
   9498       Bool   r2zero = toBool(insn[1] == 0x2C);
   9499 
   9500       do_MMX_preamble();
   9501       modrm = getIByte(delta+2);
   9502 
   9503       if (epartIsReg(modrm)) {
   9504          delta += 2+1;
   9505 	 assign(f64lo, getXMMRegLane64F(eregOfRM(modrm), 0));
   9506 	 assign(f64hi, getXMMRegLane64F(eregOfRM(modrm), 1));
   9507          DIP("cvt%spd2pi %s,%s\n", r2zero ? "t" : "",
   9508                                    nameXMMReg(eregOfRM(modrm)),
   9509                                    nameMMXReg(gregOfRM(modrm)));
   9510       } else {
   9511          addr = disAMode ( &alen, sorb, delta+2, dis_buf );
   9512 	 assign(f64lo, loadLE(Ity_F64, mkexpr(addr)));
   9513 	 assign(f64hi, loadLE(Ity_F64, binop( Iop_Add32,
   9514                                               mkexpr(addr),
   9515                                               mkU32(8) )));
   9516          delta += 2+alen;
   9517          DIP("cvt%spf2pi %s,%s\n", r2zero ? "t" : "",
   9518                                    dis_buf,
   9519                                    nameMMXReg(gregOfRM(modrm)));
   9520       }
   9521 
   9522       if (r2zero) {
   9523          assign(rmode, mkU32((UInt)Irrm_ZERO) );
   9524       } else {
   9525          assign( rmode, get_sse_roundingmode() );
   9526       }
   9527 
   9528       assign(
   9529          dst64,
   9530          binop( Iop_32HLto64,
   9531                 binop( Iop_F64toI32S, mkexpr(rmode), mkexpr(f64hi) ),
   9532                 binop( Iop_F64toI32S, mkexpr(rmode), mkexpr(f64lo) )
   9533               )
   9534       );
   9535 
   9536       putMMXReg(gregOfRM(modrm), mkexpr(dst64));
   9537       goto decode_success;
   9538    }
   9539 
   9540    /* 66 0F 5A = CVTPD2PS -- convert 2 x F64 in mem/xmm to 2 x F32 in
   9541       lo half xmm(G), and zero upper half */
   9542    /* Note, this is practically identical to CVTPD2DQ.  It would have
   9543       been nicer to merge them together, but the insn[] offsets differ
   9544       by one. */
   9545    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x5A) {
   9546       IRTemp argV  = newTemp(Ity_V128);
   9547       IRTemp rmode = newTemp(Ity_I32);
   9548 
   9549       modrm = getIByte(delta+2);
   9550       if (epartIsReg(modrm)) {
   9551          assign( argV, getXMMReg(eregOfRM(modrm)) );
   9552          delta += 2+1;
   9553          DIP("cvtpd2ps %s,%s\n", nameXMMReg(eregOfRM(modrm)),
   9554                                  nameXMMReg(gregOfRM(modrm)));
   9555       } else {
   9556          addr = disAMode ( &alen, sorb, delta+2, dis_buf );
   9557 	 assign( argV, loadLE(Ity_V128, mkexpr(addr)) );
   9558          delta += 2+alen;
   9559          DIP("cvtpd2ps %s,%s\n", dis_buf,
   9560                                  nameXMMReg(gregOfRM(modrm)) );
   9561       }
   9562 
   9563       assign( rmode, get_sse_roundingmode() );
   9564       t0 = newTemp(Ity_F64);
   9565       t1 = newTemp(Ity_F64);
   9566       assign( t0, unop(Iop_ReinterpI64asF64,
   9567                        unop(Iop_V128to64, mkexpr(argV))) );
   9568       assign( t1, unop(Iop_ReinterpI64asF64,
   9569                        unop(Iop_V128HIto64, mkexpr(argV))) );
   9570 
   9571 #     define CVT(_t)  binop( Iop_F64toF32,                    \
   9572                              mkexpr(rmode),                   \
   9573                              mkexpr(_t) )
   9574 
   9575       putXMMRegLane32(  gregOfRM(modrm), 3, mkU32(0) );
   9576       putXMMRegLane32(  gregOfRM(modrm), 2, mkU32(0) );
   9577       putXMMRegLane32F( gregOfRM(modrm), 1, CVT(t1) );
   9578       putXMMRegLane32F( gregOfRM(modrm), 0, CVT(t0) );
   9579 
   9580 #     undef CVT
   9581 
   9582       goto decode_success;
   9583    }
   9584 
   9585    /* 66 0F 2A = CVTPI2PD -- convert 2 x I32 in mem/mmx to 2 x F64 in
   9586       xmm(G) */
   9587    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x2A) {
   9588       IRTemp arg64 = newTemp(Ity_I64);
   9589 
   9590       modrm = getIByte(delta+2);
   9591       if (epartIsReg(modrm)) {
   9592          /* Only switch to MMX mode if the source is a MMX register.
   9593             This is inconsistent with all other instructions which
   9594             convert between XMM and (M64 or MMX), which always switch
   9595             to MMX mode even if 64-bit operand is M64 and not MMX.  At
   9596             least, that's what the Intel docs seem to me to say.
   9597             Fixes #210264. */
   9598          do_MMX_preamble();
   9599          assign( arg64, getMMXReg(eregOfRM(modrm)) );
   9600          delta += 2+1;
   9601          DIP("cvtpi2pd %s,%s\n", nameMMXReg(eregOfRM(modrm)),
   9602                                  nameXMMReg(gregOfRM(modrm)));
   9603       } else {
   9604          addr = disAMode ( &alen, sorb, delta+2, dis_buf );
   9605 	 assign( arg64, loadLE(Ity_I64, mkexpr(addr)) );
   9606          delta += 2+alen;
   9607          DIP("cvtpi2pd %s,%s\n", dis_buf,
   9608                                  nameXMMReg(gregOfRM(modrm)) );
   9609       }
   9610 
   9611       putXMMRegLane64F(
   9612          gregOfRM(modrm), 0,
   9613          unop(Iop_I32StoF64, unop(Iop_64to32, mkexpr(arg64)) )
   9614       );
   9615 
   9616       putXMMRegLane64F(
   9617          gregOfRM(modrm), 1,
   9618          unop(Iop_I32StoF64, unop(Iop_64HIto32, mkexpr(arg64)) )
   9619       );
   9620 
   9621       goto decode_success;
   9622    }
   9623 
   9624    /* 66 0F 5B = CVTPS2DQ -- convert 4 x F32 in mem/xmm to 4 x I32 in
   9625       xmm(G) */
   9626    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x5B) {
   9627       IRTemp argV  = newTemp(Ity_V128);
   9628       IRTemp rmode = newTemp(Ity_I32);
   9629 
   9630       modrm = getIByte(delta+2);
   9631       if (epartIsReg(modrm)) {
   9632          assign( argV, getXMMReg(eregOfRM(modrm)) );
   9633          delta += 2+1;
   9634          DIP("cvtps2dq %s,%s\n", nameXMMReg(eregOfRM(modrm)),
   9635                                  nameXMMReg(gregOfRM(modrm)));
   9636       } else {
   9637          addr = disAMode ( &alen, sorb, delta+2, dis_buf );
   9638 	 assign( argV, loadLE(Ity_V128, mkexpr(addr)) );
   9639          delta += 2+alen;
   9640          DIP("cvtps2dq %s,%s\n", dis_buf,
   9641                                  nameXMMReg(gregOfRM(modrm)) );
   9642       }
   9643 
   9644       assign( rmode, get_sse_roundingmode() );
   9645       breakup128to32s( argV, &t3, &t2, &t1, &t0 );
   9646 
   9647       /* This is less than ideal.  If it turns out to be a performance
   9648 	 bottleneck it can be improved. */
   9649 #     define CVT(_t)                            \
   9650         binop( Iop_F64toI32S,                   \
   9651                mkexpr(rmode),                   \
   9652                unop( Iop_F32toF64,              \
   9653                      unop( Iop_ReinterpI32asF32, mkexpr(_t))) )
   9654 
   9655       putXMMRegLane32( gregOfRM(modrm), 3, CVT(t3) );
   9656       putXMMRegLane32( gregOfRM(modrm), 2, CVT(t2) );
   9657       putXMMRegLane32( gregOfRM(modrm), 1, CVT(t1) );
   9658       putXMMRegLane32( gregOfRM(modrm), 0, CVT(t0) );
   9659 
   9660 #     undef CVT
   9661 
   9662       goto decode_success;
   9663    }
   9664 
   9665    /* 0F 5A = CVTPS2PD -- convert 2 x F32 in low half mem/xmm to 2 x
   9666       F64 in xmm(G). */
   9667    if (sz == 4 && insn[0] == 0x0F && insn[1] == 0x5A) {
   9668       IRTemp f32lo = newTemp(Ity_F32);
   9669       IRTemp f32hi = newTemp(Ity_F32);
   9670 
   9671       modrm = getIByte(delta+2);
   9672       if (epartIsReg(modrm)) {
   9673          assign( f32lo, getXMMRegLane32F(eregOfRM(modrm), 0) );
   9674          assign( f32hi, getXMMRegLane32F(eregOfRM(modrm), 1) );
   9675          delta += 2+1;
   9676          DIP("cvtps2pd %s,%s\n", nameXMMReg(eregOfRM(modrm)),
   9677                                  nameXMMReg(gregOfRM(modrm)));
   9678       } else {
   9679          addr = disAMode ( &alen, sorb, delta+2, dis_buf );
   9680 	 assign( f32lo, loadLE(Ity_F32, mkexpr(addr)) );
   9681 	 assign( f32hi, loadLE(Ity_F32,
   9682                                binop(Iop_Add32,mkexpr(addr),mkU32(4))) );
   9683          delta += 2+alen;
   9684          DIP("cvtps2pd %s,%s\n", dis_buf,
   9685                                  nameXMMReg(gregOfRM(modrm)) );
   9686       }
   9687 
   9688       putXMMRegLane64F( gregOfRM(modrm), 1,
   9689                         unop(Iop_F32toF64, mkexpr(f32hi)) );
   9690       putXMMRegLane64F( gregOfRM(modrm), 0,
   9691                         unop(Iop_F32toF64, mkexpr(f32lo)) );
   9692 
   9693       goto decode_success;
   9694    }
   9695 
   9696    /* F2 0F 2D = CVTSD2SI -- convert F64 in mem/low half xmm to
   9697       I32 in ireg, according to prevailing SSE rounding mode */
   9698    /* F2 0F 2C = CVTTSD2SI -- convert F64 in mem/low half xmm to
   9699       I32 in ireg, rounding towards zero */
   9700    if (insn[0] == 0xF2 && insn[1] == 0x0F
   9701        && (insn[2] == 0x2D || insn[2] == 0x2C)) {
   9702       IRTemp rmode = newTemp(Ity_I32);
   9703       IRTemp f64lo = newTemp(Ity_F64);
   9704       Bool   r2zero = toBool(insn[2] == 0x2C);
   9705       vassert(sz == 4);
   9706 
   9707       modrm = getIByte(delta+3);
   9708       if (epartIsReg(modrm)) {
   9709          delta += 3+1;
   9710 	 assign(f64lo, getXMMRegLane64F(eregOfRM(modrm), 0));
   9711          DIP("cvt%ssd2si %s,%s\n", r2zero ? "t" : "",
   9712                                    nameXMMReg(eregOfRM(modrm)),
   9713                                    nameIReg(4, gregOfRM(modrm)));
   9714       } else {
   9715          addr = disAMode ( &alen, sorb, delta+3, dis_buf );
   9716 	 assign(f64lo, loadLE(Ity_F64, mkexpr(addr)));
   9717          delta += 3+alen;
   9718          DIP("cvt%ssd2si %s,%s\n", r2zero ? "t" : "",
   9719                                    dis_buf,
   9720                                    nameIReg(4, gregOfRM(modrm)));
   9721       }
   9722 
   9723       if (r2zero) {
   9724          assign( rmode, mkU32((UInt)Irrm_ZERO) );
   9725       } else {
   9726          assign( rmode, get_sse_roundingmode() );
   9727       }
   9728 
   9729       putIReg(4, gregOfRM(modrm),
   9730                  binop( Iop_F64toI32S, mkexpr(rmode), mkexpr(f64lo)) );
   9731 
   9732       goto decode_success;
   9733    }
   9734 
   9735    /* F2 0F 5A = CVTSD2SS -- convert F64 in mem/low half xmm to F32 in
   9736       low 1/4 xmm(G), according to prevailing SSE rounding mode */
   9737    if (insn[0] == 0xF2 && insn[1] == 0x0F && insn[2] == 0x5A) {
   9738       IRTemp rmode = newTemp(Ity_I32);
   9739       IRTemp f64lo = newTemp(Ity_F64);
   9740       vassert(sz == 4);
   9741 
   9742       modrm = getIByte(delta+3);
   9743       if (epartIsReg(modrm)) {
   9744          delta += 3+1;
   9745 	 assign(f64lo, getXMMRegLane64F(eregOfRM(modrm), 0));
   9746          DIP("cvtsd2ss %s,%s\n", nameXMMReg(eregOfRM(modrm)),
   9747                                  nameXMMReg(gregOfRM(modrm)));
   9748       } else {
   9749          addr = disAMode ( &alen, sorb, delta+3, dis_buf );
   9750 	 assign(f64lo, loadLE(Ity_F64, mkexpr(addr)));
   9751          delta += 3+alen;
   9752          DIP("cvtsd2ss %s,%s\n", dis_buf,
   9753                                  nameXMMReg(gregOfRM(modrm)));
   9754       }
   9755 
   9756       assign( rmode, get_sse_roundingmode() );
   9757       putXMMRegLane32F(
   9758          gregOfRM(modrm), 0,
   9759          binop( Iop_F64toF32, mkexpr(rmode), mkexpr(f64lo) )
   9760       );
   9761 
   9762       goto decode_success;
   9763    }
   9764 
   9765    /* F2 0F 2A = CVTSI2SD -- convert I32 in mem/ireg to F64 in low
   9766       half xmm */
   9767    if (insn[0] == 0xF2 && insn[1] == 0x0F && insn[2] == 0x2A) {
   9768       IRTemp arg32 = newTemp(Ity_I32);
   9769       vassert(sz == 4);
   9770 
   9771       modrm = getIByte(delta+3);
   9772       if (epartIsReg(modrm)) {
   9773          assign( arg32, getIReg(4, eregOfRM(modrm)) );
   9774          delta += 3+1;
   9775          DIP("cvtsi2sd %s,%s\n", nameIReg(4, eregOfRM(modrm)),
   9776                                  nameXMMReg(gregOfRM(modrm)));
   9777       } else {
   9778          addr = disAMode ( &alen, sorb, delta+3, dis_buf );
   9779 	 assign( arg32, loadLE(Ity_I32, mkexpr(addr)) );
   9780          delta += 3+alen;
   9781          DIP("cvtsi2sd %s,%s\n", dis_buf,
   9782                                  nameXMMReg(gregOfRM(modrm)) );
   9783       }
   9784 
   9785       putXMMRegLane64F(
   9786          gregOfRM(modrm), 0,
   9787          unop(Iop_I32StoF64, mkexpr(arg32)) );
   9788 
   9789       goto decode_success;
   9790    }
   9791 
   9792    /* F3 0F 5A = CVTSS2SD -- convert F32 in mem/low 1/4 xmm to F64 in
   9793       low half xmm(G) */
   9794    if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0x5A) {
   9795       IRTemp f32lo = newTemp(Ity_F32);
   9796       vassert(sz == 4);
   9797 
   9798       modrm = getIByte(delta+3);
   9799       if (epartIsReg(modrm)) {
   9800          delta += 3+1;
   9801 	 assign(f32lo, getXMMRegLane32F(eregOfRM(modrm), 0));
   9802          DIP("cvtss2sd %s,%s\n", nameXMMReg(eregOfRM(modrm)),
   9803                                  nameXMMReg(gregOfRM(modrm)));
   9804       } else {
   9805          addr = disAMode ( &alen, sorb, delta+3, dis_buf );
   9806 	 assign(f32lo, loadLE(Ity_F32, mkexpr(addr)));
   9807          delta += 3+alen;
   9808          DIP("cvtss2sd %s,%s\n", dis_buf,
   9809                                  nameXMMReg(gregOfRM(modrm)));
   9810       }
   9811 
   9812       putXMMRegLane64F( gregOfRM(modrm), 0,
   9813                         unop( Iop_F32toF64, mkexpr(f32lo) ) );
   9814 
   9815       goto decode_success;
   9816    }
   9817 
   9818    /* 66 0F E6 = CVTTPD2DQ -- convert 2 x F64 in mem/xmm to 2 x I32 in
   9819       lo half xmm(G), and zero upper half, rounding towards zero */
   9820    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xE6) {
   9821       IRTemp argV  = newTemp(Ity_V128);
   9822       IRTemp rmode = newTemp(Ity_I32);
   9823 
   9824       modrm = getIByte(delta+2);
   9825       if (epartIsReg(modrm)) {
   9826          assign( argV, getXMMReg(eregOfRM(modrm)) );
   9827          delta += 2+1;
   9828          DIP("cvttpd2dq %s,%s\n", nameXMMReg(eregOfRM(modrm)),
   9829                                   nameXMMReg(gregOfRM(modrm)));
   9830       } else {
   9831          addr = disAMode ( &alen, sorb, delta+2, dis_buf );
   9832 	 assign( argV, loadLE(Ity_V128, mkexpr(addr)) );
   9833          delta += 2+alen;
   9834          DIP("cvttpd2dq %s,%s\n", dis_buf,
   9835                                   nameXMMReg(gregOfRM(modrm)) );
   9836       }
   9837 
   9838       assign( rmode, mkU32((UInt)Irrm_ZERO) );
   9839 
   9840       t0 = newTemp(Ity_F64);
   9841       t1 = newTemp(Ity_F64);
   9842       assign( t0, unop(Iop_ReinterpI64asF64,
   9843                        unop(Iop_V128to64, mkexpr(argV))) );
   9844       assign( t1, unop(Iop_ReinterpI64asF64,
   9845                        unop(Iop_V128HIto64, mkexpr(argV))) );
   9846 
   9847 #     define CVT(_t)  binop( Iop_F64toI32S,                   \
   9848                              mkexpr(rmode),                   \
   9849                              mkexpr(_t) )
   9850 
   9851       putXMMRegLane32( gregOfRM(modrm), 3, mkU32(0) );
   9852       putXMMRegLane32( gregOfRM(modrm), 2, mkU32(0) );
   9853       putXMMRegLane32( gregOfRM(modrm), 1, CVT(t1) );
   9854       putXMMRegLane32( gregOfRM(modrm), 0, CVT(t0) );
   9855 
   9856 #     undef CVT
   9857 
   9858       goto decode_success;
   9859    }
   9860 
   9861    /* F3 0F 5B = CVTTPS2DQ -- convert 4 x F32 in mem/xmm to 4 x I32 in
   9862       xmm(G), rounding towards zero */
   9863    if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0x5B) {
   9864       IRTemp argV  = newTemp(Ity_V128);
   9865       IRTemp rmode = newTemp(Ity_I32);
   9866       vassert(sz == 4);
   9867 
   9868       modrm = getIByte(delta+3);
   9869       if (epartIsReg(modrm)) {
   9870          assign( argV, getXMMReg(eregOfRM(modrm)) );
   9871          delta += 3+1;
   9872          DIP("cvttps2dq %s,%s\n", nameXMMReg(eregOfRM(modrm)),
   9873                                   nameXMMReg(gregOfRM(modrm)));
   9874       } else {
   9875          addr = disAMode ( &alen, sorb, delta+3, dis_buf );
   9876 	 assign( argV, loadLE(Ity_V128, mkexpr(addr)) );
   9877          delta += 3+alen;
   9878          DIP("cvttps2dq %s,%s\n", dis_buf,
   9879                                   nameXMMReg(gregOfRM(modrm)) );
   9880       }
   9881 
   9882       assign( rmode, mkU32((UInt)Irrm_ZERO) );
   9883       breakup128to32s( argV, &t3, &t2, &t1, &t0 );
   9884 
   9885       /* This is less than ideal.  If it turns out to be a performance
   9886 	 bottleneck it can be improved. */
   9887 #     define CVT(_t)                            \
   9888         binop( Iop_F64toI32S,                   \
   9889                mkexpr(rmode),                   \
   9890                unop( Iop_F32toF64,              \
   9891                      unop( Iop_ReinterpI32asF32, mkexpr(_t))) )
   9892 
   9893       putXMMRegLane32( gregOfRM(modrm), 3, CVT(t3) );
   9894       putXMMRegLane32( gregOfRM(modrm), 2, CVT(t2) );
   9895       putXMMRegLane32( gregOfRM(modrm), 1, CVT(t1) );
   9896       putXMMRegLane32( gregOfRM(modrm), 0, CVT(t0) );
   9897 
   9898 #     undef CVT
   9899 
   9900       goto decode_success;
   9901    }
   9902 
   9903    /* 66 0F 5E = DIVPD -- div 64Fx2 from R/M to R */
   9904    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x5E) {
   9905       delta = dis_SSE_E_to_G_all( sorb, delta+2, "divpd", Iop_Div64Fx2 );
   9906       goto decode_success;
   9907    }
   9908 
   9909    /* F2 0F 5E = DIVSD -- div 64F0x2 from R/M to R */
   9910    if (insn[0] == 0xF2 && insn[1] == 0x0F && insn[2] == 0x5E) {
   9911       vassert(sz == 4);
   9912       delta = dis_SSE_E_to_G_lo64( sorb, delta+3, "divsd", Iop_Div64F0x2 );
   9913       goto decode_success;
   9914    }
   9915 
   9916    /* 0F AE /5 = LFENCE -- flush pending operations to memory */
   9917    /* 0F AE /6 = MFENCE -- flush pending operations to memory */
   9918    if (insn[0] == 0x0F && insn[1] == 0xAE
   9919        && epartIsReg(insn[2])
   9920        && (gregOfRM(insn[2]) == 5 || gregOfRM(insn[2]) == 6)) {
   9921       vassert(sz == 4);
   9922       delta += 3;
   9923       /* Insert a memory fence.  It's sometimes important that these
   9924          are carried through to the generated code. */
   9925       stmt( IRStmt_MBE(Imbe_Fence) );
   9926       DIP("%sfence\n", gregOfRM(insn[2])==5 ? "l" : "m");
   9927       goto decode_success;
   9928    }
   9929 
   9930    /* 66 0F 5F = MAXPD -- max 64Fx2 from R/M to R */
   9931    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x5F) {
   9932       delta = dis_SSE_E_to_G_all( sorb, delta+2, "maxpd", Iop_Max64Fx2 );
   9933       goto decode_success;
   9934    }
   9935 
   9936    /* F2 0F 5F = MAXSD -- max 64F0x2 from R/M to R */
   9937    if (insn[0] == 0xF2 && insn[1] == 0x0F && insn[2] == 0x5F) {
   9938       vassert(sz == 4);
   9939       delta = dis_SSE_E_to_G_lo64( sorb, delta+3, "maxsd", Iop_Max64F0x2 );
   9940       goto decode_success;
   9941    }
   9942 
   9943    /* 66 0F 5D = MINPD -- min 64Fx2 from R/M to R */
   9944    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x5D) {
   9945       delta = dis_SSE_E_to_G_all( sorb, delta+2, "minpd", Iop_Min64Fx2 );
   9946       goto decode_success;
   9947    }
   9948 
   9949    /* F2 0F 5D = MINSD -- min 64F0x2 from R/M to R */
   9950    if (insn[0] == 0xF2 && insn[1] == 0x0F && insn[2] == 0x5D) {
   9951       vassert(sz == 4);
   9952       delta = dis_SSE_E_to_G_lo64( sorb, delta+3, "minsd", Iop_Min64F0x2 );
   9953       goto decode_success;
   9954    }
   9955 
   9956    /* 66 0F 28 = MOVAPD -- move from E (mem or xmm) to G (xmm). */
   9957    /* 66 0F 10 = MOVUPD -- move from E (mem or xmm) to G (xmm). */
   9958    /* 66 0F 6F = MOVDQA -- move from E (mem or xmm) to G (xmm). */
   9959    if (sz == 2 && insn[0] == 0x0F
   9960        && (insn[1] == 0x28 || insn[1] == 0x10 || insn[1] == 0x6F)) {
   9961       HChar* wot = insn[1]==0x28 ? "apd" :
   9962                    insn[1]==0x10 ? "upd" : "dqa";
   9963       modrm = getIByte(delta+2);
   9964       if (epartIsReg(modrm)) {
   9965          putXMMReg( gregOfRM(modrm),
   9966                     getXMMReg( eregOfRM(modrm) ));
   9967          DIP("mov%s %s,%s\n", wot, nameXMMReg(eregOfRM(modrm)),
   9968                                    nameXMMReg(gregOfRM(modrm)));
   9969          delta += 2+1;
   9970       } else {
   9971          addr = disAMode ( &alen, sorb, delta+2, dis_buf );
   9972          if (insn[1] == 0x28/*movapd*/ || insn[1] == 0x6F/*movdqa*/)
   9973             gen_SEGV_if_not_16_aligned( addr );
   9974          putXMMReg( gregOfRM(modrm),
   9975                     loadLE(Ity_V128, mkexpr(addr)) );
   9976          DIP("mov%s %s,%s\n", wot, dis_buf,
   9977                                    nameXMMReg(gregOfRM(modrm)));
   9978          delta += 2+alen;
   9979       }
   9980       goto decode_success;
   9981    }
   9982 
   9983    /* 66 0F 29 = MOVAPD -- move from G (xmm) to E (mem or xmm). */
   9984    /* 66 0F 11 = MOVUPD -- move from G (xmm) to E (mem or xmm). */
   9985    if (sz == 2 && insn[0] == 0x0F
   9986        && (insn[1] == 0x29 || insn[1] == 0x11)) {
   9987       HChar* wot = insn[1]==0x29 ? "apd" : "upd";
   9988       modrm = getIByte(delta+2);
   9989       if (epartIsReg(modrm)) {
   9990          /* fall through; awaiting test case */
   9991       } else {
   9992          addr = disAMode ( &alen, sorb, delta+2, dis_buf );
   9993          if (insn[1] == 0x29/*movapd*/)
   9994             gen_SEGV_if_not_16_aligned( addr );
   9995          storeLE( mkexpr(addr), getXMMReg(gregOfRM(modrm)) );
   9996          DIP("mov%s %s,%s\n", wot, nameXMMReg(gregOfRM(modrm)),
   9997                                    dis_buf );
   9998          delta += 2+alen;
   9999          goto decode_success;
   10000       }
   10001    }
   10002 
   10003    /* 66 0F 6E = MOVD from r/m32 to xmm, zeroing high 3/4 of xmm. */
   10004    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x6E) {
   10005       modrm = getIByte(delta+2);
   10006       if (epartIsReg(modrm)) {
   10007          delta += 2+1;
   10008          putXMMReg(
   10009             gregOfRM(modrm),
   10010             unop( Iop_32UtoV128, getIReg(4, eregOfRM(modrm)) )
   10011          );
   10012          DIP("movd %s, %s\n",
   10013              nameIReg(4,eregOfRM(modrm)), nameXMMReg(gregOfRM(modrm)));
   10014       } else {
   10015          addr = disAMode( &alen, sorb, delta+2, dis_buf );
   10016          delta += 2+alen;
   10017          putXMMReg(
   10018             gregOfRM(modrm),
   10019             unop( Iop_32UtoV128,loadLE(Ity_I32, mkexpr(addr)) )
   10020          );
   10021          DIP("movd %s, %s\n", dis_buf, nameXMMReg(gregOfRM(modrm)));
   10022       }
   10023       goto decode_success;
   10024    }
   10025 
   10026    /* 66 0F 7E = MOVD from xmm low 1/4 to r/m32. */
   10027    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x7E) {
   10028       modrm = getIByte(delta+2);
   10029       if (epartIsReg(modrm)) {
   10030          delta += 2+1;
   10031          putIReg( 4, eregOfRM(modrm),
   10032                   getXMMRegLane32(gregOfRM(modrm), 0) );
   10033          DIP("movd %s, %s\n",
   10034              nameXMMReg(gregOfRM(modrm)), nameIReg(4,eregOfRM(modrm)));
   10035       } else {
   10036          addr = disAMode( &alen, sorb, delta+2, dis_buf );
   10037          delta += 2+alen;
   10038          storeLE( mkexpr(addr),
   10039                   getXMMRegLane32(gregOfRM(modrm), 0) );
   10040          DIP("movd %s, %s\n", nameXMMReg(gregOfRM(modrm)), dis_buf);
   10041       }
   10042       goto decode_success;
   10043    }
   10044 
   10045    /* 66 0F 7F = MOVDQA -- move from G (xmm) to E (mem or xmm). */
   10046    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x7F) {
   10047       modrm = getIByte(delta+2);
   10048       if (epartIsReg(modrm)) {
   10049          delta += 2+1;
   10050          putXMMReg( eregOfRM(modrm),
   10051                     getXMMReg(gregOfRM(modrm)) );
   10052          DIP("movdqa %s, %s\n", nameXMMReg(gregOfRM(modrm)),
   10053                                 nameXMMReg(eregOfRM(modrm)));
   10054       } else {
   10055          addr = disAMode( &alen, sorb, delta+2, dis_buf );
   10056          delta += 2+alen;
   10057          gen_SEGV_if_not_16_aligned( addr );
   10058          storeLE( mkexpr(addr), getXMMReg(gregOfRM(modrm)) );
   10059          DIP("movdqa %s, %s\n", nameXMMReg(gregOfRM(modrm)), dis_buf);
   10060       }
   10061       goto decode_success;
   10062    }
   10063 
   10064    /* F3 0F 6F = MOVDQU -- move from E (mem or xmm) to G (xmm). */
   10065    /* Unfortunately can't simply use the MOVDQA case since the
   10066       prefix lengths are different (66 vs F3) */
   10067    if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0x6F) {
   10068       vassert(sz == 4);
   10069       modrm = getIByte(delta+3);
   10070       if (epartIsReg(modrm)) {
   10071          putXMMReg( gregOfRM(modrm),
   10072                     getXMMReg( eregOfRM(modrm) ));
   10073          DIP("movdqu %s,%s\n", nameXMMReg(eregOfRM(modrm)),
   10074                                nameXMMReg(gregOfRM(modrm)));
   10075          delta += 3+1;
   10076       } else {
   10077          addr = disAMode ( &alen, sorb, delta+3, dis_buf );
   10078          putXMMReg( gregOfRM(modrm),
   10079                     loadLE(Ity_V128, mkexpr(addr)) );
   10080          DIP("movdqu %s,%s\n", dis_buf,
   10081                                nameXMMReg(gregOfRM(modrm)));
   10082          delta += 3+alen;
   10083       }
   10084       goto decode_success;
   10085    }
   10086 
   10087    /* F3 0F 7F = MOVDQU -- move from G (xmm) to E (mem or xmm). */
   10088    /* Unfortunately can't simply use the MOVDQA case since the
   10089       prefix lengths are different (66 vs F3) */
   10090    if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0x7F) {
   10091       vassert(sz == 4);
   10092       modrm = getIByte(delta+3);
   10093       if (epartIsReg(modrm)) {
   10094          delta += 3+1;
   10095          putXMMReg( eregOfRM(modrm),
   10096                     getXMMReg(gregOfRM(modrm)) );
   10097          DIP("movdqu %s, %s\n", nameXMMReg(gregOfRM(modrm)),
   10098                                 nameXMMReg(eregOfRM(modrm)));
   10099       } else {
   10100          addr = disAMode( &alen, sorb, delta+3, dis_buf );
   10101          delta += 3+alen;
   10102          storeLE( mkexpr(addr), getXMMReg(gregOfRM(modrm)) );
   10103          DIP("movdqu %s, %s\n", nameXMMReg(gregOfRM(modrm)), dis_buf);
   10104       }
   10105       goto decode_success;
   10106    }
   10107 
   10108    /* F2 0F D6 = MOVDQ2Q -- move from E (lo half xmm, not mem) to G (mmx). */
   10109    if (insn[0] == 0xF2 && insn[1] == 0x0F && insn[2] == 0xD6) {
   10110       vassert(sz == 4);
   10111       modrm = getIByte(delta+3);
   10112       if (epartIsReg(modrm)) {
   10113          do_MMX_preamble();
   10114          putMMXReg( gregOfRM(modrm),
   10115                     getXMMRegLane64( eregOfRM(modrm), 0 ));
   10116          DIP("movdq2q %s,%s\n", nameXMMReg(eregOfRM(modrm)),
   10117                                 nameMMXReg(gregOfRM(modrm)));
   10118          delta += 3+1;
   10119          goto decode_success;
   10120       } else {
   10121          /* fall through, apparently no mem case for this insn */
   10122       }
   10123    }
   10124 
   10125    /* 66 0F 16 = MOVHPD -- move from mem to high half of XMM. */
   10126    /* These seems identical to MOVHPS.  This instruction encoding is
   10127       completely crazy. */
   10128    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x16) {
   10129       modrm = getIByte(delta+2);
   10130       if (epartIsReg(modrm)) {
   10131          /* fall through; apparently reg-reg is not possible */
   10132       } else {
   10133          addr = disAMode ( &alen, sorb, delta+2, dis_buf );
   10134          delta += 2+alen;
   10135          putXMMRegLane64( gregOfRM(modrm), 1/*upper lane*/,
   10136                           loadLE(Ity_I64, mkexpr(addr)) );
   10137          DIP("movhpd %s,%s\n", dis_buf,
   10138                                nameXMMReg( gregOfRM(modrm) ));
   10139          goto decode_success;
   10140       }
   10141    }
   10142 
   10143    /* 66 0F 17 = MOVHPD -- move from high half of XMM to mem. */
   10144    /* Again, this seems identical to MOVHPS. */
   10145    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x17) {
   10146       if (!epartIsReg(insn[2])) {
   10147          delta += 2;
   10148          addr = disAMode ( &alen, sorb, delta, dis_buf );
   10149          delta += alen;
   10150          storeLE( mkexpr(addr),
   10151                   getXMMRegLane64( gregOfRM(insn[2]),
   10152                                    1/*upper lane*/ ) );
   10153          DIP("movhpd %s,%s\n", nameXMMReg( gregOfRM(insn[2]) ),
   10154                                dis_buf);
   10155          goto decode_success;
   10156       }
   10157       /* else fall through */
   10158    }
   10159 
   10160    /* 66 0F 12 = MOVLPD -- move from mem to low half of XMM. */
   10161    /* Identical to MOVLPS ? */
   10162    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x12) {
   10163       modrm = getIByte(delta+2);
   10164       if (epartIsReg(modrm)) {
   10165          /* fall through; apparently reg-reg is not possible */
   10166       } else {
   10167          addr = disAMode ( &alen, sorb, delta+2, dis_buf );
   10168          delta += 2+alen;
   10169          putXMMRegLane64( gregOfRM(modrm),  0/*lower lane*/,
   10170                           loadLE(Ity_I64, mkexpr(addr)) );
   10171          DIP("movlpd %s, %s\n",
   10172              dis_buf, nameXMMReg( gregOfRM(modrm) ));
   10173          goto decode_success;
   10174       }
   10175    }
   10176 
   10177    /* 66 0F 13 = MOVLPD -- move from low half of XMM to mem. */
   10178    /* Identical to MOVLPS ? */
   10179    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x13) {
   10180       if (!epartIsReg(insn[2])) {
   10181          delta += 2;
   10182          addr = disAMode ( &alen, sorb, delta, dis_buf );
   10183          delta += alen;
   10184          storeLE( mkexpr(addr),
   10185                   getXMMRegLane64( gregOfRM(insn[2]),
   10186                                    0/*lower lane*/ ) );
   10187          DIP("movlpd %s, %s\n", nameXMMReg( gregOfRM(insn[2]) ),
   10188                                 dis_buf);
   10189          goto decode_success;
   10190       }
   10191       /* else fall through */
   10192    }
   10193 
   10194    /* 66 0F 50 = MOVMSKPD - move 2 sign bits from 2 x F64 in xmm(E) to
   10195       2 lowest bits of ireg(G) */
   10196    if (insn[0] == 0x0F && insn[1] == 0x50) {
   10197       modrm = getIByte(delta+2);
   10198       if (sz == 2 && epartIsReg(modrm)) {
   10199          Int src;
   10200          t0 = newTemp(Ity_I32);
   10201          t1 = newTemp(Ity_I32);
   10202          delta += 2+1;
   10203          src = eregOfRM(modrm);
   10204          assign( t0, binop( Iop_And32,
   10205                             binop(Iop_Shr32, getXMMRegLane32(src,1), mkU8(31)),
   10206                             mkU32(1) ));
   10207          assign( t1, binop( Iop_And32,
   10208                             binop(Iop_Shr32, getXMMRegLane32(src,3), mkU8(30)),
   10209                             mkU32(2) ));
   10210          putIReg(4, gregOfRM(modrm),
   10211                     binop(Iop_Or32, mkexpr(t0), mkexpr(t1))
   10212                  );
   10213          DIP("movmskpd %s,%s\n", nameXMMReg(src),
   10214                                  nameIReg(4, gregOfRM(modrm)));
   10215          goto decode_success;
   10216       }
   10217       /* else fall through */
   10218    }
   10219 
   10220    /* 66 0F F7 = MASKMOVDQU -- store selected bytes of double quadword */
   10221    if (insn[0] == 0x0F && insn[1] == 0xF7) {
   10222       modrm = getIByte(delta+2);
   10223       if (sz == 2 && epartIsReg(modrm)) {
   10224          IRTemp regD    = newTemp(Ity_V128);
   10225          IRTemp mask    = newTemp(Ity_V128);
   10226          IRTemp olddata = newTemp(Ity_V128);
   10227          IRTemp newdata = newTemp(Ity_V128);
   10228                 addr    = newTemp(Ity_I32);
   10229 
   10230          assign( addr, handleSegOverride( sorb, getIReg(4, R_EDI) ));
   10231          assign( regD, getXMMReg( gregOfRM(modrm) ));
   10232 
   10233          /* Unfortunately can't do the obvious thing with SarN8x16
   10234             here since that can't be re-emitted as SSE2 code - no such
   10235             insn. */
   10236 	 assign(
   10237             mask,
   10238             binop(Iop_64HLtoV128,
   10239                   binop(Iop_SarN8x8,
   10240                         getXMMRegLane64( eregOfRM(modrm), 1 ),
   10241                         mkU8(7) ),
   10242                   binop(Iop_SarN8x8,
   10243                         getXMMRegLane64( eregOfRM(modrm), 0 ),
   10244                         mkU8(7) ) ));
   10245          assign( olddata, loadLE( Ity_V128, mkexpr(addr) ));
   10246          assign( newdata,
   10247                  binop(Iop_OrV128,
   10248                        binop(Iop_AndV128,
   10249                              mkexpr(regD),
   10250                              mkexpr(mask) ),
   10251                        binop(Iop_AndV128,
   10252                              mkexpr(olddata),
   10253                              unop(Iop_NotV128, mkexpr(mask)))) );
   10254          storeLE( mkexpr(addr), mkexpr(newdata) );
   10255 
   10256          delta += 2+1;
   10257          DIP("maskmovdqu %s,%s\n", nameXMMReg( eregOfRM(modrm) ),
   10258                                    nameXMMReg( gregOfRM(modrm) ) );
   10259          goto decode_success;
   10260       }
   10261       /* else fall through */
   10262    }
   10263 
   10264    /* 66 0F E7 = MOVNTDQ -- for us, just a plain SSE store. */
   10265    if (insn[0] == 0x0F && insn[1] == 0xE7) {
   10266       modrm = getIByte(delta+2);
   10267       if (sz == 2 && !epartIsReg(modrm)) {
   10268          addr = disAMode ( &alen, sorb, delta+2, dis_buf );
   10269          gen_SEGV_if_not_16_aligned( addr );
   10270          storeLE( mkexpr(addr), getXMMReg(gregOfRM(modrm)) );
   10271          DIP("movntdq %s,%s\n", dis_buf,
   10272                                 nameXMMReg(gregOfRM(modrm)));
   10273          delta += 2+alen;
   10274          goto decode_success;
   10275       }
   10276       /* else fall through */
   10277    }
   10278 
   10279    /* 0F C3 = MOVNTI -- for us, just a plain ireg store. */
   10280    if (insn[0] == 0x0F && insn[1] == 0xC3) {
   10281       vassert(sz == 4);
   10282       modrm = getIByte(delta+2);
   10283       if (!epartIsReg(modrm)) {
   10284          addr = disAMode ( &alen, sorb, delta+2, dis_buf );
   10285          storeLE( mkexpr(addr), getIReg(4, gregOfRM(modrm)) );
   10286          DIP("movnti %s,%s\n", dis_buf,
   10287                                nameIReg(4, gregOfRM(modrm)));
   10288          delta += 2+alen;
   10289          goto decode_success;
   10290       }
   10291       /* else fall through */
   10292    }
   10293 
   10294    /* 66 0F D6 = MOVQ -- move 64 bits from G (lo half xmm) to E (mem
   10295       or lo half xmm).  */
   10296    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xD6) {
   10297       modrm = getIByte(delta+2);
   10298       if (epartIsReg(modrm)) {
   10299          /* fall through, awaiting test case */
   10300          /* dst: lo half copied, hi half zeroed */
   10301       } else {
   10302          addr = disAMode ( &alen, sorb, delta+2, dis_buf );
   10303          storeLE( mkexpr(addr),
   10304                   getXMMRegLane64( gregOfRM(modrm), 0 ));
   10305          DIP("movq %s,%s\n", nameXMMReg(gregOfRM(modrm)), dis_buf );
   10306          delta += 2+alen;
   10307          goto decode_success;
   10308       }
   10309    }
   10310 
   10311    /* F3 0F D6 = MOVQ2DQ -- move from E (mmx) to G (lo half xmm, zero
   10312       hi half). */
   10313    if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0xD6) {
   10314       vassert(sz == 4);
   10315       modrm = getIByte(delta+3);
   10316       if (epartIsReg(modrm)) {
   10317          do_MMX_preamble();
   10318          putXMMReg( gregOfRM(modrm),
   10319                     unop(Iop_64UtoV128, getMMXReg( eregOfRM(modrm) )) );
   10320          DIP("movq2dq %s,%s\n", nameMMXReg(eregOfRM(modrm)),
   10321                                 nameXMMReg(gregOfRM(modrm)));
   10322          delta += 3+1;
   10323          goto decode_success;
   10324       } else {
   10325          /* fall through, apparently no mem case for this insn */
   10326       }
   10327    }
   10328 
   10329    /* F3 0F 7E = MOVQ -- move 64 bits from E (mem or lo half xmm) to
   10330       G (lo half xmm).  Upper half of G is zeroed out. */
   10331    /* F2 0F 10 = MOVSD -- move 64 bits from E (mem or lo half xmm) to
   10332       G (lo half xmm).  If E is mem, upper half of G is zeroed out.
   10333       If E is reg, upper half of G is unchanged. */
   10334    if ((insn[0] == 0xF2 && insn[1] == 0x0F && insn[2] == 0x10)
   10335        || (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0x7E)) {
   10336       vassert(sz == 4);
   10337       modrm = getIByte(delta+3);
   10338       if (epartIsReg(modrm)) {
   10339          putXMMRegLane64( gregOfRM(modrm), 0,
   10340                           getXMMRegLane64( eregOfRM(modrm), 0 ));
   10341          if (insn[0] == 0xF3/*MOVQ*/) {
   10342             /* zero bits 127:64 */
   10343             putXMMRegLane64( gregOfRM(modrm), 1, mkU64(0) );
   10344          }
   10345          DIP("movsd %s,%s\n", nameXMMReg(eregOfRM(modrm)),
   10346                               nameXMMReg(gregOfRM(modrm)));
   10347          delta += 3+1;
   10348       } else {
   10349          addr = disAMode ( &alen, sorb, delta+3, dis_buf );
   10350          /* zero bits 127:64 */
   10351          putXMMRegLane64( gregOfRM(modrm), 1, mkU64(0) );
   10352          /* write bits 63:0 */
   10353          putXMMRegLane64( gregOfRM(modrm), 0,
   10354                           loadLE(Ity_I64, mkexpr(addr)) );
   10355          DIP("movsd %s,%s\n", dis_buf,
   10356                               nameXMMReg(gregOfRM(modrm)));
   10357          delta += 3+alen;
   10358       }
   10359       goto decode_success;
   10360    }
   10361 
   10362    /* F2 0F 11 = MOVSD -- move 64 bits from G (lo half xmm) to E (mem
   10363       or lo half xmm). */
   10364    if (insn[0] == 0xF2 && insn[1] == 0x0F && insn[2] == 0x11) {
   10365       vassert(sz == 4);
   10366       modrm = getIByte(delta+3);
   10367       if (epartIsReg(modrm)) {
   10368          putXMMRegLane64( eregOfRM(modrm), 0,
   10369                           getXMMRegLane64( gregOfRM(modrm), 0 ));
   10370          DIP("movsd %s,%s\n", nameXMMReg(gregOfRM(modrm)),
   10371                               nameXMMReg(eregOfRM(modrm)));
   10372          delta += 3+1;
   10373       } else {
   10374          addr = disAMode ( &alen, sorb, delta+3, dis_buf );
   10375          storeLE( mkexpr(addr),
   10376                   getXMMRegLane64(gregOfRM(modrm), 0) );
   10377          DIP("movsd %s,%s\n", nameXMMReg(gregOfRM(modrm)),
   10378                               dis_buf);
   10379          delta += 3+alen;
   10380       }
   10381       goto decode_success;
   10382    }
   10383 
   10384    /* 66 0F 59 = MULPD -- mul 64Fx2 from R/M to R */
   10385    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x59) {
   10386       delta = dis_SSE_E_to_G_all( sorb, delta+2, "mulpd", Iop_Mul64Fx2 );
   10387       goto decode_success;
   10388    }
   10389 
   10390    /* F2 0F 59 = MULSD -- mul 64F0x2 from R/M to R */
   10391    if (insn[0] == 0xF2 && insn[1] == 0x0F && insn[2] == 0x59) {
   10392       vassert(sz == 4);
   10393       delta = dis_SSE_E_to_G_lo64( sorb, delta+3, "mulsd", Iop_Mul64F0x2 );
   10394       goto decode_success;
   10395    }
   10396 
   10397    /* 66 0F 56 = ORPD -- G = G and E */
   10398    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x56) {
   10399       delta = dis_SSE_E_to_G_all( sorb, delta+2, "orpd", Iop_OrV128 );
   10400       goto decode_success;
   10401    }
   10402 
   10403    /* 66 0F C6 /r ib = SHUFPD -- shuffle packed F64s */
   10404    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xC6) {
   10405       Int    select;
   10406       IRTemp sV = newTemp(Ity_V128);
   10407       IRTemp dV = newTemp(Ity_V128);
   10408       IRTemp s1 = newTemp(Ity_I64);
   10409       IRTemp s0 = newTemp(Ity_I64);
   10410       IRTemp d1 = newTemp(Ity_I64);
   10411       IRTemp d0 = newTemp(Ity_I64);
   10412 
   10413       modrm = insn[2];
   10414       assign( dV, getXMMReg(gregOfRM(modrm)) );
   10415 
   10416       if (epartIsReg(modrm)) {
   10417          assign( sV, getXMMReg(eregOfRM(modrm)) );
   10418          select = (Int)insn[3];
   10419          delta += 2+2;
   10420          DIP("shufpd $%d,%s,%s\n", select,
   10421                                    nameXMMReg(eregOfRM(modrm)),
   10422                                    nameXMMReg(gregOfRM(modrm)));
   10423       } else {
   10424          addr = disAMode ( &alen, sorb, delta+2, dis_buf );
   10425          assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
   10426          select = (Int)insn[2+alen];
   10427          delta += 3+alen;
   10428          DIP("shufpd $%d,%s,%s\n", select,
   10429                                    dis_buf,
   10430                                    nameXMMReg(gregOfRM(modrm)));
   10431       }
   10432 
   10433       assign( d1, unop(Iop_V128HIto64, mkexpr(dV)) );
   10434       assign( d0, unop(Iop_V128to64,   mkexpr(dV)) );
   10435       assign( s1, unop(Iop_V128HIto64, mkexpr(sV)) );
   10436       assign( s0, unop(Iop_V128to64,   mkexpr(sV)) );
   10437 
   10438 #     define SELD(n) mkexpr((n)==0 ? d0 : d1)
   10439 #     define SELS(n) mkexpr((n)==0 ? s0 : s1)
   10440 
   10441       putXMMReg(
   10442          gregOfRM(modrm),
   10443          binop(Iop_64HLtoV128, SELS((select>>1)&1), SELD((select>>0)&1) )
   10444       );
   10445 
   10446 #     undef SELD
   10447 #     undef SELS
   10448 
   10449       goto decode_success;
   10450    }
   10451 
   10452    /* 66 0F 51 = SQRTPD -- approx sqrt 64Fx2 from R/M to R */
   10453    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x51) {
   10454       delta = dis_SSE_E_to_G_unary_all( sorb, delta+2,
   10455                                         "sqrtpd", Iop_Sqrt64Fx2 );
   10456       goto decode_success;
   10457    }
   10458 
   10459    /* F2 0F 51 = SQRTSD -- approx sqrt 64F0x2 from R/M to R */
   10460    if (insn[0] == 0xF2 && insn[1] == 0x0F && insn[2] == 0x51) {
   10461       vassert(sz == 4);
   10462       delta = dis_SSE_E_to_G_unary_lo64( sorb, delta+3,
   10463                                          "sqrtsd", Iop_Sqrt64F0x2 );
   10464       goto decode_success;
   10465    }
   10466 
   10467    /* 66 0F 5C = SUBPD -- sub 64Fx2 from R/M to R */
   10468    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x5C) {
   10469       delta = dis_SSE_E_to_G_all( sorb, delta+2, "subpd", Iop_Sub64Fx2 );
   10470       goto decode_success;
   10471    }
   10472 
   10473    /* F2 0F 5C = SUBSD -- sub 64F0x2 from R/M to R */
   10474    if (insn[0] == 0xF2 && insn[1] == 0x0F && insn[2] == 0x5C) {
   10475       vassert(sz == 4);
   10476       delta = dis_SSE_E_to_G_lo64( sorb, delta+3, "subsd", Iop_Sub64F0x2 );
   10477       goto decode_success;
   10478    }
   10479 
   10480    /* 66 0F 15 = UNPCKHPD -- unpack and interleave high part F64s */
   10481    /* 66 0F 14 = UNPCKLPD -- unpack and interleave low part F64s */
   10482    /* These just appear to be special cases of SHUFPS */
   10483    if (sz == 2 && insn[0] == 0x0F && (insn[1] == 0x15 || insn[1] == 0x14)) {
   10484       IRTemp s1 = newTemp(Ity_I64);
   10485       IRTemp s0 = newTemp(Ity_I64);
   10486       IRTemp d1 = newTemp(Ity_I64);
   10487       IRTemp d0 = newTemp(Ity_I64);
   10488       IRTemp sV = newTemp(Ity_V128);
   10489       IRTemp dV = newTemp(Ity_V128);
   10490       Bool   hi = toBool(insn[1] == 0x15);
   10491 
   10492       modrm = insn[2];
   10493       assign( dV, getXMMReg(gregOfRM(modrm)) );
   10494 
   10495       if (epartIsReg(modrm)) {
   10496          assign( sV, getXMMReg(eregOfRM(modrm)) );
   10497          delta += 2+1;
   10498          DIP("unpck%sps %s,%s\n", hi ? "h" : "l",
   10499                                   nameXMMReg(eregOfRM(modrm)),
   10500                                   nameXMMReg(gregOfRM(modrm)));
   10501       } else {
   10502          addr = disAMode ( &alen, sorb, delta+2, dis_buf );
   10503          assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
   10504          delta += 2+alen;
   10505          DIP("unpck%sps %s,%s\n", hi ? "h" : "l",
   10506                                   dis_buf,
   10507                                   nameXMMReg(gregOfRM(modrm)));
   10508       }
   10509 
   10510       assign( d1, unop(Iop_V128HIto64, mkexpr(dV)) );
   10511       assign( d0, unop(Iop_V128to64,   mkexpr(dV)) );
   10512       assign( s1, unop(Iop_V128HIto64, mkexpr(sV)) );
   10513       assign( s0, unop(Iop_V128to64,   mkexpr(sV)) );
   10514 
   10515       if (hi) {
   10516          putXMMReg( gregOfRM(modrm),
   10517                     binop(Iop_64HLtoV128, mkexpr(s1), mkexpr(d1)) );
   10518       } else {
   10519          putXMMReg( gregOfRM(modrm),
   10520                     binop(Iop_64HLtoV128, mkexpr(s0), mkexpr(d0)) );
   10521       }
   10522 
   10523       goto decode_success;
   10524    }
   10525 
   10526    /* 66 0F 57 = XORPD -- G = G and E */
   10527    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x57) {
   10528       delta = dis_SSE_E_to_G_all( sorb, delta+2, "xorpd", Iop_XorV128 );
   10529       goto decode_success;
   10530    }
   10531 
   10532    /* 66 0F 6B = PACKSSDW */
   10533    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x6B) {
   10534       delta = dis_SSEint_E_to_G( sorb, delta+2,
   10535                                  "packssdw",
   10536                                  Iop_QNarrowBin32Sto16Sx8, True );
   10537       goto decode_success;
   10538    }
   10539 
   10540    /* 66 0F 63 = PACKSSWB */
   10541    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x63) {
   10542       delta = dis_SSEint_E_to_G( sorb, delta+2,
   10543                                  "packsswb",
   10544                                  Iop_QNarrowBin16Sto8Sx16, True );
   10545       goto decode_success;
   10546    }
   10547 
   10548    /* 66 0F 67 = PACKUSWB */
   10549    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x67) {
   10550       delta = dis_SSEint_E_to_G( sorb, delta+2,
   10551                                  "packuswb",
   10552                                  Iop_QNarrowBin16Sto8Ux16, True );
   10553       goto decode_success;
   10554    }
   10555 
   10556    /* 66 0F FC = PADDB */
   10557    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xFC) {
   10558       delta = dis_SSEint_E_to_G( sorb, delta+2,
   10559                                  "paddb", Iop_Add8x16, False );
   10560       goto decode_success;
   10561    }
   10562 
   10563    /* 66 0F FE = PADDD */
   10564    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xFE) {
   10565       delta = dis_SSEint_E_to_G( sorb, delta+2,
   10566                                  "paddd", Iop_Add32x4, False );
   10567       goto decode_success;
   10568    }
   10569 
   10570    /* ***--- this is an MMX class insn introduced in SSE2 ---*** */
   10571    /* 0F D4 = PADDQ -- add 64x1 */
   10572    if (sz == 4 && insn[0] == 0x0F && insn[1] == 0xD4) {
   10573       do_MMX_preamble();
   10574       delta = dis_MMXop_regmem_to_reg (
   10575                 sorb, delta+2, insn[1], "paddq", False );
   10576       goto decode_success;
   10577    }
   10578 
   10579    /* 66 0F D4 = PADDQ */
   10580    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xD4) {
   10581       delta = dis_SSEint_E_to_G( sorb, delta+2,
   10582                                  "paddq", Iop_Add64x2, False );
   10583       goto decode_success;
   10584    }
   10585 
   10586    /* 66 0F FD = PADDW */
   10587    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xFD) {
   10588       delta = dis_SSEint_E_to_G( sorb, delta+2,
   10589                                  "paddw", Iop_Add16x8, False );
   10590       goto decode_success;
   10591    }
   10592 
   10593    /* 66 0F EC = PADDSB */
   10594    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xEC) {
   10595       delta = dis_SSEint_E_to_G( sorb, delta+2,
   10596                                  "paddsb", Iop_QAdd8Sx16, False );
   10597       goto decode_success;
   10598    }
   10599 
   10600    /* 66 0F ED = PADDSW */
   10601    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xED) {
   10602       delta = dis_SSEint_E_to_G( sorb, delta+2,
   10603                                  "paddsw", Iop_QAdd16Sx8, False );
   10604       goto decode_success;
   10605    }
   10606 
   10607    /* 66 0F DC = PADDUSB */
   10608    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xDC) {
   10609       delta = dis_SSEint_E_to_G( sorb, delta+2,
   10610                                  "paddusb", Iop_QAdd8Ux16, False );
   10611       goto decode_success;
   10612    }
   10613 
   10614    /* 66 0F DD = PADDUSW */
   10615    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xDD) {
   10616       delta = dis_SSEint_E_to_G( sorb, delta+2,
   10617                                  "paddusw", Iop_QAdd16Ux8, False );
   10618       goto decode_success;
   10619    }
   10620 
   10621    /* 66 0F DB = PAND */
   10622    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xDB) {
   10623       delta = dis_SSE_E_to_G_all( sorb, delta+2, "pand", Iop_AndV128 );
   10624       goto decode_success;
   10625    }
   10626 
   10627    /* 66 0F DF = PANDN */
   10628    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xDF) {
   10629       delta = dis_SSE_E_to_G_all_invG( sorb, delta+2, "pandn", Iop_AndV128 );
   10630       goto decode_success;
   10631    }
   10632 
   10633    /* 66 0F E0 = PAVGB */
   10634    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xE0) {
   10635       delta = dis_SSEint_E_to_G( sorb, delta+2,
   10636                                  "pavgb", Iop_Avg8Ux16, False );
   10637       goto decode_success;
   10638    }
   10639 
   10640    /* 66 0F E3 = PAVGW */
   10641    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xE3) {
   10642       delta = dis_SSEint_E_to_G( sorb, delta+2,
   10643                                  "pavgw", Iop_Avg16Ux8, False );
   10644       goto decode_success;
   10645    }
   10646 
   10647    /* 66 0F 74 = PCMPEQB */
   10648    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x74) {
   10649       delta = dis_SSEint_E_to_G( sorb, delta+2,
   10650                                  "pcmpeqb", Iop_CmpEQ8x16, False );
   10651       goto decode_success;
   10652    }
   10653 
   10654    /* 66 0F 76 = PCMPEQD */
   10655    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x76) {
   10656       delta = dis_SSEint_E_to_G( sorb, delta+2,
   10657                                  "pcmpeqd", Iop_CmpEQ32x4, False );
   10658       goto decode_success;
   10659    }
   10660 
   10661    /* 66 0F 75 = PCMPEQW */
   10662    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x75) {
   10663       delta = dis_SSEint_E_to_G( sorb, delta+2,
   10664                                  "pcmpeqw", Iop_CmpEQ16x8, False );
   10665       goto decode_success;
   10666    }
   10667 
   10668    /* 66 0F 64 = PCMPGTB */
   10669    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x64) {
   10670       delta = dis_SSEint_E_to_G( sorb, delta+2,
   10671                                  "pcmpgtb", Iop_CmpGT8Sx16, False );
   10672       goto decode_success;
   10673    }
   10674 
   10675    /* 66 0F 66 = PCMPGTD */
   10676    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x66) {
   10677       delta = dis_SSEint_E_to_G( sorb, delta+2,
   10678                                  "pcmpgtd", Iop_CmpGT32Sx4, False );
   10679       goto decode_success;
   10680    }
   10681 
   10682    /* 66 0F 65 = PCMPGTW */
   10683    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x65) {
   10684       delta = dis_SSEint_E_to_G( sorb, delta+2,
   10685                                  "pcmpgtw", Iop_CmpGT16Sx8, False );
   10686       goto decode_success;
   10687    }
   10688 
   10689    /* 66 0F C5 = PEXTRW -- extract 16-bit field from xmm(E) and put
   10690       zero-extend of it in ireg(G). */
   10691    if (insn[0] == 0x0F && insn[1] == 0xC5) {
   10692       modrm = insn[2];
   10693       if (sz == 2 && epartIsReg(modrm)) {
   10694          t5 = newTemp(Ity_V128);
   10695          t4 = newTemp(Ity_I16);
   10696          assign(t5, getXMMReg(eregOfRM(modrm)));
   10697          breakup128to32s( t5, &t3, &t2, &t1, &t0 );
   10698          switch (insn[3] & 7) {
   10699             case 0:  assign(t4, unop(Iop_32to16,   mkexpr(t0))); break;
   10700             case 1:  assign(t4, unop(Iop_32HIto16, mkexpr(t0))); break;
   10701             case 2:  assign(t4, unop(Iop_32to16,   mkexpr(t1))); break;
   10702             case 3:  assign(t4, unop(Iop_32HIto16, mkexpr(t1))); break;
   10703             case 4:  assign(t4, unop(Iop_32to16,   mkexpr(t2))); break;
   10704             case 5:  assign(t4, unop(Iop_32HIto16, mkexpr(t2))); break;
   10705             case 6:  assign(t4, unop(Iop_32to16,   mkexpr(t3))); break;
   10706             case 7:  assign(t4, unop(Iop_32HIto16, mkexpr(t3))); break;
   10707             default: vassert(0); /*NOTREACHED*/
   10708          }
   10709          putIReg(4, gregOfRM(modrm), unop(Iop_16Uto32, mkexpr(t4)));
   10710          DIP("pextrw $%d,%s,%s\n",
   10711              (Int)insn[3], nameXMMReg(eregOfRM(modrm)),
   10712                            nameIReg(4,gregOfRM(modrm)));
   10713          delta += 4;
   10714          goto decode_success;
   10715       }
   10716       /* else fall through */
   10717    }
   10718 
   10719    /* 66 0F C4 = PINSRW -- get 16 bits from E(mem or low half ireg) and
   10720       put it into the specified lane of xmm(G). */
   10721    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xC4) {
   10722       Int lane;
   10723       t4 = newTemp(Ity_I16);
   10724       modrm = insn[2];
   10725 
   10726       if (epartIsReg(modrm)) {
   10727          assign(t4, getIReg(2, eregOfRM(modrm)));
   10728          delta += 3+1;
   10729          lane = insn[3+1-1];
   10730          DIP("pinsrw $%d,%s,%s\n", (Int)lane,
   10731                                    nameIReg(2,eregOfRM(modrm)),
   10732                                    nameXMMReg(gregOfRM(modrm)));
   10733       } else {
   10734          addr = disAMode ( &alen, sorb, delta+2, dis_buf );
   10735          delta += 3+alen;
   10736          lane = insn[3+alen-1];
   10737          assign(t4, loadLE(Ity_I16, mkexpr(addr)));
   10738          DIP("pinsrw $%d,%s,%s\n", (Int)lane,
   10739                                    dis_buf,
   10740                                    nameXMMReg(gregOfRM(modrm)));
   10741       }
   10742 
   10743       putXMMRegLane16( gregOfRM(modrm), lane & 7, mkexpr(t4) );
   10744       goto decode_success;
   10745    }
   10746 
   10747    /* 66 0F F5 = PMADDWD -- Multiply and add packed integers from
   10748       E(xmm or mem) to G(xmm) */
   10749    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xF5) {
   10750       IRTemp s1V  = newTemp(Ity_V128);
   10751       IRTemp s2V  = newTemp(Ity_V128);
   10752       IRTemp dV   = newTemp(Ity_V128);
   10753       IRTemp s1Hi = newTemp(Ity_I64);
   10754       IRTemp s1Lo = newTemp(Ity_I64);
   10755       IRTemp s2Hi = newTemp(Ity_I64);
   10756       IRTemp s2Lo = newTemp(Ity_I64);
   10757       IRTemp dHi  = newTemp(Ity_I64);
   10758       IRTemp dLo  = newTemp(Ity_I64);
   10759       modrm = insn[2];
   10760       if (epartIsReg(modrm)) {
   10761          assign( s1V, getXMMReg(eregOfRM(modrm)) );
   10762          delta += 2+1;
   10763          DIP("pmaddwd %s,%s\n", nameXMMReg(eregOfRM(modrm)),
   10764                                 nameXMMReg(gregOfRM(modrm)));
   10765       } else {
   10766          addr = disAMode ( &alen, sorb, delta+2, dis_buf );
   10767          assign( s1V, loadLE(Ity_V128, mkexpr(addr)) );
   10768          delta += 2+alen;
   10769          DIP("pmaddwd %s,%s\n", dis_buf,
   10770                                 nameXMMReg(gregOfRM(modrm)));
   10771       }
   10772       assign( s2V, getXMMReg(gregOfRM(modrm)) );
   10773       assign( s1Hi, unop(Iop_V128HIto64, mkexpr(s1V)) );
   10774       assign( s1Lo, unop(Iop_V128to64,   mkexpr(s1V)) );
   10775       assign( s2Hi, unop(Iop_V128HIto64, mkexpr(s2V)) );
   10776       assign( s2Lo, unop(Iop_V128to64,   mkexpr(s2V)) );
   10777       assign( dHi, mkIRExprCCall(
   10778                       Ity_I64, 0/*regparms*/,
   10779                       "x86g_calculate_mmx_pmaddwd",
   10780                       &x86g_calculate_mmx_pmaddwd,
   10781                       mkIRExprVec_2( mkexpr(s1Hi), mkexpr(s2Hi))
   10782                    ));
   10783       assign( dLo, mkIRExprCCall(
   10784                       Ity_I64, 0/*regparms*/,
   10785                       "x86g_calculate_mmx_pmaddwd",
   10786                       &x86g_calculate_mmx_pmaddwd,
   10787                       mkIRExprVec_2( mkexpr(s1Lo), mkexpr(s2Lo))
   10788                    ));
   10789       assign( dV, binop(Iop_64HLtoV128, mkexpr(dHi), mkexpr(dLo))) ;
   10790       putXMMReg(gregOfRM(modrm), mkexpr(dV));
   10791       goto decode_success;
   10792    }
   10793 
   10794    /* 66 0F EE = PMAXSW -- 16x8 signed max */
   10795    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xEE) {
   10796       delta = dis_SSEint_E_to_G( sorb, delta+2,
   10797                                  "pmaxsw", Iop_Max16Sx8, False );
   10798       goto decode_success;
   10799    }
   10800 
   10801    /* 66 0F DE = PMAXUB -- 8x16 unsigned max */
   10802    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xDE) {
   10803       delta = dis_SSEint_E_to_G( sorb, delta+2,
   10804                                  "pmaxub", Iop_Max8Ux16, False );
   10805       goto decode_success;
   10806    }
   10807 
   10808    /* 66 0F EA = PMINSW -- 16x8 signed min */
   10809    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xEA) {
   10810       delta = dis_SSEint_E_to_G( sorb, delta+2,
   10811                                  "pminsw", Iop_Min16Sx8, False );
   10812       goto decode_success;
   10813    }
   10814 
   10815    /* 66 0F DA = PMINUB -- 8x16 unsigned min */
   10816    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xDA) {
   10817       delta = dis_SSEint_E_to_G( sorb, delta+2,
   10818                                  "pminub", Iop_Min8Ux16, False );
   10819       goto decode_success;
   10820    }
   10821 
   10822    /* 66 0F D7 = PMOVMSKB -- extract sign bits from each of 16 lanes in
   10823       xmm(G), turn them into a byte, and put zero-extend of it in
   10824       ireg(G).  Doing this directly is just too cumbersome; give up
   10825       therefore and call a helper. */
   10826    /* UInt x86g_calculate_sse_pmovmskb ( ULong w64hi, ULong w64lo ); */
   10827    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xD7) {
   10828       modrm = insn[2];
   10829       if (epartIsReg(modrm)) {
   10830          t0 = newTemp(Ity_I64);
   10831          t1 = newTemp(Ity_I64);
   10832          assign(t0, getXMMRegLane64(eregOfRM(modrm), 0));
   10833          assign(t1, getXMMRegLane64(eregOfRM(modrm), 1));
   10834          t5 = newTemp(Ity_I32);
   10835          assign(t5, mkIRExprCCall(
   10836                        Ity_I32, 0/*regparms*/,
   10837                        "x86g_calculate_sse_pmovmskb",
   10838                        &x86g_calculate_sse_pmovmskb,
   10839                        mkIRExprVec_2( mkexpr(t1), mkexpr(t0) )));
   10840          putIReg(4, gregOfRM(modrm), mkexpr(t5));
   10841          DIP("pmovmskb %s,%s\n", nameXMMReg(eregOfRM(modrm)),
   10842                                  nameIReg(4,gregOfRM(modrm)));
   10843          delta += 3;
   10844          goto decode_success;
   10845       }
   10846       /* else fall through */
   10847    }
   10848 
   10849    /* 66 0F E4 = PMULHUW -- 16x8 hi-half of unsigned widening multiply */
   10850    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xE4) {
   10851       delta = dis_SSEint_E_to_G( sorb, delta+2,
   10852                                  "pmulhuw", Iop_MulHi16Ux8, False );
   10853       goto decode_success;
   10854    }
   10855 
   10856    /* 66 0F E5 = PMULHW -- 16x8 hi-half of signed widening multiply */
   10857    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xE5) {
   10858       delta = dis_SSEint_E_to_G( sorb, delta+2,
   10859                                  "pmulhw", Iop_MulHi16Sx8, False );
   10860       goto decode_success;
   10861    }
   10862 
   10863    /* 66 0F D5 = PMULHL -- 16x8 multiply */
   10864    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xD5) {
   10865       delta = dis_SSEint_E_to_G( sorb, delta+2,
   10866                                  "pmullw", Iop_Mul16x8, False );
   10867       goto decode_success;
   10868    }
   10869 
   10870    /* ***--- this is an MMX class insn introduced in SSE2 ---*** */
   10871    /* 0F F4 = PMULUDQ -- unsigned widening multiply of 32-lanes 0 x
   10872       0 to form 64-bit result */
   10873    if (sz == 4 && insn[0] == 0x0F && insn[1] == 0xF4) {
   10874       IRTemp sV = newTemp(Ity_I64);
   10875       IRTemp dV = newTemp(Ity_I64);
   10876       t1 = newTemp(Ity_I32);
   10877       t0 = newTemp(Ity_I32);
   10878       modrm = insn[2];
   10879 
   10880       do_MMX_preamble();
   10881       assign( dV, getMMXReg(gregOfRM(modrm)) );
   10882 
   10883       if (epartIsReg(modrm)) {
   10884          assign( sV, getMMXReg(eregOfRM(modrm)) );
   10885          delta += 2+1;
   10886          DIP("pmuludq %s,%s\n", nameMMXReg(eregOfRM(modrm)),
   10887                                 nameMMXReg(gregOfRM(modrm)));
   10888       } else {
   10889          addr = disAMode ( &alen, sorb, delta+2, dis_buf );
   10890          assign( sV, loadLE(Ity_I64, mkexpr(addr)) );
   10891          delta += 2+alen;
   10892          DIP("pmuludq %s,%s\n", dis_buf,
   10893                                 nameMMXReg(gregOfRM(modrm)));
   10894       }
   10895 
   10896       assign( t0, unop(Iop_64to32, mkexpr(dV)) );
   10897       assign( t1, unop(Iop_64to32, mkexpr(sV)) );
   10898       putMMXReg( gregOfRM(modrm),
   10899                  binop( Iop_MullU32, mkexpr(t0), mkexpr(t1) ) );
   10900       goto decode_success;
   10901    }
   10902 
   10903    /* 66 0F F4 = PMULUDQ -- unsigned widening multiply of 32-lanes 0 x
   10904       0 to form lower 64-bit half and lanes 2 x 2 to form upper 64-bit
   10905       half */
   10906    /* This is a really poor translation -- could be improved if
   10907       performance critical */
   10908    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xF4) {
   10909       IRTemp sV, dV;
   10910       IRTemp s3, s2, s1, s0, d3, d2, d1, d0;
   10911       sV = newTemp(Ity_V128);
   10912       dV = newTemp(Ity_V128);
   10913       s3 = s2 = s1 = s0 = d3 = d2 = d1 = d0 = IRTemp_INVALID;
   10914       t1 = newTemp(Ity_I64);
   10915       t0 = newTemp(Ity_I64);
   10916       modrm = insn[2];
   10917       assign( dV, getXMMReg(gregOfRM(modrm)) );
   10918 
   10919       if (epartIsReg(modrm)) {
   10920          assign( sV, getXMMReg(eregOfRM(modrm)) );
   10921          delta += 2+1;
   10922          DIP("pmuludq %s,%s\n", nameXMMReg(eregOfRM(modrm)),
   10923                                 nameXMMReg(gregOfRM(modrm)));
   10924       } else {
   10925          addr = disAMode ( &alen, sorb, delta+2, dis_buf );
   10926          assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
   10927          delta += 2+alen;
   10928          DIP("pmuludq %s,%s\n", dis_buf,
   10929                                 nameXMMReg(gregOfRM(modrm)));
   10930       }
   10931 
   10932       breakup128to32s( dV, &d3, &d2, &d1, &d0 );
   10933       breakup128to32s( sV, &s3, &s2, &s1, &s0 );
   10934 
   10935       assign( t0, binop( Iop_MullU32, mkexpr(d0), mkexpr(s0)) );
   10936       putXMMRegLane64( gregOfRM(modrm), 0, mkexpr(t0) );
   10937       assign( t1, binop( Iop_MullU32, mkexpr(d2), mkexpr(s2)) );
   10938       putXMMRegLane64( gregOfRM(modrm), 1, mkexpr(t1) );
   10939       goto decode_success;
   10940    }
   10941 
   10942    /* 66 0F EB = POR */
   10943    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xEB) {
   10944       delta = dis_SSE_E_to_G_all( sorb, delta+2, "por", Iop_OrV128 );
   10945       goto decode_success;
   10946    }
   10947 
   10948    /* 66 0F F6 = PSADBW -- 2 x (8x8 -> 48 zeroes ++ u16) Sum Abs Diffs
   10949       from E(xmm or mem) to G(xmm) */
   10950    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xF6) {
   10951       IRTemp s1V  = newTemp(Ity_V128);
   10952       IRTemp s2V  = newTemp(Ity_V128);
   10953       IRTemp dV   = newTemp(Ity_V128);
   10954       IRTemp s1Hi = newTemp(Ity_I64);
   10955       IRTemp s1Lo = newTemp(Ity_I64);
   10956       IRTemp s2Hi = newTemp(Ity_I64);
   10957       IRTemp s2Lo = newTemp(Ity_I64);
   10958       IRTemp dHi  = newTemp(Ity_I64);
   10959       IRTemp dLo  = newTemp(Ity_I64);
   10960       modrm = insn[2];
   10961       if (epartIsReg(modrm)) {
   10962          assign( s1V, getXMMReg(eregOfRM(modrm)) );
   10963          delta += 2+1;
   10964          DIP("psadbw %s,%s\n", nameXMMReg(eregOfRM(modrm)),
   10965                                nameXMMReg(gregOfRM(modrm)));
   10966       } else {
   10967          addr = disAMode ( &alen, sorb, delta+2, dis_buf );
   10968          assign( s1V, loadLE(Ity_V128, mkexpr(addr)) );
   10969          delta += 2+alen;
   10970          DIP("psadbw %s,%s\n", dis_buf,
   10971                                nameXMMReg(gregOfRM(modrm)));
   10972       }
   10973       assign( s2V, getXMMReg(gregOfRM(modrm)) );
   10974       assign( s1Hi, unop(Iop_V128HIto64, mkexpr(s1V)) );
   10975       assign( s1Lo, unop(Iop_V128to64,   mkexpr(s1V)) );
   10976       assign( s2Hi, unop(Iop_V128HIto64, mkexpr(s2V)) );
   10977       assign( s2Lo, unop(Iop_V128to64,   mkexpr(s2V)) );
   10978       assign( dHi, mkIRExprCCall(
   10979                       Ity_I64, 0/*regparms*/,
   10980                       "x86g_calculate_mmx_psadbw",
   10981                       &x86g_calculate_mmx_psadbw,
   10982                       mkIRExprVec_2( mkexpr(s1Hi), mkexpr(s2Hi))
   10983                    ));
   10984       assign( dLo, mkIRExprCCall(
   10985                       Ity_I64, 0/*regparms*/,
   10986                       "x86g_calculate_mmx_psadbw",
   10987                       &x86g_calculate_mmx_psadbw,
   10988                       mkIRExprVec_2( mkexpr(s1Lo), mkexpr(s2Lo))
   10989                    ));
   10990       assign( dV, binop(Iop_64HLtoV128, mkexpr(dHi), mkexpr(dLo))) ;
   10991       putXMMReg(gregOfRM(modrm), mkexpr(dV));
   10992       goto decode_success;
   10993    }
   10994 
   10995    /* 66 0F 70 = PSHUFD -- rearrange 4x32 from E(xmm or mem) to G(xmm) */
   10996    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x70) {
   10997       Int order;
   10998       IRTemp sV, dV, s3, s2, s1, s0;
   10999       s3 = s2 = s1 = s0 = IRTemp_INVALID;
   11000       sV = newTemp(Ity_V128);
   11001       dV = newTemp(Ity_V128);
   11002       modrm = insn[2];
   11003       if (epartIsReg(modrm)) {
   11004          assign( sV, getXMMReg(eregOfRM(modrm)) );
   11005          order = (Int)insn[3];
   11006          delta += 2+2;
   11007          DIP("pshufd $%d,%s,%s\n", order,
   11008                                    nameXMMReg(eregOfRM(modrm)),
   11009                                    nameXMMReg(gregOfRM(modrm)));
   11010       } else {
   11011          addr = disAMode ( &alen, sorb, delta+2, dis_buf );
   11012          assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
   11013 	 order = (Int)insn[2+alen];
   11014          delta += 3+alen;
   11015          DIP("pshufd $%d,%s,%s\n", order,
   11016                                    dis_buf,
   11017                                    nameXMMReg(gregOfRM(modrm)));
   11018       }
   11019       breakup128to32s( sV, &s3, &s2, &s1, &s0 );
   11020 
   11021 #     define SEL(n) \
   11022                 ((n)==0 ? s0 : ((n)==1 ? s1 : ((n)==2 ? s2 : s3)))
   11023       assign(dV,
   11024 	     mk128from32s( SEL((order>>6)&3), SEL((order>>4)&3),
   11025                            SEL((order>>2)&3), SEL((order>>0)&3) )
   11026       );
   11027       putXMMReg(gregOfRM(modrm), mkexpr(dV));
   11028 #     undef SEL
   11029       goto decode_success;
   11030    }
   11031 
   11032    /* F3 0F 70 = PSHUFHW -- rearrange upper half 4x16 from E(xmm or
   11033       mem) to G(xmm), and copy lower half */
   11034    if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0x70) {
   11035       Int order;
   11036       IRTemp sVhi, dVhi, sV, dV, s3, s2, s1, s0;
   11037       s3 = s2 = s1 = s0 = IRTemp_INVALID;
   11038       sV   = newTemp(Ity_V128);
   11039       dV   = newTemp(Ity_V128);
   11040       sVhi = newTemp(Ity_I64);
   11041       dVhi = newTemp(Ity_I64);
   11042       modrm = insn[3];
   11043       if (epartIsReg(modrm)) {
   11044          assign( sV, getXMMReg(eregOfRM(modrm)) );
   11045          order = (Int)insn[4];
   11046          delta += 4+1;
   11047          DIP("pshufhw $%d,%s,%s\n", order,
   11048                                     nameXMMReg(eregOfRM(modrm)),
   11049                                     nameXMMReg(gregOfRM(modrm)));
   11050       } else {
   11051          addr = disAMode ( &alen, sorb, delta+3, dis_buf );
   11052          assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
   11053 	 order = (Int)insn[3+alen];
   11054          delta += 4+alen;
   11055          DIP("pshufhw $%d,%s,%s\n", order,
   11056                                     dis_buf,
   11057                                     nameXMMReg(gregOfRM(modrm)));
   11058       }
   11059       assign( sVhi, unop(Iop_V128HIto64, mkexpr(sV)) );
   11060       breakup64to16s( sVhi, &s3, &s2, &s1, &s0 );
   11061 
   11062 #     define SEL(n) \
   11063                 ((n)==0 ? s0 : ((n)==1 ? s1 : ((n)==2 ? s2 : s3)))
   11064       assign(dVhi,
   11065 	     mk64from16s( SEL((order>>6)&3), SEL((order>>4)&3),
   11066                           SEL((order>>2)&3), SEL((order>>0)&3) )
   11067       );
   11068       assign(dV, binop( Iop_64HLtoV128,
   11069                         mkexpr(dVhi),
   11070                         unop(Iop_V128to64, mkexpr(sV))) );
   11071       putXMMReg(gregOfRM(modrm), mkexpr(dV));
   11072 #     undef SEL
   11073       goto decode_success;
   11074    }
   11075 
   11076    /* F2 0F 70 = PSHUFLW -- rearrange lower half 4x16 from E(xmm or
   11077       mem) to G(xmm), and copy upper half */
   11078    if (insn[0] == 0xF2 && insn[1] == 0x0F && insn[2] == 0x70) {
   11079       Int order;
   11080       IRTemp sVlo, dVlo, sV, dV, s3, s2, s1, s0;
   11081       s3 = s2 = s1 = s0 = IRTemp_INVALID;
   11082       sV   = newTemp(Ity_V128);
   11083       dV   = newTemp(Ity_V128);
   11084       sVlo = newTemp(Ity_I64);
   11085       dVlo = newTemp(Ity_I64);
   11086       modrm = insn[3];
   11087       if (epartIsReg(modrm)) {
   11088          assign( sV, getXMMReg(eregOfRM(modrm)) );
   11089          order = (Int)insn[4];
   11090          delta += 4+1;
   11091          DIP("pshuflw $%d,%s,%s\n", order,
   11092                                     nameXMMReg(eregOfRM(modrm)),
   11093                                     nameXMMReg(gregOfRM(modrm)));
   11094       } else {
   11095          addr = disAMode ( &alen, sorb, delta+3, dis_buf );
   11096          assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
   11097 	 order = (Int)insn[3+alen];
   11098          delta += 4+alen;
   11099          DIP("pshuflw $%d,%s,%s\n", order,
   11100                                     dis_buf,
   11101                                     nameXMMReg(gregOfRM(modrm)));
   11102       }
   11103       assign( sVlo, unop(Iop_V128to64, mkexpr(sV)) );
   11104       breakup64to16s( sVlo, &s3, &s2, &s1, &s0 );
   11105 
   11106 #     define SEL(n) \
   11107                 ((n)==0 ? s0 : ((n)==1 ? s1 : ((n)==2 ? s2 : s3)))
   11108       assign(dVlo,
   11109 	     mk64from16s( SEL((order>>6)&3), SEL((order>>4)&3),
   11110                           SEL((order>>2)&3), SEL((order>>0)&3) )
   11111       );
   11112       assign(dV, binop( Iop_64HLtoV128,
   11113                         unop(Iop_V128HIto64, mkexpr(sV)),
   11114                         mkexpr(dVlo) ) );
   11115       putXMMReg(gregOfRM(modrm), mkexpr(dV));
   11116 #     undef SEL
   11117       goto decode_success;
   11118    }
   11119 
   11120    /* 66 0F 72 /6 ib = PSLLD by immediate */
   11121    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x72
   11122        && epartIsReg(insn[2])
   11123        && gregOfRM(insn[2]) == 6) {
   11124       delta = dis_SSE_shiftE_imm( delta+2, "pslld", Iop_ShlN32x4 );
   11125       goto decode_success;
   11126    }
   11127 
   11128    /* 66 0F F2 = PSLLD by E */
   11129    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xF2) {
   11130       delta = dis_SSE_shiftG_byE( sorb, delta+2, "pslld", Iop_ShlN32x4 );
   11131       goto decode_success;
   11132    }
   11133 
   11134    /* 66 0F 73 /7 ib = PSLLDQ by immediate */
   11135    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x73
   11136        && epartIsReg(insn[2])
   11137        && gregOfRM(insn[2]) == 7) {
   11138       IRTemp sV, dV, hi64, lo64, hi64r, lo64r;
   11139       Int    imm = (Int)insn[3];
   11140       Int    reg = eregOfRM(insn[2]);
   11141       DIP("pslldq $%d,%s\n", imm, nameXMMReg(reg));
   11142       vassert(imm >= 0 && imm <= 255);
   11143       delta += 4;
   11144 
   11145       sV    = newTemp(Ity_V128);
   11146       dV    = newTemp(Ity_V128);
   11147       hi64  = newTemp(Ity_I64);
   11148       lo64  = newTemp(Ity_I64);
   11149       hi64r = newTemp(Ity_I64);
   11150       lo64r = newTemp(Ity_I64);
   11151 
   11152       if (imm >= 16) {
   11153          putXMMReg(reg, mkV128(0x0000));
   11154          goto decode_success;
   11155       }
   11156 
   11157       assign( sV, getXMMReg(reg) );
   11158       assign( hi64, unop(Iop_V128HIto64, mkexpr(sV)) );
   11159       assign( lo64, unop(Iop_V128to64, mkexpr(sV)) );
   11160 
   11161       if (imm == 0) {
   11162          assign( lo64r, mkexpr(lo64) );
   11163          assign( hi64r, mkexpr(hi64) );
   11164       }
   11165       else
   11166       if (imm == 8) {
   11167          assign( lo64r, mkU64(0) );
   11168          assign( hi64r, mkexpr(lo64) );
   11169       }
   11170       else
   11171       if (imm > 8) {
   11172          assign( lo64r, mkU64(0) );
   11173          assign( hi64r, binop( Iop_Shl64,
   11174                                mkexpr(lo64),
   11175                                mkU8( 8*(imm-8) ) ));
   11176       } else {
   11177          assign( lo64r, binop( Iop_Shl64,
   11178                                mkexpr(lo64),
   11179                                mkU8(8 * imm) ));
   11180          assign( hi64r,
   11181                  binop( Iop_Or64,
   11182                         binop(Iop_Shl64, mkexpr(hi64),
   11183                                          mkU8(8 * imm)),
   11184                         binop(Iop_Shr64, mkexpr(lo64),
   11185                                          mkU8(8 * (8 - imm)) )
   11186                       )
   11187                );
   11188       }
   11189       assign( dV, binop(Iop_64HLtoV128, mkexpr(hi64r), mkexpr(lo64r)) );
   11190       putXMMReg(reg, mkexpr(dV));
   11191       goto decode_success;
   11192    }
   11193 
   11194    /* 66 0F 73 /6 ib = PSLLQ by immediate */
   11195    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x73
   11196        && epartIsReg(insn[2])
   11197        && gregOfRM(insn[2]) == 6) {
   11198       delta = dis_SSE_shiftE_imm( delta+2, "psllq", Iop_ShlN64x2 );
   11199       goto decode_success;
   11200    }
   11201 
   11202    /* 66 0F F3 = PSLLQ by E */
   11203    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xF3) {
   11204       delta = dis_SSE_shiftG_byE( sorb, delta+2, "psllq", Iop_ShlN64x2 );
   11205       goto decode_success;
   11206    }
   11207 
   11208    /* 66 0F 71 /6 ib = PSLLW by immediate */
   11209    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x71
   11210        && epartIsReg(insn[2])
   11211        && gregOfRM(insn[2]) == 6) {
   11212       delta = dis_SSE_shiftE_imm( delta+2, "psllw", Iop_ShlN16x8 );
   11213       goto decode_success;
   11214    }
   11215 
   11216    /* 66 0F F1 = PSLLW by E */
   11217    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xF1) {
   11218       delta = dis_SSE_shiftG_byE( sorb, delta+2, "psllw", Iop_ShlN16x8 );
   11219       goto decode_success;
   11220    }
   11221 
   11222    /* 66 0F 72 /4 ib = PSRAD by immediate */
   11223    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x72
   11224        && epartIsReg(insn[2])
   11225        && gregOfRM(insn[2]) == 4) {
   11226       delta = dis_SSE_shiftE_imm( delta+2, "psrad", Iop_SarN32x4 );
   11227       goto decode_success;
   11228    }
   11229 
   11230    /* 66 0F E2 = PSRAD by E */
   11231    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xE2) {
   11232       delta = dis_SSE_shiftG_byE( sorb, delta+2, "psrad", Iop_SarN32x4 );
   11233       goto decode_success;
   11234    }
   11235 
   11236    /* 66 0F 71 /4 ib = PSRAW by immediate */
   11237    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x71
   11238        && epartIsReg(insn[2])
   11239        && gregOfRM(insn[2]) == 4) {
   11240       delta = dis_SSE_shiftE_imm( delta+2, "psraw", Iop_SarN16x8 );
   11241       goto decode_success;
   11242    }
   11243 
   11244    /* 66 0F E1 = PSRAW by E */
   11245    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xE1) {
   11246       delta = dis_SSE_shiftG_byE( sorb, delta+2, "psraw", Iop_SarN16x8 );
   11247       goto decode_success;
   11248    }
   11249 
   11250    /* 66 0F 72 /2 ib = PSRLD by immediate */
   11251    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x72
   11252        && epartIsReg(insn[2])
   11253        && gregOfRM(insn[2]) == 2) {
   11254       delta = dis_SSE_shiftE_imm( delta+2, "psrld", Iop_ShrN32x4 );
   11255       goto decode_success;
   11256    }
   11257 
   11258    /* 66 0F D2 = PSRLD by E */
   11259    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xD2) {
   11260       delta = dis_SSE_shiftG_byE( sorb, delta+2, "psrld", Iop_ShrN32x4 );
   11261       goto decode_success;
   11262    }
   11263 
   11264    /* 66 0F 73 /3 ib = PSRLDQ by immediate */
   11265    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x73
   11266        && epartIsReg(insn[2])
   11267        && gregOfRM(insn[2]) == 3) {
   11268       IRTemp sV, dV, hi64, lo64, hi64r, lo64r;
   11269       Int    imm = (Int)insn[3];
   11270       Int    reg = eregOfRM(insn[2]);
   11271       DIP("psrldq $%d,%s\n", imm, nameXMMReg(reg));
   11272       vassert(imm >= 0 && imm <= 255);
   11273       delta += 4;
   11274 
   11275       sV    = newTemp(Ity_V128);
   11276       dV    = newTemp(Ity_V128);
   11277       hi64  = newTemp(Ity_I64);
   11278       lo64  = newTemp(Ity_I64);
   11279       hi64r = newTemp(Ity_I64);
   11280       lo64r = newTemp(Ity_I64);
   11281 
   11282       if (imm >= 16) {
   11283          putXMMReg(reg, mkV128(0x0000));
   11284          goto decode_success;
   11285       }
   11286 
   11287       assign( sV, getXMMReg(reg) );
   11288       assign( hi64, unop(Iop_V128HIto64, mkexpr(sV)) );
   11289       assign( lo64, unop(Iop_V128to64, mkexpr(sV)) );
   11290 
   11291       if (imm == 0) {
   11292          assign( lo64r, mkexpr(lo64) );
   11293          assign( hi64r, mkexpr(hi64) );
   11294       }
   11295       else
   11296       if (imm == 8) {
   11297          assign( hi64r, mkU64(0) );
   11298          assign( lo64r, mkexpr(hi64) );
   11299       }
   11300       else
   11301       if (imm > 8) {
   11302          assign( hi64r, mkU64(0) );
   11303          assign( lo64r, binop( Iop_Shr64,
   11304                                mkexpr(hi64),
   11305                                mkU8( 8*(imm-8) ) ));
   11306       } else {
   11307          assign( hi64r, binop( Iop_Shr64,
   11308                                mkexpr(hi64),
   11309                                mkU8(8 * imm) ));
   11310          assign( lo64r,
   11311                  binop( Iop_Or64,
   11312                         binop(Iop_Shr64, mkexpr(lo64),
   11313                                          mkU8(8 * imm)),
   11314                         binop(Iop_Shl64, mkexpr(hi64),
   11315                                          mkU8(8 * (8 - imm)) )
   11316                       )
   11317                );
   11318       }
   11319 
   11320       assign( dV, binop(Iop_64HLtoV128, mkexpr(hi64r), mkexpr(lo64r)) );
   11321       putXMMReg(reg, mkexpr(dV));
   11322       goto decode_success;
   11323    }
   11324 
   11325    /* 66 0F 73 /2 ib = PSRLQ by immediate */
   11326    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x73
   11327        && epartIsReg(insn[2])
   11328        && gregOfRM(insn[2]) == 2) {
   11329       delta = dis_SSE_shiftE_imm( delta+2, "psrlq", Iop_ShrN64x2 );
   11330       goto decode_success;
   11331    }
   11332 
   11333    /* 66 0F D3 = PSRLQ by E */
   11334    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xD3) {
   11335       delta = dis_SSE_shiftG_byE( sorb, delta+2, "psrlq", Iop_ShrN64x2 );
   11336       goto decode_success;
   11337    }
   11338 
   11339    /* 66 0F 71 /2 ib = PSRLW by immediate */
   11340    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x71
   11341        && epartIsReg(insn[2])
   11342        && gregOfRM(insn[2]) == 2) {
   11343       delta = dis_SSE_shiftE_imm( delta+2, "psrlw", Iop_ShrN16x8 );
   11344       goto decode_success;
   11345    }
   11346 
   11347    /* 66 0F D1 = PSRLW by E */
   11348    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xD1) {
   11349       delta = dis_SSE_shiftG_byE( sorb, delta+2, "psrlw", Iop_ShrN16x8 );
   11350       goto decode_success;
   11351    }
   11352 
   11353    /* 66 0F F8 = PSUBB */
   11354    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xF8) {
   11355       delta = dis_SSEint_E_to_G( sorb, delta+2,
   11356                                  "psubb", Iop_Sub8x16, False );
   11357       goto decode_success;
   11358    }
   11359 
   11360    /* 66 0F FA = PSUBD */
   11361    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xFA) {
   11362       delta = dis_SSEint_E_to_G( sorb, delta+2,
   11363                                  "psubd", Iop_Sub32x4, False );
   11364       goto decode_success;
   11365    }
   11366 
   11367    /* ***--- this is an MMX class insn introduced in SSE2 ---*** */
   11368    /* 0F FB = PSUBQ -- sub 64x1 */
   11369    if (sz == 4 && insn[0] == 0x0F && insn[1] == 0xFB) {
   11370       do_MMX_preamble();
   11371       delta = dis_MMXop_regmem_to_reg (
   11372                 sorb, delta+2, insn[1], "psubq", False );
   11373       goto decode_success;
   11374    }
   11375 
   11376    /* 66 0F FB = PSUBQ */
   11377    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xFB) {
   11378       delta = dis_SSEint_E_to_G( sorb, delta+2,
   11379                                  "psubq", Iop_Sub64x2, False );
   11380       goto decode_success;
   11381    }
   11382 
   11383    /* 66 0F F9 = PSUBW */
   11384    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xF9) {
   11385       delta = dis_SSEint_E_to_G( sorb, delta+2,
   11386                                  "psubw", Iop_Sub16x8, False );
   11387       goto decode_success;
   11388    }
   11389 
   11390    /* 66 0F E8 = PSUBSB */
   11391    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xE8) {
   11392       delta = dis_SSEint_E_to_G( sorb, delta+2,
   11393                                  "psubsb", Iop_QSub8Sx16, False );
   11394       goto decode_success;
   11395    }
   11396 
   11397    /* 66 0F E9 = PSUBSW */
   11398    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xE9) {
   11399       delta = dis_SSEint_E_to_G( sorb, delta+2,
   11400                                  "psubsw", Iop_QSub16Sx8, False );
   11401       goto decode_success;
   11402    }
   11403 
   11404    /* 66 0F D8 = PSUBSB */
   11405    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xD8) {
   11406       delta = dis_SSEint_E_to_G( sorb, delta+2,
   11407                                  "psubusb", Iop_QSub8Ux16, False );
   11408       goto decode_success;
   11409    }
   11410 
   11411    /* 66 0F D9 = PSUBSW */
   11412    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xD9) {
   11413       delta = dis_SSEint_E_to_G( sorb, delta+2,
   11414                                  "psubusw", Iop_QSub16Ux8, False );
   11415       goto decode_success;
   11416    }
   11417 
   11418    /* 66 0F 68 = PUNPCKHBW */
   11419    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x68) {
   11420       delta = dis_SSEint_E_to_G( sorb, delta+2,
   11421                                  "punpckhbw",
   11422                                  Iop_InterleaveHI8x16, True );
   11423       goto decode_success;
   11424    }
   11425 
   11426    /* 66 0F 6A = PUNPCKHDQ */
   11427    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x6A) {
   11428       delta = dis_SSEint_E_to_G( sorb, delta+2,
   11429                                  "punpckhdq",
   11430                                  Iop_InterleaveHI32x4, True );
   11431       goto decode_success;
   11432    }
   11433 
   11434    /* 66 0F 6D = PUNPCKHQDQ */
   11435    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x6D) {
   11436       delta = dis_SSEint_E_to_G( sorb, delta+2,
   11437                                  "punpckhqdq",
   11438                                  Iop_InterleaveHI64x2, True );
   11439       goto decode_success;
   11440    }
   11441 
   11442    /* 66 0F 69 = PUNPCKHWD */
   11443    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x69) {
   11444       delta = dis_SSEint_E_to_G( sorb, delta+2,
   11445                                  "punpckhwd",
   11446                                  Iop_InterleaveHI16x8, True );
   11447       goto decode_success;
   11448    }
   11449 
   11450    /* 66 0F 60 = PUNPCKLBW */
   11451    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x60) {
   11452       delta = dis_SSEint_E_to_G( sorb, delta+2,
   11453                                  "punpcklbw",
   11454                                  Iop_InterleaveLO8x16, True );
   11455       goto decode_success;
   11456    }
   11457 
   11458    /* 66 0F 62 = PUNPCKLDQ */
   11459    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x62) {
   11460       delta = dis_SSEint_E_to_G( sorb, delta+2,
   11461                                  "punpckldq",
   11462                                  Iop_InterleaveLO32x4, True );
   11463       goto decode_success;
   11464    }
   11465 
   11466    /* 66 0F 6C = PUNPCKLQDQ */
   11467    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x6C) {
   11468       delta = dis_SSEint_E_to_G( sorb, delta+2,
   11469                                  "punpcklqdq",
   11470                                  Iop_InterleaveLO64x2, True );
   11471       goto decode_success;
   11472    }
   11473 
   11474    /* 66 0F 61 = PUNPCKLWD */
   11475    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x61) {
   11476       delta = dis_SSEint_E_to_G( sorb, delta+2,
   11477                                  "punpcklwd",
   11478                                  Iop_InterleaveLO16x8, True );
   11479       goto decode_success;
   11480    }
   11481 
   11482    /* 66 0F EF = PXOR */
   11483    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xEF) {
   11484       delta = dis_SSE_E_to_G_all( sorb, delta+2, "pxor", Iop_XorV128 );
   11485       goto decode_success;
   11486    }
   11487 
   11488 //--    /* FXSAVE/FXRSTOR m32 -- load/store the FPU/MMX/SSE state. */
   11489 //--    if (insn[0] == 0x0F && insn[1] == 0xAE
   11490 //--        && (!epartIsReg(insn[2]))
   11491 //--        && (gregOfRM(insn[2]) == 1 || gregOfRM(insn[2]) == 0) ) {
   11492 //--       Bool store = gregOfRM(insn[2]) == 0;
   11493 //--       vg_assert(sz == 4);
   11494 //--       pair = disAMode ( cb, sorb, eip+2, dis_buf );
   11495 //--       t1   = LOW24(pair);
   11496 //--       eip += 2+HI8(pair);
   11497 //--       uInstr3(cb, store ? SSE2a_MemWr : SSE2a_MemRd, 512,
   11498 //--                   Lit16, (((UShort)insn[0]) << 8) | (UShort)insn[1],
   11499 //--                   Lit16, (UShort)insn[2],
   11500 //--                   TempReg, t1 );
   11501 //--       DIP("fx%s %s\n", store ? "save" : "rstor", dis_buf );
   11502 //--       goto decode_success;
   11503 //--    }
   11504 
   11505    /* 0F AE /7 = CLFLUSH -- flush cache line */
   11506    if (sz == 4 && insn[0] == 0x0F && insn[1] == 0xAE
   11507        && !epartIsReg(insn[2]) && gregOfRM(insn[2]) == 7) {
   11508 
   11509       /* This is something of a hack.  We need to know the size of the
   11510          cache line containing addr.  Since we don't (easily), assume
   11511          256 on the basis that no real cache would have a line that
   11512          big.  It's safe to invalidate more stuff than we need, just
   11513          inefficient. */
   11514       UInt lineszB = 256;
   11515 
   11516       addr = disAMode ( &alen, sorb, delta+2, dis_buf );
   11517       delta += 2+alen;
   11518 
   11519       /* Round addr down to the start of the containing block. */
   11520       stmt( IRStmt_Put(
   11521                OFFB_TISTART,
   11522                binop( Iop_And32,
   11523                       mkexpr(addr),
   11524                       mkU32( ~(lineszB-1) ))) );
   11525 
   11526       stmt( IRStmt_Put(OFFB_TILEN, mkU32(lineszB) ) );
   11527 
   11528       irsb->jumpkind = Ijk_TInval;
   11529       irsb->next     = mkU32(guest_EIP_bbstart+delta);
   11530       dres.whatNext  = Dis_StopHere;
   11531 
   11532       DIP("clflush %s\n", dis_buf);
   11533       goto decode_success;
   11534    }
   11535 
   11536    /* ---------------------------------------------------- */
   11537    /* --- end of the SSE2 decoder.                     --- */
   11538    /* ---------------------------------------------------- */
   11539 
   11540    /* ---------------------------------------------------- */
   11541    /* --- start of the SSE3 decoder.                   --- */
   11542    /* ---------------------------------------------------- */
   11543 
   11544    /* Skip parts of the decoder which don't apply given the stated
   11545       guest subarchitecture. */
   11546    /* if (0 == (archinfo->hwcaps & VEX_HWCAPS_X86_SSE3)) */
   11547    /* In fact this is highly bogus; we accept SSE3 insns even on a
   11548       SSE2-only guest since they turn into IR which can be re-emitted
   11549       successfully on an SSE2 host. */
   11550    if (0 == (archinfo->hwcaps & VEX_HWCAPS_X86_SSE2))
   11551       goto after_sse_decoders; /* no SSE3 capabilities */
   11552 
   11553    insn = (UChar*)&guest_code[delta];
   11554 
   11555    /* F3 0F 12 = MOVSLDUP -- move from E (mem or xmm) to G (xmm),
   11556       duplicating some lanes (2:2:0:0). */
   11557    /* F3 0F 16 = MOVSHDUP -- move from E (mem or xmm) to G (xmm),
   11558       duplicating some lanes (3:3:1:1). */
   11559    if (sz == 4 && insn[0] == 0xF3 && insn[1] == 0x0F
   11560        && (insn[2] == 0x12 || insn[2] == 0x16)) {
   11561       IRTemp s3, s2, s1, s0;
   11562       IRTemp sV  = newTemp(Ity_V128);
   11563       Bool   isH = insn[2] == 0x16;
   11564       s3 = s2 = s1 = s0 = IRTemp_INVALID;
   11565 
   11566       modrm = insn[3];
   11567       if (epartIsReg(modrm)) {
   11568          assign( sV, getXMMReg( eregOfRM(modrm)) );
   11569          DIP("movs%cdup %s,%s\n", isH ? 'h' : 'l',
   11570                                   nameXMMReg(eregOfRM(modrm)),
   11571                                   nameXMMReg(gregOfRM(modrm)));
   11572          delta += 3+1;
   11573       } else {
   11574          addr = disAMode ( &alen, sorb, delta+3, dis_buf );
   11575          gen_SEGV_if_not_16_aligned( addr );
   11576          assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
   11577          DIP("movs%cdup %s,%s\n", isH ? 'h' : 'l',
   11578 	     dis_buf,
   11579              nameXMMReg(gregOfRM(modrm)));
   11580          delta += 3+alen;
   11581       }
   11582 
   11583       breakup128to32s( sV, &s3, &s2, &s1, &s0 );
   11584       putXMMReg( gregOfRM(modrm),
   11585                  isH ? mk128from32s( s3, s3, s1, s1 )
   11586                      : mk128from32s( s2, s2, s0, s0 ) );
   11587       goto decode_success;
   11588    }
   11589 
   11590    /* F2 0F 12 = MOVDDUP -- move from E (mem or xmm) to G (xmm),
   11591       duplicating some lanes (0:1:0:1). */
   11592    if (sz == 4 && insn[0] == 0xF2 && insn[1] == 0x0F && insn[2] == 0x12) {
   11593       IRTemp sV = newTemp(Ity_V128);
   11594       IRTemp d0 = newTemp(Ity_I64);
   11595 
   11596       modrm = insn[3];
   11597       if (epartIsReg(modrm)) {
   11598          assign( sV, getXMMReg( eregOfRM(modrm)) );
   11599          DIP("movddup %s,%s\n", nameXMMReg(eregOfRM(modrm)),
   11600                                 nameXMMReg(gregOfRM(modrm)));
   11601          delta += 3+1;
   11602          assign ( d0, unop(Iop_V128to64, mkexpr(sV)) );
   11603       } else {
   11604          addr = disAMode ( &alen, sorb, delta+3, dis_buf );
   11605          assign( d0, loadLE(Ity_I64, mkexpr(addr)) );
   11606          DIP("movddup %s,%s\n", dis_buf,
   11607                                 nameXMMReg(gregOfRM(modrm)));
   11608          delta += 3+alen;
   11609       }
   11610 
   11611       putXMMReg( gregOfRM(modrm), binop(Iop_64HLtoV128,mkexpr(d0),mkexpr(d0)) );
   11612       goto decode_success;
   11613    }
   11614 
   11615    /* F2 0F D0 = ADDSUBPS -- 32x4 +/-/+/- from E (mem or xmm) to G (xmm). */
   11616    if (sz == 4 && insn[0] == 0xF2 && insn[1] == 0x0F && insn[2] == 0xD0) {
   11617       IRTemp a3, a2, a1, a0, s3, s2, s1, s0;
   11618       IRTemp eV   = newTemp(Ity_V128);
   11619       IRTemp gV   = newTemp(Ity_V128);
   11620       IRTemp addV = newTemp(Ity_V128);
   11621       IRTemp subV = newTemp(Ity_V128);
   11622       a3 = a2 = a1 = a0 = s3 = s2 = s1 = s0 = IRTemp_INVALID;
   11623 
   11624       modrm = insn[3];
   11625       if (epartIsReg(modrm)) {
   11626          assign( eV, getXMMReg( eregOfRM(modrm)) );
   11627          DIP("addsubps %s,%s\n", nameXMMReg(eregOfRM(modrm)),
   11628                                  nameXMMReg(gregOfRM(modrm)));
   11629          delta += 3+1;
   11630       } else {
   11631          addr = disAMode ( &alen, sorb, delta+3, dis_buf );
   11632          assign( eV, loadLE(Ity_V128, mkexpr(addr)) );
   11633          DIP("addsubps %s,%s\n", dis_buf,
   11634                                  nameXMMReg(gregOfRM(modrm)));
   11635          delta += 3+alen;
   11636       }
   11637 
   11638       assign( gV, getXMMReg(gregOfRM(modrm)) );
   11639 
   11640       assign( addV, binop(Iop_Add32Fx4, mkexpr(gV), mkexpr(eV)) );
   11641       assign( subV, binop(Iop_Sub32Fx4, mkexpr(gV), mkexpr(eV)) );
   11642 
   11643       breakup128to32s( addV, &a3, &a2, &a1, &a0 );
   11644       breakup128to32s( subV, &s3, &s2, &s1, &s0 );
   11645 
   11646       putXMMReg( gregOfRM(modrm), mk128from32s( a3, s2, a1, s0 ));
   11647       goto decode_success;
   11648    }
   11649 
   11650    /* 66 0F D0 = ADDSUBPD -- 64x4 +/- from E (mem or xmm) to G (xmm). */
   11651    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xD0) {
   11652       IRTemp eV   = newTemp(Ity_V128);
   11653       IRTemp gV   = newTemp(Ity_V128);
   11654       IRTemp addV = newTemp(Ity_V128);
   11655       IRTemp subV = newTemp(Ity_V128);
   11656       IRTemp a1     = newTemp(Ity_I64);
   11657       IRTemp s0     = newTemp(Ity_I64);
   11658 
   11659       modrm = insn[2];
   11660       if (epartIsReg(modrm)) {
   11661          assign( eV, getXMMReg( eregOfRM(modrm)) );
   11662          DIP("addsubpd %s,%s\n", nameXMMReg(eregOfRM(modrm)),
   11663                                  nameXMMReg(gregOfRM(modrm)));
   11664          delta += 2+1;
   11665       } else {
   11666          addr = disAMode ( &alen, sorb, delta+2, dis_buf );
   11667          assign( eV, loadLE(Ity_V128, mkexpr(addr)) );
   11668          DIP("addsubpd %s,%s\n", dis_buf,
   11669                                  nameXMMReg(gregOfRM(modrm)));
   11670          delta += 2+alen;
   11671       }
   11672 
   11673       assign( gV, getXMMReg(gregOfRM(modrm)) );
   11674 
   11675       assign( addV, binop(Iop_Add64Fx2, mkexpr(gV), mkexpr(eV)) );
   11676       assign( subV, binop(Iop_Sub64Fx2, mkexpr(gV), mkexpr(eV)) );
   11677 
   11678       assign( a1, unop(Iop_V128HIto64, mkexpr(addV) ));
   11679       assign( s0, unop(Iop_V128to64,   mkexpr(subV) ));
   11680 
   11681       putXMMReg( gregOfRM(modrm),
   11682                  binop(Iop_64HLtoV128, mkexpr(a1), mkexpr(s0)) );
   11683       goto decode_success;
   11684    }
   11685 
   11686    /* F2 0F 7D = HSUBPS -- 32x4 sub across from E (mem or xmm) to G (xmm). */
   11687    /* F2 0F 7C = HADDPS -- 32x4 add across from E (mem or xmm) to G (xmm). */
   11688    if (sz == 4 && insn[0] == 0xF2 && insn[1] == 0x0F
   11689        && (insn[2] == 0x7C || insn[2] == 0x7D)) {
   11690       IRTemp e3, e2, e1, e0, g3, g2, g1, g0;
   11691       IRTemp eV     = newTemp(Ity_V128);
   11692       IRTemp gV     = newTemp(Ity_V128);
   11693       IRTemp leftV  = newTemp(Ity_V128);
   11694       IRTemp rightV = newTemp(Ity_V128);
   11695       Bool   isAdd  = insn[2] == 0x7C;
   11696       HChar* str    = isAdd ? "add" : "sub";
   11697       e3 = e2 = e1 = e0 = g3 = g2 = g1 = g0 = IRTemp_INVALID;
   11698 
   11699       modrm = insn[3];
   11700       if (epartIsReg(modrm)) {
   11701          assign( eV, getXMMReg( eregOfRM(modrm)) );
   11702          DIP("h%sps %s,%s\n", str, nameXMMReg(eregOfRM(modrm)),
   11703                                    nameXMMReg(gregOfRM(modrm)));
   11704          delta += 3+1;
   11705       } else {
   11706          addr = disAMode ( &alen, sorb, delta+3, dis_buf );
   11707          assign( eV, loadLE(Ity_V128, mkexpr(addr)) );
   11708          DIP("h%sps %s,%s\n", str, dis_buf,
   11709                                    nameXMMReg(gregOfRM(modrm)));
   11710          delta += 3+alen;
   11711       }
   11712 
   11713       assign( gV, getXMMReg(gregOfRM(modrm)) );
   11714 
   11715       breakup128to32s( eV, &e3, &e2, &e1, &e0 );
   11716       breakup128to32s( gV, &g3, &g2, &g1, &g0 );
   11717 
   11718       assign( leftV,  mk128from32s( e2, e0, g2, g0 ) );
   11719       assign( rightV, mk128from32s( e3, e1, g3, g1 ) );
   11720 
   11721       putXMMReg( gregOfRM(modrm),
   11722                  binop(isAdd ? Iop_Add32Fx4 : Iop_Sub32Fx4,
   11723                        mkexpr(leftV), mkexpr(rightV) ) );
   11724       goto decode_success;
   11725    }
   11726 
   11727    /* 66 0F 7D = HSUBPD -- 64x2 sub across from E (mem or xmm) to G (xmm). */
   11728    /* 66 0F 7C = HADDPD -- 64x2 add across from E (mem or xmm) to G (xmm). */
   11729    if (sz == 2 && insn[0] == 0x0F && (insn[1] == 0x7C || insn[1] == 0x7D)) {
   11730       IRTemp e1     = newTemp(Ity_I64);
   11731       IRTemp e0     = newTemp(Ity_I64);
   11732       IRTemp g1     = newTemp(Ity_I64);
   11733       IRTemp g0     = newTemp(Ity_I64);
   11734       IRTemp eV     = newTemp(Ity_V128);
   11735       IRTemp gV     = newTemp(Ity_V128);
   11736       IRTemp leftV  = newTemp(Ity_V128);
   11737       IRTemp rightV = newTemp(Ity_V128);
   11738       Bool   isAdd  = insn[1] == 0x7C;
   11739       HChar* str    = isAdd ? "add" : "sub";
   11740 
   11741       modrm = insn[2];
   11742       if (epartIsReg(modrm)) {
   11743          assign( eV, getXMMReg( eregOfRM(modrm)) );
   11744          DIP("h%spd %s,%s\n", str, nameXMMReg(eregOfRM(modrm)),
   11745                                    nameXMMReg(gregOfRM(modrm)));
   11746          delta += 2+1;
   11747       } else {
   11748          addr = disAMode ( &alen, sorb, delta+2, dis_buf );
   11749          assign( eV, loadLE(Ity_V128, mkexpr(addr)) );
   11750          DIP("h%spd %s,%s\n", str, dis_buf,
   11751                               nameXMMReg(gregOfRM(modrm)));
   11752          delta += 2+alen;
   11753       }
   11754 
   11755       assign( gV, getXMMReg(gregOfRM(modrm)) );
   11756 
   11757       assign( e1, unop(Iop_V128HIto64, mkexpr(eV) ));
   11758       assign( e0, unop(Iop_V128to64, mkexpr(eV) ));
   11759       assign( g1, unop(Iop_V128HIto64, mkexpr(gV) ));
   11760       assign( g0, unop(Iop_V128to64, mkexpr(gV) ));
   11761 
   11762       assign( leftV,  binop(Iop_64HLtoV128, mkexpr(e0),mkexpr(g0)) );
   11763       assign( rightV, binop(Iop_64HLtoV128, mkexpr(e1),mkexpr(g1)) );
   11764 
   11765       putXMMReg( gregOfRM(modrm),
   11766                  binop(isAdd ? Iop_Add64Fx2 : Iop_Sub64Fx2,
   11767                        mkexpr(leftV), mkexpr(rightV) ) );
   11768       goto decode_success;
   11769    }
   11770 
   11771    /* F2 0F F0 = LDDQU -- move from E (mem or xmm) to G (xmm). */
   11772    if (sz == 4 && insn[0] == 0xF2 && insn[1] == 0x0F && insn[2] == 0xF0) {
   11773       modrm = getIByte(delta+3);
   11774       if (epartIsReg(modrm)) {
   11775          goto decode_failure;
   11776       } else {
   11777          addr = disAMode ( &alen, sorb, delta+3, dis_buf );
   11778          putXMMReg( gregOfRM(modrm),
   11779                     loadLE(Ity_V128, mkexpr(addr)) );
   11780          DIP("lddqu %s,%s\n", dis_buf,
   11781                               nameXMMReg(gregOfRM(modrm)));
   11782          delta += 3+alen;
   11783       }
   11784       goto decode_success;
   11785    }
   11786 
   11787    /* ---------------------------------------------------- */
   11788    /* --- end of the SSE3 decoder.                     --- */
   11789    /* ---------------------------------------------------- */
   11790 
   11791    /* ---------------------------------------------------- */
   11792    /* --- start of the SSSE3 decoder.                  --- */
   11793    /* ---------------------------------------------------- */
   11794 
   11795    /* 0F 38 04 = PMADDUBSW -- Multiply and Add Packed Signed and
   11796       Unsigned Bytes (MMX) */
   11797    if (sz == 4
   11798        && insn[0] == 0x0F && insn[1] == 0x38 && insn[2] == 0x04) {
   11799       IRTemp sV        = newTemp(Ity_I64);
   11800       IRTemp dV        = newTemp(Ity_I64);
   11801       IRTemp sVoddsSX  = newTemp(Ity_I64);
   11802       IRTemp sVevensSX = newTemp(Ity_I64);
   11803       IRTemp dVoddsZX  = newTemp(Ity_I64);
   11804       IRTemp dVevensZX = newTemp(Ity_I64);
   11805 
   11806       modrm = insn[3];
   11807       do_MMX_preamble();
   11808       assign( dV, getMMXReg(gregOfRM(modrm)) );
   11809 
   11810       if (epartIsReg(modrm)) {
   11811          assign( sV, getMMXReg(eregOfRM(modrm)) );
   11812          delta += 3+1;
   11813          DIP("pmaddubsw %s,%s\n", nameMMXReg(eregOfRM(modrm)),
   11814                                   nameMMXReg(gregOfRM(modrm)));
   11815       } else {
   11816          addr = disAMode ( &alen, sorb, delta+3, dis_buf );
   11817          assign( sV, loadLE(Ity_I64, mkexpr(addr)) );
   11818          delta += 3+alen;
   11819          DIP("pmaddubsw %s,%s\n", dis_buf,
   11820                                   nameMMXReg(gregOfRM(modrm)));
   11821       }
   11822 
   11823       /* compute dV unsigned x sV signed */
   11824       assign( sVoddsSX,
   11825               binop(Iop_SarN16x4, mkexpr(sV), mkU8(8)) );
   11826       assign( sVevensSX,
   11827               binop(Iop_SarN16x4,
   11828                     binop(Iop_ShlN16x4, mkexpr(sV), mkU8(8)),
   11829                     mkU8(8)) );
   11830       assign( dVoddsZX,
   11831               binop(Iop_ShrN16x4, mkexpr(dV), mkU8(8)) );
   11832       assign( dVevensZX,
   11833               binop(Iop_ShrN16x4,
   11834                     binop(Iop_ShlN16x4, mkexpr(dV), mkU8(8)),
   11835                     mkU8(8)) );
   11836 
   11837       putMMXReg(
   11838          gregOfRM(modrm),
   11839          binop(Iop_QAdd16Sx4,
   11840                binop(Iop_Mul16x4, mkexpr(sVoddsSX), mkexpr(dVoddsZX)),
   11841                binop(Iop_Mul16x4, mkexpr(sVevensSX), mkexpr(dVevensZX))
   11842          )
   11843       );
   11844       goto decode_success;
   11845    }
   11846 
   11847    /* 66 0F 38 04 = PMADDUBSW -- Multiply and Add Packed Signed and
   11848       Unsigned Bytes (XMM) */
   11849    if (sz == 2
   11850        && insn[0] == 0x0F && insn[1] == 0x38 && insn[2] == 0x04) {
   11851       IRTemp sV        = newTemp(Ity_V128);
   11852       IRTemp dV        = newTemp(Ity_V128);
   11853       IRTemp sVoddsSX  = newTemp(Ity_V128);
   11854       IRTemp sVevensSX = newTemp(Ity_V128);
   11855       IRTemp dVoddsZX  = newTemp(Ity_V128);
   11856       IRTemp dVevensZX = newTemp(Ity_V128);
   11857 
   11858       modrm = insn[3];
   11859       assign( dV, getXMMReg(gregOfRM(modrm)) );
   11860 
   11861       if (epartIsReg(modrm)) {
   11862          assign( sV, getXMMReg(eregOfRM(modrm)) );
   11863          delta += 3+1;
   11864          DIP("pmaddubsw %s,%s\n", nameXMMReg(eregOfRM(modrm)),
   11865                                   nameXMMReg(gregOfRM(modrm)));
   11866       } else {
   11867          addr = disAMode ( &alen, sorb, delta+3, dis_buf );
   11868          gen_SEGV_if_not_16_aligned( addr );
   11869          assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
   11870          delta += 3+alen;
   11871          DIP("pmaddubsw %s,%s\n", dis_buf,
   11872                                   nameXMMReg(gregOfRM(modrm)));
   11873       }
   11874 
   11875       /* compute dV unsigned x sV signed */
   11876       assign( sVoddsSX,
   11877               binop(Iop_SarN16x8, mkexpr(sV), mkU8(8)) );
   11878       assign( sVevensSX,
   11879               binop(Iop_SarN16x8,
   11880                     binop(Iop_ShlN16x8, mkexpr(sV), mkU8(8)),
   11881                     mkU8(8)) );
   11882       assign( dVoddsZX,
   11883               binop(Iop_ShrN16x8, mkexpr(dV), mkU8(8)) );
   11884       assign( dVevensZX,
   11885               binop(Iop_ShrN16x8,
   11886                     binop(Iop_ShlN16x8, mkexpr(dV), mkU8(8)),
   11887                     mkU8(8)) );
   11888 
   11889       putXMMReg(
   11890          gregOfRM(modrm),
   11891          binop(Iop_QAdd16Sx8,
   11892                binop(Iop_Mul16x8, mkexpr(sVoddsSX), mkexpr(dVoddsZX)),
   11893                binop(Iop_Mul16x8, mkexpr(sVevensSX), mkexpr(dVevensZX))
   11894          )
   11895       );
   11896       goto decode_success;
   11897    }
   11898 
   11899    /* ***--- these are MMX class insns introduced in SSSE3 ---*** */
   11900    /* 0F 38 03 = PHADDSW -- 16x4 signed qadd across from E (mem or
   11901       mmx) and G to G (mmx). */
   11902    /* 0F 38 07 = PHSUBSW -- 16x4 signed qsub across from E (mem or
   11903       mmx) and G to G (mmx). */
   11904    /* 0F 38 01 = PHADDW -- 16x4 add across from E (mem or mmx) and G
   11905       to G (mmx). */
   11906    /* 0F 38 05 = PHSUBW -- 16x4 sub across from E (mem or mmx) and G
   11907       to G (mmx). */
   11908    /* 0F 38 02 = PHADDD -- 32x2 add across from E (mem or mmx) and G
   11909       to G (mmx). */
   11910    /* 0F 38 06 = PHSUBD -- 32x2 sub across from E (mem or mmx) and G
   11911       to G (mmx). */
   11912 
   11913    if (sz == 4
   11914        && insn[0] == 0x0F && insn[1] == 0x38
   11915        && (insn[2] == 0x03 || insn[2] == 0x07 || insn[2] == 0x01
   11916            || insn[2] == 0x05 || insn[2] == 0x02 || insn[2] == 0x06)) {
   11917       HChar* str    = "???";
   11918       IROp   opV64  = Iop_INVALID;
   11919       IROp   opCatO = Iop_CatOddLanes16x4;
   11920       IROp   opCatE = Iop_CatEvenLanes16x4;
   11921       IRTemp sV     = newTemp(Ity_I64);
   11922       IRTemp dV     = newTemp(Ity_I64);
   11923 
   11924       modrm = insn[3];
   11925 
   11926       switch (insn[2]) {
   11927          case 0x03: opV64 = Iop_QAdd16Sx4; str = "addsw"; break;
   11928          case 0x07: opV64 = Iop_QSub16Sx4; str = "subsw"; break;
   11929          case 0x01: opV64 = Iop_Add16x4;   str = "addw";  break;
   11930          case 0x05: opV64 = Iop_Sub16x4;   str = "subw";  break;
   11931          case 0x02: opV64 = Iop_Add32x2;   str = "addd";  break;
   11932          case 0x06: opV64 = Iop_Sub32x2;   str = "subd";  break;
   11933          default: vassert(0);
   11934       }
   11935       if (insn[2] == 0x02 || insn[2] == 0x06) {
   11936          opCatO = Iop_InterleaveHI32x2;
   11937          opCatE = Iop_InterleaveLO32x2;
   11938       }
   11939 
   11940       do_MMX_preamble();
   11941       assign( dV, getMMXReg(gregOfRM(modrm)) );
   11942 
   11943       if (epartIsReg(modrm)) {
   11944          assign( sV, getMMXReg(eregOfRM(modrm)) );
   11945          delta += 3+1;
   11946          DIP("ph%s %s,%s\n", str, nameMMXReg(eregOfRM(modrm)),
   11947                                   nameMMXReg(gregOfRM(modrm)));
   11948       } else {
   11949          addr = disAMode ( &alen, sorb, delta+3, dis_buf );
   11950          assign( sV, loadLE(Ity_I64, mkexpr(addr)) );
   11951          delta += 3+alen;
   11952          DIP("ph%s %s,%s\n", str, dis_buf,
   11953                                   nameMMXReg(gregOfRM(modrm)));
   11954       }
   11955 
   11956       putMMXReg(
   11957          gregOfRM(modrm),
   11958          binop(opV64,
   11959                binop(opCatE,mkexpr(sV),mkexpr(dV)),
   11960                binop(opCatO,mkexpr(sV),mkexpr(dV))
   11961          )
   11962       );
   11963       goto decode_success;
   11964    }
   11965 
   11966    /* 66 0F 38 03 = PHADDSW -- 16x8 signed qadd across from E (mem or
   11967       xmm) and G to G (xmm). */
   11968    /* 66 0F 38 07 = PHSUBSW -- 16x8 signed qsub across from E (mem or
   11969       xmm) and G to G (xmm). */
   11970    /* 66 0F 38 01 = PHADDW -- 16x8 add across from E (mem or xmm) and
   11971       G to G (xmm). */
   11972    /* 66 0F 38 05 = PHSUBW -- 16x8 sub across from E (mem or xmm) and
   11973       G to G (xmm). */
   11974    /* 66 0F 38 02 = PHADDD -- 32x4 add across from E (mem or xmm) and
   11975       G to G (xmm). */
   11976    /* 66 0F 38 06 = PHSUBD -- 32x4 sub across from E (mem or xmm) and
   11977       G to G (xmm). */
   11978 
   11979    if (sz == 2
   11980        && insn[0] == 0x0F && insn[1] == 0x38
   11981        && (insn[2] == 0x03 || insn[2] == 0x07 || insn[2] == 0x01
   11982            || insn[2] == 0x05 || insn[2] == 0x02 || insn[2] == 0x06)) {
   11983       HChar* str    = "???";
   11984       IROp   opV64  = Iop_INVALID;
   11985       IROp   opCatO = Iop_CatOddLanes16x4;
   11986       IROp   opCatE = Iop_CatEvenLanes16x4;
   11987       IRTemp sV     = newTemp(Ity_V128);
   11988       IRTemp dV     = newTemp(Ity_V128);
   11989       IRTemp sHi    = newTemp(Ity_I64);
   11990       IRTemp sLo    = newTemp(Ity_I64);
   11991       IRTemp dHi    = newTemp(Ity_I64);
   11992       IRTemp dLo    = newTemp(Ity_I64);
   11993 
   11994       modrm = insn[3];
   11995 
   11996       switch (insn[2]) {
   11997          case 0x03: opV64 = Iop_QAdd16Sx4; str = "addsw"; break;
   11998          case 0x07: opV64 = Iop_QSub16Sx4; str = "subsw"; break;
   11999          case 0x01: opV64 = Iop_Add16x4;   str = "addw";  break;
   12000          case 0x05: opV64 = Iop_Sub16x4;   str = "subw";  break;
   12001          case 0x02: opV64 = Iop_Add32x2;   str = "addd";  break;
   12002          case 0x06: opV64 = Iop_Sub32x2;   str = "subd";  break;
   12003          default: vassert(0);
   12004       }
   12005       if (insn[2] == 0x02 || insn[2] == 0x06) {
   12006          opCatO = Iop_InterleaveHI32x2;
   12007          opCatE = Iop_InterleaveLO32x2;
   12008       }
   12009 
   12010       assign( dV, getXMMReg(gregOfRM(modrm)) );
   12011 
   12012       if (epartIsReg(modrm)) {
   12013          assign( sV, getXMMReg( eregOfRM(modrm)) );
   12014          DIP("ph%s %s,%s\n", str, nameXMMReg(eregOfRM(modrm)),
   12015                                   nameXMMReg(gregOfRM(modrm)));
   12016          delta += 3+1;
   12017       } else {
   12018          addr = disAMode ( &alen, sorb, delta+3, dis_buf );
   12019          gen_SEGV_if_not_16_aligned( addr );
   12020          assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
   12021          DIP("ph%s %s,%s\n", str, dis_buf,
   12022                              nameXMMReg(gregOfRM(modrm)));
   12023          delta += 3+alen;
   12024       }
   12025 
   12026       assign( dHi, unop(Iop_V128HIto64, mkexpr(dV)) );
   12027       assign( dLo, unop(Iop_V128to64,   mkexpr(dV)) );
   12028       assign( sHi, unop(Iop_V128HIto64, mkexpr(sV)) );
   12029       assign( sLo, unop(Iop_V128to64,   mkexpr(sV)) );
   12030 
   12031       /* This isn't a particularly efficient way to compute the
   12032          result, but at least it avoids a proliferation of IROps,
   12033          hence avoids complication all the backends. */
   12034       putXMMReg(
   12035          gregOfRM(modrm),
   12036          binop(Iop_64HLtoV128,
   12037                binop(opV64,
   12038                      binop(opCatE,mkexpr(sHi),mkexpr(sLo)),
   12039                      binop(opCatO,mkexpr(sHi),mkexpr(sLo))
   12040                ),
   12041                binop(opV64,
   12042                      binop(opCatE,mkexpr(dHi),mkexpr(dLo)),
   12043                      binop(opCatO,mkexpr(dHi),mkexpr(dLo))
   12044                )
   12045          )
   12046       );
   12047       goto decode_success;
   12048    }
   12049 
   12050    /* 0F 38 0B = PMULHRSW -- Packed Multiply High with Round and Scale
   12051       (MMX) */
   12052    if (sz == 4
   12053        && insn[0] == 0x0F && insn[1] == 0x38 && insn[2] == 0x0B) {
   12054       IRTemp sV = newTemp(Ity_I64);
   12055       IRTemp dV = newTemp(Ity_I64);
   12056 
   12057       modrm = insn[3];
   12058       do_MMX_preamble();
   12059       assign( dV, getMMXReg(gregOfRM(modrm)) );
   12060 
   12061       if (epartIsReg(modrm)) {
   12062          assign( sV, getMMXReg(eregOfRM(modrm)) );
   12063          delta += 3+1;
   12064          DIP("pmulhrsw %s,%s\n", nameMMXReg(eregOfRM(modrm)),
   12065                                  nameMMXReg(gregOfRM(modrm)));
   12066       } else {
   12067          addr = disAMode ( &alen, sorb, delta+3, dis_buf );
   12068          assign( sV, loadLE(Ity_I64, mkexpr(addr)) );
   12069          delta += 3+alen;
   12070          DIP("pmulhrsw %s,%s\n", dis_buf,
   12071                                  nameMMXReg(gregOfRM(modrm)));
   12072       }
   12073 
   12074       putMMXReg(
   12075          gregOfRM(modrm),
   12076          dis_PMULHRSW_helper( mkexpr(sV), mkexpr(dV) )
   12077       );
   12078       goto decode_success;
   12079    }
   12080 
   12081    /* 66 0F 38 0B = PMULHRSW -- Packed Multiply High with Round and
   12082       Scale (XMM) */
   12083    if (sz == 2
   12084        && insn[0] == 0x0F && insn[1] == 0x38 && insn[2] == 0x0B) {
   12085       IRTemp sV  = newTemp(Ity_V128);
   12086       IRTemp dV  = newTemp(Ity_V128);
   12087       IRTemp sHi = newTemp(Ity_I64);
   12088       IRTemp sLo = newTemp(Ity_I64);
   12089       IRTemp dHi = newTemp(Ity_I64);
   12090       IRTemp dLo = newTemp(Ity_I64);
   12091 
   12092       modrm = insn[3];
   12093       assign( dV, getXMMReg(gregOfRM(modrm)) );
   12094 
   12095       if (epartIsReg(modrm)) {
   12096          assign( sV, getXMMReg(eregOfRM(modrm)) );
   12097          delta += 3+1;
   12098          DIP("pmulhrsw %s,%s\n", nameXMMReg(eregOfRM(modrm)),
   12099                                  nameXMMReg(gregOfRM(modrm)));
   12100       } else {
   12101          addr = disAMode ( &alen, sorb, delta+3, dis_buf );
   12102          gen_SEGV_if_not_16_aligned( addr );
   12103          assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
   12104          delta += 3+alen;
   12105          DIP("pmulhrsw %s,%s\n", dis_buf,
   12106                                  nameXMMReg(gregOfRM(modrm)));
   12107       }
   12108 
   12109       assign( dHi, unop(Iop_V128HIto64, mkexpr(dV)) );
   12110       assign( dLo, unop(Iop_V128to64,   mkexpr(dV)) );
   12111       assign( sHi, unop(Iop_V128HIto64, mkexpr(sV)) );
   12112       assign( sLo, unop(Iop_V128to64,   mkexpr(sV)) );
   12113 
   12114       putXMMReg(
   12115          gregOfRM(modrm),
   12116          binop(Iop_64HLtoV128,
   12117                dis_PMULHRSW_helper( mkexpr(sHi), mkexpr(dHi) ),
   12118                dis_PMULHRSW_helper( mkexpr(sLo), mkexpr(dLo) )
   12119          )
   12120       );
   12121       goto decode_success;
   12122    }
   12123 
   12124    /* 0F 38 08 = PSIGNB -- Packed Sign 8x8  (MMX) */
   12125    /* 0F 38 09 = PSIGNW -- Packed Sign 16x4 (MMX) */
   12126    /* 0F 38 09 = PSIGND -- Packed Sign 32x2 (MMX) */
   12127    if (sz == 4
   12128        && insn[0] == 0x0F && insn[1] == 0x38
   12129        && (insn[2] == 0x08 || insn[2] == 0x09 || insn[2] == 0x0A)) {
   12130       IRTemp sV      = newTemp(Ity_I64);
   12131       IRTemp dV      = newTemp(Ity_I64);
   12132       HChar* str     = "???";
   12133       Int    laneszB = 0;
   12134 
   12135       switch (insn[2]) {
   12136          case 0x08: laneszB = 1; str = "b"; break;
   12137          case 0x09: laneszB = 2; str = "w"; break;
   12138          case 0x0A: laneszB = 4; str = "d"; break;
   12139          default: vassert(0);
   12140       }
   12141 
   12142       modrm = insn[3];
   12143       do_MMX_preamble();
   12144       assign( dV, getMMXReg(gregOfRM(modrm)) );
   12145 
   12146       if (epartIsReg(modrm)) {
   12147          assign( sV, getMMXReg(eregOfRM(modrm)) );
   12148          delta += 3+1;
   12149          DIP("psign%s %s,%s\n", str, nameMMXReg(eregOfRM(modrm)),
   12150                                      nameMMXReg(gregOfRM(modrm)));
   12151       } else {
   12152          addr = disAMode ( &alen, sorb, delta+3, dis_buf );
   12153          assign( sV, loadLE(Ity_I64, mkexpr(addr)) );
   12154          delta += 3+alen;
   12155          DIP("psign%s %s,%s\n", str, dis_buf,
   12156                                      nameMMXReg(gregOfRM(modrm)));
   12157       }
   12158 
   12159       putMMXReg(
   12160          gregOfRM(modrm),
   12161          dis_PSIGN_helper( mkexpr(sV), mkexpr(dV), laneszB )
   12162       );
   12163       goto decode_success;
   12164    }
   12165 
   12166    /* 66 0F 38 08 = PSIGNB -- Packed Sign 8x16 (XMM) */
   12167    /* 66 0F 38 09 = PSIGNW -- Packed Sign 16x8 (XMM) */
   12168    /* 66 0F 38 09 = PSIGND -- Packed Sign 32x4 (XMM) */
   12169    if (sz == 2
   12170        && insn[0] == 0x0F && insn[1] == 0x38
   12171        && (insn[2] == 0x08 || insn[2] == 0x09 || insn[2] == 0x0A)) {
   12172       IRTemp sV      = newTemp(Ity_V128);
   12173       IRTemp dV      = newTemp(Ity_V128);
   12174       IRTemp sHi     = newTemp(Ity_I64);
   12175       IRTemp sLo     = newTemp(Ity_I64);
   12176       IRTemp dHi     = newTemp(Ity_I64);
   12177       IRTemp dLo     = newTemp(Ity_I64);
   12178       HChar* str     = "???";
   12179       Int    laneszB = 0;
   12180 
   12181       switch (insn[2]) {
   12182          case 0x08: laneszB = 1; str = "b"; break;
   12183          case 0x09: laneszB = 2; str = "w"; break;
   12184          case 0x0A: laneszB = 4; str = "d"; break;
   12185          default: vassert(0);
   12186       }
   12187 
   12188       modrm = insn[3];
   12189       assign( dV, getXMMReg(gregOfRM(modrm)) );
   12190 
   12191       if (epartIsReg(modrm)) {
   12192          assign( sV, getXMMReg(eregOfRM(modrm)) );
   12193          delta += 3+1;
   12194          DIP("psign%s %s,%s\n", str, nameXMMReg(eregOfRM(modrm)),
   12195                                      nameXMMReg(gregOfRM(modrm)));
   12196       } else {
   12197          addr = disAMode ( &alen, sorb, delta+3, dis_buf );
   12198          gen_SEGV_if_not_16_aligned( addr );
   12199          assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
   12200          delta += 3+alen;
   12201          DIP("psign%s %s,%s\n", str, dis_buf,
   12202                                      nameXMMReg(gregOfRM(modrm)));
   12203       }
   12204 
   12205       assign( dHi, unop(Iop_V128HIto64, mkexpr(dV)) );
   12206       assign( dLo, unop(Iop_V128to64,   mkexpr(dV)) );
   12207       assign( sHi, unop(Iop_V128HIto64, mkexpr(sV)) );
   12208       assign( sLo, unop(Iop_V128to64,   mkexpr(sV)) );
   12209 
   12210       putXMMReg(
   12211          gregOfRM(modrm),
   12212          binop(Iop_64HLtoV128,
   12213                dis_PSIGN_helper( mkexpr(sHi), mkexpr(dHi), laneszB ),
   12214                dis_PSIGN_helper( mkexpr(sLo), mkexpr(dLo), laneszB )
   12215          )
   12216       );
   12217       goto decode_success;
   12218    }
   12219 
   12220    /* 0F 38 1C = PABSB -- Packed Absolute Value 8x8  (MMX) */
   12221    /* 0F 38 1D = PABSW -- Packed Absolute Value 16x4 (MMX) */
   12222    /* 0F 38 1E = PABSD -- Packed Absolute Value 32x2 (MMX) */
   12223    if (sz == 4
   12224        && insn[0] == 0x0F && insn[1] == 0x38
   12225        && (insn[2] == 0x1C || insn[2] == 0x1D || insn[2] == 0x1E)) {
   12226       IRTemp sV      = newTemp(Ity_I64);
   12227       HChar* str     = "???";
   12228       Int    laneszB = 0;
   12229 
   12230       switch (insn[2]) {
   12231          case 0x1C: laneszB = 1; str = "b"; break;
   12232          case 0x1D: laneszB = 2; str = "w"; break;
   12233          case 0x1E: laneszB = 4; str = "d"; break;
   12234          default: vassert(0);
   12235       }
   12236 
   12237       modrm = insn[3];
   12238       do_MMX_preamble();
   12239 
   12240       if (epartIsReg(modrm)) {
   12241          assign( sV, getMMXReg(eregOfRM(modrm)) );
   12242          delta += 3+1;
   12243          DIP("pabs%s %s,%s\n", str, nameMMXReg(eregOfRM(modrm)),
   12244                                     nameMMXReg(gregOfRM(modrm)));
   12245       } else {
   12246          addr = disAMode ( &alen, sorb, delta+3, dis_buf );
   12247          assign( sV, loadLE(Ity_I64, mkexpr(addr)) );
   12248          delta += 3+alen;
   12249          DIP("pabs%s %s,%s\n", str, dis_buf,
   12250                                     nameMMXReg(gregOfRM(modrm)));
   12251       }
   12252 
   12253       putMMXReg(
   12254          gregOfRM(modrm),
   12255          dis_PABS_helper( mkexpr(sV), laneszB )
   12256       );
   12257       goto decode_success;
   12258    }
   12259 
   12260    /* 66 0F 38 1C = PABSB -- Packed Absolute Value 8x16 (XMM) */
   12261    /* 66 0F 38 1D = PABSW -- Packed Absolute Value 16x8 (XMM) */
   12262    /* 66 0F 38 1E = PABSD -- Packed Absolute Value 32x4 (XMM) */
   12263    if (sz == 2
   12264        && insn[0] == 0x0F && insn[1] == 0x38
   12265        && (insn[2] == 0x1C || insn[2] == 0x1D || insn[2] == 0x1E)) {
   12266       IRTemp sV      = newTemp(Ity_V128);
   12267       IRTemp sHi     = newTemp(Ity_I64);
   12268       IRTemp sLo     = newTemp(Ity_I64);
   12269       HChar* str     = "???";
   12270       Int    laneszB = 0;
   12271 
   12272       switch (insn[2]) {
   12273          case 0x1C: laneszB = 1; str = "b"; break;
   12274          case 0x1D: laneszB = 2; str = "w"; break;
   12275          case 0x1E: laneszB = 4; str = "d"; break;
   12276          default: vassert(0);
   12277       }
   12278 
   12279       modrm = insn[3];
   12280 
   12281       if (epartIsReg(modrm)) {
   12282          assign( sV, getXMMReg(eregOfRM(modrm)) );
   12283          delta += 3+1;
   12284          DIP("pabs%s %s,%s\n", str, nameXMMReg(eregOfRM(modrm)),
   12285                                     nameXMMReg(gregOfRM(modrm)));
   12286       } else {
   12287          addr = disAMode ( &alen, sorb, delta+3, dis_buf );
   12288          gen_SEGV_if_not_16_aligned( addr );
   12289          assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
   12290          delta += 3+alen;
   12291          DIP("pabs%s %s,%s\n", str, dis_buf,
   12292                                     nameXMMReg(gregOfRM(modrm)));
   12293       }
   12294 
   12295       assign( sHi, unop(Iop_V128HIto64, mkexpr(sV)) );
   12296       assign( sLo, unop(Iop_V128to64,   mkexpr(sV)) );
   12297 
   12298       putXMMReg(
   12299          gregOfRM(modrm),
   12300          binop(Iop_64HLtoV128,
   12301                dis_PABS_helper( mkexpr(sHi), laneszB ),
   12302                dis_PABS_helper( mkexpr(sLo), laneszB )
   12303          )
   12304       );
   12305       goto decode_success;
   12306    }
   12307 
   12308    /* 0F 3A 0F = PALIGNR -- Packed Align Right (MMX) */
   12309    if (sz == 4
   12310        && insn[0] == 0x0F && insn[1] == 0x3A && insn[2] == 0x0F) {
   12311       IRTemp sV  = newTemp(Ity_I64);
   12312       IRTemp dV  = newTemp(Ity_I64);
   12313       IRTemp res = newTemp(Ity_I64);
   12314 
   12315       modrm = insn[3];
   12316       do_MMX_preamble();
   12317       assign( dV, getMMXReg(gregOfRM(modrm)) );
   12318 
   12319       if (epartIsReg(modrm)) {
   12320          assign( sV, getMMXReg(eregOfRM(modrm)) );
   12321          d32 = (UInt)insn[3+1];
   12322          delta += 3+1+1;
   12323          DIP("palignr $%d,%s,%s\n",  (Int)d32,
   12324                                      nameMMXReg(eregOfRM(modrm)),
   12325                                      nameMMXReg(gregOfRM(modrm)));
   12326       } else {
   12327          addr = disAMode ( &alen, sorb, delta+3, dis_buf );
   12328          assign( sV, loadLE(Ity_I64, mkexpr(addr)) );
   12329          d32 = (UInt)insn[3+alen];
   12330          delta += 3+alen+1;
   12331          DIP("palignr $%d%s,%s\n", (Int)d32,
   12332                                    dis_buf,
   12333                                    nameMMXReg(gregOfRM(modrm)));
   12334       }
   12335 
   12336       if (d32 == 0) {
   12337          assign( res, mkexpr(sV) );
   12338       }
   12339       else if (d32 >= 1 && d32 <= 7) {
   12340          assign(res,
   12341                 binop(Iop_Or64,
   12342                       binop(Iop_Shr64, mkexpr(sV), mkU8(8*d32)),
   12343                       binop(Iop_Shl64, mkexpr(dV), mkU8(8*(8-d32))
   12344                      )));
   12345       }
   12346       else if (d32 == 8) {
   12347         assign( res, mkexpr(dV) );
   12348       }
   12349       else if (d32 >= 9 && d32 <= 15) {
   12350          assign( res, binop(Iop_Shr64, mkexpr(dV), mkU8(8*(d32-8))) );
   12351       }
   12352       else if (d32 >= 16 && d32 <= 255) {
   12353          assign( res, mkU64(0) );
   12354       }
   12355       else
   12356          vassert(0);
   12357 
   12358       putMMXReg( gregOfRM(modrm), mkexpr(res) );
   12359       goto decode_success;
   12360    }
   12361 
   12362    /* 66 0F 3A 0F = PALIGNR -- Packed Align Right (XMM) */
   12363    if (sz == 2
   12364        && insn[0] == 0x0F && insn[1] == 0x3A && insn[2] == 0x0F) {
   12365       IRTemp sV  = newTemp(Ity_V128);
   12366       IRTemp dV  = newTemp(Ity_V128);
   12367       IRTemp sHi = newTemp(Ity_I64);
   12368       IRTemp sLo = newTemp(Ity_I64);
   12369       IRTemp dHi = newTemp(Ity_I64);
   12370       IRTemp dLo = newTemp(Ity_I64);
   12371       IRTemp rHi = newTemp(Ity_I64);
   12372       IRTemp rLo = newTemp(Ity_I64);
   12373 
   12374       modrm = insn[3];
   12375       assign( dV, getXMMReg(gregOfRM(modrm)) );
   12376 
   12377       if (epartIsReg(modrm)) {
   12378          assign( sV, getXMMReg(eregOfRM(modrm)) );
   12379          d32 = (UInt)insn[3+1];
   12380          delta += 3+1+1;
   12381          DIP("palignr $%d,%s,%s\n", (Int)d32,
   12382                                     nameXMMReg(eregOfRM(modrm)),
   12383                                     nameXMMReg(gregOfRM(modrm)));
   12384       } else {
   12385          addr = disAMode ( &alen, sorb, delta+3, dis_buf );
   12386          gen_SEGV_if_not_16_aligned( addr );
   12387          assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
   12388          d32 = (UInt)insn[3+alen];
   12389          delta += 3+alen+1;
   12390          DIP("palignr $%d,%s,%s\n", (Int)d32,
   12391                                     dis_buf,
   12392                                     nameXMMReg(gregOfRM(modrm)));
   12393       }
   12394 
   12395       assign( dHi, unop(Iop_V128HIto64, mkexpr(dV)) );
   12396       assign( dLo, unop(Iop_V128to64,   mkexpr(dV)) );
   12397       assign( sHi, unop(Iop_V128HIto64, mkexpr(sV)) );
   12398       assign( sLo, unop(Iop_V128to64,   mkexpr(sV)) );
   12399 
   12400       if (d32 == 0) {
   12401          assign( rHi, mkexpr(sHi) );
   12402          assign( rLo, mkexpr(sLo) );
   12403       }
   12404       else if (d32 >= 1 && d32 <= 7) {
   12405          assign( rHi, dis_PALIGNR_XMM_helper(dLo, sHi, d32) );
   12406          assign( rLo, dis_PALIGNR_XMM_helper(sHi, sLo, d32) );
   12407       }
   12408       else if (d32 == 8) {
   12409          assign( rHi, mkexpr(dLo) );
   12410          assign( rLo, mkexpr(sHi) );
   12411       }
   12412       else if (d32 >= 9 && d32 <= 15) {
   12413          assign( rHi, dis_PALIGNR_XMM_helper(dHi, dLo, d32-8) );
   12414          assign( rLo, dis_PALIGNR_XMM_helper(dLo, sHi, d32-8) );
   12415       }
   12416       else if (d32 == 16) {
   12417          assign( rHi, mkexpr(dHi) );
   12418          assign( rLo, mkexpr(dLo) );
   12419       }
   12420       else if (d32 >= 17 && d32 <= 23) {
   12421          assign( rHi, binop(Iop_Shr64, mkexpr(dHi), mkU8(8*(d32-16))) );
   12422          assign( rLo, dis_PALIGNR_XMM_helper(dHi, dLo, d32-16) );
   12423       }
   12424       else if (d32 == 24) {
   12425          assign( rHi, mkU64(0) );
   12426          assign( rLo, mkexpr(dHi) );
   12427       }
   12428       else if (d32 >= 25 && d32 <= 31) {
   12429          assign( rHi, mkU64(0) );
   12430          assign( rLo, binop(Iop_Shr64, mkexpr(dHi), mkU8(8*(d32-24))) );
   12431       }
   12432       else if (d32 >= 32 && d32 <= 255) {
   12433          assign( rHi, mkU64(0) );
   12434          assign( rLo, mkU64(0) );
   12435       }
   12436       else
   12437          vassert(0);
   12438 
   12439       putXMMReg(
   12440          gregOfRM(modrm),
   12441          binop(Iop_64HLtoV128, mkexpr(rHi), mkexpr(rLo))
   12442       );
   12443       goto decode_success;
   12444    }
   12445 
   12446    /* 0F 38 00 = PSHUFB -- Packed Shuffle Bytes 8x8 (MMX) */
   12447    if (sz == 4
   12448        && insn[0] == 0x0F && insn[1] == 0x38 && insn[2] == 0x00) {
   12449       IRTemp sV      = newTemp(Ity_I64);
   12450       IRTemp dV      = newTemp(Ity_I64);
   12451 
   12452       modrm = insn[3];
   12453       do_MMX_preamble();
   12454       assign( dV, getMMXReg(gregOfRM(modrm)) );
   12455 
   12456       if (epartIsReg(modrm)) {
   12457          assign( sV, getMMXReg(eregOfRM(modrm)) );
   12458          delta += 3+1;
   12459          DIP("pshufb %s,%s\n", nameMMXReg(eregOfRM(modrm)),
   12460                                nameMMXReg(gregOfRM(modrm)));
   12461       } else {
   12462          addr = disAMode ( &alen, sorb, delta+3, dis_buf );
   12463          assign( sV, loadLE(Ity_I64, mkexpr(addr)) );
   12464          delta += 3+alen;
   12465          DIP("pshufb %s,%s\n", dis_buf,
   12466                                nameMMXReg(gregOfRM(modrm)));
   12467       }
   12468 
   12469       putMMXReg(
   12470          gregOfRM(modrm),
   12471          binop(
   12472             Iop_And64,
   12473             /* permute the lanes */
   12474             binop(
   12475                Iop_Perm8x8,
   12476                mkexpr(dV),
   12477                binop(Iop_And64, mkexpr(sV), mkU64(0x0707070707070707ULL))
   12478             ),
   12479             /* mask off lanes which have (index & 0x80) == 0x80 */
   12480             unop(Iop_Not64, binop(Iop_SarN8x8, mkexpr(sV), mkU8(7)))
   12481          )
   12482       );
   12483       goto decode_success;
   12484    }
   12485 
   12486    /* 66 0F 38 00 = PSHUFB -- Packed Shuffle Bytes 8x16 (XMM) */
   12487    if (sz == 2
   12488        && insn[0] == 0x0F && insn[1] == 0x38 && insn[2] == 0x00) {
   12489       IRTemp sV         = newTemp(Ity_V128);
   12490       IRTemp dV         = newTemp(Ity_V128);
   12491       IRTemp sHi        = newTemp(Ity_I64);
   12492       IRTemp sLo        = newTemp(Ity_I64);
   12493       IRTemp dHi        = newTemp(Ity_I64);
   12494       IRTemp dLo        = newTemp(Ity_I64);
   12495       IRTemp rHi        = newTemp(Ity_I64);
   12496       IRTemp rLo        = newTemp(Ity_I64);
   12497       IRTemp sevens     = newTemp(Ity_I64);
   12498       IRTemp mask0x80hi = newTemp(Ity_I64);
   12499       IRTemp mask0x80lo = newTemp(Ity_I64);
   12500       IRTemp maskBit3hi = newTemp(Ity_I64);
   12501       IRTemp maskBit3lo = newTemp(Ity_I64);
   12502       IRTemp sAnd7hi    = newTemp(Ity_I64);
   12503       IRTemp sAnd7lo    = newTemp(Ity_I64);
   12504       IRTemp permdHi    = newTemp(Ity_I64);
   12505       IRTemp permdLo    = newTemp(Ity_I64);
   12506 
   12507       modrm = insn[3];
   12508       assign( dV, getXMMReg(gregOfRM(modrm)) );
   12509 
   12510       if (epartIsReg(modrm)) {
   12511          assign( sV, getXMMReg(eregOfRM(modrm)) );
   12512          delta += 3+1;
   12513          DIP("pshufb %s,%s\n", nameXMMReg(eregOfRM(modrm)),
   12514                                nameXMMReg(gregOfRM(modrm)));
   12515       } else {
   12516          addr = disAMode ( &alen, sorb, delta+3, dis_buf );
   12517          gen_SEGV_if_not_16_aligned( addr );
   12518          assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
   12519          delta += 3+alen;
   12520          DIP("pshufb %s,%s\n", dis_buf,
   12521                                nameXMMReg(gregOfRM(modrm)));
   12522       }
   12523 
   12524       assign( dHi, unop(Iop_V128HIto64, mkexpr(dV)) );
   12525       assign( dLo, unop(Iop_V128to64,   mkexpr(dV)) );
   12526       assign( sHi, unop(Iop_V128HIto64, mkexpr(sV)) );
   12527       assign( sLo, unop(Iop_V128to64,   mkexpr(sV)) );
   12528 
   12529       assign( sevens, mkU64(0x0707070707070707ULL) );
   12530 
   12531       /*
   12532       mask0x80hi = Not(SarN8x8(sHi,7))
   12533       maskBit3hi = SarN8x8(ShlN8x8(sHi,4),7)
   12534       sAnd7hi    = And(sHi,sevens)
   12535       permdHi    = Or( And(Perm8x8(dHi,sAnd7hi),maskBit3hi),
   12536                        And(Perm8x8(dLo,sAnd7hi),Not(maskBit3hi)) )
   12537       rHi        = And(permdHi,mask0x80hi)
   12538       */
   12539       assign(
   12540          mask0x80hi,
   12541          unop(Iop_Not64, binop(Iop_SarN8x8,mkexpr(sHi),mkU8(7))));
   12542 
   12543       assign(
   12544          maskBit3hi,
   12545          binop(Iop_SarN8x8,
   12546                binop(Iop_ShlN8x8,mkexpr(sHi),mkU8(4)),
   12547                mkU8(7)));
   12548 
   12549       assign(sAnd7hi, binop(Iop_And64,mkexpr(sHi),mkexpr(sevens)));
   12550 
   12551       assign(
   12552          permdHi,
   12553          binop(
   12554             Iop_Or64,
   12555             binop(Iop_And64,
   12556                   binop(Iop_Perm8x8,mkexpr(dHi),mkexpr(sAnd7hi)),
   12557                   mkexpr(maskBit3hi)),
   12558             binop(Iop_And64,
   12559                   binop(Iop_Perm8x8,mkexpr(dLo),mkexpr(sAnd7hi)),
   12560                   unop(Iop_Not64,mkexpr(maskBit3hi))) ));
   12561 
   12562       assign(rHi, binop(Iop_And64,mkexpr(permdHi),mkexpr(mask0x80hi)) );
   12563 
   12564       /* And the same for the lower half of the result.  What fun. */
   12565 
   12566       assign(
   12567          mask0x80lo,
   12568          unop(Iop_Not64, binop(Iop_SarN8x8,mkexpr(sLo),mkU8(7))));
   12569 
   12570       assign(
   12571          maskBit3lo,
   12572          binop(Iop_SarN8x8,
   12573                binop(Iop_ShlN8x8,mkexpr(sLo),mkU8(4)),
   12574                mkU8(7)));
   12575 
   12576       assign(sAnd7lo, binop(Iop_And64,mkexpr(sLo),mkexpr(sevens)));
   12577 
   12578       assign(
   12579          permdLo,
   12580          binop(
   12581             Iop_Or64,
   12582             binop(Iop_And64,
   12583                   binop(Iop_Perm8x8,mkexpr(dHi),mkexpr(sAnd7lo)),
   12584                   mkexpr(maskBit3lo)),
   12585             binop(Iop_And64,
   12586                   binop(Iop_Perm8x8,mkexpr(dLo),mkexpr(sAnd7lo)),
   12587                   unop(Iop_Not64,mkexpr(maskBit3lo))) ));
   12588 
   12589       assign(rLo, binop(Iop_And64,mkexpr(permdLo),mkexpr(mask0x80lo)) );
   12590 
   12591       putXMMReg(
   12592          gregOfRM(modrm),
   12593          binop(Iop_64HLtoV128, mkexpr(rHi), mkexpr(rLo))
   12594       );
   12595       goto decode_success;
   12596    }
   12597 
   12598    /* ---------------------------------------------------- */
   12599    /* --- end of the SSSE3 decoder.                    --- */
   12600    /* ---------------------------------------------------- */
   12601 
   12602    /* ---------------------------------------------------- */
   12603    /* --- start of the SSE4 decoder                    --- */
   12604    /* ---------------------------------------------------- */
   12605 
   12606    /* 66 0F 3A 0B /r ib = ROUNDSD imm8, xmm2/m64, xmm1
   12607       (Partial implementation only -- only deal with cases where
   12608       the rounding mode is specified directly by the immediate byte.)
   12609       66 0F 3A 0A /r ib = ROUNDSS imm8, xmm2/m32, xmm1
   12610       (Limitations ditto)
   12611    */
   12612    if (sz == 2
   12613        && insn[0] == 0x0F && insn[1] == 0x3A
   12614        && (/*insn[2] == 0x0B || */insn[2] == 0x0A)) {
   12615 
   12616       Bool   isD = insn[2] == 0x0B;
   12617       IRTemp src = newTemp(isD ? Ity_F64 : Ity_F32);
   12618       IRTemp res = newTemp(isD ? Ity_F64 : Ity_F32);
   12619       Int    imm = 0;
   12620 
   12621       modrm = insn[3];
   12622 
   12623       if (epartIsReg(modrm)) {
   12624          assign( src,
   12625                  isD ? getXMMRegLane64F( eregOfRM(modrm), 0 )
   12626                      : getXMMRegLane32F( eregOfRM(modrm), 0 ) );
   12627          imm = insn[3+1];
   12628          if (imm & ~3) goto decode_failure;
   12629          delta += 3+1+1;
   12630          DIP( "rounds%c $%d,%s,%s\n",
   12631               isD ? 'd' : 's',
   12632               imm, nameXMMReg( eregOfRM(modrm) ),
   12633                    nameXMMReg( gregOfRM(modrm) ) );
   12634       } else {
   12635          addr = disAMode( &alen, sorb, delta+3, dis_buf );
   12636          assign( src, loadLE( isD ? Ity_F64 : Ity_F32, mkexpr(addr) ));
   12637          imm = insn[3+alen];
   12638          if (imm & ~3) goto decode_failure;
   12639          delta += 3+alen+1;
   12640          DIP( "roundsd $%d,%s,%s\n",
   12641               imm, dis_buf, nameXMMReg( gregOfRM(modrm) ) );
   12642       }
   12643 
   12644       /* (imm & 3) contains an Intel-encoded rounding mode.  Because
   12645          that encoding is the same as the encoding for IRRoundingMode,
   12646          we can use that value directly in the IR as a rounding
   12647          mode. */
   12648       assign(res, binop(isD ? Iop_RoundF64toInt : Iop_RoundF32toInt,
   12649                   mkU32(imm & 3), mkexpr(src)) );
   12650 
   12651       if (isD)
   12652          putXMMRegLane64F( gregOfRM(modrm), 0, mkexpr(res) );
   12653       else
   12654          putXMMRegLane32F( gregOfRM(modrm), 0, mkexpr(res) );
   12655 
   12656       goto decode_success;
   12657    }
   12658 
   12659    /* F3 0F BD -- LZCNT (count leading zeroes.  An AMD extension,
   12660       which we can only decode if we're sure this is an AMD cpu that
   12661       supports LZCNT, since otherwise it's BSR, which behaves
   12662       differently. */
   12663    if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0xBD
   12664        && 0 != (archinfo->hwcaps & VEX_HWCAPS_X86_LZCNT)) {
   12665       vassert(sz == 2 || sz == 4);
   12666       /*IRType*/ ty  = szToITy(sz);
   12667       IRTemp     src = newTemp(ty);
   12668       modrm = insn[3];
   12669       if (epartIsReg(modrm)) {
   12670          assign(src, getIReg(sz, eregOfRM(modrm)));
   12671          delta += 3+1;
   12672          DIP("lzcnt%c %s, %s\n", nameISize(sz),
   12673              nameIReg(sz, eregOfRM(modrm)),
   12674              nameIReg(sz, gregOfRM(modrm)));
   12675       } else {
   12676          addr = disAMode( &alen, sorb, delta+3, dis_buf );
   12677          assign(src, loadLE(ty, mkexpr(addr)));
   12678          delta += 3+alen;
   12679          DIP("lzcnt%c %s, %s\n", nameISize(sz), dis_buf,
   12680              nameIReg(sz, gregOfRM(modrm)));
   12681       }
   12682 
   12683       IRTemp res = gen_LZCNT(ty, src);
   12684       putIReg(sz, gregOfRM(modrm), mkexpr(res));
   12685 
   12686       // Update flags.  This is pretty lame .. perhaps can do better
   12687       // if this turns out to be performance critical.
   12688       // O S A P are cleared.  Z is set if RESULT == 0.
   12689       // C is set if SRC is zero.
   12690       IRTemp src32 = newTemp(Ity_I32);
   12691       IRTemp res32 = newTemp(Ity_I32);
   12692       assign(src32, widenUto32(mkexpr(src)));
   12693       assign(res32, widenUto32(mkexpr(res)));
   12694 
   12695       IRTemp oszacp = newTemp(Ity_I32);
   12696       assign(
   12697          oszacp,
   12698          binop(Iop_Or32,
   12699                binop(Iop_Shl32,
   12700                      unop(Iop_1Uto32,
   12701                           binop(Iop_CmpEQ32, mkexpr(res32), mkU32(0))),
   12702                      mkU8(X86G_CC_SHIFT_Z)),
   12703                binop(Iop_Shl32,
   12704                      unop(Iop_1Uto32,
   12705                           binop(Iop_CmpEQ32, mkexpr(src32), mkU32(0))),
   12706                      mkU8(X86G_CC_SHIFT_C))
   12707          )
   12708       );
   12709 
   12710       stmt( IRStmt_Put( OFFB_CC_OP,   mkU32(X86G_CC_OP_COPY) ));
   12711       stmt( IRStmt_Put( OFFB_CC_DEP2, mkU32(0) ));
   12712       stmt( IRStmt_Put( OFFB_CC_NDEP, mkU32(0) ));
   12713       stmt( IRStmt_Put( OFFB_CC_DEP1, mkexpr(oszacp) ));
   12714 
   12715       goto decode_success;
   12716    }
   12717 
   12718    /* ---------------------------------------------------- */
   12719    /* --- end of the SSE4 decoder                      --- */
   12720    /* ---------------------------------------------------- */
   12721 
   12722    after_sse_decoders:
   12723 
   12724    /* ---------------------------------------------------- */
   12725    /* --- deal with misc 0x67 pfxs (addr size override) -- */
   12726    /* ---------------------------------------------------- */
   12727 
   12728    /* 67 E3 = JCXZ (for JECXZ see below) */
   12729    if (insn[0] == 0x67 && insn[1] == 0xE3 && sz == 4) {
   12730       delta += 2;
   12731       d32 = (((Addr32)guest_EIP_bbstart)+delta+1) + getSDisp8(delta);
   12732       delta ++;
   12733       stmt( IRStmt_Exit(
   12734                binop(Iop_CmpEQ16, getIReg(2,R_ECX), mkU16(0)),
   12735                Ijk_Boring,
   12736                IRConst_U32(d32)
   12737             ));
   12738        DIP("jcxz 0x%x\n", d32);
   12739        goto decode_success;
   12740    }
   12741 
   12742    /* ---------------------------------------------------- */
   12743    /* --- start of the baseline insn decoder            -- */
   12744    /* ---------------------------------------------------- */
   12745 
   12746    /* Get the primary opcode. */
   12747    opc = getIByte(delta); delta++;
   12748 
   12749    /* We get here if the current insn isn't SSE, or this CPU doesn't
   12750       support SSE. */
   12751 
   12752    switch (opc) {
   12753 
   12754    /* ------------------------ Control flow --------------- */
   12755 
   12756    case 0xC2: /* RET imm16 */
   12757       d32 = getUDisp16(delta);
   12758       delta += 2;
   12759       dis_ret(d32);
   12760       dres.whatNext = Dis_StopHere;
   12761       DIP("ret %d\n", (Int)d32);
   12762       break;
   12763    case 0xC3: /* RET */
   12764       dis_ret(0);
   12765       dres.whatNext = Dis_StopHere;
   12766       DIP("ret\n");
   12767       break;
   12768 
   12769    case 0xCF: /* IRET */
   12770       /* Note, this is an extremely kludgey and limited implementation
   12771          of iret.  All it really does is:
   12772             popl %EIP; popl %CS; popl %EFLAGS.
   12773          %CS is set but ignored (as it is in (eg) popw %cs)". */
   12774       t1 = newTemp(Ity_I32); /* ESP */
   12775       t2 = newTemp(Ity_I32); /* new EIP */
   12776       t3 = newTemp(Ity_I32); /* new CS */
   12777       t4 = newTemp(Ity_I32); /* new EFLAGS */
   12778       assign(t1, getIReg(4,R_ESP));
   12779       assign(t2, loadLE(Ity_I32, binop(Iop_Add32,mkexpr(t1),mkU32(0) )));
   12780       assign(t3, loadLE(Ity_I32, binop(Iop_Add32,mkexpr(t1),mkU32(4) )));
   12781       assign(t4, loadLE(Ity_I32, binop(Iop_Add32,mkexpr(t1),mkU32(8) )));
   12782       /* Get stuff off stack */
   12783       putIReg(4, R_ESP,binop(Iop_Add32, mkexpr(t1), mkU32(12)));
   12784       /* set %CS (which is ignored anyway) */
   12785       putSReg( R_CS, unop(Iop_32to16, mkexpr(t3)) );
   12786       /* set %EFLAGS */
   12787       set_EFLAGS_from_value( t4, False/*!emit_AC_emwarn*/, 0/*unused*/ );
   12788       /* goto new EIP value */
   12789       jmp_treg(Ijk_Ret,t2);
   12790       dres.whatNext = Dis_StopHere;
   12791       DIP("iret (very kludgey)\n");
   12792       break;
   12793 
   12794    case 0xE8: /* CALL J4 */
   12795       d32 = getUDisp32(delta); delta += 4;
   12796       d32 += (guest_EIP_bbstart+delta);
   12797       /* (guest_eip_bbstart+delta) == return-to addr, d32 == call-to addr */
   12798       if (d32 == guest_EIP_bbstart+delta && getIByte(delta) >= 0x58
   12799                                          && getIByte(delta) <= 0x5F) {
   12800          /* Specially treat the position-independent-code idiom
   12801                  call X
   12802               X: popl %reg
   12803             as
   12804                  movl %eip, %reg.
   12805             since this generates better code, but for no other reason. */
   12806          Int archReg = getIByte(delta) - 0x58;
   12807          /* vex_printf("-- fPIC thingy\n"); */
   12808          putIReg(4, archReg, mkU32(guest_EIP_bbstart+delta));
   12809          delta++; /* Step over the POP */
   12810          DIP("call 0x%x ; popl %s\n",d32,nameIReg(4,archReg));
   12811       } else {
   12812          /* The normal sequence for a call. */
   12813          t1 = newTemp(Ity_I32);
   12814          assign(t1, binop(Iop_Sub32, getIReg(4,R_ESP), mkU32(4)));
   12815          putIReg(4, R_ESP, mkexpr(t1));
   12816          storeLE( mkexpr(t1), mkU32(guest_EIP_bbstart+delta));
   12817          if (resteerOkFn( callback_opaque, (Addr64)(Addr32)d32 )) {
   12818             /* follow into the call target. */
   12819             dres.whatNext   = Dis_ResteerU;
   12820             dres.continueAt = (Addr64)(Addr32)d32;
   12821          } else {
   12822             jmp_lit(Ijk_Call,d32);
   12823             dres.whatNext = Dis_StopHere;
   12824          }
   12825          DIP("call 0x%x\n",d32);
   12826       }
   12827       break;
   12828 
   12829 //--    case 0xC8: /* ENTER */
   12830 //--       d32 = getUDisp16(eip); eip += 2;
   12831 //--       abyte = getIByte(delta); delta++;
   12832 //--
   12833 //--       vg_assert(sz == 4);
   12834 //--       vg_assert(abyte == 0);
   12835 //--
   12836 //--       t1 = newTemp(cb); t2 = newTemp(cb);
   12837 //--       uInstr2(cb, GET,   sz, ArchReg, R_EBP, TempReg, t1);
   12838 //--       uInstr2(cb, GET,    4, ArchReg, R_ESP, TempReg, t2);
   12839 //--       uInstr2(cb, SUB,    4, Literal, 0,     TempReg, t2);
   12840 //--       uLiteral(cb, sz);
   12841 //--       uInstr2(cb, PUT,    4, TempReg, t2,    ArchReg, R_ESP);
   12842 //--       uInstr2(cb, STORE,  4, TempReg, t1,    TempReg, t2);
   12843 //--       uInstr2(cb, PUT,    4, TempReg, t2,    ArchReg, R_EBP);
   12844 //--       if (d32) {
   12845 //--          uInstr2(cb, SUB,    4, Literal, 0,     TempReg, t2);
   12846 //--          uLiteral(cb, d32);
   12847 //--          uInstr2(cb, PUT,    4, TempReg, t2,    ArchReg, R_ESP);
   12848 //--       }
   12849 //--       DIP("enter 0x%x, 0x%x", d32, abyte);
   12850 //--       break;
   12851 
   12852    case 0xC9: /* LEAVE */
   12853       vassert(sz == 4);
   12854       t1 = newTemp(Ity_I32); t2 = newTemp(Ity_I32);
   12855       assign(t1, getIReg(4,R_EBP));
   12856       /* First PUT ESP looks redundant, but need it because ESP must
   12857          always be up-to-date for Memcheck to work... */
   12858       putIReg(4, R_ESP, mkexpr(t1));
   12859       assign(t2, loadLE(Ity_I32,mkexpr(t1)));
   12860       putIReg(4, R_EBP, mkexpr(t2));
   12861       putIReg(4, R_ESP, binop(Iop_Add32, mkexpr(t1), mkU32(4)) );
   12862       DIP("leave\n");
   12863       break;
   12864 
   12865    /* ---------------- Misc weird-ass insns --------------- */
   12866 
   12867    case 0x27: /* DAA */
   12868    case 0x2F: /* DAS */
   12869    case 0x37: /* AAA */
   12870    case 0x3F: /* AAS */
   12871       /* An ugly implementation for some ugly instructions.  Oh
   12872 	 well. */
   12873       if (sz != 4) goto decode_failure;
   12874       t1 = newTemp(Ity_I32);
   12875       t2 = newTemp(Ity_I32);
   12876       /* Make up a 32-bit value (t1), with the old value of AX in the
   12877          bottom 16 bits, and the old OSZACP bitmask in the upper 16
   12878          bits. */
   12879       assign(t1,
   12880              binop(Iop_16HLto32,
   12881                    unop(Iop_32to16,
   12882                         mk_x86g_calculate_eflags_all()),
   12883                    getIReg(2, R_EAX)
   12884             ));
   12885       /* Call the helper fn, to get a new AX and OSZACP value, and
   12886          poke both back into the guest state.  Also pass the helper
   12887          the actual opcode so it knows which of the 4 instructions it
   12888          is doing the computation for. */
   12889       vassert(opc == 0x27 || opc == 0x2F || opc == 0x37 || opc == 0x3F);
   12890       assign(t2,
   12891               mkIRExprCCall(
   12892                  Ity_I32, 0/*regparm*/, "x86g_calculate_daa_das_aaa_aas",
   12893                  &x86g_calculate_daa_das_aaa_aas,
   12894                  mkIRExprVec_2( mkexpr(t1), mkU32( opc & 0xFF) )
   12895             ));
   12896      putIReg(2, R_EAX, unop(Iop_32to16, mkexpr(t2) ));
   12897 
   12898      stmt( IRStmt_Put( OFFB_CC_OP,   mkU32(X86G_CC_OP_COPY) ));
   12899      stmt( IRStmt_Put( OFFB_CC_DEP2, mkU32(0) ));
   12900      stmt( IRStmt_Put( OFFB_CC_DEP1,
   12901                        binop(Iop_And32,
   12902                              binop(Iop_Shr32, mkexpr(t2), mkU8(16)),
   12903                              mkU32( X86G_CC_MASK_C | X86G_CC_MASK_P
   12904                                     | X86G_CC_MASK_A | X86G_CC_MASK_Z
   12905                                     | X86G_CC_MASK_S| X86G_CC_MASK_O )
   12906                             )
   12907                       )
   12908          );
   12909      /* Set NDEP even though it isn't used.  This makes redundant-PUT
   12910         elimination of previous stores to this field work better. */
   12911      stmt( IRStmt_Put( OFFB_CC_NDEP, mkU32(0) ));
   12912      switch (opc) {
   12913         case 0x27: DIP("daa\n"); break;
   12914         case 0x2F: DIP("das\n"); break;
   12915         case 0x37: DIP("aaa\n"); break;
   12916         case 0x3F: DIP("aas\n"); break;
   12917         default: vassert(0);
   12918      }
   12919      break;
   12920 
   12921    case 0xD4: /* AAM */
   12922    case 0xD5: /* AAD */
   12923       d32 = getIByte(delta); delta++;
   12924       if (sz != 4 || d32 != 10) goto decode_failure;
   12925       t1 = newTemp(Ity_I32);
   12926       t2 = newTemp(Ity_I32);
   12927       /* Make up a 32-bit value (t1), with the old value of AX in the
   12928          bottom 16 bits, and the old OSZACP bitmask in the upper 16
   12929          bits. */
   12930       assign(t1,
   12931              binop(Iop_16HLto32,
   12932                    unop(Iop_32to16,
   12933                         mk_x86g_calculate_eflags_all()),
   12934                    getIReg(2, R_EAX)
   12935             ));
   12936       /* Call the helper fn, to get a new AX and OSZACP value, and
   12937          poke both back into the guest state.  Also pass the helper
   12938          the actual opcode so it knows which of the 2 instructions it
   12939          is doing the computation for. */
   12940       assign(t2,
   12941               mkIRExprCCall(
   12942                  Ity_I32, 0/*regparm*/, "x86g_calculate_aad_aam",
   12943                  &x86g_calculate_aad_aam,
   12944                  mkIRExprVec_2( mkexpr(t1), mkU32( opc & 0xFF) )
   12945             ));
   12946       putIReg(2, R_EAX, unop(Iop_32to16, mkexpr(t2) ));
   12947 
   12948       stmt( IRStmt_Put( OFFB_CC_OP,   mkU32(X86G_CC_OP_COPY) ));
   12949       stmt( IRStmt_Put( OFFB_CC_DEP2, mkU32(0) ));
   12950       stmt( IRStmt_Put( OFFB_CC_DEP1,
   12951                         binop(Iop_And32,
   12952                               binop(Iop_Shr32, mkexpr(t2), mkU8(16)),
   12953                               mkU32( X86G_CC_MASK_C | X86G_CC_MASK_P
   12954                                      | X86G_CC_MASK_A | X86G_CC_MASK_Z
   12955                                      | X86G_CC_MASK_S| X86G_CC_MASK_O )
   12956                              )
   12957                        )
   12958           );
   12959       /* Set NDEP even though it isn't used.  This makes
   12960          redundant-PUT elimination of previous stores to this field
   12961          work better. */
   12962       stmt( IRStmt_Put( OFFB_CC_NDEP, mkU32(0) ));
   12963 
   12964       DIP(opc == 0xD4 ? "aam\n" : "aad\n");
   12965       break;
   12966 
   12967    /* ------------------------ CWD/CDQ -------------------- */
   12968 
   12969    case 0x98: /* CBW */
   12970       if (sz == 4) {
   12971          putIReg(4, R_EAX, unop(Iop_16Sto32, getIReg(2, R_EAX)));
   12972          DIP("cwde\n");
   12973       } else {
   12974          vassert(sz == 2);
   12975          putIReg(2, R_EAX, unop(Iop_8Sto16, getIReg(1, R_EAX)));
   12976          DIP("cbw\n");
   12977       }
   12978       break;
   12979 
   12980    case 0x99: /* CWD/CDQ */
   12981       ty = szToITy(sz);
   12982       putIReg(sz, R_EDX,
   12983                   binop(mkSizedOp(ty,Iop_Sar8),
   12984                         getIReg(sz, R_EAX),
   12985                         mkU8(sz == 2 ? 15 : 31)) );
   12986       DIP(sz == 2 ? "cwdq\n" : "cdqq\n");
   12987       break;
   12988 
   12989    /* ------------------------ FPU ops -------------------- */
   12990 
   12991    case 0x9E: /* SAHF */
   12992       codegen_SAHF();
   12993       DIP("sahf\n");
   12994       break;
   12995 
   12996    case 0x9F: /* LAHF */
   12997       codegen_LAHF();
   12998       DIP("lahf\n");
   12999       break;
   13000 
   13001    case 0x9B: /* FWAIT */
   13002       /* ignore? */
   13003       DIP("fwait\n");
   13004       break;
   13005 
   13006    case 0xD8:
   13007    case 0xD9:
   13008    case 0xDA:
   13009    case 0xDB:
   13010    case 0xDC:
   13011    case 0xDD:
   13012    case 0xDE:
   13013    case 0xDF: {
   13014       Int  delta0    = delta;
   13015       Bool decode_OK = False;
   13016       delta = dis_FPU ( &decode_OK, sorb, delta );
   13017       if (!decode_OK) {
   13018          delta = delta0;
   13019          goto decode_failure;
   13020       }
   13021       break;
   13022    }
   13023 
   13024    /* ------------------------ INC & DEC ------------------ */
   13025 
   13026    case 0x40: /* INC eAX */
   13027    case 0x41: /* INC eCX */
   13028    case 0x42: /* INC eDX */
   13029    case 0x43: /* INC eBX */
   13030    case 0x44: /* INC eSP */
   13031    case 0x45: /* INC eBP */
   13032    case 0x46: /* INC eSI */
   13033    case 0x47: /* INC eDI */
   13034       vassert(sz == 2 || sz == 4);
   13035       ty = szToITy(sz);
   13036       t1 = newTemp(ty);
   13037       assign( t1, binop(mkSizedOp(ty,Iop_Add8),
   13038                         getIReg(sz, (UInt)(opc - 0x40)),
   13039                         mkU(ty,1)) );
   13040       setFlags_INC_DEC( True, t1, ty );
   13041       putIReg(sz, (UInt)(opc - 0x40), mkexpr(t1));
   13042       DIP("inc%c %s\n", nameISize(sz), nameIReg(sz,opc-0x40));
   13043       break;
   13044 
   13045    case 0x48: /* DEC eAX */
   13046    case 0x49: /* DEC eCX */
   13047    case 0x4A: /* DEC eDX */
   13048    case 0x4B: /* DEC eBX */
   13049    case 0x4C: /* DEC eSP */
   13050    case 0x4D: /* DEC eBP */
   13051    case 0x4E: /* DEC eSI */
   13052    case 0x4F: /* DEC eDI */
   13053       vassert(sz == 2 || sz == 4);
   13054       ty = szToITy(sz);
   13055       t1 = newTemp(ty);
   13056       assign( t1, binop(mkSizedOp(ty,Iop_Sub8),
   13057                         getIReg(sz, (UInt)(opc - 0x48)),
   13058                         mkU(ty,1)) );
   13059       setFlags_INC_DEC( False, t1, ty );
   13060       putIReg(sz, (UInt)(opc - 0x48), mkexpr(t1));
   13061       DIP("dec%c %s\n", nameISize(sz), nameIReg(sz,opc-0x48));
   13062       break;
   13063 
   13064    /* ------------------------ INT ------------------------ */
   13065 
   13066    case 0xCC: /* INT 3 */
   13067       jmp_lit(Ijk_SigTRAP,((Addr32)guest_EIP_bbstart)+delta);
   13068       dres.whatNext = Dis_StopHere;
   13069       DIP("int $0x3\n");
   13070       break;
   13071 
   13072    case 0xCD: /* INT imm8 */
   13073       d32 = getIByte(delta); delta++;
   13074 
   13075       /* For any of the cases where we emit a jump (that is, for all
   13076          currently handled cases), it's important that all ArchRegs
   13077          carry their up-to-date value at this point.  So we declare an
   13078          end-of-block here, which forces any TempRegs caching ArchRegs
   13079          to be flushed. */
   13080 
   13081       /* Handle int $0x40 .. $0x43 by synthesising a segfault and a
   13082          restart of this instruction (hence the "-2" two lines below,
   13083          to get the restart EIP to be this instruction.  This is
   13084          probably Linux-specific and it would be more correct to only
   13085          do this if the VexAbiInfo says that is what we should do. */
   13086       if (d32 >= 0x40 && d32 <= 0x43) {
   13087          jmp_lit(Ijk_SigSEGV,((Addr32)guest_EIP_bbstart)+delta-2);
   13088          dres.whatNext = Dis_StopHere;
   13089          DIP("int $0x%x\n", (Int)d32);
   13090          break;
   13091       }
   13092 
   13093       /* Handle int $0x80 (linux syscalls), int $0x81 and $0x82
   13094          (darwin syscalls).  As part of this, note where we are, so we
   13095          can back up the guest to this point if the syscall needs to
   13096          be restarted. */
   13097       if (d32 == 0x80) {
   13098          stmt( IRStmt_Put( OFFB_IP_AT_SYSCALL,
   13099                            mkU32(guest_EIP_curr_instr) ) );
   13100          jmp_lit(Ijk_Sys_int128,((Addr32)guest_EIP_bbstart)+delta);
   13101          dres.whatNext = Dis_StopHere;
   13102          DIP("int $0x80\n");
   13103          break;
   13104       }
   13105       if (d32 == 0x81) {
   13106          stmt( IRStmt_Put( OFFB_IP_AT_SYSCALL,
   13107                            mkU32(guest_EIP_curr_instr) ) );
   13108          jmp_lit(Ijk_Sys_int129,((Addr32)guest_EIP_bbstart)+delta);
   13109          dres.whatNext = Dis_StopHere;
   13110          DIP("int $0x81\n");
   13111          break;
   13112       }
   13113       if (d32 == 0x82) {
   13114          stmt( IRStmt_Put( OFFB_IP_AT_SYSCALL,
   13115                            mkU32(guest_EIP_curr_instr) ) );
   13116          jmp_lit(Ijk_Sys_int130,((Addr32)guest_EIP_bbstart)+delta);
   13117          dres.whatNext = Dis_StopHere;
   13118          DIP("int $0x82\n");
   13119          break;
   13120       }
   13121 
   13122       /* none of the above */
   13123       goto decode_failure;
   13124 
   13125    /* ------------------------ Jcond, byte offset --------- */
   13126 
   13127    case 0xEB: /* Jb (jump, byte offset) */
   13128       d32 = (((Addr32)guest_EIP_bbstart)+delta+1) + getSDisp8(delta);
   13129       delta++;
   13130       if (resteerOkFn( callback_opaque, (Addr64)(Addr32)d32) ) {
   13131          dres.whatNext   = Dis_ResteerU;
   13132          dres.continueAt = (Addr64)(Addr32)d32;
   13133       } else {
   13134          jmp_lit(Ijk_Boring,d32);
   13135          dres.whatNext = Dis_StopHere;
   13136       }
   13137       DIP("jmp-8 0x%x\n", d32);
   13138       break;
   13139 
   13140    case 0xE9: /* Jv (jump, 16/32 offset) */
   13141       vassert(sz == 4); /* JRS added 2004 July 11 */
   13142       d32 = (((Addr32)guest_EIP_bbstart)+delta+sz) + getSDisp(sz,delta);
   13143       delta += sz;
   13144       if (resteerOkFn( callback_opaque, (Addr64)(Addr32)d32) ) {
   13145          dres.whatNext   = Dis_ResteerU;
   13146          dres.continueAt = (Addr64)(Addr32)d32;
   13147       } else {
   13148          jmp_lit(Ijk_Boring,d32);
   13149          dres.whatNext = Dis_StopHere;
   13150       }
   13151       DIP("jmp 0x%x\n", d32);
   13152       break;
   13153 
   13154    case 0x70:
   13155    case 0x71:
   13156    case 0x72: /* JBb/JNAEb (jump below) */
   13157    case 0x73: /* JNBb/JAEb (jump not below) */
   13158    case 0x74: /* JZb/JEb (jump zero) */
   13159    case 0x75: /* JNZb/JNEb (jump not zero) */
   13160    case 0x76: /* JBEb/JNAb (jump below or equal) */
   13161    case 0x77: /* JNBEb/JAb (jump not below or equal) */
   13162    case 0x78: /* JSb (jump negative) */
   13163    case 0x79: /* JSb (jump not negative) */
   13164    case 0x7A: /* JP (jump parity even) */
   13165    case 0x7B: /* JNP/JPO (jump parity odd) */
   13166    case 0x7C: /* JLb/JNGEb (jump less) */
   13167    case 0x7D: /* JGEb/JNLb (jump greater or equal) */
   13168    case 0x7E: /* JLEb/JNGb (jump less or equal) */
   13169    case 0x7F: /* JGb/JNLEb (jump greater) */
   13170     { Int    jmpDelta;
   13171       HChar* comment  = "";
   13172       jmpDelta = (Int)getSDisp8(delta);
   13173       vassert(-128 <= jmpDelta && jmpDelta < 128);
   13174       d32 = (((Addr32)guest_EIP_bbstart)+delta+1) + jmpDelta;
   13175       delta++;
   13176       if (resteerCisOk
   13177           && vex_control.guest_chase_cond
   13178           && (Addr32)d32 != (Addr32)guest_EIP_bbstart
   13179           && jmpDelta < 0
   13180           && resteerOkFn( callback_opaque, (Addr64)(Addr32)d32) ) {
   13181          /* Speculation: assume this backward branch is taken.  So we
   13182             need to emit a side-exit to the insn following this one,
   13183             on the negation of the condition, and continue at the
   13184             branch target address (d32).  If we wind up back at the
   13185             first instruction of the trace, just stop; it's better to
   13186             let the IR loop unroller handle that case. */
   13187          stmt( IRStmt_Exit(
   13188                   mk_x86g_calculate_condition((X86Condcode)(1 ^ (opc - 0x70))),
   13189                   Ijk_Boring,
   13190                   IRConst_U32(guest_EIP_bbstart+delta) ) );
   13191          dres.whatNext   = Dis_ResteerC;
   13192          dres.continueAt = (Addr64)(Addr32)d32;
   13193          comment = "(assumed taken)";
   13194       }
   13195       else
   13196       if (resteerCisOk
   13197           && vex_control.guest_chase_cond
   13198           && (Addr32)d32 != (Addr32)guest_EIP_bbstart
   13199           && jmpDelta >= 0
   13200           && resteerOkFn( callback_opaque,
   13201                           (Addr64)(Addr32)(guest_EIP_bbstart+delta)) ) {
   13202          /* Speculation: assume this forward branch is not taken.  So
   13203             we need to emit a side-exit to d32 (the dest) and continue
   13204             disassembling at the insn immediately following this
   13205             one. */
   13206          stmt( IRStmt_Exit(
   13207                   mk_x86g_calculate_condition((X86Condcode)(opc - 0x70)),
   13208                   Ijk_Boring,
   13209                   IRConst_U32(d32) ) );
   13210          dres.whatNext   = Dis_ResteerC;
   13211          dres.continueAt = (Addr64)(Addr32)(guest_EIP_bbstart+delta);
   13212          comment = "(assumed not taken)";
   13213       }
   13214       else {
   13215          /* Conservative default translation - end the block at this
   13216             point. */
   13217          jcc_01( (X86Condcode)(opc - 0x70),
   13218                  (Addr32)(guest_EIP_bbstart+delta), d32);
   13219          dres.whatNext = Dis_StopHere;
   13220       }
   13221       DIP("j%s-8 0x%x %s\n", name_X86Condcode(opc - 0x70), d32, comment);
   13222       break;
   13223     }
   13224 
   13225    case 0xE3: /* JECXZ (for JCXZ see above) */
   13226       if (sz != 4) goto decode_failure;
   13227       d32 = (((Addr32)guest_EIP_bbstart)+delta+1) + getSDisp8(delta);
   13228       delta ++;
   13229       stmt( IRStmt_Exit(
   13230                binop(Iop_CmpEQ32, getIReg(4,R_ECX), mkU32(0)),
   13231             Ijk_Boring,
   13232             IRConst_U32(d32)
   13233           ));
   13234       DIP("jecxz 0x%x\n", d32);
   13235       break;
   13236 
   13237    case 0xE0: /* LOOPNE disp8: decrement count, jump if count != 0 && ZF==0 */
   13238    case 0xE1: /* LOOPE  disp8: decrement count, jump if count != 0 && ZF==1 */
   13239    case 0xE2: /* LOOP   disp8: decrement count, jump if count != 0 */
   13240     { /* Again, the docs say this uses ECX/CX as a count depending on
   13241          the address size override, not the operand one.  Since we
   13242          don't handle address size overrides, I guess that means
   13243          ECX. */
   13244       IRExpr* zbit  = NULL;
   13245       IRExpr* count = NULL;
   13246       IRExpr* cond  = NULL;
   13247       HChar*  xtra  = NULL;
   13248 
   13249       if (sz != 4) goto decode_failure;
   13250       d32 = (((Addr32)guest_EIP_bbstart)+delta+1) + getSDisp8(delta);
   13251       delta++;
   13252       putIReg(4, R_ECX, binop(Iop_Sub32, getIReg(4,R_ECX), mkU32(1)));
   13253 
   13254       count = getIReg(4,R_ECX);
   13255       cond = binop(Iop_CmpNE32, count, mkU32(0));
   13256       switch (opc) {
   13257          case 0xE2:
   13258             xtra = "";
   13259             break;
   13260          case 0xE1:
   13261             xtra = "e";
   13262             zbit = mk_x86g_calculate_condition( X86CondZ );
   13263 	    cond = mkAnd1(cond, zbit);
   13264             break;
   13265          case 0xE0:
   13266             xtra = "ne";
   13267             zbit = mk_x86g_calculate_condition( X86CondNZ );
   13268 	    cond = mkAnd1(cond, zbit);
   13269             break;
   13270          default:
   13271 	    vassert(0);
   13272       }
   13273       stmt( IRStmt_Exit(cond, Ijk_Boring, IRConst_U32(d32)) );
   13274 
   13275       DIP("loop%s 0x%x\n", xtra, d32);
   13276       break;
   13277     }
   13278 
   13279    /* ------------------------ IMUL ----------------------- */
   13280 
   13281    case 0x69: /* IMUL Iv, Ev, Gv */
   13282       delta = dis_imul_I_E_G ( sorb, sz, delta, sz );
   13283       break;
   13284    case 0x6B: /* IMUL Ib, Ev, Gv */
   13285       delta = dis_imul_I_E_G ( sorb, sz, delta, 1 );
   13286       break;
   13287 
   13288    /* ------------------------ MOV ------------------------ */
   13289 
   13290    case 0x88: /* MOV Gb,Eb */
   13291       delta = dis_mov_G_E(sorb, 1, delta);
   13292       break;
   13293 
   13294    case 0x89: /* MOV Gv,Ev */
   13295       delta = dis_mov_G_E(sorb, sz, delta);
   13296       break;
   13297 
   13298    case 0x8A: /* MOV Eb,Gb */
   13299       delta = dis_mov_E_G(sorb, 1, delta);
   13300       break;
   13301 
   13302    case 0x8B: /* MOV Ev,Gv */
   13303       delta = dis_mov_E_G(sorb, sz, delta);
   13304       break;
   13305 
   13306    case 0x8D: /* LEA M,Gv */
   13307       if (sz != 4)
   13308          goto decode_failure;
   13309       modrm = getIByte(delta);
   13310       if (epartIsReg(modrm))
   13311          goto decode_failure;
   13312       /* NOTE!  this is the one place where a segment override prefix
   13313          has no effect on the address calculation.  Therefore we pass
   13314          zero instead of sorb here. */
   13315       addr = disAMode ( &alen, /*sorb*/ 0, delta, dis_buf );
   13316       delta += alen;
   13317       putIReg(sz, gregOfRM(modrm), mkexpr(addr));
   13318       DIP("lea%c %s, %s\n", nameISize(sz), dis_buf,
   13319                             nameIReg(sz,gregOfRM(modrm)));
   13320       break;
   13321 
   13322    case 0x8C: /* MOV Sw,Ew -- MOV from a SEGMENT REGISTER */
   13323       delta = dis_mov_Sw_Ew(sorb, sz, delta);
   13324       break;
   13325 
   13326    case 0x8E: /* MOV Ew,Sw -- MOV to a SEGMENT REGISTER */
   13327       delta = dis_mov_Ew_Sw(sorb, delta);
   13328       break;
   13329 
   13330    case 0xA0: /* MOV Ob,AL */
   13331       sz = 1;
   13332       /* Fall through ... */
   13333    case 0xA1: /* MOV Ov,eAX */
   13334       d32 = getUDisp32(delta); delta += 4;
   13335       ty = szToITy(sz);
   13336       addr = newTemp(Ity_I32);
   13337       assign( addr, handleSegOverride(sorb, mkU32(d32)) );
   13338       putIReg(sz, R_EAX, loadLE(ty, mkexpr(addr)));
   13339       DIP("mov%c %s0x%x, %s\n", nameISize(sz), sorbTxt(sorb),
   13340                                 d32, nameIReg(sz,R_EAX));
   13341       break;
   13342 
   13343    case 0xA2: /* MOV Ob,AL */
   13344       sz = 1;
   13345       /* Fall through ... */
   13346    case 0xA3: /* MOV eAX,Ov */
   13347       d32 = getUDisp32(delta); delta += 4;
   13348       ty = szToITy(sz);
   13349       addr = newTemp(Ity_I32);
   13350       assign( addr, handleSegOverride(sorb, mkU32(d32)) );
   13351       storeLE( mkexpr(addr), getIReg(sz,R_EAX) );
   13352       DIP("mov%c %s, %s0x%x\n", nameISize(sz), nameIReg(sz,R_EAX),
   13353                                 sorbTxt(sorb), d32);
   13354       break;
   13355 
   13356    case 0xB0: /* MOV imm,AL */
   13357    case 0xB1: /* MOV imm,CL */
   13358    case 0xB2: /* MOV imm,DL */
   13359    case 0xB3: /* MOV imm,BL */
   13360    case 0xB4: /* MOV imm,AH */
   13361    case 0xB5: /* MOV imm,CH */
   13362    case 0xB6: /* MOV imm,DH */
   13363    case 0xB7: /* MOV imm,BH */
   13364       d32 = getIByte(delta); delta += 1;
   13365       putIReg(1, opc-0xB0, mkU8(d32));
   13366       DIP("movb $0x%x,%s\n", d32, nameIReg(1,opc-0xB0));
   13367       break;
   13368 
   13369    case 0xB8: /* MOV imm,eAX */
   13370    case 0xB9: /* MOV imm,eCX */
   13371    case 0xBA: /* MOV imm,eDX */
   13372    case 0xBB: /* MOV imm,eBX */
   13373    case 0xBC: /* MOV imm,eSP */
   13374    case 0xBD: /* MOV imm,eBP */
   13375    case 0xBE: /* MOV imm,eSI */
   13376    case 0xBF: /* MOV imm,eDI */
   13377       d32 = getUDisp(sz,delta); delta += sz;
   13378       putIReg(sz, opc-0xB8, mkU(szToITy(sz), d32));
   13379       DIP("mov%c $0x%x,%s\n", nameISize(sz), d32, nameIReg(sz,opc-0xB8));
   13380       break;
   13381 
   13382    case 0xC6: /* MOV Ib,Eb */
   13383       sz = 1;
   13384       goto do_Mov_I_E;
   13385    case 0xC7: /* MOV Iv,Ev */
   13386       goto do_Mov_I_E;
   13387 
   13388    do_Mov_I_E:
   13389       modrm = getIByte(delta);
   13390       if (epartIsReg(modrm)) {
   13391          delta++; /* mod/rm byte */
   13392          d32 = getUDisp(sz,delta); delta += sz;
   13393          putIReg(sz, eregOfRM(modrm), mkU(szToITy(sz), d32));
   13394          DIP("mov%c $0x%x, %s\n", nameISize(sz), d32,
   13395                                   nameIReg(sz,eregOfRM(modrm)));
   13396       } else {
   13397          addr = disAMode ( &alen, sorb, delta, dis_buf );
   13398          delta += alen;
   13399          d32 = getUDisp(sz,delta); delta += sz;
   13400          storeLE(mkexpr(addr), mkU(szToITy(sz), d32));
   13401          DIP("mov%c $0x%x, %s\n", nameISize(sz), d32, dis_buf);
   13402       }
   13403       break;
   13404 
   13405    /* ------------------------ opl imm, A ----------------- */
   13406 
   13407    case 0x04: /* ADD Ib, AL */
   13408       delta = dis_op_imm_A(  1, False, Iop_Add8, True, delta, "add" );
   13409       break;
   13410    case 0x05: /* ADD Iv, eAX */
   13411       delta = dis_op_imm_A( sz, False, Iop_Add8, True, delta, "add" );
   13412       break;
   13413 
   13414    case 0x0C: /* OR Ib, AL */
   13415       delta = dis_op_imm_A(  1, False, Iop_Or8, True, delta, "or" );
   13416       break;
   13417    case 0x0D: /* OR Iv, eAX */
   13418       delta = dis_op_imm_A( sz, False, Iop_Or8, True, delta, "or" );
   13419       break;
   13420 
   13421    case 0x14: /* ADC Ib, AL */
   13422       delta = dis_op_imm_A(  1, True, Iop_Add8, True, delta, "adc" );
   13423       break;
   13424    case 0x15: /* ADC Iv, eAX */
   13425       delta = dis_op_imm_A( sz, True, Iop_Add8, True, delta, "adc" );
   13426       break;
   13427 
   13428    case 0x1C: /* SBB Ib, AL */
   13429       delta = dis_op_imm_A( 1, True, Iop_Sub8, True, delta, "sbb" );
   13430       break;
   13431    case 0x1D: /* SBB Iv, eAX */
   13432       delta = dis_op_imm_A( sz, True, Iop_Sub8, True, delta, "sbb" );
   13433       break;
   13434 
   13435    case 0x24: /* AND Ib, AL */
   13436       delta = dis_op_imm_A(  1, False, Iop_And8, True, delta, "and" );
   13437       break;
   13438    case 0x25: /* AND Iv, eAX */
   13439       delta = dis_op_imm_A( sz, False, Iop_And8, True, delta, "and" );
   13440       break;
   13441 
   13442    case 0x2C: /* SUB Ib, AL */
   13443       delta = dis_op_imm_A(  1, False, Iop_Sub8, True, delta, "sub" );
   13444       break;
   13445    case 0x2D: /* SUB Iv, eAX */
   13446       delta = dis_op_imm_A( sz, False, Iop_Sub8, True, delta, "sub" );
   13447       break;
   13448 
   13449    case 0x34: /* XOR Ib, AL */
   13450       delta = dis_op_imm_A(  1, False, Iop_Xor8, True, delta, "xor" );
   13451       break;
   13452    case 0x35: /* XOR Iv, eAX */
   13453       delta = dis_op_imm_A( sz, False, Iop_Xor8, True, delta, "xor" );
   13454       break;
   13455 
   13456    case 0x3C: /* CMP Ib, AL */
   13457       delta = dis_op_imm_A(  1, False, Iop_Sub8, False, delta, "cmp" );
   13458       break;
   13459    case 0x3D: /* CMP Iv, eAX */
   13460       delta = dis_op_imm_A( sz, False, Iop_Sub8, False, delta, "cmp" );
   13461       break;
   13462 
   13463    case 0xA8: /* TEST Ib, AL */
   13464       delta = dis_op_imm_A(  1, False, Iop_And8, False, delta, "test" );
   13465       break;
   13466    case 0xA9: /* TEST Iv, eAX */
   13467       delta = dis_op_imm_A( sz, False, Iop_And8, False, delta, "test" );
   13468       break;
   13469 
   13470    /* ------------------------ opl Ev, Gv ----------------- */
   13471 
   13472    case 0x02: /* ADD Eb,Gb */
   13473       delta = dis_op2_E_G ( sorb, False, Iop_Add8, True, 1, delta, "add" );
   13474       break;
   13475    case 0x03: /* ADD Ev,Gv */
   13476       delta = dis_op2_E_G ( sorb, False, Iop_Add8, True, sz, delta, "add" );
   13477       break;
   13478 
   13479    case 0x0A: /* OR Eb,Gb */
   13480       delta = dis_op2_E_G ( sorb, False, Iop_Or8, True, 1, delta, "or" );
   13481       break;
   13482    case 0x0B: /* OR Ev,Gv */
   13483       delta = dis_op2_E_G ( sorb, False, Iop_Or8, True, sz, delta, "or" );
   13484       break;
   13485 
   13486    case 0x12: /* ADC Eb,Gb */
   13487       delta = dis_op2_E_G ( sorb, True, Iop_Add8, True, 1, delta, "adc" );
   13488       break;
   13489    case 0x13: /* ADC Ev,Gv */
   13490       delta = dis_op2_E_G ( sorb, True, Iop_Add8, True, sz, delta, "adc" );
   13491       break;
   13492 
   13493    case 0x1A: /* SBB Eb,Gb */
   13494       delta = dis_op2_E_G ( sorb, True, Iop_Sub8, True, 1, delta, "sbb" );
   13495       break;
   13496    case 0x1B: /* SBB Ev,Gv */
   13497       delta = dis_op2_E_G ( sorb, True, Iop_Sub8, True, sz, delta, "sbb" );
   13498       break;
   13499 
   13500    case 0x22: /* AND Eb,Gb */
   13501       delta = dis_op2_E_G ( sorb, False, Iop_And8, True, 1, delta, "and" );
   13502       break;
   13503    case 0x23: /* AND Ev,Gv */
   13504       delta = dis_op2_E_G ( sorb, False, Iop_And8, True, sz, delta, "and" );
   13505       break;
   13506 
   13507    case 0x2A: /* SUB Eb,Gb */
   13508       delta = dis_op2_E_G ( sorb, False, Iop_Sub8, True, 1, delta, "sub" );
   13509       break;
   13510    case 0x2B: /* SUB Ev,Gv */
   13511       delta = dis_op2_E_G ( sorb, False, Iop_Sub8, True, sz, delta, "sub" );
   13512       break;
   13513 
   13514    case 0x32: /* XOR Eb,Gb */
   13515       delta = dis_op2_E_G ( sorb, False, Iop_Xor8, True, 1, delta, "xor" );
   13516       break;
   13517    case 0x33: /* XOR Ev,Gv */
   13518       delta = dis_op2_E_G ( sorb, False, Iop_Xor8, True, sz, delta, "xor" );
   13519       break;
   13520 
   13521    case 0x3A: /* CMP Eb,Gb */
   13522       delta = dis_op2_E_G ( sorb, False, Iop_Sub8, False, 1, delta, "cmp" );
   13523       break;
   13524    case 0x3B: /* CMP Ev,Gv */
   13525       delta = dis_op2_E_G ( sorb, False, Iop_Sub8, False, sz, delta, "cmp" );
   13526       break;
   13527 
   13528    case 0x84: /* TEST Eb,Gb */
   13529       delta = dis_op2_E_G ( sorb, False, Iop_And8, False, 1, delta, "test" );
   13530       break;
   13531    case 0x85: /* TEST Ev,Gv */
   13532       delta = dis_op2_E_G ( sorb, False, Iop_And8, False, sz, delta, "test" );
   13533       break;
   13534 
   13535    /* ------------------------ opl Gv, Ev ----------------- */
   13536 
   13537    case 0x00: /* ADD Gb,Eb */
   13538       delta = dis_op2_G_E ( sorb, pfx_lock, False,
   13539                             Iop_Add8, True, 1, delta, "add" );
   13540       break;
   13541    case 0x01: /* ADD Gv,Ev */
   13542       delta = dis_op2_G_E ( sorb, pfx_lock, False,
   13543                             Iop_Add8, True, sz, delta, "add" );
   13544       break;
   13545 
   13546    case 0x08: /* OR Gb,Eb */
   13547       delta = dis_op2_G_E ( sorb, pfx_lock, False,
   13548                             Iop_Or8, True, 1, delta, "or" );
   13549       break;
   13550    case 0x09: /* OR Gv,Ev */
   13551       delta = dis_op2_G_E ( sorb, pfx_lock, False,
   13552                             Iop_Or8, True, sz, delta, "or" );
   13553       break;
   13554 
   13555    case 0x10: /* ADC Gb,Eb */
   13556       delta = dis_op2_G_E ( sorb, pfx_lock, True,
   13557                             Iop_Add8, True, 1, delta, "adc" );
   13558       break;
   13559    case 0x11: /* ADC Gv,Ev */
   13560       delta = dis_op2_G_E ( sorb, pfx_lock, True,
   13561                             Iop_Add8, True, sz, delta, "adc" );
   13562       break;
   13563 
   13564    case 0x18: /* SBB Gb,Eb */
   13565       delta = dis_op2_G_E ( sorb, pfx_lock, True,
   13566                             Iop_Sub8, True, 1, delta, "sbb" );
   13567       break;
   13568    case 0x19: /* SBB Gv,Ev */
   13569       delta = dis_op2_G_E ( sorb, pfx_lock, True,
   13570                             Iop_Sub8, True, sz, delta, "sbb" );
   13571       break;
   13572 
   13573    case 0x20: /* AND Gb,Eb */
   13574       delta = dis_op2_G_E ( sorb, pfx_lock, False,
   13575                             Iop_And8, True, 1, delta, "and" );
   13576       break;
   13577    case 0x21: /* AND Gv,Ev */
   13578       delta = dis_op2_G_E ( sorb, pfx_lock, False,
   13579                             Iop_And8, True, sz, delta, "and" );
   13580       break;
   13581 
   13582    case 0x28: /* SUB Gb,Eb */
   13583       delta = dis_op2_G_E ( sorb, pfx_lock, False,
   13584                             Iop_Sub8, True, 1, delta, "sub" );
   13585       break;
   13586    case 0x29: /* SUB Gv,Ev */
   13587       delta = dis_op2_G_E ( sorb, pfx_lock, False,
   13588                             Iop_Sub8, True, sz, delta, "sub" );
   13589       break;
   13590 
   13591    case 0x30: /* XOR Gb,Eb */
   13592       delta = dis_op2_G_E ( sorb, pfx_lock, False,
   13593                             Iop_Xor8, True, 1, delta, "xor" );
   13594       break;
   13595    case 0x31: /* XOR Gv,Ev */
   13596       delta = dis_op2_G_E ( sorb, pfx_lock, False,
   13597                             Iop_Xor8, True, sz, delta, "xor" );
   13598       break;
   13599 
   13600    case 0x38: /* CMP Gb,Eb */
   13601       delta = dis_op2_G_E ( sorb, pfx_lock, False,
   13602                             Iop_Sub8, False, 1, delta, "cmp" );
   13603       break;
   13604    case 0x39: /* CMP Gv,Ev */
   13605       delta = dis_op2_G_E ( sorb, pfx_lock, False,
   13606                             Iop_Sub8, False, sz, delta, "cmp" );
   13607       break;
   13608 
   13609    /* ------------------------ POP ------------------------ */
   13610 
   13611    case 0x58: /* POP eAX */
   13612    case 0x59: /* POP eCX */
   13613    case 0x5A: /* POP eDX */
   13614    case 0x5B: /* POP eBX */
   13615    case 0x5D: /* POP eBP */
   13616    case 0x5E: /* POP eSI */
   13617    case 0x5F: /* POP eDI */
   13618    case 0x5C: /* POP eSP */
   13619       vassert(sz == 2 || sz == 4);
   13620       t1 = newTemp(szToITy(sz)); t2 = newTemp(Ity_I32);
   13621       assign(t2, getIReg(4, R_ESP));
   13622       assign(t1, loadLE(szToITy(sz),mkexpr(t2)));
   13623       putIReg(4, R_ESP, binop(Iop_Add32, mkexpr(t2), mkU32(sz)));
   13624       putIReg(sz, opc-0x58, mkexpr(t1));
   13625       DIP("pop%c %s\n", nameISize(sz), nameIReg(sz,opc-0x58));
   13626       break;
   13627 
   13628    case 0x9D: /* POPF */
   13629       vassert(sz == 2 || sz == 4);
   13630       t1 = newTemp(Ity_I32); t2 = newTemp(Ity_I32);
   13631       assign(t2, getIReg(4, R_ESP));
   13632       assign(t1, widenUto32(loadLE(szToITy(sz),mkexpr(t2))));
   13633       putIReg(4, R_ESP, binop(Iop_Add32, mkexpr(t2), mkU32(sz)));
   13634 
   13635       /* Generate IR to set %EFLAGS{O,S,Z,A,C,P,D,ID,AC} from the
   13636 	 value in t1. */
   13637       set_EFLAGS_from_value( t1, True/*emit_AC_emwarn*/,
   13638                                  ((Addr32)guest_EIP_bbstart)+delta );
   13639 
   13640       DIP("popf%c\n", nameISize(sz));
   13641       break;
   13642 
   13643    case 0x61: /* POPA */
   13644       /* This is almost certainly wrong for sz==2.  So ... */
   13645       if (sz != 4) goto decode_failure;
   13646 
   13647       /* t5 is the old %ESP value. */
   13648       t5 = newTemp(Ity_I32);
   13649       assign( t5, getIReg(4, R_ESP) );
   13650 
   13651       /* Reload all the registers, except %esp. */
   13652       putIReg(4,R_EAX, loadLE(Ity_I32, binop(Iop_Add32,mkexpr(t5),mkU32(28)) ));
   13653       putIReg(4,R_ECX, loadLE(Ity_I32, binop(Iop_Add32,mkexpr(t5),mkU32(24)) ));
   13654       putIReg(4,R_EDX, loadLE(Ity_I32, binop(Iop_Add32,mkexpr(t5),mkU32(20)) ));
   13655       putIReg(4,R_EBX, loadLE(Ity_I32, binop(Iop_Add32,mkexpr(t5),mkU32(16)) ));
   13656       /* ignore saved %ESP */
   13657       putIReg(4,R_EBP, loadLE(Ity_I32, binop(Iop_Add32,mkexpr(t5),mkU32( 8)) ));
   13658       putIReg(4,R_ESI, loadLE(Ity_I32, binop(Iop_Add32,mkexpr(t5),mkU32( 4)) ));
   13659       putIReg(4,R_EDI, loadLE(Ity_I32, binop(Iop_Add32,mkexpr(t5),mkU32( 0)) ));
   13660 
   13661       /* and move %ESP back up */
   13662       putIReg( 4, R_ESP, binop(Iop_Add32, mkexpr(t5), mkU32(8*4)) );
   13663 
   13664       DIP("popa%c\n", nameISize(sz));
   13665       break;
   13666 
   13667    case 0x8F: /* POPL/POPW m32 */
   13668      { Int    len;
   13669        UChar  rm = getIByte(delta);
   13670 
   13671        /* make sure this instruction is correct POP */
   13672        if (epartIsReg(rm) || gregOfRM(rm) != 0)
   13673           goto decode_failure;
   13674        /* and has correct size */
   13675        if (sz != 4 && sz != 2)
   13676           goto decode_failure;
   13677        ty = szToITy(sz);
   13678 
   13679        t1 = newTemp(Ity_I32); /* stack address */
   13680        t3 = newTemp(ty); /* data */
   13681        /* set t1 to ESP: t1 = ESP */
   13682        assign( t1, getIReg(4, R_ESP) );
   13683        /* load M[ESP] to virtual register t3: t3 = M[t1] */
   13684        assign( t3, loadLE(ty, mkexpr(t1)) );
   13685 
   13686        /* increase ESP; must be done before the STORE.  Intel manual says:
   13687             If the ESP register is used as a base register for addressing
   13688             a destination operand in memory, the POP instruction computes
   13689             the effective address of the operand after it increments the
   13690             ESP register.
   13691        */
   13692        putIReg(4, R_ESP, binop(Iop_Add32, mkexpr(t1), mkU32(sz)) );
   13693 
   13694        /* resolve MODR/M */
   13695        addr = disAMode ( &len, sorb, delta, dis_buf);
   13696        storeLE( mkexpr(addr), mkexpr(t3) );
   13697 
   13698        DIP("pop%c %s\n", sz==2 ? 'w' : 'l', dis_buf);
   13699 
   13700        delta += len;
   13701        break;
   13702      }
   13703 
   13704    case 0x1F: /* POP %DS */
   13705       dis_pop_segreg( R_DS, sz ); break;
   13706    case 0x07: /* POP %ES */
   13707       dis_pop_segreg( R_ES, sz ); break;
   13708    case 0x17: /* POP %SS */
   13709       dis_pop_segreg( R_SS, sz ); break;
   13710 
   13711    /* ------------------------ PUSH ----------------------- */
   13712 
   13713    case 0x50: /* PUSH eAX */
   13714    case 0x51: /* PUSH eCX */
   13715    case 0x52: /* PUSH eDX */
   13716    case 0x53: /* PUSH eBX */
   13717    case 0x55: /* PUSH eBP */
   13718    case 0x56: /* PUSH eSI */
   13719    case 0x57: /* PUSH eDI */
   13720    case 0x54: /* PUSH eSP */
   13721       /* This is the Right Way, in that the value to be pushed is
   13722          established before %esp is changed, so that pushl %esp
   13723          correctly pushes the old value. */
   13724       vassert(sz == 2 || sz == 4);
   13725       ty = sz==2 ? Ity_I16 : Ity_I32;
   13726       t1 = newTemp(ty); t2 = newTemp(Ity_I32);
   13727       assign(t1, getIReg(sz, opc-0x50));
   13728       assign(t2, binop(Iop_Sub32, getIReg(4, R_ESP), mkU32(sz)));
   13729       putIReg(4, R_ESP, mkexpr(t2) );
   13730       storeLE(mkexpr(t2),mkexpr(t1));
   13731       DIP("push%c %s\n", nameISize(sz), nameIReg(sz,opc-0x50));
   13732       break;
   13733 
   13734 
   13735    case 0x68: /* PUSH Iv */
   13736       d32 = getUDisp(sz,delta); delta += sz;
   13737       goto do_push_I;
   13738    case 0x6A: /* PUSH Ib, sign-extended to sz */
   13739       d32 = getSDisp8(delta); delta += 1;
   13740       goto do_push_I;
   13741    do_push_I:
   13742       ty = szToITy(sz);
   13743       t1 = newTemp(Ity_I32); t2 = newTemp(ty);
   13744       assign( t1, binop(Iop_Sub32,getIReg(4,R_ESP),mkU32(sz)) );
   13745       putIReg(4, R_ESP, mkexpr(t1) );
   13746       /* stop mkU16 asserting if d32 is a negative 16-bit number
   13747          (bug #132813) */
   13748       if (ty == Ity_I16)
   13749          d32 &= 0xFFFF;
   13750       storeLE( mkexpr(t1), mkU(ty,d32) );
   13751       DIP("push%c $0x%x\n", nameISize(sz), d32);
   13752       break;
   13753 
   13754    case 0x9C: /* PUSHF */ {
   13755       vassert(sz == 2 || sz == 4);
   13756 
   13757       t1 = newTemp(Ity_I32);
   13758       assign( t1, binop(Iop_Sub32,getIReg(4,R_ESP),mkU32(sz)) );
   13759       putIReg(4, R_ESP, mkexpr(t1) );
   13760 
   13761       /* Calculate OSZACP, and patch in fixed fields as per
   13762          Intel docs.
   13763          - bit 1 is always 1
   13764          - bit 9 is Interrupt Enable (should always be 1 in user mode?)
   13765       */
   13766       t2 = newTemp(Ity_I32);
   13767       assign( t2, binop(Iop_Or32,
   13768                         mk_x86g_calculate_eflags_all(),
   13769                         mkU32( (1<<1)|(1<<9) ) ));
   13770 
   13771       /* Patch in the D flag.  This can simply be a copy of bit 10 of
   13772          baseBlock[OFFB_DFLAG]. */
   13773       t3 = newTemp(Ity_I32);
   13774       assign( t3, binop(Iop_Or32,
   13775                         mkexpr(t2),
   13776                         binop(Iop_And32,
   13777                               IRExpr_Get(OFFB_DFLAG,Ity_I32),
   13778                               mkU32(1<<10)))
   13779             );
   13780 
   13781       /* And patch in the ID flag. */
   13782       t4 = newTemp(Ity_I32);
   13783       assign( t4, binop(Iop_Or32,
   13784                         mkexpr(t3),
   13785                         binop(Iop_And32,
   13786                               binop(Iop_Shl32, IRExpr_Get(OFFB_IDFLAG,Ity_I32),
   13787                                                mkU8(21)),
   13788                               mkU32(1<<21)))
   13789             );
   13790 
   13791       /* And patch in the AC flag. */
   13792       t5 = newTemp(Ity_I32);
   13793       assign( t5, binop(Iop_Or32,
   13794                         mkexpr(t4),
   13795                         binop(Iop_And32,
   13796                               binop(Iop_Shl32, IRExpr_Get(OFFB_ACFLAG,Ity_I32),
   13797                                                mkU8(18)),
   13798                               mkU32(1<<18)))
   13799             );
   13800 
   13801       /* if sz==2, the stored value needs to be narrowed. */
   13802       if (sz == 2)
   13803         storeLE( mkexpr(t1), unop(Iop_32to16,mkexpr(t5)) );
   13804       else
   13805         storeLE( mkexpr(t1), mkexpr(t5) );
   13806 
   13807       DIP("pushf%c\n", nameISize(sz));
   13808       break;
   13809    }
   13810 
   13811    case 0x60: /* PUSHA */
   13812       /* This is almost certainly wrong for sz==2.  So ... */
   13813       if (sz != 4) goto decode_failure;
   13814 
   13815       /* This is the Right Way, in that the value to be pushed is
   13816          established before %esp is changed, so that pusha
   13817          correctly pushes the old %esp value.  New value of %esp is
   13818          pushed at start. */
   13819       /* t0 is the %ESP value we're going to push. */
   13820       t0 = newTemp(Ity_I32);
   13821       assign( t0, getIReg(4, R_ESP) );
   13822 
   13823       /* t5 will be the new %ESP value. */
   13824       t5 = newTemp(Ity_I32);
   13825       assign( t5, binop(Iop_Sub32, mkexpr(t0), mkU32(8*4)) );
   13826 
   13827       /* Update guest state before prodding memory. */
   13828       putIReg(4, R_ESP, mkexpr(t5));
   13829 
   13830       /* Dump all the registers. */
   13831       storeLE( binop(Iop_Add32,mkexpr(t5),mkU32(28)), getIReg(4,R_EAX) );
   13832       storeLE( binop(Iop_Add32,mkexpr(t5),mkU32(24)), getIReg(4,R_ECX) );
   13833       storeLE( binop(Iop_Add32,mkexpr(t5),mkU32(20)), getIReg(4,R_EDX) );
   13834       storeLE( binop(Iop_Add32,mkexpr(t5),mkU32(16)), getIReg(4,R_EBX) );
   13835       storeLE( binop(Iop_Add32,mkexpr(t5),mkU32(12)), mkexpr(t0) /*esp*/);
   13836       storeLE( binop(Iop_Add32,mkexpr(t5),mkU32( 8)), getIReg(4,R_EBP) );
   13837       storeLE( binop(Iop_Add32,mkexpr(t5),mkU32( 4)), getIReg(4,R_ESI) );
   13838       storeLE( binop(Iop_Add32,mkexpr(t5),mkU32( 0)), getIReg(4,R_EDI) );
   13839 
   13840       DIP("pusha%c\n", nameISize(sz));
   13841       break;
   13842 
   13843    case 0x0E: /* PUSH %CS */
   13844       dis_push_segreg( R_CS, sz ); break;
   13845    case 0x1E: /* PUSH %DS */
   13846       dis_push_segreg( R_DS, sz ); break;
   13847    case 0x06: /* PUSH %ES */
   13848       dis_push_segreg( R_ES, sz ); break;
   13849    case 0x16: /* PUSH %SS */
   13850       dis_push_segreg( R_SS, sz ); break;
   13851 
   13852    /* ------------------------ SCAS et al ----------------- */
   13853 
   13854    case 0xA4: /* MOVS, no REP prefix */
   13855    case 0xA5:
   13856       if (sorb != 0)
   13857          goto decode_failure; /* else dis_string_op asserts */
   13858       dis_string_op( dis_MOVS, ( opc == 0xA4 ? 1 : sz ), "movs", sorb );
   13859       break;
   13860 
   13861   case 0xA6: /* CMPSb, no REP prefix */
   13862   case 0xA7:
   13863       if (sorb != 0)
   13864          goto decode_failure; /* else dis_string_op asserts */
   13865       dis_string_op( dis_CMPS, ( opc == 0xA6 ? 1 : sz ), "cmps", sorb );
   13866       break;
   13867 
   13868    case 0xAA: /* STOS, no REP prefix */
   13869    case 0xAB:
   13870       if (sorb != 0)
   13871          goto decode_failure; /* else dis_string_op asserts */
   13872       dis_string_op( dis_STOS, ( opc == 0xAA ? 1 : sz ), "stos", sorb );
   13873       break;
   13874 
   13875    case 0xAC: /* LODS, no REP prefix */
   13876    case 0xAD:
   13877       if (sorb != 0)
   13878          goto decode_failure; /* else dis_string_op asserts */
   13879       dis_string_op( dis_LODS, ( opc == 0xAC ? 1 : sz ), "lods", sorb );
   13880       break;
   13881 
   13882    case 0xAE: /* SCAS, no REP prefix */
   13883    case 0xAF:
   13884       if (sorb != 0)
   13885          goto decode_failure; /* else dis_string_op asserts */
   13886       dis_string_op( dis_SCAS, ( opc == 0xAE ? 1 : sz ), "scas", sorb );
   13887       break;
   13888 
   13889 
   13890    case 0xFC: /* CLD */
   13891       stmt( IRStmt_Put( OFFB_DFLAG, mkU32(1)) );
   13892       DIP("cld\n");
   13893       break;
   13894 
   13895    case 0xFD: /* STD */
   13896       stmt( IRStmt_Put( OFFB_DFLAG, mkU32(0xFFFFFFFF)) );
   13897       DIP("std\n");
   13898       break;
   13899 
   13900    case 0xF8: /* CLC */
   13901    case 0xF9: /* STC */
   13902    case 0xF5: /* CMC */
   13903       t0 = newTemp(Ity_I32);
   13904       t1 = newTemp(Ity_I32);
   13905       assign( t0, mk_x86g_calculate_eflags_all() );
   13906       switch (opc) {
   13907          case 0xF8:
   13908             assign( t1, binop(Iop_And32, mkexpr(t0),
   13909                                          mkU32(~X86G_CC_MASK_C)));
   13910             DIP("clc\n");
   13911             break;
   13912          case 0xF9:
   13913             assign( t1, binop(Iop_Or32, mkexpr(t0),
   13914                                         mkU32(X86G_CC_MASK_C)));
   13915             DIP("stc\n");
   13916             break;
   13917          case 0xF5:
   13918             assign( t1, binop(Iop_Xor32, mkexpr(t0),
   13919                                          mkU32(X86G_CC_MASK_C)));
   13920             DIP("cmc\n");
   13921             break;
   13922          default:
   13923             vpanic("disInstr(x86)(clc/stc/cmc)");
   13924       }
   13925       stmt( IRStmt_Put( OFFB_CC_OP,   mkU32(X86G_CC_OP_COPY) ));
   13926       stmt( IRStmt_Put( OFFB_CC_DEP2, mkU32(0) ));
   13927       stmt( IRStmt_Put( OFFB_CC_DEP1, mkexpr(t1) ));
   13928       /* Set NDEP even though it isn't used.  This makes redundant-PUT
   13929          elimination of previous stores to this field work better. */
   13930       stmt( IRStmt_Put( OFFB_CC_NDEP, mkU32(0) ));
   13931       break;
   13932 
   13933    case 0xD6: /* SALC */
   13934       t0 = newTemp(Ity_I32);
   13935       t1 = newTemp(Ity_I32);
   13936       assign( t0,  binop(Iop_And32,
   13937                          mk_x86g_calculate_eflags_c(),
   13938                          mkU32(1)) );
   13939       assign( t1, binop(Iop_Sar32,
   13940                         binop(Iop_Shl32, mkexpr(t0), mkU8(31)),
   13941                         mkU8(31)) );
   13942       putIReg(1, R_EAX, unop(Iop_32to8, mkexpr(t1)) );
   13943       DIP("salc\n");
   13944       break;
   13945 
   13946    /* REPNE prefix insn */
   13947    case 0xF2: {
   13948       Addr32 eip_orig = guest_EIP_bbstart + delta_start;
   13949       if (sorb != 0) goto decode_failure;
   13950       abyte = getIByte(delta); delta++;
   13951 
   13952       if (abyte == 0x66) { sz = 2; abyte = getIByte(delta); delta++; }
   13953       dres.whatNext = Dis_StopHere;
   13954 
   13955       switch (abyte) {
   13956       /* According to the Intel manual, "repne movs" should never occur, but
   13957        * in practice it has happened, so allow for it here... */
   13958       case 0xA4: sz = 1;   /* REPNE MOVS<sz> */
   13959       case 0xA5:
   13960          dis_REP_op ( X86CondNZ, dis_MOVS, sz, eip_orig,
   13961                                  guest_EIP_bbstart+delta, "repne movs" );
   13962          break;
   13963 
   13964       case 0xA6: sz = 1;   /* REPNE CMP<sz> */
   13965       case 0xA7:
   13966          dis_REP_op ( X86CondNZ, dis_CMPS, sz, eip_orig,
   13967                                  guest_EIP_bbstart+delta, "repne cmps" );
   13968          break;
   13969 
   13970       case 0xAA: sz = 1;   /* REPNE STOS<sz> */
   13971       case 0xAB:
   13972          dis_REP_op ( X86CondNZ, dis_STOS, sz, eip_orig,
   13973                                  guest_EIP_bbstart+delta, "repne stos" );
   13974          break;
   13975 
   13976       case 0xAE: sz = 1;   /* REPNE SCAS<sz> */
   13977       case 0xAF:
   13978          dis_REP_op ( X86CondNZ, dis_SCAS, sz, eip_orig,
   13979                                  guest_EIP_bbstart+delta, "repne scas" );
   13980          break;
   13981 
   13982       default:
   13983          goto decode_failure;
   13984       }
   13985       break;
   13986    }
   13987 
   13988    /* REP/REPE prefix insn (for SCAS and CMPS, 0xF3 means REPE,
   13989       for the rest, it means REP) */
   13990    case 0xF3: {
   13991       Addr32 eip_orig = guest_EIP_bbstart + delta_start;
   13992       if (sorb != 0) goto decode_failure;
   13993       abyte = getIByte(delta); delta++;
   13994 
   13995       if (abyte == 0x66) { sz = 2; abyte = getIByte(delta); delta++; }
   13996       dres.whatNext = Dis_StopHere;
   13997 
   13998       switch (abyte) {
   13999       case 0xA4: sz = 1;   /* REP MOVS<sz> */
   14000       case 0xA5:
   14001          dis_REP_op ( X86CondAlways, dis_MOVS, sz, eip_orig,
   14002                                      guest_EIP_bbstart+delta, "rep movs" );
   14003          break;
   14004 
   14005       case 0xA6: sz = 1;   /* REPE CMP<sz> */
   14006       case 0xA7:
   14007          dis_REP_op ( X86CondZ, dis_CMPS, sz, eip_orig,
   14008                                 guest_EIP_bbstart+delta, "repe cmps" );
   14009          break;
   14010 
   14011       case 0xAA: sz = 1;   /* REP STOS<sz> */
   14012       case 0xAB:
   14013          dis_REP_op ( X86CondAlways, dis_STOS, sz, eip_orig,
   14014                                      guest_EIP_bbstart+delta, "rep stos" );
   14015          break;
   14016 
   14017       case 0xAC: sz = 1;   /* REP LODS<sz> */
   14018       case 0xAD:
   14019          dis_REP_op ( X86CondAlways, dis_LODS, sz, eip_orig,
   14020                                      guest_EIP_bbstart+delta, "rep lods" );
   14021          break;
   14022 
   14023       case 0xAE: sz = 1;   /* REPE SCAS<sz> */
   14024       case 0xAF:
   14025          dis_REP_op ( X86CondZ, dis_SCAS, sz, eip_orig,
   14026                                 guest_EIP_bbstart+delta, "repe scas" );
   14027          break;
   14028 
   14029       case 0x90:           /* REP NOP (PAUSE) */
   14030          /* a hint to the P4 re spin-wait loop */
   14031          DIP("rep nop (P4 pause)\n");
   14032          /* "observe" the hint.  The Vex client needs to be careful not
   14033             to cause very long delays as a result, though. */
   14034          jmp_lit(Ijk_Yield, ((Addr32)guest_EIP_bbstart)+delta);
   14035          dres.whatNext = Dis_StopHere;
   14036          break;
   14037 
   14038       case 0xC3:           /* REP RET -- same as normal ret? */
   14039          dis_ret(0);
   14040          dres.whatNext = Dis_StopHere;
   14041          DIP("rep ret\n");
   14042          break;
   14043 
   14044       default:
   14045          goto decode_failure;
   14046       }
   14047       break;
   14048    }
   14049 
   14050    /* ------------------------ XCHG ----------------------- */
   14051 
   14052    /* XCHG reg,mem automatically asserts LOCK# even without a LOCK
   14053       prefix; hence it must be translated with an IRCAS (at least, the
   14054       memory variant). */
   14055    case 0x86: /* XCHG Gb,Eb */
   14056       sz = 1;
   14057       /* Fall through ... */
   14058    case 0x87: /* XCHG Gv,Ev */
   14059       modrm = getIByte(delta);
   14060       ty = szToITy(sz);
   14061       t1 = newTemp(ty); t2 = newTemp(ty);
   14062       if (epartIsReg(modrm)) {
   14063          assign(t1, getIReg(sz, eregOfRM(modrm)));
   14064          assign(t2, getIReg(sz, gregOfRM(modrm)));
   14065          putIReg(sz, gregOfRM(modrm), mkexpr(t1));
   14066          putIReg(sz, eregOfRM(modrm), mkexpr(t2));
   14067          delta++;
   14068          DIP("xchg%c %s, %s\n",
   14069              nameISize(sz), nameIReg(sz,gregOfRM(modrm)),
   14070                             nameIReg(sz,eregOfRM(modrm)));
   14071       } else {
   14072          *expect_CAS = True;
   14073          addr = disAMode ( &alen, sorb, delta, dis_buf );
   14074          assign( t1, loadLE(ty,mkexpr(addr)) );
   14075          assign( t2, getIReg(sz,gregOfRM(modrm)) );
   14076          casLE( mkexpr(addr),
   14077                 mkexpr(t1), mkexpr(t2), guest_EIP_curr_instr );
   14078          putIReg( sz, gregOfRM(modrm), mkexpr(t1) );
   14079          delta += alen;
   14080          DIP("xchg%c %s, %s\n", nameISize(sz),
   14081                                 nameIReg(sz,gregOfRM(modrm)), dis_buf);
   14082       }
   14083       break;
   14084 
   14085    case 0x90: /* XCHG eAX,eAX */
   14086       DIP("nop\n");
   14087       break;
   14088    case 0x91: /* XCHG eAX,eCX */
   14089    case 0x92: /* XCHG eAX,eDX */
   14090    case 0x93: /* XCHG eAX,eBX */
   14091    case 0x94: /* XCHG eAX,eSP */
   14092    case 0x95: /* XCHG eAX,eBP */
   14093    case 0x96: /* XCHG eAX,eSI */
   14094    case 0x97: /* XCHG eAX,eDI */
   14095       codegen_xchg_eAX_Reg ( sz, opc - 0x90 );
   14096       break;
   14097 
   14098    /* ------------------------ XLAT ----------------------- */
   14099 
   14100    case 0xD7: /* XLAT */
   14101       if (sz != 4) goto decode_failure; /* sz == 2 is also allowed (0x66) */
   14102       putIReg(
   14103          1,
   14104          R_EAX/*AL*/,
   14105          loadLE(Ity_I8,
   14106                 handleSegOverride(
   14107                    sorb,
   14108                    binop(Iop_Add32,
   14109                          getIReg(4, R_EBX),
   14110                          unop(Iop_8Uto32, getIReg(1, R_EAX/*AL*/))))));
   14111 
   14112       DIP("xlat%c [ebx]\n", nameISize(sz));
   14113       break;
   14114 
   14115    /* ------------------------ IN / OUT ----------------------- */
   14116 
   14117    case 0xE4: /* IN imm8, AL */
   14118       sz = 1;
   14119       t1 = newTemp(Ity_I32);
   14120       abyte = getIByte(delta); delta++;
   14121       assign(t1, mkU32( abyte & 0xFF ));
   14122       DIP("in%c $%d,%s\n", nameISize(sz), (Int)abyte, nameIReg(sz,R_EAX));
   14123       goto do_IN;
   14124    case 0xE5: /* IN imm8, eAX */
   14125       vassert(sz == 2 || sz == 4);
   14126       t1 = newTemp(Ity_I32);
   14127       abyte = getIByte(delta); delta++;
   14128       assign(t1, mkU32( abyte & 0xFF ));
   14129       DIP("in%c $%d,%s\n", nameISize(sz), (Int)abyte, nameIReg(sz,R_EAX));
   14130       goto do_IN;
   14131    case 0xEC: /* IN %DX, AL */
   14132       sz = 1;
   14133       t1 = newTemp(Ity_I32);
   14134       assign(t1, unop(Iop_16Uto32, getIReg(2, R_EDX)));
   14135       DIP("in%c %s,%s\n", nameISize(sz), nameIReg(2,R_EDX),
   14136                                          nameIReg(sz,R_EAX));
   14137       goto do_IN;
   14138    case 0xED: /* IN %DX, eAX */
   14139       vassert(sz == 2 || sz == 4);
   14140       t1 = newTemp(Ity_I32);
   14141       assign(t1, unop(Iop_16Uto32, getIReg(2, R_EDX)));
   14142       DIP("in%c %s,%s\n", nameISize(sz), nameIReg(2,R_EDX),
   14143                                          nameIReg(sz,R_EAX));
   14144       goto do_IN;
   14145    do_IN: {
   14146       /* At this point, sz indicates the width, and t1 is a 32-bit
   14147          value giving port number. */
   14148       IRDirty* d;
   14149       vassert(sz == 1 || sz == 2 || sz == 4);
   14150       ty = szToITy(sz);
   14151       t2 = newTemp(Ity_I32);
   14152       d = unsafeIRDirty_1_N(
   14153              t2,
   14154              0/*regparms*/,
   14155              "x86g_dirtyhelper_IN",
   14156              &x86g_dirtyhelper_IN,
   14157              mkIRExprVec_2( mkexpr(t1), mkU32(sz) )
   14158           );
   14159       /* do the call, dumping the result in t2. */
   14160       stmt( IRStmt_Dirty(d) );
   14161       putIReg(sz, R_EAX, narrowTo( ty, mkexpr(t2) ) );
   14162       break;
   14163    }
   14164 
   14165    case 0xE6: /* OUT AL, imm8 */
   14166       sz = 1;
   14167       t1 = newTemp(Ity_I32);
   14168       abyte = getIByte(delta); delta++;
   14169       assign( t1, mkU32( abyte & 0xFF ) );
   14170       DIP("out%c %s,$%d\n", nameISize(sz), nameIReg(sz,R_EAX), (Int)abyte);
   14171       goto do_OUT;
   14172    case 0xE7: /* OUT eAX, imm8 */
   14173       vassert(sz == 2 || sz == 4);
   14174       t1 = newTemp(Ity_I32);
   14175       abyte = getIByte(delta); delta++;
   14176       assign( t1, mkU32( abyte & 0xFF ) );
   14177       DIP("out%c %s,$%d\n", nameISize(sz), nameIReg(sz,R_EAX), (Int)abyte);
   14178       goto do_OUT;
   14179    case 0xEE: /* OUT AL, %DX */
   14180       sz = 1;
   14181       t1 = newTemp(Ity_I32);
   14182       assign( t1, unop(Iop_16Uto32, getIReg(2, R_EDX)) );
   14183       DIP("out%c %s,%s\n", nameISize(sz), nameIReg(sz,R_EAX),
   14184                                           nameIReg(2,R_EDX));
   14185       goto do_OUT;
   14186    case 0xEF: /* OUT eAX, %DX */
   14187       vassert(sz == 2 || sz == 4);
   14188       t1 = newTemp(Ity_I32);
   14189       assign( t1, unop(Iop_16Uto32, getIReg(2, R_EDX)) );
   14190       DIP("out%c %s,%s\n", nameISize(sz), nameIReg(sz,R_EAX),
   14191                                           nameIReg(2,R_EDX));
   14192       goto do_OUT;
   14193    do_OUT: {
   14194       /* At this point, sz indicates the width, and t1 is a 32-bit
   14195          value giving port number. */
   14196       IRDirty* d;
   14197       vassert(sz == 1 || sz == 2 || sz == 4);
   14198       ty = szToITy(sz);
   14199       d = unsafeIRDirty_0_N(
   14200              0/*regparms*/,
   14201              "x86g_dirtyhelper_OUT",
   14202              &x86g_dirtyhelper_OUT,
   14203              mkIRExprVec_3( mkexpr(t1),
   14204                             widenUto32( getIReg(sz, R_EAX) ),
   14205                             mkU32(sz) )
   14206           );
   14207       stmt( IRStmt_Dirty(d) );
   14208       break;
   14209    }
   14210 
   14211    /* ------------------------ (Grp1 extensions) ---------- */
   14212 
   14213    case 0x82: /* Grp1 Ib,Eb too.  Apparently this is the same as
   14214                  case 0x80, but only in 32-bit mode. */
   14215       /* fallthru */
   14216    case 0x80: /* Grp1 Ib,Eb */
   14217       modrm = getIByte(delta);
   14218       am_sz = lengthAMode(delta);
   14219       sz    = 1;
   14220       d_sz  = 1;
   14221       d32   = getUChar(delta + am_sz);
   14222       delta = dis_Grp1 ( sorb, pfx_lock, delta, modrm, am_sz, d_sz, sz, d32 );
   14223       break;
   14224 
   14225    case 0x81: /* Grp1 Iv,Ev */
   14226       modrm = getIByte(delta);
   14227       am_sz = lengthAMode(delta);
   14228       d_sz  = sz;
   14229       d32   = getUDisp(d_sz, delta + am_sz);
   14230       delta = dis_Grp1 ( sorb, pfx_lock, delta, modrm, am_sz, d_sz, sz, d32 );
   14231       break;
   14232 
   14233    case 0x83: /* Grp1 Ib,Ev */
   14234       modrm = getIByte(delta);
   14235       am_sz = lengthAMode(delta);
   14236       d_sz  = 1;
   14237       d32   = getSDisp8(delta + am_sz);
   14238       delta = dis_Grp1 ( sorb, pfx_lock, delta, modrm, am_sz, d_sz, sz, d32 );
   14239       break;
   14240 
   14241    /* ------------------------ (Grp2 extensions) ---------- */
   14242 
   14243    case 0xC0: { /* Grp2 Ib,Eb */
   14244       Bool decode_OK = True;
   14245       modrm = getIByte(delta);
   14246       am_sz = lengthAMode(delta);
   14247       d_sz  = 1;
   14248       d32   = getUChar(delta + am_sz);
   14249       sz    = 1;
   14250       delta = dis_Grp2 ( sorb, delta, modrm, am_sz, d_sz, sz,
   14251                          mkU8(d32 & 0xFF), NULL, &decode_OK );
   14252       if (!decode_OK)
   14253          goto decode_failure;
   14254       break;
   14255    }
   14256    case 0xC1: { /* Grp2 Ib,Ev */
   14257       Bool decode_OK = True;
   14258       modrm = getIByte(delta);
   14259       am_sz = lengthAMode(delta);
   14260       d_sz  = 1;
   14261       d32   = getUChar(delta + am_sz);
   14262       delta = dis_Grp2 ( sorb, delta, modrm, am_sz, d_sz, sz,
   14263                          mkU8(d32 & 0xFF), NULL, &decode_OK );
   14264       if (!decode_OK)
   14265          goto decode_failure;
   14266       break;
   14267    }
   14268    case 0xD0: { /* Grp2 1,Eb */
   14269       Bool decode_OK = True;
   14270       modrm = getIByte(delta);
   14271       am_sz = lengthAMode(delta);
   14272       d_sz  = 0;
   14273       d32   = 1;
   14274       sz    = 1;
   14275       delta = dis_Grp2 ( sorb, delta, modrm, am_sz, d_sz, sz,
   14276                          mkU8(d32), NULL, &decode_OK );
   14277       if (!decode_OK)
   14278          goto decode_failure;
   14279       break;
   14280    }
   14281    case 0xD1: { /* Grp2 1,Ev */
   14282       Bool decode_OK = True;
   14283       modrm = getUChar(delta);
   14284       am_sz = lengthAMode(delta);
   14285       d_sz  = 0;
   14286       d32   = 1;
   14287       delta = dis_Grp2 ( sorb, delta, modrm, am_sz, d_sz, sz,
   14288                          mkU8(d32), NULL, &decode_OK );
   14289       if (!decode_OK)
   14290          goto decode_failure;
   14291       break;
   14292    }
   14293    case 0xD2: { /* Grp2 CL,Eb */
   14294       Bool decode_OK = True;
   14295       modrm = getUChar(delta);
   14296       am_sz = lengthAMode(delta);
   14297       d_sz  = 0;
   14298       sz    = 1;
   14299       delta = dis_Grp2 ( sorb, delta, modrm, am_sz, d_sz, sz,
   14300                          getIReg(1,R_ECX), "%cl", &decode_OK );
   14301       if (!decode_OK)
   14302          goto decode_failure;
   14303       break;
   14304    }
   14305    case 0xD3: { /* Grp2 CL,Ev */
   14306       Bool decode_OK = True;
   14307       modrm = getIByte(delta);
   14308       am_sz = lengthAMode(delta);
   14309       d_sz  = 0;
   14310       delta = dis_Grp2 ( sorb, delta, modrm, am_sz, d_sz, sz,
   14311                          getIReg(1,R_ECX), "%cl", &decode_OK );
   14312       if (!decode_OK)
   14313          goto decode_failure;
   14314       break;
   14315    }
   14316 
   14317    /* ------------------------ (Grp3 extensions) ---------- */
   14318 
   14319    case 0xF6: { /* Grp3 Eb */
   14320       Bool decode_OK = True;
   14321       delta = dis_Grp3 ( sorb, pfx_lock, 1, delta, &decode_OK );
   14322       if (!decode_OK)
   14323          goto decode_failure;
   14324       break;
   14325    }
   14326    case 0xF7: { /* Grp3 Ev */
   14327       Bool decode_OK = True;
   14328       delta = dis_Grp3 ( sorb, pfx_lock, sz, delta, &decode_OK );
   14329       if (!decode_OK)
   14330          goto decode_failure;
   14331       break;
   14332    }
   14333 
   14334    /* ------------------------ (Grp4 extensions) ---------- */
   14335 
   14336    case 0xFE: { /* Grp4 Eb */
   14337       Bool decode_OK = True;
   14338       delta = dis_Grp4 ( sorb, pfx_lock, delta, &decode_OK );
   14339       if (!decode_OK)
   14340          goto decode_failure;
   14341       break;
   14342    }
   14343 
   14344    /* ------------------------ (Grp5 extensions) ---------- */
   14345 
   14346    case 0xFF: { /* Grp5 Ev */
   14347       Bool decode_OK = True;
   14348       delta = dis_Grp5 ( sorb, pfx_lock, sz, delta, &dres, &decode_OK );
   14349       if (!decode_OK)
   14350          goto decode_failure;
   14351       break;
   14352    }
   14353 
   14354    /* ------------------------ Escapes to 2-byte opcodes -- */
   14355 
   14356    case 0x0F: {
   14357       opc = getIByte(delta); delta++;
   14358       switch (opc) {
   14359 
   14360       /* =-=-=-=-=-=-=-=-=- Grp8 =-=-=-=-=-=-=-=-=-=-=-= */
   14361 
   14362       case 0xBA: { /* Grp8 Ib,Ev */
   14363          Bool decode_OK = False;
   14364          modrm = getUChar(delta);
   14365          am_sz = lengthAMode(delta);
   14366          d32   = getSDisp8(delta + am_sz);
   14367          delta = dis_Grp8_Imm ( sorb, pfx_lock, delta, modrm,
   14368                                 am_sz, sz, d32, &decode_OK );
   14369          if (!decode_OK)
   14370             goto decode_failure;
   14371          break;
   14372       }
   14373 
   14374       /* =-=-=-=-=-=-=-=-=- BSF/BSR -=-=-=-=-=-=-=-=-=-= */
   14375 
   14376       case 0xBC: /* BSF Gv,Ev */
   14377          delta = dis_bs_E_G ( sorb, sz, delta, True );
   14378          break;
   14379       case 0xBD: /* BSR Gv,Ev */
   14380          delta = dis_bs_E_G ( sorb, sz, delta, False );
   14381          break;
   14382 
   14383       /* =-=-=-=-=-=-=-=-=- BSWAP -=-=-=-=-=-=-=-=-=-=-= */
   14384 
   14385       case 0xC8: /* BSWAP %eax */
   14386       case 0xC9:
   14387       case 0xCA:
   14388       case 0xCB:
   14389       case 0xCC:
   14390       case 0xCD:
   14391       case 0xCE:
   14392       case 0xCF: /* BSWAP %edi */
   14393          /* AFAICS from the Intel docs, this only exists at size 4. */
   14394          vassert(sz == 4);
   14395          t1 = newTemp(Ity_I32);
   14396          t2 = newTemp(Ity_I32);
   14397          assign( t1, getIReg(4, opc-0xC8) );
   14398 
   14399          assign( t2,
   14400             binop(Iop_Or32,
   14401                binop(Iop_Shl32, mkexpr(t1), mkU8(24)),
   14402             binop(Iop_Or32,
   14403                binop(Iop_And32, binop(Iop_Shl32, mkexpr(t1), mkU8(8)),
   14404                                 mkU32(0x00FF0000)),
   14405             binop(Iop_Or32,
   14406                binop(Iop_And32, binop(Iop_Shr32, mkexpr(t1), mkU8(8)),
   14407                                 mkU32(0x0000FF00)),
   14408                binop(Iop_And32, binop(Iop_Shr32, mkexpr(t1), mkU8(24)),
   14409                                 mkU32(0x000000FF) )
   14410             )))
   14411          );
   14412 
   14413          putIReg(4, opc-0xC8, mkexpr(t2));
   14414          DIP("bswapl %s\n", nameIReg(4, opc-0xC8));
   14415          break;
   14416 
   14417       /* =-=-=-=-=-=-=-=-=- BT/BTS/BTR/BTC =-=-=-=-=-=-= */
   14418 
   14419       case 0xA3: /* BT Gv,Ev */
   14420          delta = dis_bt_G_E ( vbi, sorb, pfx_lock, sz, delta, BtOpNone );
   14421          break;
   14422       case 0xB3: /* BTR Gv,Ev */
   14423          delta = dis_bt_G_E ( vbi, sorb, pfx_lock, sz, delta, BtOpReset );
   14424          break;
   14425       case 0xAB: /* BTS Gv,Ev */
   14426          delta = dis_bt_G_E ( vbi, sorb, pfx_lock, sz, delta, BtOpSet );
   14427          break;
   14428       case 0xBB: /* BTC Gv,Ev */
   14429          delta = dis_bt_G_E ( vbi, sorb, pfx_lock, sz, delta, BtOpComp );
   14430          break;
   14431 
   14432       /* =-=-=-=-=-=-=-=-=- CMOV =-=-=-=-=-=-=-=-=-=-=-= */
   14433 
   14434       case 0x40:
   14435       case 0x41:
   14436       case 0x42: /* CMOVBb/CMOVNAEb (cmov below) */
   14437       case 0x43: /* CMOVNBb/CMOVAEb (cmov not below) */
   14438       case 0x44: /* CMOVZb/CMOVEb (cmov zero) */
   14439       case 0x45: /* CMOVNZb/CMOVNEb (cmov not zero) */
   14440       case 0x46: /* CMOVBEb/CMOVNAb (cmov below or equal) */
   14441       case 0x47: /* CMOVNBEb/CMOVAb (cmov not below or equal) */
   14442       case 0x48: /* CMOVSb (cmov negative) */
   14443       case 0x49: /* CMOVSb (cmov not negative) */
   14444       case 0x4A: /* CMOVP (cmov parity even) */
   14445       case 0x4B: /* CMOVNP (cmov parity odd) */
   14446       case 0x4C: /* CMOVLb/CMOVNGEb (cmov less) */
   14447       case 0x4D: /* CMOVGEb/CMOVNLb (cmov greater or equal) */
   14448       case 0x4E: /* CMOVLEb/CMOVNGb (cmov less or equal) */
   14449       case 0x4F: /* CMOVGb/CMOVNLEb (cmov greater) */
   14450          delta = dis_cmov_E_G(sorb, sz, (X86Condcode)(opc - 0x40), delta);
   14451          break;
   14452 
   14453       /* =-=-=-=-=-=-=-=-=- CMPXCHG -=-=-=-=-=-=-=-=-=-= */
   14454 
   14455       case 0xB0: /* CMPXCHG Gb,Eb */
   14456          delta = dis_cmpxchg_G_E ( sorb, pfx_lock, 1, delta );
   14457          break;
   14458       case 0xB1: /* CMPXCHG Gv,Ev */
   14459          delta = dis_cmpxchg_G_E ( sorb, pfx_lock, sz, delta );
   14460          break;
   14461 
   14462       case 0xC7: { /* CMPXCHG8B Gv (0F C7 /1) */
   14463          IRTemp expdHi    = newTemp(Ity_I32);
   14464          IRTemp expdLo    = newTemp(Ity_I32);
   14465          IRTemp dataHi    = newTemp(Ity_I32);
   14466          IRTemp dataLo    = newTemp(Ity_I32);
   14467          IRTemp oldHi     = newTemp(Ity_I32);
   14468          IRTemp oldLo     = newTemp(Ity_I32);
   14469          IRTemp flags_old = newTemp(Ity_I32);
   14470          IRTemp flags_new = newTemp(Ity_I32);
   14471          IRTemp success   = newTemp(Ity_I1);
   14472 
   14473          /* Translate this using a DCAS, even if there is no LOCK
   14474             prefix.  Life is too short to bother with generating two
   14475             different translations for the with/without-LOCK-prefix
   14476             cases. */
   14477          *expect_CAS = True;
   14478 
   14479 	 /* Decode, and generate address. */
   14480          if (sz != 4) goto decode_failure;
   14481          modrm = getIByte(delta);
   14482          if (epartIsReg(modrm)) goto decode_failure;
   14483          if (gregOfRM(modrm) != 1) goto decode_failure;
   14484          addr = disAMode ( &alen, sorb, delta, dis_buf );
   14485          delta += alen;
   14486 
   14487          /* Get the expected and new values. */
   14488          assign( expdHi, getIReg(4,R_EDX) );
   14489          assign( expdLo, getIReg(4,R_EAX) );
   14490          assign( dataHi, getIReg(4,R_ECX) );
   14491          assign( dataLo, getIReg(4,R_EBX) );
   14492 
   14493          /* Do the DCAS */
   14494          stmt( IRStmt_CAS(
   14495                   mkIRCAS( oldHi, oldLo,
   14496                            Iend_LE, mkexpr(addr),
   14497                            mkexpr(expdHi), mkexpr(expdLo),
   14498                            mkexpr(dataHi), mkexpr(dataLo)
   14499                )));
   14500 
   14501          /* success when oldHi:oldLo == expdHi:expdLo */
   14502          assign( success,
   14503                  binop(Iop_CasCmpEQ32,
   14504                        binop(Iop_Or32,
   14505                              binop(Iop_Xor32, mkexpr(oldHi), mkexpr(expdHi)),
   14506                              binop(Iop_Xor32, mkexpr(oldLo), mkexpr(expdLo))
   14507                        ),
   14508                        mkU32(0)
   14509                  ));
   14510 
   14511          /* If the DCAS is successful, that is to say oldHi:oldLo ==
   14512             expdHi:expdLo, then put expdHi:expdLo back in EDX:EAX,
   14513             which is where they came from originally.  Both the actual
   14514             contents of these two regs, and any shadow values, are
   14515             unchanged.  If the DCAS fails then we're putting into
   14516             EDX:EAX the value seen in memory. */
   14517          putIReg(4, R_EDX,
   14518                     IRExpr_Mux0X( unop(Iop_1Uto8, mkexpr(success)),
   14519                                   mkexpr(oldHi),
   14520                                   mkexpr(expdHi)
   14521                 ));
   14522          putIReg(4, R_EAX,
   14523                     IRExpr_Mux0X( unop(Iop_1Uto8, mkexpr(success)),
   14524                                   mkexpr(oldLo),
   14525                                   mkexpr(expdLo)
   14526                 ));
   14527 
   14528          /* Copy the success bit into the Z flag and leave the others
   14529             unchanged */
   14530          assign( flags_old, widenUto32(mk_x86g_calculate_eflags_all()));
   14531          assign(
   14532             flags_new,
   14533             binop(Iop_Or32,
   14534                   binop(Iop_And32, mkexpr(flags_old),
   14535                                    mkU32(~X86G_CC_MASK_Z)),
   14536                   binop(Iop_Shl32,
   14537                         binop(Iop_And32,
   14538                               unop(Iop_1Uto32, mkexpr(success)), mkU32(1)),
   14539                         mkU8(X86G_CC_SHIFT_Z)) ));
   14540 
   14541          stmt( IRStmt_Put( OFFB_CC_OP,   mkU32(X86G_CC_OP_COPY) ));
   14542          stmt( IRStmt_Put( OFFB_CC_DEP1, mkexpr(flags_new) ));
   14543          stmt( IRStmt_Put( OFFB_CC_DEP2, mkU32(0) ));
   14544          /* Set NDEP even though it isn't used.  This makes
   14545             redundant-PUT elimination of previous stores to this field
   14546             work better. */
   14547          stmt( IRStmt_Put( OFFB_CC_NDEP, mkU32(0) ));
   14548 
   14549          /* Sheesh.  Aren't you glad it was me and not you that had to
   14550 	    write and validate all this grunge? */
   14551 
   14552 	 DIP("cmpxchg8b %s\n", dis_buf);
   14553 	 break;
   14554       }
   14555 
   14556       /* =-=-=-=-=-=-=-=-=- CPUID -=-=-=-=-=-=-=-=-=-=-= */
   14557 
   14558       case 0xA2: { /* CPUID */
   14559          /* Uses dirty helper:
   14560                void dirtyhelper_CPUID_sse[012] ( VexGuestX86State* )
   14561             declared to mod eax, wr ebx, ecx, edx
   14562          */
   14563          IRDirty* d     = NULL;
   14564          HChar*   fName = NULL;
   14565          void*    fAddr = NULL;
   14566          if (archinfo->hwcaps & VEX_HWCAPS_X86_SSE2) {
   14567             fName = "x86g_dirtyhelper_CPUID_sse2";
   14568             fAddr = &x86g_dirtyhelper_CPUID_sse2;
   14569          }
   14570          else
   14571          if (archinfo->hwcaps & VEX_HWCAPS_X86_SSE1) {
   14572             fName = "x86g_dirtyhelper_CPUID_sse1";
   14573             fAddr = &x86g_dirtyhelper_CPUID_sse1;
   14574          }
   14575          else
   14576          if (archinfo->hwcaps == 0/*no SSE*/) {
   14577             fName = "x86g_dirtyhelper_CPUID_sse0";
   14578             fAddr = &x86g_dirtyhelper_CPUID_sse0;
   14579          } else
   14580             vpanic("disInstr(x86)(cpuid)");
   14581 
   14582          vassert(fName); vassert(fAddr);
   14583          d = unsafeIRDirty_0_N ( 0/*regparms*/,
   14584                                  fName, fAddr, mkIRExprVec_0() );
   14585          /* declare guest state effects */
   14586          d->needsBBP = True;
   14587          d->nFxState = 4;
   14588          d->fxState[0].fx     = Ifx_Modify;
   14589          d->fxState[0].offset = OFFB_EAX;
   14590          d->fxState[0].size   = 4;
   14591          d->fxState[1].fx     = Ifx_Write;
   14592          d->fxState[1].offset = OFFB_EBX;
   14593          d->fxState[1].size   = 4;
   14594          d->fxState[2].fx     = Ifx_Modify;
   14595          d->fxState[2].offset = OFFB_ECX;
   14596          d->fxState[2].size   = 4;
   14597          d->fxState[3].fx     = Ifx_Write;
   14598          d->fxState[3].offset = OFFB_EDX;
   14599          d->fxState[3].size   = 4;
   14600          /* execute the dirty call, side-effecting guest state */
   14601          stmt( IRStmt_Dirty(d) );
   14602          /* CPUID is a serialising insn.  So, just in case someone is
   14603             using it as a memory fence ... */
   14604          stmt( IRStmt_MBE(Imbe_Fence) );
   14605          DIP("cpuid\n");
   14606          break;
   14607       }
   14608 
   14609 //--          if (!VG_(cpu_has_feature)(VG_X86_FEAT_CPUID))
   14610 //--             goto decode_failure;
   14611 //--
   14612 //--          t1 = newTemp(cb);
   14613 //--          t2 = newTemp(cb);
   14614 //--          t3 = newTemp(cb);
   14615 //--          t4 = newTemp(cb);
   14616 //--          uInstr0(cb, CALLM_S, 0);
   14617 //--
   14618 //--          uInstr2(cb, GET,   4, ArchReg, R_EAX, TempReg, t1);
   14619 //--          uInstr1(cb, PUSH,  4, TempReg, t1);
   14620 //--
   14621 //--          uInstr2(cb, MOV,   4, Literal, 0, TempReg, t2);
   14622 //--          uLiteral(cb, 0);
   14623 //--          uInstr1(cb, PUSH,  4, TempReg, t2);
   14624 //--
   14625 //--          uInstr2(cb, MOV,   4, Literal, 0, TempReg, t3);
   14626 //--          uLiteral(cb, 0);
   14627 //--          uInstr1(cb, PUSH,  4, TempReg, t3);
   14628 //--
   14629 //--          uInstr2(cb, MOV,   4, Literal, 0, TempReg, t4);
   14630 //--          uLiteral(cb, 0);
   14631 //--          uInstr1(cb, PUSH,  4, TempReg, t4);
   14632 //--
   14633 //--          uInstr1(cb, CALLM, 0, Lit16,   VGOFF_(helper_CPUID));
   14634 //--          uFlagsRWU(cb, FlagsEmpty, FlagsEmpty, FlagsEmpty);
   14635 //--
   14636 //--          uInstr1(cb, POP,   4, TempReg, t4);
   14637 //--          uInstr2(cb, PUT,   4, TempReg, t4, ArchReg, R_EDX);
   14638 //--
   14639 //--          uInstr1(cb, POP,   4, TempReg, t3);
   14640 //--          uInstr2(cb, PUT,   4, TempReg, t3, ArchReg, R_ECX);
   14641 //--
   14642 //--          uInstr1(cb, POP,   4, TempReg, t2);
   14643 //--          uInstr2(cb, PUT,   4, TempReg, t2, ArchReg, R_EBX);
   14644 //--
   14645 //--          uInstr1(cb, POP,   4, TempReg, t1);
   14646 //--          uInstr2(cb, PUT,   4, TempReg, t1, ArchReg, R_EAX);
   14647 //--
   14648 //--          uInstr0(cb, CALLM_E, 0);
   14649 //--          DIP("cpuid\n");
   14650 //--          break;
   14651 //--
   14652       /* =-=-=-=-=-=-=-=-=- MOVZX, MOVSX =-=-=-=-=-=-=-= */
   14653 
   14654       case 0xB6: /* MOVZXb Eb,Gv */
   14655          if (sz != 2 && sz != 4)
   14656             goto decode_failure;
   14657          delta = dis_movx_E_G ( sorb, delta, 1, sz, False );
   14658          break;
   14659 
   14660       case 0xB7: /* MOVZXw Ew,Gv */
   14661          if (sz != 4)
   14662             goto decode_failure;
   14663          delta = dis_movx_E_G ( sorb, delta, 2, 4, False );
   14664          break;
   14665 
   14666       case 0xBE: /* MOVSXb Eb,Gv */
   14667          if (sz != 2 && sz != 4)
   14668             goto decode_failure;
   14669          delta = dis_movx_E_G ( sorb, delta, 1, sz, True );
   14670          break;
   14671 
   14672       case 0xBF: /* MOVSXw Ew,Gv */
   14673          if (sz != 4 && /* accept movsww, sigh, see #250799 */sz != 2)
   14674             goto decode_failure;
   14675          delta = dis_movx_E_G ( sorb, delta, 2, sz, True );
   14676          break;
   14677 
   14678 //--       /* =-=-=-=-=-=-=-=-=-=-= MOVNTI -=-=-=-=-=-=-=-=-= */
   14679 //--
   14680 //--       case 0xC3: /* MOVNTI Gv,Ev */
   14681 //--          vg_assert(sz == 4);
   14682 //--          modrm = getUChar(eip);
   14683 //--          vg_assert(!epartIsReg(modrm));
   14684 //--          t1 = newTemp(cb);
   14685 //--          uInstr2(cb, GET, 4, ArchReg, gregOfRM(modrm), TempReg, t1);
   14686 //--          pair = disAMode ( cb, sorb, eip, dis_buf );
   14687 //--          t2 = LOW24(pair);
   14688 //--          eip += HI8(pair);
   14689 //--          uInstr2(cb, STORE, 4, TempReg, t1, TempReg, t2);
   14690 //--          DIP("movnti %s,%s\n", nameIReg(4,gregOfRM(modrm)), dis_buf);
   14691 //--          break;
   14692 
   14693       /* =-=-=-=-=-=-=-=-=- MUL/IMUL =-=-=-=-=-=-=-=-=-= */
   14694 
   14695       case 0xAF: /* IMUL Ev, Gv */
   14696          delta = dis_mul_E_G ( sorb, sz, delta );
   14697          break;
   14698 
   14699       /* =-=-=-=-=-=-=-=-=- NOPs =-=-=-=-=-=-=-=-=-=-=-= */
   14700 
   14701       case 0x1F:
   14702          modrm = getUChar(delta);
   14703          if (epartIsReg(modrm)) goto decode_failure;
   14704          addr = disAMode ( &alen, sorb, delta, dis_buf );
   14705          delta += alen;
   14706          DIP("nop%c %s\n", nameISize(sz), dis_buf);
   14707          break;
   14708 
   14709       /* =-=-=-=-=-=-=-=-=- Jcond d32 -=-=-=-=-=-=-=-=-= */
   14710       case 0x80:
   14711       case 0x81:
   14712       case 0x82: /* JBb/JNAEb (jump below) */
   14713       case 0x83: /* JNBb/JAEb (jump not below) */
   14714       case 0x84: /* JZb/JEb (jump zero) */
   14715       case 0x85: /* JNZb/JNEb (jump not zero) */
   14716       case 0x86: /* JBEb/JNAb (jump below or equal) */
   14717       case 0x87: /* JNBEb/JAb (jump not below or equal) */
   14718       case 0x88: /* JSb (jump negative) */
   14719       case 0x89: /* JSb (jump not negative) */
   14720       case 0x8A: /* JP (jump parity even) */
   14721       case 0x8B: /* JNP/JPO (jump parity odd) */
   14722       case 0x8C: /* JLb/JNGEb (jump less) */
   14723       case 0x8D: /* JGEb/JNLb (jump greater or equal) */
   14724       case 0x8E: /* JLEb/JNGb (jump less or equal) */
   14725       case 0x8F: /* JGb/JNLEb (jump greater) */
   14726        { Int    jmpDelta;
   14727          HChar* comment  = "";
   14728          jmpDelta = (Int)getUDisp32(delta);
   14729          d32 = (((Addr32)guest_EIP_bbstart)+delta+4) + jmpDelta;
   14730          delta += 4;
   14731          if (resteerCisOk
   14732              && vex_control.guest_chase_cond
   14733              && (Addr32)d32 != (Addr32)guest_EIP_bbstart
   14734              && jmpDelta < 0
   14735              && resteerOkFn( callback_opaque, (Addr64)(Addr32)d32) ) {
   14736             /* Speculation: assume this backward branch is taken.  So
   14737                we need to emit a side-exit to the insn following this
   14738                one, on the negation of the condition, and continue at
   14739                the branch target address (d32).  If we wind up back at
   14740                the first instruction of the trace, just stop; it's
   14741                better to let the IR loop unroller handle that case.*/
   14742             stmt( IRStmt_Exit(
   14743                      mk_x86g_calculate_condition((X86Condcode)
   14744                                                  (1 ^ (opc - 0x80))),
   14745                      Ijk_Boring,
   14746                      IRConst_U32(guest_EIP_bbstart+delta) ) );
   14747             dres.whatNext   = Dis_ResteerC;
   14748             dres.continueAt = (Addr64)(Addr32)d32;
   14749             comment = "(assumed taken)";
   14750          }
   14751          else
   14752          if (resteerCisOk
   14753              && vex_control.guest_chase_cond
   14754              && (Addr32)d32 != (Addr32)guest_EIP_bbstart
   14755              && jmpDelta >= 0
   14756              && resteerOkFn( callback_opaque,
   14757                              (Addr64)(Addr32)(guest_EIP_bbstart+delta)) ) {
   14758             /* Speculation: assume this forward branch is not taken.
   14759                So we need to emit a side-exit to d32 (the dest) and
   14760                continue disassembling at the insn immediately
   14761                following this one. */
   14762             stmt( IRStmt_Exit(
   14763                      mk_x86g_calculate_condition((X86Condcode)(opc - 0x80)),
   14764                      Ijk_Boring,
   14765                      IRConst_U32(d32) ) );
   14766             dres.whatNext   = Dis_ResteerC;
   14767             dres.continueAt = (Addr64)(Addr32)(guest_EIP_bbstart+delta);
   14768             comment = "(assumed not taken)";
   14769          }
   14770          else {
   14771             /* Conservative default translation - end the block at
   14772                this point. */
   14773             jcc_01( (X86Condcode)(opc - 0x80),
   14774                     (Addr32)(guest_EIP_bbstart+delta), d32);
   14775             dres.whatNext = Dis_StopHere;
   14776          }
   14777          DIP("j%s-32 0x%x %s\n", name_X86Condcode(opc - 0x80), d32, comment);
   14778          break;
   14779        }
   14780 
   14781       /* =-=-=-=-=-=-=-=-=- RDTSC -=-=-=-=-=-=-=-=-=-=-= */
   14782       case 0x31: { /* RDTSC */
   14783          IRTemp   val  = newTemp(Ity_I64);
   14784          IRExpr** args = mkIRExprVec_0();
   14785          IRDirty* d    = unsafeIRDirty_1_N (
   14786                             val,
   14787                             0/*regparms*/,
   14788                             "x86g_dirtyhelper_RDTSC",
   14789                             &x86g_dirtyhelper_RDTSC,
   14790                             args
   14791                          );
   14792          /* execute the dirty call, dumping the result in val. */
   14793          stmt( IRStmt_Dirty(d) );
   14794          putIReg(4, R_EDX, unop(Iop_64HIto32, mkexpr(val)));
   14795          putIReg(4, R_EAX, unop(Iop_64to32, mkexpr(val)));
   14796          DIP("rdtsc\n");
   14797          break;
   14798       }
   14799 
   14800       /* =-=-=-=-=-=-=-=-=- PUSH/POP Sreg =-=-=-=-=-=-=-=-=-= */
   14801 
   14802       case 0xA1: /* POP %FS */
   14803          dis_pop_segreg( R_FS, sz ); break;
   14804       case 0xA9: /* POP %GS */
   14805          dis_pop_segreg( R_GS, sz ); break;
   14806 
   14807       case 0xA0: /* PUSH %FS */
   14808          dis_push_segreg( R_FS, sz ); break;
   14809       case 0xA8: /* PUSH %GS */
   14810          dis_push_segreg( R_GS, sz ); break;
   14811 
   14812       /* =-=-=-=-=-=-=-=-=- SETcc Eb =-=-=-=-=-=-=-=-=-= */
   14813       case 0x90:
   14814       case 0x91:
   14815       case 0x92: /* set-Bb/set-NAEb (jump below) */
   14816       case 0x93: /* set-NBb/set-AEb (jump not below) */
   14817       case 0x94: /* set-Zb/set-Eb (jump zero) */
   14818       case 0x95: /* set-NZb/set-NEb (jump not zero) */
   14819       case 0x96: /* set-BEb/set-NAb (jump below or equal) */
   14820       case 0x97: /* set-NBEb/set-Ab (jump not below or equal) */
   14821       case 0x98: /* set-Sb (jump negative) */
   14822       case 0x99: /* set-Sb (jump not negative) */
   14823       case 0x9A: /* set-P (jump parity even) */
   14824       case 0x9B: /* set-NP (jump parity odd) */
   14825       case 0x9C: /* set-Lb/set-NGEb (jump less) */
   14826       case 0x9D: /* set-GEb/set-NLb (jump greater or equal) */
   14827       case 0x9E: /* set-LEb/set-NGb (jump less or equal) */
   14828       case 0x9F: /* set-Gb/set-NLEb (jump greater) */
   14829          t1 = newTemp(Ity_I8);
   14830          assign( t1, unop(Iop_1Uto8,mk_x86g_calculate_condition(opc-0x90)) );
   14831          modrm = getIByte(delta);
   14832          if (epartIsReg(modrm)) {
   14833             delta++;
   14834             putIReg(1, eregOfRM(modrm), mkexpr(t1));
   14835             DIP("set%s %s\n", name_X86Condcode(opc-0x90),
   14836                               nameIReg(1,eregOfRM(modrm)));
   14837          } else {
   14838            addr = disAMode ( &alen, sorb, delta, dis_buf );
   14839            delta += alen;
   14840            storeLE( mkexpr(addr), mkexpr(t1) );
   14841            DIP("set%s %s\n", name_X86Condcode(opc-0x90), dis_buf);
   14842          }
   14843          break;
   14844 
   14845       /* =-=-=-=-=-=-=-=-=- SHLD/SHRD -=-=-=-=-=-=-=-=-= */
   14846 
   14847       case 0xA4: /* SHLDv imm8,Gv,Ev */
   14848          modrm = getIByte(delta);
   14849          d32   = delta + lengthAMode(delta);
   14850          vex_sprintf(dis_buf, "$%d", getIByte(d32));
   14851          delta = dis_SHLRD_Gv_Ev (
   14852                   sorb, delta, modrm, sz,
   14853                   mkU8(getIByte(d32)), True, /* literal */
   14854                   dis_buf, True );
   14855          break;
   14856       case 0xA5: /* SHLDv %cl,Gv,Ev */
   14857          modrm = getIByte(delta);
   14858          delta = dis_SHLRD_Gv_Ev (
   14859                     sorb, delta, modrm, sz,
   14860                     getIReg(1,R_ECX), False, /* not literal */
   14861                     "%cl", True );
   14862          break;
   14863 
   14864       case 0xAC: /* SHRDv imm8,Gv,Ev */
   14865          modrm = getIByte(delta);
   14866          d32   = delta + lengthAMode(delta);
   14867          vex_sprintf(dis_buf, "$%d", getIByte(d32));
   14868          delta = dis_SHLRD_Gv_Ev (
   14869                     sorb, delta, modrm, sz,
   14870                     mkU8(getIByte(d32)), True, /* literal */
   14871                     dis_buf, False );
   14872          break;
   14873       case 0xAD: /* SHRDv %cl,Gv,Ev */
   14874          modrm = getIByte(delta);
   14875          delta = dis_SHLRD_Gv_Ev (
   14876                     sorb, delta, modrm, sz,
   14877                     getIReg(1,R_ECX), False, /* not literal */
   14878                     "%cl", False );
   14879          break;
   14880 
   14881       /* =-=-=-=-=-=-=-=-=- SYSENTER -=-=-=-=-=-=-=-=-=-= */
   14882 
   14883       case 0x34:
   14884          /* Simple implementation needing a long explaination.
   14885 
   14886             sysenter is a kind of syscall entry.  The key thing here
   14887             is that the return address is not known -- that is
   14888             something that is beyond Vex's knowledge.  So this IR
   14889             forces a return to the scheduler, which can do what it
   14890             likes to simulate the systenter, but it MUST set this
   14891             thread's guest_EIP field with the continuation address
   14892             before resuming execution.  If that doesn't happen, the
   14893             thread will jump to address zero, which is probably
   14894             fatal.
   14895          */
   14896 
   14897          /* Note where we are, so we can back up the guest to this
   14898             point if the syscall needs to be restarted. */
   14899          stmt( IRStmt_Put( OFFB_IP_AT_SYSCALL,
   14900                            mkU32(guest_EIP_curr_instr) ) );
   14901          jmp_lit(Ijk_Sys_sysenter, 0/*bogus next EIP value*/);
   14902          dres.whatNext = Dis_StopHere;
   14903          DIP("sysenter");
   14904          break;
   14905 
   14906       /* =-=-=-=-=-=-=-=-=- XADD -=-=-=-=-=-=-=-=-=-= */
   14907 
   14908       case 0xC0: { /* XADD Gb,Eb */
   14909          Bool decodeOK;
   14910          delta = dis_xadd_G_E ( sorb, pfx_lock, 1, delta, &decodeOK );
   14911          if (!decodeOK) goto decode_failure;
   14912          break;
   14913       }
   14914       case 0xC1: { /* XADD Gv,Ev */
   14915          Bool decodeOK;
   14916          delta = dis_xadd_G_E ( sorb, pfx_lock, sz, delta, &decodeOK );
   14917          if (!decodeOK) goto decode_failure;
   14918          break;
   14919       }
   14920 
   14921       /* =-=-=-=-=-=-=-=-=- MMXery =-=-=-=-=-=-=-=-=-=-= */
   14922 
   14923       case 0x71:
   14924       case 0x72:
   14925       case 0x73: /* PSLLgg/PSRAgg/PSRLgg mmxreg by imm8 */
   14926 
   14927       case 0x6E: /* MOVD (src)ireg-or-mem, (dst)mmxreg */
   14928       case 0x7E: /* MOVD (src)mmxreg, (dst)ireg-or-mem */
   14929       case 0x7F: /* MOVQ (src)mmxreg, (dst)mmxreg-or-mem */
   14930       case 0x6F: /* MOVQ (src)mmxreg-or-mem, (dst)mmxreg */
   14931 
   14932       case 0xFC:
   14933       case 0xFD:
   14934       case 0xFE: /* PADDgg (src)mmxreg-or-mem, (dst)mmxreg */
   14935 
   14936       case 0xEC:
   14937       case 0xED: /* PADDSgg (src)mmxreg-or-mem, (dst)mmxreg */
   14938 
   14939       case 0xDC:
   14940       case 0xDD: /* PADDUSgg (src)mmxreg-or-mem, (dst)mmxreg */
   14941 
   14942       case 0xF8:
   14943       case 0xF9:
   14944       case 0xFA: /* PSUBgg (src)mmxreg-or-mem, (dst)mmxreg */
   14945 
   14946       case 0xE8:
   14947       case 0xE9: /* PSUBSgg (src)mmxreg-or-mem, (dst)mmxreg */
   14948 
   14949       case 0xD8:
   14950       case 0xD9: /* PSUBUSgg (src)mmxreg-or-mem, (dst)mmxreg */
   14951 
   14952       case 0xE5: /* PMULHW (src)mmxreg-or-mem, (dst)mmxreg */
   14953       case 0xD5: /* PMULLW (src)mmxreg-or-mem, (dst)mmxreg */
   14954 
   14955       case 0xF5: /* PMADDWD (src)mmxreg-or-mem, (dst)mmxreg */
   14956 
   14957       case 0x74:
   14958       case 0x75:
   14959       case 0x76: /* PCMPEQgg (src)mmxreg-or-mem, (dst)mmxreg */
   14960 
   14961       case 0x64:
   14962       case 0x65:
   14963       case 0x66: /* PCMPGTgg (src)mmxreg-or-mem, (dst)mmxreg */
   14964 
   14965       case 0x6B: /* PACKSSDW (src)mmxreg-or-mem, (dst)mmxreg */
   14966       case 0x63: /* PACKSSWB (src)mmxreg-or-mem, (dst)mmxreg */
   14967       case 0x67: /* PACKUSWB (src)mmxreg-or-mem, (dst)mmxreg */
   14968 
   14969       case 0x68:
   14970       case 0x69:
   14971       case 0x6A: /* PUNPCKHgg (src)mmxreg-or-mem, (dst)mmxreg */
   14972 
   14973       case 0x60:
   14974       case 0x61:
   14975       case 0x62: /* PUNPCKLgg (src)mmxreg-or-mem, (dst)mmxreg */
   14976 
   14977       case 0xDB: /* PAND (src)mmxreg-or-mem, (dst)mmxreg */
   14978       case 0xDF: /* PANDN (src)mmxreg-or-mem, (dst)mmxreg */
   14979       case 0xEB: /* POR (src)mmxreg-or-mem, (dst)mmxreg */
   14980       case 0xEF: /* PXOR (src)mmxreg-or-mem, (dst)mmxreg */
   14981 
   14982       case 0xF1: /* PSLLgg (src)mmxreg-or-mem, (dst)mmxreg */
   14983       case 0xF2:
   14984       case 0xF3:
   14985 
   14986       case 0xD1: /* PSRLgg (src)mmxreg-or-mem, (dst)mmxreg */
   14987       case 0xD2:
   14988       case 0xD3:
   14989 
   14990       case 0xE1: /* PSRAgg (src)mmxreg-or-mem, (dst)mmxreg */
   14991       case 0xE2:
   14992       {
   14993          Int  delta0    = delta-1;
   14994          Bool decode_OK = False;
   14995 
   14996          /* If sz==2 this is SSE, and we assume sse idec has
   14997             already spotted those cases by now. */
   14998          if (sz != 4)
   14999             goto decode_failure;
   15000 
   15001          delta = dis_MMX ( &decode_OK, sorb, sz, delta-1 );
   15002          if (!decode_OK) {
   15003             delta = delta0;
   15004             goto decode_failure;
   15005          }
   15006          break;
   15007       }
   15008 
   15009       case 0x0E: /* FEMMS */
   15010       case 0x77: /* EMMS */
   15011          if (sz != 4)
   15012             goto decode_failure;
   15013          do_EMMS_preamble();
   15014          DIP("{f}emms\n");
   15015          break;
   15016 
   15017       /* =-=-=-=-=-=-=-=-=- SGDT and SIDT =-=-=-=-=-=-=-=-=-=-= */
   15018       case 0x01: /* 0F 01 /0 -- SGDT */
   15019                  /* 0F 01 /1 -- SIDT */
   15020       {
   15021           /* This is really revolting, but ... since each processor
   15022              (core) only has one IDT and one GDT, just let the guest
   15023              see it (pass-through semantics).  I can't see any way to
   15024              construct a faked-up value, so don't bother to try. */
   15025          modrm = getUChar(delta);
   15026          addr = disAMode ( &alen, sorb, delta, dis_buf );
   15027          delta += alen;
   15028          if (epartIsReg(modrm)) goto decode_failure;
   15029          if (gregOfRM(modrm) != 0 && gregOfRM(modrm) != 1)
   15030             goto decode_failure;
   15031          switch (gregOfRM(modrm)) {
   15032             case 0: DIP("sgdt %s\n", dis_buf); break;
   15033             case 1: DIP("sidt %s\n", dis_buf); break;
   15034             default: vassert(0); /*NOTREACHED*/
   15035          }
   15036 
   15037          IRDirty* d = unsafeIRDirty_0_N (
   15038                           0/*regparms*/,
   15039                           "x86g_dirtyhelper_SxDT",
   15040                           &x86g_dirtyhelper_SxDT,
   15041                           mkIRExprVec_2( mkexpr(addr),
   15042                                          mkU32(gregOfRM(modrm)) )
   15043                       );
   15044          /* declare we're writing memory */
   15045          d->mFx   = Ifx_Write;
   15046          d->mAddr = mkexpr(addr);
   15047          d->mSize = 6;
   15048          stmt( IRStmt_Dirty(d) );
   15049          break;
   15050       }
   15051 
   15052       /* =-=-=-=-=-=-=-=-=- unimp2 =-=-=-=-=-=-=-=-=-=-= */
   15053 
   15054       default:
   15055          goto decode_failure;
   15056    } /* switch (opc) for the 2-byte opcodes */
   15057    goto decode_success;
   15058    } /* case 0x0F: of primary opcode */
   15059 
   15060    /* ------------------------ ??? ------------------------ */
   15061 
   15062   default:
   15063   decode_failure:
   15064    /* All decode failures end up here. */
   15065    vex_printf("vex x86->IR: unhandled instruction bytes: "
   15066               "0x%x 0x%x 0x%x 0x%x\n",
   15067               (Int)getIByte(delta_start+0),
   15068               (Int)getIByte(delta_start+1),
   15069               (Int)getIByte(delta_start+2),
   15070               (Int)getIByte(delta_start+3) );
   15071 
   15072    /* Tell the dispatcher that this insn cannot be decoded, and so has
   15073       not been executed, and (is currently) the next to be executed.
   15074       EIP should be up-to-date since it made so at the start of each
   15075       insn, but nevertheless be paranoid and update it again right
   15076       now. */
   15077    stmt( IRStmt_Put( OFFB_EIP, mkU32(guest_EIP_curr_instr) ) );
   15078    jmp_lit(Ijk_NoDecode, guest_EIP_curr_instr);
   15079    dres.whatNext = Dis_StopHere;
   15080    dres.len = 0;
   15081    /* We also need to say that a CAS is not expected now, regardless
   15082       of what it might have been set to at the start of the function,
   15083       since the IR that we've emitted just above (to synthesis a
   15084       SIGILL) does not involve any CAS, and presumably no other IR has
   15085       been emitted for this (non-decoded) insn. */
   15086    *expect_CAS = False;
   15087    return dres;
   15088 
   15089    } /* switch (opc) for the main (primary) opcode switch. */
   15090 
   15091   decode_success:
   15092    /* All decode successes end up here. */
   15093    DIP("\n");
   15094    dres.len = delta - delta_start;
   15095    return dres;
   15096 }
   15097 
   15098 #undef DIP
   15099 #undef DIS
   15100 
   15101 
   15102 /*------------------------------------------------------------*/
   15103 /*--- Top-level fn                                         ---*/
   15104 /*------------------------------------------------------------*/
   15105 
   15106 /* Disassemble a single instruction into IR.  The instruction
   15107    is located in host memory at &guest_code[delta]. */
   15108 
   15109 DisResult disInstr_X86 ( IRSB*        irsb_IN,
   15110                          Bool         put_IP,
   15111                          Bool         (*resteerOkFn) ( void*, Addr64 ),
   15112                          Bool         resteerCisOk,
   15113                          void*        callback_opaque,
   15114                          UChar*       guest_code_IN,
   15115                          Long         delta,
   15116                          Addr64       guest_IP,
   15117                          VexArch      guest_arch,
   15118                          VexArchInfo* archinfo,
   15119                          VexAbiInfo*  abiinfo,
   15120                          Bool         host_bigendian_IN )
   15121 {
   15122    Int       i, x1, x2;
   15123    Bool      expect_CAS, has_CAS;
   15124    DisResult dres;
   15125 
   15126    /* Set globals (see top of this file) */
   15127    vassert(guest_arch == VexArchX86);
   15128    guest_code           = guest_code_IN;
   15129    irsb                 = irsb_IN;
   15130    host_is_bigendian    = host_bigendian_IN;
   15131    guest_EIP_curr_instr = (Addr32)guest_IP;
   15132    guest_EIP_bbstart    = (Addr32)toUInt(guest_IP - delta);
   15133 
   15134    x1 = irsb_IN->stmts_used;
   15135    expect_CAS = False;
   15136    dres = disInstr_X86_WRK ( &expect_CAS, put_IP, resteerOkFn,
   15137                              resteerCisOk,
   15138                              callback_opaque,
   15139                              delta, archinfo, abiinfo );
   15140    x2 = irsb_IN->stmts_used;
   15141    vassert(x2 >= x1);
   15142 
   15143    /* See comment at the top of disInstr_X86_WRK for meaning of
   15144       expect_CAS.  Here, we (sanity-)check for the presence/absence of
   15145       IRCAS as directed by the returned expect_CAS value. */
   15146    has_CAS = False;
   15147    for (i = x1; i < x2; i++) {
   15148       if (irsb_IN->stmts[i]->tag == Ist_CAS)
   15149          has_CAS = True;
   15150    }
   15151 
   15152    if (expect_CAS != has_CAS) {
   15153       /* inconsistency detected.  re-disassemble the instruction so as
   15154          to generate a useful error message; then assert. */
   15155       vex_traceflags |= VEX_TRACE_FE;
   15156       dres = disInstr_X86_WRK ( &expect_CAS, put_IP, resteerOkFn,
   15157                                 resteerCisOk,
   15158                                 callback_opaque,
   15159                                 delta, archinfo, abiinfo );
   15160       for (i = x1; i < x2; i++) {
   15161          vex_printf("\t\t");
   15162          ppIRStmt(irsb_IN->stmts[i]);
   15163          vex_printf("\n");
   15164       }
   15165       /* Failure of this assertion is serious and denotes a bug in
   15166          disInstr. */
   15167       vpanic("disInstr_X86: inconsistency in LOCK prefix handling");
   15168    }
   15169 
   15170    return dres;
   15171 }
   15172 
   15173 
   15174 /*--------------------------------------------------------------------*/
   15175 /*--- end                                         guest_x86_toIR.c ---*/
   15176 /*--------------------------------------------------------------------*/
   15177