Home | History | Annotate | Download | only in priv
      1 
      2 /*--------------------------------------------------------------------*/
      3 /*--- begin                                       guest_x86_toIR.c ---*/
      4 /*--------------------------------------------------------------------*/
      5 
      6 /*
      7    This file is part of Valgrind, a dynamic binary instrumentation
      8    framework.
      9 
     10    Copyright (C) 2004-2013 OpenWorks LLP
     11       info (at) open-works.net
     12 
     13    This program is free software; you can redistribute it and/or
     14    modify it under the terms of the GNU General Public License as
     15    published by the Free Software Foundation; either version 2 of the
     16    License, or (at your option) any later version.
     17 
     18    This program is distributed in the hope that it will be useful, but
     19    WITHOUT ANY WARRANTY; without even the implied warranty of
     20    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
     21    General Public License for more details.
     22 
     23    You should have received a copy of the GNU General Public License
     24    along with this program; if not, write to the Free Software
     25    Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
     26    02110-1301, USA.
     27 
     28    The GNU General Public License is contained in the file COPYING.
     29 
     30    Neither the names of the U.S. Department of Energy nor the
     31    University of California nor the names of its contributors may be
     32    used to endorse or promote products derived from this software
     33    without prior written permission.
     34 */
     35 
     36 /* Translates x86 code to IR. */
     37 
     38 /* TODO:
     39 
     40    All Puts to CC_OP/CC_DEP1/CC_DEP2/CC_NDEP should really be checked
     41    to ensure a 32-bit value is being written.
     42 
     43    FUCOMI(P): what happens to A and S flags?  Currently are forced
     44       to zero.
     45 
     46    x87 FP Limitations:
     47 
     48    * all arithmetic done at 64 bits
     49 
     50    * no FP exceptions, except for handling stack over/underflow
     51 
     52    * FP rounding mode observed only for float->int conversions
     53      and int->float conversions which could lose accuracy, and
     54      for float-to-float rounding.  For all other operations,
     55      round-to-nearest is used, regardless.
     56 
     57    * some of the FCOM cases could do with testing -- not convinced
     58      that the args are the right way round.
     59 
     60    * FSAVE does not re-initialise the FPU; it should do
     61 
     62    * FINIT not only initialises the FPU environment, it also
     63      zeroes all the FP registers.  It should leave the registers
     64      unchanged.
     65 
     66    SAHF should cause eflags[1] == 1, and in fact it produces 0.  As
     67    per Intel docs this bit has no meaning anyway.  Since PUSHF is the
     68    only way to observe eflags[1], a proper fix would be to make that
     69    bit be set by PUSHF.
     70 
     71    The state of %eflags.AC (alignment check, bit 18) is recorded by
     72    the simulation (viz, if you set it with popf then a pushf produces
     73    the value you set it to), but it is otherwise ignored.  In
     74    particular, setting it to 1 does NOT cause alignment checking to
     75    happen.  Programs that set it to 1 and then rely on the resulting
     76    SIGBUSs to inform them of misaligned accesses will not work.
     77 
     78    Implementation of sysenter is necessarily partial.  sysenter is a
     79    kind of system call entry.  When doing a sysenter, the return
     80    address is not known -- that is something that is beyond Vex's
     81    knowledge.  So the generated IR forces a return to the scheduler,
     82    which can do what it likes to simulate the systenter, but it MUST
     83    set this thread's guest_EIP field with the continuation address
     84    before resuming execution.  If that doesn't happen, the thread will
     85    jump to address zero, which is probably fatal.
     86 
     87    This module uses global variables and so is not MT-safe (if that
     88    should ever become relevant).
     89 
     90    The delta values are 32-bit ints, not 64-bit ints.  That means
     91    this module may not work right if run on a 64-bit host.  That should
     92    be fixed properly, really -- if anyone ever wants to use Vex to
     93    translate x86 code for execution on a 64-bit host.
     94 
     95    casLE (implementation of lock-prefixed insns) and rep-prefixed
     96    insns: the side-exit back to the start of the insn is done with
     97    Ijk_Boring.  This is quite wrong, it should be done with
     98    Ijk_NoRedir, since otherwise the side exit, which is intended to
     99    restart the instruction for whatever reason, could go somewhere
    100    entirely else.  Doing it right (with Ijk_NoRedir jumps) would make
    101    no-redir jumps performance critical, at least for rep-prefixed
    102    instructions, since all iterations thereof would involve such a
    103    jump.  It's not such a big deal with casLE since the side exit is
    104    only taken if the CAS fails, that is, the location is contended,
    105    which is relatively unlikely.
    106 
    107    XXXX: Nov 2009: handling of SWP on ARM suffers from the same
    108    problem.
    109 
    110    Note also, the test for CAS success vs failure is done using
    111    Iop_CasCmp{EQ,NE}{8,16,32,64} rather than the ordinary
    112    Iop_Cmp{EQ,NE} equivalents.  This is so as to tell Memcheck that it
    113    shouldn't definedness-check these comparisons.  See
    114    COMMENT_ON_CasCmpEQ in memcheck/mc_translate.c for
    115    background/rationale.
    116 */
    117 
    118 /* Performance holes:
    119 
    120    - fcom ; fstsw %ax ; sahf
    121      sahf does not update the O flag (sigh) and so O needs to
    122      be computed.  This is done expensively; it would be better
    123      to have a calculate_eflags_o helper.
    124 
    125    - emwarns; some FP codes can generate huge numbers of these
    126      if the fpucw is changed in an inner loop.  It would be
    127      better for the guest state to have an emwarn-enable reg
    128      which can be set zero or nonzero.  If it is zero, emwarns
    129      are not flagged, and instead control just flows all the
    130      way through bbs as usual.
    131 */
    132 
    133 /* "Special" instructions.
    134 
    135    This instruction decoder can decode three special instructions
    136    which mean nothing natively (are no-ops as far as regs/mem are
    137    concerned) but have meaning for supporting Valgrind.  A special
    138    instruction is flagged by the 12-byte preamble C1C703 C1C70D C1C71D
    139    C1C713 (in the standard interpretation, that means: roll $3, %edi;
    140    roll $13, %edi; roll $29, %edi; roll $19, %edi).  Following that,
    141    one of the following 3 are allowed (standard interpretation in
    142    parentheses):
    143 
    144       87DB (xchgl %ebx,%ebx)   %EDX = client_request ( %EAX )
    145       87C9 (xchgl %ecx,%ecx)   %EAX = guest_NRADDR
    146       87D2 (xchgl %edx,%edx)   call-noredir *%EAX
    147       87FF (xchgl %edi,%edi)   IR injection
    148 
    149    Any other bytes following the 12-byte preamble are illegal and
    150    constitute a failure in instruction decoding.  This all assumes
    151    that the preamble will never occur except in specific code
    152    fragments designed for Valgrind to catch.
    153 
    154    No prefixes may precede a "Special" instruction.
    155 */
    156 
    157 /* LOCK prefixed instructions.  These are translated using IR-level
    158    CAS statements (IRCAS) and are believed to preserve atomicity, even
    159    from the point of view of some other process racing against a
    160    simulated one (presumably they communicate via a shared memory
    161    segment).
    162 
    163    Handlers which are aware of LOCK prefixes are:
    164       dis_op2_G_E      (add, or, adc, sbb, and, sub, xor)
    165       dis_cmpxchg_G_E  (cmpxchg)
    166       dis_Grp1         (add, or, adc, sbb, and, sub, xor)
    167       dis_Grp3         (not, neg)
    168       dis_Grp4         (inc, dec)
    169       dis_Grp5         (inc, dec)
    170       dis_Grp8_Imm     (bts, btc, btr)
    171       dis_bt_G_E       (bts, btc, btr)
    172       dis_xadd_G_E     (xadd)
    173 */
    174 
    175 
    176 #include "libvex_basictypes.h"
    177 #include "libvex_ir.h"
    178 #include "libvex.h"
    179 #include "libvex_guest_x86.h"
    180 
    181 #include "main_util.h"
    182 #include "main_globals.h"
    183 #include "guest_generic_bb_to_IR.h"
    184 #include "guest_generic_x87.h"
    185 #include "guest_x86_defs.h"
    186 
    187 
    188 /*------------------------------------------------------------*/
    189 /*--- Globals                                              ---*/
    190 /*------------------------------------------------------------*/
    191 
    192 /* These are set at the start of the translation of an insn, right
    193    down in disInstr_X86, so that we don't have to pass them around
    194    endlessly.  They are all constant during the translation of any
    195    given insn. */
    196 
    197 /* We need to know this to do sub-register accesses correctly. */
    198 static Bool host_is_bigendian;
    199 
    200 /* Pointer to the guest code area (points to start of BB, not to the
    201    insn being processed). */
    202 static UChar* guest_code;
    203 
    204 /* The guest address corresponding to guest_code[0]. */
    205 static Addr32 guest_EIP_bbstart;
    206 
    207 /* The guest address for the instruction currently being
    208    translated. */
    209 static Addr32 guest_EIP_curr_instr;
    210 
    211 /* The IRSB* into which we're generating code. */
    212 static IRSB* irsb;
    213 
    214 
    215 /*------------------------------------------------------------*/
    216 /*--- Debugging output                                     ---*/
    217 /*------------------------------------------------------------*/
    218 
    219 #define DIP(format, args...)           \
    220    if (vex_traceflags & VEX_TRACE_FE)  \
    221       vex_printf(format, ## args)
    222 
    223 #define DIS(buf, format, args...)      \
    224    if (vex_traceflags & VEX_TRACE_FE)  \
    225       vex_sprintf(buf, format, ## args)
    226 
    227 
    228 /*------------------------------------------------------------*/
    229 /*--- Offsets of various parts of the x86 guest state.     ---*/
    230 /*------------------------------------------------------------*/
    231 
    232 #define OFFB_EAX       offsetof(VexGuestX86State,guest_EAX)
    233 #define OFFB_EBX       offsetof(VexGuestX86State,guest_EBX)
    234 #define OFFB_ECX       offsetof(VexGuestX86State,guest_ECX)
    235 #define OFFB_EDX       offsetof(VexGuestX86State,guest_EDX)
    236 #define OFFB_ESP       offsetof(VexGuestX86State,guest_ESP)
    237 #define OFFB_EBP       offsetof(VexGuestX86State,guest_EBP)
    238 #define OFFB_ESI       offsetof(VexGuestX86State,guest_ESI)
    239 #define OFFB_EDI       offsetof(VexGuestX86State,guest_EDI)
    240 
    241 #define OFFB_EIP       offsetof(VexGuestX86State,guest_EIP)
    242 
    243 #define OFFB_CC_OP     offsetof(VexGuestX86State,guest_CC_OP)
    244 #define OFFB_CC_DEP1   offsetof(VexGuestX86State,guest_CC_DEP1)
    245 #define OFFB_CC_DEP2   offsetof(VexGuestX86State,guest_CC_DEP2)
    246 #define OFFB_CC_NDEP   offsetof(VexGuestX86State,guest_CC_NDEP)
    247 
    248 #define OFFB_FPREGS    offsetof(VexGuestX86State,guest_FPREG[0])
    249 #define OFFB_FPTAGS    offsetof(VexGuestX86State,guest_FPTAG[0])
    250 #define OFFB_DFLAG     offsetof(VexGuestX86State,guest_DFLAG)
    251 #define OFFB_IDFLAG    offsetof(VexGuestX86State,guest_IDFLAG)
    252 #define OFFB_ACFLAG    offsetof(VexGuestX86State,guest_ACFLAG)
    253 #define OFFB_FTOP      offsetof(VexGuestX86State,guest_FTOP)
    254 #define OFFB_FC3210    offsetof(VexGuestX86State,guest_FC3210)
    255 #define OFFB_FPROUND   offsetof(VexGuestX86State,guest_FPROUND)
    256 
    257 #define OFFB_CS        offsetof(VexGuestX86State,guest_CS)
    258 #define OFFB_DS        offsetof(VexGuestX86State,guest_DS)
    259 #define OFFB_ES        offsetof(VexGuestX86State,guest_ES)
    260 #define OFFB_FS        offsetof(VexGuestX86State,guest_FS)
    261 #define OFFB_GS        offsetof(VexGuestX86State,guest_GS)
    262 #define OFFB_SS        offsetof(VexGuestX86State,guest_SS)
    263 #define OFFB_LDT       offsetof(VexGuestX86State,guest_LDT)
    264 #define OFFB_GDT       offsetof(VexGuestX86State,guest_GDT)
    265 
    266 #define OFFB_SSEROUND  offsetof(VexGuestX86State,guest_SSEROUND)
    267 #define OFFB_XMM0      offsetof(VexGuestX86State,guest_XMM0)
    268 #define OFFB_XMM1      offsetof(VexGuestX86State,guest_XMM1)
    269 #define OFFB_XMM2      offsetof(VexGuestX86State,guest_XMM2)
    270 #define OFFB_XMM3      offsetof(VexGuestX86State,guest_XMM3)
    271 #define OFFB_XMM4      offsetof(VexGuestX86State,guest_XMM4)
    272 #define OFFB_XMM5      offsetof(VexGuestX86State,guest_XMM5)
    273 #define OFFB_XMM6      offsetof(VexGuestX86State,guest_XMM6)
    274 #define OFFB_XMM7      offsetof(VexGuestX86State,guest_XMM7)
    275 
    276 #define OFFB_EMNOTE    offsetof(VexGuestX86State,guest_EMNOTE)
    277 
    278 #define OFFB_CMSTART   offsetof(VexGuestX86State,guest_CMSTART)
    279 #define OFFB_CMLEN     offsetof(VexGuestX86State,guest_CMLEN)
    280 #define OFFB_NRADDR    offsetof(VexGuestX86State,guest_NRADDR)
    281 
    282 #define OFFB_IP_AT_SYSCALL offsetof(VexGuestX86State,guest_IP_AT_SYSCALL)
    283 
    284 
    285 /*------------------------------------------------------------*/
    286 /*--- Helper bits and pieces for deconstructing the        ---*/
    287 /*--- x86 insn stream.                                     ---*/
    288 /*------------------------------------------------------------*/
    289 
    290 /* This is the Intel register encoding -- integer regs. */
    291 #define R_EAX 0
    292 #define R_ECX 1
    293 #define R_EDX 2
    294 #define R_EBX 3
    295 #define R_ESP 4
    296 #define R_EBP 5
    297 #define R_ESI 6
    298 #define R_EDI 7
    299 
    300 #define R_AL (0+R_EAX)
    301 #define R_AH (4+R_EAX)
    302 
    303 /* This is the Intel register encoding -- segment regs. */
    304 #define R_ES 0
    305 #define R_CS 1
    306 #define R_SS 2
    307 #define R_DS 3
    308 #define R_FS 4
    309 #define R_GS 5
    310 
    311 
    312 /* Add a statement to the list held by "irbb". */
    313 static void stmt ( IRStmt* st )
    314 {
    315    addStmtToIRSB( irsb, st );
    316 }
    317 
    318 /* Generate a new temporary of the given type. */
    319 static IRTemp newTemp ( IRType ty )
    320 {
    321    vassert(isPlausibleIRType(ty));
    322    return newIRTemp( irsb->tyenv, ty );
    323 }
    324 
    325 /* Various simple conversions */
    326 
    327 static UInt extend_s_8to32( UInt x )
    328 {
    329    return (UInt)((((Int)x) << 24) >> 24);
    330 }
    331 
    332 static UInt extend_s_16to32 ( UInt x )
    333 {
    334    return (UInt)((((Int)x) << 16) >> 16);
    335 }
    336 
    337 /* Fetch a byte from the guest insn stream. */
    338 static UChar getIByte ( Int delta )
    339 {
    340    return guest_code[delta];
    341 }
    342 
    343 /* Extract the reg field from a modRM byte. */
    344 static Int gregOfRM ( UChar mod_reg_rm )
    345 {
    346    return (Int)( (mod_reg_rm >> 3) & 7 );
    347 }
    348 
    349 /* Figure out whether the mod and rm parts of a modRM byte refer to a
    350    register or memory.  If so, the byte will have the form 11XXXYYY,
    351    where YYY is the register number. */
    352 static Bool epartIsReg ( UChar mod_reg_rm )
    353 {
    354    return toBool(0xC0 == (mod_reg_rm & 0xC0));
    355 }
    356 
    357 /* ... and extract the register number ... */
    358 static Int eregOfRM ( UChar mod_reg_rm )
    359 {
    360    return (Int)(mod_reg_rm & 0x7);
    361 }
    362 
    363 /* Get a 8/16/32-bit unsigned value out of the insn stream. */
    364 
    365 static UChar getUChar ( Int delta )
    366 {
    367    UChar v = guest_code[delta+0];
    368    return toUChar(v);
    369 }
    370 
    371 static UInt getUDisp16 ( Int delta )
    372 {
    373    UInt v = guest_code[delta+1]; v <<= 8;
    374    v |= guest_code[delta+0];
    375    return v & 0xFFFF;
    376 }
    377 
    378 static UInt getUDisp32 ( Int delta )
    379 {
    380    UInt v = guest_code[delta+3]; v <<= 8;
    381    v |= guest_code[delta+2]; v <<= 8;
    382    v |= guest_code[delta+1]; v <<= 8;
    383    v |= guest_code[delta+0];
    384    return v;
    385 }
    386 
    387 static UInt getUDisp ( Int size, Int delta )
    388 {
    389    switch (size) {
    390       case 4: return getUDisp32(delta);
    391       case 2: return getUDisp16(delta);
    392       case 1: return (UInt)getUChar(delta);
    393       default: vpanic("getUDisp(x86)");
    394    }
    395    return 0; /*notreached*/
    396 }
    397 
    398 
    399 /* Get a byte value out of the insn stream and sign-extend to 32
    400    bits. */
    401 static UInt getSDisp8 ( Int delta )
    402 {
    403    return extend_s_8to32( (UInt) (guest_code[delta]) );
    404 }
    405 
    406 static UInt getSDisp16 ( Int delta0 )
    407 {
    408    UChar* eip = (UChar*)(&guest_code[delta0]);
    409    UInt d = *eip++;
    410    d |= ((*eip++) << 8);
    411    return extend_s_16to32(d);
    412 }
    413 
    414 static UInt getSDisp ( Int size, Int delta )
    415 {
    416    switch (size) {
    417       case 4: return getUDisp32(delta);
    418       case 2: return getSDisp16(delta);
    419       case 1: return getSDisp8(delta);
    420       default: vpanic("getSDisp(x86)");
    421   }
    422   return 0; /*notreached*/
    423 }
    424 
    425 
    426 /*------------------------------------------------------------*/
    427 /*--- Helpers for constructing IR.                         ---*/
    428 /*------------------------------------------------------------*/
    429 
    430 /* Create a 1/2/4 byte read of an x86 integer registers.  For 16/8 bit
    431    register references, we need to take the host endianness into
    432    account.  Supplied value is 0 .. 7 and in the Intel instruction
    433    encoding. */
    434 
    435 static IRType szToITy ( Int n )
    436 {
    437    switch (n) {
    438       case 1: return Ity_I8;
    439       case 2: return Ity_I16;
    440       case 4: return Ity_I32;
    441       default: vpanic("szToITy(x86)");
    442    }
    443 }
    444 
    445 /* On a little-endian host, less significant bits of the guest
    446    registers are at lower addresses.  Therefore, if a reference to a
    447    register low half has the safe guest state offset as a reference to
    448    the full register.
    449 */
    450 static Int integerGuestRegOffset ( Int sz, UInt archreg )
    451 {
    452    vassert(archreg < 8);
    453 
    454    /* Correct for little-endian host only. */
    455    vassert(!host_is_bigendian);
    456 
    457    if (sz == 4 || sz == 2 || (sz == 1 && archreg < 4)) {
    458       switch (archreg) {
    459          case R_EAX: return OFFB_EAX;
    460          case R_EBX: return OFFB_EBX;
    461          case R_ECX: return OFFB_ECX;
    462          case R_EDX: return OFFB_EDX;
    463          case R_ESI: return OFFB_ESI;
    464          case R_EDI: return OFFB_EDI;
    465          case R_ESP: return OFFB_ESP;
    466          case R_EBP: return OFFB_EBP;
    467          default: vpanic("integerGuestRegOffset(x86,le)(4,2)");
    468       }
    469    }
    470 
    471    vassert(archreg >= 4 && archreg < 8 && sz == 1);
    472    switch (archreg-4) {
    473       case R_EAX: return 1+ OFFB_EAX;
    474       case R_EBX: return 1+ OFFB_EBX;
    475       case R_ECX: return 1+ OFFB_ECX;
    476       case R_EDX: return 1+ OFFB_EDX;
    477       default: vpanic("integerGuestRegOffset(x86,le)(1h)");
    478    }
    479 
    480    /* NOTREACHED */
    481    vpanic("integerGuestRegOffset(x86,le)");
    482 }
    483 
    484 static Int segmentGuestRegOffset ( UInt sreg )
    485 {
    486    switch (sreg) {
    487       case R_ES: return OFFB_ES;
    488       case R_CS: return OFFB_CS;
    489       case R_SS: return OFFB_SS;
    490       case R_DS: return OFFB_DS;
    491       case R_FS: return OFFB_FS;
    492       case R_GS: return OFFB_GS;
    493       default: vpanic("segmentGuestRegOffset(x86)");
    494    }
    495 }
    496 
    497 static Int xmmGuestRegOffset ( UInt xmmreg )
    498 {
    499    switch (xmmreg) {
    500       case 0: return OFFB_XMM0;
    501       case 1: return OFFB_XMM1;
    502       case 2: return OFFB_XMM2;
    503       case 3: return OFFB_XMM3;
    504       case 4: return OFFB_XMM4;
    505       case 5: return OFFB_XMM5;
    506       case 6: return OFFB_XMM6;
    507       case 7: return OFFB_XMM7;
    508       default: vpanic("xmmGuestRegOffset");
    509    }
    510 }
    511 
    512 /* Lanes of vector registers are always numbered from zero being the
    513    least significant lane (rightmost in the register).  */
    514 
    515 static Int xmmGuestRegLane16offset ( UInt xmmreg, Int laneno )
    516 {
    517    /* Correct for little-endian host only. */
    518    vassert(!host_is_bigendian);
    519    vassert(laneno >= 0 && laneno < 8);
    520    return xmmGuestRegOffset( xmmreg ) + 2 * laneno;
    521 }
    522 
    523 static Int xmmGuestRegLane32offset ( UInt xmmreg, Int laneno )
    524 {
    525    /* Correct for little-endian host only. */
    526    vassert(!host_is_bigendian);
    527    vassert(laneno >= 0 && laneno < 4);
    528    return xmmGuestRegOffset( xmmreg ) + 4 * laneno;
    529 }
    530 
    531 static Int xmmGuestRegLane64offset ( UInt xmmreg, Int laneno )
    532 {
    533    /* Correct for little-endian host only. */
    534    vassert(!host_is_bigendian);
    535    vassert(laneno >= 0 && laneno < 2);
    536    return xmmGuestRegOffset( xmmreg ) + 8 * laneno;
    537 }
    538 
    539 static IRExpr* getIReg ( Int sz, UInt archreg )
    540 {
    541    vassert(sz == 1 || sz == 2 || sz == 4);
    542    vassert(archreg < 8);
    543    return IRExpr_Get( integerGuestRegOffset(sz,archreg),
    544                       szToITy(sz) );
    545 }
    546 
    547 /* Ditto, but write to a reg instead. */
    548 static void putIReg ( Int sz, UInt archreg, IRExpr* e )
    549 {
    550    IRType ty = typeOfIRExpr(irsb->tyenv, e);
    551    switch (sz) {
    552       case 1: vassert(ty == Ity_I8); break;
    553       case 2: vassert(ty == Ity_I16); break;
    554       case 4: vassert(ty == Ity_I32); break;
    555       default: vpanic("putIReg(x86)");
    556    }
    557    vassert(archreg < 8);
    558    stmt( IRStmt_Put(integerGuestRegOffset(sz,archreg), e) );
    559 }
    560 
    561 static IRExpr* getSReg ( UInt sreg )
    562 {
    563    return IRExpr_Get( segmentGuestRegOffset(sreg), Ity_I16 );
    564 }
    565 
    566 static void putSReg ( UInt sreg, IRExpr* e )
    567 {
    568    vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_I16);
    569    stmt( IRStmt_Put( segmentGuestRegOffset(sreg), e ) );
    570 }
    571 
    572 static IRExpr* getXMMReg ( UInt xmmreg )
    573 {
    574    return IRExpr_Get( xmmGuestRegOffset(xmmreg), Ity_V128 );
    575 }
    576 
    577 static IRExpr* getXMMRegLane64 ( UInt xmmreg, Int laneno )
    578 {
    579    return IRExpr_Get( xmmGuestRegLane64offset(xmmreg,laneno), Ity_I64 );
    580 }
    581 
    582 static IRExpr* getXMMRegLane64F ( UInt xmmreg, Int laneno )
    583 {
    584    return IRExpr_Get( xmmGuestRegLane64offset(xmmreg,laneno), Ity_F64 );
    585 }
    586 
    587 static IRExpr* getXMMRegLane32 ( UInt xmmreg, Int laneno )
    588 {
    589    return IRExpr_Get( xmmGuestRegLane32offset(xmmreg,laneno), Ity_I32 );
    590 }
    591 
    592 static IRExpr* getXMMRegLane32F ( UInt xmmreg, Int laneno )
    593 {
    594    return IRExpr_Get( xmmGuestRegLane32offset(xmmreg,laneno), Ity_F32 );
    595 }
    596 
    597 static void putXMMReg ( UInt xmmreg, IRExpr* e )
    598 {
    599    vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_V128);
    600    stmt( IRStmt_Put( xmmGuestRegOffset(xmmreg), e ) );
    601 }
    602 
    603 static void putXMMRegLane64 ( UInt xmmreg, Int laneno, IRExpr* e )
    604 {
    605    vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_I64);
    606    stmt( IRStmt_Put( xmmGuestRegLane64offset(xmmreg,laneno), e ) );
    607 }
    608 
    609 static void putXMMRegLane64F ( UInt xmmreg, Int laneno, IRExpr* e )
    610 {
    611    vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_F64);
    612    stmt( IRStmt_Put( xmmGuestRegLane64offset(xmmreg,laneno), e ) );
    613 }
    614 
    615 static void putXMMRegLane32F ( UInt xmmreg, Int laneno, IRExpr* e )
    616 {
    617    vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_F32);
    618    stmt( IRStmt_Put( xmmGuestRegLane32offset(xmmreg,laneno), e ) );
    619 }
    620 
    621 static void putXMMRegLane32 ( UInt xmmreg, Int laneno, IRExpr* e )
    622 {
    623    vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_I32);
    624    stmt( IRStmt_Put( xmmGuestRegLane32offset(xmmreg,laneno), e ) );
    625 }
    626 
    627 static void putXMMRegLane16 ( UInt xmmreg, Int laneno, IRExpr* e )
    628 {
    629    vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_I16);
    630    stmt( IRStmt_Put( xmmGuestRegLane16offset(xmmreg,laneno), e ) );
    631 }
    632 
    633 static void assign ( IRTemp dst, IRExpr* e )
    634 {
    635    stmt( IRStmt_WrTmp(dst, e) );
    636 }
    637 
    638 static void storeLE ( IRExpr* addr, IRExpr* data )
    639 {
    640    stmt( IRStmt_Store(Iend_LE, addr, data) );
    641 }
    642 
    643 static IRExpr* unop ( IROp op, IRExpr* a )
    644 {
    645    return IRExpr_Unop(op, a);
    646 }
    647 
    648 static IRExpr* binop ( IROp op, IRExpr* a1, IRExpr* a2 )
    649 {
    650    return IRExpr_Binop(op, a1, a2);
    651 }
    652 
    653 static IRExpr* triop ( IROp op, IRExpr* a1, IRExpr* a2, IRExpr* a3 )
    654 {
    655    return IRExpr_Triop(op, a1, a2, a3);
    656 }
    657 
    658 static IRExpr* mkexpr ( IRTemp tmp )
    659 {
    660    return IRExpr_RdTmp(tmp);
    661 }
    662 
    663 static IRExpr* mkU8 ( UInt i )
    664 {
    665    vassert(i < 256);
    666    return IRExpr_Const(IRConst_U8( (UChar)i ));
    667 }
    668 
    669 static IRExpr* mkU16 ( UInt i )
    670 {
    671    vassert(i < 65536);
    672    return IRExpr_Const(IRConst_U16( (UShort)i ));
    673 }
    674 
    675 static IRExpr* mkU32 ( UInt i )
    676 {
    677    return IRExpr_Const(IRConst_U32(i));
    678 }
    679 
    680 static IRExpr* mkU64 ( ULong i )
    681 {
    682    return IRExpr_Const(IRConst_U64(i));
    683 }
    684 
    685 static IRExpr* mkU ( IRType ty, UInt i )
    686 {
    687    if (ty == Ity_I8)  return mkU8(i);
    688    if (ty == Ity_I16) return mkU16(i);
    689    if (ty == Ity_I32) return mkU32(i);
    690    /* If this panics, it usually means you passed a size (1,2,4)
    691       value as the IRType, rather than a real IRType. */
    692    vpanic("mkU(x86)");
    693 }
    694 
    695 static IRExpr* mkV128 ( UShort mask )
    696 {
    697    return IRExpr_Const(IRConst_V128(mask));
    698 }
    699 
    700 static IRExpr* loadLE ( IRType ty, IRExpr* addr )
    701 {
    702    return IRExpr_Load(Iend_LE, ty, addr);
    703 }
    704 
    705 static IROp mkSizedOp ( IRType ty, IROp op8 )
    706 {
    707    Int adj;
    708    vassert(ty == Ity_I8 || ty == Ity_I16 || ty == Ity_I32);
    709    vassert(op8 == Iop_Add8 || op8 == Iop_Sub8
    710            || op8 == Iop_Mul8
    711            || op8 == Iop_Or8 || op8 == Iop_And8 || op8 == Iop_Xor8
    712            || op8 == Iop_Shl8 || op8 == Iop_Shr8 || op8 == Iop_Sar8
    713            || op8 == Iop_CmpEQ8 || op8 == Iop_CmpNE8
    714            || op8 == Iop_CasCmpNE8
    715            || op8 == Iop_ExpCmpNE8
    716            || op8 == Iop_Not8);
    717    adj = ty==Ity_I8 ? 0 : (ty==Ity_I16 ? 1 : 2);
    718    return adj + op8;
    719 }
    720 
    721 static IROp mkWidenOp ( Int szSmall, Int szBig, Bool signd )
    722 {
    723    if (szSmall == 1 && szBig == 4) {
    724       return signd ? Iop_8Sto32 : Iop_8Uto32;
    725    }
    726    if (szSmall == 1 && szBig == 2) {
    727       return signd ? Iop_8Sto16 : Iop_8Uto16;
    728    }
    729    if (szSmall == 2 && szBig == 4) {
    730       return signd ? Iop_16Sto32 : Iop_16Uto32;
    731    }
    732    vpanic("mkWidenOp(x86,guest)");
    733 }
    734 
    735 static IRExpr* mkAnd1 ( IRExpr* x, IRExpr* y )
    736 {
    737    vassert(typeOfIRExpr(irsb->tyenv,x) == Ity_I1);
    738    vassert(typeOfIRExpr(irsb->tyenv,y) == Ity_I1);
    739    return unop(Iop_32to1,
    740                binop(Iop_And32,
    741                      unop(Iop_1Uto32,x),
    742                      unop(Iop_1Uto32,y)));
    743 }
    744 
    745 /* Generate a compare-and-swap operation, operating on memory at
    746    'addr'.  The expected value is 'expVal' and the new value is
    747    'newVal'.  If the operation fails, then transfer control (with a
    748    no-redir jump (XXX no -- see comment at top of this file)) to
    749    'restart_point', which is presumably the address of the guest
    750    instruction again -- retrying, essentially. */
    751 static void casLE ( IRExpr* addr, IRExpr* expVal, IRExpr* newVal,
    752                     Addr32 restart_point )
    753 {
    754    IRCAS* cas;
    755    IRType tyE    = typeOfIRExpr(irsb->tyenv, expVal);
    756    IRType tyN    = typeOfIRExpr(irsb->tyenv, newVal);
    757    IRTemp oldTmp = newTemp(tyE);
    758    IRTemp expTmp = newTemp(tyE);
    759    vassert(tyE == tyN);
    760    vassert(tyE == Ity_I32 || tyE == Ity_I16 || tyE == Ity_I8);
    761    assign(expTmp, expVal);
    762    cas = mkIRCAS( IRTemp_INVALID, oldTmp, Iend_LE, addr,
    763                   NULL, mkexpr(expTmp), NULL, newVal );
    764    stmt( IRStmt_CAS(cas) );
    765    stmt( IRStmt_Exit(
    766             binop( mkSizedOp(tyE,Iop_CasCmpNE8),
    767                    mkexpr(oldTmp), mkexpr(expTmp) ),
    768             Ijk_Boring, /*Ijk_NoRedir*/
    769             IRConst_U32( restart_point ),
    770             OFFB_EIP
    771          ));
    772 }
    773 
    774 
    775 /*------------------------------------------------------------*/
    776 /*--- Helpers for %eflags.                                 ---*/
    777 /*------------------------------------------------------------*/
    778 
    779 /* -------------- Evaluating the flags-thunk. -------------- */
    780 
    781 /* Build IR to calculate all the eflags from stored
    782    CC_OP/CC_DEP1/CC_DEP2/CC_NDEP.  Returns an expression ::
    783    Ity_I32. */
    784 static IRExpr* mk_x86g_calculate_eflags_all ( void )
    785 {
    786    IRExpr** args
    787       = mkIRExprVec_4( IRExpr_Get(OFFB_CC_OP,   Ity_I32),
    788                        IRExpr_Get(OFFB_CC_DEP1, Ity_I32),
    789                        IRExpr_Get(OFFB_CC_DEP2, Ity_I32),
    790                        IRExpr_Get(OFFB_CC_NDEP, Ity_I32) );
    791    IRExpr* call
    792       = mkIRExprCCall(
    793            Ity_I32,
    794            0/*regparm*/,
    795            "x86g_calculate_eflags_all", &x86g_calculate_eflags_all,
    796            args
    797         );
    798    /* Exclude OP and NDEP from definedness checking.  We're only
    799       interested in DEP1 and DEP2. */
    800    call->Iex.CCall.cee->mcx_mask = (1<<0) | (1<<3);
    801    return call;
    802 }
    803 
    804 /* Build IR to calculate some particular condition from stored
    805    CC_OP/CC_DEP1/CC_DEP2/CC_NDEP.  Returns an expression ::
    806    Ity_Bit. */
    807 static IRExpr* mk_x86g_calculate_condition ( X86Condcode cond )
    808 {
    809    IRExpr** args
    810       = mkIRExprVec_5( mkU32(cond),
    811                        IRExpr_Get(OFFB_CC_OP,  Ity_I32),
    812                        IRExpr_Get(OFFB_CC_DEP1, Ity_I32),
    813                        IRExpr_Get(OFFB_CC_DEP2, Ity_I32),
    814                        IRExpr_Get(OFFB_CC_NDEP, Ity_I32) );
    815    IRExpr* call
    816       = mkIRExprCCall(
    817            Ity_I32,
    818            0/*regparm*/,
    819            "x86g_calculate_condition", &x86g_calculate_condition,
    820            args
    821         );
    822    /* Exclude the requested condition, OP and NDEP from definedness
    823       checking.  We're only interested in DEP1 and DEP2. */
    824    call->Iex.CCall.cee->mcx_mask = (1<<0) | (1<<1) | (1<<4);
    825    return unop(Iop_32to1, call);
    826 }
    827 
    828 /* Build IR to calculate just the carry flag from stored
    829    CC_OP/CC_DEP1/CC_DEP2/CC_NDEP.  Returns an expression :: Ity_I32. */
    830 static IRExpr* mk_x86g_calculate_eflags_c ( void )
    831 {
    832    IRExpr** args
    833       = mkIRExprVec_4( IRExpr_Get(OFFB_CC_OP,   Ity_I32),
    834                        IRExpr_Get(OFFB_CC_DEP1, Ity_I32),
    835                        IRExpr_Get(OFFB_CC_DEP2, Ity_I32),
    836                        IRExpr_Get(OFFB_CC_NDEP, Ity_I32) );
    837    IRExpr* call
    838       = mkIRExprCCall(
    839            Ity_I32,
    840            3/*regparm*/,
    841            "x86g_calculate_eflags_c", &x86g_calculate_eflags_c,
    842            args
    843         );
    844    /* Exclude OP and NDEP from definedness checking.  We're only
    845       interested in DEP1 and DEP2. */
    846    call->Iex.CCall.cee->mcx_mask = (1<<0) | (1<<3);
    847    return call;
    848 }
    849 
    850 
    851 /* -------------- Building the flags-thunk. -------------- */
    852 
    853 /* The machinery in this section builds the flag-thunk following a
    854    flag-setting operation.  Hence the various setFlags_* functions.
    855 */
    856 
    857 static Bool isAddSub ( IROp op8 )
    858 {
    859    return toBool(op8 == Iop_Add8 || op8 == Iop_Sub8);
    860 }
    861 
    862 static Bool isLogic ( IROp op8 )
    863 {
    864    return toBool(op8 == Iop_And8 || op8 == Iop_Or8 || op8 == Iop_Xor8);
    865 }
    866 
    867 /* U-widen 8/16/32 bit int expr to 32. */
    868 static IRExpr* widenUto32 ( IRExpr* e )
    869 {
    870    switch (typeOfIRExpr(irsb->tyenv,e)) {
    871       case Ity_I32: return e;
    872       case Ity_I16: return unop(Iop_16Uto32,e);
    873       case Ity_I8:  return unop(Iop_8Uto32,e);
    874       default: vpanic("widenUto32");
    875    }
    876 }
    877 
    878 /* S-widen 8/16/32 bit int expr to 32. */
    879 static IRExpr* widenSto32 ( IRExpr* e )
    880 {
    881    switch (typeOfIRExpr(irsb->tyenv,e)) {
    882       case Ity_I32: return e;
    883       case Ity_I16: return unop(Iop_16Sto32,e);
    884       case Ity_I8:  return unop(Iop_8Sto32,e);
    885       default: vpanic("widenSto32");
    886    }
    887 }
    888 
    889 /* Narrow 8/16/32 bit int expr to 8/16/32.  Clearly only some
    890    of these combinations make sense. */
    891 static IRExpr* narrowTo ( IRType dst_ty, IRExpr* e )
    892 {
    893    IRType src_ty = typeOfIRExpr(irsb->tyenv,e);
    894    if (src_ty == dst_ty)
    895       return e;
    896    if (src_ty == Ity_I32 && dst_ty == Ity_I16)
    897       return unop(Iop_32to16, e);
    898    if (src_ty == Ity_I32 && dst_ty == Ity_I8)
    899       return unop(Iop_32to8, e);
    900 
    901    vex_printf("\nsrc, dst tys are: ");
    902    ppIRType(src_ty);
    903    vex_printf(", ");
    904    ppIRType(dst_ty);
    905    vex_printf("\n");
    906    vpanic("narrowTo(x86)");
    907 }
    908 
    909 
    910 /* Set the flags thunk OP, DEP1 and DEP2 fields.  The supplied op is
    911    auto-sized up to the real op. */
    912 
    913 static
    914 void setFlags_DEP1_DEP2 ( IROp op8, IRTemp dep1, IRTemp dep2, IRType ty )
    915 {
    916    Int ccOp = ty==Ity_I8 ? 0 : (ty==Ity_I16 ? 1 : 2);
    917 
    918    vassert(ty == Ity_I8 || ty == Ity_I16 || ty == Ity_I32);
    919 
    920    switch (op8) {
    921       case Iop_Add8: ccOp += X86G_CC_OP_ADDB;   break;
    922       case Iop_Sub8: ccOp += X86G_CC_OP_SUBB;   break;
    923       default:       ppIROp(op8);
    924                      vpanic("setFlags_DEP1_DEP2(x86)");
    925    }
    926    stmt( IRStmt_Put( OFFB_CC_OP,   mkU32(ccOp)) );
    927    stmt( IRStmt_Put( OFFB_CC_DEP1, widenUto32(mkexpr(dep1))) );
    928    stmt( IRStmt_Put( OFFB_CC_DEP2, widenUto32(mkexpr(dep2))) );
    929    /* Set NDEP even though it isn't used.  This makes redundant-PUT
    930       elimination of previous stores to this field work better. */
    931    stmt( IRStmt_Put( OFFB_CC_NDEP, mkU32(0) ));
    932 }
    933 
    934 
    935 /* Set the OP and DEP1 fields only, and write zero to DEP2. */
    936 
    937 static
    938 void setFlags_DEP1 ( IROp op8, IRTemp dep1, IRType ty )
    939 {
    940    Int ccOp = ty==Ity_I8 ? 0 : (ty==Ity_I16 ? 1 : 2);
    941 
    942    vassert(ty == Ity_I8 || ty == Ity_I16 || ty == Ity_I32);
    943 
    944    switch (op8) {
    945       case Iop_Or8:
    946       case Iop_And8:
    947       case Iop_Xor8: ccOp += X86G_CC_OP_LOGICB; break;
    948       default:       ppIROp(op8);
    949                      vpanic("setFlags_DEP1(x86)");
    950    }
    951    stmt( IRStmt_Put( OFFB_CC_OP,   mkU32(ccOp)) );
    952    stmt( IRStmt_Put( OFFB_CC_DEP1, widenUto32(mkexpr(dep1))) );
    953    stmt( IRStmt_Put( OFFB_CC_DEP2, mkU32(0)) );
    954    /* Set NDEP even though it isn't used.  This makes redundant-PUT
    955       elimination of previous stores to this field work better. */
    956    stmt( IRStmt_Put( OFFB_CC_NDEP, mkU32(0) ));
    957 }
    958 
    959 
    960 /* For shift operations, we put in the result and the undershifted
    961    result.  Except if the shift amount is zero, the thunk is left
    962    unchanged. */
    963 
    964 static void setFlags_DEP1_DEP2_shift ( IROp    op32,
    965                                        IRTemp  res,
    966                                        IRTemp  resUS,
    967                                        IRType  ty,
    968                                        IRTemp  guard )
    969 {
    970    Int ccOp = ty==Ity_I8 ? 2 : (ty==Ity_I16 ? 1 : 0);
    971 
    972    vassert(ty == Ity_I8 || ty == Ity_I16 || ty == Ity_I32);
    973    vassert(guard);
    974 
    975    /* Both kinds of right shifts are handled by the same thunk
    976       operation. */
    977    switch (op32) {
    978       case Iop_Shr32:
    979       case Iop_Sar32: ccOp = X86G_CC_OP_SHRL - ccOp; break;
    980       case Iop_Shl32: ccOp = X86G_CC_OP_SHLL - ccOp; break;
    981       default:        ppIROp(op32);
    982                       vpanic("setFlags_DEP1_DEP2_shift(x86)");
    983    }
    984 
    985    /* guard :: Ity_I8.  We need to convert it to I1. */
    986    IRTemp guardB = newTemp(Ity_I1);
    987    assign( guardB, binop(Iop_CmpNE8, mkexpr(guard), mkU8(0)) );
    988 
    989    /* DEP1 contains the result, DEP2 contains the undershifted value. */
    990    stmt( IRStmt_Put( OFFB_CC_OP,
    991                      IRExpr_ITE( mkexpr(guardB),
    992                                  mkU32(ccOp),
    993                                  IRExpr_Get(OFFB_CC_OP,Ity_I32) ) ));
    994    stmt( IRStmt_Put( OFFB_CC_DEP1,
    995                      IRExpr_ITE( mkexpr(guardB),
    996                                  widenUto32(mkexpr(res)),
    997                                  IRExpr_Get(OFFB_CC_DEP1,Ity_I32) ) ));
    998    stmt( IRStmt_Put( OFFB_CC_DEP2,
    999                      IRExpr_ITE( mkexpr(guardB),
   1000                                  widenUto32(mkexpr(resUS)),
   1001                                  IRExpr_Get(OFFB_CC_DEP2,Ity_I32) ) ));
   1002    /* Set NDEP even though it isn't used.  This makes redundant-PUT
   1003       elimination of previous stores to this field work better. */
   1004    stmt( IRStmt_Put( OFFB_CC_NDEP,
   1005                      IRExpr_ITE( mkexpr(guardB),
   1006                                  mkU32(0),
   1007                                  IRExpr_Get(OFFB_CC_NDEP,Ity_I32) ) ));
   1008 }
   1009 
   1010 
   1011 /* For the inc/dec case, we store in DEP1 the result value and in NDEP
   1012    the former value of the carry flag, which unfortunately we have to
   1013    compute. */
   1014 
   1015 static void setFlags_INC_DEC ( Bool inc, IRTemp res, IRType ty )
   1016 {
   1017    Int ccOp = inc ? X86G_CC_OP_INCB : X86G_CC_OP_DECB;
   1018 
   1019    ccOp += ty==Ity_I8 ? 0 : (ty==Ity_I16 ? 1 : 2);
   1020    vassert(ty == Ity_I8 || ty == Ity_I16 || ty == Ity_I32);
   1021 
   1022    /* This has to come first, because calculating the C flag
   1023       may require reading all four thunk fields. */
   1024    stmt( IRStmt_Put( OFFB_CC_NDEP, mk_x86g_calculate_eflags_c()) );
   1025    stmt( IRStmt_Put( OFFB_CC_OP,   mkU32(ccOp)) );
   1026    stmt( IRStmt_Put( OFFB_CC_DEP1, widenUto32(mkexpr(res))) );
   1027    stmt( IRStmt_Put( OFFB_CC_DEP2, mkU32(0)) );
   1028 }
   1029 
   1030 
   1031 /* Multiplies are pretty much like add and sub: DEP1 and DEP2 hold the
   1032    two arguments. */
   1033 
   1034 static
   1035 void setFlags_MUL ( IRType ty, IRTemp arg1, IRTemp arg2, UInt base_op )
   1036 {
   1037    switch (ty) {
   1038       case Ity_I8:
   1039          stmt( IRStmt_Put( OFFB_CC_OP, mkU32(base_op+0) ) );
   1040          break;
   1041       case Ity_I16:
   1042          stmt( IRStmt_Put( OFFB_CC_OP, mkU32(base_op+1) ) );
   1043          break;
   1044       case Ity_I32:
   1045          stmt( IRStmt_Put( OFFB_CC_OP, mkU32(base_op+2) ) );
   1046          break;
   1047       default:
   1048          vpanic("setFlags_MUL(x86)");
   1049    }
   1050    stmt( IRStmt_Put( OFFB_CC_DEP1, widenUto32(mkexpr(arg1)) ));
   1051    stmt( IRStmt_Put( OFFB_CC_DEP2, widenUto32(mkexpr(arg2)) ));
   1052    /* Set NDEP even though it isn't used.  This makes redundant-PUT
   1053       elimination of previous stores to this field work better. */
   1054    stmt( IRStmt_Put( OFFB_CC_NDEP, mkU32(0) ));
   1055 }
   1056 
   1057 
   1058 /* -------------- Condition codes. -------------- */
   1059 
   1060 /* Condition codes, using the Intel encoding.  */
   1061 
   1062 static const HChar* name_X86Condcode ( X86Condcode cond )
   1063 {
   1064    switch (cond) {
   1065       case X86CondO:      return "o";
   1066       case X86CondNO:     return "no";
   1067       case X86CondB:      return "b";
   1068       case X86CondNB:     return "nb";
   1069       case X86CondZ:      return "z";
   1070       case X86CondNZ:     return "nz";
   1071       case X86CondBE:     return "be";
   1072       case X86CondNBE:    return "nbe";
   1073       case X86CondS:      return "s";
   1074       case X86CondNS:     return "ns";
   1075       case X86CondP:      return "p";
   1076       case X86CondNP:     return "np";
   1077       case X86CondL:      return "l";
   1078       case X86CondNL:     return "nl";
   1079       case X86CondLE:     return "le";
   1080       case X86CondNLE:    return "nle";
   1081       case X86CondAlways: return "ALWAYS";
   1082       default: vpanic("name_X86Condcode");
   1083    }
   1084 }
   1085 
   1086 static
   1087 X86Condcode positiveIse_X86Condcode ( X86Condcode  cond,
   1088                                       Bool*        needInvert )
   1089 {
   1090    vassert(cond >= X86CondO && cond <= X86CondNLE);
   1091    if (cond & 1) {
   1092       *needInvert = True;
   1093       return cond-1;
   1094    } else {
   1095       *needInvert = False;
   1096       return cond;
   1097    }
   1098 }
   1099 
   1100 
   1101 /* -------------- Helpers for ADD/SUB with carry. -------------- */
   1102 
   1103 /* Given ta1, ta2 and tres, compute tres = ADC(ta1,ta2) and set flags
   1104    appropriately.
   1105 
   1106    Optionally, generate a store for the 'tres' value.  This can either
   1107    be a normal store, or it can be a cas-with-possible-failure style
   1108    store:
   1109 
   1110    if taddr is IRTemp_INVALID, then no store is generated.
   1111 
   1112    if taddr is not IRTemp_INVALID, then a store (using taddr as
   1113    the address) is generated:
   1114 
   1115      if texpVal is IRTemp_INVALID then a normal store is
   1116      generated, and restart_point must be zero (it is irrelevant).
   1117 
   1118      if texpVal is not IRTemp_INVALID then a cas-style store is
   1119      generated.  texpVal is the expected value, restart_point
   1120      is the restart point if the store fails, and texpVal must
   1121      have the same type as tres.
   1122 */
   1123 static void helper_ADC ( Int sz,
   1124                          IRTemp tres, IRTemp ta1, IRTemp ta2,
   1125                          /* info about optional store: */
   1126                          IRTemp taddr, IRTemp texpVal, Addr32 restart_point )
   1127 {
   1128    UInt    thunkOp;
   1129    IRType  ty    = szToITy(sz);
   1130    IRTemp  oldc  = newTemp(Ity_I32);
   1131    IRTemp  oldcn = newTemp(ty);
   1132    IROp    plus  = mkSizedOp(ty, Iop_Add8);
   1133    IROp    xor   = mkSizedOp(ty, Iop_Xor8);
   1134 
   1135    vassert(typeOfIRTemp(irsb->tyenv, tres) == ty);
   1136    vassert(sz == 1 || sz == 2 || sz == 4);
   1137    thunkOp = sz==4 ? X86G_CC_OP_ADCL
   1138                    : (sz==2 ? X86G_CC_OP_ADCW : X86G_CC_OP_ADCB);
   1139 
   1140    /* oldc = old carry flag, 0 or 1 */
   1141    assign( oldc,  binop(Iop_And32,
   1142                         mk_x86g_calculate_eflags_c(),
   1143                         mkU32(1)) );
   1144 
   1145    assign( oldcn, narrowTo(ty, mkexpr(oldc)) );
   1146 
   1147    assign( tres, binop(plus,
   1148                        binop(plus,mkexpr(ta1),mkexpr(ta2)),
   1149                        mkexpr(oldcn)) );
   1150 
   1151    /* Possibly generate a store of 'tres' to 'taddr'.  See comment at
   1152       start of this function. */
   1153    if (taddr != IRTemp_INVALID) {
   1154       if (texpVal == IRTemp_INVALID) {
   1155          vassert(restart_point == 0);
   1156          storeLE( mkexpr(taddr), mkexpr(tres) );
   1157       } else {
   1158          vassert(typeOfIRTemp(irsb->tyenv, texpVal) == ty);
   1159          /* .. and hence 'texpVal' has the same type as 'tres'. */
   1160          casLE( mkexpr(taddr),
   1161                 mkexpr(texpVal), mkexpr(tres), restart_point );
   1162       }
   1163    }
   1164 
   1165    stmt( IRStmt_Put( OFFB_CC_OP,   mkU32(thunkOp) ) );
   1166    stmt( IRStmt_Put( OFFB_CC_DEP1, widenUto32(mkexpr(ta1)) ));
   1167    stmt( IRStmt_Put( OFFB_CC_DEP2, widenUto32(binop(xor, mkexpr(ta2),
   1168                                                          mkexpr(oldcn)) )) );
   1169    stmt( IRStmt_Put( OFFB_CC_NDEP, mkexpr(oldc) ) );
   1170 }
   1171 
   1172 
   1173 /* Given ta1, ta2 and tres, compute tres = SBB(ta1,ta2) and set flags
   1174    appropriately.  As with helper_ADC, possibly generate a store of
   1175    the result -- see comments on helper_ADC for details.
   1176 */
   1177 static void helper_SBB ( Int sz,
   1178                          IRTemp tres, IRTemp ta1, IRTemp ta2,
   1179                          /* info about optional store: */
   1180                          IRTemp taddr, IRTemp texpVal, Addr32 restart_point )
   1181 {
   1182    UInt    thunkOp;
   1183    IRType  ty    = szToITy(sz);
   1184    IRTemp  oldc  = newTemp(Ity_I32);
   1185    IRTemp  oldcn = newTemp(ty);
   1186    IROp    minus = mkSizedOp(ty, Iop_Sub8);
   1187    IROp    xor   = mkSizedOp(ty, Iop_Xor8);
   1188 
   1189    vassert(typeOfIRTemp(irsb->tyenv, tres) == ty);
   1190    vassert(sz == 1 || sz == 2 || sz == 4);
   1191    thunkOp = sz==4 ? X86G_CC_OP_SBBL
   1192                    : (sz==2 ? X86G_CC_OP_SBBW : X86G_CC_OP_SBBB);
   1193 
   1194    /* oldc = old carry flag, 0 or 1 */
   1195    assign( oldc, binop(Iop_And32,
   1196                        mk_x86g_calculate_eflags_c(),
   1197                        mkU32(1)) );
   1198 
   1199    assign( oldcn, narrowTo(ty, mkexpr(oldc)) );
   1200 
   1201    assign( tres, binop(minus,
   1202                        binop(minus,mkexpr(ta1),mkexpr(ta2)),
   1203                        mkexpr(oldcn)) );
   1204 
   1205    /* Possibly generate a store of 'tres' to 'taddr'.  See comment at
   1206       start of this function. */
   1207    if (taddr != IRTemp_INVALID) {
   1208       if (texpVal == IRTemp_INVALID) {
   1209          vassert(restart_point == 0);
   1210          storeLE( mkexpr(taddr), mkexpr(tres) );
   1211       } else {
   1212          vassert(typeOfIRTemp(irsb->tyenv, texpVal) == ty);
   1213          /* .. and hence 'texpVal' has the same type as 'tres'. */
   1214          casLE( mkexpr(taddr),
   1215                 mkexpr(texpVal), mkexpr(tres), restart_point );
   1216       }
   1217    }
   1218 
   1219    stmt( IRStmt_Put( OFFB_CC_OP,   mkU32(thunkOp) ) );
   1220    stmt( IRStmt_Put( OFFB_CC_DEP1, widenUto32(mkexpr(ta1) )) );
   1221    stmt( IRStmt_Put( OFFB_CC_DEP2, widenUto32(binop(xor, mkexpr(ta2),
   1222                                                          mkexpr(oldcn)) )) );
   1223    stmt( IRStmt_Put( OFFB_CC_NDEP, mkexpr(oldc) ) );
   1224 }
   1225 
   1226 
   1227 /* -------------- Helpers for disassembly printing. -------------- */
   1228 
   1229 static const HChar* nameGrp1 ( Int opc_aux )
   1230 {
   1231    static const HChar* grp1_names[8]
   1232      = { "add", "or", "adc", "sbb", "and", "sub", "xor", "cmp" };
   1233    if (opc_aux < 0 || opc_aux > 7) vpanic("nameGrp1(x86)");
   1234    return grp1_names[opc_aux];
   1235 }
   1236 
   1237 static const HChar* nameGrp2 ( Int opc_aux )
   1238 {
   1239    static const HChar* grp2_names[8]
   1240      = { "rol", "ror", "rcl", "rcr", "shl", "shr", "shl", "sar" };
   1241    if (opc_aux < 0 || opc_aux > 7) vpanic("nameGrp2(x86)");
   1242    return grp2_names[opc_aux];
   1243 }
   1244 
   1245 static const HChar* nameGrp4 ( Int opc_aux )
   1246 {
   1247    static const HChar* grp4_names[8]
   1248      = { "inc", "dec", "???", "???", "???", "???", "???", "???" };
   1249    if (opc_aux < 0 || opc_aux > 1) vpanic("nameGrp4(x86)");
   1250    return grp4_names[opc_aux];
   1251 }
   1252 
   1253 static const HChar* nameGrp5 ( Int opc_aux )
   1254 {
   1255    static const HChar* grp5_names[8]
   1256      = { "inc", "dec", "call*", "call*", "jmp*", "jmp*", "push", "???" };
   1257    if (opc_aux < 0 || opc_aux > 6) vpanic("nameGrp5(x86)");
   1258    return grp5_names[opc_aux];
   1259 }
   1260 
   1261 static const HChar* nameGrp8 ( Int opc_aux )
   1262 {
   1263    static const HChar* grp8_names[8]
   1264      = { "???", "???", "???", "???", "bt", "bts", "btr", "btc" };
   1265    if (opc_aux < 4 || opc_aux > 7) vpanic("nameGrp8(x86)");
   1266    return grp8_names[opc_aux];
   1267 }
   1268 
   1269 static const HChar* nameIReg ( Int size, Int reg )
   1270 {
   1271    static const HChar* ireg32_names[8]
   1272      = { "%eax", "%ecx", "%edx", "%ebx",
   1273          "%esp", "%ebp", "%esi", "%edi" };
   1274    static const HChar* ireg16_names[8]
   1275      = { "%ax", "%cx", "%dx", "%bx", "%sp", "%bp", "%si", "%di" };
   1276    static const HChar* ireg8_names[8]
   1277      = { "%al", "%cl", "%dl", "%bl",
   1278          "%ah{sp}", "%ch{bp}", "%dh{si}", "%bh{di}" };
   1279    if (reg < 0 || reg > 7) goto bad;
   1280    switch (size) {
   1281       case 4: return ireg32_names[reg];
   1282       case 2: return ireg16_names[reg];
   1283       case 1: return ireg8_names[reg];
   1284    }
   1285   bad:
   1286    vpanic("nameIReg(X86)");
   1287    return NULL; /*notreached*/
   1288 }
   1289 
   1290 static const HChar* nameSReg ( UInt sreg )
   1291 {
   1292    switch (sreg) {
   1293       case R_ES: return "%es";
   1294       case R_CS: return "%cs";
   1295       case R_SS: return "%ss";
   1296       case R_DS: return "%ds";
   1297       case R_FS: return "%fs";
   1298       case R_GS: return "%gs";
   1299       default: vpanic("nameSReg(x86)");
   1300    }
   1301 }
   1302 
   1303 static const HChar* nameMMXReg ( Int mmxreg )
   1304 {
   1305    static const HChar* mmx_names[8]
   1306      = { "%mm0", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5", "%mm6", "%mm7" };
   1307    if (mmxreg < 0 || mmxreg > 7) vpanic("nameMMXReg(x86,guest)");
   1308    return mmx_names[mmxreg];
   1309 }
   1310 
   1311 static const HChar* nameXMMReg ( Int xmmreg )
   1312 {
   1313    static const HChar* xmm_names[8]
   1314      = { "%xmm0", "%xmm1", "%xmm2", "%xmm3",
   1315          "%xmm4", "%xmm5", "%xmm6", "%xmm7" };
   1316    if (xmmreg < 0 || xmmreg > 7) vpanic("name_of_xmm_reg");
   1317    return xmm_names[xmmreg];
   1318 }
   1319 
   1320 static const HChar* nameMMXGran ( Int gran )
   1321 {
   1322    switch (gran) {
   1323       case 0: return "b";
   1324       case 1: return "w";
   1325       case 2: return "d";
   1326       case 3: return "q";
   1327       default: vpanic("nameMMXGran(x86,guest)");
   1328    }
   1329 }
   1330 
   1331 static HChar nameISize ( Int size )
   1332 {
   1333    switch (size) {
   1334       case 4: return 'l';
   1335       case 2: return 'w';
   1336       case 1: return 'b';
   1337       default: vpanic("nameISize(x86)");
   1338    }
   1339 }
   1340 
   1341 
   1342 /*------------------------------------------------------------*/
   1343 /*--- JMP helpers                                          ---*/
   1344 /*------------------------------------------------------------*/
   1345 
   1346 static void jmp_lit( /*MOD*/DisResult* dres,
   1347                      IRJumpKind kind, Addr32 d32 )
   1348 {
   1349    vassert(dres->whatNext    == Dis_Continue);
   1350    vassert(dres->len         == 0);
   1351    vassert(dres->continueAt  == 0);
   1352    vassert(dres->jk_StopHere == Ijk_INVALID);
   1353    dres->whatNext    = Dis_StopHere;
   1354    dres->jk_StopHere = kind;
   1355    stmt( IRStmt_Put( OFFB_EIP, mkU32(d32) ) );
   1356 }
   1357 
   1358 static void jmp_treg( /*MOD*/DisResult* dres,
   1359                       IRJumpKind kind, IRTemp t )
   1360 {
   1361    vassert(dres->whatNext    == Dis_Continue);
   1362    vassert(dres->len         == 0);
   1363    vassert(dres->continueAt  == 0);
   1364    vassert(dres->jk_StopHere == Ijk_INVALID);
   1365    dres->whatNext    = Dis_StopHere;
   1366    dres->jk_StopHere = kind;
   1367    stmt( IRStmt_Put( OFFB_EIP, mkexpr(t) ) );
   1368 }
   1369 
   1370 static
   1371 void jcc_01( /*MOD*/DisResult* dres,
   1372              X86Condcode cond, Addr32 d32_false, Addr32 d32_true )
   1373 {
   1374    Bool        invert;
   1375    X86Condcode condPos;
   1376    vassert(dres->whatNext    == Dis_Continue);
   1377    vassert(dres->len         == 0);
   1378    vassert(dres->continueAt  == 0);
   1379    vassert(dres->jk_StopHere == Ijk_INVALID);
   1380    dres->whatNext    = Dis_StopHere;
   1381    dres->jk_StopHere = Ijk_Boring;
   1382    condPos = positiveIse_X86Condcode ( cond, &invert );
   1383    if (invert) {
   1384       stmt( IRStmt_Exit( mk_x86g_calculate_condition(condPos),
   1385                          Ijk_Boring,
   1386                          IRConst_U32(d32_false),
   1387                          OFFB_EIP ) );
   1388       stmt( IRStmt_Put( OFFB_EIP, mkU32(d32_true) ) );
   1389    } else {
   1390       stmt( IRStmt_Exit( mk_x86g_calculate_condition(condPos),
   1391                          Ijk_Boring,
   1392                          IRConst_U32(d32_true),
   1393                          OFFB_EIP ) );
   1394       stmt( IRStmt_Put( OFFB_EIP, mkU32(d32_false) ) );
   1395    }
   1396 }
   1397 
   1398 
   1399 /*------------------------------------------------------------*/
   1400 /*--- Disassembling addressing modes                       ---*/
   1401 /*------------------------------------------------------------*/
   1402 
   1403 static
   1404 const HChar* sorbTxt ( UChar sorb )
   1405 {
   1406    switch (sorb) {
   1407       case 0:    return ""; /* no override */
   1408       case 0x3E: return "%ds";
   1409       case 0x26: return "%es:";
   1410       case 0x64: return "%fs:";
   1411       case 0x65: return "%gs:";
   1412       default: vpanic("sorbTxt(x86,guest)");
   1413    }
   1414 }
   1415 
   1416 
   1417 /* 'virtual' is an IRExpr* holding a virtual address.  Convert it to a
   1418    linear address by adding any required segment override as indicated
   1419    by sorb. */
   1420 static
   1421 IRExpr* handleSegOverride ( UChar sorb, IRExpr* virtual )
   1422 {
   1423    Int    sreg;
   1424    IRType hWordTy;
   1425    IRTemp ldt_ptr, gdt_ptr, seg_selector, r64;
   1426 
   1427    if (sorb == 0)
   1428       /* the common case - no override */
   1429       return virtual;
   1430 
   1431    switch (sorb) {
   1432       case 0x3E: sreg = R_DS; break;
   1433       case 0x26: sreg = R_ES; break;
   1434       case 0x64: sreg = R_FS; break;
   1435       case 0x65: sreg = R_GS; break;
   1436       default: vpanic("handleSegOverride(x86,guest)");
   1437    }
   1438 
   1439    hWordTy = sizeof(HWord)==4 ? Ity_I32 : Ity_I64;
   1440 
   1441    seg_selector = newTemp(Ity_I32);
   1442    ldt_ptr      = newTemp(hWordTy);
   1443    gdt_ptr      = newTemp(hWordTy);
   1444    r64          = newTemp(Ity_I64);
   1445 
   1446    assign( seg_selector, unop(Iop_16Uto32, getSReg(sreg)) );
   1447    assign( ldt_ptr, IRExpr_Get( OFFB_LDT, hWordTy ));
   1448    assign( gdt_ptr, IRExpr_Get( OFFB_GDT, hWordTy ));
   1449 
   1450    /*
   1451    Call this to do the translation and limit checks:
   1452    ULong x86g_use_seg_selector ( HWord ldt, HWord gdt,
   1453                                  UInt seg_selector, UInt virtual_addr )
   1454    */
   1455    assign(
   1456       r64,
   1457       mkIRExprCCall(
   1458          Ity_I64,
   1459          0/*regparms*/,
   1460          "x86g_use_seg_selector",
   1461          &x86g_use_seg_selector,
   1462          mkIRExprVec_4( mkexpr(ldt_ptr), mkexpr(gdt_ptr),
   1463                         mkexpr(seg_selector), virtual)
   1464       )
   1465    );
   1466 
   1467    /* If the high 32 of the result are non-zero, there was a
   1468       failure in address translation.  In which case, make a
   1469       quick exit.
   1470    */
   1471    stmt(
   1472       IRStmt_Exit(
   1473          binop(Iop_CmpNE32, unop(Iop_64HIto32, mkexpr(r64)), mkU32(0)),
   1474          Ijk_MapFail,
   1475          IRConst_U32( guest_EIP_curr_instr ),
   1476          OFFB_EIP
   1477       )
   1478    );
   1479 
   1480    /* otherwise, here's the translated result. */
   1481    return unop(Iop_64to32, mkexpr(r64));
   1482 }
   1483 
   1484 
   1485 /* Generate IR to calculate an address indicated by a ModRM and
   1486    following SIB bytes.  The expression, and the number of bytes in
   1487    the address mode, are returned.  Note that this fn should not be
   1488    called if the R/M part of the address denotes a register instead of
   1489    memory.  If print_codegen is true, text of the addressing mode is
   1490    placed in buf.
   1491 
   1492    The computed address is stored in a new tempreg, and the
   1493    identity of the tempreg is returned.  */
   1494 
   1495 static IRTemp disAMode_copy2tmp ( IRExpr* addr32 )
   1496 {
   1497    IRTemp tmp = newTemp(Ity_I32);
   1498    assign( tmp, addr32 );
   1499    return tmp;
   1500 }
   1501 
   1502 static
   1503 IRTemp disAMode ( Int* len, UChar sorb, Int delta, HChar* buf )
   1504 {
   1505    UChar mod_reg_rm = getIByte(delta);
   1506    delta++;
   1507 
   1508    buf[0] = (UChar)0;
   1509 
   1510    /* squeeze out the reg field from mod_reg_rm, since a 256-entry
   1511       jump table seems a bit excessive.
   1512    */
   1513    mod_reg_rm &= 0xC7;                      /* is now XX000YYY */
   1514    mod_reg_rm  = toUChar(mod_reg_rm | (mod_reg_rm >> 3));
   1515                                             /* is now XX0XXYYY */
   1516    mod_reg_rm &= 0x1F;                      /* is now 000XXYYY */
   1517    switch (mod_reg_rm) {
   1518 
   1519       /* (%eax) .. (%edi), not including (%esp) or (%ebp).
   1520          --> GET %reg, t
   1521       */
   1522       case 0x00: case 0x01: case 0x02: case 0x03:
   1523       /* ! 04 */ /* ! 05 */ case 0x06: case 0x07:
   1524          { UChar rm = mod_reg_rm;
   1525            DIS(buf, "%s(%s)", sorbTxt(sorb), nameIReg(4,rm));
   1526            *len = 1;
   1527            return disAMode_copy2tmp(
   1528                   handleSegOverride(sorb, getIReg(4,rm)));
   1529          }
   1530 
   1531       /* d8(%eax) ... d8(%edi), not including d8(%esp)
   1532          --> GET %reg, t ; ADDL d8, t
   1533       */
   1534       case 0x08: case 0x09: case 0x0A: case 0x0B:
   1535       /* ! 0C */ case 0x0D: case 0x0E: case 0x0F:
   1536          { UChar rm = toUChar(mod_reg_rm & 7);
   1537            UInt  d  = getSDisp8(delta);
   1538            DIS(buf, "%s%d(%s)", sorbTxt(sorb), (Int)d, nameIReg(4,rm));
   1539            *len = 2;
   1540            return disAMode_copy2tmp(
   1541                   handleSegOverride(sorb,
   1542                      binop(Iop_Add32,getIReg(4,rm),mkU32(d))));
   1543          }
   1544 
   1545       /* d32(%eax) ... d32(%edi), not including d32(%esp)
   1546          --> GET %reg, t ; ADDL d8, t
   1547       */
   1548       case 0x10: case 0x11: case 0x12: case 0x13:
   1549       /* ! 14 */ case 0x15: case 0x16: case 0x17:
   1550          { UChar rm = toUChar(mod_reg_rm & 7);
   1551            UInt  d  = getUDisp32(delta);
   1552            DIS(buf, "%s0x%x(%s)", sorbTxt(sorb), (Int)d, nameIReg(4,rm));
   1553            *len = 5;
   1554            return disAMode_copy2tmp(
   1555                   handleSegOverride(sorb,
   1556                      binop(Iop_Add32,getIReg(4,rm),mkU32(d))));
   1557          }
   1558 
   1559       /* a register, %eax .. %edi.  This shouldn't happen. */
   1560       case 0x18: case 0x19: case 0x1A: case 0x1B:
   1561       case 0x1C: case 0x1D: case 0x1E: case 0x1F:
   1562          vpanic("disAMode(x86): not an addr!");
   1563 
   1564       /* a 32-bit literal address
   1565          --> MOV d32, tmp
   1566       */
   1567       case 0x05:
   1568          { UInt d = getUDisp32(delta);
   1569            *len = 5;
   1570            DIS(buf, "%s(0x%x)", sorbTxt(sorb), d);
   1571            return disAMode_copy2tmp(
   1572                      handleSegOverride(sorb, mkU32(d)));
   1573          }
   1574 
   1575       case 0x04: {
   1576          /* SIB, with no displacement.  Special cases:
   1577             -- %esp cannot act as an index value.
   1578                If index_r indicates %esp, zero is used for the index.
   1579             -- when mod is zero and base indicates EBP, base is instead
   1580                a 32-bit literal.
   1581             It's all madness, I tell you.  Extract %index, %base and
   1582             scale from the SIB byte.  The value denoted is then:
   1583                | %index == %ESP && %base == %EBP
   1584                = d32 following SIB byte
   1585                | %index == %ESP && %base != %EBP
   1586                = %base
   1587                | %index != %ESP && %base == %EBP
   1588                = d32 following SIB byte + (%index << scale)
   1589                | %index != %ESP && %base != %ESP
   1590                = %base + (%index << scale)
   1591 
   1592             What happens to the souls of CPU architects who dream up such
   1593             horrendous schemes, do you suppose?
   1594          */
   1595          UChar sib     = getIByte(delta);
   1596          UChar scale   = toUChar((sib >> 6) & 3);
   1597          UChar index_r = toUChar((sib >> 3) & 7);
   1598          UChar base_r  = toUChar(sib & 7);
   1599          delta++;
   1600 
   1601          if (index_r != R_ESP && base_r != R_EBP) {
   1602             DIS(buf, "%s(%s,%s,%d)", sorbTxt(sorb),
   1603                       nameIReg(4,base_r), nameIReg(4,index_r), 1<<scale);
   1604             *len = 2;
   1605             return
   1606                disAMode_copy2tmp(
   1607                handleSegOverride(sorb,
   1608                   binop(Iop_Add32,
   1609                         getIReg(4,base_r),
   1610                         binop(Iop_Shl32, getIReg(4,index_r),
   1611                               mkU8(scale)))));
   1612          }
   1613 
   1614          if (index_r != R_ESP && base_r == R_EBP) {
   1615             UInt d = getUDisp32(delta);
   1616             DIS(buf, "%s0x%x(,%s,%d)", sorbTxt(sorb), d,
   1617                       nameIReg(4,index_r), 1<<scale);
   1618             *len = 6;
   1619             return
   1620                disAMode_copy2tmp(
   1621                handleSegOverride(sorb,
   1622                   binop(Iop_Add32,
   1623                         binop(Iop_Shl32, getIReg(4,index_r), mkU8(scale)),
   1624                         mkU32(d))));
   1625          }
   1626 
   1627          if (index_r == R_ESP && base_r != R_EBP) {
   1628             DIS(buf, "%s(%s,,)", sorbTxt(sorb), nameIReg(4,base_r));
   1629             *len = 2;
   1630             return disAMode_copy2tmp(
   1631                    handleSegOverride(sorb, getIReg(4,base_r)));
   1632          }
   1633 
   1634          if (index_r == R_ESP && base_r == R_EBP) {
   1635             UInt d = getUDisp32(delta);
   1636             DIS(buf, "%s0x%x(,,)", sorbTxt(sorb), d);
   1637             *len = 6;
   1638             return disAMode_copy2tmp(
   1639                    handleSegOverride(sorb, mkU32(d)));
   1640          }
   1641          /*NOTREACHED*/
   1642          vassert(0);
   1643       }
   1644 
   1645       /* SIB, with 8-bit displacement.  Special cases:
   1646          -- %esp cannot act as an index value.
   1647             If index_r indicates %esp, zero is used for the index.
   1648          Denoted value is:
   1649             | %index == %ESP
   1650             = d8 + %base
   1651             | %index != %ESP
   1652             = d8 + %base + (%index << scale)
   1653       */
   1654       case 0x0C: {
   1655          UChar sib     = getIByte(delta);
   1656          UChar scale   = toUChar((sib >> 6) & 3);
   1657          UChar index_r = toUChar((sib >> 3) & 7);
   1658          UChar base_r  = toUChar(sib & 7);
   1659          UInt  d       = getSDisp8(delta+1);
   1660 
   1661          if (index_r == R_ESP) {
   1662             DIS(buf, "%s%d(%s,,)", sorbTxt(sorb),
   1663                                    (Int)d, nameIReg(4,base_r));
   1664             *len = 3;
   1665             return disAMode_copy2tmp(
   1666                    handleSegOverride(sorb,
   1667                       binop(Iop_Add32, getIReg(4,base_r), mkU32(d)) ));
   1668          } else {
   1669             DIS(buf, "%s%d(%s,%s,%d)", sorbTxt(sorb), (Int)d,
   1670                      nameIReg(4,base_r), nameIReg(4,index_r), 1<<scale);
   1671             *len = 3;
   1672             return
   1673                 disAMode_copy2tmp(
   1674                 handleSegOverride(sorb,
   1675                   binop(Iop_Add32,
   1676                         binop(Iop_Add32,
   1677                               getIReg(4,base_r),
   1678                               binop(Iop_Shl32,
   1679                                     getIReg(4,index_r), mkU8(scale))),
   1680                         mkU32(d))));
   1681          }
   1682 	 /*NOTREACHED*/
   1683          vassert(0);
   1684       }
   1685 
   1686       /* SIB, with 32-bit displacement.  Special cases:
   1687          -- %esp cannot act as an index value.
   1688             If index_r indicates %esp, zero is used for the index.
   1689          Denoted value is:
   1690             | %index == %ESP
   1691             = d32 + %base
   1692             | %index != %ESP
   1693             = d32 + %base + (%index << scale)
   1694       */
   1695       case 0x14: {
   1696          UChar sib     = getIByte(delta);
   1697          UChar scale   = toUChar((sib >> 6) & 3);
   1698          UChar index_r = toUChar((sib >> 3) & 7);
   1699          UChar base_r  = toUChar(sib & 7);
   1700          UInt d        = getUDisp32(delta+1);
   1701 
   1702          if (index_r == R_ESP) {
   1703             DIS(buf, "%s%d(%s,,)", sorbTxt(sorb),
   1704                                    (Int)d, nameIReg(4,base_r));
   1705             *len = 6;
   1706             return disAMode_copy2tmp(
   1707                    handleSegOverride(sorb,
   1708                       binop(Iop_Add32, getIReg(4,base_r), mkU32(d)) ));
   1709          } else {
   1710             DIS(buf, "%s%d(%s,%s,%d)", sorbTxt(sorb), (Int)d,
   1711                      nameIReg(4,base_r), nameIReg(4,index_r), 1<<scale);
   1712             *len = 6;
   1713             return
   1714                 disAMode_copy2tmp(
   1715                 handleSegOverride(sorb,
   1716                   binop(Iop_Add32,
   1717                         binop(Iop_Add32,
   1718                               getIReg(4,base_r),
   1719                               binop(Iop_Shl32,
   1720                                     getIReg(4,index_r), mkU8(scale))),
   1721                         mkU32(d))));
   1722          }
   1723 	 /*NOTREACHED*/
   1724          vassert(0);
   1725       }
   1726 
   1727       default:
   1728          vpanic("disAMode(x86)");
   1729          return 0; /*notreached*/
   1730    }
   1731 }
   1732 
   1733 
   1734 /* Figure out the number of (insn-stream) bytes constituting the amode
   1735    beginning at delta.  Is useful for getting hold of literals beyond
   1736    the end of the amode before it has been disassembled.  */
   1737 
   1738 static UInt lengthAMode ( Int delta )
   1739 {
   1740    UChar mod_reg_rm = getIByte(delta); delta++;
   1741 
   1742    /* squeeze out the reg field from mod_reg_rm, since a 256-entry
   1743       jump table seems a bit excessive.
   1744    */
   1745    mod_reg_rm &= 0xC7;               /* is now XX000YYY */
   1746    mod_reg_rm  = toUChar(mod_reg_rm | (mod_reg_rm >> 3));
   1747                                      /* is now XX0XXYYY */
   1748    mod_reg_rm &= 0x1F;               /* is now 000XXYYY */
   1749    switch (mod_reg_rm) {
   1750 
   1751       /* (%eax) .. (%edi), not including (%esp) or (%ebp). */
   1752       case 0x00: case 0x01: case 0x02: case 0x03:
   1753       /* ! 04 */ /* ! 05 */ case 0x06: case 0x07:
   1754          return 1;
   1755 
   1756       /* d8(%eax) ... d8(%edi), not including d8(%esp). */
   1757       case 0x08: case 0x09: case 0x0A: case 0x0B:
   1758       /* ! 0C */ case 0x0D: case 0x0E: case 0x0F:
   1759          return 2;
   1760 
   1761       /* d32(%eax) ... d32(%edi), not including d32(%esp). */
   1762       case 0x10: case 0x11: case 0x12: case 0x13:
   1763       /* ! 14 */ case 0x15: case 0x16: case 0x17:
   1764          return 5;
   1765 
   1766       /* a register, %eax .. %edi.  (Not an addr, but still handled.) */
   1767       case 0x18: case 0x19: case 0x1A: case 0x1B:
   1768       case 0x1C: case 0x1D: case 0x1E: case 0x1F:
   1769          return 1;
   1770 
   1771       /* a 32-bit literal address. */
   1772       case 0x05: return 5;
   1773 
   1774       /* SIB, no displacement.  */
   1775       case 0x04: {
   1776          UChar sib    = getIByte(delta);
   1777          UChar base_r = toUChar(sib & 7);
   1778          if (base_r == R_EBP) return 6; else return 2;
   1779       }
   1780       /* SIB, with 8-bit displacement.  */
   1781       case 0x0C: return 3;
   1782 
   1783       /* SIB, with 32-bit displacement.  */
   1784       case 0x14: return 6;
   1785 
   1786       default:
   1787          vpanic("lengthAMode");
   1788          return 0; /*notreached*/
   1789    }
   1790 }
   1791 
   1792 /*------------------------------------------------------------*/
   1793 /*--- Disassembling common idioms                          ---*/
   1794 /*------------------------------------------------------------*/
   1795 
   1796 /* Handle binary integer instructions of the form
   1797       op E, G  meaning
   1798       op reg-or-mem, reg
   1799    Is passed the a ptr to the modRM byte, the actual operation, and the
   1800    data size.  Returns the address advanced completely over this
   1801    instruction.
   1802 
   1803    E(src) is reg-or-mem
   1804    G(dst) is reg.
   1805 
   1806    If E is reg, -->    GET %G,  tmp
   1807                        OP %E,   tmp
   1808                        PUT tmp, %G
   1809 
   1810    If E is mem and OP is not reversible,
   1811                 -->    (getAddr E) -> tmpa
   1812                        LD (tmpa), tmpa
   1813                        GET %G, tmp2
   1814                        OP tmpa, tmp2
   1815                        PUT tmp2, %G
   1816 
   1817    If E is mem and OP is reversible
   1818                 -->    (getAddr E) -> tmpa
   1819                        LD (tmpa), tmpa
   1820                        OP %G, tmpa
   1821                        PUT tmpa, %G
   1822 */
   1823 static
   1824 UInt dis_op2_E_G ( UChar       sorb,
   1825                    Bool        addSubCarry,
   1826                    IROp        op8,
   1827                    Bool        keep,
   1828                    Int         size,
   1829                    Int         delta0,
   1830                    const HChar* t_x86opc )
   1831 {
   1832    HChar   dis_buf[50];
   1833    Int     len;
   1834    IRType  ty   = szToITy(size);
   1835    IRTemp  dst1 = newTemp(ty);
   1836    IRTemp  src  = newTemp(ty);
   1837    IRTemp  dst0 = newTemp(ty);
   1838    UChar   rm   = getUChar(delta0);
   1839    IRTemp  addr = IRTemp_INVALID;
   1840 
   1841    /* addSubCarry == True indicates the intended operation is
   1842       add-with-carry or subtract-with-borrow. */
   1843    if (addSubCarry) {
   1844       vassert(op8 == Iop_Add8 || op8 == Iop_Sub8);
   1845       vassert(keep);
   1846    }
   1847 
   1848    if (epartIsReg(rm)) {
   1849       /* Specially handle XOR reg,reg, because that doesn't really
   1850          depend on reg, and doing the obvious thing potentially
   1851          generates a spurious value check failure due to the bogus
   1852          dependency.  Ditto SBB reg,reg. */
   1853       if ((op8 == Iop_Xor8 || (op8 == Iop_Sub8 && addSubCarry))
   1854           && gregOfRM(rm) == eregOfRM(rm)) {
   1855          putIReg(size, gregOfRM(rm), mkU(ty,0));
   1856       }
   1857       assign( dst0, getIReg(size,gregOfRM(rm)) );
   1858       assign( src,  getIReg(size,eregOfRM(rm)) );
   1859 
   1860       if (addSubCarry && op8 == Iop_Add8) {
   1861          helper_ADC( size, dst1, dst0, src,
   1862                      /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
   1863          putIReg(size, gregOfRM(rm), mkexpr(dst1));
   1864       } else
   1865       if (addSubCarry && op8 == Iop_Sub8) {
   1866          helper_SBB( size, dst1, dst0, src,
   1867                      /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
   1868          putIReg(size, gregOfRM(rm), mkexpr(dst1));
   1869       } else {
   1870          assign( dst1, binop(mkSizedOp(ty,op8), mkexpr(dst0), mkexpr(src)) );
   1871          if (isAddSub(op8))
   1872             setFlags_DEP1_DEP2(op8, dst0, src, ty);
   1873          else
   1874             setFlags_DEP1(op8, dst1, ty);
   1875          if (keep)
   1876             putIReg(size, gregOfRM(rm), mkexpr(dst1));
   1877       }
   1878 
   1879       DIP("%s%c %s,%s\n", t_x86opc, nameISize(size),
   1880                           nameIReg(size,eregOfRM(rm)),
   1881                           nameIReg(size,gregOfRM(rm)));
   1882       return 1+delta0;
   1883    } else {
   1884       /* E refers to memory */
   1885       addr = disAMode ( &len, sorb, delta0, dis_buf);
   1886       assign( dst0, getIReg(size,gregOfRM(rm)) );
   1887       assign( src,  loadLE(szToITy(size), mkexpr(addr)) );
   1888 
   1889       if (addSubCarry && op8 == Iop_Add8) {
   1890          helper_ADC( size, dst1, dst0, src,
   1891                      /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
   1892          putIReg(size, gregOfRM(rm), mkexpr(dst1));
   1893       } else
   1894       if (addSubCarry && op8 == Iop_Sub8) {
   1895          helper_SBB( size, dst1, dst0, src,
   1896                      /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
   1897          putIReg(size, gregOfRM(rm), mkexpr(dst1));
   1898       } else {
   1899          assign( dst1, binop(mkSizedOp(ty,op8), mkexpr(dst0), mkexpr(src)) );
   1900          if (isAddSub(op8))
   1901             setFlags_DEP1_DEP2(op8, dst0, src, ty);
   1902          else
   1903             setFlags_DEP1(op8, dst1, ty);
   1904          if (keep)
   1905             putIReg(size, gregOfRM(rm), mkexpr(dst1));
   1906       }
   1907 
   1908       DIP("%s%c %s,%s\n", t_x86opc, nameISize(size),
   1909                           dis_buf,nameIReg(size,gregOfRM(rm)));
   1910       return len+delta0;
   1911    }
   1912 }
   1913 
   1914 
   1915 
   1916 /* Handle binary integer instructions of the form
   1917       op G, E  meaning
   1918       op reg, reg-or-mem
   1919    Is passed the a ptr to the modRM byte, the actual operation, and the
   1920    data size.  Returns the address advanced completely over this
   1921    instruction.
   1922 
   1923    G(src) is reg.
   1924    E(dst) is reg-or-mem
   1925 
   1926    If E is reg, -->    GET %E,  tmp
   1927                        OP %G,   tmp
   1928                        PUT tmp, %E
   1929 
   1930    If E is mem, -->    (getAddr E) -> tmpa
   1931                        LD (tmpa), tmpv
   1932                        OP %G, tmpv
   1933                        ST tmpv, (tmpa)
   1934 */
   1935 static
   1936 UInt dis_op2_G_E ( UChar       sorb,
   1937                    Bool        locked,
   1938                    Bool        addSubCarry,
   1939                    IROp        op8,
   1940                    Bool        keep,
   1941                    Int         size,
   1942                    Int         delta0,
   1943                    const HChar* t_x86opc )
   1944 {
   1945    HChar   dis_buf[50];
   1946    Int     len;
   1947    IRType  ty   = szToITy(size);
   1948    IRTemp  dst1 = newTemp(ty);
   1949    IRTemp  src  = newTemp(ty);
   1950    IRTemp  dst0 = newTemp(ty);
   1951    UChar   rm   = getIByte(delta0);
   1952    IRTemp  addr = IRTemp_INVALID;
   1953 
   1954    /* addSubCarry == True indicates the intended operation is
   1955       add-with-carry or subtract-with-borrow. */
   1956    if (addSubCarry) {
   1957       vassert(op8 == Iop_Add8 || op8 == Iop_Sub8);
   1958       vassert(keep);
   1959    }
   1960 
   1961    if (epartIsReg(rm)) {
   1962       /* Specially handle XOR reg,reg, because that doesn't really
   1963          depend on reg, and doing the obvious thing potentially
   1964          generates a spurious value check failure due to the bogus
   1965          dependency.  Ditto SBB reg,reg.*/
   1966       if ((op8 == Iop_Xor8 || (op8 == Iop_Sub8 && addSubCarry))
   1967           && gregOfRM(rm) == eregOfRM(rm)) {
   1968          putIReg(size, eregOfRM(rm), mkU(ty,0));
   1969       }
   1970       assign(dst0, getIReg(size,eregOfRM(rm)));
   1971       assign(src,  getIReg(size,gregOfRM(rm)));
   1972 
   1973       if (addSubCarry && op8 == Iop_Add8) {
   1974          helper_ADC( size, dst1, dst0, src,
   1975                      /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
   1976          putIReg(size, eregOfRM(rm), mkexpr(dst1));
   1977       } else
   1978       if (addSubCarry && op8 == Iop_Sub8) {
   1979          helper_SBB( size, dst1, dst0, src,
   1980                      /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
   1981          putIReg(size, eregOfRM(rm), mkexpr(dst1));
   1982       } else {
   1983          assign(dst1, binop(mkSizedOp(ty,op8), mkexpr(dst0), mkexpr(src)));
   1984          if (isAddSub(op8))
   1985             setFlags_DEP1_DEP2(op8, dst0, src, ty);
   1986          else
   1987             setFlags_DEP1(op8, dst1, ty);
   1988          if (keep)
   1989             putIReg(size, eregOfRM(rm), mkexpr(dst1));
   1990       }
   1991 
   1992       DIP("%s%c %s,%s\n", t_x86opc, nameISize(size),
   1993                           nameIReg(size,gregOfRM(rm)),
   1994                           nameIReg(size,eregOfRM(rm)));
   1995       return 1+delta0;
   1996    }
   1997 
   1998    /* E refers to memory */
   1999    {
   2000       addr = disAMode ( &len, sorb, delta0, dis_buf);
   2001       assign(dst0, loadLE(ty,mkexpr(addr)));
   2002       assign(src,  getIReg(size,gregOfRM(rm)));
   2003 
   2004       if (addSubCarry && op8 == Iop_Add8) {
   2005          if (locked) {
   2006             /* cas-style store */
   2007             helper_ADC( size, dst1, dst0, src,
   2008                         /*store*/addr, dst0/*expVal*/, guest_EIP_curr_instr );
   2009          } else {
   2010             /* normal store */
   2011             helper_ADC( size, dst1, dst0, src,
   2012                         /*store*/addr, IRTemp_INVALID, 0 );
   2013          }
   2014       } else
   2015       if (addSubCarry && op8 == Iop_Sub8) {
   2016          if (locked) {
   2017             /* cas-style store */
   2018             helper_SBB( size, dst1, dst0, src,
   2019                         /*store*/addr, dst0/*expVal*/, guest_EIP_curr_instr );
   2020          } else {
   2021             /* normal store */
   2022             helper_SBB( size, dst1, dst0, src,
   2023                         /*store*/addr, IRTemp_INVALID, 0 );
   2024          }
   2025       } else {
   2026          assign(dst1, binop(mkSizedOp(ty,op8), mkexpr(dst0), mkexpr(src)));
   2027          if (keep) {
   2028             if (locked) {
   2029                if (0) vex_printf("locked case\n" );
   2030                casLE( mkexpr(addr),
   2031                       mkexpr(dst0)/*expval*/,
   2032                       mkexpr(dst1)/*newval*/, guest_EIP_curr_instr );
   2033             } else {
   2034                if (0) vex_printf("nonlocked case\n");
   2035                storeLE(mkexpr(addr), mkexpr(dst1));
   2036             }
   2037          }
   2038          if (isAddSub(op8))
   2039             setFlags_DEP1_DEP2(op8, dst0, src, ty);
   2040          else
   2041             setFlags_DEP1(op8, dst1, ty);
   2042       }
   2043 
   2044       DIP("%s%c %s,%s\n", t_x86opc, nameISize(size),
   2045                           nameIReg(size,gregOfRM(rm)), dis_buf);
   2046       return len+delta0;
   2047    }
   2048 }
   2049 
   2050 
   2051 /* Handle move instructions of the form
   2052       mov E, G  meaning
   2053       mov reg-or-mem, reg
   2054    Is passed the a ptr to the modRM byte, and the data size.  Returns
   2055    the address advanced completely over this instruction.
   2056 
   2057    E(src) is reg-or-mem
   2058    G(dst) is reg.
   2059 
   2060    If E is reg, -->    GET %E,  tmpv
   2061                        PUT tmpv, %G
   2062 
   2063    If E is mem  -->    (getAddr E) -> tmpa
   2064                        LD (tmpa), tmpb
   2065                        PUT tmpb, %G
   2066 */
   2067 static
   2068 UInt dis_mov_E_G ( UChar       sorb,
   2069                    Int         size,
   2070                    Int         delta0 )
   2071 {
   2072    Int len;
   2073    UChar rm = getIByte(delta0);
   2074    HChar dis_buf[50];
   2075 
   2076    if (epartIsReg(rm)) {
   2077       putIReg(size, gregOfRM(rm), getIReg(size, eregOfRM(rm)));
   2078       DIP("mov%c %s,%s\n", nameISize(size),
   2079                            nameIReg(size,eregOfRM(rm)),
   2080                            nameIReg(size,gregOfRM(rm)));
   2081       return 1+delta0;
   2082    }
   2083 
   2084    /* E refers to memory */
   2085    {
   2086       IRTemp addr = disAMode ( &len, sorb, delta0, dis_buf );
   2087       putIReg(size, gregOfRM(rm), loadLE(szToITy(size), mkexpr(addr)));
   2088       DIP("mov%c %s,%s\n", nameISize(size),
   2089                            dis_buf,nameIReg(size,gregOfRM(rm)));
   2090       return delta0+len;
   2091    }
   2092 }
   2093 
   2094 
   2095 /* Handle move instructions of the form
   2096       mov G, E  meaning
   2097       mov reg, reg-or-mem
   2098    Is passed the a ptr to the modRM byte, and the data size.  Returns
   2099    the address advanced completely over this instruction.
   2100 
   2101    G(src) is reg.
   2102    E(dst) is reg-or-mem
   2103 
   2104    If E is reg, -->    GET %G,  tmp
   2105                        PUT tmp, %E
   2106 
   2107    If E is mem, -->    (getAddr E) -> tmpa
   2108                        GET %G, tmpv
   2109                        ST tmpv, (tmpa)
   2110 */
   2111 static
   2112 UInt dis_mov_G_E ( UChar       sorb,
   2113                    Int         size,
   2114                    Int         delta0 )
   2115 {
   2116    Int len;
   2117    UChar rm = getIByte(delta0);
   2118    HChar dis_buf[50];
   2119 
   2120    if (epartIsReg(rm)) {
   2121       putIReg(size, eregOfRM(rm), getIReg(size, gregOfRM(rm)));
   2122       DIP("mov%c %s,%s\n", nameISize(size),
   2123                            nameIReg(size,gregOfRM(rm)),
   2124                            nameIReg(size,eregOfRM(rm)));
   2125       return 1+delta0;
   2126    }
   2127 
   2128    /* E refers to memory */
   2129    {
   2130       IRTemp addr = disAMode ( &len, sorb, delta0, dis_buf);
   2131       storeLE( mkexpr(addr), getIReg(size, gregOfRM(rm)) );
   2132       DIP("mov%c %s,%s\n", nameISize(size),
   2133                            nameIReg(size,gregOfRM(rm)), dis_buf);
   2134       return len+delta0;
   2135    }
   2136 }
   2137 
   2138 
   2139 /* op $immediate, AL/AX/EAX. */
   2140 static
   2141 UInt dis_op_imm_A ( Int    size,
   2142                     Bool   carrying,
   2143                     IROp   op8,
   2144                     Bool   keep,
   2145                     Int    delta,
   2146                     const HChar* t_x86opc )
   2147 {
   2148    IRType ty   = szToITy(size);
   2149    IRTemp dst0 = newTemp(ty);
   2150    IRTemp src  = newTemp(ty);
   2151    IRTemp dst1 = newTemp(ty);
   2152    UInt lit    = getUDisp(size,delta);
   2153    assign(dst0, getIReg(size,R_EAX));
   2154    assign(src,  mkU(ty,lit));
   2155 
   2156    if (isAddSub(op8) && !carrying) {
   2157       assign(dst1, binop(mkSizedOp(ty,op8), mkexpr(dst0), mkexpr(src)) );
   2158       setFlags_DEP1_DEP2(op8, dst0, src, ty);
   2159    }
   2160    else
   2161    if (isLogic(op8)) {
   2162       vassert(!carrying);
   2163       assign(dst1, binop(mkSizedOp(ty,op8), mkexpr(dst0), mkexpr(src)) );
   2164       setFlags_DEP1(op8, dst1, ty);
   2165    }
   2166    else
   2167    if (op8 == Iop_Add8 && carrying) {
   2168       helper_ADC( size, dst1, dst0, src,
   2169                   /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
   2170    }
   2171    else
   2172    if (op8 == Iop_Sub8 && carrying) {
   2173       helper_SBB( size, dst1, dst0, src,
   2174                   /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
   2175    }
   2176    else
   2177       vpanic("dis_op_imm_A(x86,guest)");
   2178 
   2179    if (keep)
   2180       putIReg(size, R_EAX, mkexpr(dst1));
   2181 
   2182    DIP("%s%c $0x%x, %s\n", t_x86opc, nameISize(size),
   2183                            lit, nameIReg(size,R_EAX));
   2184    return delta+size;
   2185 }
   2186 
   2187 
   2188 /* Sign- and Zero-extending moves. */
   2189 static
   2190 UInt dis_movx_E_G ( UChar      sorb,
   2191                     Int delta, Int szs, Int szd, Bool sign_extend )
   2192 {
   2193    UChar rm = getIByte(delta);
   2194    if (epartIsReg(rm)) {
   2195       if (szd == szs) {
   2196          // mutant case.  See #250799
   2197          putIReg(szd, gregOfRM(rm),
   2198                            getIReg(szs,eregOfRM(rm)));
   2199       } else {
   2200          // normal case
   2201          putIReg(szd, gregOfRM(rm),
   2202                       unop(mkWidenOp(szs,szd,sign_extend),
   2203                            getIReg(szs,eregOfRM(rm))));
   2204       }
   2205       DIP("mov%c%c%c %s,%s\n", sign_extend ? 's' : 'z',
   2206                                nameISize(szs), nameISize(szd),
   2207                                nameIReg(szs,eregOfRM(rm)),
   2208                                nameIReg(szd,gregOfRM(rm)));
   2209       return 1+delta;
   2210    }
   2211 
   2212    /* E refers to memory */
   2213    {
   2214       Int    len;
   2215       HChar  dis_buf[50];
   2216       IRTemp addr = disAMode ( &len, sorb, delta, dis_buf );
   2217       if (szd == szs) {
   2218          // mutant case.  See #250799
   2219          putIReg(szd, gregOfRM(rm),
   2220                            loadLE(szToITy(szs),mkexpr(addr)));
   2221       } else {
   2222          // normal case
   2223          putIReg(szd, gregOfRM(rm),
   2224                       unop(mkWidenOp(szs,szd,sign_extend),
   2225                            loadLE(szToITy(szs),mkexpr(addr))));
   2226       }
   2227       DIP("mov%c%c%c %s,%s\n", sign_extend ? 's' : 'z',
   2228                                nameISize(szs), nameISize(szd),
   2229                                dis_buf, nameIReg(szd,gregOfRM(rm)));
   2230       return len+delta;
   2231    }
   2232 }
   2233 
   2234 
   2235 /* Generate code to divide ArchRegs EDX:EAX / DX:AX / AX by the 32 /
   2236    16 / 8 bit quantity in the given IRTemp.  */
   2237 static
   2238 void codegen_div ( Int sz, IRTemp t, Bool signed_divide )
   2239 {
   2240    IROp   op    = signed_divide ? Iop_DivModS64to32 : Iop_DivModU64to32;
   2241    IRTemp src64 = newTemp(Ity_I64);
   2242    IRTemp dst64 = newTemp(Ity_I64);
   2243    switch (sz) {
   2244       case 4:
   2245          assign( src64, binop(Iop_32HLto64,
   2246                               getIReg(4,R_EDX), getIReg(4,R_EAX)) );
   2247          assign( dst64, binop(op, mkexpr(src64), mkexpr(t)) );
   2248          putIReg( 4, R_EAX, unop(Iop_64to32,mkexpr(dst64)) );
   2249          putIReg( 4, R_EDX, unop(Iop_64HIto32,mkexpr(dst64)) );
   2250          break;
   2251       case 2: {
   2252          IROp widen3264 = signed_divide ? Iop_32Sto64 : Iop_32Uto64;
   2253          IROp widen1632 = signed_divide ? Iop_16Sto32 : Iop_16Uto32;
   2254          assign( src64, unop(widen3264,
   2255                              binop(Iop_16HLto32,
   2256                                    getIReg(2,R_EDX), getIReg(2,R_EAX))) );
   2257          assign( dst64, binop(op, mkexpr(src64), unop(widen1632,mkexpr(t))) );
   2258          putIReg( 2, R_EAX, unop(Iop_32to16,unop(Iop_64to32,mkexpr(dst64))) );
   2259          putIReg( 2, R_EDX, unop(Iop_32to16,unop(Iop_64HIto32,mkexpr(dst64))) );
   2260          break;
   2261       }
   2262       case 1: {
   2263          IROp widen3264 = signed_divide ? Iop_32Sto64 : Iop_32Uto64;
   2264          IROp widen1632 = signed_divide ? Iop_16Sto32 : Iop_16Uto32;
   2265          IROp widen816  = signed_divide ? Iop_8Sto16  : Iop_8Uto16;
   2266          assign( src64, unop(widen3264, unop(widen1632, getIReg(2,R_EAX))) );
   2267          assign( dst64,
   2268                  binop(op, mkexpr(src64),
   2269                            unop(widen1632, unop(widen816, mkexpr(t)))) );
   2270          putIReg( 1, R_AL, unop(Iop_16to8, unop(Iop_32to16,
   2271                            unop(Iop_64to32,mkexpr(dst64)))) );
   2272          putIReg( 1, R_AH, unop(Iop_16to8, unop(Iop_32to16,
   2273                            unop(Iop_64HIto32,mkexpr(dst64)))) );
   2274          break;
   2275       }
   2276       default: vpanic("codegen_div(x86)");
   2277    }
   2278 }
   2279 
   2280 
   2281 static
   2282 UInt dis_Grp1 ( UChar sorb, Bool locked,
   2283                 Int delta, UChar modrm,
   2284                 Int am_sz, Int d_sz, Int sz, UInt d32 )
   2285 {
   2286    Int     len;
   2287    HChar   dis_buf[50];
   2288    IRType  ty   = szToITy(sz);
   2289    IRTemp  dst1 = newTemp(ty);
   2290    IRTemp  src  = newTemp(ty);
   2291    IRTemp  dst0 = newTemp(ty);
   2292    IRTemp  addr = IRTemp_INVALID;
   2293    IROp    op8  = Iop_INVALID;
   2294    UInt    mask = sz==1 ? 0xFF : (sz==2 ? 0xFFFF : 0xFFFFFFFF);
   2295 
   2296    switch (gregOfRM(modrm)) {
   2297       case 0: op8 = Iop_Add8; break;  case 1: op8 = Iop_Or8;  break;
   2298       case 2: break;  // ADC
   2299       case 3: break;  // SBB
   2300       case 4: op8 = Iop_And8; break;  case 5: op8 = Iop_Sub8; break;
   2301       case 6: op8 = Iop_Xor8; break;  case 7: op8 = Iop_Sub8; break;
   2302       /*NOTREACHED*/
   2303       default: vpanic("dis_Grp1: unhandled case");
   2304    }
   2305 
   2306    if (epartIsReg(modrm)) {
   2307       vassert(am_sz == 1);
   2308 
   2309       assign(dst0, getIReg(sz,eregOfRM(modrm)));
   2310       assign(src,  mkU(ty,d32 & mask));
   2311 
   2312       if (gregOfRM(modrm) == 2 /* ADC */) {
   2313          helper_ADC( sz, dst1, dst0, src,
   2314                      /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
   2315       } else
   2316       if (gregOfRM(modrm) == 3 /* SBB */) {
   2317          helper_SBB( sz, dst1, dst0, src,
   2318                      /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
   2319       } else {
   2320          assign(dst1, binop(mkSizedOp(ty,op8), mkexpr(dst0), mkexpr(src)));
   2321          if (isAddSub(op8))
   2322             setFlags_DEP1_DEP2(op8, dst0, src, ty);
   2323          else
   2324             setFlags_DEP1(op8, dst1, ty);
   2325       }
   2326 
   2327       if (gregOfRM(modrm) < 7)
   2328          putIReg(sz, eregOfRM(modrm), mkexpr(dst1));
   2329 
   2330       delta += (am_sz + d_sz);
   2331       DIP("%s%c $0x%x, %s\n", nameGrp1(gregOfRM(modrm)), nameISize(sz), d32,
   2332                               nameIReg(sz,eregOfRM(modrm)));
   2333    } else {
   2334       addr = disAMode ( &len, sorb, delta, dis_buf);
   2335 
   2336       assign(dst0, loadLE(ty,mkexpr(addr)));
   2337       assign(src, mkU(ty,d32 & mask));
   2338 
   2339       if (gregOfRM(modrm) == 2 /* ADC */) {
   2340          if (locked) {
   2341             /* cas-style store */
   2342             helper_ADC( sz, dst1, dst0, src,
   2343                        /*store*/addr, dst0/*expVal*/, guest_EIP_curr_instr );
   2344          } else {
   2345             /* normal store */
   2346             helper_ADC( sz, dst1, dst0, src,
   2347                         /*store*/addr, IRTemp_INVALID, 0 );
   2348          }
   2349       } else
   2350       if (gregOfRM(modrm) == 3 /* SBB */) {
   2351          if (locked) {
   2352             /* cas-style store */
   2353             helper_SBB( sz, dst1, dst0, src,
   2354                        /*store*/addr, dst0/*expVal*/, guest_EIP_curr_instr );
   2355          } else {
   2356             /* normal store */
   2357             helper_SBB( sz, dst1, dst0, src,
   2358                         /*store*/addr, IRTemp_INVALID, 0 );
   2359          }
   2360       } else {
   2361          assign(dst1, binop(mkSizedOp(ty,op8), mkexpr(dst0), mkexpr(src)));
   2362          if (gregOfRM(modrm) < 7) {
   2363             if (locked) {
   2364                casLE( mkexpr(addr), mkexpr(dst0)/*expVal*/,
   2365                                     mkexpr(dst1)/*newVal*/,
   2366                                     guest_EIP_curr_instr );
   2367             } else {
   2368                storeLE(mkexpr(addr), mkexpr(dst1));
   2369             }
   2370          }
   2371          if (isAddSub(op8))
   2372             setFlags_DEP1_DEP2(op8, dst0, src, ty);
   2373          else
   2374             setFlags_DEP1(op8, dst1, ty);
   2375       }
   2376 
   2377       delta += (len+d_sz);
   2378       DIP("%s%c $0x%x, %s\n", nameGrp1(gregOfRM(modrm)), nameISize(sz),
   2379                               d32, dis_buf);
   2380    }
   2381    return delta;
   2382 }
   2383 
   2384 
   2385 /* Group 2 extended opcodes.  shift_expr must be an 8-bit typed
   2386    expression. */
   2387 
   2388 static
   2389 UInt dis_Grp2 ( UChar sorb,
   2390                 Int delta, UChar modrm,
   2391                 Int am_sz, Int d_sz, Int sz, IRExpr* shift_expr,
   2392                 const HChar* shift_expr_txt, Bool* decode_OK )
   2393 {
   2394    /* delta on entry points at the modrm byte. */
   2395    HChar  dis_buf[50];
   2396    Int    len;
   2397    Bool   isShift, isRotate, isRotateC;
   2398    IRType ty    = szToITy(sz);
   2399    IRTemp dst0  = newTemp(ty);
   2400    IRTemp dst1  = newTemp(ty);
   2401    IRTemp addr  = IRTemp_INVALID;
   2402 
   2403    *decode_OK = True;
   2404 
   2405    vassert(sz == 1 || sz == 2 || sz == 4);
   2406 
   2407    /* Put value to shift/rotate in dst0. */
   2408    if (epartIsReg(modrm)) {
   2409       assign(dst0, getIReg(sz, eregOfRM(modrm)));
   2410       delta += (am_sz + d_sz);
   2411    } else {
   2412       addr = disAMode ( &len, sorb, delta, dis_buf);
   2413       assign(dst0, loadLE(ty,mkexpr(addr)));
   2414       delta += len + d_sz;
   2415    }
   2416 
   2417    isShift = False;
   2418    switch (gregOfRM(modrm)) { case 4: case 5: case 6: case 7: isShift = True; }
   2419 
   2420    isRotate = False;
   2421    switch (gregOfRM(modrm)) { case 0: case 1: isRotate = True; }
   2422 
   2423    isRotateC = False;
   2424    switch (gregOfRM(modrm)) { case 2: case 3: isRotateC = True; }
   2425 
   2426    if (!isShift && !isRotate && !isRotateC) {
   2427       /*NOTREACHED*/
   2428       vpanic("dis_Grp2(Reg): unhandled case(x86)");
   2429    }
   2430 
   2431    if (isRotateC) {
   2432       /* call a helper; these insns are so ridiculous they do not
   2433          deserve better */
   2434       Bool     left = toBool(gregOfRM(modrm) == 2);
   2435       IRTemp   r64  = newTemp(Ity_I64);
   2436       IRExpr** args
   2437          = mkIRExprVec_4( widenUto32(mkexpr(dst0)), /* thing to rotate */
   2438                           widenUto32(shift_expr),   /* rotate amount */
   2439                           widenUto32(mk_x86g_calculate_eflags_all()),
   2440                           mkU32(sz) );
   2441       assign( r64, mkIRExprCCall(
   2442                       Ity_I64,
   2443                       0/*regparm*/,
   2444                       left ? "x86g_calculate_RCL" : "x86g_calculate_RCR",
   2445                       left ? &x86g_calculate_RCL  : &x86g_calculate_RCR,
   2446                       args
   2447                    )
   2448             );
   2449       /* new eflags in hi half r64; new value in lo half r64 */
   2450       assign( dst1, narrowTo(ty, unop(Iop_64to32, mkexpr(r64))) );
   2451       stmt( IRStmt_Put( OFFB_CC_OP,   mkU32(X86G_CC_OP_COPY) ));
   2452       stmt( IRStmt_Put( OFFB_CC_DEP1, unop(Iop_64HIto32, mkexpr(r64)) ));
   2453       stmt( IRStmt_Put( OFFB_CC_DEP2, mkU32(0) ));
   2454       /* Set NDEP even though it isn't used.  This makes redundant-PUT
   2455          elimination of previous stores to this field work better. */
   2456       stmt( IRStmt_Put( OFFB_CC_NDEP, mkU32(0) ));
   2457    }
   2458 
   2459    if (isShift) {
   2460 
   2461       IRTemp pre32     = newTemp(Ity_I32);
   2462       IRTemp res32     = newTemp(Ity_I32);
   2463       IRTemp res32ss   = newTemp(Ity_I32);
   2464       IRTemp shift_amt = newTemp(Ity_I8);
   2465       IROp   op32;
   2466 
   2467       switch (gregOfRM(modrm)) {
   2468          case 4: op32 = Iop_Shl32; break;
   2469          case 5: op32 = Iop_Shr32; break;
   2470          case 6: op32 = Iop_Shl32; break;
   2471          case 7: op32 = Iop_Sar32; break;
   2472          /*NOTREACHED*/
   2473          default: vpanic("dis_Grp2:shift"); break;
   2474       }
   2475 
   2476       /* Widen the value to be shifted to 32 bits, do the shift, and
   2477          narrow back down.  This seems surprisingly long-winded, but
   2478          unfortunately the Intel semantics requires that 8/16-bit
   2479          shifts give defined results for shift values all the way up
   2480          to 31, and this seems the simplest way to do it.  It has the
   2481          advantage that the only IR level shifts generated are of 32
   2482          bit values, and the shift amount is guaranteed to be in the
   2483          range 0 .. 31, thereby observing the IR semantics requiring
   2484          all shift values to be in the range 0 .. 2^word_size-1. */
   2485 
   2486       /* shift_amt = shift_expr & 31, regardless of operation size */
   2487       assign( shift_amt, binop(Iop_And8, shift_expr, mkU8(31)) );
   2488 
   2489       /* suitably widen the value to be shifted to 32 bits. */
   2490       assign( pre32, op32==Iop_Sar32 ? widenSto32(mkexpr(dst0))
   2491                                      : widenUto32(mkexpr(dst0)) );
   2492 
   2493       /* res32 = pre32 `shift` shift_amt */
   2494       assign( res32, binop(op32, mkexpr(pre32), mkexpr(shift_amt)) );
   2495 
   2496       /* res32ss = pre32 `shift` ((shift_amt - 1) & 31) */
   2497       assign( res32ss,
   2498               binop(op32,
   2499                     mkexpr(pre32),
   2500                     binop(Iop_And8,
   2501                           binop(Iop_Sub8,
   2502                                 mkexpr(shift_amt), mkU8(1)),
   2503                           mkU8(31))) );
   2504 
   2505       /* Build the flags thunk. */
   2506       setFlags_DEP1_DEP2_shift(op32, res32, res32ss, ty, shift_amt);
   2507 
   2508       /* Narrow the result back down. */
   2509       assign( dst1, narrowTo(ty, mkexpr(res32)) );
   2510 
   2511    } /* if (isShift) */
   2512 
   2513    else
   2514    if (isRotate) {
   2515       Int    ccOp      = ty==Ity_I8 ? 0 : (ty==Ity_I16 ? 1 : 2);
   2516       Bool   left      = toBool(gregOfRM(modrm) == 0);
   2517       IRTemp rot_amt   = newTemp(Ity_I8);
   2518       IRTemp rot_amt32 = newTemp(Ity_I8);
   2519       IRTemp oldFlags  = newTemp(Ity_I32);
   2520 
   2521       /* rot_amt = shift_expr & mask */
   2522       /* By masking the rotate amount thusly, the IR-level Shl/Shr
   2523          expressions never shift beyond the word size and thus remain
   2524          well defined. */
   2525       assign(rot_amt32, binop(Iop_And8, shift_expr, mkU8(31)));
   2526 
   2527       if (ty == Ity_I32)
   2528          assign(rot_amt, mkexpr(rot_amt32));
   2529       else
   2530          assign(rot_amt, binop(Iop_And8, mkexpr(rot_amt32), mkU8(8*sz-1)));
   2531 
   2532       if (left) {
   2533 
   2534          /* dst1 = (dst0 << rot_amt) | (dst0 >>u (wordsize-rot_amt)) */
   2535          assign(dst1,
   2536             binop( mkSizedOp(ty,Iop_Or8),
   2537                    binop( mkSizedOp(ty,Iop_Shl8),
   2538                           mkexpr(dst0),
   2539                           mkexpr(rot_amt)
   2540                    ),
   2541                    binop( mkSizedOp(ty,Iop_Shr8),
   2542                           mkexpr(dst0),
   2543                           binop(Iop_Sub8,mkU8(8*sz), mkexpr(rot_amt))
   2544                    )
   2545             )
   2546          );
   2547          ccOp += X86G_CC_OP_ROLB;
   2548 
   2549       } else { /* right */
   2550 
   2551          /* dst1 = (dst0 >>u rot_amt) | (dst0 << (wordsize-rot_amt)) */
   2552          assign(dst1,
   2553             binop( mkSizedOp(ty,Iop_Or8),
   2554                    binop( mkSizedOp(ty,Iop_Shr8),
   2555                           mkexpr(dst0),
   2556                           mkexpr(rot_amt)
   2557                    ),
   2558                    binop( mkSizedOp(ty,Iop_Shl8),
   2559                           mkexpr(dst0),
   2560                           binop(Iop_Sub8,mkU8(8*sz), mkexpr(rot_amt))
   2561                    )
   2562             )
   2563          );
   2564          ccOp += X86G_CC_OP_RORB;
   2565 
   2566       }
   2567 
   2568       /* dst1 now holds the rotated value.  Build flag thunk.  We
   2569          need the resulting value for this, and the previous flags.
   2570          Except don't set it if the rotate count is zero. */
   2571 
   2572       assign(oldFlags, mk_x86g_calculate_eflags_all());
   2573 
   2574       /* rot_amt32 :: Ity_I8.  We need to convert it to I1. */
   2575       IRTemp rot_amt32b = newTemp(Ity_I1);
   2576       assign(rot_amt32b, binop(Iop_CmpNE8, mkexpr(rot_amt32), mkU8(0)) );
   2577 
   2578       /* CC_DEP1 is the rotated value.  CC_NDEP is flags before. */
   2579       stmt( IRStmt_Put( OFFB_CC_OP,
   2580                         IRExpr_ITE( mkexpr(rot_amt32b),
   2581                                     mkU32(ccOp),
   2582                                     IRExpr_Get(OFFB_CC_OP,Ity_I32) ) ));
   2583       stmt( IRStmt_Put( OFFB_CC_DEP1,
   2584                         IRExpr_ITE( mkexpr(rot_amt32b),
   2585                                     widenUto32(mkexpr(dst1)),
   2586                                     IRExpr_Get(OFFB_CC_DEP1,Ity_I32) ) ));
   2587       stmt( IRStmt_Put( OFFB_CC_DEP2,
   2588                         IRExpr_ITE( mkexpr(rot_amt32b),
   2589                                     mkU32(0),
   2590                                     IRExpr_Get(OFFB_CC_DEP2,Ity_I32) ) ));
   2591       stmt( IRStmt_Put( OFFB_CC_NDEP,
   2592                         IRExpr_ITE( mkexpr(rot_amt32b),
   2593                                     mkexpr(oldFlags),
   2594                                     IRExpr_Get(OFFB_CC_NDEP,Ity_I32) ) ));
   2595    } /* if (isRotate) */
   2596 
   2597    /* Save result, and finish up. */
   2598    if (epartIsReg(modrm)) {
   2599       putIReg(sz, eregOfRM(modrm), mkexpr(dst1));
   2600       if (vex_traceflags & VEX_TRACE_FE) {
   2601          vex_printf("%s%c ",
   2602                     nameGrp2(gregOfRM(modrm)), nameISize(sz) );
   2603          if (shift_expr_txt)
   2604             vex_printf("%s", shift_expr_txt);
   2605          else
   2606             ppIRExpr(shift_expr);
   2607          vex_printf(", %s\n", nameIReg(sz,eregOfRM(modrm)));
   2608       }
   2609    } else {
   2610       storeLE(mkexpr(addr), mkexpr(dst1));
   2611       if (vex_traceflags & VEX_TRACE_FE) {
   2612          vex_printf("%s%c ",
   2613                     nameGrp2(gregOfRM(modrm)), nameISize(sz) );
   2614          if (shift_expr_txt)
   2615             vex_printf("%s", shift_expr_txt);
   2616          else
   2617             ppIRExpr(shift_expr);
   2618          vex_printf(", %s\n", dis_buf);
   2619       }
   2620    }
   2621    return delta;
   2622 }
   2623 
   2624 
   2625 /* Group 8 extended opcodes (but BT/BTS/BTC/BTR only). */
   2626 static
   2627 UInt dis_Grp8_Imm ( UChar sorb,
   2628                     Bool locked,
   2629                     Int delta, UChar modrm,
   2630                     Int am_sz, Int sz, UInt src_val,
   2631                     Bool* decode_OK )
   2632 {
   2633    /* src_val denotes a d8.
   2634       And delta on entry points at the modrm byte. */
   2635 
   2636    IRType ty     = szToITy(sz);
   2637    IRTemp t2     = newTemp(Ity_I32);
   2638    IRTemp t2m    = newTemp(Ity_I32);
   2639    IRTemp t_addr = IRTemp_INVALID;
   2640    HChar  dis_buf[50];
   2641    UInt   mask;
   2642 
   2643    /* we're optimists :-) */
   2644    *decode_OK = True;
   2645 
   2646    /* Limit src_val -- the bit offset -- to something within a word.
   2647       The Intel docs say that literal offsets larger than a word are
   2648       masked in this way. */
   2649    switch (sz) {
   2650       case 2:  src_val &= 15; break;
   2651       case 4:  src_val &= 31; break;
   2652       default: *decode_OK = False; return delta;
   2653    }
   2654 
   2655    /* Invent a mask suitable for the operation. */
   2656    switch (gregOfRM(modrm)) {
   2657       case 4: /* BT */  mask = 0;               break;
   2658       case 5: /* BTS */ mask = 1 << src_val;    break;
   2659       case 6: /* BTR */ mask = ~(1 << src_val); break;
   2660       case 7: /* BTC */ mask = 1 << src_val;    break;
   2661          /* If this needs to be extended, probably simplest to make a
   2662             new function to handle the other cases (0 .. 3).  The
   2663             Intel docs do however not indicate any use for 0 .. 3, so
   2664             we don't expect this to happen. */
   2665       default: *decode_OK = False; return delta;
   2666    }
   2667 
   2668    /* Fetch the value to be tested and modified into t2, which is
   2669       32-bits wide regardless of sz. */
   2670    if (epartIsReg(modrm)) {
   2671       vassert(am_sz == 1);
   2672       assign( t2, widenUto32(getIReg(sz, eregOfRM(modrm))) );
   2673       delta += (am_sz + 1);
   2674       DIP("%s%c $0x%x, %s\n", nameGrp8(gregOfRM(modrm)), nameISize(sz),
   2675                               src_val, nameIReg(sz,eregOfRM(modrm)));
   2676    } else {
   2677       Int len;
   2678       t_addr = disAMode ( &len, sorb, delta, dis_buf);
   2679       delta  += (len+1);
   2680       assign( t2, widenUto32(loadLE(ty, mkexpr(t_addr))) );
   2681       DIP("%s%c $0x%x, %s\n", nameGrp8(gregOfRM(modrm)), nameISize(sz),
   2682                               src_val, dis_buf);
   2683    }
   2684 
   2685    /* Compute the new value into t2m, if non-BT. */
   2686    switch (gregOfRM(modrm)) {
   2687       case 4: /* BT */
   2688          break;
   2689       case 5: /* BTS */
   2690          assign( t2m, binop(Iop_Or32, mkU32(mask), mkexpr(t2)) );
   2691          break;
   2692       case 6: /* BTR */
   2693          assign( t2m, binop(Iop_And32, mkU32(mask), mkexpr(t2)) );
   2694          break;
   2695       case 7: /* BTC */
   2696          assign( t2m, binop(Iop_Xor32, mkU32(mask), mkexpr(t2)) );
   2697          break;
   2698       default:
   2699          /*NOTREACHED*/ /*the previous switch guards this*/
   2700          vassert(0);
   2701    }
   2702 
   2703    /* Write the result back, if non-BT.  If the CAS fails then we
   2704       side-exit from the trace at this point, and so the flag state is
   2705       not affected.  This is of course as required. */
   2706    if (gregOfRM(modrm) != 4 /* BT */) {
   2707       if (epartIsReg(modrm)) {
   2708          putIReg(sz, eregOfRM(modrm), narrowTo(ty, mkexpr(t2m)));
   2709       } else {
   2710          if (locked) {
   2711             casLE( mkexpr(t_addr),
   2712                    narrowTo(ty, mkexpr(t2))/*expd*/,
   2713                    narrowTo(ty, mkexpr(t2m))/*new*/,
   2714                    guest_EIP_curr_instr );
   2715          } else {
   2716             storeLE(mkexpr(t_addr), narrowTo(ty, mkexpr(t2m)));
   2717          }
   2718       }
   2719    }
   2720 
   2721    /* Copy relevant bit from t2 into the carry flag. */
   2722    /* Flags: C=selected bit, O,S,Z,A,P undefined, so are set to zero. */
   2723    stmt( IRStmt_Put( OFFB_CC_OP,   mkU32(X86G_CC_OP_COPY) ));
   2724    stmt( IRStmt_Put( OFFB_CC_DEP2, mkU32(0) ));
   2725    stmt( IRStmt_Put(
   2726             OFFB_CC_DEP1,
   2727             binop(Iop_And32,
   2728                   binop(Iop_Shr32, mkexpr(t2), mkU8(src_val)),
   2729                   mkU32(1))
   2730        ));
   2731    /* Set NDEP even though it isn't used.  This makes redundant-PUT
   2732       elimination of previous stores to this field work better. */
   2733    stmt( IRStmt_Put( OFFB_CC_NDEP, mkU32(0) ));
   2734 
   2735    return delta;
   2736 }
   2737 
   2738 
   2739 /* Signed/unsigned widening multiply.  Generate IR to multiply the
   2740    value in EAX/AX/AL by the given IRTemp, and park the result in
   2741    EDX:EAX/DX:AX/AX.
   2742 */
   2743 static void codegen_mulL_A_D ( Int sz, Bool syned,
   2744                                IRTemp tmp, const HChar* tmp_txt )
   2745 {
   2746    IRType ty = szToITy(sz);
   2747    IRTemp t1 = newTemp(ty);
   2748 
   2749    assign( t1, getIReg(sz, R_EAX) );
   2750 
   2751    switch (ty) {
   2752       case Ity_I32: {
   2753          IRTemp res64   = newTemp(Ity_I64);
   2754          IRTemp resHi   = newTemp(Ity_I32);
   2755          IRTemp resLo   = newTemp(Ity_I32);
   2756          IROp   mulOp   = syned ? Iop_MullS32 : Iop_MullU32;
   2757          UInt   tBaseOp = syned ? X86G_CC_OP_SMULB : X86G_CC_OP_UMULB;
   2758          setFlags_MUL ( Ity_I32, t1, tmp, tBaseOp );
   2759          assign( res64, binop(mulOp, mkexpr(t1), mkexpr(tmp)) );
   2760          assign( resHi, unop(Iop_64HIto32,mkexpr(res64)));
   2761          assign( resLo, unop(Iop_64to32,mkexpr(res64)));
   2762          putIReg(4, R_EDX, mkexpr(resHi));
   2763          putIReg(4, R_EAX, mkexpr(resLo));
   2764          break;
   2765       }
   2766       case Ity_I16: {
   2767          IRTemp res32   = newTemp(Ity_I32);
   2768          IRTemp resHi   = newTemp(Ity_I16);
   2769          IRTemp resLo   = newTemp(Ity_I16);
   2770          IROp   mulOp   = syned ? Iop_MullS16 : Iop_MullU16;
   2771          UInt   tBaseOp = syned ? X86G_CC_OP_SMULB : X86G_CC_OP_UMULB;
   2772          setFlags_MUL ( Ity_I16, t1, tmp, tBaseOp );
   2773          assign( res32, binop(mulOp, mkexpr(t1), mkexpr(tmp)) );
   2774          assign( resHi, unop(Iop_32HIto16,mkexpr(res32)));
   2775          assign( resLo, unop(Iop_32to16,mkexpr(res32)));
   2776          putIReg(2, R_EDX, mkexpr(resHi));
   2777          putIReg(2, R_EAX, mkexpr(resLo));
   2778          break;
   2779       }
   2780       case Ity_I8: {
   2781          IRTemp res16   = newTemp(Ity_I16);
   2782          IRTemp resHi   = newTemp(Ity_I8);
   2783          IRTemp resLo   = newTemp(Ity_I8);
   2784          IROp   mulOp   = syned ? Iop_MullS8 : Iop_MullU8;
   2785          UInt   tBaseOp = syned ? X86G_CC_OP_SMULB : X86G_CC_OP_UMULB;
   2786          setFlags_MUL ( Ity_I8, t1, tmp, tBaseOp );
   2787          assign( res16, binop(mulOp, mkexpr(t1), mkexpr(tmp)) );
   2788          assign( resHi, unop(Iop_16HIto8,mkexpr(res16)));
   2789          assign( resLo, unop(Iop_16to8,mkexpr(res16)));
   2790          putIReg(2, R_EAX, mkexpr(res16));
   2791          break;
   2792       }
   2793       default:
   2794          vpanic("codegen_mulL_A_D(x86)");
   2795    }
   2796    DIP("%s%c %s\n", syned ? "imul" : "mul", nameISize(sz), tmp_txt);
   2797 }
   2798 
   2799 
   2800 /* Group 3 extended opcodes. */
   2801 static
   2802 UInt dis_Grp3 ( UChar sorb, Bool locked, Int sz, Int delta, Bool* decode_OK )
   2803 {
   2804    UInt    d32;
   2805    UChar   modrm;
   2806    HChar   dis_buf[50];
   2807    Int     len;
   2808    IRTemp  addr;
   2809    IRType  ty = szToITy(sz);
   2810    IRTemp  t1 = newTemp(ty);
   2811    IRTemp dst1, src, dst0;
   2812 
   2813    *decode_OK = True; /* may change this later */
   2814 
   2815    modrm = getIByte(delta);
   2816 
   2817    if (locked && (gregOfRM(modrm) != 2 && gregOfRM(modrm) != 3)) {
   2818       /* LOCK prefix only allowed with not and neg subopcodes */
   2819       *decode_OK = False;
   2820       return delta;
   2821    }
   2822 
   2823    if (epartIsReg(modrm)) {
   2824       switch (gregOfRM(modrm)) {
   2825          case 0: { /* TEST */
   2826             delta++; d32 = getUDisp(sz, delta); delta += sz;
   2827             dst1 = newTemp(ty);
   2828             assign(dst1, binop(mkSizedOp(ty,Iop_And8),
   2829                                getIReg(sz,eregOfRM(modrm)),
   2830                                mkU(ty,d32)));
   2831             setFlags_DEP1( Iop_And8, dst1, ty );
   2832             DIP("test%c $0x%x, %s\n", nameISize(sz), d32,
   2833                                       nameIReg(sz, eregOfRM(modrm)));
   2834             break;
   2835          }
   2836          case 1: /* UNDEFINED */
   2837            /* The Intel docs imply this insn is undefined and binutils
   2838               agrees.  Unfortunately Core 2 will run it (with who
   2839               knows what result?)  sandpile.org reckons it's an alias
   2840               for case 0.  We play safe. */
   2841            *decode_OK = False;
   2842            break;
   2843          case 2: /* NOT */
   2844             delta++;
   2845             putIReg(sz, eregOfRM(modrm),
   2846                         unop(mkSizedOp(ty,Iop_Not8),
   2847                              getIReg(sz, eregOfRM(modrm))));
   2848             DIP("not%c %s\n", nameISize(sz), nameIReg(sz, eregOfRM(modrm)));
   2849             break;
   2850          case 3: /* NEG */
   2851             delta++;
   2852             dst0 = newTemp(ty);
   2853             src  = newTemp(ty);
   2854             dst1 = newTemp(ty);
   2855             assign(dst0, mkU(ty,0));
   2856             assign(src,  getIReg(sz,eregOfRM(modrm)));
   2857             assign(dst1, binop(mkSizedOp(ty,Iop_Sub8), mkexpr(dst0), mkexpr(src)));
   2858             setFlags_DEP1_DEP2(Iop_Sub8, dst0, src, ty);
   2859             putIReg(sz, eregOfRM(modrm), mkexpr(dst1));
   2860             DIP("neg%c %s\n", nameISize(sz), nameIReg(sz, eregOfRM(modrm)));
   2861             break;
   2862          case 4: /* MUL (unsigned widening) */
   2863             delta++;
   2864             src = newTemp(ty);
   2865             assign(src, getIReg(sz,eregOfRM(modrm)));
   2866             codegen_mulL_A_D ( sz, False, src, nameIReg(sz,eregOfRM(modrm)) );
   2867             break;
   2868          case 5: /* IMUL (signed widening) */
   2869             delta++;
   2870             src = newTemp(ty);
   2871             assign(src, getIReg(sz,eregOfRM(modrm)));
   2872             codegen_mulL_A_D ( sz, True, src, nameIReg(sz,eregOfRM(modrm)) );
   2873             break;
   2874          case 6: /* DIV */
   2875             delta++;
   2876             assign( t1, getIReg(sz, eregOfRM(modrm)) );
   2877             codegen_div ( sz, t1, False );
   2878             DIP("div%c %s\n", nameISize(sz), nameIReg(sz, eregOfRM(modrm)));
   2879             break;
   2880          case 7: /* IDIV */
   2881             delta++;
   2882             assign( t1, getIReg(sz, eregOfRM(modrm)) );
   2883             codegen_div ( sz, t1, True );
   2884             DIP("idiv%c %s\n", nameISize(sz), nameIReg(sz, eregOfRM(modrm)));
   2885             break;
   2886          default:
   2887             /* This can't happen - gregOfRM should return 0 .. 7 only */
   2888             vpanic("Grp3(x86)");
   2889       }
   2890    } else {
   2891       addr = disAMode ( &len, sorb, delta, dis_buf );
   2892       t1   = newTemp(ty);
   2893       delta += len;
   2894       assign(t1, loadLE(ty,mkexpr(addr)));
   2895       switch (gregOfRM(modrm)) {
   2896          case 0: { /* TEST */
   2897             d32 = getUDisp(sz, delta); delta += sz;
   2898             dst1 = newTemp(ty);
   2899             assign(dst1, binop(mkSizedOp(ty,Iop_And8),
   2900                                mkexpr(t1), mkU(ty,d32)));
   2901             setFlags_DEP1( Iop_And8, dst1, ty );
   2902             DIP("test%c $0x%x, %s\n", nameISize(sz), d32, dis_buf);
   2903             break;
   2904          }
   2905          case 1: /* UNDEFINED */
   2906            /* See comment above on R case */
   2907            *decode_OK = False;
   2908            break;
   2909          case 2: /* NOT */
   2910             dst1 = newTemp(ty);
   2911             assign(dst1, unop(mkSizedOp(ty,Iop_Not8), mkexpr(t1)));
   2912             if (locked) {
   2913                casLE( mkexpr(addr), mkexpr(t1)/*expd*/, mkexpr(dst1)/*new*/,
   2914                                     guest_EIP_curr_instr );
   2915             } else {
   2916                storeLE( mkexpr(addr), mkexpr(dst1) );
   2917             }
   2918             DIP("not%c %s\n", nameISize(sz), dis_buf);
   2919             break;
   2920          case 3: /* NEG */
   2921             dst0 = newTemp(ty);
   2922             src  = newTemp(ty);
   2923             dst1 = newTemp(ty);
   2924             assign(dst0, mkU(ty,0));
   2925             assign(src,  mkexpr(t1));
   2926             assign(dst1, binop(mkSizedOp(ty,Iop_Sub8),
   2927                                mkexpr(dst0), mkexpr(src)));
   2928             if (locked) {
   2929                casLE( mkexpr(addr), mkexpr(t1)/*expd*/, mkexpr(dst1)/*new*/,
   2930                                     guest_EIP_curr_instr );
   2931             } else {
   2932                storeLE( mkexpr(addr), mkexpr(dst1) );
   2933             }
   2934             setFlags_DEP1_DEP2(Iop_Sub8, dst0, src, ty);
   2935             DIP("neg%c %s\n", nameISize(sz), dis_buf);
   2936             break;
   2937          case 4: /* MUL */
   2938             codegen_mulL_A_D ( sz, False, t1, dis_buf );
   2939             break;
   2940          case 5: /* IMUL */
   2941             codegen_mulL_A_D ( sz, True, t1, dis_buf );
   2942             break;
   2943          case 6: /* DIV */
   2944             codegen_div ( sz, t1, False );
   2945             DIP("div%c %s\n", nameISize(sz), dis_buf);
   2946             break;
   2947          case 7: /* IDIV */
   2948             codegen_div ( sz, t1, True );
   2949             DIP("idiv%c %s\n", nameISize(sz), dis_buf);
   2950             break;
   2951          default:
   2952             /* This can't happen - gregOfRM should return 0 .. 7 only */
   2953             vpanic("Grp3(x86)");
   2954       }
   2955    }
   2956    return delta;
   2957 }
   2958 
   2959 
   2960 /* Group 4 extended opcodes. */
   2961 static
   2962 UInt dis_Grp4 ( UChar sorb, Bool locked, Int delta, Bool* decode_OK )
   2963 {
   2964    Int   alen;
   2965    UChar modrm;
   2966    HChar dis_buf[50];
   2967    IRType ty = Ity_I8;
   2968    IRTemp t1 = newTemp(ty);
   2969    IRTemp t2 = newTemp(ty);
   2970 
   2971    *decode_OK = True;
   2972 
   2973    modrm = getIByte(delta);
   2974 
   2975    if (locked && (gregOfRM(modrm) != 0 && gregOfRM(modrm) != 1)) {
   2976       /* LOCK prefix only allowed with inc and dec subopcodes */
   2977       *decode_OK = False;
   2978       return delta;
   2979    }
   2980 
   2981    if (epartIsReg(modrm)) {
   2982       assign(t1, getIReg(1, eregOfRM(modrm)));
   2983       switch (gregOfRM(modrm)) {
   2984          case 0: /* INC */
   2985             assign(t2, binop(Iop_Add8, mkexpr(t1), mkU8(1)));
   2986             putIReg(1, eregOfRM(modrm), mkexpr(t2));
   2987             setFlags_INC_DEC( True, t2, ty );
   2988             break;
   2989          case 1: /* DEC */
   2990             assign(t2, binop(Iop_Sub8, mkexpr(t1), mkU8(1)));
   2991             putIReg(1, eregOfRM(modrm), mkexpr(t2));
   2992             setFlags_INC_DEC( False, t2, ty );
   2993             break;
   2994          default:
   2995             *decode_OK = False;
   2996             return delta;
   2997       }
   2998       delta++;
   2999       DIP("%sb %s\n", nameGrp4(gregOfRM(modrm)),
   3000                       nameIReg(1, eregOfRM(modrm)));
   3001    } else {
   3002       IRTemp addr = disAMode ( &alen, sorb, delta, dis_buf );
   3003       assign( t1, loadLE(ty, mkexpr(addr)) );
   3004       switch (gregOfRM(modrm)) {
   3005          case 0: /* INC */
   3006             assign(t2, binop(Iop_Add8, mkexpr(t1), mkU8(1)));
   3007             if (locked) {
   3008                casLE( mkexpr(addr), mkexpr(t1)/*expd*/, mkexpr(t2)/*new*/,
   3009                       guest_EIP_curr_instr );
   3010             } else {
   3011                storeLE( mkexpr(addr), mkexpr(t2) );
   3012             }
   3013             setFlags_INC_DEC( True, t2, ty );
   3014             break;
   3015          case 1: /* DEC */
   3016             assign(t2, binop(Iop_Sub8, mkexpr(t1), mkU8(1)));
   3017             if (locked) {
   3018                casLE( mkexpr(addr), mkexpr(t1)/*expd*/, mkexpr(t2)/*new*/,
   3019                       guest_EIP_curr_instr );
   3020             } else {
   3021                storeLE( mkexpr(addr), mkexpr(t2) );
   3022             }
   3023             setFlags_INC_DEC( False, t2, ty );
   3024             break;
   3025          default:
   3026             *decode_OK = False;
   3027             return delta;
   3028       }
   3029       delta += alen;
   3030       DIP("%sb %s\n", nameGrp4(gregOfRM(modrm)), dis_buf);
   3031    }
   3032    return delta;
   3033 }
   3034 
   3035 
   3036 /* Group 5 extended opcodes. */
   3037 static
   3038 UInt dis_Grp5 ( UChar sorb, Bool locked, Int sz, Int delta,
   3039                 /*MOD*/DisResult* dres, /*OUT*/Bool* decode_OK )
   3040 {
   3041    Int     len;
   3042    UChar   modrm;
   3043    HChar   dis_buf[50];
   3044    IRTemp  addr = IRTemp_INVALID;
   3045    IRType  ty = szToITy(sz);
   3046    IRTemp  t1 = newTemp(ty);
   3047    IRTemp  t2 = IRTemp_INVALID;
   3048 
   3049    *decode_OK = True;
   3050 
   3051    modrm = getIByte(delta);
   3052 
   3053    if (locked && (gregOfRM(modrm) != 0 && gregOfRM(modrm) != 1)) {
   3054       /* LOCK prefix only allowed with inc and dec subopcodes */
   3055       *decode_OK = False;
   3056       return delta;
   3057    }
   3058 
   3059    if (epartIsReg(modrm)) {
   3060       assign(t1, getIReg(sz,eregOfRM(modrm)));
   3061       switch (gregOfRM(modrm)) {
   3062          case 0: /* INC */
   3063             vassert(sz == 2 || sz == 4);
   3064             t2 = newTemp(ty);
   3065             assign(t2, binop(mkSizedOp(ty,Iop_Add8),
   3066                              mkexpr(t1), mkU(ty,1)));
   3067             setFlags_INC_DEC( True, t2, ty );
   3068             putIReg(sz,eregOfRM(modrm),mkexpr(t2));
   3069             break;
   3070          case 1: /* DEC */
   3071             vassert(sz == 2 || sz == 4);
   3072             t2 = newTemp(ty);
   3073             assign(t2, binop(mkSizedOp(ty,Iop_Sub8),
   3074                              mkexpr(t1), mkU(ty,1)));
   3075             setFlags_INC_DEC( False, t2, ty );
   3076             putIReg(sz,eregOfRM(modrm),mkexpr(t2));
   3077             break;
   3078          case 2: /* call Ev */
   3079             vassert(sz == 4);
   3080             t2 = newTemp(Ity_I32);
   3081             assign(t2, binop(Iop_Sub32, getIReg(4,R_ESP), mkU32(4)));
   3082             putIReg(4, R_ESP, mkexpr(t2));
   3083             storeLE( mkexpr(t2), mkU32(guest_EIP_bbstart+delta+1));
   3084             jmp_treg(dres, Ijk_Call, t1);
   3085             vassert(dres->whatNext == Dis_StopHere);
   3086             break;
   3087          case 4: /* jmp Ev */
   3088             vassert(sz == 4);
   3089             jmp_treg(dres, Ijk_Boring, t1);
   3090             vassert(dres->whatNext == Dis_StopHere);
   3091             break;
   3092          case 6: /* PUSH Ev */
   3093             vassert(sz == 4 || sz == 2);
   3094             t2 = newTemp(Ity_I32);
   3095             assign( t2, binop(Iop_Sub32,getIReg(4,R_ESP),mkU32(sz)) );
   3096             putIReg(4, R_ESP, mkexpr(t2) );
   3097             storeLE( mkexpr(t2), mkexpr(t1) );
   3098             break;
   3099          default:
   3100             *decode_OK = False;
   3101             return delta;
   3102       }
   3103       delta++;
   3104       DIP("%s%c %s\n", nameGrp5(gregOfRM(modrm)),
   3105                        nameISize(sz), nameIReg(sz, eregOfRM(modrm)));
   3106    } else {
   3107       addr = disAMode ( &len, sorb, delta, dis_buf );
   3108       assign(t1, loadLE(ty,mkexpr(addr)));
   3109       switch (gregOfRM(modrm)) {
   3110          case 0: /* INC */
   3111             t2 = newTemp(ty);
   3112             assign(t2, binop(mkSizedOp(ty,Iop_Add8),
   3113                              mkexpr(t1), mkU(ty,1)));
   3114             if (locked) {
   3115                casLE( mkexpr(addr),
   3116                       mkexpr(t1), mkexpr(t2), guest_EIP_curr_instr );
   3117             } else {
   3118                storeLE(mkexpr(addr),mkexpr(t2));
   3119             }
   3120             setFlags_INC_DEC( True, t2, ty );
   3121             break;
   3122          case 1: /* DEC */
   3123             t2 = newTemp(ty);
   3124             assign(t2, binop(mkSizedOp(ty,Iop_Sub8),
   3125                              mkexpr(t1), mkU(ty,1)));
   3126             if (locked) {
   3127                casLE( mkexpr(addr),
   3128                       mkexpr(t1), mkexpr(t2), guest_EIP_curr_instr );
   3129             } else {
   3130                storeLE(mkexpr(addr),mkexpr(t2));
   3131             }
   3132             setFlags_INC_DEC( False, t2, ty );
   3133             break;
   3134          case 2: /* call Ev */
   3135             vassert(sz == 4);
   3136             t2 = newTemp(Ity_I32);
   3137             assign(t2, binop(Iop_Sub32, getIReg(4,R_ESP), mkU32(4)));
   3138             putIReg(4, R_ESP, mkexpr(t2));
   3139             storeLE( mkexpr(t2), mkU32(guest_EIP_bbstart+delta+len));
   3140             jmp_treg(dres, Ijk_Call, t1);
   3141             vassert(dres->whatNext == Dis_StopHere);
   3142             break;
   3143          case 4: /* JMP Ev */
   3144             vassert(sz == 4);
   3145             jmp_treg(dres, Ijk_Boring, t1);
   3146             vassert(dres->whatNext == Dis_StopHere);
   3147             break;
   3148          case 6: /* PUSH Ev */
   3149             vassert(sz == 4 || sz == 2);
   3150             t2 = newTemp(Ity_I32);
   3151             assign( t2, binop(Iop_Sub32,getIReg(4,R_ESP),mkU32(sz)) );
   3152             putIReg(4, R_ESP, mkexpr(t2) );
   3153             storeLE( mkexpr(t2), mkexpr(t1) );
   3154             break;
   3155          default:
   3156             *decode_OK = False;
   3157             return delta;
   3158       }
   3159       delta += len;
   3160       DIP("%s%c %s\n", nameGrp5(gregOfRM(modrm)),
   3161                        nameISize(sz), dis_buf);
   3162    }
   3163    return delta;
   3164 }
   3165 
   3166 
   3167 /*------------------------------------------------------------*/
   3168 /*--- Disassembling string ops (including REP prefixes)    ---*/
   3169 /*------------------------------------------------------------*/
   3170 
   3171 /* Code shared by all the string ops */
   3172 static
   3173 void dis_string_op_increment(Int sz, Int t_inc)
   3174 {
   3175    if (sz == 4 || sz == 2) {
   3176       assign( t_inc,
   3177               binop(Iop_Shl32, IRExpr_Get( OFFB_DFLAG, Ity_I32 ),
   3178                                mkU8(sz/2) ) );
   3179    } else {
   3180       assign( t_inc,
   3181               IRExpr_Get( OFFB_DFLAG, Ity_I32 ) );
   3182    }
   3183 }
   3184 
   3185 static
   3186 void dis_string_op( void (*dis_OP)( Int, IRTemp ),
   3187                     Int sz, const HChar* name, UChar sorb )
   3188 {
   3189    IRTemp t_inc = newTemp(Ity_I32);
   3190    vassert(sorb == 0); /* hmm.  so what was the point of passing it in? */
   3191    dis_string_op_increment(sz, t_inc);
   3192    dis_OP( sz, t_inc );
   3193    DIP("%s%c\n", name, nameISize(sz));
   3194 }
   3195 
   3196 static
   3197 void dis_MOVS ( Int sz, IRTemp t_inc )
   3198 {
   3199    IRType ty = szToITy(sz);
   3200    IRTemp td = newTemp(Ity_I32);   /* EDI */
   3201    IRTemp ts = newTemp(Ity_I32);   /* ESI */
   3202 
   3203    assign( td, getIReg(4, R_EDI) );
   3204    assign( ts, getIReg(4, R_ESI) );
   3205 
   3206    storeLE( mkexpr(td), loadLE(ty,mkexpr(ts)) );
   3207 
   3208    putIReg( 4, R_EDI, binop(Iop_Add32, mkexpr(td), mkexpr(t_inc)) );
   3209    putIReg( 4, R_ESI, binop(Iop_Add32, mkexpr(ts), mkexpr(t_inc)) );
   3210 }
   3211 
   3212 static
   3213 void dis_LODS ( Int sz, IRTemp t_inc )
   3214 {
   3215    IRType ty = szToITy(sz);
   3216    IRTemp ts = newTemp(Ity_I32);   /* ESI */
   3217 
   3218    assign( ts, getIReg(4, R_ESI) );
   3219 
   3220    putIReg( sz, R_EAX, loadLE(ty, mkexpr(ts)) );
   3221 
   3222    putIReg( 4, R_ESI, binop(Iop_Add32, mkexpr(ts), mkexpr(t_inc)) );
   3223 }
   3224 
   3225 static
   3226 void dis_STOS ( Int sz, IRTemp t_inc )
   3227 {
   3228    IRType ty = szToITy(sz);
   3229    IRTemp ta = newTemp(ty);        /* EAX */
   3230    IRTemp td = newTemp(Ity_I32);   /* EDI */
   3231 
   3232    assign( ta, getIReg(sz, R_EAX) );
   3233    assign( td, getIReg(4, R_EDI) );
   3234 
   3235    storeLE( mkexpr(td), mkexpr(ta) );
   3236 
   3237    putIReg( 4, R_EDI, binop(Iop_Add32, mkexpr(td), mkexpr(t_inc)) );
   3238 }
   3239 
   3240 static
   3241 void dis_CMPS ( Int sz, IRTemp t_inc )
   3242 {
   3243    IRType ty  = szToITy(sz);
   3244    IRTemp tdv = newTemp(ty);      /* (EDI) */
   3245    IRTemp tsv = newTemp(ty);      /* (ESI) */
   3246    IRTemp td  = newTemp(Ity_I32); /*  EDI  */
   3247    IRTemp ts  = newTemp(Ity_I32); /*  ESI  */
   3248 
   3249    assign( td, getIReg(4, R_EDI) );
   3250    assign( ts, getIReg(4, R_ESI) );
   3251 
   3252    assign( tdv, loadLE(ty,mkexpr(td)) );
   3253    assign( tsv, loadLE(ty,mkexpr(ts)) );
   3254 
   3255    setFlags_DEP1_DEP2 ( Iop_Sub8, tsv, tdv, ty );
   3256 
   3257    putIReg(4, R_EDI, binop(Iop_Add32, mkexpr(td), mkexpr(t_inc)) );
   3258    putIReg(4, R_ESI, binop(Iop_Add32, mkexpr(ts), mkexpr(t_inc)) );
   3259 }
   3260 
   3261 static
   3262 void dis_SCAS ( Int sz, IRTemp t_inc )
   3263 {
   3264    IRType ty  = szToITy(sz);
   3265    IRTemp ta  = newTemp(ty);       /*  EAX  */
   3266    IRTemp td  = newTemp(Ity_I32);  /*  EDI  */
   3267    IRTemp tdv = newTemp(ty);       /* (EDI) */
   3268 
   3269    assign( ta, getIReg(sz, R_EAX) );
   3270    assign( td, getIReg(4, R_EDI) );
   3271 
   3272    assign( tdv, loadLE(ty,mkexpr(td)) );
   3273    setFlags_DEP1_DEP2 ( Iop_Sub8, ta, tdv, ty );
   3274 
   3275    putIReg(4, R_EDI, binop(Iop_Add32, mkexpr(td), mkexpr(t_inc)) );
   3276 }
   3277 
   3278 
   3279 /* Wrap the appropriate string op inside a REP/REPE/REPNE.
   3280    We assume the insn is the last one in the basic block, and so emit a jump
   3281    to the next insn, rather than just falling through. */
   3282 static
   3283 void dis_REP_op ( /*MOD*/DisResult* dres,
   3284                   X86Condcode cond,
   3285                   void (*dis_OP)(Int, IRTemp),
   3286                   Int sz, Addr32 eip, Addr32 eip_next, const HChar* name )
   3287 {
   3288    IRTemp t_inc = newTemp(Ity_I32);
   3289    IRTemp tc    = newTemp(Ity_I32);  /*  ECX  */
   3290 
   3291    assign( tc, getIReg(4,R_ECX) );
   3292 
   3293    stmt( IRStmt_Exit( binop(Iop_CmpEQ32,mkexpr(tc),mkU32(0)),
   3294                       Ijk_Boring,
   3295                       IRConst_U32(eip_next), OFFB_EIP ) );
   3296 
   3297    putIReg(4, R_ECX, binop(Iop_Sub32, mkexpr(tc), mkU32(1)) );
   3298 
   3299    dis_string_op_increment(sz, t_inc);
   3300    dis_OP (sz, t_inc);
   3301 
   3302    if (cond == X86CondAlways) {
   3303       jmp_lit(dres, Ijk_Boring, eip);
   3304       vassert(dres->whatNext == Dis_StopHere);
   3305    } else {
   3306       stmt( IRStmt_Exit( mk_x86g_calculate_condition(cond),
   3307                          Ijk_Boring,
   3308                          IRConst_U32(eip), OFFB_EIP ) );
   3309       jmp_lit(dres, Ijk_Boring, eip_next);
   3310       vassert(dres->whatNext == Dis_StopHere);
   3311    }
   3312    DIP("%s%c\n", name, nameISize(sz));
   3313 }
   3314 
   3315 
   3316 /*------------------------------------------------------------*/
   3317 /*--- Arithmetic, etc.                                     ---*/
   3318 /*------------------------------------------------------------*/
   3319 
   3320 /* IMUL E, G.  Supplied eip points to the modR/M byte. */
   3321 static
   3322 UInt dis_mul_E_G ( UChar       sorb,
   3323                    Int         size,
   3324                    Int         delta0 )
   3325 {
   3326    Int    alen;
   3327    HChar  dis_buf[50];
   3328    UChar  rm = getIByte(delta0);
   3329    IRType ty = szToITy(size);
   3330    IRTemp te = newTemp(ty);
   3331    IRTemp tg = newTemp(ty);
   3332    IRTemp resLo = newTemp(ty);
   3333 
   3334    assign( tg, getIReg(size, gregOfRM(rm)) );
   3335    if (epartIsReg(rm)) {
   3336       assign( te, getIReg(size, eregOfRM(rm)) );
   3337    } else {
   3338       IRTemp addr = disAMode( &alen, sorb, delta0, dis_buf );
   3339       assign( te, loadLE(ty,mkexpr(addr)) );
   3340    }
   3341 
   3342    setFlags_MUL ( ty, te, tg, X86G_CC_OP_SMULB );
   3343 
   3344    assign( resLo, binop( mkSizedOp(ty, Iop_Mul8), mkexpr(te), mkexpr(tg) ) );
   3345 
   3346    putIReg(size, gregOfRM(rm), mkexpr(resLo) );
   3347 
   3348    if (epartIsReg(rm)) {
   3349       DIP("imul%c %s, %s\n", nameISize(size),
   3350                              nameIReg(size,eregOfRM(rm)),
   3351                              nameIReg(size,gregOfRM(rm)));
   3352       return 1+delta0;
   3353    } else {
   3354       DIP("imul%c %s, %s\n", nameISize(size),
   3355                              dis_buf, nameIReg(size,gregOfRM(rm)));
   3356       return alen+delta0;
   3357    }
   3358 }
   3359 
   3360 
   3361 /* IMUL I * E -> G.  Supplied eip points to the modR/M byte. */
   3362 static
   3363 UInt dis_imul_I_E_G ( UChar       sorb,
   3364                       Int         size,
   3365                       Int         delta,
   3366                       Int         litsize )
   3367 {
   3368    Int    d32, alen;
   3369    HChar  dis_buf[50];
   3370    UChar  rm = getIByte(delta);
   3371    IRType ty = szToITy(size);
   3372    IRTemp te = newTemp(ty);
   3373    IRTemp tl = newTemp(ty);
   3374    IRTemp resLo = newTemp(ty);
   3375 
   3376    vassert(size == 1 || size == 2 || size == 4);
   3377 
   3378    if (epartIsReg(rm)) {
   3379       assign(te, getIReg(size, eregOfRM(rm)));
   3380       delta++;
   3381    } else {
   3382       IRTemp addr = disAMode( &alen, sorb, delta, dis_buf );
   3383       assign(te, loadLE(ty, mkexpr(addr)));
   3384       delta += alen;
   3385    }
   3386    d32 = getSDisp(litsize,delta);
   3387    delta += litsize;
   3388 
   3389    if (size == 1) d32 &= 0xFF;
   3390    if (size == 2) d32 &= 0xFFFF;
   3391 
   3392    assign(tl, mkU(ty,d32));
   3393 
   3394    assign( resLo, binop( mkSizedOp(ty, Iop_Mul8), mkexpr(te), mkexpr(tl) ));
   3395 
   3396    setFlags_MUL ( ty, te, tl, X86G_CC_OP_SMULB );
   3397 
   3398    putIReg(size, gregOfRM(rm), mkexpr(resLo));
   3399 
   3400    DIP("imul %d, %s, %s\n", d32,
   3401        ( epartIsReg(rm) ? nameIReg(size,eregOfRM(rm)) : dis_buf ),
   3402        nameIReg(size,gregOfRM(rm)) );
   3403    return delta;
   3404 }
   3405 
   3406 
   3407 /* Generate an IR sequence to do a count-leading-zeroes operation on
   3408    the supplied IRTemp, and return a new IRTemp holding the result.
   3409    'ty' may be Ity_I16 or Ity_I32 only.  In the case where the
   3410    argument is zero, return the number of bits in the word (the
   3411    natural semantics). */
   3412 static IRTemp gen_LZCNT ( IRType ty, IRTemp src )
   3413 {
   3414    vassert(ty == Ity_I32 || ty == Ity_I16);
   3415 
   3416    IRTemp src32 = newTemp(Ity_I32);
   3417    assign(src32, widenUto32( mkexpr(src) ));
   3418 
   3419    IRTemp src32x = newTemp(Ity_I32);
   3420    assign(src32x,
   3421           binop(Iop_Shl32, mkexpr(src32),
   3422                            mkU8(32 - 8 * sizeofIRType(ty))));
   3423 
   3424    // Clz32 has undefined semantics when its input is zero, so
   3425    // special-case around that.
   3426    IRTemp res32 = newTemp(Ity_I32);
   3427    assign(res32,
   3428           IRExpr_ITE(
   3429              binop(Iop_CmpEQ32, mkexpr(src32x), mkU32(0)),
   3430              mkU32(8 * sizeofIRType(ty)),
   3431              unop(Iop_Clz32, mkexpr(src32x))
   3432    ));
   3433 
   3434    IRTemp res = newTemp(ty);
   3435    assign(res, narrowTo(ty, mkexpr(res32)));
   3436    return res;
   3437 }
   3438 
   3439 
   3440 /*------------------------------------------------------------*/
   3441 /*---                                                      ---*/
   3442 /*--- x87 FLOATING POINT INSTRUCTIONS                      ---*/
   3443 /*---                                                      ---*/
   3444 /*------------------------------------------------------------*/
   3445 
   3446 /* --- Helper functions for dealing with the register stack. --- */
   3447 
   3448 /* --- Set the emulation-warning pseudo-register. --- */
   3449 
   3450 static void put_emwarn ( IRExpr* e /* :: Ity_I32 */ )
   3451 {
   3452    vassert(typeOfIRExpr(irsb->tyenv, e) == Ity_I32);
   3453    stmt( IRStmt_Put( OFFB_EMNOTE, e ) );
   3454 }
   3455 
   3456 /* --- Produce an IRExpr* denoting a 64-bit QNaN. --- */
   3457 
   3458 static IRExpr* mkQNaN64 ( void )
   3459 {
   3460   /* QNaN is 0 2047 1 0(51times)
   3461      == 0b 11111111111b 1 0(51times)
   3462      == 0x7FF8 0000 0000 0000
   3463    */
   3464    return IRExpr_Const(IRConst_F64i(0x7FF8000000000000ULL));
   3465 }
   3466 
   3467 /* --------- Get/put the top-of-stack pointer. --------- */
   3468 
   3469 static IRExpr* get_ftop ( void )
   3470 {
   3471    return IRExpr_Get( OFFB_FTOP, Ity_I32 );
   3472 }
   3473 
   3474 static void put_ftop ( IRExpr* e )
   3475 {
   3476    vassert(typeOfIRExpr(irsb->tyenv, e) == Ity_I32);
   3477    stmt( IRStmt_Put( OFFB_FTOP, e ) );
   3478 }
   3479 
   3480 /* --------- Get/put the C3210 bits. --------- */
   3481 
   3482 static IRExpr* get_C3210 ( void )
   3483 {
   3484    return IRExpr_Get( OFFB_FC3210, Ity_I32 );
   3485 }
   3486 
   3487 static void put_C3210 ( IRExpr* e )
   3488 {
   3489    stmt( IRStmt_Put( OFFB_FC3210, e ) );
   3490 }
   3491 
   3492 /* --------- Get/put the FPU rounding mode. --------- */
   3493 static IRExpr* /* :: Ity_I32 */ get_fpround ( void )
   3494 {
   3495    return IRExpr_Get( OFFB_FPROUND, Ity_I32 );
   3496 }
   3497 
   3498 static void put_fpround ( IRExpr* /* :: Ity_I32 */ e )
   3499 {
   3500    stmt( IRStmt_Put( OFFB_FPROUND, e ) );
   3501 }
   3502 
   3503 
   3504 /* --------- Synthesise a 2-bit FPU rounding mode. --------- */
   3505 /* Produces a value in 0 .. 3, which is encoded as per the type
   3506    IRRoundingMode.  Since the guest_FPROUND value is also encoded as
   3507    per IRRoundingMode, we merely need to get it and mask it for
   3508    safety.
   3509 */
   3510 static IRExpr* /* :: Ity_I32 */ get_roundingmode ( void )
   3511 {
   3512    return binop( Iop_And32, get_fpround(), mkU32(3) );
   3513 }
   3514 
   3515 static IRExpr* /* :: Ity_I32 */ get_FAKE_roundingmode ( void )
   3516 {
   3517    return mkU32(Irrm_NEAREST);
   3518 }
   3519 
   3520 
   3521 /* --------- Get/set FP register tag bytes. --------- */
   3522 
   3523 /* Given i, and some expression e, generate 'ST_TAG(i) = e'. */
   3524 
   3525 static void put_ST_TAG ( Int i, IRExpr* value )
   3526 {
   3527    IRRegArray* descr;
   3528    vassert(typeOfIRExpr(irsb->tyenv, value) == Ity_I8);
   3529    descr = mkIRRegArray( OFFB_FPTAGS, Ity_I8, 8 );
   3530    stmt( IRStmt_PutI( mkIRPutI(descr, get_ftop(), i, value) ) );
   3531 }
   3532 
   3533 /* Given i, generate an expression yielding 'ST_TAG(i)'.  This will be
   3534    zero to indicate "Empty" and nonzero to indicate "NonEmpty".  */
   3535 
   3536 static IRExpr* get_ST_TAG ( Int i )
   3537 {
   3538    IRRegArray* descr = mkIRRegArray( OFFB_FPTAGS, Ity_I8, 8 );
   3539    return IRExpr_GetI( descr, get_ftop(), i );
   3540 }
   3541 
   3542 
   3543 /* --------- Get/set FP registers. --------- */
   3544 
   3545 /* Given i, and some expression e, emit 'ST(i) = e' and set the
   3546    register's tag to indicate the register is full.  The previous
   3547    state of the register is not checked. */
   3548 
   3549 static void put_ST_UNCHECKED ( Int i, IRExpr* value )
   3550 {
   3551    IRRegArray* descr;
   3552    vassert(typeOfIRExpr(irsb->tyenv, value) == Ity_F64);
   3553    descr = mkIRRegArray( OFFB_FPREGS, Ity_F64, 8 );
   3554    stmt( IRStmt_PutI( mkIRPutI(descr, get_ftop(), i, value) ) );
   3555    /* Mark the register as in-use. */
   3556    put_ST_TAG(i, mkU8(1));
   3557 }
   3558 
   3559 /* Given i, and some expression e, emit
   3560       ST(i) = is_full(i) ? NaN : e
   3561    and set the tag accordingly.
   3562 */
   3563 
   3564 static void put_ST ( Int i, IRExpr* value )
   3565 {
   3566    put_ST_UNCHECKED(
   3567       i,
   3568       IRExpr_ITE( binop(Iop_CmpNE8, get_ST_TAG(i), mkU8(0)),
   3569                   /* non-0 means full */
   3570                   mkQNaN64(),
   3571                   /* 0 means empty */
   3572                   value
   3573       )
   3574    );
   3575 }
   3576 
   3577 
   3578 /* Given i, generate an expression yielding 'ST(i)'. */
   3579 
   3580 static IRExpr* get_ST_UNCHECKED ( Int i )
   3581 {
   3582    IRRegArray* descr = mkIRRegArray( OFFB_FPREGS, Ity_F64, 8 );
   3583    return IRExpr_GetI( descr, get_ftop(), i );
   3584 }
   3585 
   3586 
   3587 /* Given i, generate an expression yielding
   3588   is_full(i) ? ST(i) : NaN
   3589 */
   3590 
   3591 static IRExpr* get_ST ( Int i )
   3592 {
   3593    return
   3594       IRExpr_ITE( binop(Iop_CmpNE8, get_ST_TAG(i), mkU8(0)),
   3595                   /* non-0 means full */
   3596                   get_ST_UNCHECKED(i),
   3597                   /* 0 means empty */
   3598                   mkQNaN64());
   3599 }
   3600 
   3601 
   3602 /* Given i, and some expression e, and a condition cond, generate IR
   3603    which has the same effect as put_ST(i,e) when cond is true and has
   3604    no effect when cond is false.  Given the lack of proper
   3605    if-then-else in the IR, this is pretty tricky.
   3606 */
   3607 
   3608 static void maybe_put_ST ( IRTemp cond, Int i, IRExpr* value )
   3609 {
   3610    // new_tag = if cond then FULL else old_tag
   3611    // new_val = if cond then (if old_tag==FULL then NaN else val)
   3612    //                   else old_val
   3613 
   3614    IRTemp old_tag = newTemp(Ity_I8);
   3615    assign(old_tag, get_ST_TAG(i));
   3616    IRTemp new_tag = newTemp(Ity_I8);
   3617    assign(new_tag,
   3618           IRExpr_ITE(mkexpr(cond), mkU8(1)/*FULL*/, mkexpr(old_tag)));
   3619 
   3620    IRTemp old_val = newTemp(Ity_F64);
   3621    assign(old_val, get_ST_UNCHECKED(i));
   3622    IRTemp new_val = newTemp(Ity_F64);
   3623    assign(new_val,
   3624           IRExpr_ITE(mkexpr(cond),
   3625                      IRExpr_ITE(binop(Iop_CmpNE8, mkexpr(old_tag), mkU8(0)),
   3626                                 /* non-0 means full */
   3627                                 mkQNaN64(),
   3628                                 /* 0 means empty */
   3629                                 value),
   3630                      mkexpr(old_val)));
   3631 
   3632    put_ST_UNCHECKED(i, mkexpr(new_val));
   3633    // put_ST_UNCHECKED incorrectly sets tag(i) to always be FULL.  So
   3634    // now set it to new_tag instead.
   3635    put_ST_TAG(i, mkexpr(new_tag));
   3636 }
   3637 
   3638 /* Adjust FTOP downwards by one register. */
   3639 
   3640 static void fp_push ( void )
   3641 {
   3642    put_ftop( binop(Iop_Sub32, get_ftop(), mkU32(1)) );
   3643 }
   3644 
   3645 /* Adjust FTOP downwards by one register when COND is 1:I1.  Else
   3646    don't change it. */
   3647 
   3648 static void maybe_fp_push ( IRTemp cond )
   3649 {
   3650    put_ftop( binop(Iop_Sub32, get_ftop(), unop(Iop_1Uto32,mkexpr(cond))) );
   3651 }
   3652 
   3653 /* Adjust FTOP upwards by one register, and mark the vacated register
   3654    as empty.  */
   3655 
   3656 static void fp_pop ( void )
   3657 {
   3658    put_ST_TAG(0, mkU8(0));
   3659    put_ftop( binop(Iop_Add32, get_ftop(), mkU32(1)) );
   3660 }
   3661 
   3662 /* Set the C2 bit of the FPU status register to e[0].  Assumes that
   3663    e[31:1] == 0.
   3664 */
   3665 static void set_C2 ( IRExpr* e )
   3666 {
   3667    IRExpr* cleared = binop(Iop_And32, get_C3210(), mkU32(~X86G_FC_MASK_C2));
   3668    put_C3210( binop(Iop_Or32,
   3669                     cleared,
   3670                     binop(Iop_Shl32, e, mkU8(X86G_FC_SHIFT_C2))) );
   3671 }
   3672 
   3673 /* Generate code to check that abs(d64) < 2^63 and is finite.  This is
   3674    used to do the range checks for FSIN, FCOS, FSINCOS and FPTAN.  The
   3675    test is simple, but the derivation of it is not so simple.
   3676 
   3677    The exponent field for an IEEE754 double is 11 bits.  That means it
   3678    can take values 0 through 0x7FF.  If the exponent has value 0x7FF,
   3679    the number is either a NaN or an Infinity and so is not finite.
   3680    Furthermore, a finite value of exactly 2^63 is the smallest value
   3681    that has exponent value 0x43E.  Hence, what we need to do is
   3682    extract the exponent, ignoring the sign bit and mantissa, and check
   3683    it is < 0x43E, or <= 0x43D.
   3684 
   3685    To make this easily applicable to 32- and 64-bit targets, a
   3686    roundabout approach is used.  First the number is converted to I64,
   3687    then the top 32 bits are taken.  Shifting them right by 20 bits
   3688    places the sign bit and exponent in the bottom 12 bits.  Anding
   3689    with 0x7FF gets rid of the sign bit, leaving just the exponent
   3690    available for comparison.
   3691 */
   3692 static IRTemp math_IS_TRIG_ARG_FINITE_AND_IN_RANGE ( IRTemp d64 )
   3693 {
   3694    IRTemp i64 = newTemp(Ity_I64);
   3695    assign(i64, unop(Iop_ReinterpF64asI64, mkexpr(d64)) );
   3696    IRTemp exponent = newTemp(Ity_I32);
   3697    assign(exponent,
   3698           binop(Iop_And32,
   3699                 binop(Iop_Shr32, unop(Iop_64HIto32, mkexpr(i64)), mkU8(20)),
   3700                 mkU32(0x7FF)));
   3701    IRTemp in_range_and_finite = newTemp(Ity_I1);
   3702    assign(in_range_and_finite,
   3703           binop(Iop_CmpLE32U, mkexpr(exponent), mkU32(0x43D)));
   3704    return in_range_and_finite;
   3705 }
   3706 
   3707 /* Invent a plausible-looking FPU status word value:
   3708       ((ftop & 7) << 11) | (c3210 & 0x4700)
   3709  */
   3710 static IRExpr* get_FPU_sw ( void )
   3711 {
   3712    return
   3713       unop(Iop_32to16,
   3714            binop(Iop_Or32,
   3715                  binop(Iop_Shl32,
   3716                        binop(Iop_And32, get_ftop(), mkU32(7)),
   3717                              mkU8(11)),
   3718                        binop(Iop_And32, get_C3210(), mkU32(0x4700))
   3719       ));
   3720 }
   3721 
   3722 
   3723 /* ------------------------------------------------------- */
   3724 /* Given all that stack-mangling junk, we can now go ahead
   3725    and describe FP instructions.
   3726 */
   3727 
   3728 /* ST(0) = ST(0) `op` mem64/32(addr)
   3729    Need to check ST(0)'s tag on read, but not on write.
   3730 */
   3731 static
   3732 void fp_do_op_mem_ST_0 ( IRTemp addr, const HChar* op_txt, HChar* dis_buf,
   3733                          IROp op, Bool dbl )
   3734 {
   3735    DIP("f%s%c %s\n", op_txt, dbl?'l':'s', dis_buf);
   3736    if (dbl) {
   3737       put_ST_UNCHECKED(0,
   3738          triop( op,
   3739                 get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   3740                 get_ST(0),
   3741                 loadLE(Ity_F64,mkexpr(addr))
   3742          ));
   3743    } else {
   3744       put_ST_UNCHECKED(0,
   3745          triop( op,
   3746                 get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   3747                 get_ST(0),
   3748                 unop(Iop_F32toF64, loadLE(Ity_F32,mkexpr(addr)))
   3749          ));
   3750    }
   3751 }
   3752 
   3753 
   3754 /* ST(0) = mem64/32(addr) `op` ST(0)
   3755    Need to check ST(0)'s tag on read, but not on write.
   3756 */
   3757 static
   3758 void fp_do_oprev_mem_ST_0 ( IRTemp addr, const HChar* op_txt, HChar* dis_buf,
   3759                             IROp op, Bool dbl )
   3760 {
   3761    DIP("f%s%c %s\n", op_txt, dbl?'l':'s', dis_buf);
   3762    if (dbl) {
   3763       put_ST_UNCHECKED(0,
   3764          triop( op,
   3765                 get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   3766                 loadLE(Ity_F64,mkexpr(addr)),
   3767                 get_ST(0)
   3768          ));
   3769    } else {
   3770       put_ST_UNCHECKED(0,
   3771          triop( op,
   3772                 get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   3773                 unop(Iop_F32toF64, loadLE(Ity_F32,mkexpr(addr))),
   3774                 get_ST(0)
   3775          ));
   3776    }
   3777 }
   3778 
   3779 
   3780 /* ST(dst) = ST(dst) `op` ST(src).
   3781    Check dst and src tags when reading but not on write.
   3782 */
   3783 static
   3784 void fp_do_op_ST_ST ( const HChar* op_txt, IROp op, UInt st_src, UInt st_dst,
   3785                       Bool pop_after )
   3786 {
   3787    DIP("f%s%s st(%d), st(%d)\n", op_txt, pop_after?"p":"",
   3788                                  (Int)st_src, (Int)st_dst );
   3789    put_ST_UNCHECKED(
   3790       st_dst,
   3791       triop( op,
   3792              get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   3793              get_ST(st_dst),
   3794              get_ST(st_src) )
   3795    );
   3796    if (pop_after)
   3797       fp_pop();
   3798 }
   3799 
   3800 /* ST(dst) = ST(src) `op` ST(dst).
   3801    Check dst and src tags when reading but not on write.
   3802 */
   3803 static
   3804 void fp_do_oprev_ST_ST ( const HChar* op_txt, IROp op, UInt st_src,
   3805                          UInt st_dst, Bool pop_after )
   3806 {
   3807    DIP("f%s%s st(%d), st(%d)\n", op_txt, pop_after?"p":"",
   3808                                  (Int)st_src, (Int)st_dst );
   3809    put_ST_UNCHECKED(
   3810       st_dst,
   3811       triop( op,
   3812              get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   3813              get_ST(st_src),
   3814              get_ST(st_dst) )
   3815    );
   3816    if (pop_after)
   3817       fp_pop();
   3818 }
   3819 
   3820 /* %eflags(Z,P,C) = UCOMI( st(0), st(i) ) */
   3821 static void fp_do_ucomi_ST0_STi ( UInt i, Bool pop_after )
   3822 {
   3823    DIP("fucomi%s %%st(0),%%st(%d)\n", pop_after ? "p" : "", (Int)i );
   3824    /* This is a bit of a hack (and isn't really right).  It sets
   3825       Z,P,C,O correctly, but forces A and S to zero, whereas the Intel
   3826       documentation implies A and S are unchanged.
   3827    */
   3828    /* It's also fishy in that it is used both for COMIP and
   3829       UCOMIP, and they aren't the same (although similar). */
   3830    stmt( IRStmt_Put( OFFB_CC_OP,   mkU32(X86G_CC_OP_COPY) ));
   3831    stmt( IRStmt_Put( OFFB_CC_DEP2, mkU32(0) ));
   3832    stmt( IRStmt_Put( OFFB_CC_DEP1,
   3833                      binop( Iop_And32,
   3834                             binop(Iop_CmpF64, get_ST(0), get_ST(i)),
   3835                             mkU32(0x45)
   3836        )));
   3837    /* Set NDEP even though it isn't used.  This makes redundant-PUT
   3838       elimination of previous stores to this field work better. */
   3839    stmt( IRStmt_Put( OFFB_CC_NDEP, mkU32(0) ));
   3840    if (pop_after)
   3841       fp_pop();
   3842 }
   3843 
   3844 
   3845 static
   3846 UInt dis_FPU ( Bool* decode_ok, UChar sorb, Int delta )
   3847 {
   3848    Int    len;
   3849    UInt   r_src, r_dst;
   3850    HChar  dis_buf[50];
   3851    IRTemp t1, t2;
   3852 
   3853    /* On entry, delta points at the second byte of the insn (the modrm
   3854       byte).*/
   3855    UChar first_opcode = getIByte(delta-1);
   3856    UChar modrm        = getIByte(delta+0);
   3857 
   3858    /* -+-+-+-+-+-+-+-+-+-+-+-+ 0xD8 opcodes +-+-+-+-+-+-+-+ */
   3859 
   3860    if (first_opcode == 0xD8) {
   3861       if (modrm < 0xC0) {
   3862 
   3863          /* bits 5,4,3 are an opcode extension, and the modRM also
   3864            specifies an address. */
   3865          IRTemp addr = disAMode( &len, sorb, delta, dis_buf );
   3866          delta += len;
   3867 
   3868          switch (gregOfRM(modrm)) {
   3869 
   3870             case 0: /* FADD single-real */
   3871                fp_do_op_mem_ST_0 ( addr, "add", dis_buf, Iop_AddF64, False );
   3872                break;
   3873 
   3874             case 1: /* FMUL single-real */
   3875                fp_do_op_mem_ST_0 ( addr, "mul", dis_buf, Iop_MulF64, False );
   3876                break;
   3877 
   3878             case 2: /* FCOM single-real */
   3879                DIP("fcoms %s\n", dis_buf);
   3880                /* This forces C1 to zero, which isn't right. */
   3881                put_C3210(
   3882                    binop( Iop_And32,
   3883                           binop(Iop_Shl32,
   3884                                 binop(Iop_CmpF64,
   3885                                       get_ST(0),
   3886                                       unop(Iop_F32toF64,
   3887                                            loadLE(Ity_F32,mkexpr(addr)))),
   3888                                 mkU8(8)),
   3889                           mkU32(0x4500)
   3890                    ));
   3891                break;
   3892 
   3893             case 3: /* FCOMP single-real */
   3894                DIP("fcomps %s\n", dis_buf);
   3895                /* This forces C1 to zero, which isn't right. */
   3896                put_C3210(
   3897                    binop( Iop_And32,
   3898                           binop(Iop_Shl32,
   3899                                 binop(Iop_CmpF64,
   3900                                       get_ST(0),
   3901                                       unop(Iop_F32toF64,
   3902                                            loadLE(Ity_F32,mkexpr(addr)))),
   3903                                 mkU8(8)),
   3904                           mkU32(0x4500)
   3905                    ));
   3906                fp_pop();
   3907                break;
   3908 
   3909             case 4: /* FSUB single-real */
   3910                fp_do_op_mem_ST_0 ( addr, "sub", dis_buf, Iop_SubF64, False );
   3911                break;
   3912 
   3913             case 5: /* FSUBR single-real */
   3914                fp_do_oprev_mem_ST_0 ( addr, "subr", dis_buf, Iop_SubF64, False );
   3915                break;
   3916 
   3917             case 6: /* FDIV single-real */
   3918                fp_do_op_mem_ST_0 ( addr, "div", dis_buf, Iop_DivF64, False );
   3919                break;
   3920 
   3921             case 7: /* FDIVR single-real */
   3922                fp_do_oprev_mem_ST_0 ( addr, "divr", dis_buf, Iop_DivF64, False );
   3923                break;
   3924 
   3925             default:
   3926                vex_printf("unhandled opc_aux = 0x%2x\n", gregOfRM(modrm));
   3927                vex_printf("first_opcode == 0xD8\n");
   3928                goto decode_fail;
   3929          }
   3930       } else {
   3931          delta++;
   3932          switch (modrm) {
   3933 
   3934             case 0xC0 ... 0xC7: /* FADD %st(?),%st(0) */
   3935                fp_do_op_ST_ST ( "add", Iop_AddF64, modrm - 0xC0, 0, False );
   3936                break;
   3937 
   3938             case 0xC8 ... 0xCF: /* FMUL %st(?),%st(0) */
   3939                fp_do_op_ST_ST ( "mul", Iop_MulF64, modrm - 0xC8, 0, False );
   3940                break;
   3941 
   3942             /* Dunno if this is right */
   3943             case 0xD0 ... 0xD7: /* FCOM %st(?),%st(0) */
   3944                r_dst = (UInt)modrm - 0xD0;
   3945                DIP("fcom %%st(0),%%st(%d)\n", (Int)r_dst);
   3946                /* This forces C1 to zero, which isn't right. */
   3947                put_C3210(
   3948                    binop( Iop_And32,
   3949                           binop(Iop_Shl32,
   3950                                 binop(Iop_CmpF64, get_ST(0), get_ST(r_dst)),
   3951                                 mkU8(8)),
   3952                           mkU32(0x4500)
   3953                    ));
   3954                break;
   3955 
   3956             /* Dunno if this is right */
   3957             case 0xD8 ... 0xDF: /* FCOMP %st(?),%st(0) */
   3958                r_dst = (UInt)modrm - 0xD8;
   3959                DIP("fcomp %%st(0),%%st(%d)\n", (Int)r_dst);
   3960                /* This forces C1 to zero, which isn't right. */
   3961                put_C3210(
   3962                    binop( Iop_And32,
   3963                           binop(Iop_Shl32,
   3964                                 binop(Iop_CmpF64, get_ST(0), get_ST(r_dst)),
   3965                                 mkU8(8)),
   3966                           mkU32(0x4500)
   3967                    ));
   3968                fp_pop();
   3969                break;
   3970 
   3971             case 0xE0 ... 0xE7: /* FSUB %st(?),%st(0) */
   3972                fp_do_op_ST_ST ( "sub", Iop_SubF64, modrm - 0xE0, 0, False );
   3973                break;
   3974 
   3975             case 0xE8 ... 0xEF: /* FSUBR %st(?),%st(0) */
   3976                fp_do_oprev_ST_ST ( "subr", Iop_SubF64, modrm - 0xE8, 0, False );
   3977                break;
   3978 
   3979             case 0xF0 ... 0xF7: /* FDIV %st(?),%st(0) */
   3980                fp_do_op_ST_ST ( "div", Iop_DivF64, modrm - 0xF0, 0, False );
   3981                break;
   3982 
   3983             case 0xF8 ... 0xFF: /* FDIVR %st(?),%st(0) */
   3984                fp_do_oprev_ST_ST ( "divr", Iop_DivF64, modrm - 0xF8, 0, False );
   3985                break;
   3986 
   3987             default:
   3988                goto decode_fail;
   3989          }
   3990       }
   3991    }
   3992 
   3993    /* -+-+-+-+-+-+-+-+-+-+-+-+ 0xD9 opcodes +-+-+-+-+-+-+-+ */
   3994    else
   3995    if (first_opcode == 0xD9) {
   3996       if (modrm < 0xC0) {
   3997 
   3998          /* bits 5,4,3 are an opcode extension, and the modRM also
   3999             specifies an address. */
   4000          IRTemp addr = disAMode( &len, sorb, delta, dis_buf );
   4001          delta += len;
   4002 
   4003          switch (gregOfRM(modrm)) {
   4004 
   4005             case 0: /* FLD single-real */
   4006                DIP("flds %s\n", dis_buf);
   4007                fp_push();
   4008                put_ST(0, unop(Iop_F32toF64,
   4009                               loadLE(Ity_F32, mkexpr(addr))));
   4010                break;
   4011 
   4012             case 2: /* FST single-real */
   4013                DIP("fsts %s\n", dis_buf);
   4014                storeLE(mkexpr(addr),
   4015                        binop(Iop_F64toF32, get_roundingmode(), get_ST(0)));
   4016                break;
   4017 
   4018             case 3: /* FSTP single-real */
   4019                DIP("fstps %s\n", dis_buf);
   4020                storeLE(mkexpr(addr),
   4021                        binop(Iop_F64toF32, get_roundingmode(), get_ST(0)));
   4022                fp_pop();
   4023                break;
   4024 
   4025             case 4: { /* FLDENV m28 */
   4026                /* Uses dirty helper:
   4027                      VexEmNote x86g_do_FLDENV ( VexGuestX86State*, HWord ) */
   4028                IRTemp   ew = newTemp(Ity_I32);
   4029                IRDirty* d  = unsafeIRDirty_0_N (
   4030                                 0/*regparms*/,
   4031                                 "x86g_dirtyhelper_FLDENV",
   4032                                 &x86g_dirtyhelper_FLDENV,
   4033                                 mkIRExprVec_2( IRExpr_BBPTR(), mkexpr(addr) )
   4034                              );
   4035                d->tmp   = ew;
   4036                /* declare we're reading memory */
   4037                d->mFx   = Ifx_Read;
   4038                d->mAddr = mkexpr(addr);
   4039                d->mSize = 28;
   4040 
   4041                /* declare we're writing guest state */
   4042                d->nFxState = 4;
   4043                vex_bzero(&d->fxState, sizeof(d->fxState));
   4044 
   4045                d->fxState[0].fx     = Ifx_Write;
   4046                d->fxState[0].offset = OFFB_FTOP;
   4047                d->fxState[0].size   = sizeof(UInt);
   4048 
   4049                d->fxState[1].fx     = Ifx_Write;
   4050                d->fxState[1].offset = OFFB_FPTAGS;
   4051                d->fxState[1].size   = 8 * sizeof(UChar);
   4052 
   4053                d->fxState[2].fx     = Ifx_Write;
   4054                d->fxState[2].offset = OFFB_FPROUND;
   4055                d->fxState[2].size   = sizeof(UInt);
   4056 
   4057                d->fxState[3].fx     = Ifx_Write;
   4058                d->fxState[3].offset = OFFB_FC3210;
   4059                d->fxState[3].size   = sizeof(UInt);
   4060 
   4061                stmt( IRStmt_Dirty(d) );
   4062 
   4063                /* ew contains any emulation warning we may need to
   4064                   issue.  If needed, side-exit to the next insn,
   4065                   reporting the warning, so that Valgrind's dispatcher
   4066                   sees the warning. */
   4067                put_emwarn( mkexpr(ew) );
   4068                stmt(
   4069                   IRStmt_Exit(
   4070                      binop(Iop_CmpNE32, mkexpr(ew), mkU32(0)),
   4071                      Ijk_EmWarn,
   4072                      IRConst_U32( ((Addr32)guest_EIP_bbstart)+delta),
   4073                      OFFB_EIP
   4074                   )
   4075                );
   4076 
   4077                DIP("fldenv %s\n", dis_buf);
   4078                break;
   4079             }
   4080 
   4081             case 5: {/* FLDCW */
   4082                /* The only thing we observe in the control word is the
   4083                   rounding mode.  Therefore, pass the 16-bit value
   4084                   (x87 native-format control word) to a clean helper,
   4085                   getting back a 64-bit value, the lower half of which
   4086                   is the FPROUND value to store, and the upper half of
   4087                   which is the emulation-warning token which may be
   4088                   generated.
   4089                */
   4090                /* ULong x86h_check_fldcw ( UInt ); */
   4091                IRTemp t64 = newTemp(Ity_I64);
   4092                IRTemp ew = newTemp(Ity_I32);
   4093                DIP("fldcw %s\n", dis_buf);
   4094                assign( t64, mkIRExprCCall(
   4095                                Ity_I64, 0/*regparms*/,
   4096                                "x86g_check_fldcw",
   4097                                &x86g_check_fldcw,
   4098                                mkIRExprVec_1(
   4099                                   unop( Iop_16Uto32,
   4100                                         loadLE(Ity_I16, mkexpr(addr)))
   4101                                )
   4102                             )
   4103                      );
   4104 
   4105                put_fpround( unop(Iop_64to32, mkexpr(t64)) );
   4106                assign( ew, unop(Iop_64HIto32, mkexpr(t64) ) );
   4107                put_emwarn( mkexpr(ew) );
   4108                /* Finally, if an emulation warning was reported,
   4109                   side-exit to the next insn, reporting the warning,
   4110                   so that Valgrind's dispatcher sees the warning. */
   4111                stmt(
   4112                   IRStmt_Exit(
   4113                      binop(Iop_CmpNE32, mkexpr(ew), mkU32(0)),
   4114                      Ijk_EmWarn,
   4115                      IRConst_U32( ((Addr32)guest_EIP_bbstart)+delta),
   4116                      OFFB_EIP
   4117                   )
   4118                );
   4119                break;
   4120             }
   4121 
   4122             case 6: { /* FNSTENV m28 */
   4123                /* Uses dirty helper:
   4124                      void x86g_do_FSTENV ( VexGuestX86State*, HWord ) */
   4125                IRDirty* d = unsafeIRDirty_0_N (
   4126                                0/*regparms*/,
   4127                                "x86g_dirtyhelper_FSTENV",
   4128                                &x86g_dirtyhelper_FSTENV,
   4129                                mkIRExprVec_2( IRExpr_BBPTR(), mkexpr(addr) )
   4130                             );
   4131                /* declare we're writing memory */
   4132                d->mFx   = Ifx_Write;
   4133                d->mAddr = mkexpr(addr);
   4134                d->mSize = 28;
   4135 
   4136                /* declare we're reading guest state */
   4137                d->nFxState = 4;
   4138                vex_bzero(&d->fxState, sizeof(d->fxState));
   4139 
   4140                d->fxState[0].fx     = Ifx_Read;
   4141                d->fxState[0].offset = OFFB_FTOP;
   4142                d->fxState[0].size   = sizeof(UInt);
   4143 
   4144                d->fxState[1].fx     = Ifx_Read;
   4145                d->fxState[1].offset = OFFB_FPTAGS;
   4146                d->fxState[1].size   = 8 * sizeof(UChar);
   4147 
   4148                d->fxState[2].fx     = Ifx_Read;
   4149                d->fxState[2].offset = OFFB_FPROUND;
   4150                d->fxState[2].size   = sizeof(UInt);
   4151 
   4152                d->fxState[3].fx     = Ifx_Read;
   4153                d->fxState[3].offset = OFFB_FC3210;
   4154                d->fxState[3].size   = sizeof(UInt);
   4155 
   4156                stmt( IRStmt_Dirty(d) );
   4157 
   4158                DIP("fnstenv %s\n", dis_buf);
   4159                break;
   4160             }
   4161 
   4162             case 7: /* FNSTCW */
   4163               /* Fake up a native x87 FPU control word.  The only
   4164                  thing it depends on is FPROUND[1:0], so call a clean
   4165                  helper to cook it up. */
   4166                /* UInt x86h_create_fpucw ( UInt fpround ) */
   4167                DIP("fnstcw %s\n", dis_buf);
   4168                storeLE(
   4169                   mkexpr(addr),
   4170                   unop( Iop_32to16,
   4171                         mkIRExprCCall(
   4172                            Ity_I32, 0/*regp*/,
   4173                            "x86g_create_fpucw", &x86g_create_fpucw,
   4174                            mkIRExprVec_1( get_fpround() )
   4175                         )
   4176                   )
   4177                );
   4178                break;
   4179 
   4180             default:
   4181                vex_printf("unhandled opc_aux = 0x%2x\n", gregOfRM(modrm));
   4182                vex_printf("first_opcode == 0xD9\n");
   4183                goto decode_fail;
   4184          }
   4185 
   4186       } else {
   4187          delta++;
   4188          switch (modrm) {
   4189 
   4190             case 0xC0 ... 0xC7: /* FLD %st(?) */
   4191                r_src = (UInt)modrm - 0xC0;
   4192                DIP("fld %%st(%d)\n", (Int)r_src);
   4193                t1 = newTemp(Ity_F64);
   4194                assign(t1, get_ST(r_src));
   4195                fp_push();
   4196                put_ST(0, mkexpr(t1));
   4197                break;
   4198 
   4199             case 0xC8 ... 0xCF: /* FXCH %st(?) */
   4200                r_src = (UInt)modrm - 0xC8;
   4201                DIP("fxch %%st(%d)\n", (Int)r_src);
   4202                t1 = newTemp(Ity_F64);
   4203                t2 = newTemp(Ity_F64);
   4204                assign(t1, get_ST(0));
   4205                assign(t2, get_ST(r_src));
   4206                put_ST_UNCHECKED(0, mkexpr(t2));
   4207                put_ST_UNCHECKED(r_src, mkexpr(t1));
   4208                break;
   4209 
   4210             case 0xE0: /* FCHS */
   4211                DIP("fchs\n");
   4212                put_ST_UNCHECKED(0, unop(Iop_NegF64, get_ST(0)));
   4213                break;
   4214 
   4215             case 0xE1: /* FABS */
   4216                DIP("fabs\n");
   4217                put_ST_UNCHECKED(0, unop(Iop_AbsF64, get_ST(0)));
   4218                break;
   4219 
   4220             case 0xE4: /* FTST */
   4221                DIP("ftst\n");
   4222                /* This forces C1 to zero, which isn't right. */
   4223                /* Well, in fact the Intel docs say (bizarrely): "C1 is
   4224                   set to 0 if stack underflow occurred; otherwise, set
   4225                   to 0" which is pretty nonsensical.  I guess it's a
   4226                    typo. */
   4227                put_C3210(
   4228                    binop( Iop_And32,
   4229                           binop(Iop_Shl32,
   4230                                 binop(Iop_CmpF64,
   4231                                       get_ST(0),
   4232                                       IRExpr_Const(IRConst_F64i(0x0ULL))),
   4233                                 mkU8(8)),
   4234                           mkU32(0x4500)
   4235                    ));
   4236                break;
   4237 
   4238             case 0xE5: { /* FXAM */
   4239                /* This is an interesting one.  It examines %st(0),
   4240                   regardless of whether the tag says it's empty or not.
   4241                   Here, just pass both the tag (in our format) and the
   4242                   value (as a double, actually a ULong) to a helper
   4243                   function. */
   4244                IRExpr** args
   4245                   = mkIRExprVec_2( unop(Iop_8Uto32, get_ST_TAG(0)),
   4246                                    unop(Iop_ReinterpF64asI64,
   4247                                         get_ST_UNCHECKED(0)) );
   4248                put_C3210(mkIRExprCCall(
   4249                             Ity_I32,
   4250                             0/*regparm*/,
   4251                             "x86g_calculate_FXAM", &x86g_calculate_FXAM,
   4252                             args
   4253                         ));
   4254                DIP("fxam\n");
   4255                break;
   4256             }
   4257 
   4258             case 0xE8: /* FLD1 */
   4259                DIP("fld1\n");
   4260                fp_push();
   4261                /* put_ST(0, IRExpr_Const(IRConst_F64(1.0))); */
   4262                put_ST(0, IRExpr_Const(IRConst_F64i(0x3ff0000000000000ULL)));
   4263                break;
   4264 
   4265             case 0xE9: /* FLDL2T */
   4266                DIP("fldl2t\n");
   4267                fp_push();
   4268                /* put_ST(0, IRExpr_Const(IRConst_F64(3.32192809488736234781))); */
   4269                put_ST(0, IRExpr_Const(IRConst_F64i(0x400a934f0979a371ULL)));
   4270                break;
   4271 
   4272             case 0xEA: /* FLDL2E */
   4273                DIP("fldl2e\n");
   4274                fp_push();
   4275                /* put_ST(0, IRExpr_Const(IRConst_F64(1.44269504088896340739))); */
   4276                put_ST(0, IRExpr_Const(IRConst_F64i(0x3ff71547652b82feULL)));
   4277                break;
   4278 
   4279             case 0xEB: /* FLDPI */
   4280                DIP("fldpi\n");
   4281                fp_push();
   4282                /* put_ST(0, IRExpr_Const(IRConst_F64(3.14159265358979323851))); */
   4283                put_ST(0, IRExpr_Const(IRConst_F64i(0x400921fb54442d18ULL)));
   4284                break;
   4285 
   4286             case 0xEC: /* FLDLG2 */
   4287                DIP("fldlg2\n");
   4288                fp_push();
   4289                /* put_ST(0, IRExpr_Const(IRConst_F64(0.301029995663981143))); */
   4290                put_ST(0, IRExpr_Const(IRConst_F64i(0x3fd34413509f79ffULL)));
   4291                break;
   4292 
   4293             case 0xED: /* FLDLN2 */
   4294                DIP("fldln2\n");
   4295                fp_push();
   4296                /* put_ST(0, IRExpr_Const(IRConst_F64(0.69314718055994530942))); */
   4297                put_ST(0, IRExpr_Const(IRConst_F64i(0x3fe62e42fefa39efULL)));
   4298                break;
   4299 
   4300             case 0xEE: /* FLDZ */
   4301                DIP("fldz\n");
   4302                fp_push();
   4303                /* put_ST(0, IRExpr_Const(IRConst_F64(0.0))); */
   4304                put_ST(0, IRExpr_Const(IRConst_F64i(0x0000000000000000ULL)));
   4305                break;
   4306 
   4307             case 0xF0: /* F2XM1 */
   4308                DIP("f2xm1\n");
   4309                put_ST_UNCHECKED(0,
   4310                   binop(Iop_2xm1F64,
   4311                         get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   4312                         get_ST(0)));
   4313                break;
   4314 
   4315             case 0xF1: /* FYL2X */
   4316                DIP("fyl2x\n");
   4317                put_ST_UNCHECKED(1,
   4318                   triop(Iop_Yl2xF64,
   4319                         get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   4320                         get_ST(1),
   4321                         get_ST(0)));
   4322                fp_pop();
   4323                break;
   4324 
   4325             case 0xF2: { /* FPTAN */
   4326                DIP("fptan\n");
   4327                IRTemp argD = newTemp(Ity_F64);
   4328                assign(argD, get_ST(0));
   4329                IRTemp argOK = math_IS_TRIG_ARG_FINITE_AND_IN_RANGE(argD);
   4330                IRTemp resD = newTemp(Ity_F64);
   4331                assign(resD,
   4332                   IRExpr_ITE(
   4333                      mkexpr(argOK),
   4334                      binop(Iop_TanF64,
   4335                            get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   4336                            mkexpr(argD)),
   4337                      mkexpr(argD))
   4338                );
   4339                put_ST_UNCHECKED(0, mkexpr(resD));
   4340                /* Conditionally push 1.0 on the stack, if the arg is
   4341                   in range */
   4342                maybe_fp_push(argOK);
   4343                maybe_put_ST(argOK, 0,
   4344                             IRExpr_Const(IRConst_F64(1.0)));
   4345                set_C2( binop(Iop_Xor32,
   4346                              unop(Iop_1Uto32, mkexpr(argOK)),
   4347                              mkU32(1)) );
   4348                break;
   4349             }
   4350 
   4351             case 0xF3: /* FPATAN */
   4352                DIP("fpatan\n");
   4353                put_ST_UNCHECKED(1,
   4354                   triop(Iop_AtanF64,
   4355                         get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   4356                         get_ST(1),
   4357                         get_ST(0)));
   4358                fp_pop();
   4359                break;
   4360 
   4361             case 0xF4: { /* FXTRACT */
   4362                IRTemp argF = newTemp(Ity_F64);
   4363                IRTemp sigF = newTemp(Ity_F64);
   4364                IRTemp expF = newTemp(Ity_F64);
   4365                IRTemp argI = newTemp(Ity_I64);
   4366                IRTemp sigI = newTemp(Ity_I64);
   4367                IRTemp expI = newTemp(Ity_I64);
   4368                DIP("fxtract\n");
   4369                assign( argF, get_ST(0) );
   4370                assign( argI, unop(Iop_ReinterpF64asI64, mkexpr(argF)));
   4371                assign( sigI,
   4372                        mkIRExprCCall(
   4373                           Ity_I64, 0/*regparms*/,
   4374                           "x86amd64g_calculate_FXTRACT",
   4375                           &x86amd64g_calculate_FXTRACT,
   4376                           mkIRExprVec_2( mkexpr(argI),
   4377                                          mkIRExpr_HWord(0)/*sig*/ ))
   4378                );
   4379                assign( expI,
   4380                        mkIRExprCCall(
   4381                           Ity_I64, 0/*regparms*/,
   4382                           "x86amd64g_calculate_FXTRACT",
   4383                           &x86amd64g_calculate_FXTRACT,
   4384                           mkIRExprVec_2( mkexpr(argI),
   4385                                          mkIRExpr_HWord(1)/*exp*/ ))
   4386                );
   4387                assign( sigF, unop(Iop_ReinterpI64asF64, mkexpr(sigI)) );
   4388                assign( expF, unop(Iop_ReinterpI64asF64, mkexpr(expI)) );
   4389                /* exponent */
   4390                put_ST_UNCHECKED(0, mkexpr(expF) );
   4391                fp_push();
   4392                /* significand */
   4393                put_ST(0, mkexpr(sigF) );
   4394                break;
   4395             }
   4396 
   4397             case 0xF5: { /* FPREM1 -- IEEE compliant */
   4398                IRTemp a1 = newTemp(Ity_F64);
   4399                IRTemp a2 = newTemp(Ity_F64);
   4400                DIP("fprem1\n");
   4401                /* Do FPREM1 twice, once to get the remainder, and once
   4402                   to get the C3210 flag values. */
   4403                assign( a1, get_ST(0) );
   4404                assign( a2, get_ST(1) );
   4405                put_ST_UNCHECKED(0,
   4406                   triop(Iop_PRem1F64,
   4407                         get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   4408                         mkexpr(a1),
   4409                         mkexpr(a2)));
   4410                put_C3210(
   4411                   triop(Iop_PRem1C3210F64,
   4412                         get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   4413                         mkexpr(a1),
   4414                         mkexpr(a2)) );
   4415                break;
   4416             }
   4417 
   4418             case 0xF7: /* FINCSTP */
   4419                DIP("fprem\n");
   4420                put_ftop( binop(Iop_Add32, get_ftop(), mkU32(1)) );
   4421                break;
   4422 
   4423             case 0xF8: { /* FPREM -- not IEEE compliant */
   4424                IRTemp a1 = newTemp(Ity_F64);
   4425                IRTemp a2 = newTemp(Ity_F64);
   4426                DIP("fprem\n");
   4427                /* Do FPREM twice, once to get the remainder, and once
   4428                   to get the C3210 flag values. */
   4429                assign( a1, get_ST(0) );
   4430                assign( a2, get_ST(1) );
   4431                put_ST_UNCHECKED(0,
   4432                   triop(Iop_PRemF64,
   4433                         get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   4434                         mkexpr(a1),
   4435                         mkexpr(a2)));
   4436                put_C3210(
   4437                   triop(Iop_PRemC3210F64,
   4438                         get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   4439                         mkexpr(a1),
   4440                         mkexpr(a2)) );
   4441                break;
   4442             }
   4443 
   4444             case 0xF9: /* FYL2XP1 */
   4445                DIP("fyl2xp1\n");
   4446                put_ST_UNCHECKED(1,
   4447                   triop(Iop_Yl2xp1F64,
   4448                         get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   4449                         get_ST(1),
   4450                         get_ST(0)));
   4451                fp_pop();
   4452                break;
   4453 
   4454             case 0xFA: /* FSQRT */
   4455                DIP("fsqrt\n");
   4456                put_ST_UNCHECKED(0,
   4457                   binop(Iop_SqrtF64,
   4458                         get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   4459                         get_ST(0)));
   4460                break;
   4461 
   4462             case 0xFB: { /* FSINCOS */
   4463                DIP("fsincos\n");
   4464                IRTemp argD = newTemp(Ity_F64);
   4465                assign(argD, get_ST(0));
   4466                IRTemp argOK = math_IS_TRIG_ARG_FINITE_AND_IN_RANGE(argD);
   4467                IRTemp resD = newTemp(Ity_F64);
   4468                assign(resD,
   4469                   IRExpr_ITE(
   4470                      mkexpr(argOK),
   4471                      binop(Iop_SinF64,
   4472                            get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   4473                            mkexpr(argD)),
   4474                      mkexpr(argD))
   4475                );
   4476                put_ST_UNCHECKED(0, mkexpr(resD));
   4477                /* Conditionally push the cos value on the stack, if
   4478                   the arg is in range */
   4479                maybe_fp_push(argOK);
   4480                maybe_put_ST(argOK, 0,
   4481                   binop(Iop_CosF64,
   4482                         get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   4483                         mkexpr(argD)));
   4484                set_C2( binop(Iop_Xor32,
   4485                              unop(Iop_1Uto32, mkexpr(argOK)),
   4486                              mkU32(1)) );
   4487                break;
   4488             }
   4489 
   4490             case 0xFC: /* FRNDINT */
   4491                DIP("frndint\n");
   4492                put_ST_UNCHECKED(0,
   4493                   binop(Iop_RoundF64toInt, get_roundingmode(), get_ST(0)) );
   4494                break;
   4495 
   4496             case 0xFD: /* FSCALE */
   4497                DIP("fscale\n");
   4498                put_ST_UNCHECKED(0,
   4499                   triop(Iop_ScaleF64,
   4500                         get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   4501                         get_ST(0),
   4502                         get_ST(1)));
   4503                break;
   4504 
   4505             case 0xFE:   /* FSIN */
   4506             case 0xFF: { /* FCOS */
   4507                Bool isSIN = modrm == 0xFE;
   4508                DIP("%s\n", isSIN ? "fsin" : "fcos");
   4509                IRTemp argD = newTemp(Ity_F64);
   4510                assign(argD, get_ST(0));
   4511                IRTemp argOK = math_IS_TRIG_ARG_FINITE_AND_IN_RANGE(argD);
   4512                IRTemp resD = newTemp(Ity_F64);
   4513                assign(resD,
   4514                   IRExpr_ITE(
   4515                      mkexpr(argOK),
   4516                      binop(isSIN ? Iop_SinF64 : Iop_CosF64,
   4517                            get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   4518                            mkexpr(argD)),
   4519                      mkexpr(argD))
   4520                );
   4521                put_ST_UNCHECKED(0, mkexpr(resD));
   4522                set_C2( binop(Iop_Xor32,
   4523                              unop(Iop_1Uto32, mkexpr(argOK)),
   4524                              mkU32(1)) );
   4525                break;
   4526             }
   4527 
   4528             default:
   4529                goto decode_fail;
   4530          }
   4531       }
   4532    }
   4533 
   4534    /* -+-+-+-+-+-+-+-+-+-+-+-+ 0xDA opcodes +-+-+-+-+-+-+-+ */
   4535    else
   4536    if (first_opcode == 0xDA) {
   4537 
   4538       if (modrm < 0xC0) {
   4539 
   4540          /* bits 5,4,3 are an opcode extension, and the modRM also
   4541             specifies an address. */
   4542          IROp   fop;
   4543          IRTemp addr = disAMode( &len, sorb, delta, dis_buf );
   4544          delta += len;
   4545          switch (gregOfRM(modrm)) {
   4546 
   4547             case 0: /* FIADD m32int */ /* ST(0) += m32int */
   4548                DIP("fiaddl %s\n", dis_buf);
   4549                fop = Iop_AddF64;
   4550                goto do_fop_m32;
   4551 
   4552             case 1: /* FIMUL m32int */ /* ST(0) *= m32int */
   4553                DIP("fimull %s\n", dis_buf);
   4554                fop = Iop_MulF64;
   4555                goto do_fop_m32;
   4556 
   4557             case 2: /* FICOM m32int */
   4558                DIP("ficoml %s\n", dis_buf);
   4559                /* This forces C1 to zero, which isn't right. */
   4560                put_C3210(
   4561                    binop( Iop_And32,
   4562                           binop(Iop_Shl32,
   4563                                 binop(Iop_CmpF64,
   4564                                       get_ST(0),
   4565                                       unop(Iop_I32StoF64,
   4566                                            loadLE(Ity_I32,mkexpr(addr)))),
   4567                                 mkU8(8)),
   4568                           mkU32(0x4500)
   4569                    ));
   4570                break;
   4571 
   4572             case 3: /* FICOMP m32int */
   4573                DIP("ficompl %s\n", dis_buf);
   4574                /* This forces C1 to zero, which isn't right. */
   4575                put_C3210(
   4576                    binop( Iop_And32,
   4577                           binop(Iop_Shl32,
   4578                                 binop(Iop_CmpF64,
   4579                                       get_ST(0),
   4580                                       unop(Iop_I32StoF64,
   4581                                            loadLE(Ity_I32,mkexpr(addr)))),
   4582                                 mkU8(8)),
   4583                           mkU32(0x4500)
   4584                    ));
   4585                fp_pop();
   4586                break;
   4587 
   4588             case 4: /* FISUB m32int */ /* ST(0) -= m32int */
   4589                DIP("fisubl %s\n", dis_buf);
   4590                fop = Iop_SubF64;
   4591                goto do_fop_m32;
   4592 
   4593             case 5: /* FISUBR m32int */ /* ST(0) = m32int - ST(0) */
   4594                DIP("fisubrl %s\n", dis_buf);
   4595                fop = Iop_SubF64;
   4596                goto do_foprev_m32;
   4597 
   4598             case 6: /* FIDIV m32int */ /* ST(0) /= m32int */
   4599                DIP("fidivl %s\n", dis_buf);
   4600                fop = Iop_DivF64;
   4601                goto do_fop_m32;
   4602 
   4603             case 7: /* FIDIVR m32int */ /* ST(0) = m32int / ST(0) */
   4604                DIP("fidivrl %s\n", dis_buf);
   4605                fop = Iop_DivF64;
   4606                goto do_foprev_m32;
   4607 
   4608             do_fop_m32:
   4609                put_ST_UNCHECKED(0,
   4610                   triop(fop,
   4611                         get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   4612                         get_ST(0),
   4613                         unop(Iop_I32StoF64,
   4614                              loadLE(Ity_I32, mkexpr(addr)))));
   4615                break;
   4616 
   4617             do_foprev_m32:
   4618                put_ST_UNCHECKED(0,
   4619                   triop(fop,
   4620                         get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   4621                         unop(Iop_I32StoF64,
   4622                              loadLE(Ity_I32, mkexpr(addr))),
   4623                         get_ST(0)));
   4624                break;
   4625 
   4626             default:
   4627                vex_printf("unhandled opc_aux = 0x%2x\n", gregOfRM(modrm));
   4628                vex_printf("first_opcode == 0xDA\n");
   4629                goto decode_fail;
   4630          }
   4631 
   4632       } else {
   4633 
   4634          delta++;
   4635          switch (modrm) {
   4636 
   4637             case 0xC0 ... 0xC7: /* FCMOVB ST(i), ST(0) */
   4638                r_src = (UInt)modrm - 0xC0;
   4639                DIP("fcmovb %%st(%d), %%st(0)\n", (Int)r_src);
   4640                put_ST_UNCHECKED(0,
   4641                                 IRExpr_ITE(
   4642                                     mk_x86g_calculate_condition(X86CondB),
   4643                                     get_ST(r_src), get_ST(0)) );
   4644                break;
   4645 
   4646             case 0xC8 ... 0xCF: /* FCMOVE(Z) ST(i), ST(0) */
   4647                r_src = (UInt)modrm - 0xC8;
   4648                DIP("fcmovz %%st(%d), %%st(0)\n", (Int)r_src);
   4649                put_ST_UNCHECKED(0,
   4650                                 IRExpr_ITE(
   4651                                     mk_x86g_calculate_condition(X86CondZ),
   4652                                     get_ST(r_src), get_ST(0)) );
   4653                break;
   4654 
   4655             case 0xD0 ... 0xD7: /* FCMOVBE ST(i), ST(0) */
   4656                r_src = (UInt)modrm - 0xD0;
   4657                DIP("fcmovbe %%st(%d), %%st(0)\n", (Int)r_src);
   4658                put_ST_UNCHECKED(0,
   4659                                 IRExpr_ITE(
   4660                                     mk_x86g_calculate_condition(X86CondBE),
   4661                                     get_ST(r_src), get_ST(0)) );
   4662                break;
   4663 
   4664             case 0xD8 ... 0xDF: /* FCMOVU ST(i), ST(0) */
   4665                r_src = (UInt)modrm - 0xD8;
   4666                DIP("fcmovu %%st(%d), %%st(0)\n", (Int)r_src);
   4667                put_ST_UNCHECKED(0,
   4668                                 IRExpr_ITE(
   4669                                     mk_x86g_calculate_condition(X86CondP),
   4670                                     get_ST(r_src), get_ST(0)) );
   4671                break;
   4672 
   4673             case 0xE9: /* FUCOMPP %st(0),%st(1) */
   4674                DIP("fucompp %%st(0),%%st(1)\n");
   4675                /* This forces C1 to zero, which isn't right. */
   4676                put_C3210(
   4677                    binop( Iop_And32,
   4678                           binop(Iop_Shl32,
   4679                                 binop(Iop_CmpF64, get_ST(0), get_ST(1)),
   4680                                 mkU8(8)),
   4681                           mkU32(0x4500)
   4682                    ));
   4683                fp_pop();
   4684                fp_pop();
   4685                break;
   4686 
   4687             default:
   4688                goto decode_fail;
   4689          }
   4690 
   4691       }
   4692    }
   4693 
   4694    /* -+-+-+-+-+-+-+-+-+-+-+-+ 0xDB opcodes +-+-+-+-+-+-+-+ */
   4695    else
   4696    if (first_opcode == 0xDB) {
   4697       if (modrm < 0xC0) {
   4698 
   4699          /* bits 5,4,3 are an opcode extension, and the modRM also
   4700             specifies an address. */
   4701          IRTemp addr = disAMode( &len, sorb, delta, dis_buf );
   4702          delta += len;
   4703 
   4704          switch (gregOfRM(modrm)) {
   4705 
   4706             case 0: /* FILD m32int */
   4707                DIP("fildl %s\n", dis_buf);
   4708                fp_push();
   4709                put_ST(0, unop(Iop_I32StoF64,
   4710                               loadLE(Ity_I32, mkexpr(addr))));
   4711                break;
   4712 
   4713             case 1: /* FISTTPL m32 (SSE3) */
   4714                DIP("fisttpl %s\n", dis_buf);
   4715                storeLE( mkexpr(addr),
   4716                         binop(Iop_F64toI32S, mkU32(Irrm_ZERO), get_ST(0)) );
   4717                fp_pop();
   4718                break;
   4719 
   4720             case 2: /* FIST m32 */
   4721                DIP("fistl %s\n", dis_buf);
   4722                storeLE( mkexpr(addr),
   4723                         binop(Iop_F64toI32S, get_roundingmode(), get_ST(0)) );
   4724                break;
   4725 
   4726             case 3: /* FISTP m32 */
   4727                DIP("fistpl %s\n", dis_buf);
   4728                storeLE( mkexpr(addr),
   4729                         binop(Iop_F64toI32S, get_roundingmode(), get_ST(0)) );
   4730                fp_pop();
   4731                break;
   4732 
   4733             case 5: { /* FLD extended-real */
   4734                /* Uses dirty helper:
   4735                      ULong x86g_loadF80le ( UInt )
   4736                   addr holds the address.  First, do a dirty call to
   4737                   get hold of the data. */
   4738                IRTemp   val  = newTemp(Ity_I64);
   4739                IRExpr** args = mkIRExprVec_1 ( mkexpr(addr) );
   4740 
   4741                IRDirty* d = unsafeIRDirty_1_N (
   4742                                val,
   4743                                0/*regparms*/,
   4744                                "x86g_dirtyhelper_loadF80le",
   4745                                &x86g_dirtyhelper_loadF80le,
   4746                                args
   4747                             );
   4748                /* declare that we're reading memory */
   4749                d->mFx   = Ifx_Read;
   4750                d->mAddr = mkexpr(addr);
   4751                d->mSize = 10;
   4752 
   4753                /* execute the dirty call, dumping the result in val. */
   4754                stmt( IRStmt_Dirty(d) );
   4755                fp_push();
   4756                put_ST(0, unop(Iop_ReinterpI64asF64, mkexpr(val)));
   4757 
   4758                DIP("fldt %s\n", dis_buf);
   4759                break;
   4760             }
   4761 
   4762             case 7: { /* FSTP extended-real */
   4763                /* Uses dirty helper: void x86g_storeF80le ( UInt, ULong ) */
   4764                IRExpr** args
   4765                   = mkIRExprVec_2( mkexpr(addr),
   4766                                    unop(Iop_ReinterpF64asI64, get_ST(0)) );
   4767 
   4768                IRDirty* d = unsafeIRDirty_0_N (
   4769                                0/*regparms*/,
   4770                                "x86g_dirtyhelper_storeF80le",
   4771                                &x86g_dirtyhelper_storeF80le,
   4772                                args
   4773                             );
   4774                /* declare we're writing memory */
   4775                d->mFx   = Ifx_Write;
   4776                d->mAddr = mkexpr(addr);
   4777                d->mSize = 10;
   4778 
   4779                /* execute the dirty call. */
   4780                stmt( IRStmt_Dirty(d) );
   4781                fp_pop();
   4782 
   4783                DIP("fstpt\n %s", dis_buf);
   4784                break;
   4785             }
   4786 
   4787             default:
   4788                vex_printf("unhandled opc_aux = 0x%2x\n", gregOfRM(modrm));
   4789                vex_printf("first_opcode == 0xDB\n");
   4790                goto decode_fail;
   4791          }
   4792 
   4793       } else {
   4794 
   4795          delta++;
   4796          switch (modrm) {
   4797 
   4798             case 0xC0 ... 0xC7: /* FCMOVNB ST(i), ST(0) */
   4799                r_src = (UInt)modrm - 0xC0;
   4800                DIP("fcmovnb %%st(%d), %%st(0)\n", (Int)r_src);
   4801                put_ST_UNCHECKED(0,
   4802                                 IRExpr_ITE(
   4803                                     mk_x86g_calculate_condition(X86CondNB),
   4804                                     get_ST(r_src), get_ST(0)) );
   4805                break;
   4806 
   4807             case 0xC8 ... 0xCF: /* FCMOVNE(NZ) ST(i), ST(0) */
   4808                r_src = (UInt)modrm - 0xC8;
   4809                DIP("fcmovnz %%st(%d), %%st(0)\n", (Int)r_src);
   4810                put_ST_UNCHECKED(0,
   4811                                 IRExpr_ITE(
   4812                                     mk_x86g_calculate_condition(X86CondNZ),
   4813                                     get_ST(r_src), get_ST(0)) );
   4814                break;
   4815 
   4816             case 0xD0 ... 0xD7: /* FCMOVNBE ST(i), ST(0) */
   4817                r_src = (UInt)modrm - 0xD0;
   4818                DIP("fcmovnbe %%st(%d), %%st(0)\n", (Int)r_src);
   4819                put_ST_UNCHECKED(0,
   4820                                 IRExpr_ITE(
   4821                                     mk_x86g_calculate_condition(X86CondNBE),
   4822                                     get_ST(r_src), get_ST(0)) );
   4823                break;
   4824 
   4825             case 0xD8 ... 0xDF: /* FCMOVNU ST(i), ST(0) */
   4826                r_src = (UInt)modrm - 0xD8;
   4827                DIP("fcmovnu %%st(%d), %%st(0)\n", (Int)r_src);
   4828                put_ST_UNCHECKED(0,
   4829                                 IRExpr_ITE(
   4830                                     mk_x86g_calculate_condition(X86CondNP),
   4831                                     get_ST(r_src), get_ST(0)) );
   4832                break;
   4833 
   4834             case 0xE2:
   4835                DIP("fnclex\n");
   4836                break;
   4837 
   4838             case 0xE3: {
   4839                /* Uses dirty helper:
   4840                      void x86g_do_FINIT ( VexGuestX86State* ) */
   4841                IRDirty* d  = unsafeIRDirty_0_N (
   4842                                 0/*regparms*/,
   4843                                 "x86g_dirtyhelper_FINIT",
   4844                                 &x86g_dirtyhelper_FINIT,
   4845                                 mkIRExprVec_1(IRExpr_BBPTR())
   4846                              );
   4847 
   4848                /* declare we're writing guest state */
   4849                d->nFxState = 5;
   4850                vex_bzero(&d->fxState, sizeof(d->fxState));
   4851 
   4852                d->fxState[0].fx     = Ifx_Write;
   4853                d->fxState[0].offset = OFFB_FTOP;
   4854                d->fxState[0].size   = sizeof(UInt);
   4855 
   4856                d->fxState[1].fx     = Ifx_Write;
   4857                d->fxState[1].offset = OFFB_FPREGS;
   4858                d->fxState[1].size   = 8 * sizeof(ULong);
   4859 
   4860                d->fxState[2].fx     = Ifx_Write;
   4861                d->fxState[2].offset = OFFB_FPTAGS;
   4862                d->fxState[2].size   = 8 * sizeof(UChar);
   4863 
   4864                d->fxState[3].fx     = Ifx_Write;
   4865                d->fxState[3].offset = OFFB_FPROUND;
   4866                d->fxState[3].size   = sizeof(UInt);
   4867 
   4868                d->fxState[4].fx     = Ifx_Write;
   4869                d->fxState[4].offset = OFFB_FC3210;
   4870                d->fxState[4].size   = sizeof(UInt);
   4871 
   4872                stmt( IRStmt_Dirty(d) );
   4873 
   4874                DIP("fninit\n");
   4875                break;
   4876             }
   4877 
   4878             case 0xE8 ... 0xEF: /* FUCOMI %st(0),%st(?) */
   4879                fp_do_ucomi_ST0_STi( (UInt)modrm - 0xE8, False );
   4880                break;
   4881 
   4882             case 0xF0 ... 0xF7: /* FCOMI %st(0),%st(?) */
   4883                fp_do_ucomi_ST0_STi( (UInt)modrm - 0xF0, False );
   4884                break;
   4885 
   4886             default:
   4887                goto decode_fail;
   4888          }
   4889       }
   4890    }
   4891 
   4892    /* -+-+-+-+-+-+-+-+-+-+-+-+ 0xDC opcodes +-+-+-+-+-+-+-+ */
   4893    else
   4894    if (first_opcode == 0xDC) {
   4895       if (modrm < 0xC0) {
   4896 
   4897          /* bits 5,4,3 are an opcode extension, and the modRM also
   4898             specifies an address. */
   4899          IRTemp addr = disAMode( &len, sorb, delta, dis_buf );
   4900          delta += len;
   4901 
   4902          switch (gregOfRM(modrm)) {
   4903 
   4904             case 0: /* FADD double-real */
   4905                fp_do_op_mem_ST_0 ( addr, "add", dis_buf, Iop_AddF64, True );
   4906                break;
   4907 
   4908             case 1: /* FMUL double-real */
   4909                fp_do_op_mem_ST_0 ( addr, "mul", dis_buf, Iop_MulF64, True );
   4910                break;
   4911 
   4912             case 2: /* FCOM double-real */
   4913                DIP("fcoml %s\n", dis_buf);
   4914                /* This forces C1 to zero, which isn't right. */
   4915                put_C3210(
   4916                    binop( Iop_And32,
   4917                           binop(Iop_Shl32,
   4918                                 binop(Iop_CmpF64,
   4919                                       get_ST(0),
   4920                                       loadLE(Ity_F64,mkexpr(addr))),
   4921                                 mkU8(8)),
   4922                           mkU32(0x4500)
   4923                    ));
   4924                break;
   4925 
   4926             case 3: /* FCOMP double-real */
   4927                DIP("fcompl %s\n", dis_buf);
   4928                /* This forces C1 to zero, which isn't right. */
   4929                put_C3210(
   4930                    binop( Iop_And32,
   4931                           binop(Iop_Shl32,
   4932                                 binop(Iop_CmpF64,
   4933                                       get_ST(0),
   4934                                       loadLE(Ity_F64,mkexpr(addr))),
   4935                                 mkU8(8)),
   4936                           mkU32(0x4500)
   4937                    ));
   4938                fp_pop();
   4939                break;
   4940 
   4941             case 4: /* FSUB double-real */
   4942                fp_do_op_mem_ST_0 ( addr, "sub", dis_buf, Iop_SubF64, True );
   4943                break;
   4944 
   4945             case 5: /* FSUBR double-real */
   4946                fp_do_oprev_mem_ST_0 ( addr, "subr", dis_buf, Iop_SubF64, True );
   4947                break;
   4948 
   4949             case 6: /* FDIV double-real */
   4950                fp_do_op_mem_ST_0 ( addr, "div", dis_buf, Iop_DivF64, True );
   4951                break;
   4952 
   4953             case 7: /* FDIVR double-real */
   4954                fp_do_oprev_mem_ST_0 ( addr, "divr", dis_buf, Iop_DivF64, True );
   4955                break;
   4956 
   4957             default:
   4958                vex_printf("unhandled opc_aux = 0x%2x\n", gregOfRM(modrm));
   4959                vex_printf("first_opcode == 0xDC\n");
   4960                goto decode_fail;
   4961          }
   4962 
   4963       } else {
   4964 
   4965          delta++;
   4966          switch (modrm) {
   4967 
   4968             case 0xC0 ... 0xC7: /* FADD %st(0),%st(?) */
   4969                fp_do_op_ST_ST ( "add", Iop_AddF64, 0, modrm - 0xC0, False );
   4970                break;
   4971 
   4972             case 0xC8 ... 0xCF: /* FMUL %st(0),%st(?) */
   4973                fp_do_op_ST_ST ( "mul", Iop_MulF64, 0, modrm - 0xC8, False );
   4974                break;
   4975 
   4976             case 0xE0 ... 0xE7: /* FSUBR %st(0),%st(?) */
   4977                fp_do_oprev_ST_ST ( "subr", Iop_SubF64, 0, modrm - 0xE0, False );
   4978                break;
   4979 
   4980             case 0xE8 ... 0xEF: /* FSUB %st(0),%st(?) */
   4981                fp_do_op_ST_ST ( "sub", Iop_SubF64, 0, modrm - 0xE8, False );
   4982                break;
   4983 
   4984             case 0xF0 ... 0xF7: /* FDIVR %st(0),%st(?) */
   4985                fp_do_oprev_ST_ST ( "divr", Iop_DivF64, 0, modrm - 0xF0, False );
   4986                break;
   4987 
   4988             case 0xF8 ... 0xFF: /* FDIV %st(0),%st(?) */
   4989                fp_do_op_ST_ST ( "div", Iop_DivF64, 0, modrm - 0xF8, False );
   4990                break;
   4991 
   4992             default:
   4993                goto decode_fail;
   4994          }
   4995 
   4996       }
   4997    }
   4998 
   4999    /* -+-+-+-+-+-+-+-+-+-+-+-+ 0xDD opcodes +-+-+-+-+-+-+-+ */
   5000    else
   5001    if (first_opcode == 0xDD) {
   5002 
   5003       if (modrm < 0xC0) {
   5004 
   5005          /* bits 5,4,3 are an opcode extension, and the modRM also
   5006             specifies an address. */
   5007          IRTemp addr = disAMode( &len, sorb, delta, dis_buf );
   5008          delta += len;
   5009 
   5010          switch (gregOfRM(modrm)) {
   5011 
   5012             case 0: /* FLD double-real */
   5013                DIP("fldl %s\n", dis_buf);
   5014                fp_push();
   5015                put_ST(0, loadLE(Ity_F64, mkexpr(addr)));
   5016                break;
   5017 
   5018             case 1: /* FISTTPQ m64 (SSE3) */
   5019                DIP("fistppll %s\n", dis_buf);
   5020                storeLE( mkexpr(addr),
   5021                         binop(Iop_F64toI64S, mkU32(Irrm_ZERO), get_ST(0)) );
   5022                fp_pop();
   5023                break;
   5024 
   5025             case 2: /* FST double-real */
   5026                DIP("fstl %s\n", dis_buf);
   5027                storeLE(mkexpr(addr), get_ST(0));
   5028                break;
   5029 
   5030             case 3: /* FSTP double-real */
   5031                DIP("fstpl %s\n", dis_buf);
   5032                storeLE(mkexpr(addr), get_ST(0));
   5033                fp_pop();
   5034                break;
   5035 
   5036             case 4: { /* FRSTOR m108 */
   5037                /* Uses dirty helper:
   5038                      VexEmNote x86g_do_FRSTOR ( VexGuestX86State*, Addr32 ) */
   5039                IRTemp   ew = newTemp(Ity_I32);
   5040                IRDirty* d  = unsafeIRDirty_0_N (
   5041                                 0/*regparms*/,
   5042                                 "x86g_dirtyhelper_FRSTOR",
   5043                                 &x86g_dirtyhelper_FRSTOR,
   5044                                 mkIRExprVec_2( IRExpr_BBPTR(), mkexpr(addr) )
   5045                              );
   5046                d->tmp   = ew;
   5047                /* declare we're reading memory */
   5048                d->mFx   = Ifx_Read;
   5049                d->mAddr = mkexpr(addr);
   5050                d->mSize = 108;
   5051 
   5052                /* declare we're writing guest state */
   5053                d->nFxState = 5;
   5054                vex_bzero(&d->fxState, sizeof(d->fxState));
   5055 
   5056                d->fxState[0].fx     = Ifx_Write;
   5057                d->fxState[0].offset = OFFB_FTOP;
   5058                d->fxState[0].size   = sizeof(UInt);
   5059 
   5060                d->fxState[1].fx     = Ifx_Write;
   5061                d->fxState[1].offset = OFFB_FPREGS;
   5062                d->fxState[1].size   = 8 * sizeof(ULong);
   5063 
   5064                d->fxState[2].fx     = Ifx_Write;
   5065                d->fxState[2].offset = OFFB_FPTAGS;
   5066                d->fxState[2].size   = 8 * sizeof(UChar);
   5067 
   5068                d->fxState[3].fx     = Ifx_Write;
   5069                d->fxState[3].offset = OFFB_FPROUND;
   5070                d->fxState[3].size   = sizeof(UInt);
   5071 
   5072                d->fxState[4].fx     = Ifx_Write;
   5073                d->fxState[4].offset = OFFB_FC3210;
   5074                d->fxState[4].size   = sizeof(UInt);
   5075 
   5076                stmt( IRStmt_Dirty(d) );
   5077 
   5078                /* ew contains any emulation warning we may need to
   5079                   issue.  If needed, side-exit to the next insn,
   5080                   reporting the warning, so that Valgrind's dispatcher
   5081                   sees the warning. */
   5082                put_emwarn( mkexpr(ew) );
   5083                stmt(
   5084                   IRStmt_Exit(
   5085                      binop(Iop_CmpNE32, mkexpr(ew), mkU32(0)),
   5086                      Ijk_EmWarn,
   5087                      IRConst_U32( ((Addr32)guest_EIP_bbstart)+delta),
   5088                      OFFB_EIP
   5089                   )
   5090                );
   5091 
   5092                DIP("frstor %s\n", dis_buf);
   5093                break;
   5094             }
   5095 
   5096             case 6: { /* FNSAVE m108 */
   5097                /* Uses dirty helper:
   5098                      void x86g_do_FSAVE ( VexGuestX86State*, UInt ) */
   5099                IRDirty* d = unsafeIRDirty_0_N (
   5100                                0/*regparms*/,
   5101                                "x86g_dirtyhelper_FSAVE",
   5102                                &x86g_dirtyhelper_FSAVE,
   5103                                mkIRExprVec_2( IRExpr_BBPTR(), mkexpr(addr) )
   5104                             );
   5105                /* declare we're writing memory */
   5106                d->mFx   = Ifx_Write;
   5107                d->mAddr = mkexpr(addr);
   5108                d->mSize = 108;
   5109 
   5110                /* declare we're reading guest state */
   5111                d->nFxState = 5;
   5112                vex_bzero(&d->fxState, sizeof(d->fxState));
   5113 
   5114                d->fxState[0].fx     = Ifx_Read;
   5115                d->fxState[0].offset = OFFB_FTOP;
   5116                d->fxState[0].size   = sizeof(UInt);
   5117 
   5118                d->fxState[1].fx     = Ifx_Read;
   5119                d->fxState[1].offset = OFFB_FPREGS;
   5120                d->fxState[1].size   = 8 * sizeof(ULong);
   5121 
   5122                d->fxState[2].fx     = Ifx_Read;
   5123                d->fxState[2].offset = OFFB_FPTAGS;
   5124                d->fxState[2].size   = 8 * sizeof(UChar);
   5125 
   5126                d->fxState[3].fx     = Ifx_Read;
   5127                d->fxState[3].offset = OFFB_FPROUND;
   5128                d->fxState[3].size   = sizeof(UInt);
   5129 
   5130                d->fxState[4].fx     = Ifx_Read;
   5131                d->fxState[4].offset = OFFB_FC3210;
   5132                d->fxState[4].size   = sizeof(UInt);
   5133 
   5134                stmt( IRStmt_Dirty(d) );
   5135 
   5136                DIP("fnsave %s\n", dis_buf);
   5137                break;
   5138             }
   5139 
   5140             case 7: { /* FNSTSW m16 */
   5141                IRExpr* sw = get_FPU_sw();
   5142                vassert(typeOfIRExpr(irsb->tyenv, sw) == Ity_I16);
   5143                storeLE( mkexpr(addr), sw );
   5144                DIP("fnstsw %s\n", dis_buf);
   5145                break;
   5146             }
   5147 
   5148             default:
   5149                vex_printf("unhandled opc_aux = 0x%2x\n", gregOfRM(modrm));
   5150                vex_printf("first_opcode == 0xDD\n");
   5151                goto decode_fail;
   5152          }
   5153       } else {
   5154          delta++;
   5155          switch (modrm) {
   5156 
   5157             case 0xC0 ... 0xC7: /* FFREE %st(?) */
   5158                r_dst = (UInt)modrm - 0xC0;
   5159                DIP("ffree %%st(%d)\n", (Int)r_dst);
   5160                put_ST_TAG ( r_dst, mkU8(0) );
   5161                break;
   5162 
   5163             case 0xD0 ... 0xD7: /* FST %st(0),%st(?) */
   5164                r_dst = (UInt)modrm - 0xD0;
   5165                DIP("fst %%st(0),%%st(%d)\n", (Int)r_dst);
   5166                /* P4 manual says: "If the destination operand is a
   5167                   non-empty register, the invalid-operation exception
   5168                   is not generated.  Hence put_ST_UNCHECKED. */
   5169                put_ST_UNCHECKED(r_dst, get_ST(0));
   5170                break;
   5171 
   5172             case 0xD8 ... 0xDF: /* FSTP %st(0),%st(?) */
   5173                r_dst = (UInt)modrm - 0xD8;
   5174                DIP("fstp %%st(0),%%st(%d)\n", (Int)r_dst);
   5175                /* P4 manual says: "If the destination operand is a
   5176                   non-empty register, the invalid-operation exception
   5177                   is not generated.  Hence put_ST_UNCHECKED. */
   5178                put_ST_UNCHECKED(r_dst, get_ST(0));
   5179                fp_pop();
   5180                break;
   5181 
   5182             case 0xE0 ... 0xE7: /* FUCOM %st(0),%st(?) */
   5183                r_dst = (UInt)modrm - 0xE0;
   5184                DIP("fucom %%st(0),%%st(%d)\n", (Int)r_dst);
   5185                /* This forces C1 to zero, which isn't right. */
   5186                put_C3210(
   5187                    binop( Iop_And32,
   5188                           binop(Iop_Shl32,
   5189                                 binop(Iop_CmpF64, get_ST(0), get_ST(r_dst)),
   5190                                 mkU8(8)),
   5191                           mkU32(0x4500)
   5192                    ));
   5193                break;
   5194 
   5195             case 0xE8 ... 0xEF: /* FUCOMP %st(0),%st(?) */
   5196                r_dst = (UInt)modrm - 0xE8;
   5197                DIP("fucomp %%st(0),%%st(%d)\n", (Int)r_dst);
   5198                /* This forces C1 to zero, which isn't right. */
   5199                put_C3210(
   5200                    binop( Iop_And32,
   5201                           binop(Iop_Shl32,
   5202                                 binop(Iop_CmpF64, get_ST(0), get_ST(r_dst)),
   5203                                 mkU8(8)),
   5204                           mkU32(0x4500)
   5205                    ));
   5206                fp_pop();
   5207                break;
   5208 
   5209             default:
   5210                goto decode_fail;
   5211          }
   5212       }
   5213    }
   5214 
   5215    /* -+-+-+-+-+-+-+-+-+-+-+-+ 0xDE opcodes +-+-+-+-+-+-+-+ */
   5216    else
   5217    if (first_opcode == 0xDE) {
   5218 
   5219       if (modrm < 0xC0) {
   5220 
   5221          /* bits 5,4,3 are an opcode extension, and the modRM also
   5222             specifies an address. */
   5223          IROp   fop;
   5224          IRTemp addr = disAMode( &len, sorb, delta, dis_buf );
   5225          delta += len;
   5226 
   5227          switch (gregOfRM(modrm)) {
   5228 
   5229             case 0: /* FIADD m16int */ /* ST(0) += m16int */
   5230                DIP("fiaddw %s\n", dis_buf);
   5231                fop = Iop_AddF64;
   5232                goto do_fop_m16;
   5233 
   5234             case 1: /* FIMUL m16int */ /* ST(0) *= m16int */
   5235                DIP("fimulw %s\n", dis_buf);
   5236                fop = Iop_MulF64;
   5237                goto do_fop_m16;
   5238 
   5239             case 2: /* FICOM m16int */
   5240                DIP("ficomw %s\n", dis_buf);
   5241                /* This forces C1 to zero, which isn't right. */
   5242                put_C3210(
   5243                    binop( Iop_And32,
   5244                           binop(Iop_Shl32,
   5245                                 binop(Iop_CmpF64,
   5246                                       get_ST(0),
   5247                                       unop(Iop_I32StoF64,
   5248                                          unop(Iop_16Sto32,
   5249                                            loadLE(Ity_I16,mkexpr(addr))))),
   5250                                 mkU8(8)),
   5251                           mkU32(0x4500)
   5252                    ));
   5253                break;
   5254 
   5255             case 3: /* FICOMP m16int */
   5256                DIP("ficompw %s\n", dis_buf);
   5257                /* This forces C1 to zero, which isn't right. */
   5258                put_C3210(
   5259                    binop( Iop_And32,
   5260                           binop(Iop_Shl32,
   5261                                 binop(Iop_CmpF64,
   5262                                       get_ST(0),
   5263                                       unop(Iop_I32StoF64,
   5264                                          unop(Iop_16Sto32,
   5265                                               loadLE(Ity_I16,mkexpr(addr))))),
   5266                                 mkU8(8)),
   5267                           mkU32(0x4500)
   5268                    ));
   5269                fp_pop();
   5270                break;
   5271 
   5272             case 4: /* FISUB m16int */ /* ST(0) -= m16int */
   5273                DIP("fisubw %s\n", dis_buf);
   5274                fop = Iop_SubF64;
   5275                goto do_fop_m16;
   5276 
   5277             case 5: /* FISUBR m16int */ /* ST(0) = m16int - ST(0) */
   5278                DIP("fisubrw %s\n", dis_buf);
   5279                fop = Iop_SubF64;
   5280                goto do_foprev_m16;
   5281 
   5282             case 6: /* FIDIV m16int */ /* ST(0) /= m16int */
   5283                DIP("fisubw %s\n", dis_buf);
   5284                fop = Iop_DivF64;
   5285                goto do_fop_m16;
   5286 
   5287             case 7: /* FIDIVR m16int */ /* ST(0) = m16int / ST(0) */
   5288                DIP("fidivrw %s\n", dis_buf);
   5289                fop = Iop_DivF64;
   5290                goto do_foprev_m16;
   5291 
   5292             do_fop_m16:
   5293                put_ST_UNCHECKED(0,
   5294                   triop(fop,
   5295                         get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   5296                         get_ST(0),
   5297                         unop(Iop_I32StoF64,
   5298                              unop(Iop_16Sto32,
   5299                                   loadLE(Ity_I16, mkexpr(addr))))));
   5300                break;
   5301 
   5302             do_foprev_m16:
   5303                put_ST_UNCHECKED(0,
   5304                   triop(fop,
   5305                         get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   5306                         unop(Iop_I32StoF64,
   5307                              unop(Iop_16Sto32,
   5308                                   loadLE(Ity_I16, mkexpr(addr)))),
   5309                         get_ST(0)));
   5310                break;
   5311 
   5312             default:
   5313                vex_printf("unhandled opc_aux = 0x%2x\n", gregOfRM(modrm));
   5314                vex_printf("first_opcode == 0xDE\n");
   5315                goto decode_fail;
   5316          }
   5317 
   5318       } else {
   5319 
   5320          delta++;
   5321          switch (modrm) {
   5322 
   5323             case 0xC0 ... 0xC7: /* FADDP %st(0),%st(?) */
   5324                fp_do_op_ST_ST ( "add", Iop_AddF64, 0, modrm - 0xC0, True );
   5325                break;
   5326 
   5327             case 0xC8 ... 0xCF: /* FMULP %st(0),%st(?) */
   5328                fp_do_op_ST_ST ( "mul", Iop_MulF64, 0, modrm - 0xC8, True );
   5329                break;
   5330 
   5331             case 0xD9: /* FCOMPP %st(0),%st(1) */
   5332                DIP("fuompp %%st(0),%%st(1)\n");
   5333                /* This forces C1 to zero, which isn't right. */
   5334                put_C3210(
   5335                    binop( Iop_And32,
   5336                           binop(Iop_Shl32,
   5337                                 binop(Iop_CmpF64, get_ST(0), get_ST(1)),
   5338                                 mkU8(8)),
   5339                           mkU32(0x4500)
   5340                    ));
   5341                fp_pop();
   5342                fp_pop();
   5343                break;
   5344 
   5345             case 0xE0 ... 0xE7: /* FSUBRP %st(0),%st(?) */
   5346                fp_do_oprev_ST_ST ( "subr", Iop_SubF64, 0,  modrm - 0xE0, True );
   5347                break;
   5348 
   5349             case 0xE8 ... 0xEF: /* FSUBP %st(0),%st(?) */
   5350                fp_do_op_ST_ST ( "sub", Iop_SubF64, 0,  modrm - 0xE8, True );
   5351                break;
   5352 
   5353             case 0xF0 ... 0xF7: /* FDIVRP %st(0),%st(?) */
   5354                fp_do_oprev_ST_ST ( "divr", Iop_DivF64, 0, modrm - 0xF0, True );
   5355                break;
   5356 
   5357             case 0xF8 ... 0xFF: /* FDIVP %st(0),%st(?) */
   5358                fp_do_op_ST_ST ( "div", Iop_DivF64, 0, modrm - 0xF8, True );
   5359                break;
   5360 
   5361             default:
   5362                goto decode_fail;
   5363          }
   5364 
   5365       }
   5366    }
   5367 
   5368    /* -+-+-+-+-+-+-+-+-+-+-+-+ 0xDF opcodes +-+-+-+-+-+-+-+ */
   5369    else
   5370    if (first_opcode == 0xDF) {
   5371 
   5372       if (modrm < 0xC0) {
   5373 
   5374          /* bits 5,4,3 are an opcode extension, and the modRM also
   5375             specifies an address. */
   5376          IRTemp addr = disAMode( &len, sorb, delta, dis_buf );
   5377          delta += len;
   5378 
   5379          switch (gregOfRM(modrm)) {
   5380 
   5381             case 0: /* FILD m16int */
   5382                DIP("fildw %s\n", dis_buf);
   5383                fp_push();
   5384                put_ST(0, unop(Iop_I32StoF64,
   5385                               unop(Iop_16Sto32,
   5386                                    loadLE(Ity_I16, mkexpr(addr)))));
   5387                break;
   5388 
   5389             case 1: /* FISTTPS m16 (SSE3) */
   5390                DIP("fisttps %s\n", dis_buf);
   5391                storeLE( mkexpr(addr),
   5392                         binop(Iop_F64toI16S, mkU32(Irrm_ZERO), get_ST(0)) );
   5393                fp_pop();
   5394                break;
   5395 
   5396             case 2: /* FIST m16 */
   5397                DIP("fistp %s\n", dis_buf);
   5398                storeLE( mkexpr(addr),
   5399                         binop(Iop_F64toI16S, get_roundingmode(), get_ST(0)) );
   5400                break;
   5401 
   5402             case 3: /* FISTP m16 */
   5403                DIP("fistps %s\n", dis_buf);
   5404                storeLE( mkexpr(addr),
   5405                         binop(Iop_F64toI16S, get_roundingmode(), get_ST(0)) );
   5406                fp_pop();
   5407                break;
   5408 
   5409             case 5: /* FILD m64 */
   5410                DIP("fildll %s\n", dis_buf);
   5411                fp_push();
   5412                put_ST(0, binop(Iop_I64StoF64,
   5413                                get_roundingmode(),
   5414                                loadLE(Ity_I64, mkexpr(addr))));
   5415                break;
   5416 
   5417             case 7: /* FISTP m64 */
   5418                DIP("fistpll %s\n", dis_buf);
   5419                storeLE( mkexpr(addr),
   5420                         binop(Iop_F64toI64S, get_roundingmode(), get_ST(0)) );
   5421                fp_pop();
   5422                break;
   5423 
   5424             default:
   5425                vex_printf("unhandled opc_aux = 0x%2x\n", gregOfRM(modrm));
   5426                vex_printf("first_opcode == 0xDF\n");
   5427                goto decode_fail;
   5428          }
   5429 
   5430       } else {
   5431 
   5432          delta++;
   5433          switch (modrm) {
   5434 
   5435             case 0xC0: /* FFREEP %st(0) */
   5436                DIP("ffreep %%st(%d)\n", 0);
   5437                put_ST_TAG ( 0, mkU8(0) );
   5438                fp_pop();
   5439                break;
   5440 
   5441             case 0xE0: /* FNSTSW %ax */
   5442                DIP("fnstsw %%ax\n");
   5443                /* Get the FPU status word value and dump it in %AX. */
   5444                if (0) {
   5445                   /* The obvious thing to do is simply dump the 16-bit
   5446                      status word value in %AX.  However, due to a
   5447                      limitation in Memcheck's origin tracking
   5448                      machinery, this causes Memcheck not to track the
   5449                      origin of any undefinedness into %AH (only into
   5450                      %AL/%AX/%EAX), which means origins are lost in
   5451                      the sequence "fnstsw %ax; test $M,%ah; jcond .." */
   5452                   putIReg(2, R_EAX, get_FPU_sw());
   5453                } else {
   5454                   /* So a somewhat lame kludge is to make it very
   5455                      clear to Memcheck that the value is written to
   5456                      both %AH and %AL.  This generates marginally
   5457                      worse code, but I don't think it matters much. */
   5458                   IRTemp t16 = newTemp(Ity_I16);
   5459                   assign(t16, get_FPU_sw());
   5460                   putIReg( 1, R_AL, unop(Iop_16to8, mkexpr(t16)) );
   5461                   putIReg( 1, R_AH, unop(Iop_16HIto8, mkexpr(t16)) );
   5462                }
   5463                break;
   5464 
   5465             case 0xE8 ... 0xEF: /* FUCOMIP %st(0),%st(?) */
   5466                fp_do_ucomi_ST0_STi( (UInt)modrm - 0xE8, True );
   5467                break;
   5468 
   5469             case 0xF0 ... 0xF7: /* FCOMIP %st(0),%st(?) */
   5470                /* not really right since COMIP != UCOMIP */
   5471                fp_do_ucomi_ST0_STi( (UInt)modrm - 0xF0, True );
   5472                break;
   5473 
   5474             default:
   5475                goto decode_fail;
   5476          }
   5477       }
   5478 
   5479    }
   5480 
   5481    else
   5482    vpanic("dis_FPU(x86): invalid primary opcode");
   5483 
   5484    *decode_ok = True;
   5485    return delta;
   5486 
   5487   decode_fail:
   5488    *decode_ok = False;
   5489    return delta;
   5490 }
   5491 
   5492 
   5493 /*------------------------------------------------------------*/
   5494 /*---                                                      ---*/
   5495 /*--- MMX INSTRUCTIONS                                     ---*/
   5496 /*---                                                      ---*/
   5497 /*------------------------------------------------------------*/
   5498 
   5499 /* Effect of MMX insns on x87 FPU state (table 11-2 of
   5500    IA32 arch manual, volume 3):
   5501 
   5502    Read from, or write to MMX register (viz, any insn except EMMS):
   5503    * All tags set to Valid (non-empty) -- FPTAGS[i] := nonzero
   5504    * FP stack pointer set to zero
   5505 
   5506    EMMS:
   5507    * All tags set to Invalid (empty) -- FPTAGS[i] := zero
   5508    * FP stack pointer set to zero
   5509 */
   5510 
   5511 static void do_MMX_preamble ( void )
   5512 {
   5513    Int         i;
   5514    IRRegArray* descr = mkIRRegArray( OFFB_FPTAGS, Ity_I8, 8 );
   5515    IRExpr*     zero  = mkU32(0);
   5516    IRExpr*     tag1  = mkU8(1);
   5517    put_ftop(zero);
   5518    for (i = 0; i < 8; i++)
   5519       stmt( IRStmt_PutI( mkIRPutI(descr, zero, i, tag1) ) );
   5520 }
   5521 
   5522 static void do_EMMS_preamble ( void )
   5523 {
   5524    Int         i;
   5525    IRRegArray* descr = mkIRRegArray( OFFB_FPTAGS, Ity_I8, 8 );
   5526    IRExpr*     zero  = mkU32(0);
   5527    IRExpr*     tag0  = mkU8(0);
   5528    put_ftop(zero);
   5529    for (i = 0; i < 8; i++)
   5530       stmt( IRStmt_PutI( mkIRPutI(descr, zero, i, tag0) ) );
   5531 }
   5532 
   5533 
   5534 static IRExpr* getMMXReg ( UInt archreg )
   5535 {
   5536    vassert(archreg < 8);
   5537    return IRExpr_Get( OFFB_FPREGS + 8 * archreg, Ity_I64 );
   5538 }
   5539 
   5540 
   5541 static void putMMXReg ( UInt archreg, IRExpr* e )
   5542 {
   5543    vassert(archreg < 8);
   5544    vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_I64);
   5545    stmt( IRStmt_Put( OFFB_FPREGS + 8 * archreg, e ) );
   5546 }
   5547 
   5548 
   5549 /* Helper for non-shift MMX insns.  Note this is incomplete in the
   5550    sense that it does not first call do_MMX_preamble() -- that is the
   5551    responsibility of its caller. */
   5552 
   5553 static
   5554 UInt dis_MMXop_regmem_to_reg ( UChar  sorb,
   5555                                Int    delta,
   5556                                UChar  opc,
   5557                                const HChar* name,
   5558                                Bool   show_granularity )
   5559 {
   5560    HChar   dis_buf[50];
   5561    UChar   modrm = getIByte(delta);
   5562    Bool    isReg = epartIsReg(modrm);
   5563    IRExpr* argL  = NULL;
   5564    IRExpr* argR  = NULL;
   5565    IRExpr* argG  = NULL;
   5566    IRExpr* argE  = NULL;
   5567    IRTemp  res   = newTemp(Ity_I64);
   5568 
   5569    Bool    invG  = False;
   5570    IROp    op    = Iop_INVALID;
   5571    void*   hAddr = NULL;
   5572    Bool    eLeft = False;
   5573    const HChar*  hName = NULL;
   5574 
   5575 #  define XXX(_name) do { hAddr = &_name; hName = #_name; } while (0)
   5576 
   5577    switch (opc) {
   5578       /* Original MMX ones */
   5579       case 0xFC: op = Iop_Add8x8; break;
   5580       case 0xFD: op = Iop_Add16x4; break;
   5581       case 0xFE: op = Iop_Add32x2; break;
   5582 
   5583       case 0xEC: op = Iop_QAdd8Sx8; break;
   5584       case 0xED: op = Iop_QAdd16Sx4; break;
   5585 
   5586       case 0xDC: op = Iop_QAdd8Ux8; break;
   5587       case 0xDD: op = Iop_QAdd16Ux4; break;
   5588 
   5589       case 0xF8: op = Iop_Sub8x8;  break;
   5590       case 0xF9: op = Iop_Sub16x4; break;
   5591       case 0xFA: op = Iop_Sub32x2; break;
   5592 
   5593       case 0xE8: op = Iop_QSub8Sx8; break;
   5594       case 0xE9: op = Iop_QSub16Sx4; break;
   5595 
   5596       case 0xD8: op = Iop_QSub8Ux8; break;
   5597       case 0xD9: op = Iop_QSub16Ux4; break;
   5598 
   5599       case 0xE5: op = Iop_MulHi16Sx4; break;
   5600       case 0xD5: op = Iop_Mul16x4; break;
   5601       case 0xF5: XXX(x86g_calculate_mmx_pmaddwd); break;
   5602 
   5603       case 0x74: op = Iop_CmpEQ8x8; break;
   5604       case 0x75: op = Iop_CmpEQ16x4; break;
   5605       case 0x76: op = Iop_CmpEQ32x2; break;
   5606 
   5607       case 0x64: op = Iop_CmpGT8Sx8; break;
   5608       case 0x65: op = Iop_CmpGT16Sx4; break;
   5609       case 0x66: op = Iop_CmpGT32Sx2; break;
   5610 
   5611       case 0x6B: op = Iop_QNarrowBin32Sto16Sx4; eLeft = True; break;
   5612       case 0x63: op = Iop_QNarrowBin16Sto8Sx8;  eLeft = True; break;
   5613       case 0x67: op = Iop_QNarrowBin16Sto8Ux8;  eLeft = True; break;
   5614 
   5615       case 0x68: op = Iop_InterleaveHI8x8;  eLeft = True; break;
   5616       case 0x69: op = Iop_InterleaveHI16x4; eLeft = True; break;
   5617       case 0x6A: op = Iop_InterleaveHI32x2; eLeft = True; break;
   5618 
   5619       case 0x60: op = Iop_InterleaveLO8x8;  eLeft = True; break;
   5620       case 0x61: op = Iop_InterleaveLO16x4; eLeft = True; break;
   5621       case 0x62: op = Iop_InterleaveLO32x2; eLeft = True; break;
   5622 
   5623       case 0xDB: op = Iop_And64; break;
   5624       case 0xDF: op = Iop_And64; invG = True; break;
   5625       case 0xEB: op = Iop_Or64; break;
   5626       case 0xEF: /* Possibly do better here if argL and argR are the
   5627                     same reg */
   5628                  op = Iop_Xor64; break;
   5629 
   5630       /* Introduced in SSE1 */
   5631       case 0xE0: op = Iop_Avg8Ux8;    break;
   5632       case 0xE3: op = Iop_Avg16Ux4;   break;
   5633       case 0xEE: op = Iop_Max16Sx4;   break;
   5634       case 0xDE: op = Iop_Max8Ux8;    break;
   5635       case 0xEA: op = Iop_Min16Sx4;   break;
   5636       case 0xDA: op = Iop_Min8Ux8;    break;
   5637       case 0xE4: op = Iop_MulHi16Ux4; break;
   5638       case 0xF6: XXX(x86g_calculate_mmx_psadbw); break;
   5639 
   5640       /* Introduced in SSE2 */
   5641       case 0xD4: op = Iop_Add64; break;
   5642       case 0xFB: op = Iop_Sub64; break;
   5643 
   5644       default:
   5645          vex_printf("\n0x%x\n", (Int)opc);
   5646          vpanic("dis_MMXop_regmem_to_reg");
   5647    }
   5648 
   5649 #  undef XXX
   5650 
   5651    argG = getMMXReg(gregOfRM(modrm));
   5652    if (invG)
   5653       argG = unop(Iop_Not64, argG);
   5654 
   5655    if (isReg) {
   5656       delta++;
   5657       argE = getMMXReg(eregOfRM(modrm));
   5658    } else {
   5659       Int    len;
   5660       IRTemp addr = disAMode( &len, sorb, delta, dis_buf );
   5661       delta += len;
   5662       argE = loadLE(Ity_I64, mkexpr(addr));
   5663    }
   5664 
   5665    if (eLeft) {
   5666       argL = argE;
   5667       argR = argG;
   5668    } else {
   5669       argL = argG;
   5670       argR = argE;
   5671    }
   5672 
   5673    if (op != Iop_INVALID) {
   5674       vassert(hName == NULL);
   5675       vassert(hAddr == NULL);
   5676       assign(res, binop(op, argL, argR));
   5677    } else {
   5678       vassert(hName != NULL);
   5679       vassert(hAddr != NULL);
   5680       assign( res,
   5681               mkIRExprCCall(
   5682                  Ity_I64,
   5683                  0/*regparms*/, hName, hAddr,
   5684                  mkIRExprVec_2( argL, argR )
   5685               )
   5686             );
   5687    }
   5688 
   5689    putMMXReg( gregOfRM(modrm), mkexpr(res) );
   5690 
   5691    DIP("%s%s %s, %s\n",
   5692        name, show_granularity ? nameMMXGran(opc & 3) : "",
   5693        ( isReg ? nameMMXReg(eregOfRM(modrm)) : dis_buf ),
   5694        nameMMXReg(gregOfRM(modrm)) );
   5695 
   5696    return delta;
   5697 }
   5698 
   5699 
   5700 /* Vector by scalar shift of G by the amount specified at the bottom
   5701    of E.  This is a straight copy of dis_SSE_shiftG_byE. */
   5702 
   5703 static UInt dis_MMX_shiftG_byE ( UChar sorb, Int delta,
   5704                                  const HChar* opname, IROp op )
   5705 {
   5706    HChar   dis_buf[50];
   5707    Int     alen, size;
   5708    IRTemp  addr;
   5709    Bool    shl, shr, sar;
   5710    UChar   rm   = getIByte(delta);
   5711    IRTemp  g0   = newTemp(Ity_I64);
   5712    IRTemp  g1   = newTemp(Ity_I64);
   5713    IRTemp  amt  = newTemp(Ity_I32);
   5714    IRTemp  amt8 = newTemp(Ity_I8);
   5715 
   5716    if (epartIsReg(rm)) {
   5717       assign( amt, unop(Iop_64to32, getMMXReg(eregOfRM(rm))) );
   5718       DIP("%s %s,%s\n", opname,
   5719                         nameMMXReg(eregOfRM(rm)),
   5720                         nameMMXReg(gregOfRM(rm)) );
   5721       delta++;
   5722    } else {
   5723       addr = disAMode ( &alen, sorb, delta, dis_buf );
   5724       assign( amt, loadLE(Ity_I32, mkexpr(addr)) );
   5725       DIP("%s %s,%s\n", opname,
   5726                         dis_buf,
   5727                         nameMMXReg(gregOfRM(rm)) );
   5728       delta += alen;
   5729    }
   5730    assign( g0,   getMMXReg(gregOfRM(rm)) );
   5731    assign( amt8, unop(Iop_32to8, mkexpr(amt)) );
   5732 
   5733    shl = shr = sar = False;
   5734    size = 0;
   5735    switch (op) {
   5736       case Iop_ShlN16x4: shl = True; size = 32; break;
   5737       case Iop_ShlN32x2: shl = True; size = 32; break;
   5738       case Iop_Shl64:    shl = True; size = 64; break;
   5739       case Iop_ShrN16x4: shr = True; size = 16; break;
   5740       case Iop_ShrN32x2: shr = True; size = 32; break;
   5741       case Iop_Shr64:    shr = True; size = 64; break;
   5742       case Iop_SarN16x4: sar = True; size = 16; break;
   5743       case Iop_SarN32x2: sar = True; size = 32; break;
   5744       default: vassert(0);
   5745    }
   5746 
   5747    if (shl || shr) {
   5748      assign(
   5749         g1,
   5750         IRExpr_ITE(
   5751            binop(Iop_CmpLT32U,mkexpr(amt),mkU32(size)),
   5752            binop(op, mkexpr(g0), mkexpr(amt8)),
   5753            mkU64(0)
   5754         )
   5755      );
   5756    } else
   5757    if (sar) {
   5758      assign(
   5759         g1,
   5760         IRExpr_ITE(
   5761            binop(Iop_CmpLT32U,mkexpr(amt),mkU32(size)),
   5762            binop(op, mkexpr(g0), mkexpr(amt8)),
   5763            binop(op, mkexpr(g0), mkU8(size-1))
   5764         )
   5765      );
   5766    } else {
   5767       /*NOTREACHED*/
   5768       vassert(0);
   5769    }
   5770 
   5771    putMMXReg( gregOfRM(rm), mkexpr(g1) );
   5772    return delta;
   5773 }
   5774 
   5775 
   5776 /* Vector by scalar shift of E by an immediate byte.  This is a
   5777    straight copy of dis_SSE_shiftE_imm. */
   5778 
   5779 static
   5780 UInt dis_MMX_shiftE_imm ( Int delta, const HChar* opname, IROp op )
   5781 {
   5782    Bool    shl, shr, sar;
   5783    UChar   rm   = getIByte(delta);
   5784    IRTemp  e0   = newTemp(Ity_I64);
   5785    IRTemp  e1   = newTemp(Ity_I64);
   5786    UChar   amt, size;
   5787    vassert(epartIsReg(rm));
   5788    vassert(gregOfRM(rm) == 2
   5789            || gregOfRM(rm) == 4 || gregOfRM(rm) == 6);
   5790    amt = getIByte(delta+1);
   5791    delta += 2;
   5792    DIP("%s $%d,%s\n", opname,
   5793                       (Int)amt,
   5794                       nameMMXReg(eregOfRM(rm)) );
   5795 
   5796    assign( e0, getMMXReg(eregOfRM(rm)) );
   5797 
   5798    shl = shr = sar = False;
   5799    size = 0;
   5800    switch (op) {
   5801       case Iop_ShlN16x4: shl = True; size = 16; break;
   5802       case Iop_ShlN32x2: shl = True; size = 32; break;
   5803       case Iop_Shl64:    shl = True; size = 64; break;
   5804       case Iop_SarN16x4: sar = True; size = 16; break;
   5805       case Iop_SarN32x2: sar = True; size = 32; break;
   5806       case Iop_ShrN16x4: shr = True; size = 16; break;
   5807       case Iop_ShrN32x2: shr = True; size = 32; break;
   5808       case Iop_Shr64:    shr = True; size = 64; break;
   5809       default: vassert(0);
   5810    }
   5811 
   5812    if (shl || shr) {
   5813       assign( e1, amt >= size
   5814                      ? mkU64(0)
   5815                      : binop(op, mkexpr(e0), mkU8(amt))
   5816       );
   5817    } else
   5818    if (sar) {
   5819       assign( e1, amt >= size
   5820                      ? binop(op, mkexpr(e0), mkU8(size-1))
   5821                      : binop(op, mkexpr(e0), mkU8(amt))
   5822       );
   5823    } else {
   5824       /*NOTREACHED*/
   5825       vassert(0);
   5826    }
   5827 
   5828    putMMXReg( eregOfRM(rm), mkexpr(e1) );
   5829    return delta;
   5830 }
   5831 
   5832 
   5833 /* Completely handle all MMX instructions except emms. */
   5834 
   5835 static
   5836 UInt dis_MMX ( Bool* decode_ok, UChar sorb, Int sz, Int delta )
   5837 {
   5838    Int   len;
   5839    UChar modrm;
   5840    HChar dis_buf[50];
   5841    UChar opc = getIByte(delta);
   5842    delta++;
   5843 
   5844    /* dis_MMX handles all insns except emms. */
   5845    do_MMX_preamble();
   5846 
   5847    switch (opc) {
   5848 
   5849       case 0x6E:
   5850          /* MOVD (src)ireg-or-mem (E), (dst)mmxreg (G)*/
   5851          if (sz != 4)
   5852             goto mmx_decode_failure;
   5853          modrm = getIByte(delta);
   5854          if (epartIsReg(modrm)) {
   5855             delta++;
   5856             putMMXReg(
   5857                gregOfRM(modrm),
   5858                binop( Iop_32HLto64,
   5859                       mkU32(0),
   5860                       getIReg(4, eregOfRM(modrm)) ) );
   5861             DIP("movd %s, %s\n",
   5862                 nameIReg(4,eregOfRM(modrm)), nameMMXReg(gregOfRM(modrm)));
   5863          } else {
   5864             IRTemp addr = disAMode( &len, sorb, delta, dis_buf );
   5865             delta += len;
   5866             putMMXReg(
   5867                gregOfRM(modrm),
   5868                binop( Iop_32HLto64,
   5869                       mkU32(0),
   5870                       loadLE(Ity_I32, mkexpr(addr)) ) );
   5871             DIP("movd %s, %s\n", dis_buf, nameMMXReg(gregOfRM(modrm)));
   5872          }
   5873          break;
   5874 
   5875       case 0x7E: /* MOVD (src)mmxreg (G), (dst)ireg-or-mem (E) */
   5876          if (sz != 4)
   5877             goto mmx_decode_failure;
   5878          modrm = getIByte(delta);
   5879          if (epartIsReg(modrm)) {
   5880             delta++;
   5881             putIReg( 4, eregOfRM(modrm),
   5882                      unop(Iop_64to32, getMMXReg(gregOfRM(modrm)) ) );
   5883             DIP("movd %s, %s\n",
   5884                 nameMMXReg(gregOfRM(modrm)), nameIReg(4,eregOfRM(modrm)));
   5885          } else {
   5886             IRTemp addr = disAMode( &len, sorb, delta, dis_buf );
   5887             delta += len;
   5888             storeLE( mkexpr(addr),
   5889                      unop(Iop_64to32, getMMXReg(gregOfRM(modrm)) ) );
   5890             DIP("movd %s, %s\n", nameMMXReg(gregOfRM(modrm)), dis_buf);
   5891          }
   5892          break;
   5893 
   5894       case 0x6F:
   5895          /* MOVQ (src)mmxreg-or-mem, (dst)mmxreg */
   5896          if (sz != 4)
   5897             goto mmx_decode_failure;
   5898          modrm = getIByte(delta);
   5899          if (epartIsReg(modrm)) {
   5900             delta++;
   5901             putMMXReg( gregOfRM(modrm), getMMXReg(eregOfRM(modrm)) );
   5902             DIP("movq %s, %s\n",
   5903                 nameMMXReg(eregOfRM(modrm)), nameMMXReg(gregOfRM(modrm)));
   5904          } else {
   5905             IRTemp addr = disAMode( &len, sorb, delta, dis_buf );
   5906             delta += len;
   5907             putMMXReg( gregOfRM(modrm), loadLE(Ity_I64, mkexpr(addr)) );
   5908             DIP("movq %s, %s\n",
   5909                 dis_buf, nameMMXReg(gregOfRM(modrm)));
   5910          }
   5911          break;
   5912 
   5913       case 0x7F:
   5914          /* MOVQ (src)mmxreg, (dst)mmxreg-or-mem */
   5915          if (sz != 4)
   5916             goto mmx_decode_failure;
   5917          modrm = getIByte(delta);
   5918          if (epartIsReg(modrm)) {
   5919             delta++;
   5920             putMMXReg( eregOfRM(modrm), getMMXReg(gregOfRM(modrm)) );
   5921             DIP("movq %s, %s\n",
   5922                 nameMMXReg(gregOfRM(modrm)), nameMMXReg(eregOfRM(modrm)));
   5923          } else {
   5924             IRTemp addr = disAMode( &len, sorb, delta, dis_buf );
   5925             delta += len;
   5926             storeLE( mkexpr(addr), getMMXReg(gregOfRM(modrm)) );
   5927             DIP("mov(nt)q %s, %s\n",
   5928                 nameMMXReg(gregOfRM(modrm)), dis_buf);
   5929          }
   5930          break;
   5931 
   5932       case 0xFC:
   5933       case 0xFD:
   5934       case 0xFE: /* PADDgg (src)mmxreg-or-mem, (dst)mmxreg */
   5935          if (sz != 4)
   5936             goto mmx_decode_failure;
   5937          delta = dis_MMXop_regmem_to_reg ( sorb, delta, opc, "padd", True );
   5938          break;
   5939 
   5940       case 0xEC:
   5941       case 0xED: /* PADDSgg (src)mmxreg-or-mem, (dst)mmxreg */
   5942          if (sz != 4)
   5943             goto mmx_decode_failure;
   5944          delta = dis_MMXop_regmem_to_reg ( sorb, delta, opc, "padds", True );
   5945          break;
   5946 
   5947       case 0xDC:
   5948       case 0xDD: /* PADDUSgg (src)mmxreg-or-mem, (dst)mmxreg */
   5949          if (sz != 4)
   5950             goto mmx_decode_failure;
   5951          delta = dis_MMXop_regmem_to_reg ( sorb, delta, opc, "paddus", True );
   5952          break;
   5953 
   5954       case 0xF8:
   5955       case 0xF9:
   5956       case 0xFA: /* PSUBgg (src)mmxreg-or-mem, (dst)mmxreg */
   5957          if (sz != 4)
   5958             goto mmx_decode_failure;
   5959          delta = dis_MMXop_regmem_to_reg ( sorb, delta, opc, "psub", True );
   5960          break;
   5961 
   5962       case 0xE8:
   5963       case 0xE9: /* PSUBSgg (src)mmxreg-or-mem, (dst)mmxreg */
   5964          if (sz != 4)
   5965             goto mmx_decode_failure;
   5966          delta = dis_MMXop_regmem_to_reg ( sorb, delta, opc, "psubs", True );
   5967          break;
   5968 
   5969       case 0xD8:
   5970       case 0xD9: /* PSUBUSgg (src)mmxreg-or-mem, (dst)mmxreg */
   5971          if (sz != 4)
   5972             goto mmx_decode_failure;
   5973          delta = dis_MMXop_regmem_to_reg ( sorb, delta, opc, "psubus", True );
   5974          break;
   5975 
   5976       case 0xE5: /* PMULHW (src)mmxreg-or-mem, (dst)mmxreg */
   5977          if (sz != 4)
   5978             goto mmx_decode_failure;
   5979          delta = dis_MMXop_regmem_to_reg ( sorb, delta, opc, "pmulhw", False );
   5980          break;
   5981 
   5982       case 0xD5: /* PMULLW (src)mmxreg-or-mem, (dst)mmxreg */
   5983          if (sz != 4)
   5984             goto mmx_decode_failure;
   5985          delta = dis_MMXop_regmem_to_reg ( sorb, delta, opc, "pmullw", False );
   5986          break;
   5987 
   5988       case 0xF5: /* PMADDWD (src)mmxreg-or-mem, (dst)mmxreg */
   5989          vassert(sz == 4);
   5990          delta = dis_MMXop_regmem_to_reg ( sorb, delta, opc, "pmaddwd", False );
   5991          break;
   5992 
   5993       case 0x74:
   5994       case 0x75:
   5995       case 0x76: /* PCMPEQgg (src)mmxreg-or-mem, (dst)mmxreg */
   5996          if (sz != 4)
   5997             goto mmx_decode_failure;
   5998          delta = dis_MMXop_regmem_to_reg ( sorb, delta, opc, "pcmpeq", True );
   5999          break;
   6000 
   6001       case 0x64:
   6002       case 0x65:
   6003       case 0x66: /* PCMPGTgg (src)mmxreg-or-mem, (dst)mmxreg */
   6004          if (sz != 4)
   6005             goto mmx_decode_failure;
   6006          delta = dis_MMXop_regmem_to_reg ( sorb, delta, opc, "pcmpgt", True );
   6007          break;
   6008 
   6009       case 0x6B: /* PACKSSDW (src)mmxreg-or-mem, (dst)mmxreg */
   6010          if (sz != 4)
   6011             goto mmx_decode_failure;
   6012          delta = dis_MMXop_regmem_to_reg ( sorb, delta, opc, "packssdw", False );
   6013          break;
   6014 
   6015       case 0x63: /* PACKSSWB (src)mmxreg-or-mem, (dst)mmxreg */
   6016          if (sz != 4)
   6017             goto mmx_decode_failure;
   6018          delta = dis_MMXop_regmem_to_reg ( sorb, delta, opc, "packsswb", False );
   6019          break;
   6020 
   6021       case 0x67: /* PACKUSWB (src)mmxreg-or-mem, (dst)mmxreg */
   6022          if (sz != 4)
   6023             goto mmx_decode_failure;
   6024          delta = dis_MMXop_regmem_to_reg ( sorb, delta, opc, "packuswb", False );
   6025          break;
   6026 
   6027       case 0x68:
   6028       case 0x69:
   6029       case 0x6A: /* PUNPCKHgg (src)mmxreg-or-mem, (dst)mmxreg */
   6030          if (sz != 4)
   6031             goto mmx_decode_failure;
   6032          delta = dis_MMXop_regmem_to_reg ( sorb, delta, opc, "punpckh", True );
   6033          break;
   6034 
   6035       case 0x60:
   6036       case 0x61:
   6037       case 0x62: /* PUNPCKLgg (src)mmxreg-or-mem, (dst)mmxreg */
   6038          if (sz != 4)
   6039             goto mmx_decode_failure;
   6040          delta = dis_MMXop_regmem_to_reg ( sorb, delta, opc, "punpckl", True );
   6041          break;
   6042 
   6043       case 0xDB: /* PAND (src)mmxreg-or-mem, (dst)mmxreg */
   6044          if (sz != 4)
   6045             goto mmx_decode_failure;
   6046          delta = dis_MMXop_regmem_to_reg ( sorb, delta, opc, "pand", False );
   6047          break;
   6048 
   6049       case 0xDF: /* PANDN (src)mmxreg-or-mem, (dst)mmxreg */
   6050          if (sz != 4)
   6051             goto mmx_decode_failure;
   6052          delta = dis_MMXop_regmem_to_reg ( sorb, delta, opc, "pandn", False );
   6053          break;
   6054 
   6055       case 0xEB: /* POR (src)mmxreg-or-mem, (dst)mmxreg */
   6056          if (sz != 4)
   6057             goto mmx_decode_failure;
   6058          delta = dis_MMXop_regmem_to_reg ( sorb, delta, opc, "por", False );
   6059          break;
   6060 
   6061       case 0xEF: /* PXOR (src)mmxreg-or-mem, (dst)mmxreg */
   6062          if (sz != 4)
   6063             goto mmx_decode_failure;
   6064          delta = dis_MMXop_regmem_to_reg ( sorb, delta, opc, "pxor", False );
   6065          break;
   6066 
   6067 #     define SHIFT_BY_REG(_name,_op)                                 \
   6068                 delta = dis_MMX_shiftG_byE(sorb, delta, _name, _op); \
   6069                 break;
   6070 
   6071       /* PSLLgg (src)mmxreg-or-mem, (dst)mmxreg */
   6072       case 0xF1: SHIFT_BY_REG("psllw", Iop_ShlN16x4);
   6073       case 0xF2: SHIFT_BY_REG("pslld", Iop_ShlN32x2);
   6074       case 0xF3: SHIFT_BY_REG("psllq", Iop_Shl64);
   6075 
   6076       /* PSRLgg (src)mmxreg-or-mem, (dst)mmxreg */
   6077       case 0xD1: SHIFT_BY_REG("psrlw", Iop_ShrN16x4);
   6078       case 0xD2: SHIFT_BY_REG("psrld", Iop_ShrN32x2);
   6079       case 0xD3: SHIFT_BY_REG("psrlq", Iop_Shr64);
   6080 
   6081       /* PSRAgg (src)mmxreg-or-mem, (dst)mmxreg */
   6082       case 0xE1: SHIFT_BY_REG("psraw", Iop_SarN16x4);
   6083       case 0xE2: SHIFT_BY_REG("psrad", Iop_SarN32x2);
   6084 
   6085 #     undef SHIFT_BY_REG
   6086 
   6087       case 0x71:
   6088       case 0x72:
   6089       case 0x73: {
   6090          /* (sz==4): PSLLgg/PSRAgg/PSRLgg mmxreg by imm8 */
   6091          UChar byte2, subopc;
   6092          if (sz != 4)
   6093             goto mmx_decode_failure;
   6094          byte2  = getIByte(delta);           /* amode / sub-opcode */
   6095          subopc = toUChar( (byte2 >> 3) & 7 );
   6096 
   6097 #        define SHIFT_BY_IMM(_name,_op)                         \
   6098              do { delta = dis_MMX_shiftE_imm(delta,_name,_op);  \
   6099              } while (0)
   6100 
   6101               if (subopc == 2 /*SRL*/ && opc == 0x71)
   6102                  SHIFT_BY_IMM("psrlw", Iop_ShrN16x4);
   6103          else if (subopc == 2 /*SRL*/ && opc == 0x72)
   6104                  SHIFT_BY_IMM("psrld", Iop_ShrN32x2);
   6105          else if (subopc == 2 /*SRL*/ && opc == 0x73)
   6106                  SHIFT_BY_IMM("psrlq", Iop_Shr64);
   6107 
   6108          else if (subopc == 4 /*SAR*/ && opc == 0x71)
   6109                  SHIFT_BY_IMM("psraw", Iop_SarN16x4);
   6110          else if (subopc == 4 /*SAR*/ && opc == 0x72)
   6111                  SHIFT_BY_IMM("psrad", Iop_SarN32x2);
   6112 
   6113          else if (subopc == 6 /*SHL*/ && opc == 0x71)
   6114                  SHIFT_BY_IMM("psllw", Iop_ShlN16x4);
   6115          else if (subopc == 6 /*SHL*/ && opc == 0x72)
   6116                  SHIFT_BY_IMM("pslld", Iop_ShlN32x2);
   6117          else if (subopc == 6 /*SHL*/ && opc == 0x73)
   6118                  SHIFT_BY_IMM("psllq", Iop_Shl64);
   6119 
   6120          else goto mmx_decode_failure;
   6121 
   6122 #        undef SHIFT_BY_IMM
   6123          break;
   6124       }
   6125 
   6126       case 0xF7: {
   6127          IRTemp addr    = newTemp(Ity_I32);
   6128          IRTemp regD    = newTemp(Ity_I64);
   6129          IRTemp regM    = newTemp(Ity_I64);
   6130          IRTemp mask    = newTemp(Ity_I64);
   6131          IRTemp olddata = newTemp(Ity_I64);
   6132          IRTemp newdata = newTemp(Ity_I64);
   6133 
   6134          modrm = getIByte(delta);
   6135          if (sz != 4 || (!epartIsReg(modrm)))
   6136             goto mmx_decode_failure;
   6137          delta++;
   6138 
   6139          assign( addr, handleSegOverride( sorb, getIReg(4, R_EDI) ));
   6140          assign( regM, getMMXReg( eregOfRM(modrm) ));
   6141          assign( regD, getMMXReg( gregOfRM(modrm) ));
   6142          assign( mask, binop(Iop_SarN8x8, mkexpr(regM), mkU8(7)) );
   6143          assign( olddata, loadLE( Ity_I64, mkexpr(addr) ));
   6144          assign( newdata,
   6145                  binop(Iop_Or64,
   6146                        binop(Iop_And64,
   6147                              mkexpr(regD),
   6148                              mkexpr(mask) ),
   6149                        binop(Iop_And64,
   6150                              mkexpr(olddata),
   6151                              unop(Iop_Not64, mkexpr(mask)))) );
   6152          storeLE( mkexpr(addr), mkexpr(newdata) );
   6153          DIP("maskmovq %s,%s\n", nameMMXReg( eregOfRM(modrm) ),
   6154                                  nameMMXReg( gregOfRM(modrm) ) );
   6155          break;
   6156       }
   6157 
   6158       /* --- MMX decode failure --- */
   6159       default:
   6160       mmx_decode_failure:
   6161          *decode_ok = False;
   6162          return delta; /* ignored */
   6163 
   6164    }
   6165 
   6166    *decode_ok = True;
   6167    return delta;
   6168 }
   6169 
   6170 
   6171 /*------------------------------------------------------------*/
   6172 /*--- More misc arithmetic and other obscure insns.        ---*/
   6173 /*------------------------------------------------------------*/
   6174 
   6175 /* Double length left and right shifts.  Apparently only required in
   6176    v-size (no b- variant). */
   6177 static
   6178 UInt dis_SHLRD_Gv_Ev ( UChar sorb,
   6179                        Int delta, UChar modrm,
   6180                        Int sz,
   6181                        IRExpr* shift_amt,
   6182                        Bool amt_is_literal,
   6183                        const HChar* shift_amt_txt,
   6184                        Bool left_shift )
   6185 {
   6186    /* shift_amt :: Ity_I8 is the amount to shift.  shift_amt_txt is used
   6187       for printing it.   And eip on entry points at the modrm byte. */
   6188    Int len;
   6189    HChar dis_buf[50];
   6190 
   6191    IRType ty       = szToITy(sz);
   6192    IRTemp gsrc     = newTemp(ty);
   6193    IRTemp esrc     = newTemp(ty);
   6194    IRTemp addr     = IRTemp_INVALID;
   6195    IRTemp tmpSH    = newTemp(Ity_I8);
   6196    IRTemp tmpL     = IRTemp_INVALID;
   6197    IRTemp tmpRes   = IRTemp_INVALID;
   6198    IRTemp tmpSubSh = IRTemp_INVALID;
   6199    IROp   mkpair;
   6200    IROp   getres;
   6201    IROp   shift;
   6202    IRExpr* mask = NULL;
   6203 
   6204    vassert(sz == 2 || sz == 4);
   6205 
   6206    /* The E-part is the destination; this is shifted.  The G-part
   6207       supplies bits to be shifted into the E-part, but is not
   6208       changed.
   6209 
   6210       If shifting left, form a double-length word with E at the top
   6211       and G at the bottom, and shift this left.  The result is then in
   6212       the high part.
   6213 
   6214       If shifting right, form a double-length word with G at the top
   6215       and E at the bottom, and shift this right.  The result is then
   6216       at the bottom.  */
   6217 
   6218    /* Fetch the operands. */
   6219 
   6220    assign( gsrc, getIReg(sz, gregOfRM(modrm)) );
   6221 
   6222    if (epartIsReg(modrm)) {
   6223       delta++;
   6224       assign( esrc, getIReg(sz, eregOfRM(modrm)) );
   6225       DIP("sh%cd%c %s, %s, %s\n",
   6226           ( left_shift ? 'l' : 'r' ), nameISize(sz),
   6227           shift_amt_txt,
   6228           nameIReg(sz, gregOfRM(modrm)), nameIReg(sz, eregOfRM(modrm)));
   6229    } else {
   6230       addr = disAMode ( &len, sorb, delta, dis_buf );
   6231       delta += len;
   6232       assign( esrc, loadLE(ty, mkexpr(addr)) );
   6233       DIP("sh%cd%c %s, %s, %s\n",
   6234           ( left_shift ? 'l' : 'r' ), nameISize(sz),
   6235           shift_amt_txt,
   6236           nameIReg(sz, gregOfRM(modrm)), dis_buf);
   6237    }
   6238 
   6239    /* Round up the relevant primops. */
   6240 
   6241    if (sz == 4) {
   6242       tmpL     = newTemp(Ity_I64);
   6243       tmpRes   = newTemp(Ity_I32);
   6244       tmpSubSh = newTemp(Ity_I32);
   6245       mkpair   = Iop_32HLto64;
   6246       getres   = left_shift ? Iop_64HIto32 : Iop_64to32;
   6247       shift    = left_shift ? Iop_Shl64 : Iop_Shr64;
   6248       mask     = mkU8(31);
   6249    } else {
   6250       /* sz == 2 */
   6251       tmpL     = newTemp(Ity_I32);
   6252       tmpRes   = newTemp(Ity_I16);
   6253       tmpSubSh = newTemp(Ity_I16);
   6254       mkpair   = Iop_16HLto32;
   6255       getres   = left_shift ? Iop_32HIto16 : Iop_32to16;
   6256       shift    = left_shift ? Iop_Shl32 : Iop_Shr32;
   6257       mask     = mkU8(15);
   6258    }
   6259 
   6260    /* Do the shift, calculate the subshift value, and set
   6261       the flag thunk. */
   6262 
   6263    assign( tmpSH, binop(Iop_And8, shift_amt, mask) );
   6264 
   6265    if (left_shift)
   6266       assign( tmpL, binop(mkpair, mkexpr(esrc), mkexpr(gsrc)) );
   6267    else
   6268       assign( tmpL, binop(mkpair, mkexpr(gsrc), mkexpr(esrc)) );
   6269 
   6270    assign( tmpRes, unop(getres, binop(shift, mkexpr(tmpL), mkexpr(tmpSH)) ) );
   6271    assign( tmpSubSh,
   6272            unop(getres,
   6273                 binop(shift,
   6274                       mkexpr(tmpL),
   6275                       binop(Iop_And8,
   6276                             binop(Iop_Sub8, mkexpr(tmpSH), mkU8(1) ),
   6277                             mask))) );
   6278 
   6279    setFlags_DEP1_DEP2_shift ( left_shift ? Iop_Shl32 : Iop_Sar32,
   6280                               tmpRes, tmpSubSh, ty, tmpSH );
   6281 
   6282    /* Put result back. */
   6283 
   6284    if (epartIsReg(modrm)) {
   6285       putIReg(sz, eregOfRM(modrm), mkexpr(tmpRes));
   6286    } else {
   6287       storeLE( mkexpr(addr), mkexpr(tmpRes) );
   6288    }
   6289 
   6290    if (amt_is_literal) delta++;
   6291    return delta;
   6292 }
   6293 
   6294 
   6295 /* Handle BT/BTS/BTR/BTC Gv, Ev.  Apparently b-size is not
   6296    required. */
   6297 
   6298 typedef enum { BtOpNone, BtOpSet, BtOpReset, BtOpComp } BtOp;
   6299 
   6300 static const HChar* nameBtOp ( BtOp op )
   6301 {
   6302    switch (op) {
   6303       case BtOpNone:  return "";
   6304       case BtOpSet:   return "s";
   6305       case BtOpReset: return "r";
   6306       case BtOpComp:  return "c";
   6307       default: vpanic("nameBtOp(x86)");
   6308    }
   6309 }
   6310 
   6311 
   6312 static
   6313 UInt dis_bt_G_E ( VexAbiInfo* vbi,
   6314                   UChar sorb, Bool locked, Int sz, Int delta, BtOp op )
   6315 {
   6316    HChar  dis_buf[50];
   6317    UChar  modrm;
   6318    Int    len;
   6319    IRTemp t_fetched, t_bitno0, t_bitno1, t_bitno2, t_addr0,
   6320           t_addr1, t_esp, t_mask, t_new;
   6321 
   6322    vassert(sz == 2 || sz == 4);
   6323 
   6324    t_fetched = t_bitno0 = t_bitno1 = t_bitno2
   6325              = t_addr0 = t_addr1 = t_esp
   6326              = t_mask = t_new = IRTemp_INVALID;
   6327 
   6328    t_fetched = newTemp(Ity_I8);
   6329    t_new     = newTemp(Ity_I8);
   6330    t_bitno0  = newTemp(Ity_I32);
   6331    t_bitno1  = newTemp(Ity_I32);
   6332    t_bitno2  = newTemp(Ity_I8);
   6333    t_addr1   = newTemp(Ity_I32);
   6334    modrm     = getIByte(delta);
   6335 
   6336    assign( t_bitno0, widenSto32(getIReg(sz, gregOfRM(modrm))) );
   6337 
   6338    if (epartIsReg(modrm)) {
   6339       delta++;
   6340       /* Get it onto the client's stack. */
   6341       t_esp = newTemp(Ity_I32);
   6342       t_addr0 = newTemp(Ity_I32);
   6343 
   6344       /* For the choice of the value 128, see comment in dis_bt_G_E in
   6345          guest_amd64_toIR.c.  We point out here only that 128 is
   6346          fast-cased in Memcheck and is > 0, so seems like a good
   6347          choice. */
   6348       vassert(vbi->guest_stack_redzone_size == 0);
   6349       assign( t_esp, binop(Iop_Sub32, getIReg(4, R_ESP), mkU32(128)) );
   6350       putIReg(4, R_ESP, mkexpr(t_esp));
   6351 
   6352       storeLE( mkexpr(t_esp), getIReg(sz, eregOfRM(modrm)) );
   6353 
   6354       /* Make t_addr0 point at it. */
   6355       assign( t_addr0, mkexpr(t_esp) );
   6356 
   6357       /* Mask out upper bits of the shift amount, since we're doing a
   6358          reg. */
   6359       assign( t_bitno1, binop(Iop_And32,
   6360                               mkexpr(t_bitno0),
   6361                               mkU32(sz == 4 ? 31 : 15)) );
   6362 
   6363    } else {
   6364       t_addr0 = disAMode ( &len, sorb, delta, dis_buf );
   6365       delta += len;
   6366       assign( t_bitno1, mkexpr(t_bitno0) );
   6367    }
   6368 
   6369    /* At this point: t_addr0 is the address being operated on.  If it
   6370       was a reg, we will have pushed it onto the client's stack.
   6371       t_bitno1 is the bit number, suitably masked in the case of a
   6372       reg.  */
   6373 
   6374    /* Now the main sequence. */
   6375    assign( t_addr1,
   6376            binop(Iop_Add32,
   6377                  mkexpr(t_addr0),
   6378                  binop(Iop_Sar32, mkexpr(t_bitno1), mkU8(3))) );
   6379 
   6380    /* t_addr1 now holds effective address */
   6381 
   6382    assign( t_bitno2,
   6383            unop(Iop_32to8,
   6384                 binop(Iop_And32, mkexpr(t_bitno1), mkU32(7))) );
   6385 
   6386    /* t_bitno2 contains offset of bit within byte */
   6387 
   6388    if (op != BtOpNone) {
   6389       t_mask = newTemp(Ity_I8);
   6390       assign( t_mask, binop(Iop_Shl8, mkU8(1), mkexpr(t_bitno2)) );
   6391    }
   6392 
   6393    /* t_mask is now a suitable byte mask */
   6394 
   6395    assign( t_fetched, loadLE(Ity_I8, mkexpr(t_addr1)) );
   6396 
   6397    if (op != BtOpNone) {
   6398       switch (op) {
   6399          case BtOpSet:
   6400             assign( t_new,
   6401                     binop(Iop_Or8, mkexpr(t_fetched), mkexpr(t_mask)) );
   6402             break;
   6403          case BtOpComp:
   6404             assign( t_new,
   6405                     binop(Iop_Xor8, mkexpr(t_fetched), mkexpr(t_mask)) );
   6406             break;
   6407          case BtOpReset:
   6408             assign( t_new,
   6409                     binop(Iop_And8, mkexpr(t_fetched),
   6410                                     unop(Iop_Not8, mkexpr(t_mask))) );
   6411             break;
   6412          default:
   6413             vpanic("dis_bt_G_E(x86)");
   6414       }
   6415       if (locked && !epartIsReg(modrm)) {
   6416          casLE( mkexpr(t_addr1), mkexpr(t_fetched)/*expd*/,
   6417                                  mkexpr(t_new)/*new*/,
   6418                                  guest_EIP_curr_instr );
   6419       } else {
   6420          storeLE( mkexpr(t_addr1), mkexpr(t_new) );
   6421       }
   6422    }
   6423 
   6424    /* Side effect done; now get selected bit into Carry flag */
   6425    /* Flags: C=selected bit, O,S,Z,A,P undefined, so are set to zero. */
   6426    stmt( IRStmt_Put( OFFB_CC_OP,   mkU32(X86G_CC_OP_COPY) ));
   6427    stmt( IRStmt_Put( OFFB_CC_DEP2, mkU32(0) ));
   6428    stmt( IRStmt_Put(
   6429             OFFB_CC_DEP1,
   6430             binop(Iop_And32,
   6431                   binop(Iop_Shr32,
   6432                         unop(Iop_8Uto32, mkexpr(t_fetched)),
   6433                         mkexpr(t_bitno2)),
   6434                   mkU32(1)))
   6435        );
   6436    /* Set NDEP even though it isn't used.  This makes redundant-PUT
   6437       elimination of previous stores to this field work better. */
   6438    stmt( IRStmt_Put( OFFB_CC_NDEP, mkU32(0) ));
   6439 
   6440    /* Move reg operand from stack back to reg */
   6441    if (epartIsReg(modrm)) {
   6442       /* t_esp still points at it. */
   6443       putIReg(sz, eregOfRM(modrm), loadLE(szToITy(sz), mkexpr(t_esp)) );
   6444       putIReg(4, R_ESP, binop(Iop_Add32, mkexpr(t_esp), mkU32(128)) );
   6445    }
   6446 
   6447    DIP("bt%s%c %s, %s\n",
   6448        nameBtOp(op), nameISize(sz), nameIReg(sz, gregOfRM(modrm)),
   6449        ( epartIsReg(modrm) ? nameIReg(sz, eregOfRM(modrm)) : dis_buf ) );
   6450 
   6451    return delta;
   6452 }
   6453 
   6454 
   6455 
   6456 /* Handle BSF/BSR.  Only v-size seems necessary. */
   6457 static
   6458 UInt dis_bs_E_G ( UChar sorb, Int sz, Int delta, Bool fwds )
   6459 {
   6460    Bool   isReg;
   6461    UChar  modrm;
   6462    HChar  dis_buf[50];
   6463 
   6464    IRType ty  = szToITy(sz);
   6465    IRTemp src = newTemp(ty);
   6466    IRTemp dst = newTemp(ty);
   6467 
   6468    IRTemp src32 = newTemp(Ity_I32);
   6469    IRTemp dst32 = newTemp(Ity_I32);
   6470    IRTemp srcB  = newTemp(Ity_I1);
   6471 
   6472    vassert(sz == 4 || sz == 2);
   6473 
   6474    modrm = getIByte(delta);
   6475 
   6476    isReg = epartIsReg(modrm);
   6477    if (isReg) {
   6478       delta++;
   6479       assign( src, getIReg(sz, eregOfRM(modrm)) );
   6480    } else {
   6481       Int    len;
   6482       IRTemp addr = disAMode( &len, sorb, delta, dis_buf );
   6483       delta += len;
   6484       assign( src, loadLE(ty, mkexpr(addr)) );
   6485    }
   6486 
   6487    DIP("bs%c%c %s, %s\n",
   6488        fwds ? 'f' : 'r', nameISize(sz),
   6489        ( isReg ? nameIReg(sz, eregOfRM(modrm)) : dis_buf ),
   6490        nameIReg(sz, gregOfRM(modrm)));
   6491 
   6492    /* Generate a bool expression which is zero iff the original is
   6493       zero, and nonzero otherwise.  Ask for a CmpNE version which, if
   6494       instrumented by Memcheck, is instrumented expensively, since
   6495       this may be used on the output of a preceding movmskb insn,
   6496       which has been known to be partially defined, and in need of
   6497       careful handling. */
   6498    assign( srcB, binop(mkSizedOp(ty,Iop_ExpCmpNE8),
   6499                        mkexpr(src), mkU(ty,0)) );
   6500 
   6501    /* Flags: Z is 1 iff source value is zero.  All others
   6502       are undefined -- we force them to zero. */
   6503    stmt( IRStmt_Put( OFFB_CC_OP,   mkU32(X86G_CC_OP_COPY) ));
   6504    stmt( IRStmt_Put( OFFB_CC_DEP2, mkU32(0) ));
   6505    stmt( IRStmt_Put(
   6506             OFFB_CC_DEP1,
   6507             IRExpr_ITE( mkexpr(srcB),
   6508                         /* src!=0 */
   6509                         mkU32(0),
   6510                         /* src==0 */
   6511                         mkU32(X86G_CC_MASK_Z)
   6512                         )
   6513        ));
   6514    /* Set NDEP even though it isn't used.  This makes redundant-PUT
   6515       elimination of previous stores to this field work better. */
   6516    stmt( IRStmt_Put( OFFB_CC_NDEP, mkU32(0) ));
   6517 
   6518    /* Result: iff source value is zero, we can't use
   6519       Iop_Clz32/Iop_Ctz32 as they have no defined result in that case.
   6520       But anyway, Intel x86 semantics say the result is undefined in
   6521       such situations.  Hence handle the zero case specially. */
   6522 
   6523    /* Bleh.  What we compute:
   6524 
   6525           bsf32:  if src == 0 then 0 else  Ctz32(src)
   6526           bsr32:  if src == 0 then 0 else  31 - Clz32(src)
   6527 
   6528           bsf16:  if src == 0 then 0 else  Ctz32(16Uto32(src))
   6529           bsr16:  if src == 0 then 0 else  31 - Clz32(16Uto32(src))
   6530 
   6531       First, widen src to 32 bits if it is not already.
   6532 
   6533       Postscript 15 Oct 04: it seems that at least VIA Nehemiah leaves the
   6534       dst register unchanged when src == 0.  Hence change accordingly.
   6535    */
   6536    if (sz == 2)
   6537       assign( src32, unop(Iop_16Uto32, mkexpr(src)) );
   6538    else
   6539       assign( src32, mkexpr(src) );
   6540 
   6541    /* The main computation, guarding against zero. */
   6542    assign( dst32,
   6543            IRExpr_ITE(
   6544               mkexpr(srcB),
   6545               /* src != 0 */
   6546               fwds ? unop(Iop_Ctz32, mkexpr(src32))
   6547                    : binop(Iop_Sub32,
   6548                            mkU32(31),
   6549                            unop(Iop_Clz32, mkexpr(src32))),
   6550               /* src == 0 -- leave dst unchanged */
   6551               widenUto32( getIReg( sz, gregOfRM(modrm) ) )
   6552            )
   6553          );
   6554 
   6555    if (sz == 2)
   6556       assign( dst, unop(Iop_32to16, mkexpr(dst32)) );
   6557    else
   6558       assign( dst, mkexpr(dst32) );
   6559 
   6560    /* dump result back */
   6561    putIReg( sz, gregOfRM(modrm), mkexpr(dst) );
   6562 
   6563    return delta;
   6564 }
   6565 
   6566 
   6567 static
   6568 void codegen_xchg_eAX_Reg ( Int sz, Int reg )
   6569 {
   6570    IRType ty = szToITy(sz);
   6571    IRTemp t1 = newTemp(ty);
   6572    IRTemp t2 = newTemp(ty);
   6573    vassert(sz == 2 || sz == 4);
   6574    assign( t1, getIReg(sz, R_EAX) );
   6575    assign( t2, getIReg(sz, reg) );
   6576    putIReg( sz, R_EAX, mkexpr(t2) );
   6577    putIReg( sz, reg, mkexpr(t1) );
   6578    DIP("xchg%c %s, %s\n",
   6579        nameISize(sz), nameIReg(sz, R_EAX), nameIReg(sz, reg));
   6580 }
   6581 
   6582 
   6583 static
   6584 void codegen_SAHF ( void )
   6585 {
   6586    /* Set the flags to:
   6587       (x86g_calculate_flags_all() & X86G_CC_MASK_O)  -- retain the old O flag
   6588       | (%AH & (X86G_CC_MASK_S|X86G_CC_MASK_Z|X86G_CC_MASK_A
   6589                 |X86G_CC_MASK_P|X86G_CC_MASK_C)
   6590    */
   6591    UInt   mask_SZACP = X86G_CC_MASK_S|X86G_CC_MASK_Z|X86G_CC_MASK_A
   6592                        |X86G_CC_MASK_C|X86G_CC_MASK_P;
   6593    IRTemp oldflags   = newTemp(Ity_I32);
   6594    assign( oldflags, mk_x86g_calculate_eflags_all() );
   6595    stmt( IRStmt_Put( OFFB_CC_OP,   mkU32(X86G_CC_OP_COPY) ));
   6596    stmt( IRStmt_Put( OFFB_CC_NDEP, mkU32(0) ));
   6597    stmt( IRStmt_Put( OFFB_CC_DEP2, mkU32(0) ));
   6598    stmt( IRStmt_Put( OFFB_CC_DEP1,
   6599          binop(Iop_Or32,
   6600                binop(Iop_And32, mkexpr(oldflags), mkU32(X86G_CC_MASK_O)),
   6601                binop(Iop_And32,
   6602                      binop(Iop_Shr32, getIReg(4, R_EAX), mkU8(8)),
   6603                      mkU32(mask_SZACP))
   6604               )
   6605    ));
   6606    /* Set NDEP even though it isn't used.  This makes redundant-PUT
   6607       elimination of previous stores to this field work better. */
   6608    stmt( IRStmt_Put( OFFB_CC_NDEP, mkU32(0) ));
   6609 }
   6610 
   6611 
   6612 static
   6613 void codegen_LAHF ( void  )
   6614 {
   6615    /* AH <- EFLAGS(SF:ZF:0:AF:0:PF:1:CF) */
   6616    IRExpr* eax_with_hole;
   6617    IRExpr* new_byte;
   6618    IRExpr* new_eax;
   6619    UInt    mask_SZACP = X86G_CC_MASK_S|X86G_CC_MASK_Z|X86G_CC_MASK_A
   6620                         |X86G_CC_MASK_C|X86G_CC_MASK_P;
   6621 
   6622    IRTemp  flags = newTemp(Ity_I32);
   6623    assign( flags, mk_x86g_calculate_eflags_all() );
   6624 
   6625    eax_with_hole
   6626       = binop(Iop_And32, getIReg(4, R_EAX), mkU32(0xFFFF00FF));
   6627    new_byte
   6628       = binop(Iop_Or32, binop(Iop_And32, mkexpr(flags), mkU32(mask_SZACP)),
   6629                         mkU32(1<<1));
   6630    new_eax
   6631       = binop(Iop_Or32, eax_with_hole,
   6632                         binop(Iop_Shl32, new_byte, mkU8(8)));
   6633    putIReg(4, R_EAX, new_eax);
   6634 }
   6635 
   6636 
   6637 static
   6638 UInt dis_cmpxchg_G_E ( UChar       sorb,
   6639                        Bool        locked,
   6640                        Int         size,
   6641                        Int         delta0 )
   6642 {
   6643    HChar dis_buf[50];
   6644    Int   len;
   6645 
   6646    IRType ty    = szToITy(size);
   6647    IRTemp acc   = newTemp(ty);
   6648    IRTemp src   = newTemp(ty);
   6649    IRTemp dest  = newTemp(ty);
   6650    IRTemp dest2 = newTemp(ty);
   6651    IRTemp acc2  = newTemp(ty);
   6652    IRTemp cond  = newTemp(Ity_I1);
   6653    IRTemp addr  = IRTemp_INVALID;
   6654    UChar  rm    = getUChar(delta0);
   6655 
   6656    /* There are 3 cases to consider:
   6657 
   6658       reg-reg: ignore any lock prefix, generate sequence based
   6659                on ITE
   6660 
   6661       reg-mem, not locked: ignore any lock prefix, generate sequence
   6662                            based on ITE
   6663 
   6664       reg-mem, locked: use IRCAS
   6665    */
   6666    if (epartIsReg(rm)) {
   6667       /* case 1 */
   6668       assign( dest, getIReg(size, eregOfRM(rm)) );
   6669       delta0++;
   6670       assign( src, getIReg(size, gregOfRM(rm)) );
   6671       assign( acc, getIReg(size, R_EAX) );
   6672       setFlags_DEP1_DEP2(Iop_Sub8, acc, dest, ty);
   6673       assign( cond, mk_x86g_calculate_condition(X86CondZ) );
   6674       assign( dest2, IRExpr_ITE(mkexpr(cond), mkexpr(src), mkexpr(dest)) );
   6675       assign( acc2,  IRExpr_ITE(mkexpr(cond), mkexpr(acc), mkexpr(dest)) );
   6676       putIReg(size, R_EAX, mkexpr(acc2));
   6677       putIReg(size, eregOfRM(rm), mkexpr(dest2));
   6678       DIP("cmpxchg%c %s,%s\n", nameISize(size),
   6679                                nameIReg(size,gregOfRM(rm)),
   6680                                nameIReg(size,eregOfRM(rm)) );
   6681    }
   6682    else if (!epartIsReg(rm) && !locked) {
   6683       /* case 2 */
   6684       addr = disAMode ( &len, sorb, delta0, dis_buf );
   6685       assign( dest, loadLE(ty, mkexpr(addr)) );
   6686       delta0 += len;
   6687       assign( src, getIReg(size, gregOfRM(rm)) );
   6688       assign( acc, getIReg(size, R_EAX) );
   6689       setFlags_DEP1_DEP2(Iop_Sub8, acc, dest, ty);
   6690       assign( cond, mk_x86g_calculate_condition(X86CondZ) );
   6691       assign( dest2, IRExpr_ITE(mkexpr(cond), mkexpr(src), mkexpr(dest)) );
   6692       assign( acc2,  IRExpr_ITE(mkexpr(cond), mkexpr(acc), mkexpr(dest)) );
   6693       putIReg(size, R_EAX, mkexpr(acc2));
   6694       storeLE( mkexpr(addr), mkexpr(dest2) );
   6695       DIP("cmpxchg%c %s,%s\n", nameISize(size),
   6696                                nameIReg(size,gregOfRM(rm)), dis_buf);
   6697    }
   6698    else if (!epartIsReg(rm) && locked) {
   6699       /* case 3 */
   6700       /* src is new value.  acc is expected value.  dest is old value.
   6701          Compute success from the output of the IRCAS, and steer the
   6702          new value for EAX accordingly: in case of success, EAX is
   6703          unchanged. */
   6704       addr = disAMode ( &len, sorb, delta0, dis_buf );
   6705       delta0 += len;
   6706       assign( src, getIReg(size, gregOfRM(rm)) );
   6707       assign( acc, getIReg(size, R_EAX) );
   6708       stmt( IRStmt_CAS(
   6709          mkIRCAS( IRTemp_INVALID, dest, Iend_LE, mkexpr(addr),
   6710                   NULL, mkexpr(acc), NULL, mkexpr(src) )
   6711       ));
   6712       setFlags_DEP1_DEP2(Iop_Sub8, acc, dest, ty);
   6713       assign( cond, mk_x86g_calculate_condition(X86CondZ) );
   6714       assign( acc2,  IRExpr_ITE(mkexpr(cond), mkexpr(acc), mkexpr(dest)) );
   6715       putIReg(size, R_EAX, mkexpr(acc2));
   6716       DIP("cmpxchg%c %s,%s\n", nameISize(size),
   6717                                nameIReg(size,gregOfRM(rm)), dis_buf);
   6718    }
   6719    else vassert(0);
   6720 
   6721    return delta0;
   6722 }
   6723 
   6724 
   6725 /* Handle conditional move instructions of the form
   6726       cmovcc E(reg-or-mem), G(reg)
   6727 
   6728    E(src) is reg-or-mem
   6729    G(dst) is reg.
   6730 
   6731    If E is reg, -->    GET %E, tmps
   6732                        GET %G, tmpd
   6733                        CMOVcc tmps, tmpd
   6734                        PUT tmpd, %G
   6735 
   6736    If E is mem  -->    (getAddr E) -> tmpa
   6737                        LD (tmpa), tmps
   6738                        GET %G, tmpd
   6739                        CMOVcc tmps, tmpd
   6740                        PUT tmpd, %G
   6741 */
   6742 static
   6743 UInt dis_cmov_E_G ( UChar       sorb,
   6744                     Int         sz,
   6745                     X86Condcode cond,
   6746                     Int         delta0 )
   6747 {
   6748    UChar rm  = getIByte(delta0);
   6749    HChar dis_buf[50];
   6750    Int   len;
   6751 
   6752    IRType ty   = szToITy(sz);
   6753    IRTemp tmps = newTemp(ty);
   6754    IRTemp tmpd = newTemp(ty);
   6755 
   6756    if (epartIsReg(rm)) {
   6757       assign( tmps, getIReg(sz, eregOfRM(rm)) );
   6758       assign( tmpd, getIReg(sz, gregOfRM(rm)) );
   6759 
   6760       putIReg(sz, gregOfRM(rm),
   6761                   IRExpr_ITE( mk_x86g_calculate_condition(cond),
   6762                               mkexpr(tmps),
   6763                               mkexpr(tmpd) )
   6764              );
   6765       DIP("cmov%c%s %s,%s\n", nameISize(sz),
   6766                               name_X86Condcode(cond),
   6767                               nameIReg(sz,eregOfRM(rm)),
   6768                               nameIReg(sz,gregOfRM(rm)));
   6769       return 1+delta0;
   6770    }
   6771 
   6772    /* E refers to memory */
   6773    {
   6774       IRTemp addr = disAMode ( &len, sorb, delta0, dis_buf );
   6775       assign( tmps, loadLE(ty, mkexpr(addr)) );
   6776       assign( tmpd, getIReg(sz, gregOfRM(rm)) );
   6777 
   6778       putIReg(sz, gregOfRM(rm),
   6779                   IRExpr_ITE( mk_x86g_calculate_condition(cond),
   6780                               mkexpr(tmps),
   6781                               mkexpr(tmpd) )
   6782              );
   6783 
   6784       DIP("cmov%c%s %s,%s\n", nameISize(sz),
   6785                               name_X86Condcode(cond),
   6786                               dis_buf,
   6787                               nameIReg(sz,gregOfRM(rm)));
   6788       return len+delta0;
   6789    }
   6790 }
   6791 
   6792 
   6793 static
   6794 UInt dis_xadd_G_E ( UChar sorb, Bool locked, Int sz, Int delta0,
   6795                     Bool* decodeOK )
   6796 {
   6797    Int   len;
   6798    UChar rm = getIByte(delta0);
   6799    HChar dis_buf[50];
   6800 
   6801    IRType ty    = szToITy(sz);
   6802    IRTemp tmpd  = newTemp(ty);
   6803    IRTemp tmpt0 = newTemp(ty);
   6804    IRTemp tmpt1 = newTemp(ty);
   6805 
   6806    /* There are 3 cases to consider:
   6807 
   6808       reg-reg: ignore any lock prefix,
   6809                generate 'naive' (non-atomic) sequence
   6810 
   6811       reg-mem, not locked: ignore any lock prefix, generate 'naive'
   6812                            (non-atomic) sequence
   6813 
   6814       reg-mem, locked: use IRCAS
   6815    */
   6816 
   6817    if (epartIsReg(rm)) {
   6818       /* case 1 */
   6819       assign( tmpd,  getIReg(sz, eregOfRM(rm)));
   6820       assign( tmpt0, getIReg(sz, gregOfRM(rm)) );
   6821       assign( tmpt1, binop(mkSizedOp(ty,Iop_Add8),
   6822                            mkexpr(tmpd), mkexpr(tmpt0)) );
   6823       setFlags_DEP1_DEP2( Iop_Add8, tmpd, tmpt0, ty );
   6824       putIReg(sz, eregOfRM(rm), mkexpr(tmpt1));
   6825       putIReg(sz, gregOfRM(rm), mkexpr(tmpd));
   6826       DIP("xadd%c %s, %s\n",
   6827           nameISize(sz), nameIReg(sz,gregOfRM(rm)),
   6828           				 nameIReg(sz,eregOfRM(rm)));
   6829       *decodeOK = True;
   6830       return 1+delta0;
   6831    }
   6832    else if (!epartIsReg(rm) && !locked) {
   6833       /* case 2 */
   6834       IRTemp addr = disAMode ( &len, sorb, delta0, dis_buf );
   6835       assign( tmpd,  loadLE(ty, mkexpr(addr)) );
   6836       assign( tmpt0, getIReg(sz, gregOfRM(rm)) );
   6837       assign( tmpt1, binop(mkSizedOp(ty,Iop_Add8),
   6838                            mkexpr(tmpd), mkexpr(tmpt0)) );
   6839       storeLE( mkexpr(addr), mkexpr(tmpt1) );
   6840       setFlags_DEP1_DEP2( Iop_Add8, tmpd, tmpt0, ty );
   6841       putIReg(sz, gregOfRM(rm), mkexpr(tmpd));
   6842       DIP("xadd%c %s, %s\n",
   6843           nameISize(sz), nameIReg(sz,gregOfRM(rm)), dis_buf);
   6844       *decodeOK = True;
   6845       return len+delta0;
   6846    }
   6847    else if (!epartIsReg(rm) && locked) {
   6848       /* case 3 */
   6849       IRTemp addr = disAMode ( &len, sorb, delta0, dis_buf );
   6850       assign( tmpd,  loadLE(ty, mkexpr(addr)) );
   6851       assign( tmpt0, getIReg(sz, gregOfRM(rm)) );
   6852       assign( tmpt1, binop(mkSizedOp(ty,Iop_Add8),
   6853                            mkexpr(tmpd), mkexpr(tmpt0)) );
   6854       casLE( mkexpr(addr), mkexpr(tmpd)/*expVal*/,
   6855                            mkexpr(tmpt1)/*newVal*/, guest_EIP_curr_instr );
   6856       setFlags_DEP1_DEP2( Iop_Add8, tmpd, tmpt0, ty );
   6857       putIReg(sz, gregOfRM(rm), mkexpr(tmpd));
   6858       DIP("xadd%c %s, %s\n",
   6859           nameISize(sz), nameIReg(sz,gregOfRM(rm)), dis_buf);
   6860       *decodeOK = True;
   6861       return len+delta0;
   6862    }
   6863    /*UNREACHED*/
   6864    vassert(0);
   6865 }
   6866 
   6867 /* Move 16 bits from Ew (ireg or mem) to G (a segment register). */
   6868 
   6869 static
   6870 UInt dis_mov_Ew_Sw ( UChar sorb, Int delta0 )
   6871 {
   6872    Int    len;
   6873    IRTemp addr;
   6874    UChar  rm  = getIByte(delta0);
   6875    HChar  dis_buf[50];
   6876 
   6877    if (epartIsReg(rm)) {
   6878       putSReg( gregOfRM(rm), getIReg(2, eregOfRM(rm)) );
   6879       DIP("movw %s,%s\n", nameIReg(2,eregOfRM(rm)), nameSReg(gregOfRM(rm)));
   6880       return 1+delta0;
   6881    } else {
   6882       addr = disAMode ( &len, sorb, delta0, dis_buf );
   6883       putSReg( gregOfRM(rm), loadLE(Ity_I16, mkexpr(addr)) );
   6884       DIP("movw %s,%s\n", dis_buf, nameSReg(gregOfRM(rm)));
   6885       return len+delta0;
   6886    }
   6887 }
   6888 
   6889 /* Move 16 bits from G (a segment register) to Ew (ireg or mem).  If
   6890    dst is ireg and sz==4, zero out top half of it.  */
   6891 
   6892 static
   6893 UInt dis_mov_Sw_Ew ( UChar sorb,
   6894                      Int   sz,
   6895                      Int   delta0 )
   6896 {
   6897    Int    len;
   6898    IRTemp addr;
   6899    UChar  rm  = getIByte(delta0);
   6900    HChar  dis_buf[50];
   6901 
   6902    vassert(sz == 2 || sz == 4);
   6903 
   6904    if (epartIsReg(rm)) {
   6905       if (sz == 4)
   6906          putIReg(4, eregOfRM(rm), unop(Iop_16Uto32, getSReg(gregOfRM(rm))));
   6907       else
   6908          putIReg(2, eregOfRM(rm), getSReg(gregOfRM(rm)));
   6909 
   6910       DIP("mov %s,%s\n", nameSReg(gregOfRM(rm)), nameIReg(sz,eregOfRM(rm)));
   6911       return 1+delta0;
   6912    } else {
   6913       addr = disAMode ( &len, sorb, delta0, dis_buf );
   6914       storeLE( mkexpr(addr), getSReg(gregOfRM(rm)) );
   6915       DIP("mov %s,%s\n", nameSReg(gregOfRM(rm)), dis_buf);
   6916       return len+delta0;
   6917    }
   6918 }
   6919 
   6920 
   6921 static
   6922 void dis_push_segreg ( UInt sreg, Int sz )
   6923 {
   6924     IRTemp t1 = newTemp(Ity_I16);
   6925     IRTemp ta = newTemp(Ity_I32);
   6926     vassert(sz == 2 || sz == 4);
   6927 
   6928     assign( t1, getSReg(sreg) );
   6929     assign( ta, binop(Iop_Sub32, getIReg(4, R_ESP), mkU32(sz)) );
   6930     putIReg(4, R_ESP, mkexpr(ta));
   6931     storeLE( mkexpr(ta), mkexpr(t1) );
   6932 
   6933     DIP("push%c %s\n", sz==2 ? 'w' : 'l', nameSReg(sreg));
   6934 }
   6935 
   6936 static
   6937 void dis_pop_segreg ( UInt sreg, Int sz )
   6938 {
   6939     IRTemp t1 = newTemp(Ity_I16);
   6940     IRTemp ta = newTemp(Ity_I32);
   6941     vassert(sz == 2 || sz == 4);
   6942 
   6943     assign( ta, getIReg(4, R_ESP) );
   6944     assign( t1, loadLE(Ity_I16, mkexpr(ta)) );
   6945 
   6946     putIReg(4, R_ESP, binop(Iop_Add32, mkexpr(ta), mkU32(sz)) );
   6947     putSReg( sreg, mkexpr(t1) );
   6948     DIP("pop%c %s\n", sz==2 ? 'w' : 'l', nameSReg(sreg));
   6949 }
   6950 
   6951 static
   6952 void dis_ret ( /*MOD*/DisResult* dres, UInt d32 )
   6953 {
   6954    IRTemp t1 = newTemp(Ity_I32);
   6955    IRTemp t2 = newTemp(Ity_I32);
   6956    assign(t1, getIReg(4,R_ESP));
   6957    assign(t2, loadLE(Ity_I32,mkexpr(t1)));
   6958    putIReg(4, R_ESP,binop(Iop_Add32, mkexpr(t1), mkU32(4+d32)));
   6959    jmp_treg(dres, Ijk_Ret, t2);
   6960    vassert(dres->whatNext == Dis_StopHere);
   6961 }
   6962 
   6963 /*------------------------------------------------------------*/
   6964 /*--- SSE/SSE2/SSE3 helpers                                ---*/
   6965 /*------------------------------------------------------------*/
   6966 
   6967 /* Indicates whether the op requires a rounding-mode argument.  Note
   6968    that this covers only vector floating point arithmetic ops, and
   6969    omits the scalar ones that need rounding modes.  Note also that
   6970    inconsistencies here will get picked up later by the IR sanity
   6971    checker, so this isn't correctness-critical. */
   6972 static Bool requiresRMode ( IROp op )
   6973 {
   6974    switch (op) {
   6975       /* 128 bit ops */
   6976       case Iop_Add32Fx4: case Iop_Sub32Fx4:
   6977       case Iop_Mul32Fx4: case Iop_Div32Fx4:
   6978       case Iop_Add64Fx2: case Iop_Sub64Fx2:
   6979       case Iop_Mul64Fx2: case Iop_Div64Fx2:
   6980          return True;
   6981       default:
   6982          break;
   6983    }
   6984    return False;
   6985 }
   6986 
   6987 
   6988 /* Worker function; do not call directly.
   6989    Handles full width G = G `op` E   and   G = (not G) `op` E.
   6990 */
   6991 
   6992 static UInt dis_SSE_E_to_G_all_wrk (
   6993                UChar sorb, Int delta,
   6994                const HChar* opname, IROp op,
   6995                Bool   invertG
   6996             )
   6997 {
   6998    HChar   dis_buf[50];
   6999    Int     alen;
   7000    IRTemp  addr;
   7001    UChar   rm = getIByte(delta);
   7002    IRExpr* gpart
   7003       = invertG ? unop(Iop_NotV128, getXMMReg(gregOfRM(rm)))
   7004                 : getXMMReg(gregOfRM(rm));
   7005    if (epartIsReg(rm)) {
   7006       putXMMReg(
   7007          gregOfRM(rm),
   7008          requiresRMode(op)
   7009             ? triop(op, get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   7010                         gpart,
   7011                         getXMMReg(eregOfRM(rm)))
   7012             : binop(op, gpart,
   7013                         getXMMReg(eregOfRM(rm)))
   7014       );
   7015       DIP("%s %s,%s\n", opname,
   7016                         nameXMMReg(eregOfRM(rm)),
   7017                         nameXMMReg(gregOfRM(rm)) );
   7018       return delta+1;
   7019    } else {
   7020       addr = disAMode ( &alen, sorb, delta, dis_buf );
   7021       putXMMReg(
   7022          gregOfRM(rm),
   7023          requiresRMode(op)
   7024             ? triop(op, get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   7025                         gpart,
   7026                         loadLE(Ity_V128, mkexpr(addr)))
   7027             : binop(op, gpart,
   7028                         loadLE(Ity_V128, mkexpr(addr)))
   7029       );
   7030       DIP("%s %s,%s\n", opname,
   7031                         dis_buf,
   7032                         nameXMMReg(gregOfRM(rm)) );
   7033       return delta+alen;
   7034    }
   7035 }
   7036 
   7037 
   7038 /* All lanes SSE binary operation, G = G `op` E. */
   7039 
   7040 static
   7041 UInt dis_SSE_E_to_G_all ( UChar sorb, Int delta, const HChar* opname, IROp op )
   7042 {
   7043    return dis_SSE_E_to_G_all_wrk( sorb, delta, opname, op, False );
   7044 }
   7045 
   7046 /* All lanes SSE binary operation, G = (not G) `op` E. */
   7047 
   7048 static
   7049 UInt dis_SSE_E_to_G_all_invG ( UChar sorb, Int delta,
   7050                                const HChar* opname, IROp op )
   7051 {
   7052    return dis_SSE_E_to_G_all_wrk( sorb, delta, opname, op, True );
   7053 }
   7054 
   7055 
   7056 /* Lowest 32-bit lane only SSE binary operation, G = G `op` E. */
   7057 
   7058 static UInt dis_SSE_E_to_G_lo32 ( UChar sorb, Int delta,
   7059                                   const HChar* opname, IROp op )
   7060 {
   7061    HChar   dis_buf[50];
   7062    Int     alen;
   7063    IRTemp  addr;
   7064    UChar   rm = getIByte(delta);
   7065    IRExpr* gpart = getXMMReg(gregOfRM(rm));
   7066    if (epartIsReg(rm)) {
   7067       putXMMReg( gregOfRM(rm),
   7068                  binop(op, gpart,
   7069                            getXMMReg(eregOfRM(rm))) );
   7070       DIP("%s %s,%s\n", opname,
   7071                         nameXMMReg(eregOfRM(rm)),
   7072                         nameXMMReg(gregOfRM(rm)) );
   7073       return delta+1;
   7074    } else {
   7075       /* We can only do a 32-bit memory read, so the upper 3/4 of the
   7076          E operand needs to be made simply of zeroes. */
   7077       IRTemp epart = newTemp(Ity_V128);
   7078       addr = disAMode ( &alen, sorb, delta, dis_buf );
   7079       assign( epart, unop( Iop_32UtoV128,
   7080                            loadLE(Ity_I32, mkexpr(addr))) );
   7081       putXMMReg( gregOfRM(rm),
   7082                  binop(op, gpart, mkexpr(epart)) );
   7083       DIP("%s %s,%s\n", opname,
   7084                         dis_buf,
   7085                         nameXMMReg(gregOfRM(rm)) );
   7086       return delta+alen;
   7087    }
   7088 }
   7089 
   7090 
   7091 /* Lower 64-bit lane only SSE binary operation, G = G `op` E. */
   7092 
   7093 static UInt dis_SSE_E_to_G_lo64 ( UChar sorb, Int delta,
   7094                                   const HChar* opname, IROp op )
   7095 {
   7096    HChar   dis_buf[50];
   7097    Int     alen;
   7098    IRTemp  addr;
   7099    UChar   rm = getIByte(delta);
   7100    IRExpr* gpart = getXMMReg(gregOfRM(rm));
   7101    if (epartIsReg(rm)) {
   7102       putXMMReg( gregOfRM(rm),
   7103                  binop(op, gpart,
   7104                            getXMMReg(eregOfRM(rm))) );
   7105       DIP("%s %s,%s\n", opname,
   7106                         nameXMMReg(eregOfRM(rm)),
   7107                         nameXMMReg(gregOfRM(rm)) );
   7108       return delta+1;
   7109    } else {
   7110       /* We can only do a 64-bit memory read, so the upper half of the
   7111          E operand needs to be made simply of zeroes. */
   7112       IRTemp epart = newTemp(Ity_V128);
   7113       addr = disAMode ( &alen, sorb, delta, dis_buf );
   7114       assign( epart, unop( Iop_64UtoV128,
   7115                            loadLE(Ity_I64, mkexpr(addr))) );
   7116       putXMMReg( gregOfRM(rm),
   7117                  binop(op, gpart, mkexpr(epart)) );
   7118       DIP("%s %s,%s\n", opname,
   7119                         dis_buf,
   7120                         nameXMMReg(gregOfRM(rm)) );
   7121       return delta+alen;
   7122    }
   7123 }
   7124 
   7125 
   7126 /* All lanes unary SSE operation, G = op(E). */
   7127 
   7128 static UInt dis_SSE_E_to_G_unary_all (
   7129                UChar sorb, Int delta,
   7130                const HChar* opname, IROp op
   7131             )
   7132 {
   7133    HChar   dis_buf[50];
   7134    Int     alen;
   7135    IRTemp  addr;
   7136    UChar   rm = getIByte(delta);
   7137    if (epartIsReg(rm)) {
   7138       putXMMReg( gregOfRM(rm),
   7139                  unop(op, getXMMReg(eregOfRM(rm))) );
   7140       DIP("%s %s,%s\n", opname,
   7141                         nameXMMReg(eregOfRM(rm)),
   7142                         nameXMMReg(gregOfRM(rm)) );
   7143       return delta+1;
   7144    } else {
   7145       addr = disAMode ( &alen, sorb, delta, dis_buf );
   7146       putXMMReg( gregOfRM(rm),
   7147                  unop(op, loadLE(Ity_V128, mkexpr(addr))) );
   7148       DIP("%s %s,%s\n", opname,
   7149                         dis_buf,
   7150                         nameXMMReg(gregOfRM(rm)) );
   7151       return delta+alen;
   7152    }
   7153 }
   7154 
   7155 
   7156 /* Lowest 32-bit lane only unary SSE operation, G = op(E). */
   7157 
   7158 static UInt dis_SSE_E_to_G_unary_lo32 (
   7159                UChar sorb, Int delta,
   7160                const HChar* opname, IROp op
   7161             )
   7162 {
   7163    /* First we need to get the old G value and patch the low 32 bits
   7164       of the E operand into it.  Then apply op and write back to G. */
   7165    HChar   dis_buf[50];
   7166    Int     alen;
   7167    IRTemp  addr;
   7168    UChar   rm = getIByte(delta);
   7169    IRTemp  oldG0 = newTemp(Ity_V128);
   7170    IRTemp  oldG1 = newTemp(Ity_V128);
   7171 
   7172    assign( oldG0, getXMMReg(gregOfRM(rm)) );
   7173 
   7174    if (epartIsReg(rm)) {
   7175       assign( oldG1,
   7176               binop( Iop_SetV128lo32,
   7177                      mkexpr(oldG0),
   7178                      getXMMRegLane32(eregOfRM(rm), 0)) );
   7179       putXMMReg( gregOfRM(rm), unop(op, mkexpr(oldG1)) );
   7180       DIP("%s %s,%s\n", opname,
   7181                         nameXMMReg(eregOfRM(rm)),
   7182                         nameXMMReg(gregOfRM(rm)) );
   7183       return delta+1;
   7184    } else {
   7185       addr = disAMode ( &alen, sorb, delta, dis_buf );
   7186       assign( oldG1,
   7187               binop( Iop_SetV128lo32,
   7188                      mkexpr(oldG0),
   7189                      loadLE(Ity_I32, mkexpr(addr)) ));
   7190       putXMMReg( gregOfRM(rm), unop(op, mkexpr(oldG1)) );
   7191       DIP("%s %s,%s\n", opname,
   7192                         dis_buf,
   7193                         nameXMMReg(gregOfRM(rm)) );
   7194       return delta+alen;
   7195    }
   7196 }
   7197 
   7198 
   7199 /* Lowest 64-bit lane only unary SSE operation, G = op(E). */
   7200 
   7201 static UInt dis_SSE_E_to_G_unary_lo64 (
   7202                UChar sorb, Int delta,
   7203                const HChar* opname, IROp op
   7204             )
   7205 {
   7206    /* First we need to get the old G value and patch the low 64 bits
   7207       of the E operand into it.  Then apply op and write back to G. */
   7208    HChar   dis_buf[50];
   7209    Int     alen;
   7210    IRTemp  addr;
   7211    UChar   rm = getIByte(delta);
   7212    IRTemp  oldG0 = newTemp(Ity_V128);
   7213    IRTemp  oldG1 = newTemp(Ity_V128);
   7214 
   7215    assign( oldG0, getXMMReg(gregOfRM(rm)) );
   7216 
   7217    if (epartIsReg(rm)) {
   7218       assign( oldG1,
   7219               binop( Iop_SetV128lo64,
   7220                      mkexpr(oldG0),
   7221                      getXMMRegLane64(eregOfRM(rm), 0)) );
   7222       putXMMReg( gregOfRM(rm), unop(op, mkexpr(oldG1)) );
   7223       DIP("%s %s,%s\n", opname,
   7224                         nameXMMReg(eregOfRM(rm)),
   7225                         nameXMMReg(gregOfRM(rm)) );
   7226       return delta+1;
   7227    } else {
   7228       addr = disAMode ( &alen, sorb, delta, dis_buf );
   7229       assign( oldG1,
   7230               binop( Iop_SetV128lo64,
   7231                      mkexpr(oldG0),
   7232                      loadLE(Ity_I64, mkexpr(addr)) ));
   7233       putXMMReg( gregOfRM(rm), unop(op, mkexpr(oldG1)) );
   7234       DIP("%s %s,%s\n", opname,
   7235                         dis_buf,
   7236                         nameXMMReg(gregOfRM(rm)) );
   7237       return delta+alen;
   7238    }
   7239 }
   7240 
   7241 
   7242 /* SSE integer binary operation:
   7243       G = G `op` E   (eLeft == False)
   7244       G = E `op` G   (eLeft == True)
   7245 */
   7246 static UInt dis_SSEint_E_to_G(
   7247                UChar sorb, Int delta,
   7248                const HChar* opname, IROp op,
   7249                Bool   eLeft
   7250             )
   7251 {
   7252    HChar   dis_buf[50];
   7253    Int     alen;
   7254    IRTemp  addr;
   7255    UChar   rm = getIByte(delta);
   7256    IRExpr* gpart = getXMMReg(gregOfRM(rm));
   7257    IRExpr* epart = NULL;
   7258    if (epartIsReg(rm)) {
   7259       epart = getXMMReg(eregOfRM(rm));
   7260       DIP("%s %s,%s\n", opname,
   7261                         nameXMMReg(eregOfRM(rm)),
   7262                         nameXMMReg(gregOfRM(rm)) );
   7263       delta += 1;
   7264    } else {
   7265       addr  = disAMode ( &alen, sorb, delta, dis_buf );
   7266       epart = loadLE(Ity_V128, mkexpr(addr));
   7267       DIP("%s %s,%s\n", opname,
   7268                         dis_buf,
   7269                         nameXMMReg(gregOfRM(rm)) );
   7270       delta += alen;
   7271    }
   7272    putXMMReg( gregOfRM(rm),
   7273               eLeft ? binop(op, epart, gpart)
   7274 	            : binop(op, gpart, epart) );
   7275    return delta;
   7276 }
   7277 
   7278 
   7279 /* Helper for doing SSE FP comparisons. */
   7280 
   7281 static void findSSECmpOp ( Bool* needNot, IROp* op,
   7282                            Int imm8, Bool all_lanes, Int sz )
   7283 {
   7284    imm8 &= 7;
   7285    *needNot = False;
   7286    *op      = Iop_INVALID;
   7287    if (imm8 >= 4) {
   7288       *needNot = True;
   7289       imm8 -= 4;
   7290    }
   7291 
   7292    if (sz == 4 && all_lanes) {
   7293       switch (imm8) {
   7294          case 0: *op = Iop_CmpEQ32Fx4; return;
   7295          case 1: *op = Iop_CmpLT32Fx4; return;
   7296          case 2: *op = Iop_CmpLE32Fx4; return;
   7297          case 3: *op = Iop_CmpUN32Fx4; return;
   7298          default: break;
   7299       }
   7300    }
   7301    if (sz == 4 && !all_lanes) {
   7302       switch (imm8) {
   7303          case 0: *op = Iop_CmpEQ32F0x4; return;
   7304          case 1: *op = Iop_CmpLT32F0x4; return;
   7305          case 2: *op = Iop_CmpLE32F0x4; return;
   7306          case 3: *op = Iop_CmpUN32F0x4; return;
   7307          default: break;
   7308       }
   7309    }
   7310    if (sz == 8 && all_lanes) {
   7311       switch (imm8) {
   7312          case 0: *op = Iop_CmpEQ64Fx2; return;
   7313          case 1: *op = Iop_CmpLT64Fx2; return;
   7314          case 2: *op = Iop_CmpLE64Fx2; return;
   7315          case 3: *op = Iop_CmpUN64Fx2; return;
   7316          default: break;
   7317       }
   7318    }
   7319    if (sz == 8 && !all_lanes) {
   7320       switch (imm8) {
   7321          case 0: *op = Iop_CmpEQ64F0x2; return;
   7322          case 1: *op = Iop_CmpLT64F0x2; return;
   7323          case 2: *op = Iop_CmpLE64F0x2; return;
   7324          case 3: *op = Iop_CmpUN64F0x2; return;
   7325          default: break;
   7326       }
   7327    }
   7328    vpanic("findSSECmpOp(x86,guest)");
   7329 }
   7330 
   7331 /* Handles SSE 32F/64F comparisons. */
   7332 
   7333 static UInt dis_SSEcmp_E_to_G ( UChar sorb, Int delta,
   7334 				const HChar* opname, Bool all_lanes, Int sz )
   7335 {
   7336    HChar   dis_buf[50];
   7337    Int     alen, imm8;
   7338    IRTemp  addr;
   7339    Bool    needNot = False;
   7340    IROp    op      = Iop_INVALID;
   7341    IRTemp  plain   = newTemp(Ity_V128);
   7342    UChar   rm      = getIByte(delta);
   7343    UShort  mask    = 0;
   7344    vassert(sz == 4 || sz == 8);
   7345    if (epartIsReg(rm)) {
   7346       imm8 = getIByte(delta+1);
   7347       findSSECmpOp(&needNot, &op, imm8, all_lanes, sz);
   7348       assign( plain, binop(op, getXMMReg(gregOfRM(rm)),
   7349                                getXMMReg(eregOfRM(rm))) );
   7350       delta += 2;
   7351       DIP("%s $%d,%s,%s\n", opname,
   7352                             (Int)imm8,
   7353                             nameXMMReg(eregOfRM(rm)),
   7354                             nameXMMReg(gregOfRM(rm)) );
   7355    } else {
   7356       addr = disAMode ( &alen, sorb, delta, dis_buf );
   7357       imm8 = getIByte(delta+alen);
   7358       findSSECmpOp(&needNot, &op, imm8, all_lanes, sz);
   7359       assign( plain,
   7360               binop(
   7361                  op,
   7362                  getXMMReg(gregOfRM(rm)),
   7363                    all_lanes  ? loadLE(Ity_V128, mkexpr(addr))
   7364                  : sz == 8    ? unop( Iop_64UtoV128, loadLE(Ity_I64, mkexpr(addr)))
   7365                  : /*sz==4*/    unop( Iop_32UtoV128, loadLE(Ity_I32, mkexpr(addr)))
   7366              )
   7367       );
   7368       delta += alen+1;
   7369       DIP("%s $%d,%s,%s\n", opname,
   7370                             (Int)imm8,
   7371                             dis_buf,
   7372                             nameXMMReg(gregOfRM(rm)) );
   7373    }
   7374 
   7375    if (needNot && all_lanes) {
   7376       putXMMReg( gregOfRM(rm),
   7377                  unop(Iop_NotV128, mkexpr(plain)) );
   7378    }
   7379    else
   7380    if (needNot && !all_lanes) {
   7381       mask = toUShort( sz==4 ? 0x000F : 0x00FF );
   7382       putXMMReg( gregOfRM(rm),
   7383                  binop(Iop_XorV128, mkexpr(plain), mkV128(mask)) );
   7384    }
   7385    else {
   7386       putXMMReg( gregOfRM(rm), mkexpr(plain) );
   7387    }
   7388 
   7389    return delta;
   7390 }
   7391 
   7392 
   7393 /* Vector by scalar shift of G by the amount specified at the bottom
   7394    of E. */
   7395 
   7396 static UInt dis_SSE_shiftG_byE ( UChar sorb, Int delta,
   7397                                  const HChar* opname, IROp op )
   7398 {
   7399    HChar   dis_buf[50];
   7400    Int     alen, size;
   7401    IRTemp  addr;
   7402    Bool    shl, shr, sar;
   7403    UChar   rm   = getIByte(delta);
   7404    IRTemp  g0   = newTemp(Ity_V128);
   7405    IRTemp  g1   = newTemp(Ity_V128);
   7406    IRTemp  amt  = newTemp(Ity_I32);
   7407    IRTemp  amt8 = newTemp(Ity_I8);
   7408    if (epartIsReg(rm)) {
   7409       assign( amt, getXMMRegLane32(eregOfRM(rm), 0) );
   7410       DIP("%s %s,%s\n", opname,
   7411                         nameXMMReg(eregOfRM(rm)),
   7412                         nameXMMReg(gregOfRM(rm)) );
   7413       delta++;
   7414    } else {
   7415       addr = disAMode ( &alen, sorb, delta, dis_buf );
   7416       assign( amt, loadLE(Ity_I32, mkexpr(addr)) );
   7417       DIP("%s %s,%s\n", opname,
   7418                         dis_buf,
   7419                         nameXMMReg(gregOfRM(rm)) );
   7420       delta += alen;
   7421    }
   7422    assign( g0,   getXMMReg(gregOfRM(rm)) );
   7423    assign( amt8, unop(Iop_32to8, mkexpr(amt)) );
   7424 
   7425    shl = shr = sar = False;
   7426    size = 0;
   7427    switch (op) {
   7428       case Iop_ShlN16x8: shl = True; size = 32; break;
   7429       case Iop_ShlN32x4: shl = True; size = 32; break;
   7430       case Iop_ShlN64x2: shl = True; size = 64; break;
   7431       case Iop_SarN16x8: sar = True; size = 16; break;
   7432       case Iop_SarN32x4: sar = True; size = 32; break;
   7433       case Iop_ShrN16x8: shr = True; size = 16; break;
   7434       case Iop_ShrN32x4: shr = True; size = 32; break;
   7435       case Iop_ShrN64x2: shr = True; size = 64; break;
   7436       default: vassert(0);
   7437    }
   7438 
   7439    if (shl || shr) {
   7440      assign(
   7441         g1,
   7442         IRExpr_ITE(
   7443            binop(Iop_CmpLT32U,mkexpr(amt),mkU32(size)),
   7444            binop(op, mkexpr(g0), mkexpr(amt8)),
   7445            mkV128(0x0000)
   7446         )
   7447      );
   7448    } else
   7449    if (sar) {
   7450      assign(
   7451         g1,
   7452         IRExpr_ITE(
   7453            binop(Iop_CmpLT32U,mkexpr(amt),mkU32(size)),
   7454            binop(op, mkexpr(g0), mkexpr(amt8)),
   7455            binop(op, mkexpr(g0), mkU8(size-1))
   7456         )
   7457      );
   7458    } else {
   7459       /*NOTREACHED*/
   7460       vassert(0);
   7461    }
   7462 
   7463    putXMMReg( gregOfRM(rm), mkexpr(g1) );
   7464    return delta;
   7465 }
   7466 
   7467 
   7468 /* Vector by scalar shift of E by an immediate byte. */
   7469 
   7470 static
   7471 UInt dis_SSE_shiftE_imm ( Int delta, const HChar* opname, IROp op )
   7472 {
   7473    Bool    shl, shr, sar;
   7474    UChar   rm   = getIByte(delta);
   7475    IRTemp  e0   = newTemp(Ity_V128);
   7476    IRTemp  e1   = newTemp(Ity_V128);
   7477    UChar   amt, size;
   7478    vassert(epartIsReg(rm));
   7479    vassert(gregOfRM(rm) == 2
   7480            || gregOfRM(rm) == 4 || gregOfRM(rm) == 6);
   7481    amt = getIByte(delta+1);
   7482    delta += 2;
   7483    DIP("%s $%d,%s\n", opname,
   7484                       (Int)amt,
   7485                       nameXMMReg(eregOfRM(rm)) );
   7486    assign( e0, getXMMReg(eregOfRM(rm)) );
   7487 
   7488    shl = shr = sar = False;
   7489    size = 0;
   7490    switch (op) {
   7491       case Iop_ShlN16x8: shl = True; size = 16; break;
   7492       case Iop_ShlN32x4: shl = True; size = 32; break;
   7493       case Iop_ShlN64x2: shl = True; size = 64; break;
   7494       case Iop_SarN16x8: sar = True; size = 16; break;
   7495       case Iop_SarN32x4: sar = True; size = 32; break;
   7496       case Iop_ShrN16x8: shr = True; size = 16; break;
   7497       case Iop_ShrN32x4: shr = True; size = 32; break;
   7498       case Iop_ShrN64x2: shr = True; size = 64; break;
   7499       default: vassert(0);
   7500    }
   7501 
   7502    if (shl || shr) {
   7503       assign( e1, amt >= size
   7504                      ? mkV128(0x0000)
   7505                      : binop(op, mkexpr(e0), mkU8(amt))
   7506       );
   7507    } else
   7508    if (sar) {
   7509       assign( e1, amt >= size
   7510                      ? binop(op, mkexpr(e0), mkU8(size-1))
   7511                      : binop(op, mkexpr(e0), mkU8(amt))
   7512       );
   7513    } else {
   7514       /*NOTREACHED*/
   7515       vassert(0);
   7516    }
   7517 
   7518    putXMMReg( eregOfRM(rm), mkexpr(e1) );
   7519    return delta;
   7520 }
   7521 
   7522 
   7523 /* Get the current SSE rounding mode. */
   7524 
   7525 static IRExpr* /* :: Ity_I32 */ get_sse_roundingmode ( void )
   7526 {
   7527    return binop( Iop_And32,
   7528                  IRExpr_Get( OFFB_SSEROUND, Ity_I32 ),
   7529                  mkU32(3) );
   7530 }
   7531 
   7532 static void put_sse_roundingmode ( IRExpr* sseround )
   7533 {
   7534    vassert(typeOfIRExpr(irsb->tyenv, sseround) == Ity_I32);
   7535    stmt( IRStmt_Put( OFFB_SSEROUND, sseround ) );
   7536 }
   7537 
   7538 /* Break a 128-bit value up into four 32-bit ints. */
   7539 
   7540 static void breakup128to32s ( IRTemp t128,
   7541 			      /*OUTs*/
   7542                               IRTemp* t3, IRTemp* t2,
   7543                               IRTemp* t1, IRTemp* t0 )
   7544 {
   7545    IRTemp hi64 = newTemp(Ity_I64);
   7546    IRTemp lo64 = newTemp(Ity_I64);
   7547    assign( hi64, unop(Iop_V128HIto64, mkexpr(t128)) );
   7548    assign( lo64, unop(Iop_V128to64,   mkexpr(t128)) );
   7549 
   7550    vassert(t0 && *t0 == IRTemp_INVALID);
   7551    vassert(t1 && *t1 == IRTemp_INVALID);
   7552    vassert(t2 && *t2 == IRTemp_INVALID);
   7553    vassert(t3 && *t3 == IRTemp_INVALID);
   7554 
   7555    *t0 = newTemp(Ity_I32);
   7556    *t1 = newTemp(Ity_I32);
   7557    *t2 = newTemp(Ity_I32);
   7558    *t3 = newTemp(Ity_I32);
   7559    assign( *t0, unop(Iop_64to32,   mkexpr(lo64)) );
   7560    assign( *t1, unop(Iop_64HIto32, mkexpr(lo64)) );
   7561    assign( *t2, unop(Iop_64to32,   mkexpr(hi64)) );
   7562    assign( *t3, unop(Iop_64HIto32, mkexpr(hi64)) );
   7563 }
   7564 
   7565 /* Construct a 128-bit value from four 32-bit ints. */
   7566 
   7567 static IRExpr* mk128from32s ( IRTemp t3, IRTemp t2,
   7568                               IRTemp t1, IRTemp t0 )
   7569 {
   7570    return
   7571       binop( Iop_64HLtoV128,
   7572              binop(Iop_32HLto64, mkexpr(t3), mkexpr(t2)),
   7573              binop(Iop_32HLto64, mkexpr(t1), mkexpr(t0))
   7574    );
   7575 }
   7576 
   7577 /* Break a 64-bit value up into four 16-bit ints. */
   7578 
   7579 static void breakup64to16s ( IRTemp t64,
   7580                              /*OUTs*/
   7581                              IRTemp* t3, IRTemp* t2,
   7582                              IRTemp* t1, IRTemp* t0 )
   7583 {
   7584    IRTemp hi32 = newTemp(Ity_I32);
   7585    IRTemp lo32 = newTemp(Ity_I32);
   7586    assign( hi32, unop(Iop_64HIto32, mkexpr(t64)) );
   7587    assign( lo32, unop(Iop_64to32,   mkexpr(t64)) );
   7588 
   7589    vassert(t0 && *t0 == IRTemp_INVALID);
   7590    vassert(t1 && *t1 == IRTemp_INVALID);
   7591    vassert(t2 && *t2 == IRTemp_INVALID);
   7592    vassert(t3 && *t3 == IRTemp_INVALID);
   7593 
   7594    *t0 = newTemp(Ity_I16);
   7595    *t1 = newTemp(Ity_I16);
   7596    *t2 = newTemp(Ity_I16);
   7597    *t3 = newTemp(Ity_I16);
   7598    assign( *t0, unop(Iop_32to16,   mkexpr(lo32)) );
   7599    assign( *t1, unop(Iop_32HIto16, mkexpr(lo32)) );
   7600    assign( *t2, unop(Iop_32to16,   mkexpr(hi32)) );
   7601    assign( *t3, unop(Iop_32HIto16, mkexpr(hi32)) );
   7602 }
   7603 
   7604 /* Construct a 64-bit value from four 16-bit ints. */
   7605 
   7606 static IRExpr* mk64from16s ( IRTemp t3, IRTemp t2,
   7607                              IRTemp t1, IRTemp t0 )
   7608 {
   7609    return
   7610       binop( Iop_32HLto64,
   7611              binop(Iop_16HLto32, mkexpr(t3), mkexpr(t2)),
   7612              binop(Iop_16HLto32, mkexpr(t1), mkexpr(t0))
   7613    );
   7614 }
   7615 
   7616 /* Generate IR to set the guest %EFLAGS from the pushfl-format image
   7617    in the given 32-bit temporary.  The flags that are set are: O S Z A
   7618    C P D ID AC.
   7619 
   7620    In all cases, code to set AC is generated.  However, VEX actually
   7621    ignores the AC value and so can optionally emit an emulation
   7622    warning when it is enabled.  In this routine, an emulation warning
   7623    is only emitted if emit_AC_emwarn is True, in which case
   7624    next_insn_EIP must be correct (this allows for correct code
   7625    generation for popfl/popfw).  If emit_AC_emwarn is False,
   7626    next_insn_EIP is unimportant (this allows for easy if kludgey code
   7627    generation for IRET.) */
   7628 
   7629 static
   7630 void set_EFLAGS_from_value ( IRTemp t1,
   7631                              Bool   emit_AC_emwarn,
   7632                              Addr32 next_insn_EIP )
   7633 {
   7634    vassert(typeOfIRTemp(irsb->tyenv,t1) == Ity_I32);
   7635 
   7636    /* t1 is the flag word.  Mask out everything except OSZACP and set
   7637       the flags thunk to X86G_CC_OP_COPY. */
   7638    stmt( IRStmt_Put( OFFB_CC_OP,   mkU32(X86G_CC_OP_COPY) ));
   7639    stmt( IRStmt_Put( OFFB_CC_DEP2, mkU32(0) ));
   7640    stmt( IRStmt_Put( OFFB_CC_DEP1,
   7641                      binop(Iop_And32,
   7642                            mkexpr(t1),
   7643                            mkU32( X86G_CC_MASK_C | X86G_CC_MASK_P
   7644                                   | X86G_CC_MASK_A | X86G_CC_MASK_Z
   7645                                   | X86G_CC_MASK_S| X86G_CC_MASK_O )
   7646                           )
   7647                     )
   7648        );
   7649    /* Set NDEP even though it isn't used.  This makes redundant-PUT
   7650       elimination of previous stores to this field work better. */
   7651    stmt( IRStmt_Put( OFFB_CC_NDEP, mkU32(0) ));
   7652 
   7653    /* Also need to set the D flag, which is held in bit 10 of t1.
   7654       If zero, put 1 in OFFB_DFLAG, else -1 in OFFB_DFLAG. */
   7655    stmt( IRStmt_Put(
   7656             OFFB_DFLAG,
   7657             IRExpr_ITE(
   7658                unop(Iop_32to1,
   7659                     binop(Iop_And32,
   7660                           binop(Iop_Shr32, mkexpr(t1), mkU8(10)),
   7661                           mkU32(1))),
   7662                mkU32(0xFFFFFFFF),
   7663                mkU32(1)))
   7664        );
   7665 
   7666    /* Set the ID flag */
   7667    stmt( IRStmt_Put(
   7668             OFFB_IDFLAG,
   7669             IRExpr_ITE(
   7670                unop(Iop_32to1,
   7671                     binop(Iop_And32,
   7672                           binop(Iop_Shr32, mkexpr(t1), mkU8(21)),
   7673                           mkU32(1))),
   7674                mkU32(1),
   7675                mkU32(0)))
   7676        );
   7677 
   7678    /* And set the AC flag.  If setting it 1 to, possibly emit an
   7679       emulation warning. */
   7680    stmt( IRStmt_Put(
   7681             OFFB_ACFLAG,
   7682             IRExpr_ITE(
   7683                unop(Iop_32to1,
   7684                     binop(Iop_And32,
   7685                           binop(Iop_Shr32, mkexpr(t1), mkU8(18)),
   7686                           mkU32(1))),
   7687                mkU32(1),
   7688                mkU32(0)))
   7689        );
   7690 
   7691    if (emit_AC_emwarn) {
   7692       put_emwarn( mkU32(EmWarn_X86_acFlag) );
   7693       stmt(
   7694          IRStmt_Exit(
   7695             binop( Iop_CmpNE32,
   7696                    binop(Iop_And32, mkexpr(t1), mkU32(1<<18)),
   7697                    mkU32(0) ),
   7698             Ijk_EmWarn,
   7699             IRConst_U32( next_insn_EIP ),
   7700             OFFB_EIP
   7701          )
   7702       );
   7703    }
   7704 }
   7705 
   7706 
   7707 /* Helper for the SSSE3 (not SSE3) PMULHRSW insns.  Given two 64-bit
   7708    values (aa,bb), computes, for each of the 4 16-bit lanes:
   7709 
   7710    (((aa_lane *s32 bb_lane) >>u 14) + 1) >>u 1
   7711 */
   7712 static IRExpr* dis_PMULHRSW_helper ( IRExpr* aax, IRExpr* bbx )
   7713 {
   7714    IRTemp aa      = newTemp(Ity_I64);
   7715    IRTemp bb      = newTemp(Ity_I64);
   7716    IRTemp aahi32s = newTemp(Ity_I64);
   7717    IRTemp aalo32s = newTemp(Ity_I64);
   7718    IRTemp bbhi32s = newTemp(Ity_I64);
   7719    IRTemp bblo32s = newTemp(Ity_I64);
   7720    IRTemp rHi     = newTemp(Ity_I64);
   7721    IRTemp rLo     = newTemp(Ity_I64);
   7722    IRTemp one32x2 = newTemp(Ity_I64);
   7723    assign(aa, aax);
   7724    assign(bb, bbx);
   7725    assign( aahi32s,
   7726            binop(Iop_SarN32x2,
   7727                  binop(Iop_InterleaveHI16x4, mkexpr(aa), mkexpr(aa)),
   7728                  mkU8(16) ));
   7729    assign( aalo32s,
   7730            binop(Iop_SarN32x2,
   7731                  binop(Iop_InterleaveLO16x4, mkexpr(aa), mkexpr(aa)),
   7732                  mkU8(16) ));
   7733    assign( bbhi32s,
   7734            binop(Iop_SarN32x2,
   7735                  binop(Iop_InterleaveHI16x4, mkexpr(bb), mkexpr(bb)),
   7736                  mkU8(16) ));
   7737    assign( bblo32s,
   7738            binop(Iop_SarN32x2,
   7739                  binop(Iop_InterleaveLO16x4, mkexpr(bb), mkexpr(bb)),
   7740                  mkU8(16) ));
   7741    assign(one32x2, mkU64( (1ULL << 32) + 1 ));
   7742    assign(
   7743       rHi,
   7744       binop(
   7745          Iop_ShrN32x2,
   7746          binop(
   7747             Iop_Add32x2,
   7748             binop(
   7749                Iop_ShrN32x2,
   7750                binop(Iop_Mul32x2, mkexpr(aahi32s), mkexpr(bbhi32s)),
   7751                mkU8(14)
   7752             ),
   7753             mkexpr(one32x2)
   7754          ),
   7755          mkU8(1)
   7756       )
   7757    );
   7758    assign(
   7759       rLo,
   7760       binop(
   7761          Iop_ShrN32x2,
   7762          binop(
   7763             Iop_Add32x2,
   7764             binop(
   7765                Iop_ShrN32x2,
   7766                binop(Iop_Mul32x2, mkexpr(aalo32s), mkexpr(bblo32s)),
   7767                mkU8(14)
   7768             ),
   7769             mkexpr(one32x2)
   7770          ),
   7771          mkU8(1)
   7772       )
   7773    );
   7774    return
   7775       binop(Iop_CatEvenLanes16x4, mkexpr(rHi), mkexpr(rLo));
   7776 }
   7777 
   7778 /* Helper for the SSSE3 (not SSE3) PSIGN{B,W,D} insns.  Given two 64-bit
   7779    values (aa,bb), computes, for each lane:
   7780 
   7781           if aa_lane < 0 then - bb_lane
   7782      else if aa_lane > 0 then bb_lane
   7783      else 0
   7784 */
   7785 static IRExpr* dis_PSIGN_helper ( IRExpr* aax, IRExpr* bbx, Int laneszB )
   7786 {
   7787    IRTemp aa       = newTemp(Ity_I64);
   7788    IRTemp bb       = newTemp(Ity_I64);
   7789    IRTemp zero     = newTemp(Ity_I64);
   7790    IRTemp bbNeg    = newTemp(Ity_I64);
   7791    IRTemp negMask  = newTemp(Ity_I64);
   7792    IRTemp posMask  = newTemp(Ity_I64);
   7793    IROp   opSub    = Iop_INVALID;
   7794    IROp   opCmpGTS = Iop_INVALID;
   7795 
   7796    switch (laneszB) {
   7797       case 1: opSub = Iop_Sub8x8;  opCmpGTS = Iop_CmpGT8Sx8;  break;
   7798       case 2: opSub = Iop_Sub16x4; opCmpGTS = Iop_CmpGT16Sx4; break;
   7799       case 4: opSub = Iop_Sub32x2; opCmpGTS = Iop_CmpGT32Sx2; break;
   7800       default: vassert(0);
   7801    }
   7802 
   7803    assign( aa,      aax );
   7804    assign( bb,      bbx );
   7805    assign( zero,    mkU64(0) );
   7806    assign( bbNeg,   binop(opSub,    mkexpr(zero), mkexpr(bb)) );
   7807    assign( negMask, binop(opCmpGTS, mkexpr(zero), mkexpr(aa)) );
   7808    assign( posMask, binop(opCmpGTS, mkexpr(aa),   mkexpr(zero)) );
   7809 
   7810    return
   7811       binop(Iop_Or64,
   7812             binop(Iop_And64, mkexpr(bb),    mkexpr(posMask)),
   7813             binop(Iop_And64, mkexpr(bbNeg), mkexpr(negMask)) );
   7814 
   7815 }
   7816 
   7817 /* Helper for the SSSE3 (not SSE3) PABS{B,W,D} insns.  Given a 64-bit
   7818    value aa, computes, for each lane
   7819 
   7820    if aa < 0 then -aa else aa
   7821 
   7822    Note that the result is interpreted as unsigned, so that the
   7823    absolute value of the most negative signed input can be
   7824    represented.
   7825 */
   7826 static IRExpr* dis_PABS_helper ( IRExpr* aax, Int laneszB )
   7827 {
   7828    IRTemp aa      = newTemp(Ity_I64);
   7829    IRTemp zero    = newTemp(Ity_I64);
   7830    IRTemp aaNeg   = newTemp(Ity_I64);
   7831    IRTemp negMask = newTemp(Ity_I64);
   7832    IRTemp posMask = newTemp(Ity_I64);
   7833    IROp   opSub   = Iop_INVALID;
   7834    IROp   opSarN  = Iop_INVALID;
   7835 
   7836    switch (laneszB) {
   7837       case 1: opSub = Iop_Sub8x8;  opSarN = Iop_SarN8x8;  break;
   7838       case 2: opSub = Iop_Sub16x4; opSarN = Iop_SarN16x4; break;
   7839       case 4: opSub = Iop_Sub32x2; opSarN = Iop_SarN32x2; break;
   7840       default: vassert(0);
   7841    }
   7842 
   7843    assign( aa,      aax );
   7844    assign( negMask, binop(opSarN, mkexpr(aa), mkU8(8*laneszB-1)) );
   7845    assign( posMask, unop(Iop_Not64, mkexpr(negMask)) );
   7846    assign( zero,    mkU64(0) );
   7847    assign( aaNeg,   binop(opSub, mkexpr(zero), mkexpr(aa)) );
   7848    return
   7849       binop(Iop_Or64,
   7850             binop(Iop_And64, mkexpr(aa),    mkexpr(posMask)),
   7851             binop(Iop_And64, mkexpr(aaNeg), mkexpr(negMask)) );
   7852 }
   7853 
   7854 static IRExpr* dis_PALIGNR_XMM_helper ( IRTemp hi64,
   7855                                         IRTemp lo64, Int byteShift )
   7856 {
   7857    vassert(byteShift >= 1 && byteShift <= 7);
   7858    return
   7859       binop(Iop_Or64,
   7860             binop(Iop_Shl64, mkexpr(hi64), mkU8(8*(8-byteShift))),
   7861             binop(Iop_Shr64, mkexpr(lo64), mkU8(8*byteShift))
   7862       );
   7863 }
   7864 
   7865 /* Generate a SIGSEGV followed by a restart of the current instruction
   7866    if effective_addr is not 16-aligned.  This is required behaviour
   7867    for some SSE3 instructions and all 128-bit SSSE3 instructions.
   7868    This assumes that guest_RIP_curr_instr is set correctly! */
   7869 static void gen_SEGV_if_not_16_aligned ( IRTemp effective_addr )
   7870 {
   7871    stmt(
   7872       IRStmt_Exit(
   7873          binop(Iop_CmpNE32,
   7874                binop(Iop_And32,mkexpr(effective_addr),mkU32(0xF)),
   7875                mkU32(0)),
   7876          Ijk_SigSEGV,
   7877          IRConst_U32(guest_EIP_curr_instr),
   7878          OFFB_EIP
   7879       )
   7880    );
   7881 }
   7882 
   7883 
   7884 /* Helper for deciding whether a given insn (starting at the opcode
   7885    byte) may validly be used with a LOCK prefix.  The following insns
   7886    may be used with LOCK when their destination operand is in memory.
   7887    AFAICS this is exactly the same for both 32-bit and 64-bit mode.
   7888 
   7889    ADD        80 /0,  81 /0,  82 /0,  83 /0,  00,  01
   7890    OR         80 /1,  81 /1,  82 /x,  83 /1,  08,  09
   7891    ADC        80 /2,  81 /2,  82 /2,  83 /2,  10,  11
   7892    SBB        81 /3,  81 /3,  82 /x,  83 /3,  18,  19
   7893    AND        80 /4,  81 /4,  82 /x,  83 /4,  20,  21
   7894    SUB        80 /5,  81 /5,  82 /x,  83 /5,  28,  29
   7895    XOR        80 /6,  81 /6,  82 /x,  83 /6,  30,  31
   7896 
   7897    DEC        FE /1,  FF /1
   7898    INC        FE /0,  FF /0
   7899 
   7900    NEG        F6 /3,  F7 /3
   7901    NOT        F6 /2,  F7 /2
   7902 
   7903    XCHG       86, 87
   7904 
   7905    BTC        0F BB,  0F BA /7
   7906    BTR        0F B3,  0F BA /6
   7907    BTS        0F AB,  0F BA /5
   7908 
   7909    CMPXCHG    0F B0,  0F B1
   7910    CMPXCHG8B  0F C7 /1
   7911 
   7912    XADD       0F C0,  0F C1
   7913 
   7914    ------------------------------
   7915 
   7916    80 /0  =  addb $imm8,  rm8
   7917    81 /0  =  addl $imm32, rm32  and  addw $imm16, rm16
   7918    82 /0  =  addb $imm8,  rm8
   7919    83 /0  =  addl $simm8, rm32  and  addw $simm8, rm16
   7920 
   7921    00     =  addb r8,  rm8
   7922    01     =  addl r32, rm32  and  addw r16, rm16
   7923 
   7924    Same for ADD OR ADC SBB AND SUB XOR
   7925 
   7926    FE /1  = dec rm8
   7927    FF /1  = dec rm32  and  dec rm16
   7928 
   7929    FE /0  = inc rm8
   7930    FF /0  = inc rm32  and  inc rm16
   7931 
   7932    F6 /3  = neg rm8
   7933    F7 /3  = neg rm32  and  neg rm16
   7934 
   7935    F6 /2  = not rm8
   7936    F7 /2  = not rm32  and  not rm16
   7937 
   7938    0F BB     = btcw r16, rm16    and  btcl r32, rm32
   7939    OF BA /7  = btcw $imm8, rm16  and  btcw $imm8, rm32
   7940 
   7941    Same for BTS, BTR
   7942 */
   7943 static Bool can_be_used_with_LOCK_prefix ( UChar* opc )
   7944 {
   7945    switch (opc[0]) {
   7946       case 0x00: case 0x01: case 0x08: case 0x09:
   7947       case 0x10: case 0x11: case 0x18: case 0x19:
   7948       case 0x20: case 0x21: case 0x28: case 0x29:
   7949       case 0x30: case 0x31:
   7950          if (!epartIsReg(opc[1]))
   7951             return True;
   7952          break;
   7953 
   7954       case 0x80: case 0x81: case 0x82: case 0x83:
   7955          if (gregOfRM(opc[1]) >= 0 && gregOfRM(opc[1]) <= 6
   7956              && !epartIsReg(opc[1]))
   7957             return True;
   7958          break;
   7959 
   7960       case 0xFE: case 0xFF:
   7961          if (gregOfRM(opc[1]) >= 0 && gregOfRM(opc[1]) <= 1
   7962              && !epartIsReg(opc[1]))
   7963             return True;
   7964          break;
   7965 
   7966       case 0xF6: case 0xF7:
   7967          if (gregOfRM(opc[1]) >= 2 && gregOfRM(opc[1]) <= 3
   7968              && !epartIsReg(opc[1]))
   7969             return True;
   7970          break;
   7971 
   7972       case 0x86: case 0x87:
   7973          if (!epartIsReg(opc[1]))
   7974             return True;
   7975          break;
   7976 
   7977       case 0x0F: {
   7978          switch (opc[1]) {
   7979             case 0xBB: case 0xB3: case 0xAB:
   7980                if (!epartIsReg(opc[2]))
   7981                   return True;
   7982                break;
   7983             case 0xBA:
   7984                if (gregOfRM(opc[2]) >= 5 && gregOfRM(opc[2]) <= 7
   7985                    && !epartIsReg(opc[2]))
   7986                   return True;
   7987                break;
   7988             case 0xB0: case 0xB1:
   7989                if (!epartIsReg(opc[2]))
   7990                   return True;
   7991                break;
   7992             case 0xC7:
   7993                if (gregOfRM(opc[2]) == 1 && !epartIsReg(opc[2]) )
   7994                   return True;
   7995                break;
   7996             case 0xC0: case 0xC1:
   7997                if (!epartIsReg(opc[2]))
   7998                   return True;
   7999                break;
   8000             default:
   8001                break;
   8002          } /* switch (opc[1]) */
   8003          break;
   8004       }
   8005 
   8006       default:
   8007          break;
   8008    } /* switch (opc[0]) */
   8009 
   8010    return False;
   8011 }
   8012 
   8013 static IRTemp math_BSWAP ( IRTemp t1, IRType ty )
   8014 {
   8015    IRTemp t2 = newTemp(ty);
   8016    if (ty == Ity_I32) {
   8017       assign( t2,
   8018          binop(
   8019             Iop_Or32,
   8020             binop(Iop_Shl32, mkexpr(t1), mkU8(24)),
   8021             binop(
   8022                Iop_Or32,
   8023                binop(Iop_And32, binop(Iop_Shl32, mkexpr(t1), mkU8(8)),
   8024                                 mkU32(0x00FF0000)),
   8025                binop(Iop_Or32,
   8026                      binop(Iop_And32, binop(Iop_Shr32, mkexpr(t1), mkU8(8)),
   8027                                       mkU32(0x0000FF00)),
   8028                      binop(Iop_And32, binop(Iop_Shr32, mkexpr(t1), mkU8(24)),
   8029                                       mkU32(0x000000FF) )
   8030             )))
   8031       );
   8032       return t2;
   8033    }
   8034    if (ty == Ity_I16) {
   8035       assign(t2,
   8036              binop(Iop_Or16,
   8037                    binop(Iop_Shl16, mkexpr(t1), mkU8(8)),
   8038                    binop(Iop_Shr16, mkexpr(t1), mkU8(8)) ));
   8039       return t2;
   8040    }
   8041    vassert(0);
   8042    /*NOTREACHED*/
   8043    return IRTemp_INVALID;
   8044 }
   8045 
   8046 /*------------------------------------------------------------*/
   8047 /*--- Disassemble a single instruction                     ---*/
   8048 /*------------------------------------------------------------*/
   8049 
   8050 /* Disassemble a single instruction into IR.  The instruction is
   8051    located in host memory at &guest_code[delta].  *expect_CAS is set
   8052    to True if the resulting IR is expected to contain an IRCAS
   8053    statement, and False if it's not expected to.  This makes it
   8054    possible for the caller of disInstr_X86_WRK to check that
   8055    LOCK-prefixed instructions are at least plausibly translated, in
   8056    that it becomes possible to check that a (validly) LOCK-prefixed
   8057    instruction generates a translation containing an IRCAS, and
   8058    instructions without LOCK prefixes don't generate translations
   8059    containing an IRCAS.
   8060 */
   8061 static
   8062 DisResult disInstr_X86_WRK (
   8063              /*OUT*/Bool* expect_CAS,
   8064              Bool         (*resteerOkFn) ( /*opaque*/void*, Addr64 ),
   8065              Bool         resteerCisOk,
   8066              void*        callback_opaque,
   8067              Long         delta64,
   8068              VexArchInfo* archinfo,
   8069              VexAbiInfo*  vbi,
   8070              Bool         sigill_diag
   8071           )
   8072 {
   8073    IRType    ty;
   8074    IRTemp    addr, t0, t1, t2, t3, t4, t5, t6;
   8075    Int       alen;
   8076    UChar     opc, modrm, abyte, pre;
   8077    UInt      d32;
   8078    HChar     dis_buf[50];
   8079    Int       am_sz, d_sz, n_prefixes;
   8080    DisResult dres;
   8081    UChar*    insn; /* used in SSE decoders */
   8082 
   8083    /* The running delta */
   8084    Int delta = (Int)delta64;
   8085 
   8086    /* Holds eip at the start of the insn, so that we can print
   8087       consistent error messages for unimplemented insns. */
   8088    Int delta_start = delta;
   8089 
   8090    /* sz denotes the nominal data-op size of the insn; we change it to
   8091       2 if an 0x66 prefix is seen */
   8092    Int sz = 4;
   8093 
   8094    /* sorb holds the segment-override-prefix byte, if any.  Zero if no
   8095       prefix has been seen, else one of {0x26, 0x3E, 0x64, 0x65}
   8096       indicating the prefix.  */
   8097    UChar sorb = 0;
   8098 
   8099    /* Gets set to True if a LOCK prefix is seen. */
   8100    Bool pfx_lock = False;
   8101 
   8102    /* Set result defaults. */
   8103    dres.whatNext    = Dis_Continue;
   8104    dres.len         = 0;
   8105    dres.continueAt  = 0;
   8106    dres.jk_StopHere = Ijk_INVALID;
   8107 
   8108    *expect_CAS = False;
   8109 
   8110    addr = t0 = t1 = t2 = t3 = t4 = t5 = t6 = IRTemp_INVALID;
   8111 
   8112    vassert(guest_EIP_bbstart + delta == guest_EIP_curr_instr);
   8113    DIP("\t0x%x:  ", guest_EIP_bbstart+delta);
   8114 
   8115    /* Spot "Special" instructions (see comment at top of file). */
   8116    {
   8117       UChar* code = (UChar*)(guest_code + delta);
   8118       /* Spot the 12-byte preamble:
   8119          C1C703   roll $3,  %edi
   8120          C1C70D   roll $13, %edi
   8121          C1C71D   roll $29, %edi
   8122          C1C713   roll $19, %edi
   8123       */
   8124       if (code[ 0] == 0xC1 && code[ 1] == 0xC7 && code[ 2] == 0x03 &&
   8125           code[ 3] == 0xC1 && code[ 4] == 0xC7 && code[ 5] == 0x0D &&
   8126           code[ 6] == 0xC1 && code[ 7] == 0xC7 && code[ 8] == 0x1D &&
   8127           code[ 9] == 0xC1 && code[10] == 0xC7 && code[11] == 0x13) {
   8128          /* Got a "Special" instruction preamble.  Which one is it? */
   8129          if (code[12] == 0x87 && code[13] == 0xDB /* xchgl %ebx,%ebx */) {
   8130             /* %EDX = client_request ( %EAX ) */
   8131             DIP("%%edx = client_request ( %%eax )\n");
   8132             delta += 14;
   8133             jmp_lit(&dres, Ijk_ClientReq, guest_EIP_bbstart+delta);
   8134             vassert(dres.whatNext == Dis_StopHere);
   8135             goto decode_success;
   8136          }
   8137          else
   8138          if (code[12] == 0x87 && code[13] == 0xC9 /* xchgl %ecx,%ecx */) {
   8139             /* %EAX = guest_NRADDR */
   8140             DIP("%%eax = guest_NRADDR\n");
   8141             delta += 14;
   8142             putIReg(4, R_EAX, IRExpr_Get( OFFB_NRADDR, Ity_I32 ));
   8143             goto decode_success;
   8144          }
   8145          else
   8146          if (code[12] == 0x87 && code[13] == 0xD2 /* xchgl %edx,%edx */) {
   8147             /* call-noredir *%EAX */
   8148             DIP("call-noredir *%%eax\n");
   8149             delta += 14;
   8150             t1 = newTemp(Ity_I32);
   8151             assign(t1, getIReg(4,R_EAX));
   8152             t2 = newTemp(Ity_I32);
   8153             assign(t2, binop(Iop_Sub32, getIReg(4,R_ESP), mkU32(4)));
   8154             putIReg(4, R_ESP, mkexpr(t2));
   8155             storeLE( mkexpr(t2), mkU32(guest_EIP_bbstart+delta));
   8156             jmp_treg(&dres, Ijk_NoRedir, t1);
   8157             vassert(dres.whatNext == Dis_StopHere);
   8158             goto decode_success;
   8159          }
   8160          else
   8161          if (code[12] == 0x87 && code[13] == 0xFF /* xchgl %edi,%edi */) {
   8162             /* IR injection */
   8163             DIP("IR injection\n");
   8164             vex_inject_ir(irsb, Iend_LE);
   8165 
   8166             // Invalidate the current insn. The reason is that the IRop we're
   8167             // injecting here can change. In which case the translation has to
   8168             // be redone. For ease of handling, we simply invalidate all the
   8169             // time.
   8170             stmt(IRStmt_Put(OFFB_CMSTART, mkU32(guest_EIP_curr_instr)));
   8171             stmt(IRStmt_Put(OFFB_CMLEN,   mkU32(14)));
   8172 
   8173             delta += 14;
   8174 
   8175             stmt( IRStmt_Put( OFFB_EIP, mkU32(guest_EIP_bbstart + delta) ) );
   8176             dres.whatNext    = Dis_StopHere;
   8177             dres.jk_StopHere = Ijk_InvalICache;
   8178             goto decode_success;
   8179          }
   8180          /* We don't know what it is. */
   8181          goto decode_failure;
   8182          /*NOTREACHED*/
   8183       }
   8184    }
   8185 
   8186    /* Handle a couple of weird-ass NOPs that have been observed in the
   8187       wild. */
   8188    {
   8189       UChar* code = (UChar*)(guest_code + delta);
   8190       /* Sun's JVM 1.5.0 uses the following as a NOP:
   8191          26 2E 64 65 90  %es:%cs:%fs:%gs:nop */
   8192       if (code[0] == 0x26 && code[1] == 0x2E && code[2] == 0x64
   8193           && code[3] == 0x65 && code[4] == 0x90) {
   8194          DIP("%%es:%%cs:%%fs:%%gs:nop\n");
   8195          delta += 5;
   8196          goto decode_success;
   8197       }
   8198       /* Don't barf on recent binutils padding,
   8199          all variants of which are: nopw %cs:0x0(%eax,%eax,1)
   8200          66 2e 0f 1f 84 00 00 00 00 00
   8201          66 66 2e 0f 1f 84 00 00 00 00 00
   8202          66 66 66 2e 0f 1f 84 00 00 00 00 00
   8203          66 66 66 66 2e 0f 1f 84 00 00 00 00 00
   8204          66 66 66 66 66 2e 0f 1f 84 00 00 00 00 00
   8205          66 66 66 66 66 66 2e 0f 1f 84 00 00 00 00 00
   8206       */
   8207       if (code[0] == 0x66) {
   8208          Int data16_cnt;
   8209          for (data16_cnt = 1; data16_cnt < 6; data16_cnt++)
   8210             if (code[data16_cnt] != 0x66)
   8211                break;
   8212          if (code[data16_cnt] == 0x2E && code[data16_cnt + 1] == 0x0F
   8213              && code[data16_cnt + 2] == 0x1F && code[data16_cnt + 3] == 0x84
   8214              && code[data16_cnt + 4] == 0x00 && code[data16_cnt + 5] == 0x00
   8215              && code[data16_cnt + 6] == 0x00 && code[data16_cnt + 7] == 0x00
   8216              && code[data16_cnt + 8] == 0x00 ) {
   8217             DIP("nopw %%cs:0x0(%%eax,%%eax,1)\n");
   8218             delta += 9 + data16_cnt;
   8219             goto decode_success;
   8220          }
   8221       }
   8222    }
   8223 
   8224    /* Normal instruction handling starts here. */
   8225 
   8226    /* Deal with some but not all prefixes:
   8227          66(oso)
   8228          F0(lock)
   8229          2E(cs:) 3E(ds:) 26(es:) 64(fs:) 65(gs:) 36(ss:)
   8230       Not dealt with (left in place):
   8231          F2 F3
   8232    */
   8233    n_prefixes = 0;
   8234    while (True) {
   8235       if (n_prefixes > 7) goto decode_failure;
   8236       pre = getUChar(delta);
   8237       switch (pre) {
   8238          case 0x66:
   8239             sz = 2;
   8240             break;
   8241          case 0xF0:
   8242             pfx_lock = True;
   8243             *expect_CAS = True;
   8244             break;
   8245          case 0x3E: /* %DS: */
   8246          case 0x26: /* %ES: */
   8247          case 0x64: /* %FS: */
   8248          case 0x65: /* %GS: */
   8249             if (sorb != 0)
   8250                goto decode_failure; /* only one seg override allowed */
   8251             sorb = pre;
   8252             break;
   8253          case 0x2E: { /* %CS: */
   8254             /* 2E prefix on a conditional branch instruction is a
   8255                branch-prediction hint, which can safely be ignored.  */
   8256             UChar op1 = getIByte(delta+1);
   8257             UChar op2 = getIByte(delta+2);
   8258             if ((op1 >= 0x70 && op1 <= 0x7F)
   8259                 || (op1 == 0xE3)
   8260                 || (op1 == 0x0F && op2 >= 0x80 && op2 <= 0x8F)) {
   8261                if (0) vex_printf("vex x86->IR: ignoring branch hint\n");
   8262             } else {
   8263                /* All other CS override cases are not handled */
   8264                goto decode_failure;
   8265             }
   8266             break;
   8267          }
   8268          case 0x36: /* %SS: */
   8269             /* SS override cases are not handled */
   8270             goto decode_failure;
   8271          default:
   8272             goto not_a_prefix;
   8273       }
   8274       n_prefixes++;
   8275       delta++;
   8276    }
   8277 
   8278    not_a_prefix:
   8279 
   8280    /* Now we should be looking at the primary opcode byte or the
   8281       leading F2 or F3.  Check that any LOCK prefix is actually
   8282       allowed. */
   8283 
   8284    if (pfx_lock) {
   8285       if (can_be_used_with_LOCK_prefix( (UChar*)&guest_code[delta] )) {
   8286          DIP("lock ");
   8287       } else {
   8288          *expect_CAS = False;
   8289          goto decode_failure;
   8290       }
   8291    }
   8292 
   8293 
   8294    /* ---------------------------------------------------- */
   8295    /* --- The SSE decoder.                             --- */
   8296    /* ---------------------------------------------------- */
   8297 
   8298    /* What did I do to deserve SSE ?  Perhaps I was really bad in a
   8299       previous life? */
   8300 
   8301    /* Note, this doesn't handle SSE2 or SSE3.  That is handled in a
   8302       later section, further on. */
   8303 
   8304    insn = (UChar*)&guest_code[delta];
   8305 
   8306    /* Treat fxsave specially.  It should be doable even on an SSE0
   8307       (Pentium-II class) CPU.  Hence be prepared to handle it on
   8308       any subarchitecture variant.
   8309    */
   8310 
   8311    /* 0F AE /0 = FXSAVE m512 -- write x87 and SSE state to memory */
   8312    if (sz == 4 && insn[0] == 0x0F && insn[1] == 0xAE
   8313        && !epartIsReg(insn[2]) && gregOfRM(insn[2]) == 0) {
   8314       IRDirty* d;
   8315       modrm = getIByte(delta+2);
   8316       vassert(sz == 4);
   8317       vassert(!epartIsReg(modrm));
   8318 
   8319       addr = disAMode ( &alen, sorb, delta+2, dis_buf );
   8320       delta += 2+alen;
   8321       gen_SEGV_if_not_16_aligned(addr);
   8322 
   8323       DIP("fxsave %s\n", dis_buf);
   8324 
   8325       /* Uses dirty helper:
   8326             void x86g_do_FXSAVE ( VexGuestX86State*, UInt ) */
   8327       d = unsafeIRDirty_0_N (
   8328              0/*regparms*/,
   8329              "x86g_dirtyhelper_FXSAVE",
   8330              &x86g_dirtyhelper_FXSAVE,
   8331              mkIRExprVec_2( IRExpr_BBPTR(), mkexpr(addr) )
   8332           );
   8333 
   8334       /* declare we're writing memory */
   8335       d->mFx   = Ifx_Write;
   8336       d->mAddr = mkexpr(addr);
   8337       d->mSize = 464; /* according to recent Intel docs */
   8338 
   8339       /* declare we're reading guest state */
   8340       d->nFxState = 7;
   8341       vex_bzero(&d->fxState, sizeof(d->fxState));
   8342 
   8343       d->fxState[0].fx     = Ifx_Read;
   8344       d->fxState[0].offset = OFFB_FTOP;
   8345       d->fxState[0].size   = sizeof(UInt);
   8346 
   8347       d->fxState[1].fx     = Ifx_Read;
   8348       d->fxState[1].offset = OFFB_FPREGS;
   8349       d->fxState[1].size   = 8 * sizeof(ULong);
   8350 
   8351       d->fxState[2].fx     = Ifx_Read;
   8352       d->fxState[2].offset = OFFB_FPTAGS;
   8353       d->fxState[2].size   = 8 * sizeof(UChar);
   8354 
   8355       d->fxState[3].fx     = Ifx_Read;
   8356       d->fxState[3].offset = OFFB_FPROUND;
   8357       d->fxState[3].size   = sizeof(UInt);
   8358 
   8359       d->fxState[4].fx     = Ifx_Read;
   8360       d->fxState[4].offset = OFFB_FC3210;
   8361       d->fxState[4].size   = sizeof(UInt);
   8362 
   8363       d->fxState[5].fx     = Ifx_Read;
   8364       d->fxState[5].offset = OFFB_XMM0;
   8365       d->fxState[5].size   = 8 * sizeof(U128);
   8366 
   8367       d->fxState[6].fx     = Ifx_Read;
   8368       d->fxState[6].offset = OFFB_SSEROUND;
   8369       d->fxState[6].size   = sizeof(UInt);
   8370 
   8371       /* Be paranoid ... this assertion tries to ensure the 8 %xmm
   8372 	 images are packed back-to-back.  If not, the value of
   8373 	 d->fxState[5].size is wrong. */
   8374       vassert(16 == sizeof(U128));
   8375       vassert(OFFB_XMM7 == (OFFB_XMM0 + 7 * 16));
   8376 
   8377       stmt( IRStmt_Dirty(d) );
   8378 
   8379       goto decode_success;
   8380    }
   8381 
   8382    /* 0F AE /1 = FXRSTOR m512 -- read x87 and SSE state from memory */
   8383    if (sz == 4 && insn[0] == 0x0F && insn[1] == 0xAE
   8384        && !epartIsReg(insn[2]) && gregOfRM(insn[2]) == 1) {
   8385       IRDirty* d;
   8386       modrm = getIByte(delta+2);
   8387       vassert(sz == 4);
   8388       vassert(!epartIsReg(modrm));
   8389 
   8390       addr = disAMode ( &alen, sorb, delta+2, dis_buf );
   8391       delta += 2+alen;
   8392       gen_SEGV_if_not_16_aligned(addr);
   8393 
   8394       DIP("fxrstor %s\n", dis_buf);
   8395 
   8396       /* Uses dirty helper:
   8397             VexEmNote x86g_do_FXRSTOR ( VexGuestX86State*, UInt )
   8398          NOTE:
   8399             the VexEmNote value is simply ignored (unlike for FRSTOR)
   8400       */
   8401       d = unsafeIRDirty_0_N (
   8402              0/*regparms*/,
   8403              "x86g_dirtyhelper_FXRSTOR",
   8404              &x86g_dirtyhelper_FXRSTOR,
   8405              mkIRExprVec_2( IRExpr_BBPTR(), mkexpr(addr) )
   8406           );
   8407 
   8408       /* declare we're reading memory */
   8409       d->mFx   = Ifx_Read;
   8410       d->mAddr = mkexpr(addr);
   8411       d->mSize = 464; /* according to recent Intel docs */
   8412 
   8413       /* declare we're writing guest state */
   8414       d->nFxState = 7;
   8415       vex_bzero(&d->fxState, sizeof(d->fxState));
   8416 
   8417       d->fxState[0].fx     = Ifx_Write;
   8418       d->fxState[0].offset = OFFB_FTOP;
   8419       d->fxState[0].size   = sizeof(UInt);
   8420 
   8421       d->fxState[1].fx     = Ifx_Write;
   8422       d->fxState[1].offset = OFFB_FPREGS;
   8423       d->fxState[1].size   = 8 * sizeof(ULong);
   8424 
   8425       d->fxState[2].fx     = Ifx_Write;
   8426       d->fxState[2].offset = OFFB_FPTAGS;
   8427       d->fxState[2].size   = 8 * sizeof(UChar);
   8428 
   8429       d->fxState[3].fx     = Ifx_Write;
   8430       d->fxState[3].offset = OFFB_FPROUND;
   8431       d->fxState[3].size   = sizeof(UInt);
   8432 
   8433       d->fxState[4].fx     = Ifx_Write;
   8434       d->fxState[4].offset = OFFB_FC3210;
   8435       d->fxState[4].size   = sizeof(UInt);
   8436 
   8437       d->fxState[5].fx     = Ifx_Write;
   8438       d->fxState[5].offset = OFFB_XMM0;
   8439       d->fxState[5].size   = 8 * sizeof(U128);
   8440 
   8441       d->fxState[6].fx     = Ifx_Write;
   8442       d->fxState[6].offset = OFFB_SSEROUND;
   8443       d->fxState[6].size   = sizeof(UInt);
   8444 
   8445       /* Be paranoid ... this assertion tries to ensure the 8 %xmm
   8446 	 images are packed back-to-back.  If not, the value of
   8447 	 d->fxState[5].size is wrong. */
   8448       vassert(16 == sizeof(U128));
   8449       vassert(OFFB_XMM7 == (OFFB_XMM0 + 7 * 16));
   8450 
   8451       stmt( IRStmt_Dirty(d) );
   8452 
   8453       goto decode_success;
   8454    }
   8455 
   8456    /* ------ SSE decoder main ------ */
   8457 
   8458    /* Skip parts of the decoder which don't apply given the stated
   8459       guest subarchitecture. */
   8460    if (archinfo->hwcaps == 0/*baseline, no sse at all*/)
   8461       goto after_sse_decoders;
   8462 
   8463    /* With mmxext only some extended MMX instructions are recognized.
   8464       The mmxext instructions are MASKMOVQ MOVNTQ PAVGB PAVGW PMAXSW
   8465       PMAXUB PMINSW PMINUB PMULHUW PSADBW PSHUFW PEXTRW PINSRW PMOVMSKB
   8466       PREFETCHNTA PREFETCHT0 PREFETCHT1 PREFETCHT2 SFENCE
   8467 
   8468       http://support.amd.com/us/Embedded_TechDocs/22466.pdf
   8469       https://en.wikipedia.org/wiki/3DNow!#3DNow.21_extensions */
   8470 
   8471    if (archinfo->hwcaps == VEX_HWCAPS_X86_MMXEXT/*integer only sse1 subset*/)
   8472       goto mmxext;
   8473 
   8474    /* Otherwise we must be doing sse1 or sse2, so we can at least try
   8475       for SSE1 here. */
   8476 
   8477    /* 0F 58 = ADDPS -- add 32Fx4 from R/M to R */
   8478    if (sz == 4 && insn[0] == 0x0F && insn[1] == 0x58) {
   8479       delta = dis_SSE_E_to_G_all( sorb, delta+2, "addps", Iop_Add32Fx4 );
   8480       goto decode_success;
   8481    }
   8482 
   8483    /* F3 0F 58 = ADDSS -- add 32F0x4 from R/M to R */
   8484    if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0x58) {
   8485       vassert(sz == 4);
   8486       delta = dis_SSE_E_to_G_lo32( sorb, delta+3, "addss", Iop_Add32F0x4 );
   8487       goto decode_success;
   8488    }
   8489 
   8490    /* 0F 55 = ANDNPS -- G = (not G) and E */
   8491    if (sz == 4 && insn[0] == 0x0F && insn[1] == 0x55) {
   8492       delta = dis_SSE_E_to_G_all_invG( sorb, delta+2, "andnps", Iop_AndV128 );
   8493       goto decode_success;
   8494    }
   8495 
   8496    /* 0F 54 = ANDPS -- G = G and E */
   8497    if (sz == 4 && insn[0] == 0x0F && insn[1] == 0x54) {
   8498       delta = dis_SSE_E_to_G_all( sorb, delta+2, "andps", Iop_AndV128 );
   8499       goto decode_success;
   8500    }
   8501 
   8502    /* 0F C2 = CMPPS -- 32Fx4 comparison from R/M to R */
   8503    if (sz == 4 && insn[0] == 0x0F && insn[1] == 0xC2) {
   8504       delta = dis_SSEcmp_E_to_G( sorb, delta+2, "cmpps", True, 4 );
   8505       goto decode_success;
   8506    }
   8507 
   8508    /* F3 0F C2 = CMPSS -- 32F0x4 comparison from R/M to R */
   8509    if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0xC2) {
   8510       vassert(sz == 4);
   8511       delta = dis_SSEcmp_E_to_G( sorb, delta+3, "cmpss", False, 4 );
   8512       goto decode_success;
   8513    }
   8514 
   8515    /* 0F 2F = COMISS  -- 32F0x4 comparison G,E, and set ZCP */
   8516    /* 0F 2E = UCOMISS -- 32F0x4 comparison G,E, and set ZCP */
   8517    if (sz == 4 && insn[0] == 0x0F && (insn[1] == 0x2F || insn[1] == 0x2E)) {
   8518       IRTemp argL = newTemp(Ity_F32);
   8519       IRTemp argR = newTemp(Ity_F32);
   8520       modrm = getIByte(delta+2);
   8521       if (epartIsReg(modrm)) {
   8522          assign( argR, getXMMRegLane32F( eregOfRM(modrm), 0/*lowest lane*/ ) );
   8523          delta += 2+1;
   8524          DIP("[u]comiss %s,%s\n", nameXMMReg(eregOfRM(modrm)),
   8525                                   nameXMMReg(gregOfRM(modrm)) );
   8526       } else {
   8527          addr = disAMode ( &alen, sorb, delta+2, dis_buf );
   8528 	 assign( argR, loadLE(Ity_F32, mkexpr(addr)) );
   8529          delta += 2+alen;
   8530          DIP("[u]comiss %s,%s\n", dis_buf,
   8531                                   nameXMMReg(gregOfRM(modrm)) );
   8532       }
   8533       assign( argL, getXMMRegLane32F( gregOfRM(modrm), 0/*lowest lane*/ ) );
   8534 
   8535       stmt( IRStmt_Put( OFFB_CC_OP,   mkU32(X86G_CC_OP_COPY) ));
   8536       stmt( IRStmt_Put( OFFB_CC_DEP2, mkU32(0) ));
   8537       stmt( IRStmt_Put(
   8538                OFFB_CC_DEP1,
   8539                binop( Iop_And32,
   8540                       binop(Iop_CmpF64,
   8541                             unop(Iop_F32toF64,mkexpr(argL)),
   8542                             unop(Iop_F32toF64,mkexpr(argR))),
   8543                       mkU32(0x45)
   8544           )));
   8545       /* Set NDEP even though it isn't used.  This makes redundant-PUT
   8546          elimination of previous stores to this field work better. */
   8547       stmt( IRStmt_Put( OFFB_CC_NDEP, mkU32(0) ));
   8548       goto decode_success;
   8549    }
   8550 
   8551    /* 0F 2A = CVTPI2PS -- convert 2 x I32 in mem/mmx to 2 x F32 in low
   8552       half xmm */
   8553    if (sz == 4 && insn[0] == 0x0F && insn[1] == 0x2A) {
   8554       IRTemp arg64 = newTemp(Ity_I64);
   8555       IRTemp rmode = newTemp(Ity_I32);
   8556       vassert(sz == 4);
   8557 
   8558       modrm = getIByte(delta+2);
   8559       do_MMX_preamble();
   8560       if (epartIsReg(modrm)) {
   8561          assign( arg64, getMMXReg(eregOfRM(modrm)) );
   8562          delta += 2+1;
   8563          DIP("cvtpi2ps %s,%s\n", nameMMXReg(eregOfRM(modrm)),
   8564                                  nameXMMReg(gregOfRM(modrm)));
   8565       } else {
   8566          addr = disAMode ( &alen, sorb, delta+2, dis_buf );
   8567 	 assign( arg64, loadLE(Ity_I64, mkexpr(addr)) );
   8568          delta += 2+alen;
   8569          DIP("cvtpi2ps %s,%s\n", dis_buf,
   8570                                  nameXMMReg(gregOfRM(modrm)) );
   8571       }
   8572 
   8573       assign( rmode, get_sse_roundingmode() );
   8574 
   8575       putXMMRegLane32F(
   8576          gregOfRM(modrm), 0,
   8577          binop(Iop_F64toF32,
   8578                mkexpr(rmode),
   8579                unop(Iop_I32StoF64,
   8580                     unop(Iop_64to32, mkexpr(arg64)) )) );
   8581 
   8582       putXMMRegLane32F(
   8583          gregOfRM(modrm), 1,
   8584          binop(Iop_F64toF32,
   8585                mkexpr(rmode),
   8586                unop(Iop_I32StoF64,
   8587                     unop(Iop_64HIto32, mkexpr(arg64)) )) );
   8588 
   8589       goto decode_success;
   8590    }
   8591 
   8592    /* F3 0F 2A = CVTSI2SS -- convert I32 in mem/ireg to F32 in low
   8593       quarter xmm */
   8594    if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0x2A) {
   8595       IRTemp arg32 = newTemp(Ity_I32);
   8596       IRTemp rmode = newTemp(Ity_I32);
   8597       vassert(sz == 4);
   8598 
   8599       modrm = getIByte(delta+3);
   8600       if (epartIsReg(modrm)) {
   8601          assign( arg32, getIReg(4, eregOfRM(modrm)) );
   8602          delta += 3+1;
   8603          DIP("cvtsi2ss %s,%s\n", nameIReg(4, eregOfRM(modrm)),
   8604                                  nameXMMReg(gregOfRM(modrm)));
   8605       } else {
   8606          addr = disAMode ( &alen, sorb, delta+3, dis_buf );
   8607 	 assign( arg32, loadLE(Ity_I32, mkexpr(addr)) );
   8608          delta += 3+alen;
   8609          DIP("cvtsi2ss %s,%s\n", dis_buf,
   8610                                  nameXMMReg(gregOfRM(modrm)) );
   8611       }
   8612 
   8613       assign( rmode, get_sse_roundingmode() );
   8614 
   8615       putXMMRegLane32F(
   8616          gregOfRM(modrm), 0,
   8617          binop(Iop_F64toF32,
   8618                mkexpr(rmode),
   8619                unop(Iop_I32StoF64, mkexpr(arg32)) ) );
   8620 
   8621       goto decode_success;
   8622    }
   8623 
   8624    /* 0F 2D = CVTPS2PI -- convert 2 x F32 in mem/low half xmm to 2 x
   8625       I32 in mmx, according to prevailing SSE rounding mode */
   8626    /* 0F 2C = CVTTPS2PI -- convert 2 x F32 in mem/low half xmm to 2 x
   8627       I32 in mmx, rounding towards zero */
   8628    if (sz == 4 && insn[0] == 0x0F && (insn[1] == 0x2D || insn[1] == 0x2C)) {
   8629       IRTemp dst64  = newTemp(Ity_I64);
   8630       IRTemp rmode  = newTemp(Ity_I32);
   8631       IRTemp f32lo  = newTemp(Ity_F32);
   8632       IRTemp f32hi  = newTemp(Ity_F32);
   8633       Bool   r2zero = toBool(insn[1] == 0x2C);
   8634 
   8635       do_MMX_preamble();
   8636       modrm = getIByte(delta+2);
   8637 
   8638       if (epartIsReg(modrm)) {
   8639          delta += 2+1;
   8640 	 assign(f32lo, getXMMRegLane32F(eregOfRM(modrm), 0));
   8641 	 assign(f32hi, getXMMRegLane32F(eregOfRM(modrm), 1));
   8642          DIP("cvt%sps2pi %s,%s\n", r2zero ? "t" : "",
   8643                                    nameXMMReg(eregOfRM(modrm)),
   8644                                    nameMMXReg(gregOfRM(modrm)));
   8645       } else {
   8646          addr = disAMode ( &alen, sorb, delta+2, dis_buf );
   8647 	 assign(f32lo, loadLE(Ity_F32, mkexpr(addr)));
   8648 	 assign(f32hi, loadLE(Ity_F32, binop( Iop_Add32,
   8649                                               mkexpr(addr),
   8650                                               mkU32(4) )));
   8651          delta += 2+alen;
   8652          DIP("cvt%sps2pi %s,%s\n", r2zero ? "t" : "",
   8653                                    dis_buf,
   8654                                    nameMMXReg(gregOfRM(modrm)));
   8655       }
   8656 
   8657       if (r2zero) {
   8658          assign(rmode, mkU32((UInt)Irrm_ZERO) );
   8659       } else {
   8660          assign( rmode, get_sse_roundingmode() );
   8661       }
   8662 
   8663       assign(
   8664          dst64,
   8665          binop( Iop_32HLto64,
   8666                 binop( Iop_F64toI32S,
   8667                        mkexpr(rmode),
   8668                        unop( Iop_F32toF64, mkexpr(f32hi) ) ),
   8669                 binop( Iop_F64toI32S,
   8670                        mkexpr(rmode),
   8671                        unop( Iop_F32toF64, mkexpr(f32lo) ) )
   8672               )
   8673       );
   8674 
   8675       putMMXReg(gregOfRM(modrm), mkexpr(dst64));
   8676       goto decode_success;
   8677    }
   8678 
   8679    /* F3 0F 2D = CVTSS2SI -- convert F32 in mem/low quarter xmm to
   8680       I32 in ireg, according to prevailing SSE rounding mode */
   8681    /* F3 0F 2C = CVTTSS2SI -- convert F32 in mem/low quarter xmm to
   8682       I32 in ireg, rounding towards zero */
   8683    if (insn[0] == 0xF3 && insn[1] == 0x0F
   8684        && (insn[2] == 0x2D || insn[2] == 0x2C)) {
   8685       IRTemp rmode = newTemp(Ity_I32);
   8686       IRTemp f32lo = newTemp(Ity_F32);
   8687       Bool   r2zero = toBool(insn[2] == 0x2C);
   8688       vassert(sz == 4);
   8689 
   8690       modrm = getIByte(delta+3);
   8691       if (epartIsReg(modrm)) {
   8692          delta += 3+1;
   8693 	 assign(f32lo, getXMMRegLane32F(eregOfRM(modrm), 0));
   8694          DIP("cvt%sss2si %s,%s\n", r2zero ? "t" : "",
   8695                                    nameXMMReg(eregOfRM(modrm)),
   8696                                    nameIReg(4, gregOfRM(modrm)));
   8697       } else {
   8698          addr = disAMode ( &alen, sorb, delta+3, dis_buf );
   8699 	 assign(f32lo, loadLE(Ity_F32, mkexpr(addr)));
   8700          delta += 3+alen;
   8701          DIP("cvt%sss2si %s,%s\n", r2zero ? "t" : "",
   8702                                    dis_buf,
   8703                                    nameIReg(4, gregOfRM(modrm)));
   8704       }
   8705 
   8706       if (r2zero) {
   8707          assign( rmode, mkU32((UInt)Irrm_ZERO) );
   8708       } else {
   8709          assign( rmode, get_sse_roundingmode() );
   8710       }
   8711 
   8712       putIReg(4, gregOfRM(modrm),
   8713                  binop( Iop_F64toI32S,
   8714                         mkexpr(rmode),
   8715                         unop( Iop_F32toF64, mkexpr(f32lo) ) )
   8716       );
   8717 
   8718       goto decode_success;
   8719    }
   8720 
   8721    /* 0F 5E = DIVPS -- div 32Fx4 from R/M to R */
   8722    if (sz == 4 && insn[0] == 0x0F && insn[1] == 0x5E) {
   8723       delta = dis_SSE_E_to_G_all( sorb, delta+2, "divps", Iop_Div32Fx4 );
   8724       goto decode_success;
   8725    }
   8726 
   8727    /* F3 0F 5E = DIVSS -- div 32F0x4 from R/M to R */
   8728    if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0x5E) {
   8729       vassert(sz == 4);
   8730       delta = dis_SSE_E_to_G_lo32( sorb, delta+3, "divss", Iop_Div32F0x4 );
   8731       goto decode_success;
   8732    }
   8733 
   8734    /* 0F AE /2 = LDMXCSR m32 -- load %mxcsr */
   8735    if (insn[0] == 0x0F && insn[1] == 0xAE
   8736        && !epartIsReg(insn[2]) && gregOfRM(insn[2]) == 2) {
   8737 
   8738       IRTemp t64 = newTemp(Ity_I64);
   8739       IRTemp ew = newTemp(Ity_I32);
   8740 
   8741       modrm = getIByte(delta+2);
   8742       vassert(!epartIsReg(modrm));
   8743       vassert(sz == 4);
   8744 
   8745       addr = disAMode ( &alen, sorb, delta+2, dis_buf );
   8746       delta += 2+alen;
   8747       DIP("ldmxcsr %s\n", dis_buf);
   8748 
   8749       /* The only thing we observe in %mxcsr is the rounding mode.
   8750          Therefore, pass the 32-bit value (SSE native-format control
   8751          word) to a clean helper, getting back a 64-bit value, the
   8752          lower half of which is the SSEROUND value to store, and the
   8753          upper half of which is the emulation-warning token which may
   8754          be generated.
   8755       */
   8756       /* ULong x86h_check_ldmxcsr ( UInt ); */
   8757       assign( t64, mkIRExprCCall(
   8758                       Ity_I64, 0/*regparms*/,
   8759                       "x86g_check_ldmxcsr",
   8760                       &x86g_check_ldmxcsr,
   8761                       mkIRExprVec_1( loadLE(Ity_I32, mkexpr(addr)) )
   8762                    )
   8763             );
   8764 
   8765       put_sse_roundingmode( unop(Iop_64to32, mkexpr(t64)) );
   8766       assign( ew, unop(Iop_64HIto32, mkexpr(t64) ) );
   8767       put_emwarn( mkexpr(ew) );
   8768       /* Finally, if an emulation warning was reported, side-exit to
   8769          the next insn, reporting the warning, so that Valgrind's
   8770          dispatcher sees the warning. */
   8771       stmt(
   8772          IRStmt_Exit(
   8773             binop(Iop_CmpNE32, mkexpr(ew), mkU32(0)),
   8774             Ijk_EmWarn,
   8775             IRConst_U32( ((Addr32)guest_EIP_bbstart)+delta),
   8776             OFFB_EIP
   8777          )
   8778       );
   8779       goto decode_success;
   8780    }
   8781 
   8782 
   8783    /* mmxext sse1 subset starts here. mmxext only arches will parse
   8784       only this subset of the sse1 instructions. */
   8785   mmxext:
   8786 
   8787    /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
   8788    /* 0F F7 = MASKMOVQ -- 8x8 masked store */
   8789    if (sz == 4 && insn[0] == 0x0F && insn[1] == 0xF7) {
   8790       Bool ok = False;
   8791       delta = dis_MMX( &ok, sorb, sz, delta+1 );
   8792       if (!ok)
   8793          goto decode_failure;
   8794       goto decode_success;
   8795    }
   8796 
   8797    /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
   8798    /* 0F E7 = MOVNTQ -- for us, just a plain MMX store.  Note, the
   8799       Intel manual does not say anything about the usual business of
   8800       the FP reg tags getting trashed whenever an MMX insn happens.
   8801       So we just leave them alone.
   8802    */
   8803    if (insn[0] == 0x0F && insn[1] == 0xE7) {
   8804       modrm = getIByte(delta+2);
   8805       if (sz == 4 && !epartIsReg(modrm)) {
   8806          /* do_MMX_preamble(); Intel docs don't specify this */
   8807          addr = disAMode ( &alen, sorb, delta+2, dis_buf );
   8808          storeLE( mkexpr(addr), getMMXReg(gregOfRM(modrm)) );
   8809          DIP("movntq %s,%s\n", dis_buf,
   8810                                nameMMXReg(gregOfRM(modrm)));
   8811          delta += 2+alen;
   8812          goto decode_success;
   8813       }
   8814       /* else fall through */
   8815    }
   8816 
   8817    /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
   8818    /* 0F E0 = PAVGB -- 8x8 unsigned Packed Average, with rounding */
   8819    if (sz == 4 && insn[0] == 0x0F && insn[1] == 0xE0) {
   8820       do_MMX_preamble();
   8821       delta = dis_MMXop_regmem_to_reg (
   8822                 sorb, delta+2, insn[1], "pavgb", False );
   8823       goto decode_success;
   8824    }
   8825 
   8826    /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
   8827    /* 0F E3 = PAVGW -- 16x4 unsigned Packed Average, with rounding */
   8828    if (sz == 4 && insn[0] == 0x0F && insn[1] == 0xE3) {
   8829       do_MMX_preamble();
   8830       delta = dis_MMXop_regmem_to_reg (
   8831                 sorb, delta+2, insn[1], "pavgw", False );
   8832       goto decode_success;
   8833    }
   8834 
   8835    /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
   8836    /* 0F C5 = PEXTRW -- extract 16-bit field from mmx(E) and put
   8837       zero-extend of it in ireg(G). */
   8838    if (insn[0] == 0x0F && insn[1] == 0xC5) {
   8839       modrm = insn[2];
   8840       if (sz == 4 && epartIsReg(modrm)) {
   8841          IRTemp sV = newTemp(Ity_I64);
   8842          t5 = newTemp(Ity_I16);
   8843          do_MMX_preamble();
   8844          assign(sV, getMMXReg(eregOfRM(modrm)));
   8845          breakup64to16s( sV, &t3, &t2, &t1, &t0 );
   8846          switch (insn[3] & 3) {
   8847             case 0:  assign(t5, mkexpr(t0)); break;
   8848             case 1:  assign(t5, mkexpr(t1)); break;
   8849             case 2:  assign(t5, mkexpr(t2)); break;
   8850             case 3:  assign(t5, mkexpr(t3)); break;
   8851             default: vassert(0); /*NOTREACHED*/
   8852          }
   8853          putIReg(4, gregOfRM(modrm), unop(Iop_16Uto32, mkexpr(t5)));
   8854          DIP("pextrw $%d,%s,%s\n",
   8855              (Int)insn[3], nameMMXReg(eregOfRM(modrm)),
   8856                            nameIReg(4,gregOfRM(modrm)));
   8857          delta += 4;
   8858          goto decode_success;
   8859       }
   8860       /* else fall through */
   8861    }
   8862 
   8863    /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
   8864    /* 0F C4 = PINSRW -- get 16 bits from E(mem or low half ireg) and
   8865       put it into the specified lane of mmx(G). */
   8866    if (sz == 4 && insn[0] == 0x0F && insn[1] == 0xC4) {
   8867       /* Use t0 .. t3 to hold the 4 original 16-bit lanes of the
   8868          mmx reg.  t4 is the new lane value.  t5 is the original
   8869          mmx value. t6 is the new mmx value. */
   8870       Int lane;
   8871       t4 = newTemp(Ity_I16);
   8872       t5 = newTemp(Ity_I64);
   8873       t6 = newTemp(Ity_I64);
   8874       modrm = insn[2];
   8875       do_MMX_preamble();
   8876 
   8877       assign(t5, getMMXReg(gregOfRM(modrm)));
   8878       breakup64to16s( t5, &t3, &t2, &t1, &t0 );
   8879 
   8880       if (epartIsReg(modrm)) {
   8881          assign(t4, getIReg(2, eregOfRM(modrm)));
   8882          delta += 3+1;
   8883          lane = insn[3+1-1];
   8884          DIP("pinsrw $%d,%s,%s\n", (Int)lane,
   8885                                    nameIReg(2,eregOfRM(modrm)),
   8886                                    nameMMXReg(gregOfRM(modrm)));
   8887       } else {
   8888          addr = disAMode ( &alen, sorb, delta+2, dis_buf );
   8889          delta += 3+alen;
   8890          lane = insn[3+alen-1];
   8891          assign(t4, loadLE(Ity_I16, mkexpr(addr)));
   8892          DIP("pinsrw $%d,%s,%s\n", (Int)lane,
   8893                                    dis_buf,
   8894                                    nameMMXReg(gregOfRM(modrm)));
   8895       }
   8896 
   8897       switch (lane & 3) {
   8898          case 0:  assign(t6, mk64from16s(t3,t2,t1,t4)); break;
   8899          case 1:  assign(t6, mk64from16s(t3,t2,t4,t0)); break;
   8900          case 2:  assign(t6, mk64from16s(t3,t4,t1,t0)); break;
   8901          case 3:  assign(t6, mk64from16s(t4,t2,t1,t0)); break;
   8902          default: vassert(0); /*NOTREACHED*/
   8903       }
   8904       putMMXReg(gregOfRM(modrm), mkexpr(t6));
   8905       goto decode_success;
   8906    }
   8907 
   8908    /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
   8909    /* 0F EE = PMAXSW -- 16x4 signed max */
   8910    if (sz == 4 && insn[0] == 0x0F && insn[1] == 0xEE) {
   8911       do_MMX_preamble();
   8912       delta = dis_MMXop_regmem_to_reg (
   8913                 sorb, delta+2, insn[1], "pmaxsw", False );
   8914       goto decode_success;
   8915    }
   8916 
   8917    /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
   8918    /* 0F DE = PMAXUB -- 8x8 unsigned max */
   8919    if (sz == 4 && insn[0] == 0x0F && insn[1] == 0xDE) {
   8920       do_MMX_preamble();
   8921       delta = dis_MMXop_regmem_to_reg (
   8922                 sorb, delta+2, insn[1], "pmaxub", False );
   8923       goto decode_success;
   8924    }
   8925 
   8926    /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
   8927    /* 0F EA = PMINSW -- 16x4 signed min */
   8928    if (sz == 4 && insn[0] == 0x0F && insn[1] == 0xEA) {
   8929       do_MMX_preamble();
   8930       delta = dis_MMXop_regmem_to_reg (
   8931                 sorb, delta+2, insn[1], "pminsw", False );
   8932       goto decode_success;
   8933    }
   8934 
   8935    /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
   8936    /* 0F DA = PMINUB -- 8x8 unsigned min */
   8937    if (sz == 4 && insn[0] == 0x0F && insn[1] == 0xDA) {
   8938       do_MMX_preamble();
   8939       delta = dis_MMXop_regmem_to_reg (
   8940                 sorb, delta+2, insn[1], "pminub", False );
   8941       goto decode_success;
   8942    }
   8943 
   8944    /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
   8945    /* 0F D7 = PMOVMSKB -- extract sign bits from each of 8 lanes in
   8946       mmx(E), turn them into a byte, and put zero-extend of it in
   8947       ireg(G). */
   8948    if (sz == 4 && insn[0] == 0x0F && insn[1] == 0xD7) {
   8949       modrm = insn[2];
   8950       if (epartIsReg(modrm)) {
   8951          do_MMX_preamble();
   8952          t0 = newTemp(Ity_I64);
   8953          t1 = newTemp(Ity_I32);
   8954          assign(t0, getMMXReg(eregOfRM(modrm)));
   8955          assign(t1, unop(Iop_8Uto32, unop(Iop_GetMSBs8x8, mkexpr(t0))));
   8956          putIReg(4, gregOfRM(modrm), mkexpr(t1));
   8957          DIP("pmovmskb %s,%s\n", nameMMXReg(eregOfRM(modrm)),
   8958                                  nameIReg(4,gregOfRM(modrm)));
   8959          delta += 3;
   8960          goto decode_success;
   8961       }
   8962       /* else fall through */
   8963    }
   8964 
   8965    /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
   8966    /* 0F E4 = PMULUH -- 16x4 hi-half of unsigned widening multiply */
   8967    if (sz == 4 && insn[0] == 0x0F && insn[1] == 0xE4) {
   8968       do_MMX_preamble();
   8969       delta = dis_MMXop_regmem_to_reg (
   8970                 sorb, delta+2, insn[1], "pmuluh", False );
   8971       goto decode_success;
   8972    }
   8973 
   8974    /* 0F 18 /0 = PREFETCHNTA -- prefetch into caches, */
   8975    /* 0F 18 /1 = PREFETCH0   -- with various different hints */
   8976    /* 0F 18 /2 = PREFETCH1 */
   8977    /* 0F 18 /3 = PREFETCH2 */
   8978    if (insn[0] == 0x0F && insn[1] == 0x18
   8979        && !epartIsReg(insn[2])
   8980        && gregOfRM(insn[2]) >= 0 && gregOfRM(insn[2]) <= 3) {
   8981       const HChar* hintstr = "??";
   8982 
   8983       modrm = getIByte(delta+2);
   8984       vassert(!epartIsReg(modrm));
   8985 
   8986       addr = disAMode ( &alen, sorb, delta+2, dis_buf );
   8987       delta += 2+alen;
   8988 
   8989       switch (gregOfRM(modrm)) {
   8990          case 0: hintstr = "nta"; break;
   8991          case 1: hintstr = "t0"; break;
   8992          case 2: hintstr = "t1"; break;
   8993          case 3: hintstr = "t2"; break;
   8994          default: vassert(0); /*NOTREACHED*/
   8995       }
   8996 
   8997       DIP("prefetch%s %s\n", hintstr, dis_buf);
   8998       goto decode_success;
   8999    }
   9000 
   9001    /* 0F 0D /0 = PREFETCH  m8 -- 3DNow! prefetch */
   9002    /* 0F 0D /1 = PREFETCHW m8 -- ditto, with some other hint */
   9003    if (insn[0] == 0x0F && insn[1] == 0x0D
   9004        && !epartIsReg(insn[2])
   9005        && gregOfRM(insn[2]) >= 0 && gregOfRM(insn[2]) <= 1) {
   9006       const HChar* hintstr = "??";
   9007 
   9008       modrm = getIByte(delta+2);
   9009       vassert(!epartIsReg(modrm));
   9010 
   9011       addr = disAMode ( &alen, sorb, delta+2, dis_buf );
   9012       delta += 2+alen;
   9013 
   9014       switch (gregOfRM(modrm)) {
   9015          case 0: hintstr = ""; break;
   9016          case 1: hintstr = "w"; break;
   9017          default: vassert(0); /*NOTREACHED*/
   9018       }
   9019 
   9020       DIP("prefetch%s %s\n", hintstr, dis_buf);
   9021       goto decode_success;
   9022    }
   9023 
   9024    /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
   9025    /* 0F F6 = PSADBW -- sum of 8Ux8 absolute differences */
   9026    if (sz == 4 && insn[0] == 0x0F && insn[1] == 0xF6) {
   9027       do_MMX_preamble();
   9028       delta = dis_MMXop_regmem_to_reg (
   9029                  sorb, delta+2, insn[1], "psadbw", False );
   9030       goto decode_success;
   9031    }
   9032 
   9033    /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
   9034    /* 0F 70 = PSHUFW -- rearrange 4x16 from E(mmx or mem) to G(mmx) */
   9035    if (sz == 4 && insn[0] == 0x0F && insn[1] == 0x70) {
   9036       Int order;
   9037       IRTemp sV, dV, s3, s2, s1, s0;
   9038       s3 = s2 = s1 = s0 = IRTemp_INVALID;
   9039       sV = newTemp(Ity_I64);
   9040       dV = newTemp(Ity_I64);
   9041       do_MMX_preamble();
   9042       modrm = insn[2];
   9043       if (epartIsReg(modrm)) {
   9044          assign( sV, getMMXReg(eregOfRM(modrm)) );
   9045          order = (Int)insn[3];
   9046          delta += 2+2;
   9047          DIP("pshufw $%d,%s,%s\n", order,
   9048                                    nameMMXReg(eregOfRM(modrm)),
   9049                                    nameMMXReg(gregOfRM(modrm)));
   9050       } else {
   9051          addr = disAMode ( &alen, sorb, delta+2, dis_buf );
   9052          assign( sV, loadLE(Ity_I64, mkexpr(addr)) );
   9053 	 order = (Int)insn[2+alen];
   9054          delta += 3+alen;
   9055          DIP("pshufw $%d,%s,%s\n", order,
   9056                                    dis_buf,
   9057                                    nameMMXReg(gregOfRM(modrm)));
   9058       }
   9059       breakup64to16s( sV, &s3, &s2, &s1, &s0 );
   9060 
   9061 #     define SEL(n) \
   9062                 ((n)==0 ? s0 : ((n)==1 ? s1 : ((n)==2 ? s2 : s3)))
   9063       assign(dV,
   9064 	     mk64from16s( SEL((order>>6)&3), SEL((order>>4)&3),
   9065                           SEL((order>>2)&3), SEL((order>>0)&3) )
   9066       );
   9067       putMMXReg(gregOfRM(modrm), mkexpr(dV));
   9068 #     undef SEL
   9069       goto decode_success;
   9070    }
   9071 
   9072    /* 0F AE /7 = SFENCE -- flush pending operations to memory */
   9073    if (insn[0] == 0x0F && insn[1] == 0xAE
   9074        && epartIsReg(insn[2]) && gregOfRM(insn[2]) == 7) {
   9075       vassert(sz == 4);
   9076       delta += 3;
   9077       /* Insert a memory fence.  It's sometimes important that these
   9078          are carried through to the generated code. */
   9079       stmt( IRStmt_MBE(Imbe_Fence) );
   9080       DIP("sfence\n");
   9081       goto decode_success;
   9082    }
   9083 
   9084    /* End of mmxext sse1 subset. No more sse parsing for mmxext only arches. */
   9085    if (archinfo->hwcaps == VEX_HWCAPS_X86_MMXEXT/*integer only sse1 subset*/)
   9086       goto after_sse_decoders;
   9087 
   9088 
   9089    /* 0F 5F = MAXPS -- max 32Fx4 from R/M to R */
   9090    if (sz == 4 && insn[0] == 0x0F && insn[1] == 0x5F) {
   9091       delta = dis_SSE_E_to_G_all( sorb, delta+2, "maxps", Iop_Max32Fx4 );
   9092       goto decode_success;
   9093    }
   9094 
   9095    /* F3 0F 5F = MAXSS -- max 32F0x4 from R/M to R */
   9096    if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0x5F) {
   9097       vassert(sz == 4);
   9098       delta = dis_SSE_E_to_G_lo32( sorb, delta+3, "maxss", Iop_Max32F0x4 );
   9099       goto decode_success;
   9100    }
   9101 
   9102    /* 0F 5D = MINPS -- min 32Fx4 from R/M to R */
   9103    if (sz == 4 && insn[0] == 0x0F && insn[1] == 0x5D) {
   9104       delta = dis_SSE_E_to_G_all( sorb, delta+2, "minps", Iop_Min32Fx4 );
   9105       goto decode_success;
   9106    }
   9107 
   9108    /* F3 0F 5D = MINSS -- min 32F0x4 from R/M to R */
   9109    if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0x5D) {
   9110       vassert(sz == 4);
   9111       delta = dis_SSE_E_to_G_lo32( sorb, delta+3, "minss", Iop_Min32F0x4 );
   9112       goto decode_success;
   9113    }
   9114 
   9115    /* 0F 28 = MOVAPS -- move from E (mem or xmm) to G (xmm). */
   9116    /* 0F 10 = MOVUPS -- move from E (mem or xmm) to G (xmm). */
   9117    if (sz == 4 && insn[0] == 0x0F && (insn[1] == 0x28 || insn[1] == 0x10)) {
   9118       modrm = getIByte(delta+2);
   9119       if (epartIsReg(modrm)) {
   9120          putXMMReg( gregOfRM(modrm),
   9121                     getXMMReg( eregOfRM(modrm) ));
   9122          DIP("mov[ua]ps %s,%s\n", nameXMMReg(eregOfRM(modrm)),
   9123                                   nameXMMReg(gregOfRM(modrm)));
   9124          delta += 2+1;
   9125       } else {
   9126          addr = disAMode ( &alen, sorb, delta+2, dis_buf );
   9127          if (insn[1] == 0x28/*movaps*/)
   9128             gen_SEGV_if_not_16_aligned( addr );
   9129          putXMMReg( gregOfRM(modrm),
   9130                     loadLE(Ity_V128, mkexpr(addr)) );
   9131          DIP("mov[ua]ps %s,%s\n", dis_buf,
   9132                                   nameXMMReg(gregOfRM(modrm)));
   9133          delta += 2+alen;
   9134       }
   9135       goto decode_success;
   9136    }
   9137 
   9138    /* 0F 29 = MOVAPS -- move from G (xmm) to E (mem or xmm). */
   9139    /* 0F 11 = MOVUPS -- move from G (xmm) to E (mem or xmm). */
   9140    if (sz == 4 && insn[0] == 0x0F
   9141        && (insn[1] == 0x29 || insn[1] == 0x11)) {
   9142       modrm = getIByte(delta+2);
   9143       if (epartIsReg(modrm)) {
   9144          /* fall through; awaiting test case */
   9145       } else {
   9146          addr = disAMode ( &alen, sorb, delta+2, dis_buf );
   9147          if (insn[1] == 0x29/*movaps*/)
   9148             gen_SEGV_if_not_16_aligned( addr );
   9149          storeLE( mkexpr(addr), getXMMReg(gregOfRM(modrm)) );
   9150          DIP("mov[ua]ps %s,%s\n", nameXMMReg(gregOfRM(modrm)),
   9151                                   dis_buf );
   9152          delta += 2+alen;
   9153          goto decode_success;
   9154       }
   9155    }
   9156 
   9157    /* 0F 16 = MOVHPS -- move from mem to high half of XMM. */
   9158    /* 0F 16 = MOVLHPS -- move from lo half to hi half of XMM. */
   9159    if (sz == 4 && insn[0] == 0x0F && insn[1] == 0x16) {
   9160       modrm = getIByte(delta+2);
   9161       if (epartIsReg(modrm)) {
   9162          delta += 2+1;
   9163          putXMMRegLane64( gregOfRM(modrm), 1/*upper lane*/,
   9164                           getXMMRegLane64( eregOfRM(modrm), 0 ) );
   9165          DIP("movhps %s,%s\n", nameXMMReg(eregOfRM(modrm)),
   9166                                nameXMMReg(gregOfRM(modrm)));
   9167       } else {
   9168          addr = disAMode ( &alen, sorb, delta+2, dis_buf );
   9169          delta += 2+alen;
   9170          putXMMRegLane64( gregOfRM(modrm), 1/*upper lane*/,
   9171                           loadLE(Ity_I64, mkexpr(addr)) );
   9172          DIP("movhps %s,%s\n", dis_buf,
   9173                                nameXMMReg( gregOfRM(modrm) ));
   9174       }
   9175       goto decode_success;
   9176    }
   9177 
   9178    /* 0F 17 = MOVHPS -- move from high half of XMM to mem. */
   9179    if (sz == 4 && insn[0] == 0x0F && insn[1] == 0x17) {
   9180       if (!epartIsReg(insn[2])) {
   9181          delta += 2;
   9182          addr = disAMode ( &alen, sorb, delta, dis_buf );
   9183          delta += alen;
   9184          storeLE( mkexpr(addr),
   9185                   getXMMRegLane64( gregOfRM(insn[2]),
   9186                                    1/*upper lane*/ ) );
   9187          DIP("movhps %s,%s\n", nameXMMReg( gregOfRM(insn[2]) ),
   9188                                dis_buf);
   9189          goto decode_success;
   9190       }
   9191       /* else fall through */
   9192    }
   9193 
   9194    /* 0F 12 = MOVLPS -- move from mem to low half of XMM. */
   9195    /* OF 12 = MOVHLPS -- from from hi half to lo half of XMM. */
   9196    if (sz == 4 && insn[0] == 0x0F && insn[1] == 0x12) {
   9197       modrm = getIByte(delta+2);
   9198       if (epartIsReg(modrm)) {
   9199          delta += 2+1;
   9200          putXMMRegLane64( gregOfRM(modrm),
   9201                           0/*lower lane*/,
   9202                           getXMMRegLane64( eregOfRM(modrm), 1 ));
   9203          DIP("movhlps %s, %s\n", nameXMMReg(eregOfRM(modrm)),
   9204                                  nameXMMReg(gregOfRM(modrm)));
   9205       } else {
   9206          addr = disAMode ( &alen, sorb, delta+2, dis_buf );
   9207          delta += 2+alen;
   9208          putXMMRegLane64( gregOfRM(modrm),  0/*lower lane*/,
   9209                           loadLE(Ity_I64, mkexpr(addr)) );
   9210          DIP("movlps %s, %s\n",
   9211              dis_buf, nameXMMReg( gregOfRM(modrm) ));
   9212       }
   9213       goto decode_success;
   9214    }
   9215 
   9216    /* 0F 13 = MOVLPS -- move from low half of XMM to mem. */
   9217    if (sz == 4 && insn[0] == 0x0F && insn[1] == 0x13) {
   9218       if (!epartIsReg(insn[2])) {
   9219          delta += 2;
   9220          addr = disAMode ( &alen, sorb, delta, dis_buf );
   9221          delta += alen;
   9222          storeLE( mkexpr(addr),
   9223                   getXMMRegLane64( gregOfRM(insn[2]),
   9224                                    0/*lower lane*/ ) );
   9225          DIP("movlps %s, %s\n", nameXMMReg( gregOfRM(insn[2]) ),
   9226                                 dis_buf);
   9227          goto decode_success;
   9228       }
   9229       /* else fall through */
   9230    }
   9231 
   9232    /* 0F 50 = MOVMSKPS - move 4 sign bits from 4 x F32 in xmm(E)
   9233       to 4 lowest bits of ireg(G) */
   9234    if (insn[0] == 0x0F && insn[1] == 0x50) {
   9235       modrm = getIByte(delta+2);
   9236       if (sz == 4 && epartIsReg(modrm)) {
   9237          Int src;
   9238          t0 = newTemp(Ity_I32);
   9239          t1 = newTemp(Ity_I32);
   9240          t2 = newTemp(Ity_I32);
   9241          t3 = newTemp(Ity_I32);
   9242          delta += 2+1;
   9243          src = eregOfRM(modrm);
   9244          assign( t0, binop( Iop_And32,
   9245                             binop(Iop_Shr32, getXMMRegLane32(src,0), mkU8(31)),
   9246                             mkU32(1) ));
   9247          assign( t1, binop( Iop_And32,
   9248                             binop(Iop_Shr32, getXMMRegLane32(src,1), mkU8(30)),
   9249                             mkU32(2) ));
   9250          assign( t2, binop( Iop_And32,
   9251                             binop(Iop_Shr32, getXMMRegLane32(src,2), mkU8(29)),
   9252                             mkU32(4) ));
   9253          assign( t3, binop( Iop_And32,
   9254                             binop(Iop_Shr32, getXMMRegLane32(src,3), mkU8(28)),
   9255                             mkU32(8) ));
   9256          putIReg(4, gregOfRM(modrm),
   9257                     binop(Iop_Or32,
   9258                           binop(Iop_Or32, mkexpr(t0), mkexpr(t1)),
   9259                           binop(Iop_Or32, mkexpr(t2), mkexpr(t3))
   9260                          )
   9261                  );
   9262          DIP("movmskps %s,%s\n", nameXMMReg(src),
   9263                                  nameIReg(4, gregOfRM(modrm)));
   9264          goto decode_success;
   9265       }
   9266       /* else fall through */
   9267    }
   9268 
   9269    /* 0F 2B = MOVNTPS -- for us, just a plain SSE store. */
   9270    /* 66 0F 2B = MOVNTPD -- for us, just a plain SSE store. */
   9271    if (insn[0] == 0x0F && insn[1] == 0x2B) {
   9272       modrm = getIByte(delta+2);
   9273       if (!epartIsReg(modrm)) {
   9274          addr = disAMode ( &alen, sorb, delta+2, dis_buf );
   9275          gen_SEGV_if_not_16_aligned( addr );
   9276          storeLE( mkexpr(addr), getXMMReg(gregOfRM(modrm)) );
   9277          DIP("movntp%s %s,%s\n", sz==2 ? "d" : "s",
   9278                                  dis_buf,
   9279                                  nameXMMReg(gregOfRM(modrm)));
   9280          delta += 2+alen;
   9281          goto decode_success;
   9282       }
   9283       /* else fall through */
   9284    }
   9285 
   9286    /* F3 0F 10 = MOVSS -- move 32 bits from E (mem or lo 1/4 xmm) to G
   9287       (lo 1/4 xmm).  If E is mem, upper 3/4 of G is zeroed out. */
   9288    if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0x10) {
   9289       vassert(sz == 4);
   9290       modrm = getIByte(delta+3);
   9291       if (epartIsReg(modrm)) {
   9292          putXMMRegLane32( gregOfRM(modrm), 0,
   9293                           getXMMRegLane32( eregOfRM(modrm), 0 ));
   9294          DIP("movss %s,%s\n", nameXMMReg(eregOfRM(modrm)),
   9295                               nameXMMReg(gregOfRM(modrm)));
   9296          delta += 3+1;
   9297       } else {
   9298          addr = disAMode ( &alen, sorb, delta+3, dis_buf );
   9299          /* zero bits 127:64 */
   9300          putXMMRegLane64( gregOfRM(modrm), 1, mkU64(0) );
   9301          /* zero bits 63:32 */
   9302          putXMMRegLane32( gregOfRM(modrm), 1, mkU32(0) );
   9303          /* write bits 31:0 */
   9304          putXMMRegLane32( gregOfRM(modrm), 0,
   9305                           loadLE(Ity_I32, mkexpr(addr)) );
   9306          DIP("movss %s,%s\n", dis_buf,
   9307                               nameXMMReg(gregOfRM(modrm)));
   9308          delta += 3+alen;
   9309       }
   9310       goto decode_success;
   9311    }
   9312 
   9313    /* F3 0F 11 = MOVSS -- move 32 bits from G (lo 1/4 xmm) to E (mem
   9314       or lo 1/4 xmm). */
   9315    if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0x11) {
   9316       vassert(sz == 4);
   9317       modrm = getIByte(delta+3);
   9318       if (epartIsReg(modrm)) {
   9319          /* fall through, we don't yet have a test case */
   9320       } else {
   9321          addr = disAMode ( &alen, sorb, delta+3, dis_buf );
   9322          storeLE( mkexpr(addr),
   9323                   getXMMRegLane32(gregOfRM(modrm), 0) );
   9324          DIP("movss %s,%s\n", nameXMMReg(gregOfRM(modrm)),
   9325                               dis_buf);
   9326          delta += 3+alen;
   9327          goto decode_success;
   9328       }
   9329    }
   9330 
   9331    /* 0F 59 = MULPS -- mul 32Fx4 from R/M to R */
   9332    if (sz == 4 && insn[0] == 0x0F && insn[1] == 0x59) {
   9333       delta = dis_SSE_E_to_G_all( sorb, delta+2, "mulps", Iop_Mul32Fx4 );
   9334       goto decode_success;
   9335    }
   9336 
   9337    /* F3 0F 59 = MULSS -- mul 32F0x4 from R/M to R */
   9338    if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0x59) {
   9339       vassert(sz == 4);
   9340       delta = dis_SSE_E_to_G_lo32( sorb, delta+3, "mulss", Iop_Mul32F0x4 );
   9341       goto decode_success;
   9342    }
   9343 
   9344    /* 0F 56 = ORPS -- G = G and E */
   9345    if (sz == 4 && insn[0] == 0x0F && insn[1] == 0x56) {
   9346       delta = dis_SSE_E_to_G_all( sorb, delta+2, "orps", Iop_OrV128 );
   9347       goto decode_success;
   9348    }
   9349 
   9350    /* 0F 53 = RCPPS -- approx reciprocal 32Fx4 from R/M to R */
   9351    if (insn[0] == 0x0F && insn[1] == 0x53) {
   9352       vassert(sz == 4);
   9353       delta = dis_SSE_E_to_G_unary_all( sorb, delta+2,
   9354                                         "rcpps", Iop_Recip32Fx4 );
   9355       goto decode_success;
   9356    }
   9357 
   9358    /* F3 0F 53 = RCPSS -- approx reciprocal 32F0x4 from R/M to R */
   9359    if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0x53) {
   9360       vassert(sz == 4);
   9361       delta = dis_SSE_E_to_G_unary_lo32( sorb, delta+3,
   9362                                          "rcpss", Iop_Recip32F0x4 );
   9363       goto decode_success;
   9364    }
   9365 
   9366    /* 0F 52 = RSQRTPS -- approx reciprocal sqrt 32Fx4 from R/M to R */
   9367    if (insn[0] == 0x0F && insn[1] == 0x52) {
   9368       vassert(sz == 4);
   9369       delta = dis_SSE_E_to_G_unary_all( sorb, delta+2,
   9370                                         "rsqrtps", Iop_RSqrt32Fx4 );
   9371       goto decode_success;
   9372    }
   9373 
   9374    /* F3 0F 52 = RSQRTSS -- approx reciprocal sqrt 32F0x4 from R/M to R */
   9375    if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0x52) {
   9376       vassert(sz == 4);
   9377       delta = dis_SSE_E_to_G_unary_lo32( sorb, delta+3,
   9378                                          "rsqrtss", Iop_RSqrt32F0x4 );
   9379       goto decode_success;
   9380    }
   9381 
   9382    /* 0F C6 /r ib = SHUFPS -- shuffle packed F32s */
   9383    if (sz == 4 && insn[0] == 0x0F && insn[1] == 0xC6) {
   9384       Int    select;
   9385       IRTemp sV, dV;
   9386       IRTemp s3, s2, s1, s0, d3, d2, d1, d0;
   9387       sV = newTemp(Ity_V128);
   9388       dV = newTemp(Ity_V128);
   9389       s3 = s2 = s1 = s0 = d3 = d2 = d1 = d0 = IRTemp_INVALID;
   9390       modrm = insn[2];
   9391       assign( dV, getXMMReg(gregOfRM(modrm)) );
   9392 
   9393       if (epartIsReg(modrm)) {
   9394          assign( sV, getXMMReg(eregOfRM(modrm)) );
   9395          select = (Int)insn[3];
   9396          delta += 2+2;
   9397          DIP("shufps $%d,%s,%s\n", select,
   9398                                    nameXMMReg(eregOfRM(modrm)),
   9399                                    nameXMMReg(gregOfRM(modrm)));
   9400       } else {
   9401          addr = disAMode ( &alen, sorb, delta+2, dis_buf );
   9402          assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
   9403          select = (Int)insn[2+alen];
   9404          delta += 3+alen;
   9405          DIP("shufps $%d,%s,%s\n", select,
   9406                                    dis_buf,
   9407                                    nameXMMReg(gregOfRM(modrm)));
   9408       }
   9409 
   9410       breakup128to32s( dV, &d3, &d2, &d1, &d0 );
   9411       breakup128to32s( sV, &s3, &s2, &s1, &s0 );
   9412 
   9413 #     define SELD(n) ((n)==0 ? d0 : ((n)==1 ? d1 : ((n)==2 ? d2 : d3)))
   9414 #     define SELS(n) ((n)==0 ? s0 : ((n)==1 ? s1 : ((n)==2 ? s2 : s3)))
   9415 
   9416       putXMMReg(
   9417          gregOfRM(modrm),
   9418          mk128from32s( SELS((select>>6)&3), SELS((select>>4)&3),
   9419                        SELD((select>>2)&3), SELD((select>>0)&3) )
   9420       );
   9421 
   9422 #     undef SELD
   9423 #     undef SELS
   9424 
   9425       goto decode_success;
   9426    }
   9427 
   9428    /* 0F 51 = SQRTPS -- approx sqrt 32Fx4 from R/M to R */
   9429    if (sz == 4 && insn[0] == 0x0F && insn[1] == 0x51) {
   9430       delta = dis_SSE_E_to_G_unary_all( sorb, delta+2,
   9431                                         "sqrtps", Iop_Sqrt32Fx4 );
   9432       goto decode_success;
   9433    }
   9434 
   9435    /* F3 0F 51 = SQRTSS -- approx sqrt 32F0x4 from R/M to R */
   9436    if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0x51) {
   9437       vassert(sz == 4);
   9438       delta = dis_SSE_E_to_G_unary_lo32( sorb, delta+3,
   9439                                          "sqrtss", Iop_Sqrt32F0x4 );
   9440       goto decode_success;
   9441    }
   9442 
   9443    /* 0F AE /3 = STMXCSR m32 -- store %mxcsr */
   9444    if (insn[0] == 0x0F && insn[1] == 0xAE
   9445        && !epartIsReg(insn[2]) && gregOfRM(insn[2]) == 3) {
   9446       modrm = getIByte(delta+2);
   9447       vassert(sz == 4);
   9448       vassert(!epartIsReg(modrm));
   9449 
   9450       addr = disAMode ( &alen, sorb, delta+2, dis_buf );
   9451       delta += 2+alen;
   9452 
   9453       /* Fake up a native SSE mxcsr word.  The only thing it depends
   9454          on is SSEROUND[1:0], so call a clean helper to cook it up.
   9455       */
   9456       /* UInt x86h_create_mxcsr ( UInt sseround ) */
   9457       DIP("stmxcsr %s\n", dis_buf);
   9458       storeLE( mkexpr(addr),
   9459                mkIRExprCCall(
   9460                   Ity_I32, 0/*regp*/,
   9461                   "x86g_create_mxcsr", &x86g_create_mxcsr,
   9462                   mkIRExprVec_1( get_sse_roundingmode() )
   9463                )
   9464              );
   9465       goto decode_success;
   9466    }
   9467 
   9468    /* 0F 5C = SUBPS -- sub 32Fx4 from R/M to R */
   9469    if (sz == 4 && insn[0] == 0x0F && insn[1] == 0x5C) {
   9470       delta = dis_SSE_E_to_G_all( sorb, delta+2, "subps", Iop_Sub32Fx4 );
   9471       goto decode_success;
   9472    }
   9473 
   9474    /* F3 0F 5C = SUBSS -- sub 32F0x4 from R/M to R */
   9475    if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0x5C) {
   9476       vassert(sz == 4);
   9477       delta = dis_SSE_E_to_G_lo32( sorb, delta+3, "subss", Iop_Sub32F0x4 );
   9478       goto decode_success;
   9479    }
   9480 
   9481    /* 0F 15 = UNPCKHPS -- unpack and interleave high part F32s */
   9482    /* 0F 14 = UNPCKLPS -- unpack and interleave low part F32s */
   9483    /* These just appear to be special cases of SHUFPS */
   9484    if (sz == 4 && insn[0] == 0x0F && (insn[1] == 0x15 || insn[1] == 0x14)) {
   9485       IRTemp sV, dV;
   9486       IRTemp s3, s2, s1, s0, d3, d2, d1, d0;
   9487       Bool hi = toBool(insn[1] == 0x15);
   9488       sV = newTemp(Ity_V128);
   9489       dV = newTemp(Ity_V128);
   9490       s3 = s2 = s1 = s0 = d3 = d2 = d1 = d0 = IRTemp_INVALID;
   9491       modrm = insn[2];
   9492       assign( dV, getXMMReg(gregOfRM(modrm)) );
   9493 
   9494       if (epartIsReg(modrm)) {
   9495          assign( sV, getXMMReg(eregOfRM(modrm)) );
   9496          delta += 2+1;
   9497          DIP("unpck%sps %s,%s\n", hi ? "h" : "l",
   9498                                   nameXMMReg(eregOfRM(modrm)),
   9499                                   nameXMMReg(gregOfRM(modrm)));
   9500       } else {
   9501          addr = disAMode ( &alen, sorb, delta+2, dis_buf );
   9502          assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
   9503          delta += 2+alen;
   9504          DIP("unpck%sps %s,%s\n", hi ? "h" : "l",
   9505                                   dis_buf,
   9506                                   nameXMMReg(gregOfRM(modrm)));
   9507       }
   9508 
   9509       breakup128to32s( dV, &d3, &d2, &d1, &d0 );
   9510       breakup128to32s( sV, &s3, &s2, &s1, &s0 );
   9511 
   9512       if (hi) {
   9513          putXMMReg( gregOfRM(modrm), mk128from32s( s3, d3, s2, d2 ) );
   9514       } else {
   9515          putXMMReg( gregOfRM(modrm), mk128from32s( s1, d1, s0, d0 ) );
   9516       }
   9517 
   9518       goto decode_success;
   9519    }
   9520 
   9521    /* 0F 57 = XORPS -- G = G and E */
   9522    if (sz == 4 && insn[0] == 0x0F && insn[1] == 0x57) {
   9523       delta = dis_SSE_E_to_G_all( sorb, delta+2, "xorps", Iop_XorV128 );
   9524       goto decode_success;
   9525    }
   9526 
   9527    /* ---------------------------------------------------- */
   9528    /* --- end of the SSE decoder.                      --- */
   9529    /* ---------------------------------------------------- */
   9530 
   9531    /* ---------------------------------------------------- */
   9532    /* --- start of the SSE2 decoder.                   --- */
   9533    /* ---------------------------------------------------- */
   9534 
   9535    /* Skip parts of the decoder which don't apply given the stated
   9536       guest subarchitecture. */
   9537    if (0 == (archinfo->hwcaps & VEX_HWCAPS_X86_SSE2))
   9538       goto after_sse_decoders; /* no SSE2 capabilities */
   9539 
   9540    insn = (UChar*)&guest_code[delta];
   9541 
   9542    /* 66 0F 58 = ADDPD -- add 32Fx4 from R/M to R */
   9543    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x58) {
   9544       delta = dis_SSE_E_to_G_all( sorb, delta+2, "addpd", Iop_Add64Fx2 );
   9545       goto decode_success;
   9546    }
   9547 
   9548    /* F2 0F 58 = ADDSD -- add 64F0x2 from R/M to R */
   9549    if (insn[0] == 0xF2 && insn[1] == 0x0F && insn[2] == 0x58) {
   9550       vassert(sz == 4);
   9551       delta = dis_SSE_E_to_G_lo64( sorb, delta+3, "addsd", Iop_Add64F0x2 );
   9552       goto decode_success;
   9553    }
   9554 
   9555    /* 66 0F 55 = ANDNPD -- G = (not G) and E */
   9556    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x55) {
   9557       delta = dis_SSE_E_to_G_all_invG( sorb, delta+2, "andnpd", Iop_AndV128 );
   9558       goto decode_success;
   9559    }
   9560 
   9561    /* 66 0F 54 = ANDPD -- G = G and E */
   9562    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x54) {
   9563       delta = dis_SSE_E_to_G_all( sorb, delta+2, "andpd", Iop_AndV128 );
   9564       goto decode_success;
   9565    }
   9566 
   9567    /* 66 0F C2 = CMPPD -- 64Fx2 comparison from R/M to R */
   9568    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xC2) {
   9569       delta = dis_SSEcmp_E_to_G( sorb, delta+2, "cmppd", True, 8 );
   9570       goto decode_success;
   9571    }
   9572 
   9573    /* F2 0F C2 = CMPSD -- 64F0x2 comparison from R/M to R */
   9574    if (insn[0] == 0xF2 && insn[1] == 0x0F && insn[2] == 0xC2) {
   9575       vassert(sz == 4);
   9576       delta = dis_SSEcmp_E_to_G( sorb, delta+3, "cmpsd", False, 8 );
   9577       goto decode_success;
   9578    }
   9579 
   9580    /* 66 0F 2F = COMISD  -- 64F0x2 comparison G,E, and set ZCP */
   9581    /* 66 0F 2E = UCOMISD -- 64F0x2 comparison G,E, and set ZCP */
   9582    if (sz == 2 && insn[0] == 0x0F && (insn[1] == 0x2F || insn[1] == 0x2E)) {
   9583       IRTemp argL = newTemp(Ity_F64);
   9584       IRTemp argR = newTemp(Ity_F64);
   9585       modrm = getIByte(delta+2);
   9586       if (epartIsReg(modrm)) {
   9587          assign( argR, getXMMRegLane64F( eregOfRM(modrm), 0/*lowest lane*/ ) );
   9588          delta += 2+1;
   9589          DIP("[u]comisd %s,%s\n", nameXMMReg(eregOfRM(modrm)),
   9590                                   nameXMMReg(gregOfRM(modrm)) );
   9591       } else {
   9592          addr = disAMode ( &alen, sorb, delta+2, dis_buf );
   9593 	 assign( argR, loadLE(Ity_F64, mkexpr(addr)) );
   9594          delta += 2+alen;
   9595          DIP("[u]comisd %s,%s\n", dis_buf,
   9596                                   nameXMMReg(gregOfRM(modrm)) );
   9597       }
   9598       assign( argL, getXMMRegLane64F( gregOfRM(modrm), 0/*lowest lane*/ ) );
   9599 
   9600       stmt( IRStmt_Put( OFFB_CC_OP,   mkU32(X86G_CC_OP_COPY) ));
   9601       stmt( IRStmt_Put( OFFB_CC_DEP2, mkU32(0) ));
   9602       stmt( IRStmt_Put(
   9603                OFFB_CC_DEP1,
   9604                binop( Iop_And32,
   9605                       binop(Iop_CmpF64, mkexpr(argL), mkexpr(argR)),
   9606                       mkU32(0x45)
   9607           )));
   9608       /* Set NDEP even though it isn't used.  This makes redundant-PUT
   9609          elimination of previous stores to this field work better. */
   9610       stmt( IRStmt_Put( OFFB_CC_NDEP, mkU32(0) ));
   9611       goto decode_success;
   9612    }
   9613 
   9614    /* F3 0F E6 = CVTDQ2PD -- convert 2 x I32 in mem/lo half xmm to 2 x
   9615       F64 in xmm(G) */
   9616    if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0xE6) {
   9617       IRTemp arg64 = newTemp(Ity_I64);
   9618       vassert(sz == 4);
   9619 
   9620       modrm = getIByte(delta+3);
   9621       if (epartIsReg(modrm)) {
   9622          assign( arg64, getXMMRegLane64(eregOfRM(modrm), 0) );
   9623          delta += 3+1;
   9624          DIP("cvtdq2pd %s,%s\n", nameXMMReg(eregOfRM(modrm)),
   9625                                  nameXMMReg(gregOfRM(modrm)));
   9626       } else {
   9627          addr = disAMode ( &alen, sorb, delta+3, dis_buf );
   9628 	 assign( arg64, loadLE(Ity_I64, mkexpr(addr)) );
   9629          delta += 3+alen;
   9630          DIP("cvtdq2pd %s,%s\n", dis_buf,
   9631                                  nameXMMReg(gregOfRM(modrm)) );
   9632       }
   9633 
   9634       putXMMRegLane64F(
   9635          gregOfRM(modrm), 0,
   9636          unop(Iop_I32StoF64, unop(Iop_64to32, mkexpr(arg64)))
   9637       );
   9638 
   9639       putXMMRegLane64F(
   9640          gregOfRM(modrm), 1,
   9641          unop(Iop_I32StoF64, unop(Iop_64HIto32, mkexpr(arg64)))
   9642       );
   9643 
   9644       goto decode_success;
   9645    }
   9646 
   9647    /* 0F 5B = CVTDQ2PS -- convert 4 x I32 in mem/xmm to 4 x F32 in
   9648       xmm(G) */
   9649    if (sz == 4 && insn[0] == 0x0F && insn[1] == 0x5B) {
   9650       IRTemp argV  = newTemp(Ity_V128);
   9651       IRTemp rmode = newTemp(Ity_I32);
   9652 
   9653       modrm = getIByte(delta+2);
   9654       if (epartIsReg(modrm)) {
   9655          assign( argV, getXMMReg(eregOfRM(modrm)) );
   9656          delta += 2+1;
   9657          DIP("cvtdq2ps %s,%s\n", nameXMMReg(eregOfRM(modrm)),
   9658                                  nameXMMReg(gregOfRM(modrm)));
   9659       } else {
   9660          addr = disAMode ( &alen, sorb, delta+2, dis_buf );
   9661 	 assign( argV, loadLE(Ity_V128, mkexpr(addr)) );
   9662          delta += 2+alen;
   9663          DIP("cvtdq2ps %s,%s\n", dis_buf,
   9664                                  nameXMMReg(gregOfRM(modrm)) );
   9665       }
   9666 
   9667       assign( rmode, get_sse_roundingmode() );
   9668       breakup128to32s( argV, &t3, &t2, &t1, &t0 );
   9669 
   9670 #     define CVT(_t)  binop( Iop_F64toF32,                    \
   9671                              mkexpr(rmode),                   \
   9672                              unop(Iop_I32StoF64,mkexpr(_t)))
   9673 
   9674       putXMMRegLane32F( gregOfRM(modrm), 3, CVT(t3) );
   9675       putXMMRegLane32F( gregOfRM(modrm), 2, CVT(t2) );
   9676       putXMMRegLane32F( gregOfRM(modrm), 1, CVT(t1) );
   9677       putXMMRegLane32F( gregOfRM(modrm), 0, CVT(t0) );
   9678 
   9679 #     undef CVT
   9680 
   9681       goto decode_success;
   9682    }
   9683 
   9684    /* F2 0F E6 = CVTPD2DQ -- convert 2 x F64 in mem/xmm to 2 x I32 in
   9685       lo half xmm(G), and zero upper half */
   9686    if (insn[0] == 0xF2 && insn[1] == 0x0F && insn[2] == 0xE6) {
   9687       IRTemp argV  = newTemp(Ity_V128);
   9688       IRTemp rmode = newTemp(Ity_I32);
   9689       vassert(sz == 4);
   9690 
   9691       modrm = getIByte(delta+3);
   9692       if (epartIsReg(modrm)) {
   9693          assign( argV, getXMMReg(eregOfRM(modrm)) );
   9694          delta += 3+1;
   9695          DIP("cvtpd2dq %s,%s\n", nameXMMReg(eregOfRM(modrm)),
   9696                                  nameXMMReg(gregOfRM(modrm)));
   9697       } else {
   9698          addr = disAMode ( &alen, sorb, delta+3, dis_buf );
   9699 	 assign( argV, loadLE(Ity_V128, mkexpr(addr)) );
   9700          delta += 3+alen;
   9701          DIP("cvtpd2dq %s,%s\n", dis_buf,
   9702                                  nameXMMReg(gregOfRM(modrm)) );
   9703       }
   9704 
   9705       assign( rmode, get_sse_roundingmode() );
   9706       t0 = newTemp(Ity_F64);
   9707       t1 = newTemp(Ity_F64);
   9708       assign( t0, unop(Iop_ReinterpI64asF64,
   9709                        unop(Iop_V128to64, mkexpr(argV))) );
   9710       assign( t1, unop(Iop_ReinterpI64asF64,
   9711                        unop(Iop_V128HIto64, mkexpr(argV))) );
   9712 
   9713 #     define CVT(_t)  binop( Iop_F64toI32S,                   \
   9714                              mkexpr(rmode),                   \
   9715                              mkexpr(_t) )
   9716 
   9717       putXMMRegLane32( gregOfRM(modrm), 3, mkU32(0) );
   9718       putXMMRegLane32( gregOfRM(modrm), 2, mkU32(0) );
   9719       putXMMRegLane32( gregOfRM(modrm), 1, CVT(t1) );
   9720       putXMMRegLane32( gregOfRM(modrm), 0, CVT(t0) );
   9721 
   9722 #     undef CVT
   9723 
   9724       goto decode_success;
   9725    }
   9726 
   9727    /* 66 0F 2D = CVTPD2PI -- convert 2 x F64 in mem/xmm to 2 x
   9728       I32 in mmx, according to prevailing SSE rounding mode */
   9729    /* 66 0F 2C = CVTTPD2PI -- convert 2 x F64 in mem/xmm to 2 x
   9730       I32 in mmx, rounding towards zero */
   9731    if (sz == 2 && insn[0] == 0x0F && (insn[1] == 0x2D || insn[1] == 0x2C)) {
   9732       IRTemp dst64  = newTemp(Ity_I64);
   9733       IRTemp rmode  = newTemp(Ity_I32);
   9734       IRTemp f64lo  = newTemp(Ity_F64);
   9735       IRTemp f64hi  = newTemp(Ity_F64);
   9736       Bool   r2zero = toBool(insn[1] == 0x2C);
   9737 
   9738       do_MMX_preamble();
   9739       modrm = getIByte(delta+2);
   9740 
   9741       if (epartIsReg(modrm)) {
   9742          delta += 2+1;
   9743 	 assign(f64lo, getXMMRegLane64F(eregOfRM(modrm), 0));
   9744 	 assign(f64hi, getXMMRegLane64F(eregOfRM(modrm), 1));
   9745          DIP("cvt%spd2pi %s,%s\n", r2zero ? "t" : "",
   9746                                    nameXMMReg(eregOfRM(modrm)),
   9747                                    nameMMXReg(gregOfRM(modrm)));
   9748       } else {
   9749          addr = disAMode ( &alen, sorb, delta+2, dis_buf );
   9750 	 assign(f64lo, loadLE(Ity_F64, mkexpr(addr)));
   9751 	 assign(f64hi, loadLE(Ity_F64, binop( Iop_Add32,
   9752                                               mkexpr(addr),
   9753                                               mkU32(8) )));
   9754          delta += 2+alen;
   9755          DIP("cvt%spf2pi %s,%s\n", r2zero ? "t" : "",
   9756                                    dis_buf,
   9757                                    nameMMXReg(gregOfRM(modrm)));
   9758       }
   9759 
   9760       if (r2zero) {
   9761          assign(rmode, mkU32((UInt)Irrm_ZERO) );
   9762       } else {
   9763          assign( rmode, get_sse_roundingmode() );
   9764       }
   9765 
   9766       assign(
   9767          dst64,
   9768          binop( Iop_32HLto64,
   9769                 binop( Iop_F64toI32S, mkexpr(rmode), mkexpr(f64hi) ),
   9770                 binop( Iop_F64toI32S, mkexpr(rmode), mkexpr(f64lo) )
   9771               )
   9772       );
   9773 
   9774       putMMXReg(gregOfRM(modrm), mkexpr(dst64));
   9775       goto decode_success;
   9776    }
   9777 
   9778    /* 66 0F 5A = CVTPD2PS -- convert 2 x F64 in mem/xmm to 2 x F32 in
   9779       lo half xmm(G), and zero upper half */
   9780    /* Note, this is practically identical to CVTPD2DQ.  It would have
   9781       been nicer to merge them together, but the insn[] offsets differ
   9782       by one. */
   9783    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x5A) {
   9784       IRTemp argV  = newTemp(Ity_V128);
   9785       IRTemp rmode = newTemp(Ity_I32);
   9786 
   9787       modrm = getIByte(delta+2);
   9788       if (epartIsReg(modrm)) {
   9789          assign( argV, getXMMReg(eregOfRM(modrm)) );
   9790          delta += 2+1;
   9791          DIP("cvtpd2ps %s,%s\n", nameXMMReg(eregOfRM(modrm)),
   9792                                  nameXMMReg(gregOfRM(modrm)));
   9793       } else {
   9794          addr = disAMode ( &alen, sorb, delta+2, dis_buf );
   9795 	 assign( argV, loadLE(Ity_V128, mkexpr(addr)) );
   9796          delta += 2+alen;
   9797          DIP("cvtpd2ps %s,%s\n", dis_buf,
   9798                                  nameXMMReg(gregOfRM(modrm)) );
   9799       }
   9800 
   9801       assign( rmode, get_sse_roundingmode() );
   9802       t0 = newTemp(Ity_F64);
   9803       t1 = newTemp(Ity_F64);
   9804       assign( t0, unop(Iop_ReinterpI64asF64,
   9805                        unop(Iop_V128to64, mkexpr(argV))) );
   9806       assign( t1, unop(Iop_ReinterpI64asF64,
   9807                        unop(Iop_V128HIto64, mkexpr(argV))) );
   9808 
   9809 #     define CVT(_t)  binop( Iop_F64toF32,                    \
   9810                              mkexpr(rmode),                   \
   9811                              mkexpr(_t) )
   9812 
   9813       putXMMRegLane32(  gregOfRM(modrm), 3, mkU32(0) );
   9814       putXMMRegLane32(  gregOfRM(modrm), 2, mkU32(0) );
   9815       putXMMRegLane32F( gregOfRM(modrm), 1, CVT(t1) );
   9816       putXMMRegLane32F( gregOfRM(modrm), 0, CVT(t0) );
   9817 
   9818 #     undef CVT
   9819 
   9820       goto decode_success;
   9821    }
   9822 
   9823    /* 66 0F 2A = CVTPI2PD -- convert 2 x I32 in mem/mmx to 2 x F64 in
   9824       xmm(G) */
   9825    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x2A) {
   9826       IRTemp arg64 = newTemp(Ity_I64);
   9827 
   9828       modrm = getIByte(delta+2);
   9829       if (epartIsReg(modrm)) {
   9830          /* Only switch to MMX mode if the source is a MMX register.
   9831             This is inconsistent with all other instructions which
   9832             convert between XMM and (M64 or MMX), which always switch
   9833             to MMX mode even if 64-bit operand is M64 and not MMX.  At
   9834             least, that's what the Intel docs seem to me to say.
   9835             Fixes #210264. */
   9836          do_MMX_preamble();
   9837          assign( arg64, getMMXReg(eregOfRM(modrm)) );
   9838          delta += 2+1;
   9839          DIP("cvtpi2pd %s,%s\n", nameMMXReg(eregOfRM(modrm)),
   9840                                  nameXMMReg(gregOfRM(modrm)));
   9841       } else {
   9842          addr = disAMode ( &alen, sorb, delta+2, dis_buf );
   9843 	 assign( arg64, loadLE(Ity_I64, mkexpr(addr)) );
   9844          delta += 2+alen;
   9845          DIP("cvtpi2pd %s,%s\n", dis_buf,
   9846                                  nameXMMReg(gregOfRM(modrm)) );
   9847       }
   9848 
   9849       putXMMRegLane64F(
   9850          gregOfRM(modrm), 0,
   9851          unop(Iop_I32StoF64, unop(Iop_64to32, mkexpr(arg64)) )
   9852       );
   9853 
   9854       putXMMRegLane64F(
   9855          gregOfRM(modrm), 1,
   9856          unop(Iop_I32StoF64, unop(Iop_64HIto32, mkexpr(arg64)) )
   9857       );
   9858 
   9859       goto decode_success;
   9860    }
   9861 
   9862    /* 66 0F 5B = CVTPS2DQ -- convert 4 x F32 in mem/xmm to 4 x I32 in
   9863       xmm(G) */
   9864    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x5B) {
   9865       IRTemp argV  = newTemp(Ity_V128);
   9866       IRTemp rmode = newTemp(Ity_I32);
   9867 
   9868       modrm = getIByte(delta+2);
   9869       if (epartIsReg(modrm)) {
   9870          assign( argV, getXMMReg(eregOfRM(modrm)) );
   9871          delta += 2+1;
   9872          DIP("cvtps2dq %s,%s\n", nameXMMReg(eregOfRM(modrm)),
   9873                                  nameXMMReg(gregOfRM(modrm)));
   9874       } else {
   9875          addr = disAMode ( &alen, sorb, delta+2, dis_buf );
   9876 	 assign( argV, loadLE(Ity_V128, mkexpr(addr)) );
   9877          delta += 2+alen;
   9878          DIP("cvtps2dq %s,%s\n", dis_buf,
   9879                                  nameXMMReg(gregOfRM(modrm)) );
   9880       }
   9881 
   9882       assign( rmode, get_sse_roundingmode() );
   9883       breakup128to32s( argV, &t3, &t2, &t1, &t0 );
   9884 
   9885       /* This is less than ideal.  If it turns out to be a performance
   9886 	 bottleneck it can be improved. */
   9887 #     define CVT(_t)                            \
   9888         binop( Iop_F64toI32S,                   \
   9889                mkexpr(rmode),                   \
   9890                unop( Iop_F32toF64,              \
   9891                      unop( Iop_ReinterpI32asF32, mkexpr(_t))) )
   9892 
   9893       putXMMRegLane32( gregOfRM(modrm), 3, CVT(t3) );
   9894       putXMMRegLane32( gregOfRM(modrm), 2, CVT(t2) );
   9895       putXMMRegLane32( gregOfRM(modrm), 1, CVT(t1) );
   9896       putXMMRegLane32( gregOfRM(modrm), 0, CVT(t0) );
   9897 
   9898 #     undef CVT
   9899 
   9900       goto decode_success;
   9901    }
   9902 
   9903    /* 0F 5A = CVTPS2PD -- convert 2 x F32 in low half mem/xmm to 2 x
   9904       F64 in xmm(G). */
   9905    if (sz == 4 && insn[0] == 0x0F && insn[1] == 0x5A) {
   9906       IRTemp f32lo = newTemp(Ity_F32);
   9907       IRTemp f32hi = newTemp(Ity_F32);
   9908 
   9909       modrm = getIByte(delta+2);
   9910       if (epartIsReg(modrm)) {
   9911          assign( f32lo, getXMMRegLane32F(eregOfRM(modrm), 0) );
   9912          assign( f32hi, getXMMRegLane32F(eregOfRM(modrm), 1) );
   9913          delta += 2+1;
   9914          DIP("cvtps2pd %s,%s\n", nameXMMReg(eregOfRM(modrm)),
   9915                                  nameXMMReg(gregOfRM(modrm)));
   9916       } else {
   9917          addr = disAMode ( &alen, sorb, delta+2, dis_buf );
   9918 	 assign( f32lo, loadLE(Ity_F32, mkexpr(addr)) );
   9919 	 assign( f32hi, loadLE(Ity_F32,
   9920                                binop(Iop_Add32,mkexpr(addr),mkU32(4))) );
   9921          delta += 2+alen;
   9922          DIP("cvtps2pd %s,%s\n", dis_buf,
   9923                                  nameXMMReg(gregOfRM(modrm)) );
   9924       }
   9925 
   9926       putXMMRegLane64F( gregOfRM(modrm), 1,
   9927                         unop(Iop_F32toF64, mkexpr(f32hi)) );
   9928       putXMMRegLane64F( gregOfRM(modrm), 0,
   9929                         unop(Iop_F32toF64, mkexpr(f32lo)) );
   9930 
   9931       goto decode_success;
   9932    }
   9933 
   9934    /* F2 0F 2D = CVTSD2SI -- convert F64 in mem/low half xmm to
   9935       I32 in ireg, according to prevailing SSE rounding mode */
   9936    /* F2 0F 2C = CVTTSD2SI -- convert F64 in mem/low half xmm to
   9937       I32 in ireg, rounding towards zero */
   9938    if (insn[0] == 0xF2 && insn[1] == 0x0F
   9939        && (insn[2] == 0x2D || insn[2] == 0x2C)) {
   9940       IRTemp rmode = newTemp(Ity_I32);
   9941       IRTemp f64lo = newTemp(Ity_F64);
   9942       Bool   r2zero = toBool(insn[2] == 0x2C);
   9943       vassert(sz == 4);
   9944 
   9945       modrm = getIByte(delta+3);
   9946       if (epartIsReg(modrm)) {
   9947          delta += 3+1;
   9948 	 assign(f64lo, getXMMRegLane64F(eregOfRM(modrm), 0));
   9949          DIP("cvt%ssd2si %s,%s\n", r2zero ? "t" : "",
   9950                                    nameXMMReg(eregOfRM(modrm)),
   9951                                    nameIReg(4, gregOfRM(modrm)));
   9952       } else {
   9953          addr = disAMode ( &alen, sorb, delta+3, dis_buf );
   9954 	 assign(f64lo, loadLE(Ity_F64, mkexpr(addr)));
   9955          delta += 3+alen;
   9956          DIP("cvt%ssd2si %s,%s\n", r2zero ? "t" : "",
   9957                                    dis_buf,
   9958                                    nameIReg(4, gregOfRM(modrm)));
   9959       }
   9960 
   9961       if (r2zero) {
   9962          assign( rmode, mkU32((UInt)Irrm_ZERO) );
   9963       } else {
   9964          assign( rmode, get_sse_roundingmode() );
   9965       }
   9966 
   9967       putIReg(4, gregOfRM(modrm),
   9968                  binop( Iop_F64toI32S, mkexpr(rmode), mkexpr(f64lo)) );
   9969 
   9970       goto decode_success;
   9971    }
   9972 
   9973    /* F2 0F 5A = CVTSD2SS -- convert F64 in mem/low half xmm to F32 in
   9974       low 1/4 xmm(G), according to prevailing SSE rounding mode */
   9975    if (insn[0] == 0xF2 && insn[1] == 0x0F && insn[2] == 0x5A) {
   9976       IRTemp rmode = newTemp(Ity_I32);
   9977       IRTemp f64lo = newTemp(Ity_F64);
   9978       vassert(sz == 4);
   9979 
   9980       modrm = getIByte(delta+3);
   9981       if (epartIsReg(modrm)) {
   9982          delta += 3+1;
   9983 	 assign(f64lo, getXMMRegLane64F(eregOfRM(modrm), 0));
   9984          DIP("cvtsd2ss %s,%s\n", nameXMMReg(eregOfRM(modrm)),
   9985                                  nameXMMReg(gregOfRM(modrm)));
   9986       } else {
   9987          addr = disAMode ( &alen, sorb, delta+3, dis_buf );
   9988 	 assign(f64lo, loadLE(Ity_F64, mkexpr(addr)));
   9989          delta += 3+alen;
   9990          DIP("cvtsd2ss %s,%s\n", dis_buf,
   9991                                  nameXMMReg(gregOfRM(modrm)));
   9992       }
   9993 
   9994       assign( rmode, get_sse_roundingmode() );
   9995       putXMMRegLane32F(
   9996          gregOfRM(modrm), 0,
   9997          binop( Iop_F64toF32, mkexpr(rmode), mkexpr(f64lo) )
   9998       );
   9999 
   10000       goto decode_success;
   10001    }
   10002 
   10003    /* F2 0F 2A = CVTSI2SD -- convert I32 in mem/ireg to F64 in low
   10004       half xmm */
   10005    if (insn[0] == 0xF2 && insn[1] == 0x0F && insn[2] == 0x2A) {
   10006       IRTemp arg32 = newTemp(Ity_I32);
   10007       vassert(sz == 4);
   10008 
   10009       modrm = getIByte(delta+3);
   10010       if (epartIsReg(modrm)) {
   10011          assign( arg32, getIReg(4, eregOfRM(modrm)) );
   10012          delta += 3+1;
   10013          DIP("cvtsi2sd %s,%s\n", nameIReg(4, eregOfRM(modrm)),
   10014                                  nameXMMReg(gregOfRM(modrm)));
   10015       } else {
   10016          addr = disAMode ( &alen, sorb, delta+3, dis_buf );
   10017 	 assign( arg32, loadLE(Ity_I32, mkexpr(addr)) );
   10018          delta += 3+alen;
   10019          DIP("cvtsi2sd %s,%s\n", dis_buf,
   10020                                  nameXMMReg(gregOfRM(modrm)) );
   10021       }
   10022 
   10023       putXMMRegLane64F(
   10024          gregOfRM(modrm), 0,
   10025          unop(Iop_I32StoF64, mkexpr(arg32)) );
   10026 
   10027       goto decode_success;
   10028    }
   10029 
   10030    /* F3 0F 5A = CVTSS2SD -- convert F32 in mem/low 1/4 xmm to F64 in
   10031       low half xmm(G) */
   10032    if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0x5A) {
   10033       IRTemp f32lo = newTemp(Ity_F32);
   10034       vassert(sz == 4);
   10035 
   10036       modrm = getIByte(delta+3);
   10037       if (epartIsReg(modrm)) {
   10038          delta += 3+1;
   10039 	 assign(f32lo, getXMMRegLane32F(eregOfRM(modrm), 0));
   10040          DIP("cvtss2sd %s,%s\n", nameXMMReg(eregOfRM(modrm)),
   10041                                  nameXMMReg(gregOfRM(modrm)));
   10042       } else {
   10043          addr = disAMode ( &alen, sorb, delta+3, dis_buf );
   10044 	 assign(f32lo, loadLE(Ity_F32, mkexpr(addr)));
   10045          delta += 3+alen;
   10046          DIP("cvtss2sd %s,%s\n", dis_buf,
   10047                                  nameXMMReg(gregOfRM(modrm)));
   10048       }
   10049 
   10050       putXMMRegLane64F( gregOfRM(modrm), 0,
   10051                         unop( Iop_F32toF64, mkexpr(f32lo) ) );
   10052 
   10053       goto decode_success;
   10054    }
   10055 
   10056    /* 66 0F E6 = CVTTPD2DQ -- convert 2 x F64 in mem/xmm to 2 x I32 in
   10057       lo half xmm(G), and zero upper half, rounding towards zero */
   10058    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xE6) {
   10059       IRTemp argV  = newTemp(Ity_V128);
   10060       IRTemp rmode = newTemp(Ity_I32);
   10061 
   10062       modrm = getIByte(delta+2);
   10063       if (epartIsReg(modrm)) {
   10064          assign( argV, getXMMReg(eregOfRM(modrm)) );
   10065          delta += 2+1;
   10066          DIP("cvttpd2dq %s,%s\n", nameXMMReg(eregOfRM(modrm)),
   10067                                   nameXMMReg(gregOfRM(modrm)));
   10068       } else {
   10069          addr = disAMode ( &alen, sorb, delta+2, dis_buf );
   10070 	 assign( argV, loadLE(Ity_V128, mkexpr(addr)) );
   10071          delta += 2+alen;
   10072          DIP("cvttpd2dq %s,%s\n", dis_buf,
   10073                                   nameXMMReg(gregOfRM(modrm)) );
   10074       }
   10075 
   10076       assign( rmode, mkU32((UInt)Irrm_ZERO) );
   10077 
   10078       t0 = newTemp(Ity_F64);
   10079       t1 = newTemp(Ity_F64);
   10080       assign( t0, unop(Iop_ReinterpI64asF64,
   10081                        unop(Iop_V128to64, mkexpr(argV))) );
   10082       assign( t1, unop(Iop_ReinterpI64asF64,
   10083                        unop(Iop_V128HIto64, mkexpr(argV))) );
   10084 
   10085 #     define CVT(_t)  binop( Iop_F64toI32S,                   \
   10086                              mkexpr(rmode),                   \
   10087                              mkexpr(_t) )
   10088 
   10089       putXMMRegLane32( gregOfRM(modrm), 3, mkU32(0) );
   10090       putXMMRegLane32( gregOfRM(modrm), 2, mkU32(0) );
   10091       putXMMRegLane32( gregOfRM(modrm), 1, CVT(t1) );
   10092       putXMMRegLane32( gregOfRM(modrm), 0, CVT(t0) );
   10093 
   10094 #     undef CVT
   10095 
   10096       goto decode_success;
   10097    }
   10098 
   10099    /* F3 0F 5B = CVTTPS2DQ -- convert 4 x F32 in mem/xmm to 4 x I32 in
   10100       xmm(G), rounding towards zero */
   10101    if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0x5B) {
   10102       IRTemp argV  = newTemp(Ity_V128);
   10103       IRTemp rmode = newTemp(Ity_I32);
   10104       vassert(sz == 4);
   10105 
   10106       modrm = getIByte(delta+3);
   10107       if (epartIsReg(modrm)) {
   10108          assign( argV, getXMMReg(eregOfRM(modrm)) );
   10109          delta += 3+1;
   10110          DIP("cvttps2dq %s,%s\n", nameXMMReg(eregOfRM(modrm)),
   10111                                   nameXMMReg(gregOfRM(modrm)));
   10112       } else {
   10113          addr = disAMode ( &alen, sorb, delta+3, dis_buf );
   10114 	 assign( argV, loadLE(Ity_V128, mkexpr(addr)) );
   10115          delta += 3+alen;
   10116          DIP("cvttps2dq %s,%s\n", dis_buf,
   10117                                   nameXMMReg(gregOfRM(modrm)) );
   10118       }
   10119 
   10120       assign( rmode, mkU32((UInt)Irrm_ZERO) );
   10121       breakup128to32s( argV, &t3, &t2, &t1, &t0 );
   10122 
   10123       /* This is less than ideal.  If it turns out to be a performance
   10124 	 bottleneck it can be improved. */
   10125 #     define CVT(_t)                            \
   10126         binop( Iop_F64toI32S,                   \
   10127                mkexpr(rmode),                   \
   10128                unop( Iop_F32toF64,              \
   10129                      unop( Iop_ReinterpI32asF32, mkexpr(_t))) )
   10130 
   10131       putXMMRegLane32( gregOfRM(modrm), 3, CVT(t3) );
   10132       putXMMRegLane32( gregOfRM(modrm), 2, CVT(t2) );
   10133       putXMMRegLane32( gregOfRM(modrm), 1, CVT(t1) );
   10134       putXMMRegLane32( gregOfRM(modrm), 0, CVT(t0) );
   10135 
   10136 #     undef CVT
   10137 
   10138       goto decode_success;
   10139    }
   10140 
   10141    /* 66 0F 5E = DIVPD -- div 64Fx2 from R/M to R */
   10142    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x5E) {
   10143       delta = dis_SSE_E_to_G_all( sorb, delta+2, "divpd", Iop_Div64Fx2 );
   10144       goto decode_success;
   10145    }
   10146 
   10147    /* F2 0F 5E = DIVSD -- div 64F0x2 from R/M to R */
   10148    if (insn[0] == 0xF2 && insn[1] == 0x0F && insn[2] == 0x5E) {
   10149       vassert(sz == 4);
   10150       delta = dis_SSE_E_to_G_lo64( sorb, delta+3, "divsd", Iop_Div64F0x2 );
   10151       goto decode_success;
   10152    }
   10153 
   10154    /* 0F AE /5 = LFENCE -- flush pending operations to memory */
   10155    /* 0F AE /6 = MFENCE -- flush pending operations to memory */
   10156    if (insn[0] == 0x0F && insn[1] == 0xAE
   10157        && epartIsReg(insn[2])
   10158        && (gregOfRM(insn[2]) == 5 || gregOfRM(insn[2]) == 6)) {
   10159       vassert(sz == 4);
   10160       delta += 3;
   10161       /* Insert a memory fence.  It's sometimes important that these
   10162          are carried through to the generated code. */
   10163       stmt( IRStmt_MBE(Imbe_Fence) );
   10164       DIP("%sfence\n", gregOfRM(insn[2])==5 ? "l" : "m");
   10165       goto decode_success;
   10166    }
   10167 
   10168    /* 66 0F 5F = MAXPD -- max 64Fx2 from R/M to R */
   10169    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x5F) {
   10170       delta = dis_SSE_E_to_G_all( sorb, delta+2, "maxpd", Iop_Max64Fx2 );
   10171       goto decode_success;
   10172    }
   10173 
   10174    /* F2 0F 5F = MAXSD -- max 64F0x2 from R/M to R */
   10175    if (insn[0] == 0xF2 && insn[1] == 0x0F && insn[2] == 0x5F) {
   10176       vassert(sz == 4);
   10177       delta = dis_SSE_E_to_G_lo64( sorb, delta+3, "maxsd", Iop_Max64F0x2 );
   10178       goto decode_success;
   10179    }
   10180 
   10181    /* 66 0F 5D = MINPD -- min 64Fx2 from R/M to R */
   10182    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x5D) {
   10183       delta = dis_SSE_E_to_G_all( sorb, delta+2, "minpd", Iop_Min64Fx2 );
   10184       goto decode_success;
   10185    }
   10186 
   10187    /* F2 0F 5D = MINSD -- min 64F0x2 from R/M to R */
   10188    if (insn[0] == 0xF2 && insn[1] == 0x0F && insn[2] == 0x5D) {
   10189       vassert(sz == 4);
   10190       delta = dis_SSE_E_to_G_lo64( sorb, delta+3, "minsd", Iop_Min64F0x2 );
   10191       goto decode_success;
   10192    }
   10193 
   10194    /* 66 0F 28 = MOVAPD -- move from E (mem or xmm) to G (xmm). */
   10195    /* 66 0F 10 = MOVUPD -- move from E (mem or xmm) to G (xmm). */
   10196    /* 66 0F 6F = MOVDQA -- move from E (mem or xmm) to G (xmm). */
   10197    if (sz == 2 && insn[0] == 0x0F
   10198        && (insn[1] == 0x28 || insn[1] == 0x10 || insn[1] == 0x6F)) {
   10199       const HChar* wot = insn[1]==0x28 ? "apd" :
   10200                          insn[1]==0x10 ? "upd" : "dqa";
   10201       modrm = getIByte(delta+2);
   10202       if (epartIsReg(modrm)) {
   10203          putXMMReg( gregOfRM(modrm),
   10204                     getXMMReg( eregOfRM(modrm) ));
   10205          DIP("mov%s %s,%s\n", wot, nameXMMReg(eregOfRM(modrm)),
   10206                                    nameXMMReg(gregOfRM(modrm)));
   10207          delta += 2+1;
   10208       } else {
   10209          addr = disAMode ( &alen, sorb, delta+2, dis_buf );
   10210          if (insn[1] == 0x28/*movapd*/ || insn[1] == 0x6F/*movdqa*/)
   10211             gen_SEGV_if_not_16_aligned( addr );
   10212          putXMMReg( gregOfRM(modrm),
   10213                     loadLE(Ity_V128, mkexpr(addr)) );
   10214          DIP("mov%s %s,%s\n", wot, dis_buf,
   10215                                    nameXMMReg(gregOfRM(modrm)));
   10216          delta += 2+alen;
   10217       }
   10218       goto decode_success;
   10219    }
   10220 
   10221    /* 66 0F 29 = MOVAPD -- move from G (xmm) to E (mem or xmm). */
   10222    /* 66 0F 11 = MOVUPD -- move from G (xmm) to E (mem or xmm). */
   10223    if (sz == 2 && insn[0] == 0x0F
   10224        && (insn[1] == 0x29 || insn[1] == 0x11)) {
   10225       const HChar* wot = insn[1]==0x29 ? "apd" : "upd";
   10226       modrm = getIByte(delta+2);
   10227       if (epartIsReg(modrm)) {
   10228          /* fall through; awaiting test case */
   10229       } else {
   10230          addr = disAMode ( &alen, sorb, delta+2, dis_buf );
   10231          if (insn[1] == 0x29/*movapd*/)
   10232             gen_SEGV_if_not_16_aligned( addr );
   10233          storeLE( mkexpr(addr), getXMMReg(gregOfRM(modrm)) );
   10234          DIP("mov%s %s,%s\n", wot, nameXMMReg(gregOfRM(modrm)),
   10235                                    dis_buf );
   10236          delta += 2+alen;
   10237          goto decode_success;
   10238       }
   10239    }
   10240 
   10241    /* 66 0F 6E = MOVD from r/m32 to xmm, zeroing high 3/4 of xmm. */
   10242    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x6E) {
   10243       modrm = getIByte(delta+2);
   10244       if (epartIsReg(modrm)) {
   10245          delta += 2+1;
   10246          putXMMReg(
   10247             gregOfRM(modrm),
   10248             unop( Iop_32UtoV128, getIReg(4, eregOfRM(modrm)) )
   10249          );
   10250          DIP("movd %s, %s\n",
   10251              nameIReg(4,eregOfRM(modrm)), nameXMMReg(gregOfRM(modrm)));
   10252       } else {
   10253          addr = disAMode( &alen, sorb, delta+2, dis_buf );
   10254          delta += 2+alen;
   10255          putXMMReg(
   10256             gregOfRM(modrm),
   10257             unop( Iop_32UtoV128,loadLE(Ity_I32, mkexpr(addr)) )
   10258          );
   10259          DIP("movd %s, %s\n", dis_buf, nameXMMReg(gregOfRM(modrm)));
   10260       }
   10261       goto decode_success;
   10262    }
   10263 
   10264    /* 66 0F 7E = MOVD from xmm low 1/4 to r/m32. */
   10265    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x7E) {
   10266       modrm = getIByte(delta+2);
   10267       if (epartIsReg(modrm)) {
   10268          delta += 2+1;
   10269          putIReg( 4, eregOfRM(modrm),
   10270                   getXMMRegLane32(gregOfRM(modrm), 0) );
   10271          DIP("movd %s, %s\n",
   10272              nameXMMReg(gregOfRM(modrm)), nameIReg(4,eregOfRM(modrm)));
   10273       } else {
   10274          addr = disAMode( &alen, sorb, delta+2, dis_buf );
   10275          delta += 2+alen;
   10276          storeLE( mkexpr(addr),
   10277                   getXMMRegLane32(gregOfRM(modrm), 0) );
   10278          DIP("movd %s, %s\n", nameXMMReg(gregOfRM(modrm)), dis_buf);
   10279       }
   10280       goto decode_success;
   10281    }
   10282 
   10283    /* 66 0F 7F = MOVDQA -- move from G (xmm) to E (mem or xmm). */
   10284    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x7F) {
   10285       modrm = getIByte(delta+2);
   10286       if (epartIsReg(modrm)) {
   10287          delta += 2+1;
   10288          putXMMReg( eregOfRM(modrm),
   10289                     getXMMReg(gregOfRM(modrm)) );
   10290          DIP("movdqa %s, %s\n", nameXMMReg(gregOfRM(modrm)),
   10291                                 nameXMMReg(eregOfRM(modrm)));
   10292       } else {
   10293          addr = disAMode( &alen, sorb, delta+2, dis_buf );
   10294          delta += 2+alen;
   10295          gen_SEGV_if_not_16_aligned( addr );
   10296          storeLE( mkexpr(addr), getXMMReg(gregOfRM(modrm)) );
   10297          DIP("movdqa %s, %s\n", nameXMMReg(gregOfRM(modrm)), dis_buf);
   10298       }
   10299       goto decode_success;
   10300    }
   10301 
   10302    /* F3 0F 6F = MOVDQU -- move from E (mem or xmm) to G (xmm). */
   10303    /* Unfortunately can't simply use the MOVDQA case since the
   10304       prefix lengths are different (66 vs F3) */
   10305    if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0x6F) {
   10306       vassert(sz == 4);
   10307       modrm = getIByte(delta+3);
   10308       if (epartIsReg(modrm)) {
   10309          putXMMReg( gregOfRM(modrm),
   10310                     getXMMReg( eregOfRM(modrm) ));
   10311          DIP("movdqu %s,%s\n", nameXMMReg(eregOfRM(modrm)),
   10312                                nameXMMReg(gregOfRM(modrm)));
   10313          delta += 3+1;
   10314       } else {
   10315          addr = disAMode ( &alen, sorb, delta+3, dis_buf );
   10316          putXMMReg( gregOfRM(modrm),
   10317                     loadLE(Ity_V128, mkexpr(addr)) );
   10318          DIP("movdqu %s,%s\n", dis_buf,
   10319                                nameXMMReg(gregOfRM(modrm)));
   10320          delta += 3+alen;
   10321       }
   10322       goto decode_success;
   10323    }
   10324 
   10325    /* F3 0F 7F = MOVDQU -- move from G (xmm) to E (mem or xmm). */
   10326    /* Unfortunately can't simply use the MOVDQA case since the
   10327       prefix lengths are different (66 vs F3) */
   10328    if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0x7F) {
   10329       vassert(sz == 4);
   10330       modrm = getIByte(delta+3);
   10331       if (epartIsReg(modrm)) {
   10332          delta += 3+1;
   10333          putXMMReg( eregOfRM(modrm),
   10334                     getXMMReg(gregOfRM(modrm)) );
   10335          DIP("movdqu %s, %s\n", nameXMMReg(gregOfRM(modrm)),
   10336                                 nameXMMReg(eregOfRM(modrm)));
   10337       } else {
   10338          addr = disAMode( &alen, sorb, delta+3, dis_buf );
   10339          delta += 3+alen;
   10340          storeLE( mkexpr(addr), getXMMReg(gregOfRM(modrm)) );
   10341          DIP("movdqu %s, %s\n", nameXMMReg(gregOfRM(modrm)), dis_buf);
   10342       }
   10343       goto decode_success;
   10344    }
   10345 
   10346    /* F2 0F D6 = MOVDQ2Q -- move from E (lo half xmm, not mem) to G (mmx). */
   10347    if (insn[0] == 0xF2 && insn[1] == 0x0F && insn[2] == 0xD6) {
   10348       vassert(sz == 4);
   10349       modrm = getIByte(delta+3);
   10350       if (epartIsReg(modrm)) {
   10351          do_MMX_preamble();
   10352          putMMXReg( gregOfRM(modrm),
   10353                     getXMMRegLane64( eregOfRM(modrm), 0 ));
   10354          DIP("movdq2q %s,%s\n", nameXMMReg(eregOfRM(modrm)),
   10355                                 nameMMXReg(gregOfRM(modrm)));
   10356          delta += 3+1;
   10357          goto decode_success;
   10358       } else {
   10359          /* fall through, apparently no mem case for this insn */
   10360       }
   10361    }
   10362 
   10363    /* 66 0F 16 = MOVHPD -- move from mem to high half of XMM. */
   10364    /* These seems identical to MOVHPS.  This instruction encoding is
   10365       completely crazy. */
   10366    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x16) {
   10367       modrm = getIByte(delta+2);
   10368       if (epartIsReg(modrm)) {
   10369          /* fall through; apparently reg-reg is not possible */
   10370       } else {
   10371          addr = disAMode ( &alen, sorb, delta+2, dis_buf );
   10372          delta += 2+alen;
   10373          putXMMRegLane64( gregOfRM(modrm), 1/*upper lane*/,
   10374                           loadLE(Ity_I64, mkexpr(addr)) );
   10375          DIP("movhpd %s,%s\n", dis_buf,
   10376                                nameXMMReg( gregOfRM(modrm) ));
   10377          goto decode_success;
   10378       }
   10379    }
   10380 
   10381    /* 66 0F 17 = MOVHPD -- move from high half of XMM to mem. */
   10382    /* Again, this seems identical to MOVHPS. */
   10383    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x17) {
   10384       if (!epartIsReg(insn[2])) {
   10385          delta += 2;
   10386          addr = disAMode ( &alen, sorb, delta, dis_buf );
   10387          delta += alen;
   10388          storeLE( mkexpr(addr),
   10389                   getXMMRegLane64( gregOfRM(insn[2]),
   10390                                    1/*upper lane*/ ) );
   10391          DIP("movhpd %s,%s\n", nameXMMReg( gregOfRM(insn[2]) ),
   10392                                dis_buf);
   10393          goto decode_success;
   10394       }
   10395       /* else fall through */
   10396    }
   10397 
   10398    /* 66 0F 12 = MOVLPD -- move from mem to low half of XMM. */
   10399    /* Identical to MOVLPS ? */
   10400    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x12) {
   10401       modrm = getIByte(delta+2);
   10402       if (epartIsReg(modrm)) {
   10403          /* fall through; apparently reg-reg is not possible */
   10404       } else {
   10405          addr = disAMode ( &alen, sorb, delta+2, dis_buf );
   10406          delta += 2+alen;
   10407          putXMMRegLane64( gregOfRM(modrm),  0/*lower lane*/,
   10408                           loadLE(Ity_I64, mkexpr(addr)) );
   10409          DIP("movlpd %s, %s\n",
   10410              dis_buf, nameXMMReg( gregOfRM(modrm) ));
   10411          goto decode_success;
   10412       }
   10413    }
   10414 
   10415    /* 66 0F 13 = MOVLPD -- move from low half of XMM to mem. */
   10416    /* Identical to MOVLPS ? */
   10417    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x13) {
   10418       if (!epartIsReg(insn[2])) {
   10419          delta += 2;
   10420          addr = disAMode ( &alen, sorb, delta, dis_buf );
   10421          delta += alen;
   10422          storeLE( mkexpr(addr),
   10423                   getXMMRegLane64( gregOfRM(insn[2]),
   10424                                    0/*lower lane*/ ) );
   10425          DIP("movlpd %s, %s\n", nameXMMReg( gregOfRM(insn[2]) ),
   10426                                 dis_buf);
   10427          goto decode_success;
   10428       }
   10429       /* else fall through */
   10430    }
   10431 
   10432    /* 66 0F 50 = MOVMSKPD - move 2 sign bits from 2 x F64 in xmm(E) to
   10433       2 lowest bits of ireg(G) */
   10434    if (insn[0] == 0x0F && insn[1] == 0x50) {
   10435       modrm = getIByte(delta+2);
   10436       if (sz == 2 && epartIsReg(modrm)) {
   10437          Int src;
   10438          t0 = newTemp(Ity_I32);
   10439          t1 = newTemp(Ity_I32);
   10440          delta += 2+1;
   10441          src = eregOfRM(modrm);
   10442          assign( t0, binop( Iop_And32,
   10443                             binop(Iop_Shr32, getXMMRegLane32(src,1), mkU8(31)),
   10444                             mkU32(1) ));
   10445          assign( t1, binop( Iop_And32,
   10446                             binop(Iop_Shr32, getXMMRegLane32(src,3), mkU8(30)),
   10447                             mkU32(2) ));
   10448          putIReg(4, gregOfRM(modrm),
   10449                     binop(Iop_Or32, mkexpr(t0), mkexpr(t1))
   10450                  );
   10451          DIP("movmskpd %s,%s\n", nameXMMReg(src),
   10452                                  nameIReg(4, gregOfRM(modrm)));
   10453          goto decode_success;
   10454       }
   10455       /* else fall through */
   10456    }
   10457 
   10458    /* 66 0F F7 = MASKMOVDQU -- store selected bytes of double quadword */
   10459    if (insn[0] == 0x0F && insn[1] == 0xF7) {
   10460       modrm = getIByte(delta+2);
   10461       if (sz == 2 && epartIsReg(modrm)) {
   10462          IRTemp regD    = newTemp(Ity_V128);
   10463          IRTemp mask    = newTemp(Ity_V128);
   10464          IRTemp olddata = newTemp(Ity_V128);
   10465          IRTemp newdata = newTemp(Ity_V128);
   10466                 addr    = newTemp(Ity_I32);
   10467 
   10468          assign( addr, handleSegOverride( sorb, getIReg(4, R_EDI) ));
   10469          assign( regD, getXMMReg( gregOfRM(modrm) ));
   10470 
   10471          /* Unfortunately can't do the obvious thing with SarN8x16
   10472             here since that can't be re-emitted as SSE2 code - no such
   10473             insn. */
   10474 	 assign(
   10475             mask,
   10476             binop(Iop_64HLtoV128,
   10477                   binop(Iop_SarN8x8,
   10478                         getXMMRegLane64( eregOfRM(modrm), 1 ),
   10479                         mkU8(7) ),
   10480                   binop(Iop_SarN8x8,
   10481                         getXMMRegLane64( eregOfRM(modrm), 0 ),
   10482                         mkU8(7) ) ));
   10483          assign( olddata, loadLE( Ity_V128, mkexpr(addr) ));
   10484          assign( newdata,
   10485                  binop(Iop_OrV128,
   10486                        binop(Iop_AndV128,
   10487                              mkexpr(regD),
   10488                              mkexpr(mask) ),
   10489                        binop(Iop_AndV128,
   10490                              mkexpr(olddata),
   10491                              unop(Iop_NotV128, mkexpr(mask)))) );
   10492          storeLE( mkexpr(addr), mkexpr(newdata) );
   10493 
   10494          delta += 2+1;
   10495          DIP("maskmovdqu %s,%s\n", nameXMMReg( eregOfRM(modrm) ),
   10496                                    nameXMMReg( gregOfRM(modrm) ) );
   10497          goto decode_success;
   10498       }
   10499       /* else fall through */
   10500    }
   10501 
   10502    /* 66 0F E7 = MOVNTDQ -- for us, just a plain SSE store. */
   10503    if (insn[0] == 0x0F && insn[1] == 0xE7) {
   10504       modrm = getIByte(delta+2);
   10505       if (sz == 2 && !epartIsReg(modrm)) {
   10506          addr = disAMode ( &alen, sorb, delta+2, dis_buf );
   10507          gen_SEGV_if_not_16_aligned( addr );
   10508          storeLE( mkexpr(addr), getXMMReg(gregOfRM(modrm)) );
   10509          DIP("movntdq %s,%s\n", dis_buf,
   10510                                 nameXMMReg(gregOfRM(modrm)));
   10511          delta += 2+alen;
   10512          goto decode_success;
   10513       }
   10514       /* else fall through */
   10515    }
   10516 
   10517    /* 0F C3 = MOVNTI -- for us, just a plain ireg store. */
   10518    if (insn[0] == 0x0F && insn[1] == 0xC3) {
   10519       vassert(sz == 4);
   10520       modrm = getIByte(delta+2);
   10521       if (!epartIsReg(modrm)) {
   10522          addr = disAMode ( &alen, sorb, delta+2, dis_buf );
   10523          storeLE( mkexpr(addr), getIReg(4, gregOfRM(modrm)) );
   10524          DIP("movnti %s,%s\n", dis_buf,
   10525                                nameIReg(4, gregOfRM(modrm)));
   10526          delta += 2+alen;
   10527          goto decode_success;
   10528       }
   10529       /* else fall through */
   10530    }
   10531 
   10532    /* 66 0F D6 = MOVQ -- move 64 bits from G (lo half xmm) to E (mem
   10533       or lo half xmm).  */
   10534    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xD6) {
   10535       modrm = getIByte(delta+2);
   10536       if (epartIsReg(modrm)) {
   10537          /* fall through, awaiting test case */
   10538          /* dst: lo half copied, hi half zeroed */
   10539       } else {
   10540          addr = disAMode ( &alen, sorb, delta+2, dis_buf );
   10541          storeLE( mkexpr(addr),
   10542                   getXMMRegLane64( gregOfRM(modrm), 0 ));
   10543          DIP("movq %s,%s\n", nameXMMReg(gregOfRM(modrm)), dis_buf );
   10544          delta += 2+alen;
   10545          goto decode_success;
   10546       }
   10547    }
   10548 
   10549    /* F3 0F D6 = MOVQ2DQ -- move from E (mmx) to G (lo half xmm, zero
   10550       hi half). */
   10551    if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0xD6) {
   10552       vassert(sz == 4);
   10553       modrm = getIByte(delta+3);
   10554       if (epartIsReg(modrm)) {
   10555          do_MMX_preamble();
   10556          putXMMReg( gregOfRM(modrm),
   10557                     unop(Iop_64UtoV128, getMMXReg( eregOfRM(modrm) )) );
   10558          DIP("movq2dq %s,%s\n", nameMMXReg(eregOfRM(modrm)),
   10559                                 nameXMMReg(gregOfRM(modrm)));
   10560          delta += 3+1;
   10561          goto decode_success;
   10562       } else {
   10563          /* fall through, apparently no mem case for this insn */
   10564       }
   10565    }
   10566 
   10567    /* F3 0F 7E = MOVQ -- move 64 bits from E (mem or lo half xmm) to
   10568       G (lo half xmm).  Upper half of G is zeroed out. */
   10569    /* F2 0F 10 = MOVSD -- move 64 bits from E (mem or lo half xmm) to
   10570       G (lo half xmm).  If E is mem, upper half of G is zeroed out.
   10571       If E is reg, upper half of G is unchanged. */
   10572    if ((insn[0] == 0xF2 && insn[1] == 0x0F && insn[2] == 0x10)
   10573        || (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0x7E)) {
   10574       vassert(sz == 4);
   10575       modrm = getIByte(delta+3);
   10576       if (epartIsReg(modrm)) {
   10577          putXMMRegLane64( gregOfRM(modrm), 0,
   10578                           getXMMRegLane64( eregOfRM(modrm), 0 ));
   10579          if (insn[0] == 0xF3/*MOVQ*/) {
   10580             /* zero bits 127:64 */
   10581             putXMMRegLane64( gregOfRM(modrm), 1, mkU64(0) );
   10582          }
   10583          DIP("movsd %s,%s\n", nameXMMReg(eregOfRM(modrm)),
   10584                               nameXMMReg(gregOfRM(modrm)));
   10585          delta += 3+1;
   10586       } else {
   10587          addr = disAMode ( &alen, sorb, delta+3, dis_buf );
   10588          /* zero bits 127:64 */
   10589          putXMMRegLane64( gregOfRM(modrm), 1, mkU64(0) );
   10590          /* write bits 63:0 */
   10591          putXMMRegLane64( gregOfRM(modrm), 0,
   10592                           loadLE(Ity_I64, mkexpr(addr)) );
   10593          DIP("movsd %s,%s\n", dis_buf,
   10594                               nameXMMReg(gregOfRM(modrm)));
   10595          delta += 3+alen;
   10596       }
   10597       goto decode_success;
   10598    }
   10599 
   10600    /* F2 0F 11 = MOVSD -- move 64 bits from G (lo half xmm) to E (mem
   10601       or lo half xmm). */
   10602    if (insn[0] == 0xF2 && insn[1] == 0x0F && insn[2] == 0x11) {
   10603       vassert(sz == 4);
   10604       modrm = getIByte(delta+3);
   10605       if (epartIsReg(modrm)) {
   10606          putXMMRegLane64( eregOfRM(modrm), 0,
   10607                           getXMMRegLane64( gregOfRM(modrm), 0 ));
   10608          DIP("movsd %s,%s\n", nameXMMReg(gregOfRM(modrm)),
   10609                               nameXMMReg(eregOfRM(modrm)));
   10610          delta += 3+1;
   10611       } else {
   10612          addr = disAMode ( &alen, sorb, delta+3, dis_buf );
   10613          storeLE( mkexpr(addr),
   10614                   getXMMRegLane64(gregOfRM(modrm), 0) );
   10615          DIP("movsd %s,%s\n", nameXMMReg(gregOfRM(modrm)),
   10616                               dis_buf);
   10617          delta += 3+alen;
   10618       }
   10619       goto decode_success;
   10620    }
   10621 
   10622    /* 66 0F 59 = MULPD -- mul 64Fx2 from R/M to R */
   10623    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x59) {
   10624       delta = dis_SSE_E_to_G_all( sorb, delta+2, "mulpd", Iop_Mul64Fx2 );
   10625       goto decode_success;
   10626    }
   10627 
   10628    /* F2 0F 59 = MULSD -- mul 64F0x2 from R/M to R */
   10629    if (insn[0] == 0xF2 && insn[1] == 0x0F && insn[2] == 0x59) {
   10630       vassert(sz == 4);
   10631       delta = dis_SSE_E_to_G_lo64( sorb, delta+3, "mulsd", Iop_Mul64F0x2 );
   10632       goto decode_success;
   10633    }
   10634 
   10635    /* 66 0F 56 = ORPD -- G = G and E */
   10636    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x56) {
   10637       delta = dis_SSE_E_to_G_all( sorb, delta+2, "orpd", Iop_OrV128 );
   10638       goto decode_success;
   10639    }
   10640 
   10641    /* 66 0F C6 /r ib = SHUFPD -- shuffle packed F64s */
   10642    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xC6) {
   10643       Int    select;
   10644       IRTemp sV = newTemp(Ity_V128);
   10645       IRTemp dV = newTemp(Ity_V128);
   10646       IRTemp s1 = newTemp(Ity_I64);
   10647       IRTemp s0 = newTemp(Ity_I64);
   10648       IRTemp d1 = newTemp(Ity_I64);
   10649       IRTemp d0 = newTemp(Ity_I64);
   10650 
   10651       modrm = insn[2];
   10652       assign( dV, getXMMReg(gregOfRM(modrm)) );
   10653 
   10654       if (epartIsReg(modrm)) {
   10655          assign( sV, getXMMReg(eregOfRM(modrm)) );
   10656          select = (Int)insn[3];
   10657          delta += 2+2;
   10658          DIP("shufpd $%d,%s,%s\n", select,
   10659                                    nameXMMReg(eregOfRM(modrm)),
   10660                                    nameXMMReg(gregOfRM(modrm)));
   10661       } else {
   10662          addr = disAMode ( &alen, sorb, delta+2, dis_buf );
   10663          assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
   10664          select = (Int)insn[2+alen];
   10665          delta += 3+alen;
   10666          DIP("shufpd $%d,%s,%s\n", select,
   10667                                    dis_buf,
   10668                                    nameXMMReg(gregOfRM(modrm)));
   10669       }
   10670 
   10671       assign( d1, unop(Iop_V128HIto64, mkexpr(dV)) );
   10672       assign( d0, unop(Iop_V128to64,   mkexpr(dV)) );
   10673       assign( s1, unop(Iop_V128HIto64, mkexpr(sV)) );
   10674       assign( s0, unop(Iop_V128to64,   mkexpr(sV)) );
   10675 
   10676 #     define SELD(n) mkexpr((n)==0 ? d0 : d1)
   10677 #     define SELS(n) mkexpr((n)==0 ? s0 : s1)
   10678 
   10679       putXMMReg(
   10680          gregOfRM(modrm),
   10681          binop(Iop_64HLtoV128, SELS((select>>1)&1), SELD((select>>0)&1) )
   10682       );
   10683 
   10684 #     undef SELD
   10685 #     undef SELS
   10686 
   10687       goto decode_success;
   10688    }
   10689 
   10690    /* 66 0F 51 = SQRTPD -- approx sqrt 64Fx2 from R/M to R */
   10691    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x51) {
   10692       delta = dis_SSE_E_to_G_unary_all( sorb, delta+2,
   10693                                         "sqrtpd", Iop_Sqrt64Fx2 );
   10694       goto decode_success;
   10695    }
   10696 
   10697    /* F2 0F 51 = SQRTSD -- approx sqrt 64F0x2 from R/M to R */
   10698    if (insn[0] == 0xF2 && insn[1] == 0x0F && insn[2] == 0x51) {
   10699       vassert(sz == 4);
   10700       delta = dis_SSE_E_to_G_unary_lo64( sorb, delta+3,
   10701                                          "sqrtsd", Iop_Sqrt64F0x2 );
   10702       goto decode_success;
   10703    }
   10704 
   10705    /* 66 0F 5C = SUBPD -- sub 64Fx2 from R/M to R */
   10706    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x5C) {
   10707       delta = dis_SSE_E_to_G_all( sorb, delta+2, "subpd", Iop_Sub64Fx2 );
   10708       goto decode_success;
   10709    }
   10710 
   10711    /* F2 0F 5C = SUBSD -- sub 64F0x2 from R/M to R */
   10712    if (insn[0] == 0xF2 && insn[1] == 0x0F && insn[2] == 0x5C) {
   10713       vassert(sz == 4);
   10714       delta = dis_SSE_E_to_G_lo64( sorb, delta+3, "subsd", Iop_Sub64F0x2 );
   10715       goto decode_success;
   10716    }
   10717 
   10718    /* 66 0F 15 = UNPCKHPD -- unpack and interleave high part F64s */
   10719    /* 66 0F 14 = UNPCKLPD -- unpack and interleave low part F64s */
   10720    /* These just appear to be special cases of SHUFPS */
   10721    if (sz == 2 && insn[0] == 0x0F && (insn[1] == 0x15 || insn[1] == 0x14)) {
   10722       IRTemp s1 = newTemp(Ity_I64);
   10723       IRTemp s0 = newTemp(Ity_I64);
   10724       IRTemp d1 = newTemp(Ity_I64);
   10725       IRTemp d0 = newTemp(Ity_I64);
   10726       IRTemp sV = newTemp(Ity_V128);
   10727       IRTemp dV = newTemp(Ity_V128);
   10728       Bool   hi = toBool(insn[1] == 0x15);
   10729 
   10730       modrm = insn[2];
   10731       assign( dV, getXMMReg(gregOfRM(modrm)) );
   10732 
   10733       if (epartIsReg(modrm)) {
   10734          assign( sV, getXMMReg(eregOfRM(modrm)) );
   10735          delta += 2+1;
   10736          DIP("unpck%sps %s,%s\n", hi ? "h" : "l",
   10737                                   nameXMMReg(eregOfRM(modrm)),
   10738                                   nameXMMReg(gregOfRM(modrm)));
   10739       } else {
   10740          addr = disAMode ( &alen, sorb, delta+2, dis_buf );
   10741          assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
   10742          delta += 2+alen;
   10743          DIP("unpck%sps %s,%s\n", hi ? "h" : "l",
   10744                                   dis_buf,
   10745                                   nameXMMReg(gregOfRM(modrm)));
   10746       }
   10747 
   10748       assign( d1, unop(Iop_V128HIto64, mkexpr(dV)) );
   10749       assign( d0, unop(Iop_V128to64,   mkexpr(dV)) );
   10750       assign( s1, unop(Iop_V128HIto64, mkexpr(sV)) );
   10751       assign( s0, unop(Iop_V128to64,   mkexpr(sV)) );
   10752 
   10753       if (hi) {
   10754          putXMMReg( gregOfRM(modrm),
   10755                     binop(Iop_64HLtoV128, mkexpr(s1), mkexpr(d1)) );
   10756       } else {
   10757          putXMMReg( gregOfRM(modrm),
   10758                     binop(Iop_64HLtoV128, mkexpr(s0), mkexpr(d0)) );
   10759       }
   10760 
   10761       goto decode_success;
   10762    }
   10763 
   10764    /* 66 0F 57 = XORPD -- G = G and E */
   10765    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x57) {
   10766       delta = dis_SSE_E_to_G_all( sorb, delta+2, "xorpd", Iop_XorV128 );
   10767       goto decode_success;
   10768    }
   10769 
   10770    /* 66 0F 6B = PACKSSDW */
   10771    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x6B) {
   10772       delta = dis_SSEint_E_to_G( sorb, delta+2,
   10773                                  "packssdw",
   10774                                  Iop_QNarrowBin32Sto16Sx8, True );
   10775       goto decode_success;
   10776    }
   10777 
   10778    /* 66 0F 63 = PACKSSWB */
   10779    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x63) {
   10780       delta = dis_SSEint_E_to_G( sorb, delta+2,
   10781                                  "packsswb",
   10782                                  Iop_QNarrowBin16Sto8Sx16, True );
   10783       goto decode_success;
   10784    }
   10785 
   10786    /* 66 0F 67 = PACKUSWB */
   10787    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x67) {
   10788       delta = dis_SSEint_E_to_G( sorb, delta+2,
   10789                                  "packuswb",
   10790                                  Iop_QNarrowBin16Sto8Ux16, True );
   10791       goto decode_success;
   10792    }
   10793 
   10794    /* 66 0F FC = PADDB */
   10795    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xFC) {
   10796       delta = dis_SSEint_E_to_G( sorb, delta+2,
   10797                                  "paddb", Iop_Add8x16, False );
   10798       goto decode_success;
   10799    }
   10800 
   10801    /* 66 0F FE = PADDD */
   10802    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xFE) {
   10803       delta = dis_SSEint_E_to_G( sorb, delta+2,
   10804                                  "paddd", Iop_Add32x4, False );
   10805       goto decode_success;
   10806    }
   10807 
   10808    /* ***--- this is an MMX class insn introduced in SSE2 ---*** */
   10809    /* 0F D4 = PADDQ -- add 64x1 */
   10810    if (sz == 4 && insn[0] == 0x0F && insn[1] == 0xD4) {
   10811       do_MMX_preamble();
   10812       delta = dis_MMXop_regmem_to_reg (
   10813                 sorb, delta+2, insn[1], "paddq", False );
   10814       goto decode_success;
   10815    }
   10816 
   10817    /* 66 0F D4 = PADDQ */
   10818    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xD4) {
   10819       delta = dis_SSEint_E_to_G( sorb, delta+2,
   10820                                  "paddq", Iop_Add64x2, False );
   10821       goto decode_success;
   10822    }
   10823 
   10824    /* 66 0F FD = PADDW */
   10825    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xFD) {
   10826       delta = dis_SSEint_E_to_G( sorb, delta+2,
   10827                                  "paddw", Iop_Add16x8, False );
   10828       goto decode_success;
   10829    }
   10830 
   10831    /* 66 0F EC = PADDSB */
   10832    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xEC) {
   10833       delta = dis_SSEint_E_to_G( sorb, delta+2,
   10834                                  "paddsb", Iop_QAdd8Sx16, False );
   10835       goto decode_success;
   10836    }
   10837 
   10838    /* 66 0F ED = PADDSW */
   10839    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xED) {
   10840       delta = dis_SSEint_E_to_G( sorb, delta+2,
   10841                                  "paddsw", Iop_QAdd16Sx8, False );
   10842       goto decode_success;
   10843    }
   10844 
   10845    /* 66 0F DC = PADDUSB */
   10846    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xDC) {
   10847       delta = dis_SSEint_E_to_G( sorb, delta+2,
   10848                                  "paddusb", Iop_QAdd8Ux16, False );
   10849       goto decode_success;
   10850    }
   10851 
   10852    /* 66 0F DD = PADDUSW */
   10853    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xDD) {
   10854       delta = dis_SSEint_E_to_G( sorb, delta+2,
   10855                                  "paddusw", Iop_QAdd16Ux8, False );
   10856       goto decode_success;
   10857    }
   10858 
   10859    /* 66 0F DB = PAND */
   10860    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xDB) {
   10861       delta = dis_SSE_E_to_G_all( sorb, delta+2, "pand", Iop_AndV128 );
   10862       goto decode_success;
   10863    }
   10864 
   10865    /* 66 0F DF = PANDN */
   10866    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xDF) {
   10867       delta = dis_SSE_E_to_G_all_invG( sorb, delta+2, "pandn", Iop_AndV128 );
   10868       goto decode_success;
   10869    }
   10870 
   10871    /* 66 0F E0 = PAVGB */
   10872    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xE0) {
   10873       delta = dis_SSEint_E_to_G( sorb, delta+2,
   10874                                  "pavgb", Iop_Avg8Ux16, False );
   10875       goto decode_success;
   10876    }
   10877 
   10878    /* 66 0F E3 = PAVGW */
   10879    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xE3) {
   10880       delta = dis_SSEint_E_to_G( sorb, delta+2,
   10881                                  "pavgw", Iop_Avg16Ux8, False );
   10882       goto decode_success;
   10883    }
   10884 
   10885    /* 66 0F 74 = PCMPEQB */
   10886    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x74) {
   10887       delta = dis_SSEint_E_to_G( sorb, delta+2,
   10888                                  "pcmpeqb", Iop_CmpEQ8x16, False );
   10889       goto decode_success;
   10890    }
   10891 
   10892    /* 66 0F 76 = PCMPEQD */
   10893    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x76) {
   10894       delta = dis_SSEint_E_to_G( sorb, delta+2,
   10895                                  "pcmpeqd", Iop_CmpEQ32x4, False );
   10896       goto decode_success;
   10897    }
   10898 
   10899    /* 66 0F 75 = PCMPEQW */
   10900    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x75) {
   10901       delta = dis_SSEint_E_to_G( sorb, delta+2,
   10902                                  "pcmpeqw", Iop_CmpEQ16x8, False );
   10903       goto decode_success;
   10904    }
   10905 
   10906    /* 66 0F 64 = PCMPGTB */
   10907    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x64) {
   10908       delta = dis_SSEint_E_to_G( sorb, delta+2,
   10909                                  "pcmpgtb", Iop_CmpGT8Sx16, False );
   10910       goto decode_success;
   10911    }
   10912 
   10913    /* 66 0F 66 = PCMPGTD */
   10914    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x66) {
   10915       delta = dis_SSEint_E_to_G( sorb, delta+2,
   10916                                  "pcmpgtd", Iop_CmpGT32Sx4, False );
   10917       goto decode_success;
   10918    }
   10919 
   10920    /* 66 0F 65 = PCMPGTW */
   10921    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x65) {
   10922       delta = dis_SSEint_E_to_G( sorb, delta+2,
   10923                                  "pcmpgtw", Iop_CmpGT16Sx8, False );
   10924       goto decode_success;
   10925    }
   10926 
   10927    /* 66 0F C5 = PEXTRW -- extract 16-bit field from xmm(E) and put
   10928       zero-extend of it in ireg(G). */
   10929    if (insn[0] == 0x0F && insn[1] == 0xC5) {
   10930       modrm = insn[2];
   10931       if (sz == 2 && epartIsReg(modrm)) {
   10932          t5 = newTemp(Ity_V128);
   10933          t4 = newTemp(Ity_I16);
   10934          assign(t5, getXMMReg(eregOfRM(modrm)));
   10935          breakup128to32s( t5, &t3, &t2, &t1, &t0 );
   10936          switch (insn[3] & 7) {
   10937             case 0:  assign(t4, unop(Iop_32to16,   mkexpr(t0))); break;
   10938             case 1:  assign(t4, unop(Iop_32HIto16, mkexpr(t0))); break;
   10939             case 2:  assign(t4, unop(Iop_32to16,   mkexpr(t1))); break;
   10940             case 3:  assign(t4, unop(Iop_32HIto16, mkexpr(t1))); break;
   10941             case 4:  assign(t4, unop(Iop_32to16,   mkexpr(t2))); break;
   10942             case 5:  assign(t4, unop(Iop_32HIto16, mkexpr(t2))); break;
   10943             case 6:  assign(t4, unop(Iop_32to16,   mkexpr(t3))); break;
   10944             case 7:  assign(t4, unop(Iop_32HIto16, mkexpr(t3))); break;
   10945             default: vassert(0); /*NOTREACHED*/
   10946          }
   10947          putIReg(4, gregOfRM(modrm), unop(Iop_16Uto32, mkexpr(t4)));
   10948          DIP("pextrw $%d,%s,%s\n",
   10949              (Int)insn[3], nameXMMReg(eregOfRM(modrm)),
   10950                            nameIReg(4,gregOfRM(modrm)));
   10951          delta += 4;
   10952          goto decode_success;
   10953       }
   10954       /* else fall through */
   10955    }
   10956 
   10957    /* 66 0F C4 = PINSRW -- get 16 bits from E(mem or low half ireg) and
   10958       put it into the specified lane of xmm(G). */
   10959    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xC4) {
   10960       Int lane;
   10961       t4 = newTemp(Ity_I16);
   10962       modrm = insn[2];
   10963 
   10964       if (epartIsReg(modrm)) {
   10965          assign(t4, getIReg(2, eregOfRM(modrm)));
   10966          delta += 3+1;
   10967          lane = insn[3+1-1];
   10968          DIP("pinsrw $%d,%s,%s\n", (Int)lane,
   10969                                    nameIReg(2,eregOfRM(modrm)),
   10970                                    nameXMMReg(gregOfRM(modrm)));
   10971       } else {
   10972          addr = disAMode ( &alen, sorb, delta+2, dis_buf );
   10973          delta += 3+alen;
   10974          lane = insn[3+alen-1];
   10975          assign(t4, loadLE(Ity_I16, mkexpr(addr)));
   10976          DIP("pinsrw $%d,%s,%s\n", (Int)lane,
   10977                                    dis_buf,
   10978                                    nameXMMReg(gregOfRM(modrm)));
   10979       }
   10980 
   10981       putXMMRegLane16( gregOfRM(modrm), lane & 7, mkexpr(t4) );
   10982       goto decode_success;
   10983    }
   10984 
   10985    /* 66 0F F5 = PMADDWD -- Multiply and add packed integers from
   10986       E(xmm or mem) to G(xmm) */
   10987    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xF5) {
   10988       IRTemp s1V  = newTemp(Ity_V128);
   10989       IRTemp s2V  = newTemp(Ity_V128);
   10990       IRTemp dV   = newTemp(Ity_V128);
   10991       IRTemp s1Hi = newTemp(Ity_I64);
   10992       IRTemp s1Lo = newTemp(Ity_I64);
   10993       IRTemp s2Hi = newTemp(Ity_I64);
   10994       IRTemp s2Lo = newTemp(Ity_I64);
   10995       IRTemp dHi  = newTemp(Ity_I64);
   10996       IRTemp dLo  = newTemp(Ity_I64);
   10997       modrm = insn[2];
   10998       if (epartIsReg(modrm)) {
   10999          assign( s1V, getXMMReg(eregOfRM(modrm)) );
   11000          delta += 2+1;
   11001          DIP("pmaddwd %s,%s\n", nameXMMReg(eregOfRM(modrm)),
   11002                                 nameXMMReg(gregOfRM(modrm)));
   11003       } else {
   11004          addr = disAMode ( &alen, sorb, delta+2, dis_buf );
   11005          assign( s1V, loadLE(Ity_V128, mkexpr(addr)) );
   11006          delta += 2+alen;
   11007          DIP("pmaddwd %s,%s\n", dis_buf,
   11008                                 nameXMMReg(gregOfRM(modrm)));
   11009       }
   11010       assign( s2V, getXMMReg(gregOfRM(modrm)) );
   11011       assign( s1Hi, unop(Iop_V128HIto64, mkexpr(s1V)) );
   11012       assign( s1Lo, unop(Iop_V128to64,   mkexpr(s1V)) );
   11013       assign( s2Hi, unop(Iop_V128HIto64, mkexpr(s2V)) );
   11014       assign( s2Lo, unop(Iop_V128to64,   mkexpr(s2V)) );
   11015       assign( dHi, mkIRExprCCall(
   11016                       Ity_I64, 0/*regparms*/,
   11017                       "x86g_calculate_mmx_pmaddwd",
   11018                       &x86g_calculate_mmx_pmaddwd,
   11019                       mkIRExprVec_2( mkexpr(s1Hi), mkexpr(s2Hi))
   11020                    ));
   11021       assign( dLo, mkIRExprCCall(
   11022                       Ity_I64, 0/*regparms*/,
   11023                       "x86g_calculate_mmx_pmaddwd",
   11024                       &x86g_calculate_mmx_pmaddwd,
   11025                       mkIRExprVec_2( mkexpr(s1Lo), mkexpr(s2Lo))
   11026                    ));
   11027       assign( dV, binop(Iop_64HLtoV128, mkexpr(dHi), mkexpr(dLo))) ;
   11028       putXMMReg(gregOfRM(modrm), mkexpr(dV));
   11029       goto decode_success;
   11030    }
   11031 
   11032    /* 66 0F EE = PMAXSW -- 16x8 signed max */
   11033    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xEE) {
   11034       delta = dis_SSEint_E_to_G( sorb, delta+2,
   11035                                  "pmaxsw", Iop_Max16Sx8, False );
   11036       goto decode_success;
   11037    }
   11038 
   11039    /* 66 0F DE = PMAXUB -- 8x16 unsigned max */
   11040    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xDE) {
   11041       delta = dis_SSEint_E_to_G( sorb, delta+2,
   11042                                  "pmaxub", Iop_Max8Ux16, False );
   11043       goto decode_success;
   11044    }
   11045 
   11046    /* 66 0F EA = PMINSW -- 16x8 signed min */
   11047    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xEA) {
   11048       delta = dis_SSEint_E_to_G( sorb, delta+2,
   11049                                  "pminsw", Iop_Min16Sx8, False );
   11050       goto decode_success;
   11051    }
   11052 
   11053    /* 66 0F DA = PMINUB -- 8x16 unsigned min */
   11054    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xDA) {
   11055       delta = dis_SSEint_E_to_G( sorb, delta+2,
   11056                                  "pminub", Iop_Min8Ux16, False );
   11057       goto decode_success;
   11058    }
   11059 
   11060    /* 66 0F D7 = PMOVMSKB -- extract sign bits from each of 16 lanes
   11061       in xmm(E), turn them into a byte, and put zero-extend of it in
   11062       ireg(G). */
   11063    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xD7) {
   11064       modrm = insn[2];
   11065       if (epartIsReg(modrm)) {
   11066          t0 = newTemp(Ity_I64);
   11067          t1 = newTemp(Ity_I64);
   11068          assign(t0, getXMMRegLane64(eregOfRM(modrm), 0));
   11069          assign(t1, getXMMRegLane64(eregOfRM(modrm), 1));
   11070          t5 = newTemp(Ity_I32);
   11071          assign(t5,
   11072                 unop(Iop_16Uto32,
   11073                      binop(Iop_8HLto16,
   11074                            unop(Iop_GetMSBs8x8, mkexpr(t1)),
   11075                            unop(Iop_GetMSBs8x8, mkexpr(t0)))));
   11076          putIReg(4, gregOfRM(modrm), mkexpr(t5));
   11077          DIP("pmovmskb %s,%s\n", nameXMMReg(eregOfRM(modrm)),
   11078                                  nameIReg(4,gregOfRM(modrm)));
   11079          delta += 3;
   11080          goto decode_success;
   11081       }
   11082       /* else fall through */
   11083    }
   11084 
   11085    /* 66 0F E4 = PMULHUW -- 16x8 hi-half of unsigned widening multiply */
   11086    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xE4) {
   11087       delta = dis_SSEint_E_to_G( sorb, delta+2,
   11088                                  "pmulhuw", Iop_MulHi16Ux8, False );
   11089       goto decode_success;
   11090    }
   11091 
   11092    /* 66 0F E5 = PMULHW -- 16x8 hi-half of signed widening multiply */
   11093    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xE5) {
   11094       delta = dis_SSEint_E_to_G( sorb, delta+2,
   11095                                  "pmulhw", Iop_MulHi16Sx8, False );
   11096       goto decode_success;
   11097    }
   11098 
   11099    /* 66 0F D5 = PMULHL -- 16x8 multiply */
   11100    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xD5) {
   11101       delta = dis_SSEint_E_to_G( sorb, delta+2,
   11102                                  "pmullw", Iop_Mul16x8, False );
   11103       goto decode_success;
   11104    }
   11105 
   11106    /* ***--- this is an MMX class insn introduced in SSE2 ---*** */
   11107    /* 0F F4 = PMULUDQ -- unsigned widening multiply of 32-lanes 0 x
   11108       0 to form 64-bit result */
   11109    if (sz == 4 && insn[0] == 0x0F && insn[1] == 0xF4) {
   11110       IRTemp sV = newTemp(Ity_I64);
   11111       IRTemp dV = newTemp(Ity_I64);
   11112       t1 = newTemp(Ity_I32);
   11113       t0 = newTemp(Ity_I32);
   11114       modrm = insn[2];
   11115 
   11116       do_MMX_preamble();
   11117       assign( dV, getMMXReg(gregOfRM(modrm)) );
   11118 
   11119       if (epartIsReg(modrm)) {
   11120          assign( sV, getMMXReg(eregOfRM(modrm)) );
   11121          delta += 2+1;
   11122          DIP("pmuludq %s,%s\n", nameMMXReg(eregOfRM(modrm)),
   11123                                 nameMMXReg(gregOfRM(modrm)));
   11124       } else {
   11125          addr = disAMode ( &alen, sorb, delta+2, dis_buf );
   11126          assign( sV, loadLE(Ity_I64, mkexpr(addr)) );
   11127          delta += 2+alen;
   11128          DIP("pmuludq %s,%s\n", dis_buf,
   11129                                 nameMMXReg(gregOfRM(modrm)));
   11130       }
   11131 
   11132       assign( t0, unop(Iop_64to32, mkexpr(dV)) );
   11133       assign( t1, unop(Iop_64to32, mkexpr(sV)) );
   11134       putMMXReg( gregOfRM(modrm),
   11135                  binop( Iop_MullU32, mkexpr(t0), mkexpr(t1) ) );
   11136       goto decode_success;
   11137    }
   11138 
   11139    /* 66 0F F4 = PMULUDQ -- unsigned widening multiply of 32-lanes 0 x
   11140       0 to form lower 64-bit half and lanes 2 x 2 to form upper 64-bit
   11141       half */
   11142    /* This is a really poor translation -- could be improved if
   11143       performance critical */
   11144    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xF4) {
   11145       IRTemp sV, dV;
   11146       IRTemp s3, s2, s1, s0, d3, d2, d1, d0;
   11147       sV = newTemp(Ity_V128);
   11148       dV = newTemp(Ity_V128);
   11149       s3 = s2 = s1 = s0 = d3 = d2 = d1 = d0 = IRTemp_INVALID;
   11150       t1 = newTemp(Ity_I64);
   11151       t0 = newTemp(Ity_I64);
   11152       modrm = insn[2];
   11153       assign( dV, getXMMReg(gregOfRM(modrm)) );
   11154 
   11155       if (epartIsReg(modrm)) {
   11156          assign( sV, getXMMReg(eregOfRM(modrm)) );
   11157          delta += 2+1;
   11158          DIP("pmuludq %s,%s\n", nameXMMReg(eregOfRM(modrm)),
   11159                                 nameXMMReg(gregOfRM(modrm)));
   11160       } else {
   11161          addr = disAMode ( &alen, sorb, delta+2, dis_buf );
   11162          assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
   11163          delta += 2+alen;
   11164          DIP("pmuludq %s,%s\n", dis_buf,
   11165                                 nameXMMReg(gregOfRM(modrm)));
   11166       }
   11167 
   11168       breakup128to32s( dV, &d3, &d2, &d1, &d0 );
   11169       breakup128to32s( sV, &s3, &s2, &s1, &s0 );
   11170 
   11171       assign( t0, binop( Iop_MullU32, mkexpr(d0), mkexpr(s0)) );
   11172       putXMMRegLane64( gregOfRM(modrm), 0, mkexpr(t0) );
   11173       assign( t1, binop( Iop_MullU32, mkexpr(d2), mkexpr(s2)) );
   11174       putXMMRegLane64( gregOfRM(modrm), 1, mkexpr(t1) );
   11175       goto decode_success;
   11176    }
   11177 
   11178    /* 66 0F EB = POR */
   11179    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xEB) {
   11180       delta = dis_SSE_E_to_G_all( sorb, delta+2, "por", Iop_OrV128 );
   11181       goto decode_success;
   11182    }
   11183 
   11184    /* 66 0F F6 = PSADBW -- 2 x (8x8 -> 48 zeroes ++ u16) Sum Abs Diffs
   11185       from E(xmm or mem) to G(xmm) */
   11186    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xF6) {
   11187       IRTemp s1V  = newTemp(Ity_V128);
   11188       IRTemp s2V  = newTemp(Ity_V128);
   11189       IRTemp dV   = newTemp(Ity_V128);
   11190       IRTemp s1Hi = newTemp(Ity_I64);
   11191       IRTemp s1Lo = newTemp(Ity_I64);
   11192       IRTemp s2Hi = newTemp(Ity_I64);
   11193       IRTemp s2Lo = newTemp(Ity_I64);
   11194       IRTemp dHi  = newTemp(Ity_I64);
   11195       IRTemp dLo  = newTemp(Ity_I64);
   11196       modrm = insn[2];
   11197       if (epartIsReg(modrm)) {
   11198          assign( s1V, getXMMReg(eregOfRM(modrm)) );
   11199          delta += 2+1;
   11200          DIP("psadbw %s,%s\n", nameXMMReg(eregOfRM(modrm)),
   11201                                nameXMMReg(gregOfRM(modrm)));
   11202       } else {
   11203          addr = disAMode ( &alen, sorb, delta+2, dis_buf );
   11204          assign( s1V, loadLE(Ity_V128, mkexpr(addr)) );
   11205          delta += 2+alen;
   11206          DIP("psadbw %s,%s\n", dis_buf,
   11207                                nameXMMReg(gregOfRM(modrm)));
   11208       }
   11209       assign( s2V, getXMMReg(gregOfRM(modrm)) );
   11210       assign( s1Hi, unop(Iop_V128HIto64, mkexpr(s1V)) );
   11211       assign( s1Lo, unop(Iop_V128to64,   mkexpr(s1V)) );
   11212       assign( s2Hi, unop(Iop_V128HIto64, mkexpr(s2V)) );
   11213       assign( s2Lo, unop(Iop_V128to64,   mkexpr(s2V)) );
   11214       assign( dHi, mkIRExprCCall(
   11215                       Ity_I64, 0/*regparms*/,
   11216                       "x86g_calculate_mmx_psadbw",
   11217                       &x86g_calculate_mmx_psadbw,
   11218                       mkIRExprVec_2( mkexpr(s1Hi), mkexpr(s2Hi))
   11219                    ));
   11220       assign( dLo, mkIRExprCCall(
   11221                       Ity_I64, 0/*regparms*/,
   11222                       "x86g_calculate_mmx_psadbw",
   11223                       &x86g_calculate_mmx_psadbw,
   11224                       mkIRExprVec_2( mkexpr(s1Lo), mkexpr(s2Lo))
   11225                    ));
   11226       assign( dV, binop(Iop_64HLtoV128, mkexpr(dHi), mkexpr(dLo))) ;
   11227       putXMMReg(gregOfRM(modrm), mkexpr(dV));
   11228       goto decode_success;
   11229    }
   11230 
   11231    /* 66 0F 70 = PSHUFD -- rearrange 4x32 from E(xmm or mem) to G(xmm) */
   11232    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x70) {
   11233       Int order;
   11234       IRTemp sV, dV, s3, s2, s1, s0;
   11235       s3 = s2 = s1 = s0 = IRTemp_INVALID;
   11236       sV = newTemp(Ity_V128);
   11237       dV = newTemp(Ity_V128);
   11238       modrm = insn[2];
   11239       if (epartIsReg(modrm)) {
   11240          assign( sV, getXMMReg(eregOfRM(modrm)) );
   11241          order = (Int)insn[3];
   11242          delta += 2+2;
   11243          DIP("pshufd $%d,%s,%s\n", order,
   11244                                    nameXMMReg(eregOfRM(modrm)),
   11245                                    nameXMMReg(gregOfRM(modrm)));
   11246       } else {
   11247          addr = disAMode ( &alen, sorb, delta+2, dis_buf );
   11248          assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
   11249 	 order = (Int)insn[2+alen];
   11250          delta += 3+alen;
   11251          DIP("pshufd $%d,%s,%s\n", order,
   11252                                    dis_buf,
   11253                                    nameXMMReg(gregOfRM(modrm)));
   11254       }
   11255       breakup128to32s( sV, &s3, &s2, &s1, &s0 );
   11256 
   11257 #     define SEL(n) \
   11258                 ((n)==0 ? s0 : ((n)==1 ? s1 : ((n)==2 ? s2 : s3)))
   11259       assign(dV,
   11260 	     mk128from32s( SEL((order>>6)&3), SEL((order>>4)&3),
   11261                            SEL((order>>2)&3), SEL((order>>0)&3) )
   11262       );
   11263       putXMMReg(gregOfRM(modrm), mkexpr(dV));
   11264 #     undef SEL
   11265       goto decode_success;
   11266    }
   11267 
   11268    /* F3 0F 70 = PSHUFHW -- rearrange upper half 4x16 from E(xmm or
   11269       mem) to G(xmm), and copy lower half */
   11270    if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0x70) {
   11271       Int order;
   11272       IRTemp sVhi, dVhi, sV, dV, s3, s2, s1, s0;
   11273       s3 = s2 = s1 = s0 = IRTemp_INVALID;
   11274       sV   = newTemp(Ity_V128);
   11275       dV   = newTemp(Ity_V128);
   11276       sVhi = newTemp(Ity_I64);
   11277       dVhi = newTemp(Ity_I64);
   11278       modrm = insn[3];
   11279       if (epartIsReg(modrm)) {
   11280          assign( sV, getXMMReg(eregOfRM(modrm)) );
   11281          order = (Int)insn[4];
   11282          delta += 4+1;
   11283          DIP("pshufhw $%d,%s,%s\n", order,
   11284                                     nameXMMReg(eregOfRM(modrm)),
   11285                                     nameXMMReg(gregOfRM(modrm)));
   11286       } else {
   11287          addr = disAMode ( &alen, sorb, delta+3, dis_buf );
   11288          assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
   11289 	 order = (Int)insn[3+alen];
   11290          delta += 4+alen;
   11291          DIP("pshufhw $%d,%s,%s\n", order,
   11292                                     dis_buf,
   11293                                     nameXMMReg(gregOfRM(modrm)));
   11294       }
   11295       assign( sVhi, unop(Iop_V128HIto64, mkexpr(sV)) );
   11296       breakup64to16s( sVhi, &s3, &s2, &s1, &s0 );
   11297 
   11298 #     define SEL(n) \
   11299                 ((n)==0 ? s0 : ((n)==1 ? s1 : ((n)==2 ? s2 : s3)))
   11300       assign(dVhi,
   11301 	     mk64from16s( SEL((order>>6)&3), SEL((order>>4)&3),
   11302                           SEL((order>>2)&3), SEL((order>>0)&3) )
   11303       );
   11304       assign(dV, binop( Iop_64HLtoV128,
   11305                         mkexpr(dVhi),
   11306                         unop(Iop_V128to64, mkexpr(sV))) );
   11307       putXMMReg(gregOfRM(modrm), mkexpr(dV));
   11308 #     undef SEL
   11309       goto decode_success;
   11310    }
   11311 
   11312    /* F2 0F 70 = PSHUFLW -- rearrange lower half 4x16 from E(xmm or
   11313       mem) to G(xmm), and copy upper half */
   11314    if (insn[0] == 0xF2 && insn[1] == 0x0F && insn[2] == 0x70) {
   11315       Int order;
   11316       IRTemp sVlo, dVlo, sV, dV, s3, s2, s1, s0;
   11317       s3 = s2 = s1 = s0 = IRTemp_INVALID;
   11318       sV   = newTemp(Ity_V128);
   11319       dV   = newTemp(Ity_V128);
   11320       sVlo = newTemp(Ity_I64);
   11321       dVlo = newTemp(Ity_I64);
   11322       modrm = insn[3];
   11323       if (epartIsReg(modrm)) {
   11324          assign( sV, getXMMReg(eregOfRM(modrm)) );
   11325          order = (Int)insn[4];
   11326          delta += 4+1;
   11327          DIP("pshuflw $%d,%s,%s\n", order,
   11328                                     nameXMMReg(eregOfRM(modrm)),
   11329                                     nameXMMReg(gregOfRM(modrm)));
   11330       } else {
   11331          addr = disAMode ( &alen, sorb, delta+3, dis_buf );
   11332          assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
   11333 	 order = (Int)insn[3+alen];
   11334          delta += 4+alen;
   11335          DIP("pshuflw $%d,%s,%s\n", order,
   11336                                     dis_buf,
   11337                                     nameXMMReg(gregOfRM(modrm)));
   11338       }
   11339       assign( sVlo, unop(Iop_V128to64, mkexpr(sV)) );
   11340       breakup64to16s( sVlo, &s3, &s2, &s1, &s0 );
   11341 
   11342 #     define SEL(n) \
   11343                 ((n)==0 ? s0 : ((n)==1 ? s1 : ((n)==2 ? s2 : s3)))
   11344       assign(dVlo,
   11345 	     mk64from16s( SEL((order>>6)&3), SEL((order>>4)&3),
   11346                           SEL((order>>2)&3), SEL((order>>0)&3) )
   11347       );
   11348       assign(dV, binop( Iop_64HLtoV128,
   11349                         unop(Iop_V128HIto64, mkexpr(sV)),
   11350                         mkexpr(dVlo) ) );
   11351       putXMMReg(gregOfRM(modrm), mkexpr(dV));
   11352 #     undef SEL
   11353       goto decode_success;
   11354    }
   11355 
   11356    /* 66 0F 72 /6 ib = PSLLD by immediate */
   11357    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x72
   11358        && epartIsReg(insn[2])
   11359        && gregOfRM(insn[2]) == 6) {
   11360       delta = dis_SSE_shiftE_imm( delta+2, "pslld", Iop_ShlN32x4 );
   11361       goto decode_success;
   11362    }
   11363 
   11364    /* 66 0F F2 = PSLLD by E */
   11365    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xF2) {
   11366       delta = dis_SSE_shiftG_byE( sorb, delta+2, "pslld", Iop_ShlN32x4 );
   11367       goto decode_success;
   11368    }
   11369 
   11370    /* 66 0F 73 /7 ib = PSLLDQ by immediate */
   11371    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x73
   11372        && epartIsReg(insn[2])
   11373        && gregOfRM(insn[2]) == 7) {
   11374       IRTemp sV, dV, hi64, lo64, hi64r, lo64r;
   11375       Int    imm = (Int)insn[3];
   11376       Int    reg = eregOfRM(insn[2]);
   11377       DIP("pslldq $%d,%s\n", imm, nameXMMReg(reg));
   11378       vassert(imm >= 0 && imm <= 255);
   11379       delta += 4;
   11380 
   11381       sV    = newTemp(Ity_V128);
   11382       dV    = newTemp(Ity_V128);
   11383       hi64  = newTemp(Ity_I64);
   11384       lo64  = newTemp(Ity_I64);
   11385       hi64r = newTemp(Ity_I64);
   11386       lo64r = newTemp(Ity_I64);
   11387 
   11388       if (imm >= 16) {
   11389          putXMMReg(reg, mkV128(0x0000));
   11390          goto decode_success;
   11391       }
   11392 
   11393       assign( sV, getXMMReg(reg) );
   11394       assign( hi64, unop(Iop_V128HIto64, mkexpr(sV)) );
   11395       assign( lo64, unop(Iop_V128to64, mkexpr(sV)) );
   11396 
   11397       if (imm == 0) {
   11398          assign( lo64r, mkexpr(lo64) );
   11399          assign( hi64r, mkexpr(hi64) );
   11400       }
   11401       else
   11402       if (imm == 8) {
   11403          assign( lo64r, mkU64(0) );
   11404          assign( hi64r, mkexpr(lo64) );
   11405       }
   11406       else
   11407       if (imm > 8) {
   11408          assign( lo64r, mkU64(0) );
   11409          assign( hi64r, binop( Iop_Shl64,
   11410                                mkexpr(lo64),
   11411                                mkU8( 8*(imm-8) ) ));
   11412       } else {
   11413          assign( lo64r, binop( Iop_Shl64,
   11414                                mkexpr(lo64),
   11415                                mkU8(8 * imm) ));
   11416          assign( hi64r,
   11417                  binop( Iop_Or64,
   11418                         binop(Iop_Shl64, mkexpr(hi64),
   11419                                          mkU8(8 * imm)),
   11420                         binop(Iop_Shr64, mkexpr(lo64),
   11421                                          mkU8(8 * (8 - imm)) )
   11422                       )
   11423                );
   11424       }
   11425       assign( dV, binop(Iop_64HLtoV128, mkexpr(hi64r), mkexpr(lo64r)) );
   11426       putXMMReg(reg, mkexpr(dV));
   11427       goto decode_success;
   11428    }
   11429 
   11430    /* 66 0F 73 /6 ib = PSLLQ by immediate */
   11431    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x73
   11432        && epartIsReg(insn[2])
   11433        && gregOfRM(insn[2]) == 6) {
   11434       delta = dis_SSE_shiftE_imm( delta+2, "psllq", Iop_ShlN64x2 );
   11435       goto decode_success;
   11436    }
   11437 
   11438    /* 66 0F F3 = PSLLQ by E */
   11439    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xF3) {
   11440       delta = dis_SSE_shiftG_byE( sorb, delta+2, "psllq", Iop_ShlN64x2 );
   11441       goto decode_success;
   11442    }
   11443 
   11444    /* 66 0F 71 /6 ib = PSLLW by immediate */
   11445    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x71
   11446        && epartIsReg(insn[2])
   11447        && gregOfRM(insn[2]) == 6) {
   11448       delta = dis_SSE_shiftE_imm( delta+2, "psllw", Iop_ShlN16x8 );
   11449       goto decode_success;
   11450    }
   11451 
   11452    /* 66 0F F1 = PSLLW by E */
   11453    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xF1) {
   11454       delta = dis_SSE_shiftG_byE( sorb, delta+2, "psllw", Iop_ShlN16x8 );
   11455       goto decode_success;
   11456    }
   11457 
   11458    /* 66 0F 72 /4 ib = PSRAD by immediate */
   11459    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x72
   11460        && epartIsReg(insn[2])
   11461        && gregOfRM(insn[2]) == 4) {
   11462       delta = dis_SSE_shiftE_imm( delta+2, "psrad", Iop_SarN32x4 );
   11463       goto decode_success;
   11464    }
   11465 
   11466    /* 66 0F E2 = PSRAD by E */
   11467    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xE2) {
   11468       delta = dis_SSE_shiftG_byE( sorb, delta+2, "psrad", Iop_SarN32x4 );
   11469       goto decode_success;
   11470    }
   11471 
   11472    /* 66 0F 71 /4 ib = PSRAW by immediate */
   11473    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x71
   11474        && epartIsReg(insn[2])
   11475        && gregOfRM(insn[2]) == 4) {
   11476       delta = dis_SSE_shiftE_imm( delta+2, "psraw", Iop_SarN16x8 );
   11477       goto decode_success;
   11478    }
   11479 
   11480    /* 66 0F E1 = PSRAW by E */
   11481    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xE1) {
   11482       delta = dis_SSE_shiftG_byE( sorb, delta+2, "psraw", Iop_SarN16x8 );
   11483       goto decode_success;
   11484    }
   11485 
   11486    /* 66 0F 72 /2 ib = PSRLD by immediate */
   11487    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x72
   11488        && epartIsReg(insn[2])
   11489        && gregOfRM(insn[2]) == 2) {
   11490       delta = dis_SSE_shiftE_imm( delta+2, "psrld", Iop_ShrN32x4 );
   11491       goto decode_success;
   11492    }
   11493 
   11494    /* 66 0F D2 = PSRLD by E */
   11495    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xD2) {
   11496       delta = dis_SSE_shiftG_byE( sorb, delta+2, "psrld", Iop_ShrN32x4 );
   11497       goto decode_success;
   11498    }
   11499 
   11500    /* 66 0F 73 /3 ib = PSRLDQ by immediate */
   11501    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x73
   11502        && epartIsReg(insn[2])
   11503        && gregOfRM(insn[2]) == 3) {
   11504       IRTemp sV, dV, hi64, lo64, hi64r, lo64r;
   11505       Int    imm = (Int)insn[3];
   11506       Int    reg = eregOfRM(insn[2]);
   11507       DIP("psrldq $%d,%s\n", imm, nameXMMReg(reg));
   11508       vassert(imm >= 0 && imm <= 255);
   11509       delta += 4;
   11510 
   11511       sV    = newTemp(Ity_V128);
   11512       dV    = newTemp(Ity_V128);
   11513       hi64  = newTemp(Ity_I64);
   11514       lo64  = newTemp(Ity_I64);
   11515       hi64r = newTemp(Ity_I64);
   11516       lo64r = newTemp(Ity_I64);
   11517 
   11518       if (imm >= 16) {
   11519          putXMMReg(reg, mkV128(0x0000));
   11520          goto decode_success;
   11521       }
   11522 
   11523       assign( sV, getXMMReg(reg) );
   11524       assign( hi64, unop(Iop_V128HIto64, mkexpr(sV)) );
   11525       assign( lo64, unop(Iop_V128to64, mkexpr(sV)) );
   11526 
   11527       if (imm == 0) {
   11528          assign( lo64r, mkexpr(lo64) );
   11529          assign( hi64r, mkexpr(hi64) );
   11530       }
   11531       else
   11532       if (imm == 8) {
   11533          assign( hi64r, mkU64(0) );
   11534          assign( lo64r, mkexpr(hi64) );
   11535       }
   11536       else
   11537       if (imm > 8) {
   11538          assign( hi64r, mkU64(0) );
   11539          assign( lo64r, binop( Iop_Shr64,
   11540                                mkexpr(hi64),
   11541                                mkU8( 8*(imm-8) ) ));
   11542       } else {
   11543          assign( hi64r, binop( Iop_Shr64,
   11544                                mkexpr(hi64),
   11545                                mkU8(8 * imm) ));
   11546          assign( lo64r,
   11547                  binop( Iop_Or64,
   11548                         binop(Iop_Shr64, mkexpr(lo64),
   11549                                          mkU8(8 * imm)),
   11550                         binop(Iop_Shl64, mkexpr(hi64),
   11551                                          mkU8(8 * (8 - imm)) )
   11552                       )
   11553                );
   11554       }
   11555 
   11556       assign( dV, binop(Iop_64HLtoV128, mkexpr(hi64r), mkexpr(lo64r)) );
   11557       putXMMReg(reg, mkexpr(dV));
   11558       goto decode_success;
   11559    }
   11560 
   11561    /* 66 0F 73 /2 ib = PSRLQ by immediate */
   11562    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x73
   11563        && epartIsReg(insn[2])
   11564        && gregOfRM(insn[2]) == 2) {
   11565       delta = dis_SSE_shiftE_imm( delta+2, "psrlq", Iop_ShrN64x2 );
   11566       goto decode_success;
   11567    }
   11568 
   11569    /* 66 0F D3 = PSRLQ by E */
   11570    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xD3) {
   11571       delta = dis_SSE_shiftG_byE( sorb, delta+2, "psrlq", Iop_ShrN64x2 );
   11572       goto decode_success;
   11573    }
   11574 
   11575    /* 66 0F 71 /2 ib = PSRLW by immediate */
   11576    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x71
   11577        && epartIsReg(insn[2])
   11578        && gregOfRM(insn[2]) == 2) {
   11579       delta = dis_SSE_shiftE_imm( delta+2, "psrlw", Iop_ShrN16x8 );
   11580       goto decode_success;
   11581    }
   11582 
   11583    /* 66 0F D1 = PSRLW by E */
   11584    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xD1) {
   11585       delta = dis_SSE_shiftG_byE( sorb, delta+2, "psrlw", Iop_ShrN16x8 );
   11586       goto decode_success;
   11587    }
   11588 
   11589    /* 66 0F F8 = PSUBB */
   11590    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xF8) {
   11591       delta = dis_SSEint_E_to_G( sorb, delta+2,
   11592                                  "psubb", Iop_Sub8x16, False );
   11593       goto decode_success;
   11594    }
   11595 
   11596    /* 66 0F FA = PSUBD */
   11597    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xFA) {
   11598       delta = dis_SSEint_E_to_G( sorb, delta+2,
   11599                                  "psubd", Iop_Sub32x4, False );
   11600       goto decode_success;
   11601    }
   11602 
   11603    /* ***--- this is an MMX class insn introduced in SSE2 ---*** */
   11604    /* 0F FB = PSUBQ -- sub 64x1 */
   11605    if (sz == 4 && insn[0] == 0x0F && insn[1] == 0xFB) {
   11606       do_MMX_preamble();
   11607       delta = dis_MMXop_regmem_to_reg (
   11608                 sorb, delta+2, insn[1], "psubq", False );
   11609       goto decode_success;
   11610    }
   11611 
   11612    /* 66 0F FB = PSUBQ */
   11613    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xFB) {
   11614       delta = dis_SSEint_E_to_G( sorb, delta+2,
   11615                                  "psubq", Iop_Sub64x2, False );
   11616       goto decode_success;
   11617    }
   11618 
   11619    /* 66 0F F9 = PSUBW */
   11620    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xF9) {
   11621       delta = dis_SSEint_E_to_G( sorb, delta+2,
   11622                                  "psubw", Iop_Sub16x8, False );
   11623       goto decode_success;
   11624    }
   11625 
   11626    /* 66 0F E8 = PSUBSB */
   11627    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xE8) {
   11628       delta = dis_SSEint_E_to_G( sorb, delta+2,
   11629                                  "psubsb", Iop_QSub8Sx16, False );
   11630       goto decode_success;
   11631    }
   11632 
   11633    /* 66 0F E9 = PSUBSW */
   11634    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xE9) {
   11635       delta = dis_SSEint_E_to_G( sorb, delta+2,
   11636                                  "psubsw", Iop_QSub16Sx8, False );
   11637       goto decode_success;
   11638    }
   11639 
   11640    /* 66 0F D8 = PSUBSB */
   11641    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xD8) {
   11642       delta = dis_SSEint_E_to_G( sorb, delta+2,
   11643                                  "psubusb", Iop_QSub8Ux16, False );
   11644       goto decode_success;
   11645    }
   11646 
   11647    /* 66 0F D9 = PSUBSW */
   11648    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xD9) {
   11649       delta = dis_SSEint_E_to_G( sorb, delta+2,
   11650                                  "psubusw", Iop_QSub16Ux8, False );
   11651       goto decode_success;
   11652    }
   11653 
   11654    /* 66 0F 68 = PUNPCKHBW */
   11655    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x68) {
   11656       delta = dis_SSEint_E_to_G( sorb, delta+2,
   11657                                  "punpckhbw",
   11658                                  Iop_InterleaveHI8x16, True );
   11659       goto decode_success;
   11660    }
   11661 
   11662    /* 66 0F 6A = PUNPCKHDQ */
   11663    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x6A) {
   11664       delta = dis_SSEint_E_to_G( sorb, delta+2,
   11665                                  "punpckhdq",
   11666                                  Iop_InterleaveHI32x4, True );
   11667       goto decode_success;
   11668    }
   11669 
   11670    /* 66 0F 6D = PUNPCKHQDQ */
   11671    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x6D) {
   11672       delta = dis_SSEint_E_to_G( sorb, delta+2,
   11673                                  "punpckhqdq",
   11674                                  Iop_InterleaveHI64x2, True );
   11675       goto decode_success;
   11676    }
   11677 
   11678    /* 66 0F 69 = PUNPCKHWD */
   11679    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x69) {
   11680       delta = dis_SSEint_E_to_G( sorb, delta+2,
   11681                                  "punpckhwd",
   11682                                  Iop_InterleaveHI16x8, True );
   11683       goto decode_success;
   11684    }
   11685 
   11686    /* 66 0F 60 = PUNPCKLBW */
   11687    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x60) {
   11688       delta = dis_SSEint_E_to_G( sorb, delta+2,
   11689                                  "punpcklbw",
   11690                                  Iop_InterleaveLO8x16, True );
   11691       goto decode_success;
   11692    }
   11693 
   11694    /* 66 0F 62 = PUNPCKLDQ */
   11695    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x62) {
   11696       delta = dis_SSEint_E_to_G( sorb, delta+2,
   11697                                  "punpckldq",
   11698                                  Iop_InterleaveLO32x4, True );
   11699       goto decode_success;
   11700    }
   11701 
   11702    /* 66 0F 6C = PUNPCKLQDQ */
   11703    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x6C) {
   11704       delta = dis_SSEint_E_to_G( sorb, delta+2,
   11705                                  "punpcklqdq",
   11706                                  Iop_InterleaveLO64x2, True );
   11707       goto decode_success;
   11708    }
   11709 
   11710    /* 66 0F 61 = PUNPCKLWD */
   11711    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x61) {
   11712       delta = dis_SSEint_E_to_G( sorb, delta+2,
   11713                                  "punpcklwd",
   11714                                  Iop_InterleaveLO16x8, True );
   11715       goto decode_success;
   11716    }
   11717 
   11718    /* 66 0F EF = PXOR */
   11719    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xEF) {
   11720       delta = dis_SSE_E_to_G_all( sorb, delta+2, "pxor", Iop_XorV128 );
   11721       goto decode_success;
   11722    }
   11723 
   11724 //--    /* FXSAVE/FXRSTOR m32 -- load/store the FPU/MMX/SSE state. */
   11725 //--    if (insn[0] == 0x0F && insn[1] == 0xAE
   11726 //--        && (!epartIsReg(insn[2]))
   11727 //--        && (gregOfRM(insn[2]) == 1 || gregOfRM(insn[2]) == 0) ) {
   11728 //--       Bool store = gregOfRM(insn[2]) == 0;
   11729 //--       vg_assert(sz == 4);
   11730 //--       pair = disAMode ( cb, sorb, eip+2, dis_buf );
   11731 //--       t1   = LOW24(pair);
   11732 //--       eip += 2+HI8(pair);
   11733 //--       uInstr3(cb, store ? SSE2a_MemWr : SSE2a_MemRd, 512,
   11734 //--                   Lit16, (((UShort)insn[0]) << 8) | (UShort)insn[1],
   11735 //--                   Lit16, (UShort)insn[2],
   11736 //--                   TempReg, t1 );
   11737 //--       DIP("fx%s %s\n", store ? "save" : "rstor", dis_buf );
   11738 //--       goto decode_success;
   11739 //--    }
   11740 
   11741    /* 0F AE /7 = CLFLUSH -- flush cache line */
   11742    if (sz == 4 && insn[0] == 0x0F && insn[1] == 0xAE
   11743        && !epartIsReg(insn[2]) && gregOfRM(insn[2]) == 7) {
   11744 
   11745       /* This is something of a hack.  We need to know the size of the
   11746          cache line containing addr.  Since we don't (easily), assume
   11747          256 on the basis that no real cache would have a line that
   11748          big.  It's safe to invalidate more stuff than we need, just
   11749          inefficient. */
   11750       UInt lineszB = 256;
   11751 
   11752       addr = disAMode ( &alen, sorb, delta+2, dis_buf );
   11753       delta += 2+alen;
   11754 
   11755       /* Round addr down to the start of the containing block. */
   11756       stmt( IRStmt_Put(
   11757                OFFB_CMSTART,
   11758                binop( Iop_And32,
   11759                       mkexpr(addr),
   11760                       mkU32( ~(lineszB-1) ))) );
   11761 
   11762       stmt( IRStmt_Put(OFFB_CMLEN, mkU32(lineszB) ) );
   11763 
   11764       jmp_lit(&dres, Ijk_InvalICache, (Addr32)(guest_EIP_bbstart+delta));
   11765 
   11766       DIP("clflush %s\n", dis_buf);
   11767       goto decode_success;
   11768    }
   11769 
   11770    /* ---------------------------------------------------- */
   11771    /* --- end of the SSE2 decoder.                     --- */
   11772    /* ---------------------------------------------------- */
   11773 
   11774    /* ---------------------------------------------------- */
   11775    /* --- start of the SSE3 decoder.                   --- */
   11776    /* ---------------------------------------------------- */
   11777 
   11778    /* Skip parts of the decoder which don't apply given the stated
   11779       guest subarchitecture. */
   11780    /* if (0 == (archinfo->hwcaps & VEX_HWCAPS_X86_SSE3)) */
   11781    /* In fact this is highly bogus; we accept SSE3 insns even on a
   11782       SSE2-only guest since they turn into IR which can be re-emitted
   11783       successfully on an SSE2 host. */
   11784    if (0 == (archinfo->hwcaps & VEX_HWCAPS_X86_SSE2))
   11785       goto after_sse_decoders; /* no SSE3 capabilities */
   11786 
   11787    insn = (UChar*)&guest_code[delta];
   11788 
   11789    /* F3 0F 12 = MOVSLDUP -- move from E (mem or xmm) to G (xmm),
   11790       duplicating some lanes (2:2:0:0). */
   11791    /* F3 0F 16 = MOVSHDUP -- move from E (mem or xmm) to G (xmm),
   11792       duplicating some lanes (3:3:1:1). */
   11793    if (sz == 4 && insn[0] == 0xF3 && insn[1] == 0x0F
   11794        && (insn[2] == 0x12 || insn[2] == 0x16)) {
   11795       IRTemp s3, s2, s1, s0;
   11796       IRTemp sV  = newTemp(Ity_V128);
   11797       Bool   isH = insn[2] == 0x16;
   11798       s3 = s2 = s1 = s0 = IRTemp_INVALID;
   11799 
   11800       modrm = insn[3];
   11801       if (epartIsReg(modrm)) {
   11802          assign( sV, getXMMReg( eregOfRM(modrm)) );
   11803          DIP("movs%cdup %s,%s\n", isH ? 'h' : 'l',
   11804                                   nameXMMReg(eregOfRM(modrm)),
   11805                                   nameXMMReg(gregOfRM(modrm)));
   11806          delta += 3+1;
   11807       } else {
   11808          addr = disAMode ( &alen, sorb, delta+3, dis_buf );
   11809          gen_SEGV_if_not_16_aligned( addr );
   11810          assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
   11811          DIP("movs%cdup %s,%s\n", isH ? 'h' : 'l',
   11812 	     dis_buf,
   11813              nameXMMReg(gregOfRM(modrm)));
   11814          delta += 3+alen;
   11815       }
   11816 
   11817       breakup128to32s( sV, &s3, &s2, &s1, &s0 );
   11818       putXMMReg( gregOfRM(modrm),
   11819                  isH ? mk128from32s( s3, s3, s1, s1 )
   11820                      : mk128from32s( s2, s2, s0, s0 ) );
   11821       goto decode_success;
   11822    }
   11823 
   11824    /* F2 0F 12 = MOVDDUP -- move from E (mem or xmm) to G (xmm),
   11825       duplicating some lanes (0:1:0:1). */
   11826    if (sz == 4 && insn[0] == 0xF2 && insn[1] == 0x0F && insn[2] == 0x12) {
   11827       IRTemp sV = newTemp(Ity_V128);
   11828       IRTemp d0 = newTemp(Ity_I64);
   11829 
   11830       modrm = insn[3];
   11831       if (epartIsReg(modrm)) {
   11832          assign( sV, getXMMReg( eregOfRM(modrm)) );
   11833          DIP("movddup %s,%s\n", nameXMMReg(eregOfRM(modrm)),
   11834                                 nameXMMReg(gregOfRM(modrm)));
   11835          delta += 3+1;
   11836          assign ( d0, unop(Iop_V128to64, mkexpr(sV)) );
   11837       } else {
   11838          addr = disAMode ( &alen, sorb, delta+3, dis_buf );
   11839          assign( d0, loadLE(Ity_I64, mkexpr(addr)) );
   11840          DIP("movddup %s,%s\n", dis_buf,
   11841                                 nameXMMReg(gregOfRM(modrm)));
   11842          delta += 3+alen;
   11843       }
   11844 
   11845       putXMMReg( gregOfRM(modrm), binop(Iop_64HLtoV128,mkexpr(d0),mkexpr(d0)) );
   11846       goto decode_success;
   11847    }
   11848 
   11849    /* F2 0F D0 = ADDSUBPS -- 32x4 +/-/+/- from E (mem or xmm) to G (xmm). */
   11850    if (sz == 4 && insn[0] == 0xF2 && insn[1] == 0x0F && insn[2] == 0xD0) {
   11851       IRTemp a3, a2, a1, a0, s3, s2, s1, s0;
   11852       IRTemp eV   = newTemp(Ity_V128);
   11853       IRTemp gV   = newTemp(Ity_V128);
   11854       IRTemp addV = newTemp(Ity_V128);
   11855       IRTemp subV = newTemp(Ity_V128);
   11856       IRTemp rm     = newTemp(Ity_I32);
   11857       a3 = a2 = a1 = a0 = s3 = s2 = s1 = s0 = IRTemp_INVALID;
   11858 
   11859       modrm = insn[3];
   11860       if (epartIsReg(modrm)) {
   11861          assign( eV, getXMMReg( eregOfRM(modrm)) );
   11862          DIP("addsubps %s,%s\n", nameXMMReg(eregOfRM(modrm)),
   11863                                  nameXMMReg(gregOfRM(modrm)));
   11864          delta += 3+1;
   11865       } else {
   11866          addr = disAMode ( &alen, sorb, delta+3, dis_buf );
   11867          assign( eV, loadLE(Ity_V128, mkexpr(addr)) );
   11868          DIP("addsubps %s,%s\n", dis_buf,
   11869                                  nameXMMReg(gregOfRM(modrm)));
   11870          delta += 3+alen;
   11871       }
   11872 
   11873       assign( gV, getXMMReg(gregOfRM(modrm)) );
   11874 
   11875       assign( rm, get_FAKE_roundingmode() ); /* XXXROUNDINGFIXME */
   11876       assign( addV, triop(Iop_Add32Fx4, mkexpr(rm), mkexpr(gV), mkexpr(eV)) );
   11877       assign( subV, triop(Iop_Sub32Fx4, mkexpr(rm), mkexpr(gV), mkexpr(eV)) );
   11878 
   11879       breakup128to32s( addV, &a3, &a2, &a1, &a0 );
   11880       breakup128to32s( subV, &s3, &s2, &s1, &s0 );
   11881 
   11882       putXMMReg( gregOfRM(modrm), mk128from32s( a3, s2, a1, s0 ));
   11883       goto decode_success;
   11884    }
   11885 
   11886    /* 66 0F D0 = ADDSUBPD -- 64x4 +/- from E (mem or xmm) to G (xmm). */
   11887    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xD0) {
   11888       IRTemp eV   = newTemp(Ity_V128);
   11889       IRTemp gV   = newTemp(Ity_V128);
   11890       IRTemp addV = newTemp(Ity_V128);
   11891       IRTemp subV = newTemp(Ity_V128);
   11892       IRTemp a1     = newTemp(Ity_I64);
   11893       IRTemp s0     = newTemp(Ity_I64);
   11894       IRTemp rm     = newTemp(Ity_I32);
   11895 
   11896       modrm = insn[2];
   11897       if (epartIsReg(modrm)) {
   11898          assign( eV, getXMMReg( eregOfRM(modrm)) );
   11899          DIP("addsubpd %s,%s\n", nameXMMReg(eregOfRM(modrm)),
   11900                                  nameXMMReg(gregOfRM(modrm)));
   11901          delta += 2+1;
   11902       } else {
   11903          addr = disAMode ( &alen, sorb, delta+2, dis_buf );
   11904          assign( eV, loadLE(Ity_V128, mkexpr(addr)) );
   11905          DIP("addsubpd %s,%s\n", dis_buf,
   11906                                  nameXMMReg(gregOfRM(modrm)));
   11907          delta += 2+alen;
   11908       }
   11909 
   11910       assign( gV, getXMMReg(gregOfRM(modrm)) );
   11911 
   11912       assign( rm, get_FAKE_roundingmode() ); /* XXXROUNDINGFIXME */
   11913       assign( addV, triop(Iop_Add64Fx2, mkexpr(rm), mkexpr(gV), mkexpr(eV)) );
   11914       assign( subV, triop(Iop_Sub64Fx2, mkexpr(rm), mkexpr(gV), mkexpr(eV)) );
   11915 
   11916       assign( a1, unop(Iop_V128HIto64, mkexpr(addV) ));
   11917       assign( s0, unop(Iop_V128to64,   mkexpr(subV) ));
   11918 
   11919       putXMMReg( gregOfRM(modrm),
   11920                  binop(Iop_64HLtoV128, mkexpr(a1), mkexpr(s0)) );
   11921       goto decode_success;
   11922    }
   11923 
   11924    /* F2 0F 7D = HSUBPS -- 32x4 sub across from E (mem or xmm) to G (xmm). */
   11925    /* F2 0F 7C = HADDPS -- 32x4 add across from E (mem or xmm) to G (xmm). */
   11926    if (sz == 4 && insn[0] == 0xF2 && insn[1] == 0x0F
   11927        && (insn[2] == 0x7C || insn[2] == 0x7D)) {
   11928       IRTemp e3, e2, e1, e0, g3, g2, g1, g0;
   11929       IRTemp eV     = newTemp(Ity_V128);
   11930       IRTemp gV     = newTemp(Ity_V128);
   11931       IRTemp leftV  = newTemp(Ity_V128);
   11932       IRTemp rightV = newTemp(Ity_V128);
   11933       IRTemp rm     = newTemp(Ity_I32);
   11934       Bool   isAdd  = insn[2] == 0x7C;
   11935       const HChar* str = isAdd ? "add" : "sub";
   11936       e3 = e2 = e1 = e0 = g3 = g2 = g1 = g0 = IRTemp_INVALID;
   11937 
   11938       modrm = insn[3];
   11939       if (epartIsReg(modrm)) {
   11940          assign( eV, getXMMReg( eregOfRM(modrm)) );
   11941          DIP("h%sps %s,%s\n", str, nameXMMReg(eregOfRM(modrm)),
   11942                                    nameXMMReg(gregOfRM(modrm)));
   11943          delta += 3+1;
   11944       } else {
   11945          addr = disAMode ( &alen, sorb, delta+3, dis_buf );
   11946          assign( eV, loadLE(Ity_V128, mkexpr(addr)) );
   11947          DIP("h%sps %s,%s\n", str, dis_buf,
   11948                                    nameXMMReg(gregOfRM(modrm)));
   11949          delta += 3+alen;
   11950       }
   11951 
   11952       assign( gV, getXMMReg(gregOfRM(modrm)) );
   11953 
   11954       breakup128to32s( eV, &e3, &e2, &e1, &e0 );
   11955       breakup128to32s( gV, &g3, &g2, &g1, &g0 );
   11956 
   11957       assign( leftV,  mk128from32s( e2, e0, g2, g0 ) );
   11958       assign( rightV, mk128from32s( e3, e1, g3, g1 ) );
   11959 
   11960       assign( rm, get_FAKE_roundingmode() ); /* XXXROUNDINGFIXME */
   11961       putXMMReg( gregOfRM(modrm),
   11962                  triop(isAdd ? Iop_Add32Fx4 : Iop_Sub32Fx4,
   11963                        mkexpr(rm), mkexpr(leftV), mkexpr(rightV) ) );
   11964       goto decode_success;
   11965    }
   11966 
   11967    /* 66 0F 7D = HSUBPD -- 64x2 sub across from E (mem or xmm) to G (xmm). */
   11968    /* 66 0F 7C = HADDPD -- 64x2 add across from E (mem or xmm) to G (xmm). */
   11969    if (sz == 2 && insn[0] == 0x0F && (insn[1] == 0x7C || insn[1] == 0x7D)) {
   11970       IRTemp e1     = newTemp(Ity_I64);
   11971       IRTemp e0     = newTemp(Ity_I64);
   11972       IRTemp g1     = newTemp(Ity_I64);
   11973       IRTemp g0     = newTemp(Ity_I64);
   11974       IRTemp eV     = newTemp(Ity_V128);
   11975       IRTemp gV     = newTemp(Ity_V128);
   11976       IRTemp leftV  = newTemp(Ity_V128);
   11977       IRTemp rightV = newTemp(Ity_V128);
   11978       IRTemp rm     = newTemp(Ity_I32);
   11979       Bool   isAdd  = insn[1] == 0x7C;
   11980       const HChar* str = isAdd ? "add" : "sub";
   11981 
   11982       modrm = insn[2];
   11983       if (epartIsReg(modrm)) {
   11984          assign( eV, getXMMReg( eregOfRM(modrm)) );
   11985          DIP("h%spd %s,%s\n", str, nameXMMReg(eregOfRM(modrm)),
   11986                                    nameXMMReg(gregOfRM(modrm)));
   11987          delta += 2+1;
   11988       } else {
   11989          addr = disAMode ( &alen, sorb, delta+2, dis_buf );
   11990          assign( eV, loadLE(Ity_V128, mkexpr(addr)) );
   11991          DIP("h%spd %s,%s\n", str, dis_buf,
   11992                               nameXMMReg(gregOfRM(modrm)));
   11993          delta += 2+alen;
   11994       }
   11995 
   11996       assign( gV, getXMMReg(gregOfRM(modrm)) );
   11997 
   11998       assign( e1, unop(Iop_V128HIto64, mkexpr(eV) ));
   11999       assign( e0, unop(Iop_V128to64, mkexpr(eV) ));
   12000       assign( g1, unop(Iop_V128HIto64, mkexpr(gV) ));
   12001       assign( g0, unop(Iop_V128to64, mkexpr(gV) ));
   12002 
   12003       assign( leftV,  binop(Iop_64HLtoV128, mkexpr(e0),mkexpr(g0)) );
   12004       assign( rightV, binop(Iop_64HLtoV128, mkexpr(e1),mkexpr(g1)) );
   12005 
   12006       assign( rm, get_FAKE_roundingmode() ); /* XXXROUNDINGFIXME */
   12007       putXMMReg( gregOfRM(modrm),
   12008                  triop(isAdd ? Iop_Add64Fx2 : Iop_Sub64Fx2,
   12009                        mkexpr(rm), mkexpr(leftV), mkexpr(rightV) ) );
   12010       goto decode_success;
   12011    }
   12012 
   12013    /* F2 0F F0 = LDDQU -- move from E (mem or xmm) to G (xmm). */
   12014    if (sz == 4 && insn[0] == 0xF2 && insn[1] == 0x0F && insn[2] == 0xF0) {
   12015       modrm = getIByte(delta+3);
   12016       if (epartIsReg(modrm)) {
   12017          goto decode_failure;
   12018       } else {
   12019          addr = disAMode ( &alen, sorb, delta+3, dis_buf );
   12020          putXMMReg( gregOfRM(modrm),
   12021                     loadLE(Ity_V128, mkexpr(addr)) );
   12022          DIP("lddqu %s,%s\n", dis_buf,
   12023                               nameXMMReg(gregOfRM(modrm)));
   12024          delta += 3+alen;
   12025       }
   12026       goto decode_success;
   12027    }
   12028 
   12029    /* ---------------------------------------------------- */
   12030    /* --- end of the SSE3 decoder.                     --- */
   12031    /* ---------------------------------------------------- */
   12032 
   12033    /* ---------------------------------------------------- */
   12034    /* --- start of the SSSE3 decoder.                  --- */
   12035    /* ---------------------------------------------------- */
   12036 
   12037    /* 0F 38 04 = PMADDUBSW -- Multiply and Add Packed Signed and
   12038       Unsigned Bytes (MMX) */
   12039    if (sz == 4
   12040        && insn[0] == 0x0F && insn[1] == 0x38 && insn[2] == 0x04) {
   12041       IRTemp sV        = newTemp(Ity_I64);
   12042       IRTemp dV        = newTemp(Ity_I64);
   12043       IRTemp sVoddsSX  = newTemp(Ity_I64);
   12044       IRTemp sVevensSX = newTemp(Ity_I64);
   12045       IRTemp dVoddsZX  = newTemp(Ity_I64);
   12046       IRTemp dVevensZX = newTemp(Ity_I64);
   12047 
   12048       modrm = insn[3];
   12049       do_MMX_preamble();
   12050       assign( dV, getMMXReg(gregOfRM(modrm)) );
   12051 
   12052       if (epartIsReg(modrm)) {
   12053          assign( sV, getMMXReg(eregOfRM(modrm)) );
   12054          delta += 3+1;
   12055          DIP("pmaddubsw %s,%s\n", nameMMXReg(eregOfRM(modrm)),
   12056                                   nameMMXReg(gregOfRM(modrm)));
   12057       } else {
   12058          addr = disAMode ( &alen, sorb, delta+3, dis_buf );
   12059          assign( sV, loadLE(Ity_I64, mkexpr(addr)) );
   12060          delta += 3+alen;
   12061          DIP("pmaddubsw %s,%s\n", dis_buf,
   12062                                   nameMMXReg(gregOfRM(modrm)));
   12063       }
   12064 
   12065       /* compute dV unsigned x sV signed */
   12066       assign( sVoddsSX,
   12067               binop(Iop_SarN16x4, mkexpr(sV), mkU8(8)) );
   12068       assign( sVevensSX,
   12069               binop(Iop_SarN16x4,
   12070                     binop(Iop_ShlN16x4, mkexpr(sV), mkU8(8)),
   12071                     mkU8(8)) );
   12072       assign( dVoddsZX,
   12073               binop(Iop_ShrN16x4, mkexpr(dV), mkU8(8)) );
   12074       assign( dVevensZX,
   12075               binop(Iop_ShrN16x4,
   12076                     binop(Iop_ShlN16x4, mkexpr(dV), mkU8(8)),
   12077                     mkU8(8)) );
   12078 
   12079       putMMXReg(
   12080          gregOfRM(modrm),
   12081          binop(Iop_QAdd16Sx4,
   12082                binop(Iop_Mul16x4, mkexpr(sVoddsSX), mkexpr(dVoddsZX)),
   12083                binop(Iop_Mul16x4, mkexpr(sVevensSX), mkexpr(dVevensZX))
   12084          )
   12085       );
   12086       goto decode_success;
   12087    }
   12088 
   12089    /* 66 0F 38 04 = PMADDUBSW -- Multiply and Add Packed Signed and
   12090       Unsigned Bytes (XMM) */
   12091    if (sz == 2
   12092        && insn[0] == 0x0F && insn[1] == 0x38 && insn[2] == 0x04) {
   12093       IRTemp sV        = newTemp(Ity_V128);
   12094       IRTemp dV        = newTemp(Ity_V128);
   12095       IRTemp sVoddsSX  = newTemp(Ity_V128);
   12096       IRTemp sVevensSX = newTemp(Ity_V128);
   12097       IRTemp dVoddsZX  = newTemp(Ity_V128);
   12098       IRTemp dVevensZX = newTemp(Ity_V128);
   12099 
   12100       modrm = insn[3];
   12101       assign( dV, getXMMReg(gregOfRM(modrm)) );
   12102 
   12103       if (epartIsReg(modrm)) {
   12104          assign( sV, getXMMReg(eregOfRM(modrm)) );
   12105          delta += 3+1;
   12106          DIP("pmaddubsw %s,%s\n", nameXMMReg(eregOfRM(modrm)),
   12107                                   nameXMMReg(gregOfRM(modrm)));
   12108       } else {
   12109          addr = disAMode ( &alen, sorb, delta+3, dis_buf );
   12110          gen_SEGV_if_not_16_aligned( addr );
   12111          assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
   12112          delta += 3+alen;
   12113          DIP("pmaddubsw %s,%s\n", dis_buf,
   12114                                   nameXMMReg(gregOfRM(modrm)));
   12115       }
   12116 
   12117       /* compute dV unsigned x sV signed */
   12118       assign( sVoddsSX,
   12119               binop(Iop_SarN16x8, mkexpr(sV), mkU8(8)) );
   12120       assign( sVevensSX,
   12121               binop(Iop_SarN16x8,
   12122                     binop(Iop_ShlN16x8, mkexpr(sV), mkU8(8)),
   12123                     mkU8(8)) );
   12124       assign( dVoddsZX,
   12125               binop(Iop_ShrN16x8, mkexpr(dV), mkU8(8)) );
   12126       assign( dVevensZX,
   12127               binop(Iop_ShrN16x8,
   12128                     binop(Iop_ShlN16x8, mkexpr(dV), mkU8(8)),
   12129                     mkU8(8)) );
   12130 
   12131       putXMMReg(
   12132          gregOfRM(modrm),
   12133          binop(Iop_QAdd16Sx8,
   12134                binop(Iop_Mul16x8, mkexpr(sVoddsSX), mkexpr(dVoddsZX)),
   12135                binop(Iop_Mul16x8, mkexpr(sVevensSX), mkexpr(dVevensZX))
   12136          )
   12137       );
   12138       goto decode_success;
   12139    }
   12140 
   12141    /* ***--- these are MMX class insns introduced in SSSE3 ---*** */
   12142    /* 0F 38 03 = PHADDSW -- 16x4 signed qadd across from E (mem or
   12143       mmx) and G to G (mmx). */
   12144    /* 0F 38 07 = PHSUBSW -- 16x4 signed qsub across from E (mem or
   12145       mmx) and G to G (mmx). */
   12146    /* 0F 38 01 = PHADDW -- 16x4 add across from E (mem or mmx) and G
   12147       to G (mmx). */
   12148    /* 0F 38 05 = PHSUBW -- 16x4 sub across from E (mem or mmx) and G
   12149       to G (mmx). */
   12150    /* 0F 38 02 = PHADDD -- 32x2 add across from E (mem or mmx) and G
   12151       to G (mmx). */
   12152    /* 0F 38 06 = PHSUBD -- 32x2 sub across from E (mem or mmx) and G
   12153       to G (mmx). */
   12154 
   12155    if (sz == 4
   12156        && insn[0] == 0x0F && insn[1] == 0x38
   12157        && (insn[2] == 0x03 || insn[2] == 0x07 || insn[2] == 0x01
   12158            || insn[2] == 0x05 || insn[2] == 0x02 || insn[2] == 0x06)) {
   12159       const HChar* str = "???";
   12160       IROp   opV64  = Iop_INVALID;
   12161       IROp   opCatO = Iop_CatOddLanes16x4;
   12162       IROp   opCatE = Iop_CatEvenLanes16x4;
   12163       IRTemp sV     = newTemp(Ity_I64);
   12164       IRTemp dV     = newTemp(Ity_I64);
   12165 
   12166       modrm = insn[3];
   12167 
   12168       switch (insn[2]) {
   12169          case 0x03: opV64 = Iop_QAdd16Sx4; str = "addsw"; break;
   12170          case 0x07: opV64 = Iop_QSub16Sx4; str = "subsw"; break;
   12171          case 0x01: opV64 = Iop_Add16x4;   str = "addw";  break;
   12172          case 0x05: opV64 = Iop_Sub16x4;   str = "subw";  break;
   12173          case 0x02: opV64 = Iop_Add32x2;   str = "addd";  break;
   12174          case 0x06: opV64 = Iop_Sub32x2;   str = "subd";  break;
   12175          default: vassert(0);
   12176       }
   12177       if (insn[2] == 0x02 || insn[2] == 0x06) {
   12178          opCatO = Iop_InterleaveHI32x2;
   12179          opCatE = Iop_InterleaveLO32x2;
   12180       }
   12181 
   12182       do_MMX_preamble();
   12183       assign( dV, getMMXReg(gregOfRM(modrm)) );
   12184 
   12185       if (epartIsReg(modrm)) {
   12186          assign( sV, getMMXReg(eregOfRM(modrm)) );
   12187          delta += 3+1;
   12188          DIP("ph%s %s,%s\n", str, nameMMXReg(eregOfRM(modrm)),
   12189                                   nameMMXReg(gregOfRM(modrm)));
   12190       } else {
   12191          addr = disAMode ( &alen, sorb, delta+3, dis_buf );
   12192          assign( sV, loadLE(Ity_I64, mkexpr(addr)) );
   12193          delta += 3+alen;
   12194          DIP("ph%s %s,%s\n", str, dis_buf,
   12195                                   nameMMXReg(gregOfRM(modrm)));
   12196       }
   12197 
   12198       putMMXReg(
   12199          gregOfRM(modrm),
   12200          binop(opV64,
   12201                binop(opCatE,mkexpr(sV),mkexpr(dV)),
   12202                binop(opCatO,mkexpr(sV),mkexpr(dV))
   12203          )
   12204       );
   12205       goto decode_success;
   12206    }
   12207 
   12208    /* 66 0F 38 03 = PHADDSW -- 16x8 signed qadd across from E (mem or
   12209       xmm) and G to G (xmm). */
   12210    /* 66 0F 38 07 = PHSUBSW -- 16x8 signed qsub across from E (mem or
   12211       xmm) and G to G (xmm). */
   12212    /* 66 0F 38 01 = PHADDW -- 16x8 add across from E (mem or xmm) and
   12213       G to G (xmm). */
   12214    /* 66 0F 38 05 = PHSUBW -- 16x8 sub across from E (mem or xmm) and
   12215       G to G (xmm). */
   12216    /* 66 0F 38 02 = PHADDD -- 32x4 add across from E (mem or xmm) and
   12217       G to G (xmm). */
   12218    /* 66 0F 38 06 = PHSUBD -- 32x4 sub across from E (mem or xmm) and
   12219       G to G (xmm). */
   12220 
   12221    if (sz == 2
   12222        && insn[0] == 0x0F && insn[1] == 0x38
   12223        && (insn[2] == 0x03 || insn[2] == 0x07 || insn[2] == 0x01
   12224            || insn[2] == 0x05 || insn[2] == 0x02 || insn[2] == 0x06)) {
   12225       const HChar* str = "???";
   12226       IROp   opV64  = Iop_INVALID;
   12227       IROp   opCatO = Iop_CatOddLanes16x4;
   12228       IROp   opCatE = Iop_CatEvenLanes16x4;
   12229       IRTemp sV     = newTemp(Ity_V128);
   12230       IRTemp dV     = newTemp(Ity_V128);
   12231       IRTemp sHi    = newTemp(Ity_I64);
   12232       IRTemp sLo    = newTemp(Ity_I64);
   12233       IRTemp dHi    = newTemp(Ity_I64);
   12234       IRTemp dLo    = newTemp(Ity_I64);
   12235 
   12236       modrm = insn[3];
   12237 
   12238       switch (insn[2]) {
   12239          case 0x03: opV64 = Iop_QAdd16Sx4; str = "addsw"; break;
   12240          case 0x07: opV64 = Iop_QSub16Sx4; str = "subsw"; break;
   12241          case 0x01: opV64 = Iop_Add16x4;   str = "addw";  break;
   12242          case 0x05: opV64 = Iop_Sub16x4;   str = "subw";  break;
   12243          case 0x02: opV64 = Iop_Add32x2;   str = "addd";  break;
   12244          case 0x06: opV64 = Iop_Sub32x2;   str = "subd";  break;
   12245          default: vassert(0);
   12246       }
   12247       if (insn[2] == 0x02 || insn[2] == 0x06) {
   12248          opCatO = Iop_InterleaveHI32x2;
   12249          opCatE = Iop_InterleaveLO32x2;
   12250       }
   12251 
   12252       assign( dV, getXMMReg(gregOfRM(modrm)) );
   12253 
   12254       if (epartIsReg(modrm)) {
   12255          assign( sV, getXMMReg( eregOfRM(modrm)) );
   12256          DIP("ph%s %s,%s\n", str, nameXMMReg(eregOfRM(modrm)),
   12257                                   nameXMMReg(gregOfRM(modrm)));
   12258          delta += 3+1;
   12259       } else {
   12260          addr = disAMode ( &alen, sorb, delta+3, dis_buf );
   12261          gen_SEGV_if_not_16_aligned( addr );
   12262          assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
   12263          DIP("ph%s %s,%s\n", str, dis_buf,
   12264                              nameXMMReg(gregOfRM(modrm)));
   12265          delta += 3+alen;
   12266       }
   12267 
   12268       assign( dHi, unop(Iop_V128HIto64, mkexpr(dV)) );
   12269       assign( dLo, unop(Iop_V128to64,   mkexpr(dV)) );
   12270       assign( sHi, unop(Iop_V128HIto64, mkexpr(sV)) );
   12271       assign( sLo, unop(Iop_V128to64,   mkexpr(sV)) );
   12272 
   12273       /* This isn't a particularly efficient way to compute the
   12274          result, but at least it avoids a proliferation of IROps,
   12275          hence avoids complication all the backends. */
   12276       putXMMReg(
   12277          gregOfRM(modrm),
   12278          binop(Iop_64HLtoV128,
   12279                binop(opV64,
   12280                      binop(opCatE,mkexpr(sHi),mkexpr(sLo)),
   12281                      binop(opCatO,mkexpr(sHi),mkexpr(sLo))
   12282                ),
   12283                binop(opV64,
   12284                      binop(opCatE,mkexpr(dHi),mkexpr(dLo)),
   12285                      binop(opCatO,mkexpr(dHi),mkexpr(dLo))
   12286                )
   12287          )
   12288       );
   12289       goto decode_success;
   12290    }
   12291 
   12292    /* 0F 38 0B = PMULHRSW -- Packed Multiply High with Round and Scale
   12293       (MMX) */
   12294    if (sz == 4
   12295        && insn[0] == 0x0F && insn[1] == 0x38 && insn[2] == 0x0B) {
   12296       IRTemp sV = newTemp(Ity_I64);
   12297       IRTemp dV = newTemp(Ity_I64);
   12298 
   12299       modrm = insn[3];
   12300       do_MMX_preamble();
   12301       assign( dV, getMMXReg(gregOfRM(modrm)) );
   12302 
   12303       if (epartIsReg(modrm)) {
   12304          assign( sV, getMMXReg(eregOfRM(modrm)) );
   12305          delta += 3+1;
   12306          DIP("pmulhrsw %s,%s\n", nameMMXReg(eregOfRM(modrm)),
   12307                                  nameMMXReg(gregOfRM(modrm)));
   12308       } else {
   12309          addr = disAMode ( &alen, sorb, delta+3, dis_buf );
   12310          assign( sV, loadLE(Ity_I64, mkexpr(addr)) );
   12311          delta += 3+alen;
   12312          DIP("pmulhrsw %s,%s\n", dis_buf,
   12313                                  nameMMXReg(gregOfRM(modrm)));
   12314       }
   12315 
   12316       putMMXReg(
   12317          gregOfRM(modrm),
   12318          dis_PMULHRSW_helper( mkexpr(sV), mkexpr(dV) )
   12319       );
   12320       goto decode_success;
   12321    }
   12322 
   12323    /* 66 0F 38 0B = PMULHRSW -- Packed Multiply High with Round and
   12324       Scale (XMM) */
   12325    if (sz == 2
   12326        && insn[0] == 0x0F && insn[1] == 0x38 && insn[2] == 0x0B) {
   12327       IRTemp sV  = newTemp(Ity_V128);
   12328       IRTemp dV  = newTemp(Ity_V128);
   12329       IRTemp sHi = newTemp(Ity_I64);
   12330       IRTemp sLo = newTemp(Ity_I64);
   12331       IRTemp dHi = newTemp(Ity_I64);
   12332       IRTemp dLo = newTemp(Ity_I64);
   12333 
   12334       modrm = insn[3];
   12335       assign( dV, getXMMReg(gregOfRM(modrm)) );
   12336 
   12337       if (epartIsReg(modrm)) {
   12338          assign( sV, getXMMReg(eregOfRM(modrm)) );
   12339          delta += 3+1;
   12340          DIP("pmulhrsw %s,%s\n", nameXMMReg(eregOfRM(modrm)),
   12341                                  nameXMMReg(gregOfRM(modrm)));
   12342       } else {
   12343          addr = disAMode ( &alen, sorb, delta+3, dis_buf );
   12344          gen_SEGV_if_not_16_aligned( addr );
   12345          assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
   12346          delta += 3+alen;
   12347          DIP("pmulhrsw %s,%s\n", dis_buf,
   12348                                  nameXMMReg(gregOfRM(modrm)));
   12349       }
   12350 
   12351       assign( dHi, unop(Iop_V128HIto64, mkexpr(dV)) );
   12352       assign( dLo, unop(Iop_V128to64,   mkexpr(dV)) );
   12353       assign( sHi, unop(Iop_V128HIto64, mkexpr(sV)) );
   12354       assign( sLo, unop(Iop_V128to64,   mkexpr(sV)) );
   12355 
   12356       putXMMReg(
   12357          gregOfRM(modrm),
   12358          binop(Iop_64HLtoV128,
   12359                dis_PMULHRSW_helper( mkexpr(sHi), mkexpr(dHi) ),
   12360                dis_PMULHRSW_helper( mkexpr(sLo), mkexpr(dLo) )
   12361          )
   12362       );
   12363       goto decode_success;
   12364    }
   12365 
   12366    /* 0F 38 08 = PSIGNB -- Packed Sign 8x8  (MMX) */
   12367    /* 0F 38 09 = PSIGNW -- Packed Sign 16x4 (MMX) */
   12368    /* 0F 38 09 = PSIGND -- Packed Sign 32x2 (MMX) */
   12369    if (sz == 4
   12370        && insn[0] == 0x0F && insn[1] == 0x38
   12371        && (insn[2] == 0x08 || insn[2] == 0x09 || insn[2] == 0x0A)) {
   12372       IRTemp sV      = newTemp(Ity_I64);
   12373       IRTemp dV      = newTemp(Ity_I64);
   12374       const HChar* str = "???";
   12375       Int    laneszB = 0;
   12376 
   12377       switch (insn[2]) {
   12378          case 0x08: laneszB = 1; str = "b"; break;
   12379          case 0x09: laneszB = 2; str = "w"; break;
   12380          case 0x0A: laneszB = 4; str = "d"; break;
   12381          default: vassert(0);
   12382       }
   12383 
   12384       modrm = insn[3];
   12385       do_MMX_preamble();
   12386       assign( dV, getMMXReg(gregOfRM(modrm)) );
   12387 
   12388       if (epartIsReg(modrm)) {
   12389          assign( sV, getMMXReg(eregOfRM(modrm)) );
   12390          delta += 3+1;
   12391          DIP("psign%s %s,%s\n", str, nameMMXReg(eregOfRM(modrm)),
   12392                                      nameMMXReg(gregOfRM(modrm)));
   12393       } else {
   12394          addr = disAMode ( &alen, sorb, delta+3, dis_buf );
   12395          assign( sV, loadLE(Ity_I64, mkexpr(addr)) );
   12396          delta += 3+alen;
   12397          DIP("psign%s %s,%s\n", str, dis_buf,
   12398                                      nameMMXReg(gregOfRM(modrm)));
   12399       }
   12400 
   12401       putMMXReg(
   12402          gregOfRM(modrm),
   12403          dis_PSIGN_helper( mkexpr(sV), mkexpr(dV), laneszB )
   12404       );
   12405       goto decode_success;
   12406    }
   12407 
   12408    /* 66 0F 38 08 = PSIGNB -- Packed Sign 8x16 (XMM) */
   12409    /* 66 0F 38 09 = PSIGNW -- Packed Sign 16x8 (XMM) */
   12410    /* 66 0F 38 09 = PSIGND -- Packed Sign 32x4 (XMM) */
   12411    if (sz == 2
   12412        && insn[0] == 0x0F && insn[1] == 0x38
   12413        && (insn[2] == 0x08 || insn[2] == 0x09 || insn[2] == 0x0A)) {
   12414       IRTemp sV      = newTemp(Ity_V128);
   12415       IRTemp dV      = newTemp(Ity_V128);
   12416       IRTemp sHi     = newTemp(Ity_I64);
   12417       IRTemp sLo     = newTemp(Ity_I64);
   12418       IRTemp dHi     = newTemp(Ity_I64);
   12419       IRTemp dLo     = newTemp(Ity_I64);
   12420       const HChar* str = "???";
   12421       Int    laneszB = 0;
   12422 
   12423       switch (insn[2]) {
   12424          case 0x08: laneszB = 1; str = "b"; break;
   12425          case 0x09: laneszB = 2; str = "w"; break;
   12426          case 0x0A: laneszB = 4; str = "d"; break;
   12427          default: vassert(0);
   12428       }
   12429 
   12430       modrm = insn[3];
   12431       assign( dV, getXMMReg(gregOfRM(modrm)) );
   12432 
   12433       if (epartIsReg(modrm)) {
   12434          assign( sV, getXMMReg(eregOfRM(modrm)) );
   12435          delta += 3+1;
   12436          DIP("psign%s %s,%s\n", str, nameXMMReg(eregOfRM(modrm)),
   12437                                      nameXMMReg(gregOfRM(modrm)));
   12438       } else {
   12439          addr = disAMode ( &alen, sorb, delta+3, dis_buf );
   12440          gen_SEGV_if_not_16_aligned( addr );
   12441          assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
   12442          delta += 3+alen;
   12443          DIP("psign%s %s,%s\n", str, dis_buf,
   12444                                      nameXMMReg(gregOfRM(modrm)));
   12445       }
   12446 
   12447       assign( dHi, unop(Iop_V128HIto64, mkexpr(dV)) );
   12448       assign( dLo, unop(Iop_V128to64,   mkexpr(dV)) );
   12449       assign( sHi, unop(Iop_V128HIto64, mkexpr(sV)) );
   12450       assign( sLo, unop(Iop_V128to64,   mkexpr(sV)) );
   12451 
   12452       putXMMReg(
   12453          gregOfRM(modrm),
   12454          binop(Iop_64HLtoV128,
   12455                dis_PSIGN_helper( mkexpr(sHi), mkexpr(dHi), laneszB ),
   12456                dis_PSIGN_helper( mkexpr(sLo), mkexpr(dLo), laneszB )
   12457          )
   12458       );
   12459       goto decode_success;
   12460    }
   12461 
   12462    /* 0F 38 1C = PABSB -- Packed Absolute Value 8x8  (MMX) */
   12463    /* 0F 38 1D = PABSW -- Packed Absolute Value 16x4 (MMX) */
   12464    /* 0F 38 1E = PABSD -- Packed Absolute Value 32x2 (MMX) */
   12465    if (sz == 4
   12466        && insn[0] == 0x0F && insn[1] == 0x38
   12467        && (insn[2] == 0x1C || insn[2] == 0x1D || insn[2] == 0x1E)) {
   12468       IRTemp sV      = newTemp(Ity_I64);
   12469       const HChar* str = "???";
   12470       Int    laneszB = 0;
   12471 
   12472       switch (insn[2]) {
   12473          case 0x1C: laneszB = 1; str = "b"; break;
   12474          case 0x1D: laneszB = 2; str = "w"; break;
   12475          case 0x1E: laneszB = 4; str = "d"; break;
   12476          default: vassert(0);
   12477       }
   12478 
   12479       modrm = insn[3];
   12480       do_MMX_preamble();
   12481 
   12482       if (epartIsReg(modrm)) {
   12483          assign( sV, getMMXReg(eregOfRM(modrm)) );
   12484          delta += 3+1;
   12485          DIP("pabs%s %s,%s\n", str, nameMMXReg(eregOfRM(modrm)),
   12486                                     nameMMXReg(gregOfRM(modrm)));
   12487       } else {
   12488          addr = disAMode ( &alen, sorb, delta+3, dis_buf );
   12489          assign( sV, loadLE(Ity_I64, mkexpr(addr)) );
   12490          delta += 3+alen;
   12491          DIP("pabs%s %s,%s\n", str, dis_buf,
   12492                                     nameMMXReg(gregOfRM(modrm)));
   12493       }
   12494 
   12495       putMMXReg(
   12496          gregOfRM(modrm),
   12497          dis_PABS_helper( mkexpr(sV), laneszB )
   12498       );
   12499       goto decode_success;
   12500    }
   12501 
   12502    /* 66 0F 38 1C = PABSB -- Packed Absolute Value 8x16 (XMM) */
   12503    /* 66 0F 38 1D = PABSW -- Packed Absolute Value 16x8 (XMM) */
   12504    /* 66 0F 38 1E = PABSD -- Packed Absolute Value 32x4 (XMM) */
   12505    if (sz == 2
   12506        && insn[0] == 0x0F && insn[1] == 0x38
   12507        && (insn[2] == 0x1C || insn[2] == 0x1D || insn[2] == 0x1E)) {
   12508       IRTemp sV      = newTemp(Ity_V128);
   12509       IRTemp sHi     = newTemp(Ity_I64);
   12510       IRTemp sLo     = newTemp(Ity_I64);
   12511       const HChar* str = "???";
   12512       Int    laneszB = 0;
   12513 
   12514       switch (insn[2]) {
   12515          case 0x1C: laneszB = 1; str = "b"; break;
   12516          case 0x1D: laneszB = 2; str = "w"; break;
   12517          case 0x1E: laneszB = 4; str = "d"; break;
   12518          default: vassert(0);
   12519       }
   12520 
   12521       modrm = insn[3];
   12522 
   12523       if (epartIsReg(modrm)) {
   12524          assign( sV, getXMMReg(eregOfRM(modrm)) );
   12525          delta += 3+1;
   12526          DIP("pabs%s %s,%s\n", str, nameXMMReg(eregOfRM(modrm)),
   12527                                     nameXMMReg(gregOfRM(modrm)));
   12528       } else {
   12529          addr = disAMode ( &alen, sorb, delta+3, dis_buf );
   12530          gen_SEGV_if_not_16_aligned( addr );
   12531          assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
   12532          delta += 3+alen;
   12533          DIP("pabs%s %s,%s\n", str, dis_buf,
   12534                                     nameXMMReg(gregOfRM(modrm)));
   12535       }
   12536 
   12537       assign( sHi, unop(Iop_V128HIto64, mkexpr(sV)) );
   12538       assign( sLo, unop(Iop_V128to64,   mkexpr(sV)) );
   12539 
   12540       putXMMReg(
   12541          gregOfRM(modrm),
   12542          binop(Iop_64HLtoV128,
   12543                dis_PABS_helper( mkexpr(sHi), laneszB ),
   12544                dis_PABS_helper( mkexpr(sLo), laneszB )
   12545          )
   12546       );
   12547       goto decode_success;
   12548    }
   12549 
   12550    /* 0F 3A 0F = PALIGNR -- Packed Align Right (MMX) */
   12551    if (sz == 4
   12552        && insn[0] == 0x0F && insn[1] == 0x3A && insn[2] == 0x0F) {
   12553       IRTemp sV  = newTemp(Ity_I64);
   12554       IRTemp dV  = newTemp(Ity_I64);
   12555       IRTemp res = newTemp(Ity_I64);
   12556 
   12557       modrm = insn[3];
   12558       do_MMX_preamble();
   12559       assign( dV, getMMXReg(gregOfRM(modrm)) );
   12560 
   12561       if (epartIsReg(modrm)) {
   12562          assign( sV, getMMXReg(eregOfRM(modrm)) );
   12563          d32 = (UInt)insn[3+1];
   12564          delta += 3+1+1;
   12565          DIP("palignr $%d,%s,%s\n",  (Int)d32,
   12566                                      nameMMXReg(eregOfRM(modrm)),
   12567                                      nameMMXReg(gregOfRM(modrm)));
   12568       } else {
   12569          addr = disAMode ( &alen, sorb, delta+3, dis_buf );
   12570          assign( sV, loadLE(Ity_I64, mkexpr(addr)) );
   12571          d32 = (UInt)insn[3+alen];
   12572          delta += 3+alen+1;
   12573          DIP("palignr $%d%s,%s\n", (Int)d32,
   12574                                    dis_buf,
   12575                                    nameMMXReg(gregOfRM(modrm)));
   12576       }
   12577 
   12578       if (d32 == 0) {
   12579          assign( res, mkexpr(sV) );
   12580       }
   12581       else if (d32 >= 1 && d32 <= 7) {
   12582          assign(res,
   12583                 binop(Iop_Or64,
   12584                       binop(Iop_Shr64, mkexpr(sV), mkU8(8*d32)),
   12585                       binop(Iop_Shl64, mkexpr(dV), mkU8(8*(8-d32))
   12586                      )));
   12587       }
   12588       else if (d32 == 8) {
   12589         assign( res, mkexpr(dV) );
   12590       }
   12591       else if (d32 >= 9 && d32 <= 15) {
   12592          assign( res, binop(Iop_Shr64, mkexpr(dV), mkU8(8*(d32-8))) );
   12593       }
   12594       else if (d32 >= 16 && d32 <= 255) {
   12595          assign( res, mkU64(0) );
   12596       }
   12597       else
   12598          vassert(0);
   12599 
   12600       putMMXReg( gregOfRM(modrm), mkexpr(res) );
   12601       goto decode_success;
   12602    }
   12603 
   12604    /* 66 0F 3A 0F = PALIGNR -- Packed Align Right (XMM) */
   12605    if (sz == 2
   12606        && insn[0] == 0x0F && insn[1] == 0x3A && insn[2] == 0x0F) {
   12607       IRTemp sV  = newTemp(Ity_V128);
   12608       IRTemp dV  = newTemp(Ity_V128);
   12609       IRTemp sHi = newTemp(Ity_I64);
   12610       IRTemp sLo = newTemp(Ity_I64);
   12611       IRTemp dHi = newTemp(Ity_I64);
   12612       IRTemp dLo = newTemp(Ity_I64);
   12613       IRTemp rHi = newTemp(Ity_I64);
   12614       IRTemp rLo = newTemp(Ity_I64);
   12615 
   12616       modrm = insn[3];
   12617       assign( dV, getXMMReg(gregOfRM(modrm)) );
   12618 
   12619       if (epartIsReg(modrm)) {
   12620          assign( sV, getXMMReg(eregOfRM(modrm)) );
   12621          d32 = (UInt)insn[3+1];
   12622          delta += 3+1+1;
   12623          DIP("palignr $%d,%s,%s\n", (Int)d32,
   12624                                     nameXMMReg(eregOfRM(modrm)),
   12625                                     nameXMMReg(gregOfRM(modrm)));
   12626       } else {
   12627          addr = disAMode ( &alen, sorb, delta+3, dis_buf );
   12628          gen_SEGV_if_not_16_aligned( addr );
   12629          assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
   12630          d32 = (UInt)insn[3+alen];
   12631          delta += 3+alen+1;
   12632          DIP("palignr $%d,%s,%s\n", (Int)d32,
   12633                                     dis_buf,
   12634                                     nameXMMReg(gregOfRM(modrm)));
   12635       }
   12636 
   12637       assign( dHi, unop(Iop_V128HIto64, mkexpr(dV)) );
   12638       assign( dLo, unop(Iop_V128to64,   mkexpr(dV)) );
   12639       assign( sHi, unop(Iop_V128HIto64, mkexpr(sV)) );
   12640       assign( sLo, unop(Iop_V128to64,   mkexpr(sV)) );
   12641 
   12642       if (d32 == 0) {
   12643          assign( rHi, mkexpr(sHi) );
   12644          assign( rLo, mkexpr(sLo) );
   12645       }
   12646       else if (d32 >= 1 && d32 <= 7) {
   12647          assign( rHi, dis_PALIGNR_XMM_helper(dLo, sHi, d32) );
   12648          assign( rLo, dis_PALIGNR_XMM_helper(sHi, sLo, d32) );
   12649       }
   12650       else if (d32 == 8) {
   12651          assign( rHi, mkexpr(dLo) );
   12652          assign( rLo, mkexpr(sHi) );
   12653       }
   12654       else if (d32 >= 9 && d32 <= 15) {
   12655          assign( rHi, dis_PALIGNR_XMM_helper(dHi, dLo, d32-8) );
   12656          assign( rLo, dis_PALIGNR_XMM_helper(dLo, sHi, d32-8) );
   12657       }
   12658       else if (d32 == 16) {
   12659          assign( rHi, mkexpr(dHi) );
   12660          assign( rLo, mkexpr(dLo) );
   12661       }
   12662       else if (d32 >= 17 && d32 <= 23) {
   12663          assign( rHi, binop(Iop_Shr64, mkexpr(dHi), mkU8(8*(d32-16))) );
   12664          assign( rLo, dis_PALIGNR_XMM_helper(dHi, dLo, d32-16) );
   12665       }
   12666       else if (d32 == 24) {
   12667          assign( rHi, mkU64(0) );
   12668          assign( rLo, mkexpr(dHi) );
   12669       }
   12670       else if (d32 >= 25 && d32 <= 31) {
   12671          assign( rHi, mkU64(0) );
   12672          assign( rLo, binop(Iop_Shr64, mkexpr(dHi), mkU8(8*(d32-24))) );
   12673       }
   12674       else if (d32 >= 32 && d32 <= 255) {
   12675          assign( rHi, mkU64(0) );
   12676          assign( rLo, mkU64(0) );
   12677       }
   12678       else
   12679          vassert(0);
   12680 
   12681       putXMMReg(
   12682          gregOfRM(modrm),
   12683          binop(Iop_64HLtoV128, mkexpr(rHi), mkexpr(rLo))
   12684       );
   12685       goto decode_success;
   12686    }
   12687 
   12688    /* 0F 38 00 = PSHUFB -- Packed Shuffle Bytes 8x8 (MMX) */
   12689    if (sz == 4
   12690        && insn[0] == 0x0F && insn[1] == 0x38 && insn[2] == 0x00) {
   12691       IRTemp sV      = newTemp(Ity_I64);
   12692       IRTemp dV      = newTemp(Ity_I64);
   12693 
   12694       modrm = insn[3];
   12695       do_MMX_preamble();
   12696       assign( dV, getMMXReg(gregOfRM(modrm)) );
   12697 
   12698       if (epartIsReg(modrm)) {
   12699          assign( sV, getMMXReg(eregOfRM(modrm)) );
   12700          delta += 3+1;
   12701          DIP("pshufb %s,%s\n", nameMMXReg(eregOfRM(modrm)),
   12702                                nameMMXReg(gregOfRM(modrm)));
   12703       } else {
   12704          addr = disAMode ( &alen, sorb, delta+3, dis_buf );
   12705          assign( sV, loadLE(Ity_I64, mkexpr(addr)) );
   12706          delta += 3+alen;
   12707          DIP("pshufb %s,%s\n", dis_buf,
   12708                                nameMMXReg(gregOfRM(modrm)));
   12709       }
   12710 
   12711       putMMXReg(
   12712          gregOfRM(modrm),
   12713          binop(
   12714             Iop_And64,
   12715             /* permute the lanes */
   12716             binop(
   12717                Iop_Perm8x8,
   12718                mkexpr(dV),
   12719                binop(Iop_And64, mkexpr(sV), mkU64(0x0707070707070707ULL))
   12720             ),
   12721             /* mask off lanes which have (index & 0x80) == 0x80 */
   12722             unop(Iop_Not64, binop(Iop_SarN8x8, mkexpr(sV), mkU8(7)))
   12723          )
   12724       );
   12725       goto decode_success;
   12726    }
   12727 
   12728    /* 66 0F 38 00 = PSHUFB -- Packed Shuffle Bytes 8x16 (XMM) */
   12729    if (sz == 2
   12730        && insn[0] == 0x0F && insn[1] == 0x38 && insn[2] == 0x00) {
   12731       IRTemp sV         = newTemp(Ity_V128);
   12732       IRTemp dV         = newTemp(Ity_V128);
   12733       IRTemp sHi        = newTemp(Ity_I64);
   12734       IRTemp sLo        = newTemp(Ity_I64);
   12735       IRTemp dHi        = newTemp(Ity_I64);
   12736       IRTemp dLo        = newTemp(Ity_I64);
   12737       IRTemp rHi        = newTemp(Ity_I64);
   12738       IRTemp rLo        = newTemp(Ity_I64);
   12739       IRTemp sevens     = newTemp(Ity_I64);
   12740       IRTemp mask0x80hi = newTemp(Ity_I64);
   12741       IRTemp mask0x80lo = newTemp(Ity_I64);
   12742       IRTemp maskBit3hi = newTemp(Ity_I64);
   12743       IRTemp maskBit3lo = newTemp(Ity_I64);
   12744       IRTemp sAnd7hi    = newTemp(Ity_I64);
   12745       IRTemp sAnd7lo    = newTemp(Ity_I64);
   12746       IRTemp permdHi    = newTemp(Ity_I64);
   12747       IRTemp permdLo    = newTemp(Ity_I64);
   12748 
   12749       modrm = insn[3];
   12750       assign( dV, getXMMReg(gregOfRM(modrm)) );
   12751 
   12752       if (epartIsReg(modrm)) {
   12753          assign( sV, getXMMReg(eregOfRM(modrm)) );
   12754          delta += 3+1;
   12755          DIP("pshufb %s,%s\n", nameXMMReg(eregOfRM(modrm)),
   12756                                nameXMMReg(gregOfRM(modrm)));
   12757       } else {
   12758          addr = disAMode ( &alen, sorb, delta+3, dis_buf );
   12759          gen_SEGV_if_not_16_aligned( addr );
   12760          assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
   12761          delta += 3+alen;
   12762          DIP("pshufb %s,%s\n", dis_buf,
   12763                                nameXMMReg(gregOfRM(modrm)));
   12764       }
   12765 
   12766       assign( dHi, unop(Iop_V128HIto64, mkexpr(dV)) );
   12767       assign( dLo, unop(Iop_V128to64,   mkexpr(dV)) );
   12768       assign( sHi, unop(Iop_V128HIto64, mkexpr(sV)) );
   12769       assign( sLo, unop(Iop_V128to64,   mkexpr(sV)) );
   12770 
   12771       assign( sevens, mkU64(0x0707070707070707ULL) );
   12772 
   12773       /*
   12774       mask0x80hi = Not(SarN8x8(sHi,7))
   12775       maskBit3hi = SarN8x8(ShlN8x8(sHi,4),7)
   12776       sAnd7hi    = And(sHi,sevens)
   12777       permdHi    = Or( And(Perm8x8(dHi,sAnd7hi),maskBit3hi),
   12778                        And(Perm8x8(dLo,sAnd7hi),Not(maskBit3hi)) )
   12779       rHi        = And(permdHi,mask0x80hi)
   12780       */
   12781       assign(
   12782          mask0x80hi,
   12783          unop(Iop_Not64, binop(Iop_SarN8x8,mkexpr(sHi),mkU8(7))));
   12784 
   12785       assign(
   12786          maskBit3hi,
   12787          binop(Iop_SarN8x8,
   12788                binop(Iop_ShlN8x8,mkexpr(sHi),mkU8(4)),
   12789                mkU8(7)));
   12790 
   12791       assign(sAnd7hi, binop(Iop_And64,mkexpr(sHi),mkexpr(sevens)));
   12792 
   12793       assign(
   12794          permdHi,
   12795          binop(
   12796             Iop_Or64,
   12797             binop(Iop_And64,
   12798                   binop(Iop_Perm8x8,mkexpr(dHi),mkexpr(sAnd7hi)),
   12799                   mkexpr(maskBit3hi)),
   12800             binop(Iop_And64,
   12801                   binop(Iop_Perm8x8,mkexpr(dLo),mkexpr(sAnd7hi)),
   12802                   unop(Iop_Not64,mkexpr(maskBit3hi))) ));
   12803 
   12804       assign(rHi, binop(Iop_And64,mkexpr(permdHi),mkexpr(mask0x80hi)) );
   12805 
   12806       /* And the same for the lower half of the result.  What fun. */
   12807 
   12808       assign(
   12809          mask0x80lo,
   12810          unop(Iop_Not64, binop(Iop_SarN8x8,mkexpr(sLo),mkU8(7))));
   12811 
   12812       assign(
   12813          maskBit3lo,
   12814          binop(Iop_SarN8x8,
   12815                binop(Iop_ShlN8x8,mkexpr(sLo),mkU8(4)),
   12816                mkU8(7)));
   12817 
   12818       assign(sAnd7lo, binop(Iop_And64,mkexpr(sLo),mkexpr(sevens)));
   12819 
   12820       assign(
   12821          permdLo,
   12822          binop(
   12823             Iop_Or64,
   12824             binop(Iop_And64,
   12825                   binop(Iop_Perm8x8,mkexpr(dHi),mkexpr(sAnd7lo)),
   12826                   mkexpr(maskBit3lo)),
   12827             binop(Iop_And64,
   12828                   binop(Iop_Perm8x8,mkexpr(dLo),mkexpr(sAnd7lo)),
   12829                   unop(Iop_Not64,mkexpr(maskBit3lo))) ));
   12830 
   12831       assign(rLo, binop(Iop_And64,mkexpr(permdLo),mkexpr(mask0x80lo)) );
   12832 
   12833       putXMMReg(
   12834          gregOfRM(modrm),
   12835          binop(Iop_64HLtoV128, mkexpr(rHi), mkexpr(rLo))
   12836       );
   12837       goto decode_success;
   12838    }
   12839 
   12840    /* 0F 38 F0 = MOVBE m16/32(E), r16/32(G) */
   12841    /* 0F 38 F1 = MOVBE r16/32(G), m16/32(E) */
   12842    if ((sz == 2 || sz == 4)
   12843        && insn[0] == 0x0F && insn[1] == 0x38
   12844        && (insn[2] == 0xF0 || insn[2] == 0xF1)
   12845        && !epartIsReg(insn[3])) {
   12846 
   12847       modrm = insn[3];
   12848       addr = disAMode(&alen, sorb, delta + 3, dis_buf);
   12849       delta += 3 + alen;
   12850       ty = szToITy(sz);
   12851       IRTemp src = newTemp(ty);
   12852 
   12853       if (insn[2] == 0xF0) { /* LOAD */
   12854          assign(src, loadLE(ty, mkexpr(addr)));
   12855          IRTemp dst = math_BSWAP(src, ty);
   12856          putIReg(sz, gregOfRM(modrm), mkexpr(dst));
   12857          DIP("movbe %s,%s\n", dis_buf, nameIReg(sz, gregOfRM(modrm)));
   12858       } else { /* STORE */
   12859          assign(src, getIReg(sz, gregOfRM(modrm)));
   12860          IRTemp dst = math_BSWAP(src, ty);
   12861          storeLE(mkexpr(addr), mkexpr(dst));
   12862          DIP("movbe %s,%s\n", nameIReg(sz, gregOfRM(modrm)), dis_buf);
   12863       }
   12864       goto decode_success;
   12865    }
   12866 
   12867    /* ---------------------------------------------------- */
   12868    /* --- end of the SSSE3 decoder.                    --- */
   12869    /* ---------------------------------------------------- */
   12870 
   12871    /* ---------------------------------------------------- */
   12872    /* --- start of the SSE4 decoder                    --- */
   12873    /* ---------------------------------------------------- */
   12874 
   12875    /* 66 0F 3A 0B /r ib = ROUNDSD imm8, xmm2/m64, xmm1
   12876       (Partial implementation only -- only deal with cases where
   12877       the rounding mode is specified directly by the immediate byte.)
   12878       66 0F 3A 0A /r ib = ROUNDSS imm8, xmm2/m32, xmm1
   12879       (Limitations ditto)
   12880    */
   12881    if (sz == 2
   12882        && insn[0] == 0x0F && insn[1] == 0x3A
   12883        && (/*insn[2] == 0x0B || */insn[2] == 0x0A)) {
   12884 
   12885       Bool   isD = insn[2] == 0x0B;
   12886       IRTemp src = newTemp(isD ? Ity_F64 : Ity_F32);
   12887       IRTemp res = newTemp(isD ? Ity_F64 : Ity_F32);
   12888       Int    imm = 0;
   12889 
   12890       modrm = insn[3];
   12891 
   12892       if (epartIsReg(modrm)) {
   12893          assign( src,
   12894                  isD ? getXMMRegLane64F( eregOfRM(modrm), 0 )
   12895                      : getXMMRegLane32F( eregOfRM(modrm), 0 ) );
   12896          imm = insn[3+1];
   12897          if (imm & ~3) goto decode_failure;
   12898          delta += 3+1+1;
   12899          DIP( "rounds%c $%d,%s,%s\n",
   12900               isD ? 'd' : 's',
   12901               imm, nameXMMReg( eregOfRM(modrm) ),
   12902                    nameXMMReg( gregOfRM(modrm) ) );
   12903       } else {
   12904          addr = disAMode( &alen, sorb, delta+3, dis_buf );
   12905          assign( src, loadLE( isD ? Ity_F64 : Ity_F32, mkexpr(addr) ));
   12906          imm = insn[3+alen];
   12907          if (imm & ~3) goto decode_failure;
   12908          delta += 3+alen+1;
   12909          DIP( "roundsd $%d,%s,%s\n",
   12910               imm, dis_buf, nameXMMReg( gregOfRM(modrm) ) );
   12911       }
   12912 
   12913       /* (imm & 3) contains an Intel-encoded rounding mode.  Because
   12914          that encoding is the same as the encoding for IRRoundingMode,
   12915          we can use that value directly in the IR as a rounding
   12916          mode. */
   12917       assign(res, binop(isD ? Iop_RoundF64toInt : Iop_RoundF32toInt,
   12918                   mkU32(imm & 3), mkexpr(src)) );
   12919 
   12920       if (isD)
   12921          putXMMRegLane64F( gregOfRM(modrm), 0, mkexpr(res) );
   12922       else
   12923          putXMMRegLane32F( gregOfRM(modrm), 0, mkexpr(res) );
   12924 
   12925       goto decode_success;
   12926    }
   12927 
   12928    /* F3 0F BD -- LZCNT (count leading zeroes.  An AMD extension,
   12929       which we can only decode if we're sure this is an AMD cpu that
   12930       supports LZCNT, since otherwise it's BSR, which behaves
   12931       differently. */
   12932    if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0xBD
   12933        && 0 != (archinfo->hwcaps & VEX_HWCAPS_X86_LZCNT)) {
   12934       vassert(sz == 2 || sz == 4);
   12935       /*IRType*/ ty  = szToITy(sz);
   12936       IRTemp     src = newTemp(ty);
   12937       modrm = insn[3];
   12938       if (epartIsReg(modrm)) {
   12939          assign(src, getIReg(sz, eregOfRM(modrm)));
   12940          delta += 3+1;
   12941          DIP("lzcnt%c %s, %s\n", nameISize(sz),
   12942              nameIReg(sz, eregOfRM(modrm)),
   12943              nameIReg(sz, gregOfRM(modrm)));
   12944       } else {
   12945          addr = disAMode( &alen, sorb, delta+3, dis_buf );
   12946          assign(src, loadLE(ty, mkexpr(addr)));
   12947          delta += 3+alen;
   12948          DIP("lzcnt%c %s, %s\n", nameISize(sz), dis_buf,
   12949              nameIReg(sz, gregOfRM(modrm)));
   12950       }
   12951 
   12952       IRTemp res = gen_LZCNT(ty, src);
   12953       putIReg(sz, gregOfRM(modrm), mkexpr(res));
   12954 
   12955       // Update flags.  This is pretty lame .. perhaps can do better
   12956       // if this turns out to be performance critical.
   12957       // O S A P are cleared.  Z is set if RESULT == 0.
   12958       // C is set if SRC is zero.
   12959       IRTemp src32 = newTemp(Ity_I32);
   12960       IRTemp res32 = newTemp(Ity_I32);
   12961       assign(src32, widenUto32(mkexpr(src)));
   12962       assign(res32, widenUto32(mkexpr(res)));
   12963 
   12964       IRTemp oszacp = newTemp(Ity_I32);
   12965       assign(
   12966          oszacp,
   12967          binop(Iop_Or32,
   12968                binop(Iop_Shl32,
   12969                      unop(Iop_1Uto32,
   12970                           binop(Iop_CmpEQ32, mkexpr(res32), mkU32(0))),
   12971                      mkU8(X86G_CC_SHIFT_Z)),
   12972                binop(Iop_Shl32,
   12973                      unop(Iop_1Uto32,
   12974                           binop(Iop_CmpEQ32, mkexpr(src32), mkU32(0))),
   12975                      mkU8(X86G_CC_SHIFT_C))
   12976          )
   12977       );
   12978 
   12979       stmt( IRStmt_Put( OFFB_CC_OP,   mkU32(X86G_CC_OP_COPY) ));
   12980       stmt( IRStmt_Put( OFFB_CC_DEP2, mkU32(0) ));
   12981       stmt( IRStmt_Put( OFFB_CC_NDEP, mkU32(0) ));
   12982       stmt( IRStmt_Put( OFFB_CC_DEP1, mkexpr(oszacp) ));
   12983 
   12984       goto decode_success;
   12985    }
   12986 
   12987    /* ---------------------------------------------------- */
   12988    /* --- end of the SSE4 decoder                      --- */
   12989    /* ---------------------------------------------------- */
   12990 
   12991    after_sse_decoders:
   12992 
   12993    /* ---------------------------------------------------- */
   12994    /* --- deal with misc 0x67 pfxs (addr size override) -- */
   12995    /* ---------------------------------------------------- */
   12996 
   12997    /* 67 E3 = JCXZ (for JECXZ see below) */
   12998    if (insn[0] == 0x67 && insn[1] == 0xE3 && sz == 4) {
   12999       delta += 2;
   13000       d32 = (((Addr32)guest_EIP_bbstart)+delta+1) + getSDisp8(delta);
   13001       delta ++;
   13002       stmt( IRStmt_Exit(
   13003                binop(Iop_CmpEQ16, getIReg(2,R_ECX), mkU16(0)),
   13004                Ijk_Boring,
   13005                IRConst_U32(d32),
   13006                OFFB_EIP
   13007             ));
   13008        DIP("jcxz 0x%x\n", d32);
   13009        goto decode_success;
   13010    }
   13011 
   13012    /* ---------------------------------------------------- */
   13013    /* --- start of the baseline insn decoder            -- */
   13014    /* ---------------------------------------------------- */
   13015 
   13016    /* Get the primary opcode. */
   13017    opc = getIByte(delta); delta++;
   13018 
   13019    /* We get here if the current insn isn't SSE, or this CPU doesn't
   13020       support SSE. */
   13021 
   13022    switch (opc) {
   13023 
   13024    /* ------------------------ Control flow --------------- */
   13025 
   13026    case 0xC2: /* RET imm16 */
   13027       d32 = getUDisp16(delta);
   13028       delta += 2;
   13029       dis_ret(&dres, d32);
   13030       DIP("ret %d\n", (Int)d32);
   13031       break;
   13032    case 0xC3: /* RET */
   13033       dis_ret(&dres, 0);
   13034       DIP("ret\n");
   13035       break;
   13036 
   13037    case 0xCF: /* IRET */
   13038       /* Note, this is an extremely kludgey and limited implementation
   13039          of iret.  All it really does is:
   13040             popl %EIP; popl %CS; popl %EFLAGS.
   13041          %CS is set but ignored (as it is in (eg) popw %cs)". */
   13042       t1 = newTemp(Ity_I32); /* ESP */
   13043       t2 = newTemp(Ity_I32); /* new EIP */
   13044       t3 = newTemp(Ity_I32); /* new CS */
   13045       t4 = newTemp(Ity_I32); /* new EFLAGS */
   13046       assign(t1, getIReg(4,R_ESP));
   13047       assign(t2, loadLE(Ity_I32, binop(Iop_Add32,mkexpr(t1),mkU32(0) )));
   13048       assign(t3, loadLE(Ity_I32, binop(Iop_Add32,mkexpr(t1),mkU32(4) )));
   13049       assign(t4, loadLE(Ity_I32, binop(Iop_Add32,mkexpr(t1),mkU32(8) )));
   13050       /* Get stuff off stack */
   13051       putIReg(4, R_ESP,binop(Iop_Add32, mkexpr(t1), mkU32(12)));
   13052       /* set %CS (which is ignored anyway) */
   13053       putSReg( R_CS, unop(Iop_32to16, mkexpr(t3)) );
   13054       /* set %EFLAGS */
   13055       set_EFLAGS_from_value( t4, False/*!emit_AC_emwarn*/, 0/*unused*/ );
   13056       /* goto new EIP value */
   13057       jmp_treg(&dres, Ijk_Ret, t2);
   13058       vassert(dres.whatNext == Dis_StopHere);
   13059       DIP("iret (very kludgey)\n");
   13060       break;
   13061 
   13062    case 0xE8: /* CALL J4 */
   13063       d32 = getUDisp32(delta); delta += 4;
   13064       d32 += (guest_EIP_bbstart+delta);
   13065       /* (guest_eip_bbstart+delta) == return-to addr, d32 == call-to addr */
   13066       if (d32 == guest_EIP_bbstart+delta && getIByte(delta) >= 0x58
   13067                                          && getIByte(delta) <= 0x5F) {
   13068          /* Specially treat the position-independent-code idiom
   13069                  call X
   13070               X: popl %reg
   13071             as
   13072                  movl %eip, %reg.
   13073             since this generates better code, but for no other reason. */
   13074          Int archReg = getIByte(delta) - 0x58;
   13075          /* vex_printf("-- fPIC thingy\n"); */
   13076          putIReg(4, archReg, mkU32(guest_EIP_bbstart+delta));
   13077          delta++; /* Step over the POP */
   13078          DIP("call 0x%x ; popl %s\n",d32,nameIReg(4,archReg));
   13079       } else {
   13080          /* The normal sequence for a call. */
   13081          t1 = newTemp(Ity_I32);
   13082          assign(t1, binop(Iop_Sub32, getIReg(4,R_ESP), mkU32(4)));
   13083          putIReg(4, R_ESP, mkexpr(t1));
   13084          storeLE( mkexpr(t1), mkU32(guest_EIP_bbstart+delta));
   13085          if (resteerOkFn( callback_opaque, (Addr64)(Addr32)d32 )) {
   13086             /* follow into the call target. */
   13087             dres.whatNext   = Dis_ResteerU;
   13088             dres.continueAt = (Addr64)(Addr32)d32;
   13089          } else {
   13090             jmp_lit(&dres, Ijk_Call, d32);
   13091             vassert(dres.whatNext == Dis_StopHere);
   13092          }
   13093          DIP("call 0x%x\n",d32);
   13094       }
   13095       break;
   13096 
   13097 //--    case 0xC8: /* ENTER */
   13098 //--       d32 = getUDisp16(eip); eip += 2;
   13099 //--       abyte = getIByte(delta); delta++;
   13100 //--
   13101 //--       vg_assert(sz == 4);
   13102 //--       vg_assert(abyte == 0);
   13103 //--
   13104 //--       t1 = newTemp(cb); t2 = newTemp(cb);
   13105 //--       uInstr2(cb, GET,   sz, ArchReg, R_EBP, TempReg, t1);
   13106 //--       uInstr2(cb, GET,    4, ArchReg, R_ESP, TempReg, t2);
   13107 //--       uInstr2(cb, SUB,    4, Literal, 0,     TempReg, t2);
   13108 //--       uLiteral(cb, sz);
   13109 //--       uInstr2(cb, PUT,    4, TempReg, t2,    ArchReg, R_ESP);
   13110 //--       uInstr2(cb, STORE,  4, TempReg, t1,    TempReg, t2);
   13111 //--       uInstr2(cb, PUT,    4, TempReg, t2,    ArchReg, R_EBP);
   13112 //--       if (d32) {
   13113 //--          uInstr2(cb, SUB,    4, Literal, 0,     TempReg, t2);
   13114 //--          uLiteral(cb, d32);
   13115 //--          uInstr2(cb, PUT,    4, TempReg, t2,    ArchReg, R_ESP);
   13116 //--       }
   13117 //--       DIP("enter 0x%x, 0x%x", d32, abyte);
   13118 //--       break;
   13119 
   13120    case 0xC9: /* LEAVE */
   13121       vassert(sz == 4);
   13122       t1 = newTemp(Ity_I32); t2 = newTemp(Ity_I32);
   13123       assign(t1, getIReg(4,R_EBP));
   13124       /* First PUT ESP looks redundant, but need it because ESP must
   13125          always be up-to-date for Memcheck to work... */
   13126       putIReg(4, R_ESP, mkexpr(t1));
   13127       assign(t2, loadLE(Ity_I32,mkexpr(t1)));
   13128       putIReg(4, R_EBP, mkexpr(t2));
   13129       putIReg(4, R_ESP, binop(Iop_Add32, mkexpr(t1), mkU32(4)) );
   13130       DIP("leave\n");
   13131       break;
   13132 
   13133    /* ---------------- Misc weird-ass insns --------------- */
   13134 
   13135    case 0x27: /* DAA */
   13136    case 0x2F: /* DAS */
   13137    case 0x37: /* AAA */
   13138    case 0x3F: /* AAS */
   13139       /* An ugly implementation for some ugly instructions.  Oh
   13140 	 well. */
   13141       if (sz != 4) goto decode_failure;
   13142       t1 = newTemp(Ity_I32);
   13143       t2 = newTemp(Ity_I32);
   13144       /* Make up a 32-bit value (t1), with the old value of AX in the
   13145          bottom 16 bits, and the old OSZACP bitmask in the upper 16
   13146          bits. */
   13147       assign(t1,
   13148              binop(Iop_16HLto32,
   13149                    unop(Iop_32to16,
   13150                         mk_x86g_calculate_eflags_all()),
   13151                    getIReg(2, R_EAX)
   13152             ));
   13153       /* Call the helper fn, to get a new AX and OSZACP value, and
   13154          poke both back into the guest state.  Also pass the helper
   13155          the actual opcode so it knows which of the 4 instructions it
   13156          is doing the computation for. */
   13157       vassert(opc == 0x27 || opc == 0x2F || opc == 0x37 || opc == 0x3F);
   13158       assign(t2,
   13159               mkIRExprCCall(
   13160                  Ity_I32, 0/*regparm*/, "x86g_calculate_daa_das_aaa_aas",
   13161                  &x86g_calculate_daa_das_aaa_aas,
   13162                  mkIRExprVec_2( mkexpr(t1), mkU32( opc & 0xFF) )
   13163             ));
   13164      putIReg(2, R_EAX, unop(Iop_32to16, mkexpr(t2) ));
   13165 
   13166      stmt( IRStmt_Put( OFFB_CC_OP,   mkU32(X86G_CC_OP_COPY) ));
   13167      stmt( IRStmt_Put( OFFB_CC_DEP2, mkU32(0) ));
   13168      stmt( IRStmt_Put( OFFB_CC_DEP1,
   13169                        binop(Iop_And32,
   13170                              binop(Iop_Shr32, mkexpr(t2), mkU8(16)),
   13171                              mkU32( X86G_CC_MASK_C | X86G_CC_MASK_P
   13172                                     | X86G_CC_MASK_A | X86G_CC_MASK_Z
   13173                                     | X86G_CC_MASK_S| X86G_CC_MASK_O )
   13174                             )
   13175                       )
   13176          );
   13177      /* Set NDEP even though it isn't used.  This makes redundant-PUT
   13178         elimination of previous stores to this field work better. */
   13179      stmt( IRStmt_Put( OFFB_CC_NDEP, mkU32(0) ));
   13180      switch (opc) {
   13181         case 0x27: DIP("daa\n"); break;
   13182         case 0x2F: DIP("das\n"); break;
   13183         case 0x37: DIP("aaa\n"); break;
   13184         case 0x3F: DIP("aas\n"); break;
   13185         default: vassert(0);
   13186      }
   13187      break;
   13188 
   13189    case 0xD4: /* AAM */
   13190    case 0xD5: /* AAD */
   13191       d32 = getIByte(delta); delta++;
   13192       if (sz != 4 || d32 != 10) goto decode_failure;
   13193       t1 = newTemp(Ity_I32);
   13194       t2 = newTemp(Ity_I32);
   13195       /* Make up a 32-bit value (t1), with the old value of AX in the
   13196          bottom 16 bits, and the old OSZACP bitmask in the upper 16
   13197          bits. */
   13198       assign(t1,
   13199              binop(Iop_16HLto32,
   13200                    unop(Iop_32to16,
   13201                         mk_x86g_calculate_eflags_all()),
   13202                    getIReg(2, R_EAX)
   13203             ));
   13204       /* Call the helper fn, to get a new AX and OSZACP value, and
   13205          poke both back into the guest state.  Also pass the helper
   13206          the actual opcode so it knows which of the 2 instructions it
   13207          is doing the computation for. */
   13208       assign(t2,
   13209               mkIRExprCCall(
   13210                  Ity_I32, 0/*regparm*/, "x86g_calculate_aad_aam",
   13211                  &x86g_calculate_aad_aam,
   13212                  mkIRExprVec_2( mkexpr(t1), mkU32( opc & 0xFF) )
   13213             ));
   13214       putIReg(2, R_EAX, unop(Iop_32to16, mkexpr(t2) ));
   13215 
   13216       stmt( IRStmt_Put( OFFB_CC_OP,   mkU32(X86G_CC_OP_COPY) ));
   13217       stmt( IRStmt_Put( OFFB_CC_DEP2, mkU32(0) ));
   13218       stmt( IRStmt_Put( OFFB_CC_DEP1,
   13219                         binop(Iop_And32,
   13220                               binop(Iop_Shr32, mkexpr(t2), mkU8(16)),
   13221                               mkU32( X86G_CC_MASK_C | X86G_CC_MASK_P
   13222                                      | X86G_CC_MASK_A | X86G_CC_MASK_Z
   13223                                      | X86G_CC_MASK_S| X86G_CC_MASK_O )
   13224                              )
   13225                        )
   13226           );
   13227       /* Set NDEP even though it isn't used.  This makes
   13228          redundant-PUT elimination of previous stores to this field
   13229          work better. */
   13230       stmt( IRStmt_Put( OFFB_CC_NDEP, mkU32(0) ));
   13231 
   13232       DIP(opc == 0xD4 ? "aam\n" : "aad\n");
   13233       break;
   13234 
   13235    /* ------------------------ CWD/CDQ -------------------- */
   13236 
   13237    case 0x98: /* CBW */
   13238       if (sz == 4) {
   13239          putIReg(4, R_EAX, unop(Iop_16Sto32, getIReg(2, R_EAX)));
   13240          DIP("cwde\n");
   13241       } else {
   13242          vassert(sz == 2);
   13243          putIReg(2, R_EAX, unop(Iop_8Sto16, getIReg(1, R_EAX)));
   13244          DIP("cbw\n");
   13245       }
   13246       break;
   13247 
   13248    case 0x99: /* CWD/CDQ */
   13249       ty = szToITy(sz);
   13250       putIReg(sz, R_EDX,
   13251                   binop(mkSizedOp(ty,Iop_Sar8),
   13252                         getIReg(sz, R_EAX),
   13253                         mkU8(sz == 2 ? 15 : 31)) );
   13254       DIP(sz == 2 ? "cwdq\n" : "cdqq\n");
   13255       break;
   13256 
   13257    /* ------------------------ FPU ops -------------------- */
   13258 
   13259    case 0x9E: /* SAHF */
   13260       codegen_SAHF();
   13261       DIP("sahf\n");
   13262       break;
   13263 
   13264    case 0x9F: /* LAHF */
   13265       codegen_LAHF();
   13266       DIP("lahf\n");
   13267       break;
   13268 
   13269    case 0x9B: /* FWAIT */
   13270       /* ignore? */
   13271       DIP("fwait\n");
   13272       break;
   13273 
   13274    case 0xD8:
   13275    case 0xD9:
   13276    case 0xDA:
   13277    case 0xDB:
   13278    case 0xDC:
   13279    case 0xDD:
   13280    case 0xDE:
   13281    case 0xDF: {
   13282       Int  delta0    = delta;
   13283       Bool decode_OK = False;
   13284       delta = dis_FPU ( &decode_OK, sorb, delta );
   13285       if (!decode_OK) {
   13286          delta = delta0;
   13287          goto decode_failure;
   13288       }
   13289       break;
   13290    }
   13291 
   13292    /* ------------------------ INC & DEC ------------------ */
   13293 
   13294    case 0x40: /* INC eAX */
   13295    case 0x41: /* INC eCX */
   13296    case 0x42: /* INC eDX */
   13297    case 0x43: /* INC eBX */
   13298    case 0x44: /* INC eSP */
   13299    case 0x45: /* INC eBP */
   13300    case 0x46: /* INC eSI */
   13301    case 0x47: /* INC eDI */
   13302       vassert(sz == 2 || sz == 4);
   13303       ty = szToITy(sz);
   13304       t1 = newTemp(ty);
   13305       assign( t1, binop(mkSizedOp(ty,Iop_Add8),
   13306                         getIReg(sz, (UInt)(opc - 0x40)),
   13307                         mkU(ty,1)) );
   13308       setFlags_INC_DEC( True, t1, ty );
   13309       putIReg(sz, (UInt)(opc - 0x40), mkexpr(t1));
   13310       DIP("inc%c %s\n", nameISize(sz), nameIReg(sz,opc-0x40));
   13311       break;
   13312 
   13313    case 0x48: /* DEC eAX */
   13314    case 0x49: /* DEC eCX */
   13315    case 0x4A: /* DEC eDX */
   13316    case 0x4B: /* DEC eBX */
   13317    case 0x4C: /* DEC eSP */
   13318    case 0x4D: /* DEC eBP */
   13319    case 0x4E: /* DEC eSI */
   13320    case 0x4F: /* DEC eDI */
   13321       vassert(sz == 2 || sz == 4);
   13322       ty = szToITy(sz);
   13323       t1 = newTemp(ty);
   13324       assign( t1, binop(mkSizedOp(ty,Iop_Sub8),
   13325                         getIReg(sz, (UInt)(opc - 0x48)),
   13326                         mkU(ty,1)) );
   13327       setFlags_INC_DEC( False, t1, ty );
   13328       putIReg(sz, (UInt)(opc - 0x48), mkexpr(t1));
   13329       DIP("dec%c %s\n", nameISize(sz), nameIReg(sz,opc-0x48));
   13330       break;
   13331 
   13332    /* ------------------------ INT ------------------------ */
   13333 
   13334    case 0xCC: /* INT 3 */
   13335       jmp_lit(&dres, Ijk_SigTRAP, ((Addr32)guest_EIP_bbstart)+delta);
   13336       vassert(dres.whatNext == Dis_StopHere);
   13337       DIP("int $0x3\n");
   13338       break;
   13339 
   13340    case 0xCD: /* INT imm8 */
   13341       d32 = getIByte(delta); delta++;
   13342 
   13343       /* For any of the cases where we emit a jump (that is, for all
   13344          currently handled cases), it's important that all ArchRegs
   13345          carry their up-to-date value at this point.  So we declare an
   13346          end-of-block here, which forces any TempRegs caching ArchRegs
   13347          to be flushed. */
   13348 
   13349       /* Handle int $0x3F .. $0x4F by synthesising a segfault and a
   13350          restart of this instruction (hence the "-2" two lines below,
   13351          to get the restart EIP to be this instruction.  This is
   13352          probably Linux-specific and it would be more correct to only
   13353          do this if the VexAbiInfo says that is what we should do.
   13354          This used to handle just 0x40-0x43; Jikes RVM uses a larger
   13355          range (0x3F-0x49), and this allows some slack as well. */
   13356       if (d32 >= 0x3F && d32 <= 0x4F) {
   13357          jmp_lit(&dres, Ijk_SigSEGV, ((Addr32)guest_EIP_bbstart)+delta-2);
   13358          vassert(dres.whatNext == Dis_StopHere);
   13359          DIP("int $0x%x\n", (Int)d32);
   13360          break;
   13361       }
   13362 
   13363       /* Handle int $0x80 (linux syscalls), int $0x81 and $0x82
   13364          (darwin syscalls).  As part of this, note where we are, so we
   13365          can back up the guest to this point if the syscall needs to
   13366          be restarted. */
   13367       if (d32 == 0x80) {
   13368          stmt( IRStmt_Put( OFFB_IP_AT_SYSCALL,
   13369                            mkU32(guest_EIP_curr_instr) ) );
   13370          jmp_lit(&dres, Ijk_Sys_int128, ((Addr32)guest_EIP_bbstart)+delta);
   13371          vassert(dres.whatNext == Dis_StopHere);
   13372          DIP("int $0x80\n");
   13373          break;
   13374       }
   13375       if (d32 == 0x81) {
   13376          stmt( IRStmt_Put( OFFB_IP_AT_SYSCALL,
   13377                            mkU32(guest_EIP_curr_instr) ) );
   13378          jmp_lit(&dres, Ijk_Sys_int129, ((Addr32)guest_EIP_bbstart)+delta);
   13379          vassert(dres.whatNext == Dis_StopHere);
   13380          DIP("int $0x81\n");
   13381          break;
   13382       }
   13383       if (d32 == 0x82) {
   13384          stmt( IRStmt_Put( OFFB_IP_AT_SYSCALL,
   13385                            mkU32(guest_EIP_curr_instr) ) );
   13386          jmp_lit(&dres, Ijk_Sys_int130, ((Addr32)guest_EIP_bbstart)+delta);
   13387          vassert(dres.whatNext == Dis_StopHere);
   13388          DIP("int $0x82\n");
   13389          break;
   13390       }
   13391 
   13392       /* none of the above */
   13393       goto decode_failure;
   13394 
   13395    /* ------------------------ Jcond, byte offset --------- */
   13396 
   13397    case 0xEB: /* Jb (jump, byte offset) */
   13398       d32 = (((Addr32)guest_EIP_bbstart)+delta+1) + getSDisp8(delta);
   13399       delta++;
   13400       if (resteerOkFn( callback_opaque, (Addr64)(Addr32)d32) ) {
   13401          dres.whatNext   = Dis_ResteerU;
   13402          dres.continueAt = (Addr64)(Addr32)d32;
   13403       } else {
   13404          jmp_lit(&dres, Ijk_Boring, d32);
   13405          vassert(dres.whatNext == Dis_StopHere);
   13406       }
   13407       DIP("jmp-8 0x%x\n", d32);
   13408       break;
   13409 
   13410    case 0xE9: /* Jv (jump, 16/32 offset) */
   13411       vassert(sz == 4); /* JRS added 2004 July 11 */
   13412       d32 = (((Addr32)guest_EIP_bbstart)+delta+sz) + getSDisp(sz,delta);
   13413       delta += sz;
   13414       if (resteerOkFn( callback_opaque, (Addr64)(Addr32)d32) ) {
   13415          dres.whatNext   = Dis_ResteerU;
   13416          dres.continueAt = (Addr64)(Addr32)d32;
   13417       } else {
   13418          jmp_lit(&dres, Ijk_Boring, d32);
   13419          vassert(dres.whatNext == Dis_StopHere);
   13420       }
   13421       DIP("jmp 0x%x\n", d32);
   13422       break;
   13423 
   13424    case 0x70:
   13425    case 0x71:
   13426    case 0x72: /* JBb/JNAEb (jump below) */
   13427    case 0x73: /* JNBb/JAEb (jump not below) */
   13428    case 0x74: /* JZb/JEb (jump zero) */
   13429    case 0x75: /* JNZb/JNEb (jump not zero) */
   13430    case 0x76: /* JBEb/JNAb (jump below or equal) */
   13431    case 0x77: /* JNBEb/JAb (jump not below or equal) */
   13432    case 0x78: /* JSb (jump negative) */
   13433    case 0x79: /* JSb (jump not negative) */
   13434    case 0x7A: /* JP (jump parity even) */
   13435    case 0x7B: /* JNP/JPO (jump parity odd) */
   13436    case 0x7C: /* JLb/JNGEb (jump less) */
   13437    case 0x7D: /* JGEb/JNLb (jump greater or equal) */
   13438    case 0x7E: /* JLEb/JNGb (jump less or equal) */
   13439    case 0x7F: /* JGb/JNLEb (jump greater) */
   13440     { Int    jmpDelta;
   13441       const HChar* comment  = "";
   13442       jmpDelta = (Int)getSDisp8(delta);
   13443       vassert(-128 <= jmpDelta && jmpDelta < 128);
   13444       d32 = (((Addr32)guest_EIP_bbstart)+delta+1) + jmpDelta;
   13445       delta++;
   13446       if (resteerCisOk
   13447           && vex_control.guest_chase_cond
   13448           && (Addr32)d32 != (Addr32)guest_EIP_bbstart
   13449           && jmpDelta < 0
   13450           && resteerOkFn( callback_opaque, (Addr64)(Addr32)d32) ) {
   13451          /* Speculation: assume this backward branch is taken.  So we
   13452             need to emit a side-exit to the insn following this one,
   13453             on the negation of the condition, and continue at the
   13454             branch target address (d32).  If we wind up back at the
   13455             first instruction of the trace, just stop; it's better to
   13456             let the IR loop unroller handle that case. */
   13457          stmt( IRStmt_Exit(
   13458                   mk_x86g_calculate_condition((X86Condcode)(1 ^ (opc - 0x70))),
   13459                   Ijk_Boring,
   13460                   IRConst_U32(guest_EIP_bbstart+delta),
   13461                   OFFB_EIP ) );
   13462          dres.whatNext   = Dis_ResteerC;
   13463          dres.continueAt = (Addr64)(Addr32)d32;
   13464          comment = "(assumed taken)";
   13465       }
   13466       else
   13467       if (resteerCisOk
   13468           && vex_control.guest_chase_cond
   13469           && (Addr32)d32 != (Addr32)guest_EIP_bbstart
   13470           && jmpDelta >= 0
   13471           && resteerOkFn( callback_opaque,
   13472                           (Addr64)(Addr32)(guest_EIP_bbstart+delta)) ) {
   13473          /* Speculation: assume this forward branch is not taken.  So
   13474             we need to emit a side-exit to d32 (the dest) and continue
   13475             disassembling at the insn immediately following this
   13476             one. */
   13477          stmt( IRStmt_Exit(
   13478                   mk_x86g_calculate_condition((X86Condcode)(opc - 0x70)),
   13479                   Ijk_Boring,
   13480                   IRConst_U32(d32),
   13481                   OFFB_EIP ) );
   13482          dres.whatNext   = Dis_ResteerC;
   13483          dres.continueAt = (Addr64)(Addr32)(guest_EIP_bbstart+delta);
   13484          comment = "(assumed not taken)";
   13485       }
   13486       else {
   13487          /* Conservative default translation - end the block at this
   13488             point. */
   13489          jcc_01( &dres, (X86Condcode)(opc - 0x70),
   13490                  (Addr32)(guest_EIP_bbstart+delta), d32);
   13491          vassert(dres.whatNext == Dis_StopHere);
   13492       }
   13493       DIP("j%s-8 0x%x %s\n", name_X86Condcode(opc - 0x70), d32, comment);
   13494       break;
   13495     }
   13496 
   13497    case 0xE3: /* JECXZ (for JCXZ see above) */
   13498       if (sz != 4) goto decode_failure;
   13499       d32 = (((Addr32)guest_EIP_bbstart)+delta+1) + getSDisp8(delta);
   13500       delta ++;
   13501       stmt( IRStmt_Exit(
   13502                binop(Iop_CmpEQ32, getIReg(4,R_ECX), mkU32(0)),
   13503             Ijk_Boring,
   13504             IRConst_U32(d32),
   13505             OFFB_EIP
   13506           ));
   13507       DIP("jecxz 0x%x\n", d32);
   13508       break;
   13509 
   13510    case 0xE0: /* LOOPNE disp8: decrement count, jump if count != 0 && ZF==0 */
   13511    case 0xE1: /* LOOPE  disp8: decrement count, jump if count != 0 && ZF==1 */
   13512    case 0xE2: /* LOOP   disp8: decrement count, jump if count != 0 */
   13513     { /* Again, the docs say this uses ECX/CX as a count depending on
   13514          the address size override, not the operand one.  Since we
   13515          don't handle address size overrides, I guess that means
   13516          ECX. */
   13517       IRExpr* zbit  = NULL;
   13518       IRExpr* count = NULL;
   13519       IRExpr* cond  = NULL;
   13520       const HChar* xtra = NULL;
   13521 
   13522       if (sz != 4) goto decode_failure;
   13523       d32 = (((Addr32)guest_EIP_bbstart)+delta+1) + getSDisp8(delta);
   13524       delta++;
   13525       putIReg(4, R_ECX, binop(Iop_Sub32, getIReg(4,R_ECX), mkU32(1)));
   13526 
   13527       count = getIReg(4,R_ECX);
   13528       cond = binop(Iop_CmpNE32, count, mkU32(0));
   13529       switch (opc) {
   13530          case 0xE2:
   13531             xtra = "";
   13532             break;
   13533          case 0xE1:
   13534             xtra = "e";
   13535             zbit = mk_x86g_calculate_condition( X86CondZ );
   13536 	    cond = mkAnd1(cond, zbit);
   13537             break;
   13538          case 0xE0:
   13539             xtra = "ne";
   13540             zbit = mk_x86g_calculate_condition( X86CondNZ );
   13541 	    cond = mkAnd1(cond, zbit);
   13542             break;
   13543          default:
   13544 	    vassert(0);
   13545       }
   13546       stmt( IRStmt_Exit(cond, Ijk_Boring, IRConst_U32(d32), OFFB_EIP) );
   13547 
   13548       DIP("loop%s 0x%x\n", xtra, d32);
   13549       break;
   13550     }
   13551 
   13552    /* ------------------------ IMUL ----------------------- */
   13553 
   13554    case 0x69: /* IMUL Iv, Ev, Gv */
   13555       delta = dis_imul_I_E_G ( sorb, sz, delta, sz );
   13556       break;
   13557    case 0x6B: /* IMUL Ib, Ev, Gv */
   13558       delta = dis_imul_I_E_G ( sorb, sz, delta, 1 );
   13559       break;
   13560 
   13561    /* ------------------------ MOV ------------------------ */
   13562 
   13563    case 0x88: /* MOV Gb,Eb */
   13564       delta = dis_mov_G_E(sorb, 1, delta);
   13565       break;
   13566 
   13567    case 0x89: /* MOV Gv,Ev */
   13568       delta = dis_mov_G_E(sorb, sz, delta);
   13569       break;
   13570 
   13571    case 0x8A: /* MOV Eb,Gb */
   13572       delta = dis_mov_E_G(sorb, 1, delta);
   13573       break;
   13574 
   13575    case 0x8B: /* MOV Ev,Gv */
   13576       delta = dis_mov_E_G(sorb, sz, delta);
   13577       break;
   13578 
   13579    case 0x8D: /* LEA M,Gv */
   13580       if (sz != 4)
   13581          goto decode_failure;
   13582       modrm = getIByte(delta);
   13583       if (epartIsReg(modrm))
   13584          goto decode_failure;
   13585       /* NOTE!  this is the one place where a segment override prefix
   13586          has no effect on the address calculation.  Therefore we pass
   13587          zero instead of sorb here. */
   13588       addr = disAMode ( &alen, /*sorb*/ 0, delta, dis_buf );
   13589       delta += alen;
   13590       putIReg(sz, gregOfRM(modrm), mkexpr(addr));
   13591       DIP("lea%c %s, %s\n", nameISize(sz), dis_buf,
   13592                             nameIReg(sz,gregOfRM(modrm)));
   13593       break;
   13594 
   13595    case 0x8C: /* MOV Sw,Ew -- MOV from a SEGMENT REGISTER */
   13596       delta = dis_mov_Sw_Ew(sorb, sz, delta);
   13597       break;
   13598 
   13599    case 0x8E: /* MOV Ew,Sw -- MOV to a SEGMENT REGISTER */
   13600       delta = dis_mov_Ew_Sw(sorb, delta);
   13601       break;
   13602 
   13603    case 0xA0: /* MOV Ob,AL */
   13604       sz = 1;
   13605       /* Fall through ... */
   13606    case 0xA1: /* MOV Ov,eAX */
   13607       d32 = getUDisp32(delta); delta += 4;
   13608       ty = szToITy(sz);
   13609       addr = newTemp(Ity_I32);
   13610       assign( addr, handleSegOverride(sorb, mkU32(d32)) );
   13611       putIReg(sz, R_EAX, loadLE(ty, mkexpr(addr)));
   13612       DIP("mov%c %s0x%x, %s\n", nameISize(sz), sorbTxt(sorb),
   13613                                 d32, nameIReg(sz,R_EAX));
   13614       break;
   13615 
   13616    case 0xA2: /* MOV Ob,AL */
   13617       sz = 1;
   13618       /* Fall through ... */
   13619    case 0xA3: /* MOV eAX,Ov */
   13620       d32 = getUDisp32(delta); delta += 4;
   13621       ty = szToITy(sz);
   13622       addr = newTemp(Ity_I32);
   13623       assign( addr, handleSegOverride(sorb, mkU32(d32)) );
   13624       storeLE( mkexpr(addr), getIReg(sz,R_EAX) );
   13625       DIP("mov%c %s, %s0x%x\n", nameISize(sz), nameIReg(sz,R_EAX),
   13626                                 sorbTxt(sorb), d32);
   13627       break;
   13628 
   13629    case 0xB0: /* MOV imm,AL */
   13630    case 0xB1: /* MOV imm,CL */
   13631    case 0xB2: /* MOV imm,DL */
   13632    case 0xB3: /* MOV imm,BL */
   13633    case 0xB4: /* MOV imm,AH */
   13634    case 0xB5: /* MOV imm,CH */
   13635    case 0xB6: /* MOV imm,DH */
   13636    case 0xB7: /* MOV imm,BH */
   13637       d32 = getIByte(delta); delta += 1;
   13638       putIReg(1, opc-0xB0, mkU8(d32));
   13639       DIP("movb $0x%x,%s\n", d32, nameIReg(1,opc-0xB0));
   13640       break;
   13641 
   13642    case 0xB8: /* MOV imm,eAX */
   13643    case 0xB9: /* MOV imm,eCX */
   13644    case 0xBA: /* MOV imm,eDX */
   13645    case 0xBB: /* MOV imm,eBX */
   13646    case 0xBC: /* MOV imm,eSP */
   13647    case 0xBD: /* MOV imm,eBP */
   13648    case 0xBE: /* MOV imm,eSI */
   13649    case 0xBF: /* MOV imm,eDI */
   13650       d32 = getUDisp(sz,delta); delta += sz;
   13651       putIReg(sz, opc-0xB8, mkU(szToITy(sz), d32));
   13652       DIP("mov%c $0x%x,%s\n", nameISize(sz), d32, nameIReg(sz,opc-0xB8));
   13653       break;
   13654 
   13655    case 0xC6: /* C6 /0 = MOV Ib,Eb */
   13656       sz = 1;
   13657       goto maybe_do_Mov_I_E;
   13658    case 0xC7: /* C7 /0 = MOV Iv,Ev */
   13659       goto maybe_do_Mov_I_E;
   13660 
   13661    maybe_do_Mov_I_E:
   13662       modrm = getIByte(delta);
   13663       if (gregOfRM(modrm) == 0) {
   13664          if (epartIsReg(modrm)) {
   13665             delta++; /* mod/rm byte */
   13666             d32 = getUDisp(sz,delta); delta += sz;
   13667             putIReg(sz, eregOfRM(modrm), mkU(szToITy(sz), d32));
   13668             DIP("mov%c $0x%x, %s\n", nameISize(sz), d32,
   13669                                      nameIReg(sz,eregOfRM(modrm)));
   13670          } else {
   13671             addr = disAMode ( &alen, sorb, delta, dis_buf );
   13672             delta += alen;
   13673             d32 = getUDisp(sz,delta); delta += sz;
   13674             storeLE(mkexpr(addr), mkU(szToITy(sz), d32));
   13675             DIP("mov%c $0x%x, %s\n", nameISize(sz), d32, dis_buf);
   13676          }
   13677          break;
   13678       }
   13679       goto decode_failure;
   13680 
   13681    /* ------------------------ opl imm, A ----------------- */
   13682 
   13683    case 0x04: /* ADD Ib, AL */
   13684       delta = dis_op_imm_A(  1, False, Iop_Add8, True, delta, "add" );
   13685       break;
   13686    case 0x05: /* ADD Iv, eAX */
   13687       delta = dis_op_imm_A( sz, False, Iop_Add8, True, delta, "add" );
   13688       break;
   13689 
   13690    case 0x0C: /* OR Ib, AL */
   13691       delta = dis_op_imm_A(  1, False, Iop_Or8, True, delta, "or" );
   13692       break;
   13693    case 0x0D: /* OR Iv, eAX */
   13694       delta = dis_op_imm_A( sz, False, Iop_Or8, True, delta, "or" );
   13695       break;
   13696 
   13697    case 0x14: /* ADC Ib, AL */
   13698       delta = dis_op_imm_A(  1, True, Iop_Add8, True, delta, "adc" );
   13699       break;
   13700    case 0x15: /* ADC Iv, eAX */
   13701       delta = dis_op_imm_A( sz, True, Iop_Add8, True, delta, "adc" );
   13702       break;
   13703 
   13704    case 0x1C: /* SBB Ib, AL */
   13705       delta = dis_op_imm_A( 1, True, Iop_Sub8, True, delta, "sbb" );
   13706       break;
   13707    case 0x1D: /* SBB Iv, eAX */
   13708       delta = dis_op_imm_A( sz, True, Iop_Sub8, True, delta, "sbb" );
   13709       break;
   13710 
   13711    case 0x24: /* AND Ib, AL */
   13712       delta = dis_op_imm_A(  1, False, Iop_And8, True, delta, "and" );
   13713       break;
   13714    case 0x25: /* AND Iv, eAX */
   13715       delta = dis_op_imm_A( sz, False, Iop_And8, True, delta, "and" );
   13716       break;
   13717 
   13718    case 0x2C: /* SUB Ib, AL */
   13719       delta = dis_op_imm_A(  1, False, Iop_Sub8, True, delta, "sub" );
   13720       break;
   13721    case 0x2D: /* SUB Iv, eAX */
   13722       delta = dis_op_imm_A( sz, False, Iop_Sub8, True, delta, "sub" );
   13723       break;
   13724 
   13725    case 0x34: /* XOR Ib, AL */
   13726       delta = dis_op_imm_A(  1, False, Iop_Xor8, True, delta, "xor" );
   13727       break;
   13728    case 0x35: /* XOR Iv, eAX */
   13729       delta = dis_op_imm_A( sz, False, Iop_Xor8, True, delta, "xor" );
   13730       break;
   13731 
   13732    case 0x3C: /* CMP Ib, AL */
   13733       delta = dis_op_imm_A(  1, False, Iop_Sub8, False, delta, "cmp" );
   13734       break;
   13735    case 0x3D: /* CMP Iv, eAX */
   13736       delta = dis_op_imm_A( sz, False, Iop_Sub8, False, delta, "cmp" );
   13737       break;
   13738 
   13739    case 0xA8: /* TEST Ib, AL */
   13740       delta = dis_op_imm_A(  1, False, Iop_And8, False, delta, "test" );
   13741       break;
   13742    case 0xA9: /* TEST Iv, eAX */
   13743       delta = dis_op_imm_A( sz, False, Iop_And8, False, delta, "test" );
   13744       break;
   13745 
   13746    /* ------------------------ opl Ev, Gv ----------------- */
   13747 
   13748    case 0x02: /* ADD Eb,Gb */
   13749       delta = dis_op2_E_G ( sorb, False, Iop_Add8, True, 1, delta, "add" );
   13750       break;
   13751    case 0x03: /* ADD Ev,Gv */
   13752       delta = dis_op2_E_G ( sorb, False, Iop_Add8, True, sz, delta, "add" );
   13753       break;
   13754 
   13755    case 0x0A: /* OR Eb,Gb */
   13756       delta = dis_op2_E_G ( sorb, False, Iop_Or8, True, 1, delta, "or" );
   13757       break;
   13758    case 0x0B: /* OR Ev,Gv */
   13759       delta = dis_op2_E_G ( sorb, False, Iop_Or8, True, sz, delta, "or" );
   13760       break;
   13761 
   13762    case 0x12: /* ADC Eb,Gb */
   13763       delta = dis_op2_E_G ( sorb, True, Iop_Add8, True, 1, delta, "adc" );
   13764       break;
   13765    case 0x13: /* ADC Ev,Gv */
   13766       delta = dis_op2_E_G ( sorb, True, Iop_Add8, True, sz, delta, "adc" );
   13767       break;
   13768 
   13769    case 0x1A: /* SBB Eb,Gb */
   13770       delta = dis_op2_E_G ( sorb, True, Iop_Sub8, True, 1, delta, "sbb" );
   13771       break;
   13772    case 0x1B: /* SBB Ev,Gv */
   13773       delta = dis_op2_E_G ( sorb, True, Iop_Sub8, True, sz, delta, "sbb" );
   13774       break;
   13775 
   13776    case 0x22: /* AND Eb,Gb */
   13777       delta = dis_op2_E_G ( sorb, False, Iop_And8, True, 1, delta, "and" );
   13778       break;
   13779    case 0x23: /* AND Ev,Gv */
   13780       delta = dis_op2_E_G ( sorb, False, Iop_And8, True, sz, delta, "and" );
   13781       break;
   13782 
   13783    case 0x2A: /* SUB Eb,Gb */
   13784       delta = dis_op2_E_G ( sorb, False, Iop_Sub8, True, 1, delta, "sub" );
   13785       break;
   13786    case 0x2B: /* SUB Ev,Gv */
   13787       delta = dis_op2_E_G ( sorb, False, Iop_Sub8, True, sz, delta, "sub" );
   13788       break;
   13789 
   13790    case 0x32: /* XOR Eb,Gb */
   13791       delta = dis_op2_E_G ( sorb, False, Iop_Xor8, True, 1, delta, "xor" );
   13792       break;
   13793    case 0x33: /* XOR Ev,Gv */
   13794       delta = dis_op2_E_G ( sorb, False, Iop_Xor8, True, sz, delta, "xor" );
   13795       break;
   13796 
   13797    case 0x3A: /* CMP Eb,Gb */
   13798       delta = dis_op2_E_G ( sorb, False, Iop_Sub8, False, 1, delta, "cmp" );
   13799       break;
   13800    case 0x3B: /* CMP Ev,Gv */
   13801       delta = dis_op2_E_G ( sorb, False, Iop_Sub8, False, sz, delta, "cmp" );
   13802       break;
   13803 
   13804    case 0x84: /* TEST Eb,Gb */
   13805       delta = dis_op2_E_G ( sorb, False, Iop_And8, False, 1, delta, "test" );
   13806       break;
   13807    case 0x85: /* TEST Ev,Gv */
   13808       delta = dis_op2_E_G ( sorb, False, Iop_And8, False, sz, delta, "test" );
   13809       break;
   13810 
   13811    /* ------------------------ opl Gv, Ev ----------------- */
   13812 
   13813    case 0x00: /* ADD Gb,Eb */
   13814       delta = dis_op2_G_E ( sorb, pfx_lock, False,
   13815                             Iop_Add8, True, 1, delta, "add" );
   13816       break;
   13817    case 0x01: /* ADD Gv,Ev */
   13818       delta = dis_op2_G_E ( sorb, pfx_lock, False,
   13819                             Iop_Add8, True, sz, delta, "add" );
   13820       break;
   13821 
   13822    case 0x08: /* OR Gb,Eb */
   13823       delta = dis_op2_G_E ( sorb, pfx_lock, False,
   13824                             Iop_Or8, True, 1, delta, "or" );
   13825       break;
   13826    case 0x09: /* OR Gv,Ev */
   13827       delta = dis_op2_G_E ( sorb, pfx_lock, False,
   13828                             Iop_Or8, True, sz, delta, "or" );
   13829       break;
   13830 
   13831    case 0x10: /* ADC Gb,Eb */
   13832       delta = dis_op2_G_E ( sorb, pfx_lock, True,
   13833                             Iop_Add8, True, 1, delta, "adc" );
   13834       break;
   13835    case 0x11: /* ADC Gv,Ev */
   13836       delta = dis_op2_G_E ( sorb, pfx_lock, True,
   13837                             Iop_Add8, True, sz, delta, "adc" );
   13838       break;
   13839 
   13840    case 0x18: /* SBB Gb,Eb */
   13841       delta = dis_op2_G_E ( sorb, pfx_lock, True,
   13842                             Iop_Sub8, True, 1, delta, "sbb" );
   13843       break;
   13844    case 0x19: /* SBB Gv,Ev */
   13845       delta = dis_op2_G_E ( sorb, pfx_lock, True,
   13846                             Iop_Sub8, True, sz, delta, "sbb" );
   13847       break;
   13848 
   13849    case 0x20: /* AND Gb,Eb */
   13850       delta = dis_op2_G_E ( sorb, pfx_lock, False,
   13851                             Iop_And8, True, 1, delta, "and" );
   13852       break;
   13853    case 0x21: /* AND Gv,Ev */
   13854       delta = dis_op2_G_E ( sorb, pfx_lock, False,
   13855                             Iop_And8, True, sz, delta, "and" );
   13856       break;
   13857 
   13858    case 0x28: /* SUB Gb,Eb */
   13859       delta = dis_op2_G_E ( sorb, pfx_lock, False,
   13860                             Iop_Sub8, True, 1, delta, "sub" );
   13861       break;
   13862    case 0x29: /* SUB Gv,Ev */
   13863       delta = dis_op2_G_E ( sorb, pfx_lock, False,
   13864                             Iop_Sub8, True, sz, delta, "sub" );
   13865       break;
   13866 
   13867    case 0x30: /* XOR Gb,Eb */
   13868       delta = dis_op2_G_E ( sorb, pfx_lock, False,
   13869                             Iop_Xor8, True, 1, delta, "xor" );
   13870       break;
   13871    case 0x31: /* XOR Gv,Ev */
   13872       delta = dis_op2_G_E ( sorb, pfx_lock, False,
   13873                             Iop_Xor8, True, sz, delta, "xor" );
   13874       break;
   13875 
   13876    case 0x38: /* CMP Gb,Eb */
   13877       delta = dis_op2_G_E ( sorb, pfx_lock, False,
   13878                             Iop_Sub8, False, 1, delta, "cmp" );
   13879       break;
   13880    case 0x39: /* CMP Gv,Ev */
   13881       delta = dis_op2_G_E ( sorb, pfx_lock, False,
   13882                             Iop_Sub8, False, sz, delta, "cmp" );
   13883       break;
   13884 
   13885    /* ------------------------ POP ------------------------ */
   13886 
   13887    case 0x58: /* POP eAX */
   13888    case 0x59: /* POP eCX */
   13889    case 0x5A: /* POP eDX */
   13890    case 0x5B: /* POP eBX */
   13891    case 0x5D: /* POP eBP */
   13892    case 0x5E: /* POP eSI */
   13893    case 0x5F: /* POP eDI */
   13894    case 0x5C: /* POP eSP */
   13895       vassert(sz == 2 || sz == 4);
   13896       t1 = newTemp(szToITy(sz)); t2 = newTemp(Ity_I32);
   13897       assign(t2, getIReg(4, R_ESP));
   13898       assign(t1, loadLE(szToITy(sz),mkexpr(t2)));
   13899       putIReg(4, R_ESP, binop(Iop_Add32, mkexpr(t2), mkU32(sz)));
   13900       putIReg(sz, opc-0x58, mkexpr(t1));
   13901       DIP("pop%c %s\n", nameISize(sz), nameIReg(sz,opc-0x58));
   13902       break;
   13903 
   13904    case 0x9D: /* POPF */
   13905       vassert(sz == 2 || sz == 4);
   13906       t1 = newTemp(Ity_I32); t2 = newTemp(Ity_I32);
   13907       assign(t2, getIReg(4, R_ESP));
   13908       assign(t1, widenUto32(loadLE(szToITy(sz),mkexpr(t2))));
   13909       putIReg(4, R_ESP, binop(Iop_Add32, mkexpr(t2), mkU32(sz)));
   13910 
   13911       /* Generate IR to set %EFLAGS{O,S,Z,A,C,P,D,ID,AC} from the
   13912 	 value in t1. */
   13913       set_EFLAGS_from_value( t1, True/*emit_AC_emwarn*/,
   13914                                  ((Addr32)guest_EIP_bbstart)+delta );
   13915 
   13916       DIP("popf%c\n", nameISize(sz));
   13917       break;
   13918 
   13919    case 0x61: /* POPA */
   13920       /* This is almost certainly wrong for sz==2.  So ... */
   13921       if (sz != 4) goto decode_failure;
   13922 
   13923       /* t5 is the old %ESP value. */
   13924       t5 = newTemp(Ity_I32);
   13925       assign( t5, getIReg(4, R_ESP) );
   13926 
   13927       /* Reload all the registers, except %esp. */
   13928       putIReg(4,R_EAX, loadLE(Ity_I32, binop(Iop_Add32,mkexpr(t5),mkU32(28)) ));
   13929       putIReg(4,R_ECX, loadLE(Ity_I32, binop(Iop_Add32,mkexpr(t5),mkU32(24)) ));
   13930       putIReg(4,R_EDX, loadLE(Ity_I32, binop(Iop_Add32,mkexpr(t5),mkU32(20)) ));
   13931       putIReg(4,R_EBX, loadLE(Ity_I32, binop(Iop_Add32,mkexpr(t5),mkU32(16)) ));
   13932       /* ignore saved %ESP */
   13933       putIReg(4,R_EBP, loadLE(Ity_I32, binop(Iop_Add32,mkexpr(t5),mkU32( 8)) ));
   13934       putIReg(4,R_ESI, loadLE(Ity_I32, binop(Iop_Add32,mkexpr(t5),mkU32( 4)) ));
   13935       putIReg(4,R_EDI, loadLE(Ity_I32, binop(Iop_Add32,mkexpr(t5),mkU32( 0)) ));
   13936 
   13937       /* and move %ESP back up */
   13938       putIReg( 4, R_ESP, binop(Iop_Add32, mkexpr(t5), mkU32(8*4)) );
   13939 
   13940       DIP("popa%c\n", nameISize(sz));
   13941       break;
   13942 
   13943    case 0x8F: /* POPL/POPW m32 */
   13944      { Int    len;
   13945        UChar  rm = getIByte(delta);
   13946 
   13947        /* make sure this instruction is correct POP */
   13948        if (epartIsReg(rm) || gregOfRM(rm) != 0)
   13949           goto decode_failure;
   13950        /* and has correct size */
   13951        if (sz != 4 && sz != 2)
   13952           goto decode_failure;
   13953        ty = szToITy(sz);
   13954 
   13955        t1 = newTemp(Ity_I32); /* stack address */
   13956        t3 = newTemp(ty); /* data */
   13957        /* set t1 to ESP: t1 = ESP */
   13958        assign( t1, getIReg(4, R_ESP) );
   13959        /* load M[ESP] to virtual register t3: t3 = M[t1] */
   13960        assign( t3, loadLE(ty, mkexpr(t1)) );
   13961 
   13962        /* increase ESP; must be done before the STORE.  Intel manual says:
   13963             If the ESP register is used as a base register for addressing
   13964             a destination operand in memory, the POP instruction computes
   13965             the effective address of the operand after it increments the
   13966             ESP register.
   13967        */
   13968        putIReg(4, R_ESP, binop(Iop_Add32, mkexpr(t1), mkU32(sz)) );
   13969 
   13970        /* resolve MODR/M */
   13971        addr = disAMode ( &len, sorb, delta, dis_buf);
   13972        storeLE( mkexpr(addr), mkexpr(t3) );
   13973 
   13974        DIP("pop%c %s\n", sz==2 ? 'w' : 'l', dis_buf);
   13975 
   13976        delta += len;
   13977        break;
   13978      }
   13979 
   13980    case 0x1F: /* POP %DS */
   13981       dis_pop_segreg( R_DS, sz ); break;
   13982    case 0x07: /* POP %ES */
   13983       dis_pop_segreg( R_ES, sz ); break;
   13984    case 0x17: /* POP %SS */
   13985       dis_pop_segreg( R_SS, sz ); break;
   13986 
   13987    /* ------------------------ PUSH ----------------------- */
   13988 
   13989    case 0x50: /* PUSH eAX */
   13990    case 0x51: /* PUSH eCX */
   13991    case 0x52: /* PUSH eDX */
   13992    case 0x53: /* PUSH eBX */
   13993    case 0x55: /* PUSH eBP */
   13994    case 0x56: /* PUSH eSI */
   13995    case 0x57: /* PUSH eDI */
   13996    case 0x54: /* PUSH eSP */
   13997       /* This is the Right Way, in that the value to be pushed is
   13998          established before %esp is changed, so that pushl %esp
   13999          correctly pushes the old value. */
   14000       vassert(sz == 2 || sz == 4);
   14001       ty = sz==2 ? Ity_I16 : Ity_I32;
   14002       t1 = newTemp(ty); t2 = newTemp(Ity_I32);
   14003       assign(t1, getIReg(sz, opc-0x50));
   14004       assign(t2, binop(Iop_Sub32, getIReg(4, R_ESP), mkU32(sz)));
   14005       putIReg(4, R_ESP, mkexpr(t2) );
   14006       storeLE(mkexpr(t2),mkexpr(t1));
   14007       DIP("push%c %s\n", nameISize(sz), nameIReg(sz,opc-0x50));
   14008       break;
   14009 
   14010 
   14011    case 0x68: /* PUSH Iv */
   14012       d32 = getUDisp(sz,delta); delta += sz;
   14013       goto do_push_I;
   14014    case 0x6A: /* PUSH Ib, sign-extended to sz */
   14015       d32 = getSDisp8(delta); delta += 1;
   14016       goto do_push_I;
   14017    do_push_I:
   14018       ty = szToITy(sz);
   14019       t1 = newTemp(Ity_I32); t2 = newTemp(ty);
   14020       assign( t1, binop(Iop_Sub32,getIReg(4,R_ESP),mkU32(sz)) );
   14021       putIReg(4, R_ESP, mkexpr(t1) );
   14022       /* stop mkU16 asserting if d32 is a negative 16-bit number
   14023          (bug #132813) */
   14024       if (ty == Ity_I16)
   14025          d32 &= 0xFFFF;
   14026       storeLE( mkexpr(t1), mkU(ty,d32) );
   14027       DIP("push%c $0x%x\n", nameISize(sz), d32);
   14028       break;
   14029 
   14030    case 0x9C: /* PUSHF */ {
   14031       vassert(sz == 2 || sz == 4);
   14032 
   14033       t1 = newTemp(Ity_I32);
   14034       assign( t1, binop(Iop_Sub32,getIReg(4,R_ESP),mkU32(sz)) );
   14035       putIReg(4, R_ESP, mkexpr(t1) );
   14036 
   14037       /* Calculate OSZACP, and patch in fixed fields as per
   14038          Intel docs.
   14039          - bit 1 is always 1
   14040          - bit 9 is Interrupt Enable (should always be 1 in user mode?)
   14041       */
   14042       t2 = newTemp(Ity_I32);
   14043       assign( t2, binop(Iop_Or32,
   14044                         mk_x86g_calculate_eflags_all(),
   14045                         mkU32( (1<<1)|(1<<9) ) ));
   14046 
   14047       /* Patch in the D flag.  This can simply be a copy of bit 10 of
   14048          baseBlock[OFFB_DFLAG]. */
   14049       t3 = newTemp(Ity_I32);
   14050       assign( t3, binop(Iop_Or32,
   14051                         mkexpr(t2),
   14052                         binop(Iop_And32,
   14053                               IRExpr_Get(OFFB_DFLAG,Ity_I32),
   14054                               mkU32(1<<10)))
   14055             );
   14056 
   14057       /* And patch in the ID flag. */
   14058       t4 = newTemp(Ity_I32);
   14059       assign( t4, binop(Iop_Or32,
   14060                         mkexpr(t3),
   14061                         binop(Iop_And32,
   14062                               binop(Iop_Shl32, IRExpr_Get(OFFB_IDFLAG,Ity_I32),
   14063                                                mkU8(21)),
   14064                               mkU32(1<<21)))
   14065             );
   14066 
   14067       /* And patch in the AC flag. */
   14068       t5 = newTemp(Ity_I32);
   14069       assign( t5, binop(Iop_Or32,
   14070                         mkexpr(t4),
   14071                         binop(Iop_And32,
   14072                               binop(Iop_Shl32, IRExpr_Get(OFFB_ACFLAG,Ity_I32),
   14073                                                mkU8(18)),
   14074                               mkU32(1<<18)))
   14075             );
   14076 
   14077       /* if sz==2, the stored value needs to be narrowed. */
   14078       if (sz == 2)
   14079         storeLE( mkexpr(t1), unop(Iop_32to16,mkexpr(t5)) );
   14080       else
   14081         storeLE( mkexpr(t1), mkexpr(t5) );
   14082 
   14083       DIP("pushf%c\n", nameISize(sz));
   14084       break;
   14085    }
   14086 
   14087    case 0x60: /* PUSHA */
   14088       /* This is almost certainly wrong for sz==2.  So ... */
   14089       if (sz != 4) goto decode_failure;
   14090 
   14091       /* This is the Right Way, in that the value to be pushed is
   14092          established before %esp is changed, so that pusha
   14093          correctly pushes the old %esp value.  New value of %esp is
   14094          pushed at start. */
   14095       /* t0 is the %ESP value we're going to push. */
   14096       t0 = newTemp(Ity_I32);
   14097       assign( t0, getIReg(4, R_ESP) );
   14098 
   14099       /* t5 will be the new %ESP value. */
   14100       t5 = newTemp(Ity_I32);
   14101       assign( t5, binop(Iop_Sub32, mkexpr(t0), mkU32(8*4)) );
   14102 
   14103       /* Update guest state before prodding memory. */
   14104       putIReg(4, R_ESP, mkexpr(t5));
   14105 
   14106       /* Dump all the registers. */
   14107       storeLE( binop(Iop_Add32,mkexpr(t5),mkU32(28)), getIReg(4,R_EAX) );
   14108       storeLE( binop(Iop_Add32,mkexpr(t5),mkU32(24)), getIReg(4,R_ECX) );
   14109       storeLE( binop(Iop_Add32,mkexpr(t5),mkU32(20)), getIReg(4,R_EDX) );
   14110       storeLE( binop(Iop_Add32,mkexpr(t5),mkU32(16)), getIReg(4,R_EBX) );
   14111       storeLE( binop(Iop_Add32,mkexpr(t5),mkU32(12)), mkexpr(t0) /*esp*/);
   14112       storeLE( binop(Iop_Add32,mkexpr(t5),mkU32( 8)), getIReg(4,R_EBP) );
   14113       storeLE( binop(Iop_Add32,mkexpr(t5),mkU32( 4)), getIReg(4,R_ESI) );
   14114       storeLE( binop(Iop_Add32,mkexpr(t5),mkU32( 0)), getIReg(4,R_EDI) );
   14115 
   14116       DIP("pusha%c\n", nameISize(sz));
   14117       break;
   14118 
   14119    case 0x0E: /* PUSH %CS */
   14120       dis_push_segreg( R_CS, sz ); break;
   14121    case 0x1E: /* PUSH %DS */
   14122       dis_push_segreg( R_DS, sz ); break;
   14123    case 0x06: /* PUSH %ES */
   14124       dis_push_segreg( R_ES, sz ); break;
   14125    case 0x16: /* PUSH %SS */
   14126       dis_push_segreg( R_SS, sz ); break;
   14127 
   14128    /* ------------------------ SCAS et al ----------------- */
   14129 
   14130    case 0xA4: /* MOVS, no REP prefix */
   14131    case 0xA5:
   14132       if (sorb != 0)
   14133          goto decode_failure; /* else dis_string_op asserts */
   14134       dis_string_op( dis_MOVS, ( opc == 0xA4 ? 1 : sz ), "movs", sorb );
   14135       break;
   14136 
   14137   case 0xA6: /* CMPSb, no REP prefix */
   14138   case 0xA7:
   14139       if (sorb != 0)
   14140          goto decode_failure; /* else dis_string_op asserts */
   14141       dis_string_op( dis_CMPS, ( opc == 0xA6 ? 1 : sz ), "cmps", sorb );
   14142       break;
   14143 
   14144    case 0xAA: /* STOS, no REP prefix */
   14145    case 0xAB:
   14146       if (sorb != 0)
   14147          goto decode_failure; /* else dis_string_op asserts */
   14148       dis_string_op( dis_STOS, ( opc == 0xAA ? 1 : sz ), "stos", sorb );
   14149       break;
   14150 
   14151    case 0xAC: /* LODS, no REP prefix */
   14152    case 0xAD:
   14153       if (sorb != 0)
   14154          goto decode_failure; /* else dis_string_op asserts */
   14155       dis_string_op( dis_LODS, ( opc == 0xAC ? 1 : sz ), "lods", sorb );
   14156       break;
   14157 
   14158    case 0xAE: /* SCAS, no REP prefix */
   14159    case 0xAF:
   14160       if (sorb != 0)
   14161          goto decode_failure; /* else dis_string_op asserts */
   14162       dis_string_op( dis_SCAS, ( opc == 0xAE ? 1 : sz ), "scas", sorb );
   14163       break;
   14164 
   14165 
   14166    case 0xFC: /* CLD */
   14167       stmt( IRStmt_Put( OFFB_DFLAG, mkU32(1)) );
   14168       DIP("cld\n");
   14169       break;
   14170 
   14171    case 0xFD: /* STD */
   14172       stmt( IRStmt_Put( OFFB_DFLAG, mkU32(0xFFFFFFFF)) );
   14173       DIP("std\n");
   14174       break;
   14175 
   14176    case 0xF8: /* CLC */
   14177    case 0xF9: /* STC */
   14178    case 0xF5: /* CMC */
   14179       t0 = newTemp(Ity_I32);
   14180       t1 = newTemp(Ity_I32);
   14181       assign( t0, mk_x86g_calculate_eflags_all() );
   14182       switch (opc) {
   14183          case 0xF8:
   14184             assign( t1, binop(Iop_And32, mkexpr(t0),
   14185                                          mkU32(~X86G_CC_MASK_C)));
   14186             DIP("clc\n");
   14187             break;
   14188          case 0xF9:
   14189             assign( t1, binop(Iop_Or32, mkexpr(t0),
   14190                                         mkU32(X86G_CC_MASK_C)));
   14191             DIP("stc\n");
   14192             break;
   14193          case 0xF5:
   14194             assign( t1, binop(Iop_Xor32, mkexpr(t0),
   14195                                          mkU32(X86G_CC_MASK_C)));
   14196             DIP("cmc\n");
   14197             break;
   14198          default:
   14199             vpanic("disInstr(x86)(clc/stc/cmc)");
   14200       }
   14201       stmt( IRStmt_Put( OFFB_CC_OP,   mkU32(X86G_CC_OP_COPY) ));
   14202       stmt( IRStmt_Put( OFFB_CC_DEP2, mkU32(0) ));
   14203       stmt( IRStmt_Put( OFFB_CC_DEP1, mkexpr(t1) ));
   14204       /* Set NDEP even though it isn't used.  This makes redundant-PUT
   14205          elimination of previous stores to this field work better. */
   14206       stmt( IRStmt_Put( OFFB_CC_NDEP, mkU32(0) ));
   14207       break;
   14208 
   14209    case 0xD6: /* SALC */
   14210       t0 = newTemp(Ity_I32);
   14211       t1 = newTemp(Ity_I32);
   14212       assign( t0,  binop(Iop_And32,
   14213                          mk_x86g_calculate_eflags_c(),
   14214                          mkU32(1)) );
   14215       assign( t1, binop(Iop_Sar32,
   14216                         binop(Iop_Shl32, mkexpr(t0), mkU8(31)),
   14217                         mkU8(31)) );
   14218       putIReg(1, R_EAX, unop(Iop_32to8, mkexpr(t1)) );
   14219       DIP("salc\n");
   14220       break;
   14221 
   14222    /* REPNE prefix insn */
   14223    case 0xF2: {
   14224       Addr32 eip_orig = guest_EIP_bbstart + delta_start;
   14225       if (sorb != 0) goto decode_failure;
   14226       abyte = getIByte(delta); delta++;
   14227 
   14228       if (abyte == 0x66) { sz = 2; abyte = getIByte(delta); delta++; }
   14229 
   14230       switch (abyte) {
   14231       /* According to the Intel manual, "repne movs" should never occur, but
   14232        * in practice it has happened, so allow for it here... */
   14233       case 0xA4: sz = 1;   /* REPNE MOVS<sz> */
   14234       case 0xA5:
   14235          dis_REP_op ( &dres, X86CondNZ, dis_MOVS, sz, eip_orig,
   14236                              guest_EIP_bbstart+delta, "repne movs" );
   14237          break;
   14238 
   14239       case 0xA6: sz = 1;   /* REPNE CMP<sz> */
   14240       case 0xA7:
   14241          dis_REP_op ( &dres, X86CondNZ, dis_CMPS, sz, eip_orig,
   14242                              guest_EIP_bbstart+delta, "repne cmps" );
   14243          break;
   14244 
   14245       case 0xAA: sz = 1;   /* REPNE STOS<sz> */
   14246       case 0xAB:
   14247          dis_REP_op ( &dres, X86CondNZ, dis_STOS, sz, eip_orig,
   14248                              guest_EIP_bbstart+delta, "repne stos" );
   14249          break;
   14250 
   14251       case 0xAE: sz = 1;   /* REPNE SCAS<sz> */
   14252       case 0xAF:
   14253          dis_REP_op ( &dres, X86CondNZ, dis_SCAS, sz, eip_orig,
   14254                              guest_EIP_bbstart+delta, "repne scas" );
   14255          break;
   14256 
   14257       default:
   14258          goto decode_failure;
   14259       }
   14260       break;
   14261    }
   14262 
   14263    /* REP/REPE prefix insn (for SCAS and CMPS, 0xF3 means REPE,
   14264       for the rest, it means REP) */
   14265    case 0xF3: {
   14266       Addr32 eip_orig = guest_EIP_bbstart + delta_start;
   14267       abyte = getIByte(delta); delta++;
   14268 
   14269       if (abyte == 0x66) { sz = 2; abyte = getIByte(delta); delta++; }
   14270 
   14271       if (sorb != 0 && abyte != 0x0F) goto decode_failure;
   14272 
   14273       switch (abyte) {
   14274       case 0x0F:
   14275          switch (getIByte(delta)) {
   14276          /* On older CPUs, TZCNT behaves the same as BSF.  */
   14277          case 0xBC: /* REP BSF Gv,Ev */
   14278             delta = dis_bs_E_G ( sorb, sz, delta + 1, True );
   14279             break;
   14280          /* On older CPUs, LZCNT behaves the same as BSR.  */
   14281          case 0xBD: /* REP BSR Gv,Ev */
   14282             delta = dis_bs_E_G ( sorb, sz, delta + 1, False );
   14283             break;
   14284          default:
   14285             goto decode_failure;
   14286          }
   14287          break;
   14288 
   14289       case 0xA4: sz = 1;   /* REP MOVS<sz> */
   14290       case 0xA5:
   14291          dis_REP_op ( &dres, X86CondAlways, dis_MOVS, sz, eip_orig,
   14292                              guest_EIP_bbstart+delta, "rep movs" );
   14293          break;
   14294 
   14295       case 0xA6: sz = 1;   /* REPE CMP<sz> */
   14296       case 0xA7:
   14297          dis_REP_op ( &dres, X86CondZ, dis_CMPS, sz, eip_orig,
   14298                              guest_EIP_bbstart+delta, "repe cmps" );
   14299          break;
   14300 
   14301       case 0xAA: sz = 1;   /* REP STOS<sz> */
   14302       case 0xAB:
   14303          dis_REP_op ( &dres, X86CondAlways, dis_STOS, sz, eip_orig,
   14304                              guest_EIP_bbstart+delta, "rep stos" );
   14305          break;
   14306 
   14307       case 0xAC: sz = 1;   /* REP LODS<sz> */
   14308       case 0xAD:
   14309          dis_REP_op ( &dres, X86CondAlways, dis_LODS, sz, eip_orig,
   14310                              guest_EIP_bbstart+delta, "rep lods" );
   14311          break;
   14312 
   14313       case 0xAE: sz = 1;   /* REPE SCAS<sz> */
   14314       case 0xAF:
   14315          dis_REP_op ( &dres, X86CondZ, dis_SCAS, sz, eip_orig,
   14316                              guest_EIP_bbstart+delta, "repe scas" );
   14317          break;
   14318 
   14319       case 0x90:           /* REP NOP (PAUSE) */
   14320          /* a hint to the P4 re spin-wait loop */
   14321          DIP("rep nop (P4 pause)\n");
   14322          /* "observe" the hint.  The Vex client needs to be careful not
   14323             to cause very long delays as a result, though. */
   14324          jmp_lit(&dres, Ijk_Yield, ((Addr32)guest_EIP_bbstart)+delta);
   14325          vassert(dres.whatNext == Dis_StopHere);
   14326          break;
   14327 
   14328       case 0xC3:           /* REP RET -- same as normal ret? */
   14329          dis_ret(&dres, 0);
   14330          DIP("rep ret\n");
   14331          break;
   14332 
   14333       default:
   14334          goto decode_failure;
   14335       }
   14336       break;
   14337    }
   14338 
   14339    /* ------------------------ XCHG ----------------------- */
   14340 
   14341    /* XCHG reg,mem automatically asserts LOCK# even without a LOCK
   14342       prefix; hence it must be translated with an IRCAS (at least, the
   14343       memory variant). */
   14344    case 0x86: /* XCHG Gb,Eb */
   14345       sz = 1;
   14346       /* Fall through ... */
   14347    case 0x87: /* XCHG Gv,Ev */
   14348       modrm = getIByte(delta);
   14349       ty = szToITy(sz);
   14350       t1 = newTemp(ty); t2 = newTemp(ty);
   14351       if (epartIsReg(modrm)) {
   14352          assign(t1, getIReg(sz, eregOfRM(modrm)));
   14353          assign(t2, getIReg(sz, gregOfRM(modrm)));
   14354          putIReg(sz, gregOfRM(modrm), mkexpr(t1));
   14355          putIReg(sz, eregOfRM(modrm), mkexpr(t2));
   14356          delta++;
   14357          DIP("xchg%c %s, %s\n",
   14358              nameISize(sz), nameIReg(sz,gregOfRM(modrm)),
   14359                             nameIReg(sz,eregOfRM(modrm)));
   14360       } else {
   14361          *expect_CAS = True;
   14362          addr = disAMode ( &alen, sorb, delta, dis_buf );
   14363          assign( t1, loadLE(ty,mkexpr(addr)) );
   14364          assign( t2, getIReg(sz,gregOfRM(modrm)) );
   14365          casLE( mkexpr(addr),
   14366                 mkexpr(t1), mkexpr(t2), guest_EIP_curr_instr );
   14367          putIReg( sz, gregOfRM(modrm), mkexpr(t1) );
   14368          delta += alen;
   14369          DIP("xchg%c %s, %s\n", nameISize(sz),
   14370                                 nameIReg(sz,gregOfRM(modrm)), dis_buf);
   14371       }
   14372       break;
   14373 
   14374    case 0x90: /* XCHG eAX,eAX */
   14375       DIP("nop\n");
   14376       break;
   14377    case 0x91: /* XCHG eAX,eCX */
   14378    case 0x92: /* XCHG eAX,eDX */
   14379    case 0x93: /* XCHG eAX,eBX */
   14380    case 0x94: /* XCHG eAX,eSP */
   14381    case 0x95: /* XCHG eAX,eBP */
   14382    case 0x96: /* XCHG eAX,eSI */
   14383    case 0x97: /* XCHG eAX,eDI */
   14384       codegen_xchg_eAX_Reg ( sz, opc - 0x90 );
   14385       break;
   14386 
   14387    /* ------------------------ XLAT ----------------------- */
   14388 
   14389    case 0xD7: /* XLAT */
   14390       if (sz != 4) goto decode_failure; /* sz == 2 is also allowed (0x66) */
   14391       putIReg(
   14392          1,
   14393          R_EAX/*AL*/,
   14394          loadLE(Ity_I8,
   14395                 handleSegOverride(
   14396                    sorb,
   14397                    binop(Iop_Add32,
   14398                          getIReg(4, R_EBX),
   14399                          unop(Iop_8Uto32, getIReg(1, R_EAX/*AL*/))))));
   14400 
   14401       DIP("xlat%c [ebx]\n", nameISize(sz));
   14402       break;
   14403 
   14404    /* ------------------------ IN / OUT ----------------------- */
   14405 
   14406    case 0xE4: /* IN imm8, AL */
   14407       sz = 1;
   14408       t1 = newTemp(Ity_I32);
   14409       abyte = getIByte(delta); delta++;
   14410       assign(t1, mkU32( abyte & 0xFF ));
   14411       DIP("in%c $%d,%s\n", nameISize(sz), (Int)abyte, nameIReg(sz,R_EAX));
   14412       goto do_IN;
   14413    case 0xE5: /* IN imm8, eAX */
   14414       vassert(sz == 2 || sz == 4);
   14415       t1 = newTemp(Ity_I32);
   14416       abyte = getIByte(delta); delta++;
   14417       assign(t1, mkU32( abyte & 0xFF ));
   14418       DIP("in%c $%d,%s\n", nameISize(sz), (Int)abyte, nameIReg(sz,R_EAX));
   14419       goto do_IN;
   14420    case 0xEC: /* IN %DX, AL */
   14421       sz = 1;
   14422       t1 = newTemp(Ity_I32);
   14423       assign(t1, unop(Iop_16Uto32, getIReg(2, R_EDX)));
   14424       DIP("in%c %s,%s\n", nameISize(sz), nameIReg(2,R_EDX),
   14425                                          nameIReg(sz,R_EAX));
   14426       goto do_IN;
   14427    case 0xED: /* IN %DX, eAX */
   14428       vassert(sz == 2 || sz == 4);
   14429       t1 = newTemp(Ity_I32);
   14430       assign(t1, unop(Iop_16Uto32, getIReg(2, R_EDX)));
   14431       DIP("in%c %s,%s\n", nameISize(sz), nameIReg(2,R_EDX),
   14432                                          nameIReg(sz,R_EAX));
   14433       goto do_IN;
   14434    do_IN: {
   14435       /* At this point, sz indicates the width, and t1 is a 32-bit
   14436          value giving port number. */
   14437       IRDirty* d;
   14438       vassert(sz == 1 || sz == 2 || sz == 4);
   14439       ty = szToITy(sz);
   14440       t2 = newTemp(Ity_I32);
   14441       d = unsafeIRDirty_1_N(
   14442              t2,
   14443              0/*regparms*/,
   14444              "x86g_dirtyhelper_IN",
   14445              &x86g_dirtyhelper_IN,
   14446              mkIRExprVec_2( mkexpr(t1), mkU32(sz) )
   14447           );
   14448       /* do the call, dumping the result in t2. */
   14449       stmt( IRStmt_Dirty(d) );
   14450       putIReg(sz, R_EAX, narrowTo( ty, mkexpr(t2) ) );
   14451       break;
   14452    }
   14453 
   14454    case 0xE6: /* OUT AL, imm8 */
   14455       sz = 1;
   14456       t1 = newTemp(Ity_I32);
   14457       abyte = getIByte(delta); delta++;
   14458       assign( t1, mkU32( abyte & 0xFF ) );
   14459       DIP("out%c %s,$%d\n", nameISize(sz), nameIReg(sz,R_EAX), (Int)abyte);
   14460       goto do_OUT;
   14461    case 0xE7: /* OUT eAX, imm8 */
   14462       vassert(sz == 2 || sz == 4);
   14463       t1 = newTemp(Ity_I32);
   14464       abyte = getIByte(delta); delta++;
   14465       assign( t1, mkU32( abyte & 0xFF ) );
   14466       DIP("out%c %s,$%d\n", nameISize(sz), nameIReg(sz,R_EAX), (Int)abyte);
   14467       goto do_OUT;
   14468    case 0xEE: /* OUT AL, %DX */
   14469       sz = 1;
   14470       t1 = newTemp(Ity_I32);
   14471       assign( t1, unop(Iop_16Uto32, getIReg(2, R_EDX)) );
   14472       DIP("out%c %s,%s\n", nameISize(sz), nameIReg(sz,R_EAX),
   14473                                           nameIReg(2,R_EDX));
   14474       goto do_OUT;
   14475    case 0xEF: /* OUT eAX, %DX */
   14476       vassert(sz == 2 || sz == 4);
   14477       t1 = newTemp(Ity_I32);
   14478       assign( t1, unop(Iop_16Uto32, getIReg(2, R_EDX)) );
   14479       DIP("out%c %s,%s\n", nameISize(sz), nameIReg(sz,R_EAX),
   14480                                           nameIReg(2,R_EDX));
   14481       goto do_OUT;
   14482    do_OUT: {
   14483       /* At this point, sz indicates the width, and t1 is a 32-bit
   14484          value giving port number. */
   14485       IRDirty* d;
   14486       vassert(sz == 1 || sz == 2 || sz == 4);
   14487       ty = szToITy(sz);
   14488       d = unsafeIRDirty_0_N(
   14489              0/*regparms*/,
   14490              "x86g_dirtyhelper_OUT",
   14491              &x86g_dirtyhelper_OUT,
   14492              mkIRExprVec_3( mkexpr(t1),
   14493                             widenUto32( getIReg(sz, R_EAX) ),
   14494                             mkU32(sz) )
   14495           );
   14496       stmt( IRStmt_Dirty(d) );
   14497       break;
   14498    }
   14499 
   14500    /* ------------------------ (Grp1 extensions) ---------- */
   14501 
   14502    case 0x82: /* Grp1 Ib,Eb too.  Apparently this is the same as
   14503                  case 0x80, but only in 32-bit mode. */
   14504       /* fallthru */
   14505    case 0x80: /* Grp1 Ib,Eb */
   14506       modrm = getIByte(delta);
   14507       am_sz = lengthAMode(delta);
   14508       sz    = 1;
   14509       d_sz  = 1;
   14510       d32   = getUChar(delta + am_sz);
   14511       delta = dis_Grp1 ( sorb, pfx_lock, delta, modrm, am_sz, d_sz, sz, d32 );
   14512       break;
   14513 
   14514    case 0x81: /* Grp1 Iv,Ev */
   14515       modrm = getIByte(delta);
   14516       am_sz = lengthAMode(delta);
   14517       d_sz  = sz;
   14518       d32   = getUDisp(d_sz, delta + am_sz);
   14519       delta = dis_Grp1 ( sorb, pfx_lock, delta, modrm, am_sz, d_sz, sz, d32 );
   14520       break;
   14521 
   14522    case 0x83: /* Grp1 Ib,Ev */
   14523       modrm = getIByte(delta);
   14524       am_sz = lengthAMode(delta);
   14525       d_sz  = 1;
   14526       d32   = getSDisp8(delta + am_sz);
   14527       delta = dis_Grp1 ( sorb, pfx_lock, delta, modrm, am_sz, d_sz, sz, d32 );
   14528       break;
   14529 
   14530    /* ------------------------ (Grp2 extensions) ---------- */
   14531 
   14532    case 0xC0: { /* Grp2 Ib,Eb */
   14533       Bool decode_OK = True;
   14534       modrm = getIByte(delta);
   14535       am_sz = lengthAMode(delta);
   14536       d_sz  = 1;
   14537       d32   = getUChar(delta + am_sz);
   14538       sz    = 1;
   14539       delta = dis_Grp2 ( sorb, delta, modrm, am_sz, d_sz, sz,
   14540                          mkU8(d32 & 0xFF), NULL, &decode_OK );
   14541       if (!decode_OK)
   14542          goto decode_failure;
   14543       break;
   14544    }
   14545    case 0xC1: { /* Grp2 Ib,Ev */
   14546       Bool decode_OK = True;
   14547       modrm = getIByte(delta);
   14548       am_sz = lengthAMode(delta);
   14549       d_sz  = 1;
   14550       d32   = getUChar(delta + am_sz);
   14551       delta = dis_Grp2 ( sorb, delta, modrm, am_sz, d_sz, sz,
   14552                          mkU8(d32 & 0xFF), NULL, &decode_OK );
   14553       if (!decode_OK)
   14554          goto decode_failure;
   14555       break;
   14556    }
   14557    case 0xD0: { /* Grp2 1,Eb */
   14558       Bool decode_OK = True;
   14559       modrm = getIByte(delta);
   14560       am_sz = lengthAMode(delta);
   14561       d_sz  = 0;
   14562       d32   = 1;
   14563       sz    = 1;
   14564       delta = dis_Grp2 ( sorb, delta, modrm, am_sz, d_sz, sz,
   14565                          mkU8(d32), NULL, &decode_OK );
   14566       if (!decode_OK)
   14567          goto decode_failure;
   14568       break;
   14569    }
   14570    case 0xD1: { /* Grp2 1,Ev */
   14571       Bool decode_OK = True;
   14572       modrm = getUChar(delta);
   14573       am_sz = lengthAMode(delta);
   14574       d_sz  = 0;
   14575       d32   = 1;
   14576       delta = dis_Grp2 ( sorb, delta, modrm, am_sz, d_sz, sz,
   14577                          mkU8(d32), NULL, &decode_OK );
   14578       if (!decode_OK)
   14579          goto decode_failure;
   14580       break;
   14581    }
   14582    case 0xD2: { /* Grp2 CL,Eb */
   14583       Bool decode_OK = True;
   14584       modrm = getUChar(delta);
   14585       am_sz = lengthAMode(delta);
   14586       d_sz  = 0;
   14587       sz    = 1;
   14588       delta = dis_Grp2 ( sorb, delta, modrm, am_sz, d_sz, sz,
   14589                          getIReg(1,R_ECX), "%cl", &decode_OK );
   14590       if (!decode_OK)
   14591          goto decode_failure;
   14592       break;
   14593    }
   14594    case 0xD3: { /* Grp2 CL,Ev */
   14595       Bool decode_OK = True;
   14596       modrm = getIByte(delta);
   14597       am_sz = lengthAMode(delta);
   14598       d_sz  = 0;
   14599       delta = dis_Grp2 ( sorb, delta, modrm, am_sz, d_sz, sz,
   14600                          getIReg(1,R_ECX), "%cl", &decode_OK );
   14601       if (!decode_OK)
   14602          goto decode_failure;
   14603       break;
   14604    }
   14605 
   14606    /* ------------------------ (Grp3 extensions) ---------- */
   14607 
   14608    case 0xF6: { /* Grp3 Eb */
   14609       Bool decode_OK = True;
   14610       delta = dis_Grp3 ( sorb, pfx_lock, 1, delta, &decode_OK );
   14611       if (!decode_OK)
   14612          goto decode_failure;
   14613       break;
   14614    }
   14615    case 0xF7: { /* Grp3 Ev */
   14616       Bool decode_OK = True;
   14617       delta = dis_Grp3 ( sorb, pfx_lock, sz, delta, &decode_OK );
   14618       if (!decode_OK)
   14619          goto decode_failure;
   14620       break;
   14621    }
   14622 
   14623    /* ------------------------ (Grp4 extensions) ---------- */
   14624 
   14625    case 0xFE: { /* Grp4 Eb */
   14626       Bool decode_OK = True;
   14627       delta = dis_Grp4 ( sorb, pfx_lock, delta, &decode_OK );
   14628       if (!decode_OK)
   14629          goto decode_failure;
   14630       break;
   14631    }
   14632 
   14633    /* ------------------------ (Grp5 extensions) ---------- */
   14634 
   14635    case 0xFF: { /* Grp5 Ev */
   14636       Bool decode_OK = True;
   14637       delta = dis_Grp5 ( sorb, pfx_lock, sz, delta, &dres, &decode_OK );
   14638       if (!decode_OK)
   14639          goto decode_failure;
   14640       break;
   14641    }
   14642 
   14643    /* ------------------------ Escapes to 2-byte opcodes -- */
   14644 
   14645    case 0x0F: {
   14646       opc = getIByte(delta); delta++;
   14647       switch (opc) {
   14648 
   14649       /* =-=-=-=-=-=-=-=-=- Grp8 =-=-=-=-=-=-=-=-=-=-=-= */
   14650 
   14651       case 0xBA: { /* Grp8 Ib,Ev */
   14652          Bool decode_OK = False;
   14653          modrm = getUChar(delta);
   14654          am_sz = lengthAMode(delta);
   14655          d32   = getSDisp8(delta + am_sz);
   14656          delta = dis_Grp8_Imm ( sorb, pfx_lock, delta, modrm,
   14657                                 am_sz, sz, d32, &decode_OK );
   14658          if (!decode_OK)
   14659             goto decode_failure;
   14660          break;
   14661       }
   14662 
   14663       /* =-=-=-=-=-=-=-=-=- BSF/BSR -=-=-=-=-=-=-=-=-=-= */
   14664 
   14665       case 0xBC: /* BSF Gv,Ev */
   14666          delta = dis_bs_E_G ( sorb, sz, delta, True );
   14667          break;
   14668       case 0xBD: /* BSR Gv,Ev */
   14669          delta = dis_bs_E_G ( sorb, sz, delta, False );
   14670          break;
   14671 
   14672       /* =-=-=-=-=-=-=-=-=- BSWAP -=-=-=-=-=-=-=-=-=-=-= */
   14673 
   14674       case 0xC8: /* BSWAP %eax */
   14675       case 0xC9:
   14676       case 0xCA:
   14677       case 0xCB:
   14678       case 0xCC:
   14679       case 0xCD:
   14680       case 0xCE:
   14681       case 0xCF: /* BSWAP %edi */
   14682          /* AFAICS from the Intel docs, this only exists at size 4. */
   14683          if (sz != 4) goto decode_failure;
   14684 
   14685          t1 = newTemp(Ity_I32);
   14686          assign( t1, getIReg(4, opc-0xC8) );
   14687          t2 = math_BSWAP(t1, Ity_I32);
   14688 
   14689          putIReg(4, opc-0xC8, mkexpr(t2));
   14690          DIP("bswapl %s\n", nameIReg(4, opc-0xC8));
   14691          break;
   14692 
   14693       /* =-=-=-=-=-=-=-=-=- BT/BTS/BTR/BTC =-=-=-=-=-=-= */
   14694 
   14695       case 0xA3: /* BT Gv,Ev */
   14696          delta = dis_bt_G_E ( vbi, sorb, pfx_lock, sz, delta, BtOpNone );
   14697          break;
   14698       case 0xB3: /* BTR Gv,Ev */
   14699          delta = dis_bt_G_E ( vbi, sorb, pfx_lock, sz, delta, BtOpReset );
   14700          break;
   14701       case 0xAB: /* BTS Gv,Ev */
   14702          delta = dis_bt_G_E ( vbi, sorb, pfx_lock, sz, delta, BtOpSet );
   14703          break;
   14704       case 0xBB: /* BTC Gv,Ev */
   14705          delta = dis_bt_G_E ( vbi, sorb, pfx_lock, sz, delta, BtOpComp );
   14706          break;
   14707 
   14708       /* =-=-=-=-=-=-=-=-=- CMOV =-=-=-=-=-=-=-=-=-=-=-= */
   14709 
   14710       case 0x40:
   14711       case 0x41:
   14712       case 0x42: /* CMOVBb/CMOVNAEb (cmov below) */
   14713       case 0x43: /* CMOVNBb/CMOVAEb (cmov not below) */
   14714       case 0x44: /* CMOVZb/CMOVEb (cmov zero) */
   14715       case 0x45: /* CMOVNZb/CMOVNEb (cmov not zero) */
   14716       case 0x46: /* CMOVBEb/CMOVNAb (cmov below or equal) */
   14717       case 0x47: /* CMOVNBEb/CMOVAb (cmov not below or equal) */
   14718       case 0x48: /* CMOVSb (cmov negative) */
   14719       case 0x49: /* CMOVSb (cmov not negative) */
   14720       case 0x4A: /* CMOVP (cmov parity even) */
   14721       case 0x4B: /* CMOVNP (cmov parity odd) */
   14722       case 0x4C: /* CMOVLb/CMOVNGEb (cmov less) */
   14723       case 0x4D: /* CMOVGEb/CMOVNLb (cmov greater or equal) */
   14724       case 0x4E: /* CMOVLEb/CMOVNGb (cmov less or equal) */
   14725       case 0x4F: /* CMOVGb/CMOVNLEb (cmov greater) */
   14726          delta = dis_cmov_E_G(sorb, sz, (X86Condcode)(opc - 0x40), delta);
   14727          break;
   14728 
   14729       /* =-=-=-=-=-=-=-=-=- CMPXCHG -=-=-=-=-=-=-=-=-=-= */
   14730 
   14731       case 0xB0: /* CMPXCHG Gb,Eb */
   14732          delta = dis_cmpxchg_G_E ( sorb, pfx_lock, 1, delta );
   14733          break;
   14734       case 0xB1: /* CMPXCHG Gv,Ev */
   14735          delta = dis_cmpxchg_G_E ( sorb, pfx_lock, sz, delta );
   14736          break;
   14737 
   14738       case 0xC7: { /* CMPXCHG8B Gv (0F C7 /1) */
   14739          IRTemp expdHi    = newTemp(Ity_I32);
   14740          IRTemp expdLo    = newTemp(Ity_I32);
   14741          IRTemp dataHi    = newTemp(Ity_I32);
   14742          IRTemp dataLo    = newTemp(Ity_I32);
   14743          IRTemp oldHi     = newTemp(Ity_I32);
   14744          IRTemp oldLo     = newTemp(Ity_I32);
   14745          IRTemp flags_old = newTemp(Ity_I32);
   14746          IRTemp flags_new = newTemp(Ity_I32);
   14747          IRTemp success   = newTemp(Ity_I1);
   14748 
   14749          /* Translate this using a DCAS, even if there is no LOCK
   14750             prefix.  Life is too short to bother with generating two
   14751             different translations for the with/without-LOCK-prefix
   14752             cases. */
   14753          *expect_CAS = True;
   14754 
   14755 	 /* Decode, and generate address. */
   14756          if (sz != 4) goto decode_failure;
   14757          modrm = getIByte(delta);
   14758          if (epartIsReg(modrm)) goto decode_failure;
   14759          if (gregOfRM(modrm) != 1) goto decode_failure;
   14760          addr = disAMode ( &alen, sorb, delta, dis_buf );
   14761          delta += alen;
   14762 
   14763          /* Get the expected and new values. */
   14764          assign( expdHi, getIReg(4,R_EDX) );
   14765          assign( expdLo, getIReg(4,R_EAX) );
   14766          assign( dataHi, getIReg(4,R_ECX) );
   14767          assign( dataLo, getIReg(4,R_EBX) );
   14768 
   14769          /* Do the DCAS */
   14770          stmt( IRStmt_CAS(
   14771                   mkIRCAS( oldHi, oldLo,
   14772                            Iend_LE, mkexpr(addr),
   14773                            mkexpr(expdHi), mkexpr(expdLo),
   14774                            mkexpr(dataHi), mkexpr(dataLo)
   14775                )));
   14776 
   14777          /* success when oldHi:oldLo == expdHi:expdLo */
   14778          assign( success,
   14779                  binop(Iop_CasCmpEQ32,
   14780                        binop(Iop_Or32,
   14781                              binop(Iop_Xor32, mkexpr(oldHi), mkexpr(expdHi)),
   14782                              binop(Iop_Xor32, mkexpr(oldLo), mkexpr(expdLo))
   14783                        ),
   14784                        mkU32(0)
   14785                  ));
   14786 
   14787          /* If the DCAS is successful, that is to say oldHi:oldLo ==
   14788             expdHi:expdLo, then put expdHi:expdLo back in EDX:EAX,
   14789             which is where they came from originally.  Both the actual
   14790             contents of these two regs, and any shadow values, are
   14791             unchanged.  If the DCAS fails then we're putting into
   14792             EDX:EAX the value seen in memory. */
   14793          putIReg(4, R_EDX,
   14794                     IRExpr_ITE( mkexpr(success),
   14795                                 mkexpr(expdHi), mkexpr(oldHi)
   14796                 ));
   14797          putIReg(4, R_EAX,
   14798                     IRExpr_ITE( mkexpr(success),
   14799                                 mkexpr(expdLo), mkexpr(oldLo)
   14800                 ));
   14801 
   14802          /* Copy the success bit into the Z flag and leave the others
   14803             unchanged */
   14804          assign( flags_old, widenUto32(mk_x86g_calculate_eflags_all()));
   14805          assign(
   14806             flags_new,
   14807             binop(Iop_Or32,
   14808                   binop(Iop_And32, mkexpr(flags_old),
   14809                                    mkU32(~X86G_CC_MASK_Z)),
   14810                   binop(Iop_Shl32,
   14811                         binop(Iop_And32,
   14812                               unop(Iop_1Uto32, mkexpr(success)), mkU32(1)),
   14813                         mkU8(X86G_CC_SHIFT_Z)) ));
   14814 
   14815          stmt( IRStmt_Put( OFFB_CC_OP,   mkU32(X86G_CC_OP_COPY) ));
   14816          stmt( IRStmt_Put( OFFB_CC_DEP1, mkexpr(flags_new) ));
   14817          stmt( IRStmt_Put( OFFB_CC_DEP2, mkU32(0) ));
   14818          /* Set NDEP even though it isn't used.  This makes
   14819             redundant-PUT elimination of previous stores to this field
   14820             work better. */
   14821          stmt( IRStmt_Put( OFFB_CC_NDEP, mkU32(0) ));
   14822 
   14823          /* Sheesh.  Aren't you glad it was me and not you that had to
   14824 	    write and validate all this grunge? */
   14825 
   14826 	 DIP("cmpxchg8b %s\n", dis_buf);
   14827 	 break;
   14828       }
   14829 
   14830       /* =-=-=-=-=-=-=-=-=- CPUID -=-=-=-=-=-=-=-=-=-=-= */
   14831 
   14832       case 0xA2: { /* CPUID */
   14833          /* Uses dirty helper:
   14834                void dirtyhelper_CPUID_sse[012] ( VexGuestX86State* )
   14835             declared to mod eax, wr ebx, ecx, edx
   14836          */
   14837          IRDirty* d     = NULL;
   14838          void*    fAddr = NULL;
   14839          const HChar* fName = NULL;
   14840          if (archinfo->hwcaps & VEX_HWCAPS_X86_SSE2) {
   14841             fName = "x86g_dirtyhelper_CPUID_sse2";
   14842             fAddr = &x86g_dirtyhelper_CPUID_sse2;
   14843          }
   14844          else
   14845          if (archinfo->hwcaps & VEX_HWCAPS_X86_SSE1) {
   14846             fName = "x86g_dirtyhelper_CPUID_sse1";
   14847             fAddr = &x86g_dirtyhelper_CPUID_sse1;
   14848          }
   14849          else
   14850          if (archinfo->hwcaps & VEX_HWCAPS_X86_MMXEXT) {
   14851             fName = "x86g_dirtyhelper_CPUID_mmxext";
   14852             fAddr = &x86g_dirtyhelper_CPUID_mmxext;
   14853          }
   14854          else
   14855          if (archinfo->hwcaps == 0/*no SSE*/) {
   14856             fName = "x86g_dirtyhelper_CPUID_sse0";
   14857             fAddr = &x86g_dirtyhelper_CPUID_sse0;
   14858          } else
   14859             vpanic("disInstr(x86)(cpuid)");
   14860 
   14861          vassert(fName); vassert(fAddr);
   14862          d = unsafeIRDirty_0_N ( 0/*regparms*/,
   14863                                  fName, fAddr, mkIRExprVec_1(IRExpr_BBPTR()) );
   14864          /* declare guest state effects */
   14865          d->nFxState = 4;
   14866          vex_bzero(&d->fxState, sizeof(d->fxState));
   14867          d->fxState[0].fx     = Ifx_Modify;
   14868          d->fxState[0].offset = OFFB_EAX;
   14869          d->fxState[0].size   = 4;
   14870          d->fxState[1].fx     = Ifx_Write;
   14871          d->fxState[1].offset = OFFB_EBX;
   14872          d->fxState[1].size   = 4;
   14873          d->fxState[2].fx     = Ifx_Modify;
   14874          d->fxState[2].offset = OFFB_ECX;
   14875          d->fxState[2].size   = 4;
   14876          d->fxState[3].fx     = Ifx_Write;
   14877          d->fxState[3].offset = OFFB_EDX;
   14878          d->fxState[3].size   = 4;
   14879          /* execute the dirty call, side-effecting guest state */
   14880          stmt( IRStmt_Dirty(d) );
   14881          /* CPUID is a serialising insn.  So, just in case someone is
   14882             using it as a memory fence ... */
   14883          stmt( IRStmt_MBE(Imbe_Fence) );
   14884          DIP("cpuid\n");
   14885          break;
   14886       }
   14887 
   14888 //--          if (!VG_(cpu_has_feature)(VG_X86_FEAT_CPUID))
   14889 //--             goto decode_failure;
   14890 //--
   14891 //--          t1 = newTemp(cb);
   14892 //--          t2 = newTemp(cb);
   14893 //--          t3 = newTemp(cb);
   14894 //--          t4 = newTemp(cb);
   14895 //--          uInstr0(cb, CALLM_S, 0);
   14896 //--
   14897 //--          uInstr2(cb, GET,   4, ArchReg, R_EAX, TempReg, t1);
   14898 //--          uInstr1(cb, PUSH,  4, TempReg, t1);
   14899 //--
   14900 //--          uInstr2(cb, MOV,   4, Literal, 0, TempReg, t2);
   14901 //--          uLiteral(cb, 0);
   14902 //--          uInstr1(cb, PUSH,  4, TempReg, t2);
   14903 //--
   14904 //--          uInstr2(cb, MOV,   4, Literal, 0, TempReg, t3);
   14905 //--          uLiteral(cb, 0);
   14906 //--          uInstr1(cb, PUSH,  4, TempReg, t3);
   14907 //--
   14908 //--          uInstr2(cb, MOV,   4, Literal, 0, TempReg, t4);
   14909 //--          uLiteral(cb, 0);
   14910 //--          uInstr1(cb, PUSH,  4, TempReg, t4);
   14911 //--
   14912 //--          uInstr1(cb, CALLM, 0, Lit16,   VGOFF_(helper_CPUID));
   14913 //--          uFlagsRWU(cb, FlagsEmpty, FlagsEmpty, FlagsEmpty);
   14914 //--
   14915 //--          uInstr1(cb, POP,   4, TempReg, t4);
   14916 //--          uInstr2(cb, PUT,   4, TempReg, t4, ArchReg, R_EDX);
   14917 //--
   14918 //--          uInstr1(cb, POP,   4, TempReg, t3);
   14919 //--          uInstr2(cb, PUT,   4, TempReg, t3, ArchReg, R_ECX);
   14920 //--
   14921 //--          uInstr1(cb, POP,   4, TempReg, t2);
   14922 //--          uInstr2(cb, PUT,   4, TempReg, t2, ArchReg, R_EBX);
   14923 //--
   14924 //--          uInstr1(cb, POP,   4, TempReg, t1);
   14925 //--          uInstr2(cb, PUT,   4, TempReg, t1, ArchReg, R_EAX);
   14926 //--
   14927 //--          uInstr0(cb, CALLM_E, 0);
   14928 //--          DIP("cpuid\n");
   14929 //--          break;
   14930 //--
   14931       /* =-=-=-=-=-=-=-=-=- MOVZX, MOVSX =-=-=-=-=-=-=-= */
   14932 
   14933       case 0xB6: /* MOVZXb Eb,Gv */
   14934          if (sz != 2 && sz != 4)
   14935             goto decode_failure;
   14936          delta = dis_movx_E_G ( sorb, delta, 1, sz, False );
   14937          break;
   14938 
   14939       case 0xB7: /* MOVZXw Ew,Gv */
   14940          if (sz != 4)
   14941             goto decode_failure;
   14942          delta = dis_movx_E_G ( sorb, delta, 2, 4, False );
   14943          break;
   14944 
   14945       case 0xBE: /* MOVSXb Eb,Gv */
   14946          if (sz != 2 && sz != 4)
   14947             goto decode_failure;
   14948          delta = dis_movx_E_G ( sorb, delta, 1, sz, True );
   14949          break;
   14950 
   14951       case 0xBF: /* MOVSXw Ew,Gv */
   14952          if (sz != 4 && /* accept movsww, sigh, see #250799 */sz != 2)
   14953             goto decode_failure;
   14954          delta = dis_movx_E_G ( sorb, delta, 2, sz, True );
   14955          break;
   14956 
   14957 //--       /* =-=-=-=-=-=-=-=-=-=-= MOVNTI -=-=-=-=-=-=-=-=-= */
   14958 //--
   14959 //--       case 0xC3: /* MOVNTI Gv,Ev */
   14960 //--          vg_assert(sz == 4);
   14961 //--          modrm = getUChar(eip);
   14962 //--          vg_assert(!epartIsReg(modrm));
   14963 //--          t1 = newTemp(cb);
   14964 //--          uInstr2(cb, GET, 4, ArchReg, gregOfRM(modrm), TempReg, t1);
   14965 //--          pair = disAMode ( cb, sorb, eip, dis_buf );
   14966 //--          t2 = LOW24(pair);
   14967 //--          eip += HI8(pair);
   14968 //--          uInstr2(cb, STORE, 4, TempReg, t1, TempReg, t2);
   14969 //--          DIP("movnti %s,%s\n", nameIReg(4,gregOfRM(modrm)), dis_buf);
   14970 //--          break;
   14971 
   14972       /* =-=-=-=-=-=-=-=-=- MUL/IMUL =-=-=-=-=-=-=-=-=-= */
   14973 
   14974       case 0xAF: /* IMUL Ev, Gv */
   14975          delta = dis_mul_E_G ( sorb, sz, delta );
   14976          break;
   14977 
   14978       /* =-=-=-=-=-=-=-=-=- NOPs =-=-=-=-=-=-=-=-=-=-=-= */
   14979 
   14980       case 0x1F:
   14981          modrm = getUChar(delta);
   14982          if (epartIsReg(modrm)) goto decode_failure;
   14983          addr = disAMode ( &alen, sorb, delta, dis_buf );
   14984          delta += alen;
   14985          DIP("nop%c %s\n", nameISize(sz), dis_buf);
   14986          break;
   14987 
   14988       /* =-=-=-=-=-=-=-=-=- Jcond d32 -=-=-=-=-=-=-=-=-= */
   14989       case 0x80:
   14990       case 0x81:
   14991       case 0x82: /* JBb/JNAEb (jump below) */
   14992       case 0x83: /* JNBb/JAEb (jump not below) */
   14993       case 0x84: /* JZb/JEb (jump zero) */
   14994       case 0x85: /* JNZb/JNEb (jump not zero) */
   14995       case 0x86: /* JBEb/JNAb (jump below or equal) */
   14996       case 0x87: /* JNBEb/JAb (jump not below or equal) */
   14997       case 0x88: /* JSb (jump negative) */
   14998       case 0x89: /* JSb (jump not negative) */
   14999       case 0x8A: /* JP (jump parity even) */
   15000       case 0x8B: /* JNP/JPO (jump parity odd) */
   15001       case 0x8C: /* JLb/JNGEb (jump less) */
   15002       case 0x8D: /* JGEb/JNLb (jump greater or equal) */
   15003       case 0x8E: /* JLEb/JNGb (jump less or equal) */
   15004       case 0x8F: /* JGb/JNLEb (jump greater) */
   15005        { Int    jmpDelta;
   15006          const HChar* comment  = "";
   15007          jmpDelta = (Int)getUDisp32(delta);
   15008          d32 = (((Addr32)guest_EIP_bbstart)+delta+4) + jmpDelta;
   15009          delta += 4;
   15010          if (resteerCisOk
   15011              && vex_control.guest_chase_cond
   15012              && (Addr32)d32 != (Addr32)guest_EIP_bbstart
   15013              && jmpDelta < 0
   15014              && resteerOkFn( callback_opaque, (Addr64)(Addr32)d32) ) {
   15015             /* Speculation: assume this backward branch is taken.  So
   15016                we need to emit a side-exit to the insn following this
   15017                one, on the negation of the condition, and continue at
   15018                the branch target address (d32).  If we wind up back at
   15019                the first instruction of the trace, just stop; it's
   15020                better to let the IR loop unroller handle that case.*/
   15021             stmt( IRStmt_Exit(
   15022                      mk_x86g_calculate_condition((X86Condcode)
   15023                                                  (1 ^ (opc - 0x80))),
   15024                      Ijk_Boring,
   15025                      IRConst_U32(guest_EIP_bbstart+delta),
   15026                      OFFB_EIP ) );
   15027             dres.whatNext   = Dis_ResteerC;
   15028             dres.continueAt = (Addr64)(Addr32)d32;
   15029             comment = "(assumed taken)";
   15030          }
   15031          else
   15032          if (resteerCisOk
   15033              && vex_control.guest_chase_cond
   15034              && (Addr32)d32 != (Addr32)guest_EIP_bbstart
   15035              && jmpDelta >= 0
   15036              && resteerOkFn( callback_opaque,
   15037                              (Addr64)(Addr32)(guest_EIP_bbstart+delta)) ) {
   15038             /* Speculation: assume this forward branch is not taken.
   15039                So we need to emit a side-exit to d32 (the dest) and
   15040                continue disassembling at the insn immediately
   15041                following this one. */
   15042             stmt( IRStmt_Exit(
   15043                      mk_x86g_calculate_condition((X86Condcode)(opc - 0x80)),
   15044                      Ijk_Boring,
   15045                      IRConst_U32(d32),
   15046                      OFFB_EIP ) );
   15047             dres.whatNext   = Dis_ResteerC;
   15048             dres.continueAt = (Addr64)(Addr32)(guest_EIP_bbstart+delta);
   15049             comment = "(assumed not taken)";
   15050          }
   15051          else {
   15052             /* Conservative default translation - end the block at
   15053                this point. */
   15054             jcc_01( &dres, (X86Condcode)(opc - 0x80),
   15055                     (Addr32)(guest_EIP_bbstart+delta), d32);
   15056             vassert(dres.whatNext == Dis_StopHere);
   15057          }
   15058          DIP("j%s-32 0x%x %s\n", name_X86Condcode(opc - 0x80), d32, comment);
   15059          break;
   15060        }
   15061 
   15062       /* =-=-=-=-=-=-=-=-=- RDTSC -=-=-=-=-=-=-=-=-=-=-= */
   15063       case 0x31: { /* RDTSC */
   15064          IRTemp   val  = newTemp(Ity_I64);
   15065          IRExpr** args = mkIRExprVec_0();
   15066          IRDirty* d    = unsafeIRDirty_1_N (
   15067                             val,
   15068                             0/*regparms*/,
   15069                             "x86g_dirtyhelper_RDTSC",
   15070                             &x86g_dirtyhelper_RDTSC,
   15071                             args
   15072                          );
   15073          /* execute the dirty call, dumping the result in val. */
   15074          stmt( IRStmt_Dirty(d) );
   15075          putIReg(4, R_EDX, unop(Iop_64HIto32, mkexpr(val)));
   15076          putIReg(4, R_EAX, unop(Iop_64to32, mkexpr(val)));
   15077          DIP("rdtsc\n");
   15078          break;
   15079       }
   15080 
   15081       /* =-=-=-=-=-=-=-=-=- PUSH/POP Sreg =-=-=-=-=-=-=-=-=-= */
   15082 
   15083       case 0xA1: /* POP %FS */
   15084          dis_pop_segreg( R_FS, sz ); break;
   15085       case 0xA9: /* POP %GS */
   15086          dis_pop_segreg( R_GS, sz ); break;
   15087 
   15088       case 0xA0: /* PUSH %FS */
   15089          dis_push_segreg( R_FS, sz ); break;
   15090       case 0xA8: /* PUSH %GS */
   15091          dis_push_segreg( R_GS, sz ); break;
   15092 
   15093       /* =-=-=-=-=-=-=-=-=- SETcc Eb =-=-=-=-=-=-=-=-=-= */
   15094       case 0x90:
   15095       case 0x91:
   15096       case 0x92: /* set-Bb/set-NAEb (jump below) */
   15097       case 0x93: /* set-NBb/set-AEb (jump not below) */
   15098       case 0x94: /* set-Zb/set-Eb (jump zero) */
   15099       case 0x95: /* set-NZb/set-NEb (jump not zero) */
   15100       case 0x96: /* set-BEb/set-NAb (jump below or equal) */
   15101       case 0x97: /* set-NBEb/set-Ab (jump not below or equal) */
   15102       case 0x98: /* set-Sb (jump negative) */
   15103       case 0x99: /* set-Sb (jump not negative) */
   15104       case 0x9A: /* set-P (jump parity even) */
   15105       case 0x9B: /* set-NP (jump parity odd) */
   15106       case 0x9C: /* set-Lb/set-NGEb (jump less) */
   15107       case 0x9D: /* set-GEb/set-NLb (jump greater or equal) */
   15108       case 0x9E: /* set-LEb/set-NGb (jump less or equal) */
   15109       case 0x9F: /* set-Gb/set-NLEb (jump greater) */
   15110          t1 = newTemp(Ity_I8);
   15111          assign( t1, unop(Iop_1Uto8,mk_x86g_calculate_condition(opc-0x90)) );
   15112          modrm = getIByte(delta);
   15113          if (epartIsReg(modrm)) {
   15114             delta++;
   15115             putIReg(1, eregOfRM(modrm), mkexpr(t1));
   15116             DIP("set%s %s\n", name_X86Condcode(opc-0x90),
   15117                               nameIReg(1,eregOfRM(modrm)));
   15118          } else {
   15119            addr = disAMode ( &alen, sorb, delta, dis_buf );
   15120            delta += alen;
   15121            storeLE( mkexpr(addr), mkexpr(t1) );
   15122            DIP("set%s %s\n", name_X86Condcode(opc-0x90), dis_buf);
   15123          }
   15124          break;
   15125 
   15126       /* =-=-=-=-=-=-=-=-=- SHLD/SHRD -=-=-=-=-=-=-=-=-= */
   15127 
   15128       case 0xA4: /* SHLDv imm8,Gv,Ev */
   15129          modrm = getIByte(delta);
   15130          d32   = delta + lengthAMode(delta);
   15131          vex_sprintf(dis_buf, "$%d", getIByte(d32));
   15132          delta = dis_SHLRD_Gv_Ev (
   15133                   sorb, delta, modrm, sz,
   15134                   mkU8(getIByte(d32)), True, /* literal */
   15135                   dis_buf, True );
   15136          break;
   15137       case 0xA5: /* SHLDv %cl,Gv,Ev */
   15138          modrm = getIByte(delta);
   15139          delta = dis_SHLRD_Gv_Ev (
   15140                     sorb, delta, modrm, sz,
   15141                     getIReg(1,R_ECX), False, /* not literal */
   15142                     "%cl", True );
   15143          break;
   15144 
   15145       case 0xAC: /* SHRDv imm8,Gv,Ev */
   15146          modrm = getIByte(delta);
   15147          d32   = delta + lengthAMode(delta);
   15148          vex_sprintf(dis_buf, "$%d", getIByte(d32));
   15149          delta = dis_SHLRD_Gv_Ev (
   15150                     sorb, delta, modrm, sz,
   15151                     mkU8(getIByte(d32)), True, /* literal */
   15152                     dis_buf, False );
   15153          break;
   15154       case 0xAD: /* SHRDv %cl,Gv,Ev */
   15155          modrm = getIByte(delta);
   15156          delta = dis_SHLRD_Gv_Ev (
   15157                     sorb, delta, modrm, sz,
   15158                     getIReg(1,R_ECX), False, /* not literal */
   15159                     "%cl", False );
   15160          break;
   15161 
   15162       /* =-=-=-=-=-=-=-=-=- SYSENTER -=-=-=-=-=-=-=-=-=-= */
   15163 
   15164       case 0x34:
   15165          /* Simple implementation needing a long explaination.
   15166 
   15167             sysenter is a kind of syscall entry.  The key thing here
   15168             is that the return address is not known -- that is
   15169             something that is beyond Vex's knowledge.  So this IR
   15170             forces a return to the scheduler, which can do what it
   15171             likes to simulate the systenter, but it MUST set this
   15172             thread's guest_EIP field with the continuation address
   15173             before resuming execution.  If that doesn't happen, the
   15174             thread will jump to address zero, which is probably
   15175             fatal.
   15176          */
   15177 
   15178          /* Note where we are, so we can back up the guest to this
   15179             point if the syscall needs to be restarted. */
   15180          stmt( IRStmt_Put( OFFB_IP_AT_SYSCALL,
   15181                            mkU32(guest_EIP_curr_instr) ) );
   15182          jmp_lit(&dres, Ijk_Sys_sysenter, 0/*bogus next EIP value*/);
   15183          vassert(dres.whatNext == Dis_StopHere);
   15184          DIP("sysenter");
   15185          break;
   15186 
   15187       /* =-=-=-=-=-=-=-=-=- XADD -=-=-=-=-=-=-=-=-=-= */
   15188 
   15189       case 0xC0: { /* XADD Gb,Eb */
   15190          Bool decodeOK;
   15191          delta = dis_xadd_G_E ( sorb, pfx_lock, 1, delta, &decodeOK );
   15192          if (!decodeOK) goto decode_failure;
   15193          break;
   15194       }
   15195       case 0xC1: { /* XADD Gv,Ev */
   15196          Bool decodeOK;
   15197          delta = dis_xadd_G_E ( sorb, pfx_lock, sz, delta, &decodeOK );
   15198          if (!decodeOK) goto decode_failure;
   15199          break;
   15200       }
   15201 
   15202       /* =-=-=-=-=-=-=-=-=- MMXery =-=-=-=-=-=-=-=-=-=-= */
   15203 
   15204       case 0x71:
   15205       case 0x72:
   15206       case 0x73: /* PSLLgg/PSRAgg/PSRLgg mmxreg by imm8 */
   15207 
   15208       case 0x6E: /* MOVD (src)ireg-or-mem, (dst)mmxreg */
   15209       case 0x7E: /* MOVD (src)mmxreg, (dst)ireg-or-mem */
   15210       case 0x7F: /* MOVQ (src)mmxreg, (dst)mmxreg-or-mem */
   15211       case 0x6F: /* MOVQ (src)mmxreg-or-mem, (dst)mmxreg */
   15212 
   15213       case 0xFC:
   15214       case 0xFD:
   15215       case 0xFE: /* PADDgg (src)mmxreg-or-mem, (dst)mmxreg */
   15216 
   15217       case 0xEC:
   15218       case 0xED: /* PADDSgg (src)mmxreg-or-mem, (dst)mmxreg */
   15219 
   15220       case 0xDC:
   15221       case 0xDD: /* PADDUSgg (src)mmxreg-or-mem, (dst)mmxreg */
   15222 
   15223       case 0xF8:
   15224       case 0xF9:
   15225       case 0xFA: /* PSUBgg (src)mmxreg-or-mem, (dst)mmxreg */
   15226 
   15227       case 0xE8:
   15228       case 0xE9: /* PSUBSgg (src)mmxreg-or-mem, (dst)mmxreg */
   15229 
   15230       case 0xD8:
   15231       case 0xD9: /* PSUBUSgg (src)mmxreg-or-mem, (dst)mmxreg */
   15232 
   15233       case 0xE5: /* PMULHW (src)mmxreg-or-mem, (dst)mmxreg */
   15234       case 0xD5: /* PMULLW (src)mmxreg-or-mem, (dst)mmxreg */
   15235 
   15236       case 0xF5: /* PMADDWD (src)mmxreg-or-mem, (dst)mmxreg */
   15237 
   15238       case 0x74:
   15239       case 0x75:
   15240       case 0x76: /* PCMPEQgg (src)mmxreg-or-mem, (dst)mmxreg */
   15241 
   15242       case 0x64:
   15243       case 0x65:
   15244       case 0x66: /* PCMPGTgg (src)mmxreg-or-mem, (dst)mmxreg */
   15245 
   15246       case 0x6B: /* PACKSSDW (src)mmxreg-or-mem, (dst)mmxreg */
   15247       case 0x63: /* PACKSSWB (src)mmxreg-or-mem, (dst)mmxreg */
   15248       case 0x67: /* PACKUSWB (src)mmxreg-or-mem, (dst)mmxreg */
   15249 
   15250       case 0x68:
   15251       case 0x69:
   15252       case 0x6A: /* PUNPCKHgg (src)mmxreg-or-mem, (dst)mmxreg */
   15253 
   15254       case 0x60:
   15255       case 0x61:
   15256       case 0x62: /* PUNPCKLgg (src)mmxreg-or-mem, (dst)mmxreg */
   15257 
   15258       case 0xDB: /* PAND (src)mmxreg-or-mem, (dst)mmxreg */
   15259       case 0xDF: /* PANDN (src)mmxreg-or-mem, (dst)mmxreg */
   15260       case 0xEB: /* POR (src)mmxreg-or-mem, (dst)mmxreg */
   15261       case 0xEF: /* PXOR (src)mmxreg-or-mem, (dst)mmxreg */
   15262 
   15263       case 0xF1: /* PSLLgg (src)mmxreg-or-mem, (dst)mmxreg */
   15264       case 0xF2:
   15265       case 0xF3:
   15266 
   15267       case 0xD1: /* PSRLgg (src)mmxreg-or-mem, (dst)mmxreg */
   15268       case 0xD2:
   15269       case 0xD3:
   15270 
   15271       case 0xE1: /* PSRAgg (src)mmxreg-or-mem, (dst)mmxreg */
   15272       case 0xE2:
   15273       {
   15274          Int  delta0    = delta-1;
   15275          Bool decode_OK = False;
   15276 
   15277          /* If sz==2 this is SSE, and we assume sse idec has
   15278             already spotted those cases by now. */
   15279          if (sz != 4)
   15280             goto decode_failure;
   15281 
   15282          delta = dis_MMX ( &decode_OK, sorb, sz, delta-1 );
   15283          if (!decode_OK) {
   15284             delta = delta0;
   15285             goto decode_failure;
   15286          }
   15287          break;
   15288       }
   15289 
   15290       case 0x0E: /* FEMMS */
   15291       case 0x77: /* EMMS */
   15292          if (sz != 4)
   15293             goto decode_failure;
   15294          do_EMMS_preamble();
   15295          DIP("{f}emms\n");
   15296          break;
   15297 
   15298       /* =-=-=-=-=-=-=-=-=- SGDT and SIDT =-=-=-=-=-=-=-=-=-=-= */
   15299       case 0x01: /* 0F 01 /0 -- SGDT */
   15300                  /* 0F 01 /1 -- SIDT */
   15301       {
   15302           /* This is really revolting, but ... since each processor
   15303              (core) only has one IDT and one GDT, just let the guest
   15304              see it (pass-through semantics).  I can't see any way to
   15305              construct a faked-up value, so don't bother to try. */
   15306          modrm = getUChar(delta);
   15307          addr = disAMode ( &alen, sorb, delta, dis_buf );
   15308          delta += alen;
   15309          if (epartIsReg(modrm)) goto decode_failure;
   15310          if (gregOfRM(modrm) != 0 && gregOfRM(modrm) != 1)
   15311             goto decode_failure;
   15312          switch (gregOfRM(modrm)) {
   15313             case 0: DIP("sgdt %s\n", dis_buf); break;
   15314             case 1: DIP("sidt %s\n", dis_buf); break;
   15315             default: vassert(0); /*NOTREACHED*/
   15316          }
   15317 
   15318          IRDirty* d = unsafeIRDirty_0_N (
   15319                           0/*regparms*/,
   15320                           "x86g_dirtyhelper_SxDT",
   15321                           &x86g_dirtyhelper_SxDT,
   15322                           mkIRExprVec_2( mkexpr(addr),
   15323                                          mkU32(gregOfRM(modrm)) )
   15324                       );
   15325          /* declare we're writing memory */
   15326          d->mFx   = Ifx_Write;
   15327          d->mAddr = mkexpr(addr);
   15328          d->mSize = 6;
   15329          stmt( IRStmt_Dirty(d) );
   15330          break;
   15331       }
   15332 
   15333       case 0x05: /* AMD's syscall */
   15334          stmt( IRStmt_Put( OFFB_IP_AT_SYSCALL,
   15335               mkU32(guest_EIP_curr_instr) ) );
   15336          jmp_lit(&dres, Ijk_Sys_syscall, ((Addr32)guest_EIP_bbstart)+delta);
   15337          vassert(dres.whatNext == Dis_StopHere);
   15338          DIP("syscall\n");
   15339          break;
   15340 
   15341       /* =-=-=-=-=-=-=-=-=- unimp2 =-=-=-=-=-=-=-=-=-=-= */
   15342 
   15343       default:
   15344          goto decode_failure;
   15345    } /* switch (opc) for the 2-byte opcodes */
   15346    goto decode_success;
   15347    } /* case 0x0F: of primary opcode */
   15348 
   15349    /* ------------------------ ??? ------------------------ */
   15350 
   15351   default:
   15352   decode_failure:
   15353    /* All decode failures end up here. */
   15354    if (sigill_diag) {
   15355       vex_printf("vex x86->IR: unhandled instruction bytes: "
   15356                  "0x%x 0x%x 0x%x 0x%x\n",
   15357                  (Int)getIByte(delta_start+0),
   15358                  (Int)getIByte(delta_start+1),
   15359                  (Int)getIByte(delta_start+2),
   15360                  (Int)getIByte(delta_start+3) );
   15361    }
   15362 
   15363    /* Tell the dispatcher that this insn cannot be decoded, and so has
   15364       not been executed, and (is currently) the next to be executed.
   15365       EIP should be up-to-date since it made so at the start of each
   15366       insn, but nevertheless be paranoid and update it again right
   15367       now. */
   15368    stmt( IRStmt_Put( OFFB_EIP, mkU32(guest_EIP_curr_instr) ) );
   15369    jmp_lit(&dres, Ijk_NoDecode, guest_EIP_curr_instr);
   15370    vassert(dres.whatNext == Dis_StopHere);
   15371    dres.len = 0;
   15372    /* We also need to say that a CAS is not expected now, regardless
   15373       of what it might have been set to at the start of the function,
   15374       since the IR that we've emitted just above (to synthesis a
   15375       SIGILL) does not involve any CAS, and presumably no other IR has
   15376       been emitted for this (non-decoded) insn. */
   15377    *expect_CAS = False;
   15378    return dres;
   15379 
   15380    } /* switch (opc) for the main (primary) opcode switch. */
   15381 
   15382   decode_success:
   15383    /* All decode successes end up here. */
   15384    switch (dres.whatNext) {
   15385       case Dis_Continue:
   15386          stmt( IRStmt_Put( OFFB_EIP, mkU32(guest_EIP_bbstart + delta) ) );
   15387          break;
   15388       case Dis_ResteerU:
   15389       case Dis_ResteerC:
   15390          stmt( IRStmt_Put( OFFB_EIP, mkU32(dres.continueAt) ) );
   15391          break;
   15392       case Dis_StopHere:
   15393          break;
   15394       default:
   15395          vassert(0);
   15396    }
   15397 
   15398    DIP("\n");
   15399    dres.len = delta - delta_start;
   15400    return dres;
   15401 }
   15402 
   15403 #undef DIP
   15404 #undef DIS
   15405 
   15406 
   15407 /*------------------------------------------------------------*/
   15408 /*--- Top-level fn                                         ---*/
   15409 /*------------------------------------------------------------*/
   15410 
   15411 /* Disassemble a single instruction into IR.  The instruction
   15412    is located in host memory at &guest_code[delta]. */
   15413 
   15414 DisResult disInstr_X86 ( IRSB*        irsb_IN,
   15415                          Bool         (*resteerOkFn) ( void*, Addr64 ),
   15416                          Bool         resteerCisOk,
   15417                          void*        callback_opaque,
   15418                          UChar*       guest_code_IN,
   15419                          Long         delta,
   15420                          Addr64       guest_IP,
   15421                          VexArch      guest_arch,
   15422                          VexArchInfo* archinfo,
   15423                          VexAbiInfo*  abiinfo,
   15424                          Bool         host_bigendian_IN,
   15425                          Bool         sigill_diag_IN )
   15426 {
   15427    Int       i, x1, x2;
   15428    Bool      expect_CAS, has_CAS;
   15429    DisResult dres;
   15430 
   15431    /* Set globals (see top of this file) */
   15432    vassert(guest_arch == VexArchX86);
   15433    guest_code           = guest_code_IN;
   15434    irsb                 = irsb_IN;
   15435    host_is_bigendian    = host_bigendian_IN;
   15436    guest_EIP_curr_instr = (Addr32)guest_IP;
   15437    guest_EIP_bbstart    = (Addr32)toUInt(guest_IP - delta);
   15438 
   15439    x1 = irsb_IN->stmts_used;
   15440    expect_CAS = False;
   15441    dres = disInstr_X86_WRK ( &expect_CAS, resteerOkFn,
   15442                              resteerCisOk,
   15443                              callback_opaque,
   15444                              delta, archinfo, abiinfo, sigill_diag_IN );
   15445    x2 = irsb_IN->stmts_used;
   15446    vassert(x2 >= x1);
   15447 
   15448    /* See comment at the top of disInstr_X86_WRK for meaning of
   15449       expect_CAS.  Here, we (sanity-)check for the presence/absence of
   15450       IRCAS as directed by the returned expect_CAS value. */
   15451    has_CAS = False;
   15452    for (i = x1; i < x2; i++) {
   15453       if (irsb_IN->stmts[i]->tag == Ist_CAS)
   15454          has_CAS = True;
   15455    }
   15456 
   15457    if (expect_CAS != has_CAS) {
   15458       /* inconsistency detected.  re-disassemble the instruction so as
   15459          to generate a useful error message; then assert. */
   15460       vex_traceflags |= VEX_TRACE_FE;
   15461       dres = disInstr_X86_WRK ( &expect_CAS, resteerOkFn,
   15462                                 resteerCisOk,
   15463                                 callback_opaque,
   15464                                 delta, archinfo, abiinfo, sigill_diag_IN );
   15465       for (i = x1; i < x2; i++) {
   15466          vex_printf("\t\t");
   15467          ppIRStmt(irsb_IN->stmts[i]);
   15468          vex_printf("\n");
   15469       }
   15470       /* Failure of this assertion is serious and denotes a bug in
   15471          disInstr. */
   15472       vpanic("disInstr_X86: inconsistency in LOCK prefix handling");
   15473    }
   15474 
   15475    return dres;
   15476 }
   15477 
   15478 
   15479 /*--------------------------------------------------------------------*/
   15480 /*--- end                                         guest_x86_toIR.c ---*/
   15481 /*--------------------------------------------------------------------*/
   15482