Home | History | Annotate | Download | only in priv
      1 
      2 /*---------------------------------------------------------------*/
      3 /*--- begin                               guest_x86_helpers.c ---*/
      4 /*---------------------------------------------------------------*/
      5 
      6 /*
      7    This file is part of Valgrind, a dynamic binary instrumentation
      8    framework.
      9 
     10    Copyright (C) 2004-2013 OpenWorks LLP
     11       info (at) open-works.net
     12 
     13    This program is free software; you can redistribute it and/or
     14    modify it under the terms of the GNU General Public License as
     15    published by the Free Software Foundation; either version 2 of the
     16    License, or (at your option) any later version.
     17 
     18    This program is distributed in the hope that it will be useful, but
     19    WITHOUT ANY WARRANTY; without even the implied warranty of
     20    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
     21    General Public License for more details.
     22 
     23    You should have received a copy of the GNU General Public License
     24    along with this program; if not, write to the Free Software
     25    Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
     26    02110-1301, USA.
     27 
     28    The GNU General Public License is contained in the file COPYING.
     29 
     30    Neither the names of the U.S. Department of Energy nor the
     31    University of California nor the names of its contributors may be
     32    used to endorse or promote products derived from this software
     33    without prior written permission.
     34 */
     35 
     36 #include "libvex_basictypes.h"
     37 #include "libvex_emnote.h"
     38 #include "libvex_guest_x86.h"
     39 #include "libvex_ir.h"
     40 #include "libvex.h"
     41 
     42 #include "main_util.h"
     43 #include "main_globals.h"
     44 #include "guest_generic_bb_to_IR.h"
     45 #include "guest_x86_defs.h"
     46 #include "guest_generic_x87.h"
     47 
     48 
     49 /* This file contains helper functions for x86 guest code.
     50    Calls to these functions are generated by the back end.
     51    These calls are of course in the host machine code and
     52    this file will be compiled to host machine code, so that
     53    all makes sense.
     54 
     55    Only change the signatures of these helper functions very
     56    carefully.  If you change the signature here, you'll have to change
     57    the parameters passed to it in the IR calls constructed by
     58    guest-x86/toIR.c.
     59 
     60    The convention used is that all functions called from generated
     61    code are named x86g_<something>, and any function whose name lacks
     62    that prefix is not called from generated code.  Note that some
     63    LibVEX_* functions can however be called by VEX's client, but that
     64    is not the same as calling them from VEX-generated code.
     65 */
     66 
     67 
     68 /* Set to 1 to get detailed profiling info about use of the flag
     69    machinery. */
     70 #define PROFILE_EFLAGS 0
     71 
     72 
     73 /*---------------------------------------------------------------*/
     74 /*--- %eflags run-time helpers.                               ---*/
     75 /*---------------------------------------------------------------*/
     76 
     77 static const UChar parity_table[256] = {
     78     X86G_CC_MASK_P, 0, 0, X86G_CC_MASK_P, 0, X86G_CC_MASK_P, X86G_CC_MASK_P, 0,
     79     0, X86G_CC_MASK_P, X86G_CC_MASK_P, 0, X86G_CC_MASK_P, 0, 0, X86G_CC_MASK_P,
     80     0, X86G_CC_MASK_P, X86G_CC_MASK_P, 0, X86G_CC_MASK_P, 0, 0, X86G_CC_MASK_P,
     81     X86G_CC_MASK_P, 0, 0, X86G_CC_MASK_P, 0, X86G_CC_MASK_P, X86G_CC_MASK_P, 0,
     82     0, X86G_CC_MASK_P, X86G_CC_MASK_P, 0, X86G_CC_MASK_P, 0, 0, X86G_CC_MASK_P,
     83     X86G_CC_MASK_P, 0, 0, X86G_CC_MASK_P, 0, X86G_CC_MASK_P, X86G_CC_MASK_P, 0,
     84     X86G_CC_MASK_P, 0, 0, X86G_CC_MASK_P, 0, X86G_CC_MASK_P, X86G_CC_MASK_P, 0,
     85     0, X86G_CC_MASK_P, X86G_CC_MASK_P, 0, X86G_CC_MASK_P, 0, 0, X86G_CC_MASK_P,
     86     0, X86G_CC_MASK_P, X86G_CC_MASK_P, 0, X86G_CC_MASK_P, 0, 0, X86G_CC_MASK_P,
     87     X86G_CC_MASK_P, 0, 0, X86G_CC_MASK_P, 0, X86G_CC_MASK_P, X86G_CC_MASK_P, 0,
     88     X86G_CC_MASK_P, 0, 0, X86G_CC_MASK_P, 0, X86G_CC_MASK_P, X86G_CC_MASK_P, 0,
     89     0, X86G_CC_MASK_P, X86G_CC_MASK_P, 0, X86G_CC_MASK_P, 0, 0, X86G_CC_MASK_P,
     90     X86G_CC_MASK_P, 0, 0, X86G_CC_MASK_P, 0, X86G_CC_MASK_P, X86G_CC_MASK_P, 0,
     91     0, X86G_CC_MASK_P, X86G_CC_MASK_P, 0, X86G_CC_MASK_P, 0, 0, X86G_CC_MASK_P,
     92     0, X86G_CC_MASK_P, X86G_CC_MASK_P, 0, X86G_CC_MASK_P, 0, 0, X86G_CC_MASK_P,
     93     X86G_CC_MASK_P, 0, 0, X86G_CC_MASK_P, 0, X86G_CC_MASK_P, X86G_CC_MASK_P, 0,
     94     0, X86G_CC_MASK_P, X86G_CC_MASK_P, 0, X86G_CC_MASK_P, 0, 0, X86G_CC_MASK_P,
     95     X86G_CC_MASK_P, 0, 0, X86G_CC_MASK_P, 0, X86G_CC_MASK_P, X86G_CC_MASK_P, 0,
     96     X86G_CC_MASK_P, 0, 0, X86G_CC_MASK_P, 0, X86G_CC_MASK_P, X86G_CC_MASK_P, 0,
     97     0, X86G_CC_MASK_P, X86G_CC_MASK_P, 0, X86G_CC_MASK_P, 0, 0, X86G_CC_MASK_P,
     98     X86G_CC_MASK_P, 0, 0, X86G_CC_MASK_P, 0, X86G_CC_MASK_P, X86G_CC_MASK_P, 0,
     99     0, X86G_CC_MASK_P, X86G_CC_MASK_P, 0, X86G_CC_MASK_P, 0, 0, X86G_CC_MASK_P,
    100     0, X86G_CC_MASK_P, X86G_CC_MASK_P, 0, X86G_CC_MASK_P, 0, 0, X86G_CC_MASK_P,
    101     X86G_CC_MASK_P, 0, 0, X86G_CC_MASK_P, 0, X86G_CC_MASK_P, X86G_CC_MASK_P, 0,
    102     X86G_CC_MASK_P, 0, 0, X86G_CC_MASK_P, 0, X86G_CC_MASK_P, X86G_CC_MASK_P, 0,
    103     0, X86G_CC_MASK_P, X86G_CC_MASK_P, 0, X86G_CC_MASK_P, 0, 0, X86G_CC_MASK_P,
    104     0, X86G_CC_MASK_P, X86G_CC_MASK_P, 0, X86G_CC_MASK_P, 0, 0, X86G_CC_MASK_P,
    105     X86G_CC_MASK_P, 0, 0, X86G_CC_MASK_P, 0, X86G_CC_MASK_P, X86G_CC_MASK_P, 0,
    106     0, X86G_CC_MASK_P, X86G_CC_MASK_P, 0, X86G_CC_MASK_P, 0, 0, X86G_CC_MASK_P,
    107     X86G_CC_MASK_P, 0, 0, X86G_CC_MASK_P, 0, X86G_CC_MASK_P, X86G_CC_MASK_P, 0,
    108     X86G_CC_MASK_P, 0, 0, X86G_CC_MASK_P, 0, X86G_CC_MASK_P, X86G_CC_MASK_P, 0,
    109     0, X86G_CC_MASK_P, X86G_CC_MASK_P, 0, X86G_CC_MASK_P, 0, 0, X86G_CC_MASK_P,
    110 };
    111 
    112 /* generalised left-shifter */
    113 inline static Int lshift ( Int x, Int n )
    114 {
    115    if (n >= 0)
    116       return x << n;
    117    else
    118       return x >> (-n);
    119 }
    120 
    121 /* identity on ULong */
    122 static inline ULong idULong ( ULong x )
    123 {
    124    return x;
    125 }
    126 
    127 
    128 #define PREAMBLE(__data_bits)					\
    129    /* const */ UInt DATA_MASK 					\
    130       = __data_bits==8 ? 0xFF 					\
    131                        : (__data_bits==16 ? 0xFFFF 		\
    132                                           : 0xFFFFFFFF); 	\
    133    /* const */ UInt SIGN_MASK = 1 << (__data_bits - 1);		\
    134    /* const */ UInt CC_DEP1 = cc_dep1_formal;			\
    135    /* const */ UInt CC_DEP2 = cc_dep2_formal;			\
    136    /* const */ UInt CC_NDEP = cc_ndep_formal;			\
    137    /* Four bogus assignments, which hopefully gcc can     */	\
    138    /* optimise away, and which stop it complaining about  */	\
    139    /* unused variables.                                   */	\
    140    SIGN_MASK = SIGN_MASK;					\
    141    DATA_MASK = DATA_MASK;					\
    142    CC_DEP2 = CC_DEP2;						\
    143    CC_NDEP = CC_NDEP;
    144 
    145 
    146 /*-------------------------------------------------------------*/
    147 
    148 #define ACTIONS_ADD(DATA_BITS,DATA_UTYPE)			\
    149 {								\
    150    PREAMBLE(DATA_BITS);						\
    151    { Int cf, pf, af, zf, sf, of;				\
    152      Int argL, argR, res;					\
    153      argL = CC_DEP1;						\
    154      argR = CC_DEP2;						\
    155      res  = argL + argR;					\
    156      cf = (DATA_UTYPE)res < (DATA_UTYPE)argL;			\
    157      pf = parity_table[(UChar)res];				\
    158      af = (res ^ argL ^ argR) & 0x10;				\
    159      zf = ((DATA_UTYPE)res == 0) << 6;				\
    160      sf = lshift(res, 8 - DATA_BITS) & 0x80;			\
    161      of = lshift((argL ^ argR ^ -1) & (argL ^ res), 		\
    162                  12 - DATA_BITS) & X86G_CC_MASK_O;		\
    163      return cf | pf | af | zf | sf | of;			\
    164    }								\
    165 }
    166 
    167 /*-------------------------------------------------------------*/
    168 
    169 #define ACTIONS_SUB(DATA_BITS,DATA_UTYPE)			\
    170 {								\
    171    PREAMBLE(DATA_BITS);						\
    172    { Int cf, pf, af, zf, sf, of;				\
    173      Int argL, argR, res;					\
    174      argL = CC_DEP1;						\
    175      argR = CC_DEP2;						\
    176      res  = argL - argR;					\
    177      cf = (DATA_UTYPE)argL < (DATA_UTYPE)argR;			\
    178      pf = parity_table[(UChar)res];				\
    179      af = (res ^ argL ^ argR) & 0x10;				\
    180      zf = ((DATA_UTYPE)res == 0) << 6;				\
    181      sf = lshift(res, 8 - DATA_BITS) & 0x80;			\
    182      of = lshift((argL ^ argR) & (argL ^ res),	 		\
    183                  12 - DATA_BITS) & X86G_CC_MASK_O; 		\
    184      return cf | pf | af | zf | sf | of;			\
    185    }								\
    186 }
    187 
    188 /*-------------------------------------------------------------*/
    189 
    190 #define ACTIONS_ADC(DATA_BITS,DATA_UTYPE)			\
    191 {								\
    192    PREAMBLE(DATA_BITS);						\
    193    { Int cf, pf, af, zf, sf, of;				\
    194      Int argL, argR, oldC, res;		       			\
    195      oldC = CC_NDEP & X86G_CC_MASK_C;				\
    196      argL = CC_DEP1;						\
    197      argR = CC_DEP2 ^ oldC;	       				\
    198      res  = (argL + argR) + oldC;				\
    199      if (oldC)							\
    200         cf = (DATA_UTYPE)res <= (DATA_UTYPE)argL;		\
    201      else							\
    202         cf = (DATA_UTYPE)res < (DATA_UTYPE)argL;		\
    203      pf = parity_table[(UChar)res];				\
    204      af = (res ^ argL ^ argR) & 0x10;				\
    205      zf = ((DATA_UTYPE)res == 0) << 6;				\
    206      sf = lshift(res, 8 - DATA_BITS) & 0x80;			\
    207      of = lshift((argL ^ argR ^ -1) & (argL ^ res), 		\
    208                   12 - DATA_BITS) & X86G_CC_MASK_O;		\
    209      return cf | pf | af | zf | sf | of;			\
    210    }								\
    211 }
    212 
    213 /*-------------------------------------------------------------*/
    214 
    215 #define ACTIONS_SBB(DATA_BITS,DATA_UTYPE)			\
    216 {								\
    217    PREAMBLE(DATA_BITS);						\
    218    { Int cf, pf, af, zf, sf, of;				\
    219      Int argL, argR, oldC, res;		       			\
    220      oldC = CC_NDEP & X86G_CC_MASK_C;				\
    221      argL = CC_DEP1;						\
    222      argR = CC_DEP2 ^ oldC;	       				\
    223      res  = (argL - argR) - oldC;				\
    224      if (oldC)							\
    225         cf = (DATA_UTYPE)argL <= (DATA_UTYPE)argR;		\
    226      else							\
    227         cf = (DATA_UTYPE)argL < (DATA_UTYPE)argR;		\
    228      pf = parity_table[(UChar)res];				\
    229      af = (res ^ argL ^ argR) & 0x10;				\
    230      zf = ((DATA_UTYPE)res == 0) << 6;				\
    231      sf = lshift(res, 8 - DATA_BITS) & 0x80;			\
    232      of = lshift((argL ^ argR) & (argL ^ res), 			\
    233                  12 - DATA_BITS) & X86G_CC_MASK_O;		\
    234      return cf | pf | af | zf | sf | of;			\
    235    }								\
    236 }
    237 
    238 /*-------------------------------------------------------------*/
    239 
    240 #define ACTIONS_LOGIC(DATA_BITS,DATA_UTYPE)			\
    241 {								\
    242    PREAMBLE(DATA_BITS);						\
    243    { Int cf, pf, af, zf, sf, of;				\
    244      cf = 0;							\
    245      pf = parity_table[(UChar)CC_DEP1];				\
    246      af = 0;							\
    247      zf = ((DATA_UTYPE)CC_DEP1 == 0) << 6;			\
    248      sf = lshift(CC_DEP1, 8 - DATA_BITS) & 0x80;		\
    249      of = 0;							\
    250      return cf | pf | af | zf | sf | of;			\
    251    }								\
    252 }
    253 
    254 /*-------------------------------------------------------------*/
    255 
    256 #define ACTIONS_INC(DATA_BITS,DATA_UTYPE)			\
    257 {								\
    258    PREAMBLE(DATA_BITS);						\
    259    { Int cf, pf, af, zf, sf, of;				\
    260      Int argL, argR, res;					\
    261      res  = CC_DEP1;						\
    262      argL = res - 1;						\
    263      argR = 1;							\
    264      cf = CC_NDEP & X86G_CC_MASK_C;				\
    265      pf = parity_table[(UChar)res];				\
    266      af = (res ^ argL ^ argR) & 0x10;				\
    267      zf = ((DATA_UTYPE)res == 0) << 6;				\
    268      sf = lshift(res, 8 - DATA_BITS) & 0x80;			\
    269      of = ((res & DATA_MASK) == SIGN_MASK) << 11;		\
    270      return cf | pf | af | zf | sf | of;			\
    271    }								\
    272 }
    273 
    274 /*-------------------------------------------------------------*/
    275 
    276 #define ACTIONS_DEC(DATA_BITS,DATA_UTYPE)			\
    277 {								\
    278    PREAMBLE(DATA_BITS);						\
    279    { Int cf, pf, af, zf, sf, of;				\
    280      Int argL, argR, res;					\
    281      res  = CC_DEP1;						\
    282      argL = res + 1;						\
    283      argR = 1;							\
    284      cf = CC_NDEP & X86G_CC_MASK_C;				\
    285      pf = parity_table[(UChar)res];				\
    286      af = (res ^ argL ^ argR) & 0x10;				\
    287      zf = ((DATA_UTYPE)res == 0) << 6;				\
    288      sf = lshift(res, 8 - DATA_BITS) & 0x80;			\
    289      of = ((res & DATA_MASK) 					\
    290           == ((UInt)SIGN_MASK - 1)) << 11;			\
    291      return cf | pf | af | zf | sf | of;			\
    292    }								\
    293 }
    294 
    295 /*-------------------------------------------------------------*/
    296 
    297 #define ACTIONS_SHL(DATA_BITS,DATA_UTYPE)			\
    298 {								\
    299    PREAMBLE(DATA_BITS);						\
    300    { Int cf, pf, af, zf, sf, of;				\
    301      cf = (CC_DEP2 >> (DATA_BITS - 1)) & X86G_CC_MASK_C;	\
    302      pf = parity_table[(UChar)CC_DEP1];				\
    303      af = 0; /* undefined */					\
    304      zf = ((DATA_UTYPE)CC_DEP1 == 0) << 6;			\
    305      sf = lshift(CC_DEP1, 8 - DATA_BITS) & 0x80;		\
    306      /* of is defined if shift count == 1 */			\
    307      of = lshift(CC_DEP2 ^ CC_DEP1, 12 - DATA_BITS) 		\
    308           & X86G_CC_MASK_O;					\
    309      return cf | pf | af | zf | sf | of;			\
    310    }								\
    311 }
    312 
    313 /*-------------------------------------------------------------*/
    314 
    315 #define ACTIONS_SHR(DATA_BITS,DATA_UTYPE)			\
    316 {								\
    317    PREAMBLE(DATA_BITS);  					\
    318    { Int cf, pf, af, zf, sf, of;				\
    319      cf = CC_DEP2 & 1;						\
    320      pf = parity_table[(UChar)CC_DEP1];				\
    321      af = 0; /* undefined */					\
    322      zf = ((DATA_UTYPE)CC_DEP1 == 0) << 6;			\
    323      sf = lshift(CC_DEP1, 8 - DATA_BITS) & 0x80;		\
    324      /* of is defined if shift count == 1 */			\
    325      of = lshift(CC_DEP2 ^ CC_DEP1, 12 - DATA_BITS)		\
    326           & X86G_CC_MASK_O;					\
    327      return cf | pf | af | zf | sf | of;			\
    328    }								\
    329 }
    330 
    331 /*-------------------------------------------------------------*/
    332 
    333 /* ROL: cf' = lsb(result).  of' = msb(result) ^ lsb(result). */
    334 /* DEP1 = result, NDEP = old flags */
    335 #define ACTIONS_ROL(DATA_BITS,DATA_UTYPE)			\
    336 {								\
    337    PREAMBLE(DATA_BITS);						\
    338    { Int fl 							\
    339         = (CC_NDEP & ~(X86G_CC_MASK_O | X86G_CC_MASK_C))	\
    340           | (X86G_CC_MASK_C & CC_DEP1)				\
    341           | (X86G_CC_MASK_O & (lshift(CC_DEP1,  		\
    342                                       11-(DATA_BITS-1)) 	\
    343                      ^ lshift(CC_DEP1, 11)));			\
    344      return fl;							\
    345    }								\
    346 }
    347 
    348 /*-------------------------------------------------------------*/
    349 
    350 /* ROR: cf' = msb(result).  of' = msb(result) ^ msb-1(result). */
    351 /* DEP1 = result, NDEP = old flags */
    352 #define ACTIONS_ROR(DATA_BITS,DATA_UTYPE)			\
    353 {								\
    354    PREAMBLE(DATA_BITS);						\
    355    { Int fl 							\
    356         = (CC_NDEP & ~(X86G_CC_MASK_O | X86G_CC_MASK_C))	\
    357           | (X86G_CC_MASK_C & (CC_DEP1 >> (DATA_BITS-1)))	\
    358           | (X86G_CC_MASK_O & (lshift(CC_DEP1, 			\
    359                                       11-(DATA_BITS-1)) 	\
    360                      ^ lshift(CC_DEP1, 11-(DATA_BITS-1)+1)));	\
    361      return fl;							\
    362    }								\
    363 }
    364 
    365 /*-------------------------------------------------------------*/
    366 
    367 #define ACTIONS_UMUL(DATA_BITS, DATA_UTYPE,  NARROWtoU,         \
    368                                 DATA_U2TYPE, NARROWto2U)        \
    369 {                                                               \
    370    PREAMBLE(DATA_BITS);                                         \
    371    { Int cf, pf, af, zf, sf, of;                                \
    372      DATA_UTYPE  hi;                                            \
    373      DATA_UTYPE  lo                                             \
    374         = NARROWtoU( ((DATA_UTYPE)CC_DEP1)                      \
    375                      * ((DATA_UTYPE)CC_DEP2) );                 \
    376      DATA_U2TYPE rr                                             \
    377         = NARROWto2U(                                           \
    378              ((DATA_U2TYPE)((DATA_UTYPE)CC_DEP1))               \
    379              * ((DATA_U2TYPE)((DATA_UTYPE)CC_DEP2)) );          \
    380      hi = NARROWtoU(rr >>/*u*/ DATA_BITS);                      \
    381      cf = (hi != 0);                                            \
    382      pf = parity_table[(UChar)lo];                              \
    383      af = 0; /* undefined */                                    \
    384      zf = (lo == 0) << 6;                                       \
    385      sf = lshift(lo, 8 - DATA_BITS) & 0x80;                     \
    386      of = cf << 11;                                             \
    387      return cf | pf | af | zf | sf | of;                        \
    388    }								\
    389 }
    390 
    391 /*-------------------------------------------------------------*/
    392 
    393 #define ACTIONS_SMUL(DATA_BITS, DATA_STYPE,  NARROWtoS,         \
    394                                 DATA_S2TYPE, NARROWto2S)        \
    395 {                                                               \
    396    PREAMBLE(DATA_BITS);                                         \
    397    { Int cf, pf, af, zf, sf, of;                                \
    398      DATA_STYPE  hi;                                            \
    399      DATA_STYPE  lo                                             \
    400         = NARROWtoS( ((DATA_STYPE)CC_DEP1)                      \
    401                      * ((DATA_STYPE)CC_DEP2) );                 \
    402      DATA_S2TYPE rr                                             \
    403         = NARROWto2S(                                           \
    404              ((DATA_S2TYPE)((DATA_STYPE)CC_DEP1))               \
    405              * ((DATA_S2TYPE)((DATA_STYPE)CC_DEP2)) );          \
    406      hi = NARROWtoS(rr >>/*s*/ DATA_BITS);                      \
    407      cf = (hi != (lo >>/*s*/ (DATA_BITS-1)));                   \
    408      pf = parity_table[(UChar)lo];                              \
    409      af = 0; /* undefined */                                    \
    410      zf = (lo == 0) << 6;                                       \
    411      sf = lshift(lo, 8 - DATA_BITS) & 0x80;                     \
    412      of = cf << 11;                                             \
    413      return cf | pf | af | zf | sf | of;                        \
    414    }								\
    415 }
    416 
    417 
    418 #if PROFILE_EFLAGS
    419 
    420 static Bool initted     = False;
    421 
    422 /* C flag, fast route */
    423 static UInt tabc_fast[X86G_CC_OP_NUMBER];
    424 /* C flag, slow route */
    425 static UInt tabc_slow[X86G_CC_OP_NUMBER];
    426 /* table for calculate_cond */
    427 static UInt tab_cond[X86G_CC_OP_NUMBER][16];
    428 /* total entry counts for calc_all, calc_c, calc_cond. */
    429 static UInt n_calc_all  = 0;
    430 static UInt n_calc_c    = 0;
    431 static UInt n_calc_cond = 0;
    432 
    433 #define SHOW_COUNTS_NOW (0 == (0x3FFFFF & (n_calc_all+n_calc_c+n_calc_cond)))
    434 
    435 
    436 static void showCounts ( void )
    437 {
    438    Int op, co;
    439    HChar ch;
    440    vex_printf("\nTotal calls: calc_all=%u   calc_cond=%u   calc_c=%u\n",
    441               n_calc_all, n_calc_cond, n_calc_c);
    442 
    443    vex_printf("      cSLOW  cFAST    O   NO    B   NB    Z   NZ   BE  NBE"
    444               "    S   NS    P   NP    L   NL   LE  NLE\n");
    445    vex_printf("     -----------------------------------------------------"
    446               "----------------------------------------\n");
    447    for (op = 0; op < X86G_CC_OP_NUMBER; op++) {
    448 
    449       ch = ' ';
    450       if (op > 0 && (op-1) % 3 == 0)
    451          ch = 'B';
    452       if (op > 0 && (op-1) % 3 == 1)
    453          ch = 'W';
    454       if (op > 0 && (op-1) % 3 == 2)
    455          ch = 'L';
    456 
    457       vex_printf("%2d%c: ", op, ch);
    458       vex_printf("%6u ", tabc_slow[op]);
    459       vex_printf("%6u ", tabc_fast[op]);
    460       for (co = 0; co < 16; co++) {
    461          Int n = tab_cond[op][co];
    462          if (n >= 1000) {
    463             vex_printf(" %3dK", n / 1000);
    464          } else
    465          if (n >= 0) {
    466             vex_printf(" %3d ", n );
    467          } else {
    468             vex_printf("     ");
    469          }
    470       }
    471       vex_printf("\n");
    472    }
    473    vex_printf("\n");
    474 }
    475 
    476 static void initCounts ( void )
    477 {
    478    Int op, co;
    479    initted = True;
    480    for (op = 0; op < X86G_CC_OP_NUMBER; op++) {
    481       tabc_fast[op] = tabc_slow[op] = 0;
    482       for (co = 0; co < 16; co++)
    483          tab_cond[op][co] = 0;
    484    }
    485 }
    486 
    487 #endif /* PROFILE_EFLAGS */
    488 
    489 
    490 /* CALLED FROM GENERATED CODE: CLEAN HELPER */
    491 /* Calculate all the 6 flags from the supplied thunk parameters.
    492    Worker function, not directly called from generated code. */
    493 static
    494 UInt x86g_calculate_eflags_all_WRK ( UInt cc_op,
    495                                      UInt cc_dep1_formal,
    496                                      UInt cc_dep2_formal,
    497                                      UInt cc_ndep_formal )
    498 {
    499    switch (cc_op) {
    500       case X86G_CC_OP_COPY:
    501          return cc_dep1_formal
    502                 & (X86G_CC_MASK_O | X86G_CC_MASK_S | X86G_CC_MASK_Z
    503                    | X86G_CC_MASK_A | X86G_CC_MASK_C | X86G_CC_MASK_P);
    504 
    505       case X86G_CC_OP_ADDB:   ACTIONS_ADD( 8,  UChar  );
    506       case X86G_CC_OP_ADDW:   ACTIONS_ADD( 16, UShort );
    507       case X86G_CC_OP_ADDL:   ACTIONS_ADD( 32, UInt   );
    508 
    509       case X86G_CC_OP_ADCB:   ACTIONS_ADC( 8,  UChar  );
    510       case X86G_CC_OP_ADCW:   ACTIONS_ADC( 16, UShort );
    511       case X86G_CC_OP_ADCL:   ACTIONS_ADC( 32, UInt   );
    512 
    513       case X86G_CC_OP_SUBB:   ACTIONS_SUB(  8, UChar  );
    514       case X86G_CC_OP_SUBW:   ACTIONS_SUB( 16, UShort );
    515       case X86G_CC_OP_SUBL:   ACTIONS_SUB( 32, UInt   );
    516 
    517       case X86G_CC_OP_SBBB:   ACTIONS_SBB(  8, UChar  );
    518       case X86G_CC_OP_SBBW:   ACTIONS_SBB( 16, UShort );
    519       case X86G_CC_OP_SBBL:   ACTIONS_SBB( 32, UInt   );
    520 
    521       case X86G_CC_OP_LOGICB: ACTIONS_LOGIC(  8, UChar  );
    522       case X86G_CC_OP_LOGICW: ACTIONS_LOGIC( 16, UShort );
    523       case X86G_CC_OP_LOGICL: ACTIONS_LOGIC( 32, UInt   );
    524 
    525       case X86G_CC_OP_INCB:   ACTIONS_INC(  8, UChar  );
    526       case X86G_CC_OP_INCW:   ACTIONS_INC( 16, UShort );
    527       case X86G_CC_OP_INCL:   ACTIONS_INC( 32, UInt   );
    528 
    529       case X86G_CC_OP_DECB:   ACTIONS_DEC(  8, UChar  );
    530       case X86G_CC_OP_DECW:   ACTIONS_DEC( 16, UShort );
    531       case X86G_CC_OP_DECL:   ACTIONS_DEC( 32, UInt   );
    532 
    533       case X86G_CC_OP_SHLB:   ACTIONS_SHL(  8, UChar  );
    534       case X86G_CC_OP_SHLW:   ACTIONS_SHL( 16, UShort );
    535       case X86G_CC_OP_SHLL:   ACTIONS_SHL( 32, UInt   );
    536 
    537       case X86G_CC_OP_SHRB:   ACTIONS_SHR(  8, UChar  );
    538       case X86G_CC_OP_SHRW:   ACTIONS_SHR( 16, UShort );
    539       case X86G_CC_OP_SHRL:   ACTIONS_SHR( 32, UInt   );
    540 
    541       case X86G_CC_OP_ROLB:   ACTIONS_ROL(  8, UChar  );
    542       case X86G_CC_OP_ROLW:   ACTIONS_ROL( 16, UShort );
    543       case X86G_CC_OP_ROLL:   ACTIONS_ROL( 32, UInt   );
    544 
    545       case X86G_CC_OP_RORB:   ACTIONS_ROR(  8, UChar  );
    546       case X86G_CC_OP_RORW:   ACTIONS_ROR( 16, UShort );
    547       case X86G_CC_OP_RORL:   ACTIONS_ROR( 32, UInt   );
    548 
    549       case X86G_CC_OP_UMULB:  ACTIONS_UMUL(  8, UChar,  toUChar,
    550                                                 UShort, toUShort );
    551       case X86G_CC_OP_UMULW:  ACTIONS_UMUL( 16, UShort, toUShort,
    552                                                 UInt,   toUInt );
    553       case X86G_CC_OP_UMULL:  ACTIONS_UMUL( 32, UInt,   toUInt,
    554                                                 ULong,  idULong );
    555 
    556       case X86G_CC_OP_SMULB:  ACTIONS_SMUL(  8, Char,   toUChar,
    557                                                 Short,  toUShort );
    558       case X86G_CC_OP_SMULW:  ACTIONS_SMUL( 16, Short,  toUShort,
    559                                                 Int,    toUInt   );
    560       case X86G_CC_OP_SMULL:  ACTIONS_SMUL( 32, Int,    toUInt,
    561                                                 Long,   idULong );
    562 
    563       default:
    564          /* shouldn't really make these calls from generated code */
    565          vex_printf("x86g_calculate_eflags_all_WRK(X86)"
    566                     "( %u, 0x%x, 0x%x, 0x%x )\n",
    567                     cc_op, cc_dep1_formal, cc_dep2_formal, cc_ndep_formal );
    568          vpanic("x86g_calculate_eflags_all_WRK(X86)");
    569    }
    570 }
    571 
    572 
    573 /* CALLED FROM GENERATED CODE: CLEAN HELPER */
    574 /* Calculate all the 6 flags from the supplied thunk parameters. */
    575 UInt x86g_calculate_eflags_all ( UInt cc_op,
    576                                  UInt cc_dep1,
    577                                  UInt cc_dep2,
    578                                  UInt cc_ndep )
    579 {
    580 #  if PROFILE_EFLAGS
    581    if (!initted) initCounts();
    582    n_calc_all++;
    583    if (SHOW_COUNTS_NOW) showCounts();
    584 #  endif
    585    return
    586       x86g_calculate_eflags_all_WRK ( cc_op, cc_dep1, cc_dep2, cc_ndep );
    587 }
    588 
    589 
    590 /* CALLED FROM GENERATED CODE: CLEAN HELPER */
    591 /* Calculate just the carry flag from the supplied thunk parameters. */
    592 VEX_REGPARM(3)
    593 UInt x86g_calculate_eflags_c ( UInt cc_op,
    594                                UInt cc_dep1,
    595                                UInt cc_dep2,
    596                                UInt cc_ndep )
    597 {
    598 #  if PROFILE_EFLAGS
    599    if (!initted) initCounts();
    600    n_calc_c++;
    601    tabc_fast[cc_op]++;
    602    if (SHOW_COUNTS_NOW) showCounts();
    603 #  endif
    604 
    605    /* Fast-case some common ones. */
    606    switch (cc_op) {
    607       case X86G_CC_OP_LOGICL:
    608       case X86G_CC_OP_LOGICW:
    609       case X86G_CC_OP_LOGICB:
    610          return 0;
    611       case X86G_CC_OP_SUBL:
    612          return ((UInt)cc_dep1) < ((UInt)cc_dep2)
    613                    ? X86G_CC_MASK_C : 0;
    614       case X86G_CC_OP_SUBW:
    615          return ((UInt)(cc_dep1 & 0xFFFF)) < ((UInt)(cc_dep2 & 0xFFFF))
    616                    ? X86G_CC_MASK_C : 0;
    617       case X86G_CC_OP_SUBB:
    618          return ((UInt)(cc_dep1 & 0xFF)) < ((UInt)(cc_dep2 & 0xFF))
    619                    ? X86G_CC_MASK_C : 0;
    620       case X86G_CC_OP_INCL:
    621       case X86G_CC_OP_DECL:
    622          return cc_ndep & X86G_CC_MASK_C;
    623       default:
    624          break;
    625    }
    626 
    627 #  if PROFILE_EFLAGS
    628    tabc_fast[cc_op]--;
    629    tabc_slow[cc_op]++;
    630 #  endif
    631 
    632    return x86g_calculate_eflags_all_WRK(cc_op,cc_dep1,cc_dep2,cc_ndep)
    633           & X86G_CC_MASK_C;
    634 }
    635 
    636 
    637 /* CALLED FROM GENERATED CODE: CLEAN HELPER */
    638 /* returns 1 or 0 */
    639 UInt x86g_calculate_condition ( UInt/*X86Condcode*/ cond,
    640                                 UInt cc_op,
    641                                 UInt cc_dep1,
    642                                 UInt cc_dep2,
    643                                 UInt cc_ndep )
    644 {
    645    UInt eflags = x86g_calculate_eflags_all_WRK(cc_op, cc_dep1,
    646                                                cc_dep2, cc_ndep);
    647    UInt of,sf,zf,cf,pf;
    648    UInt inv = cond & 1;
    649 
    650 #  if PROFILE_EFLAGS
    651    if (!initted) initCounts();
    652    tab_cond[cc_op][cond]++;
    653    n_calc_cond++;
    654    if (SHOW_COUNTS_NOW) showCounts();
    655 #  endif
    656 
    657    switch (cond) {
    658       case X86CondNO:
    659       case X86CondO: /* OF == 1 */
    660          of = eflags >> X86G_CC_SHIFT_O;
    661          return 1 & (inv ^ of);
    662 
    663       case X86CondNZ:
    664       case X86CondZ: /* ZF == 1 */
    665          zf = eflags >> X86G_CC_SHIFT_Z;
    666          return 1 & (inv ^ zf);
    667 
    668       case X86CondNB:
    669       case X86CondB: /* CF == 1 */
    670          cf = eflags >> X86G_CC_SHIFT_C;
    671          return 1 & (inv ^ cf);
    672          break;
    673 
    674       case X86CondNBE:
    675       case X86CondBE: /* (CF or ZF) == 1 */
    676          cf = eflags >> X86G_CC_SHIFT_C;
    677          zf = eflags >> X86G_CC_SHIFT_Z;
    678          return 1 & (inv ^ (cf | zf));
    679          break;
    680 
    681       case X86CondNS:
    682       case X86CondS: /* SF == 1 */
    683          sf = eflags >> X86G_CC_SHIFT_S;
    684          return 1 & (inv ^ sf);
    685 
    686       case X86CondNP:
    687       case X86CondP: /* PF == 1 */
    688          pf = eflags >> X86G_CC_SHIFT_P;
    689          return 1 & (inv ^ pf);
    690 
    691       case X86CondNL:
    692       case X86CondL: /* (SF xor OF) == 1 */
    693          sf = eflags >> X86G_CC_SHIFT_S;
    694          of = eflags >> X86G_CC_SHIFT_O;
    695          return 1 & (inv ^ (sf ^ of));
    696          break;
    697 
    698       case X86CondNLE:
    699       case X86CondLE: /* ((SF xor OF) or ZF)  == 1 */
    700          sf = eflags >> X86G_CC_SHIFT_S;
    701          of = eflags >> X86G_CC_SHIFT_O;
    702          zf = eflags >> X86G_CC_SHIFT_Z;
    703          return 1 & (inv ^ ((sf ^ of) | zf));
    704          break;
    705 
    706       default:
    707          /* shouldn't really make these calls from generated code */
    708          vex_printf("x86g_calculate_condition( %u, %u, 0x%x, 0x%x, 0x%x )\n",
    709                     cond, cc_op, cc_dep1, cc_dep2, cc_ndep );
    710          vpanic("x86g_calculate_condition");
    711    }
    712 }
    713 
    714 
    715 /* VISIBLE TO LIBVEX CLIENT */
    716 UInt LibVEX_GuestX86_get_eflags ( /*IN*/const VexGuestX86State* vex_state )
    717 {
    718    UInt eflags = x86g_calculate_eflags_all_WRK(
    719                     vex_state->guest_CC_OP,
    720                     vex_state->guest_CC_DEP1,
    721                     vex_state->guest_CC_DEP2,
    722                     vex_state->guest_CC_NDEP
    723                  );
    724    UInt dflag = vex_state->guest_DFLAG;
    725    vassert(dflag == 1 || dflag == 0xFFFFFFFF);
    726    if (dflag == 0xFFFFFFFF)
    727       eflags |= (1<<10);
    728    if (vex_state->guest_IDFLAG == 1)
    729       eflags |= (1<<21);
    730    if (vex_state->guest_ACFLAG == 1)
    731       eflags |= (1<<18);
    732 
    733    return eflags;
    734 }
    735 
    736 /* VISIBLE TO LIBVEX CLIENT */
    737 void
    738 LibVEX_GuestX86_put_eflag_c ( UInt new_carry_flag,
    739                               /*MOD*/VexGuestX86State* vex_state )
    740 {
    741    UInt oszacp = x86g_calculate_eflags_all_WRK(
    742                     vex_state->guest_CC_OP,
    743                     vex_state->guest_CC_DEP1,
    744                     vex_state->guest_CC_DEP2,
    745                     vex_state->guest_CC_NDEP
    746                  );
    747    if (new_carry_flag & 1) {
    748       oszacp |= X86G_CC_MASK_C;
    749    } else {
    750       oszacp &= ~X86G_CC_MASK_C;
    751    }
    752    vex_state->guest_CC_OP   = X86G_CC_OP_COPY;
    753    vex_state->guest_CC_DEP1 = oszacp;
    754    vex_state->guest_CC_DEP2 = 0;
    755    vex_state->guest_CC_NDEP = 0;
    756 }
    757 
    758 
    759 /*---------------------------------------------------------------*/
    760 /*--- %eflags translation-time function specialisers.         ---*/
    761 /*--- These help iropt specialise calls the above run-time    ---*/
    762 /*--- %eflags functions.                                      ---*/
    763 /*---------------------------------------------------------------*/
    764 
    765 /* Used by the optimiser to try specialisations.  Returns an
    766    equivalent expression, or NULL if none. */
    767 
    768 static inline Bool isU32 ( IRExpr* e, UInt n )
    769 {
    770    return
    771       toBool( e->tag == Iex_Const
    772               && e->Iex.Const.con->tag == Ico_U32
    773               && e->Iex.Const.con->Ico.U32 == n );
    774 }
    775 
    776 IRExpr* guest_x86_spechelper ( const HChar* function_name,
    777                                IRExpr** args,
    778                                IRStmt** precedingStmts,
    779                                Int      n_precedingStmts )
    780 {
    781 #  define unop(_op,_a1) IRExpr_Unop((_op),(_a1))
    782 #  define binop(_op,_a1,_a2) IRExpr_Binop((_op),(_a1),(_a2))
    783 #  define mkU32(_n) IRExpr_Const(IRConst_U32(_n))
    784 #  define mkU8(_n)  IRExpr_Const(IRConst_U8(_n))
    785 
    786    Int i, arity = 0;
    787    for (i = 0; args[i]; i++)
    788       arity++;
    789 #  if 0
    790    vex_printf("spec request:\n");
    791    vex_printf("   %s  ", function_name);
    792    for (i = 0; i < arity; i++) {
    793       vex_printf("  ");
    794       ppIRExpr(args[i]);
    795    }
    796    vex_printf("\n");
    797 #  endif
    798 
    799    /* --------- specialising "x86g_calculate_condition" --------- */
    800 
    801    if (vex_streq(function_name, "x86g_calculate_condition")) {
    802       /* specialise calls to above "calculate condition" function */
    803       IRExpr *cond, *cc_op, *cc_dep1, *cc_dep2;
    804       vassert(arity == 5);
    805       cond    = args[0];
    806       cc_op   = args[1];
    807       cc_dep1 = args[2];
    808       cc_dep2 = args[3];
    809 
    810       /*---------------- ADDL ----------------*/
    811 
    812       if (isU32(cc_op, X86G_CC_OP_ADDL) && isU32(cond, X86CondZ)) {
    813          /* long add, then Z --> test (dst+src == 0) */
    814          return unop(Iop_1Uto32,
    815                      binop(Iop_CmpEQ32,
    816                            binop(Iop_Add32, cc_dep1, cc_dep2),
    817                            mkU32(0)));
    818       }
    819 
    820       /*---------------- SUBL ----------------*/
    821 
    822       if (isU32(cc_op, X86G_CC_OP_SUBL) && isU32(cond, X86CondZ)) {
    823          /* long sub/cmp, then Z --> test dst==src */
    824          return unop(Iop_1Uto32,
    825                      binop(Iop_CmpEQ32, cc_dep1, cc_dep2));
    826       }
    827       if (isU32(cc_op, X86G_CC_OP_SUBL) && isU32(cond, X86CondNZ)) {
    828          /* long sub/cmp, then NZ --> test dst!=src */
    829          return unop(Iop_1Uto32,
    830                      binop(Iop_CmpNE32, cc_dep1, cc_dep2));
    831       }
    832 
    833       if (isU32(cc_op, X86G_CC_OP_SUBL) && isU32(cond, X86CondL)) {
    834          /* long sub/cmp, then L (signed less than)
    835             --> test dst <s src */
    836          return unop(Iop_1Uto32,
    837                      binop(Iop_CmpLT32S, cc_dep1, cc_dep2));
    838       }
    839       if (isU32(cc_op, X86G_CC_OP_SUBL) && isU32(cond, X86CondNL)) {
    840          /* long sub/cmp, then NL (signed greater than or equal)
    841             --> test !(dst <s src) */
    842          return binop(Iop_Xor32,
    843                       unop(Iop_1Uto32,
    844                            binop(Iop_CmpLT32S, cc_dep1, cc_dep2)),
    845                       mkU32(1));
    846       }
    847 
    848       if (isU32(cc_op, X86G_CC_OP_SUBL) && isU32(cond, X86CondLE)) {
    849          /* long sub/cmp, then LE (signed less than or equal)
    850             --> test dst <=s src */
    851          return unop(Iop_1Uto32,
    852                      binop(Iop_CmpLE32S, cc_dep1, cc_dep2));
    853       }
    854       if (isU32(cc_op, X86G_CC_OP_SUBL) && isU32(cond, X86CondNLE)) {
    855          /* long sub/cmp, then NLE (signed not less than or equal)
    856             --> test dst >s src
    857             --> test !(dst <=s src) */
    858          return binop(Iop_Xor32,
    859                       unop(Iop_1Uto32,
    860                            binop(Iop_CmpLE32S, cc_dep1, cc_dep2)),
    861                       mkU32(1));
    862       }
    863 
    864       if (isU32(cc_op, X86G_CC_OP_SUBL) && isU32(cond, X86CondBE)) {
    865          /* long sub/cmp, then BE (unsigned less than or equal)
    866             --> test dst <=u src */
    867          return unop(Iop_1Uto32,
    868                      binop(Iop_CmpLE32U, cc_dep1, cc_dep2));
    869       }
    870       if (isU32(cc_op, X86G_CC_OP_SUBL) && isU32(cond, X86CondNBE)) {
    871          /* long sub/cmp, then BE (unsigned greater than)
    872             --> test !(dst <=u src) */
    873          return binop(Iop_Xor32,
    874                       unop(Iop_1Uto32,
    875                            binop(Iop_CmpLE32U, cc_dep1, cc_dep2)),
    876                       mkU32(1));
    877       }
    878 
    879       if (isU32(cc_op, X86G_CC_OP_SUBL) && isU32(cond, X86CondB)) {
    880          /* long sub/cmp, then B (unsigned less than)
    881             --> test dst <u src */
    882          return unop(Iop_1Uto32,
    883                      binop(Iop_CmpLT32U, cc_dep1, cc_dep2));
    884       }
    885       if (isU32(cc_op, X86G_CC_OP_SUBL) && isU32(cond, X86CondNB)) {
    886          /* long sub/cmp, then NB (unsigned greater than or equal)
    887             --> test !(dst <u src) */
    888          return binop(Iop_Xor32,
    889                       unop(Iop_1Uto32,
    890                            binop(Iop_CmpLT32U, cc_dep1, cc_dep2)),
    891                       mkU32(1));
    892       }
    893 
    894       if (isU32(cc_op, X86G_CC_OP_SUBL) && isU32(cond, X86CondS)) {
    895          /* long sub/cmp, then S (negative) --> test (dst-src <s 0) */
    896          return unop(Iop_1Uto32,
    897                      binop(Iop_CmpLT32S,
    898                            binop(Iop_Sub32, cc_dep1, cc_dep2),
    899                            mkU32(0)));
    900       }
    901       if (isU32(cc_op, X86G_CC_OP_SUBL) && isU32(cond, X86CondNS)) {
    902          /* long sub/cmp, then NS (not negative) --> test !(dst-src <s 0) */
    903          return binop(Iop_Xor32,
    904                       unop(Iop_1Uto32,
    905                            binop(Iop_CmpLT32S,
    906                                  binop(Iop_Sub32, cc_dep1, cc_dep2),
    907                                  mkU32(0))),
    908                       mkU32(1));
    909       }
    910 
    911       /*---------------- SUBW ----------------*/
    912 
    913       if (isU32(cc_op, X86G_CC_OP_SUBW) && isU32(cond, X86CondZ)) {
    914          /* word sub/cmp, then Z --> test dst==src */
    915          return unop(Iop_1Uto32,
    916                      binop(Iop_CmpEQ16,
    917                            unop(Iop_32to16,cc_dep1),
    918                            unop(Iop_32to16,cc_dep2)));
    919       }
    920       if (isU32(cc_op, X86G_CC_OP_SUBW) && isU32(cond, X86CondNZ)) {
    921          /* word sub/cmp, then NZ --> test dst!=src */
    922          return unop(Iop_1Uto32,
    923                      binop(Iop_CmpNE16,
    924                            unop(Iop_32to16,cc_dep1),
    925                            unop(Iop_32to16,cc_dep2)));
    926       }
    927 
    928       /*---------------- SUBB ----------------*/
    929 
    930       if (isU32(cc_op, X86G_CC_OP_SUBB) && isU32(cond, X86CondZ)) {
    931          /* byte sub/cmp, then Z --> test dst==src */
    932          return unop(Iop_1Uto32,
    933                      binop(Iop_CmpEQ8,
    934                            unop(Iop_32to8,cc_dep1),
    935                            unop(Iop_32to8,cc_dep2)));
    936       }
    937       if (isU32(cc_op, X86G_CC_OP_SUBB) && isU32(cond, X86CondNZ)) {
    938          /* byte sub/cmp, then NZ --> test dst!=src */
    939          return unop(Iop_1Uto32,
    940                      binop(Iop_CmpNE8,
    941                            unop(Iop_32to8,cc_dep1),
    942                            unop(Iop_32to8,cc_dep2)));
    943       }
    944 
    945       if (isU32(cc_op, X86G_CC_OP_SUBB) && isU32(cond, X86CondNBE)) {
    946          /* byte sub/cmp, then NBE (unsigned greater than)
    947             --> test src <u dst */
    948          /* Note, args are opposite way round from the usual */
    949          return unop(Iop_1Uto32,
    950                      binop(Iop_CmpLT32U,
    951                            binop(Iop_And32,cc_dep2,mkU32(0xFF)),
    952 			   binop(Iop_And32,cc_dep1,mkU32(0xFF))));
    953       }
    954 
    955       if (isU32(cc_op, X86G_CC_OP_SUBB) && isU32(cond, X86CondS)
    956                                         && isU32(cc_dep2, 0)) {
    957          /* byte sub/cmp of zero, then S --> test (dst-0 <s 0)
    958                                          --> test dst <s 0
    959                                          --> (UInt)dst[7]
    960             This is yet another scheme by which gcc figures out if the
    961             top bit of a byte is 1 or 0.  See also LOGICB/CondS below. */
    962          /* Note: isU32(cc_dep2, 0) is correct, even though this is
    963             for an 8-bit comparison, since the args to the helper
    964             function are always U32s. */
    965          return binop(Iop_And32,
    966                       binop(Iop_Shr32,cc_dep1,mkU8(7)),
    967                       mkU32(1));
    968       }
    969       if (isU32(cc_op, X86G_CC_OP_SUBB) && isU32(cond, X86CondNS)
    970                                         && isU32(cc_dep2, 0)) {
    971          /* byte sub/cmp of zero, then NS --> test !(dst-0 <s 0)
    972                                           --> test !(dst <s 0)
    973                                           --> (UInt) !dst[7]
    974          */
    975          return binop(Iop_Xor32,
    976                       binop(Iop_And32,
    977                             binop(Iop_Shr32,cc_dep1,mkU8(7)),
    978                             mkU32(1)),
    979                 mkU32(1));
    980       }
    981 
    982       /*---------------- LOGICL ----------------*/
    983 
    984       if (isU32(cc_op, X86G_CC_OP_LOGICL) && isU32(cond, X86CondZ)) {
    985          /* long and/or/xor, then Z --> test dst==0 */
    986          return unop(Iop_1Uto32,binop(Iop_CmpEQ32, cc_dep1, mkU32(0)));
    987       }
    988       if (isU32(cc_op, X86G_CC_OP_LOGICL) && isU32(cond, X86CondNZ)) {
    989          /* long and/or/xor, then NZ --> test dst!=0 */
    990          return unop(Iop_1Uto32,binop(Iop_CmpNE32, cc_dep1, mkU32(0)));
    991       }
    992 
    993       if (isU32(cc_op, X86G_CC_OP_LOGICL) && isU32(cond, X86CondLE)) {
    994          /* long and/or/xor, then LE
    995             This is pretty subtle.  LOGIC sets SF and ZF according to the
    996             result and makes OF be zero.  LE computes (SZ ^ OF) | ZF, but
    997             OF is zero, so this reduces to SZ | ZF -- which will be 1 iff
    998             the result is <=signed 0.  Hence ...
    999          */
   1000          return unop(Iop_1Uto32,binop(Iop_CmpLE32S, cc_dep1, mkU32(0)));
   1001       }
   1002 
   1003       if (isU32(cc_op, X86G_CC_OP_LOGICL) && isU32(cond, X86CondBE)) {
   1004          /* long and/or/xor, then BE
   1005             LOGIC sets ZF according to the result and makes CF be zero.
   1006             BE computes (CF | ZF), but CF is zero, so this reduces ZF
   1007             -- which will be 1 iff the result is zero.  Hence ...
   1008          */
   1009          return unop(Iop_1Uto32,binop(Iop_CmpEQ32, cc_dep1, mkU32(0)));
   1010       }
   1011 
   1012       if (isU32(cc_op, X86G_CC_OP_LOGICL) && isU32(cond, X86CondS)) {
   1013          /* see comment below for (LOGICB, CondS) */
   1014          /* long and/or/xor, then S --> (UInt)result[31] */
   1015          return binop(Iop_And32,
   1016                       binop(Iop_Shr32,cc_dep1,mkU8(31)),
   1017                       mkU32(1));
   1018       }
   1019       if (isU32(cc_op, X86G_CC_OP_LOGICL) && isU32(cond, X86CondNS)) {
   1020          /* see comment below for (LOGICB, CondNS) */
   1021          /* long and/or/xor, then S --> (UInt) ~ result[31] */
   1022          return binop(Iop_Xor32,
   1023                 binop(Iop_And32,
   1024                       binop(Iop_Shr32,cc_dep1,mkU8(31)),
   1025                       mkU32(1)),
   1026                 mkU32(1));
   1027       }
   1028 
   1029       /*---------------- LOGICW ----------------*/
   1030 
   1031       if (isU32(cc_op, X86G_CC_OP_LOGICW) && isU32(cond, X86CondZ)) {
   1032          /* word and/or/xor, then Z --> test dst==0 */
   1033          return unop(Iop_1Uto32,
   1034                      binop(Iop_CmpEQ32, binop(Iop_And32,cc_dep1,mkU32(0xFFFF)),
   1035                                         mkU32(0)));
   1036       }
   1037 
   1038       if (isU32(cc_op, X86G_CC_OP_LOGICW) && isU32(cond, X86CondS)) {
   1039          /* see comment below for (LOGICB, CondS) */
   1040          /* word and/or/xor, then S --> (UInt)result[15] */
   1041          return binop(Iop_And32,
   1042                       binop(Iop_Shr32,cc_dep1,mkU8(15)),
   1043                       mkU32(1));
   1044       }
   1045 
   1046       /*---------------- LOGICB ----------------*/
   1047 
   1048       if (isU32(cc_op, X86G_CC_OP_LOGICB) && isU32(cond, X86CondZ)) {
   1049          /* byte and/or/xor, then Z --> test dst==0 */
   1050          return unop(Iop_1Uto32,
   1051                      binop(Iop_CmpEQ32, binop(Iop_And32,cc_dep1,mkU32(255)),
   1052                                         mkU32(0)));
   1053       }
   1054       if (isU32(cc_op, X86G_CC_OP_LOGICB) && isU32(cond, X86CondNZ)) {
   1055          /* byte and/or/xor, then Z --> test dst!=0 */
   1056          /* b9ac9:       84 c0                   test   %al,%al
   1057             b9acb:       75 0d                   jne    b9ada */
   1058          return unop(Iop_1Uto32,
   1059                      binop(Iop_CmpNE32, binop(Iop_And32,cc_dep1,mkU32(255)),
   1060                                         mkU32(0)));
   1061       }
   1062 
   1063       if (isU32(cc_op, X86G_CC_OP_LOGICB) && isU32(cond, X86CondS)) {
   1064          /* this is an idiom gcc sometimes uses to find out if the top
   1065             bit of a byte register is set: eg testb %al,%al; js ..
   1066             Since it just depends on the top bit of the byte, extract
   1067             that bit and explicitly get rid of all the rest.  This
   1068             helps memcheck avoid false positives in the case where any
   1069             of the other bits in the byte are undefined. */
   1070          /* byte and/or/xor, then S --> (UInt)result[7] */
   1071          return binop(Iop_And32,
   1072                       binop(Iop_Shr32,cc_dep1,mkU8(7)),
   1073                       mkU32(1));
   1074       }
   1075       if (isU32(cc_op, X86G_CC_OP_LOGICB) && isU32(cond, X86CondNS)) {
   1076          /* ditto, for negation-of-S. */
   1077          /* byte and/or/xor, then S --> (UInt) ~ result[7] */
   1078          return binop(Iop_Xor32,
   1079                 binop(Iop_And32,
   1080                       binop(Iop_Shr32,cc_dep1,mkU8(7)),
   1081                       mkU32(1)),
   1082                 mkU32(1));
   1083       }
   1084 
   1085       /*---------------- DECL ----------------*/
   1086 
   1087       if (isU32(cc_op, X86G_CC_OP_DECL) && isU32(cond, X86CondZ)) {
   1088          /* dec L, then Z --> test dst == 0 */
   1089          return unop(Iop_1Uto32,binop(Iop_CmpEQ32, cc_dep1, mkU32(0)));
   1090       }
   1091 
   1092       if (isU32(cc_op, X86G_CC_OP_DECL) && isU32(cond, X86CondS)) {
   1093          /* dec L, then S --> compare DST <s 0 */
   1094          return unop(Iop_1Uto32,binop(Iop_CmpLT32S, cc_dep1, mkU32(0)));
   1095       }
   1096 
   1097       /*---------------- DECW ----------------*/
   1098 
   1099       if (isU32(cc_op, X86G_CC_OP_DECW) && isU32(cond, X86CondZ)) {
   1100          /* dec W, then Z --> test dst == 0 */
   1101          return unop(Iop_1Uto32,
   1102                      binop(Iop_CmpEQ32,
   1103                            binop(Iop_Shl32,cc_dep1,mkU8(16)),
   1104                            mkU32(0)));
   1105       }
   1106 
   1107       /*---------------- INCW ----------------*/
   1108 
   1109       if (isU32(cc_op, X86G_CC_OP_INCW) && isU32(cond, X86CondZ)) {
   1110          /* This rewrite helps memcheck on 'incw %ax ; je ...'. */
   1111          /* inc W, then Z --> test dst == 0 */
   1112          return unop(Iop_1Uto32,
   1113                      binop(Iop_CmpEQ32,
   1114                            binop(Iop_Shl32,cc_dep1,mkU8(16)),
   1115                            mkU32(0)));
   1116       }
   1117 
   1118       /*---------------- SHRL ----------------*/
   1119 
   1120       if (isU32(cc_op, X86G_CC_OP_SHRL) && isU32(cond, X86CondZ)) {
   1121          /* SHRL, then Z --> test dep1 == 0 */
   1122          return unop(Iop_1Uto32,binop(Iop_CmpEQ32, cc_dep1, mkU32(0)));
   1123       }
   1124 
   1125       /*---------------- COPY ----------------*/
   1126       /* This can happen, as a result of x87 FP compares: "fcom ... ;
   1127          fnstsw %ax ; sahf ; jbe" for example. */
   1128 
   1129       if (isU32(cc_op, X86G_CC_OP_COPY) &&
   1130           (isU32(cond, X86CondBE) || isU32(cond, X86CondNBE))) {
   1131          /* COPY, then BE --> extract C and Z from dep1, and test
   1132             (C or Z) == 1. */
   1133          /* COPY, then NBE --> extract C and Z from dep1, and test
   1134             (C or Z) == 0. */
   1135          UInt nnn = isU32(cond, X86CondBE) ? 1 : 0;
   1136          return
   1137             unop(
   1138                Iop_1Uto32,
   1139                binop(
   1140                   Iop_CmpEQ32,
   1141                   binop(
   1142                      Iop_And32,
   1143                      binop(
   1144                         Iop_Or32,
   1145                         binop(Iop_Shr32, cc_dep1, mkU8(X86G_CC_SHIFT_C)),
   1146                         binop(Iop_Shr32, cc_dep1, mkU8(X86G_CC_SHIFT_Z))
   1147                      ),
   1148                      mkU32(1)
   1149                   ),
   1150                   mkU32(nnn)
   1151                )
   1152             );
   1153       }
   1154 
   1155       if (isU32(cc_op, X86G_CC_OP_COPY)
   1156           && (isU32(cond, X86CondB) || isU32(cond, X86CondNB))) {
   1157          /* COPY, then B --> extract C from dep1, and test (C == 1). */
   1158          /* COPY, then NB --> extract C from dep1, and test (C == 0). */
   1159          UInt nnn = isU32(cond, X86CondB) ? 1 : 0;
   1160          return
   1161             unop(
   1162                Iop_1Uto32,
   1163                binop(
   1164                   Iop_CmpEQ32,
   1165                   binop(
   1166                      Iop_And32,
   1167                      binop(Iop_Shr32, cc_dep1, mkU8(X86G_CC_SHIFT_C)),
   1168                      mkU32(1)
   1169                   ),
   1170                   mkU32(nnn)
   1171                )
   1172             );
   1173       }
   1174 
   1175       if (isU32(cc_op, X86G_CC_OP_COPY)
   1176           && (isU32(cond, X86CondZ) || isU32(cond, X86CondNZ))) {
   1177          /* COPY, then Z --> extract Z from dep1, and test (Z == 1). */
   1178          /* COPY, then NZ --> extract Z from dep1, and test (Z == 0). */
   1179          UInt nnn = isU32(cond, X86CondZ) ? 1 : 0;
   1180          return
   1181             unop(
   1182                Iop_1Uto32,
   1183                binop(
   1184                   Iop_CmpEQ32,
   1185                   binop(
   1186                      Iop_And32,
   1187                      binop(Iop_Shr32, cc_dep1, mkU8(X86G_CC_SHIFT_Z)),
   1188                      mkU32(1)
   1189                   ),
   1190                   mkU32(nnn)
   1191                )
   1192             );
   1193       }
   1194 
   1195       if (isU32(cc_op, X86G_CC_OP_COPY)
   1196           && (isU32(cond, X86CondP) || isU32(cond, X86CondNP))) {
   1197          /* COPY, then P --> extract P from dep1, and test (P == 1). */
   1198          /* COPY, then NP --> extract P from dep1, and test (P == 0). */
   1199          UInt nnn = isU32(cond, X86CondP) ? 1 : 0;
   1200          return
   1201             unop(
   1202                Iop_1Uto32,
   1203                binop(
   1204                   Iop_CmpEQ32,
   1205                   binop(
   1206                      Iop_And32,
   1207                      binop(Iop_Shr32, cc_dep1, mkU8(X86G_CC_SHIFT_P)),
   1208                      mkU32(1)
   1209                   ),
   1210                   mkU32(nnn)
   1211                )
   1212             );
   1213       }
   1214 
   1215       return NULL;
   1216    }
   1217 
   1218    /* --------- specialising "x86g_calculate_eflags_c" --------- */
   1219 
   1220    if (vex_streq(function_name, "x86g_calculate_eflags_c")) {
   1221       /* specialise calls to above "calculate_eflags_c" function */
   1222       IRExpr *cc_op, *cc_dep1, *cc_dep2, *cc_ndep;
   1223       vassert(arity == 4);
   1224       cc_op   = args[0];
   1225       cc_dep1 = args[1];
   1226       cc_dep2 = args[2];
   1227       cc_ndep = args[3];
   1228 
   1229       if (isU32(cc_op, X86G_CC_OP_SUBL)) {
   1230          /* C after sub denotes unsigned less than */
   1231          return unop(Iop_1Uto32,
   1232                      binop(Iop_CmpLT32U, cc_dep1, cc_dep2));
   1233       }
   1234       if (isU32(cc_op, X86G_CC_OP_SUBB)) {
   1235          /* C after sub denotes unsigned less than */
   1236          return unop(Iop_1Uto32,
   1237                      binop(Iop_CmpLT32U,
   1238                            binop(Iop_And32,cc_dep1,mkU32(0xFF)),
   1239                            binop(Iop_And32,cc_dep2,mkU32(0xFF))));
   1240       }
   1241       if (isU32(cc_op, X86G_CC_OP_LOGICL)
   1242           || isU32(cc_op, X86G_CC_OP_LOGICW)
   1243           || isU32(cc_op, X86G_CC_OP_LOGICB)) {
   1244          /* cflag after logic is zero */
   1245          return mkU32(0);
   1246       }
   1247       if (isU32(cc_op, X86G_CC_OP_DECL) || isU32(cc_op, X86G_CC_OP_INCL)) {
   1248          /* If the thunk is dec or inc, the cflag is supplied as CC_NDEP. */
   1249          return cc_ndep;
   1250       }
   1251       if (isU32(cc_op, X86G_CC_OP_COPY)) {
   1252          /* cflag after COPY is stored in DEP1. */
   1253          return
   1254             binop(
   1255                Iop_And32,
   1256                binop(Iop_Shr32, cc_dep1, mkU8(X86G_CC_SHIFT_C)),
   1257                mkU32(1)
   1258             );
   1259       }
   1260       if (isU32(cc_op, X86G_CC_OP_ADDL)) {
   1261          /* C after add denotes sum <u either arg */
   1262          return unop(Iop_1Uto32,
   1263                      binop(Iop_CmpLT32U,
   1264                            binop(Iop_Add32, cc_dep1, cc_dep2),
   1265                            cc_dep1));
   1266       }
   1267       // ATC, requires verification, no test case known
   1268       //if (isU32(cc_op, X86G_CC_OP_SMULL)) {
   1269       //   /* C after signed widening multiply denotes the case where
   1270       //      the top half of the result isn't simply the sign extension
   1271       //      of the bottom half (iow the result doesn't fit completely
   1272       //      in the bottom half).  Hence:
   1273       //        C = hi-half(dep1 x dep2) != lo-half(dep1 x dep2) >>s 31
   1274       //      where 'x' denotes signed widening multiply.*/
   1275       //   return
   1276       //      unop(Iop_1Uto32,
   1277       //           binop(Iop_CmpNE32,
   1278       //                 unop(Iop_64HIto32,
   1279       //                      binop(Iop_MullS32, cc_dep1, cc_dep2)),
   1280       //                 binop(Iop_Sar32,
   1281       //                       binop(Iop_Mul32, cc_dep1, cc_dep2), mkU8(31)) ));
   1282       //}
   1283 #     if 0
   1284       if (cc_op->tag == Iex_Const) {
   1285          vex_printf("CFLAG "); ppIRExpr(cc_op); vex_printf("\n");
   1286       }
   1287 #     endif
   1288 
   1289       return NULL;
   1290    }
   1291 
   1292    /* --------- specialising "x86g_calculate_eflags_all" --------- */
   1293 
   1294    if (vex_streq(function_name, "x86g_calculate_eflags_all")) {
   1295       /* specialise calls to above "calculate_eflags_all" function */
   1296       IRExpr *cc_op, *cc_dep1; /*, *cc_dep2, *cc_ndep; */
   1297       vassert(arity == 4);
   1298       cc_op   = args[0];
   1299       cc_dep1 = args[1];
   1300       /* cc_dep2 = args[2]; */
   1301       /* cc_ndep = args[3]; */
   1302 
   1303       if (isU32(cc_op, X86G_CC_OP_COPY)) {
   1304          /* eflags after COPY are stored in DEP1. */
   1305          return
   1306             binop(
   1307                Iop_And32,
   1308                cc_dep1,
   1309                mkU32(X86G_CC_MASK_O | X86G_CC_MASK_S | X86G_CC_MASK_Z
   1310                      | X86G_CC_MASK_A | X86G_CC_MASK_C | X86G_CC_MASK_P)
   1311             );
   1312       }
   1313       return NULL;
   1314    }
   1315 
   1316 #  undef unop
   1317 #  undef binop
   1318 #  undef mkU32
   1319 #  undef mkU8
   1320 
   1321    return NULL;
   1322 }
   1323 
   1324 
   1325 /*---------------------------------------------------------------*/
   1326 /*--- Supporting functions for x87 FPU activities.            ---*/
   1327 /*---------------------------------------------------------------*/
   1328 
   1329 static inline Bool host_is_little_endian ( void )
   1330 {
   1331    UInt x = 0x76543210;
   1332    UChar* p = (UChar*)(&x);
   1333    return toBool(*p == 0x10);
   1334 }
   1335 
   1336 /* 80 and 64-bit floating point formats:
   1337 
   1338    80-bit:
   1339 
   1340     S  0       0-------0      zero
   1341     S  0       0X------X      denormals
   1342     S  1-7FFE  1X------X      normals (all normals have leading 1)
   1343     S  7FFF    10------0      infinity
   1344     S  7FFF    10X-----X      snan
   1345     S  7FFF    11X-----X      qnan
   1346 
   1347    S is the sign bit.  For runs X----X, at least one of the Xs must be
   1348    nonzero.  Exponent is 15 bits, fractional part is 63 bits, and
   1349    there is an explicitly represented leading 1, and a sign bit,
   1350    giving 80 in total.
   1351 
   1352    64-bit avoids the confusion of an explicitly represented leading 1
   1353    and so is simpler:
   1354 
   1355     S  0      0------0   zero
   1356     S  0      X------X   denormals
   1357     S  1-7FE  any        normals
   1358     S  7FF    0------0   infinity
   1359     S  7FF    0X-----X   snan
   1360     S  7FF    1X-----X   qnan
   1361 
   1362    Exponent is 11 bits, fractional part is 52 bits, and there is a
   1363    sign bit, giving 64 in total.
   1364 */
   1365 
   1366 /* Inspect a value and its tag, as per the x87 'FXAM' instruction. */
   1367 /* CALLED FROM GENERATED CODE: CLEAN HELPER */
   1368 UInt x86g_calculate_FXAM ( UInt tag, ULong dbl )
   1369 {
   1370    Bool   mantissaIsZero;
   1371    Int    bexp;
   1372    UChar  sign;
   1373    UChar* f64;
   1374 
   1375    vassert(host_is_little_endian());
   1376 
   1377    /* vex_printf("calculate_FXAM ( %d, %llx ) .. ", tag, dbl ); */
   1378 
   1379    f64  = (UChar*)(&dbl);
   1380    sign = toUChar( (f64[7] >> 7) & 1 );
   1381 
   1382    /* First off, if the tag indicates the register was empty,
   1383       return 1,0,sign,1 */
   1384    if (tag == 0) {
   1385       /* vex_printf("Empty\n"); */
   1386       return X86G_FC_MASK_C3 | 0 | (sign << X86G_FC_SHIFT_C1)
   1387                                  | X86G_FC_MASK_C0;
   1388    }
   1389 
   1390    bexp = (f64[7] << 4) | ((f64[6] >> 4) & 0x0F);
   1391    bexp &= 0x7FF;
   1392 
   1393    mantissaIsZero
   1394       = toBool(
   1395            (f64[6] & 0x0F) == 0
   1396            && (f64[5] | f64[4] | f64[3] | f64[2] | f64[1] | f64[0]) == 0
   1397         );
   1398 
   1399    /* If both exponent and mantissa are zero, the value is zero.
   1400       Return 1,0,sign,0. */
   1401    if (bexp == 0 && mantissaIsZero) {
   1402       /* vex_printf("Zero\n"); */
   1403       return X86G_FC_MASK_C3 | 0
   1404                              | (sign << X86G_FC_SHIFT_C1) | 0;
   1405    }
   1406 
   1407    /* If exponent is zero but mantissa isn't, it's a denormal.
   1408       Return 1,1,sign,0. */
   1409    if (bexp == 0 && !mantissaIsZero) {
   1410       /* vex_printf("Denormal\n"); */
   1411       return X86G_FC_MASK_C3 | X86G_FC_MASK_C2
   1412                              | (sign << X86G_FC_SHIFT_C1) | 0;
   1413    }
   1414 
   1415    /* If the exponent is 7FF and the mantissa is zero, this is an infinity.
   1416       Return 0,1,sign,1. */
   1417    if (bexp == 0x7FF && mantissaIsZero) {
   1418       /* vex_printf("Inf\n"); */
   1419       return 0 | X86G_FC_MASK_C2 | (sign << X86G_FC_SHIFT_C1)
   1420                                  | X86G_FC_MASK_C0;
   1421    }
   1422 
   1423    /* If the exponent is 7FF and the mantissa isn't zero, this is a NaN.
   1424       Return 0,0,sign,1. */
   1425    if (bexp == 0x7FF && !mantissaIsZero) {
   1426       /* vex_printf("NaN\n"); */
   1427       return 0 | 0 | (sign << X86G_FC_SHIFT_C1) | X86G_FC_MASK_C0;
   1428    }
   1429 
   1430    /* Uh, ok, we give up.  It must be a normal finite number.
   1431       Return 0,1,sign,0.
   1432    */
   1433    /* vex_printf("normal\n"); */
   1434    return 0 | X86G_FC_MASK_C2 | (sign << X86G_FC_SHIFT_C1) | 0;
   1435 }
   1436 
   1437 
   1438 /* CALLED FROM GENERATED CODE */
   1439 /* DIRTY HELPER (reads guest memory) */
   1440 ULong x86g_dirtyhelper_loadF80le ( UInt addrU )
   1441 {
   1442    ULong f64;
   1443    convert_f80le_to_f64le ( (UChar*)ULong_to_Ptr(addrU), (UChar*)&f64 );
   1444    return f64;
   1445 }
   1446 
   1447 /* CALLED FROM GENERATED CODE */
   1448 /* DIRTY HELPER (writes guest memory) */
   1449 void x86g_dirtyhelper_storeF80le ( UInt addrU, ULong f64 )
   1450 {
   1451    convert_f64le_to_f80le( (UChar*)&f64, (UChar*)ULong_to_Ptr(addrU) );
   1452 }
   1453 
   1454 
   1455 /*----------------------------------------------*/
   1456 /*--- The exported fns ..                    ---*/
   1457 /*----------------------------------------------*/
   1458 
   1459 /* Layout of the real x87 state. */
   1460 /* 13 June 05: Fpu_State and auxiliary constants was moved to
   1461    g_generic_x87.h */
   1462 
   1463 
   1464 /* CLEAN HELPER */
   1465 /* fpucw[15:0] contains a x87 native format FPU control word.
   1466    Extract from it the required FPROUND value and any resulting
   1467    emulation warning, and return (warn << 32) | fpround value.
   1468 */
   1469 ULong x86g_check_fldcw ( UInt fpucw )
   1470 {
   1471    /* Decide on a rounding mode.  fpucw[11:10] holds it. */
   1472    /* NOTE, encoded exactly as per enum IRRoundingMode. */
   1473    UInt rmode = (fpucw >> 10) & 3;
   1474 
   1475    /* Detect any required emulation warnings. */
   1476    VexEmNote ew = EmNote_NONE;
   1477 
   1478    if ((fpucw & 0x3F) != 0x3F) {
   1479       /* unmasked exceptions! */
   1480       ew = EmWarn_X86_x87exns;
   1481    }
   1482    else
   1483    if (((fpucw >> 8) & 3) != 3) {
   1484       /* unsupported precision */
   1485       ew = EmWarn_X86_x87precision;
   1486    }
   1487 
   1488    return (((ULong)ew) << 32) | ((ULong)rmode);
   1489 }
   1490 
   1491 /* CLEAN HELPER */
   1492 /* Given fpround as an IRRoundingMode value, create a suitable x87
   1493    native format FPU control word. */
   1494 UInt x86g_create_fpucw ( UInt fpround )
   1495 {
   1496    fpround &= 3;
   1497    return 0x037F | (fpround << 10);
   1498 }
   1499 
   1500 
   1501 /* CLEAN HELPER */
   1502 /* mxcsr[15:0] contains a SSE native format MXCSR value.
   1503    Extract from it the required SSEROUND value and any resulting
   1504    emulation warning, and return (warn << 32) | sseround value.
   1505 */
   1506 ULong x86g_check_ldmxcsr ( UInt mxcsr )
   1507 {
   1508    /* Decide on a rounding mode.  mxcsr[14:13] holds it. */
   1509    /* NOTE, encoded exactly as per enum IRRoundingMode. */
   1510    UInt rmode = (mxcsr >> 13) & 3;
   1511 
   1512    /* Detect any required emulation warnings. */
   1513    VexEmNote ew = EmNote_NONE;
   1514 
   1515    if ((mxcsr & 0x1F80) != 0x1F80) {
   1516       /* unmasked exceptions! */
   1517       ew = EmWarn_X86_sseExns;
   1518    }
   1519    else
   1520    if (mxcsr & (1<<15)) {
   1521       /* FZ is set */
   1522       ew = EmWarn_X86_fz;
   1523    }
   1524    else
   1525    if (mxcsr & (1<<6)) {
   1526       /* DAZ is set */
   1527       ew = EmWarn_X86_daz;
   1528    }
   1529 
   1530    return (((ULong)ew) << 32) | ((ULong)rmode);
   1531 }
   1532 
   1533 
   1534 /* CLEAN HELPER */
   1535 /* Given sseround as an IRRoundingMode value, create a suitable SSE
   1536    native format MXCSR value. */
   1537 UInt x86g_create_mxcsr ( UInt sseround )
   1538 {
   1539    sseround &= 3;
   1540    return 0x1F80 | (sseround << 13);
   1541 }
   1542 
   1543 
   1544 /* CALLED FROM GENERATED CODE */
   1545 /* DIRTY HELPER (writes guest state) */
   1546 /* Initialise the x87 FPU state as per 'finit'. */
   1547 void x86g_dirtyhelper_FINIT ( VexGuestX86State* gst )
   1548 {
   1549    Int i;
   1550    gst->guest_FTOP = 0;
   1551    for (i = 0; i < 8; i++) {
   1552       gst->guest_FPTAG[i] = 0; /* empty */
   1553       gst->guest_FPREG[i] = 0; /* IEEE754 64-bit zero */
   1554    }
   1555    gst->guest_FPROUND = (UInt)Irrm_NEAREST;
   1556    gst->guest_FC3210  = 0;
   1557 }
   1558 
   1559 
   1560 /* This is used to implement both 'frstor' and 'fldenv'.  The latter
   1561    appears to differ from the former only in that the 8 FP registers
   1562    themselves are not transferred into the guest state. */
   1563 static
   1564 VexEmNote do_put_x87 ( Bool moveRegs,
   1565                        /*IN*/UChar* x87_state,
   1566                        /*OUT*/VexGuestX86State* vex_state )
   1567 {
   1568    Int        stno, preg;
   1569    UInt       tag;
   1570    ULong*     vexRegs = (ULong*)(&vex_state->guest_FPREG[0]);
   1571    UChar*     vexTags = (UChar*)(&vex_state->guest_FPTAG[0]);
   1572    Fpu_State* x87     = (Fpu_State*)x87_state;
   1573    UInt       ftop    = (x87->env[FP_ENV_STAT] >> 11) & 7;
   1574    UInt       tagw    = x87->env[FP_ENV_TAG];
   1575    UInt       fpucw   = x87->env[FP_ENV_CTRL];
   1576    UInt       c3210   = x87->env[FP_ENV_STAT] & 0x4700;
   1577    VexEmNote  ew;
   1578    UInt       fpround;
   1579    ULong      pair;
   1580 
   1581    /* Copy registers and tags */
   1582    for (stno = 0; stno < 8; stno++) {
   1583       preg = (stno + ftop) & 7;
   1584       tag = (tagw >> (2*preg)) & 3;
   1585       if (tag == 3) {
   1586          /* register is empty */
   1587          /* hmm, if it's empty, does it still get written?  Probably
   1588             safer to say it does.  If we don't, memcheck could get out
   1589             of sync, in that it thinks all FP registers are defined by
   1590             this helper, but in reality some have not been updated. */
   1591          if (moveRegs)
   1592             vexRegs[preg] = 0; /* IEEE754 64-bit zero */
   1593          vexTags[preg] = 0;
   1594       } else {
   1595          /* register is non-empty */
   1596          if (moveRegs)
   1597             convert_f80le_to_f64le( &x87->reg[10*stno],
   1598                                     (UChar*)&vexRegs[preg] );
   1599          vexTags[preg] = 1;
   1600       }
   1601    }
   1602 
   1603    /* stack pointer */
   1604    vex_state->guest_FTOP = ftop;
   1605 
   1606    /* status word */
   1607    vex_state->guest_FC3210 = c3210;
   1608 
   1609    /* handle the control word, setting FPROUND and detecting any
   1610       emulation warnings. */
   1611    pair    = x86g_check_fldcw ( (UInt)fpucw );
   1612    fpround = (UInt)pair;
   1613    ew      = (VexEmNote)(pair >> 32);
   1614 
   1615    vex_state->guest_FPROUND = fpround & 3;
   1616 
   1617    /* emulation warnings --> caller */
   1618    return ew;
   1619 }
   1620 
   1621 
   1622 /* Create an x87 FPU state from the guest state, as close as
   1623    we can approximate it. */
   1624 static
   1625 void do_get_x87 ( /*IN*/VexGuestX86State* vex_state,
   1626                   /*OUT*/UChar* x87_state )
   1627 {
   1628    Int        i, stno, preg;
   1629    UInt       tagw;
   1630    ULong*     vexRegs = (ULong*)(&vex_state->guest_FPREG[0]);
   1631    UChar*     vexTags = (UChar*)(&vex_state->guest_FPTAG[0]);
   1632    Fpu_State* x87     = (Fpu_State*)x87_state;
   1633    UInt       ftop    = vex_state->guest_FTOP;
   1634    UInt       c3210   = vex_state->guest_FC3210;
   1635 
   1636    for (i = 0; i < 14; i++)
   1637       x87->env[i] = 0;
   1638 
   1639    x87->env[1] = x87->env[3] = x87->env[5] = x87->env[13] = 0xFFFF;
   1640    x87->env[FP_ENV_STAT]
   1641       = toUShort(((ftop & 7) << 11) | (c3210 & 0x4700));
   1642    x87->env[FP_ENV_CTRL]
   1643       = toUShort(x86g_create_fpucw( vex_state->guest_FPROUND ));
   1644 
   1645    /* Dump the register stack in ST order. */
   1646    tagw = 0;
   1647    for (stno = 0; stno < 8; stno++) {
   1648       preg = (stno + ftop) & 7;
   1649       if (vexTags[preg] == 0) {
   1650          /* register is empty */
   1651          tagw |= (3 << (2*preg));
   1652          convert_f64le_to_f80le( (UChar*)&vexRegs[preg],
   1653                                  &x87->reg[10*stno] );
   1654       } else {
   1655          /* register is full. */
   1656          tagw |= (0 << (2*preg));
   1657          convert_f64le_to_f80le( (UChar*)&vexRegs[preg],
   1658                                  &x87->reg[10*stno] );
   1659       }
   1660    }
   1661    x87->env[FP_ENV_TAG] = toUShort(tagw);
   1662 }
   1663 
   1664 
   1665 /* CALLED FROM GENERATED CODE */
   1666 /* DIRTY HELPER (reads guest state, writes guest mem) */
   1667 void x86g_dirtyhelper_FXSAVE ( VexGuestX86State* gst, HWord addr )
   1668 {
   1669    /* Somewhat roundabout, but at least it's simple. */
   1670    Fpu_State tmp;
   1671    UShort*   addrS = (UShort*)addr;
   1672    UChar*    addrC = (UChar*)addr;
   1673    U128*     xmm   = (U128*)(addr + 160);
   1674    UInt      mxcsr;
   1675    UShort    fp_tags;
   1676    UInt      summary_tags;
   1677    Int       r, stno;
   1678    UShort    *srcS, *dstS;
   1679 
   1680    do_get_x87( gst, (UChar*)&tmp );
   1681    mxcsr = x86g_create_mxcsr( gst->guest_SSEROUND );
   1682 
   1683    /* Now build the proper fxsave image from the x87 image we just
   1684       made. */
   1685 
   1686    addrS[0]  = tmp.env[FP_ENV_CTRL]; /* FCW: fpu control word */
   1687    addrS[1]  = tmp.env[FP_ENV_STAT]; /* FCW: fpu status word */
   1688 
   1689    /* set addrS[2] in an endian-independent way */
   1690    summary_tags = 0;
   1691    fp_tags = tmp.env[FP_ENV_TAG];
   1692    for (r = 0; r < 8; r++) {
   1693       if ( ((fp_tags >> (2*r)) & 3) != 3 )
   1694          summary_tags |= (1 << r);
   1695    }
   1696    addrC[4]  = toUChar(summary_tags); /* FTW: tag summary byte */
   1697    addrC[5]  = 0; /* pad */
   1698 
   1699    addrS[3]  = 0; /* FOP: fpu opcode (bogus) */
   1700    addrS[4]  = 0;
   1701    addrS[5]  = 0; /* FPU IP (bogus) */
   1702    addrS[6]  = 0; /* FPU IP's segment selector (bogus) (although we
   1703                      could conceivably dump %CS here) */
   1704 
   1705    addrS[7]  = 0; /* Intel reserved */
   1706 
   1707    addrS[8]  = 0; /* FPU DP (operand pointer) (bogus) */
   1708    addrS[9]  = 0; /* FPU DP (operand pointer) (bogus) */
   1709    addrS[10] = 0; /* segment selector for above operand pointer; %DS
   1710                      perhaps? */
   1711    addrS[11] = 0; /* Intel reserved */
   1712 
   1713    addrS[12] = toUShort(mxcsr);  /* MXCSR */
   1714    addrS[13] = toUShort(mxcsr >> 16);
   1715 
   1716    addrS[14] = 0xFFFF; /* MXCSR mask (lo16); who knows what for */
   1717    addrS[15] = 0xFFFF; /* MXCSR mask (hi16); who knows what for */
   1718 
   1719    /* Copy in the FP registers, in ST order. */
   1720    for (stno = 0; stno < 8; stno++) {
   1721       srcS = (UShort*)(&tmp.reg[10*stno]);
   1722       dstS = (UShort*)(&addrS[16 + 8*stno]);
   1723       dstS[0] = srcS[0];
   1724       dstS[1] = srcS[1];
   1725       dstS[2] = srcS[2];
   1726       dstS[3] = srcS[3];
   1727       dstS[4] = srcS[4];
   1728       dstS[5] = 0;
   1729       dstS[6] = 0;
   1730       dstS[7] = 0;
   1731    }
   1732 
   1733    /* That's the first 160 bytes of the image done.  Now only %xmm0
   1734       .. %xmm7 remain to be copied.  If the host is big-endian, these
   1735       need to be byte-swapped. */
   1736    vassert(host_is_little_endian());
   1737 
   1738 #  define COPY_U128(_dst,_src)                       \
   1739       do { _dst[0] = _src[0]; _dst[1] = _src[1];     \
   1740            _dst[2] = _src[2]; _dst[3] = _src[3]; }   \
   1741       while (0)
   1742 
   1743    COPY_U128( xmm[0], gst->guest_XMM0 );
   1744    COPY_U128( xmm[1], gst->guest_XMM1 );
   1745    COPY_U128( xmm[2], gst->guest_XMM2 );
   1746    COPY_U128( xmm[3], gst->guest_XMM3 );
   1747    COPY_U128( xmm[4], gst->guest_XMM4 );
   1748    COPY_U128( xmm[5], gst->guest_XMM5 );
   1749    COPY_U128( xmm[6], gst->guest_XMM6 );
   1750    COPY_U128( xmm[7], gst->guest_XMM7 );
   1751 
   1752 #  undef COPY_U128
   1753 }
   1754 
   1755 
   1756 /* CALLED FROM GENERATED CODE */
   1757 /* DIRTY HELPER (writes guest state, reads guest mem) */
   1758 VexEmNote x86g_dirtyhelper_FXRSTOR ( VexGuestX86State* gst, HWord addr )
   1759 {
   1760    Fpu_State tmp;
   1761    VexEmNote warnX87 = EmNote_NONE;
   1762    VexEmNote warnXMM = EmNote_NONE;
   1763    UShort*   addrS   = (UShort*)addr;
   1764    UChar*    addrC   = (UChar*)addr;
   1765    U128*     xmm     = (U128*)(addr + 160);
   1766    UShort    fp_tags;
   1767    Int       r, stno, i;
   1768 
   1769    /* Restore %xmm0 .. %xmm7.  If the host is big-endian, these need
   1770       to be byte-swapped. */
   1771    vassert(host_is_little_endian());
   1772 
   1773 #  define COPY_U128(_dst,_src)                       \
   1774       do { _dst[0] = _src[0]; _dst[1] = _src[1];     \
   1775            _dst[2] = _src[2]; _dst[3] = _src[3]; }   \
   1776       while (0)
   1777 
   1778    COPY_U128( gst->guest_XMM0, xmm[0] );
   1779    COPY_U128( gst->guest_XMM1, xmm[1] );
   1780    COPY_U128( gst->guest_XMM2, xmm[2] );
   1781    COPY_U128( gst->guest_XMM3, xmm[3] );
   1782    COPY_U128( gst->guest_XMM4, xmm[4] );
   1783    COPY_U128( gst->guest_XMM5, xmm[5] );
   1784    COPY_U128( gst->guest_XMM6, xmm[6] );
   1785    COPY_U128( gst->guest_XMM7, xmm[7] );
   1786 
   1787 #  undef COPY_U128
   1788 
   1789    /* Copy the x87 registers out of the image, into a temporary
   1790       Fpu_State struct. */
   1791 
   1792    /* LLVM on Darwin turns the following loop into a movaps plus a
   1793       handful of scalar stores.  This would work fine except for the
   1794       fact that VEX doesn't keep the stack correctly (16-) aligned for
   1795       the call, so it segfaults.  Hence, split the loop into two
   1796       pieces (and pray LLVM doesn't merely glue them back together) so
   1797       it's composed only of scalar stores and so is alignment
   1798       insensitive.  Of course this is a kludge of the lamest kind --
   1799       VEX should be fixed properly. */
   1800    /* Code that seems to trigger the problem:
   1801       for (i = 0; i < 14; i++) tmp.env[i] = 0; */
   1802    for (i = 0; i < 7; i++) tmp.env[i+0] = 0;
   1803    for (i = 0; i < 7; i++) tmp.env[i+7] = 0;
   1804 
   1805    for (i = 0; i < 80; i++) tmp.reg[i] = 0;
   1806    /* fill in tmp.reg[0..7] */
   1807    for (stno = 0; stno < 8; stno++) {
   1808       UShort* dstS = (UShort*)(&tmp.reg[10*stno]);
   1809       UShort* srcS = (UShort*)(&addrS[16 + 8*stno]);
   1810       dstS[0] = srcS[0];
   1811       dstS[1] = srcS[1];
   1812       dstS[2] = srcS[2];
   1813       dstS[3] = srcS[3];
   1814       dstS[4] = srcS[4];
   1815    }
   1816    /* fill in tmp.env[0..13] */
   1817    tmp.env[FP_ENV_CTRL] = addrS[0]; /* FCW: fpu control word */
   1818    tmp.env[FP_ENV_STAT] = addrS[1]; /* FCW: fpu status word */
   1819 
   1820    fp_tags = 0;
   1821    for (r = 0; r < 8; r++) {
   1822       if (addrC[4] & (1<<r))
   1823          fp_tags |= (0 << (2*r)); /* EMPTY */
   1824       else
   1825          fp_tags |= (3 << (2*r)); /* VALID -- not really precise enough. */
   1826    }
   1827    tmp.env[FP_ENV_TAG] = fp_tags;
   1828 
   1829    /* Now write 'tmp' into the guest state. */
   1830    warnX87 = do_put_x87( True/*moveRegs*/, (UChar*)&tmp, gst );
   1831 
   1832    { UInt w32 = (((UInt)addrS[12]) & 0xFFFF)
   1833                 | ((((UInt)addrS[13]) & 0xFFFF) << 16);
   1834      ULong w64 = x86g_check_ldmxcsr( w32 );
   1835 
   1836      warnXMM = (VexEmNote)(w64 >> 32);
   1837 
   1838      gst->guest_SSEROUND = (UInt)w64;
   1839    }
   1840 
   1841    /* Prefer an X87 emwarn over an XMM one, if both exist. */
   1842    if (warnX87 != EmNote_NONE)
   1843       return warnX87;
   1844    else
   1845       return warnXMM;
   1846 }
   1847 
   1848 
   1849 /* CALLED FROM GENERATED CODE */
   1850 /* DIRTY HELPER (reads guest state, writes guest mem) */
   1851 void x86g_dirtyhelper_FSAVE ( VexGuestX86State* gst, HWord addr )
   1852 {
   1853    do_get_x87( gst, (UChar*)addr );
   1854 }
   1855 
   1856 /* CALLED FROM GENERATED CODE */
   1857 /* DIRTY HELPER (writes guest state, reads guest mem) */
   1858 VexEmNote x86g_dirtyhelper_FRSTOR ( VexGuestX86State* gst, HWord addr )
   1859 {
   1860    return do_put_x87( True/*regs too*/, (UChar*)addr, gst );
   1861 }
   1862 
   1863 /* CALLED FROM GENERATED CODE */
   1864 /* DIRTY HELPER (reads guest state, writes guest mem) */
   1865 void x86g_dirtyhelper_FSTENV ( VexGuestX86State* gst, HWord addr )
   1866 {
   1867    /* Somewhat roundabout, but at least it's simple. */
   1868    Int       i;
   1869    UShort*   addrP = (UShort*)addr;
   1870    Fpu_State tmp;
   1871    do_get_x87( gst, (UChar*)&tmp );
   1872    for (i = 0; i < 14; i++)
   1873       addrP[i] = tmp.env[i];
   1874 }
   1875 
   1876 /* CALLED FROM GENERATED CODE */
   1877 /* DIRTY HELPER (writes guest state, reads guest mem) */
   1878 VexEmNote x86g_dirtyhelper_FLDENV ( VexGuestX86State* gst, HWord addr )
   1879 {
   1880    return do_put_x87( False/*don't move regs*/, (UChar*)addr, gst);
   1881 }
   1882 
   1883 
   1884 /*---------------------------------------------------------------*/
   1885 /*--- Misc integer helpers, including rotates and CPUID.      ---*/
   1886 /*---------------------------------------------------------------*/
   1887 
   1888 /* CALLED FROM GENERATED CODE: CLEAN HELPER */
   1889 /* Calculate both flags and value result for rotate right
   1890    through the carry bit.  Result in low 32 bits,
   1891    new flags (OSZACP) in high 32 bits.
   1892 */
   1893 ULong x86g_calculate_RCR ( UInt arg, UInt rot_amt, UInt eflags_in, UInt sz )
   1894 {
   1895    UInt tempCOUNT = rot_amt & 0x1F, cf=0, of=0, tempcf;
   1896 
   1897    switch (sz) {
   1898       case 4:
   1899          cf        = (eflags_in >> X86G_CC_SHIFT_C) & 1;
   1900          of        = ((arg >> 31) ^ cf) & 1;
   1901          while (tempCOUNT > 0) {
   1902             tempcf = arg & 1;
   1903             arg    = (arg >> 1) | (cf << 31);
   1904             cf     = tempcf;
   1905             tempCOUNT--;
   1906          }
   1907          break;
   1908       case 2:
   1909          while (tempCOUNT >= 17) tempCOUNT -= 17;
   1910          cf        = (eflags_in >> X86G_CC_SHIFT_C) & 1;
   1911          of        = ((arg >> 15) ^ cf) & 1;
   1912          while (tempCOUNT > 0) {
   1913             tempcf = arg & 1;
   1914             arg    = ((arg >> 1) & 0x7FFF) | (cf << 15);
   1915             cf     = tempcf;
   1916             tempCOUNT--;
   1917          }
   1918          break;
   1919       case 1:
   1920          while (tempCOUNT >= 9) tempCOUNT -= 9;
   1921          cf        = (eflags_in >> X86G_CC_SHIFT_C) & 1;
   1922          of        = ((arg >> 7) ^ cf) & 1;
   1923          while (tempCOUNT > 0) {
   1924             tempcf = arg & 1;
   1925             arg    = ((arg >> 1) & 0x7F) | (cf << 7);
   1926             cf     = tempcf;
   1927             tempCOUNT--;
   1928          }
   1929          break;
   1930       default:
   1931          vpanic("calculate_RCR: invalid size");
   1932    }
   1933 
   1934    cf &= 1;
   1935    of &= 1;
   1936    eflags_in &= ~(X86G_CC_MASK_C | X86G_CC_MASK_O);
   1937    eflags_in |= (cf << X86G_CC_SHIFT_C) | (of << X86G_CC_SHIFT_O);
   1938 
   1939    return (((ULong)eflags_in) << 32) | ((ULong)arg);
   1940 }
   1941 
   1942 
   1943 /* CALLED FROM GENERATED CODE: CLEAN HELPER */
   1944 /* Calculate both flags and value result for rotate left
   1945    through the carry bit.  Result in low 32 bits,
   1946    new flags (OSZACP) in high 32 bits.
   1947 */
   1948 ULong x86g_calculate_RCL ( UInt arg, UInt rot_amt, UInt eflags_in, UInt sz )
   1949 {
   1950    UInt tempCOUNT = rot_amt & 0x1F, cf=0, of=0, tempcf;
   1951 
   1952    switch (sz) {
   1953       case 4:
   1954          cf = (eflags_in >> X86G_CC_SHIFT_C) & 1;
   1955          while (tempCOUNT > 0) {
   1956             tempcf = (arg >> 31) & 1;
   1957             arg    = (arg << 1) | (cf & 1);
   1958             cf     = tempcf;
   1959             tempCOUNT--;
   1960          }
   1961          of = ((arg >> 31) ^ cf) & 1;
   1962          break;
   1963       case 2:
   1964          while (tempCOUNT >= 17) tempCOUNT -= 17;
   1965          cf = (eflags_in >> X86G_CC_SHIFT_C) & 1;
   1966          while (tempCOUNT > 0) {
   1967             tempcf = (arg >> 15) & 1;
   1968             arg    = 0xFFFF & ((arg << 1) | (cf & 1));
   1969             cf     = tempcf;
   1970             tempCOUNT--;
   1971          }
   1972          of = ((arg >> 15) ^ cf) & 1;
   1973          break;
   1974       case 1:
   1975          while (tempCOUNT >= 9) tempCOUNT -= 9;
   1976          cf = (eflags_in >> X86G_CC_SHIFT_C) & 1;
   1977          while (tempCOUNT > 0) {
   1978             tempcf = (arg >> 7) & 1;
   1979             arg    = 0xFF & ((arg << 1) | (cf & 1));
   1980             cf     = tempcf;
   1981             tempCOUNT--;
   1982          }
   1983          of = ((arg >> 7) ^ cf) & 1;
   1984          break;
   1985       default:
   1986          vpanic("calculate_RCL: invalid size");
   1987    }
   1988 
   1989    cf &= 1;
   1990    of &= 1;
   1991    eflags_in &= ~(X86G_CC_MASK_C | X86G_CC_MASK_O);
   1992    eflags_in |= (cf << X86G_CC_SHIFT_C) | (of << X86G_CC_SHIFT_O);
   1993 
   1994    return (((ULong)eflags_in) << 32) | ((ULong)arg);
   1995 }
   1996 
   1997 
   1998 /* CALLED FROM GENERATED CODE: CLEAN HELPER */
   1999 /* Calculate both flags and value result for DAA/DAS/AAA/AAS.
   2000    AX value in low half of arg, OSZACP in upper half.
   2001    See guest-x86/toIR.c usage point for details.
   2002 */
   2003 static UInt calc_parity_8bit ( UInt w32 ) {
   2004    UInt i;
   2005    UInt p = 1;
   2006    for (i = 0; i < 8; i++)
   2007       p ^= (1 & (w32 >> i));
   2008    return p;
   2009 }
   2010 UInt x86g_calculate_daa_das_aaa_aas ( UInt flags_and_AX, UInt opcode )
   2011 {
   2012    UInt r_AL = (flags_and_AX >> 0) & 0xFF;
   2013    UInt r_AH = (flags_and_AX >> 8) & 0xFF;
   2014    UInt r_O  = (flags_and_AX >> (16 + X86G_CC_SHIFT_O)) & 1;
   2015    UInt r_S  = (flags_and_AX >> (16 + X86G_CC_SHIFT_S)) & 1;
   2016    UInt r_Z  = (flags_and_AX >> (16 + X86G_CC_SHIFT_Z)) & 1;
   2017    UInt r_A  = (flags_and_AX >> (16 + X86G_CC_SHIFT_A)) & 1;
   2018    UInt r_C  = (flags_and_AX >> (16 + X86G_CC_SHIFT_C)) & 1;
   2019    UInt r_P  = (flags_and_AX >> (16 + X86G_CC_SHIFT_P)) & 1;
   2020    UInt result = 0;
   2021 
   2022    switch (opcode) {
   2023       case 0x27: { /* DAA */
   2024          UInt old_AL = r_AL;
   2025          UInt old_C  = r_C;
   2026          r_C = 0;
   2027          if ((r_AL & 0xF) > 9 || r_A == 1) {
   2028             r_AL = r_AL + 6;
   2029             r_C  = old_C;
   2030             if (r_AL >= 0x100) r_C = 1;
   2031             r_A = 1;
   2032          } else {
   2033             r_A = 0;
   2034          }
   2035          if (old_AL > 0x99 || old_C == 1) {
   2036             r_AL = r_AL + 0x60;
   2037             r_C  = 1;
   2038          } else {
   2039             r_C = 0;
   2040          }
   2041          /* O is undefined.  S Z and P are set according to the
   2042 	    result. */
   2043          r_AL &= 0xFF;
   2044          r_O = 0; /* let's say */
   2045          r_S = (r_AL & 0x80) ? 1 : 0;
   2046          r_Z = (r_AL == 0) ? 1 : 0;
   2047          r_P = calc_parity_8bit( r_AL );
   2048          break;
   2049       }
   2050       case 0x2F: { /* DAS */
   2051          UInt old_AL = r_AL;
   2052          UInt old_C  = r_C;
   2053          r_C = 0;
   2054          if ((r_AL & 0xF) > 9 || r_A == 1) {
   2055             Bool borrow = r_AL < 6;
   2056             r_AL = r_AL - 6;
   2057             r_C  = old_C;
   2058             if (borrow) r_C = 1;
   2059             r_A = 1;
   2060          } else {
   2061             r_A = 0;
   2062          }
   2063          if (old_AL > 0x99 || old_C == 1) {
   2064             r_AL = r_AL - 0x60;
   2065             r_C  = 1;
   2066          } else {
   2067             /* Intel docs are wrong: r_C = 0; */
   2068          }
   2069          /* O is undefined.  S Z and P are set according to the
   2070 	    result. */
   2071          r_AL &= 0xFF;
   2072          r_O = 0; /* let's say */
   2073          r_S = (r_AL & 0x80) ? 1 : 0;
   2074          r_Z = (r_AL == 0) ? 1 : 0;
   2075          r_P = calc_parity_8bit( r_AL );
   2076          break;
   2077       }
   2078       case 0x37: { /* AAA */
   2079          Bool nudge = r_AL > 0xF9;
   2080          if ((r_AL & 0xF) > 9 || r_A == 1) {
   2081             r_AL = r_AL + 6;
   2082             r_AH = r_AH + 1 + (nudge ? 1 : 0);
   2083             r_A  = 1;
   2084             r_C  = 1;
   2085             r_AL = r_AL & 0xF;
   2086          } else {
   2087             r_A  = 0;
   2088             r_C  = 0;
   2089             r_AL = r_AL & 0xF;
   2090          }
   2091          /* O S Z and P are undefined. */
   2092          r_O = r_S = r_Z = r_P = 0; /* let's say */
   2093          break;
   2094       }
   2095       case 0x3F: { /* AAS */
   2096          Bool nudge = r_AL < 0x06;
   2097          if ((r_AL & 0xF) > 9 || r_A == 1) {
   2098             r_AL = r_AL - 6;
   2099             r_AH = r_AH - 1 - (nudge ? 1 : 0);
   2100             r_A  = 1;
   2101             r_C  = 1;
   2102             r_AL = r_AL & 0xF;
   2103          } else {
   2104             r_A  = 0;
   2105             r_C  = 0;
   2106             r_AL = r_AL & 0xF;
   2107          }
   2108          /* O S Z and P are undefined. */
   2109          r_O = r_S = r_Z = r_P = 0; /* let's say */
   2110          break;
   2111       }
   2112       default:
   2113          vassert(0);
   2114    }
   2115    result =   ( (r_O & 1) << (16 + X86G_CC_SHIFT_O) )
   2116             | ( (r_S & 1) << (16 + X86G_CC_SHIFT_S) )
   2117             | ( (r_Z & 1) << (16 + X86G_CC_SHIFT_Z) )
   2118             | ( (r_A & 1) << (16 + X86G_CC_SHIFT_A) )
   2119             | ( (r_C & 1) << (16 + X86G_CC_SHIFT_C) )
   2120             | ( (r_P & 1) << (16 + X86G_CC_SHIFT_P) )
   2121             | ( (r_AH & 0xFF) << 8 )
   2122             | ( (r_AL & 0xFF) << 0 );
   2123    return result;
   2124 }
   2125 
   2126 UInt x86g_calculate_aad_aam ( UInt flags_and_AX, UInt opcode )
   2127 {
   2128    UInt r_AL = (flags_and_AX >> 0) & 0xFF;
   2129    UInt r_AH = (flags_and_AX >> 8) & 0xFF;
   2130    UInt r_O  = (flags_and_AX >> (16 + X86G_CC_SHIFT_O)) & 1;
   2131    UInt r_S  = (flags_and_AX >> (16 + X86G_CC_SHIFT_S)) & 1;
   2132    UInt r_Z  = (flags_and_AX >> (16 + X86G_CC_SHIFT_Z)) & 1;
   2133    UInt r_A  = (flags_and_AX >> (16 + X86G_CC_SHIFT_A)) & 1;
   2134    UInt r_C  = (flags_and_AX >> (16 + X86G_CC_SHIFT_C)) & 1;
   2135    UInt r_P  = (flags_and_AX >> (16 + X86G_CC_SHIFT_P)) & 1;
   2136    UInt result = 0;
   2137 
   2138    switch (opcode) {
   2139       case 0xD4: { /* AAM */
   2140          r_AH = r_AL / 10;
   2141          r_AL = r_AL % 10;
   2142          break;
   2143       }
   2144       case 0xD5: { /* AAD */
   2145          r_AL = ((r_AH * 10) + r_AL) & 0xff;
   2146          r_AH = 0;
   2147          break;
   2148       }
   2149       default:
   2150          vassert(0);
   2151    }
   2152 
   2153    r_O = 0; /* let's say (undefined) */
   2154    r_C = 0; /* let's say (undefined) */
   2155    r_A = 0; /* let's say (undefined) */
   2156    r_S = (r_AL & 0x80) ? 1 : 0;
   2157    r_Z = (r_AL == 0) ? 1 : 0;
   2158    r_P = calc_parity_8bit( r_AL );
   2159 
   2160    result =   ( (r_O & 1) << (16 + X86G_CC_SHIFT_O) )
   2161             | ( (r_S & 1) << (16 + X86G_CC_SHIFT_S) )
   2162             | ( (r_Z & 1) << (16 + X86G_CC_SHIFT_Z) )
   2163             | ( (r_A & 1) << (16 + X86G_CC_SHIFT_A) )
   2164             | ( (r_C & 1) << (16 + X86G_CC_SHIFT_C) )
   2165             | ( (r_P & 1) << (16 + X86G_CC_SHIFT_P) )
   2166             | ( (r_AH & 0xFF) << 8 )
   2167             | ( (r_AL & 0xFF) << 0 );
   2168    return result;
   2169 }
   2170 
   2171 
   2172 /* CALLED FROM GENERATED CODE */
   2173 /* DIRTY HELPER (non-referentially-transparent) */
   2174 /* Horrible hack.  On non-x86 platforms, return 1. */
   2175 ULong x86g_dirtyhelper_RDTSC ( void )
   2176 {
   2177 #  if defined(__i386__)
   2178    ULong res;
   2179    __asm__ __volatile__("rdtsc" : "=A" (res));
   2180    return res;
   2181 #  else
   2182    return 1ULL;
   2183 #  endif
   2184 }
   2185 
   2186 
   2187 /* CALLED FROM GENERATED CODE */
   2188 /* DIRTY HELPER (modifies guest state) */
   2189 /* Claim to be a P55C (Intel Pentium/MMX) */
   2190 void x86g_dirtyhelper_CPUID_sse0 ( VexGuestX86State* st )
   2191 {
   2192    switch (st->guest_EAX) {
   2193       case 0:
   2194          st->guest_EAX = 0x1;
   2195          st->guest_EBX = 0x756e6547;
   2196          st->guest_ECX = 0x6c65746e;
   2197          st->guest_EDX = 0x49656e69;
   2198          break;
   2199       default:
   2200          st->guest_EAX = 0x543;
   2201          st->guest_EBX = 0x0;
   2202          st->guest_ECX = 0x0;
   2203          st->guest_EDX = 0x8001bf;
   2204          break;
   2205    }
   2206 }
   2207 
   2208 /* CALLED FROM GENERATED CODE */
   2209 /* DIRTY HELPER (modifies guest state) */
   2210 /* Claim to be a Athlon "Classic" (Model 2, K75 "Pluto/Orion") */
   2211 /* But without 3DNow support (weird, but we really don't support it). */
   2212 void x86g_dirtyhelper_CPUID_mmxext ( VexGuestX86State* st )
   2213 {
   2214    switch (st->guest_EAX) {
   2215       /* vendor ID */
   2216       case 0:
   2217          st->guest_EAX = 0x1;
   2218          st->guest_EBX = 0x68747541;
   2219          st->guest_ECX = 0x444d4163;
   2220          st->guest_EDX = 0x69746e65;
   2221          break;
   2222       /* feature bits */
   2223       case 1:
   2224          st->guest_EAX = 0x621;
   2225          st->guest_EBX = 0x0;
   2226          st->guest_ECX = 0x0;
   2227          st->guest_EDX = 0x183f9ff;
   2228          break;
   2229       /* Highest Extended Function Supported (0x80000004 brand string) */
   2230       case 0x80000000:
   2231          st->guest_EAX = 0x80000004;
   2232          st->guest_EBX = 0x68747541;
   2233          st->guest_ECX = 0x444d4163;
   2234          st->guest_EDX = 0x69746e65;
   2235          break;
   2236       /* Extended Processor Info and Feature Bits */
   2237       case 0x80000001:
   2238          st->guest_EAX = 0x721;
   2239          st->guest_EBX = 0x0;
   2240          st->guest_ECX = 0x0;
   2241          st->guest_EDX = 0x1c3f9ff; /* Note no 3DNow. */
   2242          break;
   2243       /* Processor Brand String "AMD Athlon(tm) Processor" */
   2244       case 0x80000002:
   2245          st->guest_EAX = 0x20444d41;
   2246          st->guest_EBX = 0x6c687441;
   2247          st->guest_ECX = 0x74286e6f;
   2248          st->guest_EDX = 0x5020296d;
   2249          break;
   2250       case 0x80000003:
   2251          st->guest_EAX = 0x65636f72;
   2252          st->guest_EBX = 0x726f7373;
   2253          st->guest_ECX = 0x0;
   2254          st->guest_EDX = 0x0;
   2255          break;
   2256       default:
   2257          st->guest_EAX = 0x0;
   2258          st->guest_EBX = 0x0;
   2259          st->guest_ECX = 0x0;
   2260          st->guest_EDX = 0x0;
   2261          break;
   2262    }
   2263 }
   2264 
   2265 /* CALLED FROM GENERATED CODE */
   2266 /* DIRTY HELPER (modifies guest state) */
   2267 /* Claim to be the following SSE1-capable CPU:
   2268    vendor_id       : GenuineIntel
   2269    cpu family      : 6
   2270    model           : 11
   2271    model name      : Intel(R) Pentium(R) III CPU family      1133MHz
   2272    stepping        : 1
   2273    cpu MHz         : 1131.013
   2274    cache size      : 512 KB
   2275 */
   2276 void x86g_dirtyhelper_CPUID_sse1 ( VexGuestX86State* st )
   2277 {
   2278    switch (st->guest_EAX) {
   2279       case 0:
   2280          st->guest_EAX = 0x00000002;
   2281          st->guest_EBX = 0x756e6547;
   2282          st->guest_ECX = 0x6c65746e;
   2283          st->guest_EDX = 0x49656e69;
   2284          break;
   2285       case 1:
   2286          st->guest_EAX = 0x000006b1;
   2287          st->guest_EBX = 0x00000004;
   2288          st->guest_ECX = 0x00000000;
   2289          st->guest_EDX = 0x0383fbff;
   2290          break;
   2291       default:
   2292          st->guest_EAX = 0x03020101;
   2293          st->guest_EBX = 0x00000000;
   2294          st->guest_ECX = 0x00000000;
   2295          st->guest_EDX = 0x0c040883;
   2296          break;
   2297    }
   2298 }
   2299 
   2300 /* Claim to be the following SSSE3-capable CPU (2 x ...):
   2301    vendor_id       : GenuineIntel
   2302    cpu family      : 6
   2303    model           : 15
   2304    model name      : Intel(R) Core(TM)2 CPU 6600 @ 2.40GHz
   2305    stepping        : 6
   2306    cpu MHz         : 2394.000
   2307    cache size      : 4096 KB
   2308    physical id     : 0
   2309    siblings        : 2
   2310    core id         : 0
   2311    cpu cores       : 2
   2312    fpu             : yes
   2313    fpu_exception   : yes
   2314    cpuid level     : 10
   2315    wp              : yes
   2316    flags           : fpu vme de pse tsc msr pae mce cx8 apic sep
   2317                      mtrr pge mca cmov pat pse36 clflush dts acpi
   2318                      mmx fxsr sse sse2 ss ht tm syscall nx lm
   2319                      constant_tsc pni monitor ds_cpl vmx est tm2
   2320                      cx16 xtpr lahf_lm
   2321    bogomips        : 4798.78
   2322    clflush size    : 64
   2323    cache_alignment : 64
   2324    address sizes   : 36 bits physical, 48 bits virtual
   2325    power management:
   2326 */
   2327 void x86g_dirtyhelper_CPUID_sse2 ( VexGuestX86State* st )
   2328 {
   2329 #  define SET_ABCD(_a,_b,_c,_d)               \
   2330       do { st->guest_EAX = (UInt)(_a);        \
   2331            st->guest_EBX = (UInt)(_b);        \
   2332            st->guest_ECX = (UInt)(_c);        \
   2333            st->guest_EDX = (UInt)(_d);        \
   2334       } while (0)
   2335 
   2336    switch (st->guest_EAX) {
   2337       case 0x00000000:
   2338          SET_ABCD(0x0000000a, 0x756e6547, 0x6c65746e, 0x49656e69);
   2339          break;
   2340       case 0x00000001:
   2341          SET_ABCD(0x000006f6, 0x00020800, 0x0000e3bd, 0xbfebfbff);
   2342          break;
   2343       case 0x00000002:
   2344          SET_ABCD(0x05b0b101, 0x005657f0, 0x00000000, 0x2cb43049);
   2345          break;
   2346       case 0x00000003:
   2347          SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
   2348          break;
   2349       case 0x00000004: {
   2350          switch (st->guest_ECX) {
   2351             case 0x00000000: SET_ABCD(0x04000121, 0x01c0003f,
   2352                                       0x0000003f, 0x00000001); break;
   2353             case 0x00000001: SET_ABCD(0x04000122, 0x01c0003f,
   2354                                       0x0000003f, 0x00000001); break;
   2355             case 0x00000002: SET_ABCD(0x04004143, 0x03c0003f,
   2356                                       0x00000fff, 0x00000001); break;
   2357             default:         SET_ABCD(0x00000000, 0x00000000,
   2358                                       0x00000000, 0x00000000); break;
   2359          }
   2360          break;
   2361       }
   2362       case 0x00000005:
   2363          SET_ABCD(0x00000040, 0x00000040, 0x00000003, 0x00000020);
   2364          break;
   2365       case 0x00000006:
   2366          SET_ABCD(0x00000001, 0x00000002, 0x00000001, 0x00000000);
   2367          break;
   2368       case 0x00000007:
   2369          SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
   2370          break;
   2371       case 0x00000008:
   2372          SET_ABCD(0x00000400, 0x00000000, 0x00000000, 0x00000000);
   2373          break;
   2374       case 0x00000009:
   2375          SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
   2376          break;
   2377       case 0x0000000a:
   2378       unhandled_eax_value:
   2379          SET_ABCD(0x07280202, 0x00000000, 0x00000000, 0x00000000);
   2380          break;
   2381       case 0x80000000:
   2382          SET_ABCD(0x80000008, 0x00000000, 0x00000000, 0x00000000);
   2383          break;
   2384       case 0x80000001:
   2385          SET_ABCD(0x00000000, 0x00000000, 0x00000001, 0x20100000);
   2386          break;
   2387       case 0x80000002:
   2388          SET_ABCD(0x65746e49, 0x2952286c, 0x726f4320, 0x4d542865);
   2389          break;
   2390       case 0x80000003:
   2391          SET_ABCD(0x43203229, 0x20205550, 0x20202020, 0x20202020);
   2392          break;
   2393       case 0x80000004:
   2394          SET_ABCD(0x30303636, 0x20402020, 0x30342e32, 0x007a4847);
   2395          break;
   2396       case 0x80000005:
   2397          SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
   2398          break;
   2399       case 0x80000006:
   2400          SET_ABCD(0x00000000, 0x00000000, 0x10008040, 0x00000000);
   2401          break;
   2402       case 0x80000007:
   2403          SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
   2404          break;
   2405       case 0x80000008:
   2406          SET_ABCD(0x00003024, 0x00000000, 0x00000000, 0x00000000);
   2407          break;
   2408       default:
   2409          goto unhandled_eax_value;
   2410    }
   2411 #  undef SET_ABCD
   2412 }
   2413 
   2414 
   2415 /* CALLED FROM GENERATED CODE */
   2416 /* DIRTY HELPER (non-referentially-transparent) */
   2417 /* Horrible hack.  On non-x86 platforms, return 0. */
   2418 UInt x86g_dirtyhelper_IN ( UInt portno, UInt sz/*1,2 or 4*/ )
   2419 {
   2420 #  if defined(__i386__)
   2421    UInt r = 0;
   2422    portno &= 0xFFFF;
   2423    switch (sz) {
   2424       case 4:
   2425          __asm__ __volatile__("movl $0,%%eax; inl %w1,%0"
   2426                               : "=a" (r) : "Nd" (portno));
   2427 	 break;
   2428       case 2:
   2429          __asm__ __volatile__("movl $0,%%eax; inw %w1,%w0"
   2430                               : "=a" (r) : "Nd" (portno));
   2431 	 break;
   2432       case 1:
   2433          __asm__ __volatile__("movl $0,%%eax; inb %w1,%b0"
   2434                               : "=a" (r) : "Nd" (portno));
   2435 	 break;
   2436       default:
   2437          break;
   2438    }
   2439    return r;
   2440 #  else
   2441    return 0;
   2442 #  endif
   2443 }
   2444 
   2445 
   2446 /* CALLED FROM GENERATED CODE */
   2447 /* DIRTY HELPER (non-referentially-transparent) */
   2448 /* Horrible hack.  On non-x86 platforms, do nothing. */
   2449 void x86g_dirtyhelper_OUT ( UInt portno, UInt data, UInt sz/*1,2 or 4*/ )
   2450 {
   2451 #  if defined(__i386__)
   2452    portno &= 0xFFFF;
   2453    switch (sz) {
   2454       case 4:
   2455          __asm__ __volatile__("outl %0, %w1"
   2456                               : : "a" (data), "Nd" (portno));
   2457 	 break;
   2458       case 2:
   2459          __asm__ __volatile__("outw %w0, %w1"
   2460                               : : "a" (data), "Nd" (portno));
   2461 	 break;
   2462       case 1:
   2463          __asm__ __volatile__("outb %b0, %w1"
   2464                               : : "a" (data), "Nd" (portno));
   2465 	 break;
   2466       default:
   2467          break;
   2468    }
   2469 #  else
   2470    /* do nothing */
   2471 #  endif
   2472 }
   2473 
   2474 /* CALLED FROM GENERATED CODE */
   2475 /* DIRTY HELPER (non-referentially-transparent) */
   2476 /* Horrible hack.  On non-x86 platforms, do nothing. */
   2477 /* op = 0: call the native SGDT instruction.
   2478    op = 1: call the native SIDT instruction.
   2479 */
   2480 void x86g_dirtyhelper_SxDT ( void *address, UInt op ) {
   2481 #  if defined(__i386__)
   2482    switch (op) {
   2483       case 0:
   2484          __asm__ __volatile__("sgdt (%0)" : : "r" (address) : "memory");
   2485          break;
   2486       case 1:
   2487          __asm__ __volatile__("sidt (%0)" : : "r" (address) : "memory");
   2488          break;
   2489       default:
   2490          vpanic("x86g_dirtyhelper_SxDT");
   2491    }
   2492 #  else
   2493    /* do nothing */
   2494    UChar* p = (UChar*)address;
   2495    p[0] = p[1] = p[2] = p[3] = p[4] = p[5] = 0;
   2496 #  endif
   2497 }
   2498 
   2499 /*---------------------------------------------------------------*/
   2500 /*--- Helpers for MMX/SSE/SSE2.                               ---*/
   2501 /*---------------------------------------------------------------*/
   2502 
   2503 static inline UChar abdU8 ( UChar xx, UChar yy ) {
   2504    return toUChar(xx>yy ? xx-yy : yy-xx);
   2505 }
   2506 
   2507 static inline ULong mk32x2 ( UInt w1, UInt w0 ) {
   2508    return (((ULong)w1) << 32) | ((ULong)w0);
   2509 }
   2510 
   2511 static inline UShort sel16x4_3 ( ULong w64 ) {
   2512    UInt hi32 = toUInt(w64 >> 32);
   2513    return toUShort(hi32 >> 16);
   2514 }
   2515 static inline UShort sel16x4_2 ( ULong w64 ) {
   2516    UInt hi32 = toUInt(w64 >> 32);
   2517    return toUShort(hi32);
   2518 }
   2519 static inline UShort sel16x4_1 ( ULong w64 ) {
   2520    UInt lo32 = toUInt(w64);
   2521    return toUShort(lo32 >> 16);
   2522 }
   2523 static inline UShort sel16x4_0 ( ULong w64 ) {
   2524    UInt lo32 = toUInt(w64);
   2525    return toUShort(lo32);
   2526 }
   2527 
   2528 static inline UChar sel8x8_7 ( ULong w64 ) {
   2529    UInt hi32 = toUInt(w64 >> 32);
   2530    return toUChar(hi32 >> 24);
   2531 }
   2532 static inline UChar sel8x8_6 ( ULong w64 ) {
   2533    UInt hi32 = toUInt(w64 >> 32);
   2534    return toUChar(hi32 >> 16);
   2535 }
   2536 static inline UChar sel8x8_5 ( ULong w64 ) {
   2537    UInt hi32 = toUInt(w64 >> 32);
   2538    return toUChar(hi32 >> 8);
   2539 }
   2540 static inline UChar sel8x8_4 ( ULong w64 ) {
   2541    UInt hi32 = toUInt(w64 >> 32);
   2542    return toUChar(hi32 >> 0);
   2543 }
   2544 static inline UChar sel8x8_3 ( ULong w64 ) {
   2545    UInt lo32 = toUInt(w64);
   2546    return toUChar(lo32 >> 24);
   2547 }
   2548 static inline UChar sel8x8_2 ( ULong w64 ) {
   2549    UInt lo32 = toUInt(w64);
   2550    return toUChar(lo32 >> 16);
   2551 }
   2552 static inline UChar sel8x8_1 ( ULong w64 ) {
   2553    UInt lo32 = toUInt(w64);
   2554    return toUChar(lo32 >> 8);
   2555 }
   2556 static inline UChar sel8x8_0 ( ULong w64 ) {
   2557    UInt lo32 = toUInt(w64);
   2558    return toUChar(lo32 >> 0);
   2559 }
   2560 
   2561 /* CALLED FROM GENERATED CODE: CLEAN HELPER */
   2562 ULong x86g_calculate_mmx_pmaddwd ( ULong xx, ULong yy )
   2563 {
   2564    return
   2565       mk32x2(
   2566          (((Int)(Short)sel16x4_3(xx)) * ((Int)(Short)sel16x4_3(yy)))
   2567             + (((Int)(Short)sel16x4_2(xx)) * ((Int)(Short)sel16x4_2(yy))),
   2568          (((Int)(Short)sel16x4_1(xx)) * ((Int)(Short)sel16x4_1(yy)))
   2569             + (((Int)(Short)sel16x4_0(xx)) * ((Int)(Short)sel16x4_0(yy)))
   2570       );
   2571 }
   2572 
   2573 /* CALLED FROM GENERATED CODE: CLEAN HELPER */
   2574 ULong x86g_calculate_mmx_psadbw ( ULong xx, ULong yy )
   2575 {
   2576    UInt t = 0;
   2577    t += (UInt)abdU8( sel8x8_7(xx), sel8x8_7(yy) );
   2578    t += (UInt)abdU8( sel8x8_6(xx), sel8x8_6(yy) );
   2579    t += (UInt)abdU8( sel8x8_5(xx), sel8x8_5(yy) );
   2580    t += (UInt)abdU8( sel8x8_4(xx), sel8x8_4(yy) );
   2581    t += (UInt)abdU8( sel8x8_3(xx), sel8x8_3(yy) );
   2582    t += (UInt)abdU8( sel8x8_2(xx), sel8x8_2(yy) );
   2583    t += (UInt)abdU8( sel8x8_1(xx), sel8x8_1(yy) );
   2584    t += (UInt)abdU8( sel8x8_0(xx), sel8x8_0(yy) );
   2585    t &= 0xFFFF;
   2586    return (ULong)t;
   2587 }
   2588 
   2589 
   2590 /*---------------------------------------------------------------*/
   2591 /*--- Helpers for dealing with segment overrides.             ---*/
   2592 /*---------------------------------------------------------------*/
   2593 
   2594 static inline
   2595 UInt get_segdescr_base ( VexGuestX86SegDescr* ent )
   2596 {
   2597    UInt lo  = 0xFFFF & (UInt)ent->LdtEnt.Bits.BaseLow;
   2598    UInt mid =   0xFF & (UInt)ent->LdtEnt.Bits.BaseMid;
   2599    UInt hi  =   0xFF & (UInt)ent->LdtEnt.Bits.BaseHi;
   2600    return (hi << 24) | (mid << 16) | lo;
   2601 }
   2602 
   2603 static inline
   2604 UInt get_segdescr_limit ( VexGuestX86SegDescr* ent )
   2605 {
   2606     UInt lo    = 0xFFFF & (UInt)ent->LdtEnt.Bits.LimitLow;
   2607     UInt hi    =    0xF & (UInt)ent->LdtEnt.Bits.LimitHi;
   2608     UInt limit = (hi << 16) | lo;
   2609     if (ent->LdtEnt.Bits.Granularity)
   2610        limit = (limit << 12) | 0xFFF;
   2611     return limit;
   2612 }
   2613 
   2614 /* CALLED FROM GENERATED CODE: CLEAN HELPER */
   2615 ULong x86g_use_seg_selector ( HWord ldt, HWord gdt,
   2616                               UInt seg_selector, UInt virtual_addr )
   2617 {
   2618    UInt tiBit, base, limit;
   2619    VexGuestX86SegDescr* the_descrs;
   2620 
   2621    Bool verboze = False;
   2622 
   2623    /* If this isn't true, we're in Big Trouble. */
   2624    vassert(8 == sizeof(VexGuestX86SegDescr));
   2625 
   2626    if (verboze)
   2627       vex_printf("x86h_use_seg_selector: "
   2628                  "seg_selector = 0x%x, vaddr = 0x%x\n",
   2629                  seg_selector, virtual_addr);
   2630 
   2631    /* Check for wildly invalid selector. */
   2632    if (seg_selector & ~0xFFFF)
   2633       goto bad;
   2634 
   2635    seg_selector &= 0x0000FFFF;
   2636 
   2637    /* Sanity check the segment selector.  Ensure that RPL=11b (least
   2638       privilege).  This forms the bottom 2 bits of the selector. */
   2639    if ((seg_selector & 3) != 3)
   2640       goto bad;
   2641 
   2642    /* Extract the TI bit (0 means GDT, 1 means LDT) */
   2643    tiBit = (seg_selector >> 2) & 1;
   2644 
   2645    /* Convert the segment selector onto a table index */
   2646    seg_selector >>= 3;
   2647    vassert(seg_selector >= 0 && seg_selector < 8192);
   2648 
   2649    if (tiBit == 0) {
   2650 
   2651       /* GDT access. */
   2652       /* Do we actually have a GDT to look at? */
   2653       if (gdt == 0)
   2654          goto bad;
   2655 
   2656       /* Check for access to non-existent entry. */
   2657       if (seg_selector >= VEX_GUEST_X86_GDT_NENT)
   2658          goto bad;
   2659 
   2660       the_descrs = (VexGuestX86SegDescr*)gdt;
   2661       base  = get_segdescr_base (&the_descrs[seg_selector]);
   2662       limit = get_segdescr_limit(&the_descrs[seg_selector]);
   2663 
   2664    } else {
   2665 
   2666       /* All the same stuff, except for the LDT. */
   2667       if (ldt == 0)
   2668          goto bad;
   2669 
   2670       if (seg_selector >= VEX_GUEST_X86_LDT_NENT)
   2671          goto bad;
   2672 
   2673       the_descrs = (VexGuestX86SegDescr*)ldt;
   2674       base  = get_segdescr_base (&the_descrs[seg_selector]);
   2675       limit = get_segdescr_limit(&the_descrs[seg_selector]);
   2676 
   2677    }
   2678 
   2679    /* Do the limit check.  Note, this check is just slightly too
   2680       slack.  Really it should be "if (virtual_addr + size - 1 >=
   2681       limit)," but we don't have the size info to hand.  Getting it
   2682       could be significantly complex.  */
   2683    if (virtual_addr >= limit)
   2684       goto bad;
   2685 
   2686    if (verboze)
   2687       vex_printf("x86h_use_seg_selector: "
   2688                  "base = 0x%x, addr = 0x%x\n",
   2689                  base, base + virtual_addr);
   2690 
   2691    /* High 32 bits are zero, indicating success. */
   2692    return (ULong)( ((UInt)virtual_addr) + base );
   2693 
   2694  bad:
   2695    return 1ULL << 32;
   2696 }
   2697 
   2698 
   2699 /*---------------------------------------------------------------*/
   2700 /*--- Helpers for dealing with, and describing,               ---*/
   2701 /*--- guest state as a whole.                                 ---*/
   2702 /*---------------------------------------------------------------*/
   2703 
   2704 /* Initialise the entire x86 guest state. */
   2705 /* VISIBLE TO LIBVEX CLIENT */
   2706 void LibVEX_GuestX86_initialise ( /*OUT*/VexGuestX86State* vex_state )
   2707 {
   2708    vex_state->host_EvC_FAILADDR = 0;
   2709    vex_state->host_EvC_COUNTER = 0;
   2710 
   2711    vex_state->guest_EAX = 0;
   2712    vex_state->guest_ECX = 0;
   2713    vex_state->guest_EDX = 0;
   2714    vex_state->guest_EBX = 0;
   2715    vex_state->guest_ESP = 0;
   2716    vex_state->guest_EBP = 0;
   2717    vex_state->guest_ESI = 0;
   2718    vex_state->guest_EDI = 0;
   2719 
   2720    vex_state->guest_CC_OP   = X86G_CC_OP_COPY;
   2721    vex_state->guest_CC_DEP1 = 0;
   2722    vex_state->guest_CC_DEP2 = 0;
   2723    vex_state->guest_CC_NDEP = 0;
   2724    vex_state->guest_DFLAG   = 1; /* forwards */
   2725    vex_state->guest_IDFLAG  = 0;
   2726    vex_state->guest_ACFLAG  = 0;
   2727 
   2728    vex_state->guest_EIP = 0;
   2729 
   2730    /* Initialise the simulated FPU */
   2731    x86g_dirtyhelper_FINIT( vex_state );
   2732 
   2733    /* Initialse the SSE state. */
   2734 #  define SSEZERO(_xmm) _xmm[0]=_xmm[1]=_xmm[2]=_xmm[3] = 0;
   2735 
   2736    vex_state->guest_SSEROUND = (UInt)Irrm_NEAREST;
   2737    SSEZERO(vex_state->guest_XMM0);
   2738    SSEZERO(vex_state->guest_XMM1);
   2739    SSEZERO(vex_state->guest_XMM2);
   2740    SSEZERO(vex_state->guest_XMM3);
   2741    SSEZERO(vex_state->guest_XMM4);
   2742    SSEZERO(vex_state->guest_XMM5);
   2743    SSEZERO(vex_state->guest_XMM6);
   2744    SSEZERO(vex_state->guest_XMM7);
   2745 
   2746 #  undef SSEZERO
   2747 
   2748    vex_state->guest_CS  = 0;
   2749    vex_state->guest_DS  = 0;
   2750    vex_state->guest_ES  = 0;
   2751    vex_state->guest_FS  = 0;
   2752    vex_state->guest_GS  = 0;
   2753    vex_state->guest_SS  = 0;
   2754    vex_state->guest_LDT = 0;
   2755    vex_state->guest_GDT = 0;
   2756 
   2757    vex_state->guest_EMNOTE = EmNote_NONE;
   2758 
   2759    /* SSE2 has a 'clflush' cache-line-invalidator which uses these. */
   2760    vex_state->guest_CMSTART = 0;
   2761    vex_state->guest_CMLEN   = 0;
   2762 
   2763    vex_state->guest_NRADDR   = 0;
   2764    vex_state->guest_SC_CLASS = 0;
   2765    vex_state->guest_IP_AT_SYSCALL = 0;
   2766 
   2767    vex_state->padding1 = 0;
   2768 }
   2769 
   2770 
   2771 /* Figure out if any part of the guest state contained in minoff
   2772    .. maxoff requires precise memory exceptions.  If in doubt return
   2773    True (but this generates significantly slower code).
   2774 
   2775    By default we enforce precise exns for guest %ESP, %EBP and %EIP
   2776    only.  These are the minimum needed to extract correct stack
   2777    backtraces from x86 code.
   2778 
   2779    Only %ESP is needed in mode VexRegUpdSpAtMemAccess.
   2780 */
   2781 Bool guest_x86_state_requires_precise_mem_exns ( Int minoff,
   2782                                                  Int maxoff)
   2783 {
   2784    Int ebp_min = offsetof(VexGuestX86State, guest_EBP);
   2785    Int ebp_max = ebp_min + 4 - 1;
   2786    Int esp_min = offsetof(VexGuestX86State, guest_ESP);
   2787    Int esp_max = esp_min + 4 - 1;
   2788    Int eip_min = offsetof(VexGuestX86State, guest_EIP);
   2789    Int eip_max = eip_min + 4 - 1;
   2790 
   2791    if (maxoff < esp_min || minoff > esp_max) {
   2792       /* no overlap with esp */
   2793       if (vex_control.iropt_register_updates == VexRegUpdSpAtMemAccess)
   2794          return False; // We only need to check stack pointer.
   2795    } else {
   2796       return True;
   2797    }
   2798 
   2799    if (maxoff < ebp_min || minoff > ebp_max) {
   2800       /* no overlap with ebp */
   2801    } else {
   2802       return True;
   2803    }
   2804 
   2805    if (maxoff < eip_min || minoff > eip_max) {
   2806       /* no overlap with eip */
   2807    } else {
   2808       return True;
   2809    }
   2810 
   2811    return False;
   2812 }
   2813 
   2814 
   2815 #define ALWAYSDEFD(field)                           \
   2816     { offsetof(VexGuestX86State, field),            \
   2817       (sizeof ((VexGuestX86State*)0)->field) }
   2818 
   2819 VexGuestLayout
   2820    x86guest_layout
   2821       = {
   2822           /* Total size of the guest state, in bytes. */
   2823           .total_sizeB = sizeof(VexGuestX86State),
   2824 
   2825           /* Describe the stack pointer. */
   2826           .offset_SP = offsetof(VexGuestX86State,guest_ESP),
   2827           .sizeof_SP = 4,
   2828 
   2829           /* Describe the frame pointer. */
   2830           .offset_FP = offsetof(VexGuestX86State,guest_EBP),
   2831           .sizeof_FP = 4,
   2832 
   2833           /* Describe the instruction pointer. */
   2834           .offset_IP = offsetof(VexGuestX86State,guest_EIP),
   2835           .sizeof_IP = 4,
   2836 
   2837           /* Describe any sections to be regarded by Memcheck as
   2838              'always-defined'. */
   2839           .n_alwaysDefd = 24,
   2840 
   2841           /* flags thunk: OP and NDEP are always defd, whereas DEP1
   2842              and DEP2 have to be tracked.  See detailed comment in
   2843              gdefs.h on meaning of thunk fields. */
   2844           .alwaysDefd
   2845              = { /*  0 */ ALWAYSDEFD(guest_CC_OP),
   2846                  /*  1 */ ALWAYSDEFD(guest_CC_NDEP),
   2847                  /*  2 */ ALWAYSDEFD(guest_DFLAG),
   2848                  /*  3 */ ALWAYSDEFD(guest_IDFLAG),
   2849                  /*  4 */ ALWAYSDEFD(guest_ACFLAG),
   2850                  /*  5 */ ALWAYSDEFD(guest_EIP),
   2851                  /*  6 */ ALWAYSDEFD(guest_FTOP),
   2852                  /*  7 */ ALWAYSDEFD(guest_FPTAG),
   2853                  /*  8 */ ALWAYSDEFD(guest_FPROUND),
   2854                  /*  9 */ ALWAYSDEFD(guest_FC3210),
   2855                  /* 10 */ ALWAYSDEFD(guest_CS),
   2856                  /* 11 */ ALWAYSDEFD(guest_DS),
   2857                  /* 12 */ ALWAYSDEFD(guest_ES),
   2858                  /* 13 */ ALWAYSDEFD(guest_FS),
   2859                  /* 14 */ ALWAYSDEFD(guest_GS),
   2860                  /* 15 */ ALWAYSDEFD(guest_SS),
   2861                  /* 16 */ ALWAYSDEFD(guest_LDT),
   2862                  /* 17 */ ALWAYSDEFD(guest_GDT),
   2863                  /* 18 */ ALWAYSDEFD(guest_EMNOTE),
   2864                  /* 19 */ ALWAYSDEFD(guest_SSEROUND),
   2865                  /* 20 */ ALWAYSDEFD(guest_CMSTART),
   2866                  /* 21 */ ALWAYSDEFD(guest_CMLEN),
   2867                  /* 22 */ ALWAYSDEFD(guest_SC_CLASS),
   2868                  /* 23 */ ALWAYSDEFD(guest_IP_AT_SYSCALL)
   2869                }
   2870         };
   2871 
   2872 
   2873 /*---------------------------------------------------------------*/
   2874 /*--- end                                 guest_x86_helpers.c ---*/
   2875 /*---------------------------------------------------------------*/
   2876