Home | History | Annotate | Download | only in priv
      1 
      2 /*---------------------------------------------------------------*/
      3 /*--- begin                             guest_amd64_helpers.c ---*/
      4 /*---------------------------------------------------------------*/
      5 
      6 /*
      7    This file is part of Valgrind, a dynamic binary instrumentation
      8    framework.
      9 
     10    Copyright (C) 2004-2015 OpenWorks LLP
     11       info (at) open-works.net
     12 
     13    This program is free software; you can redistribute it and/or
     14    modify it under the terms of the GNU General Public License as
     15    published by the Free Software Foundation; either version 2 of the
     16    License, or (at your option) any later version.
     17 
     18    This program is distributed in the hope that it will be useful, but
     19    WITHOUT ANY WARRANTY; without even the implied warranty of
     20    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
     21    General Public License for more details.
     22 
     23    You should have received a copy of the GNU General Public License
     24    along with this program; if not, write to the Free Software
     25    Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
     26    02110-1301, USA.
     27 
     28    The GNU General Public License is contained in the file COPYING.
     29 
     30    Neither the names of the U.S. Department of Energy nor the
     31    University of California nor the names of its contributors may be
     32    used to endorse or promote products derived from this software
     33    without prior written permission.
     34 */
     35 
     36 #include "libvex_basictypes.h"
     37 #include "libvex_emnote.h"
     38 #include "libvex_guest_amd64.h"
     39 #include "libvex_ir.h"
     40 #include "libvex.h"
     41 
     42 #include "main_util.h"
     43 #include "main_globals.h"
     44 #include "guest_generic_bb_to_IR.h"
     45 #include "guest_amd64_defs.h"
     46 #include "guest_generic_x87.h"
     47 
     48 
     49 /* This file contains helper functions for amd64 guest code.
     50    Calls to these functions are generated by the back end.
     51    These calls are of course in the host machine code and
     52    this file will be compiled to host machine code, so that
     53    all makes sense.
     54 
     55    Only change the signatures of these helper functions very
     56    carefully.  If you change the signature here, you'll have to change
     57    the parameters passed to it in the IR calls constructed by
     58    guest-amd64/toIR.c.
     59 
     60    The convention used is that all functions called from generated
     61    code are named amd64g_<something>, and any function whose name lacks
     62    that prefix is not called from generated code.  Note that some
     63    LibVEX_* functions can however be called by VEX's client, but that
     64    is not the same as calling them from VEX-generated code.
     65 */
     66 
     67 
     68 /* Set to 1 to get detailed profiling info about use of the flag
     69    machinery. */
     70 #define PROFILE_RFLAGS 0
     71 
     72 
     73 /*---------------------------------------------------------------*/
     74 /*--- %rflags run-time helpers.                               ---*/
     75 /*---------------------------------------------------------------*/
     76 
     77 /* Do 64x64 -> 128 signed/unsigned multiplies, for computing flags
     78    after imulq/mulq. */
     79 
     80 static void mullS64 ( Long u, Long v, Long* rHi, Long* rLo )
     81 {
     82    const Long halfMask = 0xFFFFFFFFLL;
     83    ULong u0, v0, w0;
     84     Long u1, v1, w1, w2, t;
     85    u0   = u & halfMask;
     86    u1   = u >> 32;
     87    v0   = v & halfMask;
     88    v1   = v >> 32;
     89    w0   = u0 * v0;
     90    t    = u1 * v0 + (w0 >> 32);
     91    w1   = t & halfMask;
     92    w2   = t >> 32;
     93    w1   = u0 * v1 + w1;
     94    *rHi = u1 * v1 + w2 + (w1 >> 32);
     95    *rLo = (Long)((ULong)u * (ULong)v);
     96 }
     97 
     98 static void mullU64 ( ULong u, ULong v, ULong* rHi, ULong* rLo )
     99 {
    100    const ULong halfMask = 0xFFFFFFFFULL;
    101    ULong u0, v0, w0;
    102    ULong u1, v1, w1,w2,t;
    103    u0   = u & halfMask;
    104    u1   = u >> 32;
    105    v0   = v & halfMask;
    106    v1   = v >> 32;
    107    w0   = u0 * v0;
    108    t    = u1 * v0 + (w0 >> 32);
    109    w1   = t & halfMask;
    110    w2   = t >> 32;
    111    w1   = u0 * v1 + w1;
    112    *rHi = u1 * v1 + w2 + (w1 >> 32);
    113    *rLo = u * v;
    114 }
    115 
    116 
    117 static const UChar parity_table[256] = {
    118     AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
    119     0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
    120     0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
    121     AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
    122     0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
    123     AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
    124     AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
    125     0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
    126     0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
    127     AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
    128     AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
    129     0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
    130     AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
    131     0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
    132     0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
    133     AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
    134     0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
    135     AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
    136     AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
    137     0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
    138     AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
    139     0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
    140     0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
    141     AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
    142     AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
    143     0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
    144     0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
    145     AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
    146     0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
    147     AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
    148     AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
    149     0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
    150 };
    151 
    152 /* generalised left-shifter */
    153 static inline Long lshift ( Long x, Int n )
    154 {
    155    if (n >= 0)
    156       return (ULong)x << n;
    157    else
    158       return x >> (-n);
    159 }
    160 
    161 /* identity on ULong */
    162 static inline ULong idULong ( ULong x )
    163 {
    164    return x;
    165 }
    166 
    167 
    168 #define PREAMBLE(__data_bits)					\
    169    /* const */ ULong DATA_MASK 					\
    170       = __data_bits==8                                          \
    171            ? 0xFFULL 					        \
    172            : (__data_bits==16                                   \
    173                 ? 0xFFFFULL 		                        \
    174                 : (__data_bits==32                              \
    175                      ? 0xFFFFFFFFULL                            \
    176                      : 0xFFFFFFFFFFFFFFFFULL));                 \
    177    /* const */ ULong SIGN_MASK = 1ULL << (__data_bits - 1);     \
    178    /* const */ ULong CC_DEP1 = cc_dep1_formal;			\
    179    /* const */ ULong CC_DEP2 = cc_dep2_formal;			\
    180    /* const */ ULong CC_NDEP = cc_ndep_formal;			\
    181    /* Four bogus assignments, which hopefully gcc can     */	\
    182    /* optimise away, and which stop it complaining about  */	\
    183    /* unused variables.                                   */	\
    184    SIGN_MASK = SIGN_MASK;					\
    185    DATA_MASK = DATA_MASK;					\
    186    CC_DEP2 = CC_DEP2;						\
    187    CC_NDEP = CC_NDEP;
    188 
    189 
    190 /*-------------------------------------------------------------*/
    191 
    192 #define ACTIONS_ADD(DATA_BITS,DATA_UTYPE)			\
    193 {								\
    194    PREAMBLE(DATA_BITS);						\
    195    { ULong cf, pf, af, zf, sf, of;				\
    196      ULong argL, argR, res;					\
    197      argL = CC_DEP1;						\
    198      argR = CC_DEP2;						\
    199      res  = argL + argR;					\
    200      cf = (DATA_UTYPE)res < (DATA_UTYPE)argL;			\
    201      pf = parity_table[(UChar)res];				\
    202      af = (res ^ argL ^ argR) & 0x10;				\
    203      zf = ((DATA_UTYPE)res == 0) << 6;				\
    204      sf = lshift(res, 8 - DATA_BITS) & 0x80;			\
    205      of = lshift((argL ^ argR ^ -1) & (argL ^ res), 		\
    206                  12 - DATA_BITS) & AMD64G_CC_MASK_O;		\
    207      return cf | pf | af | zf | sf | of;			\
    208    }								\
    209 }
    210 
    211 /*-------------------------------------------------------------*/
    212 
    213 #define ACTIONS_SUB(DATA_BITS,DATA_UTYPE)			\
    214 {								\
    215    PREAMBLE(DATA_BITS);						\
    216    { ULong cf, pf, af, zf, sf, of;				\
    217      ULong argL, argR, res;					\
    218      argL = CC_DEP1;						\
    219      argR = CC_DEP2;						\
    220      res  = argL - argR;					\
    221      cf = (DATA_UTYPE)argL < (DATA_UTYPE)argR;			\
    222      pf = parity_table[(UChar)res];				\
    223      af = (res ^ argL ^ argR) & 0x10;				\
    224      zf = ((DATA_UTYPE)res == 0) << 6;				\
    225      sf = lshift(res, 8 - DATA_BITS) & 0x80;			\
    226      of = lshift((argL ^ argR) & (argL ^ res),	 		\
    227                  12 - DATA_BITS) & AMD64G_CC_MASK_O; 		\
    228      return cf | pf | af | zf | sf | of;			\
    229    }								\
    230 }
    231 
    232 /*-------------------------------------------------------------*/
    233 
    234 #define ACTIONS_ADC(DATA_BITS,DATA_UTYPE)			\
    235 {								\
    236    PREAMBLE(DATA_BITS);						\
    237    { ULong cf, pf, af, zf, sf, of;				\
    238      ULong argL, argR, oldC, res;		 		\
    239      oldC = CC_NDEP & AMD64G_CC_MASK_C;				\
    240      argL = CC_DEP1;						\
    241      argR = CC_DEP2 ^ oldC;	       				\
    242      res  = (argL + argR) + oldC;				\
    243      if (oldC)							\
    244         cf = (DATA_UTYPE)res <= (DATA_UTYPE)argL;		\
    245      else							\
    246         cf = (DATA_UTYPE)res < (DATA_UTYPE)argL;		\
    247      pf = parity_table[(UChar)res];				\
    248      af = (res ^ argL ^ argR) & 0x10;				\
    249      zf = ((DATA_UTYPE)res == 0) << 6;				\
    250      sf = lshift(res, 8 - DATA_BITS) & 0x80;			\
    251      of = lshift((argL ^ argR ^ -1) & (argL ^ res), 		\
    252                   12 - DATA_BITS) & AMD64G_CC_MASK_O;		\
    253      return cf | pf | af | zf | sf | of;			\
    254    }								\
    255 }
    256 
    257 /*-------------------------------------------------------------*/
    258 
    259 #define ACTIONS_SBB(DATA_BITS,DATA_UTYPE)			\
    260 {								\
    261    PREAMBLE(DATA_BITS);						\
    262    { ULong cf, pf, af, zf, sf, of;				\
    263      ULong argL, argR, oldC, res;	       			\
    264      oldC = CC_NDEP & AMD64G_CC_MASK_C;				\
    265      argL = CC_DEP1;						\
    266      argR = CC_DEP2 ^ oldC;	       				\
    267      res  = (argL - argR) - oldC;				\
    268      if (oldC)							\
    269         cf = (DATA_UTYPE)argL <= (DATA_UTYPE)argR;		\
    270      else							\
    271         cf = (DATA_UTYPE)argL < (DATA_UTYPE)argR;		\
    272      pf = parity_table[(UChar)res];				\
    273      af = (res ^ argL ^ argR) & 0x10;				\
    274      zf = ((DATA_UTYPE)res == 0) << 6;				\
    275      sf = lshift(res, 8 - DATA_BITS) & 0x80;			\
    276      of = lshift((argL ^ argR) & (argL ^ res), 			\
    277                  12 - DATA_BITS) & AMD64G_CC_MASK_O;		\
    278      return cf | pf | af | zf | sf | of;			\
    279    }								\
    280 }
    281 
    282 /*-------------------------------------------------------------*/
    283 
    284 #define ACTIONS_LOGIC(DATA_BITS,DATA_UTYPE)			\
    285 {								\
    286    PREAMBLE(DATA_BITS);						\
    287    { ULong cf, pf, af, zf, sf, of;				\
    288      cf = 0;							\
    289      pf = parity_table[(UChar)CC_DEP1];				\
    290      af = 0;							\
    291      zf = ((DATA_UTYPE)CC_DEP1 == 0) << 6;			\
    292      sf = lshift(CC_DEP1, 8 - DATA_BITS) & 0x80;		\
    293      of = 0;							\
    294      return cf | pf | af | zf | sf | of;			\
    295    }								\
    296 }
    297 
    298 /*-------------------------------------------------------------*/
    299 
    300 #define ACTIONS_INC(DATA_BITS,DATA_UTYPE)			\
    301 {								\
    302    PREAMBLE(DATA_BITS);						\
    303    { ULong cf, pf, af, zf, sf, of;				\
    304      ULong argL, argR, res;					\
    305      res  = CC_DEP1;						\
    306      argL = res - 1;						\
    307      argR = 1;							\
    308      cf = CC_NDEP & AMD64G_CC_MASK_C;				\
    309      pf = parity_table[(UChar)res];				\
    310      af = (res ^ argL ^ argR) & 0x10;				\
    311      zf = ((DATA_UTYPE)res == 0) << 6;				\
    312      sf = lshift(res, 8 - DATA_BITS) & 0x80;			\
    313      of = ((res & DATA_MASK) == SIGN_MASK) << 11;		\
    314      return cf | pf | af | zf | sf | of;			\
    315    }								\
    316 }
    317 
    318 /*-------------------------------------------------------------*/
    319 
    320 #define ACTIONS_DEC(DATA_BITS,DATA_UTYPE)			\
    321 {								\
    322    PREAMBLE(DATA_BITS);						\
    323    { ULong cf, pf, af, zf, sf, of;				\
    324      ULong argL, argR, res;					\
    325      res  = CC_DEP1;						\
    326      argL = res + 1;						\
    327      argR = 1;							\
    328      cf = CC_NDEP & AMD64G_CC_MASK_C;				\
    329      pf = parity_table[(UChar)res];				\
    330      af = (res ^ argL ^ argR) & 0x10;				\
    331      zf = ((DATA_UTYPE)res == 0) << 6;				\
    332      sf = lshift(res, 8 - DATA_BITS) & 0x80;			\
    333      of = ((res & DATA_MASK) 					\
    334           == ((ULong)SIGN_MASK - 1)) << 11;			\
    335      return cf | pf | af | zf | sf | of;			\
    336    }								\
    337 }
    338 
    339 /*-------------------------------------------------------------*/
    340 
    341 #define ACTIONS_SHL(DATA_BITS,DATA_UTYPE)			\
    342 {								\
    343    PREAMBLE(DATA_BITS);						\
    344    { ULong cf, pf, af, zf, sf, of;				\
    345      cf = (CC_DEP2 >> (DATA_BITS - 1)) & AMD64G_CC_MASK_C;	\
    346      pf = parity_table[(UChar)CC_DEP1];				\
    347      af = 0; /* undefined */					\
    348      zf = ((DATA_UTYPE)CC_DEP1 == 0) << 6;			\
    349      sf = lshift(CC_DEP1, 8 - DATA_BITS) & 0x80;		\
    350      /* of is defined if shift count == 1 */			\
    351      of = lshift(CC_DEP2 ^ CC_DEP1, 12 - DATA_BITS) 		\
    352           & AMD64G_CC_MASK_O;					\
    353      return cf | pf | af | zf | sf | of;			\
    354    }								\
    355 }
    356 
    357 /*-------------------------------------------------------------*/
    358 
    359 #define ACTIONS_SHR(DATA_BITS,DATA_UTYPE)			\
    360 {								\
    361    PREAMBLE(DATA_BITS);  					\
    362    { ULong cf, pf, af, zf, sf, of;				\
    363      cf = CC_DEP2 & 1;						\
    364      pf = parity_table[(UChar)CC_DEP1];				\
    365      af = 0; /* undefined */					\
    366      zf = ((DATA_UTYPE)CC_DEP1 == 0) << 6;			\
    367      sf = lshift(CC_DEP1, 8 - DATA_BITS) & 0x80;		\
    368      /* of is defined if shift count == 1 */			\
    369      of = lshift(CC_DEP2 ^ CC_DEP1, 12 - DATA_BITS)		\
    370           & AMD64G_CC_MASK_O;					\
    371      return cf | pf | af | zf | sf | of;			\
    372    }								\
    373 }
    374 
    375 /*-------------------------------------------------------------*/
    376 
    377 /* ROL: cf' = lsb(result).  of' = msb(result) ^ lsb(result). */
    378 /* DEP1 = result, NDEP = old flags */
    379 #define ACTIONS_ROL(DATA_BITS,DATA_UTYPE)			\
    380 {								\
    381    PREAMBLE(DATA_BITS);						\
    382    { ULong fl 							\
    383         = (CC_NDEP & ~(AMD64G_CC_MASK_O | AMD64G_CC_MASK_C))	\
    384           | (AMD64G_CC_MASK_C & CC_DEP1)			\
    385           | (AMD64G_CC_MASK_O & (lshift(CC_DEP1,  		\
    386                                       11-(DATA_BITS-1)) 	\
    387                      ^ lshift(CC_DEP1, 11)));			\
    388      return fl;							\
    389    }								\
    390 }
    391 
    392 /*-------------------------------------------------------------*/
    393 
    394 /* ROR: cf' = msb(result).  of' = msb(result) ^ msb-1(result). */
    395 /* DEP1 = result, NDEP = old flags */
    396 #define ACTIONS_ROR(DATA_BITS,DATA_UTYPE)			\
    397 {								\
    398    PREAMBLE(DATA_BITS);						\
    399    { ULong fl 							\
    400         = (CC_NDEP & ~(AMD64G_CC_MASK_O | AMD64G_CC_MASK_C))	\
    401           | (AMD64G_CC_MASK_C & (CC_DEP1 >> (DATA_BITS-1)))	\
    402           | (AMD64G_CC_MASK_O & (lshift(CC_DEP1, 		\
    403                                       11-(DATA_BITS-1)) 	\
    404                      ^ lshift(CC_DEP1, 11-(DATA_BITS-1)+1)));	\
    405      return fl;							\
    406    }								\
    407 }
    408 
    409 /*-------------------------------------------------------------*/
    410 
    411 #define ACTIONS_UMUL(DATA_BITS, DATA_UTYPE,  NARROWtoU,         \
    412                                 DATA_U2TYPE, NARROWto2U)        \
    413 {                                                               \
    414    PREAMBLE(DATA_BITS);                                         \
    415    { ULong cf, pf, af, zf, sf, of;                              \
    416      DATA_UTYPE  hi;                                            \
    417      DATA_UTYPE  lo                                             \
    418         = NARROWtoU( ((DATA_UTYPE)CC_DEP1)                      \
    419                      * ((DATA_UTYPE)CC_DEP2) );                 \
    420      DATA_U2TYPE rr                                             \
    421         = NARROWto2U(                                           \
    422              ((DATA_U2TYPE)((DATA_UTYPE)CC_DEP1))               \
    423              * ((DATA_U2TYPE)((DATA_UTYPE)CC_DEP2)) );          \
    424      hi = NARROWtoU(rr >>/*u*/ DATA_BITS);                      \
    425      cf = (hi != 0);                                            \
    426      pf = parity_table[(UChar)lo];                              \
    427      af = 0; /* undefined */                                    \
    428      zf = (lo == 0) << 6;                                       \
    429      sf = lshift(lo, 8 - DATA_BITS) & 0x80;                     \
    430      of = cf << 11;                                             \
    431      return cf | pf | af | zf | sf | of;                        \
    432    }								\
    433 }
    434 
    435 /*-------------------------------------------------------------*/
    436 
    437 #define ACTIONS_SMUL(DATA_BITS, DATA_STYPE,  NARROWtoS,         \
    438                                 DATA_S2TYPE, NARROWto2S)        \
    439 {                                                               \
    440    PREAMBLE(DATA_BITS);                                         \
    441    { ULong cf, pf, af, zf, sf, of;                              \
    442      DATA_STYPE  hi;                                            \
    443      DATA_STYPE  lo                                             \
    444         = NARROWtoS( ((DATA_S2TYPE)(DATA_STYPE)CC_DEP1)         \
    445                      * ((DATA_S2TYPE)(DATA_STYPE)CC_DEP2) );    \
    446      DATA_S2TYPE rr                                             \
    447         = NARROWto2S(                                           \
    448              ((DATA_S2TYPE)((DATA_STYPE)CC_DEP1))               \
    449              * ((DATA_S2TYPE)((DATA_STYPE)CC_DEP2)) );          \
    450      hi = NARROWtoS(rr >>/*s*/ DATA_BITS);                      \
    451      cf = (hi != (lo >>/*s*/ (DATA_BITS-1)));                   \
    452      pf = parity_table[(UChar)lo];                              \
    453      af = 0; /* undefined */                                    \
    454      zf = (lo == 0) << 6;                                       \
    455      sf = lshift(lo, 8 - DATA_BITS) & 0x80;                     \
    456      of = cf << 11;                                             \
    457      return cf | pf | af | zf | sf | of;                        \
    458    }								\
    459 }
    460 
    461 /*-------------------------------------------------------------*/
    462 
    463 #define ACTIONS_UMULQ                                           \
    464 {                                                               \
    465    PREAMBLE(64);                                                \
    466    { ULong cf, pf, af, zf, sf, of;                              \
    467      ULong lo, hi;                                              \
    468      mullU64( (ULong)CC_DEP1, (ULong)CC_DEP2, &hi, &lo );       \
    469      cf = (hi != 0);                                            \
    470      pf = parity_table[(UChar)lo];                              \
    471      af = 0; /* undefined */                                    \
    472      zf = (lo == 0) << 6;                                       \
    473      sf = lshift(lo, 8 - 64) & 0x80;                            \
    474      of = cf << 11;                                             \
    475      return cf | pf | af | zf | sf | of;                        \
    476    }								\
    477 }
    478 
    479 /*-------------------------------------------------------------*/
    480 
    481 #define ACTIONS_SMULQ                                           \
    482 {                                                               \
    483    PREAMBLE(64);                                                \
    484    { ULong cf, pf, af, zf, sf, of;                              \
    485      Long lo, hi;                                               \
    486      mullS64( (Long)CC_DEP1, (Long)CC_DEP2, &hi, &lo );         \
    487      cf = (hi != (lo >>/*s*/ (64-1)));                          \
    488      pf = parity_table[(UChar)lo];                              \
    489      af = 0; /* undefined */                                    \
    490      zf = (lo == 0) << 6;                                       \
    491      sf = lshift(lo, 8 - 64) & 0x80;                            \
    492      of = cf << 11;                                             \
    493      return cf | pf | af | zf | sf | of;                        \
    494    }								\
    495 }
    496 
    497 /*-------------------------------------------------------------*/
    498 
    499 #define ACTIONS_ANDN(DATA_BITS,DATA_UTYPE)			\
    500 {								\
    501    PREAMBLE(DATA_BITS);						\
    502    { ULong cf, pf, af, zf, sf, of;				\
    503      cf = 0;							\
    504      pf = 0;							\
    505      af = 0;							\
    506      zf = ((DATA_UTYPE)CC_DEP1 == 0) << 6;			\
    507      sf = lshift(CC_DEP1, 8 - DATA_BITS) & 0x80;		\
    508      of = 0;							\
    509      return cf | pf | af | zf | sf | of;			\
    510    }								\
    511 }
    512 
    513 /*-------------------------------------------------------------*/
    514 
    515 #define ACTIONS_BLSI(DATA_BITS,DATA_UTYPE)			\
    516 {								\
    517    PREAMBLE(DATA_BITS);						\
    518    { ULong cf, pf, af, zf, sf, of;				\
    519      cf = ((DATA_UTYPE)CC_DEP2 != 0);				\
    520      pf = 0;							\
    521      af = 0;							\
    522      zf = ((DATA_UTYPE)CC_DEP1 == 0) << 6;			\
    523      sf = lshift(CC_DEP1, 8 - DATA_BITS) & 0x80;		\
    524      of = 0;							\
    525      return cf | pf | af | zf | sf | of;			\
    526    }								\
    527 }
    528 
    529 /*-------------------------------------------------------------*/
    530 
    531 #define ACTIONS_BLSMSK(DATA_BITS,DATA_UTYPE)			\
    532 {								\
    533    PREAMBLE(DATA_BITS);						\
    534    { Long cf, pf, af, zf, sf, of;				\
    535      cf = ((DATA_UTYPE)CC_DEP2 == 0);				\
    536      pf = 0;							\
    537      af = 0;							\
    538      zf = 0;							\
    539      sf = lshift(CC_DEP1, 8 - DATA_BITS) & 0x80;		\
    540      of = 0;							\
    541      return cf | pf | af | zf | sf | of;			\
    542    }								\
    543 }
    544 
    545 /*-------------------------------------------------------------*/
    546 
    547 #define ACTIONS_BLSR(DATA_BITS,DATA_UTYPE)			\
    548 {								\
    549    PREAMBLE(DATA_BITS);						\
    550    { ULong cf, pf, af, zf, sf, of;				\
    551      cf = ((DATA_UTYPE)CC_DEP2 == 0);				\
    552      pf = 0;							\
    553      af = 0;							\
    554      zf = ((DATA_UTYPE)CC_DEP1 == 0) << 6;			\
    555      sf = lshift(CC_DEP1, 8 - DATA_BITS) & 0x80;		\
    556      of = 0;							\
    557      return cf | pf | af | zf | sf | of;			\
    558    }								\
    559 }
    560 
    561 /*-------------------------------------------------------------*/
    562 
    563 
    564 #if PROFILE_RFLAGS
    565 
    566 static Bool initted     = False;
    567 
    568 /* C flag, fast route */
    569 static UInt tabc_fast[AMD64G_CC_OP_NUMBER];
    570 /* C flag, slow route */
    571 static UInt tabc_slow[AMD64G_CC_OP_NUMBER];
    572 /* table for calculate_cond */
    573 static UInt tab_cond[AMD64G_CC_OP_NUMBER][16];
    574 /* total entry counts for calc_all, calc_c, calc_cond. */
    575 static UInt n_calc_all  = 0;
    576 static UInt n_calc_c    = 0;
    577 static UInt n_calc_cond = 0;
    578 
    579 #define SHOW_COUNTS_NOW (0 == (0x3FFFFF & (n_calc_all+n_calc_c+n_calc_cond)))
    580 
    581 
    582 static void showCounts ( void )
    583 {
    584    Int op, co;
    585    HChar ch;
    586    vex_printf("\nTotal calls: calc_all=%u   calc_cond=%u   calc_c=%u\n",
    587               n_calc_all, n_calc_cond, n_calc_c);
    588 
    589    vex_printf("      cSLOW  cFAST    O   NO    B   NB    Z   NZ   BE  NBE"
    590               "    S   NS    P   NP    L   NL   LE  NLE\n");
    591    vex_printf("     -----------------------------------------------------"
    592               "----------------------------------------\n");
    593    for (op = 0; op < AMD64G_CC_OP_NUMBER; op++) {
    594 
    595       ch = ' ';
    596       if (op > 0 && (op-1) % 4 == 0)
    597          ch = 'B';
    598       if (op > 0 && (op-1) % 4 == 1)
    599          ch = 'W';
    600       if (op > 0 && (op-1) % 4 == 2)
    601          ch = 'L';
    602       if (op > 0 && (op-1) % 4 == 3)
    603          ch = 'Q';
    604 
    605       vex_printf("%2d%c: ", op, ch);
    606       vex_printf("%6u ", tabc_slow[op]);
    607       vex_printf("%6u ", tabc_fast[op]);
    608       for (co = 0; co < 16; co++) {
    609          Int n = tab_cond[op][co];
    610          if (n >= 1000) {
    611             vex_printf(" %3dK", n / 1000);
    612          } else
    613          if (n >= 0) {
    614             vex_printf(" %3d ", n );
    615          } else {
    616             vex_printf("     ");
    617          }
    618       }
    619       vex_printf("\n");
    620    }
    621    vex_printf("\n");
    622 }
    623 
    624 static void initCounts ( void )
    625 {
    626    Int op, co;
    627    initted = True;
    628    for (op = 0; op < AMD64G_CC_OP_NUMBER; op++) {
    629       tabc_fast[op] = tabc_slow[op] = 0;
    630       for (co = 0; co < 16; co++)
    631          tab_cond[op][co] = 0;
    632    }
    633 }
    634 
    635 #endif /* PROFILE_RFLAGS */
    636 
    637 
    638 /* CALLED FROM GENERATED CODE: CLEAN HELPER */
    639 /* Calculate all the 6 flags from the supplied thunk parameters.
    640    Worker function, not directly called from generated code. */
    641 static
    642 ULong amd64g_calculate_rflags_all_WRK ( ULong cc_op,
    643                                         ULong cc_dep1_formal,
    644                                         ULong cc_dep2_formal,
    645                                         ULong cc_ndep_formal )
    646 {
    647    switch (cc_op) {
    648       case AMD64G_CC_OP_COPY:
    649          return cc_dep1_formal
    650                 & (AMD64G_CC_MASK_O | AMD64G_CC_MASK_S | AMD64G_CC_MASK_Z
    651                    | AMD64G_CC_MASK_A | AMD64G_CC_MASK_C | AMD64G_CC_MASK_P);
    652 
    653       case AMD64G_CC_OP_ADDB:   ACTIONS_ADD( 8,  UChar  );
    654       case AMD64G_CC_OP_ADDW:   ACTIONS_ADD( 16, UShort );
    655       case AMD64G_CC_OP_ADDL:   ACTIONS_ADD( 32, UInt   );
    656       case AMD64G_CC_OP_ADDQ:   ACTIONS_ADD( 64, ULong  );
    657 
    658       case AMD64G_CC_OP_ADCB:   ACTIONS_ADC( 8,  UChar  );
    659       case AMD64G_CC_OP_ADCW:   ACTIONS_ADC( 16, UShort );
    660       case AMD64G_CC_OP_ADCL:   ACTIONS_ADC( 32, UInt   );
    661       case AMD64G_CC_OP_ADCQ:   ACTIONS_ADC( 64, ULong  );
    662 
    663       case AMD64G_CC_OP_SUBB:   ACTIONS_SUB(  8, UChar  );
    664       case AMD64G_CC_OP_SUBW:   ACTIONS_SUB( 16, UShort );
    665       case AMD64G_CC_OP_SUBL:   ACTIONS_SUB( 32, UInt   );
    666       case AMD64G_CC_OP_SUBQ:   ACTIONS_SUB( 64, ULong  );
    667 
    668       case AMD64G_CC_OP_SBBB:   ACTIONS_SBB(  8, UChar  );
    669       case AMD64G_CC_OP_SBBW:   ACTIONS_SBB( 16, UShort );
    670       case AMD64G_CC_OP_SBBL:   ACTIONS_SBB( 32, UInt   );
    671       case AMD64G_CC_OP_SBBQ:   ACTIONS_SBB( 64, ULong  );
    672 
    673       case AMD64G_CC_OP_LOGICB: ACTIONS_LOGIC(  8, UChar  );
    674       case AMD64G_CC_OP_LOGICW: ACTIONS_LOGIC( 16, UShort );
    675       case AMD64G_CC_OP_LOGICL: ACTIONS_LOGIC( 32, UInt   );
    676       case AMD64G_CC_OP_LOGICQ: ACTIONS_LOGIC( 64, ULong  );
    677 
    678       case AMD64G_CC_OP_INCB:   ACTIONS_INC(  8, UChar  );
    679       case AMD64G_CC_OP_INCW:   ACTIONS_INC( 16, UShort );
    680       case AMD64G_CC_OP_INCL:   ACTIONS_INC( 32, UInt   );
    681       case AMD64G_CC_OP_INCQ:   ACTIONS_INC( 64, ULong  );
    682 
    683       case AMD64G_CC_OP_DECB:   ACTIONS_DEC(  8, UChar  );
    684       case AMD64G_CC_OP_DECW:   ACTIONS_DEC( 16, UShort );
    685       case AMD64G_CC_OP_DECL:   ACTIONS_DEC( 32, UInt   );
    686       case AMD64G_CC_OP_DECQ:   ACTIONS_DEC( 64, ULong  );
    687 
    688       case AMD64G_CC_OP_SHLB:   ACTIONS_SHL(  8, UChar  );
    689       case AMD64G_CC_OP_SHLW:   ACTIONS_SHL( 16, UShort );
    690       case AMD64G_CC_OP_SHLL:   ACTIONS_SHL( 32, UInt   );
    691       case AMD64G_CC_OP_SHLQ:   ACTIONS_SHL( 64, ULong  );
    692 
    693       case AMD64G_CC_OP_SHRB:   ACTIONS_SHR(  8, UChar  );
    694       case AMD64G_CC_OP_SHRW:   ACTIONS_SHR( 16, UShort );
    695       case AMD64G_CC_OP_SHRL:   ACTIONS_SHR( 32, UInt   );
    696       case AMD64G_CC_OP_SHRQ:   ACTIONS_SHR( 64, ULong  );
    697 
    698       case AMD64G_CC_OP_ROLB:   ACTIONS_ROL(  8, UChar  );
    699       case AMD64G_CC_OP_ROLW:   ACTIONS_ROL( 16, UShort );
    700       case AMD64G_CC_OP_ROLL:   ACTIONS_ROL( 32, UInt   );
    701       case AMD64G_CC_OP_ROLQ:   ACTIONS_ROL( 64, ULong  );
    702 
    703       case AMD64G_CC_OP_RORB:   ACTIONS_ROR(  8, UChar  );
    704       case AMD64G_CC_OP_RORW:   ACTIONS_ROR( 16, UShort );
    705       case AMD64G_CC_OP_RORL:   ACTIONS_ROR( 32, UInt   );
    706       case AMD64G_CC_OP_RORQ:   ACTIONS_ROR( 64, ULong  );
    707 
    708       case AMD64G_CC_OP_UMULB:  ACTIONS_UMUL(  8, UChar,  toUChar,
    709                                                   UShort, toUShort );
    710       case AMD64G_CC_OP_UMULW:  ACTIONS_UMUL( 16, UShort, toUShort,
    711                                                   UInt,   toUInt );
    712       case AMD64G_CC_OP_UMULL:  ACTIONS_UMUL( 32, UInt,   toUInt,
    713                                                   ULong,  idULong );
    714 
    715       case AMD64G_CC_OP_UMULQ:  ACTIONS_UMULQ;
    716 
    717       case AMD64G_CC_OP_SMULB:  ACTIONS_SMUL(  8, Char,   toUChar,
    718                                                   Short,  toUShort );
    719       case AMD64G_CC_OP_SMULW:  ACTIONS_SMUL( 16, Short,  toUShort,
    720                                                   Int,    toUInt   );
    721       case AMD64G_CC_OP_SMULL:  ACTIONS_SMUL( 32, Int,    toUInt,
    722                                                   Long,   idULong );
    723 
    724       case AMD64G_CC_OP_SMULQ:  ACTIONS_SMULQ;
    725 
    726       case AMD64G_CC_OP_ANDN32: ACTIONS_ANDN( 32, UInt   );
    727       case AMD64G_CC_OP_ANDN64: ACTIONS_ANDN( 64, ULong  );
    728 
    729       case AMD64G_CC_OP_BLSI32: ACTIONS_BLSI( 32, UInt   );
    730       case AMD64G_CC_OP_BLSI64: ACTIONS_BLSI( 64, ULong  );
    731 
    732       case AMD64G_CC_OP_BLSMSK32: ACTIONS_BLSMSK( 32, UInt   );
    733       case AMD64G_CC_OP_BLSMSK64: ACTIONS_BLSMSK( 64, ULong  );
    734 
    735       case AMD64G_CC_OP_BLSR32: ACTIONS_BLSR( 32, UInt   );
    736       case AMD64G_CC_OP_BLSR64: ACTIONS_BLSR( 64, ULong  );
    737 
    738       default:
    739          /* shouldn't really make these calls from generated code */
    740          vex_printf("amd64g_calculate_rflags_all_WRK(AMD64)"
    741                     "( %llu, 0x%llx, 0x%llx, 0x%llx )\n",
    742                     cc_op, cc_dep1_formal, cc_dep2_formal, cc_ndep_formal );
    743          vpanic("amd64g_calculate_rflags_all_WRK(AMD64)");
    744    }
    745 }
    746 
    747 
    748 /* CALLED FROM GENERATED CODE: CLEAN HELPER */
    749 /* Calculate all the 6 flags from the supplied thunk parameters. */
    750 ULong amd64g_calculate_rflags_all ( ULong cc_op,
    751                                     ULong cc_dep1,
    752                                     ULong cc_dep2,
    753                                     ULong cc_ndep )
    754 {
    755 #  if PROFILE_RFLAGS
    756    if (!initted) initCounts();
    757    n_calc_all++;
    758    if (SHOW_COUNTS_NOW) showCounts();
    759 #  endif
    760    return
    761       amd64g_calculate_rflags_all_WRK ( cc_op, cc_dep1, cc_dep2, cc_ndep );
    762 }
    763 
    764 
    765 /* CALLED FROM GENERATED CODE: CLEAN HELPER */
    766 /* Calculate just the carry flag from the supplied thunk parameters. */
    767 ULong amd64g_calculate_rflags_c ( ULong cc_op,
    768                                   ULong cc_dep1,
    769                                   ULong cc_dep2,
    770                                   ULong cc_ndep )
    771 {
    772 #  if PROFILE_RFLAGS
    773    if (!initted) initCounts();
    774    n_calc_c++;
    775    tabc_fast[cc_op]++;
    776    if (SHOW_COUNTS_NOW) showCounts();
    777 #  endif
    778 
    779    /* Fast-case some common ones. */
    780    switch (cc_op) {
    781       case AMD64G_CC_OP_COPY:
    782          return (cc_dep1 >> AMD64G_CC_SHIFT_C) & 1;
    783       case AMD64G_CC_OP_LOGICQ:
    784       case AMD64G_CC_OP_LOGICL:
    785       case AMD64G_CC_OP_LOGICW:
    786       case AMD64G_CC_OP_LOGICB:
    787          return 0;
    788 	 //      case AMD64G_CC_OP_SUBL:
    789 	 //         return ((UInt)cc_dep1) < ((UInt)cc_dep2)
    790 	 //                   ? AMD64G_CC_MASK_C : 0;
    791 	 //      case AMD64G_CC_OP_SUBW:
    792 	 //         return ((UInt)(cc_dep1 & 0xFFFF)) < ((UInt)(cc_dep2 & 0xFFFF))
    793 	 //                   ? AMD64G_CC_MASK_C : 0;
    794 	 //      case AMD64G_CC_OP_SUBB:
    795 	 //         return ((UInt)(cc_dep1 & 0xFF)) < ((UInt)(cc_dep2 & 0xFF))
    796 	 //                   ? AMD64G_CC_MASK_C : 0;
    797 	 //      case AMD64G_CC_OP_INCL:
    798 	 //      case AMD64G_CC_OP_DECL:
    799 	 //         return cc_ndep & AMD64G_CC_MASK_C;
    800       default:
    801          break;
    802    }
    803 
    804 #  if PROFILE_RFLAGS
    805    tabc_fast[cc_op]--;
    806    tabc_slow[cc_op]++;
    807 #  endif
    808 
    809    return amd64g_calculate_rflags_all_WRK(cc_op,cc_dep1,cc_dep2,cc_ndep)
    810           & AMD64G_CC_MASK_C;
    811 }
    812 
    813 
    814 /* CALLED FROM GENERATED CODE: CLEAN HELPER */
    815 /* returns 1 or 0 */
    816 ULong amd64g_calculate_condition ( ULong/*AMD64Condcode*/ cond,
    817                                    ULong cc_op,
    818                                    ULong cc_dep1,
    819                                    ULong cc_dep2,
    820                                    ULong cc_ndep )
    821 {
    822    ULong rflags = amd64g_calculate_rflags_all_WRK(cc_op, cc_dep1,
    823                                                   cc_dep2, cc_ndep);
    824    ULong of,sf,zf,cf,pf;
    825    ULong inv = cond & 1;
    826 
    827 #  if PROFILE_RFLAGS
    828    if (!initted) initCounts();
    829    tab_cond[cc_op][cond]++;
    830    n_calc_cond++;
    831    if (SHOW_COUNTS_NOW) showCounts();
    832 #  endif
    833 
    834    switch (cond) {
    835       case AMD64CondNO:
    836       case AMD64CondO: /* OF == 1 */
    837          of = rflags >> AMD64G_CC_SHIFT_O;
    838          return 1 & (inv ^ of);
    839 
    840       case AMD64CondNZ:
    841       case AMD64CondZ: /* ZF == 1 */
    842          zf = rflags >> AMD64G_CC_SHIFT_Z;
    843          return 1 & (inv ^ zf);
    844 
    845       case AMD64CondNB:
    846       case AMD64CondB: /* CF == 1 */
    847          cf = rflags >> AMD64G_CC_SHIFT_C;
    848          return 1 & (inv ^ cf);
    849          break;
    850 
    851       case AMD64CondNBE:
    852       case AMD64CondBE: /* (CF or ZF) == 1 */
    853          cf = rflags >> AMD64G_CC_SHIFT_C;
    854          zf = rflags >> AMD64G_CC_SHIFT_Z;
    855          return 1 & (inv ^ (cf | zf));
    856          break;
    857 
    858       case AMD64CondNS:
    859       case AMD64CondS: /* SF == 1 */
    860          sf = rflags >> AMD64G_CC_SHIFT_S;
    861          return 1 & (inv ^ sf);
    862 
    863       case AMD64CondNP:
    864       case AMD64CondP: /* PF == 1 */
    865          pf = rflags >> AMD64G_CC_SHIFT_P;
    866          return 1 & (inv ^ pf);
    867 
    868       case AMD64CondNL:
    869       case AMD64CondL: /* (SF xor OF) == 1 */
    870          sf = rflags >> AMD64G_CC_SHIFT_S;
    871          of = rflags >> AMD64G_CC_SHIFT_O;
    872          return 1 & (inv ^ (sf ^ of));
    873          break;
    874 
    875       case AMD64CondNLE:
    876       case AMD64CondLE: /* ((SF xor OF) or ZF)  == 1 */
    877          sf = rflags >> AMD64G_CC_SHIFT_S;
    878          of = rflags >> AMD64G_CC_SHIFT_O;
    879          zf = rflags >> AMD64G_CC_SHIFT_Z;
    880          return 1 & (inv ^ ((sf ^ of) | zf));
    881          break;
    882 
    883       default:
    884          /* shouldn't really make these calls from generated code */
    885          vex_printf("amd64g_calculate_condition"
    886                     "( %llu, %llu, 0x%llx, 0x%llx, 0x%llx )\n",
    887                     cond, cc_op, cc_dep1, cc_dep2, cc_ndep );
    888          vpanic("amd64g_calculate_condition");
    889    }
    890 }
    891 
    892 
    893 /* VISIBLE TO LIBVEX CLIENT */
    894 ULong LibVEX_GuestAMD64_get_rflags ( /*IN*/const VexGuestAMD64State* vex_state )
    895 {
    896    ULong rflags = amd64g_calculate_rflags_all_WRK(
    897                      vex_state->guest_CC_OP,
    898                      vex_state->guest_CC_DEP1,
    899                      vex_state->guest_CC_DEP2,
    900                      vex_state->guest_CC_NDEP
    901                   );
    902    Long dflag = vex_state->guest_DFLAG;
    903    vassert(dflag == 1 || dflag == -1);
    904    if (dflag == -1)
    905       rflags |= (1<<10);
    906    if (vex_state->guest_IDFLAG == 1)
    907       rflags |= (1<<21);
    908    if (vex_state->guest_ACFLAG == 1)
    909       rflags |= (1<<18);
    910 
    911    return rflags;
    912 }
    913 
    914 /* VISIBLE TO LIBVEX CLIENT */
    915 void
    916 LibVEX_GuestAMD64_put_rflags ( ULong rflags,
    917                                /*MOD*/VexGuestAMD64State* vex_state )
    918 {
    919    /* D flag */
    920    if (rflags & AMD64G_CC_MASK_D) {
    921       vex_state->guest_DFLAG = -1;
    922       rflags &= ~AMD64G_CC_MASK_D;
    923    }
    924    else
    925       vex_state->guest_DFLAG = 1;
    926 
    927    /* ID flag */
    928    if (rflags & AMD64G_CC_MASK_ID) {
    929       vex_state->guest_IDFLAG = 1;
    930       rflags &= ~AMD64G_CC_MASK_ID;
    931    }
    932    else
    933       vex_state->guest_IDFLAG = 0;
    934 
    935    /* AC flag */
    936    if (rflags & AMD64G_CC_MASK_AC) {
    937       vex_state->guest_ACFLAG = 1;
    938       rflags &= ~AMD64G_CC_MASK_AC;
    939    }
    940    else
    941       vex_state->guest_ACFLAG = 0;
    942 
    943    UInt cc_mask = AMD64G_CC_MASK_O | AMD64G_CC_MASK_S | AMD64G_CC_MASK_Z |
    944                   AMD64G_CC_MASK_A | AMD64G_CC_MASK_C | AMD64G_CC_MASK_P;
    945    vex_state->guest_CC_OP   = AMD64G_CC_OP_COPY;
    946    vex_state->guest_CC_DEP1 = rflags & cc_mask;
    947    vex_state->guest_CC_DEP2 = 0;
    948    vex_state->guest_CC_NDEP = 0;
    949 }
    950 
    951 /* VISIBLE TO LIBVEX CLIENT */
    952 void
    953 LibVEX_GuestAMD64_put_rflag_c ( ULong new_carry_flag,
    954                                /*MOD*/VexGuestAMD64State* vex_state )
    955 {
    956    ULong oszacp = amd64g_calculate_rflags_all_WRK(
    957                      vex_state->guest_CC_OP,
    958                      vex_state->guest_CC_DEP1,
    959                      vex_state->guest_CC_DEP2,
    960                      vex_state->guest_CC_NDEP
    961                   );
    962    if (new_carry_flag & 1) {
    963       oszacp |= AMD64G_CC_MASK_C;
    964    } else {
    965       oszacp &= ~AMD64G_CC_MASK_C;
    966    }
    967    vex_state->guest_CC_OP   = AMD64G_CC_OP_COPY;
    968    vex_state->guest_CC_DEP1 = oszacp;
    969    vex_state->guest_CC_DEP2 = 0;
    970    vex_state->guest_CC_NDEP = 0;
    971 }
    972 
    973 
    974 /*---------------------------------------------------------------*/
    975 /*--- %rflags translation-time function specialisers.         ---*/
    976 /*--- These help iropt specialise calls the above run-time    ---*/
    977 /*--- %rflags functions.                                      ---*/
    978 /*---------------------------------------------------------------*/
    979 
    980 /* Used by the optimiser to try specialisations.  Returns an
    981    equivalent expression, or NULL if none. */
    982 
    983 static Bool isU64 ( IRExpr* e, ULong n )
    984 {
    985    return toBool( e->tag == Iex_Const
    986                   && e->Iex.Const.con->tag == Ico_U64
    987                   && e->Iex.Const.con->Ico.U64 == n );
    988 }
    989 
    990 IRExpr* guest_amd64_spechelper ( const HChar* function_name,
    991                                  IRExpr** args,
    992                                  IRStmt** precedingStmts,
    993                                  Int      n_precedingStmts )
    994 {
    995 #  define unop(_op,_a1) IRExpr_Unop((_op),(_a1))
    996 #  define binop(_op,_a1,_a2) IRExpr_Binop((_op),(_a1),(_a2))
    997 #  define mkU64(_n) IRExpr_Const(IRConst_U64(_n))
    998 #  define mkU32(_n) IRExpr_Const(IRConst_U32(_n))
    999 #  define mkU8(_n)  IRExpr_Const(IRConst_U8(_n))
   1000 
   1001    Int i, arity = 0;
   1002    for (i = 0; args[i]; i++)
   1003       arity++;
   1004 #  if 0
   1005    vex_printf("spec request:\n");
   1006    vex_printf("   %s  ", function_name);
   1007    for (i = 0; i < arity; i++) {
   1008       vex_printf("  ");
   1009       ppIRExpr(args[i]);
   1010    }
   1011    vex_printf("\n");
   1012 #  endif
   1013 
   1014    /* --------- specialising "amd64g_calculate_condition" --------- */
   1015 
   1016    if (vex_streq(function_name, "amd64g_calculate_condition")) {
   1017       /* specialise calls to above "calculate condition" function */
   1018       IRExpr *cond, *cc_op, *cc_dep1, *cc_dep2;
   1019       vassert(arity == 5);
   1020       cond    = args[0];
   1021       cc_op   = args[1];
   1022       cc_dep1 = args[2];
   1023       cc_dep2 = args[3];
   1024 
   1025       /*---------------- ADDQ ----------------*/
   1026 
   1027       if (isU64(cc_op, AMD64G_CC_OP_ADDQ) && isU64(cond, AMD64CondZ)) {
   1028          /* long long add, then Z --> test (dst+src == 0) */
   1029          return unop(Iop_1Uto64,
   1030                      binop(Iop_CmpEQ64,
   1031                            binop(Iop_Add64, cc_dep1, cc_dep2),
   1032                            mkU64(0)));
   1033       }
   1034 
   1035       /*---------------- ADDL ----------------*/
   1036 
   1037       if (isU64(cc_op, AMD64G_CC_OP_ADDL) && isU64(cond, AMD64CondO)) {
   1038          /* This is very commonly generated by Javascript JITs, for
   1039             the idiom "do a 32-bit add and jump to out-of-line code if
   1040             an overflow occurs". */
   1041          /* long add, then O (overflow)
   1042             --> ((dep1 ^ dep2 ^ -1) & (dep1 ^ (dep1 + dep2)))[31]
   1043             --> (((dep1 ^ dep2 ^ -1) & (dep1 ^ (dep1 +64 dep2))) >>u 31) & 1
   1044             --> (((not(dep1 ^ dep2)) & (dep1 ^ (dep1 +64 dep2))) >>u 31) & 1
   1045          */
   1046          vassert(isIRAtom(cc_dep1));
   1047          vassert(isIRAtom(cc_dep2));
   1048          return
   1049             binop(Iop_And64,
   1050                   binop(Iop_Shr64,
   1051                         binop(Iop_And64,
   1052                               unop(Iop_Not64,
   1053                                    binop(Iop_Xor64, cc_dep1, cc_dep2)),
   1054                               binop(Iop_Xor64,
   1055                                     cc_dep1,
   1056                                     binop(Iop_Add64, cc_dep1, cc_dep2))),
   1057                         mkU8(31)),
   1058                   mkU64(1));
   1059 
   1060       }
   1061 
   1062       /*---------------- SUBQ ----------------*/
   1063 
   1064       /* 0, */
   1065       if (isU64(cc_op, AMD64G_CC_OP_SUBQ) && isU64(cond, AMD64CondO)) {
   1066          /* long long sub/cmp, then O (overflow)
   1067             --> ((dep1 ^ dep2) & (dep1 ^ (dep1 - dep2)))[63]
   1068             --> ((dep1 ^ dep2) & (dep1 ^ (dep1 - dep2))) >>u 63
   1069          */
   1070          vassert(isIRAtom(cc_dep1));
   1071          vassert(isIRAtom(cc_dep2));
   1072          return binop(Iop_Shr64,
   1073                       binop(Iop_And64,
   1074                             binop(Iop_Xor64, cc_dep1, cc_dep2),
   1075                             binop(Iop_Xor64,
   1076                                   cc_dep1,
   1077                                   binop(Iop_Sub64, cc_dep1, cc_dep2))),
   1078                       mkU8(63));
   1079       }
   1080       if (isU64(cc_op, AMD64G_CC_OP_SUBQ) && isU64(cond, AMD64CondNO)) {
   1081          /* No action.  Never yet found a test case. */
   1082       }
   1083 
   1084       /* 2, 3 */
   1085       if (isU64(cc_op, AMD64G_CC_OP_SUBQ) && isU64(cond, AMD64CondB)) {
   1086          /* long long sub/cmp, then B (unsigned less than)
   1087             --> test dst <u src */
   1088          return unop(Iop_1Uto64,
   1089                      binop(Iop_CmpLT64U, cc_dep1, cc_dep2));
   1090       }
   1091       if (isU64(cc_op, AMD64G_CC_OP_SUBQ) && isU64(cond, AMD64CondNB)) {
   1092          /* long long sub/cmp, then NB (unsigned greater than or equal)
   1093             --> test src <=u dst */
   1094          /* Note, args are opposite way round from the usual */
   1095          return unop(Iop_1Uto64,
   1096                      binop(Iop_CmpLE64U, cc_dep2, cc_dep1));
   1097       }
   1098 
   1099       /* 4, 5 */
   1100       if (isU64(cc_op, AMD64G_CC_OP_SUBQ) && isU64(cond, AMD64CondZ)) {
   1101          /* long long sub/cmp, then Z --> test dst==src */
   1102          return unop(Iop_1Uto64,
   1103                      binop(Iop_CmpEQ64,cc_dep1,cc_dep2));
   1104       }
   1105       if (isU64(cc_op, AMD64G_CC_OP_SUBQ) && isU64(cond, AMD64CondNZ)) {
   1106          /* long long sub/cmp, then NZ --> test dst!=src */
   1107          return unop(Iop_1Uto64,
   1108                      binop(Iop_CmpNE64,cc_dep1,cc_dep2));
   1109       }
   1110 
   1111       /* 6, 7 */
   1112       if (isU64(cc_op, AMD64G_CC_OP_SUBQ) && isU64(cond, AMD64CondBE)) {
   1113          /* long long sub/cmp, then BE (unsigned less than or equal)
   1114             --> test dst <=u src */
   1115          return unop(Iop_1Uto64,
   1116                      binop(Iop_CmpLE64U, cc_dep1, cc_dep2));
   1117       }
   1118       if (isU64(cc_op, AMD64G_CC_OP_SUBQ) && isU64(cond, AMD64CondNBE)) {
   1119          /* long long sub/cmp, then NBE (unsigned greater than)
   1120             --> test !(dst <=u src) */
   1121          return binop(Iop_Xor64,
   1122                       unop(Iop_1Uto64,
   1123                            binop(Iop_CmpLE64U, cc_dep1, cc_dep2)),
   1124                       mkU64(1));
   1125       }
   1126 
   1127       /* 8, 9 */
   1128       if (isU64(cc_op, AMD64G_CC_OP_SUBQ) && isU64(cond, AMD64CondS)) {
   1129          /* long long sub/cmp, then S (negative)
   1130             --> (dst-src)[63]
   1131             --> (dst-src) >>u 63 */
   1132          return binop(Iop_Shr64,
   1133                       binop(Iop_Sub64, cc_dep1, cc_dep2),
   1134                       mkU8(63));
   1135       }
   1136       if (isU64(cc_op, AMD64G_CC_OP_SUBQ) && isU64(cond, AMD64CondNS)) {
   1137          /* long long sub/cmp, then NS (not negative)
   1138             --> (dst-src)[63] ^ 1
   1139             --> ((dst-src) >>u 63) ^ 1 */
   1140          return binop(Iop_Xor64,
   1141                       binop(Iop_Shr64,
   1142                             binop(Iop_Sub64, cc_dep1, cc_dep2),
   1143                             mkU8(63)),
   1144                       mkU64(1));
   1145       }
   1146 
   1147       /* 12, 13 */
   1148       if (isU64(cc_op, AMD64G_CC_OP_SUBQ) && isU64(cond, AMD64CondL)) {
   1149          /* long long sub/cmp, then L (signed less than)
   1150             --> test dst <s src */
   1151          return unop(Iop_1Uto64,
   1152                      binop(Iop_CmpLT64S, cc_dep1, cc_dep2));
   1153       }
   1154       if (isU64(cc_op, AMD64G_CC_OP_SUBQ) && isU64(cond, AMD64CondNL)) {
   1155          /* long long sub/cmp, then NL (signed greater than or equal)
   1156             --> test dst >=s src
   1157             --> test src <=s dst */
   1158          return unop(Iop_1Uto64,
   1159                      binop(Iop_CmpLE64S, cc_dep2, cc_dep1));
   1160       }
   1161 
   1162       /* 14, 15 */
   1163       if (isU64(cc_op, AMD64G_CC_OP_SUBQ) && isU64(cond, AMD64CondLE)) {
   1164          /* long long sub/cmp, then LE (signed less than or equal)
   1165             --> test dst <=s src */
   1166          return unop(Iop_1Uto64,
   1167                      binop(Iop_CmpLE64S, cc_dep1, cc_dep2));
   1168       }
   1169       if (isU64(cc_op, AMD64G_CC_OP_SUBQ) && isU64(cond, AMD64CondNLE)) {
   1170          /* long sub/cmp, then NLE (signed greater than)
   1171             --> test !(dst <=s src)
   1172             --> test (dst >s src)
   1173             --> test (src <s dst) */
   1174          return unop(Iop_1Uto64,
   1175                      binop(Iop_CmpLT64S, cc_dep2, cc_dep1));
   1176 
   1177       }
   1178 
   1179       /*---------------- SUBL ----------------*/
   1180 
   1181       /* 0, */
   1182       if (isU64(cc_op, AMD64G_CC_OP_SUBL) && isU64(cond, AMD64CondO)) {
   1183          /* This is very commonly generated by Javascript JITs, for
   1184             the idiom "do a 32-bit subtract and jump to out-of-line
   1185             code if an overflow occurs". */
   1186          /* long sub/cmp, then O (overflow)
   1187             --> ((dep1 ^ dep2) & (dep1 ^ (dep1 - dep2)))[31]
   1188             --> (((dep1 ^ dep2) & (dep1 ^ (dep1 -64 dep2))) >>u 31) & 1
   1189          */
   1190          vassert(isIRAtom(cc_dep1));
   1191          vassert(isIRAtom(cc_dep2));
   1192          return
   1193             binop(Iop_And64,
   1194                   binop(Iop_Shr64,
   1195                         binop(Iop_And64,
   1196                               binop(Iop_Xor64, cc_dep1, cc_dep2),
   1197                               binop(Iop_Xor64,
   1198                                     cc_dep1,
   1199                                     binop(Iop_Sub64, cc_dep1, cc_dep2))),
   1200                         mkU8(31)),
   1201                   mkU64(1));
   1202       }
   1203       if (isU64(cc_op, AMD64G_CC_OP_SUBL) && isU64(cond, AMD64CondNO)) {
   1204          /* No action.  Never yet found a test case. */
   1205       }
   1206 
   1207       /* 2, 3 */
   1208       if (isU64(cc_op, AMD64G_CC_OP_SUBL) && isU64(cond, AMD64CondB)) {
   1209          /* long sub/cmp, then B (unsigned less than)
   1210             --> test dst <u src */
   1211          return unop(Iop_1Uto64,
   1212                      binop(Iop_CmpLT32U,
   1213                            unop(Iop_64to32, cc_dep1),
   1214                            unop(Iop_64to32, cc_dep2)));
   1215       }
   1216       if (isU64(cc_op, AMD64G_CC_OP_SUBL) && isU64(cond, AMD64CondNB)) {
   1217          /* long sub/cmp, then NB (unsigned greater than or equal)
   1218             --> test src <=u dst */
   1219          /* Note, args are opposite way round from the usual */
   1220          return unop(Iop_1Uto64,
   1221                      binop(Iop_CmpLE32U,
   1222                            unop(Iop_64to32, cc_dep2),
   1223                            unop(Iop_64to32, cc_dep1)));
   1224       }
   1225 
   1226       /* 4, 5 */
   1227       if (isU64(cc_op, AMD64G_CC_OP_SUBL) && isU64(cond, AMD64CondZ)) {
   1228          /* long sub/cmp, then Z --> test dst==src */
   1229          return unop(Iop_1Uto64,
   1230                      binop(Iop_CmpEQ32,
   1231                            unop(Iop_64to32, cc_dep1),
   1232                            unop(Iop_64to32, cc_dep2)));
   1233       }
   1234       if (isU64(cc_op, AMD64G_CC_OP_SUBL) && isU64(cond, AMD64CondNZ)) {
   1235          /* long sub/cmp, then NZ --> test dst!=src */
   1236          return unop(Iop_1Uto64,
   1237                      binop(Iop_CmpNE32,
   1238                            unop(Iop_64to32, cc_dep1),
   1239                            unop(Iop_64to32, cc_dep2)));
   1240       }
   1241 
   1242       /* 6, 7 */
   1243       if (isU64(cc_op, AMD64G_CC_OP_SUBL) && isU64(cond, AMD64CondBE)) {
   1244          /* long sub/cmp, then BE (unsigned less than or equal)
   1245             --> test dst <=u src */
   1246          return unop(Iop_1Uto64,
   1247                      binop(Iop_CmpLE32U,
   1248                            unop(Iop_64to32, cc_dep1),
   1249                            unop(Iop_64to32, cc_dep2)));
   1250       }
   1251       if (isU64(cc_op, AMD64G_CC_OP_SUBL) && isU64(cond, AMD64CondNBE)) {
   1252          /* long sub/cmp, then NBE (unsigned greater than)
   1253             --> test src <u dst */
   1254          /* Note, args are opposite way round from the usual */
   1255          return unop(Iop_1Uto64,
   1256                      binop(Iop_CmpLT32U,
   1257                            unop(Iop_64to32, cc_dep2),
   1258                            unop(Iop_64to32, cc_dep1)));
   1259       }
   1260 
   1261       /* 8, 9 */
   1262       if (isU64(cc_op, AMD64G_CC_OP_SUBL) && isU64(cond, AMD64CondS)) {
   1263          /* long sub/cmp, then S (negative)
   1264             --> (dst-src)[31]
   1265             --> ((dst -64 src) >>u 31) & 1
   1266             Pointless to narrow the args to 32 bit before the subtract. */
   1267          return binop(Iop_And64,
   1268                       binop(Iop_Shr64,
   1269                             binop(Iop_Sub64, cc_dep1, cc_dep2),
   1270                             mkU8(31)),
   1271                       mkU64(1));
   1272       }
   1273       if (isU64(cc_op, AMD64G_CC_OP_SUBL) && isU64(cond, AMD64CondNS)) {
   1274          /* long sub/cmp, then NS (not negative)
   1275             --> (dst-src)[31] ^ 1
   1276             --> (((dst -64 src) >>u 31) & 1) ^ 1
   1277             Pointless to narrow the args to 32 bit before the subtract. */
   1278          return binop(Iop_Xor64,
   1279                       binop(Iop_And64,
   1280                             binop(Iop_Shr64,
   1281                                   binop(Iop_Sub64, cc_dep1, cc_dep2),
   1282                                   mkU8(31)),
   1283                             mkU64(1)),
   1284                       mkU64(1));
   1285       }
   1286 
   1287       /* 12, 13 */
   1288       if (isU64(cc_op, AMD64G_CC_OP_SUBL) && isU64(cond, AMD64CondL)) {
   1289          /* long sub/cmp, then L (signed less than)
   1290             --> test dst <s src */
   1291          return unop(Iop_1Uto64,
   1292                      binop(Iop_CmpLT32S,
   1293                            unop(Iop_64to32, cc_dep1),
   1294                            unop(Iop_64to32, cc_dep2)));
   1295       }
   1296       if (isU64(cc_op, AMD64G_CC_OP_SUBL) && isU64(cond, AMD64CondNL)) {
   1297          /* long sub/cmp, then NL (signed greater than or equal)
   1298             --> test dst >=s src
   1299             --> test src <=s dst */
   1300          return unop(Iop_1Uto64,
   1301                      binop(Iop_CmpLE32S,
   1302                            unop(Iop_64to32, cc_dep2),
   1303                            unop(Iop_64to32, cc_dep1)));
   1304       }
   1305 
   1306       /* 14, 15 */
   1307       if (isU64(cc_op, AMD64G_CC_OP_SUBL) && isU64(cond, AMD64CondLE)) {
   1308          /* long sub/cmp, then LE (signed less than or equal)
   1309             --> test dst <=s src */
   1310          return unop(Iop_1Uto64,
   1311                      binop(Iop_CmpLE32S,
   1312                            unop(Iop_64to32, cc_dep1),
   1313                            unop(Iop_64to32, cc_dep2)));
   1314 
   1315       }
   1316       if (isU64(cc_op, AMD64G_CC_OP_SUBL) && isU64(cond, AMD64CondNLE)) {
   1317          /* long sub/cmp, then NLE (signed greater than)
   1318             --> test !(dst <=s src)
   1319             --> test (dst >s src)
   1320             --> test (src <s dst) */
   1321          return unop(Iop_1Uto64,
   1322                      binop(Iop_CmpLT32S,
   1323                            unop(Iop_64to32, cc_dep2),
   1324                            unop(Iop_64to32, cc_dep1)));
   1325 
   1326       }
   1327 
   1328       /*---------------- SUBW ----------------*/
   1329 
   1330       /* 4, 5 */
   1331       if (isU64(cc_op, AMD64G_CC_OP_SUBW) && isU64(cond, AMD64CondZ)) {
   1332          /* word sub/cmp, then Z --> test dst==src */
   1333          return unop(Iop_1Uto64,
   1334                      binop(Iop_CmpEQ16,
   1335                            unop(Iop_64to16,cc_dep1),
   1336                            unop(Iop_64to16,cc_dep2)));
   1337       }
   1338       if (isU64(cc_op, AMD64G_CC_OP_SUBW) && isU64(cond, AMD64CondNZ)) {
   1339          /* word sub/cmp, then NZ --> test dst!=src */
   1340          return unop(Iop_1Uto64,
   1341                      binop(Iop_CmpNE16,
   1342                            unop(Iop_64to16,cc_dep1),
   1343                            unop(Iop_64to16,cc_dep2)));
   1344       }
   1345 
   1346       /* 6, */
   1347       if (isU64(cc_op, AMD64G_CC_OP_SUBW) && isU64(cond, AMD64CondBE)) {
   1348          /* word sub/cmp, then BE (unsigned less than or equal)
   1349             --> test dst <=u src */
   1350          return unop(Iop_1Uto64,
   1351                      binop(Iop_CmpLE64U,
   1352                            binop(Iop_Shl64, cc_dep1, mkU8(48)),
   1353                            binop(Iop_Shl64, cc_dep2, mkU8(48))));
   1354       }
   1355 
   1356       /* 14, */
   1357       if (isU64(cc_op, AMD64G_CC_OP_SUBW) && isU64(cond, AMD64CondLE)) {
   1358          /* word sub/cmp, then LE (signed less than or equal)
   1359             --> test dst <=s src */
   1360          return unop(Iop_1Uto64,
   1361                      binop(Iop_CmpLE64S,
   1362                            binop(Iop_Shl64,cc_dep1,mkU8(48)),
   1363                            binop(Iop_Shl64,cc_dep2,mkU8(48))));
   1364 
   1365       }
   1366 
   1367       /*---------------- SUBB ----------------*/
   1368 
   1369       /* 2, 3 */
   1370       if (isU64(cc_op, AMD64G_CC_OP_SUBB) && isU64(cond, AMD64CondB)) {
   1371          /* byte sub/cmp, then B (unsigned less than)
   1372             --> test dst <u src */
   1373          return unop(Iop_1Uto64,
   1374                      binop(Iop_CmpLT64U,
   1375                            binop(Iop_And64, cc_dep1, mkU64(0xFF)),
   1376                            binop(Iop_And64, cc_dep2, mkU64(0xFF))));
   1377       }
   1378       if (isU64(cc_op, AMD64G_CC_OP_SUBB) && isU64(cond, AMD64CondNB)) {
   1379          /* byte sub/cmp, then NB (unsigned greater than or equal)
   1380             --> test src <=u dst */
   1381          /* Note, args are opposite way round from the usual */
   1382          return unop(Iop_1Uto64,
   1383                      binop(Iop_CmpLE64U,
   1384                            binop(Iop_And64, cc_dep2, mkU64(0xFF)),
   1385                            binop(Iop_And64, cc_dep1, mkU64(0xFF))));
   1386       }
   1387 
   1388       /* 4, 5 */
   1389       if (isU64(cc_op, AMD64G_CC_OP_SUBB) && isU64(cond, AMD64CondZ)) {
   1390          /* byte sub/cmp, then Z --> test dst==src */
   1391          return unop(Iop_1Uto64,
   1392                      binop(Iop_CmpEQ8,
   1393                            unop(Iop_64to8,cc_dep1),
   1394                            unop(Iop_64to8,cc_dep2)));
   1395       }
   1396       if (isU64(cc_op, AMD64G_CC_OP_SUBB) && isU64(cond, AMD64CondNZ)) {
   1397          /* byte sub/cmp, then NZ --> test dst!=src */
   1398          return unop(Iop_1Uto64,
   1399                      binop(Iop_CmpNE8,
   1400                            unop(Iop_64to8,cc_dep1),
   1401                            unop(Iop_64to8,cc_dep2)));
   1402       }
   1403 
   1404       /* 6, */
   1405       if (isU64(cc_op, AMD64G_CC_OP_SUBB) && isU64(cond, AMD64CondBE)) {
   1406          /* byte sub/cmp, then BE (unsigned less than or equal)
   1407             --> test dst <=u src */
   1408          return unop(Iop_1Uto64,
   1409                      binop(Iop_CmpLE64U,
   1410                            binop(Iop_And64, cc_dep1, mkU64(0xFF)),
   1411                            binop(Iop_And64, cc_dep2, mkU64(0xFF))));
   1412       }
   1413 
   1414       /* 8, 9 */
   1415       if (isU64(cc_op, AMD64G_CC_OP_SUBB) && isU64(cond, AMD64CondS)
   1416                                           && isU64(cc_dep2, 0)) {
   1417          /* byte sub/cmp of zero, then S --> test (dst-0 <s 0)
   1418                                          --> test dst <s 0
   1419                                          --> (ULong)dst[7]
   1420             This is yet another scheme by which gcc figures out if the
   1421             top bit of a byte is 1 or 0.  See also LOGICB/CondS below. */
   1422          /* Note: isU64(cc_dep2, 0) is correct, even though this is
   1423             for an 8-bit comparison, since the args to the helper
   1424             function are always U64s. */
   1425          return binop(Iop_And64,
   1426                       binop(Iop_Shr64,cc_dep1,mkU8(7)),
   1427                       mkU64(1));
   1428       }
   1429       if (isU64(cc_op, AMD64G_CC_OP_SUBB) && isU64(cond, AMD64CondNS)
   1430                                           && isU64(cc_dep2, 0)) {
   1431          /* byte sub/cmp of zero, then NS --> test !(dst-0 <s 0)
   1432                                           --> test !(dst <s 0)
   1433                                           --> (ULong) !dst[7]
   1434          */
   1435          return binop(Iop_Xor64,
   1436                       binop(Iop_And64,
   1437                             binop(Iop_Shr64,cc_dep1,mkU8(7)),
   1438                             mkU64(1)),
   1439                       mkU64(1));
   1440       }
   1441 
   1442       /*---------------- LOGICQ ----------------*/
   1443 
   1444       if (isU64(cc_op, AMD64G_CC_OP_LOGICQ) && isU64(cond, AMD64CondZ)) {
   1445          /* long long and/or/xor, then Z --> test dst==0 */
   1446          return unop(Iop_1Uto64,
   1447                      binop(Iop_CmpEQ64, cc_dep1, mkU64(0)));
   1448       }
   1449       if (isU64(cc_op, AMD64G_CC_OP_LOGICQ) && isU64(cond, AMD64CondNZ)) {
   1450          /* long long and/or/xor, then NZ --> test dst!=0 */
   1451          return unop(Iop_1Uto64,
   1452                      binop(Iop_CmpNE64, cc_dep1, mkU64(0)));
   1453       }
   1454 
   1455       if (isU64(cc_op, AMD64G_CC_OP_LOGICQ) && isU64(cond, AMD64CondL)) {
   1456          /* long long and/or/xor, then L
   1457             LOGIC sets SF and ZF according to the
   1458             result and makes OF be zero.  L computes SF ^ OF, but
   1459             OF is zero, so this reduces to SF -- which will be 1 iff
   1460             the result is < signed 0.  Hence ...
   1461          */
   1462          return unop(Iop_1Uto64,
   1463                      binop(Iop_CmpLT64S,
   1464                            cc_dep1,
   1465                            mkU64(0)));
   1466       }
   1467 
   1468       /*---------------- LOGICL ----------------*/
   1469 
   1470       if (isU64(cc_op, AMD64G_CC_OP_LOGICL) && isU64(cond, AMD64CondZ)) {
   1471          /* long and/or/xor, then Z --> test dst==0 */
   1472          return unop(Iop_1Uto64,
   1473                      binop(Iop_CmpEQ32,
   1474                            unop(Iop_64to32, cc_dep1),
   1475                            mkU32(0)));
   1476       }
   1477       if (isU64(cc_op, AMD64G_CC_OP_LOGICL) && isU64(cond, AMD64CondNZ)) {
   1478          /* long and/or/xor, then NZ --> test dst!=0 */
   1479          return unop(Iop_1Uto64,
   1480                      binop(Iop_CmpNE32,
   1481                            unop(Iop_64to32, cc_dep1),
   1482                            mkU32(0)));
   1483       }
   1484 
   1485       if (isU64(cc_op, AMD64G_CC_OP_LOGICL) && isU64(cond, AMD64CondLE)) {
   1486          /* long and/or/xor, then LE
   1487             This is pretty subtle.  LOGIC sets SF and ZF according to the
   1488             result and makes OF be zero.  LE computes (SF ^ OF) | ZF, but
   1489             OF is zero, so this reduces to SF | ZF -- which will be 1 iff
   1490             the result is <=signed 0.  Hence ...
   1491          */
   1492          return unop(Iop_1Uto64,
   1493                      binop(Iop_CmpLE32S,
   1494                            unop(Iop_64to32, cc_dep1),
   1495                            mkU32(0)));
   1496       }
   1497 
   1498       if (isU64(cc_op, AMD64G_CC_OP_LOGICL) && isU64(cond, AMD64CondS)) {
   1499          /* long and/or/xor, then S --> (ULong)result[31] */
   1500          return binop(Iop_And64,
   1501                       binop(Iop_Shr64, cc_dep1, mkU8(31)),
   1502                       mkU64(1));
   1503       }
   1504       if (isU64(cc_op, AMD64G_CC_OP_LOGICL) && isU64(cond, AMD64CondNS)) {
   1505          /* long and/or/xor, then S --> (ULong) ~ result[31] */
   1506          return binop(Iop_Xor64,
   1507                 binop(Iop_And64,
   1508                       binop(Iop_Shr64, cc_dep1, mkU8(31)),
   1509                       mkU64(1)),
   1510                 mkU64(1));
   1511       }
   1512 
   1513       /*---------------- LOGICW ----------------*/
   1514 
   1515       if (isU64(cc_op, AMD64G_CC_OP_LOGICW) && isU64(cond, AMD64CondZ)) {
   1516          /* word and/or/xor, then Z --> test dst==0 */
   1517          return unop(Iop_1Uto64,
   1518                      binop(Iop_CmpEQ64,
   1519                            binop(Iop_And64, cc_dep1, mkU64(0xFFFF)),
   1520                            mkU64(0)));
   1521       }
   1522       if (isU64(cc_op, AMD64G_CC_OP_LOGICW) && isU64(cond, AMD64CondNZ)) {
   1523          /* word and/or/xor, then NZ --> test dst!=0 */
   1524          return unop(Iop_1Uto64,
   1525                      binop(Iop_CmpNE64,
   1526                            binop(Iop_And64, cc_dep1, mkU64(0xFFFF)),
   1527                            mkU64(0)));
   1528       }
   1529 
   1530       /*---------------- LOGICB ----------------*/
   1531 
   1532       if (isU64(cc_op, AMD64G_CC_OP_LOGICB) && isU64(cond, AMD64CondZ)) {
   1533          /* byte and/or/xor, then Z --> test dst==0 */
   1534          return unop(Iop_1Uto64,
   1535                      binop(Iop_CmpEQ64, binop(Iop_And64,cc_dep1,mkU64(255)),
   1536                                         mkU64(0)));
   1537       }
   1538       if (isU64(cc_op, AMD64G_CC_OP_LOGICB) && isU64(cond, AMD64CondNZ)) {
   1539          /* byte and/or/xor, then NZ --> test dst!=0 */
   1540          return unop(Iop_1Uto64,
   1541                      binop(Iop_CmpNE64, binop(Iop_And64,cc_dep1,mkU64(255)),
   1542                                         mkU64(0)));
   1543       }
   1544 
   1545       if (isU64(cc_op, AMD64G_CC_OP_LOGICB) && isU64(cond, AMD64CondS)) {
   1546          /* this is an idiom gcc sometimes uses to find out if the top
   1547             bit of a byte register is set: eg testb %al,%al; js ..
   1548             Since it just depends on the top bit of the byte, extract
   1549             that bit and explicitly get rid of all the rest.  This
   1550             helps memcheck avoid false positives in the case where any
   1551             of the other bits in the byte are undefined. */
   1552          /* byte and/or/xor, then S --> (UInt)result[7] */
   1553          return binop(Iop_And64,
   1554                       binop(Iop_Shr64,cc_dep1,mkU8(7)),
   1555                       mkU64(1));
   1556       }
   1557       if (isU64(cc_op, AMD64G_CC_OP_LOGICB) && isU64(cond, AMD64CondNS)) {
   1558          /* byte and/or/xor, then NS --> (UInt)!result[7] */
   1559          return binop(Iop_Xor64,
   1560                       binop(Iop_And64,
   1561                             binop(Iop_Shr64,cc_dep1,mkU8(7)),
   1562                             mkU64(1)),
   1563                       mkU64(1));
   1564       }
   1565 
   1566       /*---------------- INCB ----------------*/
   1567 
   1568       if (isU64(cc_op, AMD64G_CC_OP_INCB) && isU64(cond, AMD64CondLE)) {
   1569          /* 8-bit inc, then LE --> sign bit of the arg */
   1570          return binop(Iop_And64,
   1571                       binop(Iop_Shr64,
   1572                             binop(Iop_Sub64, cc_dep1, mkU64(1)),
   1573                             mkU8(7)),
   1574                       mkU64(1));
   1575       }
   1576 
   1577       /*---------------- INCW ----------------*/
   1578 
   1579       if (isU64(cc_op, AMD64G_CC_OP_INCW) && isU64(cond, AMD64CondZ)) {
   1580          /* 16-bit inc, then Z --> test dst == 0 */
   1581          return unop(Iop_1Uto64,
   1582                      binop(Iop_CmpEQ64,
   1583                            binop(Iop_Shl64,cc_dep1,mkU8(48)),
   1584                            mkU64(0)));
   1585       }
   1586 
   1587       /*---------------- DECL ----------------*/
   1588 
   1589       if (isU64(cc_op, AMD64G_CC_OP_DECL) && isU64(cond, AMD64CondZ)) {
   1590          /* dec L, then Z --> test dst == 0 */
   1591          return unop(Iop_1Uto64,
   1592                      binop(Iop_CmpEQ32,
   1593                            unop(Iop_64to32, cc_dep1),
   1594                            mkU32(0)));
   1595       }
   1596 
   1597       /*---------------- DECW ----------------*/
   1598 
   1599       if (isU64(cc_op, AMD64G_CC_OP_DECW) && isU64(cond, AMD64CondNZ)) {
   1600          /* 16-bit dec, then NZ --> test dst != 0 */
   1601          return unop(Iop_1Uto64,
   1602                      binop(Iop_CmpNE64,
   1603                            binop(Iop_Shl64,cc_dep1,mkU8(48)),
   1604                            mkU64(0)));
   1605       }
   1606 
   1607       /*---------------- COPY ----------------*/
   1608       /* This can happen, as a result of amd64 FP compares: "comisd ... ;
   1609          jbe" for example. */
   1610 
   1611       if (isU64(cc_op, AMD64G_CC_OP_COPY) &&
   1612           (isU64(cond, AMD64CondBE) || isU64(cond, AMD64CondNBE))) {
   1613          /* COPY, then BE --> extract C and Z from dep1, and test (C
   1614             or Z == 1). */
   1615          /* COPY, then NBE --> extract C and Z from dep1, and test (C
   1616             or Z == 0). */
   1617          ULong nnn = isU64(cond, AMD64CondBE) ? 1 : 0;
   1618          return
   1619             unop(
   1620                Iop_1Uto64,
   1621                binop(
   1622                   Iop_CmpEQ64,
   1623                   binop(
   1624                      Iop_And64,
   1625                      binop(
   1626                         Iop_Or64,
   1627                         binop(Iop_Shr64, cc_dep1, mkU8(AMD64G_CC_SHIFT_C)),
   1628                         binop(Iop_Shr64, cc_dep1, mkU8(AMD64G_CC_SHIFT_Z))
   1629                      ),
   1630                      mkU64(1)
   1631                   ),
   1632                   mkU64(nnn)
   1633                )
   1634             );
   1635       }
   1636 
   1637       if (isU64(cc_op, AMD64G_CC_OP_COPY) && isU64(cond, AMD64CondB)) {
   1638          /* COPY, then B --> extract C dep1, and test (C == 1). */
   1639          return
   1640             unop(
   1641                Iop_1Uto64,
   1642                binop(
   1643                   Iop_CmpNE64,
   1644                   binop(
   1645                      Iop_And64,
   1646                      binop(Iop_Shr64, cc_dep1, mkU8(AMD64G_CC_SHIFT_C)),
   1647                      mkU64(1)
   1648                   ),
   1649                   mkU64(0)
   1650                )
   1651             );
   1652       }
   1653 
   1654       if (isU64(cc_op, AMD64G_CC_OP_COPY)
   1655           && (isU64(cond, AMD64CondZ) || isU64(cond, AMD64CondNZ))) {
   1656          /* COPY, then Z --> extract Z from dep1, and test (Z == 1). */
   1657          /* COPY, then NZ --> extract Z from dep1, and test (Z == 0). */
   1658          UInt nnn = isU64(cond, AMD64CondZ) ? 1 : 0;
   1659          return
   1660             unop(
   1661                Iop_1Uto64,
   1662                binop(
   1663                   Iop_CmpEQ64,
   1664                   binop(
   1665                      Iop_And64,
   1666                      binop(Iop_Shr64, cc_dep1, mkU8(AMD64G_CC_SHIFT_Z)),
   1667                      mkU64(1)
   1668                   ),
   1669                   mkU64(nnn)
   1670                )
   1671             );
   1672       }
   1673 
   1674       if (isU64(cc_op, AMD64G_CC_OP_COPY) && isU64(cond, AMD64CondP)) {
   1675          /* COPY, then P --> extract P from dep1, and test (P == 1). */
   1676          return
   1677             unop(
   1678                Iop_1Uto64,
   1679                binop(
   1680                   Iop_CmpNE64,
   1681                   binop(
   1682                      Iop_And64,
   1683                      binop(Iop_Shr64, cc_dep1, mkU8(AMD64G_CC_SHIFT_P)),
   1684                      mkU64(1)
   1685                   ),
   1686                   mkU64(0)
   1687                )
   1688             );
   1689       }
   1690 
   1691       return NULL;
   1692    }
   1693 
   1694    /* --------- specialising "amd64g_calculate_rflags_c" --------- */
   1695 
   1696    if (vex_streq(function_name, "amd64g_calculate_rflags_c")) {
   1697       /* specialise calls to above "calculate_rflags_c" function */
   1698       IRExpr *cc_op, *cc_dep1, *cc_dep2, *cc_ndep;
   1699       vassert(arity == 4);
   1700       cc_op   = args[0];
   1701       cc_dep1 = args[1];
   1702       cc_dep2 = args[2];
   1703       cc_ndep = args[3];
   1704 
   1705       if (isU64(cc_op, AMD64G_CC_OP_SUBQ)) {
   1706          /* C after sub denotes unsigned less than */
   1707          return unop(Iop_1Uto64,
   1708                      binop(Iop_CmpLT64U,
   1709                            cc_dep1,
   1710                            cc_dep2));
   1711       }
   1712       if (isU64(cc_op, AMD64G_CC_OP_SUBL)) {
   1713          /* C after sub denotes unsigned less than */
   1714          return unop(Iop_1Uto64,
   1715                      binop(Iop_CmpLT32U,
   1716                            unop(Iop_64to32, cc_dep1),
   1717                            unop(Iop_64to32, cc_dep2)));
   1718       }
   1719       if (isU64(cc_op, AMD64G_CC_OP_SUBB)) {
   1720          /* C after sub denotes unsigned less than */
   1721          return unop(Iop_1Uto64,
   1722                      binop(Iop_CmpLT64U,
   1723                            binop(Iop_And64,cc_dep1,mkU64(0xFF)),
   1724                            binop(Iop_And64,cc_dep2,mkU64(0xFF))));
   1725       }
   1726       if (isU64(cc_op, AMD64G_CC_OP_LOGICQ)
   1727           || isU64(cc_op, AMD64G_CC_OP_LOGICL)
   1728           || isU64(cc_op, AMD64G_CC_OP_LOGICW)
   1729           || isU64(cc_op, AMD64G_CC_OP_LOGICB)) {
   1730          /* cflag after logic is zero */
   1731          return mkU64(0);
   1732       }
   1733       if (isU64(cc_op, AMD64G_CC_OP_DECL) || isU64(cc_op, AMD64G_CC_OP_INCL)
   1734           || isU64(cc_op, AMD64G_CC_OP_DECQ) || isU64(cc_op, AMD64G_CC_OP_INCQ)) {
   1735          /* If the thunk is dec or inc, the cflag is supplied as CC_NDEP. */
   1736          return cc_ndep;
   1737       }
   1738 
   1739 #     if 0
   1740       if (cc_op->tag == Iex_Const) {
   1741          vex_printf("CFLAG "); ppIRExpr(cc_op); vex_printf("\n");
   1742       }
   1743 #     endif
   1744 
   1745       return NULL;
   1746    }
   1747 
   1748 #  undef unop
   1749 #  undef binop
   1750 #  undef mkU64
   1751 #  undef mkU32
   1752 #  undef mkU8
   1753 
   1754    return NULL;
   1755 }
   1756 
   1757 
   1758 /*---------------------------------------------------------------*/
   1759 /*--- Supporting functions for x87 FPU activities.            ---*/
   1760 /*---------------------------------------------------------------*/
   1761 
   1762 static inline Bool host_is_little_endian ( void )
   1763 {
   1764    UInt x = 0x76543210;
   1765    UChar* p = (UChar*)(&x);
   1766    return toBool(*p == 0x10);
   1767 }
   1768 
   1769 /* Inspect a value and its tag, as per the x87 'FXAM' instruction. */
   1770 /* CALLED FROM GENERATED CODE: CLEAN HELPER */
   1771 ULong amd64g_calculate_FXAM ( ULong tag, ULong dbl )
   1772 {
   1773    Bool   mantissaIsZero;
   1774    Int    bexp;
   1775    UChar  sign;
   1776    UChar* f64;
   1777 
   1778    vassert(host_is_little_endian());
   1779 
   1780    /* vex_printf("calculate_FXAM ( %d, %llx ) .. ", tag, dbl ); */
   1781 
   1782    f64  = (UChar*)(&dbl);
   1783    sign = toUChar( (f64[7] >> 7) & 1 );
   1784 
   1785    /* First off, if the tag indicates the register was empty,
   1786       return 1,0,sign,1 */
   1787    if (tag == 0) {
   1788       /* vex_printf("Empty\n"); */
   1789       return AMD64G_FC_MASK_C3 | 0 | (sign << AMD64G_FC_SHIFT_C1)
   1790                                    | AMD64G_FC_MASK_C0;
   1791    }
   1792 
   1793    bexp = (f64[7] << 4) | ((f64[6] >> 4) & 0x0F);
   1794    bexp &= 0x7FF;
   1795 
   1796    mantissaIsZero
   1797       = toBool(
   1798            (f64[6] & 0x0F) == 0
   1799            && (f64[5] | f64[4] | f64[3] | f64[2] | f64[1] | f64[0]) == 0
   1800         );
   1801 
   1802    /* If both exponent and mantissa are zero, the value is zero.
   1803       Return 1,0,sign,0. */
   1804    if (bexp == 0 && mantissaIsZero) {
   1805       /* vex_printf("Zero\n"); */
   1806       return AMD64G_FC_MASK_C3 | 0
   1807                                | (sign << AMD64G_FC_SHIFT_C1) | 0;
   1808    }
   1809 
   1810    /* If exponent is zero but mantissa isn't, it's a denormal.
   1811       Return 1,1,sign,0. */
   1812    if (bexp == 0 && !mantissaIsZero) {
   1813       /* vex_printf("Denormal\n"); */
   1814       return AMD64G_FC_MASK_C3 | AMD64G_FC_MASK_C2
   1815                                | (sign << AMD64G_FC_SHIFT_C1) | 0;
   1816    }
   1817 
   1818    /* If the exponent is 7FF and the mantissa is zero, this is an infinity.
   1819       Return 0,1,sign,1. */
   1820    if (bexp == 0x7FF && mantissaIsZero) {
   1821       /* vex_printf("Inf\n"); */
   1822       return 0 | AMD64G_FC_MASK_C2 | (sign << AMD64G_FC_SHIFT_C1)
   1823                                    | AMD64G_FC_MASK_C0;
   1824    }
   1825 
   1826    /* If the exponent is 7FF and the mantissa isn't zero, this is a NaN.
   1827       Return 0,0,sign,1. */
   1828    if (bexp == 0x7FF && !mantissaIsZero) {
   1829       /* vex_printf("NaN\n"); */
   1830       return 0 | 0 | (sign << AMD64G_FC_SHIFT_C1) | AMD64G_FC_MASK_C0;
   1831    }
   1832 
   1833    /* Uh, ok, we give up.  It must be a normal finite number.
   1834       Return 0,1,sign,0.
   1835    */
   1836    /* vex_printf("normal\n"); */
   1837    return 0 | AMD64G_FC_MASK_C2 | (sign << AMD64G_FC_SHIFT_C1) | 0;
   1838 }
   1839 
   1840 
   1841 /* This is used to implement both 'frstor' and 'fldenv'.  The latter
   1842    appears to differ from the former only in that the 8 FP registers
   1843    themselves are not transferred into the guest state. */
   1844 static
   1845 VexEmNote do_put_x87 ( Bool moveRegs,
   1846                        /*IN*/UChar* x87_state,
   1847                        /*OUT*/VexGuestAMD64State* vex_state )
   1848 {
   1849    Int        stno, preg;
   1850    UInt       tag;
   1851    ULong*     vexRegs = (ULong*)(&vex_state->guest_FPREG[0]);
   1852    UChar*     vexTags = (UChar*)(&vex_state->guest_FPTAG[0]);
   1853    Fpu_State* x87     = (Fpu_State*)x87_state;
   1854    UInt       ftop    = (x87->env[FP_ENV_STAT] >> 11) & 7;
   1855    UInt       tagw    = x87->env[FP_ENV_TAG];
   1856    UInt       fpucw   = x87->env[FP_ENV_CTRL];
   1857    UInt       c3210   = x87->env[FP_ENV_STAT] & 0x4700;
   1858    VexEmNote  ew;
   1859    UInt       fpround;
   1860    ULong      pair;
   1861 
   1862    /* Copy registers and tags */
   1863    for (stno = 0; stno < 8; stno++) {
   1864       preg = (stno + ftop) & 7;
   1865       tag = (tagw >> (2*preg)) & 3;
   1866       if (tag == 3) {
   1867          /* register is empty */
   1868          /* hmm, if it's empty, does it still get written?  Probably
   1869             safer to say it does.  If we don't, memcheck could get out
   1870             of sync, in that it thinks all FP registers are defined by
   1871             this helper, but in reality some have not been updated. */
   1872          if (moveRegs)
   1873             vexRegs[preg] = 0; /* IEEE754 64-bit zero */
   1874          vexTags[preg] = 0;
   1875       } else {
   1876          /* register is non-empty */
   1877          if (moveRegs)
   1878             convert_f80le_to_f64le( &x87->reg[10*stno],
   1879                                     (UChar*)&vexRegs[preg] );
   1880          vexTags[preg] = 1;
   1881       }
   1882    }
   1883 
   1884    /* stack pointer */
   1885    vex_state->guest_FTOP = ftop;
   1886 
   1887    /* status word */
   1888    vex_state->guest_FC3210 = c3210;
   1889 
   1890    /* handle the control word, setting FPROUND and detecting any
   1891       emulation warnings. */
   1892    pair    = amd64g_check_fldcw ( (ULong)fpucw );
   1893    fpround = (UInt)pair & 0xFFFFFFFFULL;
   1894    ew      = (VexEmNote)(pair >> 32);
   1895 
   1896    vex_state->guest_FPROUND = fpround & 3;
   1897 
   1898    /* emulation warnings --> caller */
   1899    return ew;
   1900 }
   1901 
   1902 
   1903 /* Create an x87 FPU state from the guest state, as close as
   1904    we can approximate it. */
   1905 static
   1906 void do_get_x87 ( /*IN*/VexGuestAMD64State* vex_state,
   1907                   /*OUT*/UChar* x87_state )
   1908 {
   1909    Int        i, stno, preg;
   1910    UInt       tagw;
   1911    ULong*     vexRegs = (ULong*)(&vex_state->guest_FPREG[0]);
   1912    UChar*     vexTags = (UChar*)(&vex_state->guest_FPTAG[0]);
   1913    Fpu_State* x87     = (Fpu_State*)x87_state;
   1914    UInt       ftop    = vex_state->guest_FTOP;
   1915    UInt       c3210   = vex_state->guest_FC3210;
   1916 
   1917    for (i = 0; i < 14; i++)
   1918       x87->env[i] = 0;
   1919 
   1920    x87->env[1] = x87->env[3] = x87->env[5] = x87->env[13] = 0xFFFF;
   1921    x87->env[FP_ENV_STAT]
   1922       = toUShort(((ftop & 7) << 11) | (c3210 & 0x4700));
   1923    x87->env[FP_ENV_CTRL]
   1924       = toUShort(amd64g_create_fpucw( vex_state->guest_FPROUND ));
   1925 
   1926    /* Dump the register stack in ST order. */
   1927    tagw = 0;
   1928    for (stno = 0; stno < 8; stno++) {
   1929       preg = (stno + ftop) & 7;
   1930       if (vexTags[preg] == 0) {
   1931          /* register is empty */
   1932          tagw |= (3 << (2*preg));
   1933          convert_f64le_to_f80le( (UChar*)&vexRegs[preg],
   1934                                  &x87->reg[10*stno] );
   1935       } else {
   1936          /* register is full. */
   1937          tagw |= (0 << (2*preg));
   1938          convert_f64le_to_f80le( (UChar*)&vexRegs[preg],
   1939                                  &x87->reg[10*stno] );
   1940       }
   1941    }
   1942    x87->env[FP_ENV_TAG] = toUShort(tagw);
   1943 }
   1944 
   1945 
   1946 /*---------------------------------------------------------------*/
   1947 /*--- Supporting functions for XSAVE/FXSAVE.                  ---*/
   1948 /*---------------------------------------------------------------*/
   1949 
   1950 /* CALLED FROM GENERATED CODE */
   1951 /* DIRTY HELPER (reads guest state, writes guest mem) */
   1952 /* XSAVE component 0 is the x87 FPU state. */
   1953 void amd64g_dirtyhelper_XSAVE_COMPONENT_0
   1954         ( VexGuestAMD64State* gst, HWord addr )
   1955 {
   1956    /* Derived from values obtained from
   1957       vendor_id       : AuthenticAMD
   1958       cpu family      : 15
   1959       model           : 12
   1960       model name      : AMD Athlon(tm) 64 Processor 3200+
   1961       stepping        : 0
   1962       cpu MHz         : 2200.000
   1963       cache size      : 512 KB
   1964    */
   1965    /* Somewhat roundabout, but at least it's simple. */
   1966    Fpu_State tmp;
   1967    UShort*   addrS = (UShort*)addr;
   1968    UChar*    addrC = (UChar*)addr;
   1969    UShort    fp_tags;
   1970    UInt      summary_tags;
   1971    Int       r, stno;
   1972    UShort    *srcS, *dstS;
   1973 
   1974    do_get_x87( gst, (UChar*)&tmp );
   1975 
   1976    /* Now build the proper fxsave x87 image from the fsave x87 image
   1977       we just made. */
   1978 
   1979    addrS[0]  = tmp.env[FP_ENV_CTRL]; /* FCW: fpu control word */
   1980    addrS[1]  = tmp.env[FP_ENV_STAT]; /* FCW: fpu status word */
   1981 
   1982    /* set addrS[2] in an endian-independent way */
   1983    summary_tags = 0;
   1984    fp_tags = tmp.env[FP_ENV_TAG];
   1985    for (r = 0; r < 8; r++) {
   1986       if ( ((fp_tags >> (2*r)) & 3) != 3 )
   1987          summary_tags |= (1 << r);
   1988    }
   1989    addrC[4]  = toUChar(summary_tags); /* FTW: tag summary byte */
   1990    addrC[5]  = 0; /* pad */
   1991 
   1992    /* FOP: faulting fpu opcode.  From experimentation, the real CPU
   1993       does not write this field. (?!) */
   1994    addrS[3]  = 0; /* BOGUS */
   1995 
   1996    /* RIP (Last x87 instruction pointer).  From experimentation, the
   1997       real CPU does not write this field. (?!) */
   1998    addrS[4]  = 0; /* BOGUS */
   1999    addrS[5]  = 0; /* BOGUS */
   2000    addrS[6]  = 0; /* BOGUS */
   2001    addrS[7]  = 0; /* BOGUS */
   2002 
   2003    /* RDP (Last x87 data pointer).  From experimentation, the real CPU
   2004       does not write this field. (?!) */
   2005    addrS[8]  = 0; /* BOGUS */
   2006    addrS[9]  = 0; /* BOGUS */
   2007    addrS[10] = 0; /* BOGUS */
   2008    addrS[11] = 0; /* BOGUS */
   2009 
   2010    /* addrS[13,12] are MXCSR -- not written */
   2011    /* addrS[15,14] are MXCSR_MASK -- not written */
   2012 
   2013    /* Copy in the FP registers, in ST order. */
   2014    for (stno = 0; stno < 8; stno++) {
   2015       srcS = (UShort*)(&tmp.reg[10*stno]);
   2016       dstS = (UShort*)(&addrS[16 + 8*stno]);
   2017       dstS[0] = srcS[0];
   2018       dstS[1] = srcS[1];
   2019       dstS[2] = srcS[2];
   2020       dstS[3] = srcS[3];
   2021       dstS[4] = srcS[4];
   2022       dstS[5] = 0;
   2023       dstS[6] = 0;
   2024       dstS[7] = 0;
   2025    }
   2026 }
   2027 
   2028 
   2029 /* CALLED FROM GENERATED CODE */
   2030 /* DIRTY HELPER (reads guest state, writes guest mem) */
   2031 /* XSAVE component 1 is the SSE state. */
   2032 void amd64g_dirtyhelper_XSAVE_COMPONENT_1_EXCLUDING_XMMREGS
   2033         ( VexGuestAMD64State* gst, HWord addr )
   2034 {
   2035    UShort* addrS = (UShort*)addr;
   2036    UInt    mxcsr;
   2037 
   2038    /* The only non-register parts of the SSE state are MXCSR and
   2039       MXCSR_MASK. */
   2040    mxcsr = amd64g_create_mxcsr( gst->guest_SSEROUND );
   2041 
   2042    addrS[12] = toUShort(mxcsr);  /* MXCSR */
   2043    addrS[13] = toUShort(mxcsr >> 16);
   2044 
   2045    addrS[14] = 0xFFFF; /* MXCSR mask (lo16) */
   2046    addrS[15] = 0x0000; /* MXCSR mask (hi16) */
   2047 }
   2048 
   2049 
   2050 /* VISIBLE TO LIBVEX CLIENT */
   2051 /* Do FXSAVE from the supplied VexGuestAMD64State structure and store
   2052    the result at the given address which represents a buffer of at
   2053    least 416 bytes.
   2054 
   2055    This function is not called from generated code.  FXSAVE is dealt
   2056    with by the amd64 front end by calling the XSAVE_COMPONENT_{0,1}
   2057    functions above plus some in-line IR.  This function is merely a
   2058    convenience function for VEX's users.
   2059 */
   2060 void LibVEX_GuestAMD64_fxsave ( /*IN*/VexGuestAMD64State* gst,
   2061                                 /*OUT*/HWord fp_state )
   2062 {
   2063    /* Do the x87 part */
   2064    amd64g_dirtyhelper_XSAVE_COMPONENT_0(gst, fp_state);
   2065 
   2066    /* And now the SSE part, except for the registers themselves. */
   2067    amd64g_dirtyhelper_XSAVE_COMPONENT_1_EXCLUDING_XMMREGS(gst, fp_state);
   2068 
   2069    /* That's the first 160 bytes of the image done. */
   2070    /* Now only %xmm0 .. %xmm15 remain to be copied.  If the host is
   2071       big-endian, these need to be byte-swapped. */
   2072    U128 *xmm = (U128 *)(fp_state + 160);
   2073    vassert(host_is_little_endian());
   2074 
   2075 #  define COPY_U128(_dst,_src)                       \
   2076       do { _dst[0] = _src[0]; _dst[1] = _src[1];     \
   2077            _dst[2] = _src[2]; _dst[3] = _src[3]; }   \
   2078       while (0)
   2079 
   2080    COPY_U128( xmm[0],  gst->guest_YMM0 );
   2081    COPY_U128( xmm[1],  gst->guest_YMM1 );
   2082    COPY_U128( xmm[2],  gst->guest_YMM2 );
   2083    COPY_U128( xmm[3],  gst->guest_YMM3 );
   2084    COPY_U128( xmm[4],  gst->guest_YMM4 );
   2085    COPY_U128( xmm[5],  gst->guest_YMM5 );
   2086    COPY_U128( xmm[6],  gst->guest_YMM6 );
   2087    COPY_U128( xmm[7],  gst->guest_YMM7 );
   2088    COPY_U128( xmm[8],  gst->guest_YMM8 );
   2089    COPY_U128( xmm[9],  gst->guest_YMM9 );
   2090    COPY_U128( xmm[10], gst->guest_YMM10 );
   2091    COPY_U128( xmm[11], gst->guest_YMM11 );
   2092    COPY_U128( xmm[12], gst->guest_YMM12 );
   2093    COPY_U128( xmm[13], gst->guest_YMM13 );
   2094    COPY_U128( xmm[14], gst->guest_YMM14 );
   2095    COPY_U128( xmm[15], gst->guest_YMM15 );
   2096 #  undef COPY_U128
   2097 }
   2098 
   2099 
   2100 /*---------------------------------------------------------------*/
   2101 /*--- Supporting functions for XRSTOR/FXRSTOR.                ---*/
   2102 /*---------------------------------------------------------------*/
   2103 
   2104 /* CALLED FROM GENERATED CODE */
   2105 /* DIRTY HELPER (writes guest state, reads guest mem) */
   2106 VexEmNote amd64g_dirtyhelper_XRSTOR_COMPONENT_0
   2107              ( VexGuestAMD64State* gst, HWord addr )
   2108 {
   2109    Fpu_State tmp;
   2110    UShort*   addrS   = (UShort*)addr;
   2111    UChar*    addrC   = (UChar*)addr;
   2112    UShort    fp_tags;
   2113    Int       r, stno, i;
   2114 
   2115    /* Copy the x87 registers out of the image, into a temporary
   2116       Fpu_State struct. */
   2117    for (i = 0; i < 14; i++) tmp.env[i] = 0;
   2118    for (i = 0; i < 80; i++) tmp.reg[i] = 0;
   2119    /* fill in tmp.reg[0..7] */
   2120    for (stno = 0; stno < 8; stno++) {
   2121       UShort* dstS = (UShort*)(&tmp.reg[10*stno]);
   2122       UShort* srcS = (UShort*)(&addrS[16 + 8*stno]);
   2123       dstS[0] = srcS[0];
   2124       dstS[1] = srcS[1];
   2125       dstS[2] = srcS[2];
   2126       dstS[3] = srcS[3];
   2127       dstS[4] = srcS[4];
   2128    }
   2129    /* fill in tmp.env[0..13] */
   2130    tmp.env[FP_ENV_CTRL] = addrS[0]; /* FCW: fpu control word */
   2131    tmp.env[FP_ENV_STAT] = addrS[1]; /* FCW: fpu status word */
   2132 
   2133    fp_tags = 0;
   2134    for (r = 0; r < 8; r++) {
   2135       if (addrC[4] & (1<<r))
   2136          fp_tags |= (0 << (2*r)); /* EMPTY */
   2137       else
   2138          fp_tags |= (3 << (2*r)); /* VALID -- not really precise enough. */
   2139    }
   2140    tmp.env[FP_ENV_TAG] = fp_tags;
   2141 
   2142    /* Now write 'tmp' into the guest state. */
   2143    VexEmNote warnX87 = do_put_x87( True/*moveRegs*/, (UChar*)&tmp, gst );
   2144 
   2145    return warnX87;
   2146 }
   2147 
   2148 
   2149 /* CALLED FROM GENERATED CODE */
   2150 /* DIRTY HELPER (writes guest state, reads guest mem) */
   2151 VexEmNote amd64g_dirtyhelper_XRSTOR_COMPONENT_1_EXCLUDING_XMMREGS
   2152              ( VexGuestAMD64State* gst, HWord addr )
   2153 {
   2154    UShort* addrS = (UShort*)addr;
   2155    UInt    w32   = (((UInt)addrS[12]) & 0xFFFF)
   2156                    | ((((UInt)addrS[13]) & 0xFFFF) << 16);
   2157    ULong   w64   = amd64g_check_ldmxcsr( (ULong)w32 );
   2158 
   2159    VexEmNote warnXMM = (VexEmNote)(w64 >> 32);
   2160 
   2161    gst->guest_SSEROUND = w64 & 0xFFFFFFFFULL;
   2162    return warnXMM;
   2163 }
   2164 
   2165 
   2166 /* VISIBLE TO LIBVEX CLIENT */
   2167 /* Do FXRSTOR from the supplied address and store read values to the given
   2168    VexGuestAMD64State structure.
   2169 
   2170    This function is not called from generated code.  FXRSTOR is dealt
   2171    with by the amd64 front end by calling the XRSTOR_COMPONENT_{0,1}
   2172    functions above plus some in-line IR.  This function is merely a
   2173    convenience function for VEX's users.
   2174 */
   2175 VexEmNote LibVEX_GuestAMD64_fxrstor ( /*IN*/HWord fp_state,
   2176                                       /*MOD*/VexGuestAMD64State* gst )
   2177 {
   2178    /* Restore %xmm0 .. %xmm15.  If the host is big-endian, these need
   2179       to be byte-swapped. */
   2180    U128 *xmm = (U128 *)(fp_state + 160);
   2181 
   2182    vassert(host_is_little_endian());
   2183 
   2184 #  define COPY_U128(_dst,_src)                       \
   2185       do { _dst[0] = _src[0]; _dst[1] = _src[1];     \
   2186            _dst[2] = _src[2]; _dst[3] = _src[3]; }   \
   2187       while (0)
   2188 
   2189    COPY_U128( gst->guest_YMM0, xmm[0] );
   2190    COPY_U128( gst->guest_YMM1, xmm[1] );
   2191    COPY_U128( gst->guest_YMM2, xmm[2] );
   2192    COPY_U128( gst->guest_YMM3, xmm[3] );
   2193    COPY_U128( gst->guest_YMM4, xmm[4] );
   2194    COPY_U128( gst->guest_YMM5, xmm[5] );
   2195    COPY_U128( gst->guest_YMM6, xmm[6] );
   2196    COPY_U128( gst->guest_YMM7, xmm[7] );
   2197    COPY_U128( gst->guest_YMM8, xmm[8] );
   2198    COPY_U128( gst->guest_YMM9, xmm[9] );
   2199    COPY_U128( gst->guest_YMM10, xmm[10] );
   2200    COPY_U128( gst->guest_YMM11, xmm[11] );
   2201    COPY_U128( gst->guest_YMM12, xmm[12] );
   2202    COPY_U128( gst->guest_YMM13, xmm[13] );
   2203    COPY_U128( gst->guest_YMM14, xmm[14] );
   2204    COPY_U128( gst->guest_YMM15, xmm[15] );
   2205 
   2206 #  undef COPY_U128
   2207 
   2208    VexEmNote warnXMM
   2209       = amd64g_dirtyhelper_XRSTOR_COMPONENT_1_EXCLUDING_XMMREGS(gst, fp_state);
   2210    VexEmNote warnX87
   2211       = amd64g_dirtyhelper_XRSTOR_COMPONENT_0(gst, fp_state);
   2212 
   2213    /* Prefer an X87 emwarn over an XMM one, if both exist. */
   2214    if (warnX87 != EmNote_NONE)
   2215       return warnX87;
   2216    else
   2217       return warnXMM;
   2218 }
   2219 
   2220 
   2221 /*---------------------------------------------------------------*/
   2222 /*--- Supporting functions for FSAVE/FRSTOR                   ---*/
   2223 /*---------------------------------------------------------------*/
   2224 
   2225 /* DIRTY HELPER (writes guest state) */
   2226 /* Initialise the x87 FPU state as per 'finit'. */
   2227 void amd64g_dirtyhelper_FINIT ( VexGuestAMD64State* gst )
   2228 {
   2229    Int i;
   2230    gst->guest_FTOP = 0;
   2231    for (i = 0; i < 8; i++) {
   2232       gst->guest_FPTAG[i] = 0; /* empty */
   2233       gst->guest_FPREG[i] = 0; /* IEEE754 64-bit zero */
   2234    }
   2235    gst->guest_FPROUND = (ULong)Irrm_NEAREST;
   2236    gst->guest_FC3210  = 0;
   2237 }
   2238 
   2239 
   2240 /* CALLED FROM GENERATED CODE */
   2241 /* DIRTY HELPER (reads guest memory) */
   2242 ULong amd64g_dirtyhelper_loadF80le ( Addr addrU )
   2243 {
   2244    ULong f64;
   2245    convert_f80le_to_f64le ( (UChar*)addrU, (UChar*)&f64 );
   2246    return f64;
   2247 }
   2248 
   2249 /* CALLED FROM GENERATED CODE */
   2250 /* DIRTY HELPER (writes guest memory) */
   2251 void amd64g_dirtyhelper_storeF80le ( Addr addrU, ULong f64 )
   2252 {
   2253    convert_f64le_to_f80le( (UChar*)&f64, (UChar*)addrU );
   2254 }
   2255 
   2256 
   2257 /* CALLED FROM GENERATED CODE */
   2258 /* CLEAN HELPER */
   2259 /* mxcsr[15:0] contains a SSE native format MXCSR value.
   2260    Extract from it the required SSEROUND value and any resulting
   2261    emulation warning, and return (warn << 32) | sseround value.
   2262 */
   2263 ULong amd64g_check_ldmxcsr ( ULong mxcsr )
   2264 {
   2265    /* Decide on a rounding mode.  mxcsr[14:13] holds it. */
   2266    /* NOTE, encoded exactly as per enum IRRoundingMode. */
   2267    ULong rmode = (mxcsr >> 13) & 3;
   2268 
   2269    /* Detect any required emulation warnings. */
   2270    VexEmNote ew = EmNote_NONE;
   2271 
   2272    if ((mxcsr & 0x1F80) != 0x1F80) {
   2273       /* unmasked exceptions! */
   2274       ew = EmWarn_X86_sseExns;
   2275    }
   2276    else
   2277    if (mxcsr & (1<<15)) {
   2278       /* FZ is set */
   2279       ew = EmWarn_X86_fz;
   2280    }
   2281    else
   2282    if (mxcsr & (1<<6)) {
   2283       /* DAZ is set */
   2284       ew = EmWarn_X86_daz;
   2285    }
   2286 
   2287    return (((ULong)ew) << 32) | ((ULong)rmode);
   2288 }
   2289 
   2290 
   2291 /* CALLED FROM GENERATED CODE */
   2292 /* CLEAN HELPER */
   2293 /* Given sseround as an IRRoundingMode value, create a suitable SSE
   2294    native format MXCSR value. */
   2295 ULong amd64g_create_mxcsr ( ULong sseround )
   2296 {
   2297    sseround &= 3;
   2298    return 0x1F80 | (sseround << 13);
   2299 }
   2300 
   2301 
   2302 /* CLEAN HELPER */
   2303 /* fpucw[15:0] contains a x87 native format FPU control word.
   2304    Extract from it the required FPROUND value and any resulting
   2305    emulation warning, and return (warn << 32) | fpround value.
   2306 */
   2307 ULong amd64g_check_fldcw ( ULong fpucw )
   2308 {
   2309    /* Decide on a rounding mode.  fpucw[11:10] holds it. */
   2310    /* NOTE, encoded exactly as per enum IRRoundingMode. */
   2311    ULong rmode = (fpucw >> 10) & 3;
   2312 
   2313    /* Detect any required emulation warnings. */
   2314    VexEmNote ew = EmNote_NONE;
   2315 
   2316    if ((fpucw & 0x3F) != 0x3F) {
   2317       /* unmasked exceptions! */
   2318       ew = EmWarn_X86_x87exns;
   2319    }
   2320    else
   2321    if (((fpucw >> 8) & 3) != 3) {
   2322       /* unsupported precision */
   2323       ew = EmWarn_X86_x87precision;
   2324    }
   2325 
   2326    return (((ULong)ew) << 32) | ((ULong)rmode);
   2327 }
   2328 
   2329 
   2330 /* CLEAN HELPER */
   2331 /* Given fpround as an IRRoundingMode value, create a suitable x87
   2332    native format FPU control word. */
   2333 ULong amd64g_create_fpucw ( ULong fpround )
   2334 {
   2335    fpround &= 3;
   2336    return 0x037F | (fpround << 10);
   2337 }
   2338 
   2339 
   2340 /* This is used to implement 'fldenv'.
   2341    Reads 28 bytes at x87_state[0 .. 27]. */
   2342 /* CALLED FROM GENERATED CODE */
   2343 /* DIRTY HELPER */
   2344 VexEmNote amd64g_dirtyhelper_FLDENV ( /*OUT*/VexGuestAMD64State* vex_state,
   2345                                       /*IN*/HWord x87_state)
   2346 {
   2347    return do_put_x87( False, (UChar*)x87_state, vex_state );
   2348 }
   2349 
   2350 
   2351 /* CALLED FROM GENERATED CODE */
   2352 /* DIRTY HELPER */
   2353 /* Create an x87 FPU env from the guest state, as close as we can
   2354    approximate it.  Writes 28 bytes at x87_state[0..27]. */
   2355 void amd64g_dirtyhelper_FSTENV ( /*IN*/VexGuestAMD64State* vex_state,
   2356                                  /*OUT*/HWord x87_state )
   2357 {
   2358    Int        i, stno, preg;
   2359    UInt       tagw;
   2360    UChar*     vexTags = (UChar*)(&vex_state->guest_FPTAG[0]);
   2361    Fpu_State* x87     = (Fpu_State*)x87_state;
   2362    UInt       ftop    = vex_state->guest_FTOP;
   2363    ULong      c3210   = vex_state->guest_FC3210;
   2364 
   2365    for (i = 0; i < 14; i++)
   2366       x87->env[i] = 0;
   2367 
   2368    x87->env[1] = x87->env[3] = x87->env[5] = x87->env[13] = 0xFFFF;
   2369    x87->env[FP_ENV_STAT]
   2370       = toUShort(toUInt( ((ftop & 7) << 11) | (c3210 & 0x4700) ));
   2371    x87->env[FP_ENV_CTRL]
   2372       = toUShort(toUInt( amd64g_create_fpucw( vex_state->guest_FPROUND ) ));
   2373 
   2374    /* Compute the x87 tag word. */
   2375    tagw = 0;
   2376    for (stno = 0; stno < 8; stno++) {
   2377       preg = (stno + ftop) & 7;
   2378       if (vexTags[preg] == 0) {
   2379          /* register is empty */
   2380          tagw |= (3 << (2*preg));
   2381       } else {
   2382          /* register is full. */
   2383          tagw |= (0 << (2*preg));
   2384       }
   2385    }
   2386    x87->env[FP_ENV_TAG] = toUShort(tagw);
   2387 
   2388    /* We don't dump the x87 registers, tho. */
   2389 }
   2390 
   2391 
   2392 /* This is used to implement 'fnsave'.
   2393    Writes 108 bytes at x87_state[0 .. 107]. */
   2394 /* CALLED FROM GENERATED CODE */
   2395 /* DIRTY HELPER */
   2396 void amd64g_dirtyhelper_FNSAVE ( /*IN*/VexGuestAMD64State* vex_state,
   2397                                  /*OUT*/HWord x87_state)
   2398 {
   2399    do_get_x87( vex_state, (UChar*)x87_state );
   2400 }
   2401 
   2402 
   2403 /* This is used to implement 'fnsaves'.
   2404    Writes 94 bytes at x87_state[0 .. 93]. */
   2405 /* CALLED FROM GENERATED CODE */
   2406 /* DIRTY HELPER */
   2407 void amd64g_dirtyhelper_FNSAVES ( /*IN*/VexGuestAMD64State* vex_state,
   2408                                   /*OUT*/HWord x87_state)
   2409 {
   2410    Int           i, stno, preg;
   2411    UInt          tagw;
   2412    ULong*        vexRegs = (ULong*)(&vex_state->guest_FPREG[0]);
   2413    UChar*        vexTags = (UChar*)(&vex_state->guest_FPTAG[0]);
   2414    Fpu_State_16* x87     = (Fpu_State_16*)x87_state;
   2415    UInt          ftop    = vex_state->guest_FTOP;
   2416    UInt          c3210   = vex_state->guest_FC3210;
   2417 
   2418    for (i = 0; i < 7; i++)
   2419       x87->env[i] = 0;
   2420 
   2421    x87->env[FPS_ENV_STAT]
   2422       = toUShort(((ftop & 7) << 11) | (c3210 & 0x4700));
   2423    x87->env[FPS_ENV_CTRL]
   2424       = toUShort(amd64g_create_fpucw( vex_state->guest_FPROUND ));
   2425 
   2426    /* Dump the register stack in ST order. */
   2427    tagw = 0;
   2428    for (stno = 0; stno < 8; stno++) {
   2429       preg = (stno + ftop) & 7;
   2430       if (vexTags[preg] == 0) {
   2431          /* register is empty */
   2432          tagw |= (3 << (2*preg));
   2433          convert_f64le_to_f80le( (UChar*)&vexRegs[preg],
   2434                                  &x87->reg[10*stno] );
   2435       } else {
   2436          /* register is full. */
   2437          tagw |= (0 << (2*preg));
   2438          convert_f64le_to_f80le( (UChar*)&vexRegs[preg],
   2439                                  &x87->reg[10*stno] );
   2440       }
   2441    }
   2442    x87->env[FPS_ENV_TAG] = toUShort(tagw);
   2443 }
   2444 
   2445 
   2446 /* This is used to implement 'frstor'.
   2447    Reads 108 bytes at x87_state[0 .. 107]. */
   2448 /* CALLED FROM GENERATED CODE */
   2449 /* DIRTY HELPER */
   2450 VexEmNote amd64g_dirtyhelper_FRSTOR ( /*OUT*/VexGuestAMD64State* vex_state,
   2451                                       /*IN*/HWord x87_state)
   2452 {
   2453    return do_put_x87( True, (UChar*)x87_state, vex_state );
   2454 }
   2455 
   2456 
   2457 /* This is used to implement 'frstors'.
   2458    Reads 94 bytes at x87_state[0 .. 93]. */
   2459 /* CALLED FROM GENERATED CODE */
   2460 /* DIRTY HELPER */
   2461 VexEmNote amd64g_dirtyhelper_FRSTORS ( /*OUT*/VexGuestAMD64State* vex_state,
   2462                                        /*IN*/HWord x87_state)
   2463 {
   2464    Int           stno, preg;
   2465    UInt          tag;
   2466    ULong*        vexRegs = (ULong*)(&vex_state->guest_FPREG[0]);
   2467    UChar*        vexTags = (UChar*)(&vex_state->guest_FPTAG[0]);
   2468    Fpu_State_16* x87     = (Fpu_State_16*)x87_state;
   2469    UInt          ftop    = (x87->env[FPS_ENV_STAT] >> 11) & 7;
   2470    UInt          tagw    = x87->env[FPS_ENV_TAG];
   2471    UInt          fpucw   = x87->env[FPS_ENV_CTRL];
   2472    UInt          c3210   = x87->env[FPS_ENV_STAT] & 0x4700;
   2473    VexEmNote     ew;
   2474    UInt          fpround;
   2475    ULong         pair;
   2476 
   2477    /* Copy registers and tags */
   2478    for (stno = 0; stno < 8; stno++) {
   2479       preg = (stno + ftop) & 7;
   2480       tag = (tagw >> (2*preg)) & 3;
   2481       if (tag == 3) {
   2482          /* register is empty */
   2483          /* hmm, if it's empty, does it still get written?  Probably
   2484             safer to say it does.  If we don't, memcheck could get out
   2485             of sync, in that it thinks all FP registers are defined by
   2486             this helper, but in reality some have not been updated. */
   2487          vexRegs[preg] = 0; /* IEEE754 64-bit zero */
   2488          vexTags[preg] = 0;
   2489       } else {
   2490          /* register is non-empty */
   2491          convert_f80le_to_f64le( &x87->reg[10*stno],
   2492                                  (UChar*)&vexRegs[preg] );
   2493          vexTags[preg] = 1;
   2494       }
   2495    }
   2496 
   2497    /* stack pointer */
   2498    vex_state->guest_FTOP = ftop;
   2499 
   2500    /* status word */
   2501    vex_state->guest_FC3210 = c3210;
   2502 
   2503    /* handle the control word, setting FPROUND and detecting any
   2504       emulation warnings. */
   2505    pair    = amd64g_check_fldcw ( (ULong)fpucw );
   2506    fpround = (UInt)pair & 0xFFFFFFFFULL;
   2507    ew      = (VexEmNote)(pair >> 32);
   2508 
   2509    vex_state->guest_FPROUND = fpround & 3;
   2510 
   2511    /* emulation warnings --> caller */
   2512    return ew;
   2513 }
   2514 
   2515 
   2516 /*---------------------------------------------------------------*/
   2517 /*--- CPUID helpers.                                          ---*/
   2518 /*---------------------------------------------------------------*/
   2519 
   2520 /* Claim to be the following CPU, which is probably representative of
   2521    the lowliest (earliest) amd64 offerings.  It can do neither sse3
   2522    nor cx16.
   2523 
   2524    vendor_id       : AuthenticAMD
   2525    cpu family      : 15
   2526    model           : 5
   2527    model name      : AMD Opteron (tm) Processor 848
   2528    stepping        : 10
   2529    cpu MHz         : 1797.682
   2530    cache size      : 1024 KB
   2531    fpu             : yes
   2532    fpu_exception   : yes
   2533    cpuid level     : 1
   2534    wp              : yes
   2535    flags           : fpu vme de pse tsc msr pae mce cx8 apic sep
   2536                      mtrr pge mca cmov pat pse36 clflush mmx fxsr
   2537                      sse sse2 syscall nx mmxext lm 3dnowext 3dnow
   2538    bogomips        : 3600.62
   2539    TLB size        : 1088 4K pages
   2540    clflush size    : 64
   2541    cache_alignment : 64
   2542    address sizes   : 40 bits physical, 48 bits virtual
   2543    power management: ts fid vid ttp
   2544 
   2545    2012-Feb-21: don't claim 3dnow or 3dnowext, since in fact
   2546    we don't support them.  See #291568.  3dnow is 80000001.EDX.31
   2547    and 3dnowext is 80000001.EDX.30.
   2548 */
   2549 void amd64g_dirtyhelper_CPUID_baseline ( VexGuestAMD64State* st )
   2550 {
   2551 #  define SET_ABCD(_a,_b,_c,_d)                \
   2552       do { st->guest_RAX = (ULong)(_a);        \
   2553            st->guest_RBX = (ULong)(_b);        \
   2554            st->guest_RCX = (ULong)(_c);        \
   2555            st->guest_RDX = (ULong)(_d);        \
   2556       } while (0)
   2557 
   2558    switch (0xFFFFFFFF & st->guest_RAX) {
   2559       case 0x00000000:
   2560          SET_ABCD(0x00000001, 0x68747541, 0x444d4163, 0x69746e65);
   2561          break;
   2562       case 0x00000001:
   2563          SET_ABCD(0x00000f5a, 0x01000800, 0x00000000, 0x078bfbff);
   2564          break;
   2565       case 0x80000000:
   2566          SET_ABCD(0x80000018, 0x68747541, 0x444d4163, 0x69746e65);
   2567          break;
   2568       case 0x80000001:
   2569          /* Don't claim to support 3dnow or 3dnowext.  0xe1d3fbff is
   2570             the original it-is-supported value that the h/w provides.
   2571             See #291568. */
   2572          SET_ABCD(0x00000f5a, 0x00000505, 0x00000000, /*0xe1d3fbff*/
   2573                                                       0x21d3fbff);
   2574          break;
   2575       case 0x80000002:
   2576          SET_ABCD(0x20444d41, 0x6574704f, 0x206e6f72, 0x296d7428);
   2577          break;
   2578       case 0x80000003:
   2579          SET_ABCD(0x6f725020, 0x73736563, 0x3820726f, 0x00003834);
   2580          break;
   2581       case 0x80000004:
   2582          SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
   2583          break;
   2584       case 0x80000005:
   2585          SET_ABCD(0xff08ff08, 0xff20ff20, 0x40020140, 0x40020140);
   2586          break;
   2587       case 0x80000006:
   2588          SET_ABCD(0x00000000, 0x42004200, 0x04008140, 0x00000000);
   2589          break;
   2590       case 0x80000007:
   2591          SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x0000000f);
   2592          break;
   2593       case 0x80000008:
   2594          SET_ABCD(0x00003028, 0x00000000, 0x00000000, 0x00000000);
   2595          break;
   2596       default:
   2597          SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
   2598          break;
   2599    }
   2600 #  undef SET_ABCD
   2601 }
   2602 
   2603 
   2604 /* Claim to be the following CPU (2 x ...), which is sse3 and cx16
   2605    capable.
   2606 
   2607    vendor_id       : GenuineIntel
   2608    cpu family      : 6
   2609    model           : 15
   2610    model name      : Intel(R) Core(TM)2 CPU 6600 @ 2.40GHz
   2611    stepping        : 6
   2612    cpu MHz         : 2394.000
   2613    cache size      : 4096 KB
   2614    physical id     : 0
   2615    siblings        : 2
   2616    core id         : 0
   2617    cpu cores       : 2
   2618    fpu             : yes
   2619    fpu_exception   : yes
   2620    cpuid level     : 10
   2621    wp              : yes
   2622    flags           : fpu vme de pse tsc msr pae mce cx8 apic sep
   2623                      mtrr pge mca cmov pat pse36 clflush dts acpi
   2624                      mmx fxsr sse sse2 ss ht tm syscall nx lm
   2625                      constant_tsc pni monitor ds_cpl vmx est tm2
   2626                      cx16 xtpr lahf_lm
   2627    bogomips        : 4798.78
   2628    clflush size    : 64
   2629    cache_alignment : 64
   2630    address sizes   : 36 bits physical, 48 bits virtual
   2631    power management:
   2632 */
   2633 void amd64g_dirtyhelper_CPUID_sse3_and_cx16 ( VexGuestAMD64State* st )
   2634 {
   2635 #  define SET_ABCD(_a,_b,_c,_d)                \
   2636       do { st->guest_RAX = (ULong)(_a);        \
   2637            st->guest_RBX = (ULong)(_b);        \
   2638            st->guest_RCX = (ULong)(_c);        \
   2639            st->guest_RDX = (ULong)(_d);        \
   2640       } while (0)
   2641 
   2642    switch (0xFFFFFFFF & st->guest_RAX) {
   2643       case 0x00000000:
   2644          SET_ABCD(0x0000000a, 0x756e6547, 0x6c65746e, 0x49656e69);
   2645          break;
   2646       case 0x00000001:
   2647          SET_ABCD(0x000006f6, 0x00020800, 0x0000e3bd, 0xbfebfbff);
   2648          break;
   2649       case 0x00000002:
   2650          SET_ABCD(0x05b0b101, 0x005657f0, 0x00000000, 0x2cb43049);
   2651          break;
   2652       case 0x00000003:
   2653          SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
   2654          break;
   2655       case 0x00000004: {
   2656          switch (0xFFFFFFFF & st->guest_RCX) {
   2657             case 0x00000000: SET_ABCD(0x04000121, 0x01c0003f,
   2658                                       0x0000003f, 0x00000001); break;
   2659             case 0x00000001: SET_ABCD(0x04000122, 0x01c0003f,
   2660                                       0x0000003f, 0x00000001); break;
   2661             case 0x00000002: SET_ABCD(0x04004143, 0x03c0003f,
   2662                                       0x00000fff, 0x00000001); break;
   2663             default:         SET_ABCD(0x00000000, 0x00000000,
   2664                                       0x00000000, 0x00000000); break;
   2665          }
   2666          break;
   2667       }
   2668       case 0x00000005:
   2669          SET_ABCD(0x00000040, 0x00000040, 0x00000003, 0x00000020);
   2670          break;
   2671       case 0x00000006:
   2672          SET_ABCD(0x00000001, 0x00000002, 0x00000001, 0x00000000);
   2673          break;
   2674       case 0x00000007:
   2675          SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
   2676          break;
   2677       case 0x00000008:
   2678          SET_ABCD(0x00000400, 0x00000000, 0x00000000, 0x00000000);
   2679          break;
   2680       case 0x00000009:
   2681          SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
   2682          break;
   2683       case 0x0000000a:
   2684       unhandled_eax_value:
   2685          SET_ABCD(0x07280202, 0x00000000, 0x00000000, 0x00000000);
   2686          break;
   2687       case 0x80000000:
   2688          SET_ABCD(0x80000008, 0x00000000, 0x00000000, 0x00000000);
   2689          break;
   2690       case 0x80000001:
   2691          SET_ABCD(0x00000000, 0x00000000, 0x00000001, 0x20100800);
   2692          break;
   2693       case 0x80000002:
   2694          SET_ABCD(0x65746e49, 0x2952286c, 0x726f4320, 0x4d542865);
   2695          break;
   2696       case 0x80000003:
   2697          SET_ABCD(0x43203229, 0x20205550, 0x20202020, 0x20202020);
   2698          break;
   2699       case 0x80000004:
   2700          SET_ABCD(0x30303636, 0x20402020, 0x30342e32, 0x007a4847);
   2701          break;
   2702       case 0x80000005:
   2703          SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
   2704          break;
   2705       case 0x80000006:
   2706          SET_ABCD(0x00000000, 0x00000000, 0x10008040, 0x00000000);
   2707          break;
   2708       case 0x80000007:
   2709          SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
   2710          break;
   2711       case 0x80000008:
   2712          SET_ABCD(0x00003024, 0x00000000, 0x00000000, 0x00000000);
   2713          break;
   2714       default:
   2715          goto unhandled_eax_value;
   2716    }
   2717 #  undef SET_ABCD
   2718 }
   2719 
   2720 
   2721 /* Claim to be the following CPU (4 x ...), which is sse4.2 and cx16
   2722    capable.
   2723 
   2724    vendor_id       : GenuineIntel
   2725    cpu family      : 6
   2726    model           : 37
   2727    model name      : Intel(R) Core(TM) i5 CPU         670  @ 3.47GHz
   2728    stepping        : 2
   2729    cpu MHz         : 3334.000
   2730    cache size      : 4096 KB
   2731    physical id     : 0
   2732    siblings        : 4
   2733    core id         : 0
   2734    cpu cores       : 2
   2735    apicid          : 0
   2736    initial apicid  : 0
   2737    fpu             : yes
   2738    fpu_exception   : yes
   2739    cpuid level     : 11
   2740    wp              : yes
   2741    flags           : fpu vme de pse tsc msr pae mce cx8 apic sep
   2742                      mtrr pge mca cmov pat pse36 clflush dts acpi
   2743                      mmx fxsr sse sse2 ss ht tm pbe syscall nx rdtscp
   2744                      lm constant_tsc arch_perfmon pebs bts rep_good
   2745                      xtopology nonstop_tsc aperfmperf pni pclmulqdq
   2746                      dtes64 monitor ds_cpl vmx smx est tm2 ssse3 cx16
   2747                      xtpr pdcm sse4_1 sse4_2 popcnt aes lahf_lm ida
   2748                      arat tpr_shadow vnmi flexpriority ept vpid
   2749    bogomips        : 6957.57
   2750    clflush size    : 64
   2751    cache_alignment : 64
   2752    address sizes   : 36 bits physical, 48 bits virtual
   2753    power management:
   2754 */
   2755 void amd64g_dirtyhelper_CPUID_sse42_and_cx16 ( VexGuestAMD64State* st )
   2756 {
   2757 #  define SET_ABCD(_a,_b,_c,_d)                \
   2758       do { st->guest_RAX = (ULong)(_a);        \
   2759            st->guest_RBX = (ULong)(_b);        \
   2760            st->guest_RCX = (ULong)(_c);        \
   2761            st->guest_RDX = (ULong)(_d);        \
   2762       } while (0)
   2763 
   2764    UInt old_eax = (UInt)st->guest_RAX;
   2765    UInt old_ecx = (UInt)st->guest_RCX;
   2766 
   2767    switch (old_eax) {
   2768       case 0x00000000:
   2769          SET_ABCD(0x0000000b, 0x756e6547, 0x6c65746e, 0x49656e69);
   2770          break;
   2771       case 0x00000001:
   2772          SET_ABCD(0x00020652, 0x00100800, 0x0298e3ff, 0xbfebfbff);
   2773          break;
   2774       case 0x00000002:
   2775          SET_ABCD(0x55035a01, 0x00f0b2e3, 0x00000000, 0x09ca212c);
   2776          break;
   2777       case 0x00000003:
   2778          SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
   2779          break;
   2780       case 0x00000004:
   2781          switch (old_ecx) {
   2782             case 0x00000000: SET_ABCD(0x1c004121, 0x01c0003f,
   2783                                       0x0000003f, 0x00000000); break;
   2784             case 0x00000001: SET_ABCD(0x1c004122, 0x00c0003f,
   2785                                       0x0000007f, 0x00000000); break;
   2786             case 0x00000002: SET_ABCD(0x1c004143, 0x01c0003f,
   2787                                       0x000001ff, 0x00000000); break;
   2788             case 0x00000003: SET_ABCD(0x1c03c163, 0x03c0003f,
   2789                                       0x00000fff, 0x00000002); break;
   2790             default:         SET_ABCD(0x00000000, 0x00000000,
   2791                                       0x00000000, 0x00000000); break;
   2792          }
   2793          break;
   2794       case 0x00000005:
   2795          SET_ABCD(0x00000040, 0x00000040, 0x00000003, 0x00001120);
   2796          break;
   2797       case 0x00000006:
   2798          SET_ABCD(0x00000007, 0x00000002, 0x00000001, 0x00000000);
   2799          break;
   2800       case 0x00000007:
   2801          SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
   2802          break;
   2803       case 0x00000008:
   2804          SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
   2805          break;
   2806       case 0x00000009:
   2807          SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
   2808          break;
   2809       case 0x0000000a:
   2810          SET_ABCD(0x07300403, 0x00000004, 0x00000000, 0x00000603);
   2811          break;
   2812       case 0x0000000b:
   2813          switch (old_ecx) {
   2814             case 0x00000000:
   2815                SET_ABCD(0x00000001, 0x00000002,
   2816                         0x00000100, 0x00000000); break;
   2817             case 0x00000001:
   2818                SET_ABCD(0x00000004, 0x00000004,
   2819                         0x00000201, 0x00000000); break;
   2820             default:
   2821                SET_ABCD(0x00000000, 0x00000000,
   2822                         old_ecx,    0x00000000); break;
   2823          }
   2824          break;
   2825       case 0x0000000c:
   2826          SET_ABCD(0x00000001, 0x00000002, 0x00000100, 0x00000000);
   2827          break;
   2828       case 0x0000000d:
   2829          switch (old_ecx) {
   2830             case 0x00000000: SET_ABCD(0x00000001, 0x00000002,
   2831                                       0x00000100, 0x00000000); break;
   2832             case 0x00000001: SET_ABCD(0x00000004, 0x00000004,
   2833                                       0x00000201, 0x00000000); break;
   2834             default:         SET_ABCD(0x00000000, 0x00000000,
   2835                                       old_ecx,    0x00000000); break;
   2836          }
   2837          break;
   2838       case 0x80000000:
   2839          SET_ABCD(0x80000008, 0x00000000, 0x00000000, 0x00000000);
   2840          break;
   2841       case 0x80000001:
   2842          SET_ABCD(0x00000000, 0x00000000, 0x00000001, 0x28100800);
   2843          break;
   2844       case 0x80000002:
   2845          SET_ABCD(0x65746e49, 0x2952286c, 0x726f4320, 0x4d542865);
   2846          break;
   2847       case 0x80000003:
   2848          SET_ABCD(0x35692029, 0x55504320, 0x20202020, 0x20202020);
   2849          break;
   2850       case 0x80000004:
   2851          SET_ABCD(0x30373620, 0x20402020, 0x37342e33, 0x007a4847);
   2852          break;
   2853       case 0x80000005:
   2854          SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
   2855          break;
   2856       case 0x80000006:
   2857          SET_ABCD(0x00000000, 0x00000000, 0x01006040, 0x00000000);
   2858          break;
   2859       case 0x80000007:
   2860          SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000100);
   2861          break;
   2862       case 0x80000008:
   2863          SET_ABCD(0x00003024, 0x00000000, 0x00000000, 0x00000000);
   2864          break;
   2865       default:
   2866          SET_ABCD(0x00000001, 0x00000002, 0x00000100, 0x00000000);
   2867          break;
   2868    }
   2869 #  undef SET_ABCD
   2870 }
   2871 
   2872 
   2873 /* Claim to be the following CPU (4 x ...), which is AVX and cx16
   2874    capable.  Plus (kludge!) it "supports" HTM.
   2875 
   2876    Also with the following change: claim that XSaveOpt is not
   2877    available, by cpuid(eax=0xD,ecx=1).eax[0] returns 0, compared to 1
   2878    on the real CPU.  Consequently, programs that correctly observe
   2879    these CPUID values should only try to use 3 of the 8 XSave-family
   2880    instructions: XGETBV, XSAVE and XRSTOR.  In particular this avoids
   2881    having to implement the compacted or optimised save/restore
   2882    variants.
   2883 
   2884    vendor_id       : GenuineIntel
   2885    cpu family      : 6
   2886    model           : 42
   2887    model name      : Intel(R) Core(TM) i5-2300 CPU @ 2.80GHz
   2888    stepping        : 7
   2889    cpu MHz         : 1600.000
   2890    cache size      : 6144 KB
   2891    physical id     : 0
   2892    siblings        : 4
   2893    core id         : 3
   2894    cpu cores       : 4
   2895    apicid          : 6
   2896    initial apicid  : 6
   2897    fpu             : yes
   2898    fpu_exception   : yes
   2899    cpuid level     : 13
   2900    wp              : yes
   2901    flags           : fpu vme de pse tsc msr pae mce cx8 apic sep
   2902                      mtrr pge mca cmov pat pse36 clflush dts acpi
   2903                      mmx fxsr sse sse2 ss ht tm pbe syscall nx rdtscp
   2904                      lm constant_tsc arch_perfmon pebs bts rep_good
   2905                      nopl xtopology nonstop_tsc aperfmperf pni pclmulqdq
   2906                      dtes64 monitor ds_cpl vmx est tm2 ssse3 cx16
   2907                      xtpr pdcm sse4_1 sse4_2 popcnt aes xsave avx
   2908                      lahf_lm ida arat epb xsaveopt pln pts dts
   2909                      tpr_shadow vnmi flexpriority ept vpid
   2910 
   2911    bogomips        : 5768.94
   2912    clflush size    : 64
   2913    cache_alignment : 64
   2914    address sizes   : 36 bits physical, 48 bits virtual
   2915    power management:
   2916 */
   2917 void amd64g_dirtyhelper_CPUID_avx_and_cx16 ( VexGuestAMD64State* st )
   2918 {
   2919 #  define SET_ABCD(_a,_b,_c,_d)                \
   2920       do { st->guest_RAX = (ULong)(_a);        \
   2921            st->guest_RBX = (ULong)(_b);        \
   2922            st->guest_RCX = (ULong)(_c);        \
   2923            st->guest_RDX = (ULong)(_d);        \
   2924       } while (0)
   2925 
   2926    UInt old_eax = (UInt)st->guest_RAX;
   2927    UInt old_ecx = (UInt)st->guest_RCX;
   2928 
   2929    switch (old_eax) {
   2930       case 0x00000000:
   2931          SET_ABCD(0x0000000d, 0x756e6547, 0x6c65746e, 0x49656e69);
   2932          break;
   2933       case 0x00000001:
   2934          SET_ABCD(0x000206a7, 0x00100800, 0x1f9ae3bf, 0xbfebfbff);
   2935          break;
   2936       case 0x00000002:
   2937          SET_ABCD(0x76035a01, 0x00f0b0ff, 0x00000000, 0x00ca0000);
   2938          break;
   2939       case 0x00000003:
   2940          SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
   2941          break;
   2942       case 0x00000004:
   2943          switch (old_ecx) {
   2944             case 0x00000000: SET_ABCD(0x1c004121, 0x01c0003f,
   2945                                       0x0000003f, 0x00000000); break;
   2946             case 0x00000001: SET_ABCD(0x1c004122, 0x01c0003f,
   2947                                       0x0000003f, 0x00000000); break;
   2948             case 0x00000002: SET_ABCD(0x1c004143, 0x01c0003f,
   2949                                       0x000001ff, 0x00000000); break;
   2950             case 0x00000003: SET_ABCD(0x1c03c163, 0x02c0003f,
   2951                                       0x00001fff, 0x00000006); break;
   2952             default:         SET_ABCD(0x00000000, 0x00000000,
   2953                                       0x00000000, 0x00000000); break;
   2954          }
   2955          break;
   2956       case 0x00000005:
   2957          SET_ABCD(0x00000040, 0x00000040, 0x00000003, 0x00001120);
   2958          break;
   2959       case 0x00000006:
   2960          SET_ABCD(0x00000077, 0x00000002, 0x00000009, 0x00000000);
   2961          break;
   2962       case 0x00000007:
   2963          SET_ABCD(0x00000000, 0x00000800, 0x00000000, 0x00000000);
   2964          break;
   2965       case 0x00000008:
   2966          SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
   2967          break;
   2968       case 0x00000009:
   2969          SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
   2970          break;
   2971       case 0x0000000a:
   2972          SET_ABCD(0x07300803, 0x00000000, 0x00000000, 0x00000603);
   2973          break;
   2974       case 0x0000000b:
   2975          switch (old_ecx) {
   2976             case 0x00000000:
   2977                SET_ABCD(0x00000001, 0x00000001,
   2978                         0x00000100, 0x00000000); break;
   2979             case 0x00000001:
   2980                SET_ABCD(0x00000004, 0x00000004,
   2981                         0x00000201, 0x00000000); break;
   2982             default:
   2983                SET_ABCD(0x00000000, 0x00000000,
   2984                         old_ecx,    0x00000000); break;
   2985          }
   2986          break;
   2987       case 0x0000000c:
   2988          SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
   2989          break;
   2990       case 0x0000000d:
   2991          switch (old_ecx) {
   2992             case 0x00000000: SET_ABCD(0x00000007, 0x00000340,
   2993                                       0x00000340, 0x00000000); break;
   2994             case 0x00000001: SET_ABCD(0x00000000, 0x00000000,
   2995                                       0x00000000, 0x00000000); break;
   2996             case 0x00000002: SET_ABCD(0x00000100, 0x00000240,
   2997                                       0x00000000, 0x00000000); break;
   2998             default:         SET_ABCD(0x00000000, 0x00000000,
   2999                                       0x00000000, 0x00000000); break;
   3000          }
   3001          break;
   3002       case 0x0000000e:
   3003          SET_ABCD(0x00000007, 0x00000340, 0x00000340, 0x00000000);
   3004          break;
   3005       case 0x0000000f:
   3006          SET_ABCD(0x00000007, 0x00000340, 0x00000340, 0x00000000);
   3007          break;
   3008       case 0x80000000:
   3009          SET_ABCD(0x80000008, 0x00000000, 0x00000000, 0x00000000);
   3010          break;
   3011       case 0x80000001:
   3012          SET_ABCD(0x00000000, 0x00000000, 0x00000001, 0x28100800);
   3013          break;
   3014       case 0x80000002:
   3015          SET_ABCD(0x20202020, 0x20202020, 0x65746e49, 0x2952286c);
   3016          break;
   3017       case 0x80000003:
   3018          SET_ABCD(0x726f4320, 0x4d542865, 0x35692029, 0x3033322d);
   3019          break;
   3020       case 0x80000004:
   3021          SET_ABCD(0x50432030, 0x20402055, 0x30382e32, 0x007a4847);
   3022          break;
   3023       case 0x80000005:
   3024          SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
   3025          break;
   3026       case 0x80000006:
   3027          SET_ABCD(0x00000000, 0x00000000, 0x01006040, 0x00000000);
   3028          break;
   3029       case 0x80000007:
   3030          SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000100);
   3031          break;
   3032       case 0x80000008:
   3033          SET_ABCD(0x00003024, 0x00000000, 0x00000000, 0x00000000);
   3034          break;
   3035       default:
   3036          SET_ABCD(0x00000007, 0x00000340, 0x00000340, 0x00000000);
   3037          break;
   3038    }
   3039 #  undef SET_ABCD
   3040 }
   3041 
   3042 
   3043 /* Claim to be the following CPU (4 x ...), which is AVX2 capable.
   3044 
   3045    With the following change: claim that XSaveOpt is not available, by
   3046    cpuid(eax=0xD,ecx=1).eax[0] returns 0, compared to 1 on the real
   3047    CPU.  Consequently, programs that correctly observe these CPUID
   3048    values should only try to use 3 of the 8 XSave-family instructions:
   3049    XGETBV, XSAVE and XRSTOR.  In particular this avoids having to
   3050    implement the compacted or optimised save/restore variants.
   3051 
   3052    vendor_id       : GenuineIntel
   3053    cpu family      : 6
   3054    model           : 60
   3055    model name      : Intel(R) Core(TM) i7-4910MQ CPU @ 2.90GHz
   3056    stepping        : 3
   3057    microcode       : 0x1c
   3058    cpu MHz         : 919.957
   3059    cache size      : 8192 KB
   3060    physical id     : 0
   3061    siblings        : 4
   3062    core id         : 3
   3063    cpu cores       : 4
   3064    apicid          : 6
   3065    initial apicid  : 6
   3066    fpu             : yes
   3067    fpu_exception   : yes
   3068    cpuid level     : 13
   3069    wp              : yes
   3070    flags           : fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca
   3071                      cmov pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht
   3072                      tm pbe syscall nx pdpe1gb rdtscp lm constant_tsc
   3073                      arch_perfmon pebs bts rep_good nopl xtopology nonstop_tsc
   3074                      aperfmperf eagerfpu pni pclmulqdq dtes64 monitor ds_cpl
   3075                      vmx smx est tm2 ssse3 fma cx16 xtpr pdcm pcid sse4_1
   3076                      sse4_2 x2apic movbe popcnt tsc_deadline_timer aes xsave
   3077                      avx f16c rdrand lahf_lm abm ida arat epb pln pts dtherm
   3078                      tpr_shadow vnmi flexpriority ept vpid fsgsbase tsc_adjust
   3079                      bmi1 avx2 smep bmi2 erms invpcid xsaveopt
   3080    bugs            :
   3081    bogomips        : 5786.68
   3082    clflush size    : 64
   3083    cache_alignment : 64
   3084    address sizes   : 39 bits physical, 48 bits virtual
   3085    power management:
   3086 */
   3087 void amd64g_dirtyhelper_CPUID_avx2 ( VexGuestAMD64State* st )
   3088 {
   3089 #  define SET_ABCD(_a,_b,_c,_d)                \
   3090       do { st->guest_RAX = (ULong)(_a);        \
   3091            st->guest_RBX = (ULong)(_b);        \
   3092            st->guest_RCX = (ULong)(_c);        \
   3093            st->guest_RDX = (ULong)(_d);        \
   3094       } while (0)
   3095 
   3096    UInt old_eax = (UInt)st->guest_RAX;
   3097    UInt old_ecx = (UInt)st->guest_RCX;
   3098 
   3099    switch (old_eax) {
   3100       case 0x00000000:
   3101          SET_ABCD(0x0000000d, 0x756e6547, 0x6c65746e, 0x49656e69);
   3102          break;
   3103       case 0x00000001:
   3104          /* Don't advertise RDRAND support, bit 30 in ECX.  */
   3105          SET_ABCD(0x000306c3, 0x02100800, 0x3ffafbff, 0xbfebfbff);
   3106          break;
   3107       case 0x00000002:
   3108          SET_ABCD(0x76036301, 0x00f0b6ff, 0x00000000, 0x00c10000);
   3109          break;
   3110       case 0x00000003:
   3111          SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
   3112          break;
   3113       case 0x00000004:
   3114          switch (old_ecx) {
   3115             case 0x00000000: SET_ABCD(0x1c004121, 0x01c0003f,
   3116                                       0x0000003f, 0x00000000); break;
   3117             case 0x00000001: SET_ABCD(0x1c004122, 0x01c0003f,
   3118                                       0x0000003f, 0x00000000); break;
   3119             case 0x00000002: SET_ABCD(0x1c004143, 0x01c0003f,
   3120                                       0x000001ff, 0x00000000); break;
   3121             case 0x00000003: SET_ABCD(0x1c03c163, 0x03c0003f,
   3122                                       0x00001fff, 0x00000006); break;
   3123             default:         SET_ABCD(0x00000000, 0x00000000,
   3124                                       0x00000000, 0x00000000); break;
   3125          }
   3126          break;
   3127       case 0x00000005:
   3128          SET_ABCD(0x00000040, 0x00000040, 0x00000003, 0x00042120);
   3129          break;
   3130       case 0x00000006:
   3131          SET_ABCD(0x00000077, 0x00000002, 0x00000009, 0x00000000);
   3132          break;
   3133       case 0x00000007:
   3134          switch (old_ecx) {
   3135             case 0x00000000: SET_ABCD(0x00000000, 0x000027ab,
   3136                                       0x00000000, 0x00000000); break;
   3137             default:         SET_ABCD(0x00000000, 0x00000000,
   3138                                       0x00000000, 0x00000000); break;
   3139          }
   3140          break;
   3141       case 0x00000008:
   3142          SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
   3143          break;
   3144       case 0x00000009:
   3145          SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
   3146          break;
   3147       case 0x0000000a:
   3148          SET_ABCD(0x07300803, 0x00000000, 0x00000000, 0x00000603);
   3149          break;
   3150       case 0x0000000b:
   3151          switch (old_ecx) {
   3152             case 0x00000000: SET_ABCD(0x00000001, 0x00000002,
   3153                                       0x00000100, 0x00000002); break;
   3154             case 0x00000001: SET_ABCD(0x00000004, 0x00000008,
   3155                                       0x00000201, 0x00000002); break;
   3156             default:         SET_ABCD(0x00000000, 0x00000000,
   3157                                       old_ecx,    0x00000002); break;
   3158          }
   3159          break;
   3160       case 0x0000000c:
   3161          SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
   3162          break;
   3163       case 0x0000000d:
   3164          switch (old_ecx) {
   3165             case 0x00000000: SET_ABCD(0x00000007, 0x00000340,
   3166                                       0x00000340, 0x00000000); break;
   3167             case 0x00000001: SET_ABCD(0x00000000, 0x00000000,
   3168                                       0x00000000, 0x00000000); break;
   3169             case 0x00000002: SET_ABCD(0x00000100, 0x00000240,
   3170                                       0x00000000, 0x00000000); break;
   3171             default:         SET_ABCD(0x00000000, 0x00000000,
   3172                                       0x00000000, 0x00000000); break;
   3173          }
   3174          break;
   3175       case 0x80000000:
   3176          SET_ABCD(0x80000008, 0x00000000, 0x00000000, 0x00000000);
   3177          break;
   3178       case 0x80000001:
   3179          SET_ABCD(0x00000000, 0x00000000, 0x00000021, 0x2c100800);
   3180          break;
   3181       case 0x80000002:
   3182          SET_ABCD(0x65746e49, 0x2952286c, 0x726f4320, 0x4d542865);
   3183          break;
   3184       case 0x80000003:
   3185          SET_ABCD(0x37692029, 0x3139342d, 0x20514d30, 0x20555043);
   3186          break;
   3187       case 0x80000004:
   3188          SET_ABCD(0x2e322040, 0x48473039, 0x0000007a, 0x00000000);
   3189          break;
   3190       case 0x80000005:
   3191          SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
   3192          break;
   3193       case 0x80000006:
   3194          SET_ABCD(0x00000000, 0x00000000, 0x01006040, 0x00000000);
   3195          break;
   3196       case 0x80000007:
   3197          SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000100);
   3198          break;
   3199       case 0x80000008:
   3200          SET_ABCD(0x00003027, 0x00000000, 0x00000000, 0x00000000);
   3201          break;
   3202       default:
   3203          SET_ABCD(0x00000007, 0x00000340, 0x00000340, 0x00000000);
   3204          break;
   3205    }
   3206 #  undef SET_ABCD
   3207 }
   3208 
   3209 
   3210 /*---------------------------------------------------------------*/
   3211 /*--- Misc integer helpers, including rotates and crypto.     ---*/
   3212 /*---------------------------------------------------------------*/
   3213 
   3214 ULong amd64g_calculate_RCR ( ULong arg,
   3215                              ULong rot_amt,
   3216                              ULong rflags_in,
   3217                              Long  szIN )
   3218 {
   3219    Bool  wantRflags = toBool(szIN < 0);
   3220    ULong sz         = wantRflags ? (-szIN) : szIN;
   3221    ULong tempCOUNT  = rot_amt & (sz == 8 ? 0x3F : 0x1F);
   3222    ULong cf=0, of=0, tempcf;
   3223 
   3224    switch (sz) {
   3225       case 8:
   3226          cf        = (rflags_in >> AMD64G_CC_SHIFT_C) & 1;
   3227          of        = ((arg >> 63) ^ cf) & 1;
   3228          while (tempCOUNT > 0) {
   3229             tempcf = arg & 1;
   3230             arg    = (arg >> 1) | (cf << 63);
   3231             cf     = tempcf;
   3232             tempCOUNT--;
   3233          }
   3234          break;
   3235       case 4:
   3236          while (tempCOUNT >= 33) tempCOUNT -= 33;
   3237          cf        = (rflags_in >> AMD64G_CC_SHIFT_C) & 1;
   3238          of        = ((arg >> 31) ^ cf) & 1;
   3239          while (tempCOUNT > 0) {
   3240             tempcf = arg & 1;
   3241             arg    = ((arg >> 1) & 0x7FFFFFFFULL) | (cf << 31);
   3242             cf     = tempcf;
   3243             tempCOUNT--;
   3244          }
   3245          break;
   3246       case 2:
   3247          while (tempCOUNT >= 17) tempCOUNT -= 17;
   3248          cf        = (rflags_in >> AMD64G_CC_SHIFT_C) & 1;
   3249          of        = ((arg >> 15) ^ cf) & 1;
   3250          while (tempCOUNT > 0) {
   3251             tempcf = arg & 1;
   3252             arg    = ((arg >> 1) & 0x7FFFULL) | (cf << 15);
   3253             cf     = tempcf;
   3254             tempCOUNT--;
   3255          }
   3256          break;
   3257       case 1:
   3258          while (tempCOUNT >= 9) tempCOUNT -= 9;
   3259          cf        = (rflags_in >> AMD64G_CC_SHIFT_C) & 1;
   3260          of        = ((arg >> 7) ^ cf) & 1;
   3261          while (tempCOUNT > 0) {
   3262             tempcf = arg & 1;
   3263             arg    = ((arg >> 1) & 0x7FULL) | (cf << 7);
   3264             cf     = tempcf;
   3265             tempCOUNT--;
   3266          }
   3267          break;
   3268       default:
   3269          vpanic("calculate_RCR(amd64g): invalid size");
   3270    }
   3271 
   3272    cf &= 1;
   3273    of &= 1;
   3274    rflags_in &= ~(AMD64G_CC_MASK_C | AMD64G_CC_MASK_O);
   3275    rflags_in |= (cf << AMD64G_CC_SHIFT_C) | (of << AMD64G_CC_SHIFT_O);
   3276 
   3277    /* caller can ask to have back either the resulting flags or
   3278       resulting value, but not both */
   3279    return wantRflags ? rflags_in : arg;
   3280 }
   3281 
   3282 ULong amd64g_calculate_RCL ( ULong arg,
   3283                              ULong rot_amt,
   3284                              ULong rflags_in,
   3285                              Long  szIN )
   3286 {
   3287    Bool  wantRflags = toBool(szIN < 0);
   3288    ULong sz         = wantRflags ? (-szIN) : szIN;
   3289    ULong tempCOUNT  = rot_amt & (sz == 8 ? 0x3F : 0x1F);
   3290    ULong cf=0, of=0, tempcf;
   3291 
   3292    switch (sz) {
   3293       case 8:
   3294          cf = (rflags_in >> AMD64G_CC_SHIFT_C) & 1;
   3295          while (tempCOUNT > 0) {
   3296             tempcf = (arg >> 63) & 1;
   3297             arg    = (arg << 1) | (cf & 1);
   3298             cf     = tempcf;
   3299             tempCOUNT--;
   3300          }
   3301          of = ((arg >> 63) ^ cf) & 1;
   3302          break;
   3303       case 4:
   3304          while (tempCOUNT >= 33) tempCOUNT -= 33;
   3305          cf = (rflags_in >> AMD64G_CC_SHIFT_C) & 1;
   3306          while (tempCOUNT > 0) {
   3307             tempcf = (arg >> 31) & 1;
   3308             arg    = 0xFFFFFFFFULL & ((arg << 1) | (cf & 1));
   3309             cf     = tempcf;
   3310             tempCOUNT--;
   3311          }
   3312          of = ((arg >> 31) ^ cf) & 1;
   3313          break;
   3314       case 2:
   3315          while (tempCOUNT >= 17) tempCOUNT -= 17;
   3316          cf = (rflags_in >> AMD64G_CC_SHIFT_C) & 1;
   3317          while (tempCOUNT > 0) {
   3318             tempcf = (arg >> 15) & 1;
   3319             arg    = 0xFFFFULL & ((arg << 1) | (cf & 1));
   3320             cf     = tempcf;
   3321             tempCOUNT--;
   3322          }
   3323          of = ((arg >> 15) ^ cf) & 1;
   3324          break;
   3325       case 1:
   3326          while (tempCOUNT >= 9) tempCOUNT -= 9;
   3327          cf = (rflags_in >> AMD64G_CC_SHIFT_C) & 1;
   3328          while (tempCOUNT > 0) {
   3329             tempcf = (arg >> 7) & 1;
   3330             arg    = 0xFFULL & ((arg << 1) | (cf & 1));
   3331             cf     = tempcf;
   3332             tempCOUNT--;
   3333          }
   3334          of = ((arg >> 7) ^ cf) & 1;
   3335          break;
   3336       default:
   3337          vpanic("calculate_RCL(amd64g): invalid size");
   3338    }
   3339 
   3340    cf &= 1;
   3341    of &= 1;
   3342    rflags_in &= ~(AMD64G_CC_MASK_C | AMD64G_CC_MASK_O);
   3343    rflags_in |= (cf << AMD64G_CC_SHIFT_C) | (of << AMD64G_CC_SHIFT_O);
   3344 
   3345    return wantRflags ? rflags_in : arg;
   3346 }
   3347 
   3348 /* Taken from gf2x-0.9.5, released under GPLv2+ (later versions LGPLv2+)
   3349  * svn://scm.gforge.inria.fr/svn/gf2x/trunk/hardware/opteron/gf2x_mul1.h@25
   3350  */
   3351 ULong amd64g_calculate_pclmul(ULong a, ULong b, ULong which)
   3352 {
   3353     ULong hi, lo, tmp, A[16];
   3354 
   3355    A[0] = 0;            A[1] = a;
   3356    A[2] = A[1] << 1;    A[3] = A[2] ^ a;
   3357    A[4] = A[2] << 1;    A[5] = A[4] ^ a;
   3358    A[6] = A[3] << 1;    A[7] = A[6] ^ a;
   3359    A[8] = A[4] << 1;    A[9] = A[8] ^ a;
   3360    A[10] = A[5] << 1;   A[11] = A[10] ^ a;
   3361    A[12] = A[6] << 1;   A[13] = A[12] ^ a;
   3362    A[14] = A[7] << 1;   A[15] = A[14] ^ a;
   3363 
   3364    lo = (A[b >> 60] << 4) ^ A[(b >> 56) & 15];
   3365    hi = lo >> 56;
   3366    lo = (lo << 8) ^ (A[(b >> 52) & 15] << 4) ^ A[(b >> 48) & 15];
   3367    hi = (hi << 8) | (lo >> 56);
   3368    lo = (lo << 8) ^ (A[(b >> 44) & 15] << 4) ^ A[(b >> 40) & 15];
   3369    hi = (hi << 8) | (lo >> 56);
   3370    lo = (lo << 8) ^ (A[(b >> 36) & 15] << 4) ^ A[(b >> 32) & 15];
   3371    hi = (hi << 8) | (lo >> 56);
   3372    lo = (lo << 8) ^ (A[(b >> 28) & 15] << 4) ^ A[(b >> 24) & 15];
   3373    hi = (hi << 8) | (lo >> 56);
   3374    lo = (lo << 8) ^ (A[(b >> 20) & 15] << 4) ^ A[(b >> 16) & 15];
   3375    hi = (hi << 8) | (lo >> 56);
   3376    lo = (lo << 8) ^ (A[(b >> 12) & 15] << 4) ^ A[(b >> 8) & 15];
   3377    hi = (hi << 8) | (lo >> 56);
   3378    lo = (lo << 8) ^ (A[(b >> 4) & 15] << 4) ^ A[b & 15];
   3379 
   3380    ULong m0 = -1;
   3381    m0 /= 255;
   3382    tmp = -((a >> 63) & 1); tmp &= ((b & (m0 * 0xfe)) >> 1); hi = hi ^ tmp;
   3383    tmp = -((a >> 62) & 1); tmp &= ((b & (m0 * 0xfc)) >> 2); hi = hi ^ tmp;
   3384    tmp = -((a >> 61) & 1); tmp &= ((b & (m0 * 0xf8)) >> 3); hi = hi ^ tmp;
   3385    tmp = -((a >> 60) & 1); tmp &= ((b & (m0 * 0xf0)) >> 4); hi = hi ^ tmp;
   3386    tmp = -((a >> 59) & 1); tmp &= ((b & (m0 * 0xe0)) >> 5); hi = hi ^ tmp;
   3387    tmp = -((a >> 58) & 1); tmp &= ((b & (m0 * 0xc0)) >> 6); hi = hi ^ tmp;
   3388    tmp = -((a >> 57) & 1); tmp &= ((b & (m0 * 0x80)) >> 7); hi = hi ^ tmp;
   3389 
   3390    return which ? hi : lo;
   3391 }
   3392 
   3393 
   3394 /* CALLED FROM GENERATED CODE */
   3395 /* DIRTY HELPER (non-referentially-transparent) */
   3396 /* Horrible hack.  On non-amd64 platforms, return 1. */
   3397 ULong amd64g_dirtyhelper_RDTSC ( void )
   3398 {
   3399 #  if defined(__x86_64__)
   3400    UInt  eax, edx;
   3401    __asm__ __volatile__("rdtsc" : "=a" (eax), "=d" (edx));
   3402    return (((ULong)edx) << 32) | ((ULong)eax);
   3403 #  else
   3404    return 1ULL;
   3405 #  endif
   3406 }
   3407 
   3408 /* CALLED FROM GENERATED CODE */
   3409 /* DIRTY HELPER (non-referentially-transparent) */
   3410 /* Horrible hack.  On non-amd64 platforms, return 1. */
   3411 /* This uses a different calling convention from _RDTSC just above
   3412    only because of the difficulty of returning 96 bits from a C
   3413    function -- RDTSC returns 64 bits and so is simple by comparison,
   3414    on amd64. */
   3415 void amd64g_dirtyhelper_RDTSCP ( VexGuestAMD64State* st )
   3416 {
   3417 #  if defined(__x86_64__)
   3418    UInt eax, ecx, edx;
   3419    __asm__ __volatile__("rdtscp" : "=a" (eax), "=d" (edx), "=c" (ecx));
   3420    st->guest_RAX = (ULong)eax;
   3421    st->guest_RCX = (ULong)ecx;
   3422    st->guest_RDX = (ULong)edx;
   3423 #  else
   3424    /* Do nothing. */
   3425 #  endif
   3426 }
   3427 
   3428 /* CALLED FROM GENERATED CODE */
   3429 /* DIRTY HELPER (non-referentially-transparent) */
   3430 /* Horrible hack.  On non-amd64 platforms, return 0. */
   3431 ULong amd64g_dirtyhelper_IN ( ULong portno, ULong sz/*1,2 or 4*/ )
   3432 {
   3433 #  if defined(__x86_64__)
   3434    ULong r = 0;
   3435    portno &= 0xFFFF;
   3436    switch (sz) {
   3437       case 4:
   3438          __asm__ __volatile__("movq $0,%%rax; inl %w1,%%eax; movq %%rax,%0"
   3439                               : "=a" (r) : "Nd" (portno));
   3440 	 break;
   3441       case 2:
   3442          __asm__ __volatile__("movq $0,%%rax; inw %w1,%w0"
   3443                               : "=a" (r) : "Nd" (portno));
   3444 	 break;
   3445       case 1:
   3446          __asm__ __volatile__("movq $0,%%rax; inb %w1,%b0"
   3447                               : "=a" (r) : "Nd" (portno));
   3448 	 break;
   3449       default:
   3450          break; /* note: no 64-bit version of insn exists */
   3451    }
   3452    return r;
   3453 #  else
   3454    return 0;
   3455 #  endif
   3456 }
   3457 
   3458 
   3459 /* CALLED FROM GENERATED CODE */
   3460 /* DIRTY HELPER (non-referentially-transparent) */
   3461 /* Horrible hack.  On non-amd64 platforms, do nothing. */
   3462 void amd64g_dirtyhelper_OUT ( ULong portno, ULong data, ULong sz/*1,2 or 4*/ )
   3463 {
   3464 #  if defined(__x86_64__)
   3465    portno &= 0xFFFF;
   3466    switch (sz) {
   3467       case 4:
   3468          __asm__ __volatile__("movq %0,%%rax; outl %%eax, %w1"
   3469                               : : "a" (data), "Nd" (portno));
   3470 	 break;
   3471       case 2:
   3472          __asm__ __volatile__("outw %w0, %w1"
   3473                               : : "a" (data), "Nd" (portno));
   3474 	 break;
   3475       case 1:
   3476          __asm__ __volatile__("outb %b0, %w1"
   3477                               : : "a" (data), "Nd" (portno));
   3478 	 break;
   3479       default:
   3480          break; /* note: no 64-bit version of insn exists */
   3481    }
   3482 #  else
   3483    /* do nothing */
   3484 #  endif
   3485 }
   3486 
   3487 /* CALLED FROM GENERATED CODE */
   3488 /* DIRTY HELPER (non-referentially-transparent) */
   3489 /* Horrible hack.  On non-amd64 platforms, do nothing. */
   3490 /* op = 0: call the native SGDT instruction.
   3491    op = 1: call the native SIDT instruction.
   3492 */
   3493 void amd64g_dirtyhelper_SxDT ( void *address, ULong op ) {
   3494 #  if defined(__x86_64__)
   3495    switch (op) {
   3496       case 0:
   3497          __asm__ __volatile__("sgdt (%0)" : : "r" (address) : "memory");
   3498          break;
   3499       case 1:
   3500          __asm__ __volatile__("sidt (%0)" : : "r" (address) : "memory");
   3501          break;
   3502       default:
   3503          vpanic("amd64g_dirtyhelper_SxDT");
   3504    }
   3505 #  else
   3506    /* do nothing */
   3507    UChar* p = (UChar*)address;
   3508    p[0] = p[1] = p[2] = p[3] = p[4] = p[5] = 0;
   3509    p[6] = p[7] = p[8] = p[9] = 0;
   3510 #  endif
   3511 }
   3512 
   3513 /*---------------------------------------------------------------*/
   3514 /*--- Helpers for MMX/SSE/SSE2.                               ---*/
   3515 /*---------------------------------------------------------------*/
   3516 
   3517 static inline UChar abdU8 ( UChar xx, UChar yy ) {
   3518    return toUChar(xx>yy ? xx-yy : yy-xx);
   3519 }
   3520 
   3521 static inline ULong mk32x2 ( UInt w1, UInt w0 ) {
   3522    return (((ULong)w1) << 32) | ((ULong)w0);
   3523 }
   3524 
   3525 static inline UShort sel16x4_3 ( ULong w64 ) {
   3526    UInt hi32 = toUInt(w64 >> 32);
   3527    return toUShort(hi32 >> 16);
   3528 }
   3529 static inline UShort sel16x4_2 ( ULong w64 ) {
   3530    UInt hi32 = toUInt(w64 >> 32);
   3531    return toUShort(hi32);
   3532 }
   3533 static inline UShort sel16x4_1 ( ULong w64 ) {
   3534    UInt lo32 = toUInt(w64);
   3535    return toUShort(lo32 >> 16);
   3536 }
   3537 static inline UShort sel16x4_0 ( ULong w64 ) {
   3538    UInt lo32 = toUInt(w64);
   3539    return toUShort(lo32);
   3540 }
   3541 
   3542 static inline UChar sel8x8_7 ( ULong w64 ) {
   3543    UInt hi32 = toUInt(w64 >> 32);
   3544    return toUChar(hi32 >> 24);
   3545 }
   3546 static inline UChar sel8x8_6 ( ULong w64 ) {
   3547    UInt hi32 = toUInt(w64 >> 32);
   3548    return toUChar(hi32 >> 16);
   3549 }
   3550 static inline UChar sel8x8_5 ( ULong w64 ) {
   3551    UInt hi32 = toUInt(w64 >> 32);
   3552    return toUChar(hi32 >> 8);
   3553 }
   3554 static inline UChar sel8x8_4 ( ULong w64 ) {
   3555    UInt hi32 = toUInt(w64 >> 32);
   3556    return toUChar(hi32 >> 0);
   3557 }
   3558 static inline UChar sel8x8_3 ( ULong w64 ) {
   3559    UInt lo32 = toUInt(w64);
   3560    return toUChar(lo32 >> 24);
   3561 }
   3562 static inline UChar sel8x8_2 ( ULong w64 ) {
   3563    UInt lo32 = toUInt(w64);
   3564    return toUChar(lo32 >> 16);
   3565 }
   3566 static inline UChar sel8x8_1 ( ULong w64 ) {
   3567    UInt lo32 = toUInt(w64);
   3568    return toUChar(lo32 >> 8);
   3569 }
   3570 static inline UChar sel8x8_0 ( ULong w64 ) {
   3571    UInt lo32 = toUInt(w64);
   3572    return toUChar(lo32 >> 0);
   3573 }
   3574 
   3575 /* CALLED FROM GENERATED CODE: CLEAN HELPER */
   3576 ULong amd64g_calculate_mmx_pmaddwd ( ULong xx, ULong yy )
   3577 {
   3578    return
   3579       mk32x2(
   3580          (((Int)(Short)sel16x4_3(xx)) * ((Int)(Short)sel16x4_3(yy)))
   3581             + (((Int)(Short)sel16x4_2(xx)) * ((Int)(Short)sel16x4_2(yy))),
   3582          (((Int)(Short)sel16x4_1(xx)) * ((Int)(Short)sel16x4_1(yy)))
   3583             + (((Int)(Short)sel16x4_0(xx)) * ((Int)(Short)sel16x4_0(yy)))
   3584       );
   3585 }
   3586 
   3587 /* CALLED FROM GENERATED CODE: CLEAN HELPER */
   3588 ULong amd64g_calculate_mmx_psadbw ( ULong xx, ULong yy )
   3589 {
   3590    UInt t = 0;
   3591    t += (UInt)abdU8( sel8x8_7(xx), sel8x8_7(yy) );
   3592    t += (UInt)abdU8( sel8x8_6(xx), sel8x8_6(yy) );
   3593    t += (UInt)abdU8( sel8x8_5(xx), sel8x8_5(yy) );
   3594    t += (UInt)abdU8( sel8x8_4(xx), sel8x8_4(yy) );
   3595    t += (UInt)abdU8( sel8x8_3(xx), sel8x8_3(yy) );
   3596    t += (UInt)abdU8( sel8x8_2(xx), sel8x8_2(yy) );
   3597    t += (UInt)abdU8( sel8x8_1(xx), sel8x8_1(yy) );
   3598    t += (UInt)abdU8( sel8x8_0(xx), sel8x8_0(yy) );
   3599    t &= 0xFFFF;
   3600    return (ULong)t;
   3601 }
   3602 
   3603 /* CALLED FROM GENERATED CODE: CLEAN HELPER */
   3604 ULong amd64g_calculate_sse_phminposuw ( ULong sLo, ULong sHi )
   3605 {
   3606    UShort t, min;
   3607    UInt   idx;
   3608    t = sel16x4_0(sLo); if (True)    { min = t; idx = 0; }
   3609    t = sel16x4_1(sLo); if (t < min) { min = t; idx = 1; }
   3610    t = sel16x4_2(sLo); if (t < min) { min = t; idx = 2; }
   3611    t = sel16x4_3(sLo); if (t < min) { min = t; idx = 3; }
   3612    t = sel16x4_0(sHi); if (t < min) { min = t; idx = 4; }
   3613    t = sel16x4_1(sHi); if (t < min) { min = t; idx = 5; }
   3614    t = sel16x4_2(sHi); if (t < min) { min = t; idx = 6; }
   3615    t = sel16x4_3(sHi); if (t < min) { min = t; idx = 7; }
   3616    return ((ULong)(idx << 16)) | ((ULong)min);
   3617 }
   3618 
   3619 /* CALLED FROM GENERATED CODE: CLEAN HELPER */
   3620 ULong amd64g_calc_crc32b ( ULong crcIn, ULong b )
   3621 {
   3622    UInt  i;
   3623    ULong crc = (b & 0xFFULL) ^ crcIn;
   3624    for (i = 0; i < 8; i++)
   3625       crc = (crc >> 1) ^ ((crc & 1) ? 0x82f63b78ULL : 0);
   3626    return crc;
   3627 }
   3628 
   3629 /* CALLED FROM GENERATED CODE: CLEAN HELPER */
   3630 ULong amd64g_calc_crc32w ( ULong crcIn, ULong w )
   3631 {
   3632    UInt  i;
   3633    ULong crc = (w & 0xFFFFULL) ^ crcIn;
   3634    for (i = 0; i < 16; i++)
   3635       crc = (crc >> 1) ^ ((crc & 1) ? 0x82f63b78ULL : 0);
   3636    return crc;
   3637 }
   3638 
   3639 /* CALLED FROM GENERATED CODE: CLEAN HELPER */
   3640 ULong amd64g_calc_crc32l ( ULong crcIn, ULong l )
   3641 {
   3642    UInt i;
   3643    ULong crc = (l & 0xFFFFFFFFULL) ^ crcIn;
   3644    for (i = 0; i < 32; i++)
   3645       crc = (crc >> 1) ^ ((crc & 1) ? 0x82f63b78ULL : 0);
   3646    return crc;
   3647 }
   3648 
   3649 /* CALLED FROM GENERATED CODE: CLEAN HELPER */
   3650 ULong amd64g_calc_crc32q ( ULong crcIn, ULong q )
   3651 {
   3652    ULong crc = amd64g_calc_crc32l(crcIn, q);
   3653    return amd64g_calc_crc32l(crc, q >> 32);
   3654 }
   3655 
   3656 
   3657 /* .. helper for next fn .. */
   3658 static inline ULong sad_8x4 ( ULong xx, ULong yy )
   3659 {
   3660    UInt t = 0;
   3661    t += (UInt)abdU8( sel8x8_3(xx), sel8x8_3(yy) );
   3662    t += (UInt)abdU8( sel8x8_2(xx), sel8x8_2(yy) );
   3663    t += (UInt)abdU8( sel8x8_1(xx), sel8x8_1(yy) );
   3664    t += (UInt)abdU8( sel8x8_0(xx), sel8x8_0(yy) );
   3665    return (ULong)t;
   3666 }
   3667 
   3668 /* CALLED FROM GENERATED CODE: CLEAN HELPER */
   3669 ULong amd64g_calc_mpsadbw ( ULong sHi, ULong sLo,
   3670                             ULong dHi, ULong dLo,
   3671                             ULong imm_and_return_control_bit )
   3672 {
   3673    UInt imm8     = imm_and_return_control_bit & 7;
   3674    Bool calcHi   = (imm_and_return_control_bit >> 7) & 1;
   3675    UInt srcOffsL = imm8 & 3; /* src offs in 32-bit (L) chunks */
   3676    UInt dstOffsL = (imm8 >> 2) & 1; /* dst offs in ditto chunks */
   3677    /* For src we only need 32 bits, so get them into the
   3678       lower half of a 64 bit word. */
   3679    ULong src = ((srcOffsL & 2) ? sHi : sLo) >> (32 * (srcOffsL & 1));
   3680    /* For dst we need to get hold of 56 bits (7 bytes) from a total of
   3681       11 bytes.  If calculating the low part of the result, need bytes
   3682       dstOffsL * 4 + (0 .. 6); if calculating the high part,
   3683       dstOffsL * 4 + (4 .. 10). */
   3684    ULong dst;
   3685    /* dstOffL = 0, Lo  ->  0 .. 6
   3686       dstOffL = 1, Lo  ->  4 .. 10
   3687       dstOffL = 0, Hi  ->  4 .. 10
   3688       dstOffL = 1, Hi  ->  8 .. 14
   3689    */
   3690    if (calcHi && dstOffsL) {
   3691       /* 8 .. 14 */
   3692       dst = dHi & 0x00FFFFFFFFFFFFFFULL;
   3693    }
   3694    else if (!calcHi && !dstOffsL) {
   3695       /* 0 .. 6 */
   3696       dst = dLo & 0x00FFFFFFFFFFFFFFULL;
   3697    }
   3698    else {
   3699       /* 4 .. 10 */
   3700       dst = (dLo >> 32) | ((dHi & 0x00FFFFFFULL) << 32);
   3701    }
   3702    ULong r0  = sad_8x4( dst >>  0, src );
   3703    ULong r1  = sad_8x4( dst >>  8, src );
   3704    ULong r2  = sad_8x4( dst >> 16, src );
   3705    ULong r3  = sad_8x4( dst >> 24, src );
   3706    ULong res = (r3 << 48) | (r2 << 32) | (r1 << 16) | r0;
   3707    return res;
   3708 }
   3709 
   3710 /* CALLED FROM GENERATED CODE: CLEAN HELPER */
   3711 ULong amd64g_calculate_pext ( ULong src_masked, ULong mask )
   3712 {
   3713    ULong dst = 0;
   3714    ULong src_bit;
   3715    ULong dst_bit = 1;
   3716    for (src_bit = 1; src_bit; src_bit <<= 1) {
   3717       if (mask & src_bit) {
   3718          if (src_masked & src_bit) dst |= dst_bit;
   3719          dst_bit <<= 1;
   3720       }
   3721    }
   3722    return dst;
   3723 }
   3724 
   3725 /* CALLED FROM GENERATED CODE: CLEAN HELPER */
   3726 ULong amd64g_calculate_pdep ( ULong src, ULong mask )
   3727 {
   3728    ULong dst = 0;
   3729    ULong dst_bit;
   3730    ULong src_bit = 1;
   3731    for (dst_bit = 1; dst_bit; dst_bit <<= 1) {
   3732       if (mask & dst_bit) {
   3733          if (src & src_bit) dst |= dst_bit;
   3734          src_bit <<= 1;
   3735       }
   3736    }
   3737    return dst;
   3738 }
   3739 
   3740 /*---------------------------------------------------------------*/
   3741 /*--- Helpers for SSE4.2 PCMP{E,I}STR{I,M}                    ---*/
   3742 /*---------------------------------------------------------------*/
   3743 
   3744 static UInt zmask_from_V128 ( V128* arg )
   3745 {
   3746    UInt i, res = 0;
   3747    for (i = 0; i < 16; i++) {
   3748       res |=  ((arg->w8[i] == 0) ? 1 : 0) << i;
   3749    }
   3750    return res;
   3751 }
   3752 
   3753 static UInt zmask_from_V128_wide ( V128* arg )
   3754 {
   3755    UInt i, res = 0;
   3756    for (i = 0; i < 8; i++) {
   3757       res |=  ((arg->w16[i] == 0) ? 1 : 0) << i;
   3758    }
   3759    return res;
   3760 }
   3761 
   3762 /* Helps with PCMP{I,E}STR{I,M}.
   3763 
   3764    CALLED FROM GENERATED CODE: DIRTY HELPER(s).  (But not really,
   3765    actually it could be a clean helper, but for the fact that we can't
   3766    pass by value 2 x V128 to a clean helper, nor have one returned.)
   3767    Reads guest state, writes to guest state for the xSTRM cases, no
   3768    accesses of memory, is a pure function.
   3769 
   3770    opc_and_imm contains (4th byte of opcode << 8) | the-imm8-byte so
   3771    the callee knows which I/E and I/M variant it is dealing with and
   3772    what the specific operation is.  4th byte of opcode is in the range
   3773    0x60 to 0x63:
   3774        istri  66 0F 3A 63
   3775        istrm  66 0F 3A 62
   3776        estri  66 0F 3A 61
   3777        estrm  66 0F 3A 60
   3778 
   3779    gstOffL and gstOffR are the guest state offsets for the two XMM
   3780    register inputs.  We never have to deal with the memory case since
   3781    that is handled by pre-loading the relevant value into the fake
   3782    XMM16 register.
   3783 
   3784    For ESTRx variants, edxIN and eaxIN hold the values of those two
   3785    registers.
   3786 
   3787    In all cases, the bottom 16 bits of the result contain the new
   3788    OSZACP %rflags values.  For xSTRI variants, bits[31:16] of the
   3789    result hold the new %ecx value.  For xSTRM variants, the helper
   3790    writes the result directly to the guest XMM0.
   3791 
   3792    Declarable side effects: in all cases, reads guest state at
   3793    [gstOffL, +16) and [gstOffR, +16).  For xSTRM variants, also writes
   3794    guest_XMM0.
   3795 
   3796    Is expected to be called with opc_and_imm combinations which have
   3797    actually been validated, and will assert if otherwise.  The front
   3798    end should ensure we're only called with verified values.
   3799 */
   3800 ULong amd64g_dirtyhelper_PCMPxSTRx (
   3801           VexGuestAMD64State* gst,
   3802           HWord opc4_and_imm,
   3803           HWord gstOffL, HWord gstOffR,
   3804           HWord edxIN, HWord eaxIN
   3805        )
   3806 {
   3807    HWord opc4 = (opc4_and_imm >> 8) & 0xFF;
   3808    HWord imm8 = opc4_and_imm & 0xFF;
   3809    HWord isISTRx = opc4 & 2;
   3810    HWord isxSTRM = (opc4 & 1) ^ 1;
   3811    vassert((opc4 & 0xFC) == 0x60); /* 0x60 .. 0x63 */
   3812    HWord wide = (imm8 & 1);
   3813 
   3814    // where the args are
   3815    V128* argL = (V128*)( ((UChar*)gst) + gstOffL );
   3816    V128* argR = (V128*)( ((UChar*)gst) + gstOffR );
   3817 
   3818    /* Create the arg validity masks, either from the vectors
   3819       themselves or from the supplied edx/eax values. */
   3820    // FIXME: this is only right for the 8-bit data cases.
   3821    // At least that is asserted above.
   3822    UInt zmaskL, zmaskR;
   3823 
   3824    // temp spot for the resulting flags and vector.
   3825    V128 resV;
   3826    UInt resOSZACP;
   3827 
   3828    // for checking whether case was handled
   3829    Bool ok = False;
   3830 
   3831    if (wide) {
   3832       if (isISTRx) {
   3833          zmaskL = zmask_from_V128_wide(argL);
   3834          zmaskR = zmask_from_V128_wide(argR);
   3835       } else {
   3836          Int tmp;
   3837          tmp = edxIN & 0xFFFFFFFF;
   3838          if (tmp < -8) tmp = -8;
   3839          if (tmp > 8)  tmp = 8;
   3840          if (tmp < 0)  tmp = -tmp;
   3841          vassert(tmp >= 0 && tmp <= 8);
   3842          zmaskL = (1 << tmp) & 0xFF;
   3843          tmp = eaxIN & 0xFFFFFFFF;
   3844          if (tmp < -8) tmp = -8;
   3845          if (tmp > 8)  tmp = 8;
   3846          if (tmp < 0)  tmp = -tmp;
   3847          vassert(tmp >= 0 && tmp <= 8);
   3848          zmaskR = (1 << tmp) & 0xFF;
   3849       }
   3850       // do the meyaath
   3851       ok = compute_PCMPxSTRx_wide (
   3852               &resV, &resOSZACP, argL, argR,
   3853               zmaskL, zmaskR, imm8, (Bool)isxSTRM
   3854            );
   3855    } else {
   3856       if (isISTRx) {
   3857          zmaskL = zmask_from_V128(argL);
   3858          zmaskR = zmask_from_V128(argR);
   3859       } else {
   3860          Int tmp;
   3861          tmp = edxIN & 0xFFFFFFFF;
   3862          if (tmp < -16) tmp = -16;
   3863          if (tmp > 16)  tmp = 16;
   3864          if (tmp < 0)   tmp = -tmp;
   3865          vassert(tmp >= 0 && tmp <= 16);
   3866          zmaskL = (1 << tmp) & 0xFFFF;
   3867          tmp = eaxIN & 0xFFFFFFFF;
   3868          if (tmp < -16) tmp = -16;
   3869          if (tmp > 16)  tmp = 16;
   3870          if (tmp < 0)   tmp = -tmp;
   3871          vassert(tmp >= 0 && tmp <= 16);
   3872          zmaskR = (1 << tmp) & 0xFFFF;
   3873       }
   3874       // do the meyaath
   3875       ok = compute_PCMPxSTRx (
   3876               &resV, &resOSZACP, argL, argR,
   3877               zmaskL, zmaskR, imm8, (Bool)isxSTRM
   3878            );
   3879    }
   3880 
   3881    // front end shouldn't pass us any imm8 variants we can't
   3882    // handle.  Hence:
   3883    vassert(ok);
   3884 
   3885    // So, finally we need to get the results back to the caller.
   3886    // In all cases, the new OSZACP value is the lowest 16 of
   3887    // the return value.
   3888    if (isxSTRM) {
   3889       gst->guest_YMM0[0] = resV.w32[0];
   3890       gst->guest_YMM0[1] = resV.w32[1];
   3891       gst->guest_YMM0[2] = resV.w32[2];
   3892       gst->guest_YMM0[3] = resV.w32[3];
   3893       return resOSZACP & 0x8D5;
   3894    } else {
   3895       UInt newECX = resV.w32[0] & 0xFFFF;
   3896       return (newECX << 16) | (resOSZACP & 0x8D5);
   3897    }
   3898 }
   3899 
   3900 /*---------------------------------------------------------------*/
   3901 /*--- AES primitives and helpers                              ---*/
   3902 /*---------------------------------------------------------------*/
   3903 /* a 16 x 16 matrix */
   3904 static const UChar sbox[256] = {                   // row nr
   3905    0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5, // 1
   3906    0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76,
   3907    0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0, // 2
   3908    0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0,
   3909    0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc, // 3
   3910    0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15,
   3911    0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a, // 4
   3912    0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75,
   3913    0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0, // 5
   3914    0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84,
   3915    0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b, // 6
   3916    0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf,
   3917    0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85, // 7
   3918    0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8,
   3919    0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5, // 8
   3920    0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2,
   3921    0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17, // 9
   3922    0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73,
   3923    0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88, //10
   3924    0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb,
   3925    0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c, //11
   3926    0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79,
   3927    0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9, //12
   3928    0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08,
   3929    0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6, //13
   3930    0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a,
   3931    0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e, //14
   3932    0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e,
   3933    0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94, //15
   3934    0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf,
   3935    0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68, //16
   3936    0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16
   3937 };
   3938 static void SubBytes (V128* v)
   3939 {
   3940    V128 r;
   3941    UInt i;
   3942    for (i = 0; i < 16; i++)
   3943       r.w8[i] = sbox[v->w8[i]];
   3944    *v = r;
   3945 }
   3946 
   3947 /* a 16 x 16 matrix */
   3948 static const UChar invsbox[256] = {                // row nr
   3949    0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38, // 1
   3950    0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb,
   3951    0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87, // 2
   3952    0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb,
   3953    0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d, // 3
   3954    0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e,
   3955    0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2, // 4
   3956    0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25,
   3957    0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16, // 5
   3958    0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92,
   3959    0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda, // 6
   3960    0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84,
   3961    0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a, // 7
   3962    0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06,
   3963    0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02, // 8
   3964    0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b,
   3965    0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea, // 9
   3966    0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73,
   3967    0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85, //10
   3968    0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e,
   3969    0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89, //11
   3970    0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b,
   3971    0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20, //12
   3972    0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4,
   3973    0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31, //13
   3974    0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f,
   3975    0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d, //14
   3976    0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef,
   3977    0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0, //15
   3978    0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61,
   3979    0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26, //16
   3980    0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d
   3981 };
   3982 static void InvSubBytes (V128* v)
   3983 {
   3984    V128 r;
   3985    UInt i;
   3986    for (i = 0; i < 16; i++)
   3987       r.w8[i] = invsbox[v->w8[i]];
   3988    *v = r;
   3989 }
   3990 
   3991 static const UChar ShiftRows_op[16] =
   3992    {11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15, 10, 5, 0};
   3993 static void ShiftRows (V128* v)
   3994 {
   3995    V128 r;
   3996    UInt i;
   3997    for (i = 0; i < 16; i++)
   3998       r.w8[i] = v->w8[ShiftRows_op[15-i]];
   3999    *v = r;
   4000 }
   4001 
   4002 static const UChar InvShiftRows_op[16] =
   4003    {3, 6, 9, 12, 15, 2, 5, 8, 11, 14, 1, 4, 7, 10, 13, 0};
   4004 static void InvShiftRows (V128* v)
   4005 {
   4006    V128 r;
   4007    UInt i;
   4008    for (i = 0; i < 16; i++)
   4009       r.w8[i] = v->w8[InvShiftRows_op[15-i]];
   4010    *v = r;
   4011 }
   4012 
   4013 /* Multiplication of the finite fields elements of AES.
   4014    See "A Specification for The AES Algorithm Rijndael
   4015         (by Joan Daemen & Vincent Rijmen)"
   4016         Dr. Brian Gladman, v3.1, 3rd March 2001. */
   4017 /* N values so that (hex) xy = 0x03^N.
   4018    0x00 cannot be used. We put 0xff for this value.*/
   4019 /* a 16 x 16 matrix */
   4020 static const UChar Nxy[256] = {                    // row nr
   4021    0xff, 0x00, 0x19, 0x01, 0x32, 0x02, 0x1a, 0xc6, // 1
   4022    0x4b, 0xc7, 0x1b, 0x68, 0x33, 0xee, 0xdf, 0x03,
   4023    0x64, 0x04, 0xe0, 0x0e, 0x34, 0x8d, 0x81, 0xef, // 2
   4024    0x4c, 0x71, 0x08, 0xc8, 0xf8, 0x69, 0x1c, 0xc1,
   4025    0x7d, 0xc2, 0x1d, 0xb5, 0xf9, 0xb9, 0x27, 0x6a, // 3
   4026    0x4d, 0xe4, 0xa6, 0x72, 0x9a, 0xc9, 0x09, 0x78,
   4027    0x65, 0x2f, 0x8a, 0x05, 0x21, 0x0f, 0xe1, 0x24, // 4
   4028    0x12, 0xf0, 0x82, 0x45, 0x35, 0x93, 0xda, 0x8e,
   4029    0x96, 0x8f, 0xdb, 0xbd, 0x36, 0xd0, 0xce, 0x94, // 5
   4030    0x13, 0x5c, 0xd2, 0xf1, 0x40, 0x46, 0x83, 0x38,
   4031    0x66, 0xdd, 0xfd, 0x30, 0xbf, 0x06, 0x8b, 0x62, // 6
   4032    0xb3, 0x25, 0xe2, 0x98, 0x22, 0x88, 0x91, 0x10,
   4033    0x7e, 0x6e, 0x48, 0xc3, 0xa3, 0xb6, 0x1e, 0x42, // 7
   4034    0x3a, 0x6b, 0x28, 0x54, 0xfa, 0x85, 0x3d, 0xba,
   4035    0x2b, 0x79, 0x0a, 0x15, 0x9b, 0x9f, 0x5e, 0xca, // 8
   4036    0x4e, 0xd4, 0xac, 0xe5, 0xf3, 0x73, 0xa7, 0x57,
   4037    0xaf, 0x58, 0xa8, 0x50, 0xf4, 0xea, 0xd6, 0x74, // 9
   4038    0x4f, 0xae, 0xe9, 0xd5, 0xe7, 0xe6, 0xad, 0xe8,
   4039    0x2c, 0xd7, 0x75, 0x7a, 0xeb, 0x16, 0x0b, 0xf5, //10
   4040    0x59, 0xcb, 0x5f, 0xb0, 0x9c, 0xa9, 0x51, 0xa0,
   4041    0x7f, 0x0c, 0xf6, 0x6f, 0x17, 0xc4, 0x49, 0xec, //11
   4042    0xd8, 0x43, 0x1f, 0x2d, 0xa4, 0x76, 0x7b, 0xb7,
   4043    0xcc, 0xbb, 0x3e, 0x5a, 0xfb, 0x60, 0xb1, 0x86, //12
   4044    0x3b, 0x52, 0xa1, 0x6c, 0xaa, 0x55, 0x29, 0x9d,
   4045    0x97, 0xb2, 0x87, 0x90, 0x61, 0xbe, 0xdc, 0xfc, //13
   4046    0xbc, 0x95, 0xcf, 0xcd, 0x37, 0x3f, 0x5b, 0xd1,
   4047    0x53, 0x39, 0x84, 0x3c, 0x41, 0xa2, 0x6d, 0x47, //14
   4048    0x14, 0x2a, 0x9e, 0x5d, 0x56, 0xf2, 0xd3, 0xab,
   4049    0x44, 0x11, 0x92, 0xd9, 0x23, 0x20, 0x2e, 0x89, //15
   4050    0xb4, 0x7c, 0xb8, 0x26, 0x77, 0x99, 0xe3, 0xa5,
   4051    0x67, 0x4a, 0xed, 0xde, 0xc5, 0x31, 0xfe, 0x18, //16
   4052    0x0d, 0x63, 0x8c, 0x80, 0xc0, 0xf7, 0x70, 0x07
   4053 };
   4054 
   4055 /* E values so that E = 0x03^xy. */
   4056 static const UChar Exy[256] = {                    // row nr
   4057    0x01, 0x03, 0x05, 0x0f, 0x11, 0x33, 0x55, 0xff, // 1
   4058    0x1a, 0x2e, 0x72, 0x96, 0xa1, 0xf8, 0x13, 0x35,
   4059    0x5f, 0xe1, 0x38, 0x48, 0xd8, 0x73, 0x95, 0xa4, // 2
   4060    0xf7, 0x02, 0x06, 0x0a, 0x1e, 0x22, 0x66, 0xaa,
   4061    0xe5, 0x34, 0x5c, 0xe4, 0x37, 0x59, 0xeb, 0x26, // 3
   4062    0x6a, 0xbe, 0xd9, 0x70, 0x90, 0xab, 0xe6, 0x31,
   4063    0x53, 0xf5, 0x04, 0x0c, 0x14, 0x3c, 0x44, 0xcc, // 4
   4064    0x4f, 0xd1, 0x68, 0xb8, 0xd3, 0x6e, 0xb2, 0xcd,
   4065    0x4c, 0xd4, 0x67, 0xa9, 0xe0, 0x3b, 0x4d, 0xd7, // 5
   4066    0x62, 0xa6, 0xf1, 0x08, 0x18, 0x28, 0x78, 0x88,
   4067    0x83, 0x9e, 0xb9, 0xd0, 0x6b, 0xbd, 0xdc, 0x7f, // 6
   4068    0x81, 0x98, 0xb3, 0xce, 0x49, 0xdb, 0x76, 0x9a,
   4069    0xb5, 0xc4, 0x57, 0xf9, 0x10, 0x30, 0x50, 0xf0, // 7
   4070    0x0b, 0x1d, 0x27, 0x69, 0xbb, 0xd6, 0x61, 0xa3,
   4071    0xfe, 0x19, 0x2b, 0x7d, 0x87, 0x92, 0xad, 0xec, // 8
   4072    0x2f, 0x71, 0x93, 0xae, 0xe9, 0x20, 0x60, 0xa0,
   4073    0xfb, 0x16, 0x3a, 0x4e, 0xd2, 0x6d, 0xb7, 0xc2, // 9
   4074    0x5d, 0xe7, 0x32, 0x56, 0xfa, 0x15, 0x3f, 0x41,
   4075    0xc3, 0x5e, 0xe2, 0x3d, 0x47, 0xc9, 0x40, 0xc0, //10
   4076    0x5b, 0xed, 0x2c, 0x74, 0x9c, 0xbf, 0xda, 0x75,
   4077    0x9f, 0xba, 0xd5, 0x64, 0xac, 0xef, 0x2a, 0x7e, //11
   4078    0x82, 0x9d, 0xbc, 0xdf, 0x7a, 0x8e, 0x89, 0x80,
   4079    0x9b, 0xb6, 0xc1, 0x58, 0xe8, 0x23, 0x65, 0xaf, //12
   4080    0xea, 0x25, 0x6f, 0xb1, 0xc8, 0x43, 0xc5, 0x54,
   4081    0xfc, 0x1f, 0x21, 0x63, 0xa5, 0xf4, 0x07, 0x09, //13
   4082    0x1b, 0x2d, 0x77, 0x99, 0xb0, 0xcb, 0x46, 0xca,
   4083    0x45, 0xcf, 0x4a, 0xde, 0x79, 0x8b, 0x86, 0x91, //14
   4084    0xa8, 0xe3, 0x3e, 0x42, 0xc6, 0x51, 0xf3, 0x0e,
   4085    0x12, 0x36, 0x5a, 0xee, 0x29, 0x7b, 0x8d, 0x8c, //15
   4086    0x8f, 0x8a, 0x85, 0x94, 0xa7, 0xf2, 0x0d, 0x17,
   4087    0x39, 0x4b, 0xdd, 0x7c, 0x84, 0x97, 0xa2, 0xfd, //16
   4088    0x1c, 0x24, 0x6c, 0xb4, 0xc7, 0x52, 0xf6, 0x01};
   4089 
   4090 static inline UChar ff_mul(UChar u1, UChar u2)
   4091 {
   4092    if ((u1 > 0) && (u2 > 0)) {
   4093       UInt ui = Nxy[u1] + Nxy[u2];
   4094       if (ui >= 255)
   4095          ui = ui - 255;
   4096       return Exy[ui];
   4097    } else {
   4098       return 0;
   4099    };
   4100 }
   4101 
   4102 static void MixColumns (V128* v)
   4103 {
   4104    V128 r;
   4105    Int j;
   4106 #define P(x,row,col) (x)->w8[((row)*4+(col))]
   4107    for (j = 0; j < 4; j++) {
   4108       P(&r,j,0) = ff_mul(0x02, P(v,j,0)) ^ ff_mul(0x03, P(v,j,1))
   4109          ^ P(v,j,2) ^ P(v,j,3);
   4110       P(&r,j,1) = P(v,j,0) ^ ff_mul( 0x02, P(v,j,1) )
   4111          ^ ff_mul(0x03, P(v,j,2) ) ^ P(v,j,3);
   4112       P(&r,j,2) = P(v,j,0) ^ P(v,j,1) ^ ff_mul( 0x02, P(v,j,2) )
   4113          ^ ff_mul(0x03, P(v,j,3) );
   4114       P(&r,j,3) = ff_mul(0x03, P(v,j,0) ) ^ P(v,j,1) ^ P(v,j,2)
   4115          ^ ff_mul( 0x02, P(v,j,3) );
   4116    }
   4117    *v = r;
   4118 #undef P
   4119 }
   4120 
   4121 static void InvMixColumns (V128* v)
   4122 {
   4123    V128 r;
   4124    Int j;
   4125 #define P(x,row,col) (x)->w8[((row)*4+(col))]
   4126    for (j = 0; j < 4; j++) {
   4127       P(&r,j,0) = ff_mul(0x0e, P(v,j,0) ) ^ ff_mul(0x0b, P(v,j,1) )
   4128          ^ ff_mul(0x0d,P(v,j,2) ) ^ ff_mul(0x09, P(v,j,3) );
   4129       P(&r,j,1) = ff_mul(0x09, P(v,j,0) ) ^ ff_mul(0x0e, P(v,j,1) )
   4130          ^ ff_mul(0x0b,P(v,j,2) ) ^ ff_mul(0x0d, P(v,j,3) );
   4131       P(&r,j,2) = ff_mul(0x0d, P(v,j,0) ) ^ ff_mul(0x09, P(v,j,1) )
   4132          ^ ff_mul(0x0e,P(v,j,2) ) ^ ff_mul(0x0b, P(v,j,3) );
   4133       P(&r,j,3) = ff_mul(0x0b, P(v,j,0) ) ^ ff_mul(0x0d, P(v,j,1) )
   4134          ^ ff_mul(0x09,P(v,j,2) ) ^ ff_mul(0x0e, P(v,j,3) );
   4135    }
   4136    *v = r;
   4137 #undef P
   4138 
   4139 }
   4140 
   4141 /* For description, see definition in guest_amd64_defs.h */
   4142 void amd64g_dirtyhelper_AES (
   4143           VexGuestAMD64State* gst,
   4144           HWord opc4, HWord gstOffD,
   4145           HWord gstOffL, HWord gstOffR
   4146        )
   4147 {
   4148    // where the args are
   4149    V128* argD = (V128*)( ((UChar*)gst) + gstOffD );
   4150    V128* argL = (V128*)( ((UChar*)gst) + gstOffL );
   4151    V128* argR = (V128*)( ((UChar*)gst) + gstOffR );
   4152    V128  r;
   4153 
   4154    switch (opc4) {
   4155       case 0xDC: /* AESENC */
   4156       case 0xDD: /* AESENCLAST */
   4157          r = *argR;
   4158          ShiftRows (&r);
   4159          SubBytes  (&r);
   4160          if (opc4 == 0xDC)
   4161             MixColumns (&r);
   4162          argD->w64[0] = r.w64[0] ^ argL->w64[0];
   4163          argD->w64[1] = r.w64[1] ^ argL->w64[1];
   4164          break;
   4165 
   4166       case 0xDE: /* AESDEC */
   4167       case 0xDF: /* AESDECLAST */
   4168          r = *argR;
   4169          InvShiftRows (&r);
   4170          InvSubBytes (&r);
   4171          if (opc4 == 0xDE)
   4172             InvMixColumns (&r);
   4173          argD->w64[0] = r.w64[0] ^ argL->w64[0];
   4174          argD->w64[1] = r.w64[1] ^ argL->w64[1];
   4175          break;
   4176 
   4177       case 0xDB: /* AESIMC */
   4178          *argD = *argL;
   4179          InvMixColumns (argD);
   4180          break;
   4181       default: vassert(0);
   4182    }
   4183 }
   4184 
   4185 static inline UInt RotWord (UInt   w32)
   4186 {
   4187    return ((w32 >> 8) | (w32 << 24));
   4188 }
   4189 
   4190 static inline UInt SubWord (UInt   w32)
   4191 {
   4192    UChar *w8;
   4193    UChar *r8;
   4194    UInt res;
   4195    w8 = (UChar*) &w32;
   4196    r8 = (UChar*) &res;
   4197    r8[0] = sbox[w8[0]];
   4198    r8[1] = sbox[w8[1]];
   4199    r8[2] = sbox[w8[2]];
   4200    r8[3] = sbox[w8[3]];
   4201    return res;
   4202 }
   4203 
   4204 /* For description, see definition in guest_amd64_defs.h */
   4205 extern void amd64g_dirtyhelper_AESKEYGENASSIST (
   4206           VexGuestAMD64State* gst,
   4207           HWord imm8,
   4208           HWord gstOffL, HWord gstOffR
   4209        )
   4210 {
   4211    // where the args are
   4212    V128* argL = (V128*)( ((UChar*)gst) + gstOffL );
   4213    V128* argR = (V128*)( ((UChar*)gst) + gstOffR );
   4214 
   4215    // We have to create the result in a temporary in the
   4216    // case where the src and dst regs are the same.  See #341698.
   4217    V128 tmp;
   4218 
   4219    tmp.w32[3] = RotWord (SubWord (argL->w32[3])) ^ imm8;
   4220    tmp.w32[2] = SubWord (argL->w32[3]);
   4221    tmp.w32[1] = RotWord (SubWord (argL->w32[1])) ^ imm8;
   4222    tmp.w32[0] = SubWord (argL->w32[1]);
   4223 
   4224    argR->w32[3] = tmp.w32[3];
   4225    argR->w32[2] = tmp.w32[2];
   4226    argR->w32[1] = tmp.w32[1];
   4227    argR->w32[0] = tmp.w32[0];
   4228 }
   4229 
   4230 
   4231 
   4232 /*---------------------------------------------------------------*/
   4233 /*--- Helpers for dealing with, and describing,               ---*/
   4234 /*--- guest state as a whole.                                 ---*/
   4235 /*---------------------------------------------------------------*/
   4236 
   4237 /* Initialise the entire amd64 guest state. */
   4238 /* VISIBLE TO LIBVEX CLIENT */
   4239 void LibVEX_GuestAMD64_initialise ( /*OUT*/VexGuestAMD64State* vex_state )
   4240 {
   4241    vex_state->host_EvC_FAILADDR = 0;
   4242    vex_state->host_EvC_COUNTER = 0;
   4243    vex_state->pad0 = 0;
   4244 
   4245    vex_state->guest_RAX = 0;
   4246    vex_state->guest_RCX = 0;
   4247    vex_state->guest_RDX = 0;
   4248    vex_state->guest_RBX = 0;
   4249    vex_state->guest_RSP = 0;
   4250    vex_state->guest_RBP = 0;
   4251    vex_state->guest_RSI = 0;
   4252    vex_state->guest_RDI = 0;
   4253    vex_state->guest_R8  = 0;
   4254    vex_state->guest_R9  = 0;
   4255    vex_state->guest_R10 = 0;
   4256    vex_state->guest_R11 = 0;
   4257    vex_state->guest_R12 = 0;
   4258    vex_state->guest_R13 = 0;
   4259    vex_state->guest_R14 = 0;
   4260    vex_state->guest_R15 = 0;
   4261 
   4262    vex_state->guest_CC_OP   = AMD64G_CC_OP_COPY;
   4263    vex_state->guest_CC_DEP1 = 0;
   4264    vex_state->guest_CC_DEP2 = 0;
   4265    vex_state->guest_CC_NDEP = 0;
   4266 
   4267    vex_state->guest_DFLAG   = 1; /* forwards */
   4268    vex_state->guest_IDFLAG  = 0;
   4269    vex_state->guest_ACFLAG  = 0;
   4270 
   4271    /* HACK: represent the offset associated with a constant %fs.
   4272       Typically, on linux, this assumes that %fs is only ever zero (main
   4273       thread) or 0x63. */
   4274    vex_state->guest_FS_CONST = 0;
   4275 
   4276    vex_state->guest_RIP = 0;
   4277 
   4278    /* Initialise the simulated FPU */
   4279    amd64g_dirtyhelper_FINIT( vex_state );
   4280 
   4281    /* Initialise the AVX state. */
   4282 #  define AVXZERO(_ymm) \
   4283       do { _ymm[0]=_ymm[1]=_ymm[2]=_ymm[3] = 0; \
   4284            _ymm[4]=_ymm[5]=_ymm[6]=_ymm[7] = 0; \
   4285       } while (0)
   4286    vex_state->guest_SSEROUND = (ULong)Irrm_NEAREST;
   4287    AVXZERO(vex_state->guest_YMM0);
   4288    AVXZERO(vex_state->guest_YMM1);
   4289    AVXZERO(vex_state->guest_YMM2);
   4290    AVXZERO(vex_state->guest_YMM3);
   4291    AVXZERO(vex_state->guest_YMM4);
   4292    AVXZERO(vex_state->guest_YMM5);
   4293    AVXZERO(vex_state->guest_YMM6);
   4294    AVXZERO(vex_state->guest_YMM7);
   4295    AVXZERO(vex_state->guest_YMM8);
   4296    AVXZERO(vex_state->guest_YMM9);
   4297    AVXZERO(vex_state->guest_YMM10);
   4298    AVXZERO(vex_state->guest_YMM11);
   4299    AVXZERO(vex_state->guest_YMM12);
   4300    AVXZERO(vex_state->guest_YMM13);
   4301    AVXZERO(vex_state->guest_YMM14);
   4302    AVXZERO(vex_state->guest_YMM15);
   4303    AVXZERO(vex_state->guest_YMM16);
   4304 
   4305 #  undef AVXZERO
   4306 
   4307    vex_state->guest_EMNOTE = EmNote_NONE;
   4308 
   4309    /* These should not ever be either read or written, but we
   4310       initialise them anyway. */
   4311    vex_state->guest_CMSTART = 0;
   4312    vex_state->guest_CMLEN   = 0;
   4313 
   4314    vex_state->guest_NRADDR   = 0;
   4315    vex_state->guest_SC_CLASS = 0;
   4316    vex_state->guest_GS_CONST = 0;
   4317 
   4318    vex_state->guest_IP_AT_SYSCALL = 0;
   4319    vex_state->pad1 = 0;
   4320 }
   4321 
   4322 
   4323 /* Figure out if any part of the guest state contained in minoff
   4324    .. maxoff requires precise memory exceptions.  If in doubt return
   4325    True (but this generates significantly slower code).
   4326 
   4327    By default we enforce precise exns for guest %RSP, %RBP and %RIP
   4328    only.  These are the minimum needed to extract correct stack
   4329    backtraces from amd64 code.
   4330 
   4331    Only %RSP is needed in mode VexRegUpdSpAtMemAccess.
   4332 */
   4333 Bool guest_amd64_state_requires_precise_mem_exns (
   4334         Int minoff, Int maxoff, VexRegisterUpdates pxControl
   4335      )
   4336 {
   4337    Int rbp_min = offsetof(VexGuestAMD64State, guest_RBP);
   4338    Int rbp_max = rbp_min + 8 - 1;
   4339    Int rsp_min = offsetof(VexGuestAMD64State, guest_RSP);
   4340    Int rsp_max = rsp_min + 8 - 1;
   4341    Int rip_min = offsetof(VexGuestAMD64State, guest_RIP);
   4342    Int rip_max = rip_min + 8 - 1;
   4343 
   4344    if (maxoff < rsp_min || minoff > rsp_max) {
   4345       /* no overlap with rsp */
   4346       if (pxControl == VexRegUpdSpAtMemAccess)
   4347          return False; // We only need to check stack pointer.
   4348    } else {
   4349       return True;
   4350    }
   4351 
   4352    if (maxoff < rbp_min || minoff > rbp_max) {
   4353       /* no overlap with rbp */
   4354    } else {
   4355       return True;
   4356    }
   4357 
   4358    if (maxoff < rip_min || minoff > rip_max) {
   4359       /* no overlap with eip */
   4360    } else {
   4361       return True;
   4362    }
   4363 
   4364    return False;
   4365 }
   4366 
   4367 
   4368 #define ALWAYSDEFD(field)                             \
   4369     { offsetof(VexGuestAMD64State, field),            \
   4370       (sizeof ((VexGuestAMD64State*)0)->field) }
   4371 
   4372 VexGuestLayout
   4373    amd64guest_layout
   4374       = {
   4375           /* Total size of the guest state, in bytes. */
   4376           .total_sizeB = sizeof(VexGuestAMD64State),
   4377 
   4378           /* Describe the stack pointer. */
   4379           .offset_SP = offsetof(VexGuestAMD64State,guest_RSP),
   4380           .sizeof_SP = 8,
   4381 
   4382           /* Describe the frame pointer. */
   4383           .offset_FP = offsetof(VexGuestAMD64State,guest_RBP),
   4384           .sizeof_FP = 8,
   4385 
   4386           /* Describe the instruction pointer. */
   4387           .offset_IP = offsetof(VexGuestAMD64State,guest_RIP),
   4388           .sizeof_IP = 8,
   4389 
   4390           /* Describe any sections to be regarded by Memcheck as
   4391              'always-defined'. */
   4392           .n_alwaysDefd = 16,
   4393 
   4394           /* flags thunk: OP and NDEP are always defd, whereas DEP1
   4395              and DEP2 have to be tracked.  See detailed comment in
   4396              gdefs.h on meaning of thunk fields. */
   4397           .alwaysDefd
   4398              = { /*  0 */ ALWAYSDEFD(guest_CC_OP),
   4399                  /*  1 */ ALWAYSDEFD(guest_CC_NDEP),
   4400 		 /*  2 */ ALWAYSDEFD(guest_DFLAG),
   4401                  /*  3 */ ALWAYSDEFD(guest_IDFLAG),
   4402                  /*  4 */ ALWAYSDEFD(guest_RIP),
   4403                  /*  5 */ ALWAYSDEFD(guest_FS_CONST),
   4404                  /*  6 */ ALWAYSDEFD(guest_FTOP),
   4405                  /*  7 */ ALWAYSDEFD(guest_FPTAG),
   4406                  /*  8 */ ALWAYSDEFD(guest_FPROUND),
   4407                  /*  9 */ ALWAYSDEFD(guest_FC3210),
   4408                  // /* */ ALWAYSDEFD(guest_CS),
   4409                  // /* */ ALWAYSDEFD(guest_DS),
   4410                  // /* */ ALWAYSDEFD(guest_ES),
   4411                  // /* */ ALWAYSDEFD(guest_FS),
   4412                  // /* */ ALWAYSDEFD(guest_GS),
   4413                  // /* */ ALWAYSDEFD(guest_SS),
   4414                  // /* */ ALWAYSDEFD(guest_LDT),
   4415                  // /* */ ALWAYSDEFD(guest_GDT),
   4416                  /* 10 */ ALWAYSDEFD(guest_EMNOTE),
   4417                  /* 11 */ ALWAYSDEFD(guest_SSEROUND),
   4418                  /* 12 */ ALWAYSDEFD(guest_CMSTART),
   4419                  /* 13 */ ALWAYSDEFD(guest_CMLEN),
   4420                  /* 14 */ ALWAYSDEFD(guest_SC_CLASS),
   4421                  /* 15 */ ALWAYSDEFD(guest_IP_AT_SYSCALL)
   4422                }
   4423         };
   4424 
   4425 
   4426 /*---------------------------------------------------------------*/
   4427 /*--- end                               guest_amd64_helpers.c ---*/
   4428 /*---------------------------------------------------------------*/
   4429