Home | History | Annotate | Download | only in priv
      1 
      2 /*---------------------------------------------------------------*/
      3 /*--- begin                             guest_amd64_helpers.c ---*/
      4 /*---------------------------------------------------------------*/
      5 
      6 /*
      7    This file is part of Valgrind, a dynamic binary instrumentation
      8    framework.
      9 
     10    Copyright (C) 2004-2013 OpenWorks LLP
     11       info (at) open-works.net
     12 
     13    This program is free software; you can redistribute it and/or
     14    modify it under the terms of the GNU General Public License as
     15    published by the Free Software Foundation; either version 2 of the
     16    License, or (at your option) any later version.
     17 
     18    This program is distributed in the hope that it will be useful, but
     19    WITHOUT ANY WARRANTY; without even the implied warranty of
     20    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
     21    General Public License for more details.
     22 
     23    You should have received a copy of the GNU General Public License
     24    along with this program; if not, write to the Free Software
     25    Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
     26    02110-1301, USA.
     27 
     28    The GNU General Public License is contained in the file COPYING.
     29 
     30    Neither the names of the U.S. Department of Energy nor the
     31    University of California nor the names of its contributors may be
     32    used to endorse or promote products derived from this software
     33    without prior written permission.
     34 */
     35 
     36 #include "libvex_basictypes.h"
     37 #include "libvex_emnote.h"
     38 #include "libvex_guest_amd64.h"
     39 #include "libvex_ir.h"
     40 #include "libvex.h"
     41 
     42 #include "main_util.h"
     43 #include "main_globals.h"
     44 #include "guest_generic_bb_to_IR.h"
     45 #include "guest_amd64_defs.h"
     46 #include "guest_generic_x87.h"
     47 
     48 
     49 /* This file contains helper functions for amd64 guest code.
     50    Calls to these functions are generated by the back end.
     51    These calls are of course in the host machine code and
     52    this file will be compiled to host machine code, so that
     53    all makes sense.
     54 
     55    Only change the signatures of these helper functions very
     56    carefully.  If you change the signature here, you'll have to change
     57    the parameters passed to it in the IR calls constructed by
     58    guest-amd64/toIR.c.
     59 
     60    The convention used is that all functions called from generated
     61    code are named amd64g_<something>, and any function whose name lacks
     62    that prefix is not called from generated code.  Note that some
     63    LibVEX_* functions can however be called by VEX's client, but that
     64    is not the same as calling them from VEX-generated code.
     65 */
     66 
     67 
     68 /* Set to 1 to get detailed profiling info about use of the flag
     69    machinery. */
     70 #define PROFILE_RFLAGS 0
     71 
     72 
     73 /*---------------------------------------------------------------*/
     74 /*--- %rflags run-time helpers.                               ---*/
     75 /*---------------------------------------------------------------*/
     76 
     77 /* Do 64x64 -> 128 signed/unsigned multiplies, for computing flags
     78    after imulq/mulq. */
     79 
     80 static void mullS64 ( Long u, Long v, Long* rHi, Long* rLo )
     81 {
     82    ULong u0, v0, w0;
     83     Long u1, v1, w1, w2, t;
     84    u0   = u & 0xFFFFFFFFULL;
     85    u1   = u >> 32;
     86    v0   = v & 0xFFFFFFFFULL;
     87    v1   = v >> 32;
     88    w0   = u0 * v0;
     89    t    = u1 * v0 + (w0 >> 32);
     90    w1   = t & 0xFFFFFFFFULL;
     91    w2   = t >> 32;
     92    w1   = u0 * v1 + w1;
     93    *rHi = u1 * v1 + w2 + (w1 >> 32);
     94    *rLo = u * v;
     95 }
     96 
     97 static void mullU64 ( ULong u, ULong v, ULong* rHi, ULong* rLo )
     98 {
     99    ULong u0, v0, w0;
    100    ULong u1, v1, w1,w2,t;
    101    u0   = u & 0xFFFFFFFFULL;
    102    u1   = u >> 32;
    103    v0   = v & 0xFFFFFFFFULL;
    104    v1   = v >> 32;
    105    w0   = u0 * v0;
    106    t    = u1 * v0 + (w0 >> 32);
    107    w1   = t & 0xFFFFFFFFULL;
    108    w2   = t >> 32;
    109    w1   = u0 * v1 + w1;
    110    *rHi = u1 * v1 + w2 + (w1 >> 32);
    111    *rLo = u * v;
    112 }
    113 
    114 
    115 static const UChar parity_table[256] = {
    116     AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
    117     0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
    118     0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
    119     AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
    120     0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
    121     AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
    122     AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
    123     0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
    124     0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
    125     AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
    126     AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
    127     0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
    128     AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
    129     0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
    130     0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
    131     AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
    132     0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
    133     AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
    134     AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
    135     0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
    136     AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
    137     0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
    138     0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
    139     AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
    140     AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
    141     0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
    142     0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
    143     AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
    144     0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
    145     AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
    146     AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
    147     0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
    148 };
    149 
    150 /* generalised left-shifter */
    151 static inline Long lshift ( Long x, Int n )
    152 {
    153    if (n >= 0)
    154       return x << n;
    155    else
    156       return x >> (-n);
    157 }
    158 
    159 /* identity on ULong */
    160 static inline ULong idULong ( ULong x )
    161 {
    162    return x;
    163 }
    164 
    165 
    166 #define PREAMBLE(__data_bits)					\
    167    /* const */ ULong DATA_MASK 					\
    168       = __data_bits==8                                          \
    169            ? 0xFFULL 					        \
    170            : (__data_bits==16                                   \
    171                 ? 0xFFFFULL 		                        \
    172                 : (__data_bits==32                              \
    173                      ? 0xFFFFFFFFULL                            \
    174                      : 0xFFFFFFFFFFFFFFFFULL));                 \
    175    /* const */ ULong SIGN_MASK = 1ULL << (__data_bits - 1);     \
    176    /* const */ ULong CC_DEP1 = cc_dep1_formal;			\
    177    /* const */ ULong CC_DEP2 = cc_dep2_formal;			\
    178    /* const */ ULong CC_NDEP = cc_ndep_formal;			\
    179    /* Four bogus assignments, which hopefully gcc can     */	\
    180    /* optimise away, and which stop it complaining about  */	\
    181    /* unused variables.                                   */	\
    182    SIGN_MASK = SIGN_MASK;					\
    183    DATA_MASK = DATA_MASK;					\
    184    CC_DEP2 = CC_DEP2;						\
    185    CC_NDEP = CC_NDEP;
    186 
    187 
    188 /*-------------------------------------------------------------*/
    189 
    190 #define ACTIONS_ADD(DATA_BITS,DATA_UTYPE)			\
    191 {								\
    192    PREAMBLE(DATA_BITS);						\
    193    { Long cf, pf, af, zf, sf, of;				\
    194      Long argL, argR, res;					\
    195      argL = CC_DEP1;						\
    196      argR = CC_DEP2;						\
    197      res  = argL + argR;					\
    198      cf = (DATA_UTYPE)res < (DATA_UTYPE)argL;			\
    199      pf = parity_table[(UChar)res];				\
    200      af = (res ^ argL ^ argR) & 0x10;				\
    201      zf = ((DATA_UTYPE)res == 0) << 6;				\
    202      sf = lshift(res, 8 - DATA_BITS) & 0x80;			\
    203      of = lshift((argL ^ argR ^ -1) & (argL ^ res), 		\
    204                  12 - DATA_BITS) & AMD64G_CC_MASK_O;		\
    205      return cf | pf | af | zf | sf | of;			\
    206    }								\
    207 }
    208 
    209 /*-------------------------------------------------------------*/
    210 
    211 #define ACTIONS_SUB(DATA_BITS,DATA_UTYPE)			\
    212 {								\
    213    PREAMBLE(DATA_BITS);						\
    214    { Long cf, pf, af, zf, sf, of;				\
    215      Long argL, argR, res;					\
    216      argL = CC_DEP1;						\
    217      argR = CC_DEP2;						\
    218      res  = argL - argR;					\
    219      cf = (DATA_UTYPE)argL < (DATA_UTYPE)argR;			\
    220      pf = parity_table[(UChar)res];				\
    221      af = (res ^ argL ^ argR) & 0x10;				\
    222      zf = ((DATA_UTYPE)res == 0) << 6;				\
    223      sf = lshift(res, 8 - DATA_BITS) & 0x80;			\
    224      of = lshift((argL ^ argR) & (argL ^ res),	 		\
    225                  12 - DATA_BITS) & AMD64G_CC_MASK_O; 		\
    226      return cf | pf | af | zf | sf | of;			\
    227    }								\
    228 }
    229 
    230 /*-------------------------------------------------------------*/
    231 
    232 #define ACTIONS_ADC(DATA_BITS,DATA_UTYPE)			\
    233 {								\
    234    PREAMBLE(DATA_BITS);						\
    235    { Long cf, pf, af, zf, sf, of;				\
    236      Long argL, argR, oldC, res;		 		\
    237      oldC = CC_NDEP & AMD64G_CC_MASK_C;				\
    238      argL = CC_DEP1;						\
    239      argR = CC_DEP2 ^ oldC;	       				\
    240      res  = (argL + argR) + oldC;				\
    241      if (oldC)							\
    242         cf = (DATA_UTYPE)res <= (DATA_UTYPE)argL;		\
    243      else							\
    244         cf = (DATA_UTYPE)res < (DATA_UTYPE)argL;		\
    245      pf = parity_table[(UChar)res];				\
    246      af = (res ^ argL ^ argR) & 0x10;				\
    247      zf = ((DATA_UTYPE)res == 0) << 6;				\
    248      sf = lshift(res, 8 - DATA_BITS) & 0x80;			\
    249      of = lshift((argL ^ argR ^ -1) & (argL ^ res), 		\
    250                   12 - DATA_BITS) & AMD64G_CC_MASK_O;		\
    251      return cf | pf | af | zf | sf | of;			\
    252    }								\
    253 }
    254 
    255 /*-------------------------------------------------------------*/
    256 
    257 #define ACTIONS_SBB(DATA_BITS,DATA_UTYPE)			\
    258 {								\
    259    PREAMBLE(DATA_BITS);						\
    260    { Long cf, pf, af, zf, sf, of;				\
    261      Long argL, argR, oldC, res;	       			\
    262      oldC = CC_NDEP & AMD64G_CC_MASK_C;				\
    263      argL = CC_DEP1;						\
    264      argR = CC_DEP2 ^ oldC;	       				\
    265      res  = (argL - argR) - oldC;				\
    266      if (oldC)							\
    267         cf = (DATA_UTYPE)argL <= (DATA_UTYPE)argR;		\
    268      else							\
    269         cf = (DATA_UTYPE)argL < (DATA_UTYPE)argR;		\
    270      pf = parity_table[(UChar)res];				\
    271      af = (res ^ argL ^ argR) & 0x10;				\
    272      zf = ((DATA_UTYPE)res == 0) << 6;				\
    273      sf = lshift(res, 8 - DATA_BITS) & 0x80;			\
    274      of = lshift((argL ^ argR) & (argL ^ res), 			\
    275                  12 - DATA_BITS) & AMD64G_CC_MASK_O;		\
    276      return cf | pf | af | zf | sf | of;			\
    277    }								\
    278 }
    279 
    280 /*-------------------------------------------------------------*/
    281 
    282 #define ACTIONS_LOGIC(DATA_BITS,DATA_UTYPE)			\
    283 {								\
    284    PREAMBLE(DATA_BITS);						\
    285    { Long cf, pf, af, zf, sf, of;				\
    286      cf = 0;							\
    287      pf = parity_table[(UChar)CC_DEP1];				\
    288      af = 0;							\
    289      zf = ((DATA_UTYPE)CC_DEP1 == 0) << 6;			\
    290      sf = lshift(CC_DEP1, 8 - DATA_BITS) & 0x80;		\
    291      of = 0;							\
    292      return cf | pf | af | zf | sf | of;			\
    293    }								\
    294 }
    295 
    296 /*-------------------------------------------------------------*/
    297 
    298 #define ACTIONS_INC(DATA_BITS,DATA_UTYPE)			\
    299 {								\
    300    PREAMBLE(DATA_BITS);						\
    301    { Long cf, pf, af, zf, sf, of;				\
    302      Long argL, argR, res;					\
    303      res  = CC_DEP1;						\
    304      argL = res - 1;						\
    305      argR = 1;							\
    306      cf = CC_NDEP & AMD64G_CC_MASK_C;				\
    307      pf = parity_table[(UChar)res];				\
    308      af = (res ^ argL ^ argR) & 0x10;				\
    309      zf = ((DATA_UTYPE)res == 0) << 6;				\
    310      sf = lshift(res, 8 - DATA_BITS) & 0x80;			\
    311      of = ((res & DATA_MASK) == SIGN_MASK) << 11;		\
    312      return cf | pf | af | zf | sf | of;			\
    313    }								\
    314 }
    315 
    316 /*-------------------------------------------------------------*/
    317 
    318 #define ACTIONS_DEC(DATA_BITS,DATA_UTYPE)			\
    319 {								\
    320    PREAMBLE(DATA_BITS);						\
    321    { Long cf, pf, af, zf, sf, of;				\
    322      Long argL, argR, res;					\
    323      res  = CC_DEP1;						\
    324      argL = res + 1;						\
    325      argR = 1;							\
    326      cf = CC_NDEP & AMD64G_CC_MASK_C;				\
    327      pf = parity_table[(UChar)res];				\
    328      af = (res ^ argL ^ argR) & 0x10;				\
    329      zf = ((DATA_UTYPE)res == 0) << 6;				\
    330      sf = lshift(res, 8 - DATA_BITS) & 0x80;			\
    331      of = ((res & DATA_MASK) 					\
    332           == ((ULong)SIGN_MASK - 1)) << 11;			\
    333      return cf | pf | af | zf | sf | of;			\
    334    }								\
    335 }
    336 
    337 /*-------------------------------------------------------------*/
    338 
    339 #define ACTIONS_SHL(DATA_BITS,DATA_UTYPE)			\
    340 {								\
    341    PREAMBLE(DATA_BITS);						\
    342    { Long cf, pf, af, zf, sf, of;				\
    343      cf = (CC_DEP2 >> (DATA_BITS - 1)) & AMD64G_CC_MASK_C;	\
    344      pf = parity_table[(UChar)CC_DEP1];				\
    345      af = 0; /* undefined */					\
    346      zf = ((DATA_UTYPE)CC_DEP1 == 0) << 6;			\
    347      sf = lshift(CC_DEP1, 8 - DATA_BITS) & 0x80;		\
    348      /* of is defined if shift count == 1 */			\
    349      of = lshift(CC_DEP2 ^ CC_DEP1, 12 - DATA_BITS) 		\
    350           & AMD64G_CC_MASK_O;					\
    351      return cf | pf | af | zf | sf | of;			\
    352    }								\
    353 }
    354 
    355 /*-------------------------------------------------------------*/
    356 
    357 #define ACTIONS_SHR(DATA_BITS,DATA_UTYPE)			\
    358 {								\
    359    PREAMBLE(DATA_BITS);  					\
    360    { Long cf, pf, af, zf, sf, of;				\
    361      cf = CC_DEP2 & 1;						\
    362      pf = parity_table[(UChar)CC_DEP1];				\
    363      af = 0; /* undefined */					\
    364      zf = ((DATA_UTYPE)CC_DEP1 == 0) << 6;			\
    365      sf = lshift(CC_DEP1, 8 - DATA_BITS) & 0x80;		\
    366      /* of is defined if shift count == 1 */			\
    367      of = lshift(CC_DEP2 ^ CC_DEP1, 12 - DATA_BITS)		\
    368           & AMD64G_CC_MASK_O;					\
    369      return cf | pf | af | zf | sf | of;			\
    370    }								\
    371 }
    372 
    373 /*-------------------------------------------------------------*/
    374 
    375 /* ROL: cf' = lsb(result).  of' = msb(result) ^ lsb(result). */
    376 /* DEP1 = result, NDEP = old flags */
    377 #define ACTIONS_ROL(DATA_BITS,DATA_UTYPE)			\
    378 {								\
    379    PREAMBLE(DATA_BITS);						\
    380    { Long fl 							\
    381         = (CC_NDEP & ~(AMD64G_CC_MASK_O | AMD64G_CC_MASK_C))	\
    382           | (AMD64G_CC_MASK_C & CC_DEP1)			\
    383           | (AMD64G_CC_MASK_O & (lshift(CC_DEP1,  		\
    384                                       11-(DATA_BITS-1)) 	\
    385                      ^ lshift(CC_DEP1, 11)));			\
    386      return fl;							\
    387    }								\
    388 }
    389 
    390 /*-------------------------------------------------------------*/
    391 
    392 /* ROR: cf' = msb(result).  of' = msb(result) ^ msb-1(result). */
    393 /* DEP1 = result, NDEP = old flags */
    394 #define ACTIONS_ROR(DATA_BITS,DATA_UTYPE)			\
    395 {								\
    396    PREAMBLE(DATA_BITS);						\
    397    { Long fl 							\
    398         = (CC_NDEP & ~(AMD64G_CC_MASK_O | AMD64G_CC_MASK_C))	\
    399           | (AMD64G_CC_MASK_C & (CC_DEP1 >> (DATA_BITS-1)))	\
    400           | (AMD64G_CC_MASK_O & (lshift(CC_DEP1, 		\
    401                                       11-(DATA_BITS-1)) 	\
    402                      ^ lshift(CC_DEP1, 11-(DATA_BITS-1)+1)));	\
    403      return fl;							\
    404    }								\
    405 }
    406 
    407 /*-------------------------------------------------------------*/
    408 
    409 #define ACTIONS_UMUL(DATA_BITS, DATA_UTYPE,  NARROWtoU,         \
    410                                 DATA_U2TYPE, NARROWto2U)        \
    411 {                                                               \
    412    PREAMBLE(DATA_BITS);                                         \
    413    { Long cf, pf, af, zf, sf, of;                               \
    414      DATA_UTYPE  hi;                                            \
    415      DATA_UTYPE  lo                                             \
    416         = NARROWtoU( ((DATA_UTYPE)CC_DEP1)                      \
    417                      * ((DATA_UTYPE)CC_DEP2) );                 \
    418      DATA_U2TYPE rr                                             \
    419         = NARROWto2U(                                           \
    420              ((DATA_U2TYPE)((DATA_UTYPE)CC_DEP1))               \
    421              * ((DATA_U2TYPE)((DATA_UTYPE)CC_DEP2)) );          \
    422      hi = NARROWtoU(rr >>/*u*/ DATA_BITS);                      \
    423      cf = (hi != 0);                                            \
    424      pf = parity_table[(UChar)lo];                              \
    425      af = 0; /* undefined */                                    \
    426      zf = (lo == 0) << 6;                                       \
    427      sf = lshift(lo, 8 - DATA_BITS) & 0x80;                     \
    428      of = cf << 11;                                             \
    429      return cf | pf | af | zf | sf | of;                        \
    430    }								\
    431 }
    432 
    433 /*-------------------------------------------------------------*/
    434 
    435 #define ACTIONS_SMUL(DATA_BITS, DATA_STYPE,  NARROWtoS,         \
    436                                 DATA_S2TYPE, NARROWto2S)        \
    437 {                                                               \
    438    PREAMBLE(DATA_BITS);                                         \
    439    { Long cf, pf, af, zf, sf, of;                               \
    440      DATA_STYPE  hi;                                            \
    441      DATA_STYPE  lo                                             \
    442         = NARROWtoS( ((DATA_STYPE)CC_DEP1)                      \
    443                      * ((DATA_STYPE)CC_DEP2) );                 \
    444      DATA_S2TYPE rr                                             \
    445         = NARROWto2S(                                           \
    446              ((DATA_S2TYPE)((DATA_STYPE)CC_DEP1))               \
    447              * ((DATA_S2TYPE)((DATA_STYPE)CC_DEP2)) );          \
    448      hi = NARROWtoS(rr >>/*s*/ DATA_BITS);                      \
    449      cf = (hi != (lo >>/*s*/ (DATA_BITS-1)));                   \
    450      pf = parity_table[(UChar)lo];                              \
    451      af = 0; /* undefined */                                    \
    452      zf = (lo == 0) << 6;                                       \
    453      sf = lshift(lo, 8 - DATA_BITS) & 0x80;                     \
    454      of = cf << 11;                                             \
    455      return cf | pf | af | zf | sf | of;                        \
    456    }								\
    457 }
    458 
    459 /*-------------------------------------------------------------*/
    460 
    461 #define ACTIONS_UMULQ                                           \
    462 {                                                               \
    463    PREAMBLE(64);                                                \
    464    { Long cf, pf, af, zf, sf, of;                               \
    465      ULong lo, hi;                                              \
    466      mullU64( (ULong)CC_DEP1, (ULong)CC_DEP2, &hi, &lo );       \
    467      cf = (hi != 0);                                            \
    468      pf = parity_table[(UChar)lo];                              \
    469      af = 0; /* undefined */                                    \
    470      zf = (lo == 0) << 6;                                       \
    471      sf = lshift(lo, 8 - 64) & 0x80;                            \
    472      of = cf << 11;                                             \
    473      return cf | pf | af | zf | sf | of;                        \
    474    }								\
    475 }
    476 
    477 /*-------------------------------------------------------------*/
    478 
    479 #define ACTIONS_SMULQ                                           \
    480 {                                                               \
    481    PREAMBLE(64);                                                \
    482    { Long cf, pf, af, zf, sf, of;                               \
    483      Long lo, hi;                                               \
    484      mullS64( (Long)CC_DEP1, (Long)CC_DEP2, &hi, &lo );         \
    485      cf = (hi != (lo >>/*s*/ (64-1)));                          \
    486      pf = parity_table[(UChar)lo];                              \
    487      af = 0; /* undefined */                                    \
    488      zf = (lo == 0) << 6;                                       \
    489      sf = lshift(lo, 8 - 64) & 0x80;                            \
    490      of = cf << 11;                                             \
    491      return cf | pf | af | zf | sf | of;                        \
    492    }								\
    493 }
    494 
    495 /*-------------------------------------------------------------*/
    496 
    497 #define ACTIONS_ANDN(DATA_BITS,DATA_UTYPE)			\
    498 {								\
    499    PREAMBLE(DATA_BITS);						\
    500    { Long cf, pf, af, zf, sf, of;				\
    501      cf = 0;							\
    502      pf = 0;							\
    503      af = 0;							\
    504      zf = ((DATA_UTYPE)CC_DEP1 == 0) << 6;			\
    505      sf = lshift(CC_DEP1, 8 - DATA_BITS) & 0x80;		\
    506      of = 0;							\
    507      return cf | pf | af | zf | sf | of;			\
    508    }								\
    509 }
    510 
    511 /*-------------------------------------------------------------*/
    512 
    513 #define ACTIONS_BLSI(DATA_BITS,DATA_UTYPE)			\
    514 {								\
    515    PREAMBLE(DATA_BITS);						\
    516    { Long cf, pf, af, zf, sf, of;				\
    517      cf = ((DATA_UTYPE)CC_DEP2 != 0);				\
    518      pf = 0;							\
    519      af = 0;							\
    520      zf = ((DATA_UTYPE)CC_DEP1 == 0) << 6;			\
    521      sf = lshift(CC_DEP1, 8 - DATA_BITS) & 0x80;		\
    522      of = 0;							\
    523      return cf | pf | af | zf | sf | of;			\
    524    }								\
    525 }
    526 
    527 /*-------------------------------------------------------------*/
    528 
    529 #define ACTIONS_BLSMSK(DATA_BITS,DATA_UTYPE)			\
    530 {								\
    531    PREAMBLE(DATA_BITS);						\
    532    { Long cf, pf, af, zf, sf, of;				\
    533      cf = ((DATA_UTYPE)CC_DEP2 == 0);				\
    534      pf = 0;							\
    535      af = 0;							\
    536      zf = 0;							\
    537      sf = lshift(CC_DEP1, 8 - DATA_BITS) & 0x80;		\
    538      of = 0;							\
    539      return cf | pf | af | zf | sf | of;			\
    540    }								\
    541 }
    542 
    543 /*-------------------------------------------------------------*/
    544 
    545 #define ACTIONS_BLSR(DATA_BITS,DATA_UTYPE)			\
    546 {								\
    547    PREAMBLE(DATA_BITS);						\
    548    { Long cf, pf, af, zf, sf, of;				\
    549      cf = ((DATA_UTYPE)CC_DEP2 == 0);				\
    550      pf = 0;							\
    551      af = 0;							\
    552      zf = ((DATA_UTYPE)CC_DEP1 == 0) << 6;			\
    553      sf = lshift(CC_DEP1, 8 - DATA_BITS) & 0x80;		\
    554      of = 0;							\
    555      return cf | pf | af | zf | sf | of;			\
    556    }								\
    557 }
    558 
    559 /*-------------------------------------------------------------*/
    560 
    561 
    562 #if PROFILE_RFLAGS
    563 
    564 static Bool initted     = False;
    565 
    566 /* C flag, fast route */
    567 static UInt tabc_fast[AMD64G_CC_OP_NUMBER];
    568 /* C flag, slow route */
    569 static UInt tabc_slow[AMD64G_CC_OP_NUMBER];
    570 /* table for calculate_cond */
    571 static UInt tab_cond[AMD64G_CC_OP_NUMBER][16];
    572 /* total entry counts for calc_all, calc_c, calc_cond. */
    573 static UInt n_calc_all  = 0;
    574 static UInt n_calc_c    = 0;
    575 static UInt n_calc_cond = 0;
    576 
    577 #define SHOW_COUNTS_NOW (0 == (0x3FFFFF & (n_calc_all+n_calc_c+n_calc_cond)))
    578 
    579 
    580 static void showCounts ( void )
    581 {
    582    Int op, co;
    583    HChar ch;
    584    vex_printf("\nTotal calls: calc_all=%u   calc_cond=%u   calc_c=%u\n",
    585               n_calc_all, n_calc_cond, n_calc_c);
    586 
    587    vex_printf("      cSLOW  cFAST    O   NO    B   NB    Z   NZ   BE  NBE"
    588               "    S   NS    P   NP    L   NL   LE  NLE\n");
    589    vex_printf("     -----------------------------------------------------"
    590               "----------------------------------------\n");
    591    for (op = 0; op < AMD64G_CC_OP_NUMBER; op++) {
    592 
    593       ch = ' ';
    594       if (op > 0 && (op-1) % 4 == 0)
    595          ch = 'B';
    596       if (op > 0 && (op-1) % 4 == 1)
    597          ch = 'W';
    598       if (op > 0 && (op-1) % 4 == 2)
    599          ch = 'L';
    600       if (op > 0 && (op-1) % 4 == 3)
    601          ch = 'Q';
    602 
    603       vex_printf("%2d%c: ", op, ch);
    604       vex_printf("%6u ", tabc_slow[op]);
    605       vex_printf("%6u ", tabc_fast[op]);
    606       for (co = 0; co < 16; co++) {
    607          Int n = tab_cond[op][co];
    608          if (n >= 1000) {
    609             vex_printf(" %3dK", n / 1000);
    610          } else
    611          if (n >= 0) {
    612             vex_printf(" %3d ", n );
    613          } else {
    614             vex_printf("     ");
    615          }
    616       }
    617       vex_printf("\n");
    618    }
    619    vex_printf("\n");
    620 }
    621 
    622 static void initCounts ( void )
    623 {
    624    Int op, co;
    625    initted = True;
    626    for (op = 0; op < AMD64G_CC_OP_NUMBER; op++) {
    627       tabc_fast[op] = tabc_slow[op] = 0;
    628       for (co = 0; co < 16; co++)
    629          tab_cond[op][co] = 0;
    630    }
    631 }
    632 
    633 #endif /* PROFILE_RFLAGS */
    634 
    635 
    636 /* CALLED FROM GENERATED CODE: CLEAN HELPER */
    637 /* Calculate all the 6 flags from the supplied thunk parameters.
    638    Worker function, not directly called from generated code. */
    639 static
    640 ULong amd64g_calculate_rflags_all_WRK ( ULong cc_op,
    641                                         ULong cc_dep1_formal,
    642                                         ULong cc_dep2_formal,
    643                                         ULong cc_ndep_formal )
    644 {
    645    switch (cc_op) {
    646       case AMD64G_CC_OP_COPY:
    647          return cc_dep1_formal
    648                 & (AMD64G_CC_MASK_O | AMD64G_CC_MASK_S | AMD64G_CC_MASK_Z
    649                    | AMD64G_CC_MASK_A | AMD64G_CC_MASK_C | AMD64G_CC_MASK_P);
    650 
    651       case AMD64G_CC_OP_ADDB:   ACTIONS_ADD( 8,  UChar  );
    652       case AMD64G_CC_OP_ADDW:   ACTIONS_ADD( 16, UShort );
    653       case AMD64G_CC_OP_ADDL:   ACTIONS_ADD( 32, UInt   );
    654       case AMD64G_CC_OP_ADDQ:   ACTIONS_ADD( 64, ULong  );
    655 
    656       case AMD64G_CC_OP_ADCB:   ACTIONS_ADC( 8,  UChar  );
    657       case AMD64G_CC_OP_ADCW:   ACTIONS_ADC( 16, UShort );
    658       case AMD64G_CC_OP_ADCL:   ACTIONS_ADC( 32, UInt   );
    659       case AMD64G_CC_OP_ADCQ:   ACTIONS_ADC( 64, ULong  );
    660 
    661       case AMD64G_CC_OP_SUBB:   ACTIONS_SUB(  8, UChar  );
    662       case AMD64G_CC_OP_SUBW:   ACTIONS_SUB( 16, UShort );
    663       case AMD64G_CC_OP_SUBL:   ACTIONS_SUB( 32, UInt   );
    664       case AMD64G_CC_OP_SUBQ:   ACTIONS_SUB( 64, ULong  );
    665 
    666       case AMD64G_CC_OP_SBBB:   ACTIONS_SBB(  8, UChar  );
    667       case AMD64G_CC_OP_SBBW:   ACTIONS_SBB( 16, UShort );
    668       case AMD64G_CC_OP_SBBL:   ACTIONS_SBB( 32, UInt   );
    669       case AMD64G_CC_OP_SBBQ:   ACTIONS_SBB( 64, ULong  );
    670 
    671       case AMD64G_CC_OP_LOGICB: ACTIONS_LOGIC(  8, UChar  );
    672       case AMD64G_CC_OP_LOGICW: ACTIONS_LOGIC( 16, UShort );
    673       case AMD64G_CC_OP_LOGICL: ACTIONS_LOGIC( 32, UInt   );
    674       case AMD64G_CC_OP_LOGICQ: ACTIONS_LOGIC( 64, ULong  );
    675 
    676       case AMD64G_CC_OP_INCB:   ACTIONS_INC(  8, UChar  );
    677       case AMD64G_CC_OP_INCW:   ACTIONS_INC( 16, UShort );
    678       case AMD64G_CC_OP_INCL:   ACTIONS_INC( 32, UInt   );
    679       case AMD64G_CC_OP_INCQ:   ACTIONS_INC( 64, ULong  );
    680 
    681       case AMD64G_CC_OP_DECB:   ACTIONS_DEC(  8, UChar  );
    682       case AMD64G_CC_OP_DECW:   ACTIONS_DEC( 16, UShort );
    683       case AMD64G_CC_OP_DECL:   ACTIONS_DEC( 32, UInt   );
    684       case AMD64G_CC_OP_DECQ:   ACTIONS_DEC( 64, ULong  );
    685 
    686       case AMD64G_CC_OP_SHLB:   ACTIONS_SHL(  8, UChar  );
    687       case AMD64G_CC_OP_SHLW:   ACTIONS_SHL( 16, UShort );
    688       case AMD64G_CC_OP_SHLL:   ACTIONS_SHL( 32, UInt   );
    689       case AMD64G_CC_OP_SHLQ:   ACTIONS_SHL( 64, ULong  );
    690 
    691       case AMD64G_CC_OP_SHRB:   ACTIONS_SHR(  8, UChar  );
    692       case AMD64G_CC_OP_SHRW:   ACTIONS_SHR( 16, UShort );
    693       case AMD64G_CC_OP_SHRL:   ACTIONS_SHR( 32, UInt   );
    694       case AMD64G_CC_OP_SHRQ:   ACTIONS_SHR( 64, ULong  );
    695 
    696       case AMD64G_CC_OP_ROLB:   ACTIONS_ROL(  8, UChar  );
    697       case AMD64G_CC_OP_ROLW:   ACTIONS_ROL( 16, UShort );
    698       case AMD64G_CC_OP_ROLL:   ACTIONS_ROL( 32, UInt   );
    699       case AMD64G_CC_OP_ROLQ:   ACTIONS_ROL( 64, ULong  );
    700 
    701       case AMD64G_CC_OP_RORB:   ACTIONS_ROR(  8, UChar  );
    702       case AMD64G_CC_OP_RORW:   ACTIONS_ROR( 16, UShort );
    703       case AMD64G_CC_OP_RORL:   ACTIONS_ROR( 32, UInt   );
    704       case AMD64G_CC_OP_RORQ:   ACTIONS_ROR( 64, ULong  );
    705 
    706       case AMD64G_CC_OP_UMULB:  ACTIONS_UMUL(  8, UChar,  toUChar,
    707                                                   UShort, toUShort );
    708       case AMD64G_CC_OP_UMULW:  ACTIONS_UMUL( 16, UShort, toUShort,
    709                                                   UInt,   toUInt );
    710       case AMD64G_CC_OP_UMULL:  ACTIONS_UMUL( 32, UInt,   toUInt,
    711                                                   ULong,  idULong );
    712 
    713       case AMD64G_CC_OP_UMULQ:  ACTIONS_UMULQ;
    714 
    715       case AMD64G_CC_OP_SMULB:  ACTIONS_SMUL(  8, Char,   toUChar,
    716                                                   Short,  toUShort );
    717       case AMD64G_CC_OP_SMULW:  ACTIONS_SMUL( 16, Short,  toUShort,
    718                                                   Int,    toUInt   );
    719       case AMD64G_CC_OP_SMULL:  ACTIONS_SMUL( 32, Int,    toUInt,
    720                                                   Long,   idULong );
    721 
    722       case AMD64G_CC_OP_SMULQ:  ACTIONS_SMULQ;
    723 
    724       case AMD64G_CC_OP_ANDN32: ACTIONS_ANDN( 32, UInt   );
    725       case AMD64G_CC_OP_ANDN64: ACTIONS_ANDN( 64, ULong  );
    726 
    727       case AMD64G_CC_OP_BLSI32: ACTIONS_BLSI( 32, UInt   );
    728       case AMD64G_CC_OP_BLSI64: ACTIONS_BLSI( 64, ULong  );
    729 
    730       case AMD64G_CC_OP_BLSMSK32: ACTIONS_BLSMSK( 32, UInt   );
    731       case AMD64G_CC_OP_BLSMSK64: ACTIONS_BLSMSK( 64, ULong  );
    732 
    733       case AMD64G_CC_OP_BLSR32: ACTIONS_BLSR( 32, UInt   );
    734       case AMD64G_CC_OP_BLSR64: ACTIONS_BLSR( 64, ULong  );
    735 
    736       default:
    737          /* shouldn't really make these calls from generated code */
    738          vex_printf("amd64g_calculate_rflags_all_WRK(AMD64)"
    739                     "( %llu, 0x%llx, 0x%llx, 0x%llx )\n",
    740                     cc_op, cc_dep1_formal, cc_dep2_formal, cc_ndep_formal );
    741          vpanic("amd64g_calculate_rflags_all_WRK(AMD64)");
    742    }
    743 }
    744 
    745 
    746 /* CALLED FROM GENERATED CODE: CLEAN HELPER */
    747 /* Calculate all the 6 flags from the supplied thunk parameters. */
    748 ULong amd64g_calculate_rflags_all ( ULong cc_op,
    749                                     ULong cc_dep1,
    750                                     ULong cc_dep2,
    751                                     ULong cc_ndep )
    752 {
    753 #  if PROFILE_RFLAGS
    754    if (!initted) initCounts();
    755    n_calc_all++;
    756    if (SHOW_COUNTS_NOW) showCounts();
    757 #  endif
    758    return
    759       amd64g_calculate_rflags_all_WRK ( cc_op, cc_dep1, cc_dep2, cc_ndep );
    760 }
    761 
    762 
    763 /* CALLED FROM GENERATED CODE: CLEAN HELPER */
    764 /* Calculate just the carry flag from the supplied thunk parameters. */
    765 ULong amd64g_calculate_rflags_c ( ULong cc_op,
    766                                   ULong cc_dep1,
    767                                   ULong cc_dep2,
    768                                   ULong cc_ndep )
    769 {
    770 #  if PROFILE_RFLAGS
    771    if (!initted) initCounts();
    772    n_calc_c++;
    773    tabc_fast[cc_op]++;
    774    if (SHOW_COUNTS_NOW) showCounts();
    775 #  endif
    776 
    777    /* Fast-case some common ones. */
    778    switch (cc_op) {
    779       case AMD64G_CC_OP_COPY:
    780          return (cc_dep1 >> AMD64G_CC_SHIFT_C) & 1;
    781       case AMD64G_CC_OP_LOGICQ:
    782       case AMD64G_CC_OP_LOGICL:
    783       case AMD64G_CC_OP_LOGICW:
    784       case AMD64G_CC_OP_LOGICB:
    785          return 0;
    786 	 //      case AMD64G_CC_OP_SUBL:
    787 	 //         return ((UInt)cc_dep1) < ((UInt)cc_dep2)
    788 	 //                   ? AMD64G_CC_MASK_C : 0;
    789 	 //      case AMD64G_CC_OP_SUBW:
    790 	 //         return ((UInt)(cc_dep1 & 0xFFFF)) < ((UInt)(cc_dep2 & 0xFFFF))
    791 	 //                   ? AMD64G_CC_MASK_C : 0;
    792 	 //      case AMD64G_CC_OP_SUBB:
    793 	 //         return ((UInt)(cc_dep1 & 0xFF)) < ((UInt)(cc_dep2 & 0xFF))
    794 	 //                   ? AMD64G_CC_MASK_C : 0;
    795 	 //      case AMD64G_CC_OP_INCL:
    796 	 //      case AMD64G_CC_OP_DECL:
    797 	 //         return cc_ndep & AMD64G_CC_MASK_C;
    798       default:
    799          break;
    800    }
    801 
    802 #  if PROFILE_RFLAGS
    803    tabc_fast[cc_op]--;
    804    tabc_slow[cc_op]++;
    805 #  endif
    806 
    807    return amd64g_calculate_rflags_all_WRK(cc_op,cc_dep1,cc_dep2,cc_ndep)
    808           & AMD64G_CC_MASK_C;
    809 }
    810 
    811 
    812 /* CALLED FROM GENERATED CODE: CLEAN HELPER */
    813 /* returns 1 or 0 */
    814 ULong amd64g_calculate_condition ( ULong/*AMD64Condcode*/ cond,
    815                                    ULong cc_op,
    816                                    ULong cc_dep1,
    817                                    ULong cc_dep2,
    818                                    ULong cc_ndep )
    819 {
    820    ULong rflags = amd64g_calculate_rflags_all_WRK(cc_op, cc_dep1,
    821                                                   cc_dep2, cc_ndep);
    822    ULong of,sf,zf,cf,pf;
    823    ULong inv = cond & 1;
    824 
    825 #  if PROFILE_RFLAGS
    826    if (!initted) initCounts();
    827    tab_cond[cc_op][cond]++;
    828    n_calc_cond++;
    829    if (SHOW_COUNTS_NOW) showCounts();
    830 #  endif
    831 
    832    switch (cond) {
    833       case AMD64CondNO:
    834       case AMD64CondO: /* OF == 1 */
    835          of = rflags >> AMD64G_CC_SHIFT_O;
    836          return 1 & (inv ^ of);
    837 
    838       case AMD64CondNZ:
    839       case AMD64CondZ: /* ZF == 1 */
    840          zf = rflags >> AMD64G_CC_SHIFT_Z;
    841          return 1 & (inv ^ zf);
    842 
    843       case AMD64CondNB:
    844       case AMD64CondB: /* CF == 1 */
    845          cf = rflags >> AMD64G_CC_SHIFT_C;
    846          return 1 & (inv ^ cf);
    847          break;
    848 
    849       case AMD64CondNBE:
    850       case AMD64CondBE: /* (CF or ZF) == 1 */
    851          cf = rflags >> AMD64G_CC_SHIFT_C;
    852          zf = rflags >> AMD64G_CC_SHIFT_Z;
    853          return 1 & (inv ^ (cf | zf));
    854          break;
    855 
    856       case AMD64CondNS:
    857       case AMD64CondS: /* SF == 1 */
    858          sf = rflags >> AMD64G_CC_SHIFT_S;
    859          return 1 & (inv ^ sf);
    860 
    861       case AMD64CondNP:
    862       case AMD64CondP: /* PF == 1 */
    863          pf = rflags >> AMD64G_CC_SHIFT_P;
    864          return 1 & (inv ^ pf);
    865 
    866       case AMD64CondNL:
    867       case AMD64CondL: /* (SF xor OF) == 1 */
    868          sf = rflags >> AMD64G_CC_SHIFT_S;
    869          of = rflags >> AMD64G_CC_SHIFT_O;
    870          return 1 & (inv ^ (sf ^ of));
    871          break;
    872 
    873       case AMD64CondNLE:
    874       case AMD64CondLE: /* ((SF xor OF) or ZF)  == 1 */
    875          sf = rflags >> AMD64G_CC_SHIFT_S;
    876          of = rflags >> AMD64G_CC_SHIFT_O;
    877          zf = rflags >> AMD64G_CC_SHIFT_Z;
    878          return 1 & (inv ^ ((sf ^ of) | zf));
    879          break;
    880 
    881       default:
    882          /* shouldn't really make these calls from generated code */
    883          vex_printf("amd64g_calculate_condition"
    884                     "( %llu, %llu, 0x%llx, 0x%llx, 0x%llx )\n",
    885                     cond, cc_op, cc_dep1, cc_dep2, cc_ndep );
    886          vpanic("amd64g_calculate_condition");
    887    }
    888 }
    889 
    890 
    891 /* VISIBLE TO LIBVEX CLIENT */
    892 ULong LibVEX_GuestAMD64_get_rflags ( /*IN*/const VexGuestAMD64State* vex_state )
    893 {
    894    ULong rflags = amd64g_calculate_rflags_all_WRK(
    895                      vex_state->guest_CC_OP,
    896                      vex_state->guest_CC_DEP1,
    897                      vex_state->guest_CC_DEP2,
    898                      vex_state->guest_CC_NDEP
    899                   );
    900    Long dflag = vex_state->guest_DFLAG;
    901    vassert(dflag == 1 || dflag == -1);
    902    if (dflag == -1)
    903       rflags |= (1<<10);
    904    if (vex_state->guest_IDFLAG == 1)
    905       rflags |= (1<<21);
    906    if (vex_state->guest_ACFLAG == 1)
    907       rflags |= (1<<18);
    908 
    909    return rflags;
    910 }
    911 
    912 /* VISIBLE TO LIBVEX CLIENT */
    913 void
    914 LibVEX_GuestAMD64_put_rflag_c ( ULong new_carry_flag,
    915                                /*MOD*/VexGuestAMD64State* vex_state )
    916 {
    917    ULong oszacp = amd64g_calculate_rflags_all_WRK(
    918                      vex_state->guest_CC_OP,
    919                      vex_state->guest_CC_DEP1,
    920                      vex_state->guest_CC_DEP2,
    921                      vex_state->guest_CC_NDEP
    922                   );
    923    if (new_carry_flag & 1) {
    924       oszacp |= AMD64G_CC_MASK_C;
    925    } else {
    926       oszacp &= ~AMD64G_CC_MASK_C;
    927    }
    928    vex_state->guest_CC_OP   = AMD64G_CC_OP_COPY;
    929    vex_state->guest_CC_DEP1 = oszacp;
    930    vex_state->guest_CC_DEP2 = 0;
    931    vex_state->guest_CC_NDEP = 0;
    932 }
    933 
    934 
    935 /*---------------------------------------------------------------*/
    936 /*--- %rflags translation-time function specialisers.         ---*/
    937 /*--- These help iropt specialise calls the above run-time    ---*/
    938 /*--- %rflags functions.                                      ---*/
    939 /*---------------------------------------------------------------*/
    940 
    941 /* Used by the optimiser to try specialisations.  Returns an
    942    equivalent expression, or NULL if none. */
    943 
    944 static Bool isU64 ( IRExpr* e, ULong n )
    945 {
    946    return toBool( e->tag == Iex_Const
    947                   && e->Iex.Const.con->tag == Ico_U64
    948                   && e->Iex.Const.con->Ico.U64 == n );
    949 }
    950 
    951 IRExpr* guest_amd64_spechelper ( const HChar* function_name,
    952                                  IRExpr** args,
    953                                  IRStmt** precedingStmts,
    954                                  Int      n_precedingStmts )
    955 {
    956 #  define unop(_op,_a1) IRExpr_Unop((_op),(_a1))
    957 #  define binop(_op,_a1,_a2) IRExpr_Binop((_op),(_a1),(_a2))
    958 #  define mkU64(_n) IRExpr_Const(IRConst_U64(_n))
    959 #  define mkU32(_n) IRExpr_Const(IRConst_U32(_n))
    960 #  define mkU8(_n)  IRExpr_Const(IRConst_U8(_n))
    961 
    962    Int i, arity = 0;
    963    for (i = 0; args[i]; i++)
    964       arity++;
    965 #  if 0
    966    vex_printf("spec request:\n");
    967    vex_printf("   %s  ", function_name);
    968    for (i = 0; i < arity; i++) {
    969       vex_printf("  ");
    970       ppIRExpr(args[i]);
    971    }
    972    vex_printf("\n");
    973 #  endif
    974 
    975    /* --------- specialising "amd64g_calculate_condition" --------- */
    976 
    977    if (vex_streq(function_name, "amd64g_calculate_condition")) {
    978       /* specialise calls to above "calculate condition" function */
    979       IRExpr *cond, *cc_op, *cc_dep1, *cc_dep2;
    980       vassert(arity == 5);
    981       cond    = args[0];
    982       cc_op   = args[1];
    983       cc_dep1 = args[2];
    984       cc_dep2 = args[3];
    985 
    986       /*---------------- ADDQ ----------------*/
    987 
    988       if (isU64(cc_op, AMD64G_CC_OP_ADDQ) && isU64(cond, AMD64CondZ)) {
    989          /* long long add, then Z --> test (dst+src == 0) */
    990          return unop(Iop_1Uto64,
    991                      binop(Iop_CmpEQ64,
    992                            binop(Iop_Add64, cc_dep1, cc_dep2),
    993                            mkU64(0)));
    994       }
    995 
    996       /*---------------- SUBQ ----------------*/
    997 
    998       if (isU64(cc_op, AMD64G_CC_OP_SUBQ) && isU64(cond, AMD64CondZ)) {
    999          /* long long sub/cmp, then Z --> test dst==src */
   1000          return unop(Iop_1Uto64,
   1001                      binop(Iop_CmpEQ64,cc_dep1,cc_dep2));
   1002       }
   1003       if (isU64(cc_op, AMD64G_CC_OP_SUBQ) && isU64(cond, AMD64CondNZ)) {
   1004          /* long long sub/cmp, then NZ --> test dst!=src */
   1005          return unop(Iop_1Uto64,
   1006                      binop(Iop_CmpNE64,cc_dep1,cc_dep2));
   1007       }
   1008 
   1009       if (isU64(cc_op, AMD64G_CC_OP_SUBQ) && isU64(cond, AMD64CondL)) {
   1010          /* long long sub/cmp, then L (signed less than)
   1011             --> test dst <s src */
   1012          return unop(Iop_1Uto64,
   1013                      binop(Iop_CmpLT64S, cc_dep1, cc_dep2));
   1014       }
   1015 
   1016       if (isU64(cc_op, AMD64G_CC_OP_SUBQ) && isU64(cond, AMD64CondB)) {
   1017          /* long long sub/cmp, then B (unsigned less than)
   1018             --> test dst <u src */
   1019          return unop(Iop_1Uto64,
   1020                      binop(Iop_CmpLT64U, cc_dep1, cc_dep2));
   1021       }
   1022       if (isU64(cc_op, AMD64G_CC_OP_SUBQ) && isU64(cond, AMD64CondNB)) {
   1023          /* long long sub/cmp, then NB (unsigned greater than or equal)
   1024             --> test src <=u dst */
   1025          /* Note, args are opposite way round from the usual */
   1026          return unop(Iop_1Uto64,
   1027                      binop(Iop_CmpLE64U, cc_dep2, cc_dep1));
   1028       }
   1029 
   1030       if (isU64(cc_op, AMD64G_CC_OP_SUBQ) && isU64(cond, AMD64CondNLE)) {
   1031          /* long sub/cmp, then NLE (signed greater than)
   1032             --> test !(dst <=s src)
   1033             --> test (dst >s src)
   1034             --> test (src <s dst) */
   1035          return unop(Iop_1Uto64,
   1036                      binop(Iop_CmpLT64S, cc_dep2, cc_dep1));
   1037 
   1038       }
   1039 
   1040       if (isU64(cc_op, AMD64G_CC_OP_SUBQ) && isU64(cond, AMD64CondBE)) {
   1041          /* long long sub/cmp, then BE (unsigned less than or equal)
   1042             --> test dst <=u src */
   1043          return unop(Iop_1Uto64,
   1044                      binop(Iop_CmpLE64U, cc_dep1, cc_dep2));
   1045       }
   1046       if (isU64(cc_op, AMD64G_CC_OP_SUBQ) && isU64(cond, AMD64CondNBE)) {
   1047          /* long long sub/cmp, then NBE (unsigned greater than)
   1048             --> test !(dst <=u src) */
   1049          return binop(Iop_Xor64,
   1050                       unop(Iop_1Uto64,
   1051                            binop(Iop_CmpLE64U, cc_dep1, cc_dep2)),
   1052                       mkU64(1));
   1053       }
   1054 
   1055       /*---------------- SUBL ----------------*/
   1056 
   1057       if (isU64(cc_op, AMD64G_CC_OP_SUBL) && isU64(cond, AMD64CondZ)) {
   1058          /* long sub/cmp, then Z --> test dst==src */
   1059          return unop(Iop_1Uto64,
   1060                      binop(Iop_CmpEQ32,
   1061                            unop(Iop_64to32, cc_dep1),
   1062                            unop(Iop_64to32, cc_dep2)));
   1063       }
   1064       if (isU64(cc_op, AMD64G_CC_OP_SUBL) && isU64(cond, AMD64CondNZ)) {
   1065          /* long sub/cmp, then NZ --> test dst!=src */
   1066          return unop(Iop_1Uto64,
   1067                      binop(Iop_CmpNE32,
   1068                            unop(Iop_64to32, cc_dep1),
   1069                            unop(Iop_64to32, cc_dep2)));
   1070       }
   1071 
   1072       if (isU64(cc_op, AMD64G_CC_OP_SUBL) && isU64(cond, AMD64CondL)) {
   1073          /* long sub/cmp, then L (signed less than)
   1074             --> test dst <s src */
   1075          return unop(Iop_1Uto64,
   1076                      binop(Iop_CmpLT32S,
   1077                            unop(Iop_64to32, cc_dep1),
   1078                            unop(Iop_64to32, cc_dep2)));
   1079       }
   1080 
   1081       if (isU64(cc_op, AMD64G_CC_OP_SUBL) && isU64(cond, AMD64CondLE)) {
   1082          /* long sub/cmp, then LE (signed less than or equal)
   1083             --> test dst <=s src */
   1084          return unop(Iop_1Uto64,
   1085                      binop(Iop_CmpLE32S,
   1086                            unop(Iop_64to32, cc_dep1),
   1087                            unop(Iop_64to32, cc_dep2)));
   1088 
   1089       }
   1090       if (isU64(cc_op, AMD64G_CC_OP_SUBL) && isU64(cond, AMD64CondNLE)) {
   1091          /* long sub/cmp, then NLE (signed greater than)
   1092             --> test !(dst <=s src)
   1093             --> test (dst >s src)
   1094             --> test (src <s dst) */
   1095          return unop(Iop_1Uto64,
   1096                      binop(Iop_CmpLT32S,
   1097                            unop(Iop_64to32, cc_dep2),
   1098                            unop(Iop_64to32, cc_dep1)));
   1099 
   1100       }
   1101 
   1102       if (isU64(cc_op, AMD64G_CC_OP_SUBL) && isU64(cond, AMD64CondBE)) {
   1103          /* long sub/cmp, then BE (unsigned less than or equal)
   1104             --> test dst <=u src */
   1105          return unop(Iop_1Uto64,
   1106                      binop(Iop_CmpLE32U,
   1107                            unop(Iop_64to32, cc_dep1),
   1108                            unop(Iop_64to32, cc_dep2)));
   1109       }
   1110       if (isU64(cc_op, AMD64G_CC_OP_SUBL) && isU64(cond, AMD64CondNBE)) {
   1111          /* long sub/cmp, then NBE (unsigned greater than)
   1112             --> test src <u dst */
   1113          /* Note, args are opposite way round from the usual */
   1114          return unop(Iop_1Uto64,
   1115                      binop(Iop_CmpLT32U,
   1116                            unop(Iop_64to32, cc_dep2),
   1117                            unop(Iop_64to32, cc_dep1)));
   1118       }
   1119 
   1120       if (isU64(cc_op, AMD64G_CC_OP_SUBL) && isU64(cond, AMD64CondS)) {
   1121          /* long sub/cmp, then S (negative) --> test (dst-src <s 0) */
   1122          return unop(Iop_1Uto64,
   1123                      binop(Iop_CmpLT32S,
   1124                            binop(Iop_Sub32,
   1125                                  unop(Iop_64to32, cc_dep1),
   1126                                  unop(Iop_64to32, cc_dep2)),
   1127                            mkU32(0)));
   1128       }
   1129 
   1130       if (isU64(cc_op, AMD64G_CC_OP_SUBL) && isU64(cond, AMD64CondB)) {
   1131          /* long sub/cmp, then B (unsigned less than)
   1132             --> test dst <u src */
   1133          return unop(Iop_1Uto64,
   1134                      binop(Iop_CmpLT32U,
   1135                            unop(Iop_64to32, cc_dep1),
   1136                            unop(Iop_64to32, cc_dep2)));
   1137       }
   1138 
   1139       /*---------------- SUBW ----------------*/
   1140 
   1141       if (isU64(cc_op, AMD64G_CC_OP_SUBW) && isU64(cond, AMD64CondZ)) {
   1142          /* word sub/cmp, then Z --> test dst==src */
   1143          return unop(Iop_1Uto64,
   1144                      binop(Iop_CmpEQ16,
   1145                            unop(Iop_64to16,cc_dep1),
   1146                            unop(Iop_64to16,cc_dep2)));
   1147       }
   1148       if (isU64(cc_op, AMD64G_CC_OP_SUBW) && isU64(cond, AMD64CondNZ)) {
   1149          /* word sub/cmp, then NZ --> test dst!=src */
   1150          return unop(Iop_1Uto64,
   1151                      binop(Iop_CmpNE16,
   1152                            unop(Iop_64to16,cc_dep1),
   1153                            unop(Iop_64to16,cc_dep2)));
   1154       }
   1155 
   1156       if (isU64(cc_op, AMD64G_CC_OP_SUBW) && isU64(cond, AMD64CondLE)) {
   1157          /* word sub/cmp, then LE (signed less than or equal)
   1158             --> test dst <=s src */
   1159          return unop(Iop_1Uto64,
   1160                      binop(Iop_CmpLE64S,
   1161                            binop(Iop_Shl64,cc_dep1,mkU8(48)),
   1162                            binop(Iop_Shl64,cc_dep2,mkU8(48))));
   1163 
   1164       }
   1165 
   1166       /*---------------- SUBB ----------------*/
   1167 
   1168       if (isU64(cc_op, AMD64G_CC_OP_SUBB) && isU64(cond, AMD64CondZ)) {
   1169          /* byte sub/cmp, then Z --> test dst==src */
   1170          return unop(Iop_1Uto64,
   1171                      binop(Iop_CmpEQ8,
   1172                            unop(Iop_64to8,cc_dep1),
   1173                            unop(Iop_64to8,cc_dep2)));
   1174       }
   1175       if (isU64(cc_op, AMD64G_CC_OP_SUBB) && isU64(cond, AMD64CondNZ)) {
   1176          /* byte sub/cmp, then NZ --> test dst!=src */
   1177          return unop(Iop_1Uto64,
   1178                      binop(Iop_CmpNE8,
   1179                            unop(Iop_64to8,cc_dep1),
   1180                            unop(Iop_64to8,cc_dep2)));
   1181       }
   1182 
   1183       if (isU64(cc_op, AMD64G_CC_OP_SUBB) && isU64(cond, AMD64CondBE)) {
   1184          /* byte sub/cmp, then BE (unsigned less than or equal)
   1185             --> test dst <=u src */
   1186          return unop(Iop_1Uto64,
   1187                      binop(Iop_CmpLE64U,
   1188                            binop(Iop_And64, cc_dep1, mkU64(0xFF)),
   1189                            binop(Iop_And64, cc_dep2, mkU64(0xFF))));
   1190       }
   1191 
   1192       if (isU64(cc_op, AMD64G_CC_OP_SUBB) && isU64(cond, AMD64CondS)
   1193                                           && isU64(cc_dep2, 0)) {
   1194          /* byte sub/cmp of zero, then S --> test (dst-0 <s 0)
   1195                                          --> test dst <s 0
   1196                                          --> (ULong)dst[7]
   1197             This is yet another scheme by which gcc figures out if the
   1198             top bit of a byte is 1 or 0.  See also LOGICB/CondS below. */
   1199          /* Note: isU64(cc_dep2, 0) is correct, even though this is
   1200             for an 8-bit comparison, since the args to the helper
   1201             function are always U64s. */
   1202          return binop(Iop_And64,
   1203                       binop(Iop_Shr64,cc_dep1,mkU8(7)),
   1204                       mkU64(1));
   1205       }
   1206       if (isU64(cc_op, AMD64G_CC_OP_SUBB) && isU64(cond, AMD64CondNS)
   1207                                           && isU64(cc_dep2, 0)) {
   1208          /* byte sub/cmp of zero, then NS --> test !(dst-0 <s 0)
   1209                                           --> test !(dst <s 0)
   1210                                           --> (ULong) !dst[7]
   1211          */
   1212          return binop(Iop_Xor64,
   1213                       binop(Iop_And64,
   1214                             binop(Iop_Shr64,cc_dep1,mkU8(7)),
   1215                             mkU64(1)),
   1216                       mkU64(1));
   1217       }
   1218 
   1219       /*---------------- LOGICQ ----------------*/
   1220 
   1221       if (isU64(cc_op, AMD64G_CC_OP_LOGICQ) && isU64(cond, AMD64CondZ)) {
   1222          /* long long and/or/xor, then Z --> test dst==0 */
   1223          return unop(Iop_1Uto64,
   1224                      binop(Iop_CmpEQ64, cc_dep1, mkU64(0)));
   1225       }
   1226       if (isU64(cc_op, AMD64G_CC_OP_LOGICQ) && isU64(cond, AMD64CondNZ)) {
   1227          /* long long and/or/xor, then NZ --> test dst!=0 */
   1228          return unop(Iop_1Uto64,
   1229                      binop(Iop_CmpNE64, cc_dep1, mkU64(0)));
   1230       }
   1231 
   1232       if (isU64(cc_op, AMD64G_CC_OP_LOGICQ) && isU64(cond, AMD64CondL)) {
   1233          /* long long and/or/xor, then L
   1234             LOGIC sets SF and ZF according to the
   1235             result and makes OF be zero.  L computes SF ^ OF, but
   1236             OF is zero, so this reduces to SF -- which will be 1 iff
   1237             the result is < signed 0.  Hence ...
   1238          */
   1239          return unop(Iop_1Uto64,
   1240                      binop(Iop_CmpLT64S,
   1241                            cc_dep1,
   1242                            mkU64(0)));
   1243       }
   1244 
   1245       /*---------------- LOGICL ----------------*/
   1246 
   1247       if (isU64(cc_op, AMD64G_CC_OP_LOGICL) && isU64(cond, AMD64CondZ)) {
   1248          /* long and/or/xor, then Z --> test dst==0 */
   1249          return unop(Iop_1Uto64,
   1250                      binop(Iop_CmpEQ32,
   1251                            unop(Iop_64to32, cc_dep1),
   1252                            mkU32(0)));
   1253       }
   1254       if (isU64(cc_op, AMD64G_CC_OP_LOGICL) && isU64(cond, AMD64CondNZ)) {
   1255          /* long and/or/xor, then NZ --> test dst!=0 */
   1256          return unop(Iop_1Uto64,
   1257                      binop(Iop_CmpNE32,
   1258                            unop(Iop_64to32, cc_dep1),
   1259                            mkU32(0)));
   1260       }
   1261 
   1262       if (isU64(cc_op, AMD64G_CC_OP_LOGICL) && isU64(cond, AMD64CondLE)) {
   1263          /* long and/or/xor, then LE
   1264             This is pretty subtle.  LOGIC sets SF and ZF according to the
   1265             result and makes OF be zero.  LE computes (SF ^ OF) | ZF, but
   1266             OF is zero, so this reduces to SF | ZF -- which will be 1 iff
   1267             the result is <=signed 0.  Hence ...
   1268          */
   1269          return unop(Iop_1Uto64,
   1270                      binop(Iop_CmpLE32S,
   1271                            unop(Iop_64to32, cc_dep1),
   1272                            mkU32(0)));
   1273       }
   1274 
   1275       if (isU64(cc_op, AMD64G_CC_OP_LOGICL) && isU64(cond, AMD64CondS)) {
   1276          /* long and/or/xor, then S --> (ULong)result[31] */
   1277          return binop(Iop_And64,
   1278                       binop(Iop_Shr64, cc_dep1, mkU8(31)),
   1279                       mkU64(1));
   1280       }
   1281       if (isU64(cc_op, AMD64G_CC_OP_LOGICL) && isU64(cond, AMD64CondNS)) {
   1282          /* long and/or/xor, then S --> (ULong) ~ result[31] */
   1283          return binop(Iop_Xor64,
   1284                 binop(Iop_And64,
   1285                       binop(Iop_Shr64, cc_dep1, mkU8(31)),
   1286                       mkU64(1)),
   1287                 mkU64(1));
   1288       }
   1289 
   1290       /*---------------- LOGICW ----------------*/
   1291 
   1292       if (isU64(cc_op, AMD64G_CC_OP_LOGICW) && isU64(cond, AMD64CondZ)) {
   1293          /* word and/or/xor, then Z --> test dst==0 */
   1294          return unop(Iop_1Uto64,
   1295                      binop(Iop_CmpEQ64,
   1296                            binop(Iop_And64, cc_dep1, mkU64(0xFFFF)),
   1297                            mkU64(0)));
   1298       }
   1299       if (isU64(cc_op, AMD64G_CC_OP_LOGICW) && isU64(cond, AMD64CondNZ)) {
   1300          /* word and/or/xor, then NZ --> test dst!=0 */
   1301          return unop(Iop_1Uto64,
   1302                      binop(Iop_CmpNE64,
   1303                            binop(Iop_And64, cc_dep1, mkU64(0xFFFF)),
   1304                            mkU64(0)));
   1305       }
   1306 
   1307       /*---------------- LOGICB ----------------*/
   1308 
   1309       if (isU64(cc_op, AMD64G_CC_OP_LOGICB) && isU64(cond, AMD64CondZ)) {
   1310          /* byte and/or/xor, then Z --> test dst==0 */
   1311          return unop(Iop_1Uto64,
   1312                      binop(Iop_CmpEQ64, binop(Iop_And64,cc_dep1,mkU64(255)),
   1313                                         mkU64(0)));
   1314       }
   1315       if (isU64(cc_op, AMD64G_CC_OP_LOGICB) && isU64(cond, AMD64CondNZ)) {
   1316          /* byte and/or/xor, then NZ --> test dst!=0 */
   1317          return unop(Iop_1Uto64,
   1318                      binop(Iop_CmpNE64, binop(Iop_And64,cc_dep1,mkU64(255)),
   1319                                         mkU64(0)));
   1320       }
   1321 
   1322       if (isU64(cc_op, AMD64G_CC_OP_LOGICB) && isU64(cond, AMD64CondS)) {
   1323          /* this is an idiom gcc sometimes uses to find out if the top
   1324             bit of a byte register is set: eg testb %al,%al; js ..
   1325             Since it just depends on the top bit of the byte, extract
   1326             that bit and explicitly get rid of all the rest.  This
   1327             helps memcheck avoid false positives in the case where any
   1328             of the other bits in the byte are undefined. */
   1329          /* byte and/or/xor, then S --> (UInt)result[7] */
   1330          return binop(Iop_And64,
   1331                       binop(Iop_Shr64,cc_dep1,mkU8(7)),
   1332                       mkU64(1));
   1333       }
   1334       if (isU64(cc_op, AMD64G_CC_OP_LOGICB) && isU64(cond, AMD64CondNS)) {
   1335          /* byte and/or/xor, then NS --> (UInt)!result[7] */
   1336          return binop(Iop_Xor64,
   1337                       binop(Iop_And64,
   1338                             binop(Iop_Shr64,cc_dep1,mkU8(7)),
   1339                             mkU64(1)),
   1340                       mkU64(1));
   1341       }
   1342 
   1343       /*---------------- INCB ----------------*/
   1344 
   1345       if (isU64(cc_op, AMD64G_CC_OP_INCB) && isU64(cond, AMD64CondLE)) {
   1346          /* 8-bit inc, then LE --> sign bit of the arg */
   1347          return binop(Iop_And64,
   1348                       binop(Iop_Shr64,
   1349                             binop(Iop_Sub64, cc_dep1, mkU64(1)),
   1350                             mkU8(7)),
   1351                       mkU64(1));
   1352       }
   1353 
   1354       /*---------------- INCW ----------------*/
   1355 
   1356       if (isU64(cc_op, AMD64G_CC_OP_INCW) && isU64(cond, AMD64CondZ)) {
   1357          /* 16-bit inc, then Z --> test dst == 0 */
   1358          return unop(Iop_1Uto64,
   1359                      binop(Iop_CmpEQ64,
   1360                            binop(Iop_Shl64,cc_dep1,mkU8(48)),
   1361                            mkU64(0)));
   1362       }
   1363 
   1364       /*---------------- DECL ----------------*/
   1365 
   1366       if (isU64(cc_op, AMD64G_CC_OP_DECL) && isU64(cond, AMD64CondZ)) {
   1367          /* dec L, then Z --> test dst == 0 */
   1368          return unop(Iop_1Uto64,
   1369                      binop(Iop_CmpEQ32,
   1370                            unop(Iop_64to32, cc_dep1),
   1371                            mkU32(0)));
   1372       }
   1373 
   1374       /*---------------- DECW ----------------*/
   1375 
   1376       if (isU64(cc_op, AMD64G_CC_OP_DECW) && isU64(cond, AMD64CondNZ)) {
   1377          /* 16-bit dec, then NZ --> test dst != 0 */
   1378          return unop(Iop_1Uto64,
   1379                      binop(Iop_CmpNE64,
   1380                            binop(Iop_Shl64,cc_dep1,mkU8(48)),
   1381                            mkU64(0)));
   1382       }
   1383 
   1384       /*---------------- COPY ----------------*/
   1385       /* This can happen, as a result of amd64 FP compares: "comisd ... ;
   1386          jbe" for example. */
   1387 
   1388       if (isU64(cc_op, AMD64G_CC_OP_COPY) &&
   1389           (isU64(cond, AMD64CondBE) || isU64(cond, AMD64CondNBE))) {
   1390          /* COPY, then BE --> extract C and Z from dep1, and test (C
   1391             or Z == 1). */
   1392          /* COPY, then NBE --> extract C and Z from dep1, and test (C
   1393             or Z == 0). */
   1394          ULong nnn = isU64(cond, AMD64CondBE) ? 1 : 0;
   1395          return
   1396             unop(
   1397                Iop_1Uto64,
   1398                binop(
   1399                   Iop_CmpEQ64,
   1400                   binop(
   1401                      Iop_And64,
   1402                      binop(
   1403                         Iop_Or64,
   1404                         binop(Iop_Shr64, cc_dep1, mkU8(AMD64G_CC_SHIFT_C)),
   1405                         binop(Iop_Shr64, cc_dep1, mkU8(AMD64G_CC_SHIFT_Z))
   1406                      ),
   1407                      mkU64(1)
   1408                   ),
   1409                   mkU64(nnn)
   1410                )
   1411             );
   1412       }
   1413 
   1414       if (isU64(cc_op, AMD64G_CC_OP_COPY) && isU64(cond, AMD64CondB)) {
   1415          /* COPY, then B --> extract C dep1, and test (C == 1). */
   1416          return
   1417             unop(
   1418                Iop_1Uto64,
   1419                binop(
   1420                   Iop_CmpNE64,
   1421                   binop(
   1422                      Iop_And64,
   1423                      binop(Iop_Shr64, cc_dep1, mkU8(AMD64G_CC_SHIFT_C)),
   1424                      mkU64(1)
   1425                   ),
   1426                   mkU64(0)
   1427                )
   1428             );
   1429       }
   1430 
   1431       if (isU64(cc_op, AMD64G_CC_OP_COPY)
   1432           && (isU64(cond, AMD64CondZ) || isU64(cond, AMD64CondNZ))) {
   1433          /* COPY, then Z --> extract Z from dep1, and test (Z == 1). */
   1434          /* COPY, then NZ --> extract Z from dep1, and test (Z == 0). */
   1435          UInt nnn = isU64(cond, AMD64CondZ) ? 1 : 0;
   1436          return
   1437             unop(
   1438                Iop_1Uto64,
   1439                binop(
   1440                   Iop_CmpEQ64,
   1441                   binop(
   1442                      Iop_And64,
   1443                      binop(Iop_Shr64, cc_dep1, mkU8(AMD64G_CC_SHIFT_Z)),
   1444                      mkU64(1)
   1445                   ),
   1446                   mkU64(nnn)
   1447                )
   1448             );
   1449       }
   1450 
   1451       if (isU64(cc_op, AMD64G_CC_OP_COPY) && isU64(cond, AMD64CondP)) {
   1452          /* COPY, then P --> extract P from dep1, and test (P == 1). */
   1453          return
   1454             unop(
   1455                Iop_1Uto64,
   1456                binop(
   1457                   Iop_CmpNE64,
   1458                   binop(
   1459                      Iop_And64,
   1460                      binop(Iop_Shr64, cc_dep1, mkU8(AMD64G_CC_SHIFT_P)),
   1461                      mkU64(1)
   1462                   ),
   1463                   mkU64(0)
   1464                )
   1465             );
   1466       }
   1467 
   1468       return NULL;
   1469    }
   1470 
   1471    /* --------- specialising "amd64g_calculate_rflags_c" --------- */
   1472 
   1473    if (vex_streq(function_name, "amd64g_calculate_rflags_c")) {
   1474       /* specialise calls to above "calculate_rflags_c" function */
   1475       IRExpr *cc_op, *cc_dep1, *cc_dep2, *cc_ndep;
   1476       vassert(arity == 4);
   1477       cc_op   = args[0];
   1478       cc_dep1 = args[1];
   1479       cc_dep2 = args[2];
   1480       cc_ndep = args[3];
   1481 
   1482       if (isU64(cc_op, AMD64G_CC_OP_SUBQ)) {
   1483          /* C after sub denotes unsigned less than */
   1484          return unop(Iop_1Uto64,
   1485                      binop(Iop_CmpLT64U,
   1486                            cc_dep1,
   1487                            cc_dep2));
   1488       }
   1489       if (isU64(cc_op, AMD64G_CC_OP_SUBL)) {
   1490          /* C after sub denotes unsigned less than */
   1491          return unop(Iop_1Uto64,
   1492                      binop(Iop_CmpLT32U,
   1493                            unop(Iop_64to32, cc_dep1),
   1494                            unop(Iop_64to32, cc_dep2)));
   1495       }
   1496       if (isU64(cc_op, AMD64G_CC_OP_SUBB)) {
   1497          /* C after sub denotes unsigned less than */
   1498          return unop(Iop_1Uto64,
   1499                      binop(Iop_CmpLT64U,
   1500                            binop(Iop_And64,cc_dep1,mkU64(0xFF)),
   1501                            binop(Iop_And64,cc_dep2,mkU64(0xFF))));
   1502       }
   1503       if (isU64(cc_op, AMD64G_CC_OP_LOGICQ)
   1504           || isU64(cc_op, AMD64G_CC_OP_LOGICL)
   1505           || isU64(cc_op, AMD64G_CC_OP_LOGICW)
   1506           || isU64(cc_op, AMD64G_CC_OP_LOGICB)) {
   1507          /* cflag after logic is zero */
   1508          return mkU64(0);
   1509       }
   1510       if (isU64(cc_op, AMD64G_CC_OP_DECL) || isU64(cc_op, AMD64G_CC_OP_INCL)
   1511           || isU64(cc_op, AMD64G_CC_OP_DECQ) || isU64(cc_op, AMD64G_CC_OP_INCQ)) {
   1512          /* If the thunk is dec or inc, the cflag is supplied as CC_NDEP. */
   1513          return cc_ndep;
   1514       }
   1515 
   1516 #     if 0
   1517       if (cc_op->tag == Iex_Const) {
   1518          vex_printf("CFLAG "); ppIRExpr(cc_op); vex_printf("\n");
   1519       }
   1520 #     endif
   1521 
   1522       return NULL;
   1523    }
   1524 
   1525 #  undef unop
   1526 #  undef binop
   1527 #  undef mkU64
   1528 #  undef mkU32
   1529 #  undef mkU8
   1530 
   1531    return NULL;
   1532 }
   1533 
   1534 
   1535 /*---------------------------------------------------------------*/
   1536 /*--- Supporting functions for x87 FPU activities.            ---*/
   1537 /*---------------------------------------------------------------*/
   1538 
   1539 static inline Bool host_is_little_endian ( void )
   1540 {
   1541    UInt x = 0x76543210;
   1542    UChar* p = (UChar*)(&x);
   1543    return toBool(*p == 0x10);
   1544 }
   1545 
   1546 /* Inspect a value and its tag, as per the x87 'FXAM' instruction. */
   1547 /* CALLED FROM GENERATED CODE: CLEAN HELPER */
   1548 ULong amd64g_calculate_FXAM ( ULong tag, ULong dbl )
   1549 {
   1550    Bool   mantissaIsZero;
   1551    Int    bexp;
   1552    UChar  sign;
   1553    UChar* f64;
   1554 
   1555    vassert(host_is_little_endian());
   1556 
   1557    /* vex_printf("calculate_FXAM ( %d, %llx ) .. ", tag, dbl ); */
   1558 
   1559    f64  = (UChar*)(&dbl);
   1560    sign = toUChar( (f64[7] >> 7) & 1 );
   1561 
   1562    /* First off, if the tag indicates the register was empty,
   1563       return 1,0,sign,1 */
   1564    if (tag == 0) {
   1565       /* vex_printf("Empty\n"); */
   1566       return AMD64G_FC_MASK_C3 | 0 | (sign << AMD64G_FC_SHIFT_C1)
   1567                                    | AMD64G_FC_MASK_C0;
   1568    }
   1569 
   1570    bexp = (f64[7] << 4) | ((f64[6] >> 4) & 0x0F);
   1571    bexp &= 0x7FF;
   1572 
   1573    mantissaIsZero
   1574       = toBool(
   1575            (f64[6] & 0x0F) == 0
   1576            && (f64[5] | f64[4] | f64[3] | f64[2] | f64[1] | f64[0]) == 0
   1577         );
   1578 
   1579    /* If both exponent and mantissa are zero, the value is zero.
   1580       Return 1,0,sign,0. */
   1581    if (bexp == 0 && mantissaIsZero) {
   1582       /* vex_printf("Zero\n"); */
   1583       return AMD64G_FC_MASK_C3 | 0
   1584                                | (sign << AMD64G_FC_SHIFT_C1) | 0;
   1585    }
   1586 
   1587    /* If exponent is zero but mantissa isn't, it's a denormal.
   1588       Return 1,1,sign,0. */
   1589    if (bexp == 0 && !mantissaIsZero) {
   1590       /* vex_printf("Denormal\n"); */
   1591       return AMD64G_FC_MASK_C3 | AMD64G_FC_MASK_C2
   1592                                | (sign << AMD64G_FC_SHIFT_C1) | 0;
   1593    }
   1594 
   1595    /* If the exponent is 7FF and the mantissa is zero, this is an infinity.
   1596       Return 0,1,sign,1. */
   1597    if (bexp == 0x7FF && mantissaIsZero) {
   1598       /* vex_printf("Inf\n"); */
   1599       return 0 | AMD64G_FC_MASK_C2 | (sign << AMD64G_FC_SHIFT_C1)
   1600                                    | AMD64G_FC_MASK_C0;
   1601    }
   1602 
   1603    /* If the exponent is 7FF and the mantissa isn't zero, this is a NaN.
   1604       Return 0,0,sign,1. */
   1605    if (bexp == 0x7FF && !mantissaIsZero) {
   1606       /* vex_printf("NaN\n"); */
   1607       return 0 | 0 | (sign << AMD64G_FC_SHIFT_C1) | AMD64G_FC_MASK_C0;
   1608    }
   1609 
   1610    /* Uh, ok, we give up.  It must be a normal finite number.
   1611       Return 0,1,sign,0.
   1612    */
   1613    /* vex_printf("normal\n"); */
   1614    return 0 | AMD64G_FC_MASK_C2 | (sign << AMD64G_FC_SHIFT_C1) | 0;
   1615 }
   1616 
   1617 
   1618 /* This is used to implement both 'frstor' and 'fldenv'.  The latter
   1619    appears to differ from the former only in that the 8 FP registers
   1620    themselves are not transferred into the guest state. */
   1621 static
   1622 VexEmNote do_put_x87 ( Bool moveRegs,
   1623                        /*IN*/UChar* x87_state,
   1624                        /*OUT*/VexGuestAMD64State* vex_state )
   1625 {
   1626    Int        stno, preg;
   1627    UInt       tag;
   1628    ULong*     vexRegs = (ULong*)(&vex_state->guest_FPREG[0]);
   1629    UChar*     vexTags = (UChar*)(&vex_state->guest_FPTAG[0]);
   1630    Fpu_State* x87     = (Fpu_State*)x87_state;
   1631    UInt       ftop    = (x87->env[FP_ENV_STAT] >> 11) & 7;
   1632    UInt       tagw    = x87->env[FP_ENV_TAG];
   1633    UInt       fpucw   = x87->env[FP_ENV_CTRL];
   1634    UInt       c3210   = x87->env[FP_ENV_STAT] & 0x4700;
   1635    VexEmNote  ew;
   1636    UInt       fpround;
   1637    ULong      pair;
   1638 
   1639    /* Copy registers and tags */
   1640    for (stno = 0; stno < 8; stno++) {
   1641       preg = (stno + ftop) & 7;
   1642       tag = (tagw >> (2*preg)) & 3;
   1643       if (tag == 3) {
   1644          /* register is empty */
   1645          /* hmm, if it's empty, does it still get written?  Probably
   1646             safer to say it does.  If we don't, memcheck could get out
   1647             of sync, in that it thinks all FP registers are defined by
   1648             this helper, but in reality some have not been updated. */
   1649          if (moveRegs)
   1650             vexRegs[preg] = 0; /* IEEE754 64-bit zero */
   1651          vexTags[preg] = 0;
   1652       } else {
   1653          /* register is non-empty */
   1654          if (moveRegs)
   1655             convert_f80le_to_f64le( &x87->reg[10*stno],
   1656                                     (UChar*)&vexRegs[preg] );
   1657          vexTags[preg] = 1;
   1658       }
   1659    }
   1660 
   1661    /* stack pointer */
   1662    vex_state->guest_FTOP = ftop;
   1663 
   1664    /* status word */
   1665    vex_state->guest_FC3210 = c3210;
   1666 
   1667    /* handle the control word, setting FPROUND and detecting any
   1668       emulation warnings. */
   1669    pair    = amd64g_check_fldcw ( (ULong)fpucw );
   1670    fpround = (UInt)pair & 0xFFFFFFFFULL;
   1671    ew      = (VexEmNote)(pair >> 32);
   1672 
   1673    vex_state->guest_FPROUND = fpround & 3;
   1674 
   1675    /* emulation warnings --> caller */
   1676    return ew;
   1677 }
   1678 
   1679 
   1680 /* Create an x87 FPU state from the guest state, as close as
   1681    we can approximate it. */
   1682 static
   1683 void do_get_x87 ( /*IN*/VexGuestAMD64State* vex_state,
   1684                   /*OUT*/UChar* x87_state )
   1685 {
   1686    Int        i, stno, preg;
   1687    UInt       tagw;
   1688    ULong*     vexRegs = (ULong*)(&vex_state->guest_FPREG[0]);
   1689    UChar*     vexTags = (UChar*)(&vex_state->guest_FPTAG[0]);
   1690    Fpu_State* x87     = (Fpu_State*)x87_state;
   1691    UInt       ftop    = vex_state->guest_FTOP;
   1692    UInt       c3210   = vex_state->guest_FC3210;
   1693 
   1694    for (i = 0; i < 14; i++)
   1695       x87->env[i] = 0;
   1696 
   1697    x87->env[1] = x87->env[3] = x87->env[5] = x87->env[13] = 0xFFFF;
   1698    x87->env[FP_ENV_STAT]
   1699       = toUShort(((ftop & 7) << 11) | (c3210 & 0x4700));
   1700    x87->env[FP_ENV_CTRL]
   1701       = toUShort(amd64g_create_fpucw( vex_state->guest_FPROUND ));
   1702 
   1703    /* Dump the register stack in ST order. */
   1704    tagw = 0;
   1705    for (stno = 0; stno < 8; stno++) {
   1706       preg = (stno + ftop) & 7;
   1707       if (vexTags[preg] == 0) {
   1708          /* register is empty */
   1709          tagw |= (3 << (2*preg));
   1710          convert_f64le_to_f80le( (UChar*)&vexRegs[preg],
   1711                                  &x87->reg[10*stno] );
   1712       } else {
   1713          /* register is full. */
   1714          tagw |= (0 << (2*preg));
   1715          convert_f64le_to_f80le( (UChar*)&vexRegs[preg],
   1716                                  &x87->reg[10*stno] );
   1717       }
   1718    }
   1719    x87->env[FP_ENV_TAG] = toUShort(tagw);
   1720 }
   1721 
   1722 
   1723 /* CALLED FROM GENERATED CODE */
   1724 /* DIRTY HELPER (reads guest state, writes guest mem) */
   1725 /* NOTE: only handles 32-bit format (no REX.W on the insn) */
   1726 void amd64g_dirtyhelper_FXSAVE ( VexGuestAMD64State* gst, HWord addr )
   1727 {
   1728    /* Derived from values obtained from
   1729       vendor_id       : AuthenticAMD
   1730       cpu family      : 15
   1731       model           : 12
   1732       model name      : AMD Athlon(tm) 64 Processor 3200+
   1733       stepping        : 0
   1734       cpu MHz         : 2200.000
   1735       cache size      : 512 KB
   1736    */
   1737    /* Somewhat roundabout, but at least it's simple. */
   1738    Fpu_State tmp;
   1739    UShort*   addrS = (UShort*)addr;
   1740    UChar*    addrC = (UChar*)addr;
   1741    U128*     xmm   = (U128*)(addr + 160);
   1742    UInt      mxcsr;
   1743    UShort    fp_tags;
   1744    UInt      summary_tags;
   1745    Int       r, stno;
   1746    UShort    *srcS, *dstS;
   1747 
   1748    do_get_x87( gst, (UChar*)&tmp );
   1749    mxcsr = amd64g_create_mxcsr( gst->guest_SSEROUND );
   1750 
   1751    /* Now build the proper fxsave image from the x87 image we just
   1752       made. */
   1753 
   1754    addrS[0]  = tmp.env[FP_ENV_CTRL]; /* FCW: fpu control word */
   1755    addrS[1]  = tmp.env[FP_ENV_STAT]; /* FCW: fpu status word */
   1756 
   1757    /* set addrS[2] in an endian-independent way */
   1758    summary_tags = 0;
   1759    fp_tags = tmp.env[FP_ENV_TAG];
   1760    for (r = 0; r < 8; r++) {
   1761       if ( ((fp_tags >> (2*r)) & 3) != 3 )
   1762          summary_tags |= (1 << r);
   1763    }
   1764    addrC[4]  = toUChar(summary_tags); /* FTW: tag summary byte */
   1765    addrC[5]  = 0; /* pad */
   1766 
   1767    /* FOP: faulting fpu opcode.  From experimentation, the real CPU
   1768       does not write this field. (?!) */
   1769    addrS[3]  = 0; /* BOGUS */
   1770 
   1771    /* RIP (Last x87 instruction pointer).  From experimentation, the
   1772       real CPU does not write this field. (?!) */
   1773    addrS[4]  = 0; /* BOGUS */
   1774    addrS[5]  = 0; /* BOGUS */
   1775    addrS[6]  = 0; /* BOGUS */
   1776    addrS[7]  = 0; /* BOGUS */
   1777 
   1778    /* RDP (Last x87 data pointer).  From experimentation, the real CPU
   1779       does not write this field. (?!) */
   1780    addrS[8]  = 0; /* BOGUS */
   1781    addrS[9]  = 0; /* BOGUS */
   1782    addrS[10] = 0; /* BOGUS */
   1783    addrS[11] = 0; /* BOGUS */
   1784 
   1785    addrS[12] = toUShort(mxcsr);  /* MXCSR */
   1786    addrS[13] = toUShort(mxcsr >> 16);
   1787 
   1788    addrS[14] = 0xFFFF; /* MXCSR mask (lo16) */
   1789    addrS[15] = 0x0000; /* MXCSR mask (hi16) */
   1790 
   1791    /* Copy in the FP registers, in ST order. */
   1792    for (stno = 0; stno < 8; stno++) {
   1793       srcS = (UShort*)(&tmp.reg[10*stno]);
   1794       dstS = (UShort*)(&addrS[16 + 8*stno]);
   1795       dstS[0] = srcS[0];
   1796       dstS[1] = srcS[1];
   1797       dstS[2] = srcS[2];
   1798       dstS[3] = srcS[3];
   1799       dstS[4] = srcS[4];
   1800       dstS[5] = 0;
   1801       dstS[6] = 0;
   1802       dstS[7] = 0;
   1803    }
   1804 
   1805    /* That's the first 160 bytes of the image done.  Now only %xmm0
   1806       .. %xmm15 remain to be copied.  If the host is big-endian, these
   1807       need to be byte-swapped. */
   1808    vassert(host_is_little_endian());
   1809 
   1810 #  define COPY_U128(_dst,_src)                       \
   1811       do { _dst[0] = _src[0]; _dst[1] = _src[1];     \
   1812            _dst[2] = _src[2]; _dst[3] = _src[3]; }   \
   1813       while (0)
   1814 
   1815    COPY_U128( xmm[0],  gst->guest_YMM0 );
   1816    COPY_U128( xmm[1],  gst->guest_YMM1 );
   1817    COPY_U128( xmm[2],  gst->guest_YMM2 );
   1818    COPY_U128( xmm[3],  gst->guest_YMM3 );
   1819    COPY_U128( xmm[4],  gst->guest_YMM4 );
   1820    COPY_U128( xmm[5],  gst->guest_YMM5 );
   1821    COPY_U128( xmm[6],  gst->guest_YMM6 );
   1822    COPY_U128( xmm[7],  gst->guest_YMM7 );
   1823    COPY_U128( xmm[8],  gst->guest_YMM8 );
   1824    COPY_U128( xmm[9],  gst->guest_YMM9 );
   1825    COPY_U128( xmm[10], gst->guest_YMM10 );
   1826    COPY_U128( xmm[11], gst->guest_YMM11 );
   1827    COPY_U128( xmm[12], gst->guest_YMM12 );
   1828    COPY_U128( xmm[13], gst->guest_YMM13 );
   1829    COPY_U128( xmm[14], gst->guest_YMM14 );
   1830    COPY_U128( xmm[15], gst->guest_YMM15 );
   1831 
   1832 #  undef COPY_U128
   1833 }
   1834 
   1835 
   1836 /* CALLED FROM GENERATED CODE */
   1837 /* DIRTY HELPER (writes guest state, reads guest mem) */
   1838 VexEmNote amd64g_dirtyhelper_FXRSTOR ( VexGuestAMD64State* gst, HWord addr )
   1839 {
   1840    Fpu_State tmp;
   1841    VexEmNote warnX87 = EmNote_NONE;
   1842    VexEmNote warnXMM = EmNote_NONE;
   1843    UShort*   addrS   = (UShort*)addr;
   1844    UChar*    addrC   = (UChar*)addr;
   1845    U128*     xmm     = (U128*)(addr + 160);
   1846    UShort    fp_tags;
   1847    Int       r, stno, i;
   1848 
   1849    /* Restore %xmm0 .. %xmm15.  If the host is big-endian, these need
   1850       to be byte-swapped. */
   1851    vassert(host_is_little_endian());
   1852 
   1853 #  define COPY_U128(_dst,_src)                       \
   1854       do { _dst[0] = _src[0]; _dst[1] = _src[1];     \
   1855            _dst[2] = _src[2]; _dst[3] = _src[3]; }   \
   1856       while (0)
   1857 
   1858    COPY_U128( gst->guest_YMM0, xmm[0] );
   1859    COPY_U128( gst->guest_YMM1, xmm[1] );
   1860    COPY_U128( gst->guest_YMM2, xmm[2] );
   1861    COPY_U128( gst->guest_YMM3, xmm[3] );
   1862    COPY_U128( gst->guest_YMM4, xmm[4] );
   1863    COPY_U128( gst->guest_YMM5, xmm[5] );
   1864    COPY_U128( gst->guest_YMM6, xmm[6] );
   1865    COPY_U128( gst->guest_YMM7, xmm[7] );
   1866    COPY_U128( gst->guest_YMM8, xmm[8] );
   1867    COPY_U128( gst->guest_YMM9, xmm[9] );
   1868    COPY_U128( gst->guest_YMM10, xmm[10] );
   1869    COPY_U128( gst->guest_YMM11, xmm[11] );
   1870    COPY_U128( gst->guest_YMM12, xmm[12] );
   1871    COPY_U128( gst->guest_YMM13, xmm[13] );
   1872    COPY_U128( gst->guest_YMM14, xmm[14] );
   1873    COPY_U128( gst->guest_YMM15, xmm[15] );
   1874 
   1875 #  undef COPY_U128
   1876 
   1877    /* Copy the x87 registers out of the image, into a temporary
   1878       Fpu_State struct. */
   1879    for (i = 0; i < 14; i++) tmp.env[i] = 0;
   1880    for (i = 0; i < 80; i++) tmp.reg[i] = 0;
   1881    /* fill in tmp.reg[0..7] */
   1882    for (stno = 0; stno < 8; stno++) {
   1883       UShort* dstS = (UShort*)(&tmp.reg[10*stno]);
   1884       UShort* srcS = (UShort*)(&addrS[16 + 8*stno]);
   1885       dstS[0] = srcS[0];
   1886       dstS[1] = srcS[1];
   1887       dstS[2] = srcS[2];
   1888       dstS[3] = srcS[3];
   1889       dstS[4] = srcS[4];
   1890    }
   1891    /* fill in tmp.env[0..13] */
   1892    tmp.env[FP_ENV_CTRL] = addrS[0]; /* FCW: fpu control word */
   1893    tmp.env[FP_ENV_STAT] = addrS[1]; /* FCW: fpu status word */
   1894 
   1895    fp_tags = 0;
   1896    for (r = 0; r < 8; r++) {
   1897       if (addrC[4] & (1<<r))
   1898          fp_tags |= (0 << (2*r)); /* EMPTY */
   1899       else
   1900          fp_tags |= (3 << (2*r)); /* VALID -- not really precise enough. */
   1901    }
   1902    tmp.env[FP_ENV_TAG] = fp_tags;
   1903 
   1904    /* Now write 'tmp' into the guest state. */
   1905    warnX87 = do_put_x87( True/*moveRegs*/, (UChar*)&tmp, gst );
   1906 
   1907    { UInt w32 = (((UInt)addrS[12]) & 0xFFFF)
   1908                 | ((((UInt)addrS[13]) & 0xFFFF) << 16);
   1909      ULong w64 = amd64g_check_ldmxcsr( (ULong)w32 );
   1910 
   1911      warnXMM = (VexEmNote)(w64 >> 32);
   1912 
   1913      gst->guest_SSEROUND = w64 & 0xFFFFFFFFULL;
   1914    }
   1915 
   1916    /* Prefer an X87 emwarn over an XMM one, if both exist. */
   1917    if (warnX87 != EmNote_NONE)
   1918       return warnX87;
   1919    else
   1920       return warnXMM;
   1921 }
   1922 
   1923 
   1924 /* DIRTY HELPER (writes guest state) */
   1925 /* Initialise the x87 FPU state as per 'finit'. */
   1926 void amd64g_dirtyhelper_FINIT ( VexGuestAMD64State* gst )
   1927 {
   1928    Int i;
   1929    gst->guest_FTOP = 0;
   1930    for (i = 0; i < 8; i++) {
   1931       gst->guest_FPTAG[i] = 0; /* empty */
   1932       gst->guest_FPREG[i] = 0; /* IEEE754 64-bit zero */
   1933    }
   1934    gst->guest_FPROUND = (ULong)Irrm_NEAREST;
   1935    gst->guest_FC3210  = 0;
   1936 }
   1937 
   1938 
   1939 /* CALLED FROM GENERATED CODE */
   1940 /* DIRTY HELPER (reads guest memory) */
   1941 ULong amd64g_dirtyhelper_loadF80le ( ULong addrU )
   1942 {
   1943    ULong f64;
   1944    convert_f80le_to_f64le ( (UChar*)ULong_to_Ptr(addrU), (UChar*)&f64 );
   1945    return f64;
   1946 }
   1947 
   1948 /* CALLED FROM GENERATED CODE */
   1949 /* DIRTY HELPER (writes guest memory) */
   1950 void amd64g_dirtyhelper_storeF80le ( ULong addrU, ULong f64 )
   1951 {
   1952    convert_f64le_to_f80le( (UChar*)&f64, (UChar*)ULong_to_Ptr(addrU) );
   1953 }
   1954 
   1955 
   1956 /* CALLED FROM GENERATED CODE */
   1957 /* CLEAN HELPER */
   1958 /* mxcsr[15:0] contains a SSE native format MXCSR value.
   1959    Extract from it the required SSEROUND value and any resulting
   1960    emulation warning, and return (warn << 32) | sseround value.
   1961 */
   1962 ULong amd64g_check_ldmxcsr ( ULong mxcsr )
   1963 {
   1964    /* Decide on a rounding mode.  mxcsr[14:13] holds it. */
   1965    /* NOTE, encoded exactly as per enum IRRoundingMode. */
   1966    ULong rmode = (mxcsr >> 13) & 3;
   1967 
   1968    /* Detect any required emulation warnings. */
   1969    VexEmNote ew = EmNote_NONE;
   1970 
   1971    if ((mxcsr & 0x1F80) != 0x1F80) {
   1972       /* unmasked exceptions! */
   1973       ew = EmWarn_X86_sseExns;
   1974    }
   1975    else
   1976    if (mxcsr & (1<<15)) {
   1977       /* FZ is set */
   1978       ew = EmWarn_X86_fz;
   1979    }
   1980    else
   1981    if (mxcsr & (1<<6)) {
   1982       /* DAZ is set */
   1983       ew = EmWarn_X86_daz;
   1984    }
   1985 
   1986    return (((ULong)ew) << 32) | ((ULong)rmode);
   1987 }
   1988 
   1989 
   1990 /* CALLED FROM GENERATED CODE */
   1991 /* CLEAN HELPER */
   1992 /* Given sseround as an IRRoundingMode value, create a suitable SSE
   1993    native format MXCSR value. */
   1994 ULong amd64g_create_mxcsr ( ULong sseround )
   1995 {
   1996    sseround &= 3;
   1997    return 0x1F80 | (sseround << 13);
   1998 }
   1999 
   2000 
   2001 /* CLEAN HELPER */
   2002 /* fpucw[15:0] contains a x87 native format FPU control word.
   2003    Extract from it the required FPROUND value and any resulting
   2004    emulation warning, and return (warn << 32) | fpround value.
   2005 */
   2006 ULong amd64g_check_fldcw ( ULong fpucw )
   2007 {
   2008    /* Decide on a rounding mode.  fpucw[11:10] holds it. */
   2009    /* NOTE, encoded exactly as per enum IRRoundingMode. */
   2010    ULong rmode = (fpucw >> 10) & 3;
   2011 
   2012    /* Detect any required emulation warnings. */
   2013    VexEmNote ew = EmNote_NONE;
   2014 
   2015    if ((fpucw & 0x3F) != 0x3F) {
   2016       /* unmasked exceptions! */
   2017       ew = EmWarn_X86_x87exns;
   2018    }
   2019    else
   2020    if (((fpucw >> 8) & 3) != 3) {
   2021       /* unsupported precision */
   2022       ew = EmWarn_X86_x87precision;
   2023    }
   2024 
   2025    return (((ULong)ew) << 32) | ((ULong)rmode);
   2026 }
   2027 
   2028 
   2029 /* CLEAN HELPER */
   2030 /* Given fpround as an IRRoundingMode value, create a suitable x87
   2031    native format FPU control word. */
   2032 ULong amd64g_create_fpucw ( ULong fpround )
   2033 {
   2034    fpround &= 3;
   2035    return 0x037F | (fpround << 10);
   2036 }
   2037 
   2038 
   2039 /* This is used to implement 'fldenv'.
   2040    Reads 28 bytes at x87_state[0 .. 27]. */
   2041 /* CALLED FROM GENERATED CODE */
   2042 /* DIRTY HELPER */
   2043 VexEmNote amd64g_dirtyhelper_FLDENV ( /*OUT*/VexGuestAMD64State* vex_state,
   2044                                       /*IN*/HWord x87_state)
   2045 {
   2046    return do_put_x87( False, (UChar*)x87_state, vex_state );
   2047 }
   2048 
   2049 
   2050 /* CALLED FROM GENERATED CODE */
   2051 /* DIRTY HELPER */
   2052 /* Create an x87 FPU env from the guest state, as close as we can
   2053    approximate it.  Writes 28 bytes at x87_state[0..27]. */
   2054 void amd64g_dirtyhelper_FSTENV ( /*IN*/VexGuestAMD64State* vex_state,
   2055                                  /*OUT*/HWord x87_state )
   2056 {
   2057    Int        i, stno, preg;
   2058    UInt       tagw;
   2059    UChar*     vexTags = (UChar*)(&vex_state->guest_FPTAG[0]);
   2060    Fpu_State* x87     = (Fpu_State*)x87_state;
   2061    UInt       ftop    = vex_state->guest_FTOP;
   2062    ULong      c3210   = vex_state->guest_FC3210;
   2063 
   2064    for (i = 0; i < 14; i++)
   2065       x87->env[i] = 0;
   2066 
   2067    x87->env[1] = x87->env[3] = x87->env[5] = x87->env[13] = 0xFFFF;
   2068    x87->env[FP_ENV_STAT]
   2069       = toUShort(toUInt( ((ftop & 7) << 11) | (c3210 & 0x4700) ));
   2070    x87->env[FP_ENV_CTRL]
   2071       = toUShort(toUInt( amd64g_create_fpucw( vex_state->guest_FPROUND ) ));
   2072 
   2073    /* Compute the x87 tag word. */
   2074    tagw = 0;
   2075    for (stno = 0; stno < 8; stno++) {
   2076       preg = (stno + ftop) & 7;
   2077       if (vexTags[preg] == 0) {
   2078          /* register is empty */
   2079          tagw |= (3 << (2*preg));
   2080       } else {
   2081          /* register is full. */
   2082          tagw |= (0 << (2*preg));
   2083       }
   2084    }
   2085    x87->env[FP_ENV_TAG] = toUShort(tagw);
   2086 
   2087    /* We don't dump the x87 registers, tho. */
   2088 }
   2089 
   2090 
   2091 /* This is used to implement 'fnsave'.
   2092    Writes 108 bytes at x87_state[0 .. 107]. */
   2093 /* CALLED FROM GENERATED CODE */
   2094 /* DIRTY HELPER */
   2095 void amd64g_dirtyhelper_FNSAVE ( /*IN*/VexGuestAMD64State* vex_state,
   2096                                  /*OUT*/HWord x87_state)
   2097 {
   2098    do_get_x87( vex_state, (UChar*)x87_state );
   2099 }
   2100 
   2101 
   2102 /* This is used to implement 'fnsaves'.
   2103    Writes 94 bytes at x87_state[0 .. 93]. */
   2104 /* CALLED FROM GENERATED CODE */
   2105 /* DIRTY HELPER */
   2106 void amd64g_dirtyhelper_FNSAVES ( /*IN*/VexGuestAMD64State* vex_state,
   2107                                   /*OUT*/HWord x87_state)
   2108 {
   2109    Int           i, stno, preg;
   2110    UInt          tagw;
   2111    ULong*        vexRegs = (ULong*)(&vex_state->guest_FPREG[0]);
   2112    UChar*        vexTags = (UChar*)(&vex_state->guest_FPTAG[0]);
   2113    Fpu_State_16* x87     = (Fpu_State_16*)x87_state;
   2114    UInt          ftop    = vex_state->guest_FTOP;
   2115    UInt          c3210   = vex_state->guest_FC3210;
   2116 
   2117    for (i = 0; i < 7; i++)
   2118       x87->env[i] = 0;
   2119 
   2120    x87->env[FPS_ENV_STAT]
   2121       = toUShort(((ftop & 7) << 11) | (c3210 & 0x4700));
   2122    x87->env[FPS_ENV_CTRL]
   2123       = toUShort(amd64g_create_fpucw( vex_state->guest_FPROUND ));
   2124 
   2125    /* Dump the register stack in ST order. */
   2126    tagw = 0;
   2127    for (stno = 0; stno < 8; stno++) {
   2128       preg = (stno + ftop) & 7;
   2129       if (vexTags[preg] == 0) {
   2130          /* register is empty */
   2131          tagw |= (3 << (2*preg));
   2132          convert_f64le_to_f80le( (UChar*)&vexRegs[preg],
   2133                                  &x87->reg[10*stno] );
   2134       } else {
   2135          /* register is full. */
   2136          tagw |= (0 << (2*preg));
   2137          convert_f64le_to_f80le( (UChar*)&vexRegs[preg],
   2138                                  &x87->reg[10*stno] );
   2139       }
   2140    }
   2141    x87->env[FPS_ENV_TAG] = toUShort(tagw);
   2142 }
   2143 
   2144 
   2145 /* This is used to implement 'frstor'.
   2146    Reads 108 bytes at x87_state[0 .. 107]. */
   2147 /* CALLED FROM GENERATED CODE */
   2148 /* DIRTY HELPER */
   2149 VexEmNote amd64g_dirtyhelper_FRSTOR ( /*OUT*/VexGuestAMD64State* vex_state,
   2150                                       /*IN*/HWord x87_state)
   2151 {
   2152    return do_put_x87( True, (UChar*)x87_state, vex_state );
   2153 }
   2154 
   2155 
   2156 /* This is used to implement 'frstors'.
   2157    Reads 94 bytes at x87_state[0 .. 93]. */
   2158 /* CALLED FROM GENERATED CODE */
   2159 /* DIRTY HELPER */
   2160 VexEmNote amd64g_dirtyhelper_FRSTORS ( /*OUT*/VexGuestAMD64State* vex_state,
   2161                                        /*IN*/HWord x87_state)
   2162 {
   2163    Int           stno, preg;
   2164    UInt          tag;
   2165    ULong*        vexRegs = (ULong*)(&vex_state->guest_FPREG[0]);
   2166    UChar*        vexTags = (UChar*)(&vex_state->guest_FPTAG[0]);
   2167    Fpu_State_16* x87     = (Fpu_State_16*)x87_state;
   2168    UInt          ftop    = (x87->env[FPS_ENV_STAT] >> 11) & 7;
   2169    UInt          tagw    = x87->env[FPS_ENV_TAG];
   2170    UInt          fpucw   = x87->env[FPS_ENV_CTRL];
   2171    UInt          c3210   = x87->env[FPS_ENV_STAT] & 0x4700;
   2172    VexEmNote     ew;
   2173    UInt          fpround;
   2174    ULong         pair;
   2175 
   2176    /* Copy registers and tags */
   2177    for (stno = 0; stno < 8; stno++) {
   2178       preg = (stno + ftop) & 7;
   2179       tag = (tagw >> (2*preg)) & 3;
   2180       if (tag == 3) {
   2181          /* register is empty */
   2182          /* hmm, if it's empty, does it still get written?  Probably
   2183             safer to say it does.  If we don't, memcheck could get out
   2184             of sync, in that it thinks all FP registers are defined by
   2185             this helper, but in reality some have not been updated. */
   2186          vexRegs[preg] = 0; /* IEEE754 64-bit zero */
   2187          vexTags[preg] = 0;
   2188       } else {
   2189          /* register is non-empty */
   2190          convert_f80le_to_f64le( &x87->reg[10*stno],
   2191                                  (UChar*)&vexRegs[preg] );
   2192          vexTags[preg] = 1;
   2193       }
   2194    }
   2195 
   2196    /* stack pointer */
   2197    vex_state->guest_FTOP = ftop;
   2198 
   2199    /* status word */
   2200    vex_state->guest_FC3210 = c3210;
   2201 
   2202    /* handle the control word, setting FPROUND and detecting any
   2203       emulation warnings. */
   2204    pair    = amd64g_check_fldcw ( (ULong)fpucw );
   2205    fpround = (UInt)pair & 0xFFFFFFFFULL;
   2206    ew      = (VexEmNote)(pair >> 32);
   2207 
   2208    vex_state->guest_FPROUND = fpround & 3;
   2209 
   2210    /* emulation warnings --> caller */
   2211    return ew;
   2212 }
   2213 
   2214 
   2215 /*---------------------------------------------------------------*/
   2216 /*--- Misc integer helpers, including rotates and CPUID.      ---*/
   2217 /*---------------------------------------------------------------*/
   2218 
   2219 /* Claim to be the following CPU, which is probably representative of
   2220    the lowliest (earliest) amd64 offerings.  It can do neither sse3
   2221    nor cx16.
   2222 
   2223    vendor_id       : AuthenticAMD
   2224    cpu family      : 15
   2225    model           : 5
   2226    model name      : AMD Opteron (tm) Processor 848
   2227    stepping        : 10
   2228    cpu MHz         : 1797.682
   2229    cache size      : 1024 KB
   2230    fpu             : yes
   2231    fpu_exception   : yes
   2232    cpuid level     : 1
   2233    wp              : yes
   2234    flags           : fpu vme de pse tsc msr pae mce cx8 apic sep
   2235                      mtrr pge mca cmov pat pse36 clflush mmx fxsr
   2236                      sse sse2 syscall nx mmxext lm 3dnowext 3dnow
   2237    bogomips        : 3600.62
   2238    TLB size        : 1088 4K pages
   2239    clflush size    : 64
   2240    cache_alignment : 64
   2241    address sizes   : 40 bits physical, 48 bits virtual
   2242    power management: ts fid vid ttp
   2243 
   2244    2012-Feb-21: don't claim 3dnow or 3dnowext, since in fact
   2245    we don't support them.  See #291568.  3dnow is 80000001.EDX.31
   2246    and 3dnowext is 80000001.EDX.30.
   2247 */
   2248 void amd64g_dirtyhelper_CPUID_baseline ( VexGuestAMD64State* st )
   2249 {
   2250 #  define SET_ABCD(_a,_b,_c,_d)                \
   2251       do { st->guest_RAX = (ULong)(_a);        \
   2252            st->guest_RBX = (ULong)(_b);        \
   2253            st->guest_RCX = (ULong)(_c);        \
   2254            st->guest_RDX = (ULong)(_d);        \
   2255       } while (0)
   2256 
   2257    switch (0xFFFFFFFF & st->guest_RAX) {
   2258       case 0x00000000:
   2259          SET_ABCD(0x00000001, 0x68747541, 0x444d4163, 0x69746e65);
   2260          break;
   2261       case 0x00000001:
   2262          SET_ABCD(0x00000f5a, 0x01000800, 0x00000000, 0x078bfbff);
   2263          break;
   2264       case 0x80000000:
   2265          SET_ABCD(0x80000018, 0x68747541, 0x444d4163, 0x69746e65);
   2266          break;
   2267       case 0x80000001:
   2268          /* Don't claim to support 3dnow or 3dnowext.  0xe1d3fbff is
   2269             the original it-is-supported value that the h/w provides.
   2270             See #291568. */
   2271          SET_ABCD(0x00000f5a, 0x00000505, 0x00000000, /*0xe1d3fbff*/
   2272                                                       0x21d3fbff);
   2273          break;
   2274       case 0x80000002:
   2275          SET_ABCD(0x20444d41, 0x6574704f, 0x206e6f72, 0x296d7428);
   2276          break;
   2277       case 0x80000003:
   2278          SET_ABCD(0x6f725020, 0x73736563, 0x3820726f, 0x00003834);
   2279          break;
   2280       case 0x80000004:
   2281          SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
   2282          break;
   2283       case 0x80000005:
   2284          SET_ABCD(0xff08ff08, 0xff20ff20, 0x40020140, 0x40020140);
   2285          break;
   2286       case 0x80000006:
   2287          SET_ABCD(0x00000000, 0x42004200, 0x04008140, 0x00000000);
   2288          break;
   2289       case 0x80000007:
   2290          SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x0000000f);
   2291          break;
   2292       case 0x80000008:
   2293          SET_ABCD(0x00003028, 0x00000000, 0x00000000, 0x00000000);
   2294          break;
   2295       default:
   2296          SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
   2297          break;
   2298    }
   2299 #  undef SET_ABCD
   2300 }
   2301 
   2302 
   2303 /* Claim to be the following CPU (2 x ...), which is sse3 and cx16
   2304    capable.
   2305 
   2306    vendor_id       : GenuineIntel
   2307    cpu family      : 6
   2308    model           : 15
   2309    model name      : Intel(R) Core(TM)2 CPU 6600 @ 2.40GHz
   2310    stepping        : 6
   2311    cpu MHz         : 2394.000
   2312    cache size      : 4096 KB
   2313    physical id     : 0
   2314    siblings        : 2
   2315    core id         : 0
   2316    cpu cores       : 2
   2317    fpu             : yes
   2318    fpu_exception   : yes
   2319    cpuid level     : 10
   2320    wp              : yes
   2321    flags           : fpu vme de pse tsc msr pae mce cx8 apic sep
   2322                      mtrr pge mca cmov pat pse36 clflush dts acpi
   2323                      mmx fxsr sse sse2 ss ht tm syscall nx lm
   2324                      constant_tsc pni monitor ds_cpl vmx est tm2
   2325                      cx16 xtpr lahf_lm
   2326    bogomips        : 4798.78
   2327    clflush size    : 64
   2328    cache_alignment : 64
   2329    address sizes   : 36 bits physical, 48 bits virtual
   2330    power management:
   2331 */
   2332 void amd64g_dirtyhelper_CPUID_sse3_and_cx16 ( VexGuestAMD64State* st )
   2333 {
   2334 #  define SET_ABCD(_a,_b,_c,_d)                \
   2335       do { st->guest_RAX = (ULong)(_a);        \
   2336            st->guest_RBX = (ULong)(_b);        \
   2337            st->guest_RCX = (ULong)(_c);        \
   2338            st->guest_RDX = (ULong)(_d);        \
   2339       } while (0)
   2340 
   2341    switch (0xFFFFFFFF & st->guest_RAX) {
   2342       case 0x00000000:
   2343          SET_ABCD(0x0000000a, 0x756e6547, 0x6c65746e, 0x49656e69);
   2344          break;
   2345       case 0x00000001:
   2346          SET_ABCD(0x000006f6, 0x00020800, 0x0000e3bd, 0xbfebfbff);
   2347          break;
   2348       case 0x00000002:
   2349          SET_ABCD(0x05b0b101, 0x005657f0, 0x00000000, 0x2cb43049);
   2350          break;
   2351       case 0x00000003:
   2352          SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
   2353          break;
   2354       case 0x00000004: {
   2355          switch (0xFFFFFFFF & st->guest_RCX) {
   2356             case 0x00000000: SET_ABCD(0x04000121, 0x01c0003f,
   2357                                       0x0000003f, 0x00000001); break;
   2358             case 0x00000001: SET_ABCD(0x04000122, 0x01c0003f,
   2359                                       0x0000003f, 0x00000001); break;
   2360             case 0x00000002: SET_ABCD(0x04004143, 0x03c0003f,
   2361                                       0x00000fff, 0x00000001); break;
   2362             default:         SET_ABCD(0x00000000, 0x00000000,
   2363                                       0x00000000, 0x00000000); break;
   2364          }
   2365          break;
   2366       }
   2367       case 0x00000005:
   2368          SET_ABCD(0x00000040, 0x00000040, 0x00000003, 0x00000020);
   2369          break;
   2370       case 0x00000006:
   2371          SET_ABCD(0x00000001, 0x00000002, 0x00000001, 0x00000000);
   2372          break;
   2373       case 0x00000007:
   2374          SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
   2375          break;
   2376       case 0x00000008:
   2377          SET_ABCD(0x00000400, 0x00000000, 0x00000000, 0x00000000);
   2378          break;
   2379       case 0x00000009:
   2380          SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
   2381          break;
   2382       case 0x0000000a:
   2383       unhandled_eax_value:
   2384          SET_ABCD(0x07280202, 0x00000000, 0x00000000, 0x00000000);
   2385          break;
   2386       case 0x80000000:
   2387          SET_ABCD(0x80000008, 0x00000000, 0x00000000, 0x00000000);
   2388          break;
   2389       case 0x80000001:
   2390          SET_ABCD(0x00000000, 0x00000000, 0x00000001, 0x20100800);
   2391          break;
   2392       case 0x80000002:
   2393          SET_ABCD(0x65746e49, 0x2952286c, 0x726f4320, 0x4d542865);
   2394          break;
   2395       case 0x80000003:
   2396          SET_ABCD(0x43203229, 0x20205550, 0x20202020, 0x20202020);
   2397          break;
   2398       case 0x80000004:
   2399          SET_ABCD(0x30303636, 0x20402020, 0x30342e32, 0x007a4847);
   2400          break;
   2401       case 0x80000005:
   2402          SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
   2403          break;
   2404       case 0x80000006:
   2405          SET_ABCD(0x00000000, 0x00000000, 0x10008040, 0x00000000);
   2406          break;
   2407       case 0x80000007:
   2408          SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
   2409          break;
   2410       case 0x80000008:
   2411          SET_ABCD(0x00003024, 0x00000000, 0x00000000, 0x00000000);
   2412          break;
   2413       default:
   2414          goto unhandled_eax_value;
   2415    }
   2416 #  undef SET_ABCD
   2417 }
   2418 
   2419 
   2420 /* Claim to be the following CPU (4 x ...), which is sse4.2 and cx16
   2421    capable.
   2422 
   2423    vendor_id       : GenuineIntel
   2424    cpu family      : 6
   2425    model           : 37
   2426    model name      : Intel(R) Core(TM) i5 CPU         670  @ 3.47GHz
   2427    stepping        : 2
   2428    cpu MHz         : 3334.000
   2429    cache size      : 4096 KB
   2430    physical id     : 0
   2431    siblings        : 4
   2432    core id         : 0
   2433    cpu cores       : 2
   2434    apicid          : 0
   2435    initial apicid  : 0
   2436    fpu             : yes
   2437    fpu_exception   : yes
   2438    cpuid level     : 11
   2439    wp              : yes
   2440    flags           : fpu vme de pse tsc msr pae mce cx8 apic sep
   2441                      mtrr pge mca cmov pat pse36 clflush dts acpi
   2442                      mmx fxsr sse sse2 ss ht tm pbe syscall nx rdtscp
   2443                      lm constant_tsc arch_perfmon pebs bts rep_good
   2444                      xtopology nonstop_tsc aperfmperf pni pclmulqdq
   2445                      dtes64 monitor ds_cpl vmx smx est tm2 ssse3 cx16
   2446                      xtpr pdcm sse4_1 sse4_2 popcnt aes lahf_lm ida
   2447                      arat tpr_shadow vnmi flexpriority ept vpid
   2448    bogomips        : 6957.57
   2449    clflush size    : 64
   2450    cache_alignment : 64
   2451    address sizes   : 36 bits physical, 48 bits virtual
   2452    power management:
   2453 */
   2454 void amd64g_dirtyhelper_CPUID_sse42_and_cx16 ( VexGuestAMD64State* st )
   2455 {
   2456 #  define SET_ABCD(_a,_b,_c,_d)                \
   2457       do { st->guest_RAX = (ULong)(_a);        \
   2458            st->guest_RBX = (ULong)(_b);        \
   2459            st->guest_RCX = (ULong)(_c);        \
   2460            st->guest_RDX = (ULong)(_d);        \
   2461       } while (0)
   2462 
   2463    UInt old_eax = (UInt)st->guest_RAX;
   2464    UInt old_ecx = (UInt)st->guest_RCX;
   2465 
   2466    switch (old_eax) {
   2467       case 0x00000000:
   2468          SET_ABCD(0x0000000b, 0x756e6547, 0x6c65746e, 0x49656e69);
   2469          break;
   2470       case 0x00000001:
   2471          SET_ABCD(0x00020652, 0x00100800, 0x0298e3ff, 0xbfebfbff);
   2472          break;
   2473       case 0x00000002:
   2474          SET_ABCD(0x55035a01, 0x00f0b2e3, 0x00000000, 0x09ca212c);
   2475          break;
   2476       case 0x00000003:
   2477          SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
   2478          break;
   2479       case 0x00000004:
   2480          switch (old_ecx) {
   2481             case 0x00000000: SET_ABCD(0x1c004121, 0x01c0003f,
   2482                                       0x0000003f, 0x00000000); break;
   2483             case 0x00000001: SET_ABCD(0x1c004122, 0x00c0003f,
   2484                                       0x0000007f, 0x00000000); break;
   2485             case 0x00000002: SET_ABCD(0x1c004143, 0x01c0003f,
   2486                                       0x000001ff, 0x00000000); break;
   2487             case 0x00000003: SET_ABCD(0x1c03c163, 0x03c0003f,
   2488                                       0x00000fff, 0x00000002); break;
   2489             default:         SET_ABCD(0x00000000, 0x00000000,
   2490                                       0x00000000, 0x00000000); break;
   2491          }
   2492          break;
   2493       case 0x00000005:
   2494          SET_ABCD(0x00000040, 0x00000040, 0x00000003, 0x00001120);
   2495          break;
   2496       case 0x00000006:
   2497          SET_ABCD(0x00000007, 0x00000002, 0x00000001, 0x00000000);
   2498          break;
   2499       case 0x00000007:
   2500          SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
   2501          break;
   2502       case 0x00000008:
   2503          SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
   2504          break;
   2505       case 0x00000009:
   2506          SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
   2507          break;
   2508       case 0x0000000a:
   2509          SET_ABCD(0x07300403, 0x00000004, 0x00000000, 0x00000603);
   2510          break;
   2511       case 0x0000000b:
   2512          switch (old_ecx) {
   2513             case 0x00000000:
   2514                SET_ABCD(0x00000001, 0x00000002,
   2515                         0x00000100, 0x00000000); break;
   2516             case 0x00000001:
   2517                SET_ABCD(0x00000004, 0x00000004,
   2518                         0x00000201, 0x00000000); break;
   2519             default:
   2520                SET_ABCD(0x00000000, 0x00000000,
   2521                         old_ecx,    0x00000000); break;
   2522          }
   2523          break;
   2524       case 0x0000000c:
   2525          SET_ABCD(0x00000001, 0x00000002, 0x00000100, 0x00000000);
   2526          break;
   2527       case 0x0000000d:
   2528          switch (old_ecx) {
   2529             case 0x00000000: SET_ABCD(0x00000001, 0x00000002,
   2530                                       0x00000100, 0x00000000); break;
   2531             case 0x00000001: SET_ABCD(0x00000004, 0x00000004,
   2532                                       0x00000201, 0x00000000); break;
   2533             default:         SET_ABCD(0x00000000, 0x00000000,
   2534                                       old_ecx,    0x00000000); break;
   2535          }
   2536          break;
   2537       case 0x80000000:
   2538          SET_ABCD(0x80000008, 0x00000000, 0x00000000, 0x00000000);
   2539          break;
   2540       case 0x80000001:
   2541          SET_ABCD(0x00000000, 0x00000000, 0x00000001, 0x28100800);
   2542          break;
   2543       case 0x80000002:
   2544          SET_ABCD(0x65746e49, 0x2952286c, 0x726f4320, 0x4d542865);
   2545          break;
   2546       case 0x80000003:
   2547          SET_ABCD(0x35692029, 0x55504320, 0x20202020, 0x20202020);
   2548          break;
   2549       case 0x80000004:
   2550          SET_ABCD(0x30373620, 0x20402020, 0x37342e33, 0x007a4847);
   2551          break;
   2552       case 0x80000005:
   2553          SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
   2554          break;
   2555       case 0x80000006:
   2556          SET_ABCD(0x00000000, 0x00000000, 0x01006040, 0x00000000);
   2557          break;
   2558       case 0x80000007:
   2559          SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000100);
   2560          break;
   2561       case 0x80000008:
   2562          SET_ABCD(0x00003024, 0x00000000, 0x00000000, 0x00000000);
   2563          break;
   2564       default:
   2565          SET_ABCD(0x00000001, 0x00000002, 0x00000100, 0x00000000);
   2566          break;
   2567    }
   2568 #  undef SET_ABCD
   2569 }
   2570 
   2571 
   2572 /* Claim to be the following CPU (4 x ...), which is AVX and cx16
   2573    capable.  Plus (kludge!) it "supports" HTM.
   2574 
   2575    vendor_id       : GenuineIntel
   2576    cpu family      : 6
   2577    model           : 42
   2578    model name      : Intel(R) Core(TM) i5-2300 CPU @ 2.80GHz
   2579    stepping        : 7
   2580    cpu MHz         : 1600.000
   2581    cache size      : 6144 KB
   2582    physical id     : 0
   2583    siblings        : 4
   2584    core id         : 3
   2585    cpu cores       : 4
   2586    apicid          : 6
   2587    initial apicid  : 6
   2588    fpu             : yes
   2589    fpu_exception   : yes
   2590    cpuid level     : 13
   2591    wp              : yes
   2592    flags           : fpu vme de pse tsc msr pae mce cx8 apic sep
   2593                      mtrr pge mca cmov pat pse36 clflush dts acpi
   2594                      mmx fxsr sse sse2 ss ht tm pbe syscall nx rdtscp
   2595                      lm constant_tsc arch_perfmon pebs bts rep_good
   2596                      nopl xtopology nonstop_tsc aperfmperf pni pclmulqdq
   2597                      dtes64 monitor ds_cpl vmx est tm2 ssse3 cx16
   2598                      xtpr pdcm sse4_1 sse4_2 popcnt aes xsave avx
   2599                      lahf_lm ida arat epb xsaveopt pln pts dts
   2600                      tpr_shadow vnmi flexpriority ept vpid
   2601 
   2602    bogomips        : 5768.94
   2603    clflush size    : 64
   2604    cache_alignment : 64
   2605    address sizes   : 36 bits physical, 48 bits virtual
   2606    power management:
   2607 */
   2608 void amd64g_dirtyhelper_CPUID_avx_and_cx16 ( VexGuestAMD64State* st )
   2609 {
   2610 #  define SET_ABCD(_a,_b,_c,_d)                \
   2611       do { st->guest_RAX = (ULong)(_a);        \
   2612            st->guest_RBX = (ULong)(_b);        \
   2613            st->guest_RCX = (ULong)(_c);        \
   2614            st->guest_RDX = (ULong)(_d);        \
   2615       } while (0)
   2616 
   2617    UInt old_eax = (UInt)st->guest_RAX;
   2618    UInt old_ecx = (UInt)st->guest_RCX;
   2619 
   2620    switch (old_eax) {
   2621       case 0x00000000:
   2622          SET_ABCD(0x0000000d, 0x756e6547, 0x6c65746e, 0x49656e69);
   2623          break;
   2624       case 0x00000001:
   2625          SET_ABCD(0x000206a7, 0x00100800, 0x1f9ae3bf, 0xbfebfbff);
   2626          break;
   2627       case 0x00000002:
   2628          SET_ABCD(0x76035a01, 0x00f0b0ff, 0x00000000, 0x00ca0000);
   2629          break;
   2630       case 0x00000003:
   2631          SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
   2632          break;
   2633       case 0x00000004:
   2634          switch (old_ecx) {
   2635             case 0x00000000: SET_ABCD(0x1c004121, 0x01c0003f,
   2636                                       0x0000003f, 0x00000000); break;
   2637             case 0x00000001: SET_ABCD(0x1c004122, 0x01c0003f,
   2638                                       0x0000003f, 0x00000000); break;
   2639             case 0x00000002: SET_ABCD(0x1c004143, 0x01c0003f,
   2640                                       0x000001ff, 0x00000000); break;
   2641             case 0x00000003: SET_ABCD(0x1c03c163, 0x02c0003f,
   2642                                       0x00001fff, 0x00000006); break;
   2643             default:         SET_ABCD(0x00000000, 0x00000000,
   2644                                       0x00000000, 0x00000000); break;
   2645          }
   2646          break;
   2647       case 0x00000005:
   2648          SET_ABCD(0x00000040, 0x00000040, 0x00000003, 0x00001120);
   2649          break;
   2650       case 0x00000006:
   2651          SET_ABCD(0x00000077, 0x00000002, 0x00000009, 0x00000000);
   2652          break;
   2653       case 0x00000007:
   2654          SET_ABCD(0x00000000, 0x00000800, 0x00000000, 0x00000000);
   2655          break;
   2656       case 0x00000008:
   2657          SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
   2658          break;
   2659       case 0x00000009:
   2660          SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
   2661          break;
   2662       case 0x0000000a:
   2663          SET_ABCD(0x07300803, 0x00000000, 0x00000000, 0x00000603);
   2664          break;
   2665       case 0x0000000b:
   2666          switch (old_ecx) {
   2667             case 0x00000000:
   2668                SET_ABCD(0x00000001, 0x00000001,
   2669                         0x00000100, 0x00000000); break;
   2670             case 0x00000001:
   2671                SET_ABCD(0x00000004, 0x00000004,
   2672                         0x00000201, 0x00000000); break;
   2673             default:
   2674                SET_ABCD(0x00000000, 0x00000000,
   2675                         old_ecx,    0x00000000); break;
   2676          }
   2677          break;
   2678       case 0x0000000c:
   2679          SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
   2680          break;
   2681       case 0x0000000d:
   2682          switch (old_ecx) {
   2683             case 0x00000000: SET_ABCD(0x00000007, 0x00000340,
   2684                                       0x00000340, 0x00000000); break;
   2685             case 0x00000001: SET_ABCD(0x00000001, 0x00000000,
   2686                                       0x00000000, 0x00000000); break;
   2687             case 0x00000002: SET_ABCD(0x00000100, 0x00000240,
   2688                                       0x00000000, 0x00000000); break;
   2689             default:         SET_ABCD(0x00000000, 0x00000000,
   2690                                       0x00000000, 0x00000000); break;
   2691          }
   2692          break;
   2693       case 0x0000000e:
   2694          SET_ABCD(0x00000007, 0x00000340, 0x00000340, 0x00000000);
   2695          break;
   2696       case 0x0000000f:
   2697          SET_ABCD(0x00000007, 0x00000340, 0x00000340, 0x00000000);
   2698          break;
   2699       case 0x80000000:
   2700          SET_ABCD(0x80000008, 0x00000000, 0x00000000, 0x00000000);
   2701          break;
   2702       case 0x80000001:
   2703          SET_ABCD(0x00000000, 0x00000000, 0x00000001, 0x28100800);
   2704          break;
   2705       case 0x80000002:
   2706          SET_ABCD(0x20202020, 0x20202020, 0x65746e49, 0x2952286c);
   2707          break;
   2708       case 0x80000003:
   2709          SET_ABCD(0x726f4320, 0x4d542865, 0x35692029, 0x3033322d);
   2710          break;
   2711       case 0x80000004:
   2712          SET_ABCD(0x50432030, 0x20402055, 0x30382e32, 0x007a4847);
   2713          break;
   2714       case 0x80000005:
   2715          SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
   2716          break;
   2717       case 0x80000006:
   2718          SET_ABCD(0x00000000, 0x00000000, 0x01006040, 0x00000000);
   2719          break;
   2720       case 0x80000007:
   2721          SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000100);
   2722          break;
   2723       case 0x80000008:
   2724          SET_ABCD(0x00003024, 0x00000000, 0x00000000, 0x00000000);
   2725          break;
   2726       default:
   2727          SET_ABCD(0x00000007, 0x00000340, 0x00000340, 0x00000000);
   2728          break;
   2729    }
   2730 #  undef SET_ABCD
   2731 }
   2732 
   2733 
   2734 ULong amd64g_calculate_RCR ( ULong arg,
   2735                              ULong rot_amt,
   2736                              ULong rflags_in,
   2737                              Long  szIN )
   2738 {
   2739    Bool  wantRflags = toBool(szIN < 0);
   2740    ULong sz         = wantRflags ? (-szIN) : szIN;
   2741    ULong tempCOUNT  = rot_amt & (sz == 8 ? 0x3F : 0x1F);
   2742    ULong cf=0, of=0, tempcf;
   2743 
   2744    switch (sz) {
   2745       case 8:
   2746          cf        = (rflags_in >> AMD64G_CC_SHIFT_C) & 1;
   2747          of        = ((arg >> 63) ^ cf) & 1;
   2748          while (tempCOUNT > 0) {
   2749             tempcf = arg & 1;
   2750             arg    = (arg >> 1) | (cf << 63);
   2751             cf     = tempcf;
   2752             tempCOUNT--;
   2753          }
   2754          break;
   2755       case 4:
   2756          while (tempCOUNT >= 33) tempCOUNT -= 33;
   2757          cf        = (rflags_in >> AMD64G_CC_SHIFT_C) & 1;
   2758          of        = ((arg >> 31) ^ cf) & 1;
   2759          while (tempCOUNT > 0) {
   2760             tempcf = arg & 1;
   2761             arg    = ((arg >> 1) & 0x7FFFFFFFULL) | (cf << 31);
   2762             cf     = tempcf;
   2763             tempCOUNT--;
   2764          }
   2765          break;
   2766       case 2:
   2767          while (tempCOUNT >= 17) tempCOUNT -= 17;
   2768          cf        = (rflags_in >> AMD64G_CC_SHIFT_C) & 1;
   2769          of        = ((arg >> 15) ^ cf) & 1;
   2770          while (tempCOUNT > 0) {
   2771             tempcf = arg & 1;
   2772             arg    = ((arg >> 1) & 0x7FFFULL) | (cf << 15);
   2773             cf     = tempcf;
   2774             tempCOUNT--;
   2775          }
   2776          break;
   2777       case 1:
   2778          while (tempCOUNT >= 9) tempCOUNT -= 9;
   2779          cf        = (rflags_in >> AMD64G_CC_SHIFT_C) & 1;
   2780          of        = ((arg >> 7) ^ cf) & 1;
   2781          while (tempCOUNT > 0) {
   2782             tempcf = arg & 1;
   2783             arg    = ((arg >> 1) & 0x7FULL) | (cf << 7);
   2784             cf     = tempcf;
   2785             tempCOUNT--;
   2786          }
   2787          break;
   2788       default:
   2789          vpanic("calculate_RCR(amd64g): invalid size");
   2790    }
   2791 
   2792    cf &= 1;
   2793    of &= 1;
   2794    rflags_in &= ~(AMD64G_CC_MASK_C | AMD64G_CC_MASK_O);
   2795    rflags_in |= (cf << AMD64G_CC_SHIFT_C) | (of << AMD64G_CC_SHIFT_O);
   2796 
   2797    /* caller can ask to have back either the resulting flags or
   2798       resulting value, but not both */
   2799    return wantRflags ? rflags_in : arg;
   2800 }
   2801 
   2802 ULong amd64g_calculate_RCL ( ULong arg,
   2803                              ULong rot_amt,
   2804                              ULong rflags_in,
   2805                              Long  szIN )
   2806 {
   2807    Bool  wantRflags = toBool(szIN < 0);
   2808    ULong sz         = wantRflags ? (-szIN) : szIN;
   2809    ULong tempCOUNT  = rot_amt & (sz == 8 ? 0x3F : 0x1F);
   2810    ULong cf=0, of=0, tempcf;
   2811 
   2812    switch (sz) {
   2813       case 8:
   2814          cf = (rflags_in >> AMD64G_CC_SHIFT_C) & 1;
   2815          while (tempCOUNT > 0) {
   2816             tempcf = (arg >> 63) & 1;
   2817             arg    = (arg << 1) | (cf & 1);
   2818             cf     = tempcf;
   2819             tempCOUNT--;
   2820          }
   2821          of = ((arg >> 63) ^ cf) & 1;
   2822          break;
   2823       case 4:
   2824          while (tempCOUNT >= 33) tempCOUNT -= 33;
   2825          cf = (rflags_in >> AMD64G_CC_SHIFT_C) & 1;
   2826          while (tempCOUNT > 0) {
   2827             tempcf = (arg >> 31) & 1;
   2828             arg    = 0xFFFFFFFFULL & ((arg << 1) | (cf & 1));
   2829             cf     = tempcf;
   2830             tempCOUNT--;
   2831          }
   2832          of = ((arg >> 31) ^ cf) & 1;
   2833          break;
   2834       case 2:
   2835          while (tempCOUNT >= 17) tempCOUNT -= 17;
   2836          cf = (rflags_in >> AMD64G_CC_SHIFT_C) & 1;
   2837          while (tempCOUNT > 0) {
   2838             tempcf = (arg >> 15) & 1;
   2839             arg    = 0xFFFFULL & ((arg << 1) | (cf & 1));
   2840             cf     = tempcf;
   2841             tempCOUNT--;
   2842          }
   2843          of = ((arg >> 15) ^ cf) & 1;
   2844          break;
   2845       case 1:
   2846          while (tempCOUNT >= 9) tempCOUNT -= 9;
   2847          cf = (rflags_in >> AMD64G_CC_SHIFT_C) & 1;
   2848          while (tempCOUNT > 0) {
   2849             tempcf = (arg >> 7) & 1;
   2850             arg    = 0xFFULL & ((arg << 1) | (cf & 1));
   2851             cf     = tempcf;
   2852             tempCOUNT--;
   2853          }
   2854          of = ((arg >> 7) ^ cf) & 1;
   2855          break;
   2856       default:
   2857          vpanic("calculate_RCL(amd64g): invalid size");
   2858    }
   2859 
   2860    cf &= 1;
   2861    of &= 1;
   2862    rflags_in &= ~(AMD64G_CC_MASK_C | AMD64G_CC_MASK_O);
   2863    rflags_in |= (cf << AMD64G_CC_SHIFT_C) | (of << AMD64G_CC_SHIFT_O);
   2864 
   2865    return wantRflags ? rflags_in : arg;
   2866 }
   2867 
   2868 /* Taken from gf2x-0.9.5, released under GPLv2+ (later versions LGPLv2+)
   2869  * svn://scm.gforge.inria.fr/svn/gf2x/trunk/hardware/opteron/gf2x_mul1.h@25
   2870  */
   2871 ULong amd64g_calculate_pclmul(ULong a, ULong b, ULong which)
   2872 {
   2873     ULong hi, lo, tmp, A[16];
   2874 
   2875    A[0] = 0;            A[1] = a;
   2876    A[2] = A[1] << 1;    A[3] = A[2] ^ a;
   2877    A[4] = A[2] << 1;    A[5] = A[4] ^ a;
   2878    A[6] = A[3] << 1;    A[7] = A[6] ^ a;
   2879    A[8] = A[4] << 1;    A[9] = A[8] ^ a;
   2880    A[10] = A[5] << 1;   A[11] = A[10] ^ a;
   2881    A[12] = A[6] << 1;   A[13] = A[12] ^ a;
   2882    A[14] = A[7] << 1;   A[15] = A[14] ^ a;
   2883 
   2884    lo = (A[b >> 60] << 4) ^ A[(b >> 56) & 15];
   2885    hi = lo >> 56;
   2886    lo = (lo << 8) ^ (A[(b >> 52) & 15] << 4) ^ A[(b >> 48) & 15];
   2887    hi = (hi << 8) | (lo >> 56);
   2888    lo = (lo << 8) ^ (A[(b >> 44) & 15] << 4) ^ A[(b >> 40) & 15];
   2889    hi = (hi << 8) | (lo >> 56);
   2890    lo = (lo << 8) ^ (A[(b >> 36) & 15] << 4) ^ A[(b >> 32) & 15];
   2891    hi = (hi << 8) | (lo >> 56);
   2892    lo = (lo << 8) ^ (A[(b >> 28) & 15] << 4) ^ A[(b >> 24) & 15];
   2893    hi = (hi << 8) | (lo >> 56);
   2894    lo = (lo << 8) ^ (A[(b >> 20) & 15] << 4) ^ A[(b >> 16) & 15];
   2895    hi = (hi << 8) | (lo >> 56);
   2896    lo = (lo << 8) ^ (A[(b >> 12) & 15] << 4) ^ A[(b >> 8) & 15];
   2897    hi = (hi << 8) | (lo >> 56);
   2898    lo = (lo << 8) ^ (A[(b >> 4) & 15] << 4) ^ A[b & 15];
   2899 
   2900    ULong m0 = -1;
   2901    m0 /= 255;
   2902    tmp = -((a >> 63) & 1); tmp &= ((b & (m0 * 0xfe)) >> 1); hi = hi ^ tmp;
   2903    tmp = -((a >> 62) & 1); tmp &= ((b & (m0 * 0xfc)) >> 2); hi = hi ^ tmp;
   2904    tmp = -((a >> 61) & 1); tmp &= ((b & (m0 * 0xf8)) >> 3); hi = hi ^ tmp;
   2905    tmp = -((a >> 60) & 1); tmp &= ((b & (m0 * 0xf0)) >> 4); hi = hi ^ tmp;
   2906    tmp = -((a >> 59) & 1); tmp &= ((b & (m0 * 0xe0)) >> 5); hi = hi ^ tmp;
   2907    tmp = -((a >> 58) & 1); tmp &= ((b & (m0 * 0xc0)) >> 6); hi = hi ^ tmp;
   2908    tmp = -((a >> 57) & 1); tmp &= ((b & (m0 * 0x80)) >> 7); hi = hi ^ tmp;
   2909 
   2910    return which ? hi : lo;
   2911 }
   2912 
   2913 
   2914 /* CALLED FROM GENERATED CODE */
   2915 /* DIRTY HELPER (non-referentially-transparent) */
   2916 /* Horrible hack.  On non-amd64 platforms, return 1. */
   2917 ULong amd64g_dirtyhelper_RDTSC ( void )
   2918 {
   2919 #  if defined(__x86_64__)
   2920    UInt  eax, edx;
   2921    __asm__ __volatile__("rdtsc" : "=a" (eax), "=d" (edx));
   2922    return (((ULong)edx) << 32) | ((ULong)eax);
   2923 #  else
   2924    return 1ULL;
   2925 #  endif
   2926 }
   2927 
   2928 /* CALLED FROM GENERATED CODE */
   2929 /* DIRTY HELPER (non-referentially-transparent) */
   2930 /* Horrible hack.  On non-amd64 platforms, return 1. */
   2931 /* This uses a different calling convention from _RDTSC just above
   2932    only because of the difficulty of returning 96 bits from a C
   2933    function -- RDTSC returns 64 bits and so is simple by comparison,
   2934    on amd64. */
   2935 void amd64g_dirtyhelper_RDTSCP ( VexGuestAMD64State* st )
   2936 {
   2937 #  if defined(__x86_64__)
   2938    UInt eax, ecx, edx;
   2939    __asm__ __volatile__("rdtscp" : "=a" (eax), "=d" (edx), "=c" (ecx));
   2940    st->guest_RAX = (ULong)eax;
   2941    st->guest_RCX = (ULong)ecx;
   2942    st->guest_RDX = (ULong)edx;
   2943 #  else
   2944    /* Do nothing. */
   2945 #  endif
   2946 }
   2947 
   2948 /* CALLED FROM GENERATED CODE */
   2949 /* DIRTY HELPER (non-referentially-transparent) */
   2950 /* Horrible hack.  On non-amd64 platforms, return 0. */
   2951 ULong amd64g_dirtyhelper_IN ( ULong portno, ULong sz/*1,2 or 4*/ )
   2952 {
   2953 #  if defined(__x86_64__)
   2954    ULong r = 0;
   2955    portno &= 0xFFFF;
   2956    switch (sz) {
   2957       case 4:
   2958          __asm__ __volatile__("movq $0,%%rax; inl %w1,%%eax; movq %%rax,%0"
   2959                               : "=a" (r) : "Nd" (portno));
   2960 	 break;
   2961       case 2:
   2962          __asm__ __volatile__("movq $0,%%rax; inw %w1,%w0"
   2963                               : "=a" (r) : "Nd" (portno));
   2964 	 break;
   2965       case 1:
   2966          __asm__ __volatile__("movq $0,%%rax; inb %w1,%b0"
   2967                               : "=a" (r) : "Nd" (portno));
   2968 	 break;
   2969       default:
   2970          break; /* note: no 64-bit version of insn exists */
   2971    }
   2972    return r;
   2973 #  else
   2974    return 0;
   2975 #  endif
   2976 }
   2977 
   2978 
   2979 /* CALLED FROM GENERATED CODE */
   2980 /* DIRTY HELPER (non-referentially-transparent) */
   2981 /* Horrible hack.  On non-amd64 platforms, do nothing. */
   2982 void amd64g_dirtyhelper_OUT ( ULong portno, ULong data, ULong sz/*1,2 or 4*/ )
   2983 {
   2984 #  if defined(__x86_64__)
   2985    portno &= 0xFFFF;
   2986    switch (sz) {
   2987       case 4:
   2988          __asm__ __volatile__("movq %0,%%rax; outl %%eax, %w1"
   2989                               : : "a" (data), "Nd" (portno));
   2990 	 break;
   2991       case 2:
   2992          __asm__ __volatile__("outw %w0, %w1"
   2993                               : : "a" (data), "Nd" (portno));
   2994 	 break;
   2995       case 1:
   2996          __asm__ __volatile__("outb %b0, %w1"
   2997                               : : "a" (data), "Nd" (portno));
   2998 	 break;
   2999       default:
   3000          break; /* note: no 64-bit version of insn exists */
   3001    }
   3002 #  else
   3003    /* do nothing */
   3004 #  endif
   3005 }
   3006 
   3007 /* CALLED FROM GENERATED CODE */
   3008 /* DIRTY HELPER (non-referentially-transparent) */
   3009 /* Horrible hack.  On non-amd64 platforms, do nothing. */
   3010 /* op = 0: call the native SGDT instruction.
   3011    op = 1: call the native SIDT instruction.
   3012 */
   3013 void amd64g_dirtyhelper_SxDT ( void *address, ULong op ) {
   3014 #  if defined(__x86_64__)
   3015    switch (op) {
   3016       case 0:
   3017          __asm__ __volatile__("sgdt (%0)" : : "r" (address) : "memory");
   3018          break;
   3019       case 1:
   3020          __asm__ __volatile__("sidt (%0)" : : "r" (address) : "memory");
   3021          break;
   3022       default:
   3023          vpanic("amd64g_dirtyhelper_SxDT");
   3024    }
   3025 #  else
   3026    /* do nothing */
   3027    UChar* p = (UChar*)address;
   3028    p[0] = p[1] = p[2] = p[3] = p[4] = p[5] = 0;
   3029    p[6] = p[7] = p[8] = p[9] = 0;
   3030 #  endif
   3031 }
   3032 
   3033 /*---------------------------------------------------------------*/
   3034 /*--- Helpers for MMX/SSE/SSE2.                               ---*/
   3035 /*---------------------------------------------------------------*/
   3036 
   3037 static inline UChar abdU8 ( UChar xx, UChar yy ) {
   3038    return toUChar(xx>yy ? xx-yy : yy-xx);
   3039 }
   3040 
   3041 static inline ULong mk32x2 ( UInt w1, UInt w0 ) {
   3042    return (((ULong)w1) << 32) | ((ULong)w0);
   3043 }
   3044 
   3045 static inline UShort sel16x4_3 ( ULong w64 ) {
   3046    UInt hi32 = toUInt(w64 >> 32);
   3047    return toUShort(hi32 >> 16);
   3048 }
   3049 static inline UShort sel16x4_2 ( ULong w64 ) {
   3050    UInt hi32 = toUInt(w64 >> 32);
   3051    return toUShort(hi32);
   3052 }
   3053 static inline UShort sel16x4_1 ( ULong w64 ) {
   3054    UInt lo32 = toUInt(w64);
   3055    return toUShort(lo32 >> 16);
   3056 }
   3057 static inline UShort sel16x4_0 ( ULong w64 ) {
   3058    UInt lo32 = toUInt(w64);
   3059    return toUShort(lo32);
   3060 }
   3061 
   3062 static inline UChar sel8x8_7 ( ULong w64 ) {
   3063    UInt hi32 = toUInt(w64 >> 32);
   3064    return toUChar(hi32 >> 24);
   3065 }
   3066 static inline UChar sel8x8_6 ( ULong w64 ) {
   3067    UInt hi32 = toUInt(w64 >> 32);
   3068    return toUChar(hi32 >> 16);
   3069 }
   3070 static inline UChar sel8x8_5 ( ULong w64 ) {
   3071    UInt hi32 = toUInt(w64 >> 32);
   3072    return toUChar(hi32 >> 8);
   3073 }
   3074 static inline UChar sel8x8_4 ( ULong w64 ) {
   3075    UInt hi32 = toUInt(w64 >> 32);
   3076    return toUChar(hi32 >> 0);
   3077 }
   3078 static inline UChar sel8x8_3 ( ULong w64 ) {
   3079    UInt lo32 = toUInt(w64);
   3080    return toUChar(lo32 >> 24);
   3081 }
   3082 static inline UChar sel8x8_2 ( ULong w64 ) {
   3083    UInt lo32 = toUInt(w64);
   3084    return toUChar(lo32 >> 16);
   3085 }
   3086 static inline UChar sel8x8_1 ( ULong w64 ) {
   3087    UInt lo32 = toUInt(w64);
   3088    return toUChar(lo32 >> 8);
   3089 }
   3090 static inline UChar sel8x8_0 ( ULong w64 ) {
   3091    UInt lo32 = toUInt(w64);
   3092    return toUChar(lo32 >> 0);
   3093 }
   3094 
   3095 /* CALLED FROM GENERATED CODE: CLEAN HELPER */
   3096 ULong amd64g_calculate_mmx_pmaddwd ( ULong xx, ULong yy )
   3097 {
   3098    return
   3099       mk32x2(
   3100          (((Int)(Short)sel16x4_3(xx)) * ((Int)(Short)sel16x4_3(yy)))
   3101             + (((Int)(Short)sel16x4_2(xx)) * ((Int)(Short)sel16x4_2(yy))),
   3102          (((Int)(Short)sel16x4_1(xx)) * ((Int)(Short)sel16x4_1(yy)))
   3103             + (((Int)(Short)sel16x4_0(xx)) * ((Int)(Short)sel16x4_0(yy)))
   3104       );
   3105 }
   3106 
   3107 /* CALLED FROM GENERATED CODE: CLEAN HELPER */
   3108 ULong amd64g_calculate_mmx_psadbw ( ULong xx, ULong yy )
   3109 {
   3110    UInt t = 0;
   3111    t += (UInt)abdU8( sel8x8_7(xx), sel8x8_7(yy) );
   3112    t += (UInt)abdU8( sel8x8_6(xx), sel8x8_6(yy) );
   3113    t += (UInt)abdU8( sel8x8_5(xx), sel8x8_5(yy) );
   3114    t += (UInt)abdU8( sel8x8_4(xx), sel8x8_4(yy) );
   3115    t += (UInt)abdU8( sel8x8_3(xx), sel8x8_3(yy) );
   3116    t += (UInt)abdU8( sel8x8_2(xx), sel8x8_2(yy) );
   3117    t += (UInt)abdU8( sel8x8_1(xx), sel8x8_1(yy) );
   3118    t += (UInt)abdU8( sel8x8_0(xx), sel8x8_0(yy) );
   3119    t &= 0xFFFF;
   3120    return (ULong)t;
   3121 }
   3122 
   3123 /* CALLED FROM GENERATED CODE: CLEAN HELPER */
   3124 ULong amd64g_calculate_sse_phminposuw ( ULong sLo, ULong sHi )
   3125 {
   3126    UShort t, min;
   3127    UInt   idx;
   3128    t = sel16x4_0(sLo); if (True)    { min = t; idx = 0; }
   3129    t = sel16x4_1(sLo); if (t < min) { min = t; idx = 1; }
   3130    t = sel16x4_2(sLo); if (t < min) { min = t; idx = 2; }
   3131    t = sel16x4_3(sLo); if (t < min) { min = t; idx = 3; }
   3132    t = sel16x4_0(sHi); if (t < min) { min = t; idx = 4; }
   3133    t = sel16x4_1(sHi); if (t < min) { min = t; idx = 5; }
   3134    t = sel16x4_2(sHi); if (t < min) { min = t; idx = 6; }
   3135    t = sel16x4_3(sHi); if (t < min) { min = t; idx = 7; }
   3136    return ((ULong)(idx << 16)) | ((ULong)min);
   3137 }
   3138 
   3139 /* CALLED FROM GENERATED CODE: CLEAN HELPER */
   3140 ULong amd64g_calc_crc32b ( ULong crcIn, ULong b )
   3141 {
   3142    UInt  i;
   3143    ULong crc = (b & 0xFFULL) ^ crcIn;
   3144    for (i = 0; i < 8; i++)
   3145       crc = (crc >> 1) ^ ((crc & 1) ? 0x82f63b78ULL : 0);
   3146    return crc;
   3147 }
   3148 
   3149 /* CALLED FROM GENERATED CODE: CLEAN HELPER */
   3150 ULong amd64g_calc_crc32w ( ULong crcIn, ULong w )
   3151 {
   3152    UInt  i;
   3153    ULong crc = (w & 0xFFFFULL) ^ crcIn;
   3154    for (i = 0; i < 16; i++)
   3155       crc = (crc >> 1) ^ ((crc & 1) ? 0x82f63b78ULL : 0);
   3156    return crc;
   3157 }
   3158 
   3159 /* CALLED FROM GENERATED CODE: CLEAN HELPER */
   3160 ULong amd64g_calc_crc32l ( ULong crcIn, ULong l )
   3161 {
   3162    UInt i;
   3163    ULong crc = (l & 0xFFFFFFFFULL) ^ crcIn;
   3164    for (i = 0; i < 32; i++)
   3165       crc = (crc >> 1) ^ ((crc & 1) ? 0x82f63b78ULL : 0);
   3166    return crc;
   3167 }
   3168 
   3169 /* CALLED FROM GENERATED CODE: CLEAN HELPER */
   3170 ULong amd64g_calc_crc32q ( ULong crcIn, ULong q )
   3171 {
   3172    ULong crc = amd64g_calc_crc32l(crcIn, q);
   3173    return amd64g_calc_crc32l(crc, q >> 32);
   3174 }
   3175 
   3176 
   3177 /* .. helper for next fn .. */
   3178 static inline ULong sad_8x4 ( ULong xx, ULong yy )
   3179 {
   3180    UInt t = 0;
   3181    t += (UInt)abdU8( sel8x8_3(xx), sel8x8_3(yy) );
   3182    t += (UInt)abdU8( sel8x8_2(xx), sel8x8_2(yy) );
   3183    t += (UInt)abdU8( sel8x8_1(xx), sel8x8_1(yy) );
   3184    t += (UInt)abdU8( sel8x8_0(xx), sel8x8_0(yy) );
   3185    return (ULong)t;
   3186 }
   3187 
   3188 /* CALLED FROM GENERATED CODE: CLEAN HELPER */
   3189 ULong amd64g_calc_mpsadbw ( ULong sHi, ULong sLo,
   3190                             ULong dHi, ULong dLo,
   3191                             ULong imm_and_return_control_bit )
   3192 {
   3193    UInt imm8     = imm_and_return_control_bit & 7;
   3194    Bool calcHi   = (imm_and_return_control_bit >> 7) & 1;
   3195    UInt srcOffsL = imm8 & 3; /* src offs in 32-bit (L) chunks */
   3196    UInt dstOffsL = (imm8 >> 2) & 1; /* dst offs in ditto chunks */
   3197    /* For src we only need 32 bits, so get them into the
   3198       lower half of a 64 bit word. */
   3199    ULong src = ((srcOffsL & 2) ? sHi : sLo) >> (32 * (srcOffsL & 1));
   3200    /* For dst we need to get hold of 56 bits (7 bytes) from a total of
   3201       11 bytes.  If calculating the low part of the result, need bytes
   3202       dstOffsL * 4 + (0 .. 6); if calculating the high part,
   3203       dstOffsL * 4 + (4 .. 10). */
   3204    ULong dst;
   3205    /* dstOffL = 0, Lo  ->  0 .. 6
   3206       dstOffL = 1, Lo  ->  4 .. 10
   3207       dstOffL = 0, Hi  ->  4 .. 10
   3208       dstOffL = 1, Hi  ->  8 .. 14
   3209    */
   3210    if (calcHi && dstOffsL) {
   3211       /* 8 .. 14 */
   3212       dst = dHi & 0x00FFFFFFFFFFFFFFULL;
   3213    }
   3214    else if (!calcHi && !dstOffsL) {
   3215       /* 0 .. 6 */
   3216       dst = dLo & 0x00FFFFFFFFFFFFFFULL;
   3217    }
   3218    else {
   3219       /* 4 .. 10 */
   3220       dst = (dLo >> 32) | ((dHi & 0x00FFFFFFULL) << 32);
   3221    }
   3222    ULong r0  = sad_8x4( dst >>  0, src );
   3223    ULong r1  = sad_8x4( dst >>  8, src );
   3224    ULong r2  = sad_8x4( dst >> 16, src );
   3225    ULong r3  = sad_8x4( dst >> 24, src );
   3226    ULong res = (r3 << 48) | (r2 << 32) | (r1 << 16) | r0;
   3227    return res;
   3228 }
   3229 
   3230 /* CALLED FROM GENERATED CODE: CLEAN HELPER */
   3231 ULong amd64g_calculate_pext ( ULong src_masked, ULong mask )
   3232 {
   3233    ULong dst = 0;
   3234    ULong src_bit;
   3235    ULong dst_bit = 1;
   3236    for (src_bit = 1; src_bit; src_bit <<= 1) {
   3237       if (mask & src_bit) {
   3238          if (src_masked & src_bit) dst |= dst_bit;
   3239          dst_bit <<= 1;
   3240       }
   3241    }
   3242    return dst;
   3243 }
   3244 
   3245 /* CALLED FROM GENERATED CODE: CLEAN HELPER */
   3246 ULong amd64g_calculate_pdep ( ULong src, ULong mask )
   3247 {
   3248    ULong dst = 0;
   3249    ULong dst_bit;
   3250    ULong src_bit = 1;
   3251    for (dst_bit = 1; dst_bit; dst_bit <<= 1) {
   3252       if (mask & dst_bit) {
   3253          if (src & src_bit) dst |= dst_bit;
   3254          src_bit <<= 1;
   3255       }
   3256    }
   3257    return dst;
   3258 }
   3259 
   3260 /*---------------------------------------------------------------*/
   3261 /*--- Helpers for SSE4.2 PCMP{E,I}STR{I,M}                    ---*/
   3262 /*---------------------------------------------------------------*/
   3263 
   3264 static UInt zmask_from_V128 ( V128* arg )
   3265 {
   3266    UInt i, res = 0;
   3267    for (i = 0; i < 16; i++) {
   3268       res |=  ((arg->w8[i] == 0) ? 1 : 0) << i;
   3269    }
   3270    return res;
   3271 }
   3272 
   3273 static UInt zmask_from_V128_wide ( V128* arg )
   3274 {
   3275    UInt i, res = 0;
   3276    for (i = 0; i < 8; i++) {
   3277       res |=  ((arg->w16[i] == 0) ? 1 : 0) << i;
   3278    }
   3279    return res;
   3280 }
   3281 
   3282 /* Helps with PCMP{I,E}STR{I,M}.
   3283 
   3284    CALLED FROM GENERATED CODE: DIRTY HELPER(s).  (But not really,
   3285    actually it could be a clean helper, but for the fact that we can't
   3286    pass by value 2 x V128 to a clean helper, nor have one returned.)
   3287    Reads guest state, writes to guest state for the xSTRM cases, no
   3288    accesses of memory, is a pure function.
   3289 
   3290    opc_and_imm contains (4th byte of opcode << 8) | the-imm8-byte so
   3291    the callee knows which I/E and I/M variant it is dealing with and
   3292    what the specific operation is.  4th byte of opcode is in the range
   3293    0x60 to 0x63:
   3294        istri  66 0F 3A 63
   3295        istrm  66 0F 3A 62
   3296        estri  66 0F 3A 61
   3297        estrm  66 0F 3A 60
   3298 
   3299    gstOffL and gstOffR are the guest state offsets for the two XMM
   3300    register inputs.  We never have to deal with the memory case since
   3301    that is handled by pre-loading the relevant value into the fake
   3302    XMM16 register.
   3303 
   3304    For ESTRx variants, edxIN and eaxIN hold the values of those two
   3305    registers.
   3306 
   3307    In all cases, the bottom 16 bits of the result contain the new
   3308    OSZACP %rflags values.  For xSTRI variants, bits[31:16] of the
   3309    result hold the new %ecx value.  For xSTRM variants, the helper
   3310    writes the result directly to the guest XMM0.
   3311 
   3312    Declarable side effects: in all cases, reads guest state at
   3313    [gstOffL, +16) and [gstOffR, +16).  For xSTRM variants, also writes
   3314    guest_XMM0.
   3315 
   3316    Is expected to be called with opc_and_imm combinations which have
   3317    actually been validated, and will assert if otherwise.  The front
   3318    end should ensure we're only called with verified values.
   3319 */
   3320 ULong amd64g_dirtyhelper_PCMPxSTRx (
   3321           VexGuestAMD64State* gst,
   3322           HWord opc4_and_imm,
   3323           HWord gstOffL, HWord gstOffR,
   3324           HWord edxIN, HWord eaxIN
   3325        )
   3326 {
   3327    HWord opc4 = (opc4_and_imm >> 8) & 0xFF;
   3328    HWord imm8 = opc4_and_imm & 0xFF;
   3329    HWord isISTRx = opc4 & 2;
   3330    HWord isxSTRM = (opc4 & 1) ^ 1;
   3331    vassert((opc4 & 0xFC) == 0x60); /* 0x60 .. 0x63 */
   3332    HWord wide = (imm8 & 1);
   3333 
   3334    // where the args are
   3335    V128* argL = (V128*)( ((UChar*)gst) + gstOffL );
   3336    V128* argR = (V128*)( ((UChar*)gst) + gstOffR );
   3337 
   3338    /* Create the arg validity masks, either from the vectors
   3339       themselves or from the supplied edx/eax values. */
   3340    // FIXME: this is only right for the 8-bit data cases.
   3341    // At least that is asserted above.
   3342    UInt zmaskL, zmaskR;
   3343 
   3344    // temp spot for the resulting flags and vector.
   3345    V128 resV;
   3346    UInt resOSZACP;
   3347 
   3348    // for checking whether case was handled
   3349    Bool ok = False;
   3350 
   3351    if (wide) {
   3352       if (isISTRx) {
   3353          zmaskL = zmask_from_V128_wide(argL);
   3354          zmaskR = zmask_from_V128_wide(argR);
   3355       } else {
   3356          Int tmp;
   3357          tmp = edxIN & 0xFFFFFFFF;
   3358          if (tmp < -8) tmp = -8;
   3359          if (tmp > 8)  tmp = 8;
   3360          if (tmp < 0)  tmp = -tmp;
   3361          vassert(tmp >= 0 && tmp <= 8);
   3362          zmaskL = (1 << tmp) & 0xFF;
   3363          tmp = eaxIN & 0xFFFFFFFF;
   3364          if (tmp < -8) tmp = -8;
   3365          if (tmp > 8)  tmp = 8;
   3366          if (tmp < 0)  tmp = -tmp;
   3367          vassert(tmp >= 0 && tmp <= 8);
   3368          zmaskR = (1 << tmp) & 0xFF;
   3369       }
   3370       // do the meyaath
   3371       ok = compute_PCMPxSTRx_wide (
   3372               &resV, &resOSZACP, argL, argR,
   3373               zmaskL, zmaskR, imm8, (Bool)isxSTRM
   3374            );
   3375    } else {
   3376       if (isISTRx) {
   3377          zmaskL = zmask_from_V128(argL);
   3378          zmaskR = zmask_from_V128(argR);
   3379       } else {
   3380          Int tmp;
   3381          tmp = edxIN & 0xFFFFFFFF;
   3382          if (tmp < -16) tmp = -16;
   3383          if (tmp > 16)  tmp = 16;
   3384          if (tmp < 0)   tmp = -tmp;
   3385          vassert(tmp >= 0 && tmp <= 16);
   3386          zmaskL = (1 << tmp) & 0xFFFF;
   3387          tmp = eaxIN & 0xFFFFFFFF;
   3388          if (tmp < -16) tmp = -16;
   3389          if (tmp > 16)  tmp = 16;
   3390          if (tmp < 0)   tmp = -tmp;
   3391          vassert(tmp >= 0 && tmp <= 16);
   3392          zmaskR = (1 << tmp) & 0xFFFF;
   3393       }
   3394       // do the meyaath
   3395       ok = compute_PCMPxSTRx (
   3396               &resV, &resOSZACP, argL, argR,
   3397               zmaskL, zmaskR, imm8, (Bool)isxSTRM
   3398            );
   3399    }
   3400 
   3401    // front end shouldn't pass us any imm8 variants we can't
   3402    // handle.  Hence:
   3403    vassert(ok);
   3404 
   3405    // So, finally we need to get the results back to the caller.
   3406    // In all cases, the new OSZACP value is the lowest 16 of
   3407    // the return value.
   3408    if (isxSTRM) {
   3409       gst->guest_YMM0[0] = resV.w32[0];
   3410       gst->guest_YMM0[1] = resV.w32[1];
   3411       gst->guest_YMM0[2] = resV.w32[2];
   3412       gst->guest_YMM0[3] = resV.w32[3];
   3413       return resOSZACP & 0x8D5;
   3414    } else {
   3415       UInt newECX = resV.w32[0] & 0xFFFF;
   3416       return (newECX << 16) | (resOSZACP & 0x8D5);
   3417    }
   3418 }
   3419 
   3420 /*---------------------------------------------------------------*/
   3421 /*--- AES primitives and helpers                              ---*/
   3422 /*---------------------------------------------------------------*/
   3423 /* a 16 x 16 matrix */
   3424 static const UChar sbox[256] = {                   // row nr
   3425    0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5, // 1
   3426    0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76,
   3427    0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0, // 2
   3428    0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0,
   3429    0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc, // 3
   3430    0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15,
   3431    0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a, // 4
   3432    0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75,
   3433    0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0, // 5
   3434    0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84,
   3435    0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b, // 6
   3436    0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf,
   3437    0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85, // 7
   3438    0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8,
   3439    0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5, // 8
   3440    0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2,
   3441    0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17, // 9
   3442    0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73,
   3443    0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88, //10
   3444    0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb,
   3445    0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c, //11
   3446    0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79,
   3447    0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9, //12
   3448    0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08,
   3449    0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6, //13
   3450    0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a,
   3451    0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e, //14
   3452    0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e,
   3453    0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94, //15
   3454    0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf,
   3455    0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68, //16
   3456    0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16
   3457 };
   3458 static void SubBytes (V128* v)
   3459 {
   3460    V128 r;
   3461    UInt i;
   3462    for (i = 0; i < 16; i++)
   3463       r.w8[i] = sbox[v->w8[i]];
   3464    *v = r;
   3465 }
   3466 
   3467 /* a 16 x 16 matrix */
   3468 static const UChar invsbox[256] = {                // row nr
   3469    0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38, // 1
   3470    0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb,
   3471    0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87, // 2
   3472    0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb,
   3473    0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d, // 3
   3474    0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e,
   3475    0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2, // 4
   3476    0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25,
   3477    0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16, // 5
   3478    0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92,
   3479    0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda, // 6
   3480    0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84,
   3481    0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a, // 7
   3482    0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06,
   3483    0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02, // 8
   3484    0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b,
   3485    0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea, // 9
   3486    0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73,
   3487    0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85, //10
   3488    0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e,
   3489    0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89, //11
   3490    0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b,
   3491    0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20, //12
   3492    0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4,
   3493    0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31, //13
   3494    0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f,
   3495    0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d, //14
   3496    0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef,
   3497    0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0, //15
   3498    0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61,
   3499    0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26, //16
   3500    0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d
   3501 };
   3502 static void InvSubBytes (V128* v)
   3503 {
   3504    V128 r;
   3505    UInt i;
   3506    for (i = 0; i < 16; i++)
   3507       r.w8[i] = invsbox[v->w8[i]];
   3508    *v = r;
   3509 }
   3510 
   3511 static const UChar ShiftRows_op[16] =
   3512    {11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15, 10, 5, 0};
   3513 static void ShiftRows (V128* v)
   3514 {
   3515    V128 r;
   3516    UInt i;
   3517    for (i = 0; i < 16; i++)
   3518       r.w8[i] = v->w8[ShiftRows_op[15-i]];
   3519    *v = r;
   3520 }
   3521 
   3522 static const UChar InvShiftRows_op[16] =
   3523    {3, 6, 9, 12, 15, 2, 5, 8, 11, 14, 1, 4, 7, 10, 13, 0};
   3524 static void InvShiftRows (V128* v)
   3525 {
   3526    V128 r;
   3527    UInt i;
   3528    for (i = 0; i < 16; i++)
   3529       r.w8[i] = v->w8[InvShiftRows_op[15-i]];
   3530    *v = r;
   3531 }
   3532 
   3533 /* Multiplication of the finite fields elements of AES.
   3534    See "A Specification for The AES Algorithm Rijndael
   3535         (by Joan Daemen & Vincent Rijmen)"
   3536         Dr. Brian Gladman, v3.1, 3rd March 2001. */
   3537 /* N values so that (hex) xy = 0x03^N.
   3538    0x00 cannot be used. We put 0xff for this value.*/
   3539 /* a 16 x 16 matrix */
   3540 static const UChar Nxy[256] = {                    // row nr
   3541    0xff, 0x00, 0x19, 0x01, 0x32, 0x02, 0x1a, 0xc6, // 1
   3542    0x4b, 0xc7, 0x1b, 0x68, 0x33, 0xee, 0xdf, 0x03,
   3543    0x64, 0x04, 0xe0, 0x0e, 0x34, 0x8d, 0x81, 0xef, // 2
   3544    0x4c, 0x71, 0x08, 0xc8, 0xf8, 0x69, 0x1c, 0xc1,
   3545    0x7d, 0xc2, 0x1d, 0xb5, 0xf9, 0xb9, 0x27, 0x6a, // 3
   3546    0x4d, 0xe4, 0xa6, 0x72, 0x9a, 0xc9, 0x09, 0x78,
   3547    0x65, 0x2f, 0x8a, 0x05, 0x21, 0x0f, 0xe1, 0x24, // 4
   3548    0x12, 0xf0, 0x82, 0x45, 0x35, 0x93, 0xda, 0x8e,
   3549    0x96, 0x8f, 0xdb, 0xbd, 0x36, 0xd0, 0xce, 0x94, // 5
   3550    0x13, 0x5c, 0xd2, 0xf1, 0x40, 0x46, 0x83, 0x38,
   3551    0x66, 0xdd, 0xfd, 0x30, 0xbf, 0x06, 0x8b, 0x62, // 6
   3552    0xb3, 0x25, 0xe2, 0x98, 0x22, 0x88, 0x91, 0x10,
   3553    0x7e, 0x6e, 0x48, 0xc3, 0xa3, 0xb6, 0x1e, 0x42, // 7
   3554    0x3a, 0x6b, 0x28, 0x54, 0xfa, 0x85, 0x3d, 0xba,
   3555    0x2b, 0x79, 0x0a, 0x15, 0x9b, 0x9f, 0x5e, 0xca, // 8
   3556    0x4e, 0xd4, 0xac, 0xe5, 0xf3, 0x73, 0xa7, 0x57,
   3557    0xaf, 0x58, 0xa8, 0x50, 0xf4, 0xea, 0xd6, 0x74, // 9
   3558    0x4f, 0xae, 0xe9, 0xd5, 0xe7, 0xe6, 0xad, 0xe8,
   3559    0x2c, 0xd7, 0x75, 0x7a, 0xeb, 0x16, 0x0b, 0xf5, //10
   3560    0x59, 0xcb, 0x5f, 0xb0, 0x9c, 0xa9, 0x51, 0xa0,
   3561    0x7f, 0x0c, 0xf6, 0x6f, 0x17, 0xc4, 0x49, 0xec, //11
   3562    0xd8, 0x43, 0x1f, 0x2d, 0xa4, 0x76, 0x7b, 0xb7,
   3563    0xcc, 0xbb, 0x3e, 0x5a, 0xfb, 0x60, 0xb1, 0x86, //12
   3564    0x3b, 0x52, 0xa1, 0x6c, 0xaa, 0x55, 0x29, 0x9d,
   3565    0x97, 0xb2, 0x87, 0x90, 0x61, 0xbe, 0xdc, 0xfc, //13
   3566    0xbc, 0x95, 0xcf, 0xcd, 0x37, 0x3f, 0x5b, 0xd1,
   3567    0x53, 0x39, 0x84, 0x3c, 0x41, 0xa2, 0x6d, 0x47, //14
   3568    0x14, 0x2a, 0x9e, 0x5d, 0x56, 0xf2, 0xd3, 0xab,
   3569    0x44, 0x11, 0x92, 0xd9, 0x23, 0x20, 0x2e, 0x89, //15
   3570    0xb4, 0x7c, 0xb8, 0x26, 0x77, 0x99, 0xe3, 0xa5,
   3571    0x67, 0x4a, 0xed, 0xde, 0xc5, 0x31, 0xfe, 0x18, //16
   3572    0x0d, 0x63, 0x8c, 0x80, 0xc0, 0xf7, 0x70, 0x07
   3573 };
   3574 
   3575 /* E values so that E = 0x03^xy. */
   3576 static const UChar Exy[256] = {                    // row nr
   3577    0x01, 0x03, 0x05, 0x0f, 0x11, 0x33, 0x55, 0xff, // 1
   3578    0x1a, 0x2e, 0x72, 0x96, 0xa1, 0xf8, 0x13, 0x35,
   3579    0x5f, 0xe1, 0x38, 0x48, 0xd8, 0x73, 0x95, 0xa4, // 2
   3580    0xf7, 0x02, 0x06, 0x0a, 0x1e, 0x22, 0x66, 0xaa,
   3581    0xe5, 0x34, 0x5c, 0xe4, 0x37, 0x59, 0xeb, 0x26, // 3
   3582    0x6a, 0xbe, 0xd9, 0x70, 0x90, 0xab, 0xe6, 0x31,
   3583    0x53, 0xf5, 0x04, 0x0c, 0x14, 0x3c, 0x44, 0xcc, // 4
   3584    0x4f, 0xd1, 0x68, 0xb8, 0xd3, 0x6e, 0xb2, 0xcd,
   3585    0x4c, 0xd4, 0x67, 0xa9, 0xe0, 0x3b, 0x4d, 0xd7, // 5
   3586    0x62, 0xa6, 0xf1, 0x08, 0x18, 0x28, 0x78, 0x88,
   3587    0x83, 0x9e, 0xb9, 0xd0, 0x6b, 0xbd, 0xdc, 0x7f, // 6
   3588    0x81, 0x98, 0xb3, 0xce, 0x49, 0xdb, 0x76, 0x9a,
   3589    0xb5, 0xc4, 0x57, 0xf9, 0x10, 0x30, 0x50, 0xf0, // 7
   3590    0x0b, 0x1d, 0x27, 0x69, 0xbb, 0xd6, 0x61, 0xa3,
   3591    0xfe, 0x19, 0x2b, 0x7d, 0x87, 0x92, 0xad, 0xec, // 8
   3592    0x2f, 0x71, 0x93, 0xae, 0xe9, 0x20, 0x60, 0xa0,
   3593    0xfb, 0x16, 0x3a, 0x4e, 0xd2, 0x6d, 0xb7, 0xc2, // 9
   3594    0x5d, 0xe7, 0x32, 0x56, 0xfa, 0x15, 0x3f, 0x41,
   3595    0xc3, 0x5e, 0xe2, 0x3d, 0x47, 0xc9, 0x40, 0xc0, //10
   3596    0x5b, 0xed, 0x2c, 0x74, 0x9c, 0xbf, 0xda, 0x75,
   3597    0x9f, 0xba, 0xd5, 0x64, 0xac, 0xef, 0x2a, 0x7e, //11
   3598    0x82, 0x9d, 0xbc, 0xdf, 0x7a, 0x8e, 0x89, 0x80,
   3599    0x9b, 0xb6, 0xc1, 0x58, 0xe8, 0x23, 0x65, 0xaf, //12
   3600    0xea, 0x25, 0x6f, 0xb1, 0xc8, 0x43, 0xc5, 0x54,
   3601    0xfc, 0x1f, 0x21, 0x63, 0xa5, 0xf4, 0x07, 0x09, //13
   3602    0x1b, 0x2d, 0x77, 0x99, 0xb0, 0xcb, 0x46, 0xca,
   3603    0x45, 0xcf, 0x4a, 0xde, 0x79, 0x8b, 0x86, 0x91, //14
   3604    0xa8, 0xe3, 0x3e, 0x42, 0xc6, 0x51, 0xf3, 0x0e,
   3605    0x12, 0x36, 0x5a, 0xee, 0x29, 0x7b, 0x8d, 0x8c, //15
   3606    0x8f, 0x8a, 0x85, 0x94, 0xa7, 0xf2, 0x0d, 0x17,
   3607    0x39, 0x4b, 0xdd, 0x7c, 0x84, 0x97, 0xa2, 0xfd, //16
   3608    0x1c, 0x24, 0x6c, 0xb4, 0xc7, 0x52, 0xf6, 0x01};
   3609 
   3610 static inline UChar ff_mul(UChar u1, UChar u2)
   3611 {
   3612    if ((u1 > 0) && (u2 > 0)) {
   3613       UInt ui = Nxy[u1] + Nxy[u2];
   3614       if (ui >= 255)
   3615          ui = ui - 255;
   3616       return Exy[ui];
   3617    } else {
   3618       return 0;
   3619    };
   3620 }
   3621 
   3622 static void MixColumns (V128* v)
   3623 {
   3624    V128 r;
   3625    Int j;
   3626 #define P(x,row,col) (x)->w8[((row)*4+(col))]
   3627    for (j = 0; j < 4; j++) {
   3628       P(&r,j,0) = ff_mul(0x02, P(v,j,0)) ^ ff_mul(0x03, P(v,j,1))
   3629          ^ P(v,j,2) ^ P(v,j,3);
   3630       P(&r,j,1) = P(v,j,0) ^ ff_mul( 0x02, P(v,j,1) )
   3631          ^ ff_mul(0x03, P(v,j,2) ) ^ P(v,j,3);
   3632       P(&r,j,2) = P(v,j,0) ^ P(v,j,1) ^ ff_mul( 0x02, P(v,j,2) )
   3633          ^ ff_mul(0x03, P(v,j,3) );
   3634       P(&r,j,3) = ff_mul(0x03, P(v,j,0) ) ^ P(v,j,1) ^ P(v,j,2)
   3635          ^ ff_mul( 0x02, P(v,j,3) );
   3636    }
   3637    *v = r;
   3638 #undef P
   3639 }
   3640 
   3641 static void InvMixColumns (V128* v)
   3642 {
   3643    V128 r;
   3644    Int j;
   3645 #define P(x,row,col) (x)->w8[((row)*4+(col))]
   3646    for (j = 0; j < 4; j++) {
   3647       P(&r,j,0) = ff_mul(0x0e, P(v,j,0) ) ^ ff_mul(0x0b, P(v,j,1) )
   3648          ^ ff_mul(0x0d,P(v,j,2) ) ^ ff_mul(0x09, P(v,j,3) );
   3649       P(&r,j,1) = ff_mul(0x09, P(v,j,0) ) ^ ff_mul(0x0e, P(v,j,1) )
   3650          ^ ff_mul(0x0b,P(v,j,2) ) ^ ff_mul(0x0d, P(v,j,3) );
   3651       P(&r,j,2) = ff_mul(0x0d, P(v,j,0) ) ^ ff_mul(0x09, P(v,j,1) )
   3652          ^ ff_mul(0x0e,P(v,j,2) ) ^ ff_mul(0x0b, P(v,j,3) );
   3653       P(&r,j,3) = ff_mul(0x0b, P(v,j,0) ) ^ ff_mul(0x0d, P(v,j,1) )
   3654          ^ ff_mul(0x09,P(v,j,2) ) ^ ff_mul(0x0e, P(v,j,3) );
   3655    }
   3656    *v = r;
   3657 #undef P
   3658 
   3659 }
   3660 
   3661 /* For description, see definition in guest_amd64_defs.h */
   3662 void amd64g_dirtyhelper_AES (
   3663           VexGuestAMD64State* gst,
   3664           HWord opc4, HWord gstOffD,
   3665           HWord gstOffL, HWord gstOffR
   3666        )
   3667 {
   3668    // where the args are
   3669    V128* argD = (V128*)( ((UChar*)gst) + gstOffD );
   3670    V128* argL = (V128*)( ((UChar*)gst) + gstOffL );
   3671    V128* argR = (V128*)( ((UChar*)gst) + gstOffR );
   3672    V128  r;
   3673 
   3674    switch (opc4) {
   3675       case 0xDC: /* AESENC */
   3676       case 0xDD: /* AESENCLAST */
   3677          r = *argR;
   3678          ShiftRows (&r);
   3679          SubBytes  (&r);
   3680          if (opc4 == 0xDC)
   3681             MixColumns (&r);
   3682          argD->w64[0] = r.w64[0] ^ argL->w64[0];
   3683          argD->w64[1] = r.w64[1] ^ argL->w64[1];
   3684          break;
   3685 
   3686       case 0xDE: /* AESDEC */
   3687       case 0xDF: /* AESDECLAST */
   3688          r = *argR;
   3689          InvShiftRows (&r);
   3690          InvSubBytes (&r);
   3691          if (opc4 == 0xDE)
   3692             InvMixColumns (&r);
   3693          argD->w64[0] = r.w64[0] ^ argL->w64[0];
   3694          argD->w64[1] = r.w64[1] ^ argL->w64[1];
   3695          break;
   3696 
   3697       case 0xDB: /* AESIMC */
   3698          *argD = *argL;
   3699          InvMixColumns (argD);
   3700          break;
   3701       default: vassert(0);
   3702    }
   3703 }
   3704 
   3705 static inline UInt RotWord (UInt   w32)
   3706 {
   3707    return ((w32 >> 8) | (w32 << 24));
   3708 }
   3709 
   3710 static inline UInt SubWord (UInt   w32)
   3711 {
   3712    UChar *w8;
   3713    UChar *r8;
   3714    UInt res;
   3715    w8 = (UChar*) &w32;
   3716    r8 = (UChar*) &res;
   3717    r8[0] = sbox[w8[0]];
   3718    r8[1] = sbox[w8[1]];
   3719    r8[2] = sbox[w8[2]];
   3720    r8[3] = sbox[w8[3]];
   3721    return res;
   3722 }
   3723 
   3724 /* For description, see definition in guest_amd64_defs.h */
   3725 extern void amd64g_dirtyhelper_AESKEYGENASSIST (
   3726           VexGuestAMD64State* gst,
   3727           HWord imm8,
   3728           HWord gstOffL, HWord gstOffR
   3729        )
   3730 {
   3731    // where the args are
   3732    V128* argL = (V128*)( ((UChar*)gst) + gstOffL );
   3733    V128* argR = (V128*)( ((UChar*)gst) + gstOffR );
   3734 
   3735    argR->w32[3] = RotWord (SubWord (argL->w32[3])) ^ imm8;
   3736    argR->w32[2] = SubWord (argL->w32[3]);
   3737    argR->w32[1] = RotWord (SubWord (argL->w32[1])) ^ imm8;
   3738    argR->w32[0] = SubWord (argL->w32[1]);
   3739 }
   3740 
   3741 
   3742 
   3743 /*---------------------------------------------------------------*/
   3744 /*--- Helpers for dealing with, and describing,               ---*/
   3745 /*--- guest state as a whole.                                 ---*/
   3746 /*---------------------------------------------------------------*/
   3747 
   3748 /* Initialise the entire amd64 guest state. */
   3749 /* VISIBLE TO LIBVEX CLIENT */
   3750 void LibVEX_GuestAMD64_initialise ( /*OUT*/VexGuestAMD64State* vex_state )
   3751 {
   3752    vex_state->host_EvC_FAILADDR = 0;
   3753    vex_state->host_EvC_COUNTER = 0;
   3754    vex_state->pad0 = 0;
   3755 
   3756    vex_state->guest_RAX = 0;
   3757    vex_state->guest_RCX = 0;
   3758    vex_state->guest_RDX = 0;
   3759    vex_state->guest_RBX = 0;
   3760    vex_state->guest_RSP = 0;
   3761    vex_state->guest_RBP = 0;
   3762    vex_state->guest_RSI = 0;
   3763    vex_state->guest_RDI = 0;
   3764    vex_state->guest_R8  = 0;
   3765    vex_state->guest_R9  = 0;
   3766    vex_state->guest_R10 = 0;
   3767    vex_state->guest_R11 = 0;
   3768    vex_state->guest_R12 = 0;
   3769    vex_state->guest_R13 = 0;
   3770    vex_state->guest_R14 = 0;
   3771    vex_state->guest_R15 = 0;
   3772 
   3773    vex_state->guest_CC_OP   = AMD64G_CC_OP_COPY;
   3774    vex_state->guest_CC_DEP1 = 0;
   3775    vex_state->guest_CC_DEP2 = 0;
   3776    vex_state->guest_CC_NDEP = 0;
   3777 
   3778    vex_state->guest_DFLAG   = 1; /* forwards */
   3779    vex_state->guest_IDFLAG  = 0;
   3780    vex_state->guest_ACFLAG  = 0;
   3781 
   3782    /* HACK: represent the offset associated with %fs==0. This
   3783       assumes that %fs is only ever zero. */
   3784    vex_state->guest_FS_ZERO = 0;
   3785 
   3786    vex_state->guest_RIP = 0;
   3787 
   3788    /* Initialise the simulated FPU */
   3789    amd64g_dirtyhelper_FINIT( vex_state );
   3790 
   3791    /* Initialise the AVX state. */
   3792 #  define AVXZERO(_ymm) \
   3793       do { _ymm[0]=_ymm[1]=_ymm[2]=_ymm[3] = 0; \
   3794            _ymm[4]=_ymm[5]=_ymm[6]=_ymm[7] = 0; \
   3795       } while (0)
   3796    vex_state->guest_SSEROUND = (ULong)Irrm_NEAREST;
   3797    AVXZERO(vex_state->guest_YMM0);
   3798    AVXZERO(vex_state->guest_YMM1);
   3799    AVXZERO(vex_state->guest_YMM2);
   3800    AVXZERO(vex_state->guest_YMM3);
   3801    AVXZERO(vex_state->guest_YMM4);
   3802    AVXZERO(vex_state->guest_YMM5);
   3803    AVXZERO(vex_state->guest_YMM6);
   3804    AVXZERO(vex_state->guest_YMM7);
   3805    AVXZERO(vex_state->guest_YMM8);
   3806    AVXZERO(vex_state->guest_YMM9);
   3807    AVXZERO(vex_state->guest_YMM10);
   3808    AVXZERO(vex_state->guest_YMM11);
   3809    AVXZERO(vex_state->guest_YMM12);
   3810    AVXZERO(vex_state->guest_YMM13);
   3811    AVXZERO(vex_state->guest_YMM14);
   3812    AVXZERO(vex_state->guest_YMM15);
   3813    AVXZERO(vex_state->guest_YMM16);
   3814 
   3815 #  undef AVXZERO
   3816 
   3817    vex_state->guest_EMNOTE = EmNote_NONE;
   3818 
   3819    /* These should not ever be either read or written, but we
   3820       initialise them anyway. */
   3821    vex_state->guest_CMSTART = 0;
   3822    vex_state->guest_CMLEN   = 0;
   3823 
   3824    vex_state->guest_NRADDR   = 0;
   3825    vex_state->guest_SC_CLASS = 0;
   3826    vex_state->guest_GS_0x60  = 0;
   3827 
   3828    vex_state->guest_IP_AT_SYSCALL = 0;
   3829    vex_state->pad1 = 0;
   3830 }
   3831 
   3832 
   3833 /* Figure out if any part of the guest state contained in minoff
   3834    .. maxoff requires precise memory exceptions.  If in doubt return
   3835    True (but this generates significantly slower code).
   3836 
   3837    By default we enforce precise exns for guest %RSP, %RBP and %RIP
   3838    only.  These are the minimum needed to extract correct stack
   3839    backtraces from amd64 code.
   3840 
   3841    Only %RSP is needed in mode VexRegUpdSpAtMemAccess.
   3842 */
   3843 Bool guest_amd64_state_requires_precise_mem_exns ( Int minoff,
   3844                                                    Int maxoff)
   3845 {
   3846    Int rbp_min = offsetof(VexGuestAMD64State, guest_RBP);
   3847    Int rbp_max = rbp_min + 8 - 1;
   3848    Int rsp_min = offsetof(VexGuestAMD64State, guest_RSP);
   3849    Int rsp_max = rsp_min + 8 - 1;
   3850    Int rip_min = offsetof(VexGuestAMD64State, guest_RIP);
   3851    Int rip_max = rip_min + 8 - 1;
   3852 
   3853    if (maxoff < rsp_min || minoff > rsp_max) {
   3854       /* no overlap with rsp */
   3855       if (vex_control.iropt_register_updates == VexRegUpdSpAtMemAccess)
   3856          return False; // We only need to check stack pointer.
   3857    } else {
   3858       return True;
   3859    }
   3860 
   3861    if (maxoff < rbp_min || minoff > rbp_max) {
   3862       /* no overlap with rbp */
   3863    } else {
   3864       return True;
   3865    }
   3866 
   3867    if (maxoff < rip_min || minoff > rip_max) {
   3868       /* no overlap with eip */
   3869    } else {
   3870       return True;
   3871    }
   3872 
   3873    return False;
   3874 }
   3875 
   3876 
   3877 #define ALWAYSDEFD(field)                             \
   3878     { offsetof(VexGuestAMD64State, field),            \
   3879       (sizeof ((VexGuestAMD64State*)0)->field) }
   3880 
   3881 VexGuestLayout
   3882    amd64guest_layout
   3883       = {
   3884           /* Total size of the guest state, in bytes. */
   3885           .total_sizeB = sizeof(VexGuestAMD64State),
   3886 
   3887           /* Describe the stack pointer. */
   3888           .offset_SP = offsetof(VexGuestAMD64State,guest_RSP),
   3889           .sizeof_SP = 8,
   3890 
   3891           /* Describe the frame pointer. */
   3892           .offset_FP = offsetof(VexGuestAMD64State,guest_RBP),
   3893           .sizeof_FP = 8,
   3894 
   3895           /* Describe the instruction pointer. */
   3896           .offset_IP = offsetof(VexGuestAMD64State,guest_RIP),
   3897           .sizeof_IP = 8,
   3898 
   3899           /* Describe any sections to be regarded by Memcheck as
   3900              'always-defined'. */
   3901           .n_alwaysDefd = 16,
   3902 
   3903           /* flags thunk: OP and NDEP are always defd, whereas DEP1
   3904              and DEP2 have to be tracked.  See detailed comment in
   3905              gdefs.h on meaning of thunk fields. */
   3906           .alwaysDefd
   3907              = { /*  0 */ ALWAYSDEFD(guest_CC_OP),
   3908                  /*  1 */ ALWAYSDEFD(guest_CC_NDEP),
   3909 		 /*  2 */ ALWAYSDEFD(guest_DFLAG),
   3910                  /*  3 */ ALWAYSDEFD(guest_IDFLAG),
   3911                  /*  4 */ ALWAYSDEFD(guest_RIP),
   3912                  /*  5 */ ALWAYSDEFD(guest_FS_ZERO),
   3913                  /*  6 */ ALWAYSDEFD(guest_FTOP),
   3914                  /*  7 */ ALWAYSDEFD(guest_FPTAG),
   3915                  /*  8 */ ALWAYSDEFD(guest_FPROUND),
   3916                  /*  9 */ ALWAYSDEFD(guest_FC3210),
   3917                  // /* */ ALWAYSDEFD(guest_CS),
   3918                  // /* */ ALWAYSDEFD(guest_DS),
   3919                  // /* */ ALWAYSDEFD(guest_ES),
   3920                  // /* */ ALWAYSDEFD(guest_FS),
   3921                  // /* */ ALWAYSDEFD(guest_GS),
   3922                  // /* */ ALWAYSDEFD(guest_SS),
   3923                  // /* */ ALWAYSDEFD(guest_LDT),
   3924                  // /* */ ALWAYSDEFD(guest_GDT),
   3925                  /* 10 */ ALWAYSDEFD(guest_EMNOTE),
   3926                  /* 11 */ ALWAYSDEFD(guest_SSEROUND),
   3927                  /* 12 */ ALWAYSDEFD(guest_CMSTART),
   3928                  /* 13 */ ALWAYSDEFD(guest_CMLEN),
   3929                  /* 14 */ ALWAYSDEFD(guest_SC_CLASS),
   3930                  /* 15 */ ALWAYSDEFD(guest_IP_AT_SYSCALL)
   3931                }
   3932         };
   3933 
   3934 
   3935 /*---------------------------------------------------------------*/
   3936 /*--- end                               guest_amd64_helpers.c ---*/
   3937 /*---------------------------------------------------------------*/
   3938