Home | History | Annotate | Download | only in priv
      1 
      2 /*---------------------------------------------------------------*/
      3 /*--- begin                             guest_amd64_helpers.c ---*/
      4 /*---------------------------------------------------------------*/
      5 
      6 /*
      7    This file is part of Valgrind, a dynamic binary instrumentation
      8    framework.
      9 
     10    Copyright (C) 2004-2017 OpenWorks LLP
     11       info (at) open-works.net
     12 
     13    This program is free software; you can redistribute it and/or
     14    modify it under the terms of the GNU General Public License as
     15    published by the Free Software Foundation; either version 2 of the
     16    License, or (at your option) any later version.
     17 
     18    This program is distributed in the hope that it will be useful, but
     19    WITHOUT ANY WARRANTY; without even the implied warranty of
     20    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
     21    General Public License for more details.
     22 
     23    You should have received a copy of the GNU General Public License
     24    along with this program; if not, write to the Free Software
     25    Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
     26    02110-1301, USA.
     27 
     28    The GNU General Public License is contained in the file COPYING.
     29 
     30    Neither the names of the U.S. Department of Energy nor the
     31    University of California nor the names of its contributors may be
     32    used to endorse or promote products derived from this software
     33    without prior written permission.
     34 */
     35 
     36 #include "libvex_basictypes.h"
     37 #include "libvex_emnote.h"
     38 #include "libvex_guest_amd64.h"
     39 #include "libvex_ir.h"
     40 #include "libvex.h"
     41 
     42 #include "main_util.h"
     43 #include "main_globals.h"
     44 #include "guest_generic_bb_to_IR.h"
     45 #include "guest_amd64_defs.h"
     46 #include "guest_generic_x87.h"
     47 
     48 
     49 /* This file contains helper functions for amd64 guest code.
     50    Calls to these functions are generated by the back end.
     51    These calls are of course in the host machine code and
     52    this file will be compiled to host machine code, so that
     53    all makes sense.
     54 
     55    Only change the signatures of these helper functions very
     56    carefully.  If you change the signature here, you'll have to change
     57    the parameters passed to it in the IR calls constructed by
     58    guest-amd64/toIR.c.
     59 
     60    The convention used is that all functions called from generated
     61    code are named amd64g_<something>, and any function whose name lacks
     62    that prefix is not called from generated code.  Note that some
     63    LibVEX_* functions can however be called by VEX's client, but that
     64    is not the same as calling them from VEX-generated code.
     65 */
     66 
     67 
     68 /* Set to 1 to get detailed profiling info about use of the flag
     69    machinery. */
     70 #define PROFILE_RFLAGS 0
     71 
     72 
     73 /*---------------------------------------------------------------*/
     74 /*--- %rflags run-time helpers.                               ---*/
     75 /*---------------------------------------------------------------*/
     76 
     77 /* Do 64x64 -> 128 signed/unsigned multiplies, for computing flags
     78    after imulq/mulq. */
     79 
     80 static void mullS64 ( Long u, Long v, Long* rHi, Long* rLo )
     81 {
     82    const Long halfMask = 0xFFFFFFFFLL;
     83    ULong u0, v0, w0;
     84     Long u1, v1, w1, w2, t;
     85    u0   = u & halfMask;
     86    u1   = u >> 32;
     87    v0   = v & halfMask;
     88    v1   = v >> 32;
     89    w0   = u0 * v0;
     90    t    = u1 * v0 + (w0 >> 32);
     91    w1   = t & halfMask;
     92    w2   = t >> 32;
     93    w1   = u0 * v1 + w1;
     94    *rHi = u1 * v1 + w2 + (w1 >> 32);
     95    *rLo = (Long)((ULong)u * (ULong)v);
     96 }
     97 
     98 static void mullU64 ( ULong u, ULong v, ULong* rHi, ULong* rLo )
     99 {
    100    const ULong halfMask = 0xFFFFFFFFULL;
    101    ULong u0, v0, w0;
    102    ULong u1, v1, w1,w2,t;
    103    u0   = u & halfMask;
    104    u1   = u >> 32;
    105    v0   = v & halfMask;
    106    v1   = v >> 32;
    107    w0   = u0 * v0;
    108    t    = u1 * v0 + (w0 >> 32);
    109    w1   = t & halfMask;
    110    w2   = t >> 32;
    111    w1   = u0 * v1 + w1;
    112    *rHi = u1 * v1 + w2 + (w1 >> 32);
    113    *rLo = u * v;
    114 }
    115 
    116 
    117 static const UChar parity_table[256] = {
    118     AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
    119     0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
    120     0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
    121     AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
    122     0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
    123     AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
    124     AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
    125     0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
    126     0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
    127     AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
    128     AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
    129     0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
    130     AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
    131     0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
    132     0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
    133     AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
    134     0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
    135     AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
    136     AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
    137     0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
    138     AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
    139     0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
    140     0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
    141     AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
    142     AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
    143     0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
    144     0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
    145     AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
    146     0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
    147     AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
    148     AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
    149     0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
    150 };
    151 
    152 /* generalised left-shifter */
    153 static inline Long lshift ( Long x, Int n )
    154 {
    155    if (n >= 0)
    156       return (ULong)x << n;
    157    else
    158       return x >> (-n);
    159 }
    160 
    161 /* identity on ULong */
    162 static inline ULong idULong ( ULong x )
    163 {
    164    return x;
    165 }
    166 
    167 
    168 #define PREAMBLE(__data_bits)					\
    169    /* const */ ULong DATA_MASK 					\
    170       = __data_bits==8                                          \
    171            ? 0xFFULL 					        \
    172            : (__data_bits==16                                   \
    173                 ? 0xFFFFULL 		                        \
    174                 : (__data_bits==32                              \
    175                      ? 0xFFFFFFFFULL                            \
    176                      : 0xFFFFFFFFFFFFFFFFULL));                 \
    177    /* const */ ULong SIGN_MASK = 1ULL << (__data_bits - 1);     \
    178    /* const */ ULong CC_DEP1 = cc_dep1_formal;			\
    179    /* const */ ULong CC_DEP2 = cc_dep2_formal;			\
    180    /* const */ ULong CC_NDEP = cc_ndep_formal;			\
    181    /* Four bogus assignments, which hopefully gcc can     */	\
    182    /* optimise away, and which stop it complaining about  */	\
    183    /* unused variables.                                   */	\
    184    SIGN_MASK = SIGN_MASK;					\
    185    DATA_MASK = DATA_MASK;					\
    186    CC_DEP2 = CC_DEP2;						\
    187    CC_NDEP = CC_NDEP;
    188 
    189 
    190 /*-------------------------------------------------------------*/
    191 
    192 #define ACTIONS_ADD(DATA_BITS,DATA_UTYPE)			\
    193 {								\
    194    PREAMBLE(DATA_BITS);						\
    195    { ULong cf, pf, af, zf, sf, of;				\
    196      ULong argL, argR, res;					\
    197      argL = CC_DEP1;						\
    198      argR = CC_DEP2;						\
    199      res  = argL + argR;					\
    200      cf = (DATA_UTYPE)res < (DATA_UTYPE)argL;			\
    201      pf = parity_table[(UChar)res];				\
    202      af = (res ^ argL ^ argR) & 0x10;				\
    203      zf = ((DATA_UTYPE)res == 0) << 6;				\
    204      sf = lshift(res, 8 - DATA_BITS) & 0x80;			\
    205      of = lshift((argL ^ argR ^ -1) & (argL ^ res), 		\
    206                  12 - DATA_BITS) & AMD64G_CC_MASK_O;		\
    207      return cf | pf | af | zf | sf | of;			\
    208    }								\
    209 }
    210 
    211 /*-------------------------------------------------------------*/
    212 
    213 #define ACTIONS_SUB(DATA_BITS,DATA_UTYPE)			\
    214 {								\
    215    PREAMBLE(DATA_BITS);						\
    216    { ULong cf, pf, af, zf, sf, of;				\
    217      ULong argL, argR, res;					\
    218      argL = CC_DEP1;						\
    219      argR = CC_DEP2;						\
    220      res  = argL - argR;					\
    221      cf = (DATA_UTYPE)argL < (DATA_UTYPE)argR;			\
    222      pf = parity_table[(UChar)res];				\
    223      af = (res ^ argL ^ argR) & 0x10;				\
    224      zf = ((DATA_UTYPE)res == 0) << 6;				\
    225      sf = lshift(res, 8 - DATA_BITS) & 0x80;			\
    226      of = lshift((argL ^ argR) & (argL ^ res),	 		\
    227                  12 - DATA_BITS) & AMD64G_CC_MASK_O; 		\
    228      return cf | pf | af | zf | sf | of;			\
    229    }								\
    230 }
    231 
    232 /*-------------------------------------------------------------*/
    233 
    234 #define ACTIONS_ADC(DATA_BITS,DATA_UTYPE)			\
    235 {								\
    236    PREAMBLE(DATA_BITS);						\
    237    { ULong cf, pf, af, zf, sf, of;				\
    238      ULong argL, argR, oldC, res;		 		\
    239      oldC = CC_NDEP & AMD64G_CC_MASK_C;				\
    240      argL = CC_DEP1;						\
    241      argR = CC_DEP2 ^ oldC;	       				\
    242      res  = (argL + argR) + oldC;				\
    243      if (oldC)							\
    244         cf = (DATA_UTYPE)res <= (DATA_UTYPE)argL;		\
    245      else							\
    246         cf = (DATA_UTYPE)res < (DATA_UTYPE)argL;		\
    247      pf = parity_table[(UChar)res];				\
    248      af = (res ^ argL ^ argR) & 0x10;				\
    249      zf = ((DATA_UTYPE)res == 0) << 6;				\
    250      sf = lshift(res, 8 - DATA_BITS) & 0x80;			\
    251      of = lshift((argL ^ argR ^ -1) & (argL ^ res), 		\
    252                   12 - DATA_BITS) & AMD64G_CC_MASK_O;		\
    253      return cf | pf | af | zf | sf | of;			\
    254    }								\
    255 }
    256 
    257 /*-------------------------------------------------------------*/
    258 
    259 #define ACTIONS_SBB(DATA_BITS,DATA_UTYPE)			\
    260 {								\
    261    PREAMBLE(DATA_BITS);						\
    262    { ULong cf, pf, af, zf, sf, of;				\
    263      ULong argL, argR, oldC, res;	       			\
    264      oldC = CC_NDEP & AMD64G_CC_MASK_C;				\
    265      argL = CC_DEP1;						\
    266      argR = CC_DEP2 ^ oldC;	       				\
    267      res  = (argL - argR) - oldC;				\
    268      if (oldC)							\
    269         cf = (DATA_UTYPE)argL <= (DATA_UTYPE)argR;		\
    270      else							\
    271         cf = (DATA_UTYPE)argL < (DATA_UTYPE)argR;		\
    272      pf = parity_table[(UChar)res];				\
    273      af = (res ^ argL ^ argR) & 0x10;				\
    274      zf = ((DATA_UTYPE)res == 0) << 6;				\
    275      sf = lshift(res, 8 - DATA_BITS) & 0x80;			\
    276      of = lshift((argL ^ argR) & (argL ^ res), 			\
    277                  12 - DATA_BITS) & AMD64G_CC_MASK_O;		\
    278      return cf | pf | af | zf | sf | of;			\
    279    }								\
    280 }
    281 
    282 /*-------------------------------------------------------------*/
    283 
    284 #define ACTIONS_LOGIC(DATA_BITS,DATA_UTYPE)			\
    285 {								\
    286    PREAMBLE(DATA_BITS);						\
    287    { ULong cf, pf, af, zf, sf, of;				\
    288      cf = 0;							\
    289      pf = parity_table[(UChar)CC_DEP1];				\
    290      af = 0;							\
    291      zf = ((DATA_UTYPE)CC_DEP1 == 0) << 6;			\
    292      sf = lshift(CC_DEP1, 8 - DATA_BITS) & 0x80;		\
    293      of = 0;							\
    294      return cf | pf | af | zf | sf | of;			\
    295    }								\
    296 }
    297 
    298 /*-------------------------------------------------------------*/
    299 
    300 #define ACTIONS_INC(DATA_BITS,DATA_UTYPE)			\
    301 {								\
    302    PREAMBLE(DATA_BITS);						\
    303    { ULong cf, pf, af, zf, sf, of;				\
    304      ULong argL, argR, res;					\
    305      res  = CC_DEP1;						\
    306      argL = res - 1;						\
    307      argR = 1;							\
    308      cf = CC_NDEP & AMD64G_CC_MASK_C;				\
    309      pf = parity_table[(UChar)res];				\
    310      af = (res ^ argL ^ argR) & 0x10;				\
    311      zf = ((DATA_UTYPE)res == 0) << 6;				\
    312      sf = lshift(res, 8 - DATA_BITS) & 0x80;			\
    313      of = ((res & DATA_MASK) == SIGN_MASK) << 11;		\
    314      return cf | pf | af | zf | sf | of;			\
    315    }								\
    316 }
    317 
    318 /*-------------------------------------------------------------*/
    319 
    320 #define ACTIONS_DEC(DATA_BITS,DATA_UTYPE)			\
    321 {								\
    322    PREAMBLE(DATA_BITS);						\
    323    { ULong cf, pf, af, zf, sf, of;				\
    324      ULong argL, argR, res;					\
    325      res  = CC_DEP1;						\
    326      argL = res + 1;						\
    327      argR = 1;							\
    328      cf = CC_NDEP & AMD64G_CC_MASK_C;				\
    329      pf = parity_table[(UChar)res];				\
    330      af = (res ^ argL ^ argR) & 0x10;				\
    331      zf = ((DATA_UTYPE)res == 0) << 6;				\
    332      sf = lshift(res, 8 - DATA_BITS) & 0x80;			\
    333      of = ((res & DATA_MASK) 					\
    334           == ((ULong)SIGN_MASK - 1)) << 11;			\
    335      return cf | pf | af | zf | sf | of;			\
    336    }								\
    337 }
    338 
    339 /*-------------------------------------------------------------*/
    340 
    341 #define ACTIONS_SHL(DATA_BITS,DATA_UTYPE)			\
    342 {								\
    343    PREAMBLE(DATA_BITS);						\
    344    { ULong cf, pf, af, zf, sf, of;				\
    345      cf = (CC_DEP2 >> (DATA_BITS - 1)) & AMD64G_CC_MASK_C;	\
    346      pf = parity_table[(UChar)CC_DEP1];				\
    347      af = 0; /* undefined */					\
    348      zf = ((DATA_UTYPE)CC_DEP1 == 0) << 6;			\
    349      sf = lshift(CC_DEP1, 8 - DATA_BITS) & 0x80;		\
    350      /* of is defined if shift count == 1 */			\
    351      of = lshift(CC_DEP2 ^ CC_DEP1, 12 - DATA_BITS) 		\
    352           & AMD64G_CC_MASK_O;					\
    353      return cf | pf | af | zf | sf | of;			\
    354    }								\
    355 }
    356 
    357 /*-------------------------------------------------------------*/
    358 
    359 #define ACTIONS_SHR(DATA_BITS,DATA_UTYPE)			\
    360 {								\
    361    PREAMBLE(DATA_BITS);  					\
    362    { ULong cf, pf, af, zf, sf, of;				\
    363      cf = CC_DEP2 & 1;						\
    364      pf = parity_table[(UChar)CC_DEP1];				\
    365      af = 0; /* undefined */					\
    366      zf = ((DATA_UTYPE)CC_DEP1 == 0) << 6;			\
    367      sf = lshift(CC_DEP1, 8 - DATA_BITS) & 0x80;		\
    368      /* of is defined if shift count == 1 */			\
    369      of = lshift(CC_DEP2 ^ CC_DEP1, 12 - DATA_BITS)		\
    370           & AMD64G_CC_MASK_O;					\
    371      return cf | pf | af | zf | sf | of;			\
    372    }								\
    373 }
    374 
    375 /*-------------------------------------------------------------*/
    376 
    377 /* ROL: cf' = lsb(result).  of' = msb(result) ^ lsb(result). */
    378 /* DEP1 = result, NDEP = old flags */
    379 #define ACTIONS_ROL(DATA_BITS,DATA_UTYPE)			\
    380 {								\
    381    PREAMBLE(DATA_BITS);						\
    382    { ULong fl 							\
    383         = (CC_NDEP & ~(AMD64G_CC_MASK_O | AMD64G_CC_MASK_C))	\
    384           | (AMD64G_CC_MASK_C & CC_DEP1)			\
    385           | (AMD64G_CC_MASK_O & (lshift(CC_DEP1,  		\
    386                                       11-(DATA_BITS-1)) 	\
    387                      ^ lshift(CC_DEP1, 11)));			\
    388      return fl;							\
    389    }								\
    390 }
    391 
    392 /*-------------------------------------------------------------*/
    393 
    394 /* ROR: cf' = msb(result).  of' = msb(result) ^ msb-1(result). */
    395 /* DEP1 = result, NDEP = old flags */
    396 #define ACTIONS_ROR(DATA_BITS,DATA_UTYPE)			\
    397 {								\
    398    PREAMBLE(DATA_BITS);						\
    399    { ULong fl 							\
    400         = (CC_NDEP & ~(AMD64G_CC_MASK_O | AMD64G_CC_MASK_C))	\
    401           | (AMD64G_CC_MASK_C & (CC_DEP1 >> (DATA_BITS-1)))	\
    402           | (AMD64G_CC_MASK_O & (lshift(CC_DEP1, 		\
    403                                       11-(DATA_BITS-1)) 	\
    404                      ^ lshift(CC_DEP1, 11-(DATA_BITS-1)+1)));	\
    405      return fl;							\
    406    }								\
    407 }
    408 
    409 /*-------------------------------------------------------------*/
    410 
    411 #define ACTIONS_UMUL(DATA_BITS, DATA_UTYPE,  NARROWtoU,         \
    412                                 DATA_U2TYPE, NARROWto2U)        \
    413 {                                                               \
    414    PREAMBLE(DATA_BITS);                                         \
    415    { ULong cf, pf, af, zf, sf, of;                              \
    416      DATA_UTYPE  hi;                                            \
    417      DATA_UTYPE  lo                                             \
    418         = NARROWtoU( ((DATA_UTYPE)CC_DEP1)                      \
    419                      * ((DATA_UTYPE)CC_DEP2) );                 \
    420      DATA_U2TYPE rr                                             \
    421         = NARROWto2U(                                           \
    422              ((DATA_U2TYPE)((DATA_UTYPE)CC_DEP1))               \
    423              * ((DATA_U2TYPE)((DATA_UTYPE)CC_DEP2)) );          \
    424      hi = NARROWtoU(rr >>/*u*/ DATA_BITS);                      \
    425      cf = (hi != 0);                                            \
    426      pf = parity_table[(UChar)lo];                              \
    427      af = 0; /* undefined */                                    \
    428      zf = (lo == 0) << 6;                                       \
    429      sf = lshift(lo, 8 - DATA_BITS) & 0x80;                     \
    430      of = cf << 11;                                             \
    431      return cf | pf | af | zf | sf | of;                        \
    432    }								\
    433 }
    434 
    435 /*-------------------------------------------------------------*/
    436 
    437 #define ACTIONS_SMUL(DATA_BITS, DATA_STYPE,  NARROWtoS,         \
    438                                 DATA_S2TYPE, NARROWto2S)        \
    439 {                                                               \
    440    PREAMBLE(DATA_BITS);                                         \
    441    { ULong cf, pf, af, zf, sf, of;                              \
    442      DATA_STYPE  hi;                                            \
    443      DATA_STYPE  lo                                             \
    444         = NARROWtoS( ((DATA_S2TYPE)(DATA_STYPE)CC_DEP1)         \
    445                      * ((DATA_S2TYPE)(DATA_STYPE)CC_DEP2) );    \
    446      DATA_S2TYPE rr                                             \
    447         = NARROWto2S(                                           \
    448              ((DATA_S2TYPE)((DATA_STYPE)CC_DEP1))               \
    449              * ((DATA_S2TYPE)((DATA_STYPE)CC_DEP2)) );          \
    450      hi = NARROWtoS(rr >>/*s*/ DATA_BITS);                      \
    451      cf = (hi != (lo >>/*s*/ (DATA_BITS-1)));                   \
    452      pf = parity_table[(UChar)lo];                              \
    453      af = 0; /* undefined */                                    \
    454      zf = (lo == 0) << 6;                                       \
    455      sf = lshift(lo, 8 - DATA_BITS) & 0x80;                     \
    456      of = cf << 11;                                             \
    457      return cf | pf | af | zf | sf | of;                        \
    458    }								\
    459 }
    460 
    461 /*-------------------------------------------------------------*/
    462 
    463 #define ACTIONS_UMULQ                                           \
    464 {                                                               \
    465    PREAMBLE(64);                                                \
    466    { ULong cf, pf, af, zf, sf, of;                              \
    467      ULong lo, hi;                                              \
    468      mullU64( (ULong)CC_DEP1, (ULong)CC_DEP2, &hi, &lo );       \
    469      cf = (hi != 0);                                            \
    470      pf = parity_table[(UChar)lo];                              \
    471      af = 0; /* undefined */                                    \
    472      zf = (lo == 0) << 6;                                       \
    473      sf = lshift(lo, 8 - 64) & 0x80;                            \
    474      of = cf << 11;                                             \
    475      return cf | pf | af | zf | sf | of;                        \
    476    }								\
    477 }
    478 
    479 /*-------------------------------------------------------------*/
    480 
    481 #define ACTIONS_SMULQ                                           \
    482 {                                                               \
    483    PREAMBLE(64);                                                \
    484    { ULong cf, pf, af, zf, sf, of;                              \
    485      Long lo, hi;                                               \
    486      mullS64( (Long)CC_DEP1, (Long)CC_DEP2, &hi, &lo );         \
    487      cf = (hi != (lo >>/*s*/ (64-1)));                          \
    488      pf = parity_table[(UChar)lo];                              \
    489      af = 0; /* undefined */                                    \
    490      zf = (lo == 0) << 6;                                       \
    491      sf = lshift(lo, 8 - 64) & 0x80;                            \
    492      of = cf << 11;                                             \
    493      return cf | pf | af | zf | sf | of;                        \
    494    }								\
    495 }
    496 
    497 /*-------------------------------------------------------------*/
    498 
    499 #define ACTIONS_ANDN(DATA_BITS,DATA_UTYPE)			\
    500 {								\
    501    PREAMBLE(DATA_BITS);						\
    502    { ULong cf, pf, af, zf, sf, of;				\
    503      cf = 0;							\
    504      pf = 0;							\
    505      af = 0;							\
    506      zf = ((DATA_UTYPE)CC_DEP1 == 0) << 6;			\
    507      sf = lshift(CC_DEP1, 8 - DATA_BITS) & 0x80;		\
    508      of = 0;							\
    509      return cf | pf | af | zf | sf | of;			\
    510    }								\
    511 }
    512 
    513 /*-------------------------------------------------------------*/
    514 
    515 #define ACTIONS_BLSI(DATA_BITS,DATA_UTYPE)			\
    516 {								\
    517    PREAMBLE(DATA_BITS);						\
    518    { ULong cf, pf, af, zf, sf, of;				\
    519      cf = ((DATA_UTYPE)CC_DEP2 != 0);				\
    520      pf = 0;							\
    521      af = 0;							\
    522      zf = ((DATA_UTYPE)CC_DEP1 == 0) << 6;			\
    523      sf = lshift(CC_DEP1, 8 - DATA_BITS) & 0x80;		\
    524      of = 0;							\
    525      return cf | pf | af | zf | sf | of;			\
    526    }								\
    527 }
    528 
    529 /*-------------------------------------------------------------*/
    530 
    531 #define ACTIONS_BLSMSK(DATA_BITS,DATA_UTYPE)			\
    532 {								\
    533    PREAMBLE(DATA_BITS);						\
    534    { Long cf, pf, af, zf, sf, of;				\
    535      cf = ((DATA_UTYPE)CC_DEP2 == 0);				\
    536      pf = 0;							\
    537      af = 0;							\
    538      zf = 0;							\
    539      sf = lshift(CC_DEP1, 8 - DATA_BITS) & 0x80;		\
    540      of = 0;							\
    541      return cf | pf | af | zf | sf | of;			\
    542    }								\
    543 }
    544 
    545 /*-------------------------------------------------------------*/
    546 
    547 #define ACTIONS_BLSR(DATA_BITS,DATA_UTYPE)			\
    548 {								\
    549    PREAMBLE(DATA_BITS);						\
    550    { ULong cf, pf, af, zf, sf, of;				\
    551      cf = ((DATA_UTYPE)CC_DEP2 == 0);				\
    552      pf = 0;							\
    553      af = 0;							\
    554      zf = ((DATA_UTYPE)CC_DEP1 == 0) << 6;			\
    555      sf = lshift(CC_DEP1, 8 - DATA_BITS) & 0x80;		\
    556      of = 0;							\
    557      return cf | pf | af | zf | sf | of;			\
    558    }								\
    559 }
    560 
    561 /*-------------------------------------------------------------*/
    562 
    563 #define ACTIONS_ADX(DATA_BITS,DATA_UTYPE,FLAGNAME)		\
    564 {								\
    565    PREAMBLE(DATA_BITS);						\
    566    { ULong ocf;	/* o or c */					\
    567      ULong argL, argR, oldOC, res;				\
    568      oldOC = (CC_NDEP >> AMD64G_CC_SHIFT_##FLAGNAME) & 1;	\
    569      argL  = CC_DEP1;						\
    570      argR  = CC_DEP2 ^ oldOC;					\
    571      res   = (argL + argR) + oldOC;				\
    572      if (oldOC)							\
    573         ocf = (DATA_UTYPE)res <= (DATA_UTYPE)argL;		\
    574      else							\
    575         ocf = (DATA_UTYPE)res < (DATA_UTYPE)argL;		\
    576      return (CC_NDEP & ~AMD64G_CC_MASK_##FLAGNAME)		\
    577             | (ocf << AMD64G_CC_SHIFT_##FLAGNAME);		\
    578    }								\
    579 }
    580 
    581 /*-------------------------------------------------------------*/
    582 
    583 
    584 #if PROFILE_RFLAGS
    585 
    586 static Bool initted     = False;
    587 
    588 /* C flag, fast route */
    589 static UInt tabc_fast[AMD64G_CC_OP_NUMBER];
    590 /* C flag, slow route */
    591 static UInt tabc_slow[AMD64G_CC_OP_NUMBER];
    592 /* table for calculate_cond */
    593 static UInt tab_cond[AMD64G_CC_OP_NUMBER][16];
    594 /* total entry counts for calc_all, calc_c, calc_cond. */
    595 static UInt n_calc_all  = 0;
    596 static UInt n_calc_c    = 0;
    597 static UInt n_calc_cond = 0;
    598 
    599 #define SHOW_COUNTS_NOW (0 == (0x3FFFFF & (n_calc_all+n_calc_c+n_calc_cond)))
    600 
    601 
    602 static void showCounts ( void )
    603 {
    604    Int op, co;
    605    HChar ch;
    606    vex_printf("\nTotal calls: calc_all=%u   calc_cond=%u   calc_c=%u\n",
    607               n_calc_all, n_calc_cond, n_calc_c);
    608 
    609    vex_printf("      cSLOW  cFAST    O   NO    B   NB    Z   NZ   BE  NBE"
    610               "    S   NS    P   NP    L   NL   LE  NLE\n");
    611    vex_printf("     -----------------------------------------------------"
    612               "----------------------------------------\n");
    613    for (op = 0; op < AMD64G_CC_OP_NUMBER; op++) {
    614 
    615       ch = ' ';
    616       if (op > 0 && (op-1) % 4 == 0)
    617          ch = 'B';
    618       if (op > 0 && (op-1) % 4 == 1)
    619          ch = 'W';
    620       if (op > 0 && (op-1) % 4 == 2)
    621          ch = 'L';
    622       if (op > 0 && (op-1) % 4 == 3)
    623          ch = 'Q';
    624 
    625       vex_printf("%2d%c: ", op, ch);
    626       vex_printf("%6u ", tabc_slow[op]);
    627       vex_printf("%6u ", tabc_fast[op]);
    628       for (co = 0; co < 16; co++) {
    629          Int n = tab_cond[op][co];
    630          if (n >= 1000) {
    631             vex_printf(" %3dK", n / 1000);
    632          } else
    633          if (n >= 0) {
    634             vex_printf(" %3d ", n );
    635          } else {
    636             vex_printf("     ");
    637          }
    638       }
    639       vex_printf("\n");
    640    }
    641    vex_printf("\n");
    642 }
    643 
    644 static void initCounts ( void )
    645 {
    646    Int op, co;
    647    initted = True;
    648    for (op = 0; op < AMD64G_CC_OP_NUMBER; op++) {
    649       tabc_fast[op] = tabc_slow[op] = 0;
    650       for (co = 0; co < 16; co++)
    651          tab_cond[op][co] = 0;
    652    }
    653 }
    654 
    655 #endif /* PROFILE_RFLAGS */
    656 
    657 
    658 /* CALLED FROM GENERATED CODE: CLEAN HELPER */
    659 /* Calculate all the 6 flags from the supplied thunk parameters.
    660    Worker function, not directly called from generated code. */
    661 static
    662 ULong amd64g_calculate_rflags_all_WRK ( ULong cc_op,
    663                                         ULong cc_dep1_formal,
    664                                         ULong cc_dep2_formal,
    665                                         ULong cc_ndep_formal )
    666 {
    667    switch (cc_op) {
    668       case AMD64G_CC_OP_COPY:
    669          return cc_dep1_formal
    670                 & (AMD64G_CC_MASK_O | AMD64G_CC_MASK_S | AMD64G_CC_MASK_Z
    671                    | AMD64G_CC_MASK_A | AMD64G_CC_MASK_C | AMD64G_CC_MASK_P);
    672 
    673       case AMD64G_CC_OP_ADDB:   ACTIONS_ADD( 8,  UChar  );
    674       case AMD64G_CC_OP_ADDW:   ACTIONS_ADD( 16, UShort );
    675       case AMD64G_CC_OP_ADDL:   ACTIONS_ADD( 32, UInt   );
    676       case AMD64G_CC_OP_ADDQ:   ACTIONS_ADD( 64, ULong  );
    677 
    678       case AMD64G_CC_OP_ADCB:   ACTIONS_ADC( 8,  UChar  );
    679       case AMD64G_CC_OP_ADCW:   ACTIONS_ADC( 16, UShort );
    680       case AMD64G_CC_OP_ADCL:   ACTIONS_ADC( 32, UInt   );
    681       case AMD64G_CC_OP_ADCQ:   ACTIONS_ADC( 64, ULong  );
    682 
    683       case AMD64G_CC_OP_SUBB:   ACTIONS_SUB(  8, UChar  );
    684       case AMD64G_CC_OP_SUBW:   ACTIONS_SUB( 16, UShort );
    685       case AMD64G_CC_OP_SUBL:   ACTIONS_SUB( 32, UInt   );
    686       case AMD64G_CC_OP_SUBQ:   ACTIONS_SUB( 64, ULong  );
    687 
    688       case AMD64G_CC_OP_SBBB:   ACTIONS_SBB(  8, UChar  );
    689       case AMD64G_CC_OP_SBBW:   ACTIONS_SBB( 16, UShort );
    690       case AMD64G_CC_OP_SBBL:   ACTIONS_SBB( 32, UInt   );
    691       case AMD64G_CC_OP_SBBQ:   ACTIONS_SBB( 64, ULong  );
    692 
    693       case AMD64G_CC_OP_LOGICB: ACTIONS_LOGIC(  8, UChar  );
    694       case AMD64G_CC_OP_LOGICW: ACTIONS_LOGIC( 16, UShort );
    695       case AMD64G_CC_OP_LOGICL: ACTIONS_LOGIC( 32, UInt   );
    696       case AMD64G_CC_OP_LOGICQ: ACTIONS_LOGIC( 64, ULong  );
    697 
    698       case AMD64G_CC_OP_INCB:   ACTIONS_INC(  8, UChar  );
    699       case AMD64G_CC_OP_INCW:   ACTIONS_INC( 16, UShort );
    700       case AMD64G_CC_OP_INCL:   ACTIONS_INC( 32, UInt   );
    701       case AMD64G_CC_OP_INCQ:   ACTIONS_INC( 64, ULong  );
    702 
    703       case AMD64G_CC_OP_DECB:   ACTIONS_DEC(  8, UChar  );
    704       case AMD64G_CC_OP_DECW:   ACTIONS_DEC( 16, UShort );
    705       case AMD64G_CC_OP_DECL:   ACTIONS_DEC( 32, UInt   );
    706       case AMD64G_CC_OP_DECQ:   ACTIONS_DEC( 64, ULong  );
    707 
    708       case AMD64G_CC_OP_SHLB:   ACTIONS_SHL(  8, UChar  );
    709       case AMD64G_CC_OP_SHLW:   ACTIONS_SHL( 16, UShort );
    710       case AMD64G_CC_OP_SHLL:   ACTIONS_SHL( 32, UInt   );
    711       case AMD64G_CC_OP_SHLQ:   ACTIONS_SHL( 64, ULong  );
    712 
    713       case AMD64G_CC_OP_SHRB:   ACTIONS_SHR(  8, UChar  );
    714       case AMD64G_CC_OP_SHRW:   ACTIONS_SHR( 16, UShort );
    715       case AMD64G_CC_OP_SHRL:   ACTIONS_SHR( 32, UInt   );
    716       case AMD64G_CC_OP_SHRQ:   ACTIONS_SHR( 64, ULong  );
    717 
    718       case AMD64G_CC_OP_ROLB:   ACTIONS_ROL(  8, UChar  );
    719       case AMD64G_CC_OP_ROLW:   ACTIONS_ROL( 16, UShort );
    720       case AMD64G_CC_OP_ROLL:   ACTIONS_ROL( 32, UInt   );
    721       case AMD64G_CC_OP_ROLQ:   ACTIONS_ROL( 64, ULong  );
    722 
    723       case AMD64G_CC_OP_RORB:   ACTIONS_ROR(  8, UChar  );
    724       case AMD64G_CC_OP_RORW:   ACTIONS_ROR( 16, UShort );
    725       case AMD64G_CC_OP_RORL:   ACTIONS_ROR( 32, UInt   );
    726       case AMD64G_CC_OP_RORQ:   ACTIONS_ROR( 64, ULong  );
    727 
    728       case AMD64G_CC_OP_UMULB:  ACTIONS_UMUL(  8, UChar,  toUChar,
    729                                                   UShort, toUShort );
    730       case AMD64G_CC_OP_UMULW:  ACTIONS_UMUL( 16, UShort, toUShort,
    731                                                   UInt,   toUInt );
    732       case AMD64G_CC_OP_UMULL:  ACTIONS_UMUL( 32, UInt,   toUInt,
    733                                                   ULong,  idULong );
    734 
    735       case AMD64G_CC_OP_UMULQ:  ACTIONS_UMULQ;
    736 
    737       case AMD64G_CC_OP_SMULB:  ACTIONS_SMUL(  8, Char,   toUChar,
    738                                                   Short,  toUShort );
    739       case AMD64G_CC_OP_SMULW:  ACTIONS_SMUL( 16, Short,  toUShort,
    740                                                   Int,    toUInt   );
    741       case AMD64G_CC_OP_SMULL:  ACTIONS_SMUL( 32, Int,    toUInt,
    742                                                   Long,   idULong );
    743 
    744       case AMD64G_CC_OP_SMULQ:  ACTIONS_SMULQ;
    745 
    746       case AMD64G_CC_OP_ANDN32: ACTIONS_ANDN( 32, UInt   );
    747       case AMD64G_CC_OP_ANDN64: ACTIONS_ANDN( 64, ULong  );
    748 
    749       case AMD64G_CC_OP_BLSI32: ACTIONS_BLSI( 32, UInt   );
    750       case AMD64G_CC_OP_BLSI64: ACTIONS_BLSI( 64, ULong  );
    751 
    752       case AMD64G_CC_OP_BLSMSK32: ACTIONS_BLSMSK( 32, UInt   );
    753       case AMD64G_CC_OP_BLSMSK64: ACTIONS_BLSMSK( 64, ULong  );
    754 
    755       case AMD64G_CC_OP_BLSR32: ACTIONS_BLSR( 32, UInt   );
    756       case AMD64G_CC_OP_BLSR64: ACTIONS_BLSR( 64, ULong  );
    757 
    758       case AMD64G_CC_OP_ADCX32: ACTIONS_ADX( 32, UInt,  C );
    759       case AMD64G_CC_OP_ADCX64: ACTIONS_ADX( 64, ULong, C );
    760 
    761       case AMD64G_CC_OP_ADOX32: ACTIONS_ADX( 32, UInt,  O );
    762       case AMD64G_CC_OP_ADOX64: ACTIONS_ADX( 64, ULong, O );
    763 
    764       default:
    765          /* shouldn't really make these calls from generated code */
    766          vex_printf("amd64g_calculate_rflags_all_WRK(AMD64)"
    767                     "( %llu, 0x%llx, 0x%llx, 0x%llx )\n",
    768                     cc_op, cc_dep1_formal, cc_dep2_formal, cc_ndep_formal );
    769          vpanic("amd64g_calculate_rflags_all_WRK(AMD64)");
    770    }
    771 }
    772 
    773 
    774 /* CALLED FROM GENERATED CODE: CLEAN HELPER */
    775 /* Calculate all the 6 flags from the supplied thunk parameters. */
    776 ULong amd64g_calculate_rflags_all ( ULong cc_op,
    777                                     ULong cc_dep1,
    778                                     ULong cc_dep2,
    779                                     ULong cc_ndep )
    780 {
    781 #  if PROFILE_RFLAGS
    782    if (!initted) initCounts();
    783    n_calc_all++;
    784    if (SHOW_COUNTS_NOW) showCounts();
    785 #  endif
    786    return
    787       amd64g_calculate_rflags_all_WRK ( cc_op, cc_dep1, cc_dep2, cc_ndep );
    788 }
    789 
    790 
    791 /* CALLED FROM GENERATED CODE: CLEAN HELPER */
    792 /* Calculate just the carry flag from the supplied thunk parameters. */
    793 ULong amd64g_calculate_rflags_c ( ULong cc_op,
    794                                   ULong cc_dep1,
    795                                   ULong cc_dep2,
    796                                   ULong cc_ndep )
    797 {
    798 #  if PROFILE_RFLAGS
    799    if (!initted) initCounts();
    800    n_calc_c++;
    801    tabc_fast[cc_op]++;
    802    if (SHOW_COUNTS_NOW) showCounts();
    803 #  endif
    804 
    805    /* Fast-case some common ones. */
    806    switch (cc_op) {
    807       case AMD64G_CC_OP_COPY:
    808          return (cc_dep1 >> AMD64G_CC_SHIFT_C) & 1;
    809       case AMD64G_CC_OP_LOGICQ:
    810       case AMD64G_CC_OP_LOGICL:
    811       case AMD64G_CC_OP_LOGICW:
    812       case AMD64G_CC_OP_LOGICB:
    813          return 0;
    814 	 //      case AMD64G_CC_OP_SUBL:
    815 	 //         return ((UInt)cc_dep1) < ((UInt)cc_dep2)
    816 	 //                   ? AMD64G_CC_MASK_C : 0;
    817 	 //      case AMD64G_CC_OP_SUBW:
    818 	 //         return ((UInt)(cc_dep1 & 0xFFFF)) < ((UInt)(cc_dep2 & 0xFFFF))
    819 	 //                   ? AMD64G_CC_MASK_C : 0;
    820 	 //      case AMD64G_CC_OP_SUBB:
    821 	 //         return ((UInt)(cc_dep1 & 0xFF)) < ((UInt)(cc_dep2 & 0xFF))
    822 	 //                   ? AMD64G_CC_MASK_C : 0;
    823 	 //      case AMD64G_CC_OP_INCL:
    824 	 //      case AMD64G_CC_OP_DECL:
    825 	 //         return cc_ndep & AMD64G_CC_MASK_C;
    826       default:
    827          break;
    828    }
    829 
    830 #  if PROFILE_RFLAGS
    831    tabc_fast[cc_op]--;
    832    tabc_slow[cc_op]++;
    833 #  endif
    834 
    835    return amd64g_calculate_rflags_all_WRK(cc_op,cc_dep1,cc_dep2,cc_ndep)
    836           & AMD64G_CC_MASK_C;
    837 }
    838 
    839 
    840 /* CALLED FROM GENERATED CODE: CLEAN HELPER */
    841 /* returns 1 or 0 */
    842 ULong amd64g_calculate_condition ( ULong/*AMD64Condcode*/ cond,
    843                                    ULong cc_op,
    844                                    ULong cc_dep1,
    845                                    ULong cc_dep2,
    846                                    ULong cc_ndep )
    847 {
    848    ULong rflags = amd64g_calculate_rflags_all_WRK(cc_op, cc_dep1,
    849                                                   cc_dep2, cc_ndep);
    850    ULong of,sf,zf,cf,pf;
    851    ULong inv = cond & 1;
    852 
    853 #  if PROFILE_RFLAGS
    854    if (!initted) initCounts();
    855    tab_cond[cc_op][cond]++;
    856    n_calc_cond++;
    857    if (SHOW_COUNTS_NOW) showCounts();
    858 #  endif
    859 
    860    switch (cond) {
    861       case AMD64CondNO:
    862       case AMD64CondO: /* OF == 1 */
    863          of = rflags >> AMD64G_CC_SHIFT_O;
    864          return 1 & (inv ^ of);
    865 
    866       case AMD64CondNZ:
    867       case AMD64CondZ: /* ZF == 1 */
    868          zf = rflags >> AMD64G_CC_SHIFT_Z;
    869          return 1 & (inv ^ zf);
    870 
    871       case AMD64CondNB:
    872       case AMD64CondB: /* CF == 1 */
    873          cf = rflags >> AMD64G_CC_SHIFT_C;
    874          return 1 & (inv ^ cf);
    875          break;
    876 
    877       case AMD64CondNBE:
    878       case AMD64CondBE: /* (CF or ZF) == 1 */
    879          cf = rflags >> AMD64G_CC_SHIFT_C;
    880          zf = rflags >> AMD64G_CC_SHIFT_Z;
    881          return 1 & (inv ^ (cf | zf));
    882          break;
    883 
    884       case AMD64CondNS:
    885       case AMD64CondS: /* SF == 1 */
    886          sf = rflags >> AMD64G_CC_SHIFT_S;
    887          return 1 & (inv ^ sf);
    888 
    889       case AMD64CondNP:
    890       case AMD64CondP: /* PF == 1 */
    891          pf = rflags >> AMD64G_CC_SHIFT_P;
    892          return 1 & (inv ^ pf);
    893 
    894       case AMD64CondNL:
    895       case AMD64CondL: /* (SF xor OF) == 1 */
    896          sf = rflags >> AMD64G_CC_SHIFT_S;
    897          of = rflags >> AMD64G_CC_SHIFT_O;
    898          return 1 & (inv ^ (sf ^ of));
    899          break;
    900 
    901       case AMD64CondNLE:
    902       case AMD64CondLE: /* ((SF xor OF) or ZF)  == 1 */
    903          sf = rflags >> AMD64G_CC_SHIFT_S;
    904          of = rflags >> AMD64G_CC_SHIFT_O;
    905          zf = rflags >> AMD64G_CC_SHIFT_Z;
    906          return 1 & (inv ^ ((sf ^ of) | zf));
    907          break;
    908 
    909       default:
    910          /* shouldn't really make these calls from generated code */
    911          vex_printf("amd64g_calculate_condition"
    912                     "( %llu, %llu, 0x%llx, 0x%llx, 0x%llx )\n",
    913                     cond, cc_op, cc_dep1, cc_dep2, cc_ndep );
    914          vpanic("amd64g_calculate_condition");
    915    }
    916 }
    917 
    918 
    919 /* VISIBLE TO LIBVEX CLIENT */
    920 ULong LibVEX_GuestAMD64_get_rflags ( /*IN*/const VexGuestAMD64State* vex_state )
    921 {
    922    ULong rflags = amd64g_calculate_rflags_all_WRK(
    923                      vex_state->guest_CC_OP,
    924                      vex_state->guest_CC_DEP1,
    925                      vex_state->guest_CC_DEP2,
    926                      vex_state->guest_CC_NDEP
    927                   );
    928    Long dflag = vex_state->guest_DFLAG;
    929    vassert(dflag == 1 || dflag == -1);
    930    if (dflag == -1)
    931       rflags |= (1<<10);
    932    if (vex_state->guest_IDFLAG == 1)
    933       rflags |= (1<<21);
    934    if (vex_state->guest_ACFLAG == 1)
    935       rflags |= (1<<18);
    936 
    937    return rflags;
    938 }
    939 
    940 /* VISIBLE TO LIBVEX CLIENT */
    941 void
    942 LibVEX_GuestAMD64_put_rflags ( ULong rflags,
    943                                /*MOD*/VexGuestAMD64State* vex_state )
    944 {
    945    /* D flag */
    946    if (rflags & AMD64G_CC_MASK_D) {
    947       vex_state->guest_DFLAG = -1;
    948       rflags &= ~AMD64G_CC_MASK_D;
    949    }
    950    else
    951       vex_state->guest_DFLAG = 1;
    952 
    953    /* ID flag */
    954    if (rflags & AMD64G_CC_MASK_ID) {
    955       vex_state->guest_IDFLAG = 1;
    956       rflags &= ~AMD64G_CC_MASK_ID;
    957    }
    958    else
    959       vex_state->guest_IDFLAG = 0;
    960 
    961    /* AC flag */
    962    if (rflags & AMD64G_CC_MASK_AC) {
    963       vex_state->guest_ACFLAG = 1;
    964       rflags &= ~AMD64G_CC_MASK_AC;
    965    }
    966    else
    967       vex_state->guest_ACFLAG = 0;
    968 
    969    UInt cc_mask = AMD64G_CC_MASK_O | AMD64G_CC_MASK_S | AMD64G_CC_MASK_Z |
    970                   AMD64G_CC_MASK_A | AMD64G_CC_MASK_C | AMD64G_CC_MASK_P;
    971    vex_state->guest_CC_OP   = AMD64G_CC_OP_COPY;
    972    vex_state->guest_CC_DEP1 = rflags & cc_mask;
    973    vex_state->guest_CC_DEP2 = 0;
    974    vex_state->guest_CC_NDEP = 0;
    975 }
    976 
    977 /* VISIBLE TO LIBVEX CLIENT */
    978 void
    979 LibVEX_GuestAMD64_put_rflag_c ( ULong new_carry_flag,
    980                                /*MOD*/VexGuestAMD64State* vex_state )
    981 {
    982    ULong oszacp = amd64g_calculate_rflags_all_WRK(
    983                      vex_state->guest_CC_OP,
    984                      vex_state->guest_CC_DEP1,
    985                      vex_state->guest_CC_DEP2,
    986                      vex_state->guest_CC_NDEP
    987                   );
    988    if (new_carry_flag & 1) {
    989       oszacp |= AMD64G_CC_MASK_C;
    990    } else {
    991       oszacp &= ~AMD64G_CC_MASK_C;
    992    }
    993    vex_state->guest_CC_OP   = AMD64G_CC_OP_COPY;
    994    vex_state->guest_CC_DEP1 = oszacp;
    995    vex_state->guest_CC_DEP2 = 0;
    996    vex_state->guest_CC_NDEP = 0;
    997 }
    998 
    999 
   1000 /*---------------------------------------------------------------*/
   1001 /*--- %rflags translation-time function specialisers.         ---*/
   1002 /*--- These help iropt specialise calls the above run-time    ---*/
   1003 /*--- %rflags functions.                                      ---*/
   1004 /*---------------------------------------------------------------*/
   1005 
   1006 /* Used by the optimiser to try specialisations.  Returns an
   1007    equivalent expression, or NULL if none. */
   1008 
   1009 static Bool isU64 ( IRExpr* e, ULong n )
   1010 {
   1011    return toBool( e->tag == Iex_Const
   1012                   && e->Iex.Const.con->tag == Ico_U64
   1013                   && e->Iex.Const.con->Ico.U64 == n );
   1014 }
   1015 
   1016 IRExpr* guest_amd64_spechelper ( const HChar* function_name,
   1017                                  IRExpr** args,
   1018                                  IRStmt** precedingStmts,
   1019                                  Int      n_precedingStmts )
   1020 {
   1021 #  define unop(_op,_a1) IRExpr_Unop((_op),(_a1))
   1022 #  define binop(_op,_a1,_a2) IRExpr_Binop((_op),(_a1),(_a2))
   1023 #  define mkU64(_n) IRExpr_Const(IRConst_U64(_n))
   1024 #  define mkU32(_n) IRExpr_Const(IRConst_U32(_n))
   1025 #  define mkU8(_n)  IRExpr_Const(IRConst_U8(_n))
   1026 
   1027    Int i, arity = 0;
   1028    for (i = 0; args[i]; i++)
   1029       arity++;
   1030 #  if 0
   1031    vex_printf("spec request:\n");
   1032    vex_printf("   %s  ", function_name);
   1033    for (i = 0; i < arity; i++) {
   1034       vex_printf("  ");
   1035       ppIRExpr(args[i]);
   1036    }
   1037    vex_printf("\n");
   1038 #  endif
   1039 
   1040    /* --------- specialising "amd64g_calculate_condition" --------- */
   1041 
   1042    if (vex_streq(function_name, "amd64g_calculate_condition")) {
   1043       /* specialise calls to above "calculate condition" function */
   1044       IRExpr *cond, *cc_op, *cc_dep1, *cc_dep2;
   1045       vassert(arity == 5);
   1046       cond    = args[0];
   1047       cc_op   = args[1];
   1048       cc_dep1 = args[2];
   1049       cc_dep2 = args[3];
   1050 
   1051       /*---------------- ADDQ ----------------*/
   1052 
   1053       if (isU64(cc_op, AMD64G_CC_OP_ADDQ) && isU64(cond, AMD64CondZ)) {
   1054          /* long long add, then Z --> test (dst+src == 0) */
   1055          return unop(Iop_1Uto64,
   1056                      binop(Iop_CmpEQ64,
   1057                            binop(Iop_Add64, cc_dep1, cc_dep2),
   1058                            mkU64(0)));
   1059       }
   1060 
   1061       /*---------------- ADDL ----------------*/
   1062 
   1063       if (isU64(cc_op, AMD64G_CC_OP_ADDL) && isU64(cond, AMD64CondO)) {
   1064          /* This is very commonly generated by Javascript JITs, for
   1065             the idiom "do a 32-bit add and jump to out-of-line code if
   1066             an overflow occurs". */
   1067          /* long add, then O (overflow)
   1068             --> ((dep1 ^ dep2 ^ -1) & (dep1 ^ (dep1 + dep2)))[31]
   1069             --> (((dep1 ^ dep2 ^ -1) & (dep1 ^ (dep1 +64 dep2))) >>u 31) & 1
   1070             --> (((not(dep1 ^ dep2)) & (dep1 ^ (dep1 +64 dep2))) >>u 31) & 1
   1071          */
   1072          vassert(isIRAtom(cc_dep1));
   1073          vassert(isIRAtom(cc_dep2));
   1074          return
   1075             binop(Iop_And64,
   1076                   binop(Iop_Shr64,
   1077                         binop(Iop_And64,
   1078                               unop(Iop_Not64,
   1079                                    binop(Iop_Xor64, cc_dep1, cc_dep2)),
   1080                               binop(Iop_Xor64,
   1081                                     cc_dep1,
   1082                                     binop(Iop_Add64, cc_dep1, cc_dep2))),
   1083                         mkU8(31)),
   1084                   mkU64(1));
   1085 
   1086       }
   1087 
   1088       /*---------------- SUBQ ----------------*/
   1089 
   1090       /* 0, */
   1091       if (isU64(cc_op, AMD64G_CC_OP_SUBQ) && isU64(cond, AMD64CondO)) {
   1092          /* long long sub/cmp, then O (overflow)
   1093             --> ((dep1 ^ dep2) & (dep1 ^ (dep1 - dep2)))[63]
   1094             --> ((dep1 ^ dep2) & (dep1 ^ (dep1 - dep2))) >>u 63
   1095          */
   1096          vassert(isIRAtom(cc_dep1));
   1097          vassert(isIRAtom(cc_dep2));
   1098          return binop(Iop_Shr64,
   1099                       binop(Iop_And64,
   1100                             binop(Iop_Xor64, cc_dep1, cc_dep2),
   1101                             binop(Iop_Xor64,
   1102                                   cc_dep1,
   1103                                   binop(Iop_Sub64, cc_dep1, cc_dep2))),
   1104                       mkU8(63));
   1105       }
   1106       if (isU64(cc_op, AMD64G_CC_OP_SUBQ) && isU64(cond, AMD64CondNO)) {
   1107          /* No action.  Never yet found a test case. */
   1108       }
   1109 
   1110       /* 2, 3 */
   1111       if (isU64(cc_op, AMD64G_CC_OP_SUBQ) && isU64(cond, AMD64CondB)) {
   1112          /* long long sub/cmp, then B (unsigned less than)
   1113             --> test dst <u src */
   1114          return unop(Iop_1Uto64,
   1115                      binop(Iop_CmpLT64U, cc_dep1, cc_dep2));
   1116       }
   1117       if (isU64(cc_op, AMD64G_CC_OP_SUBQ) && isU64(cond, AMD64CondNB)) {
   1118          /* long long sub/cmp, then NB (unsigned greater than or equal)
   1119             --> test src <=u dst */
   1120          /* Note, args are opposite way round from the usual */
   1121          return unop(Iop_1Uto64,
   1122                      binop(Iop_CmpLE64U, cc_dep2, cc_dep1));
   1123       }
   1124 
   1125       /* 4, 5 */
   1126       if (isU64(cc_op, AMD64G_CC_OP_SUBQ) && isU64(cond, AMD64CondZ)) {
   1127          /* long long sub/cmp, then Z --> test dst==src */
   1128          return unop(Iop_1Uto64,
   1129                      binop(Iop_CmpEQ64,cc_dep1,cc_dep2));
   1130       }
   1131       if (isU64(cc_op, AMD64G_CC_OP_SUBQ) && isU64(cond, AMD64CondNZ)) {
   1132          /* long long sub/cmp, then NZ --> test dst!=src */
   1133          return unop(Iop_1Uto64,
   1134                      binop(Iop_CmpNE64,cc_dep1,cc_dep2));
   1135       }
   1136 
   1137       /* 6, 7 */
   1138       if (isU64(cc_op, AMD64G_CC_OP_SUBQ) && isU64(cond, AMD64CondBE)) {
   1139          /* long long sub/cmp, then BE (unsigned less than or equal)
   1140             --> test dst <=u src */
   1141          return unop(Iop_1Uto64,
   1142                      binop(Iop_CmpLE64U, cc_dep1, cc_dep2));
   1143       }
   1144       if (isU64(cc_op, AMD64G_CC_OP_SUBQ) && isU64(cond, AMD64CondNBE)) {
   1145          /* long long sub/cmp, then NBE (unsigned greater than)
   1146             --> test !(dst <=u src) */
   1147          return binop(Iop_Xor64,
   1148                       unop(Iop_1Uto64,
   1149                            binop(Iop_CmpLE64U, cc_dep1, cc_dep2)),
   1150                       mkU64(1));
   1151       }
   1152 
   1153       /* 8, 9 */
   1154       if (isU64(cc_op, AMD64G_CC_OP_SUBQ) && isU64(cond, AMD64CondS)) {
   1155          /* long long sub/cmp, then S (negative)
   1156             --> (dst-src)[63]
   1157             --> (dst-src) >>u 63 */
   1158          return binop(Iop_Shr64,
   1159                       binop(Iop_Sub64, cc_dep1, cc_dep2),
   1160                       mkU8(63));
   1161       }
   1162       if (isU64(cc_op, AMD64G_CC_OP_SUBQ) && isU64(cond, AMD64CondNS)) {
   1163          /* long long sub/cmp, then NS (not negative)
   1164             --> (dst-src)[63] ^ 1
   1165             --> ((dst-src) >>u 63) ^ 1 */
   1166          return binop(Iop_Xor64,
   1167                       binop(Iop_Shr64,
   1168                             binop(Iop_Sub64, cc_dep1, cc_dep2),
   1169                             mkU8(63)),
   1170                       mkU64(1));
   1171       }
   1172 
   1173       /* 12, 13 */
   1174       if (isU64(cc_op, AMD64G_CC_OP_SUBQ) && isU64(cond, AMD64CondL)) {
   1175          /* long long sub/cmp, then L (signed less than)
   1176             --> test dst <s src */
   1177          return unop(Iop_1Uto64,
   1178                      binop(Iop_CmpLT64S, cc_dep1, cc_dep2));
   1179       }
   1180       if (isU64(cc_op, AMD64G_CC_OP_SUBQ) && isU64(cond, AMD64CondNL)) {
   1181          /* long long sub/cmp, then NL (signed greater than or equal)
   1182             --> test dst >=s src
   1183             --> test src <=s dst */
   1184          return unop(Iop_1Uto64,
   1185                      binop(Iop_CmpLE64S, cc_dep2, cc_dep1));
   1186       }
   1187 
   1188       /* 14, 15 */
   1189       if (isU64(cc_op, AMD64G_CC_OP_SUBQ) && isU64(cond, AMD64CondLE)) {
   1190          /* long long sub/cmp, then LE (signed less than or equal)
   1191             --> test dst <=s src */
   1192          return unop(Iop_1Uto64,
   1193                      binop(Iop_CmpLE64S, cc_dep1, cc_dep2));
   1194       }
   1195       if (isU64(cc_op, AMD64G_CC_OP_SUBQ) && isU64(cond, AMD64CondNLE)) {
   1196          /* long sub/cmp, then NLE (signed greater than)
   1197             --> test !(dst <=s src)
   1198             --> test (dst >s src)
   1199             --> test (src <s dst) */
   1200          return unop(Iop_1Uto64,
   1201                      binop(Iop_CmpLT64S, cc_dep2, cc_dep1));
   1202 
   1203       }
   1204 
   1205       /*---------------- SUBL ----------------*/
   1206 
   1207       /* 0, */
   1208       if (isU64(cc_op, AMD64G_CC_OP_SUBL) && isU64(cond, AMD64CondO)) {
   1209          /* This is very commonly generated by Javascript JITs, for
   1210             the idiom "do a 32-bit subtract and jump to out-of-line
   1211             code if an overflow occurs". */
   1212          /* long sub/cmp, then O (overflow)
   1213             --> ((dep1 ^ dep2) & (dep1 ^ (dep1 - dep2)))[31]
   1214             --> (((dep1 ^ dep2) & (dep1 ^ (dep1 -64 dep2))) >>u 31) & 1
   1215          */
   1216          vassert(isIRAtom(cc_dep1));
   1217          vassert(isIRAtom(cc_dep2));
   1218          return
   1219             binop(Iop_And64,
   1220                   binop(Iop_Shr64,
   1221                         binop(Iop_And64,
   1222                               binop(Iop_Xor64, cc_dep1, cc_dep2),
   1223                               binop(Iop_Xor64,
   1224                                     cc_dep1,
   1225                                     binop(Iop_Sub64, cc_dep1, cc_dep2))),
   1226                         mkU8(31)),
   1227                   mkU64(1));
   1228       }
   1229       if (isU64(cc_op, AMD64G_CC_OP_SUBL) && isU64(cond, AMD64CondNO)) {
   1230          /* No action.  Never yet found a test case. */
   1231       }
   1232 
   1233       /* 2, 3 */
   1234       if (isU64(cc_op, AMD64G_CC_OP_SUBL) && isU64(cond, AMD64CondB)) {
   1235          /* long sub/cmp, then B (unsigned less than)
   1236             --> test dst <u src */
   1237          return unop(Iop_1Uto64,
   1238                      binop(Iop_CmpLT32U,
   1239                            unop(Iop_64to32, cc_dep1),
   1240                            unop(Iop_64to32, cc_dep2)));
   1241       }
   1242       if (isU64(cc_op, AMD64G_CC_OP_SUBL) && isU64(cond, AMD64CondNB)) {
   1243          /* long sub/cmp, then NB (unsigned greater than or equal)
   1244             --> test src <=u dst */
   1245          /* Note, args are opposite way round from the usual */
   1246          return unop(Iop_1Uto64,
   1247                      binop(Iop_CmpLE32U,
   1248                            unop(Iop_64to32, cc_dep2),
   1249                            unop(Iop_64to32, cc_dep1)));
   1250       }
   1251 
   1252       /* 4, 5 */
   1253       if (isU64(cc_op, AMD64G_CC_OP_SUBL) && isU64(cond, AMD64CondZ)) {
   1254          /* long sub/cmp, then Z --> test dst==src */
   1255          return unop(Iop_1Uto64,
   1256                      binop(Iop_CmpEQ32,
   1257                            unop(Iop_64to32, cc_dep1),
   1258                            unop(Iop_64to32, cc_dep2)));
   1259       }
   1260       if (isU64(cc_op, AMD64G_CC_OP_SUBL) && isU64(cond, AMD64CondNZ)) {
   1261          /* long sub/cmp, then NZ --> test dst!=src */
   1262          return unop(Iop_1Uto64,
   1263                      binop(Iop_CmpNE32,
   1264                            unop(Iop_64to32, cc_dep1),
   1265                            unop(Iop_64to32, cc_dep2)));
   1266       }
   1267 
   1268       /* 6, 7 */
   1269       if (isU64(cc_op, AMD64G_CC_OP_SUBL) && isU64(cond, AMD64CondBE)) {
   1270          /* long sub/cmp, then BE (unsigned less than or equal)
   1271             --> test dst <=u src */
   1272          return unop(Iop_1Uto64,
   1273                      binop(Iop_CmpLE32U,
   1274                            unop(Iop_64to32, cc_dep1),
   1275                            unop(Iop_64to32, cc_dep2)));
   1276       }
   1277       if (isU64(cc_op, AMD64G_CC_OP_SUBL) && isU64(cond, AMD64CondNBE)) {
   1278          /* long sub/cmp, then NBE (unsigned greater than)
   1279             --> test src <u dst */
   1280          /* Note, args are opposite way round from the usual */
   1281          return unop(Iop_1Uto64,
   1282                      binop(Iop_CmpLT32U,
   1283                            unop(Iop_64to32, cc_dep2),
   1284                            unop(Iop_64to32, cc_dep1)));
   1285       }
   1286 
   1287       /* 8, 9 */
   1288       if (isU64(cc_op, AMD64G_CC_OP_SUBL) && isU64(cond, AMD64CondS)) {
   1289          /* long sub/cmp, then S (negative)
   1290             --> (dst-src)[31]
   1291             --> ((dst -64 src) >>u 31) & 1
   1292             Pointless to narrow the args to 32 bit before the subtract. */
   1293          return binop(Iop_And64,
   1294                       binop(Iop_Shr64,
   1295                             binop(Iop_Sub64, cc_dep1, cc_dep2),
   1296                             mkU8(31)),
   1297                       mkU64(1));
   1298       }
   1299       if (isU64(cc_op, AMD64G_CC_OP_SUBL) && isU64(cond, AMD64CondNS)) {
   1300          /* long sub/cmp, then NS (not negative)
   1301             --> (dst-src)[31] ^ 1
   1302             --> (((dst -64 src) >>u 31) & 1) ^ 1
   1303             Pointless to narrow the args to 32 bit before the subtract. */
   1304          return binop(Iop_Xor64,
   1305                       binop(Iop_And64,
   1306                             binop(Iop_Shr64,
   1307                                   binop(Iop_Sub64, cc_dep1, cc_dep2),
   1308                                   mkU8(31)),
   1309                             mkU64(1)),
   1310                       mkU64(1));
   1311       }
   1312 
   1313       /* 12, 13 */
   1314       if (isU64(cc_op, AMD64G_CC_OP_SUBL) && isU64(cond, AMD64CondL)) {
   1315          /* long sub/cmp, then L (signed less than)
   1316             --> test dst <s src */
   1317          return unop(Iop_1Uto64,
   1318                      binop(Iop_CmpLT32S,
   1319                            unop(Iop_64to32, cc_dep1),
   1320                            unop(Iop_64to32, cc_dep2)));
   1321       }
   1322       if (isU64(cc_op, AMD64G_CC_OP_SUBL) && isU64(cond, AMD64CondNL)) {
   1323          /* long sub/cmp, then NL (signed greater than or equal)
   1324             --> test dst >=s src
   1325             --> test src <=s dst */
   1326          return unop(Iop_1Uto64,
   1327                      binop(Iop_CmpLE32S,
   1328                            unop(Iop_64to32, cc_dep2),
   1329                            unop(Iop_64to32, cc_dep1)));
   1330       }
   1331 
   1332       /* 14, 15 */
   1333       if (isU64(cc_op, AMD64G_CC_OP_SUBL) && isU64(cond, AMD64CondLE)) {
   1334          /* long sub/cmp, then LE (signed less than or equal)
   1335             --> test dst <=s src */
   1336          return unop(Iop_1Uto64,
   1337                      binop(Iop_CmpLE32S,
   1338                            unop(Iop_64to32, cc_dep1),
   1339                            unop(Iop_64to32, cc_dep2)));
   1340 
   1341       }
   1342       if (isU64(cc_op, AMD64G_CC_OP_SUBL) && isU64(cond, AMD64CondNLE)) {
   1343          /* long sub/cmp, then NLE (signed greater than)
   1344             --> test !(dst <=s src)
   1345             --> test (dst >s src)
   1346             --> test (src <s dst) */
   1347          return unop(Iop_1Uto64,
   1348                      binop(Iop_CmpLT32S,
   1349                            unop(Iop_64to32, cc_dep2),
   1350                            unop(Iop_64to32, cc_dep1)));
   1351 
   1352       }
   1353 
   1354       /*---------------- SUBW ----------------*/
   1355 
   1356       /* 4, 5 */
   1357       if (isU64(cc_op, AMD64G_CC_OP_SUBW) && isU64(cond, AMD64CondZ)) {
   1358          /* word sub/cmp, then Z --> test dst==src */
   1359          return unop(Iop_1Uto64,
   1360                      binop(Iop_CmpEQ16,
   1361                            unop(Iop_64to16,cc_dep1),
   1362                            unop(Iop_64to16,cc_dep2)));
   1363       }
   1364       if (isU64(cc_op, AMD64G_CC_OP_SUBW) && isU64(cond, AMD64CondNZ)) {
   1365          /* word sub/cmp, then NZ --> test dst!=src */
   1366          return unop(Iop_1Uto64,
   1367                      binop(Iop_CmpNE16,
   1368                            unop(Iop_64to16,cc_dep1),
   1369                            unop(Iop_64to16,cc_dep2)));
   1370       }
   1371 
   1372       /* 6, */
   1373       if (isU64(cc_op, AMD64G_CC_OP_SUBW) && isU64(cond, AMD64CondBE)) {
   1374          /* word sub/cmp, then BE (unsigned less than or equal)
   1375             --> test dst <=u src */
   1376          return unop(Iop_1Uto64,
   1377                      binop(Iop_CmpLE64U,
   1378                            binop(Iop_Shl64, cc_dep1, mkU8(48)),
   1379                            binop(Iop_Shl64, cc_dep2, mkU8(48))));
   1380       }
   1381 
   1382       /* 8, 9 */
   1383       if (isU64(cc_op, AMD64G_CC_OP_SUBW) && isU64(cond, AMD64CondS)
   1384                                           && isU64(cc_dep2, 0)) {
   1385          /* word sub/cmp of zero, then S --> test (dst-0 <s 0)
   1386                                          --> test dst <s 0
   1387                                          --> (ULong)dst[15]
   1388             This is yet another scheme by which clang figures out if the
   1389             top bit of a word is 1 or 0.  See also LOGICB/CondS below. */
   1390          /* Note: isU64(cc_dep2, 0) is correct, even though this is
   1391             for an 16-bit comparison, since the args to the helper
   1392             function are always U64s. */
   1393          return binop(Iop_And64,
   1394                       binop(Iop_Shr64,cc_dep1,mkU8(15)),
   1395                       mkU64(1));
   1396       }
   1397       if (isU64(cc_op, AMD64G_CC_OP_SUBW) && isU64(cond, AMD64CondNS)
   1398                                           && isU64(cc_dep2, 0)) {
   1399          /* word sub/cmp of zero, then NS --> test !(dst-0 <s 0)
   1400                                           --> test !(dst <s 0)
   1401                                           --> (ULong) !dst[15]
   1402          */
   1403          return binop(Iop_Xor64,
   1404                       binop(Iop_And64,
   1405                             binop(Iop_Shr64,cc_dep1,mkU8(15)),
   1406                             mkU64(1)),
   1407                       mkU64(1));
   1408       }
   1409 
   1410       /* 14, */
   1411       if (isU64(cc_op, AMD64G_CC_OP_SUBW) && isU64(cond, AMD64CondLE)) {
   1412          /* word sub/cmp, then LE (signed less than or equal)
   1413             --> test dst <=s src */
   1414          return unop(Iop_1Uto64,
   1415                      binop(Iop_CmpLE64S,
   1416                            binop(Iop_Shl64,cc_dep1,mkU8(48)),
   1417                            binop(Iop_Shl64,cc_dep2,mkU8(48))));
   1418 
   1419       }
   1420 
   1421       /*---------------- SUBB ----------------*/
   1422 
   1423       /* 2, 3 */
   1424       if (isU64(cc_op, AMD64G_CC_OP_SUBB) && isU64(cond, AMD64CondB)) {
   1425          /* byte sub/cmp, then B (unsigned less than)
   1426             --> test dst <u src */
   1427          return unop(Iop_1Uto64,
   1428                      binop(Iop_CmpLT64U,
   1429                            binop(Iop_And64, cc_dep1, mkU64(0xFF)),
   1430                            binop(Iop_And64, cc_dep2, mkU64(0xFF))));
   1431       }
   1432       if (isU64(cc_op, AMD64G_CC_OP_SUBB) && isU64(cond, AMD64CondNB)) {
   1433          /* byte sub/cmp, then NB (unsigned greater than or equal)
   1434             --> test src <=u dst */
   1435          /* Note, args are opposite way round from the usual */
   1436          return unop(Iop_1Uto64,
   1437                      binop(Iop_CmpLE64U,
   1438                            binop(Iop_And64, cc_dep2, mkU64(0xFF)),
   1439                            binop(Iop_And64, cc_dep1, mkU64(0xFF))));
   1440       }
   1441 
   1442       /* 4, 5 */
   1443       if (isU64(cc_op, AMD64G_CC_OP_SUBB) && isU64(cond, AMD64CondZ)) {
   1444          /* byte sub/cmp, then Z --> test dst==src */
   1445          return unop(Iop_1Uto64,
   1446                      binop(Iop_CmpEQ8,
   1447                            unop(Iop_64to8,cc_dep1),
   1448                            unop(Iop_64to8,cc_dep2)));
   1449       }
   1450       if (isU64(cc_op, AMD64G_CC_OP_SUBB) && isU64(cond, AMD64CondNZ)) {
   1451          /* byte sub/cmp, then NZ --> test dst!=src */
   1452          return unop(Iop_1Uto64,
   1453                      binop(Iop_CmpNE8,
   1454                            unop(Iop_64to8,cc_dep1),
   1455                            unop(Iop_64to8,cc_dep2)));
   1456       }
   1457 
   1458       /* 6, */
   1459       if (isU64(cc_op, AMD64G_CC_OP_SUBB) && isU64(cond, AMD64CondBE)) {
   1460          /* byte sub/cmp, then BE (unsigned less than or equal)
   1461             --> test dst <=u src */
   1462          return unop(Iop_1Uto64,
   1463                      binop(Iop_CmpLE64U,
   1464                            binop(Iop_And64, cc_dep1, mkU64(0xFF)),
   1465                            binop(Iop_And64, cc_dep2, mkU64(0xFF))));
   1466       }
   1467 
   1468       /* 8, 9 */
   1469       if (isU64(cc_op, AMD64G_CC_OP_SUBB) && isU64(cond, AMD64CondS)
   1470                                           && isU64(cc_dep2, 0)) {
   1471          /* byte sub/cmp of zero, then S --> test (dst-0 <s 0)
   1472                                          --> test dst <s 0
   1473                                          --> (ULong)dst[7]
   1474             This is yet another scheme by which gcc figures out if the
   1475             top bit of a byte is 1 or 0.  See also LOGICB/CondS below. */
   1476          /* Note: isU64(cc_dep2, 0) is correct, even though this is
   1477             for an 8-bit comparison, since the args to the helper
   1478             function are always U64s. */
   1479          return binop(Iop_And64,
   1480                       binop(Iop_Shr64,cc_dep1,mkU8(7)),
   1481                       mkU64(1));
   1482       }
   1483       if (isU64(cc_op, AMD64G_CC_OP_SUBB) && isU64(cond, AMD64CondNS)
   1484                                           && isU64(cc_dep2, 0)) {
   1485          /* byte sub/cmp of zero, then NS --> test !(dst-0 <s 0)
   1486                                           --> test !(dst <s 0)
   1487                                           --> (ULong) !dst[7]
   1488          */
   1489          return binop(Iop_Xor64,
   1490                       binop(Iop_And64,
   1491                             binop(Iop_Shr64,cc_dep1,mkU8(7)),
   1492                             mkU64(1)),
   1493                       mkU64(1));
   1494       }
   1495 
   1496       /*---------------- LOGICQ ----------------*/
   1497 
   1498       if (isU64(cc_op, AMD64G_CC_OP_LOGICQ) && isU64(cond, AMD64CondZ)) {
   1499          /* long long and/or/xor, then Z --> test dst==0 */
   1500          return unop(Iop_1Uto64,
   1501                      binop(Iop_CmpEQ64, cc_dep1, mkU64(0)));
   1502       }
   1503       if (isU64(cc_op, AMD64G_CC_OP_LOGICQ) && isU64(cond, AMD64CondNZ)) {
   1504          /* long long and/or/xor, then NZ --> test dst!=0 */
   1505          return unop(Iop_1Uto64,
   1506                      binop(Iop_CmpNE64, cc_dep1, mkU64(0)));
   1507       }
   1508 
   1509       if (isU64(cc_op, AMD64G_CC_OP_LOGICQ) && isU64(cond, AMD64CondL)) {
   1510          /* long long and/or/xor, then L
   1511             LOGIC sets SF and ZF according to the
   1512             result and makes OF be zero.  L computes SF ^ OF, but
   1513             OF is zero, so this reduces to SF -- which will be 1 iff
   1514             the result is < signed 0.  Hence ...
   1515          */
   1516          return unop(Iop_1Uto64,
   1517                      binop(Iop_CmpLT64S,
   1518                            cc_dep1,
   1519                            mkU64(0)));
   1520       }
   1521 
   1522       /*---------------- LOGICL ----------------*/
   1523 
   1524       if (isU64(cc_op, AMD64G_CC_OP_LOGICL) && isU64(cond, AMD64CondZ)) {
   1525          /* long and/or/xor, then Z --> test dst==0 */
   1526          return unop(Iop_1Uto64,
   1527                      binop(Iop_CmpEQ32,
   1528                            unop(Iop_64to32, cc_dep1),
   1529                            mkU32(0)));
   1530       }
   1531       if (isU64(cc_op, AMD64G_CC_OP_LOGICL) && isU64(cond, AMD64CondNZ)) {
   1532          /* long and/or/xor, then NZ --> test dst!=0 */
   1533          return unop(Iop_1Uto64,
   1534                      binop(Iop_CmpNE32,
   1535                            unop(Iop_64to32, cc_dep1),
   1536                            mkU32(0)));
   1537       }
   1538 
   1539       if (isU64(cc_op, AMD64G_CC_OP_LOGICL) && isU64(cond, AMD64CondLE)) {
   1540          /* long and/or/xor, then LE
   1541             This is pretty subtle.  LOGIC sets SF and ZF according to the
   1542             result and makes OF be zero.  LE computes (SF ^ OF) | ZF, but
   1543             OF is zero, so this reduces to SF | ZF -- which will be 1 iff
   1544             the result is <=signed 0.  Hence ...
   1545          */
   1546          return unop(Iop_1Uto64,
   1547                      binop(Iop_CmpLE32S,
   1548                            unop(Iop_64to32, cc_dep1),
   1549                            mkU32(0)));
   1550       }
   1551 
   1552       if (isU64(cc_op, AMD64G_CC_OP_LOGICL) && isU64(cond, AMD64CondS)) {
   1553          /* long and/or/xor, then S --> (ULong)result[31] */
   1554          return binop(Iop_And64,
   1555                       binop(Iop_Shr64, cc_dep1, mkU8(31)),
   1556                       mkU64(1));
   1557       }
   1558       if (isU64(cc_op, AMD64G_CC_OP_LOGICL) && isU64(cond, AMD64CondNS)) {
   1559          /* long and/or/xor, then S --> (ULong) ~ result[31] */
   1560          return binop(Iop_Xor64,
   1561                 binop(Iop_And64,
   1562                       binop(Iop_Shr64, cc_dep1, mkU8(31)),
   1563                       mkU64(1)),
   1564                 mkU64(1));
   1565       }
   1566 
   1567       /*---------------- LOGICW ----------------*/
   1568 
   1569       if (isU64(cc_op, AMD64G_CC_OP_LOGICW) && isU64(cond, AMD64CondZ)) {
   1570          /* word and/or/xor, then Z --> test dst==0 */
   1571          return unop(Iop_1Uto64,
   1572                      binop(Iop_CmpEQ64,
   1573                            binop(Iop_And64, cc_dep1, mkU64(0xFFFF)),
   1574                            mkU64(0)));
   1575       }
   1576       if (isU64(cc_op, AMD64G_CC_OP_LOGICW) && isU64(cond, AMD64CondNZ)) {
   1577          /* word and/or/xor, then NZ --> test dst!=0 */
   1578          return unop(Iop_1Uto64,
   1579                      binop(Iop_CmpNE64,
   1580                            binop(Iop_And64, cc_dep1, mkU64(0xFFFF)),
   1581                            mkU64(0)));
   1582       }
   1583 
   1584       /*---------------- LOGICB ----------------*/
   1585 
   1586       if (isU64(cc_op, AMD64G_CC_OP_LOGICB) && isU64(cond, AMD64CondZ)) {
   1587          /* byte and/or/xor, then Z --> test dst==0 */
   1588          return unop(Iop_1Uto64,
   1589                      binop(Iop_CmpEQ64, binop(Iop_And64,cc_dep1,mkU64(255)),
   1590                                         mkU64(0)));
   1591       }
   1592       if (isU64(cc_op, AMD64G_CC_OP_LOGICB) && isU64(cond, AMD64CondNZ)) {
   1593          /* byte and/or/xor, then NZ --> test dst!=0 */
   1594          return unop(Iop_1Uto64,
   1595                      binop(Iop_CmpNE64, binop(Iop_And64,cc_dep1,mkU64(255)),
   1596                                         mkU64(0)));
   1597       }
   1598 
   1599       if (isU64(cc_op, AMD64G_CC_OP_LOGICB) && isU64(cond, AMD64CondS)) {
   1600          /* this is an idiom gcc sometimes uses to find out if the top
   1601             bit of a byte register is set: eg testb %al,%al; js ..
   1602             Since it just depends on the top bit of the byte, extract
   1603             that bit and explicitly get rid of all the rest.  This
   1604             helps memcheck avoid false positives in the case where any
   1605             of the other bits in the byte are undefined. */
   1606          /* byte and/or/xor, then S --> (UInt)result[7] */
   1607          return binop(Iop_And64,
   1608                       binop(Iop_Shr64,cc_dep1,mkU8(7)),
   1609                       mkU64(1));
   1610       }
   1611       if (isU64(cc_op, AMD64G_CC_OP_LOGICB) && isU64(cond, AMD64CondNS)) {
   1612          /* byte and/or/xor, then NS --> (UInt)!result[7] */
   1613          return binop(Iop_Xor64,
   1614                       binop(Iop_And64,
   1615                             binop(Iop_Shr64,cc_dep1,mkU8(7)),
   1616                             mkU64(1)),
   1617                       mkU64(1));
   1618       }
   1619 
   1620       /*---------------- INCB ----------------*/
   1621 
   1622       if (isU64(cc_op, AMD64G_CC_OP_INCB) && isU64(cond, AMD64CondLE)) {
   1623          /* 8-bit inc, then LE --> sign bit of the arg */
   1624          return binop(Iop_And64,
   1625                       binop(Iop_Shr64,
   1626                             binop(Iop_Sub64, cc_dep1, mkU64(1)),
   1627                             mkU8(7)),
   1628                       mkU64(1));
   1629       }
   1630 
   1631       /*---------------- INCW ----------------*/
   1632 
   1633       if (isU64(cc_op, AMD64G_CC_OP_INCW) && isU64(cond, AMD64CondZ)) {
   1634          /* 16-bit inc, then Z --> test dst == 0 */
   1635          return unop(Iop_1Uto64,
   1636                      binop(Iop_CmpEQ64,
   1637                            binop(Iop_Shl64,cc_dep1,mkU8(48)),
   1638                            mkU64(0)));
   1639       }
   1640 
   1641       /*---------------- DECL ----------------*/
   1642 
   1643       if (isU64(cc_op, AMD64G_CC_OP_DECL) && isU64(cond, AMD64CondZ)) {
   1644          /* dec L, then Z --> test dst == 0 */
   1645          return unop(Iop_1Uto64,
   1646                      binop(Iop_CmpEQ32,
   1647                            unop(Iop_64to32, cc_dep1),
   1648                            mkU32(0)));
   1649       }
   1650 
   1651       /*---------------- DECW ----------------*/
   1652 
   1653       if (isU64(cc_op, AMD64G_CC_OP_DECW) && isU64(cond, AMD64CondNZ)) {
   1654          /* 16-bit dec, then NZ --> test dst != 0 */
   1655          return unop(Iop_1Uto64,
   1656                      binop(Iop_CmpNE64,
   1657                            binop(Iop_Shl64,cc_dep1,mkU8(48)),
   1658                            mkU64(0)));
   1659       }
   1660 
   1661       /*---------------- SHRQ ----------------*/
   1662 
   1663       if (isU64(cc_op, AMD64G_CC_OP_SHRQ) && isU64(cond, AMD64CondZ)) {
   1664          /* SHRQ, then Z --> test dep1 == 0 */
   1665          return unop(Iop_1Uto64,
   1666                      binop(Iop_CmpEQ64, cc_dep1, mkU64(0)));
   1667       }
   1668       if (isU64(cc_op, AMD64G_CC_OP_SHRQ) && isU64(cond, AMD64CondNZ)) {
   1669          /* SHRQ, then NZ --> test dep1 != 0 */
   1670          return unop(Iop_1Uto64,
   1671                      binop(Iop_CmpNE64, cc_dep1, mkU64(0)));
   1672       }
   1673 
   1674       /*---------------- SHRL ----------------*/
   1675 
   1676       if (isU64(cc_op, AMD64G_CC_OP_SHRL) && isU64(cond, AMD64CondZ)) {
   1677          /* SHRL, then Z --> test dep1 == 0 */
   1678          return unop(Iop_1Uto64,
   1679                      binop(Iop_CmpEQ32, unop(Iop_64to32, cc_dep1),
   1680                            mkU32(0)));
   1681       }
   1682       if (isU64(cc_op, AMD64G_CC_OP_SHRL) && isU64(cond, AMD64CondNZ)) {
   1683          /* SHRL, then NZ --> test dep1 != 0 */
   1684          return unop(Iop_1Uto64,
   1685                      binop(Iop_CmpNE32, unop(Iop_64to32, cc_dep1),
   1686                            mkU32(0)));
   1687       }
   1688 
   1689       /*---------------- COPY ----------------*/
   1690       /* This can happen, as a result of amd64 FP compares: "comisd ... ;
   1691          jbe" for example. */
   1692 
   1693       if (isU64(cc_op, AMD64G_CC_OP_COPY) &&
   1694           (isU64(cond, AMD64CondBE) || isU64(cond, AMD64CondNBE))) {
   1695          /* COPY, then BE --> extract C and Z from dep1, and test (C
   1696             or Z == 1). */
   1697          /* COPY, then NBE --> extract C and Z from dep1, and test (C
   1698             or Z == 0). */
   1699          ULong nnn = isU64(cond, AMD64CondBE) ? 1 : 0;
   1700          return
   1701             unop(
   1702                Iop_1Uto64,
   1703                binop(
   1704                   Iop_CmpEQ64,
   1705                   binop(
   1706                      Iop_And64,
   1707                      binop(
   1708                         Iop_Or64,
   1709                         binop(Iop_Shr64, cc_dep1, mkU8(AMD64G_CC_SHIFT_C)),
   1710                         binop(Iop_Shr64, cc_dep1, mkU8(AMD64G_CC_SHIFT_Z))
   1711                      ),
   1712                      mkU64(1)
   1713                   ),
   1714                   mkU64(nnn)
   1715                )
   1716             );
   1717       }
   1718 
   1719       if (isU64(cc_op, AMD64G_CC_OP_COPY) && isU64(cond, AMD64CondB)) {
   1720          /* COPY, then B --> extract C dep1, and test (C == 1). */
   1721          return
   1722             unop(
   1723                Iop_1Uto64,
   1724                binop(
   1725                   Iop_CmpNE64,
   1726                   binop(
   1727                      Iop_And64,
   1728                      binop(Iop_Shr64, cc_dep1, mkU8(AMD64G_CC_SHIFT_C)),
   1729                      mkU64(1)
   1730                   ),
   1731                   mkU64(0)
   1732                )
   1733             );
   1734       }
   1735 
   1736       if (isU64(cc_op, AMD64G_CC_OP_COPY)
   1737           && (isU64(cond, AMD64CondZ) || isU64(cond, AMD64CondNZ))) {
   1738          /* COPY, then Z --> extract Z from dep1, and test (Z == 1). */
   1739          /* COPY, then NZ --> extract Z from dep1, and test (Z == 0). */
   1740          UInt nnn = isU64(cond, AMD64CondZ) ? 1 : 0;
   1741          return
   1742             unop(
   1743                Iop_1Uto64,
   1744                binop(
   1745                   Iop_CmpEQ64,
   1746                   binop(
   1747                      Iop_And64,
   1748                      binop(Iop_Shr64, cc_dep1, mkU8(AMD64G_CC_SHIFT_Z)),
   1749                      mkU64(1)
   1750                   ),
   1751                   mkU64(nnn)
   1752                )
   1753             );
   1754       }
   1755 
   1756       if (isU64(cc_op, AMD64G_CC_OP_COPY) && isU64(cond, AMD64CondP)) {
   1757          /* COPY, then P --> extract P from dep1, and test (P == 1). */
   1758          return
   1759             unop(
   1760                Iop_1Uto64,
   1761                binop(
   1762                   Iop_CmpNE64,
   1763                   binop(
   1764                      Iop_And64,
   1765                      binop(Iop_Shr64, cc_dep1, mkU8(AMD64G_CC_SHIFT_P)),
   1766                      mkU64(1)
   1767                   ),
   1768                   mkU64(0)
   1769                )
   1770             );
   1771       }
   1772 
   1773       return NULL;
   1774    }
   1775 
   1776    /* --------- specialising "amd64g_calculate_rflags_c" --------- */
   1777 
   1778    if (vex_streq(function_name, "amd64g_calculate_rflags_c")) {
   1779       /* specialise calls to above "calculate_rflags_c" function */
   1780       IRExpr *cc_op, *cc_dep1, *cc_dep2, *cc_ndep;
   1781       vassert(arity == 4);
   1782       cc_op   = args[0];
   1783       cc_dep1 = args[1];
   1784       cc_dep2 = args[2];
   1785       cc_ndep = args[3];
   1786 
   1787       if (isU64(cc_op, AMD64G_CC_OP_SUBQ)) {
   1788          /* C after sub denotes unsigned less than */
   1789          return unop(Iop_1Uto64,
   1790                      binop(Iop_CmpLT64U,
   1791                            cc_dep1,
   1792                            cc_dep2));
   1793       }
   1794       if (isU64(cc_op, AMD64G_CC_OP_SUBL)) {
   1795          /* C after sub denotes unsigned less than */
   1796          return unop(Iop_1Uto64,
   1797                      binop(Iop_CmpLT32U,
   1798                            unop(Iop_64to32, cc_dep1),
   1799                            unop(Iop_64to32, cc_dep2)));
   1800       }
   1801       if (isU64(cc_op, AMD64G_CC_OP_SUBB)) {
   1802          /* C after sub denotes unsigned less than */
   1803          return unop(Iop_1Uto64,
   1804                      binop(Iop_CmpLT64U,
   1805                            binop(Iop_And64,cc_dep1,mkU64(0xFF)),
   1806                            binop(Iop_And64,cc_dep2,mkU64(0xFF))));
   1807       }
   1808       if (isU64(cc_op, AMD64G_CC_OP_ADDQ)) {
   1809          /* C after add denotes sum <u either arg */
   1810          return unop(Iop_1Uto64,
   1811                      binop(Iop_CmpLT64U,
   1812                            binop(Iop_Add64, cc_dep1, cc_dep2),
   1813                            cc_dep1));
   1814       }
   1815       if (isU64(cc_op, AMD64G_CC_OP_ADDL)) {
   1816          /* C after add denotes sum <u either arg */
   1817          return unop(Iop_1Uto64,
   1818                      binop(Iop_CmpLT32U,
   1819                            unop(Iop_64to32, binop(Iop_Add64, cc_dep1, cc_dep2)),
   1820                            unop(Iop_64to32, cc_dep1)));
   1821       }
   1822       if (isU64(cc_op, AMD64G_CC_OP_LOGICQ)
   1823           || isU64(cc_op, AMD64G_CC_OP_LOGICL)
   1824           || isU64(cc_op, AMD64G_CC_OP_LOGICW)
   1825           || isU64(cc_op, AMD64G_CC_OP_LOGICB)) {
   1826          /* cflag after logic is zero */
   1827          return mkU64(0);
   1828       }
   1829       if (isU64(cc_op, AMD64G_CC_OP_DECL) || isU64(cc_op, AMD64G_CC_OP_INCL)
   1830           || isU64(cc_op, AMD64G_CC_OP_DECQ) || isU64(cc_op, AMD64G_CC_OP_INCQ)) {
   1831          /* If the thunk is dec or inc, the cflag is supplied as CC_NDEP. */
   1832          return cc_ndep;
   1833       }
   1834 
   1835 #     if 0
   1836       if (cc_op->tag == Iex_Const) {
   1837          vex_printf("CFLAG "); ppIRExpr(cc_op); vex_printf("\n");
   1838       }
   1839 #     endif
   1840 
   1841       return NULL;
   1842    }
   1843 
   1844 #  undef unop
   1845 #  undef binop
   1846 #  undef mkU64
   1847 #  undef mkU32
   1848 #  undef mkU8
   1849 
   1850    return NULL;
   1851 }
   1852 
   1853 
   1854 /*---------------------------------------------------------------*/
   1855 /*--- Supporting functions for x87 FPU activities.            ---*/
   1856 /*---------------------------------------------------------------*/
   1857 
   1858 static inline Bool host_is_little_endian ( void )
   1859 {
   1860    UInt x = 0x76543210;
   1861    UChar* p = (UChar*)(&x);
   1862    return toBool(*p == 0x10);
   1863 }
   1864 
   1865 /* Inspect a value and its tag, as per the x87 'FXAM' instruction. */
   1866 /* CALLED FROM GENERATED CODE: CLEAN HELPER */
   1867 ULong amd64g_calculate_FXAM ( ULong tag, ULong dbl )
   1868 {
   1869    Bool   mantissaIsZero;
   1870    Int    bexp;
   1871    UChar  sign;
   1872    UChar* f64;
   1873 
   1874    vassert(host_is_little_endian());
   1875 
   1876    /* vex_printf("calculate_FXAM ( %d, %llx ) .. ", tag, dbl ); */
   1877 
   1878    f64  = (UChar*)(&dbl);
   1879    sign = toUChar( (f64[7] >> 7) & 1 );
   1880 
   1881    /* First off, if the tag indicates the register was empty,
   1882       return 1,0,sign,1 */
   1883    if (tag == 0) {
   1884       /* vex_printf("Empty\n"); */
   1885       return AMD64G_FC_MASK_C3 | 0 | (sign << AMD64G_FC_SHIFT_C1)
   1886                                    | AMD64G_FC_MASK_C0;
   1887    }
   1888 
   1889    bexp = (f64[7] << 4) | ((f64[6] >> 4) & 0x0F);
   1890    bexp &= 0x7FF;
   1891 
   1892    mantissaIsZero
   1893       = toBool(
   1894            (f64[6] & 0x0F) == 0
   1895            && (f64[5] | f64[4] | f64[3] | f64[2] | f64[1] | f64[0]) == 0
   1896         );
   1897 
   1898    /* If both exponent and mantissa are zero, the value is zero.
   1899       Return 1,0,sign,0. */
   1900    if (bexp == 0 && mantissaIsZero) {
   1901       /* vex_printf("Zero\n"); */
   1902       return AMD64G_FC_MASK_C3 | 0
   1903                                | (sign << AMD64G_FC_SHIFT_C1) | 0;
   1904    }
   1905 
   1906    /* If exponent is zero but mantissa isn't, it's a denormal.
   1907       Return 1,1,sign,0. */
   1908    if (bexp == 0 && !mantissaIsZero) {
   1909       /* vex_printf("Denormal\n"); */
   1910       return AMD64G_FC_MASK_C3 | AMD64G_FC_MASK_C2
   1911                                | (sign << AMD64G_FC_SHIFT_C1) | 0;
   1912    }
   1913 
   1914    /* If the exponent is 7FF and the mantissa is zero, this is an infinity.
   1915       Return 0,1,sign,1. */
   1916    if (bexp == 0x7FF && mantissaIsZero) {
   1917       /* vex_printf("Inf\n"); */
   1918       return 0 | AMD64G_FC_MASK_C2 | (sign << AMD64G_FC_SHIFT_C1)
   1919                                    | AMD64G_FC_MASK_C0;
   1920    }
   1921 
   1922    /* If the exponent is 7FF and the mantissa isn't zero, this is a NaN.
   1923       Return 0,0,sign,1. */
   1924    if (bexp == 0x7FF && !mantissaIsZero) {
   1925       /* vex_printf("NaN\n"); */
   1926       return 0 | 0 | (sign << AMD64G_FC_SHIFT_C1) | AMD64G_FC_MASK_C0;
   1927    }
   1928 
   1929    /* Uh, ok, we give up.  It must be a normal finite number.
   1930       Return 0,1,sign,0.
   1931    */
   1932    /* vex_printf("normal\n"); */
   1933    return 0 | AMD64G_FC_MASK_C2 | (sign << AMD64G_FC_SHIFT_C1) | 0;
   1934 }
   1935 
   1936 
   1937 /* This is used to implement both 'frstor' and 'fldenv'.  The latter
   1938    appears to differ from the former only in that the 8 FP registers
   1939    themselves are not transferred into the guest state. */
   1940 static
   1941 VexEmNote do_put_x87 ( Bool moveRegs,
   1942                        /*IN*/Fpu_State* x87_state,
   1943                        /*OUT*/VexGuestAMD64State* vex_state )
   1944 {
   1945    Int        stno, preg;
   1946    UInt       tag;
   1947    ULong*     vexRegs = (ULong*)(&vex_state->guest_FPREG[0]);
   1948    UChar*     vexTags = (UChar*)(&vex_state->guest_FPTAG[0]);
   1949    UInt       ftop    = (x87_state->env[FP_ENV_STAT] >> 11) & 7;
   1950    UInt       tagw    = x87_state->env[FP_ENV_TAG];
   1951    UInt       fpucw   = x87_state->env[FP_ENV_CTRL];
   1952    UInt       c3210   = x87_state->env[FP_ENV_STAT] & 0x4700;
   1953    VexEmNote  ew;
   1954    UInt       fpround;
   1955    ULong      pair;
   1956 
   1957    /* Copy registers and tags */
   1958    for (stno = 0; stno < 8; stno++) {
   1959       preg = (stno + ftop) & 7;
   1960       tag = (tagw >> (2*preg)) & 3;
   1961       if (tag == 3) {
   1962          /* register is empty */
   1963          /* hmm, if it's empty, does it still get written?  Probably
   1964             safer to say it does.  If we don't, memcheck could get out
   1965             of sync, in that it thinks all FP registers are defined by
   1966             this helper, but in reality some have not been updated. */
   1967          if (moveRegs)
   1968             vexRegs[preg] = 0; /* IEEE754 64-bit zero */
   1969          vexTags[preg] = 0;
   1970       } else {
   1971          /* register is non-empty */
   1972          if (moveRegs)
   1973             convert_f80le_to_f64le( &x87_state->reg[10*stno],
   1974                                     (UChar*)&vexRegs[preg] );
   1975          vexTags[preg] = 1;
   1976       }
   1977    }
   1978 
   1979    /* stack pointer */
   1980    vex_state->guest_FTOP = ftop;
   1981 
   1982    /* status word */
   1983    vex_state->guest_FC3210 = c3210;
   1984 
   1985    /* handle the control word, setting FPROUND and detecting any
   1986       emulation warnings. */
   1987    pair    = amd64g_check_fldcw ( (ULong)fpucw );
   1988    fpround = (UInt)pair & 0xFFFFFFFFULL;
   1989    ew      = (VexEmNote)(pair >> 32);
   1990 
   1991    vex_state->guest_FPROUND = fpround & 3;
   1992 
   1993    /* emulation warnings --> caller */
   1994    return ew;
   1995 }
   1996 
   1997 
   1998 /* Create an x87 FPU state from the guest state, as close as
   1999    we can approximate it. */
   2000 static
   2001 void do_get_x87 ( /*IN*/VexGuestAMD64State* vex_state,
   2002                   /*OUT*/Fpu_State* x87_state )
   2003 {
   2004    Int        i, stno, preg;
   2005    UInt       tagw;
   2006    ULong*     vexRegs = (ULong*)(&vex_state->guest_FPREG[0]);
   2007    UChar*     vexTags = (UChar*)(&vex_state->guest_FPTAG[0]);
   2008    UInt       ftop    = vex_state->guest_FTOP;
   2009    UInt       c3210   = vex_state->guest_FC3210;
   2010 
   2011    for (i = 0; i < 14; i++)
   2012       x87_state->env[i] = 0;
   2013 
   2014    x87_state->env[1] = x87_state->env[3] = x87_state->env[5]
   2015       = x87_state->env[13] = 0xFFFF;
   2016    x87_state->env[FP_ENV_STAT]
   2017       = toUShort(((ftop & 7) << 11) | (c3210 & 0x4700));
   2018    x87_state->env[FP_ENV_CTRL]
   2019       = toUShort(amd64g_create_fpucw( vex_state->guest_FPROUND ));
   2020 
   2021    /* Dump the register stack in ST order. */
   2022    tagw = 0;
   2023    for (stno = 0; stno < 8; stno++) {
   2024       preg = (stno + ftop) & 7;
   2025       if (vexTags[preg] == 0) {
   2026          /* register is empty */
   2027          tagw |= (3 << (2*preg));
   2028          convert_f64le_to_f80le( (UChar*)&vexRegs[preg],
   2029                                  &x87_state->reg[10*stno] );
   2030       } else {
   2031          /* register is full. */
   2032          tagw |= (0 << (2*preg));
   2033          convert_f64le_to_f80le( (UChar*)&vexRegs[preg],
   2034                                  &x87_state->reg[10*stno] );
   2035       }
   2036    }
   2037    x87_state->env[FP_ENV_TAG] = toUShort(tagw);
   2038 }
   2039 
   2040 
   2041 /*---------------------------------------------------------------*/
   2042 /*--- Supporting functions for XSAVE/FXSAVE.                  ---*/
   2043 /*---------------------------------------------------------------*/
   2044 
   2045 /* CALLED FROM GENERATED CODE */
   2046 /* DIRTY HELPER (reads guest state, writes guest mem) */
   2047 /* XSAVE component 0 is the x87 FPU state. */
   2048 void amd64g_dirtyhelper_XSAVE_COMPONENT_0
   2049         ( VexGuestAMD64State* gst, HWord addr )
   2050 {
   2051    /* Derived from values obtained from
   2052       vendor_id       : AuthenticAMD
   2053       cpu family      : 15
   2054       model           : 12
   2055       model name      : AMD Athlon(tm) 64 Processor 3200+
   2056       stepping        : 0
   2057       cpu MHz         : 2200.000
   2058       cache size      : 512 KB
   2059    */
   2060    /* Somewhat roundabout, but at least it's simple. */
   2061    Fpu_State tmp;
   2062    UShort*   addrS = (UShort*)addr;
   2063    UChar*    addrC = (UChar*)addr;
   2064    UShort    fp_tags;
   2065    UInt      summary_tags;
   2066    Int       r, stno;
   2067    UShort    *srcS, *dstS;
   2068 
   2069    do_get_x87( gst, &tmp );
   2070 
   2071    /* Now build the proper fxsave x87 image from the fsave x87 image
   2072       we just made. */
   2073 
   2074    addrS[0]  = tmp.env[FP_ENV_CTRL]; /* FCW: fpu control word */
   2075    addrS[1]  = tmp.env[FP_ENV_STAT]; /* FCW: fpu status word */
   2076 
   2077    /* set addrS[2] in an endian-independent way */
   2078    summary_tags = 0;
   2079    fp_tags = tmp.env[FP_ENV_TAG];
   2080    for (r = 0; r < 8; r++) {
   2081       if ( ((fp_tags >> (2*r)) & 3) != 3 )
   2082          summary_tags |= (1 << r);
   2083    }
   2084    addrC[4]  = toUChar(summary_tags); /* FTW: tag summary byte */
   2085    addrC[5]  = 0; /* pad */
   2086 
   2087    /* FOP: faulting fpu opcode.  From experimentation, the real CPU
   2088       does not write this field. (?!) */
   2089    addrS[3]  = 0; /* BOGUS */
   2090 
   2091    /* RIP (Last x87 instruction pointer).  From experimentation, the
   2092       real CPU does not write this field. (?!) */
   2093    addrS[4]  = 0; /* BOGUS */
   2094    addrS[5]  = 0; /* BOGUS */
   2095    addrS[6]  = 0; /* BOGUS */
   2096    addrS[7]  = 0; /* BOGUS */
   2097 
   2098    /* RDP (Last x87 data pointer).  From experimentation, the real CPU
   2099       does not write this field. (?!) */
   2100    addrS[8]  = 0; /* BOGUS */
   2101    addrS[9]  = 0; /* BOGUS */
   2102    addrS[10] = 0; /* BOGUS */
   2103    addrS[11] = 0; /* BOGUS */
   2104 
   2105    /* addrS[13,12] are MXCSR -- not written */
   2106    /* addrS[15,14] are MXCSR_MASK -- not written */
   2107 
   2108    /* Copy in the FP registers, in ST order. */
   2109    for (stno = 0; stno < 8; stno++) {
   2110       srcS = (UShort*)(&tmp.reg[10*stno]);
   2111       dstS = (UShort*)(&addrS[16 + 8*stno]);
   2112       dstS[0] = srcS[0];
   2113       dstS[1] = srcS[1];
   2114       dstS[2] = srcS[2];
   2115       dstS[3] = srcS[3];
   2116       dstS[4] = srcS[4];
   2117       dstS[5] = 0;
   2118       dstS[6] = 0;
   2119       dstS[7] = 0;
   2120    }
   2121 }
   2122 
   2123 
   2124 /* CALLED FROM GENERATED CODE */
   2125 /* DIRTY HELPER (reads guest state, writes guest mem) */
   2126 /* XSAVE component 1 is the SSE state. */
   2127 void amd64g_dirtyhelper_XSAVE_COMPONENT_1_EXCLUDING_XMMREGS
   2128         ( VexGuestAMD64State* gst, HWord addr )
   2129 {
   2130    UShort* addrS = (UShort*)addr;
   2131    UInt    mxcsr;
   2132 
   2133    /* The only non-register parts of the SSE state are MXCSR and
   2134       MXCSR_MASK. */
   2135    mxcsr = amd64g_create_mxcsr( gst->guest_SSEROUND );
   2136 
   2137    addrS[12] = toUShort(mxcsr);  /* MXCSR */
   2138    addrS[13] = toUShort(mxcsr >> 16);
   2139 
   2140    addrS[14] = 0xFFFF; /* MXCSR mask (lo16) */
   2141    addrS[15] = 0x0000; /* MXCSR mask (hi16) */
   2142 }
   2143 
   2144 
   2145 /* VISIBLE TO LIBVEX CLIENT */
   2146 /* Do FXSAVE from the supplied VexGuestAMD64State structure and store
   2147    the result at the given address which represents a buffer of at
   2148    least 416 bytes.
   2149 
   2150    This function is not called from generated code.  FXSAVE is dealt
   2151    with by the amd64 front end by calling the XSAVE_COMPONENT_{0,1}
   2152    functions above plus some in-line IR.  This function is merely a
   2153    convenience function for VEX's users.
   2154 */
   2155 void LibVEX_GuestAMD64_fxsave ( /*IN*/VexGuestAMD64State* gst,
   2156                                 /*OUT*/HWord fp_state )
   2157 {
   2158    /* Do the x87 part */
   2159    amd64g_dirtyhelper_XSAVE_COMPONENT_0(gst, fp_state);
   2160 
   2161    /* And now the SSE part, except for the registers themselves. */
   2162    amd64g_dirtyhelper_XSAVE_COMPONENT_1_EXCLUDING_XMMREGS(gst, fp_state);
   2163 
   2164    /* That's the first 160 bytes of the image done. */
   2165    /* Now only %xmm0 .. %xmm15 remain to be copied.  If the host is
   2166       big-endian, these need to be byte-swapped. */
   2167    U128 *xmm = (U128 *)(fp_state + 160);
   2168    vassert(host_is_little_endian());
   2169 
   2170 #  define COPY_U128(_dst,_src)                       \
   2171       do { _dst[0] = _src[0]; _dst[1] = _src[1];     \
   2172            _dst[2] = _src[2]; _dst[3] = _src[3]; }   \
   2173       while (0)
   2174 
   2175    COPY_U128( xmm[0],  gst->guest_YMM0 );
   2176    COPY_U128( xmm[1],  gst->guest_YMM1 );
   2177    COPY_U128( xmm[2],  gst->guest_YMM2 );
   2178    COPY_U128( xmm[3],  gst->guest_YMM3 );
   2179    COPY_U128( xmm[4],  gst->guest_YMM4 );
   2180    COPY_U128( xmm[5],  gst->guest_YMM5 );
   2181    COPY_U128( xmm[6],  gst->guest_YMM6 );
   2182    COPY_U128( xmm[7],  gst->guest_YMM7 );
   2183    COPY_U128( xmm[8],  gst->guest_YMM8 );
   2184    COPY_U128( xmm[9],  gst->guest_YMM9 );
   2185    COPY_U128( xmm[10], gst->guest_YMM10 );
   2186    COPY_U128( xmm[11], gst->guest_YMM11 );
   2187    COPY_U128( xmm[12], gst->guest_YMM12 );
   2188    COPY_U128( xmm[13], gst->guest_YMM13 );
   2189    COPY_U128( xmm[14], gst->guest_YMM14 );
   2190    COPY_U128( xmm[15], gst->guest_YMM15 );
   2191 #  undef COPY_U128
   2192 }
   2193 
   2194 
   2195 /*---------------------------------------------------------------*/
   2196 /*--- Supporting functions for XRSTOR/FXRSTOR.                ---*/
   2197 /*---------------------------------------------------------------*/
   2198 
   2199 /* CALLED FROM GENERATED CODE */
   2200 /* DIRTY HELPER (writes guest state, reads guest mem) */
   2201 VexEmNote amd64g_dirtyhelper_XRSTOR_COMPONENT_0
   2202              ( VexGuestAMD64State* gst, HWord addr )
   2203 {
   2204    Fpu_State tmp;
   2205    UShort*   addrS   = (UShort*)addr;
   2206    UChar*    addrC   = (UChar*)addr;
   2207    UShort    fp_tags;
   2208    Int       r, stno, i;
   2209 
   2210    /* Copy the x87 registers out of the image, into a temporary
   2211       Fpu_State struct. */
   2212    for (i = 0; i < 14; i++) tmp.env[i] = 0;
   2213    for (i = 0; i < 80; i++) tmp.reg[i] = 0;
   2214    /* fill in tmp.reg[0..7] */
   2215    for (stno = 0; stno < 8; stno++) {
   2216       UShort* dstS = (UShort*)(&tmp.reg[10*stno]);
   2217       UShort* srcS = (UShort*)(&addrS[16 + 8*stno]);
   2218       dstS[0] = srcS[0];
   2219       dstS[1] = srcS[1];
   2220       dstS[2] = srcS[2];
   2221       dstS[3] = srcS[3];
   2222       dstS[4] = srcS[4];
   2223    }
   2224    /* fill in tmp.env[0..13] */
   2225    tmp.env[FP_ENV_CTRL] = addrS[0]; /* FCW: fpu control word */
   2226    tmp.env[FP_ENV_STAT] = addrS[1]; /* FCW: fpu status word */
   2227 
   2228    fp_tags = 0;
   2229    for (r = 0; r < 8; r++) {
   2230       if (addrC[4] & (1<<r))
   2231          fp_tags |= (0 << (2*r)); /* EMPTY */
   2232       else
   2233          fp_tags |= (3 << (2*r)); /* VALID -- not really precise enough. */
   2234    }
   2235    tmp.env[FP_ENV_TAG] = fp_tags;
   2236 
   2237    /* Now write 'tmp' into the guest state. */
   2238    VexEmNote warnX87 = do_put_x87( True/*moveRegs*/, &tmp, gst );
   2239 
   2240    return warnX87;
   2241 }
   2242 
   2243 
   2244 /* CALLED FROM GENERATED CODE */
   2245 /* DIRTY HELPER (writes guest state, reads guest mem) */
   2246 VexEmNote amd64g_dirtyhelper_XRSTOR_COMPONENT_1_EXCLUDING_XMMREGS
   2247              ( VexGuestAMD64State* gst, HWord addr )
   2248 {
   2249    UShort* addrS = (UShort*)addr;
   2250    UInt    w32   = (((UInt)addrS[12]) & 0xFFFF)
   2251                    | ((((UInt)addrS[13]) & 0xFFFF) << 16);
   2252    ULong   w64   = amd64g_check_ldmxcsr( (ULong)w32 );
   2253 
   2254    VexEmNote warnXMM = (VexEmNote)(w64 >> 32);
   2255 
   2256    gst->guest_SSEROUND = w64 & 0xFFFFFFFFULL;
   2257    return warnXMM;
   2258 }
   2259 
   2260 
   2261 /* VISIBLE TO LIBVEX CLIENT */
   2262 /* Do FXRSTOR from the supplied address and store read values to the given
   2263    VexGuestAMD64State structure.
   2264 
   2265    This function is not called from generated code.  FXRSTOR is dealt
   2266    with by the amd64 front end by calling the XRSTOR_COMPONENT_{0,1}
   2267    functions above plus some in-line IR.  This function is merely a
   2268    convenience function for VEX's users.
   2269 */
   2270 VexEmNote LibVEX_GuestAMD64_fxrstor ( /*IN*/HWord fp_state,
   2271                                       /*MOD*/VexGuestAMD64State* gst )
   2272 {
   2273    /* Restore %xmm0 .. %xmm15.  If the host is big-endian, these need
   2274       to be byte-swapped. */
   2275    U128 *xmm = (U128 *)(fp_state + 160);
   2276 
   2277    vassert(host_is_little_endian());
   2278 
   2279 #  define COPY_U128(_dst,_src)                       \
   2280       do { _dst[0] = _src[0]; _dst[1] = _src[1];     \
   2281            _dst[2] = _src[2]; _dst[3] = _src[3]; }   \
   2282       while (0)
   2283 
   2284    COPY_U128( gst->guest_YMM0, xmm[0] );
   2285    COPY_U128( gst->guest_YMM1, xmm[1] );
   2286    COPY_U128( gst->guest_YMM2, xmm[2] );
   2287    COPY_U128( gst->guest_YMM3, xmm[3] );
   2288    COPY_U128( gst->guest_YMM4, xmm[4] );
   2289    COPY_U128( gst->guest_YMM5, xmm[5] );
   2290    COPY_U128( gst->guest_YMM6, xmm[6] );
   2291    COPY_U128( gst->guest_YMM7, xmm[7] );
   2292    COPY_U128( gst->guest_YMM8, xmm[8] );
   2293    COPY_U128( gst->guest_YMM9, xmm[9] );
   2294    COPY_U128( gst->guest_YMM10, xmm[10] );
   2295    COPY_U128( gst->guest_YMM11, xmm[11] );
   2296    COPY_U128( gst->guest_YMM12, xmm[12] );
   2297    COPY_U128( gst->guest_YMM13, xmm[13] );
   2298    COPY_U128( gst->guest_YMM14, xmm[14] );
   2299    COPY_U128( gst->guest_YMM15, xmm[15] );
   2300 
   2301 #  undef COPY_U128
   2302 
   2303    VexEmNote warnXMM
   2304       = amd64g_dirtyhelper_XRSTOR_COMPONENT_1_EXCLUDING_XMMREGS(gst, fp_state);
   2305    VexEmNote warnX87
   2306       = amd64g_dirtyhelper_XRSTOR_COMPONENT_0(gst, fp_state);
   2307 
   2308    /* Prefer an X87 emwarn over an XMM one, if both exist. */
   2309    if (warnX87 != EmNote_NONE)
   2310       return warnX87;
   2311    else
   2312       return warnXMM;
   2313 }
   2314 
   2315 
   2316 /*---------------------------------------------------------------*/
   2317 /*--- Supporting functions for FSAVE/FRSTOR                   ---*/
   2318 /*---------------------------------------------------------------*/
   2319 
   2320 /* DIRTY HELPER (writes guest state) */
   2321 /* Initialise the x87 FPU state as per 'finit'. */
   2322 void amd64g_dirtyhelper_FINIT ( VexGuestAMD64State* gst )
   2323 {
   2324    Int i;
   2325    gst->guest_FTOP = 0;
   2326    for (i = 0; i < 8; i++) {
   2327       gst->guest_FPTAG[i] = 0; /* empty */
   2328       gst->guest_FPREG[i] = 0; /* IEEE754 64-bit zero */
   2329    }
   2330    gst->guest_FPROUND = (ULong)Irrm_NEAREST;
   2331    gst->guest_FC3210  = 0;
   2332 }
   2333 
   2334 
   2335 /* CALLED FROM GENERATED CODE */
   2336 /* DIRTY HELPER (reads guest memory) */
   2337 ULong amd64g_dirtyhelper_loadF80le ( Addr addrU )
   2338 {
   2339    ULong f64;
   2340    convert_f80le_to_f64le ( (UChar*)addrU, (UChar*)&f64 );
   2341    return f64;
   2342 }
   2343 
   2344 /* CALLED FROM GENERATED CODE */
   2345 /* DIRTY HELPER (writes guest memory) */
   2346 void amd64g_dirtyhelper_storeF80le ( Addr addrU, ULong f64 )
   2347 {
   2348    convert_f64le_to_f80le( (UChar*)&f64, (UChar*)addrU );
   2349 }
   2350 
   2351 
   2352 /* CALLED FROM GENERATED CODE */
   2353 /* CLEAN HELPER */
   2354 /* mxcsr[15:0] contains a SSE native format MXCSR value.
   2355    Extract from it the required SSEROUND value and any resulting
   2356    emulation warning, and return (warn << 32) | sseround value.
   2357 */
   2358 ULong amd64g_check_ldmxcsr ( ULong mxcsr )
   2359 {
   2360    /* Decide on a rounding mode.  mxcsr[14:13] holds it. */
   2361    /* NOTE, encoded exactly as per enum IRRoundingMode. */
   2362    ULong rmode = (mxcsr >> 13) & 3;
   2363 
   2364    /* Detect any required emulation warnings. */
   2365    VexEmNote ew = EmNote_NONE;
   2366 
   2367    if ((mxcsr & 0x1F80) != 0x1F80) {
   2368       /* unmasked exceptions! */
   2369       ew = EmWarn_X86_sseExns;
   2370    }
   2371    else
   2372    if (mxcsr & (1<<15)) {
   2373       /* FZ is set */
   2374       ew = EmWarn_X86_fz;
   2375    }
   2376    else
   2377    if (mxcsr & (1<<6)) {
   2378       /* DAZ is set */
   2379       ew = EmWarn_X86_daz;
   2380    }
   2381 
   2382    return (((ULong)ew) << 32) | ((ULong)rmode);
   2383 }
   2384 
   2385 
   2386 /* CALLED FROM GENERATED CODE */
   2387 /* CLEAN HELPER */
   2388 /* Given sseround as an IRRoundingMode value, create a suitable SSE
   2389    native format MXCSR value. */
   2390 ULong amd64g_create_mxcsr ( ULong sseround )
   2391 {
   2392    sseround &= 3;
   2393    return 0x1F80 | (sseround << 13);
   2394 }
   2395 
   2396 
   2397 /* CLEAN HELPER */
   2398 /* fpucw[15:0] contains a x87 native format FPU control word.
   2399    Extract from it the required FPROUND value and any resulting
   2400    emulation warning, and return (warn << 32) | fpround value.
   2401 */
   2402 ULong amd64g_check_fldcw ( ULong fpucw )
   2403 {
   2404    /* Decide on a rounding mode.  fpucw[11:10] holds it. */
   2405    /* NOTE, encoded exactly as per enum IRRoundingMode. */
   2406    ULong rmode = (fpucw >> 10) & 3;
   2407 
   2408    /* Detect any required emulation warnings. */
   2409    VexEmNote ew = EmNote_NONE;
   2410 
   2411    if ((fpucw & 0x3F) != 0x3F) {
   2412       /* unmasked exceptions! */
   2413       ew = EmWarn_X86_x87exns;
   2414    }
   2415    else
   2416    if (((fpucw >> 8) & 3) != 3) {
   2417       /* unsupported precision */
   2418       ew = EmWarn_X86_x87precision;
   2419    }
   2420 
   2421    return (((ULong)ew) << 32) | ((ULong)rmode);
   2422 }
   2423 
   2424 
   2425 /* CLEAN HELPER */
   2426 /* Given fpround as an IRRoundingMode value, create a suitable x87
   2427    native format FPU control word. */
   2428 ULong amd64g_create_fpucw ( ULong fpround )
   2429 {
   2430    fpround &= 3;
   2431    return 0x037F | (fpround << 10);
   2432 }
   2433 
   2434 
   2435 /* This is used to implement 'fldenv'.
   2436    Reads 28 bytes at x87_state[0 .. 27]. */
   2437 /* CALLED FROM GENERATED CODE */
   2438 /* DIRTY HELPER */
   2439 VexEmNote amd64g_dirtyhelper_FLDENV ( /*OUT*/VexGuestAMD64State* vex_state,
   2440                                       /*IN*/HWord x87_state)
   2441 {
   2442    return do_put_x87( False, (Fpu_State*)x87_state, vex_state );
   2443 }
   2444 
   2445 
   2446 /* CALLED FROM GENERATED CODE */
   2447 /* DIRTY HELPER */
   2448 /* Create an x87 FPU env from the guest state, as close as we can
   2449    approximate it.  Writes 28 bytes at x87_state[0..27]. */
   2450 void amd64g_dirtyhelper_FSTENV ( /*IN*/VexGuestAMD64State* vex_state,
   2451                                  /*OUT*/HWord x87_state )
   2452 {
   2453    Int        i, stno, preg;
   2454    UInt       tagw;
   2455    UChar*     vexTags = (UChar*)(&vex_state->guest_FPTAG[0]);
   2456    Fpu_State* x87     = (Fpu_State*)x87_state;
   2457    UInt       ftop    = vex_state->guest_FTOP;
   2458    ULong      c3210   = vex_state->guest_FC3210;
   2459 
   2460    for (i = 0; i < 14; i++)
   2461       x87->env[i] = 0;
   2462 
   2463    x87->env[1] = x87->env[3] = x87->env[5] = x87->env[13] = 0xFFFF;
   2464    x87->env[FP_ENV_STAT]
   2465       = toUShort(toUInt( ((ftop & 7) << 11) | (c3210 & 0x4700) ));
   2466    x87->env[FP_ENV_CTRL]
   2467       = toUShort(toUInt( amd64g_create_fpucw( vex_state->guest_FPROUND ) ));
   2468 
   2469    /* Compute the x87 tag word. */
   2470    tagw = 0;
   2471    for (stno = 0; stno < 8; stno++) {
   2472       preg = (stno + ftop) & 7;
   2473       if (vexTags[preg] == 0) {
   2474          /* register is empty */
   2475          tagw |= (3 << (2*preg));
   2476       } else {
   2477          /* register is full. */
   2478          tagw |= (0 << (2*preg));
   2479       }
   2480    }
   2481    x87->env[FP_ENV_TAG] = toUShort(tagw);
   2482 
   2483    /* We don't dump the x87 registers, tho. */
   2484 }
   2485 
   2486 
   2487 /* This is used to implement 'fnsave'.
   2488    Writes 108 bytes at x87_state[0 .. 107]. */
   2489 /* CALLED FROM GENERATED CODE */
   2490 /* DIRTY HELPER */
   2491 void amd64g_dirtyhelper_FNSAVE ( /*IN*/VexGuestAMD64State* vex_state,
   2492                                  /*OUT*/HWord x87_state)
   2493 {
   2494    do_get_x87( vex_state, (Fpu_State*)x87_state );
   2495 }
   2496 
   2497 
   2498 /* This is used to implement 'fnsaves'.
   2499    Writes 94 bytes at x87_state[0 .. 93]. */
   2500 /* CALLED FROM GENERATED CODE */
   2501 /* DIRTY HELPER */
   2502 void amd64g_dirtyhelper_FNSAVES ( /*IN*/VexGuestAMD64State* vex_state,
   2503                                   /*OUT*/HWord x87_state)
   2504 {
   2505    Int           i, stno, preg;
   2506    UInt          tagw;
   2507    ULong*        vexRegs = (ULong*)(&vex_state->guest_FPREG[0]);
   2508    UChar*        vexTags = (UChar*)(&vex_state->guest_FPTAG[0]);
   2509    Fpu_State_16* x87     = (Fpu_State_16*)x87_state;
   2510    UInt          ftop    = vex_state->guest_FTOP;
   2511    UInt          c3210   = vex_state->guest_FC3210;
   2512 
   2513    for (i = 0; i < 7; i++)
   2514       x87->env[i] = 0;
   2515 
   2516    x87->env[FPS_ENV_STAT]
   2517       = toUShort(((ftop & 7) << 11) | (c3210 & 0x4700));
   2518    x87->env[FPS_ENV_CTRL]
   2519       = toUShort(amd64g_create_fpucw( vex_state->guest_FPROUND ));
   2520 
   2521    /* Dump the register stack in ST order. */
   2522    tagw = 0;
   2523    for (stno = 0; stno < 8; stno++) {
   2524       preg = (stno + ftop) & 7;
   2525       if (vexTags[preg] == 0) {
   2526          /* register is empty */
   2527          tagw |= (3 << (2*preg));
   2528          convert_f64le_to_f80le( (UChar*)&vexRegs[preg],
   2529                                  &x87->reg[10*stno] );
   2530       } else {
   2531          /* register is full. */
   2532          tagw |= (0 << (2*preg));
   2533          convert_f64le_to_f80le( (UChar*)&vexRegs[preg],
   2534                                  &x87->reg[10*stno] );
   2535       }
   2536    }
   2537    x87->env[FPS_ENV_TAG] = toUShort(tagw);
   2538 }
   2539 
   2540 
   2541 /* This is used to implement 'frstor'.
   2542    Reads 108 bytes at x87_state[0 .. 107]. */
   2543 /* CALLED FROM GENERATED CODE */
   2544 /* DIRTY HELPER */
   2545 VexEmNote amd64g_dirtyhelper_FRSTOR ( /*OUT*/VexGuestAMD64State* vex_state,
   2546                                       /*IN*/HWord x87_state)
   2547 {
   2548    return do_put_x87( True, (Fpu_State*)x87_state, vex_state );
   2549 }
   2550 
   2551 
   2552 /* This is used to implement 'frstors'.
   2553    Reads 94 bytes at x87_state[0 .. 93]. */
   2554 /* CALLED FROM GENERATED CODE */
   2555 /* DIRTY HELPER */
   2556 VexEmNote amd64g_dirtyhelper_FRSTORS ( /*OUT*/VexGuestAMD64State* vex_state,
   2557                                        /*IN*/HWord x87_state)
   2558 {
   2559    Int           stno, preg;
   2560    UInt          tag;
   2561    ULong*        vexRegs = (ULong*)(&vex_state->guest_FPREG[0]);
   2562    UChar*        vexTags = (UChar*)(&vex_state->guest_FPTAG[0]);
   2563    Fpu_State_16* x87     = (Fpu_State_16*)x87_state;
   2564    UInt          ftop    = (x87->env[FPS_ENV_STAT] >> 11) & 7;
   2565    UInt          tagw    = x87->env[FPS_ENV_TAG];
   2566    UInt          fpucw   = x87->env[FPS_ENV_CTRL];
   2567    UInt          c3210   = x87->env[FPS_ENV_STAT] & 0x4700;
   2568    VexEmNote     ew;
   2569    UInt          fpround;
   2570    ULong         pair;
   2571 
   2572    /* Copy registers and tags */
   2573    for (stno = 0; stno < 8; stno++) {
   2574       preg = (stno + ftop) & 7;
   2575       tag = (tagw >> (2*preg)) & 3;
   2576       if (tag == 3) {
   2577          /* register is empty */
   2578          /* hmm, if it's empty, does it still get written?  Probably
   2579             safer to say it does.  If we don't, memcheck could get out
   2580             of sync, in that it thinks all FP registers are defined by
   2581             this helper, but in reality some have not been updated. */
   2582          vexRegs[preg] = 0; /* IEEE754 64-bit zero */
   2583          vexTags[preg] = 0;
   2584       } else {
   2585          /* register is non-empty */
   2586          convert_f80le_to_f64le( &x87->reg[10*stno],
   2587                                  (UChar*)&vexRegs[preg] );
   2588          vexTags[preg] = 1;
   2589       }
   2590    }
   2591 
   2592    /* stack pointer */
   2593    vex_state->guest_FTOP = ftop;
   2594 
   2595    /* status word */
   2596    vex_state->guest_FC3210 = c3210;
   2597 
   2598    /* handle the control word, setting FPROUND and detecting any
   2599       emulation warnings. */
   2600    pair    = amd64g_check_fldcw ( (ULong)fpucw );
   2601    fpround = (UInt)pair & 0xFFFFFFFFULL;
   2602    ew      = (VexEmNote)(pair >> 32);
   2603 
   2604    vex_state->guest_FPROUND = fpround & 3;
   2605 
   2606    /* emulation warnings --> caller */
   2607    return ew;
   2608 }
   2609 
   2610 
   2611 /*---------------------------------------------------------------*/
   2612 /*--- CPUID helpers.                                          ---*/
   2613 /*---------------------------------------------------------------*/
   2614 
   2615 /* Claim to be the following CPU, which is probably representative of
   2616    the lowliest (earliest) amd64 offerings.  It can do neither sse3
   2617    nor cx16.
   2618 
   2619    vendor_id       : AuthenticAMD
   2620    cpu family      : 15
   2621    model           : 5
   2622    model name      : AMD Opteron (tm) Processor 848
   2623    stepping        : 10
   2624    cpu MHz         : 1797.682
   2625    cache size      : 1024 KB
   2626    fpu             : yes
   2627    fpu_exception   : yes
   2628    cpuid level     : 1
   2629    wp              : yes
   2630    flags           : fpu vme de pse tsc msr pae mce cx8 apic sep
   2631                      mtrr pge mca cmov pat pse36 clflush mmx fxsr
   2632                      sse sse2 syscall nx mmxext lm 3dnowext 3dnow
   2633    bogomips        : 3600.62
   2634    TLB size        : 1088 4K pages
   2635    clflush size    : 64
   2636    cache_alignment : 64
   2637    address sizes   : 40 bits physical, 48 bits virtual
   2638    power management: ts fid vid ttp
   2639 
   2640    2012-Feb-21: don't claim 3dnow or 3dnowext, since in fact
   2641    we don't support them.  See #291568.  3dnow is 80000001.EDX.31
   2642    and 3dnowext is 80000001.EDX.30.
   2643 */
   2644 void amd64g_dirtyhelper_CPUID_baseline ( VexGuestAMD64State* st )
   2645 {
   2646 #  define SET_ABCD(_a,_b,_c,_d)                \
   2647       do { st->guest_RAX = (ULong)(_a);        \
   2648            st->guest_RBX = (ULong)(_b);        \
   2649            st->guest_RCX = (ULong)(_c);        \
   2650            st->guest_RDX = (ULong)(_d);        \
   2651       } while (0)
   2652 
   2653    switch (0xFFFFFFFF & st->guest_RAX) {
   2654       case 0x00000000:
   2655          SET_ABCD(0x00000001, 0x68747541, 0x444d4163, 0x69746e65);
   2656          break;
   2657       case 0x00000001:
   2658          SET_ABCD(0x00000f5a, 0x01000800, 0x00000000, 0x078bfbff);
   2659          break;
   2660       case 0x80000000:
   2661          SET_ABCD(0x80000018, 0x68747541, 0x444d4163, 0x69746e65);
   2662          break;
   2663       case 0x80000001:
   2664          /* Don't claim to support 3dnow or 3dnowext.  0xe1d3fbff is
   2665             the original it-is-supported value that the h/w provides.
   2666             See #291568. */
   2667          SET_ABCD(0x00000f5a, 0x00000505, 0x00000000, /*0xe1d3fbff*/
   2668                                                       0x21d3fbff);
   2669          break;
   2670       case 0x80000002:
   2671          SET_ABCD(0x20444d41, 0x6574704f, 0x206e6f72, 0x296d7428);
   2672          break;
   2673       case 0x80000003:
   2674          SET_ABCD(0x6f725020, 0x73736563, 0x3820726f, 0x00003834);
   2675          break;
   2676       case 0x80000004:
   2677          SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
   2678          break;
   2679       case 0x80000005:
   2680          SET_ABCD(0xff08ff08, 0xff20ff20, 0x40020140, 0x40020140);
   2681          break;
   2682       case 0x80000006:
   2683          SET_ABCD(0x00000000, 0x42004200, 0x04008140, 0x00000000);
   2684          break;
   2685       case 0x80000007:
   2686          SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x0000000f);
   2687          break;
   2688       case 0x80000008:
   2689          SET_ABCD(0x00003028, 0x00000000, 0x00000000, 0x00000000);
   2690          break;
   2691       default:
   2692          SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
   2693          break;
   2694    }
   2695 #  undef SET_ABCD
   2696 }
   2697 
   2698 
   2699 /* Claim to be the following CPU (2 x ...), which is sse3 and cx16
   2700    capable.
   2701 
   2702    vendor_id       : GenuineIntel
   2703    cpu family      : 6
   2704    model           : 15
   2705    model name      : Intel(R) Core(TM)2 CPU 6600 @ 2.40GHz
   2706    stepping        : 6
   2707    cpu MHz         : 2394.000
   2708    cache size      : 4096 KB
   2709    physical id     : 0
   2710    siblings        : 2
   2711    core id         : 0
   2712    cpu cores       : 2
   2713    fpu             : yes
   2714    fpu_exception   : yes
   2715    cpuid level     : 10
   2716    wp              : yes
   2717    flags           : fpu vme de pse tsc msr pae mce cx8 apic sep
   2718                      mtrr pge mca cmov pat pse36 clflush dts acpi
   2719                      mmx fxsr sse sse2 ss ht tm syscall nx lm
   2720                      constant_tsc pni monitor ds_cpl vmx est tm2
   2721                      cx16 xtpr lahf_lm
   2722    bogomips        : 4798.78
   2723    clflush size    : 64
   2724    cache_alignment : 64
   2725    address sizes   : 36 bits physical, 48 bits virtual
   2726    power management:
   2727 */
   2728 void amd64g_dirtyhelper_CPUID_sse3_and_cx16 ( VexGuestAMD64State* st )
   2729 {
   2730 #  define SET_ABCD(_a,_b,_c,_d)                \
   2731       do { st->guest_RAX = (ULong)(_a);        \
   2732            st->guest_RBX = (ULong)(_b);        \
   2733            st->guest_RCX = (ULong)(_c);        \
   2734            st->guest_RDX = (ULong)(_d);        \
   2735       } while (0)
   2736 
   2737    switch (0xFFFFFFFF & st->guest_RAX) {
   2738       case 0x00000000:
   2739          SET_ABCD(0x0000000a, 0x756e6547, 0x6c65746e, 0x49656e69);
   2740          break;
   2741       case 0x00000001:
   2742          SET_ABCD(0x000006f6, 0x00020800, 0x0000e3bd, 0xbfebfbff);
   2743          break;
   2744       case 0x00000002:
   2745          SET_ABCD(0x05b0b101, 0x005657f0, 0x00000000, 0x2cb43049);
   2746          break;
   2747       case 0x00000003:
   2748          SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
   2749          break;
   2750       case 0x00000004: {
   2751          switch (0xFFFFFFFF & st->guest_RCX) {
   2752             case 0x00000000: SET_ABCD(0x04000121, 0x01c0003f,
   2753                                       0x0000003f, 0x00000001); break;
   2754             case 0x00000001: SET_ABCD(0x04000122, 0x01c0003f,
   2755                                       0x0000003f, 0x00000001); break;
   2756             case 0x00000002: SET_ABCD(0x04004143, 0x03c0003f,
   2757                                       0x00000fff, 0x00000001); break;
   2758             default:         SET_ABCD(0x00000000, 0x00000000,
   2759                                       0x00000000, 0x00000000); break;
   2760          }
   2761          break;
   2762       }
   2763       case 0x00000005:
   2764          SET_ABCD(0x00000040, 0x00000040, 0x00000003, 0x00000020);
   2765          break;
   2766       case 0x00000006:
   2767          SET_ABCD(0x00000001, 0x00000002, 0x00000001, 0x00000000);
   2768          break;
   2769       case 0x00000007:
   2770          SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
   2771          break;
   2772       case 0x00000008:
   2773          SET_ABCD(0x00000400, 0x00000000, 0x00000000, 0x00000000);
   2774          break;
   2775       case 0x00000009:
   2776          SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
   2777          break;
   2778       case 0x0000000a:
   2779       unhandled_eax_value:
   2780          SET_ABCD(0x07280202, 0x00000000, 0x00000000, 0x00000000);
   2781          break;
   2782       case 0x80000000:
   2783          SET_ABCD(0x80000008, 0x00000000, 0x00000000, 0x00000000);
   2784          break;
   2785       case 0x80000001:
   2786          SET_ABCD(0x00000000, 0x00000000, 0x00000001, 0x20100800);
   2787          break;
   2788       case 0x80000002:
   2789          SET_ABCD(0x65746e49, 0x2952286c, 0x726f4320, 0x4d542865);
   2790          break;
   2791       case 0x80000003:
   2792          SET_ABCD(0x43203229, 0x20205550, 0x20202020, 0x20202020);
   2793          break;
   2794       case 0x80000004:
   2795          SET_ABCD(0x30303636, 0x20402020, 0x30342e32, 0x007a4847);
   2796          break;
   2797       case 0x80000005:
   2798          SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
   2799          break;
   2800       case 0x80000006:
   2801          SET_ABCD(0x00000000, 0x00000000, 0x10008040, 0x00000000);
   2802          break;
   2803       case 0x80000007:
   2804          SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
   2805          break;
   2806       case 0x80000008:
   2807          SET_ABCD(0x00003024, 0x00000000, 0x00000000, 0x00000000);
   2808          break;
   2809       default:
   2810          goto unhandled_eax_value;
   2811    }
   2812 #  undef SET_ABCD
   2813 }
   2814 
   2815 
   2816 /* Claim to be the following CPU (4 x ...), which is sse4.2 and cx16
   2817    capable.
   2818 
   2819    vendor_id       : GenuineIntel
   2820    cpu family      : 6
   2821    model           : 37
   2822    model name      : Intel(R) Core(TM) i5 CPU         670  @ 3.47GHz
   2823    stepping        : 2
   2824    cpu MHz         : 3334.000
   2825    cache size      : 4096 KB
   2826    physical id     : 0
   2827    siblings        : 4
   2828    core id         : 0
   2829    cpu cores       : 2
   2830    apicid          : 0
   2831    initial apicid  : 0
   2832    fpu             : yes
   2833    fpu_exception   : yes
   2834    cpuid level     : 11
   2835    wp              : yes
   2836    flags           : fpu vme de pse tsc msr pae mce cx8 apic sep
   2837                      mtrr pge mca cmov pat pse36 clflush dts acpi
   2838                      mmx fxsr sse sse2 ss ht tm pbe syscall nx rdtscp
   2839                      lm constant_tsc arch_perfmon pebs bts rep_good
   2840                      xtopology nonstop_tsc aperfmperf pni pclmulqdq
   2841                      dtes64 monitor ds_cpl vmx smx est tm2 ssse3 cx16
   2842                      xtpr pdcm sse4_1 sse4_2 popcnt aes lahf_lm ida
   2843                      arat tpr_shadow vnmi flexpriority ept vpid
   2844    bogomips        : 6957.57
   2845    clflush size    : 64
   2846    cache_alignment : 64
   2847    address sizes   : 36 bits physical, 48 bits virtual
   2848    power management:
   2849 */
   2850 void amd64g_dirtyhelper_CPUID_sse42_and_cx16 ( VexGuestAMD64State* st )
   2851 {
   2852 #  define SET_ABCD(_a,_b,_c,_d)                \
   2853       do { st->guest_RAX = (ULong)(_a);        \
   2854            st->guest_RBX = (ULong)(_b);        \
   2855            st->guest_RCX = (ULong)(_c);        \
   2856            st->guest_RDX = (ULong)(_d);        \
   2857       } while (0)
   2858 
   2859    UInt old_eax = (UInt)st->guest_RAX;
   2860    UInt old_ecx = (UInt)st->guest_RCX;
   2861 
   2862    switch (old_eax) {
   2863       case 0x00000000:
   2864          SET_ABCD(0x0000000b, 0x756e6547, 0x6c65746e, 0x49656e69);
   2865          break;
   2866       case 0x00000001:
   2867          SET_ABCD(0x00020652, 0x00100800, 0x0298e3ff, 0xbfebfbff);
   2868          break;
   2869       case 0x00000002:
   2870          SET_ABCD(0x55035a01, 0x00f0b2e3, 0x00000000, 0x09ca212c);
   2871          break;
   2872       case 0x00000003:
   2873          SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
   2874          break;
   2875       case 0x00000004:
   2876          switch (old_ecx) {
   2877             case 0x00000000: SET_ABCD(0x1c004121, 0x01c0003f,
   2878                                       0x0000003f, 0x00000000); break;
   2879             case 0x00000001: SET_ABCD(0x1c004122, 0x00c0003f,
   2880                                       0x0000007f, 0x00000000); break;
   2881             case 0x00000002: SET_ABCD(0x1c004143, 0x01c0003f,
   2882                                       0x000001ff, 0x00000000); break;
   2883             case 0x00000003: SET_ABCD(0x1c03c163, 0x03c0003f,
   2884                                       0x00000fff, 0x00000002); break;
   2885             default:         SET_ABCD(0x00000000, 0x00000000,
   2886                                       0x00000000, 0x00000000); break;
   2887          }
   2888          break;
   2889       case 0x00000005:
   2890          SET_ABCD(0x00000040, 0x00000040, 0x00000003, 0x00001120);
   2891          break;
   2892       case 0x00000006:
   2893          SET_ABCD(0x00000007, 0x00000002, 0x00000001, 0x00000000);
   2894          break;
   2895       case 0x00000007:
   2896          SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
   2897          break;
   2898       case 0x00000008:
   2899          SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
   2900          break;
   2901       case 0x00000009:
   2902          SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
   2903          break;
   2904       case 0x0000000a:
   2905          SET_ABCD(0x07300403, 0x00000004, 0x00000000, 0x00000603);
   2906          break;
   2907       case 0x0000000b:
   2908          switch (old_ecx) {
   2909             case 0x00000000:
   2910                SET_ABCD(0x00000001, 0x00000002,
   2911                         0x00000100, 0x00000000); break;
   2912             case 0x00000001:
   2913                SET_ABCD(0x00000004, 0x00000004,
   2914                         0x00000201, 0x00000000); break;
   2915             default:
   2916                SET_ABCD(0x00000000, 0x00000000,
   2917                         old_ecx,    0x00000000); break;
   2918          }
   2919          break;
   2920       case 0x0000000c:
   2921          SET_ABCD(0x00000001, 0x00000002, 0x00000100, 0x00000000);
   2922          break;
   2923       case 0x0000000d:
   2924          switch (old_ecx) {
   2925             case 0x00000000: SET_ABCD(0x00000001, 0x00000002,
   2926                                       0x00000100, 0x00000000); break;
   2927             case 0x00000001: SET_ABCD(0x00000004, 0x00000004,
   2928                                       0x00000201, 0x00000000); break;
   2929             default:         SET_ABCD(0x00000000, 0x00000000,
   2930                                       old_ecx,    0x00000000); break;
   2931          }
   2932          break;
   2933       case 0x80000000:
   2934          SET_ABCD(0x80000008, 0x00000000, 0x00000000, 0x00000000);
   2935          break;
   2936       case 0x80000001:
   2937          SET_ABCD(0x00000000, 0x00000000, 0x00000001, 0x28100800);
   2938          break;
   2939       case 0x80000002:
   2940          SET_ABCD(0x65746e49, 0x2952286c, 0x726f4320, 0x4d542865);
   2941          break;
   2942       case 0x80000003:
   2943          SET_ABCD(0x35692029, 0x55504320, 0x20202020, 0x20202020);
   2944          break;
   2945       case 0x80000004:
   2946          SET_ABCD(0x30373620, 0x20402020, 0x37342e33, 0x007a4847);
   2947          break;
   2948       case 0x80000005:
   2949          SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
   2950          break;
   2951       case 0x80000006:
   2952          SET_ABCD(0x00000000, 0x00000000, 0x01006040, 0x00000000);
   2953          break;
   2954       case 0x80000007:
   2955          SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000100);
   2956          break;
   2957       case 0x80000008:
   2958          SET_ABCD(0x00003024, 0x00000000, 0x00000000, 0x00000000);
   2959          break;
   2960       default:
   2961          SET_ABCD(0x00000001, 0x00000002, 0x00000100, 0x00000000);
   2962          break;
   2963    }
   2964 #  undef SET_ABCD
   2965 }
   2966 
   2967 
   2968 /* Claim to be the following CPU (4 x ...), which is AVX and cx16
   2969    capable.  Plus (kludge!) it "supports" HTM.
   2970 
   2971    Also with the following change: claim that XSaveOpt is not
   2972    available, by cpuid(eax=0xD,ecx=1).eax[0] returns 0, compared to 1
   2973    on the real CPU.  Consequently, programs that correctly observe
   2974    these CPUID values should only try to use 3 of the 8 XSave-family
   2975    instructions: XGETBV, XSAVE and XRSTOR.  In particular this avoids
   2976    having to implement the compacted or optimised save/restore
   2977    variants.
   2978 
   2979    vendor_id       : GenuineIntel
   2980    cpu family      : 6
   2981    model           : 42
   2982    model name      : Intel(R) Core(TM) i5-2300 CPU @ 2.80GHz
   2983    stepping        : 7
   2984    cpu MHz         : 1600.000
   2985    cache size      : 6144 KB
   2986    physical id     : 0
   2987    siblings        : 4
   2988    core id         : 3
   2989    cpu cores       : 4
   2990    apicid          : 6
   2991    initial apicid  : 6
   2992    fpu             : yes
   2993    fpu_exception   : yes
   2994    cpuid level     : 13
   2995    wp              : yes
   2996    flags           : fpu vme de pse tsc msr pae mce cx8 apic sep
   2997                      mtrr pge mca cmov pat pse36 clflush dts acpi
   2998                      mmx fxsr sse sse2 ss ht tm pbe syscall nx rdtscp
   2999                      lm constant_tsc arch_perfmon pebs bts rep_good
   3000                      nopl xtopology nonstop_tsc aperfmperf pni pclmulqdq
   3001                      dtes64 monitor ds_cpl vmx est tm2 ssse3 cx16
   3002                      xtpr pdcm sse4_1 sse4_2 popcnt aes xsave avx
   3003                      lahf_lm ida arat epb xsaveopt pln pts dts
   3004                      tpr_shadow vnmi flexpriority ept vpid
   3005 
   3006    bogomips        : 5768.94
   3007    clflush size    : 64
   3008    cache_alignment : 64
   3009    address sizes   : 36 bits physical, 48 bits virtual
   3010    power management:
   3011 */
   3012 void amd64g_dirtyhelper_CPUID_avx_and_cx16 ( VexGuestAMD64State* st )
   3013 {
   3014 #  define SET_ABCD(_a,_b,_c,_d)                \
   3015       do { st->guest_RAX = (ULong)(_a);        \
   3016            st->guest_RBX = (ULong)(_b);        \
   3017            st->guest_RCX = (ULong)(_c);        \
   3018            st->guest_RDX = (ULong)(_d);        \
   3019       } while (0)
   3020 
   3021    UInt old_eax = (UInt)st->guest_RAX;
   3022    UInt old_ecx = (UInt)st->guest_RCX;
   3023 
   3024    switch (old_eax) {
   3025       case 0x00000000:
   3026          SET_ABCD(0x0000000d, 0x756e6547, 0x6c65746e, 0x49656e69);
   3027          break;
   3028       case 0x00000001:
   3029          SET_ABCD(0x000206a7, 0x00100800, 0x1f9ae3bf, 0xbfebfbff);
   3030          break;
   3031       case 0x00000002:
   3032          SET_ABCD(0x76035a01, 0x00f0b0ff, 0x00000000, 0x00ca0000);
   3033          break;
   3034       case 0x00000003:
   3035          SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
   3036          break;
   3037       case 0x00000004:
   3038          switch (old_ecx) {
   3039             case 0x00000000: SET_ABCD(0x1c004121, 0x01c0003f,
   3040                                       0x0000003f, 0x00000000); break;
   3041             case 0x00000001: SET_ABCD(0x1c004122, 0x01c0003f,
   3042                                       0x0000003f, 0x00000000); break;
   3043             case 0x00000002: SET_ABCD(0x1c004143, 0x01c0003f,
   3044                                       0x000001ff, 0x00000000); break;
   3045             case 0x00000003: SET_ABCD(0x1c03c163, 0x02c0003f,
   3046                                       0x00001fff, 0x00000006); break;
   3047             default:         SET_ABCD(0x00000000, 0x00000000,
   3048                                       0x00000000, 0x00000000); break;
   3049          }
   3050          break;
   3051       case 0x00000005:
   3052          SET_ABCD(0x00000040, 0x00000040, 0x00000003, 0x00001120);
   3053          break;
   3054       case 0x00000006:
   3055          SET_ABCD(0x00000077, 0x00000002, 0x00000009, 0x00000000);
   3056          break;
   3057       case 0x00000007:
   3058          SET_ABCD(0x00000000, 0x00000800, 0x00000000, 0x00000000);
   3059          break;
   3060       case 0x00000008:
   3061          SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
   3062          break;
   3063       case 0x00000009:
   3064          SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
   3065          break;
   3066       case 0x0000000a:
   3067          SET_ABCD(0x07300803, 0x00000000, 0x00000000, 0x00000603);
   3068          break;
   3069       case 0x0000000b:
   3070          switch (old_ecx) {
   3071             case 0x00000000:
   3072                SET_ABCD(0x00000001, 0x00000001,
   3073                         0x00000100, 0x00000000); break;
   3074             case 0x00000001:
   3075                SET_ABCD(0x00000004, 0x00000004,
   3076                         0x00000201, 0x00000000); break;
   3077             default:
   3078                SET_ABCD(0x00000000, 0x00000000,
   3079                         old_ecx,    0x00000000); break;
   3080          }
   3081          break;
   3082       case 0x0000000c:
   3083          SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
   3084          break;
   3085       case 0x0000000d:
   3086          switch (old_ecx) {
   3087             case 0x00000000: SET_ABCD(0x00000007, 0x00000340,
   3088                                       0x00000340, 0x00000000); break;
   3089             case 0x00000001: SET_ABCD(0x00000000, 0x00000000,
   3090                                       0x00000000, 0x00000000); break;
   3091             case 0x00000002: SET_ABCD(0x00000100, 0x00000240,
   3092                                       0x00000000, 0x00000000); break;
   3093             default:         SET_ABCD(0x00000000, 0x00000000,
   3094                                       0x00000000, 0x00000000); break;
   3095          }
   3096          break;
   3097       case 0x0000000e:
   3098          SET_ABCD(0x00000007, 0x00000340, 0x00000340, 0x00000000);
   3099          break;
   3100       case 0x0000000f:
   3101          SET_ABCD(0x00000007, 0x00000340, 0x00000340, 0x00000000);
   3102          break;
   3103       case 0x80000000:
   3104          SET_ABCD(0x80000008, 0x00000000, 0x00000000, 0x00000000);
   3105          break;
   3106       case 0x80000001:
   3107          SET_ABCD(0x00000000, 0x00000000, 0x00000001, 0x28100800);
   3108          break;
   3109       case 0x80000002:
   3110          SET_ABCD(0x20202020, 0x20202020, 0x65746e49, 0x2952286c);
   3111          break;
   3112       case 0x80000003:
   3113          SET_ABCD(0x726f4320, 0x4d542865, 0x35692029, 0x3033322d);
   3114          break;
   3115       case 0x80000004:
   3116          SET_ABCD(0x50432030, 0x20402055, 0x30382e32, 0x007a4847);
   3117          break;
   3118       case 0x80000005:
   3119          SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
   3120          break;
   3121       case 0x80000006:
   3122          SET_ABCD(0x00000000, 0x00000000, 0x01006040, 0x00000000);
   3123          break;
   3124       case 0x80000007:
   3125          SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000100);
   3126          break;
   3127       case 0x80000008:
   3128          SET_ABCD(0x00003024, 0x00000000, 0x00000000, 0x00000000);
   3129          break;
   3130       default:
   3131          SET_ABCD(0x00000007, 0x00000340, 0x00000340, 0x00000000);
   3132          break;
   3133    }
   3134 #  undef SET_ABCD
   3135 }
   3136 
   3137 
   3138 /* Claim to be the following CPU (4 x ...), which is AVX2 capable.
   3139 
   3140    With the following change: claim that XSaveOpt is not available, by
   3141    cpuid(eax=0xD,ecx=1).eax[0] returns 0, compared to 1 on the real
   3142    CPU.  Consequently, programs that correctly observe these CPUID
   3143    values should only try to use 3 of the 8 XSave-family instructions:
   3144    XGETBV, XSAVE and XRSTOR.  In particular this avoids having to
   3145    implement the compacted or optimised save/restore variants.
   3146 
   3147    vendor_id       : GenuineIntel
   3148    cpu family      : 6
   3149    model           : 60
   3150    model name      : Intel(R) Core(TM) i7-4910MQ CPU @ 2.90GHz
   3151    stepping        : 3
   3152    microcode       : 0x1c
   3153    cpu MHz         : 919.957
   3154    cache size      : 8192 KB
   3155    physical id     : 0
   3156    siblings        : 4
   3157    core id         : 3
   3158    cpu cores       : 4
   3159    apicid          : 6
   3160    initial apicid  : 6
   3161    fpu             : yes
   3162    fpu_exception   : yes
   3163    cpuid level     : 13
   3164    wp              : yes
   3165    flags           : fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca
   3166                      cmov pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht
   3167                      tm pbe syscall nx pdpe1gb rdtscp lm constant_tsc
   3168                      arch_perfmon pebs bts rep_good nopl xtopology nonstop_tsc
   3169                      aperfmperf eagerfpu pni pclmulqdq dtes64 monitor ds_cpl
   3170                      vmx smx est tm2 ssse3 fma cx16 xtpr pdcm pcid sse4_1
   3171                      sse4_2 x2apic movbe popcnt tsc_deadline_timer aes xsave
   3172                      avx f16c rdrand lahf_lm abm ida arat epb pln pts dtherm
   3173                      tpr_shadow vnmi flexpriority ept vpid fsgsbase tsc_adjust
   3174                      bmi1 avx2 smep bmi2 erms invpcid xsaveopt
   3175    bugs            :
   3176    bogomips        : 5786.68
   3177    clflush size    : 64
   3178    cache_alignment : 64
   3179    address sizes   : 39 bits physical, 48 bits virtual
   3180    power management:
   3181 */
   3182 void amd64g_dirtyhelper_CPUID_avx2 ( VexGuestAMD64State* st )
   3183 {
   3184 #  define SET_ABCD(_a,_b,_c,_d)                \
   3185       do { st->guest_RAX = (ULong)(_a);        \
   3186            st->guest_RBX = (ULong)(_b);        \
   3187            st->guest_RCX = (ULong)(_c);        \
   3188            st->guest_RDX = (ULong)(_d);        \
   3189       } while (0)
   3190 
   3191    UInt old_eax = (UInt)st->guest_RAX;
   3192    UInt old_ecx = (UInt)st->guest_RCX;
   3193 
   3194    switch (old_eax) {
   3195       case 0x00000000:
   3196          SET_ABCD(0x0000000d, 0x756e6547, 0x6c65746e, 0x49656e69);
   3197          break;
   3198       case 0x00000001:
   3199          /* Don't advertise RDRAND support, bit 30 in ECX.  */
   3200          SET_ABCD(0x000306c3, 0x02100800, 0x3ffafbff, 0xbfebfbff);
   3201          break;
   3202       case 0x00000002:
   3203          SET_ABCD(0x76036301, 0x00f0b6ff, 0x00000000, 0x00c10000);
   3204          break;
   3205       case 0x00000003:
   3206          SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
   3207          break;
   3208       case 0x00000004:
   3209          switch (old_ecx) {
   3210             case 0x00000000: SET_ABCD(0x1c004121, 0x01c0003f,
   3211                                       0x0000003f, 0x00000000); break;
   3212             case 0x00000001: SET_ABCD(0x1c004122, 0x01c0003f,
   3213                                       0x0000003f, 0x00000000); break;
   3214             case 0x00000002: SET_ABCD(0x1c004143, 0x01c0003f,
   3215                                       0x000001ff, 0x00000000); break;
   3216             case 0x00000003: SET_ABCD(0x1c03c163, 0x03c0003f,
   3217                                       0x00001fff, 0x00000006); break;
   3218             default:         SET_ABCD(0x00000000, 0x00000000,
   3219                                       0x00000000, 0x00000000); break;
   3220          }
   3221          break;
   3222       case 0x00000005:
   3223          SET_ABCD(0x00000040, 0x00000040, 0x00000003, 0x00042120);
   3224          break;
   3225       case 0x00000006:
   3226          SET_ABCD(0x00000077, 0x00000002, 0x00000009, 0x00000000);
   3227          break;
   3228       case 0x00000007:
   3229          switch (old_ecx) {
   3230             case 0x00000000: SET_ABCD(0x00000000, 0x000027ab,
   3231                                       0x00000000, 0x00000000); break;
   3232             default:         SET_ABCD(0x00000000, 0x00000000,
   3233                                       0x00000000, 0x00000000); break;
   3234          }
   3235          break;
   3236       case 0x00000008:
   3237          SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
   3238          break;
   3239       case 0x00000009:
   3240          SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
   3241          break;
   3242       case 0x0000000a:
   3243          SET_ABCD(0x07300803, 0x00000000, 0x00000000, 0x00000603);
   3244          break;
   3245       case 0x0000000b:
   3246          switch (old_ecx) {
   3247             case 0x00000000: SET_ABCD(0x00000001, 0x00000002,
   3248                                       0x00000100, 0x00000002); break;
   3249             case 0x00000001: SET_ABCD(0x00000004, 0x00000008,
   3250                                       0x00000201, 0x00000002); break;
   3251             default:         SET_ABCD(0x00000000, 0x00000000,
   3252                                       old_ecx,    0x00000002); break;
   3253          }
   3254          break;
   3255       case 0x0000000c:
   3256          SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
   3257          break;
   3258       case 0x0000000d:
   3259          switch (old_ecx) {
   3260             case 0x00000000: SET_ABCD(0x00000007, 0x00000340,
   3261                                       0x00000340, 0x00000000); break;
   3262             case 0x00000001: SET_ABCD(0x00000000, 0x00000000,
   3263                                       0x00000000, 0x00000000); break;
   3264             case 0x00000002: SET_ABCD(0x00000100, 0x00000240,
   3265                                       0x00000000, 0x00000000); break;
   3266             default:         SET_ABCD(0x00000000, 0x00000000,
   3267                                       0x00000000, 0x00000000); break;
   3268          }
   3269          break;
   3270       case 0x80000000:
   3271          SET_ABCD(0x80000008, 0x00000000, 0x00000000, 0x00000000);
   3272          break;
   3273       case 0x80000001:
   3274          SET_ABCD(0x00000000, 0x00000000, 0x00000021, 0x2c100800);
   3275          break;
   3276       case 0x80000002:
   3277          SET_ABCD(0x65746e49, 0x2952286c, 0x726f4320, 0x4d542865);
   3278          break;
   3279       case 0x80000003:
   3280          SET_ABCD(0x37692029, 0x3139342d, 0x20514d30, 0x20555043);
   3281          break;
   3282       case 0x80000004:
   3283          SET_ABCD(0x2e322040, 0x48473039, 0x0000007a, 0x00000000);
   3284          break;
   3285       case 0x80000005:
   3286          SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
   3287          break;
   3288       case 0x80000006:
   3289          SET_ABCD(0x00000000, 0x00000000, 0x01006040, 0x00000000);
   3290          break;
   3291       case 0x80000007:
   3292          SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000100);
   3293          break;
   3294       case 0x80000008:
   3295          SET_ABCD(0x00003027, 0x00000000, 0x00000000, 0x00000000);
   3296          break;
   3297       default:
   3298          SET_ABCD(0x00000007, 0x00000340, 0x00000340, 0x00000000);
   3299          break;
   3300    }
   3301 #  undef SET_ABCD
   3302 }
   3303 
   3304 
   3305 /*---------------------------------------------------------------*/
   3306 /*--- Misc integer helpers, including rotates and crypto.     ---*/
   3307 /*---------------------------------------------------------------*/
   3308 
   3309 ULong amd64g_calculate_RCR ( ULong arg,
   3310                              ULong rot_amt,
   3311                              ULong rflags_in,
   3312                              Long  szIN )
   3313 {
   3314    Bool  wantRflags = toBool(szIN < 0);
   3315    ULong sz         = wantRflags ? (-szIN) : szIN;
   3316    ULong tempCOUNT  = rot_amt & (sz == 8 ? 0x3F : 0x1F);
   3317    ULong cf=0, of=0, tempcf;
   3318 
   3319    switch (sz) {
   3320       case 8:
   3321          cf        = (rflags_in >> AMD64G_CC_SHIFT_C) & 1;
   3322          of        = ((arg >> 63) ^ cf) & 1;
   3323          while (tempCOUNT > 0) {
   3324             tempcf = arg & 1;
   3325             arg    = (arg >> 1) | (cf << 63);
   3326             cf     = tempcf;
   3327             tempCOUNT--;
   3328          }
   3329          break;
   3330       case 4:
   3331          while (tempCOUNT >= 33) tempCOUNT -= 33;
   3332          cf        = (rflags_in >> AMD64G_CC_SHIFT_C) & 1;
   3333          of        = ((arg >> 31) ^ cf) & 1;
   3334          while (tempCOUNT > 0) {
   3335             tempcf = arg & 1;
   3336             arg    = ((arg >> 1) & 0x7FFFFFFFULL) | (cf << 31);
   3337             cf     = tempcf;
   3338             tempCOUNT--;
   3339          }
   3340          break;
   3341       case 2:
   3342          while (tempCOUNT >= 17) tempCOUNT -= 17;
   3343          cf        = (rflags_in >> AMD64G_CC_SHIFT_C) & 1;
   3344          of        = ((arg >> 15) ^ cf) & 1;
   3345          while (tempCOUNT > 0) {
   3346             tempcf = arg & 1;
   3347             arg    = ((arg >> 1) & 0x7FFFULL) | (cf << 15);
   3348             cf     = tempcf;
   3349             tempCOUNT--;
   3350          }
   3351          break;
   3352       case 1:
   3353          while (tempCOUNT >= 9) tempCOUNT -= 9;
   3354          cf        = (rflags_in >> AMD64G_CC_SHIFT_C) & 1;
   3355          of        = ((arg >> 7) ^ cf) & 1;
   3356          while (tempCOUNT > 0) {
   3357             tempcf = arg & 1;
   3358             arg    = ((arg >> 1) & 0x7FULL) | (cf << 7);
   3359             cf     = tempcf;
   3360             tempCOUNT--;
   3361          }
   3362          break;
   3363       default:
   3364          vpanic("calculate_RCR(amd64g): invalid size");
   3365    }
   3366 
   3367    cf &= 1;
   3368    of &= 1;
   3369    rflags_in &= ~(AMD64G_CC_MASK_C | AMD64G_CC_MASK_O);
   3370    rflags_in |= (cf << AMD64G_CC_SHIFT_C) | (of << AMD64G_CC_SHIFT_O);
   3371 
   3372    /* caller can ask to have back either the resulting flags or
   3373       resulting value, but not both */
   3374    return wantRflags ? rflags_in : arg;
   3375 }
   3376 
   3377 ULong amd64g_calculate_RCL ( ULong arg,
   3378                              ULong rot_amt,
   3379                              ULong rflags_in,
   3380                              Long  szIN )
   3381 {
   3382    Bool  wantRflags = toBool(szIN < 0);
   3383    ULong sz         = wantRflags ? (-szIN) : szIN;
   3384    ULong tempCOUNT  = rot_amt & (sz == 8 ? 0x3F : 0x1F);
   3385    ULong cf=0, of=0, tempcf;
   3386 
   3387    switch (sz) {
   3388       case 8:
   3389          cf = (rflags_in >> AMD64G_CC_SHIFT_C) & 1;
   3390          while (tempCOUNT > 0) {
   3391             tempcf = (arg >> 63) & 1;
   3392             arg    = (arg << 1) | (cf & 1);
   3393             cf     = tempcf;
   3394             tempCOUNT--;
   3395          }
   3396          of = ((arg >> 63) ^ cf) & 1;
   3397          break;
   3398       case 4:
   3399          while (tempCOUNT >= 33) tempCOUNT -= 33;
   3400          cf = (rflags_in >> AMD64G_CC_SHIFT_C) & 1;
   3401          while (tempCOUNT > 0) {
   3402             tempcf = (arg >> 31) & 1;
   3403             arg    = 0xFFFFFFFFULL & ((arg << 1) | (cf & 1));
   3404             cf     = tempcf;
   3405             tempCOUNT--;
   3406          }
   3407          of = ((arg >> 31) ^ cf) & 1;
   3408          break;
   3409       case 2:
   3410          while (tempCOUNT >= 17) tempCOUNT -= 17;
   3411          cf = (rflags_in >> AMD64G_CC_SHIFT_C) & 1;
   3412          while (tempCOUNT > 0) {
   3413             tempcf = (arg >> 15) & 1;
   3414             arg    = 0xFFFFULL & ((arg << 1) | (cf & 1));
   3415             cf     = tempcf;
   3416             tempCOUNT--;
   3417          }
   3418          of = ((arg >> 15) ^ cf) & 1;
   3419          break;
   3420       case 1:
   3421          while (tempCOUNT >= 9) tempCOUNT -= 9;
   3422          cf = (rflags_in >> AMD64G_CC_SHIFT_C) & 1;
   3423          while (tempCOUNT > 0) {
   3424             tempcf = (arg >> 7) & 1;
   3425             arg    = 0xFFULL & ((arg << 1) | (cf & 1));
   3426             cf     = tempcf;
   3427             tempCOUNT--;
   3428          }
   3429          of = ((arg >> 7) ^ cf) & 1;
   3430          break;
   3431       default:
   3432          vpanic("calculate_RCL(amd64g): invalid size");
   3433    }
   3434 
   3435    cf &= 1;
   3436    of &= 1;
   3437    rflags_in &= ~(AMD64G_CC_MASK_C | AMD64G_CC_MASK_O);
   3438    rflags_in |= (cf << AMD64G_CC_SHIFT_C) | (of << AMD64G_CC_SHIFT_O);
   3439 
   3440    return wantRflags ? rflags_in : arg;
   3441 }
   3442 
   3443 /* Taken from gf2x-0.9.5, released under GPLv2+ (later versions LGPLv2+)
   3444  * svn://scm.gforge.inria.fr/svn/gf2x/trunk/hardware/opteron/gf2x_mul1.h@25
   3445  */
   3446 ULong amd64g_calculate_pclmul(ULong a, ULong b, ULong which)
   3447 {
   3448     ULong hi, lo, tmp, A[16];
   3449 
   3450    A[0] = 0;            A[1] = a;
   3451    A[2] = A[1] << 1;    A[3] = A[2] ^ a;
   3452    A[4] = A[2] << 1;    A[5] = A[4] ^ a;
   3453    A[6] = A[3] << 1;    A[7] = A[6] ^ a;
   3454    A[8] = A[4] << 1;    A[9] = A[8] ^ a;
   3455    A[10] = A[5] << 1;   A[11] = A[10] ^ a;
   3456    A[12] = A[6] << 1;   A[13] = A[12] ^ a;
   3457    A[14] = A[7] << 1;   A[15] = A[14] ^ a;
   3458 
   3459    lo = (A[b >> 60] << 4) ^ A[(b >> 56) & 15];
   3460    hi = lo >> 56;
   3461    lo = (lo << 8) ^ (A[(b >> 52) & 15] << 4) ^ A[(b >> 48) & 15];
   3462    hi = (hi << 8) | (lo >> 56);
   3463    lo = (lo << 8) ^ (A[(b >> 44) & 15] << 4) ^ A[(b >> 40) & 15];
   3464    hi = (hi << 8) | (lo >> 56);
   3465    lo = (lo << 8) ^ (A[(b >> 36) & 15] << 4) ^ A[(b >> 32) & 15];
   3466    hi = (hi << 8) | (lo >> 56);
   3467    lo = (lo << 8) ^ (A[(b >> 28) & 15] << 4) ^ A[(b >> 24) & 15];
   3468    hi = (hi << 8) | (lo >> 56);
   3469    lo = (lo << 8) ^ (A[(b >> 20) & 15] << 4) ^ A[(b >> 16) & 15];
   3470    hi = (hi << 8) | (lo >> 56);
   3471    lo = (lo << 8) ^ (A[(b >> 12) & 15] << 4) ^ A[(b >> 8) & 15];
   3472    hi = (hi << 8) | (lo >> 56);
   3473    lo = (lo << 8) ^ (A[(b >> 4) & 15] << 4) ^ A[b & 15];
   3474 
   3475    ULong m0 = -1;
   3476    m0 /= 255;
   3477    tmp = -((a >> 63) & 1); tmp &= ((b & (m0 * 0xfe)) >> 1); hi = hi ^ tmp;
   3478    tmp = -((a >> 62) & 1); tmp &= ((b & (m0 * 0xfc)) >> 2); hi = hi ^ tmp;
   3479    tmp = -((a >> 61) & 1); tmp &= ((b & (m0 * 0xf8)) >> 3); hi = hi ^ tmp;
   3480    tmp = -((a >> 60) & 1); tmp &= ((b & (m0 * 0xf0)) >> 4); hi = hi ^ tmp;
   3481    tmp = -((a >> 59) & 1); tmp &= ((b & (m0 * 0xe0)) >> 5); hi = hi ^ tmp;
   3482    tmp = -((a >> 58) & 1); tmp &= ((b & (m0 * 0xc0)) >> 6); hi = hi ^ tmp;
   3483    tmp = -((a >> 57) & 1); tmp &= ((b & (m0 * 0x80)) >> 7); hi = hi ^ tmp;
   3484 
   3485    return which ? hi : lo;
   3486 }
   3487 
   3488 
   3489 /* CALLED FROM GENERATED CODE */
   3490 /* DIRTY HELPER (non-referentially-transparent) */
   3491 /* Horrible hack.  On non-amd64 platforms, return 1. */
   3492 ULong amd64g_dirtyhelper_RDTSC ( void )
   3493 {
   3494 #  if defined(__x86_64__)
   3495    UInt  eax, edx;
   3496    __asm__ __volatile__("rdtsc" : "=a" (eax), "=d" (edx));
   3497    return (((ULong)edx) << 32) | ((ULong)eax);
   3498 #  else
   3499    return 1ULL;
   3500 #  endif
   3501 }
   3502 
   3503 /* CALLED FROM GENERATED CODE */
   3504 /* DIRTY HELPER (non-referentially-transparent) */
   3505 /* Horrible hack.  On non-amd64 platforms, return 1. */
   3506 /* This uses a different calling convention from _RDTSC just above
   3507    only because of the difficulty of returning 96 bits from a C
   3508    function -- RDTSC returns 64 bits and so is simple by comparison,
   3509    on amd64. */
   3510 void amd64g_dirtyhelper_RDTSCP ( VexGuestAMD64State* st )
   3511 {
   3512 #  if defined(__x86_64__)
   3513    UInt eax, ecx, edx;
   3514    __asm__ __volatile__("rdtscp" : "=a" (eax), "=d" (edx), "=c" (ecx));
   3515    st->guest_RAX = (ULong)eax;
   3516    st->guest_RCX = (ULong)ecx;
   3517    st->guest_RDX = (ULong)edx;
   3518 #  else
   3519    /* Do nothing. */
   3520 #  endif
   3521 }
   3522 
   3523 /* CALLED FROM GENERATED CODE */
   3524 /* DIRTY HELPER (non-referentially-transparent) */
   3525 /* Horrible hack.  On non-amd64 platforms, return 0. */
   3526 ULong amd64g_dirtyhelper_IN ( ULong portno, ULong sz/*1,2 or 4*/ )
   3527 {
   3528 #  if defined(__x86_64__)
   3529    ULong r = 0;
   3530    portno &= 0xFFFF;
   3531    switch (sz) {
   3532       case 4:
   3533          __asm__ __volatile__("movq $0,%%rax; inl %w1,%%eax; movq %%rax,%0"
   3534                               : "=a" (r) : "Nd" (portno));
   3535 	 break;
   3536       case 2:
   3537          __asm__ __volatile__("movq $0,%%rax; inw %w1,%w0"
   3538                               : "=a" (r) : "Nd" (portno));
   3539 	 break;
   3540       case 1:
   3541          __asm__ __volatile__("movq $0,%%rax; inb %w1,%b0"
   3542                               : "=a" (r) : "Nd" (portno));
   3543 	 break;
   3544       default:
   3545          break; /* note: no 64-bit version of insn exists */
   3546    }
   3547    return r;
   3548 #  else
   3549    return 0;
   3550 #  endif
   3551 }
   3552 
   3553 
   3554 /* CALLED FROM GENERATED CODE */
   3555 /* DIRTY HELPER (non-referentially-transparent) */
   3556 /* Horrible hack.  On non-amd64 platforms, do nothing. */
   3557 void amd64g_dirtyhelper_OUT ( ULong portno, ULong data, ULong sz/*1,2 or 4*/ )
   3558 {
   3559 #  if defined(__x86_64__)
   3560    portno &= 0xFFFF;
   3561    switch (sz) {
   3562       case 4:
   3563          __asm__ __volatile__("movq %0,%%rax; outl %%eax, %w1"
   3564                               : : "a" (data), "Nd" (portno));
   3565 	 break;
   3566       case 2:
   3567          __asm__ __volatile__("outw %w0, %w1"
   3568                               : : "a" (data), "Nd" (portno));
   3569 	 break;
   3570       case 1:
   3571          __asm__ __volatile__("outb %b0, %w1"
   3572                               : : "a" (data), "Nd" (portno));
   3573 	 break;
   3574       default:
   3575          break; /* note: no 64-bit version of insn exists */
   3576    }
   3577 #  else
   3578    /* do nothing */
   3579 #  endif
   3580 }
   3581 
   3582 /* CALLED FROM GENERATED CODE */
   3583 /* DIRTY HELPER (non-referentially-transparent) */
   3584 /* Horrible hack.  On non-amd64 platforms, do nothing. */
   3585 /* op = 0: call the native SGDT instruction.
   3586    op = 1: call the native SIDT instruction.
   3587 */
   3588 void amd64g_dirtyhelper_SxDT ( void *address, ULong op ) {
   3589 #  if defined(__x86_64__)
   3590    switch (op) {
   3591       case 0:
   3592          __asm__ __volatile__("sgdt (%0)" : : "r" (address) : "memory");
   3593          break;
   3594       case 1:
   3595          __asm__ __volatile__("sidt (%0)" : : "r" (address) : "memory");
   3596          break;
   3597       default:
   3598          vpanic("amd64g_dirtyhelper_SxDT");
   3599    }
   3600 #  else
   3601    /* do nothing */
   3602    UChar* p = (UChar*)address;
   3603    p[0] = p[1] = p[2] = p[3] = p[4] = p[5] = 0;
   3604    p[6] = p[7] = p[8] = p[9] = 0;
   3605 #  endif
   3606 }
   3607 
   3608 /*---------------------------------------------------------------*/
   3609 /*--- Helpers for MMX/SSE/SSE2.                               ---*/
   3610 /*---------------------------------------------------------------*/
   3611 
   3612 static inline UChar abdU8 ( UChar xx, UChar yy ) {
   3613    return toUChar(xx>yy ? xx-yy : yy-xx);
   3614 }
   3615 
   3616 static inline ULong mk32x2 ( UInt w1, UInt w0 ) {
   3617    return (((ULong)w1) << 32) | ((ULong)w0);
   3618 }
   3619 
   3620 static inline UShort sel16x4_3 ( ULong w64 ) {
   3621    UInt hi32 = toUInt(w64 >> 32);
   3622    return toUShort(hi32 >> 16);
   3623 }
   3624 static inline UShort sel16x4_2 ( ULong w64 ) {
   3625    UInt hi32 = toUInt(w64 >> 32);
   3626    return toUShort(hi32);
   3627 }
   3628 static inline UShort sel16x4_1 ( ULong w64 ) {
   3629    UInt lo32 = toUInt(w64);
   3630    return toUShort(lo32 >> 16);
   3631 }
   3632 static inline UShort sel16x4_0 ( ULong w64 ) {
   3633    UInt lo32 = toUInt(w64);
   3634    return toUShort(lo32);
   3635 }
   3636 
   3637 static inline UChar sel8x8_7 ( ULong w64 ) {
   3638    UInt hi32 = toUInt(w64 >> 32);
   3639    return toUChar(hi32 >> 24);
   3640 }
   3641 static inline UChar sel8x8_6 ( ULong w64 ) {
   3642    UInt hi32 = toUInt(w64 >> 32);
   3643    return toUChar(hi32 >> 16);
   3644 }
   3645 static inline UChar sel8x8_5 ( ULong w64 ) {
   3646    UInt hi32 = toUInt(w64 >> 32);
   3647    return toUChar(hi32 >> 8);
   3648 }
   3649 static inline UChar sel8x8_4 ( ULong w64 ) {
   3650    UInt hi32 = toUInt(w64 >> 32);
   3651    return toUChar(hi32 >> 0);
   3652 }
   3653 static inline UChar sel8x8_3 ( ULong w64 ) {
   3654    UInt lo32 = toUInt(w64);
   3655    return toUChar(lo32 >> 24);
   3656 }
   3657 static inline UChar sel8x8_2 ( ULong w64 ) {
   3658    UInt lo32 = toUInt(w64);
   3659    return toUChar(lo32 >> 16);
   3660 }
   3661 static inline UChar sel8x8_1 ( ULong w64 ) {
   3662    UInt lo32 = toUInt(w64);
   3663    return toUChar(lo32 >> 8);
   3664 }
   3665 static inline UChar sel8x8_0 ( ULong w64 ) {
   3666    UInt lo32 = toUInt(w64);
   3667    return toUChar(lo32 >> 0);
   3668 }
   3669 
   3670 /* CALLED FROM GENERATED CODE: CLEAN HELPER */
   3671 ULong amd64g_calculate_mmx_pmaddwd ( ULong xx, ULong yy )
   3672 {
   3673    return
   3674       mk32x2(
   3675          (((Int)(Short)sel16x4_3(xx)) * ((Int)(Short)sel16x4_3(yy)))
   3676             + (((Int)(Short)sel16x4_2(xx)) * ((Int)(Short)sel16x4_2(yy))),
   3677          (((Int)(Short)sel16x4_1(xx)) * ((Int)(Short)sel16x4_1(yy)))
   3678             + (((Int)(Short)sel16x4_0(xx)) * ((Int)(Short)sel16x4_0(yy)))
   3679       );
   3680 }
   3681 
   3682 /* CALLED FROM GENERATED CODE: CLEAN HELPER */
   3683 ULong amd64g_calculate_mmx_psadbw ( ULong xx, ULong yy )
   3684 {
   3685    UInt t = 0;
   3686    t += (UInt)abdU8( sel8x8_7(xx), sel8x8_7(yy) );
   3687    t += (UInt)abdU8( sel8x8_6(xx), sel8x8_6(yy) );
   3688    t += (UInt)abdU8( sel8x8_5(xx), sel8x8_5(yy) );
   3689    t += (UInt)abdU8( sel8x8_4(xx), sel8x8_4(yy) );
   3690    t += (UInt)abdU8( sel8x8_3(xx), sel8x8_3(yy) );
   3691    t += (UInt)abdU8( sel8x8_2(xx), sel8x8_2(yy) );
   3692    t += (UInt)abdU8( sel8x8_1(xx), sel8x8_1(yy) );
   3693    t += (UInt)abdU8( sel8x8_0(xx), sel8x8_0(yy) );
   3694    t &= 0xFFFF;
   3695    return (ULong)t;
   3696 }
   3697 
   3698 /* CALLED FROM GENERATED CODE: CLEAN HELPER */
   3699 ULong amd64g_calculate_sse_phminposuw ( ULong sLo, ULong sHi )
   3700 {
   3701    UShort t, min;
   3702    UInt   idx;
   3703    t = sel16x4_0(sLo); if (True)    { min = t; idx = 0; }
   3704    t = sel16x4_1(sLo); if (t < min) { min = t; idx = 1; }
   3705    t = sel16x4_2(sLo); if (t < min) { min = t; idx = 2; }
   3706    t = sel16x4_3(sLo); if (t < min) { min = t; idx = 3; }
   3707    t = sel16x4_0(sHi); if (t < min) { min = t; idx = 4; }
   3708    t = sel16x4_1(sHi); if (t < min) { min = t; idx = 5; }
   3709    t = sel16x4_2(sHi); if (t < min) { min = t; idx = 6; }
   3710    t = sel16x4_3(sHi); if (t < min) { min = t; idx = 7; }
   3711    return ((ULong)(idx << 16)) | ((ULong)min);
   3712 }
   3713 
   3714 /* CALLED FROM GENERATED CODE: CLEAN HELPER */
   3715 ULong amd64g_calc_crc32b ( ULong crcIn, ULong b )
   3716 {
   3717    UInt  i;
   3718    ULong crc = (b & 0xFFULL) ^ crcIn;
   3719    for (i = 0; i < 8; i++)
   3720       crc = (crc >> 1) ^ ((crc & 1) ? 0x82f63b78ULL : 0);
   3721    return crc;
   3722 }
   3723 
   3724 /* CALLED FROM GENERATED CODE: CLEAN HELPER */
   3725 ULong amd64g_calc_crc32w ( ULong crcIn, ULong w )
   3726 {
   3727    UInt  i;
   3728    ULong crc = (w & 0xFFFFULL) ^ crcIn;
   3729    for (i = 0; i < 16; i++)
   3730       crc = (crc >> 1) ^ ((crc & 1) ? 0x82f63b78ULL : 0);
   3731    return crc;
   3732 }
   3733 
   3734 /* CALLED FROM GENERATED CODE: CLEAN HELPER */
   3735 ULong amd64g_calc_crc32l ( ULong crcIn, ULong l )
   3736 {
   3737    UInt i;
   3738    ULong crc = (l & 0xFFFFFFFFULL) ^ crcIn;
   3739    for (i = 0; i < 32; i++)
   3740       crc = (crc >> 1) ^ ((crc & 1) ? 0x82f63b78ULL : 0);
   3741    return crc;
   3742 }
   3743 
   3744 /* CALLED FROM GENERATED CODE: CLEAN HELPER */
   3745 ULong amd64g_calc_crc32q ( ULong crcIn, ULong q )
   3746 {
   3747    ULong crc = amd64g_calc_crc32l(crcIn, q);
   3748    return amd64g_calc_crc32l(crc, q >> 32);
   3749 }
   3750 
   3751 
   3752 /* .. helper for next fn .. */
   3753 static inline ULong sad_8x4 ( ULong xx, ULong yy )
   3754 {
   3755    UInt t = 0;
   3756    t += (UInt)abdU8( sel8x8_3(xx), sel8x8_3(yy) );
   3757    t += (UInt)abdU8( sel8x8_2(xx), sel8x8_2(yy) );
   3758    t += (UInt)abdU8( sel8x8_1(xx), sel8x8_1(yy) );
   3759    t += (UInt)abdU8( sel8x8_0(xx), sel8x8_0(yy) );
   3760    return (ULong)t;
   3761 }
   3762 
   3763 /* CALLED FROM GENERATED CODE: CLEAN HELPER */
   3764 ULong amd64g_calc_mpsadbw ( ULong sHi, ULong sLo,
   3765                             ULong dHi, ULong dLo,
   3766                             ULong imm_and_return_control_bit )
   3767 {
   3768    UInt imm8     = imm_and_return_control_bit & 7;
   3769    Bool calcHi   = (imm_and_return_control_bit >> 7) & 1;
   3770    UInt srcOffsL = imm8 & 3; /* src offs in 32-bit (L) chunks */
   3771    UInt dstOffsL = (imm8 >> 2) & 1; /* dst offs in ditto chunks */
   3772    /* For src we only need 32 bits, so get them into the
   3773       lower half of a 64 bit word. */
   3774    ULong src = ((srcOffsL & 2) ? sHi : sLo) >> (32 * (srcOffsL & 1));
   3775    /* For dst we need to get hold of 56 bits (7 bytes) from a total of
   3776       11 bytes.  If calculating the low part of the result, need bytes
   3777       dstOffsL * 4 + (0 .. 6); if calculating the high part,
   3778       dstOffsL * 4 + (4 .. 10). */
   3779    ULong dst;
   3780    /* dstOffL = 0, Lo  ->  0 .. 6
   3781       dstOffL = 1, Lo  ->  4 .. 10
   3782       dstOffL = 0, Hi  ->  4 .. 10
   3783       dstOffL = 1, Hi  ->  8 .. 14
   3784    */
   3785    if (calcHi && dstOffsL) {
   3786       /* 8 .. 14 */
   3787       dst = dHi & 0x00FFFFFFFFFFFFFFULL;
   3788    }
   3789    else if (!calcHi && !dstOffsL) {
   3790       /* 0 .. 6 */
   3791       dst = dLo & 0x00FFFFFFFFFFFFFFULL;
   3792    }
   3793    else {
   3794       /* 4 .. 10 */
   3795       dst = (dLo >> 32) | ((dHi & 0x00FFFFFFULL) << 32);
   3796    }
   3797    ULong r0  = sad_8x4( dst >>  0, src );
   3798    ULong r1  = sad_8x4( dst >>  8, src );
   3799    ULong r2  = sad_8x4( dst >> 16, src );
   3800    ULong r3  = sad_8x4( dst >> 24, src );
   3801    ULong res = (r3 << 48) | (r2 << 32) | (r1 << 16) | r0;
   3802    return res;
   3803 }
   3804 
   3805 /* CALLED FROM GENERATED CODE: CLEAN HELPER */
   3806 ULong amd64g_calculate_pext ( ULong src_masked, ULong mask )
   3807 {
   3808    ULong dst = 0;
   3809    ULong src_bit;
   3810    ULong dst_bit = 1;
   3811    for (src_bit = 1; src_bit; src_bit <<= 1) {
   3812       if (mask & src_bit) {
   3813          if (src_masked & src_bit) dst |= dst_bit;
   3814          dst_bit <<= 1;
   3815       }
   3816    }
   3817    return dst;
   3818 }
   3819 
   3820 /* CALLED FROM GENERATED CODE: CLEAN HELPER */
   3821 ULong amd64g_calculate_pdep ( ULong src, ULong mask )
   3822 {
   3823    ULong dst = 0;
   3824    ULong dst_bit;
   3825    ULong src_bit = 1;
   3826    for (dst_bit = 1; dst_bit; dst_bit <<= 1) {
   3827       if (mask & dst_bit) {
   3828          if (src & src_bit) dst |= dst_bit;
   3829          src_bit <<= 1;
   3830       }
   3831    }
   3832    return dst;
   3833 }
   3834 
   3835 /*---------------------------------------------------------------*/
   3836 /*--- Helpers for SSE4.2 PCMP{E,I}STR{I,M}                    ---*/
   3837 /*---------------------------------------------------------------*/
   3838 
   3839 static UInt zmask_from_V128 ( V128* arg )
   3840 {
   3841    UInt i, res = 0;
   3842    for (i = 0; i < 16; i++) {
   3843       res |=  ((arg->w8[i] == 0) ? 1 : 0) << i;
   3844    }
   3845    return res;
   3846 }
   3847 
   3848 static UInt zmask_from_V128_wide ( V128* arg )
   3849 {
   3850    UInt i, res = 0;
   3851    for (i = 0; i < 8; i++) {
   3852       res |=  ((arg->w16[i] == 0) ? 1 : 0) << i;
   3853    }
   3854    return res;
   3855 }
   3856 
   3857 /* Helps with PCMP{I,E}STR{I,M}.
   3858 
   3859    CALLED FROM GENERATED CODE: DIRTY HELPER(s).  (But not really,
   3860    actually it could be a clean helper, but for the fact that we can't
   3861    pass by value 2 x V128 to a clean helper, nor have one returned.)
   3862    Reads guest state, writes to guest state for the xSTRM cases, no
   3863    accesses of memory, is a pure function.
   3864 
   3865    opc_and_imm contains (4th byte of opcode << 8) | the-imm8-byte so
   3866    the callee knows which I/E and I/M variant it is dealing with and
   3867    what the specific operation is.  4th byte of opcode is in the range
   3868    0x60 to 0x63:
   3869        istri  66 0F 3A 63
   3870        istrm  66 0F 3A 62
   3871        estri  66 0F 3A 61
   3872        estrm  66 0F 3A 60
   3873 
   3874    gstOffL and gstOffR are the guest state offsets for the two XMM
   3875    register inputs.  We never have to deal with the memory case since
   3876    that is handled by pre-loading the relevant value into the fake
   3877    XMM16 register.
   3878 
   3879    For ESTRx variants, edxIN and eaxIN hold the values of those two
   3880    registers.
   3881 
   3882    In all cases, the bottom 16 bits of the result contain the new
   3883    OSZACP %rflags values.  For xSTRI variants, bits[31:16] of the
   3884    result hold the new %ecx value.  For xSTRM variants, the helper
   3885    writes the result directly to the guest XMM0.
   3886 
   3887    Declarable side effects: in all cases, reads guest state at
   3888    [gstOffL, +16) and [gstOffR, +16).  For xSTRM variants, also writes
   3889    guest_XMM0.
   3890 
   3891    Is expected to be called with opc_and_imm combinations which have
   3892    actually been validated, and will assert if otherwise.  The front
   3893    end should ensure we're only called with verified values.
   3894 */
   3895 ULong amd64g_dirtyhelper_PCMPxSTRx (
   3896           VexGuestAMD64State* gst,
   3897           HWord opc4_and_imm,
   3898           HWord gstOffL, HWord gstOffR,
   3899           HWord edxIN, HWord eaxIN
   3900        )
   3901 {
   3902    HWord opc4 = (opc4_and_imm >> 8) & 0xFF;
   3903    HWord imm8 = opc4_and_imm & 0xFF;
   3904    HWord isISTRx = opc4 & 2;
   3905    HWord isxSTRM = (opc4 & 1) ^ 1;
   3906    vassert((opc4 & 0xFC) == 0x60); /* 0x60 .. 0x63 */
   3907    HWord wide = (imm8 & 1);
   3908 
   3909    // where the args are
   3910    V128* argL = (V128*)( ((UChar*)gst) + gstOffL );
   3911    V128* argR = (V128*)( ((UChar*)gst) + gstOffR );
   3912 
   3913    /* Create the arg validity masks, either from the vectors
   3914       themselves or from the supplied edx/eax values. */
   3915    // FIXME: this is only right for the 8-bit data cases.
   3916    // At least that is asserted above.
   3917    UInt zmaskL, zmaskR;
   3918 
   3919    // temp spot for the resulting flags and vector.
   3920    V128 resV;
   3921    UInt resOSZACP;
   3922 
   3923    // for checking whether case was handled
   3924    Bool ok = False;
   3925 
   3926    if (wide) {
   3927       if (isISTRx) {
   3928          zmaskL = zmask_from_V128_wide(argL);
   3929          zmaskR = zmask_from_V128_wide(argR);
   3930       } else {
   3931          Int tmp;
   3932          tmp = edxIN & 0xFFFFFFFF;
   3933          if (tmp < -8) tmp = -8;
   3934          if (tmp > 8)  tmp = 8;
   3935          if (tmp < 0)  tmp = -tmp;
   3936          vassert(tmp >= 0 && tmp <= 8);
   3937          zmaskL = (1 << tmp) & 0xFF;
   3938          tmp = eaxIN & 0xFFFFFFFF;
   3939          if (tmp < -8) tmp = -8;
   3940          if (tmp > 8)  tmp = 8;
   3941          if (tmp < 0)  tmp = -tmp;
   3942          vassert(tmp >= 0 && tmp <= 8);
   3943          zmaskR = (1 << tmp) & 0xFF;
   3944       }
   3945       // do the meyaath
   3946       ok = compute_PCMPxSTRx_wide (
   3947               &resV, &resOSZACP, argL, argR,
   3948               zmaskL, zmaskR, imm8, (Bool)isxSTRM
   3949            );
   3950    } else {
   3951       if (isISTRx) {
   3952          zmaskL = zmask_from_V128(argL);
   3953          zmaskR = zmask_from_V128(argR);
   3954       } else {
   3955          Int tmp;
   3956          tmp = edxIN & 0xFFFFFFFF;
   3957          if (tmp < -16) tmp = -16;
   3958          if (tmp > 16)  tmp = 16;
   3959          if (tmp < 0)   tmp = -tmp;
   3960          vassert(tmp >= 0 && tmp <= 16);
   3961          zmaskL = (1 << tmp) & 0xFFFF;
   3962          tmp = eaxIN & 0xFFFFFFFF;
   3963          if (tmp < -16) tmp = -16;
   3964          if (tmp > 16)  tmp = 16;
   3965          if (tmp < 0)   tmp = -tmp;
   3966          vassert(tmp >= 0 && tmp <= 16);
   3967          zmaskR = (1 << tmp) & 0xFFFF;
   3968       }
   3969       // do the meyaath
   3970       ok = compute_PCMPxSTRx (
   3971               &resV, &resOSZACP, argL, argR,
   3972               zmaskL, zmaskR, imm8, (Bool)isxSTRM
   3973            );
   3974    }
   3975 
   3976    // front end shouldn't pass us any imm8 variants we can't
   3977    // handle.  Hence:
   3978    vassert(ok);
   3979 
   3980    // So, finally we need to get the results back to the caller.
   3981    // In all cases, the new OSZACP value is the lowest 16 of
   3982    // the return value.
   3983    if (isxSTRM) {
   3984       gst->guest_YMM0[0] = resV.w32[0];
   3985       gst->guest_YMM0[1] = resV.w32[1];
   3986       gst->guest_YMM0[2] = resV.w32[2];
   3987       gst->guest_YMM0[3] = resV.w32[3];
   3988       return resOSZACP & 0x8D5;
   3989    } else {
   3990       UInt newECX = resV.w32[0] & 0xFFFF;
   3991       return (newECX << 16) | (resOSZACP & 0x8D5);
   3992    }
   3993 }
   3994 
   3995 /*---------------------------------------------------------------*/
   3996 /*--- AES primitives and helpers                              ---*/
   3997 /*---------------------------------------------------------------*/
   3998 /* a 16 x 16 matrix */
   3999 static const UChar sbox[256] = {                   // row nr
   4000    0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5, // 1
   4001    0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76,
   4002    0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0, // 2
   4003    0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0,
   4004    0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc, // 3
   4005    0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15,
   4006    0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a, // 4
   4007    0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75,
   4008    0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0, // 5
   4009    0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84,
   4010    0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b, // 6
   4011    0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf,
   4012    0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85, // 7
   4013    0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8,
   4014    0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5, // 8
   4015    0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2,
   4016    0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17, // 9
   4017    0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73,
   4018    0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88, //10
   4019    0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb,
   4020    0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c, //11
   4021    0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79,
   4022    0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9, //12
   4023    0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08,
   4024    0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6, //13
   4025    0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a,
   4026    0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e, //14
   4027    0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e,
   4028    0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94, //15
   4029    0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf,
   4030    0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68, //16
   4031    0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16
   4032 };
   4033 static void SubBytes (V128* v)
   4034 {
   4035    V128 r;
   4036    UInt i;
   4037    for (i = 0; i < 16; i++)
   4038       r.w8[i] = sbox[v->w8[i]];
   4039    *v = r;
   4040 }
   4041 
   4042 /* a 16 x 16 matrix */
   4043 static const UChar invsbox[256] = {                // row nr
   4044    0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38, // 1
   4045    0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb,
   4046    0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87, // 2
   4047    0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb,
   4048    0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d, // 3
   4049    0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e,
   4050    0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2, // 4
   4051    0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25,
   4052    0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16, // 5
   4053    0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92,
   4054    0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda, // 6
   4055    0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84,
   4056    0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a, // 7
   4057    0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06,
   4058    0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02, // 8
   4059    0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b,
   4060    0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea, // 9
   4061    0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73,
   4062    0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85, //10
   4063    0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e,
   4064    0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89, //11
   4065    0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b,
   4066    0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20, //12
   4067    0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4,
   4068    0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31, //13
   4069    0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f,
   4070    0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d, //14
   4071    0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef,
   4072    0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0, //15
   4073    0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61,
   4074    0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26, //16
   4075    0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d
   4076 };
   4077 static void InvSubBytes (V128* v)
   4078 {
   4079    V128 r;
   4080    UInt i;
   4081    for (i = 0; i < 16; i++)
   4082       r.w8[i] = invsbox[v->w8[i]];
   4083    *v = r;
   4084 }
   4085 
   4086 static const UChar ShiftRows_op[16] =
   4087    {11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15, 10, 5, 0};
   4088 static void ShiftRows (V128* v)
   4089 {
   4090    V128 r;
   4091    UInt i;
   4092    for (i = 0; i < 16; i++)
   4093       r.w8[i] = v->w8[ShiftRows_op[15-i]];
   4094    *v = r;
   4095 }
   4096 
   4097 static const UChar InvShiftRows_op[16] =
   4098    {3, 6, 9, 12, 15, 2, 5, 8, 11, 14, 1, 4, 7, 10, 13, 0};
   4099 static void InvShiftRows (V128* v)
   4100 {
   4101    V128 r;
   4102    UInt i;
   4103    for (i = 0; i < 16; i++)
   4104       r.w8[i] = v->w8[InvShiftRows_op[15-i]];
   4105    *v = r;
   4106 }
   4107 
   4108 /* Multiplication of the finite fields elements of AES.
   4109    See "A Specification for The AES Algorithm Rijndael
   4110         (by Joan Daemen & Vincent Rijmen)"
   4111         Dr. Brian Gladman, v3.1, 3rd March 2001. */
   4112 /* N values so that (hex) xy = 0x03^N.
   4113    0x00 cannot be used. We put 0xff for this value.*/
   4114 /* a 16 x 16 matrix */
   4115 static const UChar Nxy[256] = {                    // row nr
   4116    0xff, 0x00, 0x19, 0x01, 0x32, 0x02, 0x1a, 0xc6, // 1
   4117    0x4b, 0xc7, 0x1b, 0x68, 0x33, 0xee, 0xdf, 0x03,
   4118    0x64, 0x04, 0xe0, 0x0e, 0x34, 0x8d, 0x81, 0xef, // 2
   4119    0x4c, 0x71, 0x08, 0xc8, 0xf8, 0x69, 0x1c, 0xc1,
   4120    0x7d, 0xc2, 0x1d, 0xb5, 0xf9, 0xb9, 0x27, 0x6a, // 3
   4121    0x4d, 0xe4, 0xa6, 0x72, 0x9a, 0xc9, 0x09, 0x78,
   4122    0x65, 0x2f, 0x8a, 0x05, 0x21, 0x0f, 0xe1, 0x24, // 4
   4123    0x12, 0xf0, 0x82, 0x45, 0x35, 0x93, 0xda, 0x8e,
   4124    0x96, 0x8f, 0xdb, 0xbd, 0x36, 0xd0, 0xce, 0x94, // 5
   4125    0x13, 0x5c, 0xd2, 0xf1, 0x40, 0x46, 0x83, 0x38,
   4126    0x66, 0xdd, 0xfd, 0x30, 0xbf, 0x06, 0x8b, 0x62, // 6
   4127    0xb3, 0x25, 0xe2, 0x98, 0x22, 0x88, 0x91, 0x10,
   4128    0x7e, 0x6e, 0x48, 0xc3, 0xa3, 0xb6, 0x1e, 0x42, // 7
   4129    0x3a, 0x6b, 0x28, 0x54, 0xfa, 0x85, 0x3d, 0xba,
   4130    0x2b, 0x79, 0x0a, 0x15, 0x9b, 0x9f, 0x5e, 0xca, // 8
   4131    0x4e, 0xd4, 0xac, 0xe5, 0xf3, 0x73, 0xa7, 0x57,
   4132    0xaf, 0x58, 0xa8, 0x50, 0xf4, 0xea, 0xd6, 0x74, // 9
   4133    0x4f, 0xae, 0xe9, 0xd5, 0xe7, 0xe6, 0xad, 0xe8,
   4134    0x2c, 0xd7, 0x75, 0x7a, 0xeb, 0x16, 0x0b, 0xf5, //10
   4135    0x59, 0xcb, 0x5f, 0xb0, 0x9c, 0xa9, 0x51, 0xa0,
   4136    0x7f, 0x0c, 0xf6, 0x6f, 0x17, 0xc4, 0x49, 0xec, //11
   4137    0xd8, 0x43, 0x1f, 0x2d, 0xa4, 0x76, 0x7b, 0xb7,
   4138    0xcc, 0xbb, 0x3e, 0x5a, 0xfb, 0x60, 0xb1, 0x86, //12
   4139    0x3b, 0x52, 0xa1, 0x6c, 0xaa, 0x55, 0x29, 0x9d,
   4140    0x97, 0xb2, 0x87, 0x90, 0x61, 0xbe, 0xdc, 0xfc, //13
   4141    0xbc, 0x95, 0xcf, 0xcd, 0x37, 0x3f, 0x5b, 0xd1,
   4142    0x53, 0x39, 0x84, 0x3c, 0x41, 0xa2, 0x6d, 0x47, //14
   4143    0x14, 0x2a, 0x9e, 0x5d, 0x56, 0xf2, 0xd3, 0xab,
   4144    0x44, 0x11, 0x92, 0xd9, 0x23, 0x20, 0x2e, 0x89, //15
   4145    0xb4, 0x7c, 0xb8, 0x26, 0x77, 0x99, 0xe3, 0xa5,
   4146    0x67, 0x4a, 0xed, 0xde, 0xc5, 0x31, 0xfe, 0x18, //16
   4147    0x0d, 0x63, 0x8c, 0x80, 0xc0, 0xf7, 0x70, 0x07
   4148 };
   4149 
   4150 /* E values so that E = 0x03^xy. */
   4151 static const UChar Exy[256] = {                    // row nr
   4152    0x01, 0x03, 0x05, 0x0f, 0x11, 0x33, 0x55, 0xff, // 1
   4153    0x1a, 0x2e, 0x72, 0x96, 0xa1, 0xf8, 0x13, 0x35,
   4154    0x5f, 0xe1, 0x38, 0x48, 0xd8, 0x73, 0x95, 0xa4, // 2
   4155    0xf7, 0x02, 0x06, 0x0a, 0x1e, 0x22, 0x66, 0xaa,
   4156    0xe5, 0x34, 0x5c, 0xe4, 0x37, 0x59, 0xeb, 0x26, // 3
   4157    0x6a, 0xbe, 0xd9, 0x70, 0x90, 0xab, 0xe6, 0x31,
   4158    0x53, 0xf5, 0x04, 0x0c, 0x14, 0x3c, 0x44, 0xcc, // 4
   4159    0x4f, 0xd1, 0x68, 0xb8, 0xd3, 0x6e, 0xb2, 0xcd,
   4160    0x4c, 0xd4, 0x67, 0xa9, 0xe0, 0x3b, 0x4d, 0xd7, // 5
   4161    0x62, 0xa6, 0xf1, 0x08, 0x18, 0x28, 0x78, 0x88,
   4162    0x83, 0x9e, 0xb9, 0xd0, 0x6b, 0xbd, 0xdc, 0x7f, // 6
   4163    0x81, 0x98, 0xb3, 0xce, 0x49, 0xdb, 0x76, 0x9a,
   4164    0xb5, 0xc4, 0x57, 0xf9, 0x10, 0x30, 0x50, 0xf0, // 7
   4165    0x0b, 0x1d, 0x27, 0x69, 0xbb, 0xd6, 0x61, 0xa3,
   4166    0xfe, 0x19, 0x2b, 0x7d, 0x87, 0x92, 0xad, 0xec, // 8
   4167    0x2f, 0x71, 0x93, 0xae, 0xe9, 0x20, 0x60, 0xa0,
   4168    0xfb, 0x16, 0x3a, 0x4e, 0xd2, 0x6d, 0xb7, 0xc2, // 9
   4169    0x5d, 0xe7, 0x32, 0x56, 0xfa, 0x15, 0x3f, 0x41,
   4170    0xc3, 0x5e, 0xe2, 0x3d, 0x47, 0xc9, 0x40, 0xc0, //10
   4171    0x5b, 0xed, 0x2c, 0x74, 0x9c, 0xbf, 0xda, 0x75,
   4172    0x9f, 0xba, 0xd5, 0x64, 0xac, 0xef, 0x2a, 0x7e, //11
   4173    0x82, 0x9d, 0xbc, 0xdf, 0x7a, 0x8e, 0x89, 0x80,
   4174    0x9b, 0xb6, 0xc1, 0x58, 0xe8, 0x23, 0x65, 0xaf, //12
   4175    0xea, 0x25, 0x6f, 0xb1, 0xc8, 0x43, 0xc5, 0x54,
   4176    0xfc, 0x1f, 0x21, 0x63, 0xa5, 0xf4, 0x07, 0x09, //13
   4177    0x1b, 0x2d, 0x77, 0x99, 0xb0, 0xcb, 0x46, 0xca,
   4178    0x45, 0xcf, 0x4a, 0xde, 0x79, 0x8b, 0x86, 0x91, //14
   4179    0xa8, 0xe3, 0x3e, 0x42, 0xc6, 0x51, 0xf3, 0x0e,
   4180    0x12, 0x36, 0x5a, 0xee, 0x29, 0x7b, 0x8d, 0x8c, //15
   4181    0x8f, 0x8a, 0x85, 0x94, 0xa7, 0xf2, 0x0d, 0x17,
   4182    0x39, 0x4b, 0xdd, 0x7c, 0x84, 0x97, 0xa2, 0xfd, //16
   4183    0x1c, 0x24, 0x6c, 0xb4, 0xc7, 0x52, 0xf6, 0x01};
   4184 
   4185 static inline UChar ff_mul(UChar u1, UChar u2)
   4186 {
   4187    if ((u1 > 0) && (u2 > 0)) {
   4188       UInt ui = Nxy[u1] + Nxy[u2];
   4189       if (ui >= 255)
   4190          ui = ui - 255;
   4191       return Exy[ui];
   4192    } else {
   4193       return 0;
   4194    };
   4195 }
   4196 
   4197 static void MixColumns (V128* v)
   4198 {
   4199    V128 r;
   4200    Int j;
   4201 #define P(x,row,col) (x)->w8[((row)*4+(col))]
   4202    for (j = 0; j < 4; j++) {
   4203       P(&r,j,0) = ff_mul(0x02, P(v,j,0)) ^ ff_mul(0x03, P(v,j,1))
   4204          ^ P(v,j,2) ^ P(v,j,3);
   4205       P(&r,j,1) = P(v,j,0) ^ ff_mul( 0x02, P(v,j,1) )
   4206          ^ ff_mul(0x03, P(v,j,2) ) ^ P(v,j,3);
   4207       P(&r,j,2) = P(v,j,0) ^ P(v,j,1) ^ ff_mul( 0x02, P(v,j,2) )
   4208          ^ ff_mul(0x03, P(v,j,3) );
   4209       P(&r,j,3) = ff_mul(0x03, P(v,j,0) ) ^ P(v,j,1) ^ P(v,j,2)
   4210          ^ ff_mul( 0x02, P(v,j,3) );
   4211    }
   4212    *v = r;
   4213 #undef P
   4214 }
   4215 
   4216 static void InvMixColumns (V128* v)
   4217 {
   4218    V128 r;
   4219    Int j;
   4220 #define P(x,row,col) (x)->w8[((row)*4+(col))]
   4221    for (j = 0; j < 4; j++) {
   4222       P(&r,j,0) = ff_mul(0x0e, P(v,j,0) ) ^ ff_mul(0x0b, P(v,j,1) )
   4223          ^ ff_mul(0x0d,P(v,j,2) ) ^ ff_mul(0x09, P(v,j,3) );
   4224       P(&r,j,1) = ff_mul(0x09, P(v,j,0) ) ^ ff_mul(0x0e, P(v,j,1) )
   4225          ^ ff_mul(0x0b,P(v,j,2) ) ^ ff_mul(0x0d, P(v,j,3) );
   4226       P(&r,j,2) = ff_mul(0x0d, P(v,j,0) ) ^ ff_mul(0x09, P(v,j,1) )
   4227          ^ ff_mul(0x0e,P(v,j,2) ) ^ ff_mul(0x0b, P(v,j,3) );
   4228       P(&r,j,3) = ff_mul(0x0b, P(v,j,0) ) ^ ff_mul(0x0d, P(v,j,1) )
   4229          ^ ff_mul(0x09,P(v,j,2) ) ^ ff_mul(0x0e, P(v,j,3) );
   4230    }
   4231    *v = r;
   4232 #undef P
   4233 
   4234 }
   4235 
   4236 /* For description, see definition in guest_amd64_defs.h */
   4237 void amd64g_dirtyhelper_AES (
   4238           VexGuestAMD64State* gst,
   4239           HWord opc4, HWord gstOffD,
   4240           HWord gstOffL, HWord gstOffR
   4241        )
   4242 {
   4243    // where the args are
   4244    V128* argD = (V128*)( ((UChar*)gst) + gstOffD );
   4245    V128* argL = (V128*)( ((UChar*)gst) + gstOffL );
   4246    V128* argR = (V128*)( ((UChar*)gst) + gstOffR );
   4247    V128  r;
   4248 
   4249    switch (opc4) {
   4250       case 0xDC: /* AESENC */
   4251       case 0xDD: /* AESENCLAST */
   4252          r = *argR;
   4253          ShiftRows (&r);
   4254          SubBytes  (&r);
   4255          if (opc4 == 0xDC)
   4256             MixColumns (&r);
   4257          argD->w64[0] = r.w64[0] ^ argL->w64[0];
   4258          argD->w64[1] = r.w64[1] ^ argL->w64[1];
   4259          break;
   4260 
   4261       case 0xDE: /* AESDEC */
   4262       case 0xDF: /* AESDECLAST */
   4263          r = *argR;
   4264          InvShiftRows (&r);
   4265          InvSubBytes (&r);
   4266          if (opc4 == 0xDE)
   4267             InvMixColumns (&r);
   4268          argD->w64[0] = r.w64[0] ^ argL->w64[0];
   4269          argD->w64[1] = r.w64[1] ^ argL->w64[1];
   4270          break;
   4271 
   4272       case 0xDB: /* AESIMC */
   4273          *argD = *argL;
   4274          InvMixColumns (argD);
   4275          break;
   4276       default: vassert(0);
   4277    }
   4278 }
   4279 
   4280 static inline UInt RotWord (UInt   w32)
   4281 {
   4282    return ((w32 >> 8) | (w32 << 24));
   4283 }
   4284 
   4285 static inline UInt SubWord (UInt   w32)
   4286 {
   4287    UChar *w8;
   4288    UChar *r8;
   4289    UInt res;
   4290    w8 = (UChar*) &w32;
   4291    r8 = (UChar*) &res;
   4292    r8[0] = sbox[w8[0]];
   4293    r8[1] = sbox[w8[1]];
   4294    r8[2] = sbox[w8[2]];
   4295    r8[3] = sbox[w8[3]];
   4296    return res;
   4297 }
   4298 
   4299 /* For description, see definition in guest_amd64_defs.h */
   4300 extern void amd64g_dirtyhelper_AESKEYGENASSIST (
   4301           VexGuestAMD64State* gst,
   4302           HWord imm8,
   4303           HWord gstOffL, HWord gstOffR
   4304        )
   4305 {
   4306    // where the args are
   4307    V128* argL = (V128*)( ((UChar*)gst) + gstOffL );
   4308    V128* argR = (V128*)( ((UChar*)gst) + gstOffR );
   4309 
   4310    // We have to create the result in a temporary in the
   4311    // case where the src and dst regs are the same.  See #341698.
   4312    V128 tmp;
   4313 
   4314    tmp.w32[3] = RotWord (SubWord (argL->w32[3])) ^ imm8;
   4315    tmp.w32[2] = SubWord (argL->w32[3]);
   4316    tmp.w32[1] = RotWord (SubWord (argL->w32[1])) ^ imm8;
   4317    tmp.w32[0] = SubWord (argL->w32[1]);
   4318 
   4319    argR->w32[3] = tmp.w32[3];
   4320    argR->w32[2] = tmp.w32[2];
   4321    argR->w32[1] = tmp.w32[1];
   4322    argR->w32[0] = tmp.w32[0];
   4323 }
   4324 
   4325 
   4326 
   4327 /*---------------------------------------------------------------*/
   4328 /*--- Helpers for dealing with, and describing,               ---*/
   4329 /*--- guest state as a whole.                                 ---*/
   4330 /*---------------------------------------------------------------*/
   4331 
   4332 /* Initialise the entire amd64 guest state. */
   4333 /* VISIBLE TO LIBVEX CLIENT */
   4334 void LibVEX_GuestAMD64_initialise ( /*OUT*/VexGuestAMD64State* vex_state )
   4335 {
   4336    vex_state->host_EvC_FAILADDR = 0;
   4337    vex_state->host_EvC_COUNTER = 0;
   4338    vex_state->pad0 = 0;
   4339 
   4340    vex_state->guest_RAX = 0;
   4341    vex_state->guest_RCX = 0;
   4342    vex_state->guest_RDX = 0;
   4343    vex_state->guest_RBX = 0;
   4344    vex_state->guest_RSP = 0;
   4345    vex_state->guest_RBP = 0;
   4346    vex_state->guest_RSI = 0;
   4347    vex_state->guest_RDI = 0;
   4348    vex_state->guest_R8  = 0;
   4349    vex_state->guest_R9  = 0;
   4350    vex_state->guest_R10 = 0;
   4351    vex_state->guest_R11 = 0;
   4352    vex_state->guest_R12 = 0;
   4353    vex_state->guest_R13 = 0;
   4354    vex_state->guest_R14 = 0;
   4355    vex_state->guest_R15 = 0;
   4356 
   4357    vex_state->guest_CC_OP   = AMD64G_CC_OP_COPY;
   4358    vex_state->guest_CC_DEP1 = 0;
   4359    vex_state->guest_CC_DEP2 = 0;
   4360    vex_state->guest_CC_NDEP = 0;
   4361 
   4362    vex_state->guest_DFLAG   = 1; /* forwards */
   4363    vex_state->guest_IDFLAG  = 0;
   4364    vex_state->guest_ACFLAG  = 0;
   4365 
   4366    /* HACK: represent the offset associated with a constant %fs.
   4367       Typically, on linux, this assumes that %fs is only ever zero (main
   4368       thread) or 0x63. */
   4369    vex_state->guest_FS_CONST = 0;
   4370 
   4371    vex_state->guest_RIP = 0;
   4372 
   4373    /* Initialise the simulated FPU */
   4374    amd64g_dirtyhelper_FINIT( vex_state );
   4375 
   4376    /* Initialise the AVX state. */
   4377 #  define AVXZERO(_ymm) \
   4378       do { _ymm[0]=_ymm[1]=_ymm[2]=_ymm[3] = 0; \
   4379            _ymm[4]=_ymm[5]=_ymm[6]=_ymm[7] = 0; \
   4380       } while (0)
   4381    vex_state->guest_SSEROUND = (ULong)Irrm_NEAREST;
   4382    AVXZERO(vex_state->guest_YMM0);
   4383    AVXZERO(vex_state->guest_YMM1);
   4384    AVXZERO(vex_state->guest_YMM2);
   4385    AVXZERO(vex_state->guest_YMM3);
   4386    AVXZERO(vex_state->guest_YMM4);
   4387    AVXZERO(vex_state->guest_YMM5);
   4388    AVXZERO(vex_state->guest_YMM6);
   4389    AVXZERO(vex_state->guest_YMM7);
   4390    AVXZERO(vex_state->guest_YMM8);
   4391    AVXZERO(vex_state->guest_YMM9);
   4392    AVXZERO(vex_state->guest_YMM10);
   4393    AVXZERO(vex_state->guest_YMM11);
   4394    AVXZERO(vex_state->guest_YMM12);
   4395    AVXZERO(vex_state->guest_YMM13);
   4396    AVXZERO(vex_state->guest_YMM14);
   4397    AVXZERO(vex_state->guest_YMM15);
   4398    AVXZERO(vex_state->guest_YMM16);
   4399 
   4400 #  undef AVXZERO
   4401 
   4402    vex_state->guest_EMNOTE = EmNote_NONE;
   4403 
   4404    /* These should not ever be either read or written, but we
   4405       initialise them anyway. */
   4406    vex_state->guest_CMSTART = 0;
   4407    vex_state->guest_CMLEN   = 0;
   4408 
   4409    vex_state->guest_NRADDR   = 0;
   4410    vex_state->guest_SC_CLASS = 0;
   4411    vex_state->guest_GS_CONST = 0;
   4412 
   4413    vex_state->guest_IP_AT_SYSCALL = 0;
   4414    vex_state->pad1 = 0;
   4415 }
   4416 
   4417 
   4418 /* Figure out if any part of the guest state contained in minoff
   4419    .. maxoff requires precise memory exceptions.  If in doubt return
   4420    True (but this generates significantly slower code).
   4421 
   4422    By default we enforce precise exns for guest %RSP, %RBP and %RIP
   4423    only.  These are the minimum needed to extract correct stack
   4424    backtraces from amd64 code.
   4425 
   4426    Only %RSP is needed in mode VexRegUpdSpAtMemAccess.
   4427 */
   4428 Bool guest_amd64_state_requires_precise_mem_exns (
   4429         Int minoff, Int maxoff, VexRegisterUpdates pxControl
   4430      )
   4431 {
   4432    Int rbp_min = offsetof(VexGuestAMD64State, guest_RBP);
   4433    Int rbp_max = rbp_min + 8 - 1;
   4434    Int rsp_min = offsetof(VexGuestAMD64State, guest_RSP);
   4435    Int rsp_max = rsp_min + 8 - 1;
   4436    Int rip_min = offsetof(VexGuestAMD64State, guest_RIP);
   4437    Int rip_max = rip_min + 8 - 1;
   4438 
   4439    if (maxoff < rsp_min || minoff > rsp_max) {
   4440       /* no overlap with rsp */
   4441       if (pxControl == VexRegUpdSpAtMemAccess)
   4442          return False; // We only need to check stack pointer.
   4443    } else {
   4444       return True;
   4445    }
   4446 
   4447    if (maxoff < rbp_min || minoff > rbp_max) {
   4448       /* no overlap with rbp */
   4449    } else {
   4450       return True;
   4451    }
   4452 
   4453    if (maxoff < rip_min || minoff > rip_max) {
   4454       /* no overlap with eip */
   4455    } else {
   4456       return True;
   4457    }
   4458 
   4459    return False;
   4460 }
   4461 
   4462 
   4463 #define ALWAYSDEFD(field)                             \
   4464     { offsetof(VexGuestAMD64State, field),            \
   4465       (sizeof ((VexGuestAMD64State*)0)->field) }
   4466 
   4467 VexGuestLayout
   4468    amd64guest_layout
   4469       = {
   4470           /* Total size of the guest state, in bytes. */
   4471           .total_sizeB = sizeof(VexGuestAMD64State),
   4472 
   4473           /* Describe the stack pointer. */
   4474           .offset_SP = offsetof(VexGuestAMD64State,guest_RSP),
   4475           .sizeof_SP = 8,
   4476 
   4477           /* Describe the frame pointer. */
   4478           .offset_FP = offsetof(VexGuestAMD64State,guest_RBP),
   4479           .sizeof_FP = 8,
   4480 
   4481           /* Describe the instruction pointer. */
   4482           .offset_IP = offsetof(VexGuestAMD64State,guest_RIP),
   4483           .sizeof_IP = 8,
   4484 
   4485           /* Describe any sections to be regarded by Memcheck as
   4486              'always-defined'. */
   4487           .n_alwaysDefd = 16,
   4488 
   4489           /* flags thunk: OP and NDEP are always defd, whereas DEP1
   4490              and DEP2 have to be tracked.  See detailed comment in
   4491              gdefs.h on meaning of thunk fields. */
   4492           .alwaysDefd
   4493              = { /*  0 */ ALWAYSDEFD(guest_CC_OP),
   4494                  /*  1 */ ALWAYSDEFD(guest_CC_NDEP),
   4495 		 /*  2 */ ALWAYSDEFD(guest_DFLAG),
   4496                  /*  3 */ ALWAYSDEFD(guest_IDFLAG),
   4497                  /*  4 */ ALWAYSDEFD(guest_RIP),
   4498                  /*  5 */ ALWAYSDEFD(guest_FS_CONST),
   4499                  /*  6 */ ALWAYSDEFD(guest_FTOP),
   4500                  /*  7 */ ALWAYSDEFD(guest_FPTAG),
   4501                  /*  8 */ ALWAYSDEFD(guest_FPROUND),
   4502                  /*  9 */ ALWAYSDEFD(guest_FC3210),
   4503                  // /* */ ALWAYSDEFD(guest_CS),
   4504                  // /* */ ALWAYSDEFD(guest_DS),
   4505                  // /* */ ALWAYSDEFD(guest_ES),
   4506                  // /* */ ALWAYSDEFD(guest_FS),
   4507                  // /* */ ALWAYSDEFD(guest_GS),
   4508                  // /* */ ALWAYSDEFD(guest_SS),
   4509                  // /* */ ALWAYSDEFD(guest_LDT),
   4510                  // /* */ ALWAYSDEFD(guest_GDT),
   4511                  /* 10 */ ALWAYSDEFD(guest_EMNOTE),
   4512                  /* 11 */ ALWAYSDEFD(guest_SSEROUND),
   4513                  /* 12 */ ALWAYSDEFD(guest_CMSTART),
   4514                  /* 13 */ ALWAYSDEFD(guest_CMLEN),
   4515                  /* 14 */ ALWAYSDEFD(guest_SC_CLASS),
   4516                  /* 15 */ ALWAYSDEFD(guest_IP_AT_SYSCALL)
   4517                }
   4518         };
   4519 
   4520 
   4521 /*---------------------------------------------------------------*/
   4522 /*--- end                               guest_amd64_helpers.c ---*/
   4523 /*---------------------------------------------------------------*/
   4524