Home | History | Annotate | Download | only in priv
      1 
      2 /*---------------------------------------------------------------*/
      3 /*--- begin                             guest_amd64_helpers.c ---*/
      4 /*---------------------------------------------------------------*/
      5 
      6 /*
      7    This file is part of Valgrind, a dynamic binary instrumentation
      8    framework.
      9 
     10    Copyright (C) 2004-2013 OpenWorks LLP
     11       info (at) open-works.net
     12 
     13    This program is free software; you can redistribute it and/or
     14    modify it under the terms of the GNU General Public License as
     15    published by the Free Software Foundation; either version 2 of the
     16    License, or (at your option) any later version.
     17 
     18    This program is distributed in the hope that it will be useful, but
     19    WITHOUT ANY WARRANTY; without even the implied warranty of
     20    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
     21    General Public License for more details.
     22 
     23    You should have received a copy of the GNU General Public License
     24    along with this program; if not, write to the Free Software
     25    Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
     26    02110-1301, USA.
     27 
     28    The GNU General Public License is contained in the file COPYING.
     29 
     30    Neither the names of the U.S. Department of Energy nor the
     31    University of California nor the names of its contributors may be
     32    used to endorse or promote products derived from this software
     33    without prior written permission.
     34 */
     35 
     36 #include "libvex_basictypes.h"
     37 #include "libvex_emnote.h"
     38 #include "libvex_guest_amd64.h"
     39 #include "libvex_ir.h"
     40 #include "libvex.h"
     41 
     42 #include "main_util.h"
     43 #include "main_globals.h"
     44 #include "guest_generic_bb_to_IR.h"
     45 #include "guest_amd64_defs.h"
     46 #include "guest_generic_x87.h"
     47 
     48 
     49 /* This file contains helper functions for amd64 guest code.
     50    Calls to these functions are generated by the back end.
     51    These calls are of course in the host machine code and
     52    this file will be compiled to host machine code, so that
     53    all makes sense.
     54 
     55    Only change the signatures of these helper functions very
     56    carefully.  If you change the signature here, you'll have to change
     57    the parameters passed to it in the IR calls constructed by
     58    guest-amd64/toIR.c.
     59 
     60    The convention used is that all functions called from generated
     61    code are named amd64g_<something>, and any function whose name lacks
     62    that prefix is not called from generated code.  Note that some
     63    LibVEX_* functions can however be called by VEX's client, but that
     64    is not the same as calling them from VEX-generated code.
     65 */
     66 
     67 
     68 /* Set to 1 to get detailed profiling info about use of the flag
     69    machinery. */
     70 #define PROFILE_RFLAGS 0
     71 
     72 
     73 /*---------------------------------------------------------------*/
     74 /*--- %rflags run-time helpers.                               ---*/
     75 /*---------------------------------------------------------------*/
     76 
     77 /* Do 64x64 -> 128 signed/unsigned multiplies, for computing flags
     78    after imulq/mulq. */
     79 
     80 static void mullS64 ( Long u, Long v, Long* rHi, Long* rLo )
     81 {
     82    ULong u0, v0, w0;
     83     Long u1, v1, w1, w2, t;
     84    u0   = u & 0xFFFFFFFFULL;
     85    u1   = u >> 32;
     86    v0   = v & 0xFFFFFFFFULL;
     87    v1   = v >> 32;
     88    w0   = u0 * v0;
     89    t    = u1 * v0 + (w0 >> 32);
     90    w1   = t & 0xFFFFFFFFULL;
     91    w2   = t >> 32;
     92    w1   = u0 * v1 + w1;
     93    *rHi = u1 * v1 + w2 + (w1 >> 32);
     94    *rLo = u * v;
     95 }
     96 
     97 static void mullU64 ( ULong u, ULong v, ULong* rHi, ULong* rLo )
     98 {
     99    ULong u0, v0, w0;
    100    ULong u1, v1, w1,w2,t;
    101    u0   = u & 0xFFFFFFFFULL;
    102    u1   = u >> 32;
    103    v0   = v & 0xFFFFFFFFULL;
    104    v1   = v >> 32;
    105    w0   = u0 * v0;
    106    t    = u1 * v0 + (w0 >> 32);
    107    w1   = t & 0xFFFFFFFFULL;
    108    w2   = t >> 32;
    109    w1   = u0 * v1 + w1;
    110    *rHi = u1 * v1 + w2 + (w1 >> 32);
    111    *rLo = u * v;
    112 }
    113 
    114 
    115 static const UChar parity_table[256] = {
    116     AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
    117     0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
    118     0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
    119     AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
    120     0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
    121     AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
    122     AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
    123     0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
    124     0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
    125     AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
    126     AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
    127     0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
    128     AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
    129     0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
    130     0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
    131     AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
    132     0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
    133     AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
    134     AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
    135     0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
    136     AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
    137     0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
    138     0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
    139     AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
    140     AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
    141     0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
    142     0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
    143     AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
    144     0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
    145     AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
    146     AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
    147     0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
    148 };
    149 
    150 /* generalised left-shifter */
    151 static inline Long lshift ( Long x, Int n )
    152 {
    153    if (n >= 0)
    154       return (ULong)x << n;
    155    else
    156       return x >> (-n);
    157 }
    158 
    159 /* identity on ULong */
    160 static inline ULong idULong ( ULong x )
    161 {
    162    return x;
    163 }
    164 
    165 
    166 #define PREAMBLE(__data_bits)					\
    167    /* const */ ULong DATA_MASK 					\
    168       = __data_bits==8                                          \
    169            ? 0xFFULL 					        \
    170            : (__data_bits==16                                   \
    171                 ? 0xFFFFULL 		                        \
    172                 : (__data_bits==32                              \
    173                      ? 0xFFFFFFFFULL                            \
    174                      : 0xFFFFFFFFFFFFFFFFULL));                 \
    175    /* const */ ULong SIGN_MASK = 1ULL << (__data_bits - 1);     \
    176    /* const */ ULong CC_DEP1 = cc_dep1_formal;			\
    177    /* const */ ULong CC_DEP2 = cc_dep2_formal;			\
    178    /* const */ ULong CC_NDEP = cc_ndep_formal;			\
    179    /* Four bogus assignments, which hopefully gcc can     */	\
    180    /* optimise away, and which stop it complaining about  */	\
    181    /* unused variables.                                   */	\
    182    SIGN_MASK = SIGN_MASK;					\
    183    DATA_MASK = DATA_MASK;					\
    184    CC_DEP2 = CC_DEP2;						\
    185    CC_NDEP = CC_NDEP;
    186 
    187 
    188 /*-------------------------------------------------------------*/
    189 
    190 #define ACTIONS_ADD(DATA_BITS,DATA_UTYPE)			\
    191 {								\
    192    PREAMBLE(DATA_BITS);						\
    193    { ULong cf, pf, af, zf, sf, of;				\
    194      ULong argL, argR, res;					\
    195      argL = CC_DEP1;						\
    196      argR = CC_DEP2;						\
    197      res  = argL + argR;					\
    198      cf = (DATA_UTYPE)res < (DATA_UTYPE)argL;			\
    199      pf = parity_table[(UChar)res];				\
    200      af = (res ^ argL ^ argR) & 0x10;				\
    201      zf = ((DATA_UTYPE)res == 0) << 6;				\
    202      sf = lshift(res, 8 - DATA_BITS) & 0x80;			\
    203      of = lshift((argL ^ argR ^ -1) & (argL ^ res), 		\
    204                  12 - DATA_BITS) & AMD64G_CC_MASK_O;		\
    205      return cf | pf | af | zf | sf | of;			\
    206    }								\
    207 }
    208 
    209 /*-------------------------------------------------------------*/
    210 
    211 #define ACTIONS_SUB(DATA_BITS,DATA_UTYPE)			\
    212 {								\
    213    PREAMBLE(DATA_BITS);						\
    214    { ULong cf, pf, af, zf, sf, of;				\
    215      ULong argL, argR, res;					\
    216      argL = CC_DEP1;						\
    217      argR = CC_DEP2;						\
    218      res  = argL - argR;					\
    219      cf = (DATA_UTYPE)argL < (DATA_UTYPE)argR;			\
    220      pf = parity_table[(UChar)res];				\
    221      af = (res ^ argL ^ argR) & 0x10;				\
    222      zf = ((DATA_UTYPE)res == 0) << 6;				\
    223      sf = lshift(res, 8 - DATA_BITS) & 0x80;			\
    224      of = lshift((argL ^ argR) & (argL ^ res),	 		\
    225                  12 - DATA_BITS) & AMD64G_CC_MASK_O; 		\
    226      return cf | pf | af | zf | sf | of;			\
    227    }								\
    228 }
    229 
    230 /*-------------------------------------------------------------*/
    231 
    232 #define ACTIONS_ADC(DATA_BITS,DATA_UTYPE)			\
    233 {								\
    234    PREAMBLE(DATA_BITS);						\
    235    { ULong cf, pf, af, zf, sf, of;				\
    236      ULong argL, argR, oldC, res;		 		\
    237      oldC = CC_NDEP & AMD64G_CC_MASK_C;				\
    238      argL = CC_DEP1;						\
    239      argR = CC_DEP2 ^ oldC;	       				\
    240      res  = (argL + argR) + oldC;				\
    241      if (oldC)							\
    242         cf = (DATA_UTYPE)res <= (DATA_UTYPE)argL;		\
    243      else							\
    244         cf = (DATA_UTYPE)res < (DATA_UTYPE)argL;		\
    245      pf = parity_table[(UChar)res];				\
    246      af = (res ^ argL ^ argR) & 0x10;				\
    247      zf = ((DATA_UTYPE)res == 0) << 6;				\
    248      sf = lshift(res, 8 - DATA_BITS) & 0x80;			\
    249      of = lshift((argL ^ argR ^ -1) & (argL ^ res), 		\
    250                   12 - DATA_BITS) & AMD64G_CC_MASK_O;		\
    251      return cf | pf | af | zf | sf | of;			\
    252    }								\
    253 }
    254 
    255 /*-------------------------------------------------------------*/
    256 
    257 #define ACTIONS_SBB(DATA_BITS,DATA_UTYPE)			\
    258 {								\
    259    PREAMBLE(DATA_BITS);						\
    260    { ULong cf, pf, af, zf, sf, of;				\
    261      ULong argL, argR, oldC, res;	       			\
    262      oldC = CC_NDEP & AMD64G_CC_MASK_C;				\
    263      argL = CC_DEP1;						\
    264      argR = CC_DEP2 ^ oldC;	       				\
    265      res  = (argL - argR) - oldC;				\
    266      if (oldC)							\
    267         cf = (DATA_UTYPE)argL <= (DATA_UTYPE)argR;		\
    268      else							\
    269         cf = (DATA_UTYPE)argL < (DATA_UTYPE)argR;		\
    270      pf = parity_table[(UChar)res];				\
    271      af = (res ^ argL ^ argR) & 0x10;				\
    272      zf = ((DATA_UTYPE)res == 0) << 6;				\
    273      sf = lshift(res, 8 - DATA_BITS) & 0x80;			\
    274      of = lshift((argL ^ argR) & (argL ^ res), 			\
    275                  12 - DATA_BITS) & AMD64G_CC_MASK_O;		\
    276      return cf | pf | af | zf | sf | of;			\
    277    }								\
    278 }
    279 
    280 /*-------------------------------------------------------------*/
    281 
    282 #define ACTIONS_LOGIC(DATA_BITS,DATA_UTYPE)			\
    283 {								\
    284    PREAMBLE(DATA_BITS);						\
    285    { ULong cf, pf, af, zf, sf, of;				\
    286      cf = 0;							\
    287      pf = parity_table[(UChar)CC_DEP1];				\
    288      af = 0;							\
    289      zf = ((DATA_UTYPE)CC_DEP1 == 0) << 6;			\
    290      sf = lshift(CC_DEP1, 8 - DATA_BITS) & 0x80;		\
    291      of = 0;							\
    292      return cf | pf | af | zf | sf | of;			\
    293    }								\
    294 }
    295 
    296 /*-------------------------------------------------------------*/
    297 
    298 #define ACTIONS_INC(DATA_BITS,DATA_UTYPE)			\
    299 {								\
    300    PREAMBLE(DATA_BITS);						\
    301    { ULong cf, pf, af, zf, sf, of;				\
    302      ULong argL, argR, res;					\
    303      res  = CC_DEP1;						\
    304      argL = res - 1;						\
    305      argR = 1;							\
    306      cf = CC_NDEP & AMD64G_CC_MASK_C;				\
    307      pf = parity_table[(UChar)res];				\
    308      af = (res ^ argL ^ argR) & 0x10;				\
    309      zf = ((DATA_UTYPE)res == 0) << 6;				\
    310      sf = lshift(res, 8 - DATA_BITS) & 0x80;			\
    311      of = ((res & DATA_MASK) == SIGN_MASK) << 11;		\
    312      return cf | pf | af | zf | sf | of;			\
    313    }								\
    314 }
    315 
    316 /*-------------------------------------------------------------*/
    317 
    318 #define ACTIONS_DEC(DATA_BITS,DATA_UTYPE)			\
    319 {								\
    320    PREAMBLE(DATA_BITS);						\
    321    { ULong cf, pf, af, zf, sf, of;				\
    322      ULong argL, argR, res;					\
    323      res  = CC_DEP1;						\
    324      argL = res + 1;						\
    325      argR = 1;							\
    326      cf = CC_NDEP & AMD64G_CC_MASK_C;				\
    327      pf = parity_table[(UChar)res];				\
    328      af = (res ^ argL ^ argR) & 0x10;				\
    329      zf = ((DATA_UTYPE)res == 0) << 6;				\
    330      sf = lshift(res, 8 - DATA_BITS) & 0x80;			\
    331      of = ((res & DATA_MASK) 					\
    332           == ((ULong)SIGN_MASK - 1)) << 11;			\
    333      return cf | pf | af | zf | sf | of;			\
    334    }								\
    335 }
    336 
    337 /*-------------------------------------------------------------*/
    338 
    339 #define ACTIONS_SHL(DATA_BITS,DATA_UTYPE)			\
    340 {								\
    341    PREAMBLE(DATA_BITS);						\
    342    { ULong cf, pf, af, zf, sf, of;				\
    343      cf = (CC_DEP2 >> (DATA_BITS - 1)) & AMD64G_CC_MASK_C;	\
    344      pf = parity_table[(UChar)CC_DEP1];				\
    345      af = 0; /* undefined */					\
    346      zf = ((DATA_UTYPE)CC_DEP1 == 0) << 6;			\
    347      sf = lshift(CC_DEP1, 8 - DATA_BITS) & 0x80;		\
    348      /* of is defined if shift count == 1 */			\
    349      of = lshift(CC_DEP2 ^ CC_DEP1, 12 - DATA_BITS) 		\
    350           & AMD64G_CC_MASK_O;					\
    351      return cf | pf | af | zf | sf | of;			\
    352    }								\
    353 }
    354 
    355 /*-------------------------------------------------------------*/
    356 
    357 #define ACTIONS_SHR(DATA_BITS,DATA_UTYPE)			\
    358 {								\
    359    PREAMBLE(DATA_BITS);  					\
    360    { ULong cf, pf, af, zf, sf, of;				\
    361      cf = CC_DEP2 & 1;						\
    362      pf = parity_table[(UChar)CC_DEP1];				\
    363      af = 0; /* undefined */					\
    364      zf = ((DATA_UTYPE)CC_DEP1 == 0) << 6;			\
    365      sf = lshift(CC_DEP1, 8 - DATA_BITS) & 0x80;		\
    366      /* of is defined if shift count == 1 */			\
    367      of = lshift(CC_DEP2 ^ CC_DEP1, 12 - DATA_BITS)		\
    368           & AMD64G_CC_MASK_O;					\
    369      return cf | pf | af | zf | sf | of;			\
    370    }								\
    371 }
    372 
    373 /*-------------------------------------------------------------*/
    374 
    375 /* ROL: cf' = lsb(result).  of' = msb(result) ^ lsb(result). */
    376 /* DEP1 = result, NDEP = old flags */
    377 #define ACTIONS_ROL(DATA_BITS,DATA_UTYPE)			\
    378 {								\
    379    PREAMBLE(DATA_BITS);						\
    380    { ULong fl 							\
    381         = (CC_NDEP & ~(AMD64G_CC_MASK_O | AMD64G_CC_MASK_C))	\
    382           | (AMD64G_CC_MASK_C & CC_DEP1)			\
    383           | (AMD64G_CC_MASK_O & (lshift(CC_DEP1,  		\
    384                                       11-(DATA_BITS-1)) 	\
    385                      ^ lshift(CC_DEP1, 11)));			\
    386      return fl;							\
    387    }								\
    388 }
    389 
    390 /*-------------------------------------------------------------*/
    391 
    392 /* ROR: cf' = msb(result).  of' = msb(result) ^ msb-1(result). */
    393 /* DEP1 = result, NDEP = old flags */
    394 #define ACTIONS_ROR(DATA_BITS,DATA_UTYPE)			\
    395 {								\
    396    PREAMBLE(DATA_BITS);						\
    397    { ULong fl 							\
    398         = (CC_NDEP & ~(AMD64G_CC_MASK_O | AMD64G_CC_MASK_C))	\
    399           | (AMD64G_CC_MASK_C & (CC_DEP1 >> (DATA_BITS-1)))	\
    400           | (AMD64G_CC_MASK_O & (lshift(CC_DEP1, 		\
    401                                       11-(DATA_BITS-1)) 	\
    402                      ^ lshift(CC_DEP1, 11-(DATA_BITS-1)+1)));	\
    403      return fl;							\
    404    }								\
    405 }
    406 
    407 /*-------------------------------------------------------------*/
    408 
    409 #define ACTIONS_UMUL(DATA_BITS, DATA_UTYPE,  NARROWtoU,         \
    410                                 DATA_U2TYPE, NARROWto2U)        \
    411 {                                                               \
    412    PREAMBLE(DATA_BITS);                                         \
    413    { ULong cf, pf, af, zf, sf, of;                              \
    414      DATA_UTYPE  hi;                                            \
    415      DATA_UTYPE  lo                                             \
    416         = NARROWtoU( ((DATA_UTYPE)CC_DEP1)                      \
    417                      * ((DATA_UTYPE)CC_DEP2) );                 \
    418      DATA_U2TYPE rr                                             \
    419         = NARROWto2U(                                           \
    420              ((DATA_U2TYPE)((DATA_UTYPE)CC_DEP1))               \
    421              * ((DATA_U2TYPE)((DATA_UTYPE)CC_DEP2)) );          \
    422      hi = NARROWtoU(rr >>/*u*/ DATA_BITS);                      \
    423      cf = (hi != 0);                                            \
    424      pf = parity_table[(UChar)lo];                              \
    425      af = 0; /* undefined */                                    \
    426      zf = (lo == 0) << 6;                                       \
    427      sf = lshift(lo, 8 - DATA_BITS) & 0x80;                     \
    428      of = cf << 11;                                             \
    429      return cf | pf | af | zf | sf | of;                        \
    430    }								\
    431 }
    432 
    433 /*-------------------------------------------------------------*/
    434 
    435 #define ACTIONS_SMUL(DATA_BITS, DATA_STYPE,  NARROWtoS,         \
    436                                 DATA_S2TYPE, NARROWto2S)        \
    437 {                                                               \
    438    PREAMBLE(DATA_BITS);                                         \
    439    { ULong cf, pf, af, zf, sf, of;                              \
    440      DATA_STYPE  hi;                                            \
    441      DATA_STYPE  lo                                             \
    442         = NARROWtoS( ((DATA_S2TYPE)(DATA_STYPE)CC_DEP1)         \
    443                      * ((DATA_S2TYPE)(DATA_STYPE)CC_DEP2) );    \
    444      DATA_S2TYPE rr                                             \
    445         = NARROWto2S(                                           \
    446              ((DATA_S2TYPE)((DATA_STYPE)CC_DEP1))               \
    447              * ((DATA_S2TYPE)((DATA_STYPE)CC_DEP2)) );          \
    448      hi = NARROWtoS(rr >>/*s*/ DATA_BITS);                      \
    449      cf = (hi != (lo >>/*s*/ (DATA_BITS-1)));                   \
    450      pf = parity_table[(UChar)lo];                              \
    451      af = 0; /* undefined */                                    \
    452      zf = (lo == 0) << 6;                                       \
    453      sf = lshift(lo, 8 - DATA_BITS) & 0x80;                     \
    454      of = cf << 11;                                             \
    455      return cf | pf | af | zf | sf | of;                        \
    456    }								\
    457 }
    458 
    459 /*-------------------------------------------------------------*/
    460 
    461 #define ACTIONS_UMULQ                                           \
    462 {                                                               \
    463    PREAMBLE(64);                                                \
    464    { ULong cf, pf, af, zf, sf, of;                              \
    465      ULong lo, hi;                                              \
    466      mullU64( (ULong)CC_DEP1, (ULong)CC_DEP2, &hi, &lo );       \
    467      cf = (hi != 0);                                            \
    468      pf = parity_table[(UChar)lo];                              \
    469      af = 0; /* undefined */                                    \
    470      zf = (lo == 0) << 6;                                       \
    471      sf = lshift(lo, 8 - 64) & 0x80;                            \
    472      of = cf << 11;                                             \
    473      return cf | pf | af | zf | sf | of;                        \
    474    }								\
    475 }
    476 
    477 /*-------------------------------------------------------------*/
    478 
    479 #define ACTIONS_SMULQ                                           \
    480 {                                                               \
    481    PREAMBLE(64);                                                \
    482    { ULong cf, pf, af, zf, sf, of;                              \
    483      Long lo, hi;                                               \
    484      mullS64( (Long)CC_DEP1, (Long)CC_DEP2, &hi, &lo );         \
    485      cf = (hi != (lo >>/*s*/ (64-1)));                          \
    486      pf = parity_table[(UChar)lo];                              \
    487      af = 0; /* undefined */                                    \
    488      zf = (lo == 0) << 6;                                       \
    489      sf = lshift(lo, 8 - 64) & 0x80;                            \
    490      of = cf << 11;                                             \
    491      return cf | pf | af | zf | sf | of;                        \
    492    }								\
    493 }
    494 
    495 /*-------------------------------------------------------------*/
    496 
    497 #define ACTIONS_ANDN(DATA_BITS,DATA_UTYPE)			\
    498 {								\
    499    PREAMBLE(DATA_BITS);						\
    500    { ULong cf, pf, af, zf, sf, of;				\
    501      cf = 0;							\
    502      pf = 0;							\
    503      af = 0;							\
    504      zf = ((DATA_UTYPE)CC_DEP1 == 0) << 6;			\
    505      sf = lshift(CC_DEP1, 8 - DATA_BITS) & 0x80;		\
    506      of = 0;							\
    507      return cf | pf | af | zf | sf | of;			\
    508    }								\
    509 }
    510 
    511 /*-------------------------------------------------------------*/
    512 
    513 #define ACTIONS_BLSI(DATA_BITS,DATA_UTYPE)			\
    514 {								\
    515    PREAMBLE(DATA_BITS);						\
    516    { ULong cf, pf, af, zf, sf, of;				\
    517      cf = ((DATA_UTYPE)CC_DEP2 != 0);				\
    518      pf = 0;							\
    519      af = 0;							\
    520      zf = ((DATA_UTYPE)CC_DEP1 == 0) << 6;			\
    521      sf = lshift(CC_DEP1, 8 - DATA_BITS) & 0x80;		\
    522      of = 0;							\
    523      return cf | pf | af | zf | sf | of;			\
    524    }								\
    525 }
    526 
    527 /*-------------------------------------------------------------*/
    528 
    529 #define ACTIONS_BLSMSK(DATA_BITS,DATA_UTYPE)			\
    530 {								\
    531    PREAMBLE(DATA_BITS);						\
    532    { Long cf, pf, af, zf, sf, of;				\
    533      cf = ((DATA_UTYPE)CC_DEP2 == 0);				\
    534      pf = 0;							\
    535      af = 0;							\
    536      zf = 0;							\
    537      sf = lshift(CC_DEP1, 8 - DATA_BITS) & 0x80;		\
    538      of = 0;							\
    539      return cf | pf | af | zf | sf | of;			\
    540    }								\
    541 }
    542 
    543 /*-------------------------------------------------------------*/
    544 
    545 #define ACTIONS_BLSR(DATA_BITS,DATA_UTYPE)			\
    546 {								\
    547    PREAMBLE(DATA_BITS);						\
    548    { ULong cf, pf, af, zf, sf, of;				\
    549      cf = ((DATA_UTYPE)CC_DEP2 == 0);				\
    550      pf = 0;							\
    551      af = 0;							\
    552      zf = ((DATA_UTYPE)CC_DEP1 == 0) << 6;			\
    553      sf = lshift(CC_DEP1, 8 - DATA_BITS) & 0x80;		\
    554      of = 0;							\
    555      return cf | pf | af | zf | sf | of;			\
    556    }								\
    557 }
    558 
    559 /*-------------------------------------------------------------*/
    560 
    561 
    562 #if PROFILE_RFLAGS
    563 
    564 static Bool initted     = False;
    565 
    566 /* C flag, fast route */
    567 static UInt tabc_fast[AMD64G_CC_OP_NUMBER];
    568 /* C flag, slow route */
    569 static UInt tabc_slow[AMD64G_CC_OP_NUMBER];
    570 /* table for calculate_cond */
    571 static UInt tab_cond[AMD64G_CC_OP_NUMBER][16];
    572 /* total entry counts for calc_all, calc_c, calc_cond. */
    573 static UInt n_calc_all  = 0;
    574 static UInt n_calc_c    = 0;
    575 static UInt n_calc_cond = 0;
    576 
    577 #define SHOW_COUNTS_NOW (0 == (0x3FFFFF & (n_calc_all+n_calc_c+n_calc_cond)))
    578 
    579 
    580 static void showCounts ( void )
    581 {
    582    Int op, co;
    583    HChar ch;
    584    vex_printf("\nTotal calls: calc_all=%u   calc_cond=%u   calc_c=%u\n",
    585               n_calc_all, n_calc_cond, n_calc_c);
    586 
    587    vex_printf("      cSLOW  cFAST    O   NO    B   NB    Z   NZ   BE  NBE"
    588               "    S   NS    P   NP    L   NL   LE  NLE\n");
    589    vex_printf("     -----------------------------------------------------"
    590               "----------------------------------------\n");
    591    for (op = 0; op < AMD64G_CC_OP_NUMBER; op++) {
    592 
    593       ch = ' ';
    594       if (op > 0 && (op-1) % 4 == 0)
    595          ch = 'B';
    596       if (op > 0 && (op-1) % 4 == 1)
    597          ch = 'W';
    598       if (op > 0 && (op-1) % 4 == 2)
    599          ch = 'L';
    600       if (op > 0 && (op-1) % 4 == 3)
    601          ch = 'Q';
    602 
    603       vex_printf("%2d%c: ", op, ch);
    604       vex_printf("%6u ", tabc_slow[op]);
    605       vex_printf("%6u ", tabc_fast[op]);
    606       for (co = 0; co < 16; co++) {
    607          Int n = tab_cond[op][co];
    608          if (n >= 1000) {
    609             vex_printf(" %3dK", n / 1000);
    610          } else
    611          if (n >= 0) {
    612             vex_printf(" %3d ", n );
    613          } else {
    614             vex_printf("     ");
    615          }
    616       }
    617       vex_printf("\n");
    618    }
    619    vex_printf("\n");
    620 }
    621 
    622 static void initCounts ( void )
    623 {
    624    Int op, co;
    625    initted = True;
    626    for (op = 0; op < AMD64G_CC_OP_NUMBER; op++) {
    627       tabc_fast[op] = tabc_slow[op] = 0;
    628       for (co = 0; co < 16; co++)
    629          tab_cond[op][co] = 0;
    630    }
    631 }
    632 
    633 #endif /* PROFILE_RFLAGS */
    634 
    635 
    636 /* CALLED FROM GENERATED CODE: CLEAN HELPER */
    637 /* Calculate all the 6 flags from the supplied thunk parameters.
    638    Worker function, not directly called from generated code. */
    639 static
    640 ULong amd64g_calculate_rflags_all_WRK ( ULong cc_op,
    641                                         ULong cc_dep1_formal,
    642                                         ULong cc_dep2_formal,
    643                                         ULong cc_ndep_formal )
    644 {
    645    switch (cc_op) {
    646       case AMD64G_CC_OP_COPY:
    647          return cc_dep1_formal
    648                 & (AMD64G_CC_MASK_O | AMD64G_CC_MASK_S | AMD64G_CC_MASK_Z
    649                    | AMD64G_CC_MASK_A | AMD64G_CC_MASK_C | AMD64G_CC_MASK_P);
    650 
    651       case AMD64G_CC_OP_ADDB:   ACTIONS_ADD( 8,  UChar  );
    652       case AMD64G_CC_OP_ADDW:   ACTIONS_ADD( 16, UShort );
    653       case AMD64G_CC_OP_ADDL:   ACTIONS_ADD( 32, UInt   );
    654       case AMD64G_CC_OP_ADDQ:   ACTIONS_ADD( 64, ULong  );
    655 
    656       case AMD64G_CC_OP_ADCB:   ACTIONS_ADC( 8,  UChar  );
    657       case AMD64G_CC_OP_ADCW:   ACTIONS_ADC( 16, UShort );
    658       case AMD64G_CC_OP_ADCL:   ACTIONS_ADC( 32, UInt   );
    659       case AMD64G_CC_OP_ADCQ:   ACTIONS_ADC( 64, ULong  );
    660 
    661       case AMD64G_CC_OP_SUBB:   ACTIONS_SUB(  8, UChar  );
    662       case AMD64G_CC_OP_SUBW:   ACTIONS_SUB( 16, UShort );
    663       case AMD64G_CC_OP_SUBL:   ACTIONS_SUB( 32, UInt   );
    664       case AMD64G_CC_OP_SUBQ:   ACTIONS_SUB( 64, ULong  );
    665 
    666       case AMD64G_CC_OP_SBBB:   ACTIONS_SBB(  8, UChar  );
    667       case AMD64G_CC_OP_SBBW:   ACTIONS_SBB( 16, UShort );
    668       case AMD64G_CC_OP_SBBL:   ACTIONS_SBB( 32, UInt   );
    669       case AMD64G_CC_OP_SBBQ:   ACTIONS_SBB( 64, ULong  );
    670 
    671       case AMD64G_CC_OP_LOGICB: ACTIONS_LOGIC(  8, UChar  );
    672       case AMD64G_CC_OP_LOGICW: ACTIONS_LOGIC( 16, UShort );
    673       case AMD64G_CC_OP_LOGICL: ACTIONS_LOGIC( 32, UInt   );
    674       case AMD64G_CC_OP_LOGICQ: ACTIONS_LOGIC( 64, ULong  );
    675 
    676       case AMD64G_CC_OP_INCB:   ACTIONS_INC(  8, UChar  );
    677       case AMD64G_CC_OP_INCW:   ACTIONS_INC( 16, UShort );
    678       case AMD64G_CC_OP_INCL:   ACTIONS_INC( 32, UInt   );
    679       case AMD64G_CC_OP_INCQ:   ACTIONS_INC( 64, ULong  );
    680 
    681       case AMD64G_CC_OP_DECB:   ACTIONS_DEC(  8, UChar  );
    682       case AMD64G_CC_OP_DECW:   ACTIONS_DEC( 16, UShort );
    683       case AMD64G_CC_OP_DECL:   ACTIONS_DEC( 32, UInt   );
    684       case AMD64G_CC_OP_DECQ:   ACTIONS_DEC( 64, ULong  );
    685 
    686       case AMD64G_CC_OP_SHLB:   ACTIONS_SHL(  8, UChar  );
    687       case AMD64G_CC_OP_SHLW:   ACTIONS_SHL( 16, UShort );
    688       case AMD64G_CC_OP_SHLL:   ACTIONS_SHL( 32, UInt   );
    689       case AMD64G_CC_OP_SHLQ:   ACTIONS_SHL( 64, ULong  );
    690 
    691       case AMD64G_CC_OP_SHRB:   ACTIONS_SHR(  8, UChar  );
    692       case AMD64G_CC_OP_SHRW:   ACTIONS_SHR( 16, UShort );
    693       case AMD64G_CC_OP_SHRL:   ACTIONS_SHR( 32, UInt   );
    694       case AMD64G_CC_OP_SHRQ:   ACTIONS_SHR( 64, ULong  );
    695 
    696       case AMD64G_CC_OP_ROLB:   ACTIONS_ROL(  8, UChar  );
    697       case AMD64G_CC_OP_ROLW:   ACTIONS_ROL( 16, UShort );
    698       case AMD64G_CC_OP_ROLL:   ACTIONS_ROL( 32, UInt   );
    699       case AMD64G_CC_OP_ROLQ:   ACTIONS_ROL( 64, ULong  );
    700 
    701       case AMD64G_CC_OP_RORB:   ACTIONS_ROR(  8, UChar  );
    702       case AMD64G_CC_OP_RORW:   ACTIONS_ROR( 16, UShort );
    703       case AMD64G_CC_OP_RORL:   ACTIONS_ROR( 32, UInt   );
    704       case AMD64G_CC_OP_RORQ:   ACTIONS_ROR( 64, ULong  );
    705 
    706       case AMD64G_CC_OP_UMULB:  ACTIONS_UMUL(  8, UChar,  toUChar,
    707                                                   UShort, toUShort );
    708       case AMD64G_CC_OP_UMULW:  ACTIONS_UMUL( 16, UShort, toUShort,
    709                                                   UInt,   toUInt );
    710       case AMD64G_CC_OP_UMULL:  ACTIONS_UMUL( 32, UInt,   toUInt,
    711                                                   ULong,  idULong );
    712 
    713       case AMD64G_CC_OP_UMULQ:  ACTIONS_UMULQ;
    714 
    715       case AMD64G_CC_OP_SMULB:  ACTIONS_SMUL(  8, Char,   toUChar,
    716                                                   Short,  toUShort );
    717       case AMD64G_CC_OP_SMULW:  ACTIONS_SMUL( 16, Short,  toUShort,
    718                                                   Int,    toUInt   );
    719       case AMD64G_CC_OP_SMULL:  ACTIONS_SMUL( 32, Int,    toUInt,
    720                                                   Long,   idULong );
    721 
    722       case AMD64G_CC_OP_SMULQ:  ACTIONS_SMULQ;
    723 
    724       case AMD64G_CC_OP_ANDN32: ACTIONS_ANDN( 32, UInt   );
    725       case AMD64G_CC_OP_ANDN64: ACTIONS_ANDN( 64, ULong  );
    726 
    727       case AMD64G_CC_OP_BLSI32: ACTIONS_BLSI( 32, UInt   );
    728       case AMD64G_CC_OP_BLSI64: ACTIONS_BLSI( 64, ULong  );
    729 
    730       case AMD64G_CC_OP_BLSMSK32: ACTIONS_BLSMSK( 32, UInt   );
    731       case AMD64G_CC_OP_BLSMSK64: ACTIONS_BLSMSK( 64, ULong  );
    732 
    733       case AMD64G_CC_OP_BLSR32: ACTIONS_BLSR( 32, UInt   );
    734       case AMD64G_CC_OP_BLSR64: ACTIONS_BLSR( 64, ULong  );
    735 
    736       default:
    737          /* shouldn't really make these calls from generated code */
    738          vex_printf("amd64g_calculate_rflags_all_WRK(AMD64)"
    739                     "( %llu, 0x%llx, 0x%llx, 0x%llx )\n",
    740                     cc_op, cc_dep1_formal, cc_dep2_formal, cc_ndep_formal );
    741          vpanic("amd64g_calculate_rflags_all_WRK(AMD64)");
    742    }
    743 }
    744 
    745 
    746 /* CALLED FROM GENERATED CODE: CLEAN HELPER */
    747 /* Calculate all the 6 flags from the supplied thunk parameters. */
    748 ULong amd64g_calculate_rflags_all ( ULong cc_op,
    749                                     ULong cc_dep1,
    750                                     ULong cc_dep2,
    751                                     ULong cc_ndep )
    752 {
    753 #  if PROFILE_RFLAGS
    754    if (!initted) initCounts();
    755    n_calc_all++;
    756    if (SHOW_COUNTS_NOW) showCounts();
    757 #  endif
    758    return
    759       amd64g_calculate_rflags_all_WRK ( cc_op, cc_dep1, cc_dep2, cc_ndep );
    760 }
    761 
    762 
    763 /* CALLED FROM GENERATED CODE: CLEAN HELPER */
    764 /* Calculate just the carry flag from the supplied thunk parameters. */
    765 ULong amd64g_calculate_rflags_c ( ULong cc_op,
    766                                   ULong cc_dep1,
    767                                   ULong cc_dep2,
    768                                   ULong cc_ndep )
    769 {
    770 #  if PROFILE_RFLAGS
    771    if (!initted) initCounts();
    772    n_calc_c++;
    773    tabc_fast[cc_op]++;
    774    if (SHOW_COUNTS_NOW) showCounts();
    775 #  endif
    776 
    777    /* Fast-case some common ones. */
    778    switch (cc_op) {
    779       case AMD64G_CC_OP_COPY:
    780          return (cc_dep1 >> AMD64G_CC_SHIFT_C) & 1;
    781       case AMD64G_CC_OP_LOGICQ:
    782       case AMD64G_CC_OP_LOGICL:
    783       case AMD64G_CC_OP_LOGICW:
    784       case AMD64G_CC_OP_LOGICB:
    785          return 0;
    786 	 //      case AMD64G_CC_OP_SUBL:
    787 	 //         return ((UInt)cc_dep1) < ((UInt)cc_dep2)
    788 	 //                   ? AMD64G_CC_MASK_C : 0;
    789 	 //      case AMD64G_CC_OP_SUBW:
    790 	 //         return ((UInt)(cc_dep1 & 0xFFFF)) < ((UInt)(cc_dep2 & 0xFFFF))
    791 	 //                   ? AMD64G_CC_MASK_C : 0;
    792 	 //      case AMD64G_CC_OP_SUBB:
    793 	 //         return ((UInt)(cc_dep1 & 0xFF)) < ((UInt)(cc_dep2 & 0xFF))
    794 	 //                   ? AMD64G_CC_MASK_C : 0;
    795 	 //      case AMD64G_CC_OP_INCL:
    796 	 //      case AMD64G_CC_OP_DECL:
    797 	 //         return cc_ndep & AMD64G_CC_MASK_C;
    798       default:
    799          break;
    800    }
    801 
    802 #  if PROFILE_RFLAGS
    803    tabc_fast[cc_op]--;
    804    tabc_slow[cc_op]++;
    805 #  endif
    806 
    807    return amd64g_calculate_rflags_all_WRK(cc_op,cc_dep1,cc_dep2,cc_ndep)
    808           & AMD64G_CC_MASK_C;
    809 }
    810 
    811 
    812 /* CALLED FROM GENERATED CODE: CLEAN HELPER */
    813 /* returns 1 or 0 */
    814 ULong amd64g_calculate_condition ( ULong/*AMD64Condcode*/ cond,
    815                                    ULong cc_op,
    816                                    ULong cc_dep1,
    817                                    ULong cc_dep2,
    818                                    ULong cc_ndep )
    819 {
    820    ULong rflags = amd64g_calculate_rflags_all_WRK(cc_op, cc_dep1,
    821                                                   cc_dep2, cc_ndep);
    822    ULong of,sf,zf,cf,pf;
    823    ULong inv = cond & 1;
    824 
    825 #  if PROFILE_RFLAGS
    826    if (!initted) initCounts();
    827    tab_cond[cc_op][cond]++;
    828    n_calc_cond++;
    829    if (SHOW_COUNTS_NOW) showCounts();
    830 #  endif
    831 
    832    switch (cond) {
    833       case AMD64CondNO:
    834       case AMD64CondO: /* OF == 1 */
    835          of = rflags >> AMD64G_CC_SHIFT_O;
    836          return 1 & (inv ^ of);
    837 
    838       case AMD64CondNZ:
    839       case AMD64CondZ: /* ZF == 1 */
    840          zf = rflags >> AMD64G_CC_SHIFT_Z;
    841          return 1 & (inv ^ zf);
    842 
    843       case AMD64CondNB:
    844       case AMD64CondB: /* CF == 1 */
    845          cf = rflags >> AMD64G_CC_SHIFT_C;
    846          return 1 & (inv ^ cf);
    847          break;
    848 
    849       case AMD64CondNBE:
    850       case AMD64CondBE: /* (CF or ZF) == 1 */
    851          cf = rflags >> AMD64G_CC_SHIFT_C;
    852          zf = rflags >> AMD64G_CC_SHIFT_Z;
    853          return 1 & (inv ^ (cf | zf));
    854          break;
    855 
    856       case AMD64CondNS:
    857       case AMD64CondS: /* SF == 1 */
    858          sf = rflags >> AMD64G_CC_SHIFT_S;
    859          return 1 & (inv ^ sf);
    860 
    861       case AMD64CondNP:
    862       case AMD64CondP: /* PF == 1 */
    863          pf = rflags >> AMD64G_CC_SHIFT_P;
    864          return 1 & (inv ^ pf);
    865 
    866       case AMD64CondNL:
    867       case AMD64CondL: /* (SF xor OF) == 1 */
    868          sf = rflags >> AMD64G_CC_SHIFT_S;
    869          of = rflags >> AMD64G_CC_SHIFT_O;
    870          return 1 & (inv ^ (sf ^ of));
    871          break;
    872 
    873       case AMD64CondNLE:
    874       case AMD64CondLE: /* ((SF xor OF) or ZF)  == 1 */
    875          sf = rflags >> AMD64G_CC_SHIFT_S;
    876          of = rflags >> AMD64G_CC_SHIFT_O;
    877          zf = rflags >> AMD64G_CC_SHIFT_Z;
    878          return 1 & (inv ^ ((sf ^ of) | zf));
    879          break;
    880 
    881       default:
    882          /* shouldn't really make these calls from generated code */
    883          vex_printf("amd64g_calculate_condition"
    884                     "( %llu, %llu, 0x%llx, 0x%llx, 0x%llx )\n",
    885                     cond, cc_op, cc_dep1, cc_dep2, cc_ndep );
    886          vpanic("amd64g_calculate_condition");
    887    }
    888 }
    889 
    890 
    891 /* VISIBLE TO LIBVEX CLIENT */
    892 ULong LibVEX_GuestAMD64_get_rflags ( /*IN*/const VexGuestAMD64State* vex_state )
    893 {
    894    ULong rflags = amd64g_calculate_rflags_all_WRK(
    895                      vex_state->guest_CC_OP,
    896                      vex_state->guest_CC_DEP1,
    897                      vex_state->guest_CC_DEP2,
    898                      vex_state->guest_CC_NDEP
    899                   );
    900    Long dflag = vex_state->guest_DFLAG;
    901    vassert(dflag == 1 || dflag == -1);
    902    if (dflag == -1)
    903       rflags |= (1<<10);
    904    if (vex_state->guest_IDFLAG == 1)
    905       rflags |= (1<<21);
    906    if (vex_state->guest_ACFLAG == 1)
    907       rflags |= (1<<18);
    908 
    909    return rflags;
    910 }
    911 
    912 /* VISIBLE TO LIBVEX CLIENT */
    913 void
    914 LibVEX_GuestAMD64_put_rflag_c ( ULong new_carry_flag,
    915                                /*MOD*/VexGuestAMD64State* vex_state )
    916 {
    917    ULong oszacp = amd64g_calculate_rflags_all_WRK(
    918                      vex_state->guest_CC_OP,
    919                      vex_state->guest_CC_DEP1,
    920                      vex_state->guest_CC_DEP2,
    921                      vex_state->guest_CC_NDEP
    922                   );
    923    if (new_carry_flag & 1) {
    924       oszacp |= AMD64G_CC_MASK_C;
    925    } else {
    926       oszacp &= ~AMD64G_CC_MASK_C;
    927    }
    928    vex_state->guest_CC_OP   = AMD64G_CC_OP_COPY;
    929    vex_state->guest_CC_DEP1 = oszacp;
    930    vex_state->guest_CC_DEP2 = 0;
    931    vex_state->guest_CC_NDEP = 0;
    932 }
    933 
    934 
    935 /*---------------------------------------------------------------*/
    936 /*--- %rflags translation-time function specialisers.         ---*/
    937 /*--- These help iropt specialise calls the above run-time    ---*/
    938 /*--- %rflags functions.                                      ---*/
    939 /*---------------------------------------------------------------*/
    940 
    941 /* Used by the optimiser to try specialisations.  Returns an
    942    equivalent expression, or NULL if none. */
    943 
    944 static Bool isU64 ( IRExpr* e, ULong n )
    945 {
    946    return toBool( e->tag == Iex_Const
    947                   && e->Iex.Const.con->tag == Ico_U64
    948                   && e->Iex.Const.con->Ico.U64 == n );
    949 }
    950 
    951 IRExpr* guest_amd64_spechelper ( const HChar* function_name,
    952                                  IRExpr** args,
    953                                  IRStmt** precedingStmts,
    954                                  Int      n_precedingStmts )
    955 {
    956 #  define unop(_op,_a1) IRExpr_Unop((_op),(_a1))
    957 #  define binop(_op,_a1,_a2) IRExpr_Binop((_op),(_a1),(_a2))
    958 #  define mkU64(_n) IRExpr_Const(IRConst_U64(_n))
    959 #  define mkU32(_n) IRExpr_Const(IRConst_U32(_n))
    960 #  define mkU8(_n)  IRExpr_Const(IRConst_U8(_n))
    961 
    962    Int i, arity = 0;
    963    for (i = 0; args[i]; i++)
    964       arity++;
    965 #  if 0
    966    vex_printf("spec request:\n");
    967    vex_printf("   %s  ", function_name);
    968    for (i = 0; i < arity; i++) {
    969       vex_printf("  ");
    970       ppIRExpr(args[i]);
    971    }
    972    vex_printf("\n");
    973 #  endif
    974 
    975    /* --------- specialising "amd64g_calculate_condition" --------- */
    976 
    977    if (vex_streq(function_name, "amd64g_calculate_condition")) {
    978       /* specialise calls to above "calculate condition" function */
    979       IRExpr *cond, *cc_op, *cc_dep1, *cc_dep2;
    980       vassert(arity == 5);
    981       cond    = args[0];
    982       cc_op   = args[1];
    983       cc_dep1 = args[2];
    984       cc_dep2 = args[3];
    985 
    986       /*---------------- ADDQ ----------------*/
    987 
    988       if (isU64(cc_op, AMD64G_CC_OP_ADDQ) && isU64(cond, AMD64CondZ)) {
    989          /* long long add, then Z --> test (dst+src == 0) */
    990          return unop(Iop_1Uto64,
    991                      binop(Iop_CmpEQ64,
    992                            binop(Iop_Add64, cc_dep1, cc_dep2),
    993                            mkU64(0)));
    994       }
    995 
    996       /*---------------- ADDL ----------------*/
    997 
    998       if (isU64(cc_op, AMD64G_CC_OP_ADDL) && isU64(cond, AMD64CondO)) {
    999          /* This is very commonly generated by Javascript JITs, for
   1000             the idiom "do a 32-bit add and jump to out-of-line code if
   1001             an overflow occurs". */
   1002          /* long add, then O (overflow)
   1003             --> ((dep1 ^ dep2 ^ -1) & (dep1 ^ (dep1 + dep2)))[31]
   1004             --> (((dep1 ^ dep2 ^ -1) & (dep1 ^ (dep1 +64 dep2))) >>u 31) & 1
   1005             --> (((not(dep1 ^ dep2)) & (dep1 ^ (dep1 +64 dep2))) >>u 31) & 1
   1006          */
   1007          vassert(isIRAtom(cc_dep1));
   1008          vassert(isIRAtom(cc_dep2));
   1009          return
   1010             binop(Iop_And64,
   1011                   binop(Iop_Shr64,
   1012                         binop(Iop_And64,
   1013                               unop(Iop_Not64,
   1014                                    binop(Iop_Xor64, cc_dep1, cc_dep2)),
   1015                               binop(Iop_Xor64,
   1016                                     cc_dep1,
   1017                                     binop(Iop_Add64, cc_dep1, cc_dep2))),
   1018                         mkU8(31)),
   1019                   mkU64(1));
   1020 
   1021       }
   1022 
   1023       /*---------------- SUBQ ----------------*/
   1024 
   1025       /* 0, */
   1026       if (isU64(cc_op, AMD64G_CC_OP_SUBQ) && isU64(cond, AMD64CondO)) {
   1027          /* long long sub/cmp, then O (overflow)
   1028             --> ((dep1 ^ dep2) & (dep1 ^ (dep1 - dep2)))[63]
   1029             --> ((dep1 ^ dep2) & (dep1 ^ (dep1 - dep2))) >>u 63
   1030          */
   1031          vassert(isIRAtom(cc_dep1));
   1032          vassert(isIRAtom(cc_dep2));
   1033          return binop(Iop_Shr64,
   1034                       binop(Iop_And64,
   1035                             binop(Iop_Xor64, cc_dep1, cc_dep2),
   1036                             binop(Iop_Xor64,
   1037                                   cc_dep1,
   1038                                   binop(Iop_Sub64, cc_dep1, cc_dep2))),
   1039                       mkU8(64));
   1040       }
   1041       if (isU64(cc_op, AMD64G_CC_OP_SUBQ) && isU64(cond, AMD64CondNO)) {
   1042          /* No action.  Never yet found a test case. */
   1043       }
   1044 
   1045       /* 2, 3 */
   1046       if (isU64(cc_op, AMD64G_CC_OP_SUBQ) && isU64(cond, AMD64CondB)) {
   1047          /* long long sub/cmp, then B (unsigned less than)
   1048             --> test dst <u src */
   1049          return unop(Iop_1Uto64,
   1050                      binop(Iop_CmpLT64U, cc_dep1, cc_dep2));
   1051       }
   1052       if (isU64(cc_op, AMD64G_CC_OP_SUBQ) && isU64(cond, AMD64CondNB)) {
   1053          /* long long sub/cmp, then NB (unsigned greater than or equal)
   1054             --> test src <=u dst */
   1055          /* Note, args are opposite way round from the usual */
   1056          return unop(Iop_1Uto64,
   1057                      binop(Iop_CmpLE64U, cc_dep2, cc_dep1));
   1058       }
   1059 
   1060       /* 4, 5 */
   1061       if (isU64(cc_op, AMD64G_CC_OP_SUBQ) && isU64(cond, AMD64CondZ)) {
   1062          /* long long sub/cmp, then Z --> test dst==src */
   1063          return unop(Iop_1Uto64,
   1064                      binop(Iop_CmpEQ64,cc_dep1,cc_dep2));
   1065       }
   1066       if (isU64(cc_op, AMD64G_CC_OP_SUBQ) && isU64(cond, AMD64CondNZ)) {
   1067          /* long long sub/cmp, then NZ --> test dst!=src */
   1068          return unop(Iop_1Uto64,
   1069                      binop(Iop_CmpNE64,cc_dep1,cc_dep2));
   1070       }
   1071 
   1072       /* 6, 7 */
   1073       if (isU64(cc_op, AMD64G_CC_OP_SUBQ) && isU64(cond, AMD64CondBE)) {
   1074          /* long long sub/cmp, then BE (unsigned less than or equal)
   1075             --> test dst <=u src */
   1076          return unop(Iop_1Uto64,
   1077                      binop(Iop_CmpLE64U, cc_dep1, cc_dep2));
   1078       }
   1079       if (isU64(cc_op, AMD64G_CC_OP_SUBQ) && isU64(cond, AMD64CondNBE)) {
   1080          /* long long sub/cmp, then NBE (unsigned greater than)
   1081             --> test !(dst <=u src) */
   1082          return binop(Iop_Xor64,
   1083                       unop(Iop_1Uto64,
   1084                            binop(Iop_CmpLE64U, cc_dep1, cc_dep2)),
   1085                       mkU64(1));
   1086       }
   1087 
   1088       /* 8, 9 */
   1089       if (isU64(cc_op, AMD64G_CC_OP_SUBQ) && isU64(cond, AMD64CondS)) {
   1090          /* long long sub/cmp, then S (negative)
   1091             --> (dst-src)[63]
   1092             --> (dst-src) >>u 63 */
   1093          return binop(Iop_Shr64,
   1094                       binop(Iop_Sub64, cc_dep1, cc_dep2),
   1095                       mkU8(63));
   1096       }
   1097       if (isU64(cc_op, AMD64G_CC_OP_SUBQ) && isU64(cond, AMD64CondNS)) {
   1098          /* long long sub/cmp, then NS (not negative)
   1099             --> (dst-src)[63] ^ 1
   1100             --> ((dst-src) >>u 63) ^ 1 */
   1101          return binop(Iop_Xor64,
   1102                       binop(Iop_Shr64,
   1103                             binop(Iop_Sub64, cc_dep1, cc_dep2),
   1104                             mkU8(63)),
   1105                       mkU64(1));
   1106       }
   1107 
   1108       /* 12, 13 */
   1109       if (isU64(cc_op, AMD64G_CC_OP_SUBQ) && isU64(cond, AMD64CondL)) {
   1110          /* long long sub/cmp, then L (signed less than)
   1111             --> test dst <s src */
   1112          return unop(Iop_1Uto64,
   1113                      binop(Iop_CmpLT64S, cc_dep1, cc_dep2));
   1114       }
   1115       if (isU64(cc_op, AMD64G_CC_OP_SUBQ) && isU64(cond, AMD64CondNL)) {
   1116          /* long long sub/cmp, then NL (signed greater than or equal)
   1117             --> test dst >=s src
   1118             --> test src <=s dst */
   1119          return unop(Iop_1Uto64,
   1120                      binop(Iop_CmpLE64S, cc_dep2, cc_dep1));
   1121       }
   1122 
   1123       /* 14, 15 */
   1124       if (isU64(cc_op, AMD64G_CC_OP_SUBQ) && isU64(cond, AMD64CondLE)) {
   1125          /* long long sub/cmp, then LE (signed less than or equal)
   1126             --> test dst <=s src */
   1127          return unop(Iop_1Uto64,
   1128                      binop(Iop_CmpLE64S, cc_dep1, cc_dep2));
   1129       }
   1130       if (isU64(cc_op, AMD64G_CC_OP_SUBQ) && isU64(cond, AMD64CondNLE)) {
   1131          /* long sub/cmp, then NLE (signed greater than)
   1132             --> test !(dst <=s src)
   1133             --> test (dst >s src)
   1134             --> test (src <s dst) */
   1135          return unop(Iop_1Uto64,
   1136                      binop(Iop_CmpLT64S, cc_dep2, cc_dep1));
   1137 
   1138       }
   1139 
   1140       /*---------------- SUBL ----------------*/
   1141 
   1142       /* 0, */
   1143       if (isU64(cc_op, AMD64G_CC_OP_SUBL) && isU64(cond, AMD64CondO)) {
   1144          /* This is very commonly generated by Javascript JITs, for
   1145             the idiom "do a 32-bit subtract and jump to out-of-line
   1146             code if an overflow occurs". */
   1147          /* long sub/cmp, then O (overflow)
   1148             --> ((dep1 ^ dep2) & (dep1 ^ (dep1 - dep2)))[31]
   1149             --> (((dep1 ^ dep2) & (dep1 ^ (dep1 -64 dep2))) >>u 31) & 1
   1150          */
   1151          vassert(isIRAtom(cc_dep1));
   1152          vassert(isIRAtom(cc_dep2));
   1153          return
   1154             binop(Iop_And64,
   1155                   binop(Iop_Shr64,
   1156                         binop(Iop_And64,
   1157                               binop(Iop_Xor64, cc_dep1, cc_dep2),
   1158                               binop(Iop_Xor64,
   1159                                     cc_dep1,
   1160                                     binop(Iop_Sub64, cc_dep1, cc_dep2))),
   1161                         mkU8(31)),
   1162                   mkU64(1));
   1163       }
   1164       if (isU64(cc_op, AMD64G_CC_OP_SUBL) && isU64(cond, AMD64CondNO)) {
   1165          /* No action.  Never yet found a test case. */
   1166       }
   1167 
   1168       /* 2, 3 */
   1169       if (isU64(cc_op, AMD64G_CC_OP_SUBL) && isU64(cond, AMD64CondB)) {
   1170          /* long sub/cmp, then B (unsigned less than)
   1171             --> test dst <u src */
   1172          return unop(Iop_1Uto64,
   1173                      binop(Iop_CmpLT32U,
   1174                            unop(Iop_64to32, cc_dep1),
   1175                            unop(Iop_64to32, cc_dep2)));
   1176       }
   1177       if (isU64(cc_op, AMD64G_CC_OP_SUBL) && isU64(cond, AMD64CondNB)) {
   1178          /* long sub/cmp, then NB (unsigned greater than or equal)
   1179             --> test src <=u dst */
   1180          /* Note, args are opposite way round from the usual */
   1181          return unop(Iop_1Uto64,
   1182                      binop(Iop_CmpLE32U,
   1183                            unop(Iop_64to32, cc_dep2),
   1184                            unop(Iop_64to32, cc_dep1)));
   1185       }
   1186 
   1187       /* 4, 5 */
   1188       if (isU64(cc_op, AMD64G_CC_OP_SUBL) && isU64(cond, AMD64CondZ)) {
   1189          /* long sub/cmp, then Z --> test dst==src */
   1190          return unop(Iop_1Uto64,
   1191                      binop(Iop_CmpEQ32,
   1192                            unop(Iop_64to32, cc_dep1),
   1193                            unop(Iop_64to32, cc_dep2)));
   1194       }
   1195       if (isU64(cc_op, AMD64G_CC_OP_SUBL) && isU64(cond, AMD64CondNZ)) {
   1196          /* long sub/cmp, then NZ --> test dst!=src */
   1197          return unop(Iop_1Uto64,
   1198                      binop(Iop_CmpNE32,
   1199                            unop(Iop_64to32, cc_dep1),
   1200                            unop(Iop_64to32, cc_dep2)));
   1201       }
   1202 
   1203       /* 6, 7 */
   1204       if (isU64(cc_op, AMD64G_CC_OP_SUBL) && isU64(cond, AMD64CondBE)) {
   1205          /* long sub/cmp, then BE (unsigned less than or equal)
   1206             --> test dst <=u src */
   1207          return unop(Iop_1Uto64,
   1208                      binop(Iop_CmpLE32U,
   1209                            unop(Iop_64to32, cc_dep1),
   1210                            unop(Iop_64to32, cc_dep2)));
   1211       }
   1212       if (isU64(cc_op, AMD64G_CC_OP_SUBL) && isU64(cond, AMD64CondNBE)) {
   1213          /* long sub/cmp, then NBE (unsigned greater than)
   1214             --> test src <u dst */
   1215          /* Note, args are opposite way round from the usual */
   1216          return unop(Iop_1Uto64,
   1217                      binop(Iop_CmpLT32U,
   1218                            unop(Iop_64to32, cc_dep2),
   1219                            unop(Iop_64to32, cc_dep1)));
   1220       }
   1221 
   1222       /* 8, 9 */
   1223       if (isU64(cc_op, AMD64G_CC_OP_SUBL) && isU64(cond, AMD64CondS)) {
   1224          /* long sub/cmp, then S (negative)
   1225             --> (dst-src)[31]
   1226             --> ((dst -64 src) >>u 31) & 1
   1227             Pointless to narrow the args to 32 bit before the subtract. */
   1228          return binop(Iop_And64,
   1229                       binop(Iop_Shr64,
   1230                             binop(Iop_Sub64, cc_dep1, cc_dep2),
   1231                             mkU8(31)),
   1232                       mkU64(1));
   1233       }
   1234       if (isU64(cc_op, AMD64G_CC_OP_SUBL) && isU64(cond, AMD64CondNS)) {
   1235          /* long sub/cmp, then NS (not negative)
   1236             --> (dst-src)[31] ^ 1
   1237             --> (((dst -64 src) >>u 31) & 1) ^ 1
   1238             Pointless to narrow the args to 32 bit before the subtract. */
   1239          return binop(Iop_Xor64,
   1240                       binop(Iop_And64,
   1241                             binop(Iop_Shr64,
   1242                                   binop(Iop_Sub64, cc_dep1, cc_dep2),
   1243                                   mkU8(31)),
   1244                             mkU64(1)),
   1245                       mkU64(1));
   1246       }
   1247 
   1248       /* 12, 13 */
   1249       if (isU64(cc_op, AMD64G_CC_OP_SUBL) && isU64(cond, AMD64CondL)) {
   1250          /* long sub/cmp, then L (signed less than)
   1251             --> test dst <s src */
   1252          return unop(Iop_1Uto64,
   1253                      binop(Iop_CmpLT32S,
   1254                            unop(Iop_64to32, cc_dep1),
   1255                            unop(Iop_64to32, cc_dep2)));
   1256       }
   1257       if (isU64(cc_op, AMD64G_CC_OP_SUBL) && isU64(cond, AMD64CondNL)) {
   1258          /* long sub/cmp, then NL (signed greater than or equal)
   1259             --> test dst >=s src
   1260             --> test src <=s dst */
   1261          return unop(Iop_1Uto64,
   1262                      binop(Iop_CmpLE32S,
   1263                            unop(Iop_64to32, cc_dep2),
   1264                            unop(Iop_64to32, cc_dep1)));
   1265       }
   1266 
   1267       /* 14, 15 */
   1268       if (isU64(cc_op, AMD64G_CC_OP_SUBL) && isU64(cond, AMD64CondLE)) {
   1269          /* long sub/cmp, then LE (signed less than or equal)
   1270             --> test dst <=s src */
   1271          return unop(Iop_1Uto64,
   1272                      binop(Iop_CmpLE32S,
   1273                            unop(Iop_64to32, cc_dep1),
   1274                            unop(Iop_64to32, cc_dep2)));
   1275 
   1276       }
   1277       if (isU64(cc_op, AMD64G_CC_OP_SUBL) && isU64(cond, AMD64CondNLE)) {
   1278          /* long sub/cmp, then NLE (signed greater than)
   1279             --> test !(dst <=s src)
   1280             --> test (dst >s src)
   1281             --> test (src <s dst) */
   1282          return unop(Iop_1Uto64,
   1283                      binop(Iop_CmpLT32S,
   1284                            unop(Iop_64to32, cc_dep2),
   1285                            unop(Iop_64to32, cc_dep1)));
   1286 
   1287       }
   1288 
   1289       /*---------------- SUBW ----------------*/
   1290 
   1291       /* 4, 5 */
   1292       if (isU64(cc_op, AMD64G_CC_OP_SUBW) && isU64(cond, AMD64CondZ)) {
   1293          /* word sub/cmp, then Z --> test dst==src */
   1294          return unop(Iop_1Uto64,
   1295                      binop(Iop_CmpEQ16,
   1296                            unop(Iop_64to16,cc_dep1),
   1297                            unop(Iop_64to16,cc_dep2)));
   1298       }
   1299       if (isU64(cc_op, AMD64G_CC_OP_SUBW) && isU64(cond, AMD64CondNZ)) {
   1300          /* word sub/cmp, then NZ --> test dst!=src */
   1301          return unop(Iop_1Uto64,
   1302                      binop(Iop_CmpNE16,
   1303                            unop(Iop_64to16,cc_dep1),
   1304                            unop(Iop_64to16,cc_dep2)));
   1305       }
   1306 
   1307       /* 6, */
   1308       if (isU64(cc_op, AMD64G_CC_OP_SUBW) && isU64(cond, AMD64CondBE)) {
   1309          /* word sub/cmp, then BE (unsigned less than or equal)
   1310             --> test dst <=u src */
   1311          return unop(Iop_1Uto64,
   1312                      binop(Iop_CmpLE64U,
   1313                            binop(Iop_Shl64, cc_dep1, mkU8(48)),
   1314                            binop(Iop_Shl64, cc_dep2, mkU8(48))));
   1315       }
   1316 
   1317       /* 14, */
   1318       if (isU64(cc_op, AMD64G_CC_OP_SUBW) && isU64(cond, AMD64CondLE)) {
   1319          /* word sub/cmp, then LE (signed less than or equal)
   1320             --> test dst <=s src */
   1321          return unop(Iop_1Uto64,
   1322                      binop(Iop_CmpLE64S,
   1323                            binop(Iop_Shl64,cc_dep1,mkU8(48)),
   1324                            binop(Iop_Shl64,cc_dep2,mkU8(48))));
   1325 
   1326       }
   1327 
   1328       /*---------------- SUBB ----------------*/
   1329 
   1330       /* 2, 3 */
   1331       if (isU64(cc_op, AMD64G_CC_OP_SUBB) && isU64(cond, AMD64CondB)) {
   1332          /* byte sub/cmp, then B (unsigned less than)
   1333             --> test dst <u src */
   1334          return unop(Iop_1Uto64,
   1335                      binop(Iop_CmpLT64U,
   1336                            binop(Iop_And64, cc_dep1, mkU64(0xFF)),
   1337                            binop(Iop_And64, cc_dep2, mkU64(0xFF))));
   1338       }
   1339       if (isU64(cc_op, AMD64G_CC_OP_SUBB) && isU64(cond, AMD64CondNB)) {
   1340          /* byte sub/cmp, then NB (unsigned greater than or equal)
   1341             --> test src <=u dst */
   1342          /* Note, args are opposite way round from the usual */
   1343          return unop(Iop_1Uto64,
   1344                      binop(Iop_CmpLE64U,
   1345                            binop(Iop_And64, cc_dep2, mkU64(0xFF)),
   1346                            binop(Iop_And64, cc_dep1, mkU64(0xFF))));
   1347       }
   1348 
   1349       /* 4, 5 */
   1350       if (isU64(cc_op, AMD64G_CC_OP_SUBB) && isU64(cond, AMD64CondZ)) {
   1351          /* byte sub/cmp, then Z --> test dst==src */
   1352          return unop(Iop_1Uto64,
   1353                      binop(Iop_CmpEQ8,
   1354                            unop(Iop_64to8,cc_dep1),
   1355                            unop(Iop_64to8,cc_dep2)));
   1356       }
   1357       if (isU64(cc_op, AMD64G_CC_OP_SUBB) && isU64(cond, AMD64CondNZ)) {
   1358          /* byte sub/cmp, then NZ --> test dst!=src */
   1359          return unop(Iop_1Uto64,
   1360                      binop(Iop_CmpNE8,
   1361                            unop(Iop_64to8,cc_dep1),
   1362                            unop(Iop_64to8,cc_dep2)));
   1363       }
   1364 
   1365       /* 6, */
   1366       if (isU64(cc_op, AMD64G_CC_OP_SUBB) && isU64(cond, AMD64CondBE)) {
   1367          /* byte sub/cmp, then BE (unsigned less than or equal)
   1368             --> test dst <=u src */
   1369          return unop(Iop_1Uto64,
   1370                      binop(Iop_CmpLE64U,
   1371                            binop(Iop_And64, cc_dep1, mkU64(0xFF)),
   1372                            binop(Iop_And64, cc_dep2, mkU64(0xFF))));
   1373       }
   1374 
   1375       /* 8, 9 */
   1376       if (isU64(cc_op, AMD64G_CC_OP_SUBB) && isU64(cond, AMD64CondS)
   1377                                           && isU64(cc_dep2, 0)) {
   1378          /* byte sub/cmp of zero, then S --> test (dst-0 <s 0)
   1379                                          --> test dst <s 0
   1380                                          --> (ULong)dst[7]
   1381             This is yet another scheme by which gcc figures out if the
   1382             top bit of a byte is 1 or 0.  See also LOGICB/CondS below. */
   1383          /* Note: isU64(cc_dep2, 0) is correct, even though this is
   1384             for an 8-bit comparison, since the args to the helper
   1385             function are always U64s. */
   1386          return binop(Iop_And64,
   1387                       binop(Iop_Shr64,cc_dep1,mkU8(7)),
   1388                       mkU64(1));
   1389       }
   1390       if (isU64(cc_op, AMD64G_CC_OP_SUBB) && isU64(cond, AMD64CondNS)
   1391                                           && isU64(cc_dep2, 0)) {
   1392          /* byte sub/cmp of zero, then NS --> test !(dst-0 <s 0)
   1393                                           --> test !(dst <s 0)
   1394                                           --> (ULong) !dst[7]
   1395          */
   1396          return binop(Iop_Xor64,
   1397                       binop(Iop_And64,
   1398                             binop(Iop_Shr64,cc_dep1,mkU8(7)),
   1399                             mkU64(1)),
   1400                       mkU64(1));
   1401       }
   1402 
   1403       /*---------------- LOGICQ ----------------*/
   1404 
   1405       if (isU64(cc_op, AMD64G_CC_OP_LOGICQ) && isU64(cond, AMD64CondZ)) {
   1406          /* long long and/or/xor, then Z --> test dst==0 */
   1407          return unop(Iop_1Uto64,
   1408                      binop(Iop_CmpEQ64, cc_dep1, mkU64(0)));
   1409       }
   1410       if (isU64(cc_op, AMD64G_CC_OP_LOGICQ) && isU64(cond, AMD64CondNZ)) {
   1411          /* long long and/or/xor, then NZ --> test dst!=0 */
   1412          return unop(Iop_1Uto64,
   1413                      binop(Iop_CmpNE64, cc_dep1, mkU64(0)));
   1414       }
   1415 
   1416       if (isU64(cc_op, AMD64G_CC_OP_LOGICQ) && isU64(cond, AMD64CondL)) {
   1417          /* long long and/or/xor, then L
   1418             LOGIC sets SF and ZF according to the
   1419             result and makes OF be zero.  L computes SF ^ OF, but
   1420             OF is zero, so this reduces to SF -- which will be 1 iff
   1421             the result is < signed 0.  Hence ...
   1422          */
   1423          return unop(Iop_1Uto64,
   1424                      binop(Iop_CmpLT64S,
   1425                            cc_dep1,
   1426                            mkU64(0)));
   1427       }
   1428 
   1429       /*---------------- LOGICL ----------------*/
   1430 
   1431       if (isU64(cc_op, AMD64G_CC_OP_LOGICL) && isU64(cond, AMD64CondZ)) {
   1432          /* long and/or/xor, then Z --> test dst==0 */
   1433          return unop(Iop_1Uto64,
   1434                      binop(Iop_CmpEQ32,
   1435                            unop(Iop_64to32, cc_dep1),
   1436                            mkU32(0)));
   1437       }
   1438       if (isU64(cc_op, AMD64G_CC_OP_LOGICL) && isU64(cond, AMD64CondNZ)) {
   1439          /* long and/or/xor, then NZ --> test dst!=0 */
   1440          return unop(Iop_1Uto64,
   1441                      binop(Iop_CmpNE32,
   1442                            unop(Iop_64to32, cc_dep1),
   1443                            mkU32(0)));
   1444       }
   1445 
   1446       if (isU64(cc_op, AMD64G_CC_OP_LOGICL) && isU64(cond, AMD64CondLE)) {
   1447          /* long and/or/xor, then LE
   1448             This is pretty subtle.  LOGIC sets SF and ZF according to the
   1449             result and makes OF be zero.  LE computes (SF ^ OF) | ZF, but
   1450             OF is zero, so this reduces to SF | ZF -- which will be 1 iff
   1451             the result is <=signed 0.  Hence ...
   1452          */
   1453          return unop(Iop_1Uto64,
   1454                      binop(Iop_CmpLE32S,
   1455                            unop(Iop_64to32, cc_dep1),
   1456                            mkU32(0)));
   1457       }
   1458 
   1459       if (isU64(cc_op, AMD64G_CC_OP_LOGICL) && isU64(cond, AMD64CondS)) {
   1460          /* long and/or/xor, then S --> (ULong)result[31] */
   1461          return binop(Iop_And64,
   1462                       binop(Iop_Shr64, cc_dep1, mkU8(31)),
   1463                       mkU64(1));
   1464       }
   1465       if (isU64(cc_op, AMD64G_CC_OP_LOGICL) && isU64(cond, AMD64CondNS)) {
   1466          /* long and/or/xor, then S --> (ULong) ~ result[31] */
   1467          return binop(Iop_Xor64,
   1468                 binop(Iop_And64,
   1469                       binop(Iop_Shr64, cc_dep1, mkU8(31)),
   1470                       mkU64(1)),
   1471                 mkU64(1));
   1472       }
   1473 
   1474       /*---------------- LOGICW ----------------*/
   1475 
   1476       if (isU64(cc_op, AMD64G_CC_OP_LOGICW) && isU64(cond, AMD64CondZ)) {
   1477          /* word and/or/xor, then Z --> test dst==0 */
   1478          return unop(Iop_1Uto64,
   1479                      binop(Iop_CmpEQ64,
   1480                            binop(Iop_And64, cc_dep1, mkU64(0xFFFF)),
   1481                            mkU64(0)));
   1482       }
   1483       if (isU64(cc_op, AMD64G_CC_OP_LOGICW) && isU64(cond, AMD64CondNZ)) {
   1484          /* word and/or/xor, then NZ --> test dst!=0 */
   1485          return unop(Iop_1Uto64,
   1486                      binop(Iop_CmpNE64,
   1487                            binop(Iop_And64, cc_dep1, mkU64(0xFFFF)),
   1488                            mkU64(0)));
   1489       }
   1490 
   1491       /*---------------- LOGICB ----------------*/
   1492 
   1493       if (isU64(cc_op, AMD64G_CC_OP_LOGICB) && isU64(cond, AMD64CondZ)) {
   1494          /* byte and/or/xor, then Z --> test dst==0 */
   1495          return unop(Iop_1Uto64,
   1496                      binop(Iop_CmpEQ64, binop(Iop_And64,cc_dep1,mkU64(255)),
   1497                                         mkU64(0)));
   1498       }
   1499       if (isU64(cc_op, AMD64G_CC_OP_LOGICB) && isU64(cond, AMD64CondNZ)) {
   1500          /* byte and/or/xor, then NZ --> test dst!=0 */
   1501          return unop(Iop_1Uto64,
   1502                      binop(Iop_CmpNE64, binop(Iop_And64,cc_dep1,mkU64(255)),
   1503                                         mkU64(0)));
   1504       }
   1505 
   1506       if (isU64(cc_op, AMD64G_CC_OP_LOGICB) && isU64(cond, AMD64CondS)) {
   1507          /* this is an idiom gcc sometimes uses to find out if the top
   1508             bit of a byte register is set: eg testb %al,%al; js ..
   1509             Since it just depends on the top bit of the byte, extract
   1510             that bit and explicitly get rid of all the rest.  This
   1511             helps memcheck avoid false positives in the case where any
   1512             of the other bits in the byte are undefined. */
   1513          /* byte and/or/xor, then S --> (UInt)result[7] */
   1514          return binop(Iop_And64,
   1515                       binop(Iop_Shr64,cc_dep1,mkU8(7)),
   1516                       mkU64(1));
   1517       }
   1518       if (isU64(cc_op, AMD64G_CC_OP_LOGICB) && isU64(cond, AMD64CondNS)) {
   1519          /* byte and/or/xor, then NS --> (UInt)!result[7] */
   1520          return binop(Iop_Xor64,
   1521                       binop(Iop_And64,
   1522                             binop(Iop_Shr64,cc_dep1,mkU8(7)),
   1523                             mkU64(1)),
   1524                       mkU64(1));
   1525       }
   1526 
   1527       /*---------------- INCB ----------------*/
   1528 
   1529       if (isU64(cc_op, AMD64G_CC_OP_INCB) && isU64(cond, AMD64CondLE)) {
   1530          /* 8-bit inc, then LE --> sign bit of the arg */
   1531          return binop(Iop_And64,
   1532                       binop(Iop_Shr64,
   1533                             binop(Iop_Sub64, cc_dep1, mkU64(1)),
   1534                             mkU8(7)),
   1535                       mkU64(1));
   1536       }
   1537 
   1538       /*---------------- INCW ----------------*/
   1539 
   1540       if (isU64(cc_op, AMD64G_CC_OP_INCW) && isU64(cond, AMD64CondZ)) {
   1541          /* 16-bit inc, then Z --> test dst == 0 */
   1542          return unop(Iop_1Uto64,
   1543                      binop(Iop_CmpEQ64,
   1544                            binop(Iop_Shl64,cc_dep1,mkU8(48)),
   1545                            mkU64(0)));
   1546       }
   1547 
   1548       /*---------------- DECL ----------------*/
   1549 
   1550       if (isU64(cc_op, AMD64G_CC_OP_DECL) && isU64(cond, AMD64CondZ)) {
   1551          /* dec L, then Z --> test dst == 0 */
   1552          return unop(Iop_1Uto64,
   1553                      binop(Iop_CmpEQ32,
   1554                            unop(Iop_64to32, cc_dep1),
   1555                            mkU32(0)));
   1556       }
   1557 
   1558       /*---------------- DECW ----------------*/
   1559 
   1560       if (isU64(cc_op, AMD64G_CC_OP_DECW) && isU64(cond, AMD64CondNZ)) {
   1561          /* 16-bit dec, then NZ --> test dst != 0 */
   1562          return unop(Iop_1Uto64,
   1563                      binop(Iop_CmpNE64,
   1564                            binop(Iop_Shl64,cc_dep1,mkU8(48)),
   1565                            mkU64(0)));
   1566       }
   1567 
   1568       /*---------------- COPY ----------------*/
   1569       /* This can happen, as a result of amd64 FP compares: "comisd ... ;
   1570          jbe" for example. */
   1571 
   1572       if (isU64(cc_op, AMD64G_CC_OP_COPY) &&
   1573           (isU64(cond, AMD64CondBE) || isU64(cond, AMD64CondNBE))) {
   1574          /* COPY, then BE --> extract C and Z from dep1, and test (C
   1575             or Z == 1). */
   1576          /* COPY, then NBE --> extract C and Z from dep1, and test (C
   1577             or Z == 0). */
   1578          ULong nnn = isU64(cond, AMD64CondBE) ? 1 : 0;
   1579          return
   1580             unop(
   1581                Iop_1Uto64,
   1582                binop(
   1583                   Iop_CmpEQ64,
   1584                   binop(
   1585                      Iop_And64,
   1586                      binop(
   1587                         Iop_Or64,
   1588                         binop(Iop_Shr64, cc_dep1, mkU8(AMD64G_CC_SHIFT_C)),
   1589                         binop(Iop_Shr64, cc_dep1, mkU8(AMD64G_CC_SHIFT_Z))
   1590                      ),
   1591                      mkU64(1)
   1592                   ),
   1593                   mkU64(nnn)
   1594                )
   1595             );
   1596       }
   1597 
   1598       if (isU64(cc_op, AMD64G_CC_OP_COPY) && isU64(cond, AMD64CondB)) {
   1599          /* COPY, then B --> extract C dep1, and test (C == 1). */
   1600          return
   1601             unop(
   1602                Iop_1Uto64,
   1603                binop(
   1604                   Iop_CmpNE64,
   1605                   binop(
   1606                      Iop_And64,
   1607                      binop(Iop_Shr64, cc_dep1, mkU8(AMD64G_CC_SHIFT_C)),
   1608                      mkU64(1)
   1609                   ),
   1610                   mkU64(0)
   1611                )
   1612             );
   1613       }
   1614 
   1615       if (isU64(cc_op, AMD64G_CC_OP_COPY)
   1616           && (isU64(cond, AMD64CondZ) || isU64(cond, AMD64CondNZ))) {
   1617          /* COPY, then Z --> extract Z from dep1, and test (Z == 1). */
   1618          /* COPY, then NZ --> extract Z from dep1, and test (Z == 0). */
   1619          UInt nnn = isU64(cond, AMD64CondZ) ? 1 : 0;
   1620          return
   1621             unop(
   1622                Iop_1Uto64,
   1623                binop(
   1624                   Iop_CmpEQ64,
   1625                   binop(
   1626                      Iop_And64,
   1627                      binop(Iop_Shr64, cc_dep1, mkU8(AMD64G_CC_SHIFT_Z)),
   1628                      mkU64(1)
   1629                   ),
   1630                   mkU64(nnn)
   1631                )
   1632             );
   1633       }
   1634 
   1635       if (isU64(cc_op, AMD64G_CC_OP_COPY) && isU64(cond, AMD64CondP)) {
   1636          /* COPY, then P --> extract P from dep1, and test (P == 1). */
   1637          return
   1638             unop(
   1639                Iop_1Uto64,
   1640                binop(
   1641                   Iop_CmpNE64,
   1642                   binop(
   1643                      Iop_And64,
   1644                      binop(Iop_Shr64, cc_dep1, mkU8(AMD64G_CC_SHIFT_P)),
   1645                      mkU64(1)
   1646                   ),
   1647                   mkU64(0)
   1648                )
   1649             );
   1650       }
   1651 
   1652       return NULL;
   1653    }
   1654 
   1655    /* --------- specialising "amd64g_calculate_rflags_c" --------- */
   1656 
   1657    if (vex_streq(function_name, "amd64g_calculate_rflags_c")) {
   1658       /* specialise calls to above "calculate_rflags_c" function */
   1659       IRExpr *cc_op, *cc_dep1, *cc_dep2, *cc_ndep;
   1660       vassert(arity == 4);
   1661       cc_op   = args[0];
   1662       cc_dep1 = args[1];
   1663       cc_dep2 = args[2];
   1664       cc_ndep = args[3];
   1665 
   1666       if (isU64(cc_op, AMD64G_CC_OP_SUBQ)) {
   1667          /* C after sub denotes unsigned less than */
   1668          return unop(Iop_1Uto64,
   1669                      binop(Iop_CmpLT64U,
   1670                            cc_dep1,
   1671                            cc_dep2));
   1672       }
   1673       if (isU64(cc_op, AMD64G_CC_OP_SUBL)) {
   1674          /* C after sub denotes unsigned less than */
   1675          return unop(Iop_1Uto64,
   1676                      binop(Iop_CmpLT32U,
   1677                            unop(Iop_64to32, cc_dep1),
   1678                            unop(Iop_64to32, cc_dep2)));
   1679       }
   1680       if (isU64(cc_op, AMD64G_CC_OP_SUBB)) {
   1681          /* C after sub denotes unsigned less than */
   1682          return unop(Iop_1Uto64,
   1683                      binop(Iop_CmpLT64U,
   1684                            binop(Iop_And64,cc_dep1,mkU64(0xFF)),
   1685                            binop(Iop_And64,cc_dep2,mkU64(0xFF))));
   1686       }
   1687       if (isU64(cc_op, AMD64G_CC_OP_LOGICQ)
   1688           || isU64(cc_op, AMD64G_CC_OP_LOGICL)
   1689           || isU64(cc_op, AMD64G_CC_OP_LOGICW)
   1690           || isU64(cc_op, AMD64G_CC_OP_LOGICB)) {
   1691          /* cflag after logic is zero */
   1692          return mkU64(0);
   1693       }
   1694       if (isU64(cc_op, AMD64G_CC_OP_DECL) || isU64(cc_op, AMD64G_CC_OP_INCL)
   1695           || isU64(cc_op, AMD64G_CC_OP_DECQ) || isU64(cc_op, AMD64G_CC_OP_INCQ)) {
   1696          /* If the thunk is dec or inc, the cflag is supplied as CC_NDEP. */
   1697          return cc_ndep;
   1698       }
   1699 
   1700 #     if 0
   1701       if (cc_op->tag == Iex_Const) {
   1702          vex_printf("CFLAG "); ppIRExpr(cc_op); vex_printf("\n");
   1703       }
   1704 #     endif
   1705 
   1706       return NULL;
   1707    }
   1708 
   1709 #  undef unop
   1710 #  undef binop
   1711 #  undef mkU64
   1712 #  undef mkU32
   1713 #  undef mkU8
   1714 
   1715    return NULL;
   1716 }
   1717 
   1718 
   1719 /*---------------------------------------------------------------*/
   1720 /*--- Supporting functions for x87 FPU activities.            ---*/
   1721 /*---------------------------------------------------------------*/
   1722 
   1723 static inline Bool host_is_little_endian ( void )
   1724 {
   1725    UInt x = 0x76543210;
   1726    UChar* p = (UChar*)(&x);
   1727    return toBool(*p == 0x10);
   1728 }
   1729 
   1730 /* Inspect a value and its tag, as per the x87 'FXAM' instruction. */
   1731 /* CALLED FROM GENERATED CODE: CLEAN HELPER */
   1732 ULong amd64g_calculate_FXAM ( ULong tag, ULong dbl )
   1733 {
   1734    Bool   mantissaIsZero;
   1735    Int    bexp;
   1736    UChar  sign;
   1737    UChar* f64;
   1738 
   1739    vassert(host_is_little_endian());
   1740 
   1741    /* vex_printf("calculate_FXAM ( %d, %llx ) .. ", tag, dbl ); */
   1742 
   1743    f64  = (UChar*)(&dbl);
   1744    sign = toUChar( (f64[7] >> 7) & 1 );
   1745 
   1746    /* First off, if the tag indicates the register was empty,
   1747       return 1,0,sign,1 */
   1748    if (tag == 0) {
   1749       /* vex_printf("Empty\n"); */
   1750       return AMD64G_FC_MASK_C3 | 0 | (sign << AMD64G_FC_SHIFT_C1)
   1751                                    | AMD64G_FC_MASK_C0;
   1752    }
   1753 
   1754    bexp = (f64[7] << 4) | ((f64[6] >> 4) & 0x0F);
   1755    bexp &= 0x7FF;
   1756 
   1757    mantissaIsZero
   1758       = toBool(
   1759            (f64[6] & 0x0F) == 0
   1760            && (f64[5] | f64[4] | f64[3] | f64[2] | f64[1] | f64[0]) == 0
   1761         );
   1762 
   1763    /* If both exponent and mantissa are zero, the value is zero.
   1764       Return 1,0,sign,0. */
   1765    if (bexp == 0 && mantissaIsZero) {
   1766       /* vex_printf("Zero\n"); */
   1767       return AMD64G_FC_MASK_C3 | 0
   1768                                | (sign << AMD64G_FC_SHIFT_C1) | 0;
   1769    }
   1770 
   1771    /* If exponent is zero but mantissa isn't, it's a denormal.
   1772       Return 1,1,sign,0. */
   1773    if (bexp == 0 && !mantissaIsZero) {
   1774       /* vex_printf("Denormal\n"); */
   1775       return AMD64G_FC_MASK_C3 | AMD64G_FC_MASK_C2
   1776                                | (sign << AMD64G_FC_SHIFT_C1) | 0;
   1777    }
   1778 
   1779    /* If the exponent is 7FF and the mantissa is zero, this is an infinity.
   1780       Return 0,1,sign,1. */
   1781    if (bexp == 0x7FF && mantissaIsZero) {
   1782       /* vex_printf("Inf\n"); */
   1783       return 0 | AMD64G_FC_MASK_C2 | (sign << AMD64G_FC_SHIFT_C1)
   1784                                    | AMD64G_FC_MASK_C0;
   1785    }
   1786 
   1787    /* If the exponent is 7FF and the mantissa isn't zero, this is a NaN.
   1788       Return 0,0,sign,1. */
   1789    if (bexp == 0x7FF && !mantissaIsZero) {
   1790       /* vex_printf("NaN\n"); */
   1791       return 0 | 0 | (sign << AMD64G_FC_SHIFT_C1) | AMD64G_FC_MASK_C0;
   1792    }
   1793 
   1794    /* Uh, ok, we give up.  It must be a normal finite number.
   1795       Return 0,1,sign,0.
   1796    */
   1797    /* vex_printf("normal\n"); */
   1798    return 0 | AMD64G_FC_MASK_C2 | (sign << AMD64G_FC_SHIFT_C1) | 0;
   1799 }
   1800 
   1801 
   1802 /* This is used to implement both 'frstor' and 'fldenv'.  The latter
   1803    appears to differ from the former only in that the 8 FP registers
   1804    themselves are not transferred into the guest state. */
   1805 static
   1806 VexEmNote do_put_x87 ( Bool moveRegs,
   1807                        /*IN*/UChar* x87_state,
   1808                        /*OUT*/VexGuestAMD64State* vex_state )
   1809 {
   1810    Int        stno, preg;
   1811    UInt       tag;
   1812    ULong*     vexRegs = (ULong*)(&vex_state->guest_FPREG[0]);
   1813    UChar*     vexTags = (UChar*)(&vex_state->guest_FPTAG[0]);
   1814    Fpu_State* x87     = (Fpu_State*)x87_state;
   1815    UInt       ftop    = (x87->env[FP_ENV_STAT] >> 11) & 7;
   1816    UInt       tagw    = x87->env[FP_ENV_TAG];
   1817    UInt       fpucw   = x87->env[FP_ENV_CTRL];
   1818    UInt       c3210   = x87->env[FP_ENV_STAT] & 0x4700;
   1819    VexEmNote  ew;
   1820    UInt       fpround;
   1821    ULong      pair;
   1822 
   1823    /* Copy registers and tags */
   1824    for (stno = 0; stno < 8; stno++) {
   1825       preg = (stno + ftop) & 7;
   1826       tag = (tagw >> (2*preg)) & 3;
   1827       if (tag == 3) {
   1828          /* register is empty */
   1829          /* hmm, if it's empty, does it still get written?  Probably
   1830             safer to say it does.  If we don't, memcheck could get out
   1831             of sync, in that it thinks all FP registers are defined by
   1832             this helper, but in reality some have not been updated. */
   1833          if (moveRegs)
   1834             vexRegs[preg] = 0; /* IEEE754 64-bit zero */
   1835          vexTags[preg] = 0;
   1836       } else {
   1837          /* register is non-empty */
   1838          if (moveRegs)
   1839             convert_f80le_to_f64le( &x87->reg[10*stno],
   1840                                     (UChar*)&vexRegs[preg] );
   1841          vexTags[preg] = 1;
   1842       }
   1843    }
   1844 
   1845    /* stack pointer */
   1846    vex_state->guest_FTOP = ftop;
   1847 
   1848    /* status word */
   1849    vex_state->guest_FC3210 = c3210;
   1850 
   1851    /* handle the control word, setting FPROUND and detecting any
   1852       emulation warnings. */
   1853    pair    = amd64g_check_fldcw ( (ULong)fpucw );
   1854    fpround = (UInt)pair & 0xFFFFFFFFULL;
   1855    ew      = (VexEmNote)(pair >> 32);
   1856 
   1857    vex_state->guest_FPROUND = fpround & 3;
   1858 
   1859    /* emulation warnings --> caller */
   1860    return ew;
   1861 }
   1862 
   1863 
   1864 /* Create an x87 FPU state from the guest state, as close as
   1865    we can approximate it. */
   1866 static
   1867 void do_get_x87 ( /*IN*/VexGuestAMD64State* vex_state,
   1868                   /*OUT*/UChar* x87_state )
   1869 {
   1870    Int        i, stno, preg;
   1871    UInt       tagw;
   1872    ULong*     vexRegs = (ULong*)(&vex_state->guest_FPREG[0]);
   1873    UChar*     vexTags = (UChar*)(&vex_state->guest_FPTAG[0]);
   1874    Fpu_State* x87     = (Fpu_State*)x87_state;
   1875    UInt       ftop    = vex_state->guest_FTOP;
   1876    UInt       c3210   = vex_state->guest_FC3210;
   1877 
   1878    for (i = 0; i < 14; i++)
   1879       x87->env[i] = 0;
   1880 
   1881    x87->env[1] = x87->env[3] = x87->env[5] = x87->env[13] = 0xFFFF;
   1882    x87->env[FP_ENV_STAT]
   1883       = toUShort(((ftop & 7) << 11) | (c3210 & 0x4700));
   1884    x87->env[FP_ENV_CTRL]
   1885       = toUShort(amd64g_create_fpucw( vex_state->guest_FPROUND ));
   1886 
   1887    /* Dump the register stack in ST order. */
   1888    tagw = 0;
   1889    for (stno = 0; stno < 8; stno++) {
   1890       preg = (stno + ftop) & 7;
   1891       if (vexTags[preg] == 0) {
   1892          /* register is empty */
   1893          tagw |= (3 << (2*preg));
   1894          convert_f64le_to_f80le( (UChar*)&vexRegs[preg],
   1895                                  &x87->reg[10*stno] );
   1896       } else {
   1897          /* register is full. */
   1898          tagw |= (0 << (2*preg));
   1899          convert_f64le_to_f80le( (UChar*)&vexRegs[preg],
   1900                                  &x87->reg[10*stno] );
   1901       }
   1902    }
   1903    x87->env[FP_ENV_TAG] = toUShort(tagw);
   1904 }
   1905 
   1906 
   1907 /* CALLED FROM GENERATED CODE */
   1908 /* DIRTY HELPER (reads guest state, writes guest mem) */
   1909 /* NOTE: only handles 32-bit format (no REX.W on the insn) */
   1910 void amd64g_dirtyhelper_FXSAVE_ALL_EXCEPT_XMM ( VexGuestAMD64State* gst,
   1911                                                 HWord addr )
   1912 {
   1913    /* Derived from values obtained from
   1914       vendor_id       : AuthenticAMD
   1915       cpu family      : 15
   1916       model           : 12
   1917       model name      : AMD Athlon(tm) 64 Processor 3200+
   1918       stepping        : 0
   1919       cpu MHz         : 2200.000
   1920       cache size      : 512 KB
   1921    */
   1922    /* Somewhat roundabout, but at least it's simple. */
   1923    Fpu_State tmp;
   1924    UShort*   addrS = (UShort*)addr;
   1925    UChar*    addrC = (UChar*)addr;
   1926    UInt      mxcsr;
   1927    UShort    fp_tags;
   1928    UInt      summary_tags;
   1929    Int       r, stno;
   1930    UShort    *srcS, *dstS;
   1931 
   1932    do_get_x87( gst, (UChar*)&tmp );
   1933    mxcsr = amd64g_create_mxcsr( gst->guest_SSEROUND );
   1934 
   1935    /* Now build the proper fxsave image from the x87 image we just
   1936       made. */
   1937 
   1938    addrS[0]  = tmp.env[FP_ENV_CTRL]; /* FCW: fpu control word */
   1939    addrS[1]  = tmp.env[FP_ENV_STAT]; /* FCW: fpu status word */
   1940 
   1941    /* set addrS[2] in an endian-independent way */
   1942    summary_tags = 0;
   1943    fp_tags = tmp.env[FP_ENV_TAG];
   1944    for (r = 0; r < 8; r++) {
   1945       if ( ((fp_tags >> (2*r)) & 3) != 3 )
   1946          summary_tags |= (1 << r);
   1947    }
   1948    addrC[4]  = toUChar(summary_tags); /* FTW: tag summary byte */
   1949    addrC[5]  = 0; /* pad */
   1950 
   1951    /* FOP: faulting fpu opcode.  From experimentation, the real CPU
   1952       does not write this field. (?!) */
   1953    addrS[3]  = 0; /* BOGUS */
   1954 
   1955    /* RIP (Last x87 instruction pointer).  From experimentation, the
   1956       real CPU does not write this field. (?!) */
   1957    addrS[4]  = 0; /* BOGUS */
   1958    addrS[5]  = 0; /* BOGUS */
   1959    addrS[6]  = 0; /* BOGUS */
   1960    addrS[7]  = 0; /* BOGUS */
   1961 
   1962    /* RDP (Last x87 data pointer).  From experimentation, the real CPU
   1963       does not write this field. (?!) */
   1964    addrS[8]  = 0; /* BOGUS */
   1965    addrS[9]  = 0; /* BOGUS */
   1966    addrS[10] = 0; /* BOGUS */
   1967    addrS[11] = 0; /* BOGUS */
   1968 
   1969    addrS[12] = toUShort(mxcsr);  /* MXCSR */
   1970    addrS[13] = toUShort(mxcsr >> 16);
   1971 
   1972    addrS[14] = 0xFFFF; /* MXCSR mask (lo16) */
   1973    addrS[15] = 0x0000; /* MXCSR mask (hi16) */
   1974 
   1975    /* Copy in the FP registers, in ST order. */
   1976    for (stno = 0; stno < 8; stno++) {
   1977       srcS = (UShort*)(&tmp.reg[10*stno]);
   1978       dstS = (UShort*)(&addrS[16 + 8*stno]);
   1979       dstS[0] = srcS[0];
   1980       dstS[1] = srcS[1];
   1981       dstS[2] = srcS[2];
   1982       dstS[3] = srcS[3];
   1983       dstS[4] = srcS[4];
   1984       dstS[5] = 0;
   1985       dstS[6] = 0;
   1986       dstS[7] = 0;
   1987    }
   1988 
   1989    /* That's the first 160 bytes of the image done.  Now only %xmm0
   1990       .. %xmm15 remain to be copied, and we let the generated IR do
   1991       that, so as to make Memcheck's definedness flow for the non-XMM
   1992       parts independant from that of the all the other control and
   1993       status words in the structure.  This avoids the false positives
   1994       shown in #291310. */
   1995 }
   1996 
   1997 
   1998 /* CALLED FROM GENERATED CODE */
   1999 /* DIRTY HELPER (writes guest state, reads guest mem) */
   2000 VexEmNote amd64g_dirtyhelper_FXRSTOR_ALL_EXCEPT_XMM ( VexGuestAMD64State* gst,
   2001                                                       HWord addr )
   2002 {
   2003    Fpu_State tmp;
   2004    VexEmNote warnX87 = EmNote_NONE;
   2005    VexEmNote warnXMM = EmNote_NONE;
   2006    UShort*   addrS   = (UShort*)addr;
   2007    UChar*    addrC   = (UChar*)addr;
   2008    UShort    fp_tags;
   2009    Int       r, stno, i;
   2010 
   2011    /* Don't restore %xmm0 .. %xmm15, for the same reasons that
   2012       amd64g_dirtyhelper_FXSAVE_ALL_EXCEPT_XMM doesn't save them.  See
   2013       comment in that function for details. */
   2014 
   2015    /* Copy the x87 registers out of the image, into a temporary
   2016       Fpu_State struct. */
   2017    for (i = 0; i < 14; i++) tmp.env[i] = 0;
   2018    for (i = 0; i < 80; i++) tmp.reg[i] = 0;
   2019    /* fill in tmp.reg[0..7] */
   2020    for (stno = 0; stno < 8; stno++) {
   2021       UShort* dstS = (UShort*)(&tmp.reg[10*stno]);
   2022       UShort* srcS = (UShort*)(&addrS[16 + 8*stno]);
   2023       dstS[0] = srcS[0];
   2024       dstS[1] = srcS[1];
   2025       dstS[2] = srcS[2];
   2026       dstS[3] = srcS[3];
   2027       dstS[4] = srcS[4];
   2028    }
   2029    /* fill in tmp.env[0..13] */
   2030    tmp.env[FP_ENV_CTRL] = addrS[0]; /* FCW: fpu control word */
   2031    tmp.env[FP_ENV_STAT] = addrS[1]; /* FCW: fpu status word */
   2032 
   2033    fp_tags = 0;
   2034    for (r = 0; r < 8; r++) {
   2035       if (addrC[4] & (1<<r))
   2036          fp_tags |= (0 << (2*r)); /* EMPTY */
   2037       else
   2038          fp_tags |= (3 << (2*r)); /* VALID -- not really precise enough. */
   2039    }
   2040    tmp.env[FP_ENV_TAG] = fp_tags;
   2041 
   2042    /* Now write 'tmp' into the guest state. */
   2043    warnX87 = do_put_x87( True/*moveRegs*/, (UChar*)&tmp, gst );
   2044 
   2045    { UInt w32 = (((UInt)addrS[12]) & 0xFFFF)
   2046                 | ((((UInt)addrS[13]) & 0xFFFF) << 16);
   2047      ULong w64 = amd64g_check_ldmxcsr( (ULong)w32 );
   2048 
   2049      warnXMM = (VexEmNote)(w64 >> 32);
   2050 
   2051      gst->guest_SSEROUND = w64 & 0xFFFFFFFFULL;
   2052    }
   2053 
   2054    /* Prefer an X87 emwarn over an XMM one, if both exist. */
   2055    if (warnX87 != EmNote_NONE)
   2056       return warnX87;
   2057    else
   2058       return warnXMM;
   2059 }
   2060 
   2061 
   2062 /* DIRTY HELPER (writes guest state) */
   2063 /* Initialise the x87 FPU state as per 'finit'. */
   2064 void amd64g_dirtyhelper_FINIT ( VexGuestAMD64State* gst )
   2065 {
   2066    Int i;
   2067    gst->guest_FTOP = 0;
   2068    for (i = 0; i < 8; i++) {
   2069       gst->guest_FPTAG[i] = 0; /* empty */
   2070       gst->guest_FPREG[i] = 0; /* IEEE754 64-bit zero */
   2071    }
   2072    gst->guest_FPROUND = (ULong)Irrm_NEAREST;
   2073    gst->guest_FC3210  = 0;
   2074 }
   2075 
   2076 
   2077 /* CALLED FROM GENERATED CODE */
   2078 /* DIRTY HELPER (reads guest memory) */
   2079 ULong amd64g_dirtyhelper_loadF80le ( Addr addrU )
   2080 {
   2081    ULong f64;
   2082    convert_f80le_to_f64le ( (UChar*)addrU, (UChar*)&f64 );
   2083    return f64;
   2084 }
   2085 
   2086 /* CALLED FROM GENERATED CODE */
   2087 /* DIRTY HELPER (writes guest memory) */
   2088 void amd64g_dirtyhelper_storeF80le ( Addr addrU, ULong f64 )
   2089 {
   2090    convert_f64le_to_f80le( (UChar*)&f64, (UChar*)addrU );
   2091 }
   2092 
   2093 
   2094 /* CALLED FROM GENERATED CODE */
   2095 /* CLEAN HELPER */
   2096 /* mxcsr[15:0] contains a SSE native format MXCSR value.
   2097    Extract from it the required SSEROUND value and any resulting
   2098    emulation warning, and return (warn << 32) | sseround value.
   2099 */
   2100 ULong amd64g_check_ldmxcsr ( ULong mxcsr )
   2101 {
   2102    /* Decide on a rounding mode.  mxcsr[14:13] holds it. */
   2103    /* NOTE, encoded exactly as per enum IRRoundingMode. */
   2104    ULong rmode = (mxcsr >> 13) & 3;
   2105 
   2106    /* Detect any required emulation warnings. */
   2107    VexEmNote ew = EmNote_NONE;
   2108 
   2109    if ((mxcsr & 0x1F80) != 0x1F80) {
   2110       /* unmasked exceptions! */
   2111       ew = EmWarn_X86_sseExns;
   2112    }
   2113    else
   2114    if (mxcsr & (1<<15)) {
   2115       /* FZ is set */
   2116       ew = EmWarn_X86_fz;
   2117    }
   2118    else
   2119    if (mxcsr & (1<<6)) {
   2120       /* DAZ is set */
   2121       ew = EmWarn_X86_daz;
   2122    }
   2123 
   2124    return (((ULong)ew) << 32) | ((ULong)rmode);
   2125 }
   2126 
   2127 
   2128 /* CALLED FROM GENERATED CODE */
   2129 /* CLEAN HELPER */
   2130 /* Given sseround as an IRRoundingMode value, create a suitable SSE
   2131    native format MXCSR value. */
   2132 ULong amd64g_create_mxcsr ( ULong sseround )
   2133 {
   2134    sseround &= 3;
   2135    return 0x1F80 | (sseround << 13);
   2136 }
   2137 
   2138 
   2139 /* CLEAN HELPER */
   2140 /* fpucw[15:0] contains a x87 native format FPU control word.
   2141    Extract from it the required FPROUND value and any resulting
   2142    emulation warning, and return (warn << 32) | fpround value.
   2143 */
   2144 ULong amd64g_check_fldcw ( ULong fpucw )
   2145 {
   2146    /* Decide on a rounding mode.  fpucw[11:10] holds it. */
   2147    /* NOTE, encoded exactly as per enum IRRoundingMode. */
   2148    ULong rmode = (fpucw >> 10) & 3;
   2149 
   2150    /* Detect any required emulation warnings. */
   2151    VexEmNote ew = EmNote_NONE;
   2152 
   2153    if ((fpucw & 0x3F) != 0x3F) {
   2154       /* unmasked exceptions! */
   2155       ew = EmWarn_X86_x87exns;
   2156    }
   2157    else
   2158    if (((fpucw >> 8) & 3) != 3) {
   2159       /* unsupported precision */
   2160       ew = EmWarn_X86_x87precision;
   2161    }
   2162 
   2163    return (((ULong)ew) << 32) | ((ULong)rmode);
   2164 }
   2165 
   2166 
   2167 /* CLEAN HELPER */
   2168 /* Given fpround as an IRRoundingMode value, create a suitable x87
   2169    native format FPU control word. */
   2170 ULong amd64g_create_fpucw ( ULong fpround )
   2171 {
   2172    fpround &= 3;
   2173    return 0x037F | (fpround << 10);
   2174 }
   2175 
   2176 
   2177 /* This is used to implement 'fldenv'.
   2178    Reads 28 bytes at x87_state[0 .. 27]. */
   2179 /* CALLED FROM GENERATED CODE */
   2180 /* DIRTY HELPER */
   2181 VexEmNote amd64g_dirtyhelper_FLDENV ( /*OUT*/VexGuestAMD64State* vex_state,
   2182                                       /*IN*/HWord x87_state)
   2183 {
   2184    return do_put_x87( False, (UChar*)x87_state, vex_state );
   2185 }
   2186 
   2187 
   2188 /* CALLED FROM GENERATED CODE */
   2189 /* DIRTY HELPER */
   2190 /* Create an x87 FPU env from the guest state, as close as we can
   2191    approximate it.  Writes 28 bytes at x87_state[0..27]. */
   2192 void amd64g_dirtyhelper_FSTENV ( /*IN*/VexGuestAMD64State* vex_state,
   2193                                  /*OUT*/HWord x87_state )
   2194 {
   2195    Int        i, stno, preg;
   2196    UInt       tagw;
   2197    UChar*     vexTags = (UChar*)(&vex_state->guest_FPTAG[0]);
   2198    Fpu_State* x87     = (Fpu_State*)x87_state;
   2199    UInt       ftop    = vex_state->guest_FTOP;
   2200    ULong      c3210   = vex_state->guest_FC3210;
   2201 
   2202    for (i = 0; i < 14; i++)
   2203       x87->env[i] = 0;
   2204 
   2205    x87->env[1] = x87->env[3] = x87->env[5] = x87->env[13] = 0xFFFF;
   2206    x87->env[FP_ENV_STAT]
   2207       = toUShort(toUInt( ((ftop & 7) << 11) | (c3210 & 0x4700) ));
   2208    x87->env[FP_ENV_CTRL]
   2209       = toUShort(toUInt( amd64g_create_fpucw( vex_state->guest_FPROUND ) ));
   2210 
   2211    /* Compute the x87 tag word. */
   2212    tagw = 0;
   2213    for (stno = 0; stno < 8; stno++) {
   2214       preg = (stno + ftop) & 7;
   2215       if (vexTags[preg] == 0) {
   2216          /* register is empty */
   2217          tagw |= (3 << (2*preg));
   2218       } else {
   2219          /* register is full. */
   2220          tagw |= (0 << (2*preg));
   2221       }
   2222    }
   2223    x87->env[FP_ENV_TAG] = toUShort(tagw);
   2224 
   2225    /* We don't dump the x87 registers, tho. */
   2226 }
   2227 
   2228 
   2229 /* This is used to implement 'fnsave'.
   2230    Writes 108 bytes at x87_state[0 .. 107]. */
   2231 /* CALLED FROM GENERATED CODE */
   2232 /* DIRTY HELPER */
   2233 void amd64g_dirtyhelper_FNSAVE ( /*IN*/VexGuestAMD64State* vex_state,
   2234                                  /*OUT*/HWord x87_state)
   2235 {
   2236    do_get_x87( vex_state, (UChar*)x87_state );
   2237 }
   2238 
   2239 
   2240 /* This is used to implement 'fnsaves'.
   2241    Writes 94 bytes at x87_state[0 .. 93]. */
   2242 /* CALLED FROM GENERATED CODE */
   2243 /* DIRTY HELPER */
   2244 void amd64g_dirtyhelper_FNSAVES ( /*IN*/VexGuestAMD64State* vex_state,
   2245                                   /*OUT*/HWord x87_state)
   2246 {
   2247    Int           i, stno, preg;
   2248    UInt          tagw;
   2249    ULong*        vexRegs = (ULong*)(&vex_state->guest_FPREG[0]);
   2250    UChar*        vexTags = (UChar*)(&vex_state->guest_FPTAG[0]);
   2251    Fpu_State_16* x87     = (Fpu_State_16*)x87_state;
   2252    UInt          ftop    = vex_state->guest_FTOP;
   2253    UInt          c3210   = vex_state->guest_FC3210;
   2254 
   2255    for (i = 0; i < 7; i++)
   2256       x87->env[i] = 0;
   2257 
   2258    x87->env[FPS_ENV_STAT]
   2259       = toUShort(((ftop & 7) << 11) | (c3210 & 0x4700));
   2260    x87->env[FPS_ENV_CTRL]
   2261       = toUShort(amd64g_create_fpucw( vex_state->guest_FPROUND ));
   2262 
   2263    /* Dump the register stack in ST order. */
   2264    tagw = 0;
   2265    for (stno = 0; stno < 8; stno++) {
   2266       preg = (stno + ftop) & 7;
   2267       if (vexTags[preg] == 0) {
   2268          /* register is empty */
   2269          tagw |= (3 << (2*preg));
   2270          convert_f64le_to_f80le( (UChar*)&vexRegs[preg],
   2271                                  &x87->reg[10*stno] );
   2272       } else {
   2273          /* register is full. */
   2274          tagw |= (0 << (2*preg));
   2275          convert_f64le_to_f80le( (UChar*)&vexRegs[preg],
   2276                                  &x87->reg[10*stno] );
   2277       }
   2278    }
   2279    x87->env[FPS_ENV_TAG] = toUShort(tagw);
   2280 }
   2281 
   2282 
   2283 /* This is used to implement 'frstor'.
   2284    Reads 108 bytes at x87_state[0 .. 107]. */
   2285 /* CALLED FROM GENERATED CODE */
   2286 /* DIRTY HELPER */
   2287 VexEmNote amd64g_dirtyhelper_FRSTOR ( /*OUT*/VexGuestAMD64State* vex_state,
   2288                                       /*IN*/HWord x87_state)
   2289 {
   2290    return do_put_x87( True, (UChar*)x87_state, vex_state );
   2291 }
   2292 
   2293 
   2294 /* This is used to implement 'frstors'.
   2295    Reads 94 bytes at x87_state[0 .. 93]. */
   2296 /* CALLED FROM GENERATED CODE */
   2297 /* DIRTY HELPER */
   2298 VexEmNote amd64g_dirtyhelper_FRSTORS ( /*OUT*/VexGuestAMD64State* vex_state,
   2299                                        /*IN*/HWord x87_state)
   2300 {
   2301    Int           stno, preg;
   2302    UInt          tag;
   2303    ULong*        vexRegs = (ULong*)(&vex_state->guest_FPREG[0]);
   2304    UChar*        vexTags = (UChar*)(&vex_state->guest_FPTAG[0]);
   2305    Fpu_State_16* x87     = (Fpu_State_16*)x87_state;
   2306    UInt          ftop    = (x87->env[FPS_ENV_STAT] >> 11) & 7;
   2307    UInt          tagw    = x87->env[FPS_ENV_TAG];
   2308    UInt          fpucw   = x87->env[FPS_ENV_CTRL];
   2309    UInt          c3210   = x87->env[FPS_ENV_STAT] & 0x4700;
   2310    VexEmNote     ew;
   2311    UInt          fpround;
   2312    ULong         pair;
   2313 
   2314    /* Copy registers and tags */
   2315    for (stno = 0; stno < 8; stno++) {
   2316       preg = (stno + ftop) & 7;
   2317       tag = (tagw >> (2*preg)) & 3;
   2318       if (tag == 3) {
   2319          /* register is empty */
   2320          /* hmm, if it's empty, does it still get written?  Probably
   2321             safer to say it does.  If we don't, memcheck could get out
   2322             of sync, in that it thinks all FP registers are defined by
   2323             this helper, but in reality some have not been updated. */
   2324          vexRegs[preg] = 0; /* IEEE754 64-bit zero */
   2325          vexTags[preg] = 0;
   2326       } else {
   2327          /* register is non-empty */
   2328          convert_f80le_to_f64le( &x87->reg[10*stno],
   2329                                  (UChar*)&vexRegs[preg] );
   2330          vexTags[preg] = 1;
   2331       }
   2332    }
   2333 
   2334    /* stack pointer */
   2335    vex_state->guest_FTOP = ftop;
   2336 
   2337    /* status word */
   2338    vex_state->guest_FC3210 = c3210;
   2339 
   2340    /* handle the control word, setting FPROUND and detecting any
   2341       emulation warnings. */
   2342    pair    = amd64g_check_fldcw ( (ULong)fpucw );
   2343    fpround = (UInt)pair & 0xFFFFFFFFULL;
   2344    ew      = (VexEmNote)(pair >> 32);
   2345 
   2346    vex_state->guest_FPROUND = fpround & 3;
   2347 
   2348    /* emulation warnings --> caller */
   2349    return ew;
   2350 }
   2351 
   2352 
   2353 /*---------------------------------------------------------------*/
   2354 /*--- Misc integer helpers, including rotates and CPUID.      ---*/
   2355 /*---------------------------------------------------------------*/
   2356 
   2357 /* Claim to be the following CPU, which is probably representative of
   2358    the lowliest (earliest) amd64 offerings.  It can do neither sse3
   2359    nor cx16.
   2360 
   2361    vendor_id       : AuthenticAMD
   2362    cpu family      : 15
   2363    model           : 5
   2364    model name      : AMD Opteron (tm) Processor 848
   2365    stepping        : 10
   2366    cpu MHz         : 1797.682
   2367    cache size      : 1024 KB
   2368    fpu             : yes
   2369    fpu_exception   : yes
   2370    cpuid level     : 1
   2371    wp              : yes
   2372    flags           : fpu vme de pse tsc msr pae mce cx8 apic sep
   2373                      mtrr pge mca cmov pat pse36 clflush mmx fxsr
   2374                      sse sse2 syscall nx mmxext lm 3dnowext 3dnow
   2375    bogomips        : 3600.62
   2376    TLB size        : 1088 4K pages
   2377    clflush size    : 64
   2378    cache_alignment : 64
   2379    address sizes   : 40 bits physical, 48 bits virtual
   2380    power management: ts fid vid ttp
   2381 
   2382    2012-Feb-21: don't claim 3dnow or 3dnowext, since in fact
   2383    we don't support them.  See #291568.  3dnow is 80000001.EDX.31
   2384    and 3dnowext is 80000001.EDX.30.
   2385 */
   2386 void amd64g_dirtyhelper_CPUID_baseline ( VexGuestAMD64State* st )
   2387 {
   2388 #  define SET_ABCD(_a,_b,_c,_d)                \
   2389       do { st->guest_RAX = (ULong)(_a);        \
   2390            st->guest_RBX = (ULong)(_b);        \
   2391            st->guest_RCX = (ULong)(_c);        \
   2392            st->guest_RDX = (ULong)(_d);        \
   2393       } while (0)
   2394 
   2395    switch (0xFFFFFFFF & st->guest_RAX) {
   2396       case 0x00000000:
   2397          SET_ABCD(0x00000001, 0x68747541, 0x444d4163, 0x69746e65);
   2398          break;
   2399       case 0x00000001:
   2400          SET_ABCD(0x00000f5a, 0x01000800, 0x00000000, 0x078bfbff);
   2401          break;
   2402       case 0x80000000:
   2403          SET_ABCD(0x80000018, 0x68747541, 0x444d4163, 0x69746e65);
   2404          break;
   2405       case 0x80000001:
   2406          /* Don't claim to support 3dnow or 3dnowext.  0xe1d3fbff is
   2407             the original it-is-supported value that the h/w provides.
   2408             See #291568. */
   2409          SET_ABCD(0x00000f5a, 0x00000505, 0x00000000, /*0xe1d3fbff*/
   2410                                                       0x21d3fbff);
   2411          break;
   2412       case 0x80000002:
   2413          SET_ABCD(0x20444d41, 0x6574704f, 0x206e6f72, 0x296d7428);
   2414          break;
   2415       case 0x80000003:
   2416          SET_ABCD(0x6f725020, 0x73736563, 0x3820726f, 0x00003834);
   2417          break;
   2418       case 0x80000004:
   2419          SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
   2420          break;
   2421       case 0x80000005:
   2422          SET_ABCD(0xff08ff08, 0xff20ff20, 0x40020140, 0x40020140);
   2423          break;
   2424       case 0x80000006:
   2425          SET_ABCD(0x00000000, 0x42004200, 0x04008140, 0x00000000);
   2426          break;
   2427       case 0x80000007:
   2428          SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x0000000f);
   2429          break;
   2430       case 0x80000008:
   2431          SET_ABCD(0x00003028, 0x00000000, 0x00000000, 0x00000000);
   2432          break;
   2433       default:
   2434          SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
   2435          break;
   2436    }
   2437 #  undef SET_ABCD
   2438 }
   2439 
   2440 
   2441 /* Claim to be the following CPU (2 x ...), which is sse3 and cx16
   2442    capable.
   2443 
   2444    vendor_id       : GenuineIntel
   2445    cpu family      : 6
   2446    model           : 15
   2447    model name      : Intel(R) Core(TM)2 CPU 6600 @ 2.40GHz
   2448    stepping        : 6
   2449    cpu MHz         : 2394.000
   2450    cache size      : 4096 KB
   2451    physical id     : 0
   2452    siblings        : 2
   2453    core id         : 0
   2454    cpu cores       : 2
   2455    fpu             : yes
   2456    fpu_exception   : yes
   2457    cpuid level     : 10
   2458    wp              : yes
   2459    flags           : fpu vme de pse tsc msr pae mce cx8 apic sep
   2460                      mtrr pge mca cmov pat pse36 clflush dts acpi
   2461                      mmx fxsr sse sse2 ss ht tm syscall nx lm
   2462                      constant_tsc pni monitor ds_cpl vmx est tm2
   2463                      cx16 xtpr lahf_lm
   2464    bogomips        : 4798.78
   2465    clflush size    : 64
   2466    cache_alignment : 64
   2467    address sizes   : 36 bits physical, 48 bits virtual
   2468    power management:
   2469 */
   2470 void amd64g_dirtyhelper_CPUID_sse3_and_cx16 ( VexGuestAMD64State* st )
   2471 {
   2472 #  define SET_ABCD(_a,_b,_c,_d)                \
   2473       do { st->guest_RAX = (ULong)(_a);        \
   2474            st->guest_RBX = (ULong)(_b);        \
   2475            st->guest_RCX = (ULong)(_c);        \
   2476            st->guest_RDX = (ULong)(_d);        \
   2477       } while (0)
   2478 
   2479    switch (0xFFFFFFFF & st->guest_RAX) {
   2480       case 0x00000000:
   2481          SET_ABCD(0x0000000a, 0x756e6547, 0x6c65746e, 0x49656e69);
   2482          break;
   2483       case 0x00000001:
   2484          SET_ABCD(0x000006f6, 0x00020800, 0x0000e3bd, 0xbfebfbff);
   2485          break;
   2486       case 0x00000002:
   2487          SET_ABCD(0x05b0b101, 0x005657f0, 0x00000000, 0x2cb43049);
   2488          break;
   2489       case 0x00000003:
   2490          SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
   2491          break;
   2492       case 0x00000004: {
   2493          switch (0xFFFFFFFF & st->guest_RCX) {
   2494             case 0x00000000: SET_ABCD(0x04000121, 0x01c0003f,
   2495                                       0x0000003f, 0x00000001); break;
   2496             case 0x00000001: SET_ABCD(0x04000122, 0x01c0003f,
   2497                                       0x0000003f, 0x00000001); break;
   2498             case 0x00000002: SET_ABCD(0x04004143, 0x03c0003f,
   2499                                       0x00000fff, 0x00000001); break;
   2500             default:         SET_ABCD(0x00000000, 0x00000000,
   2501                                       0x00000000, 0x00000000); break;
   2502          }
   2503          break;
   2504       }
   2505       case 0x00000005:
   2506          SET_ABCD(0x00000040, 0x00000040, 0x00000003, 0x00000020);
   2507          break;
   2508       case 0x00000006:
   2509          SET_ABCD(0x00000001, 0x00000002, 0x00000001, 0x00000000);
   2510          break;
   2511       case 0x00000007:
   2512          SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
   2513          break;
   2514       case 0x00000008:
   2515          SET_ABCD(0x00000400, 0x00000000, 0x00000000, 0x00000000);
   2516          break;
   2517       case 0x00000009:
   2518          SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
   2519          break;
   2520       case 0x0000000a:
   2521       unhandled_eax_value:
   2522          SET_ABCD(0x07280202, 0x00000000, 0x00000000, 0x00000000);
   2523          break;
   2524       case 0x80000000:
   2525          SET_ABCD(0x80000008, 0x00000000, 0x00000000, 0x00000000);
   2526          break;
   2527       case 0x80000001:
   2528          SET_ABCD(0x00000000, 0x00000000, 0x00000001, 0x20100800);
   2529          break;
   2530       case 0x80000002:
   2531          SET_ABCD(0x65746e49, 0x2952286c, 0x726f4320, 0x4d542865);
   2532          break;
   2533       case 0x80000003:
   2534          SET_ABCD(0x43203229, 0x20205550, 0x20202020, 0x20202020);
   2535          break;
   2536       case 0x80000004:
   2537          SET_ABCD(0x30303636, 0x20402020, 0x30342e32, 0x007a4847);
   2538          break;
   2539       case 0x80000005:
   2540          SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
   2541          break;
   2542       case 0x80000006:
   2543          SET_ABCD(0x00000000, 0x00000000, 0x10008040, 0x00000000);
   2544          break;
   2545       case 0x80000007:
   2546          SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
   2547          break;
   2548       case 0x80000008:
   2549          SET_ABCD(0x00003024, 0x00000000, 0x00000000, 0x00000000);
   2550          break;
   2551       default:
   2552          goto unhandled_eax_value;
   2553    }
   2554 #  undef SET_ABCD
   2555 }
   2556 
   2557 
   2558 /* Claim to be the following CPU (4 x ...), which is sse4.2 and cx16
   2559    capable.
   2560 
   2561    vendor_id       : GenuineIntel
   2562    cpu family      : 6
   2563    model           : 37
   2564    model name      : Intel(R) Core(TM) i5 CPU         670  @ 3.47GHz
   2565    stepping        : 2
   2566    cpu MHz         : 3334.000
   2567    cache size      : 4096 KB
   2568    physical id     : 0
   2569    siblings        : 4
   2570    core id         : 0
   2571    cpu cores       : 2
   2572    apicid          : 0
   2573    initial apicid  : 0
   2574    fpu             : yes
   2575    fpu_exception   : yes
   2576    cpuid level     : 11
   2577    wp              : yes
   2578    flags           : fpu vme de pse tsc msr pae mce cx8 apic sep
   2579                      mtrr pge mca cmov pat pse36 clflush dts acpi
   2580                      mmx fxsr sse sse2 ss ht tm pbe syscall nx rdtscp
   2581                      lm constant_tsc arch_perfmon pebs bts rep_good
   2582                      xtopology nonstop_tsc aperfmperf pni pclmulqdq
   2583                      dtes64 monitor ds_cpl vmx smx est tm2 ssse3 cx16
   2584                      xtpr pdcm sse4_1 sse4_2 popcnt aes lahf_lm ida
   2585                      arat tpr_shadow vnmi flexpriority ept vpid
   2586    bogomips        : 6957.57
   2587    clflush size    : 64
   2588    cache_alignment : 64
   2589    address sizes   : 36 bits physical, 48 bits virtual
   2590    power management:
   2591 */
   2592 void amd64g_dirtyhelper_CPUID_sse42_and_cx16 ( VexGuestAMD64State* st )
   2593 {
   2594 #  define SET_ABCD(_a,_b,_c,_d)                \
   2595       do { st->guest_RAX = (ULong)(_a);        \
   2596            st->guest_RBX = (ULong)(_b);        \
   2597            st->guest_RCX = (ULong)(_c);        \
   2598            st->guest_RDX = (ULong)(_d);        \
   2599       } while (0)
   2600 
   2601    UInt old_eax = (UInt)st->guest_RAX;
   2602    UInt old_ecx = (UInt)st->guest_RCX;
   2603 
   2604    switch (old_eax) {
   2605       case 0x00000000:
   2606          SET_ABCD(0x0000000b, 0x756e6547, 0x6c65746e, 0x49656e69);
   2607          break;
   2608       case 0x00000001:
   2609          SET_ABCD(0x00020652, 0x00100800, 0x0298e3ff, 0xbfebfbff);
   2610          break;
   2611       case 0x00000002:
   2612          SET_ABCD(0x55035a01, 0x00f0b2e3, 0x00000000, 0x09ca212c);
   2613          break;
   2614       case 0x00000003:
   2615          SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
   2616          break;
   2617       case 0x00000004:
   2618          switch (old_ecx) {
   2619             case 0x00000000: SET_ABCD(0x1c004121, 0x01c0003f,
   2620                                       0x0000003f, 0x00000000); break;
   2621             case 0x00000001: SET_ABCD(0x1c004122, 0x00c0003f,
   2622                                       0x0000007f, 0x00000000); break;
   2623             case 0x00000002: SET_ABCD(0x1c004143, 0x01c0003f,
   2624                                       0x000001ff, 0x00000000); break;
   2625             case 0x00000003: SET_ABCD(0x1c03c163, 0x03c0003f,
   2626                                       0x00000fff, 0x00000002); break;
   2627             default:         SET_ABCD(0x00000000, 0x00000000,
   2628                                       0x00000000, 0x00000000); break;
   2629          }
   2630          break;
   2631       case 0x00000005:
   2632          SET_ABCD(0x00000040, 0x00000040, 0x00000003, 0x00001120);
   2633          break;
   2634       case 0x00000006:
   2635          SET_ABCD(0x00000007, 0x00000002, 0x00000001, 0x00000000);
   2636          break;
   2637       case 0x00000007:
   2638          SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
   2639          break;
   2640       case 0x00000008:
   2641          SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
   2642          break;
   2643       case 0x00000009:
   2644          SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
   2645          break;
   2646       case 0x0000000a:
   2647          SET_ABCD(0x07300403, 0x00000004, 0x00000000, 0x00000603);
   2648          break;
   2649       case 0x0000000b:
   2650          switch (old_ecx) {
   2651             case 0x00000000:
   2652                SET_ABCD(0x00000001, 0x00000002,
   2653                         0x00000100, 0x00000000); break;
   2654             case 0x00000001:
   2655                SET_ABCD(0x00000004, 0x00000004,
   2656                         0x00000201, 0x00000000); break;
   2657             default:
   2658                SET_ABCD(0x00000000, 0x00000000,
   2659                         old_ecx,    0x00000000); break;
   2660          }
   2661          break;
   2662       case 0x0000000c:
   2663          SET_ABCD(0x00000001, 0x00000002, 0x00000100, 0x00000000);
   2664          break;
   2665       case 0x0000000d:
   2666          switch (old_ecx) {
   2667             case 0x00000000: SET_ABCD(0x00000001, 0x00000002,
   2668                                       0x00000100, 0x00000000); break;
   2669             case 0x00000001: SET_ABCD(0x00000004, 0x00000004,
   2670                                       0x00000201, 0x00000000); break;
   2671             default:         SET_ABCD(0x00000000, 0x00000000,
   2672                                       old_ecx,    0x00000000); break;
   2673          }
   2674          break;
   2675       case 0x80000000:
   2676          SET_ABCD(0x80000008, 0x00000000, 0x00000000, 0x00000000);
   2677          break;
   2678       case 0x80000001:
   2679          SET_ABCD(0x00000000, 0x00000000, 0x00000001, 0x28100800);
   2680          break;
   2681       case 0x80000002:
   2682          SET_ABCD(0x65746e49, 0x2952286c, 0x726f4320, 0x4d542865);
   2683          break;
   2684       case 0x80000003:
   2685          SET_ABCD(0x35692029, 0x55504320, 0x20202020, 0x20202020);
   2686          break;
   2687       case 0x80000004:
   2688          SET_ABCD(0x30373620, 0x20402020, 0x37342e33, 0x007a4847);
   2689          break;
   2690       case 0x80000005:
   2691          SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
   2692          break;
   2693       case 0x80000006:
   2694          SET_ABCD(0x00000000, 0x00000000, 0x01006040, 0x00000000);
   2695          break;
   2696       case 0x80000007:
   2697          SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000100);
   2698          break;
   2699       case 0x80000008:
   2700          SET_ABCD(0x00003024, 0x00000000, 0x00000000, 0x00000000);
   2701          break;
   2702       default:
   2703          SET_ABCD(0x00000001, 0x00000002, 0x00000100, 0x00000000);
   2704          break;
   2705    }
   2706 #  undef SET_ABCD
   2707 }
   2708 
   2709 
   2710 /* Claim to be the following CPU (4 x ...), which is AVX and cx16
   2711    capable.  Plus (kludge!) it "supports" HTM.
   2712 
   2713    vendor_id       : GenuineIntel
   2714    cpu family      : 6
   2715    model           : 42
   2716    model name      : Intel(R) Core(TM) i5-2300 CPU @ 2.80GHz
   2717    stepping        : 7
   2718    cpu MHz         : 1600.000
   2719    cache size      : 6144 KB
   2720    physical id     : 0
   2721    siblings        : 4
   2722    core id         : 3
   2723    cpu cores       : 4
   2724    apicid          : 6
   2725    initial apicid  : 6
   2726    fpu             : yes
   2727    fpu_exception   : yes
   2728    cpuid level     : 13
   2729    wp              : yes
   2730    flags           : fpu vme de pse tsc msr pae mce cx8 apic sep
   2731                      mtrr pge mca cmov pat pse36 clflush dts acpi
   2732                      mmx fxsr sse sse2 ss ht tm pbe syscall nx rdtscp
   2733                      lm constant_tsc arch_perfmon pebs bts rep_good
   2734                      nopl xtopology nonstop_tsc aperfmperf pni pclmulqdq
   2735                      dtes64 monitor ds_cpl vmx est tm2 ssse3 cx16
   2736                      xtpr pdcm sse4_1 sse4_2 popcnt aes xsave avx
   2737                      lahf_lm ida arat epb xsaveopt pln pts dts
   2738                      tpr_shadow vnmi flexpriority ept vpid
   2739 
   2740    bogomips        : 5768.94
   2741    clflush size    : 64
   2742    cache_alignment : 64
   2743    address sizes   : 36 bits physical, 48 bits virtual
   2744    power management:
   2745 */
   2746 void amd64g_dirtyhelper_CPUID_avx_and_cx16 ( VexGuestAMD64State* st )
   2747 {
   2748 #  define SET_ABCD(_a,_b,_c,_d)                \
   2749       do { st->guest_RAX = (ULong)(_a);        \
   2750            st->guest_RBX = (ULong)(_b);        \
   2751            st->guest_RCX = (ULong)(_c);        \
   2752            st->guest_RDX = (ULong)(_d);        \
   2753       } while (0)
   2754 
   2755    UInt old_eax = (UInt)st->guest_RAX;
   2756    UInt old_ecx = (UInt)st->guest_RCX;
   2757 
   2758    switch (old_eax) {
   2759       case 0x00000000:
   2760          SET_ABCD(0x0000000d, 0x756e6547, 0x6c65746e, 0x49656e69);
   2761          break;
   2762       case 0x00000001:
   2763          SET_ABCD(0x000206a7, 0x00100800, 0x1f9ae3bf, 0xbfebfbff);
   2764          break;
   2765       case 0x00000002:
   2766          SET_ABCD(0x76035a01, 0x00f0b0ff, 0x00000000, 0x00ca0000);
   2767          break;
   2768       case 0x00000003:
   2769          SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
   2770          break;
   2771       case 0x00000004:
   2772          switch (old_ecx) {
   2773             case 0x00000000: SET_ABCD(0x1c004121, 0x01c0003f,
   2774                                       0x0000003f, 0x00000000); break;
   2775             case 0x00000001: SET_ABCD(0x1c004122, 0x01c0003f,
   2776                                       0x0000003f, 0x00000000); break;
   2777             case 0x00000002: SET_ABCD(0x1c004143, 0x01c0003f,
   2778                                       0x000001ff, 0x00000000); break;
   2779             case 0x00000003: SET_ABCD(0x1c03c163, 0x02c0003f,
   2780                                       0x00001fff, 0x00000006); break;
   2781             default:         SET_ABCD(0x00000000, 0x00000000,
   2782                                       0x00000000, 0x00000000); break;
   2783          }
   2784          break;
   2785       case 0x00000005:
   2786          SET_ABCD(0x00000040, 0x00000040, 0x00000003, 0x00001120);
   2787          break;
   2788       case 0x00000006:
   2789          SET_ABCD(0x00000077, 0x00000002, 0x00000009, 0x00000000);
   2790          break;
   2791       case 0x00000007:
   2792          SET_ABCD(0x00000000, 0x00000800, 0x00000000, 0x00000000);
   2793          break;
   2794       case 0x00000008:
   2795          SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
   2796          break;
   2797       case 0x00000009:
   2798          SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
   2799          break;
   2800       case 0x0000000a:
   2801          SET_ABCD(0x07300803, 0x00000000, 0x00000000, 0x00000603);
   2802          break;
   2803       case 0x0000000b:
   2804          switch (old_ecx) {
   2805             case 0x00000000:
   2806                SET_ABCD(0x00000001, 0x00000001,
   2807                         0x00000100, 0x00000000); break;
   2808             case 0x00000001:
   2809                SET_ABCD(0x00000004, 0x00000004,
   2810                         0x00000201, 0x00000000); break;
   2811             default:
   2812                SET_ABCD(0x00000000, 0x00000000,
   2813                         old_ecx,    0x00000000); break;
   2814          }
   2815          break;
   2816       case 0x0000000c:
   2817          SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
   2818          break;
   2819       case 0x0000000d:
   2820          switch (old_ecx) {
   2821             case 0x00000000: SET_ABCD(0x00000007, 0x00000340,
   2822                                       0x00000340, 0x00000000); break;
   2823             case 0x00000001: SET_ABCD(0x00000001, 0x00000000,
   2824                                       0x00000000, 0x00000000); break;
   2825             case 0x00000002: SET_ABCD(0x00000100, 0x00000240,
   2826                                       0x00000000, 0x00000000); break;
   2827             default:         SET_ABCD(0x00000000, 0x00000000,
   2828                                       0x00000000, 0x00000000); break;
   2829          }
   2830          break;
   2831       case 0x0000000e:
   2832          SET_ABCD(0x00000007, 0x00000340, 0x00000340, 0x00000000);
   2833          break;
   2834       case 0x0000000f:
   2835          SET_ABCD(0x00000007, 0x00000340, 0x00000340, 0x00000000);
   2836          break;
   2837       case 0x80000000:
   2838          SET_ABCD(0x80000008, 0x00000000, 0x00000000, 0x00000000);
   2839          break;
   2840       case 0x80000001:
   2841          SET_ABCD(0x00000000, 0x00000000, 0x00000001, 0x28100800);
   2842          break;
   2843       case 0x80000002:
   2844          SET_ABCD(0x20202020, 0x20202020, 0x65746e49, 0x2952286c);
   2845          break;
   2846       case 0x80000003:
   2847          SET_ABCD(0x726f4320, 0x4d542865, 0x35692029, 0x3033322d);
   2848          break;
   2849       case 0x80000004:
   2850          SET_ABCD(0x50432030, 0x20402055, 0x30382e32, 0x007a4847);
   2851          break;
   2852       case 0x80000005:
   2853          SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
   2854          break;
   2855       case 0x80000006:
   2856          SET_ABCD(0x00000000, 0x00000000, 0x01006040, 0x00000000);
   2857          break;
   2858       case 0x80000007:
   2859          SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000100);
   2860          break;
   2861       case 0x80000008:
   2862          SET_ABCD(0x00003024, 0x00000000, 0x00000000, 0x00000000);
   2863          break;
   2864       default:
   2865          SET_ABCD(0x00000007, 0x00000340, 0x00000340, 0x00000000);
   2866          break;
   2867    }
   2868 #  undef SET_ABCD
   2869 }
   2870 
   2871 
   2872 ULong amd64g_calculate_RCR ( ULong arg,
   2873                              ULong rot_amt,
   2874                              ULong rflags_in,
   2875                              Long  szIN )
   2876 {
   2877    Bool  wantRflags = toBool(szIN < 0);
   2878    ULong sz         = wantRflags ? (-szIN) : szIN;
   2879    ULong tempCOUNT  = rot_amt & (sz == 8 ? 0x3F : 0x1F);
   2880    ULong cf=0, of=0, tempcf;
   2881 
   2882    switch (sz) {
   2883       case 8:
   2884          cf        = (rflags_in >> AMD64G_CC_SHIFT_C) & 1;
   2885          of        = ((arg >> 63) ^ cf) & 1;
   2886          while (tempCOUNT > 0) {
   2887             tempcf = arg & 1;
   2888             arg    = (arg >> 1) | (cf << 63);
   2889             cf     = tempcf;
   2890             tempCOUNT--;
   2891          }
   2892          break;
   2893       case 4:
   2894          while (tempCOUNT >= 33) tempCOUNT -= 33;
   2895          cf        = (rflags_in >> AMD64G_CC_SHIFT_C) & 1;
   2896          of        = ((arg >> 31) ^ cf) & 1;
   2897          while (tempCOUNT > 0) {
   2898             tempcf = arg & 1;
   2899             arg    = ((arg >> 1) & 0x7FFFFFFFULL) | (cf << 31);
   2900             cf     = tempcf;
   2901             tempCOUNT--;
   2902          }
   2903          break;
   2904       case 2:
   2905          while (tempCOUNT >= 17) tempCOUNT -= 17;
   2906          cf        = (rflags_in >> AMD64G_CC_SHIFT_C) & 1;
   2907          of        = ((arg >> 15) ^ cf) & 1;
   2908          while (tempCOUNT > 0) {
   2909             tempcf = arg & 1;
   2910             arg    = ((arg >> 1) & 0x7FFFULL) | (cf << 15);
   2911             cf     = tempcf;
   2912             tempCOUNT--;
   2913          }
   2914          break;
   2915       case 1:
   2916          while (tempCOUNT >= 9) tempCOUNT -= 9;
   2917          cf        = (rflags_in >> AMD64G_CC_SHIFT_C) & 1;
   2918          of        = ((arg >> 7) ^ cf) & 1;
   2919          while (tempCOUNT > 0) {
   2920             tempcf = arg & 1;
   2921             arg    = ((arg >> 1) & 0x7FULL) | (cf << 7);
   2922             cf     = tempcf;
   2923             tempCOUNT--;
   2924          }
   2925          break;
   2926       default:
   2927          vpanic("calculate_RCR(amd64g): invalid size");
   2928    }
   2929 
   2930    cf &= 1;
   2931    of &= 1;
   2932    rflags_in &= ~(AMD64G_CC_MASK_C | AMD64G_CC_MASK_O);
   2933    rflags_in |= (cf << AMD64G_CC_SHIFT_C) | (of << AMD64G_CC_SHIFT_O);
   2934 
   2935    /* caller can ask to have back either the resulting flags or
   2936       resulting value, but not both */
   2937    return wantRflags ? rflags_in : arg;
   2938 }
   2939 
   2940 ULong amd64g_calculate_RCL ( ULong arg,
   2941                              ULong rot_amt,
   2942                              ULong rflags_in,
   2943                              Long  szIN )
   2944 {
   2945    Bool  wantRflags = toBool(szIN < 0);
   2946    ULong sz         = wantRflags ? (-szIN) : szIN;
   2947    ULong tempCOUNT  = rot_amt & (sz == 8 ? 0x3F : 0x1F);
   2948    ULong cf=0, of=0, tempcf;
   2949 
   2950    switch (sz) {
   2951       case 8:
   2952          cf = (rflags_in >> AMD64G_CC_SHIFT_C) & 1;
   2953          while (tempCOUNT > 0) {
   2954             tempcf = (arg >> 63) & 1;
   2955             arg    = (arg << 1) | (cf & 1);
   2956             cf     = tempcf;
   2957             tempCOUNT--;
   2958          }
   2959          of = ((arg >> 63) ^ cf) & 1;
   2960          break;
   2961       case 4:
   2962          while (tempCOUNT >= 33) tempCOUNT -= 33;
   2963          cf = (rflags_in >> AMD64G_CC_SHIFT_C) & 1;
   2964          while (tempCOUNT > 0) {
   2965             tempcf = (arg >> 31) & 1;
   2966             arg    = 0xFFFFFFFFULL & ((arg << 1) | (cf & 1));
   2967             cf     = tempcf;
   2968             tempCOUNT--;
   2969          }
   2970          of = ((arg >> 31) ^ cf) & 1;
   2971          break;
   2972       case 2:
   2973          while (tempCOUNT >= 17) tempCOUNT -= 17;
   2974          cf = (rflags_in >> AMD64G_CC_SHIFT_C) & 1;
   2975          while (tempCOUNT > 0) {
   2976             tempcf = (arg >> 15) & 1;
   2977             arg    = 0xFFFFULL & ((arg << 1) | (cf & 1));
   2978             cf     = tempcf;
   2979             tempCOUNT--;
   2980          }
   2981          of = ((arg >> 15) ^ cf) & 1;
   2982          break;
   2983       case 1:
   2984          while (tempCOUNT >= 9) tempCOUNT -= 9;
   2985          cf = (rflags_in >> AMD64G_CC_SHIFT_C) & 1;
   2986          while (tempCOUNT > 0) {
   2987             tempcf = (arg >> 7) & 1;
   2988             arg    = 0xFFULL & ((arg << 1) | (cf & 1));
   2989             cf     = tempcf;
   2990             tempCOUNT--;
   2991          }
   2992          of = ((arg >> 7) ^ cf) & 1;
   2993          break;
   2994       default:
   2995          vpanic("calculate_RCL(amd64g): invalid size");
   2996    }
   2997 
   2998    cf &= 1;
   2999    of &= 1;
   3000    rflags_in &= ~(AMD64G_CC_MASK_C | AMD64G_CC_MASK_O);
   3001    rflags_in |= (cf << AMD64G_CC_SHIFT_C) | (of << AMD64G_CC_SHIFT_O);
   3002 
   3003    return wantRflags ? rflags_in : arg;
   3004 }
   3005 
   3006 /* Taken from gf2x-0.9.5, released under GPLv2+ (later versions LGPLv2+)
   3007  * svn://scm.gforge.inria.fr/svn/gf2x/trunk/hardware/opteron/gf2x_mul1.h@25
   3008  */
   3009 ULong amd64g_calculate_pclmul(ULong a, ULong b, ULong which)
   3010 {
   3011     ULong hi, lo, tmp, A[16];
   3012 
   3013    A[0] = 0;            A[1] = a;
   3014    A[2] = A[1] << 1;    A[3] = A[2] ^ a;
   3015    A[4] = A[2] << 1;    A[5] = A[4] ^ a;
   3016    A[6] = A[3] << 1;    A[7] = A[6] ^ a;
   3017    A[8] = A[4] << 1;    A[9] = A[8] ^ a;
   3018    A[10] = A[5] << 1;   A[11] = A[10] ^ a;
   3019    A[12] = A[6] << 1;   A[13] = A[12] ^ a;
   3020    A[14] = A[7] << 1;   A[15] = A[14] ^ a;
   3021 
   3022    lo = (A[b >> 60] << 4) ^ A[(b >> 56) & 15];
   3023    hi = lo >> 56;
   3024    lo = (lo << 8) ^ (A[(b >> 52) & 15] << 4) ^ A[(b >> 48) & 15];
   3025    hi = (hi << 8) | (lo >> 56);
   3026    lo = (lo << 8) ^ (A[(b >> 44) & 15] << 4) ^ A[(b >> 40) & 15];
   3027    hi = (hi << 8) | (lo >> 56);
   3028    lo = (lo << 8) ^ (A[(b >> 36) & 15] << 4) ^ A[(b >> 32) & 15];
   3029    hi = (hi << 8) | (lo >> 56);
   3030    lo = (lo << 8) ^ (A[(b >> 28) & 15] << 4) ^ A[(b >> 24) & 15];
   3031    hi = (hi << 8) | (lo >> 56);
   3032    lo = (lo << 8) ^ (A[(b >> 20) & 15] << 4) ^ A[(b >> 16) & 15];
   3033    hi = (hi << 8) | (lo >> 56);
   3034    lo = (lo << 8) ^ (A[(b >> 12) & 15] << 4) ^ A[(b >> 8) & 15];
   3035    hi = (hi << 8) | (lo >> 56);
   3036    lo = (lo << 8) ^ (A[(b >> 4) & 15] << 4) ^ A[b & 15];
   3037 
   3038    ULong m0 = -1;
   3039    m0 /= 255;
   3040    tmp = -((a >> 63) & 1); tmp &= ((b & (m0 * 0xfe)) >> 1); hi = hi ^ tmp;
   3041    tmp = -((a >> 62) & 1); tmp &= ((b & (m0 * 0xfc)) >> 2); hi = hi ^ tmp;
   3042    tmp = -((a >> 61) & 1); tmp &= ((b & (m0 * 0xf8)) >> 3); hi = hi ^ tmp;
   3043    tmp = -((a >> 60) & 1); tmp &= ((b & (m0 * 0xf0)) >> 4); hi = hi ^ tmp;
   3044    tmp = -((a >> 59) & 1); tmp &= ((b & (m0 * 0xe0)) >> 5); hi = hi ^ tmp;
   3045    tmp = -((a >> 58) & 1); tmp &= ((b & (m0 * 0xc0)) >> 6); hi = hi ^ tmp;
   3046    tmp = -((a >> 57) & 1); tmp &= ((b & (m0 * 0x80)) >> 7); hi = hi ^ tmp;
   3047 
   3048    return which ? hi : lo;
   3049 }
   3050 
   3051 
   3052 /* CALLED FROM GENERATED CODE */
   3053 /* DIRTY HELPER (non-referentially-transparent) */
   3054 /* Horrible hack.  On non-amd64 platforms, return 1. */
   3055 ULong amd64g_dirtyhelper_RDTSC ( void )
   3056 {
   3057 #  if defined(__x86_64__)
   3058    UInt  eax, edx;
   3059    __asm__ __volatile__("rdtsc" : "=a" (eax), "=d" (edx));
   3060    return (((ULong)edx) << 32) | ((ULong)eax);
   3061 #  else
   3062    return 1ULL;
   3063 #  endif
   3064 }
   3065 
   3066 /* CALLED FROM GENERATED CODE */
   3067 /* DIRTY HELPER (non-referentially-transparent) */
   3068 /* Horrible hack.  On non-amd64 platforms, return 1. */
   3069 /* This uses a different calling convention from _RDTSC just above
   3070    only because of the difficulty of returning 96 bits from a C
   3071    function -- RDTSC returns 64 bits and so is simple by comparison,
   3072    on amd64. */
   3073 void amd64g_dirtyhelper_RDTSCP ( VexGuestAMD64State* st )
   3074 {
   3075 #  if defined(__x86_64__)
   3076    UInt eax, ecx, edx;
   3077    __asm__ __volatile__("rdtscp" : "=a" (eax), "=d" (edx), "=c" (ecx));
   3078    st->guest_RAX = (ULong)eax;
   3079    st->guest_RCX = (ULong)ecx;
   3080    st->guest_RDX = (ULong)edx;
   3081 #  else
   3082    /* Do nothing. */
   3083 #  endif
   3084 }
   3085 
   3086 /* CALLED FROM GENERATED CODE */
   3087 /* DIRTY HELPER (non-referentially-transparent) */
   3088 /* Horrible hack.  On non-amd64 platforms, return 0. */
   3089 ULong amd64g_dirtyhelper_IN ( ULong portno, ULong sz/*1,2 or 4*/ )
   3090 {
   3091 #  if defined(__x86_64__)
   3092    ULong r = 0;
   3093    portno &= 0xFFFF;
   3094    switch (sz) {
   3095       case 4:
   3096          __asm__ __volatile__("movq $0,%%rax; inl %w1,%%eax; movq %%rax,%0"
   3097                               : "=a" (r) : "Nd" (portno));
   3098 	 break;
   3099       case 2:
   3100          __asm__ __volatile__("movq $0,%%rax; inw %w1,%w0"
   3101                               : "=a" (r) : "Nd" (portno));
   3102 	 break;
   3103       case 1:
   3104          __asm__ __volatile__("movq $0,%%rax; inb %w1,%b0"
   3105                               : "=a" (r) : "Nd" (portno));
   3106 	 break;
   3107       default:
   3108          break; /* note: no 64-bit version of insn exists */
   3109    }
   3110    return r;
   3111 #  else
   3112    return 0;
   3113 #  endif
   3114 }
   3115 
   3116 
   3117 /* CALLED FROM GENERATED CODE */
   3118 /* DIRTY HELPER (non-referentially-transparent) */
   3119 /* Horrible hack.  On non-amd64 platforms, do nothing. */
   3120 void amd64g_dirtyhelper_OUT ( ULong portno, ULong data, ULong sz/*1,2 or 4*/ )
   3121 {
   3122 #  if defined(__x86_64__)
   3123    portno &= 0xFFFF;
   3124    switch (sz) {
   3125       case 4:
   3126          __asm__ __volatile__("movq %0,%%rax; outl %%eax, %w1"
   3127                               : : "a" (data), "Nd" (portno));
   3128 	 break;
   3129       case 2:
   3130          __asm__ __volatile__("outw %w0, %w1"
   3131                               : : "a" (data), "Nd" (portno));
   3132 	 break;
   3133       case 1:
   3134          __asm__ __volatile__("outb %b0, %w1"
   3135                               : : "a" (data), "Nd" (portno));
   3136 	 break;
   3137       default:
   3138          break; /* note: no 64-bit version of insn exists */
   3139    }
   3140 #  else
   3141    /* do nothing */
   3142 #  endif
   3143 }
   3144 
   3145 /* CALLED FROM GENERATED CODE */
   3146 /* DIRTY HELPER (non-referentially-transparent) */
   3147 /* Horrible hack.  On non-amd64 platforms, do nothing. */
   3148 /* op = 0: call the native SGDT instruction.
   3149    op = 1: call the native SIDT instruction.
   3150 */
   3151 void amd64g_dirtyhelper_SxDT ( void *address, ULong op ) {
   3152 #  if defined(__x86_64__)
   3153    switch (op) {
   3154       case 0:
   3155          __asm__ __volatile__("sgdt (%0)" : : "r" (address) : "memory");
   3156          break;
   3157       case 1:
   3158          __asm__ __volatile__("sidt (%0)" : : "r" (address) : "memory");
   3159          break;
   3160       default:
   3161          vpanic("amd64g_dirtyhelper_SxDT");
   3162    }
   3163 #  else
   3164    /* do nothing */
   3165    UChar* p = (UChar*)address;
   3166    p[0] = p[1] = p[2] = p[3] = p[4] = p[5] = 0;
   3167    p[6] = p[7] = p[8] = p[9] = 0;
   3168 #  endif
   3169 }
   3170 
   3171 /*---------------------------------------------------------------*/
   3172 /*--- Helpers for MMX/SSE/SSE2.                               ---*/
   3173 /*---------------------------------------------------------------*/
   3174 
   3175 static inline UChar abdU8 ( UChar xx, UChar yy ) {
   3176    return toUChar(xx>yy ? xx-yy : yy-xx);
   3177 }
   3178 
   3179 static inline ULong mk32x2 ( UInt w1, UInt w0 ) {
   3180    return (((ULong)w1) << 32) | ((ULong)w0);
   3181 }
   3182 
   3183 static inline UShort sel16x4_3 ( ULong w64 ) {
   3184    UInt hi32 = toUInt(w64 >> 32);
   3185    return toUShort(hi32 >> 16);
   3186 }
   3187 static inline UShort sel16x4_2 ( ULong w64 ) {
   3188    UInt hi32 = toUInt(w64 >> 32);
   3189    return toUShort(hi32);
   3190 }
   3191 static inline UShort sel16x4_1 ( ULong w64 ) {
   3192    UInt lo32 = toUInt(w64);
   3193    return toUShort(lo32 >> 16);
   3194 }
   3195 static inline UShort sel16x4_0 ( ULong w64 ) {
   3196    UInt lo32 = toUInt(w64);
   3197    return toUShort(lo32);
   3198 }
   3199 
   3200 static inline UChar sel8x8_7 ( ULong w64 ) {
   3201    UInt hi32 = toUInt(w64 >> 32);
   3202    return toUChar(hi32 >> 24);
   3203 }
   3204 static inline UChar sel8x8_6 ( ULong w64 ) {
   3205    UInt hi32 = toUInt(w64 >> 32);
   3206    return toUChar(hi32 >> 16);
   3207 }
   3208 static inline UChar sel8x8_5 ( ULong w64 ) {
   3209    UInt hi32 = toUInt(w64 >> 32);
   3210    return toUChar(hi32 >> 8);
   3211 }
   3212 static inline UChar sel8x8_4 ( ULong w64 ) {
   3213    UInt hi32 = toUInt(w64 >> 32);
   3214    return toUChar(hi32 >> 0);
   3215 }
   3216 static inline UChar sel8x8_3 ( ULong w64 ) {
   3217    UInt lo32 = toUInt(w64);
   3218    return toUChar(lo32 >> 24);
   3219 }
   3220 static inline UChar sel8x8_2 ( ULong w64 ) {
   3221    UInt lo32 = toUInt(w64);
   3222    return toUChar(lo32 >> 16);
   3223 }
   3224 static inline UChar sel8x8_1 ( ULong w64 ) {
   3225    UInt lo32 = toUInt(w64);
   3226    return toUChar(lo32 >> 8);
   3227 }
   3228 static inline UChar sel8x8_0 ( ULong w64 ) {
   3229    UInt lo32 = toUInt(w64);
   3230    return toUChar(lo32 >> 0);
   3231 }
   3232 
   3233 /* CALLED FROM GENERATED CODE: CLEAN HELPER */
   3234 ULong amd64g_calculate_mmx_pmaddwd ( ULong xx, ULong yy )
   3235 {
   3236    return
   3237       mk32x2(
   3238          (((Int)(Short)sel16x4_3(xx)) * ((Int)(Short)sel16x4_3(yy)))
   3239             + (((Int)(Short)sel16x4_2(xx)) * ((Int)(Short)sel16x4_2(yy))),
   3240          (((Int)(Short)sel16x4_1(xx)) * ((Int)(Short)sel16x4_1(yy)))
   3241             + (((Int)(Short)sel16x4_0(xx)) * ((Int)(Short)sel16x4_0(yy)))
   3242       );
   3243 }
   3244 
   3245 /* CALLED FROM GENERATED CODE: CLEAN HELPER */
   3246 ULong amd64g_calculate_mmx_psadbw ( ULong xx, ULong yy )
   3247 {
   3248    UInt t = 0;
   3249    t += (UInt)abdU8( sel8x8_7(xx), sel8x8_7(yy) );
   3250    t += (UInt)abdU8( sel8x8_6(xx), sel8x8_6(yy) );
   3251    t += (UInt)abdU8( sel8x8_5(xx), sel8x8_5(yy) );
   3252    t += (UInt)abdU8( sel8x8_4(xx), sel8x8_4(yy) );
   3253    t += (UInt)abdU8( sel8x8_3(xx), sel8x8_3(yy) );
   3254    t += (UInt)abdU8( sel8x8_2(xx), sel8x8_2(yy) );
   3255    t += (UInt)abdU8( sel8x8_1(xx), sel8x8_1(yy) );
   3256    t += (UInt)abdU8( sel8x8_0(xx), sel8x8_0(yy) );
   3257    t &= 0xFFFF;
   3258    return (ULong)t;
   3259 }
   3260 
   3261 /* CALLED FROM GENERATED CODE: CLEAN HELPER */
   3262 ULong amd64g_calculate_sse_phminposuw ( ULong sLo, ULong sHi )
   3263 {
   3264    UShort t, min;
   3265    UInt   idx;
   3266    t = sel16x4_0(sLo); if (True)    { min = t; idx = 0; }
   3267    t = sel16x4_1(sLo); if (t < min) { min = t; idx = 1; }
   3268    t = sel16x4_2(sLo); if (t < min) { min = t; idx = 2; }
   3269    t = sel16x4_3(sLo); if (t < min) { min = t; idx = 3; }
   3270    t = sel16x4_0(sHi); if (t < min) { min = t; idx = 4; }
   3271    t = sel16x4_1(sHi); if (t < min) { min = t; idx = 5; }
   3272    t = sel16x4_2(sHi); if (t < min) { min = t; idx = 6; }
   3273    t = sel16x4_3(sHi); if (t < min) { min = t; idx = 7; }
   3274    return ((ULong)(idx << 16)) | ((ULong)min);
   3275 }
   3276 
   3277 /* CALLED FROM GENERATED CODE: CLEAN HELPER */
   3278 ULong amd64g_calc_crc32b ( ULong crcIn, ULong b )
   3279 {
   3280    UInt  i;
   3281    ULong crc = (b & 0xFFULL) ^ crcIn;
   3282    for (i = 0; i < 8; i++)
   3283       crc = (crc >> 1) ^ ((crc & 1) ? 0x82f63b78ULL : 0);
   3284    return crc;
   3285 }
   3286 
   3287 /* CALLED FROM GENERATED CODE: CLEAN HELPER */
   3288 ULong amd64g_calc_crc32w ( ULong crcIn, ULong w )
   3289 {
   3290    UInt  i;
   3291    ULong crc = (w & 0xFFFFULL) ^ crcIn;
   3292    for (i = 0; i < 16; i++)
   3293       crc = (crc >> 1) ^ ((crc & 1) ? 0x82f63b78ULL : 0);
   3294    return crc;
   3295 }
   3296 
   3297 /* CALLED FROM GENERATED CODE: CLEAN HELPER */
   3298 ULong amd64g_calc_crc32l ( ULong crcIn, ULong l )
   3299 {
   3300    UInt i;
   3301    ULong crc = (l & 0xFFFFFFFFULL) ^ crcIn;
   3302    for (i = 0; i < 32; i++)
   3303       crc = (crc >> 1) ^ ((crc & 1) ? 0x82f63b78ULL : 0);
   3304    return crc;
   3305 }
   3306 
   3307 /* CALLED FROM GENERATED CODE: CLEAN HELPER */
   3308 ULong amd64g_calc_crc32q ( ULong crcIn, ULong q )
   3309 {
   3310    ULong crc = amd64g_calc_crc32l(crcIn, q);
   3311    return amd64g_calc_crc32l(crc, q >> 32);
   3312 }
   3313 
   3314 
   3315 /* .. helper for next fn .. */
   3316 static inline ULong sad_8x4 ( ULong xx, ULong yy )
   3317 {
   3318    UInt t = 0;
   3319    t += (UInt)abdU8( sel8x8_3(xx), sel8x8_3(yy) );
   3320    t += (UInt)abdU8( sel8x8_2(xx), sel8x8_2(yy) );
   3321    t += (UInt)abdU8( sel8x8_1(xx), sel8x8_1(yy) );
   3322    t += (UInt)abdU8( sel8x8_0(xx), sel8x8_0(yy) );
   3323    return (ULong)t;
   3324 }
   3325 
   3326 /* CALLED FROM GENERATED CODE: CLEAN HELPER */
   3327 ULong amd64g_calc_mpsadbw ( ULong sHi, ULong sLo,
   3328                             ULong dHi, ULong dLo,
   3329                             ULong imm_and_return_control_bit )
   3330 {
   3331    UInt imm8     = imm_and_return_control_bit & 7;
   3332    Bool calcHi   = (imm_and_return_control_bit >> 7) & 1;
   3333    UInt srcOffsL = imm8 & 3; /* src offs in 32-bit (L) chunks */
   3334    UInt dstOffsL = (imm8 >> 2) & 1; /* dst offs in ditto chunks */
   3335    /* For src we only need 32 bits, so get them into the
   3336       lower half of a 64 bit word. */
   3337    ULong src = ((srcOffsL & 2) ? sHi : sLo) >> (32 * (srcOffsL & 1));
   3338    /* For dst we need to get hold of 56 bits (7 bytes) from a total of
   3339       11 bytes.  If calculating the low part of the result, need bytes
   3340       dstOffsL * 4 + (0 .. 6); if calculating the high part,
   3341       dstOffsL * 4 + (4 .. 10). */
   3342    ULong dst;
   3343    /* dstOffL = 0, Lo  ->  0 .. 6
   3344       dstOffL = 1, Lo  ->  4 .. 10
   3345       dstOffL = 0, Hi  ->  4 .. 10
   3346       dstOffL = 1, Hi  ->  8 .. 14
   3347    */
   3348    if (calcHi && dstOffsL) {
   3349       /* 8 .. 14 */
   3350       dst = dHi & 0x00FFFFFFFFFFFFFFULL;
   3351    }
   3352    else if (!calcHi && !dstOffsL) {
   3353       /* 0 .. 6 */
   3354       dst = dLo & 0x00FFFFFFFFFFFFFFULL;
   3355    }
   3356    else {
   3357       /* 4 .. 10 */
   3358       dst = (dLo >> 32) | ((dHi & 0x00FFFFFFULL) << 32);
   3359    }
   3360    ULong r0  = sad_8x4( dst >>  0, src );
   3361    ULong r1  = sad_8x4( dst >>  8, src );
   3362    ULong r2  = sad_8x4( dst >> 16, src );
   3363    ULong r3  = sad_8x4( dst >> 24, src );
   3364    ULong res = (r3 << 48) | (r2 << 32) | (r1 << 16) | r0;
   3365    return res;
   3366 }
   3367 
   3368 /* CALLED FROM GENERATED CODE: CLEAN HELPER */
   3369 ULong amd64g_calculate_pext ( ULong src_masked, ULong mask )
   3370 {
   3371    ULong dst = 0;
   3372    ULong src_bit;
   3373    ULong dst_bit = 1;
   3374    for (src_bit = 1; src_bit; src_bit <<= 1) {
   3375       if (mask & src_bit) {
   3376          if (src_masked & src_bit) dst |= dst_bit;
   3377          dst_bit <<= 1;
   3378       }
   3379    }
   3380    return dst;
   3381 }
   3382 
   3383 /* CALLED FROM GENERATED CODE: CLEAN HELPER */
   3384 ULong amd64g_calculate_pdep ( ULong src, ULong mask )
   3385 {
   3386    ULong dst = 0;
   3387    ULong dst_bit;
   3388    ULong src_bit = 1;
   3389    for (dst_bit = 1; dst_bit; dst_bit <<= 1) {
   3390       if (mask & dst_bit) {
   3391          if (src & src_bit) dst |= dst_bit;
   3392          src_bit <<= 1;
   3393       }
   3394    }
   3395    return dst;
   3396 }
   3397 
   3398 /*---------------------------------------------------------------*/
   3399 /*--- Helpers for SSE4.2 PCMP{E,I}STR{I,M}                    ---*/
   3400 /*---------------------------------------------------------------*/
   3401 
   3402 static UInt zmask_from_V128 ( V128* arg )
   3403 {
   3404    UInt i, res = 0;
   3405    for (i = 0; i < 16; i++) {
   3406       res |=  ((arg->w8[i] == 0) ? 1 : 0) << i;
   3407    }
   3408    return res;
   3409 }
   3410 
   3411 static UInt zmask_from_V128_wide ( V128* arg )
   3412 {
   3413    UInt i, res = 0;
   3414    for (i = 0; i < 8; i++) {
   3415       res |=  ((arg->w16[i] == 0) ? 1 : 0) << i;
   3416    }
   3417    return res;
   3418 }
   3419 
   3420 /* Helps with PCMP{I,E}STR{I,M}.
   3421 
   3422    CALLED FROM GENERATED CODE: DIRTY HELPER(s).  (But not really,
   3423    actually it could be a clean helper, but for the fact that we can't
   3424    pass by value 2 x V128 to a clean helper, nor have one returned.)
   3425    Reads guest state, writes to guest state for the xSTRM cases, no
   3426    accesses of memory, is a pure function.
   3427 
   3428    opc_and_imm contains (4th byte of opcode << 8) | the-imm8-byte so
   3429    the callee knows which I/E and I/M variant it is dealing with and
   3430    what the specific operation is.  4th byte of opcode is in the range
   3431    0x60 to 0x63:
   3432        istri  66 0F 3A 63
   3433        istrm  66 0F 3A 62
   3434        estri  66 0F 3A 61
   3435        estrm  66 0F 3A 60
   3436 
   3437    gstOffL and gstOffR are the guest state offsets for the two XMM
   3438    register inputs.  We never have to deal with the memory case since
   3439    that is handled by pre-loading the relevant value into the fake
   3440    XMM16 register.
   3441 
   3442    For ESTRx variants, edxIN and eaxIN hold the values of those two
   3443    registers.
   3444 
   3445    In all cases, the bottom 16 bits of the result contain the new
   3446    OSZACP %rflags values.  For xSTRI variants, bits[31:16] of the
   3447    result hold the new %ecx value.  For xSTRM variants, the helper
   3448    writes the result directly to the guest XMM0.
   3449 
   3450    Declarable side effects: in all cases, reads guest state at
   3451    [gstOffL, +16) and [gstOffR, +16).  For xSTRM variants, also writes
   3452    guest_XMM0.
   3453 
   3454    Is expected to be called with opc_and_imm combinations which have
   3455    actually been validated, and will assert if otherwise.  The front
   3456    end should ensure we're only called with verified values.
   3457 */
   3458 ULong amd64g_dirtyhelper_PCMPxSTRx (
   3459           VexGuestAMD64State* gst,
   3460           HWord opc4_and_imm,
   3461           HWord gstOffL, HWord gstOffR,
   3462           HWord edxIN, HWord eaxIN
   3463        )
   3464 {
   3465    HWord opc4 = (opc4_and_imm >> 8) & 0xFF;
   3466    HWord imm8 = opc4_and_imm & 0xFF;
   3467    HWord isISTRx = opc4 & 2;
   3468    HWord isxSTRM = (opc4 & 1) ^ 1;
   3469    vassert((opc4 & 0xFC) == 0x60); /* 0x60 .. 0x63 */
   3470    HWord wide = (imm8 & 1);
   3471 
   3472    // where the args are
   3473    V128* argL = (V128*)( ((UChar*)gst) + gstOffL );
   3474    V128* argR = (V128*)( ((UChar*)gst) + gstOffR );
   3475 
   3476    /* Create the arg validity masks, either from the vectors
   3477       themselves or from the supplied edx/eax values. */
   3478    // FIXME: this is only right for the 8-bit data cases.
   3479    // At least that is asserted above.
   3480    UInt zmaskL, zmaskR;
   3481 
   3482    // temp spot for the resulting flags and vector.
   3483    V128 resV;
   3484    UInt resOSZACP;
   3485 
   3486    // for checking whether case was handled
   3487    Bool ok = False;
   3488 
   3489    if (wide) {
   3490       if (isISTRx) {
   3491          zmaskL = zmask_from_V128_wide(argL);
   3492          zmaskR = zmask_from_V128_wide(argR);
   3493       } else {
   3494          Int tmp;
   3495          tmp = edxIN & 0xFFFFFFFF;
   3496          if (tmp < -8) tmp = -8;
   3497          if (tmp > 8)  tmp = 8;
   3498          if (tmp < 0)  tmp = -tmp;
   3499          vassert(tmp >= 0 && tmp <= 8);
   3500          zmaskL = (1 << tmp) & 0xFF;
   3501          tmp = eaxIN & 0xFFFFFFFF;
   3502          if (tmp < -8) tmp = -8;
   3503          if (tmp > 8)  tmp = 8;
   3504          if (tmp < 0)  tmp = -tmp;
   3505          vassert(tmp >= 0 && tmp <= 8);
   3506          zmaskR = (1 << tmp) & 0xFF;
   3507       }
   3508       // do the meyaath
   3509       ok = compute_PCMPxSTRx_wide (
   3510               &resV, &resOSZACP, argL, argR,
   3511               zmaskL, zmaskR, imm8, (Bool)isxSTRM
   3512            );
   3513    } else {
   3514       if (isISTRx) {
   3515          zmaskL = zmask_from_V128(argL);
   3516          zmaskR = zmask_from_V128(argR);
   3517       } else {
   3518          Int tmp;
   3519          tmp = edxIN & 0xFFFFFFFF;
   3520          if (tmp < -16) tmp = -16;
   3521          if (tmp > 16)  tmp = 16;
   3522          if (tmp < 0)   tmp = -tmp;
   3523          vassert(tmp >= 0 && tmp <= 16);
   3524          zmaskL = (1 << tmp) & 0xFFFF;
   3525          tmp = eaxIN & 0xFFFFFFFF;
   3526          if (tmp < -16) tmp = -16;
   3527          if (tmp > 16)  tmp = 16;
   3528          if (tmp < 0)   tmp = -tmp;
   3529          vassert(tmp >= 0 && tmp <= 16);
   3530          zmaskR = (1 << tmp) & 0xFFFF;
   3531       }
   3532       // do the meyaath
   3533       ok = compute_PCMPxSTRx (
   3534               &resV, &resOSZACP, argL, argR,
   3535               zmaskL, zmaskR, imm8, (Bool)isxSTRM
   3536            );
   3537    }
   3538 
   3539    // front end shouldn't pass us any imm8 variants we can't
   3540    // handle.  Hence:
   3541    vassert(ok);
   3542 
   3543    // So, finally we need to get the results back to the caller.
   3544    // In all cases, the new OSZACP value is the lowest 16 of
   3545    // the return value.
   3546    if (isxSTRM) {
   3547       gst->guest_YMM0[0] = resV.w32[0];
   3548       gst->guest_YMM0[1] = resV.w32[1];
   3549       gst->guest_YMM0[2] = resV.w32[2];
   3550       gst->guest_YMM0[3] = resV.w32[3];
   3551       return resOSZACP & 0x8D5;
   3552    } else {
   3553       UInt newECX = resV.w32[0] & 0xFFFF;
   3554       return (newECX << 16) | (resOSZACP & 0x8D5);
   3555    }
   3556 }
   3557 
   3558 /*---------------------------------------------------------------*/
   3559 /*--- AES primitives and helpers                              ---*/
   3560 /*---------------------------------------------------------------*/
   3561 /* a 16 x 16 matrix */
   3562 static const UChar sbox[256] = {                   // row nr
   3563    0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5, // 1
   3564    0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76,
   3565    0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0, // 2
   3566    0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0,
   3567    0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc, // 3
   3568    0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15,
   3569    0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a, // 4
   3570    0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75,
   3571    0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0, // 5
   3572    0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84,
   3573    0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b, // 6
   3574    0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf,
   3575    0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85, // 7
   3576    0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8,
   3577    0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5, // 8
   3578    0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2,
   3579    0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17, // 9
   3580    0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73,
   3581    0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88, //10
   3582    0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb,
   3583    0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c, //11
   3584    0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79,
   3585    0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9, //12
   3586    0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08,
   3587    0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6, //13
   3588    0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a,
   3589    0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e, //14
   3590    0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e,
   3591    0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94, //15
   3592    0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf,
   3593    0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68, //16
   3594    0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16
   3595 };
   3596 static void SubBytes (V128* v)
   3597 {
   3598    V128 r;
   3599    UInt i;
   3600    for (i = 0; i < 16; i++)
   3601       r.w8[i] = sbox[v->w8[i]];
   3602    *v = r;
   3603 }
   3604 
   3605 /* a 16 x 16 matrix */
   3606 static const UChar invsbox[256] = {                // row nr
   3607    0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38, // 1
   3608    0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb,
   3609    0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87, // 2
   3610    0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb,
   3611    0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d, // 3
   3612    0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e,
   3613    0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2, // 4
   3614    0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25,
   3615    0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16, // 5
   3616    0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92,
   3617    0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda, // 6
   3618    0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84,
   3619    0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a, // 7
   3620    0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06,
   3621    0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02, // 8
   3622    0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b,
   3623    0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea, // 9
   3624    0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73,
   3625    0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85, //10
   3626    0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e,
   3627    0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89, //11
   3628    0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b,
   3629    0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20, //12
   3630    0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4,
   3631    0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31, //13
   3632    0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f,
   3633    0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d, //14
   3634    0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef,
   3635    0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0, //15
   3636    0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61,
   3637    0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26, //16
   3638    0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d
   3639 };
   3640 static void InvSubBytes (V128* v)
   3641 {
   3642    V128 r;
   3643    UInt i;
   3644    for (i = 0; i < 16; i++)
   3645       r.w8[i] = invsbox[v->w8[i]];
   3646    *v = r;
   3647 }
   3648 
   3649 static const UChar ShiftRows_op[16] =
   3650    {11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15, 10, 5, 0};
   3651 static void ShiftRows (V128* v)
   3652 {
   3653    V128 r;
   3654    UInt i;
   3655    for (i = 0; i < 16; i++)
   3656       r.w8[i] = v->w8[ShiftRows_op[15-i]];
   3657    *v = r;
   3658 }
   3659 
   3660 static const UChar InvShiftRows_op[16] =
   3661    {3, 6, 9, 12, 15, 2, 5, 8, 11, 14, 1, 4, 7, 10, 13, 0};
   3662 static void InvShiftRows (V128* v)
   3663 {
   3664    V128 r;
   3665    UInt i;
   3666    for (i = 0; i < 16; i++)
   3667       r.w8[i] = v->w8[InvShiftRows_op[15-i]];
   3668    *v = r;
   3669 }
   3670 
   3671 /* Multiplication of the finite fields elements of AES.
   3672    See "A Specification for The AES Algorithm Rijndael
   3673         (by Joan Daemen & Vincent Rijmen)"
   3674         Dr. Brian Gladman, v3.1, 3rd March 2001. */
   3675 /* N values so that (hex) xy = 0x03^N.
   3676    0x00 cannot be used. We put 0xff for this value.*/
   3677 /* a 16 x 16 matrix */
   3678 static const UChar Nxy[256] = {                    // row nr
   3679    0xff, 0x00, 0x19, 0x01, 0x32, 0x02, 0x1a, 0xc6, // 1
   3680    0x4b, 0xc7, 0x1b, 0x68, 0x33, 0xee, 0xdf, 0x03,
   3681    0x64, 0x04, 0xe0, 0x0e, 0x34, 0x8d, 0x81, 0xef, // 2
   3682    0x4c, 0x71, 0x08, 0xc8, 0xf8, 0x69, 0x1c, 0xc1,
   3683    0x7d, 0xc2, 0x1d, 0xb5, 0xf9, 0xb9, 0x27, 0x6a, // 3
   3684    0x4d, 0xe4, 0xa6, 0x72, 0x9a, 0xc9, 0x09, 0x78,
   3685    0x65, 0x2f, 0x8a, 0x05, 0x21, 0x0f, 0xe1, 0x24, // 4
   3686    0x12, 0xf0, 0x82, 0x45, 0x35, 0x93, 0xda, 0x8e,
   3687    0x96, 0x8f, 0xdb, 0xbd, 0x36, 0xd0, 0xce, 0x94, // 5
   3688    0x13, 0x5c, 0xd2, 0xf1, 0x40, 0x46, 0x83, 0x38,
   3689    0x66, 0xdd, 0xfd, 0x30, 0xbf, 0x06, 0x8b, 0x62, // 6
   3690    0xb3, 0x25, 0xe2, 0x98, 0x22, 0x88, 0x91, 0x10,
   3691    0x7e, 0x6e, 0x48, 0xc3, 0xa3, 0xb6, 0x1e, 0x42, // 7
   3692    0x3a, 0x6b, 0x28, 0x54, 0xfa, 0x85, 0x3d, 0xba,
   3693    0x2b, 0x79, 0x0a, 0x15, 0x9b, 0x9f, 0x5e, 0xca, // 8
   3694    0x4e, 0xd4, 0xac, 0xe5, 0xf3, 0x73, 0xa7, 0x57,
   3695    0xaf, 0x58, 0xa8, 0x50, 0xf4, 0xea, 0xd6, 0x74, // 9
   3696    0x4f, 0xae, 0xe9, 0xd5, 0xe7, 0xe6, 0xad, 0xe8,
   3697    0x2c, 0xd7, 0x75, 0x7a, 0xeb, 0x16, 0x0b, 0xf5, //10
   3698    0x59, 0xcb, 0x5f, 0xb0, 0x9c, 0xa9, 0x51, 0xa0,
   3699    0x7f, 0x0c, 0xf6, 0x6f, 0x17, 0xc4, 0x49, 0xec, //11
   3700    0xd8, 0x43, 0x1f, 0x2d, 0xa4, 0x76, 0x7b, 0xb7,
   3701    0xcc, 0xbb, 0x3e, 0x5a, 0xfb, 0x60, 0xb1, 0x86, //12
   3702    0x3b, 0x52, 0xa1, 0x6c, 0xaa, 0x55, 0x29, 0x9d,
   3703    0x97, 0xb2, 0x87, 0x90, 0x61, 0xbe, 0xdc, 0xfc, //13
   3704    0xbc, 0x95, 0xcf, 0xcd, 0x37, 0x3f, 0x5b, 0xd1,
   3705    0x53, 0x39, 0x84, 0x3c, 0x41, 0xa2, 0x6d, 0x47, //14
   3706    0x14, 0x2a, 0x9e, 0x5d, 0x56, 0xf2, 0xd3, 0xab,
   3707    0x44, 0x11, 0x92, 0xd9, 0x23, 0x20, 0x2e, 0x89, //15
   3708    0xb4, 0x7c, 0xb8, 0x26, 0x77, 0x99, 0xe3, 0xa5,
   3709    0x67, 0x4a, 0xed, 0xde, 0xc5, 0x31, 0xfe, 0x18, //16
   3710    0x0d, 0x63, 0x8c, 0x80, 0xc0, 0xf7, 0x70, 0x07
   3711 };
   3712 
   3713 /* E values so that E = 0x03^xy. */
   3714 static const UChar Exy[256] = {                    // row nr
   3715    0x01, 0x03, 0x05, 0x0f, 0x11, 0x33, 0x55, 0xff, // 1
   3716    0x1a, 0x2e, 0x72, 0x96, 0xa1, 0xf8, 0x13, 0x35,
   3717    0x5f, 0xe1, 0x38, 0x48, 0xd8, 0x73, 0x95, 0xa4, // 2
   3718    0xf7, 0x02, 0x06, 0x0a, 0x1e, 0x22, 0x66, 0xaa,
   3719    0xe5, 0x34, 0x5c, 0xe4, 0x37, 0x59, 0xeb, 0x26, // 3
   3720    0x6a, 0xbe, 0xd9, 0x70, 0x90, 0xab, 0xe6, 0x31,
   3721    0x53, 0xf5, 0x04, 0x0c, 0x14, 0x3c, 0x44, 0xcc, // 4
   3722    0x4f, 0xd1, 0x68, 0xb8, 0xd3, 0x6e, 0xb2, 0xcd,
   3723    0x4c, 0xd4, 0x67, 0xa9, 0xe0, 0x3b, 0x4d, 0xd7, // 5
   3724    0x62, 0xa6, 0xf1, 0x08, 0x18, 0x28, 0x78, 0x88,
   3725    0x83, 0x9e, 0xb9, 0xd0, 0x6b, 0xbd, 0xdc, 0x7f, // 6
   3726    0x81, 0x98, 0xb3, 0xce, 0x49, 0xdb, 0x76, 0x9a,
   3727    0xb5, 0xc4, 0x57, 0xf9, 0x10, 0x30, 0x50, 0xf0, // 7
   3728    0x0b, 0x1d, 0x27, 0x69, 0xbb, 0xd6, 0x61, 0xa3,
   3729    0xfe, 0x19, 0x2b, 0x7d, 0x87, 0x92, 0xad, 0xec, // 8
   3730    0x2f, 0x71, 0x93, 0xae, 0xe9, 0x20, 0x60, 0xa0,
   3731    0xfb, 0x16, 0x3a, 0x4e, 0xd2, 0x6d, 0xb7, 0xc2, // 9
   3732    0x5d, 0xe7, 0x32, 0x56, 0xfa, 0x15, 0x3f, 0x41,
   3733    0xc3, 0x5e, 0xe2, 0x3d, 0x47, 0xc9, 0x40, 0xc0, //10
   3734    0x5b, 0xed, 0x2c, 0x74, 0x9c, 0xbf, 0xda, 0x75,
   3735    0x9f, 0xba, 0xd5, 0x64, 0xac, 0xef, 0x2a, 0x7e, //11
   3736    0x82, 0x9d, 0xbc, 0xdf, 0x7a, 0x8e, 0x89, 0x80,
   3737    0x9b, 0xb6, 0xc1, 0x58, 0xe8, 0x23, 0x65, 0xaf, //12
   3738    0xea, 0x25, 0x6f, 0xb1, 0xc8, 0x43, 0xc5, 0x54,
   3739    0xfc, 0x1f, 0x21, 0x63, 0xa5, 0xf4, 0x07, 0x09, //13
   3740    0x1b, 0x2d, 0x77, 0x99, 0xb0, 0xcb, 0x46, 0xca,
   3741    0x45, 0xcf, 0x4a, 0xde, 0x79, 0x8b, 0x86, 0x91, //14
   3742    0xa8, 0xe3, 0x3e, 0x42, 0xc6, 0x51, 0xf3, 0x0e,
   3743    0x12, 0x36, 0x5a, 0xee, 0x29, 0x7b, 0x8d, 0x8c, //15
   3744    0x8f, 0x8a, 0x85, 0x94, 0xa7, 0xf2, 0x0d, 0x17,
   3745    0x39, 0x4b, 0xdd, 0x7c, 0x84, 0x97, 0xa2, 0xfd, //16
   3746    0x1c, 0x24, 0x6c, 0xb4, 0xc7, 0x52, 0xf6, 0x01};
   3747 
   3748 static inline UChar ff_mul(UChar u1, UChar u2)
   3749 {
   3750    if ((u1 > 0) && (u2 > 0)) {
   3751       UInt ui = Nxy[u1] + Nxy[u2];
   3752       if (ui >= 255)
   3753          ui = ui - 255;
   3754       return Exy[ui];
   3755    } else {
   3756       return 0;
   3757    };
   3758 }
   3759 
   3760 static void MixColumns (V128* v)
   3761 {
   3762    V128 r;
   3763    Int j;
   3764 #define P(x,row,col) (x)->w8[((row)*4+(col))]
   3765    for (j = 0; j < 4; j++) {
   3766       P(&r,j,0) = ff_mul(0x02, P(v,j,0)) ^ ff_mul(0x03, P(v,j,1))
   3767          ^ P(v,j,2) ^ P(v,j,3);
   3768       P(&r,j,1) = P(v,j,0) ^ ff_mul( 0x02, P(v,j,1) )
   3769          ^ ff_mul(0x03, P(v,j,2) ) ^ P(v,j,3);
   3770       P(&r,j,2) = P(v,j,0) ^ P(v,j,1) ^ ff_mul( 0x02, P(v,j,2) )
   3771          ^ ff_mul(0x03, P(v,j,3) );
   3772       P(&r,j,3) = ff_mul(0x03, P(v,j,0) ) ^ P(v,j,1) ^ P(v,j,2)
   3773          ^ ff_mul( 0x02, P(v,j,3) );
   3774    }
   3775    *v = r;
   3776 #undef P
   3777 }
   3778 
   3779 static void InvMixColumns (V128* v)
   3780 {
   3781    V128 r;
   3782    Int j;
   3783 #define P(x,row,col) (x)->w8[((row)*4+(col))]
   3784    for (j = 0; j < 4; j++) {
   3785       P(&r,j,0) = ff_mul(0x0e, P(v,j,0) ) ^ ff_mul(0x0b, P(v,j,1) )
   3786          ^ ff_mul(0x0d,P(v,j,2) ) ^ ff_mul(0x09, P(v,j,3) );
   3787       P(&r,j,1) = ff_mul(0x09, P(v,j,0) ) ^ ff_mul(0x0e, P(v,j,1) )
   3788          ^ ff_mul(0x0b,P(v,j,2) ) ^ ff_mul(0x0d, P(v,j,3) );
   3789       P(&r,j,2) = ff_mul(0x0d, P(v,j,0) ) ^ ff_mul(0x09, P(v,j,1) )
   3790          ^ ff_mul(0x0e,P(v,j,2) ) ^ ff_mul(0x0b, P(v,j,3) );
   3791       P(&r,j,3) = ff_mul(0x0b, P(v,j,0) ) ^ ff_mul(0x0d, P(v,j,1) )
   3792          ^ ff_mul(0x09,P(v,j,2) ) ^ ff_mul(0x0e, P(v,j,3) );
   3793    }
   3794    *v = r;
   3795 #undef P
   3796 
   3797 }
   3798 
   3799 /* For description, see definition in guest_amd64_defs.h */
   3800 void amd64g_dirtyhelper_AES (
   3801           VexGuestAMD64State* gst,
   3802           HWord opc4, HWord gstOffD,
   3803           HWord gstOffL, HWord gstOffR
   3804        )
   3805 {
   3806    // where the args are
   3807    V128* argD = (V128*)( ((UChar*)gst) + gstOffD );
   3808    V128* argL = (V128*)( ((UChar*)gst) + gstOffL );
   3809    V128* argR = (V128*)( ((UChar*)gst) + gstOffR );
   3810    V128  r;
   3811 
   3812    switch (opc4) {
   3813       case 0xDC: /* AESENC */
   3814       case 0xDD: /* AESENCLAST */
   3815          r = *argR;
   3816          ShiftRows (&r);
   3817          SubBytes  (&r);
   3818          if (opc4 == 0xDC)
   3819             MixColumns (&r);
   3820          argD->w64[0] = r.w64[0] ^ argL->w64[0];
   3821          argD->w64[1] = r.w64[1] ^ argL->w64[1];
   3822          break;
   3823 
   3824       case 0xDE: /* AESDEC */
   3825       case 0xDF: /* AESDECLAST */
   3826          r = *argR;
   3827          InvShiftRows (&r);
   3828          InvSubBytes (&r);
   3829          if (opc4 == 0xDE)
   3830             InvMixColumns (&r);
   3831          argD->w64[0] = r.w64[0] ^ argL->w64[0];
   3832          argD->w64[1] = r.w64[1] ^ argL->w64[1];
   3833          break;
   3834 
   3835       case 0xDB: /* AESIMC */
   3836          *argD = *argL;
   3837          InvMixColumns (argD);
   3838          break;
   3839       default: vassert(0);
   3840    }
   3841 }
   3842 
   3843 static inline UInt RotWord (UInt   w32)
   3844 {
   3845    return ((w32 >> 8) | (w32 << 24));
   3846 }
   3847 
   3848 static inline UInt SubWord (UInt   w32)
   3849 {
   3850    UChar *w8;
   3851    UChar *r8;
   3852    UInt res;
   3853    w8 = (UChar*) &w32;
   3854    r8 = (UChar*) &res;
   3855    r8[0] = sbox[w8[0]];
   3856    r8[1] = sbox[w8[1]];
   3857    r8[2] = sbox[w8[2]];
   3858    r8[3] = sbox[w8[3]];
   3859    return res;
   3860 }
   3861 
   3862 /* For description, see definition in guest_amd64_defs.h */
   3863 extern void amd64g_dirtyhelper_AESKEYGENASSIST (
   3864           VexGuestAMD64State* gst,
   3865           HWord imm8,
   3866           HWord gstOffL, HWord gstOffR
   3867        )
   3868 {
   3869    // where the args are
   3870    V128* argL = (V128*)( ((UChar*)gst) + gstOffL );
   3871    V128* argR = (V128*)( ((UChar*)gst) + gstOffR );
   3872 
   3873    // We have to create the result in a temporary in the
   3874    // case where the src and dst regs are the same.  See #341698.
   3875    V128 tmp;
   3876 
   3877    tmp.w32[3] = RotWord (SubWord (argL->w32[3])) ^ imm8;
   3878    tmp.w32[2] = SubWord (argL->w32[3]);
   3879    tmp.w32[1] = RotWord (SubWord (argL->w32[1])) ^ imm8;
   3880    tmp.w32[0] = SubWord (argL->w32[1]);
   3881 
   3882    argR->w32[3] = tmp.w32[3];
   3883    argR->w32[2] = tmp.w32[2];
   3884    argR->w32[1] = tmp.w32[1];
   3885    argR->w32[0] = tmp.w32[0];
   3886 }
   3887 
   3888 
   3889 
   3890 /*---------------------------------------------------------------*/
   3891 /*--- Helpers for dealing with, and describing,               ---*/
   3892 /*--- guest state as a whole.                                 ---*/
   3893 /*---------------------------------------------------------------*/
   3894 
   3895 /* Initialise the entire amd64 guest state. */
   3896 /* VISIBLE TO LIBVEX CLIENT */
   3897 void LibVEX_GuestAMD64_initialise ( /*OUT*/VexGuestAMD64State* vex_state )
   3898 {
   3899    vex_state->host_EvC_FAILADDR = 0;
   3900    vex_state->host_EvC_COUNTER = 0;
   3901    vex_state->pad0 = 0;
   3902 
   3903    vex_state->guest_RAX = 0;
   3904    vex_state->guest_RCX = 0;
   3905    vex_state->guest_RDX = 0;
   3906    vex_state->guest_RBX = 0;
   3907    vex_state->guest_RSP = 0;
   3908    vex_state->guest_RBP = 0;
   3909    vex_state->guest_RSI = 0;
   3910    vex_state->guest_RDI = 0;
   3911    vex_state->guest_R8  = 0;
   3912    vex_state->guest_R9  = 0;
   3913    vex_state->guest_R10 = 0;
   3914    vex_state->guest_R11 = 0;
   3915    vex_state->guest_R12 = 0;
   3916    vex_state->guest_R13 = 0;
   3917    vex_state->guest_R14 = 0;
   3918    vex_state->guest_R15 = 0;
   3919 
   3920    vex_state->guest_CC_OP   = AMD64G_CC_OP_COPY;
   3921    vex_state->guest_CC_DEP1 = 0;
   3922    vex_state->guest_CC_DEP2 = 0;
   3923    vex_state->guest_CC_NDEP = 0;
   3924 
   3925    vex_state->guest_DFLAG   = 1; /* forwards */
   3926    vex_state->guest_IDFLAG  = 0;
   3927    vex_state->guest_ACFLAG  = 0;
   3928 
   3929    /* HACK: represent the offset associated with a constant %fs.
   3930       Typically, on linux, this assumes that %fs is only ever zero (main
   3931       thread) or 0x63. */
   3932    vex_state->guest_FS_CONST = 0;
   3933 
   3934    vex_state->guest_RIP = 0;
   3935 
   3936    /* Initialise the simulated FPU */
   3937    amd64g_dirtyhelper_FINIT( vex_state );
   3938 
   3939    /* Initialise the AVX state. */
   3940 #  define AVXZERO(_ymm) \
   3941       do { _ymm[0]=_ymm[1]=_ymm[2]=_ymm[3] = 0; \
   3942            _ymm[4]=_ymm[5]=_ymm[6]=_ymm[7] = 0; \
   3943       } while (0)
   3944    vex_state->guest_SSEROUND = (ULong)Irrm_NEAREST;
   3945    AVXZERO(vex_state->guest_YMM0);
   3946    AVXZERO(vex_state->guest_YMM1);
   3947    AVXZERO(vex_state->guest_YMM2);
   3948    AVXZERO(vex_state->guest_YMM3);
   3949    AVXZERO(vex_state->guest_YMM4);
   3950    AVXZERO(vex_state->guest_YMM5);
   3951    AVXZERO(vex_state->guest_YMM6);
   3952    AVXZERO(vex_state->guest_YMM7);
   3953    AVXZERO(vex_state->guest_YMM8);
   3954    AVXZERO(vex_state->guest_YMM9);
   3955    AVXZERO(vex_state->guest_YMM10);
   3956    AVXZERO(vex_state->guest_YMM11);
   3957    AVXZERO(vex_state->guest_YMM12);
   3958    AVXZERO(vex_state->guest_YMM13);
   3959    AVXZERO(vex_state->guest_YMM14);
   3960    AVXZERO(vex_state->guest_YMM15);
   3961    AVXZERO(vex_state->guest_YMM16);
   3962 
   3963 #  undef AVXZERO
   3964 
   3965    vex_state->guest_EMNOTE = EmNote_NONE;
   3966 
   3967    /* These should not ever be either read or written, but we
   3968       initialise them anyway. */
   3969    vex_state->guest_CMSTART = 0;
   3970    vex_state->guest_CMLEN   = 0;
   3971 
   3972    vex_state->guest_NRADDR   = 0;
   3973    vex_state->guest_SC_CLASS = 0;
   3974    vex_state->guest_GS_CONST = 0;
   3975 
   3976    vex_state->guest_IP_AT_SYSCALL = 0;
   3977    vex_state->pad1 = 0;
   3978 }
   3979 
   3980 
   3981 /* Figure out if any part of the guest state contained in minoff
   3982    .. maxoff requires precise memory exceptions.  If in doubt return
   3983    True (but this generates significantly slower code).
   3984 
   3985    By default we enforce precise exns for guest %RSP, %RBP and %RIP
   3986    only.  These are the minimum needed to extract correct stack
   3987    backtraces from amd64 code.
   3988 
   3989    Only %RSP is needed in mode VexRegUpdSpAtMemAccess.
   3990 */
   3991 Bool guest_amd64_state_requires_precise_mem_exns (
   3992         Int minoff, Int maxoff, VexRegisterUpdates pxControl
   3993      )
   3994 {
   3995    Int rbp_min = offsetof(VexGuestAMD64State, guest_RBP);
   3996    Int rbp_max = rbp_min + 8 - 1;
   3997    Int rsp_min = offsetof(VexGuestAMD64State, guest_RSP);
   3998    Int rsp_max = rsp_min + 8 - 1;
   3999    Int rip_min = offsetof(VexGuestAMD64State, guest_RIP);
   4000    Int rip_max = rip_min + 8 - 1;
   4001 
   4002    if (maxoff < rsp_min || minoff > rsp_max) {
   4003       /* no overlap with rsp */
   4004       if (pxControl == VexRegUpdSpAtMemAccess)
   4005          return False; // We only need to check stack pointer.
   4006    } else {
   4007       return True;
   4008    }
   4009 
   4010    if (maxoff < rbp_min || minoff > rbp_max) {
   4011       /* no overlap with rbp */
   4012    } else {
   4013       return True;
   4014    }
   4015 
   4016    if (maxoff < rip_min || minoff > rip_max) {
   4017       /* no overlap with eip */
   4018    } else {
   4019       return True;
   4020    }
   4021 
   4022    return False;
   4023 }
   4024 
   4025 
   4026 #define ALWAYSDEFD(field)                             \
   4027     { offsetof(VexGuestAMD64State, field),            \
   4028       (sizeof ((VexGuestAMD64State*)0)->field) }
   4029 
   4030 VexGuestLayout
   4031    amd64guest_layout
   4032       = {
   4033           /* Total size of the guest state, in bytes. */
   4034           .total_sizeB = sizeof(VexGuestAMD64State),
   4035 
   4036           /* Describe the stack pointer. */
   4037           .offset_SP = offsetof(VexGuestAMD64State,guest_RSP),
   4038           .sizeof_SP = 8,
   4039 
   4040           /* Describe the frame pointer. */
   4041           .offset_FP = offsetof(VexGuestAMD64State,guest_RBP),
   4042           .sizeof_FP = 8,
   4043 
   4044           /* Describe the instruction pointer. */
   4045           .offset_IP = offsetof(VexGuestAMD64State,guest_RIP),
   4046           .sizeof_IP = 8,
   4047 
   4048           /* Describe any sections to be regarded by Memcheck as
   4049              'always-defined'. */
   4050           .n_alwaysDefd = 16,
   4051 
   4052           /* flags thunk: OP and NDEP are always defd, whereas DEP1
   4053              and DEP2 have to be tracked.  See detailed comment in
   4054              gdefs.h on meaning of thunk fields. */
   4055           .alwaysDefd
   4056              = { /*  0 */ ALWAYSDEFD(guest_CC_OP),
   4057                  /*  1 */ ALWAYSDEFD(guest_CC_NDEP),
   4058 		 /*  2 */ ALWAYSDEFD(guest_DFLAG),
   4059                  /*  3 */ ALWAYSDEFD(guest_IDFLAG),
   4060                  /*  4 */ ALWAYSDEFD(guest_RIP),
   4061                  /*  5 */ ALWAYSDEFD(guest_FS_CONST),
   4062                  /*  6 */ ALWAYSDEFD(guest_FTOP),
   4063                  /*  7 */ ALWAYSDEFD(guest_FPTAG),
   4064                  /*  8 */ ALWAYSDEFD(guest_FPROUND),
   4065                  /*  9 */ ALWAYSDEFD(guest_FC3210),
   4066                  // /* */ ALWAYSDEFD(guest_CS),
   4067                  // /* */ ALWAYSDEFD(guest_DS),
   4068                  // /* */ ALWAYSDEFD(guest_ES),
   4069                  // /* */ ALWAYSDEFD(guest_FS),
   4070                  // /* */ ALWAYSDEFD(guest_GS),
   4071                  // /* */ ALWAYSDEFD(guest_SS),
   4072                  // /* */ ALWAYSDEFD(guest_LDT),
   4073                  // /* */ ALWAYSDEFD(guest_GDT),
   4074                  /* 10 */ ALWAYSDEFD(guest_EMNOTE),
   4075                  /* 11 */ ALWAYSDEFD(guest_SSEROUND),
   4076                  /* 12 */ ALWAYSDEFD(guest_CMSTART),
   4077                  /* 13 */ ALWAYSDEFD(guest_CMLEN),
   4078                  /* 14 */ ALWAYSDEFD(guest_SC_CLASS),
   4079                  /* 15 */ ALWAYSDEFD(guest_IP_AT_SYSCALL)
   4080                }
   4081         };
   4082 
   4083 
   4084 /*---------------------------------------------------------------*/
   4085 /*--- end                               guest_amd64_helpers.c ---*/
   4086 /*---------------------------------------------------------------*/
   4087