1 2 /*---------------------------------------------------------------*/ 3 /*--- begin guest_amd64_helpers.c ---*/ 4 /*---------------------------------------------------------------*/ 5 6 /* 7 This file is part of Valgrind, a dynamic binary instrumentation 8 framework. 9 10 Copyright (C) 2004-2013 OpenWorks LLP 11 info (at) open-works.net 12 13 This program is free software; you can redistribute it and/or 14 modify it under the terms of the GNU General Public License as 15 published by the Free Software Foundation; either version 2 of the 16 License, or (at your option) any later version. 17 18 This program is distributed in the hope that it will be useful, but 19 WITHOUT ANY WARRANTY; without even the implied warranty of 20 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 21 General Public License for more details. 22 23 You should have received a copy of the GNU General Public License 24 along with this program; if not, write to the Free Software 25 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 26 02110-1301, USA. 27 28 The GNU General Public License is contained in the file COPYING. 29 30 Neither the names of the U.S. Department of Energy nor the 31 University of California nor the names of its contributors may be 32 used to endorse or promote products derived from this software 33 without prior written permission. 34 */ 35 36 #include "libvex_basictypes.h" 37 #include "libvex_emnote.h" 38 #include "libvex_guest_amd64.h" 39 #include "libvex_ir.h" 40 #include "libvex.h" 41 42 #include "main_util.h" 43 #include "main_globals.h" 44 #include "guest_generic_bb_to_IR.h" 45 #include "guest_amd64_defs.h" 46 #include "guest_generic_x87.h" 47 48 49 /* This file contains helper functions for amd64 guest code. 50 Calls to these functions are generated by the back end. 51 These calls are of course in the host machine code and 52 this file will be compiled to host machine code, so that 53 all makes sense. 54 55 Only change the signatures of these helper functions very 56 carefully. If you change the signature here, you'll have to change 57 the parameters passed to it in the IR calls constructed by 58 guest-amd64/toIR.c. 59 60 The convention used is that all functions called from generated 61 code are named amd64g_<something>, and any function whose name lacks 62 that prefix is not called from generated code. Note that some 63 LibVEX_* functions can however be called by VEX's client, but that 64 is not the same as calling them from VEX-generated code. 65 */ 66 67 68 /* Set to 1 to get detailed profiling info about use of the flag 69 machinery. */ 70 #define PROFILE_RFLAGS 0 71 72 73 /*---------------------------------------------------------------*/ 74 /*--- %rflags run-time helpers. ---*/ 75 /*---------------------------------------------------------------*/ 76 77 /* Do 64x64 -> 128 signed/unsigned multiplies, for computing flags 78 after imulq/mulq. */ 79 80 static void mullS64 ( Long u, Long v, Long* rHi, Long* rLo ) 81 { 82 ULong u0, v0, w0; 83 Long u1, v1, w1, w2, t; 84 u0 = u & 0xFFFFFFFFULL; 85 u1 = u >> 32; 86 v0 = v & 0xFFFFFFFFULL; 87 v1 = v >> 32; 88 w0 = u0 * v0; 89 t = u1 * v0 + (w0 >> 32); 90 w1 = t & 0xFFFFFFFFULL; 91 w2 = t >> 32; 92 w1 = u0 * v1 + w1; 93 *rHi = u1 * v1 + w2 + (w1 >> 32); 94 *rLo = u * v; 95 } 96 97 static void mullU64 ( ULong u, ULong v, ULong* rHi, ULong* rLo ) 98 { 99 ULong u0, v0, w0; 100 ULong u1, v1, w1,w2,t; 101 u0 = u & 0xFFFFFFFFULL; 102 u1 = u >> 32; 103 v0 = v & 0xFFFFFFFFULL; 104 v1 = v >> 32; 105 w0 = u0 * v0; 106 t = u1 * v0 + (w0 >> 32); 107 w1 = t & 0xFFFFFFFFULL; 108 w2 = t >> 32; 109 w1 = u0 * v1 + w1; 110 *rHi = u1 * v1 + w2 + (w1 >> 32); 111 *rLo = u * v; 112 } 113 114 115 static const UChar parity_table[256] = { 116 AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, 117 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 118 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 119 AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, 120 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 121 AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, 122 AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, 123 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 124 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 125 AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, 126 AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, 127 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 128 AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, 129 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 130 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 131 AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, 132 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 133 AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, 134 AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, 135 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 136 AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, 137 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 138 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 139 AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, 140 AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, 141 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 142 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 143 AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, 144 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 145 AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, 146 AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, 147 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 148 }; 149 150 /* generalised left-shifter */ 151 static inline Long lshift ( Long x, Int n ) 152 { 153 if (n >= 0) 154 return x << n; 155 else 156 return x >> (-n); 157 } 158 159 /* identity on ULong */ 160 static inline ULong idULong ( ULong x ) 161 { 162 return x; 163 } 164 165 166 #define PREAMBLE(__data_bits) \ 167 /* const */ ULong DATA_MASK \ 168 = __data_bits==8 \ 169 ? 0xFFULL \ 170 : (__data_bits==16 \ 171 ? 0xFFFFULL \ 172 : (__data_bits==32 \ 173 ? 0xFFFFFFFFULL \ 174 : 0xFFFFFFFFFFFFFFFFULL)); \ 175 /* const */ ULong SIGN_MASK = 1ULL << (__data_bits - 1); \ 176 /* const */ ULong CC_DEP1 = cc_dep1_formal; \ 177 /* const */ ULong CC_DEP2 = cc_dep2_formal; \ 178 /* const */ ULong CC_NDEP = cc_ndep_formal; \ 179 /* Four bogus assignments, which hopefully gcc can */ \ 180 /* optimise away, and which stop it complaining about */ \ 181 /* unused variables. */ \ 182 SIGN_MASK = SIGN_MASK; \ 183 DATA_MASK = DATA_MASK; \ 184 CC_DEP2 = CC_DEP2; \ 185 CC_NDEP = CC_NDEP; 186 187 188 /*-------------------------------------------------------------*/ 189 190 #define ACTIONS_ADD(DATA_BITS,DATA_UTYPE) \ 191 { \ 192 PREAMBLE(DATA_BITS); \ 193 { Long cf, pf, af, zf, sf, of; \ 194 Long argL, argR, res; \ 195 argL = CC_DEP1; \ 196 argR = CC_DEP2; \ 197 res = argL + argR; \ 198 cf = (DATA_UTYPE)res < (DATA_UTYPE)argL; \ 199 pf = parity_table[(UChar)res]; \ 200 af = (res ^ argL ^ argR) & 0x10; \ 201 zf = ((DATA_UTYPE)res == 0) << 6; \ 202 sf = lshift(res, 8 - DATA_BITS) & 0x80; \ 203 of = lshift((argL ^ argR ^ -1) & (argL ^ res), \ 204 12 - DATA_BITS) & AMD64G_CC_MASK_O; \ 205 return cf | pf | af | zf | sf | of; \ 206 } \ 207 } 208 209 /*-------------------------------------------------------------*/ 210 211 #define ACTIONS_SUB(DATA_BITS,DATA_UTYPE) \ 212 { \ 213 PREAMBLE(DATA_BITS); \ 214 { Long cf, pf, af, zf, sf, of; \ 215 Long argL, argR, res; \ 216 argL = CC_DEP1; \ 217 argR = CC_DEP2; \ 218 res = argL - argR; \ 219 cf = (DATA_UTYPE)argL < (DATA_UTYPE)argR; \ 220 pf = parity_table[(UChar)res]; \ 221 af = (res ^ argL ^ argR) & 0x10; \ 222 zf = ((DATA_UTYPE)res == 0) << 6; \ 223 sf = lshift(res, 8 - DATA_BITS) & 0x80; \ 224 of = lshift((argL ^ argR) & (argL ^ res), \ 225 12 - DATA_BITS) & AMD64G_CC_MASK_O; \ 226 return cf | pf | af | zf | sf | of; \ 227 } \ 228 } 229 230 /*-------------------------------------------------------------*/ 231 232 #define ACTIONS_ADC(DATA_BITS,DATA_UTYPE) \ 233 { \ 234 PREAMBLE(DATA_BITS); \ 235 { Long cf, pf, af, zf, sf, of; \ 236 Long argL, argR, oldC, res; \ 237 oldC = CC_NDEP & AMD64G_CC_MASK_C; \ 238 argL = CC_DEP1; \ 239 argR = CC_DEP2 ^ oldC; \ 240 res = (argL + argR) + oldC; \ 241 if (oldC) \ 242 cf = (DATA_UTYPE)res <= (DATA_UTYPE)argL; \ 243 else \ 244 cf = (DATA_UTYPE)res < (DATA_UTYPE)argL; \ 245 pf = parity_table[(UChar)res]; \ 246 af = (res ^ argL ^ argR) & 0x10; \ 247 zf = ((DATA_UTYPE)res == 0) << 6; \ 248 sf = lshift(res, 8 - DATA_BITS) & 0x80; \ 249 of = lshift((argL ^ argR ^ -1) & (argL ^ res), \ 250 12 - DATA_BITS) & AMD64G_CC_MASK_O; \ 251 return cf | pf | af | zf | sf | of; \ 252 } \ 253 } 254 255 /*-------------------------------------------------------------*/ 256 257 #define ACTIONS_SBB(DATA_BITS,DATA_UTYPE) \ 258 { \ 259 PREAMBLE(DATA_BITS); \ 260 { Long cf, pf, af, zf, sf, of; \ 261 Long argL, argR, oldC, res; \ 262 oldC = CC_NDEP & AMD64G_CC_MASK_C; \ 263 argL = CC_DEP1; \ 264 argR = CC_DEP2 ^ oldC; \ 265 res = (argL - argR) - oldC; \ 266 if (oldC) \ 267 cf = (DATA_UTYPE)argL <= (DATA_UTYPE)argR; \ 268 else \ 269 cf = (DATA_UTYPE)argL < (DATA_UTYPE)argR; \ 270 pf = parity_table[(UChar)res]; \ 271 af = (res ^ argL ^ argR) & 0x10; \ 272 zf = ((DATA_UTYPE)res == 0) << 6; \ 273 sf = lshift(res, 8 - DATA_BITS) & 0x80; \ 274 of = lshift((argL ^ argR) & (argL ^ res), \ 275 12 - DATA_BITS) & AMD64G_CC_MASK_O; \ 276 return cf | pf | af | zf | sf | of; \ 277 } \ 278 } 279 280 /*-------------------------------------------------------------*/ 281 282 #define ACTIONS_LOGIC(DATA_BITS,DATA_UTYPE) \ 283 { \ 284 PREAMBLE(DATA_BITS); \ 285 { Long cf, pf, af, zf, sf, of; \ 286 cf = 0; \ 287 pf = parity_table[(UChar)CC_DEP1]; \ 288 af = 0; \ 289 zf = ((DATA_UTYPE)CC_DEP1 == 0) << 6; \ 290 sf = lshift(CC_DEP1, 8 - DATA_BITS) & 0x80; \ 291 of = 0; \ 292 return cf | pf | af | zf | sf | of; \ 293 } \ 294 } 295 296 /*-------------------------------------------------------------*/ 297 298 #define ACTIONS_INC(DATA_BITS,DATA_UTYPE) \ 299 { \ 300 PREAMBLE(DATA_BITS); \ 301 { Long cf, pf, af, zf, sf, of; \ 302 Long argL, argR, res; \ 303 res = CC_DEP1; \ 304 argL = res - 1; \ 305 argR = 1; \ 306 cf = CC_NDEP & AMD64G_CC_MASK_C; \ 307 pf = parity_table[(UChar)res]; \ 308 af = (res ^ argL ^ argR) & 0x10; \ 309 zf = ((DATA_UTYPE)res == 0) << 6; \ 310 sf = lshift(res, 8 - DATA_BITS) & 0x80; \ 311 of = ((res & DATA_MASK) == SIGN_MASK) << 11; \ 312 return cf | pf | af | zf | sf | of; \ 313 } \ 314 } 315 316 /*-------------------------------------------------------------*/ 317 318 #define ACTIONS_DEC(DATA_BITS,DATA_UTYPE) \ 319 { \ 320 PREAMBLE(DATA_BITS); \ 321 { Long cf, pf, af, zf, sf, of; \ 322 Long argL, argR, res; \ 323 res = CC_DEP1; \ 324 argL = res + 1; \ 325 argR = 1; \ 326 cf = CC_NDEP & AMD64G_CC_MASK_C; \ 327 pf = parity_table[(UChar)res]; \ 328 af = (res ^ argL ^ argR) & 0x10; \ 329 zf = ((DATA_UTYPE)res == 0) << 6; \ 330 sf = lshift(res, 8 - DATA_BITS) & 0x80; \ 331 of = ((res & DATA_MASK) \ 332 == ((ULong)SIGN_MASK - 1)) << 11; \ 333 return cf | pf | af | zf | sf | of; \ 334 } \ 335 } 336 337 /*-------------------------------------------------------------*/ 338 339 #define ACTIONS_SHL(DATA_BITS,DATA_UTYPE) \ 340 { \ 341 PREAMBLE(DATA_BITS); \ 342 { Long cf, pf, af, zf, sf, of; \ 343 cf = (CC_DEP2 >> (DATA_BITS - 1)) & AMD64G_CC_MASK_C; \ 344 pf = parity_table[(UChar)CC_DEP1]; \ 345 af = 0; /* undefined */ \ 346 zf = ((DATA_UTYPE)CC_DEP1 == 0) << 6; \ 347 sf = lshift(CC_DEP1, 8 - DATA_BITS) & 0x80; \ 348 /* of is defined if shift count == 1 */ \ 349 of = lshift(CC_DEP2 ^ CC_DEP1, 12 - DATA_BITS) \ 350 & AMD64G_CC_MASK_O; \ 351 return cf | pf | af | zf | sf | of; \ 352 } \ 353 } 354 355 /*-------------------------------------------------------------*/ 356 357 #define ACTIONS_SHR(DATA_BITS,DATA_UTYPE) \ 358 { \ 359 PREAMBLE(DATA_BITS); \ 360 { Long cf, pf, af, zf, sf, of; \ 361 cf = CC_DEP2 & 1; \ 362 pf = parity_table[(UChar)CC_DEP1]; \ 363 af = 0; /* undefined */ \ 364 zf = ((DATA_UTYPE)CC_DEP1 == 0) << 6; \ 365 sf = lshift(CC_DEP1, 8 - DATA_BITS) & 0x80; \ 366 /* of is defined if shift count == 1 */ \ 367 of = lshift(CC_DEP2 ^ CC_DEP1, 12 - DATA_BITS) \ 368 & AMD64G_CC_MASK_O; \ 369 return cf | pf | af | zf | sf | of; \ 370 } \ 371 } 372 373 /*-------------------------------------------------------------*/ 374 375 /* ROL: cf' = lsb(result). of' = msb(result) ^ lsb(result). */ 376 /* DEP1 = result, NDEP = old flags */ 377 #define ACTIONS_ROL(DATA_BITS,DATA_UTYPE) \ 378 { \ 379 PREAMBLE(DATA_BITS); \ 380 { Long fl \ 381 = (CC_NDEP & ~(AMD64G_CC_MASK_O | AMD64G_CC_MASK_C)) \ 382 | (AMD64G_CC_MASK_C & CC_DEP1) \ 383 | (AMD64G_CC_MASK_O & (lshift(CC_DEP1, \ 384 11-(DATA_BITS-1)) \ 385 ^ lshift(CC_DEP1, 11))); \ 386 return fl; \ 387 } \ 388 } 389 390 /*-------------------------------------------------------------*/ 391 392 /* ROR: cf' = msb(result). of' = msb(result) ^ msb-1(result). */ 393 /* DEP1 = result, NDEP = old flags */ 394 #define ACTIONS_ROR(DATA_BITS,DATA_UTYPE) \ 395 { \ 396 PREAMBLE(DATA_BITS); \ 397 { Long fl \ 398 = (CC_NDEP & ~(AMD64G_CC_MASK_O | AMD64G_CC_MASK_C)) \ 399 | (AMD64G_CC_MASK_C & (CC_DEP1 >> (DATA_BITS-1))) \ 400 | (AMD64G_CC_MASK_O & (lshift(CC_DEP1, \ 401 11-(DATA_BITS-1)) \ 402 ^ lshift(CC_DEP1, 11-(DATA_BITS-1)+1))); \ 403 return fl; \ 404 } \ 405 } 406 407 /*-------------------------------------------------------------*/ 408 409 #define ACTIONS_UMUL(DATA_BITS, DATA_UTYPE, NARROWtoU, \ 410 DATA_U2TYPE, NARROWto2U) \ 411 { \ 412 PREAMBLE(DATA_BITS); \ 413 { Long cf, pf, af, zf, sf, of; \ 414 DATA_UTYPE hi; \ 415 DATA_UTYPE lo \ 416 = NARROWtoU( ((DATA_UTYPE)CC_DEP1) \ 417 * ((DATA_UTYPE)CC_DEP2) ); \ 418 DATA_U2TYPE rr \ 419 = NARROWto2U( \ 420 ((DATA_U2TYPE)((DATA_UTYPE)CC_DEP1)) \ 421 * ((DATA_U2TYPE)((DATA_UTYPE)CC_DEP2)) ); \ 422 hi = NARROWtoU(rr >>/*u*/ DATA_BITS); \ 423 cf = (hi != 0); \ 424 pf = parity_table[(UChar)lo]; \ 425 af = 0; /* undefined */ \ 426 zf = (lo == 0) << 6; \ 427 sf = lshift(lo, 8 - DATA_BITS) & 0x80; \ 428 of = cf << 11; \ 429 return cf | pf | af | zf | sf | of; \ 430 } \ 431 } 432 433 /*-------------------------------------------------------------*/ 434 435 #define ACTIONS_SMUL(DATA_BITS, DATA_STYPE, NARROWtoS, \ 436 DATA_S2TYPE, NARROWto2S) \ 437 { \ 438 PREAMBLE(DATA_BITS); \ 439 { Long cf, pf, af, zf, sf, of; \ 440 DATA_STYPE hi; \ 441 DATA_STYPE lo \ 442 = NARROWtoS( ((DATA_STYPE)CC_DEP1) \ 443 * ((DATA_STYPE)CC_DEP2) ); \ 444 DATA_S2TYPE rr \ 445 = NARROWto2S( \ 446 ((DATA_S2TYPE)((DATA_STYPE)CC_DEP1)) \ 447 * ((DATA_S2TYPE)((DATA_STYPE)CC_DEP2)) ); \ 448 hi = NARROWtoS(rr >>/*s*/ DATA_BITS); \ 449 cf = (hi != (lo >>/*s*/ (DATA_BITS-1))); \ 450 pf = parity_table[(UChar)lo]; \ 451 af = 0; /* undefined */ \ 452 zf = (lo == 0) << 6; \ 453 sf = lshift(lo, 8 - DATA_BITS) & 0x80; \ 454 of = cf << 11; \ 455 return cf | pf | af | zf | sf | of; \ 456 } \ 457 } 458 459 /*-------------------------------------------------------------*/ 460 461 #define ACTIONS_UMULQ \ 462 { \ 463 PREAMBLE(64); \ 464 { Long cf, pf, af, zf, sf, of; \ 465 ULong lo, hi; \ 466 mullU64( (ULong)CC_DEP1, (ULong)CC_DEP2, &hi, &lo ); \ 467 cf = (hi != 0); \ 468 pf = parity_table[(UChar)lo]; \ 469 af = 0; /* undefined */ \ 470 zf = (lo == 0) << 6; \ 471 sf = lshift(lo, 8 - 64) & 0x80; \ 472 of = cf << 11; \ 473 return cf | pf | af | zf | sf | of; \ 474 } \ 475 } 476 477 /*-------------------------------------------------------------*/ 478 479 #define ACTIONS_SMULQ \ 480 { \ 481 PREAMBLE(64); \ 482 { Long cf, pf, af, zf, sf, of; \ 483 Long lo, hi; \ 484 mullS64( (Long)CC_DEP1, (Long)CC_DEP2, &hi, &lo ); \ 485 cf = (hi != (lo >>/*s*/ (64-1))); \ 486 pf = parity_table[(UChar)lo]; \ 487 af = 0; /* undefined */ \ 488 zf = (lo == 0) << 6; \ 489 sf = lshift(lo, 8 - 64) & 0x80; \ 490 of = cf << 11; \ 491 return cf | pf | af | zf | sf | of; \ 492 } \ 493 } 494 495 /*-------------------------------------------------------------*/ 496 497 #define ACTIONS_ANDN(DATA_BITS,DATA_UTYPE) \ 498 { \ 499 PREAMBLE(DATA_BITS); \ 500 { Long cf, pf, af, zf, sf, of; \ 501 cf = 0; \ 502 pf = 0; \ 503 af = 0; \ 504 zf = ((DATA_UTYPE)CC_DEP1 == 0) << 6; \ 505 sf = lshift(CC_DEP1, 8 - DATA_BITS) & 0x80; \ 506 of = 0; \ 507 return cf | pf | af | zf | sf | of; \ 508 } \ 509 } 510 511 /*-------------------------------------------------------------*/ 512 513 #define ACTIONS_BLSI(DATA_BITS,DATA_UTYPE) \ 514 { \ 515 PREAMBLE(DATA_BITS); \ 516 { Long cf, pf, af, zf, sf, of; \ 517 cf = ((DATA_UTYPE)CC_DEP2 != 0); \ 518 pf = 0; \ 519 af = 0; \ 520 zf = ((DATA_UTYPE)CC_DEP1 == 0) << 6; \ 521 sf = lshift(CC_DEP1, 8 - DATA_BITS) & 0x80; \ 522 of = 0; \ 523 return cf | pf | af | zf | sf | of; \ 524 } \ 525 } 526 527 /*-------------------------------------------------------------*/ 528 529 #define ACTIONS_BLSMSK(DATA_BITS,DATA_UTYPE) \ 530 { \ 531 PREAMBLE(DATA_BITS); \ 532 { Long cf, pf, af, zf, sf, of; \ 533 cf = ((DATA_UTYPE)CC_DEP2 == 0); \ 534 pf = 0; \ 535 af = 0; \ 536 zf = 0; \ 537 sf = lshift(CC_DEP1, 8 - DATA_BITS) & 0x80; \ 538 of = 0; \ 539 return cf | pf | af | zf | sf | of; \ 540 } \ 541 } 542 543 /*-------------------------------------------------------------*/ 544 545 #define ACTIONS_BLSR(DATA_BITS,DATA_UTYPE) \ 546 { \ 547 PREAMBLE(DATA_BITS); \ 548 { Long cf, pf, af, zf, sf, of; \ 549 cf = ((DATA_UTYPE)CC_DEP2 == 0); \ 550 pf = 0; \ 551 af = 0; \ 552 zf = ((DATA_UTYPE)CC_DEP1 == 0) << 6; \ 553 sf = lshift(CC_DEP1, 8 - DATA_BITS) & 0x80; \ 554 of = 0; \ 555 return cf | pf | af | zf | sf | of; \ 556 } \ 557 } 558 559 /*-------------------------------------------------------------*/ 560 561 562 #if PROFILE_RFLAGS 563 564 static Bool initted = False; 565 566 /* C flag, fast route */ 567 static UInt tabc_fast[AMD64G_CC_OP_NUMBER]; 568 /* C flag, slow route */ 569 static UInt tabc_slow[AMD64G_CC_OP_NUMBER]; 570 /* table for calculate_cond */ 571 static UInt tab_cond[AMD64G_CC_OP_NUMBER][16]; 572 /* total entry counts for calc_all, calc_c, calc_cond. */ 573 static UInt n_calc_all = 0; 574 static UInt n_calc_c = 0; 575 static UInt n_calc_cond = 0; 576 577 #define SHOW_COUNTS_NOW (0 == (0x3FFFFF & (n_calc_all+n_calc_c+n_calc_cond))) 578 579 580 static void showCounts ( void ) 581 { 582 Int op, co; 583 HChar ch; 584 vex_printf("\nTotal calls: calc_all=%u calc_cond=%u calc_c=%u\n", 585 n_calc_all, n_calc_cond, n_calc_c); 586 587 vex_printf(" cSLOW cFAST O NO B NB Z NZ BE NBE" 588 " S NS P NP L NL LE NLE\n"); 589 vex_printf(" -----------------------------------------------------" 590 "----------------------------------------\n"); 591 for (op = 0; op < AMD64G_CC_OP_NUMBER; op++) { 592 593 ch = ' '; 594 if (op > 0 && (op-1) % 4 == 0) 595 ch = 'B'; 596 if (op > 0 && (op-1) % 4 == 1) 597 ch = 'W'; 598 if (op > 0 && (op-1) % 4 == 2) 599 ch = 'L'; 600 if (op > 0 && (op-1) % 4 == 3) 601 ch = 'Q'; 602 603 vex_printf("%2d%c: ", op, ch); 604 vex_printf("%6u ", tabc_slow[op]); 605 vex_printf("%6u ", tabc_fast[op]); 606 for (co = 0; co < 16; co++) { 607 Int n = tab_cond[op][co]; 608 if (n >= 1000) { 609 vex_printf(" %3dK", n / 1000); 610 } else 611 if (n >= 0) { 612 vex_printf(" %3d ", n ); 613 } else { 614 vex_printf(" "); 615 } 616 } 617 vex_printf("\n"); 618 } 619 vex_printf("\n"); 620 } 621 622 static void initCounts ( void ) 623 { 624 Int op, co; 625 initted = True; 626 for (op = 0; op < AMD64G_CC_OP_NUMBER; op++) { 627 tabc_fast[op] = tabc_slow[op] = 0; 628 for (co = 0; co < 16; co++) 629 tab_cond[op][co] = 0; 630 } 631 } 632 633 #endif /* PROFILE_RFLAGS */ 634 635 636 /* CALLED FROM GENERATED CODE: CLEAN HELPER */ 637 /* Calculate all the 6 flags from the supplied thunk parameters. 638 Worker function, not directly called from generated code. */ 639 static 640 ULong amd64g_calculate_rflags_all_WRK ( ULong cc_op, 641 ULong cc_dep1_formal, 642 ULong cc_dep2_formal, 643 ULong cc_ndep_formal ) 644 { 645 switch (cc_op) { 646 case AMD64G_CC_OP_COPY: 647 return cc_dep1_formal 648 & (AMD64G_CC_MASK_O | AMD64G_CC_MASK_S | AMD64G_CC_MASK_Z 649 | AMD64G_CC_MASK_A | AMD64G_CC_MASK_C | AMD64G_CC_MASK_P); 650 651 case AMD64G_CC_OP_ADDB: ACTIONS_ADD( 8, UChar ); 652 case AMD64G_CC_OP_ADDW: ACTIONS_ADD( 16, UShort ); 653 case AMD64G_CC_OP_ADDL: ACTIONS_ADD( 32, UInt ); 654 case AMD64G_CC_OP_ADDQ: ACTIONS_ADD( 64, ULong ); 655 656 case AMD64G_CC_OP_ADCB: ACTIONS_ADC( 8, UChar ); 657 case AMD64G_CC_OP_ADCW: ACTIONS_ADC( 16, UShort ); 658 case AMD64G_CC_OP_ADCL: ACTIONS_ADC( 32, UInt ); 659 case AMD64G_CC_OP_ADCQ: ACTIONS_ADC( 64, ULong ); 660 661 case AMD64G_CC_OP_SUBB: ACTIONS_SUB( 8, UChar ); 662 case AMD64G_CC_OP_SUBW: ACTIONS_SUB( 16, UShort ); 663 case AMD64G_CC_OP_SUBL: ACTIONS_SUB( 32, UInt ); 664 case AMD64G_CC_OP_SUBQ: ACTIONS_SUB( 64, ULong ); 665 666 case AMD64G_CC_OP_SBBB: ACTIONS_SBB( 8, UChar ); 667 case AMD64G_CC_OP_SBBW: ACTIONS_SBB( 16, UShort ); 668 case AMD64G_CC_OP_SBBL: ACTIONS_SBB( 32, UInt ); 669 case AMD64G_CC_OP_SBBQ: ACTIONS_SBB( 64, ULong ); 670 671 case AMD64G_CC_OP_LOGICB: ACTIONS_LOGIC( 8, UChar ); 672 case AMD64G_CC_OP_LOGICW: ACTIONS_LOGIC( 16, UShort ); 673 case AMD64G_CC_OP_LOGICL: ACTIONS_LOGIC( 32, UInt ); 674 case AMD64G_CC_OP_LOGICQ: ACTIONS_LOGIC( 64, ULong ); 675 676 case AMD64G_CC_OP_INCB: ACTIONS_INC( 8, UChar ); 677 case AMD64G_CC_OP_INCW: ACTIONS_INC( 16, UShort ); 678 case AMD64G_CC_OP_INCL: ACTIONS_INC( 32, UInt ); 679 case AMD64G_CC_OP_INCQ: ACTIONS_INC( 64, ULong ); 680 681 case AMD64G_CC_OP_DECB: ACTIONS_DEC( 8, UChar ); 682 case AMD64G_CC_OP_DECW: ACTIONS_DEC( 16, UShort ); 683 case AMD64G_CC_OP_DECL: ACTIONS_DEC( 32, UInt ); 684 case AMD64G_CC_OP_DECQ: ACTIONS_DEC( 64, ULong ); 685 686 case AMD64G_CC_OP_SHLB: ACTIONS_SHL( 8, UChar ); 687 case AMD64G_CC_OP_SHLW: ACTIONS_SHL( 16, UShort ); 688 case AMD64G_CC_OP_SHLL: ACTIONS_SHL( 32, UInt ); 689 case AMD64G_CC_OP_SHLQ: ACTIONS_SHL( 64, ULong ); 690 691 case AMD64G_CC_OP_SHRB: ACTIONS_SHR( 8, UChar ); 692 case AMD64G_CC_OP_SHRW: ACTIONS_SHR( 16, UShort ); 693 case AMD64G_CC_OP_SHRL: ACTIONS_SHR( 32, UInt ); 694 case AMD64G_CC_OP_SHRQ: ACTIONS_SHR( 64, ULong ); 695 696 case AMD64G_CC_OP_ROLB: ACTIONS_ROL( 8, UChar ); 697 case AMD64G_CC_OP_ROLW: ACTIONS_ROL( 16, UShort ); 698 case AMD64G_CC_OP_ROLL: ACTIONS_ROL( 32, UInt ); 699 case AMD64G_CC_OP_ROLQ: ACTIONS_ROL( 64, ULong ); 700 701 case AMD64G_CC_OP_RORB: ACTIONS_ROR( 8, UChar ); 702 case AMD64G_CC_OP_RORW: ACTIONS_ROR( 16, UShort ); 703 case AMD64G_CC_OP_RORL: ACTIONS_ROR( 32, UInt ); 704 case AMD64G_CC_OP_RORQ: ACTIONS_ROR( 64, ULong ); 705 706 case AMD64G_CC_OP_UMULB: ACTIONS_UMUL( 8, UChar, toUChar, 707 UShort, toUShort ); 708 case AMD64G_CC_OP_UMULW: ACTIONS_UMUL( 16, UShort, toUShort, 709 UInt, toUInt ); 710 case AMD64G_CC_OP_UMULL: ACTIONS_UMUL( 32, UInt, toUInt, 711 ULong, idULong ); 712 713 case AMD64G_CC_OP_UMULQ: ACTIONS_UMULQ; 714 715 case AMD64G_CC_OP_SMULB: ACTIONS_SMUL( 8, Char, toUChar, 716 Short, toUShort ); 717 case AMD64G_CC_OP_SMULW: ACTIONS_SMUL( 16, Short, toUShort, 718 Int, toUInt ); 719 case AMD64G_CC_OP_SMULL: ACTIONS_SMUL( 32, Int, toUInt, 720 Long, idULong ); 721 722 case AMD64G_CC_OP_SMULQ: ACTIONS_SMULQ; 723 724 case AMD64G_CC_OP_ANDN32: ACTIONS_ANDN( 32, UInt ); 725 case AMD64G_CC_OP_ANDN64: ACTIONS_ANDN( 64, ULong ); 726 727 case AMD64G_CC_OP_BLSI32: ACTIONS_BLSI( 32, UInt ); 728 case AMD64G_CC_OP_BLSI64: ACTIONS_BLSI( 64, ULong ); 729 730 case AMD64G_CC_OP_BLSMSK32: ACTIONS_BLSMSK( 32, UInt ); 731 case AMD64G_CC_OP_BLSMSK64: ACTIONS_BLSMSK( 64, ULong ); 732 733 case AMD64G_CC_OP_BLSR32: ACTIONS_BLSR( 32, UInt ); 734 case AMD64G_CC_OP_BLSR64: ACTIONS_BLSR( 64, ULong ); 735 736 default: 737 /* shouldn't really make these calls from generated code */ 738 vex_printf("amd64g_calculate_rflags_all_WRK(AMD64)" 739 "( %llu, 0x%llx, 0x%llx, 0x%llx )\n", 740 cc_op, cc_dep1_formal, cc_dep2_formal, cc_ndep_formal ); 741 vpanic("amd64g_calculate_rflags_all_WRK(AMD64)"); 742 } 743 } 744 745 746 /* CALLED FROM GENERATED CODE: CLEAN HELPER */ 747 /* Calculate all the 6 flags from the supplied thunk parameters. */ 748 ULong amd64g_calculate_rflags_all ( ULong cc_op, 749 ULong cc_dep1, 750 ULong cc_dep2, 751 ULong cc_ndep ) 752 { 753 # if PROFILE_RFLAGS 754 if (!initted) initCounts(); 755 n_calc_all++; 756 if (SHOW_COUNTS_NOW) showCounts(); 757 # endif 758 return 759 amd64g_calculate_rflags_all_WRK ( cc_op, cc_dep1, cc_dep2, cc_ndep ); 760 } 761 762 763 /* CALLED FROM GENERATED CODE: CLEAN HELPER */ 764 /* Calculate just the carry flag from the supplied thunk parameters. */ 765 ULong amd64g_calculate_rflags_c ( ULong cc_op, 766 ULong cc_dep1, 767 ULong cc_dep2, 768 ULong cc_ndep ) 769 { 770 # if PROFILE_RFLAGS 771 if (!initted) initCounts(); 772 n_calc_c++; 773 tabc_fast[cc_op]++; 774 if (SHOW_COUNTS_NOW) showCounts(); 775 # endif 776 777 /* Fast-case some common ones. */ 778 switch (cc_op) { 779 case AMD64G_CC_OP_COPY: 780 return (cc_dep1 >> AMD64G_CC_SHIFT_C) & 1; 781 case AMD64G_CC_OP_LOGICQ: 782 case AMD64G_CC_OP_LOGICL: 783 case AMD64G_CC_OP_LOGICW: 784 case AMD64G_CC_OP_LOGICB: 785 return 0; 786 // case AMD64G_CC_OP_SUBL: 787 // return ((UInt)cc_dep1) < ((UInt)cc_dep2) 788 // ? AMD64G_CC_MASK_C : 0; 789 // case AMD64G_CC_OP_SUBW: 790 // return ((UInt)(cc_dep1 & 0xFFFF)) < ((UInt)(cc_dep2 & 0xFFFF)) 791 // ? AMD64G_CC_MASK_C : 0; 792 // case AMD64G_CC_OP_SUBB: 793 // return ((UInt)(cc_dep1 & 0xFF)) < ((UInt)(cc_dep2 & 0xFF)) 794 // ? AMD64G_CC_MASK_C : 0; 795 // case AMD64G_CC_OP_INCL: 796 // case AMD64G_CC_OP_DECL: 797 // return cc_ndep & AMD64G_CC_MASK_C; 798 default: 799 break; 800 } 801 802 # if PROFILE_RFLAGS 803 tabc_fast[cc_op]--; 804 tabc_slow[cc_op]++; 805 # endif 806 807 return amd64g_calculate_rflags_all_WRK(cc_op,cc_dep1,cc_dep2,cc_ndep) 808 & AMD64G_CC_MASK_C; 809 } 810 811 812 /* CALLED FROM GENERATED CODE: CLEAN HELPER */ 813 /* returns 1 or 0 */ 814 ULong amd64g_calculate_condition ( ULong/*AMD64Condcode*/ cond, 815 ULong cc_op, 816 ULong cc_dep1, 817 ULong cc_dep2, 818 ULong cc_ndep ) 819 { 820 ULong rflags = amd64g_calculate_rflags_all_WRK(cc_op, cc_dep1, 821 cc_dep2, cc_ndep); 822 ULong of,sf,zf,cf,pf; 823 ULong inv = cond & 1; 824 825 # if PROFILE_RFLAGS 826 if (!initted) initCounts(); 827 tab_cond[cc_op][cond]++; 828 n_calc_cond++; 829 if (SHOW_COUNTS_NOW) showCounts(); 830 # endif 831 832 switch (cond) { 833 case AMD64CondNO: 834 case AMD64CondO: /* OF == 1 */ 835 of = rflags >> AMD64G_CC_SHIFT_O; 836 return 1 & (inv ^ of); 837 838 case AMD64CondNZ: 839 case AMD64CondZ: /* ZF == 1 */ 840 zf = rflags >> AMD64G_CC_SHIFT_Z; 841 return 1 & (inv ^ zf); 842 843 case AMD64CondNB: 844 case AMD64CondB: /* CF == 1 */ 845 cf = rflags >> AMD64G_CC_SHIFT_C; 846 return 1 & (inv ^ cf); 847 break; 848 849 case AMD64CondNBE: 850 case AMD64CondBE: /* (CF or ZF) == 1 */ 851 cf = rflags >> AMD64G_CC_SHIFT_C; 852 zf = rflags >> AMD64G_CC_SHIFT_Z; 853 return 1 & (inv ^ (cf | zf)); 854 break; 855 856 case AMD64CondNS: 857 case AMD64CondS: /* SF == 1 */ 858 sf = rflags >> AMD64G_CC_SHIFT_S; 859 return 1 & (inv ^ sf); 860 861 case AMD64CondNP: 862 case AMD64CondP: /* PF == 1 */ 863 pf = rflags >> AMD64G_CC_SHIFT_P; 864 return 1 & (inv ^ pf); 865 866 case AMD64CondNL: 867 case AMD64CondL: /* (SF xor OF) == 1 */ 868 sf = rflags >> AMD64G_CC_SHIFT_S; 869 of = rflags >> AMD64G_CC_SHIFT_O; 870 return 1 & (inv ^ (sf ^ of)); 871 break; 872 873 case AMD64CondNLE: 874 case AMD64CondLE: /* ((SF xor OF) or ZF) == 1 */ 875 sf = rflags >> AMD64G_CC_SHIFT_S; 876 of = rflags >> AMD64G_CC_SHIFT_O; 877 zf = rflags >> AMD64G_CC_SHIFT_Z; 878 return 1 & (inv ^ ((sf ^ of) | zf)); 879 break; 880 881 default: 882 /* shouldn't really make these calls from generated code */ 883 vex_printf("amd64g_calculate_condition" 884 "( %llu, %llu, 0x%llx, 0x%llx, 0x%llx )\n", 885 cond, cc_op, cc_dep1, cc_dep2, cc_ndep ); 886 vpanic("amd64g_calculate_condition"); 887 } 888 } 889 890 891 /* VISIBLE TO LIBVEX CLIENT */ 892 ULong LibVEX_GuestAMD64_get_rflags ( /*IN*/const VexGuestAMD64State* vex_state ) 893 { 894 ULong rflags = amd64g_calculate_rflags_all_WRK( 895 vex_state->guest_CC_OP, 896 vex_state->guest_CC_DEP1, 897 vex_state->guest_CC_DEP2, 898 vex_state->guest_CC_NDEP 899 ); 900 Long dflag = vex_state->guest_DFLAG; 901 vassert(dflag == 1 || dflag == -1); 902 if (dflag == -1) 903 rflags |= (1<<10); 904 if (vex_state->guest_IDFLAG == 1) 905 rflags |= (1<<21); 906 if (vex_state->guest_ACFLAG == 1) 907 rflags |= (1<<18); 908 909 return rflags; 910 } 911 912 /* VISIBLE TO LIBVEX CLIENT */ 913 void 914 LibVEX_GuestAMD64_put_rflag_c ( ULong new_carry_flag, 915 /*MOD*/VexGuestAMD64State* vex_state ) 916 { 917 ULong oszacp = amd64g_calculate_rflags_all_WRK( 918 vex_state->guest_CC_OP, 919 vex_state->guest_CC_DEP1, 920 vex_state->guest_CC_DEP2, 921 vex_state->guest_CC_NDEP 922 ); 923 if (new_carry_flag & 1) { 924 oszacp |= AMD64G_CC_MASK_C; 925 } else { 926 oszacp &= ~AMD64G_CC_MASK_C; 927 } 928 vex_state->guest_CC_OP = AMD64G_CC_OP_COPY; 929 vex_state->guest_CC_DEP1 = oszacp; 930 vex_state->guest_CC_DEP2 = 0; 931 vex_state->guest_CC_NDEP = 0; 932 } 933 934 935 /*---------------------------------------------------------------*/ 936 /*--- %rflags translation-time function specialisers. ---*/ 937 /*--- These help iropt specialise calls the above run-time ---*/ 938 /*--- %rflags functions. ---*/ 939 /*---------------------------------------------------------------*/ 940 941 /* Used by the optimiser to try specialisations. Returns an 942 equivalent expression, or NULL if none. */ 943 944 static Bool isU64 ( IRExpr* e, ULong n ) 945 { 946 return toBool( e->tag == Iex_Const 947 && e->Iex.Const.con->tag == Ico_U64 948 && e->Iex.Const.con->Ico.U64 == n ); 949 } 950 951 IRExpr* guest_amd64_spechelper ( const HChar* function_name, 952 IRExpr** args, 953 IRStmt** precedingStmts, 954 Int n_precedingStmts ) 955 { 956 # define unop(_op,_a1) IRExpr_Unop((_op),(_a1)) 957 # define binop(_op,_a1,_a2) IRExpr_Binop((_op),(_a1),(_a2)) 958 # define mkU64(_n) IRExpr_Const(IRConst_U64(_n)) 959 # define mkU32(_n) IRExpr_Const(IRConst_U32(_n)) 960 # define mkU8(_n) IRExpr_Const(IRConst_U8(_n)) 961 962 Int i, arity = 0; 963 for (i = 0; args[i]; i++) 964 arity++; 965 # if 0 966 vex_printf("spec request:\n"); 967 vex_printf(" %s ", function_name); 968 for (i = 0; i < arity; i++) { 969 vex_printf(" "); 970 ppIRExpr(args[i]); 971 } 972 vex_printf("\n"); 973 # endif 974 975 /* --------- specialising "amd64g_calculate_condition" --------- */ 976 977 if (vex_streq(function_name, "amd64g_calculate_condition")) { 978 /* specialise calls to above "calculate condition" function */ 979 IRExpr *cond, *cc_op, *cc_dep1, *cc_dep2; 980 vassert(arity == 5); 981 cond = args[0]; 982 cc_op = args[1]; 983 cc_dep1 = args[2]; 984 cc_dep2 = args[3]; 985 986 /*---------------- ADDQ ----------------*/ 987 988 if (isU64(cc_op, AMD64G_CC_OP_ADDQ) && isU64(cond, AMD64CondZ)) { 989 /* long long add, then Z --> test (dst+src == 0) */ 990 return unop(Iop_1Uto64, 991 binop(Iop_CmpEQ64, 992 binop(Iop_Add64, cc_dep1, cc_dep2), 993 mkU64(0))); 994 } 995 996 /*---------------- SUBQ ----------------*/ 997 998 if (isU64(cc_op, AMD64G_CC_OP_SUBQ) && isU64(cond, AMD64CondZ)) { 999 /* long long sub/cmp, then Z --> test dst==src */ 1000 return unop(Iop_1Uto64, 1001 binop(Iop_CmpEQ64,cc_dep1,cc_dep2)); 1002 } 1003 if (isU64(cc_op, AMD64G_CC_OP_SUBQ) && isU64(cond, AMD64CondNZ)) { 1004 /* long long sub/cmp, then NZ --> test dst!=src */ 1005 return unop(Iop_1Uto64, 1006 binop(Iop_CmpNE64,cc_dep1,cc_dep2)); 1007 } 1008 1009 if (isU64(cc_op, AMD64G_CC_OP_SUBQ) && isU64(cond, AMD64CondL)) { 1010 /* long long sub/cmp, then L (signed less than) 1011 --> test dst <s src */ 1012 return unop(Iop_1Uto64, 1013 binop(Iop_CmpLT64S, cc_dep1, cc_dep2)); 1014 } 1015 1016 if (isU64(cc_op, AMD64G_CC_OP_SUBQ) && isU64(cond, AMD64CondB)) { 1017 /* long long sub/cmp, then B (unsigned less than) 1018 --> test dst <u src */ 1019 return unop(Iop_1Uto64, 1020 binop(Iop_CmpLT64U, cc_dep1, cc_dep2)); 1021 } 1022 if (isU64(cc_op, AMD64G_CC_OP_SUBQ) && isU64(cond, AMD64CondNB)) { 1023 /* long long sub/cmp, then NB (unsigned greater than or equal) 1024 --> test src <=u dst */ 1025 /* Note, args are opposite way round from the usual */ 1026 return unop(Iop_1Uto64, 1027 binop(Iop_CmpLE64U, cc_dep2, cc_dep1)); 1028 } 1029 1030 if (isU64(cc_op, AMD64G_CC_OP_SUBQ) && isU64(cond, AMD64CondNLE)) { 1031 /* long sub/cmp, then NLE (signed greater than) 1032 --> test !(dst <=s src) 1033 --> test (dst >s src) 1034 --> test (src <s dst) */ 1035 return unop(Iop_1Uto64, 1036 binop(Iop_CmpLT64S, cc_dep2, cc_dep1)); 1037 1038 } 1039 1040 if (isU64(cc_op, AMD64G_CC_OP_SUBQ) && isU64(cond, AMD64CondBE)) { 1041 /* long long sub/cmp, then BE (unsigned less than or equal) 1042 --> test dst <=u src */ 1043 return unop(Iop_1Uto64, 1044 binop(Iop_CmpLE64U, cc_dep1, cc_dep2)); 1045 } 1046 if (isU64(cc_op, AMD64G_CC_OP_SUBQ) && isU64(cond, AMD64CondNBE)) { 1047 /* long long sub/cmp, then NBE (unsigned greater than) 1048 --> test !(dst <=u src) */ 1049 return binop(Iop_Xor64, 1050 unop(Iop_1Uto64, 1051 binop(Iop_CmpLE64U, cc_dep1, cc_dep2)), 1052 mkU64(1)); 1053 } 1054 1055 /*---------------- SUBL ----------------*/ 1056 1057 if (isU64(cc_op, AMD64G_CC_OP_SUBL) && isU64(cond, AMD64CondZ)) { 1058 /* long sub/cmp, then Z --> test dst==src */ 1059 return unop(Iop_1Uto64, 1060 binop(Iop_CmpEQ32, 1061 unop(Iop_64to32, cc_dep1), 1062 unop(Iop_64to32, cc_dep2))); 1063 } 1064 if (isU64(cc_op, AMD64G_CC_OP_SUBL) && isU64(cond, AMD64CondNZ)) { 1065 /* long sub/cmp, then NZ --> test dst!=src */ 1066 return unop(Iop_1Uto64, 1067 binop(Iop_CmpNE32, 1068 unop(Iop_64to32, cc_dep1), 1069 unop(Iop_64to32, cc_dep2))); 1070 } 1071 1072 if (isU64(cc_op, AMD64G_CC_OP_SUBL) && isU64(cond, AMD64CondL)) { 1073 /* long sub/cmp, then L (signed less than) 1074 --> test dst <s src */ 1075 return unop(Iop_1Uto64, 1076 binop(Iop_CmpLT32S, 1077 unop(Iop_64to32, cc_dep1), 1078 unop(Iop_64to32, cc_dep2))); 1079 } 1080 1081 if (isU64(cc_op, AMD64G_CC_OP_SUBL) && isU64(cond, AMD64CondLE)) { 1082 /* long sub/cmp, then LE (signed less than or equal) 1083 --> test dst <=s src */ 1084 return unop(Iop_1Uto64, 1085 binop(Iop_CmpLE32S, 1086 unop(Iop_64to32, cc_dep1), 1087 unop(Iop_64to32, cc_dep2))); 1088 1089 } 1090 if (isU64(cc_op, AMD64G_CC_OP_SUBL) && isU64(cond, AMD64CondNLE)) { 1091 /* long sub/cmp, then NLE (signed greater than) 1092 --> test !(dst <=s src) 1093 --> test (dst >s src) 1094 --> test (src <s dst) */ 1095 return unop(Iop_1Uto64, 1096 binop(Iop_CmpLT32S, 1097 unop(Iop_64to32, cc_dep2), 1098 unop(Iop_64to32, cc_dep1))); 1099 1100 } 1101 1102 if (isU64(cc_op, AMD64G_CC_OP_SUBL) && isU64(cond, AMD64CondBE)) { 1103 /* long sub/cmp, then BE (unsigned less than or equal) 1104 --> test dst <=u src */ 1105 return unop(Iop_1Uto64, 1106 binop(Iop_CmpLE32U, 1107 unop(Iop_64to32, cc_dep1), 1108 unop(Iop_64to32, cc_dep2))); 1109 } 1110 if (isU64(cc_op, AMD64G_CC_OP_SUBL) && isU64(cond, AMD64CondNBE)) { 1111 /* long sub/cmp, then NBE (unsigned greater than) 1112 --> test src <u dst */ 1113 /* Note, args are opposite way round from the usual */ 1114 return unop(Iop_1Uto64, 1115 binop(Iop_CmpLT32U, 1116 unop(Iop_64to32, cc_dep2), 1117 unop(Iop_64to32, cc_dep1))); 1118 } 1119 1120 if (isU64(cc_op, AMD64G_CC_OP_SUBL) && isU64(cond, AMD64CondS)) { 1121 /* long sub/cmp, then S (negative) --> test (dst-src <s 0) */ 1122 return unop(Iop_1Uto64, 1123 binop(Iop_CmpLT32S, 1124 binop(Iop_Sub32, 1125 unop(Iop_64to32, cc_dep1), 1126 unop(Iop_64to32, cc_dep2)), 1127 mkU32(0))); 1128 } 1129 1130 if (isU64(cc_op, AMD64G_CC_OP_SUBL) && isU64(cond, AMD64CondB)) { 1131 /* long sub/cmp, then B (unsigned less than) 1132 --> test dst <u src */ 1133 return unop(Iop_1Uto64, 1134 binop(Iop_CmpLT32U, 1135 unop(Iop_64to32, cc_dep1), 1136 unop(Iop_64to32, cc_dep2))); 1137 } 1138 1139 /*---------------- SUBW ----------------*/ 1140 1141 if (isU64(cc_op, AMD64G_CC_OP_SUBW) && isU64(cond, AMD64CondZ)) { 1142 /* word sub/cmp, then Z --> test dst==src */ 1143 return unop(Iop_1Uto64, 1144 binop(Iop_CmpEQ16, 1145 unop(Iop_64to16,cc_dep1), 1146 unop(Iop_64to16,cc_dep2))); 1147 } 1148 if (isU64(cc_op, AMD64G_CC_OP_SUBW) && isU64(cond, AMD64CondNZ)) { 1149 /* word sub/cmp, then NZ --> test dst!=src */ 1150 return unop(Iop_1Uto64, 1151 binop(Iop_CmpNE16, 1152 unop(Iop_64to16,cc_dep1), 1153 unop(Iop_64to16,cc_dep2))); 1154 } 1155 1156 if (isU64(cc_op, AMD64G_CC_OP_SUBW) && isU64(cond, AMD64CondLE)) { 1157 /* word sub/cmp, then LE (signed less than or equal) 1158 --> test dst <=s src */ 1159 return unop(Iop_1Uto64, 1160 binop(Iop_CmpLE64S, 1161 binop(Iop_Shl64,cc_dep1,mkU8(48)), 1162 binop(Iop_Shl64,cc_dep2,mkU8(48)))); 1163 1164 } 1165 1166 /*---------------- SUBB ----------------*/ 1167 1168 if (isU64(cc_op, AMD64G_CC_OP_SUBB) && isU64(cond, AMD64CondZ)) { 1169 /* byte sub/cmp, then Z --> test dst==src */ 1170 return unop(Iop_1Uto64, 1171 binop(Iop_CmpEQ8, 1172 unop(Iop_64to8,cc_dep1), 1173 unop(Iop_64to8,cc_dep2))); 1174 } 1175 if (isU64(cc_op, AMD64G_CC_OP_SUBB) && isU64(cond, AMD64CondNZ)) { 1176 /* byte sub/cmp, then NZ --> test dst!=src */ 1177 return unop(Iop_1Uto64, 1178 binop(Iop_CmpNE8, 1179 unop(Iop_64to8,cc_dep1), 1180 unop(Iop_64to8,cc_dep2))); 1181 } 1182 1183 if (isU64(cc_op, AMD64G_CC_OP_SUBB) && isU64(cond, AMD64CondBE)) { 1184 /* byte sub/cmp, then BE (unsigned less than or equal) 1185 --> test dst <=u src */ 1186 return unop(Iop_1Uto64, 1187 binop(Iop_CmpLE64U, 1188 binop(Iop_And64, cc_dep1, mkU64(0xFF)), 1189 binop(Iop_And64, cc_dep2, mkU64(0xFF)))); 1190 } 1191 1192 if (isU64(cc_op, AMD64G_CC_OP_SUBB) && isU64(cond, AMD64CondS) 1193 && isU64(cc_dep2, 0)) { 1194 /* byte sub/cmp of zero, then S --> test (dst-0 <s 0) 1195 --> test dst <s 0 1196 --> (ULong)dst[7] 1197 This is yet another scheme by which gcc figures out if the 1198 top bit of a byte is 1 or 0. See also LOGICB/CondS below. */ 1199 /* Note: isU64(cc_dep2, 0) is correct, even though this is 1200 for an 8-bit comparison, since the args to the helper 1201 function are always U64s. */ 1202 return binop(Iop_And64, 1203 binop(Iop_Shr64,cc_dep1,mkU8(7)), 1204 mkU64(1)); 1205 } 1206 if (isU64(cc_op, AMD64G_CC_OP_SUBB) && isU64(cond, AMD64CondNS) 1207 && isU64(cc_dep2, 0)) { 1208 /* byte sub/cmp of zero, then NS --> test !(dst-0 <s 0) 1209 --> test !(dst <s 0) 1210 --> (ULong) !dst[7] 1211 */ 1212 return binop(Iop_Xor64, 1213 binop(Iop_And64, 1214 binop(Iop_Shr64,cc_dep1,mkU8(7)), 1215 mkU64(1)), 1216 mkU64(1)); 1217 } 1218 1219 /*---------------- LOGICQ ----------------*/ 1220 1221 if (isU64(cc_op, AMD64G_CC_OP_LOGICQ) && isU64(cond, AMD64CondZ)) { 1222 /* long long and/or/xor, then Z --> test dst==0 */ 1223 return unop(Iop_1Uto64, 1224 binop(Iop_CmpEQ64, cc_dep1, mkU64(0))); 1225 } 1226 if (isU64(cc_op, AMD64G_CC_OP_LOGICQ) && isU64(cond, AMD64CondNZ)) { 1227 /* long long and/or/xor, then NZ --> test dst!=0 */ 1228 return unop(Iop_1Uto64, 1229 binop(Iop_CmpNE64, cc_dep1, mkU64(0))); 1230 } 1231 1232 if (isU64(cc_op, AMD64G_CC_OP_LOGICQ) && isU64(cond, AMD64CondL)) { 1233 /* long long and/or/xor, then L 1234 LOGIC sets SF and ZF according to the 1235 result and makes OF be zero. L computes SF ^ OF, but 1236 OF is zero, so this reduces to SF -- which will be 1 iff 1237 the result is < signed 0. Hence ... 1238 */ 1239 return unop(Iop_1Uto64, 1240 binop(Iop_CmpLT64S, 1241 cc_dep1, 1242 mkU64(0))); 1243 } 1244 1245 /*---------------- LOGICL ----------------*/ 1246 1247 if (isU64(cc_op, AMD64G_CC_OP_LOGICL) && isU64(cond, AMD64CondZ)) { 1248 /* long and/or/xor, then Z --> test dst==0 */ 1249 return unop(Iop_1Uto64, 1250 binop(Iop_CmpEQ32, 1251 unop(Iop_64to32, cc_dep1), 1252 mkU32(0))); 1253 } 1254 if (isU64(cc_op, AMD64G_CC_OP_LOGICL) && isU64(cond, AMD64CondNZ)) { 1255 /* long and/or/xor, then NZ --> test dst!=0 */ 1256 return unop(Iop_1Uto64, 1257 binop(Iop_CmpNE32, 1258 unop(Iop_64to32, cc_dep1), 1259 mkU32(0))); 1260 } 1261 1262 if (isU64(cc_op, AMD64G_CC_OP_LOGICL) && isU64(cond, AMD64CondLE)) { 1263 /* long and/or/xor, then LE 1264 This is pretty subtle. LOGIC sets SF and ZF according to the 1265 result and makes OF be zero. LE computes (SF ^ OF) | ZF, but 1266 OF is zero, so this reduces to SF | ZF -- which will be 1 iff 1267 the result is <=signed 0. Hence ... 1268 */ 1269 return unop(Iop_1Uto64, 1270 binop(Iop_CmpLE32S, 1271 unop(Iop_64to32, cc_dep1), 1272 mkU32(0))); 1273 } 1274 1275 if (isU64(cc_op, AMD64G_CC_OP_LOGICL) && isU64(cond, AMD64CondS)) { 1276 /* long and/or/xor, then S --> (ULong)result[31] */ 1277 return binop(Iop_And64, 1278 binop(Iop_Shr64, cc_dep1, mkU8(31)), 1279 mkU64(1)); 1280 } 1281 if (isU64(cc_op, AMD64G_CC_OP_LOGICL) && isU64(cond, AMD64CondNS)) { 1282 /* long and/or/xor, then S --> (ULong) ~ result[31] */ 1283 return binop(Iop_Xor64, 1284 binop(Iop_And64, 1285 binop(Iop_Shr64, cc_dep1, mkU8(31)), 1286 mkU64(1)), 1287 mkU64(1)); 1288 } 1289 1290 /*---------------- LOGICW ----------------*/ 1291 1292 if (isU64(cc_op, AMD64G_CC_OP_LOGICW) && isU64(cond, AMD64CondZ)) { 1293 /* word and/or/xor, then Z --> test dst==0 */ 1294 return unop(Iop_1Uto64, 1295 binop(Iop_CmpEQ64, 1296 binop(Iop_And64, cc_dep1, mkU64(0xFFFF)), 1297 mkU64(0))); 1298 } 1299 if (isU64(cc_op, AMD64G_CC_OP_LOGICW) && isU64(cond, AMD64CondNZ)) { 1300 /* word and/or/xor, then NZ --> test dst!=0 */ 1301 return unop(Iop_1Uto64, 1302 binop(Iop_CmpNE64, 1303 binop(Iop_And64, cc_dep1, mkU64(0xFFFF)), 1304 mkU64(0))); 1305 } 1306 1307 /*---------------- LOGICB ----------------*/ 1308 1309 if (isU64(cc_op, AMD64G_CC_OP_LOGICB) && isU64(cond, AMD64CondZ)) { 1310 /* byte and/or/xor, then Z --> test dst==0 */ 1311 return unop(Iop_1Uto64, 1312 binop(Iop_CmpEQ64, binop(Iop_And64,cc_dep1,mkU64(255)), 1313 mkU64(0))); 1314 } 1315 if (isU64(cc_op, AMD64G_CC_OP_LOGICB) && isU64(cond, AMD64CondNZ)) { 1316 /* byte and/or/xor, then NZ --> test dst!=0 */ 1317 return unop(Iop_1Uto64, 1318 binop(Iop_CmpNE64, binop(Iop_And64,cc_dep1,mkU64(255)), 1319 mkU64(0))); 1320 } 1321 1322 if (isU64(cc_op, AMD64G_CC_OP_LOGICB) && isU64(cond, AMD64CondS)) { 1323 /* this is an idiom gcc sometimes uses to find out if the top 1324 bit of a byte register is set: eg testb %al,%al; js .. 1325 Since it just depends on the top bit of the byte, extract 1326 that bit and explicitly get rid of all the rest. This 1327 helps memcheck avoid false positives in the case where any 1328 of the other bits in the byte are undefined. */ 1329 /* byte and/or/xor, then S --> (UInt)result[7] */ 1330 return binop(Iop_And64, 1331 binop(Iop_Shr64,cc_dep1,mkU8(7)), 1332 mkU64(1)); 1333 } 1334 if (isU64(cc_op, AMD64G_CC_OP_LOGICB) && isU64(cond, AMD64CondNS)) { 1335 /* byte and/or/xor, then NS --> (UInt)!result[7] */ 1336 return binop(Iop_Xor64, 1337 binop(Iop_And64, 1338 binop(Iop_Shr64,cc_dep1,mkU8(7)), 1339 mkU64(1)), 1340 mkU64(1)); 1341 } 1342 1343 /*---------------- INCB ----------------*/ 1344 1345 if (isU64(cc_op, AMD64G_CC_OP_INCB) && isU64(cond, AMD64CondLE)) { 1346 /* 8-bit inc, then LE --> sign bit of the arg */ 1347 return binop(Iop_And64, 1348 binop(Iop_Shr64, 1349 binop(Iop_Sub64, cc_dep1, mkU64(1)), 1350 mkU8(7)), 1351 mkU64(1)); 1352 } 1353 1354 /*---------------- INCW ----------------*/ 1355 1356 if (isU64(cc_op, AMD64G_CC_OP_INCW) && isU64(cond, AMD64CondZ)) { 1357 /* 16-bit inc, then Z --> test dst == 0 */ 1358 return unop(Iop_1Uto64, 1359 binop(Iop_CmpEQ64, 1360 binop(Iop_Shl64,cc_dep1,mkU8(48)), 1361 mkU64(0))); 1362 } 1363 1364 /*---------------- DECL ----------------*/ 1365 1366 if (isU64(cc_op, AMD64G_CC_OP_DECL) && isU64(cond, AMD64CondZ)) { 1367 /* dec L, then Z --> test dst == 0 */ 1368 return unop(Iop_1Uto64, 1369 binop(Iop_CmpEQ32, 1370 unop(Iop_64to32, cc_dep1), 1371 mkU32(0))); 1372 } 1373 1374 /*---------------- DECW ----------------*/ 1375 1376 if (isU64(cc_op, AMD64G_CC_OP_DECW) && isU64(cond, AMD64CondNZ)) { 1377 /* 16-bit dec, then NZ --> test dst != 0 */ 1378 return unop(Iop_1Uto64, 1379 binop(Iop_CmpNE64, 1380 binop(Iop_Shl64,cc_dep1,mkU8(48)), 1381 mkU64(0))); 1382 } 1383 1384 /*---------------- COPY ----------------*/ 1385 /* This can happen, as a result of amd64 FP compares: "comisd ... ; 1386 jbe" for example. */ 1387 1388 if (isU64(cc_op, AMD64G_CC_OP_COPY) && 1389 (isU64(cond, AMD64CondBE) || isU64(cond, AMD64CondNBE))) { 1390 /* COPY, then BE --> extract C and Z from dep1, and test (C 1391 or Z == 1). */ 1392 /* COPY, then NBE --> extract C and Z from dep1, and test (C 1393 or Z == 0). */ 1394 ULong nnn = isU64(cond, AMD64CondBE) ? 1 : 0; 1395 return 1396 unop( 1397 Iop_1Uto64, 1398 binop( 1399 Iop_CmpEQ64, 1400 binop( 1401 Iop_And64, 1402 binop( 1403 Iop_Or64, 1404 binop(Iop_Shr64, cc_dep1, mkU8(AMD64G_CC_SHIFT_C)), 1405 binop(Iop_Shr64, cc_dep1, mkU8(AMD64G_CC_SHIFT_Z)) 1406 ), 1407 mkU64(1) 1408 ), 1409 mkU64(nnn) 1410 ) 1411 ); 1412 } 1413 1414 if (isU64(cc_op, AMD64G_CC_OP_COPY) && isU64(cond, AMD64CondB)) { 1415 /* COPY, then B --> extract C dep1, and test (C == 1). */ 1416 return 1417 unop( 1418 Iop_1Uto64, 1419 binop( 1420 Iop_CmpNE64, 1421 binop( 1422 Iop_And64, 1423 binop(Iop_Shr64, cc_dep1, mkU8(AMD64G_CC_SHIFT_C)), 1424 mkU64(1) 1425 ), 1426 mkU64(0) 1427 ) 1428 ); 1429 } 1430 1431 if (isU64(cc_op, AMD64G_CC_OP_COPY) 1432 && (isU64(cond, AMD64CondZ) || isU64(cond, AMD64CondNZ))) { 1433 /* COPY, then Z --> extract Z from dep1, and test (Z == 1). */ 1434 /* COPY, then NZ --> extract Z from dep1, and test (Z == 0). */ 1435 UInt nnn = isU64(cond, AMD64CondZ) ? 1 : 0; 1436 return 1437 unop( 1438 Iop_1Uto64, 1439 binop( 1440 Iop_CmpEQ64, 1441 binop( 1442 Iop_And64, 1443 binop(Iop_Shr64, cc_dep1, mkU8(AMD64G_CC_SHIFT_Z)), 1444 mkU64(1) 1445 ), 1446 mkU64(nnn) 1447 ) 1448 ); 1449 } 1450 1451 if (isU64(cc_op, AMD64G_CC_OP_COPY) && isU64(cond, AMD64CondP)) { 1452 /* COPY, then P --> extract P from dep1, and test (P == 1). */ 1453 return 1454 unop( 1455 Iop_1Uto64, 1456 binop( 1457 Iop_CmpNE64, 1458 binop( 1459 Iop_And64, 1460 binop(Iop_Shr64, cc_dep1, mkU8(AMD64G_CC_SHIFT_P)), 1461 mkU64(1) 1462 ), 1463 mkU64(0) 1464 ) 1465 ); 1466 } 1467 1468 return NULL; 1469 } 1470 1471 /* --------- specialising "amd64g_calculate_rflags_c" --------- */ 1472 1473 if (vex_streq(function_name, "amd64g_calculate_rflags_c")) { 1474 /* specialise calls to above "calculate_rflags_c" function */ 1475 IRExpr *cc_op, *cc_dep1, *cc_dep2, *cc_ndep; 1476 vassert(arity == 4); 1477 cc_op = args[0]; 1478 cc_dep1 = args[1]; 1479 cc_dep2 = args[2]; 1480 cc_ndep = args[3]; 1481 1482 if (isU64(cc_op, AMD64G_CC_OP_SUBQ)) { 1483 /* C after sub denotes unsigned less than */ 1484 return unop(Iop_1Uto64, 1485 binop(Iop_CmpLT64U, 1486 cc_dep1, 1487 cc_dep2)); 1488 } 1489 if (isU64(cc_op, AMD64G_CC_OP_SUBL)) { 1490 /* C after sub denotes unsigned less than */ 1491 return unop(Iop_1Uto64, 1492 binop(Iop_CmpLT32U, 1493 unop(Iop_64to32, cc_dep1), 1494 unop(Iop_64to32, cc_dep2))); 1495 } 1496 if (isU64(cc_op, AMD64G_CC_OP_SUBB)) { 1497 /* C after sub denotes unsigned less than */ 1498 return unop(Iop_1Uto64, 1499 binop(Iop_CmpLT64U, 1500 binop(Iop_And64,cc_dep1,mkU64(0xFF)), 1501 binop(Iop_And64,cc_dep2,mkU64(0xFF)))); 1502 } 1503 if (isU64(cc_op, AMD64G_CC_OP_LOGICQ) 1504 || isU64(cc_op, AMD64G_CC_OP_LOGICL) 1505 || isU64(cc_op, AMD64G_CC_OP_LOGICW) 1506 || isU64(cc_op, AMD64G_CC_OP_LOGICB)) { 1507 /* cflag after logic is zero */ 1508 return mkU64(0); 1509 } 1510 if (isU64(cc_op, AMD64G_CC_OP_DECL) || isU64(cc_op, AMD64G_CC_OP_INCL) 1511 || isU64(cc_op, AMD64G_CC_OP_DECQ) || isU64(cc_op, AMD64G_CC_OP_INCQ)) { 1512 /* If the thunk is dec or inc, the cflag is supplied as CC_NDEP. */ 1513 return cc_ndep; 1514 } 1515 1516 # if 0 1517 if (cc_op->tag == Iex_Const) { 1518 vex_printf("CFLAG "); ppIRExpr(cc_op); vex_printf("\n"); 1519 } 1520 # endif 1521 1522 return NULL; 1523 } 1524 1525 # undef unop 1526 # undef binop 1527 # undef mkU64 1528 # undef mkU32 1529 # undef mkU8 1530 1531 return NULL; 1532 } 1533 1534 1535 /*---------------------------------------------------------------*/ 1536 /*--- Supporting functions for x87 FPU activities. ---*/ 1537 /*---------------------------------------------------------------*/ 1538 1539 static inline Bool host_is_little_endian ( void ) 1540 { 1541 UInt x = 0x76543210; 1542 UChar* p = (UChar*)(&x); 1543 return toBool(*p == 0x10); 1544 } 1545 1546 /* Inspect a value and its tag, as per the x87 'FXAM' instruction. */ 1547 /* CALLED FROM GENERATED CODE: CLEAN HELPER */ 1548 ULong amd64g_calculate_FXAM ( ULong tag, ULong dbl ) 1549 { 1550 Bool mantissaIsZero; 1551 Int bexp; 1552 UChar sign; 1553 UChar* f64; 1554 1555 vassert(host_is_little_endian()); 1556 1557 /* vex_printf("calculate_FXAM ( %d, %llx ) .. ", tag, dbl ); */ 1558 1559 f64 = (UChar*)(&dbl); 1560 sign = toUChar( (f64[7] >> 7) & 1 ); 1561 1562 /* First off, if the tag indicates the register was empty, 1563 return 1,0,sign,1 */ 1564 if (tag == 0) { 1565 /* vex_printf("Empty\n"); */ 1566 return AMD64G_FC_MASK_C3 | 0 | (sign << AMD64G_FC_SHIFT_C1) 1567 | AMD64G_FC_MASK_C0; 1568 } 1569 1570 bexp = (f64[7] << 4) | ((f64[6] >> 4) & 0x0F); 1571 bexp &= 0x7FF; 1572 1573 mantissaIsZero 1574 = toBool( 1575 (f64[6] & 0x0F) == 0 1576 && (f64[5] | f64[4] | f64[3] | f64[2] | f64[1] | f64[0]) == 0 1577 ); 1578 1579 /* If both exponent and mantissa are zero, the value is zero. 1580 Return 1,0,sign,0. */ 1581 if (bexp == 0 && mantissaIsZero) { 1582 /* vex_printf("Zero\n"); */ 1583 return AMD64G_FC_MASK_C3 | 0 1584 | (sign << AMD64G_FC_SHIFT_C1) | 0; 1585 } 1586 1587 /* If exponent is zero but mantissa isn't, it's a denormal. 1588 Return 1,1,sign,0. */ 1589 if (bexp == 0 && !mantissaIsZero) { 1590 /* vex_printf("Denormal\n"); */ 1591 return AMD64G_FC_MASK_C3 | AMD64G_FC_MASK_C2 1592 | (sign << AMD64G_FC_SHIFT_C1) | 0; 1593 } 1594 1595 /* If the exponent is 7FF and the mantissa is zero, this is an infinity. 1596 Return 0,1,sign,1. */ 1597 if (bexp == 0x7FF && mantissaIsZero) { 1598 /* vex_printf("Inf\n"); */ 1599 return 0 | AMD64G_FC_MASK_C2 | (sign << AMD64G_FC_SHIFT_C1) 1600 | AMD64G_FC_MASK_C0; 1601 } 1602 1603 /* If the exponent is 7FF and the mantissa isn't zero, this is a NaN. 1604 Return 0,0,sign,1. */ 1605 if (bexp == 0x7FF && !mantissaIsZero) { 1606 /* vex_printf("NaN\n"); */ 1607 return 0 | 0 | (sign << AMD64G_FC_SHIFT_C1) | AMD64G_FC_MASK_C0; 1608 } 1609 1610 /* Uh, ok, we give up. It must be a normal finite number. 1611 Return 0,1,sign,0. 1612 */ 1613 /* vex_printf("normal\n"); */ 1614 return 0 | AMD64G_FC_MASK_C2 | (sign << AMD64G_FC_SHIFT_C1) | 0; 1615 } 1616 1617 1618 /* This is used to implement both 'frstor' and 'fldenv'. The latter 1619 appears to differ from the former only in that the 8 FP registers 1620 themselves are not transferred into the guest state. */ 1621 static 1622 VexEmNote do_put_x87 ( Bool moveRegs, 1623 /*IN*/UChar* x87_state, 1624 /*OUT*/VexGuestAMD64State* vex_state ) 1625 { 1626 Int stno, preg; 1627 UInt tag; 1628 ULong* vexRegs = (ULong*)(&vex_state->guest_FPREG[0]); 1629 UChar* vexTags = (UChar*)(&vex_state->guest_FPTAG[0]); 1630 Fpu_State* x87 = (Fpu_State*)x87_state; 1631 UInt ftop = (x87->env[FP_ENV_STAT] >> 11) & 7; 1632 UInt tagw = x87->env[FP_ENV_TAG]; 1633 UInt fpucw = x87->env[FP_ENV_CTRL]; 1634 UInt c3210 = x87->env[FP_ENV_STAT] & 0x4700; 1635 VexEmNote ew; 1636 UInt fpround; 1637 ULong pair; 1638 1639 /* Copy registers and tags */ 1640 for (stno = 0; stno < 8; stno++) { 1641 preg = (stno + ftop) & 7; 1642 tag = (tagw >> (2*preg)) & 3; 1643 if (tag == 3) { 1644 /* register is empty */ 1645 /* hmm, if it's empty, does it still get written? Probably 1646 safer to say it does. If we don't, memcheck could get out 1647 of sync, in that it thinks all FP registers are defined by 1648 this helper, but in reality some have not been updated. */ 1649 if (moveRegs) 1650 vexRegs[preg] = 0; /* IEEE754 64-bit zero */ 1651 vexTags[preg] = 0; 1652 } else { 1653 /* register is non-empty */ 1654 if (moveRegs) 1655 convert_f80le_to_f64le( &x87->reg[10*stno], 1656 (UChar*)&vexRegs[preg] ); 1657 vexTags[preg] = 1; 1658 } 1659 } 1660 1661 /* stack pointer */ 1662 vex_state->guest_FTOP = ftop; 1663 1664 /* status word */ 1665 vex_state->guest_FC3210 = c3210; 1666 1667 /* handle the control word, setting FPROUND and detecting any 1668 emulation warnings. */ 1669 pair = amd64g_check_fldcw ( (ULong)fpucw ); 1670 fpround = (UInt)pair & 0xFFFFFFFFULL; 1671 ew = (VexEmNote)(pair >> 32); 1672 1673 vex_state->guest_FPROUND = fpround & 3; 1674 1675 /* emulation warnings --> caller */ 1676 return ew; 1677 } 1678 1679 1680 /* Create an x87 FPU state from the guest state, as close as 1681 we can approximate it. */ 1682 static 1683 void do_get_x87 ( /*IN*/VexGuestAMD64State* vex_state, 1684 /*OUT*/UChar* x87_state ) 1685 { 1686 Int i, stno, preg; 1687 UInt tagw; 1688 ULong* vexRegs = (ULong*)(&vex_state->guest_FPREG[0]); 1689 UChar* vexTags = (UChar*)(&vex_state->guest_FPTAG[0]); 1690 Fpu_State* x87 = (Fpu_State*)x87_state; 1691 UInt ftop = vex_state->guest_FTOP; 1692 UInt c3210 = vex_state->guest_FC3210; 1693 1694 for (i = 0; i < 14; i++) 1695 x87->env[i] = 0; 1696 1697 x87->env[1] = x87->env[3] = x87->env[5] = x87->env[13] = 0xFFFF; 1698 x87->env[FP_ENV_STAT] 1699 = toUShort(((ftop & 7) << 11) | (c3210 & 0x4700)); 1700 x87->env[FP_ENV_CTRL] 1701 = toUShort(amd64g_create_fpucw( vex_state->guest_FPROUND )); 1702 1703 /* Dump the register stack in ST order. */ 1704 tagw = 0; 1705 for (stno = 0; stno < 8; stno++) { 1706 preg = (stno + ftop) & 7; 1707 if (vexTags[preg] == 0) { 1708 /* register is empty */ 1709 tagw |= (3 << (2*preg)); 1710 convert_f64le_to_f80le( (UChar*)&vexRegs[preg], 1711 &x87->reg[10*stno] ); 1712 } else { 1713 /* register is full. */ 1714 tagw |= (0 << (2*preg)); 1715 convert_f64le_to_f80le( (UChar*)&vexRegs[preg], 1716 &x87->reg[10*stno] ); 1717 } 1718 } 1719 x87->env[FP_ENV_TAG] = toUShort(tagw); 1720 } 1721 1722 1723 /* CALLED FROM GENERATED CODE */ 1724 /* DIRTY HELPER (reads guest state, writes guest mem) */ 1725 /* NOTE: only handles 32-bit format (no REX.W on the insn) */ 1726 void amd64g_dirtyhelper_FXSAVE ( VexGuestAMD64State* gst, HWord addr ) 1727 { 1728 /* Derived from values obtained from 1729 vendor_id : AuthenticAMD 1730 cpu family : 15 1731 model : 12 1732 model name : AMD Athlon(tm) 64 Processor 3200+ 1733 stepping : 0 1734 cpu MHz : 2200.000 1735 cache size : 512 KB 1736 */ 1737 /* Somewhat roundabout, but at least it's simple. */ 1738 Fpu_State tmp; 1739 UShort* addrS = (UShort*)addr; 1740 UChar* addrC = (UChar*)addr; 1741 U128* xmm = (U128*)(addr + 160); 1742 UInt mxcsr; 1743 UShort fp_tags; 1744 UInt summary_tags; 1745 Int r, stno; 1746 UShort *srcS, *dstS; 1747 1748 do_get_x87( gst, (UChar*)&tmp ); 1749 mxcsr = amd64g_create_mxcsr( gst->guest_SSEROUND ); 1750 1751 /* Now build the proper fxsave image from the x87 image we just 1752 made. */ 1753 1754 addrS[0] = tmp.env[FP_ENV_CTRL]; /* FCW: fpu control word */ 1755 addrS[1] = tmp.env[FP_ENV_STAT]; /* FCW: fpu status word */ 1756 1757 /* set addrS[2] in an endian-independent way */ 1758 summary_tags = 0; 1759 fp_tags = tmp.env[FP_ENV_TAG]; 1760 for (r = 0; r < 8; r++) { 1761 if ( ((fp_tags >> (2*r)) & 3) != 3 ) 1762 summary_tags |= (1 << r); 1763 } 1764 addrC[4] = toUChar(summary_tags); /* FTW: tag summary byte */ 1765 addrC[5] = 0; /* pad */ 1766 1767 /* FOP: faulting fpu opcode. From experimentation, the real CPU 1768 does not write this field. (?!) */ 1769 addrS[3] = 0; /* BOGUS */ 1770 1771 /* RIP (Last x87 instruction pointer). From experimentation, the 1772 real CPU does not write this field. (?!) */ 1773 addrS[4] = 0; /* BOGUS */ 1774 addrS[5] = 0; /* BOGUS */ 1775 addrS[6] = 0; /* BOGUS */ 1776 addrS[7] = 0; /* BOGUS */ 1777 1778 /* RDP (Last x87 data pointer). From experimentation, the real CPU 1779 does not write this field. (?!) */ 1780 addrS[8] = 0; /* BOGUS */ 1781 addrS[9] = 0; /* BOGUS */ 1782 addrS[10] = 0; /* BOGUS */ 1783 addrS[11] = 0; /* BOGUS */ 1784 1785 addrS[12] = toUShort(mxcsr); /* MXCSR */ 1786 addrS[13] = toUShort(mxcsr >> 16); 1787 1788 addrS[14] = 0xFFFF; /* MXCSR mask (lo16) */ 1789 addrS[15] = 0x0000; /* MXCSR mask (hi16) */ 1790 1791 /* Copy in the FP registers, in ST order. */ 1792 for (stno = 0; stno < 8; stno++) { 1793 srcS = (UShort*)(&tmp.reg[10*stno]); 1794 dstS = (UShort*)(&addrS[16 + 8*stno]); 1795 dstS[0] = srcS[0]; 1796 dstS[1] = srcS[1]; 1797 dstS[2] = srcS[2]; 1798 dstS[3] = srcS[3]; 1799 dstS[4] = srcS[4]; 1800 dstS[5] = 0; 1801 dstS[6] = 0; 1802 dstS[7] = 0; 1803 } 1804 1805 /* That's the first 160 bytes of the image done. Now only %xmm0 1806 .. %xmm15 remain to be copied. If the host is big-endian, these 1807 need to be byte-swapped. */ 1808 vassert(host_is_little_endian()); 1809 1810 # define COPY_U128(_dst,_src) \ 1811 do { _dst[0] = _src[0]; _dst[1] = _src[1]; \ 1812 _dst[2] = _src[2]; _dst[3] = _src[3]; } \ 1813 while (0) 1814 1815 COPY_U128( xmm[0], gst->guest_YMM0 ); 1816 COPY_U128( xmm[1], gst->guest_YMM1 ); 1817 COPY_U128( xmm[2], gst->guest_YMM2 ); 1818 COPY_U128( xmm[3], gst->guest_YMM3 ); 1819 COPY_U128( xmm[4], gst->guest_YMM4 ); 1820 COPY_U128( xmm[5], gst->guest_YMM5 ); 1821 COPY_U128( xmm[6], gst->guest_YMM6 ); 1822 COPY_U128( xmm[7], gst->guest_YMM7 ); 1823 COPY_U128( xmm[8], gst->guest_YMM8 ); 1824 COPY_U128( xmm[9], gst->guest_YMM9 ); 1825 COPY_U128( xmm[10], gst->guest_YMM10 ); 1826 COPY_U128( xmm[11], gst->guest_YMM11 ); 1827 COPY_U128( xmm[12], gst->guest_YMM12 ); 1828 COPY_U128( xmm[13], gst->guest_YMM13 ); 1829 COPY_U128( xmm[14], gst->guest_YMM14 ); 1830 COPY_U128( xmm[15], gst->guest_YMM15 ); 1831 1832 # undef COPY_U128 1833 } 1834 1835 1836 /* CALLED FROM GENERATED CODE */ 1837 /* DIRTY HELPER (writes guest state, reads guest mem) */ 1838 VexEmNote amd64g_dirtyhelper_FXRSTOR ( VexGuestAMD64State* gst, HWord addr ) 1839 { 1840 Fpu_State tmp; 1841 VexEmNote warnX87 = EmNote_NONE; 1842 VexEmNote warnXMM = EmNote_NONE; 1843 UShort* addrS = (UShort*)addr; 1844 UChar* addrC = (UChar*)addr; 1845 U128* xmm = (U128*)(addr + 160); 1846 UShort fp_tags; 1847 Int r, stno, i; 1848 1849 /* Restore %xmm0 .. %xmm15. If the host is big-endian, these need 1850 to be byte-swapped. */ 1851 vassert(host_is_little_endian()); 1852 1853 # define COPY_U128(_dst,_src) \ 1854 do { _dst[0] = _src[0]; _dst[1] = _src[1]; \ 1855 _dst[2] = _src[2]; _dst[3] = _src[3]; } \ 1856 while (0) 1857 1858 COPY_U128( gst->guest_YMM0, xmm[0] ); 1859 COPY_U128( gst->guest_YMM1, xmm[1] ); 1860 COPY_U128( gst->guest_YMM2, xmm[2] ); 1861 COPY_U128( gst->guest_YMM3, xmm[3] ); 1862 COPY_U128( gst->guest_YMM4, xmm[4] ); 1863 COPY_U128( gst->guest_YMM5, xmm[5] ); 1864 COPY_U128( gst->guest_YMM6, xmm[6] ); 1865 COPY_U128( gst->guest_YMM7, xmm[7] ); 1866 COPY_U128( gst->guest_YMM8, xmm[8] ); 1867 COPY_U128( gst->guest_YMM9, xmm[9] ); 1868 COPY_U128( gst->guest_YMM10, xmm[10] ); 1869 COPY_U128( gst->guest_YMM11, xmm[11] ); 1870 COPY_U128( gst->guest_YMM12, xmm[12] ); 1871 COPY_U128( gst->guest_YMM13, xmm[13] ); 1872 COPY_U128( gst->guest_YMM14, xmm[14] ); 1873 COPY_U128( gst->guest_YMM15, xmm[15] ); 1874 1875 # undef COPY_U128 1876 1877 /* Copy the x87 registers out of the image, into a temporary 1878 Fpu_State struct. */ 1879 for (i = 0; i < 14; i++) tmp.env[i] = 0; 1880 for (i = 0; i < 80; i++) tmp.reg[i] = 0; 1881 /* fill in tmp.reg[0..7] */ 1882 for (stno = 0; stno < 8; stno++) { 1883 UShort* dstS = (UShort*)(&tmp.reg[10*stno]); 1884 UShort* srcS = (UShort*)(&addrS[16 + 8*stno]); 1885 dstS[0] = srcS[0]; 1886 dstS[1] = srcS[1]; 1887 dstS[2] = srcS[2]; 1888 dstS[3] = srcS[3]; 1889 dstS[4] = srcS[4]; 1890 } 1891 /* fill in tmp.env[0..13] */ 1892 tmp.env[FP_ENV_CTRL] = addrS[0]; /* FCW: fpu control word */ 1893 tmp.env[FP_ENV_STAT] = addrS[1]; /* FCW: fpu status word */ 1894 1895 fp_tags = 0; 1896 for (r = 0; r < 8; r++) { 1897 if (addrC[4] & (1<<r)) 1898 fp_tags |= (0 << (2*r)); /* EMPTY */ 1899 else 1900 fp_tags |= (3 << (2*r)); /* VALID -- not really precise enough. */ 1901 } 1902 tmp.env[FP_ENV_TAG] = fp_tags; 1903 1904 /* Now write 'tmp' into the guest state. */ 1905 warnX87 = do_put_x87( True/*moveRegs*/, (UChar*)&tmp, gst ); 1906 1907 { UInt w32 = (((UInt)addrS[12]) & 0xFFFF) 1908 | ((((UInt)addrS[13]) & 0xFFFF) << 16); 1909 ULong w64 = amd64g_check_ldmxcsr( (ULong)w32 ); 1910 1911 warnXMM = (VexEmNote)(w64 >> 32); 1912 1913 gst->guest_SSEROUND = w64 & 0xFFFFFFFFULL; 1914 } 1915 1916 /* Prefer an X87 emwarn over an XMM one, if both exist. */ 1917 if (warnX87 != EmNote_NONE) 1918 return warnX87; 1919 else 1920 return warnXMM; 1921 } 1922 1923 1924 /* DIRTY HELPER (writes guest state) */ 1925 /* Initialise the x87 FPU state as per 'finit'. */ 1926 void amd64g_dirtyhelper_FINIT ( VexGuestAMD64State* gst ) 1927 { 1928 Int i; 1929 gst->guest_FTOP = 0; 1930 for (i = 0; i < 8; i++) { 1931 gst->guest_FPTAG[i] = 0; /* empty */ 1932 gst->guest_FPREG[i] = 0; /* IEEE754 64-bit zero */ 1933 } 1934 gst->guest_FPROUND = (ULong)Irrm_NEAREST; 1935 gst->guest_FC3210 = 0; 1936 } 1937 1938 1939 /* CALLED FROM GENERATED CODE */ 1940 /* DIRTY HELPER (reads guest memory) */ 1941 ULong amd64g_dirtyhelper_loadF80le ( ULong addrU ) 1942 { 1943 ULong f64; 1944 convert_f80le_to_f64le ( (UChar*)ULong_to_Ptr(addrU), (UChar*)&f64 ); 1945 return f64; 1946 } 1947 1948 /* CALLED FROM GENERATED CODE */ 1949 /* DIRTY HELPER (writes guest memory) */ 1950 void amd64g_dirtyhelper_storeF80le ( ULong addrU, ULong f64 ) 1951 { 1952 convert_f64le_to_f80le( (UChar*)&f64, (UChar*)ULong_to_Ptr(addrU) ); 1953 } 1954 1955 1956 /* CALLED FROM GENERATED CODE */ 1957 /* CLEAN HELPER */ 1958 /* mxcsr[15:0] contains a SSE native format MXCSR value. 1959 Extract from it the required SSEROUND value and any resulting 1960 emulation warning, and return (warn << 32) | sseround value. 1961 */ 1962 ULong amd64g_check_ldmxcsr ( ULong mxcsr ) 1963 { 1964 /* Decide on a rounding mode. mxcsr[14:13] holds it. */ 1965 /* NOTE, encoded exactly as per enum IRRoundingMode. */ 1966 ULong rmode = (mxcsr >> 13) & 3; 1967 1968 /* Detect any required emulation warnings. */ 1969 VexEmNote ew = EmNote_NONE; 1970 1971 if ((mxcsr & 0x1F80) != 0x1F80) { 1972 /* unmasked exceptions! */ 1973 ew = EmWarn_X86_sseExns; 1974 } 1975 else 1976 if (mxcsr & (1<<15)) { 1977 /* FZ is set */ 1978 ew = EmWarn_X86_fz; 1979 } 1980 else 1981 if (mxcsr & (1<<6)) { 1982 /* DAZ is set */ 1983 ew = EmWarn_X86_daz; 1984 } 1985 1986 return (((ULong)ew) << 32) | ((ULong)rmode); 1987 } 1988 1989 1990 /* CALLED FROM GENERATED CODE */ 1991 /* CLEAN HELPER */ 1992 /* Given sseround as an IRRoundingMode value, create a suitable SSE 1993 native format MXCSR value. */ 1994 ULong amd64g_create_mxcsr ( ULong sseround ) 1995 { 1996 sseround &= 3; 1997 return 0x1F80 | (sseround << 13); 1998 } 1999 2000 2001 /* CLEAN HELPER */ 2002 /* fpucw[15:0] contains a x87 native format FPU control word. 2003 Extract from it the required FPROUND value and any resulting 2004 emulation warning, and return (warn << 32) | fpround value. 2005 */ 2006 ULong amd64g_check_fldcw ( ULong fpucw ) 2007 { 2008 /* Decide on a rounding mode. fpucw[11:10] holds it. */ 2009 /* NOTE, encoded exactly as per enum IRRoundingMode. */ 2010 ULong rmode = (fpucw >> 10) & 3; 2011 2012 /* Detect any required emulation warnings. */ 2013 VexEmNote ew = EmNote_NONE; 2014 2015 if ((fpucw & 0x3F) != 0x3F) { 2016 /* unmasked exceptions! */ 2017 ew = EmWarn_X86_x87exns; 2018 } 2019 else 2020 if (((fpucw >> 8) & 3) != 3) { 2021 /* unsupported precision */ 2022 ew = EmWarn_X86_x87precision; 2023 } 2024 2025 return (((ULong)ew) << 32) | ((ULong)rmode); 2026 } 2027 2028 2029 /* CLEAN HELPER */ 2030 /* Given fpround as an IRRoundingMode value, create a suitable x87 2031 native format FPU control word. */ 2032 ULong amd64g_create_fpucw ( ULong fpround ) 2033 { 2034 fpround &= 3; 2035 return 0x037F | (fpround << 10); 2036 } 2037 2038 2039 /* This is used to implement 'fldenv'. 2040 Reads 28 bytes at x87_state[0 .. 27]. */ 2041 /* CALLED FROM GENERATED CODE */ 2042 /* DIRTY HELPER */ 2043 VexEmNote amd64g_dirtyhelper_FLDENV ( /*OUT*/VexGuestAMD64State* vex_state, 2044 /*IN*/HWord x87_state) 2045 { 2046 return do_put_x87( False, (UChar*)x87_state, vex_state ); 2047 } 2048 2049 2050 /* CALLED FROM GENERATED CODE */ 2051 /* DIRTY HELPER */ 2052 /* Create an x87 FPU env from the guest state, as close as we can 2053 approximate it. Writes 28 bytes at x87_state[0..27]. */ 2054 void amd64g_dirtyhelper_FSTENV ( /*IN*/VexGuestAMD64State* vex_state, 2055 /*OUT*/HWord x87_state ) 2056 { 2057 Int i, stno, preg; 2058 UInt tagw; 2059 UChar* vexTags = (UChar*)(&vex_state->guest_FPTAG[0]); 2060 Fpu_State* x87 = (Fpu_State*)x87_state; 2061 UInt ftop = vex_state->guest_FTOP; 2062 ULong c3210 = vex_state->guest_FC3210; 2063 2064 for (i = 0; i < 14; i++) 2065 x87->env[i] = 0; 2066 2067 x87->env[1] = x87->env[3] = x87->env[5] = x87->env[13] = 0xFFFF; 2068 x87->env[FP_ENV_STAT] 2069 = toUShort(toUInt( ((ftop & 7) << 11) | (c3210 & 0x4700) )); 2070 x87->env[FP_ENV_CTRL] 2071 = toUShort(toUInt( amd64g_create_fpucw( vex_state->guest_FPROUND ) )); 2072 2073 /* Compute the x87 tag word. */ 2074 tagw = 0; 2075 for (stno = 0; stno < 8; stno++) { 2076 preg = (stno + ftop) & 7; 2077 if (vexTags[preg] == 0) { 2078 /* register is empty */ 2079 tagw |= (3 << (2*preg)); 2080 } else { 2081 /* register is full. */ 2082 tagw |= (0 << (2*preg)); 2083 } 2084 } 2085 x87->env[FP_ENV_TAG] = toUShort(tagw); 2086 2087 /* We don't dump the x87 registers, tho. */ 2088 } 2089 2090 2091 /* This is used to implement 'fnsave'. 2092 Writes 108 bytes at x87_state[0 .. 107]. */ 2093 /* CALLED FROM GENERATED CODE */ 2094 /* DIRTY HELPER */ 2095 void amd64g_dirtyhelper_FNSAVE ( /*IN*/VexGuestAMD64State* vex_state, 2096 /*OUT*/HWord x87_state) 2097 { 2098 do_get_x87( vex_state, (UChar*)x87_state ); 2099 } 2100 2101 2102 /* This is used to implement 'fnsaves'. 2103 Writes 94 bytes at x87_state[0 .. 93]. */ 2104 /* CALLED FROM GENERATED CODE */ 2105 /* DIRTY HELPER */ 2106 void amd64g_dirtyhelper_FNSAVES ( /*IN*/VexGuestAMD64State* vex_state, 2107 /*OUT*/HWord x87_state) 2108 { 2109 Int i, stno, preg; 2110 UInt tagw; 2111 ULong* vexRegs = (ULong*)(&vex_state->guest_FPREG[0]); 2112 UChar* vexTags = (UChar*)(&vex_state->guest_FPTAG[0]); 2113 Fpu_State_16* x87 = (Fpu_State_16*)x87_state; 2114 UInt ftop = vex_state->guest_FTOP; 2115 UInt c3210 = vex_state->guest_FC3210; 2116 2117 for (i = 0; i < 7; i++) 2118 x87->env[i] = 0; 2119 2120 x87->env[FPS_ENV_STAT] 2121 = toUShort(((ftop & 7) << 11) | (c3210 & 0x4700)); 2122 x87->env[FPS_ENV_CTRL] 2123 = toUShort(amd64g_create_fpucw( vex_state->guest_FPROUND )); 2124 2125 /* Dump the register stack in ST order. */ 2126 tagw = 0; 2127 for (stno = 0; stno < 8; stno++) { 2128 preg = (stno + ftop) & 7; 2129 if (vexTags[preg] == 0) { 2130 /* register is empty */ 2131 tagw |= (3 << (2*preg)); 2132 convert_f64le_to_f80le( (UChar*)&vexRegs[preg], 2133 &x87->reg[10*stno] ); 2134 } else { 2135 /* register is full. */ 2136 tagw |= (0 << (2*preg)); 2137 convert_f64le_to_f80le( (UChar*)&vexRegs[preg], 2138 &x87->reg[10*stno] ); 2139 } 2140 } 2141 x87->env[FPS_ENV_TAG] = toUShort(tagw); 2142 } 2143 2144 2145 /* This is used to implement 'frstor'. 2146 Reads 108 bytes at x87_state[0 .. 107]. */ 2147 /* CALLED FROM GENERATED CODE */ 2148 /* DIRTY HELPER */ 2149 VexEmNote amd64g_dirtyhelper_FRSTOR ( /*OUT*/VexGuestAMD64State* vex_state, 2150 /*IN*/HWord x87_state) 2151 { 2152 return do_put_x87( True, (UChar*)x87_state, vex_state ); 2153 } 2154 2155 2156 /* This is used to implement 'frstors'. 2157 Reads 94 bytes at x87_state[0 .. 93]. */ 2158 /* CALLED FROM GENERATED CODE */ 2159 /* DIRTY HELPER */ 2160 VexEmNote amd64g_dirtyhelper_FRSTORS ( /*OUT*/VexGuestAMD64State* vex_state, 2161 /*IN*/HWord x87_state) 2162 { 2163 Int stno, preg; 2164 UInt tag; 2165 ULong* vexRegs = (ULong*)(&vex_state->guest_FPREG[0]); 2166 UChar* vexTags = (UChar*)(&vex_state->guest_FPTAG[0]); 2167 Fpu_State_16* x87 = (Fpu_State_16*)x87_state; 2168 UInt ftop = (x87->env[FPS_ENV_STAT] >> 11) & 7; 2169 UInt tagw = x87->env[FPS_ENV_TAG]; 2170 UInt fpucw = x87->env[FPS_ENV_CTRL]; 2171 UInt c3210 = x87->env[FPS_ENV_STAT] & 0x4700; 2172 VexEmNote ew; 2173 UInt fpround; 2174 ULong pair; 2175 2176 /* Copy registers and tags */ 2177 for (stno = 0; stno < 8; stno++) { 2178 preg = (stno + ftop) & 7; 2179 tag = (tagw >> (2*preg)) & 3; 2180 if (tag == 3) { 2181 /* register is empty */ 2182 /* hmm, if it's empty, does it still get written? Probably 2183 safer to say it does. If we don't, memcheck could get out 2184 of sync, in that it thinks all FP registers are defined by 2185 this helper, but in reality some have not been updated. */ 2186 vexRegs[preg] = 0; /* IEEE754 64-bit zero */ 2187 vexTags[preg] = 0; 2188 } else { 2189 /* register is non-empty */ 2190 convert_f80le_to_f64le( &x87->reg[10*stno], 2191 (UChar*)&vexRegs[preg] ); 2192 vexTags[preg] = 1; 2193 } 2194 } 2195 2196 /* stack pointer */ 2197 vex_state->guest_FTOP = ftop; 2198 2199 /* status word */ 2200 vex_state->guest_FC3210 = c3210; 2201 2202 /* handle the control word, setting FPROUND and detecting any 2203 emulation warnings. */ 2204 pair = amd64g_check_fldcw ( (ULong)fpucw ); 2205 fpround = (UInt)pair & 0xFFFFFFFFULL; 2206 ew = (VexEmNote)(pair >> 32); 2207 2208 vex_state->guest_FPROUND = fpround & 3; 2209 2210 /* emulation warnings --> caller */ 2211 return ew; 2212 } 2213 2214 2215 /*---------------------------------------------------------------*/ 2216 /*--- Misc integer helpers, including rotates and CPUID. ---*/ 2217 /*---------------------------------------------------------------*/ 2218 2219 /* Claim to be the following CPU, which is probably representative of 2220 the lowliest (earliest) amd64 offerings. It can do neither sse3 2221 nor cx16. 2222 2223 vendor_id : AuthenticAMD 2224 cpu family : 15 2225 model : 5 2226 model name : AMD Opteron (tm) Processor 848 2227 stepping : 10 2228 cpu MHz : 1797.682 2229 cache size : 1024 KB 2230 fpu : yes 2231 fpu_exception : yes 2232 cpuid level : 1 2233 wp : yes 2234 flags : fpu vme de pse tsc msr pae mce cx8 apic sep 2235 mtrr pge mca cmov pat pse36 clflush mmx fxsr 2236 sse sse2 syscall nx mmxext lm 3dnowext 3dnow 2237 bogomips : 3600.62 2238 TLB size : 1088 4K pages 2239 clflush size : 64 2240 cache_alignment : 64 2241 address sizes : 40 bits physical, 48 bits virtual 2242 power management: ts fid vid ttp 2243 2244 2012-Feb-21: don't claim 3dnow or 3dnowext, since in fact 2245 we don't support them. See #291568. 3dnow is 80000001.EDX.31 2246 and 3dnowext is 80000001.EDX.30. 2247 */ 2248 void amd64g_dirtyhelper_CPUID_baseline ( VexGuestAMD64State* st ) 2249 { 2250 # define SET_ABCD(_a,_b,_c,_d) \ 2251 do { st->guest_RAX = (ULong)(_a); \ 2252 st->guest_RBX = (ULong)(_b); \ 2253 st->guest_RCX = (ULong)(_c); \ 2254 st->guest_RDX = (ULong)(_d); \ 2255 } while (0) 2256 2257 switch (0xFFFFFFFF & st->guest_RAX) { 2258 case 0x00000000: 2259 SET_ABCD(0x00000001, 0x68747541, 0x444d4163, 0x69746e65); 2260 break; 2261 case 0x00000001: 2262 SET_ABCD(0x00000f5a, 0x01000800, 0x00000000, 0x078bfbff); 2263 break; 2264 case 0x80000000: 2265 SET_ABCD(0x80000018, 0x68747541, 0x444d4163, 0x69746e65); 2266 break; 2267 case 0x80000001: 2268 /* Don't claim to support 3dnow or 3dnowext. 0xe1d3fbff is 2269 the original it-is-supported value that the h/w provides. 2270 See #291568. */ 2271 SET_ABCD(0x00000f5a, 0x00000505, 0x00000000, /*0xe1d3fbff*/ 2272 0x21d3fbff); 2273 break; 2274 case 0x80000002: 2275 SET_ABCD(0x20444d41, 0x6574704f, 0x206e6f72, 0x296d7428); 2276 break; 2277 case 0x80000003: 2278 SET_ABCD(0x6f725020, 0x73736563, 0x3820726f, 0x00003834); 2279 break; 2280 case 0x80000004: 2281 SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000); 2282 break; 2283 case 0x80000005: 2284 SET_ABCD(0xff08ff08, 0xff20ff20, 0x40020140, 0x40020140); 2285 break; 2286 case 0x80000006: 2287 SET_ABCD(0x00000000, 0x42004200, 0x04008140, 0x00000000); 2288 break; 2289 case 0x80000007: 2290 SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x0000000f); 2291 break; 2292 case 0x80000008: 2293 SET_ABCD(0x00003028, 0x00000000, 0x00000000, 0x00000000); 2294 break; 2295 default: 2296 SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000); 2297 break; 2298 } 2299 # undef SET_ABCD 2300 } 2301 2302 2303 /* Claim to be the following CPU (2 x ...), which is sse3 and cx16 2304 capable. 2305 2306 vendor_id : GenuineIntel 2307 cpu family : 6 2308 model : 15 2309 model name : Intel(R) Core(TM)2 CPU 6600 @ 2.40GHz 2310 stepping : 6 2311 cpu MHz : 2394.000 2312 cache size : 4096 KB 2313 physical id : 0 2314 siblings : 2 2315 core id : 0 2316 cpu cores : 2 2317 fpu : yes 2318 fpu_exception : yes 2319 cpuid level : 10 2320 wp : yes 2321 flags : fpu vme de pse tsc msr pae mce cx8 apic sep 2322 mtrr pge mca cmov pat pse36 clflush dts acpi 2323 mmx fxsr sse sse2 ss ht tm syscall nx lm 2324 constant_tsc pni monitor ds_cpl vmx est tm2 2325 cx16 xtpr lahf_lm 2326 bogomips : 4798.78 2327 clflush size : 64 2328 cache_alignment : 64 2329 address sizes : 36 bits physical, 48 bits virtual 2330 power management: 2331 */ 2332 void amd64g_dirtyhelper_CPUID_sse3_and_cx16 ( VexGuestAMD64State* st ) 2333 { 2334 # define SET_ABCD(_a,_b,_c,_d) \ 2335 do { st->guest_RAX = (ULong)(_a); \ 2336 st->guest_RBX = (ULong)(_b); \ 2337 st->guest_RCX = (ULong)(_c); \ 2338 st->guest_RDX = (ULong)(_d); \ 2339 } while (0) 2340 2341 switch (0xFFFFFFFF & st->guest_RAX) { 2342 case 0x00000000: 2343 SET_ABCD(0x0000000a, 0x756e6547, 0x6c65746e, 0x49656e69); 2344 break; 2345 case 0x00000001: 2346 SET_ABCD(0x000006f6, 0x00020800, 0x0000e3bd, 0xbfebfbff); 2347 break; 2348 case 0x00000002: 2349 SET_ABCD(0x05b0b101, 0x005657f0, 0x00000000, 0x2cb43049); 2350 break; 2351 case 0x00000003: 2352 SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000); 2353 break; 2354 case 0x00000004: { 2355 switch (0xFFFFFFFF & st->guest_RCX) { 2356 case 0x00000000: SET_ABCD(0x04000121, 0x01c0003f, 2357 0x0000003f, 0x00000001); break; 2358 case 0x00000001: SET_ABCD(0x04000122, 0x01c0003f, 2359 0x0000003f, 0x00000001); break; 2360 case 0x00000002: SET_ABCD(0x04004143, 0x03c0003f, 2361 0x00000fff, 0x00000001); break; 2362 default: SET_ABCD(0x00000000, 0x00000000, 2363 0x00000000, 0x00000000); break; 2364 } 2365 break; 2366 } 2367 case 0x00000005: 2368 SET_ABCD(0x00000040, 0x00000040, 0x00000003, 0x00000020); 2369 break; 2370 case 0x00000006: 2371 SET_ABCD(0x00000001, 0x00000002, 0x00000001, 0x00000000); 2372 break; 2373 case 0x00000007: 2374 SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000); 2375 break; 2376 case 0x00000008: 2377 SET_ABCD(0x00000400, 0x00000000, 0x00000000, 0x00000000); 2378 break; 2379 case 0x00000009: 2380 SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000); 2381 break; 2382 case 0x0000000a: 2383 unhandled_eax_value: 2384 SET_ABCD(0x07280202, 0x00000000, 0x00000000, 0x00000000); 2385 break; 2386 case 0x80000000: 2387 SET_ABCD(0x80000008, 0x00000000, 0x00000000, 0x00000000); 2388 break; 2389 case 0x80000001: 2390 SET_ABCD(0x00000000, 0x00000000, 0x00000001, 0x20100800); 2391 break; 2392 case 0x80000002: 2393 SET_ABCD(0x65746e49, 0x2952286c, 0x726f4320, 0x4d542865); 2394 break; 2395 case 0x80000003: 2396 SET_ABCD(0x43203229, 0x20205550, 0x20202020, 0x20202020); 2397 break; 2398 case 0x80000004: 2399 SET_ABCD(0x30303636, 0x20402020, 0x30342e32, 0x007a4847); 2400 break; 2401 case 0x80000005: 2402 SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000); 2403 break; 2404 case 0x80000006: 2405 SET_ABCD(0x00000000, 0x00000000, 0x10008040, 0x00000000); 2406 break; 2407 case 0x80000007: 2408 SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000); 2409 break; 2410 case 0x80000008: 2411 SET_ABCD(0x00003024, 0x00000000, 0x00000000, 0x00000000); 2412 break; 2413 default: 2414 goto unhandled_eax_value; 2415 } 2416 # undef SET_ABCD 2417 } 2418 2419 2420 /* Claim to be the following CPU (4 x ...), which is sse4.2 and cx16 2421 capable. 2422 2423 vendor_id : GenuineIntel 2424 cpu family : 6 2425 model : 37 2426 model name : Intel(R) Core(TM) i5 CPU 670 @ 3.47GHz 2427 stepping : 2 2428 cpu MHz : 3334.000 2429 cache size : 4096 KB 2430 physical id : 0 2431 siblings : 4 2432 core id : 0 2433 cpu cores : 2 2434 apicid : 0 2435 initial apicid : 0 2436 fpu : yes 2437 fpu_exception : yes 2438 cpuid level : 11 2439 wp : yes 2440 flags : fpu vme de pse tsc msr pae mce cx8 apic sep 2441 mtrr pge mca cmov pat pse36 clflush dts acpi 2442 mmx fxsr sse sse2 ss ht tm pbe syscall nx rdtscp 2443 lm constant_tsc arch_perfmon pebs bts rep_good 2444 xtopology nonstop_tsc aperfmperf pni pclmulqdq 2445 dtes64 monitor ds_cpl vmx smx est tm2 ssse3 cx16 2446 xtpr pdcm sse4_1 sse4_2 popcnt aes lahf_lm ida 2447 arat tpr_shadow vnmi flexpriority ept vpid 2448 bogomips : 6957.57 2449 clflush size : 64 2450 cache_alignment : 64 2451 address sizes : 36 bits physical, 48 bits virtual 2452 power management: 2453 */ 2454 void amd64g_dirtyhelper_CPUID_sse42_and_cx16 ( VexGuestAMD64State* st ) 2455 { 2456 # define SET_ABCD(_a,_b,_c,_d) \ 2457 do { st->guest_RAX = (ULong)(_a); \ 2458 st->guest_RBX = (ULong)(_b); \ 2459 st->guest_RCX = (ULong)(_c); \ 2460 st->guest_RDX = (ULong)(_d); \ 2461 } while (0) 2462 2463 UInt old_eax = (UInt)st->guest_RAX; 2464 UInt old_ecx = (UInt)st->guest_RCX; 2465 2466 switch (old_eax) { 2467 case 0x00000000: 2468 SET_ABCD(0x0000000b, 0x756e6547, 0x6c65746e, 0x49656e69); 2469 break; 2470 case 0x00000001: 2471 SET_ABCD(0x00020652, 0x00100800, 0x0298e3ff, 0xbfebfbff); 2472 break; 2473 case 0x00000002: 2474 SET_ABCD(0x55035a01, 0x00f0b2e3, 0x00000000, 0x09ca212c); 2475 break; 2476 case 0x00000003: 2477 SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000); 2478 break; 2479 case 0x00000004: 2480 switch (old_ecx) { 2481 case 0x00000000: SET_ABCD(0x1c004121, 0x01c0003f, 2482 0x0000003f, 0x00000000); break; 2483 case 0x00000001: SET_ABCD(0x1c004122, 0x00c0003f, 2484 0x0000007f, 0x00000000); break; 2485 case 0x00000002: SET_ABCD(0x1c004143, 0x01c0003f, 2486 0x000001ff, 0x00000000); break; 2487 case 0x00000003: SET_ABCD(0x1c03c163, 0x03c0003f, 2488 0x00000fff, 0x00000002); break; 2489 default: SET_ABCD(0x00000000, 0x00000000, 2490 0x00000000, 0x00000000); break; 2491 } 2492 break; 2493 case 0x00000005: 2494 SET_ABCD(0x00000040, 0x00000040, 0x00000003, 0x00001120); 2495 break; 2496 case 0x00000006: 2497 SET_ABCD(0x00000007, 0x00000002, 0x00000001, 0x00000000); 2498 break; 2499 case 0x00000007: 2500 SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000); 2501 break; 2502 case 0x00000008: 2503 SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000); 2504 break; 2505 case 0x00000009: 2506 SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000); 2507 break; 2508 case 0x0000000a: 2509 SET_ABCD(0x07300403, 0x00000004, 0x00000000, 0x00000603); 2510 break; 2511 case 0x0000000b: 2512 switch (old_ecx) { 2513 case 0x00000000: 2514 SET_ABCD(0x00000001, 0x00000002, 2515 0x00000100, 0x00000000); break; 2516 case 0x00000001: 2517 SET_ABCD(0x00000004, 0x00000004, 2518 0x00000201, 0x00000000); break; 2519 default: 2520 SET_ABCD(0x00000000, 0x00000000, 2521 old_ecx, 0x00000000); break; 2522 } 2523 break; 2524 case 0x0000000c: 2525 SET_ABCD(0x00000001, 0x00000002, 0x00000100, 0x00000000); 2526 break; 2527 case 0x0000000d: 2528 switch (old_ecx) { 2529 case 0x00000000: SET_ABCD(0x00000001, 0x00000002, 2530 0x00000100, 0x00000000); break; 2531 case 0x00000001: SET_ABCD(0x00000004, 0x00000004, 2532 0x00000201, 0x00000000); break; 2533 default: SET_ABCD(0x00000000, 0x00000000, 2534 old_ecx, 0x00000000); break; 2535 } 2536 break; 2537 case 0x80000000: 2538 SET_ABCD(0x80000008, 0x00000000, 0x00000000, 0x00000000); 2539 break; 2540 case 0x80000001: 2541 SET_ABCD(0x00000000, 0x00000000, 0x00000001, 0x28100800); 2542 break; 2543 case 0x80000002: 2544 SET_ABCD(0x65746e49, 0x2952286c, 0x726f4320, 0x4d542865); 2545 break; 2546 case 0x80000003: 2547 SET_ABCD(0x35692029, 0x55504320, 0x20202020, 0x20202020); 2548 break; 2549 case 0x80000004: 2550 SET_ABCD(0x30373620, 0x20402020, 0x37342e33, 0x007a4847); 2551 break; 2552 case 0x80000005: 2553 SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000); 2554 break; 2555 case 0x80000006: 2556 SET_ABCD(0x00000000, 0x00000000, 0x01006040, 0x00000000); 2557 break; 2558 case 0x80000007: 2559 SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000100); 2560 break; 2561 case 0x80000008: 2562 SET_ABCD(0x00003024, 0x00000000, 0x00000000, 0x00000000); 2563 break; 2564 default: 2565 SET_ABCD(0x00000001, 0x00000002, 0x00000100, 0x00000000); 2566 break; 2567 } 2568 # undef SET_ABCD 2569 } 2570 2571 2572 /* Claim to be the following CPU (4 x ...), which is AVX and cx16 2573 capable. Plus (kludge!) it "supports" HTM. 2574 2575 vendor_id : GenuineIntel 2576 cpu family : 6 2577 model : 42 2578 model name : Intel(R) Core(TM) i5-2300 CPU @ 2.80GHz 2579 stepping : 7 2580 cpu MHz : 1600.000 2581 cache size : 6144 KB 2582 physical id : 0 2583 siblings : 4 2584 core id : 3 2585 cpu cores : 4 2586 apicid : 6 2587 initial apicid : 6 2588 fpu : yes 2589 fpu_exception : yes 2590 cpuid level : 13 2591 wp : yes 2592 flags : fpu vme de pse tsc msr pae mce cx8 apic sep 2593 mtrr pge mca cmov pat pse36 clflush dts acpi 2594 mmx fxsr sse sse2 ss ht tm pbe syscall nx rdtscp 2595 lm constant_tsc arch_perfmon pebs bts rep_good 2596 nopl xtopology nonstop_tsc aperfmperf pni pclmulqdq 2597 dtes64 monitor ds_cpl vmx est tm2 ssse3 cx16 2598 xtpr pdcm sse4_1 sse4_2 popcnt aes xsave avx 2599 lahf_lm ida arat epb xsaveopt pln pts dts 2600 tpr_shadow vnmi flexpriority ept vpid 2601 2602 bogomips : 5768.94 2603 clflush size : 64 2604 cache_alignment : 64 2605 address sizes : 36 bits physical, 48 bits virtual 2606 power management: 2607 */ 2608 void amd64g_dirtyhelper_CPUID_avx_and_cx16 ( VexGuestAMD64State* st ) 2609 { 2610 # define SET_ABCD(_a,_b,_c,_d) \ 2611 do { st->guest_RAX = (ULong)(_a); \ 2612 st->guest_RBX = (ULong)(_b); \ 2613 st->guest_RCX = (ULong)(_c); \ 2614 st->guest_RDX = (ULong)(_d); \ 2615 } while (0) 2616 2617 UInt old_eax = (UInt)st->guest_RAX; 2618 UInt old_ecx = (UInt)st->guest_RCX; 2619 2620 switch (old_eax) { 2621 case 0x00000000: 2622 SET_ABCD(0x0000000d, 0x756e6547, 0x6c65746e, 0x49656e69); 2623 break; 2624 case 0x00000001: 2625 SET_ABCD(0x000206a7, 0x00100800, 0x1f9ae3bf, 0xbfebfbff); 2626 break; 2627 case 0x00000002: 2628 SET_ABCD(0x76035a01, 0x00f0b0ff, 0x00000000, 0x00ca0000); 2629 break; 2630 case 0x00000003: 2631 SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000); 2632 break; 2633 case 0x00000004: 2634 switch (old_ecx) { 2635 case 0x00000000: SET_ABCD(0x1c004121, 0x01c0003f, 2636 0x0000003f, 0x00000000); break; 2637 case 0x00000001: SET_ABCD(0x1c004122, 0x01c0003f, 2638 0x0000003f, 0x00000000); break; 2639 case 0x00000002: SET_ABCD(0x1c004143, 0x01c0003f, 2640 0x000001ff, 0x00000000); break; 2641 case 0x00000003: SET_ABCD(0x1c03c163, 0x02c0003f, 2642 0x00001fff, 0x00000006); break; 2643 default: SET_ABCD(0x00000000, 0x00000000, 2644 0x00000000, 0x00000000); break; 2645 } 2646 break; 2647 case 0x00000005: 2648 SET_ABCD(0x00000040, 0x00000040, 0x00000003, 0x00001120); 2649 break; 2650 case 0x00000006: 2651 SET_ABCD(0x00000077, 0x00000002, 0x00000009, 0x00000000); 2652 break; 2653 case 0x00000007: 2654 SET_ABCD(0x00000000, 0x00000800, 0x00000000, 0x00000000); 2655 break; 2656 case 0x00000008: 2657 SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000); 2658 break; 2659 case 0x00000009: 2660 SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000); 2661 break; 2662 case 0x0000000a: 2663 SET_ABCD(0x07300803, 0x00000000, 0x00000000, 0x00000603); 2664 break; 2665 case 0x0000000b: 2666 switch (old_ecx) { 2667 case 0x00000000: 2668 SET_ABCD(0x00000001, 0x00000001, 2669 0x00000100, 0x00000000); break; 2670 case 0x00000001: 2671 SET_ABCD(0x00000004, 0x00000004, 2672 0x00000201, 0x00000000); break; 2673 default: 2674 SET_ABCD(0x00000000, 0x00000000, 2675 old_ecx, 0x00000000); break; 2676 } 2677 break; 2678 case 0x0000000c: 2679 SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000); 2680 break; 2681 case 0x0000000d: 2682 switch (old_ecx) { 2683 case 0x00000000: SET_ABCD(0x00000007, 0x00000340, 2684 0x00000340, 0x00000000); break; 2685 case 0x00000001: SET_ABCD(0x00000001, 0x00000000, 2686 0x00000000, 0x00000000); break; 2687 case 0x00000002: SET_ABCD(0x00000100, 0x00000240, 2688 0x00000000, 0x00000000); break; 2689 default: SET_ABCD(0x00000000, 0x00000000, 2690 0x00000000, 0x00000000); break; 2691 } 2692 break; 2693 case 0x0000000e: 2694 SET_ABCD(0x00000007, 0x00000340, 0x00000340, 0x00000000); 2695 break; 2696 case 0x0000000f: 2697 SET_ABCD(0x00000007, 0x00000340, 0x00000340, 0x00000000); 2698 break; 2699 case 0x80000000: 2700 SET_ABCD(0x80000008, 0x00000000, 0x00000000, 0x00000000); 2701 break; 2702 case 0x80000001: 2703 SET_ABCD(0x00000000, 0x00000000, 0x00000001, 0x28100800); 2704 break; 2705 case 0x80000002: 2706 SET_ABCD(0x20202020, 0x20202020, 0x65746e49, 0x2952286c); 2707 break; 2708 case 0x80000003: 2709 SET_ABCD(0x726f4320, 0x4d542865, 0x35692029, 0x3033322d); 2710 break; 2711 case 0x80000004: 2712 SET_ABCD(0x50432030, 0x20402055, 0x30382e32, 0x007a4847); 2713 break; 2714 case 0x80000005: 2715 SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000); 2716 break; 2717 case 0x80000006: 2718 SET_ABCD(0x00000000, 0x00000000, 0x01006040, 0x00000000); 2719 break; 2720 case 0x80000007: 2721 SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000100); 2722 break; 2723 case 0x80000008: 2724 SET_ABCD(0x00003024, 0x00000000, 0x00000000, 0x00000000); 2725 break; 2726 default: 2727 SET_ABCD(0x00000007, 0x00000340, 0x00000340, 0x00000000); 2728 break; 2729 } 2730 # undef SET_ABCD 2731 } 2732 2733 2734 ULong amd64g_calculate_RCR ( ULong arg, 2735 ULong rot_amt, 2736 ULong rflags_in, 2737 Long szIN ) 2738 { 2739 Bool wantRflags = toBool(szIN < 0); 2740 ULong sz = wantRflags ? (-szIN) : szIN; 2741 ULong tempCOUNT = rot_amt & (sz == 8 ? 0x3F : 0x1F); 2742 ULong cf=0, of=0, tempcf; 2743 2744 switch (sz) { 2745 case 8: 2746 cf = (rflags_in >> AMD64G_CC_SHIFT_C) & 1; 2747 of = ((arg >> 63) ^ cf) & 1; 2748 while (tempCOUNT > 0) { 2749 tempcf = arg & 1; 2750 arg = (arg >> 1) | (cf << 63); 2751 cf = tempcf; 2752 tempCOUNT--; 2753 } 2754 break; 2755 case 4: 2756 while (tempCOUNT >= 33) tempCOUNT -= 33; 2757 cf = (rflags_in >> AMD64G_CC_SHIFT_C) & 1; 2758 of = ((arg >> 31) ^ cf) & 1; 2759 while (tempCOUNT > 0) { 2760 tempcf = arg & 1; 2761 arg = ((arg >> 1) & 0x7FFFFFFFULL) | (cf << 31); 2762 cf = tempcf; 2763 tempCOUNT--; 2764 } 2765 break; 2766 case 2: 2767 while (tempCOUNT >= 17) tempCOUNT -= 17; 2768 cf = (rflags_in >> AMD64G_CC_SHIFT_C) & 1; 2769 of = ((arg >> 15) ^ cf) & 1; 2770 while (tempCOUNT > 0) { 2771 tempcf = arg & 1; 2772 arg = ((arg >> 1) & 0x7FFFULL) | (cf << 15); 2773 cf = tempcf; 2774 tempCOUNT--; 2775 } 2776 break; 2777 case 1: 2778 while (tempCOUNT >= 9) tempCOUNT -= 9; 2779 cf = (rflags_in >> AMD64G_CC_SHIFT_C) & 1; 2780 of = ((arg >> 7) ^ cf) & 1; 2781 while (tempCOUNT > 0) { 2782 tempcf = arg & 1; 2783 arg = ((arg >> 1) & 0x7FULL) | (cf << 7); 2784 cf = tempcf; 2785 tempCOUNT--; 2786 } 2787 break; 2788 default: 2789 vpanic("calculate_RCR(amd64g): invalid size"); 2790 } 2791 2792 cf &= 1; 2793 of &= 1; 2794 rflags_in &= ~(AMD64G_CC_MASK_C | AMD64G_CC_MASK_O); 2795 rflags_in |= (cf << AMD64G_CC_SHIFT_C) | (of << AMD64G_CC_SHIFT_O); 2796 2797 /* caller can ask to have back either the resulting flags or 2798 resulting value, but not both */ 2799 return wantRflags ? rflags_in : arg; 2800 } 2801 2802 ULong amd64g_calculate_RCL ( ULong arg, 2803 ULong rot_amt, 2804 ULong rflags_in, 2805 Long szIN ) 2806 { 2807 Bool wantRflags = toBool(szIN < 0); 2808 ULong sz = wantRflags ? (-szIN) : szIN; 2809 ULong tempCOUNT = rot_amt & (sz == 8 ? 0x3F : 0x1F); 2810 ULong cf=0, of=0, tempcf; 2811 2812 switch (sz) { 2813 case 8: 2814 cf = (rflags_in >> AMD64G_CC_SHIFT_C) & 1; 2815 while (tempCOUNT > 0) { 2816 tempcf = (arg >> 63) & 1; 2817 arg = (arg << 1) | (cf & 1); 2818 cf = tempcf; 2819 tempCOUNT--; 2820 } 2821 of = ((arg >> 63) ^ cf) & 1; 2822 break; 2823 case 4: 2824 while (tempCOUNT >= 33) tempCOUNT -= 33; 2825 cf = (rflags_in >> AMD64G_CC_SHIFT_C) & 1; 2826 while (tempCOUNT > 0) { 2827 tempcf = (arg >> 31) & 1; 2828 arg = 0xFFFFFFFFULL & ((arg << 1) | (cf & 1)); 2829 cf = tempcf; 2830 tempCOUNT--; 2831 } 2832 of = ((arg >> 31) ^ cf) & 1; 2833 break; 2834 case 2: 2835 while (tempCOUNT >= 17) tempCOUNT -= 17; 2836 cf = (rflags_in >> AMD64G_CC_SHIFT_C) & 1; 2837 while (tempCOUNT > 0) { 2838 tempcf = (arg >> 15) & 1; 2839 arg = 0xFFFFULL & ((arg << 1) | (cf & 1)); 2840 cf = tempcf; 2841 tempCOUNT--; 2842 } 2843 of = ((arg >> 15) ^ cf) & 1; 2844 break; 2845 case 1: 2846 while (tempCOUNT >= 9) tempCOUNT -= 9; 2847 cf = (rflags_in >> AMD64G_CC_SHIFT_C) & 1; 2848 while (tempCOUNT > 0) { 2849 tempcf = (arg >> 7) & 1; 2850 arg = 0xFFULL & ((arg << 1) | (cf & 1)); 2851 cf = tempcf; 2852 tempCOUNT--; 2853 } 2854 of = ((arg >> 7) ^ cf) & 1; 2855 break; 2856 default: 2857 vpanic("calculate_RCL(amd64g): invalid size"); 2858 } 2859 2860 cf &= 1; 2861 of &= 1; 2862 rflags_in &= ~(AMD64G_CC_MASK_C | AMD64G_CC_MASK_O); 2863 rflags_in |= (cf << AMD64G_CC_SHIFT_C) | (of << AMD64G_CC_SHIFT_O); 2864 2865 return wantRflags ? rflags_in : arg; 2866 } 2867 2868 /* Taken from gf2x-0.9.5, released under GPLv2+ (later versions LGPLv2+) 2869 * svn://scm.gforge.inria.fr/svn/gf2x/trunk/hardware/opteron/gf2x_mul1.h@25 2870 */ 2871 ULong amd64g_calculate_pclmul(ULong a, ULong b, ULong which) 2872 { 2873 ULong hi, lo, tmp, A[16]; 2874 2875 A[0] = 0; A[1] = a; 2876 A[2] = A[1] << 1; A[3] = A[2] ^ a; 2877 A[4] = A[2] << 1; A[5] = A[4] ^ a; 2878 A[6] = A[3] << 1; A[7] = A[6] ^ a; 2879 A[8] = A[4] << 1; A[9] = A[8] ^ a; 2880 A[10] = A[5] << 1; A[11] = A[10] ^ a; 2881 A[12] = A[6] << 1; A[13] = A[12] ^ a; 2882 A[14] = A[7] << 1; A[15] = A[14] ^ a; 2883 2884 lo = (A[b >> 60] << 4) ^ A[(b >> 56) & 15]; 2885 hi = lo >> 56; 2886 lo = (lo << 8) ^ (A[(b >> 52) & 15] << 4) ^ A[(b >> 48) & 15]; 2887 hi = (hi << 8) | (lo >> 56); 2888 lo = (lo << 8) ^ (A[(b >> 44) & 15] << 4) ^ A[(b >> 40) & 15]; 2889 hi = (hi << 8) | (lo >> 56); 2890 lo = (lo << 8) ^ (A[(b >> 36) & 15] << 4) ^ A[(b >> 32) & 15]; 2891 hi = (hi << 8) | (lo >> 56); 2892 lo = (lo << 8) ^ (A[(b >> 28) & 15] << 4) ^ A[(b >> 24) & 15]; 2893 hi = (hi << 8) | (lo >> 56); 2894 lo = (lo << 8) ^ (A[(b >> 20) & 15] << 4) ^ A[(b >> 16) & 15]; 2895 hi = (hi << 8) | (lo >> 56); 2896 lo = (lo << 8) ^ (A[(b >> 12) & 15] << 4) ^ A[(b >> 8) & 15]; 2897 hi = (hi << 8) | (lo >> 56); 2898 lo = (lo << 8) ^ (A[(b >> 4) & 15] << 4) ^ A[b & 15]; 2899 2900 ULong m0 = -1; 2901 m0 /= 255; 2902 tmp = -((a >> 63) & 1); tmp &= ((b & (m0 * 0xfe)) >> 1); hi = hi ^ tmp; 2903 tmp = -((a >> 62) & 1); tmp &= ((b & (m0 * 0xfc)) >> 2); hi = hi ^ tmp; 2904 tmp = -((a >> 61) & 1); tmp &= ((b & (m0 * 0xf8)) >> 3); hi = hi ^ tmp; 2905 tmp = -((a >> 60) & 1); tmp &= ((b & (m0 * 0xf0)) >> 4); hi = hi ^ tmp; 2906 tmp = -((a >> 59) & 1); tmp &= ((b & (m0 * 0xe0)) >> 5); hi = hi ^ tmp; 2907 tmp = -((a >> 58) & 1); tmp &= ((b & (m0 * 0xc0)) >> 6); hi = hi ^ tmp; 2908 tmp = -((a >> 57) & 1); tmp &= ((b & (m0 * 0x80)) >> 7); hi = hi ^ tmp; 2909 2910 return which ? hi : lo; 2911 } 2912 2913 2914 /* CALLED FROM GENERATED CODE */ 2915 /* DIRTY HELPER (non-referentially-transparent) */ 2916 /* Horrible hack. On non-amd64 platforms, return 1. */ 2917 ULong amd64g_dirtyhelper_RDTSC ( void ) 2918 { 2919 # if defined(__x86_64__) 2920 UInt eax, edx; 2921 __asm__ __volatile__("rdtsc" : "=a" (eax), "=d" (edx)); 2922 return (((ULong)edx) << 32) | ((ULong)eax); 2923 # else 2924 return 1ULL; 2925 # endif 2926 } 2927 2928 /* CALLED FROM GENERATED CODE */ 2929 /* DIRTY HELPER (non-referentially-transparent) */ 2930 /* Horrible hack. On non-amd64 platforms, return 1. */ 2931 /* This uses a different calling convention from _RDTSC just above 2932 only because of the difficulty of returning 96 bits from a C 2933 function -- RDTSC returns 64 bits and so is simple by comparison, 2934 on amd64. */ 2935 void amd64g_dirtyhelper_RDTSCP ( VexGuestAMD64State* st ) 2936 { 2937 # if defined(__x86_64__) 2938 UInt eax, ecx, edx; 2939 __asm__ __volatile__("rdtscp" : "=a" (eax), "=d" (edx), "=c" (ecx)); 2940 st->guest_RAX = (ULong)eax; 2941 st->guest_RCX = (ULong)ecx; 2942 st->guest_RDX = (ULong)edx; 2943 # else 2944 /* Do nothing. */ 2945 # endif 2946 } 2947 2948 /* CALLED FROM GENERATED CODE */ 2949 /* DIRTY HELPER (non-referentially-transparent) */ 2950 /* Horrible hack. On non-amd64 platforms, return 0. */ 2951 ULong amd64g_dirtyhelper_IN ( ULong portno, ULong sz/*1,2 or 4*/ ) 2952 { 2953 # if defined(__x86_64__) 2954 ULong r = 0; 2955 portno &= 0xFFFF; 2956 switch (sz) { 2957 case 4: 2958 __asm__ __volatile__("movq $0,%%rax; inl %w1,%%eax; movq %%rax,%0" 2959 : "=a" (r) : "Nd" (portno)); 2960 break; 2961 case 2: 2962 __asm__ __volatile__("movq $0,%%rax; inw %w1,%w0" 2963 : "=a" (r) : "Nd" (portno)); 2964 break; 2965 case 1: 2966 __asm__ __volatile__("movq $0,%%rax; inb %w1,%b0" 2967 : "=a" (r) : "Nd" (portno)); 2968 break; 2969 default: 2970 break; /* note: no 64-bit version of insn exists */ 2971 } 2972 return r; 2973 # else 2974 return 0; 2975 # endif 2976 } 2977 2978 2979 /* CALLED FROM GENERATED CODE */ 2980 /* DIRTY HELPER (non-referentially-transparent) */ 2981 /* Horrible hack. On non-amd64 platforms, do nothing. */ 2982 void amd64g_dirtyhelper_OUT ( ULong portno, ULong data, ULong sz/*1,2 or 4*/ ) 2983 { 2984 # if defined(__x86_64__) 2985 portno &= 0xFFFF; 2986 switch (sz) { 2987 case 4: 2988 __asm__ __volatile__("movq %0,%%rax; outl %%eax, %w1" 2989 : : "a" (data), "Nd" (portno)); 2990 break; 2991 case 2: 2992 __asm__ __volatile__("outw %w0, %w1" 2993 : : "a" (data), "Nd" (portno)); 2994 break; 2995 case 1: 2996 __asm__ __volatile__("outb %b0, %w1" 2997 : : "a" (data), "Nd" (portno)); 2998 break; 2999 default: 3000 break; /* note: no 64-bit version of insn exists */ 3001 } 3002 # else 3003 /* do nothing */ 3004 # endif 3005 } 3006 3007 /* CALLED FROM GENERATED CODE */ 3008 /* DIRTY HELPER (non-referentially-transparent) */ 3009 /* Horrible hack. On non-amd64 platforms, do nothing. */ 3010 /* op = 0: call the native SGDT instruction. 3011 op = 1: call the native SIDT instruction. 3012 */ 3013 void amd64g_dirtyhelper_SxDT ( void *address, ULong op ) { 3014 # if defined(__x86_64__) 3015 switch (op) { 3016 case 0: 3017 __asm__ __volatile__("sgdt (%0)" : : "r" (address) : "memory"); 3018 break; 3019 case 1: 3020 __asm__ __volatile__("sidt (%0)" : : "r" (address) : "memory"); 3021 break; 3022 default: 3023 vpanic("amd64g_dirtyhelper_SxDT"); 3024 } 3025 # else 3026 /* do nothing */ 3027 UChar* p = (UChar*)address; 3028 p[0] = p[1] = p[2] = p[3] = p[4] = p[5] = 0; 3029 p[6] = p[7] = p[8] = p[9] = 0; 3030 # endif 3031 } 3032 3033 /*---------------------------------------------------------------*/ 3034 /*--- Helpers for MMX/SSE/SSE2. ---*/ 3035 /*---------------------------------------------------------------*/ 3036 3037 static inline UChar abdU8 ( UChar xx, UChar yy ) { 3038 return toUChar(xx>yy ? xx-yy : yy-xx); 3039 } 3040 3041 static inline ULong mk32x2 ( UInt w1, UInt w0 ) { 3042 return (((ULong)w1) << 32) | ((ULong)w0); 3043 } 3044 3045 static inline UShort sel16x4_3 ( ULong w64 ) { 3046 UInt hi32 = toUInt(w64 >> 32); 3047 return toUShort(hi32 >> 16); 3048 } 3049 static inline UShort sel16x4_2 ( ULong w64 ) { 3050 UInt hi32 = toUInt(w64 >> 32); 3051 return toUShort(hi32); 3052 } 3053 static inline UShort sel16x4_1 ( ULong w64 ) { 3054 UInt lo32 = toUInt(w64); 3055 return toUShort(lo32 >> 16); 3056 } 3057 static inline UShort sel16x4_0 ( ULong w64 ) { 3058 UInt lo32 = toUInt(w64); 3059 return toUShort(lo32); 3060 } 3061 3062 static inline UChar sel8x8_7 ( ULong w64 ) { 3063 UInt hi32 = toUInt(w64 >> 32); 3064 return toUChar(hi32 >> 24); 3065 } 3066 static inline UChar sel8x8_6 ( ULong w64 ) { 3067 UInt hi32 = toUInt(w64 >> 32); 3068 return toUChar(hi32 >> 16); 3069 } 3070 static inline UChar sel8x8_5 ( ULong w64 ) { 3071 UInt hi32 = toUInt(w64 >> 32); 3072 return toUChar(hi32 >> 8); 3073 } 3074 static inline UChar sel8x8_4 ( ULong w64 ) { 3075 UInt hi32 = toUInt(w64 >> 32); 3076 return toUChar(hi32 >> 0); 3077 } 3078 static inline UChar sel8x8_3 ( ULong w64 ) { 3079 UInt lo32 = toUInt(w64); 3080 return toUChar(lo32 >> 24); 3081 } 3082 static inline UChar sel8x8_2 ( ULong w64 ) { 3083 UInt lo32 = toUInt(w64); 3084 return toUChar(lo32 >> 16); 3085 } 3086 static inline UChar sel8x8_1 ( ULong w64 ) { 3087 UInt lo32 = toUInt(w64); 3088 return toUChar(lo32 >> 8); 3089 } 3090 static inline UChar sel8x8_0 ( ULong w64 ) { 3091 UInt lo32 = toUInt(w64); 3092 return toUChar(lo32 >> 0); 3093 } 3094 3095 /* CALLED FROM GENERATED CODE: CLEAN HELPER */ 3096 ULong amd64g_calculate_mmx_pmaddwd ( ULong xx, ULong yy ) 3097 { 3098 return 3099 mk32x2( 3100 (((Int)(Short)sel16x4_3(xx)) * ((Int)(Short)sel16x4_3(yy))) 3101 + (((Int)(Short)sel16x4_2(xx)) * ((Int)(Short)sel16x4_2(yy))), 3102 (((Int)(Short)sel16x4_1(xx)) * ((Int)(Short)sel16x4_1(yy))) 3103 + (((Int)(Short)sel16x4_0(xx)) * ((Int)(Short)sel16x4_0(yy))) 3104 ); 3105 } 3106 3107 /* CALLED FROM GENERATED CODE: CLEAN HELPER */ 3108 ULong amd64g_calculate_mmx_psadbw ( ULong xx, ULong yy ) 3109 { 3110 UInt t = 0; 3111 t += (UInt)abdU8( sel8x8_7(xx), sel8x8_7(yy) ); 3112 t += (UInt)abdU8( sel8x8_6(xx), sel8x8_6(yy) ); 3113 t += (UInt)abdU8( sel8x8_5(xx), sel8x8_5(yy) ); 3114 t += (UInt)abdU8( sel8x8_4(xx), sel8x8_4(yy) ); 3115 t += (UInt)abdU8( sel8x8_3(xx), sel8x8_3(yy) ); 3116 t += (UInt)abdU8( sel8x8_2(xx), sel8x8_2(yy) ); 3117 t += (UInt)abdU8( sel8x8_1(xx), sel8x8_1(yy) ); 3118 t += (UInt)abdU8( sel8x8_0(xx), sel8x8_0(yy) ); 3119 t &= 0xFFFF; 3120 return (ULong)t; 3121 } 3122 3123 /* CALLED FROM GENERATED CODE: CLEAN HELPER */ 3124 ULong amd64g_calculate_sse_phminposuw ( ULong sLo, ULong sHi ) 3125 { 3126 UShort t, min; 3127 UInt idx; 3128 t = sel16x4_0(sLo); if (True) { min = t; idx = 0; } 3129 t = sel16x4_1(sLo); if (t < min) { min = t; idx = 1; } 3130 t = sel16x4_2(sLo); if (t < min) { min = t; idx = 2; } 3131 t = sel16x4_3(sLo); if (t < min) { min = t; idx = 3; } 3132 t = sel16x4_0(sHi); if (t < min) { min = t; idx = 4; } 3133 t = sel16x4_1(sHi); if (t < min) { min = t; idx = 5; } 3134 t = sel16x4_2(sHi); if (t < min) { min = t; idx = 6; } 3135 t = sel16x4_3(sHi); if (t < min) { min = t; idx = 7; } 3136 return ((ULong)(idx << 16)) | ((ULong)min); 3137 } 3138 3139 /* CALLED FROM GENERATED CODE: CLEAN HELPER */ 3140 ULong amd64g_calc_crc32b ( ULong crcIn, ULong b ) 3141 { 3142 UInt i; 3143 ULong crc = (b & 0xFFULL) ^ crcIn; 3144 for (i = 0; i < 8; i++) 3145 crc = (crc >> 1) ^ ((crc & 1) ? 0x82f63b78ULL : 0); 3146 return crc; 3147 } 3148 3149 /* CALLED FROM GENERATED CODE: CLEAN HELPER */ 3150 ULong amd64g_calc_crc32w ( ULong crcIn, ULong w ) 3151 { 3152 UInt i; 3153 ULong crc = (w & 0xFFFFULL) ^ crcIn; 3154 for (i = 0; i < 16; i++) 3155 crc = (crc >> 1) ^ ((crc & 1) ? 0x82f63b78ULL : 0); 3156 return crc; 3157 } 3158 3159 /* CALLED FROM GENERATED CODE: CLEAN HELPER */ 3160 ULong amd64g_calc_crc32l ( ULong crcIn, ULong l ) 3161 { 3162 UInt i; 3163 ULong crc = (l & 0xFFFFFFFFULL) ^ crcIn; 3164 for (i = 0; i < 32; i++) 3165 crc = (crc >> 1) ^ ((crc & 1) ? 0x82f63b78ULL : 0); 3166 return crc; 3167 } 3168 3169 /* CALLED FROM GENERATED CODE: CLEAN HELPER */ 3170 ULong amd64g_calc_crc32q ( ULong crcIn, ULong q ) 3171 { 3172 ULong crc = amd64g_calc_crc32l(crcIn, q); 3173 return amd64g_calc_crc32l(crc, q >> 32); 3174 } 3175 3176 3177 /* .. helper for next fn .. */ 3178 static inline ULong sad_8x4 ( ULong xx, ULong yy ) 3179 { 3180 UInt t = 0; 3181 t += (UInt)abdU8( sel8x8_3(xx), sel8x8_3(yy) ); 3182 t += (UInt)abdU8( sel8x8_2(xx), sel8x8_2(yy) ); 3183 t += (UInt)abdU8( sel8x8_1(xx), sel8x8_1(yy) ); 3184 t += (UInt)abdU8( sel8x8_0(xx), sel8x8_0(yy) ); 3185 return (ULong)t; 3186 } 3187 3188 /* CALLED FROM GENERATED CODE: CLEAN HELPER */ 3189 ULong amd64g_calc_mpsadbw ( ULong sHi, ULong sLo, 3190 ULong dHi, ULong dLo, 3191 ULong imm_and_return_control_bit ) 3192 { 3193 UInt imm8 = imm_and_return_control_bit & 7; 3194 Bool calcHi = (imm_and_return_control_bit >> 7) & 1; 3195 UInt srcOffsL = imm8 & 3; /* src offs in 32-bit (L) chunks */ 3196 UInt dstOffsL = (imm8 >> 2) & 1; /* dst offs in ditto chunks */ 3197 /* For src we only need 32 bits, so get them into the 3198 lower half of a 64 bit word. */ 3199 ULong src = ((srcOffsL & 2) ? sHi : sLo) >> (32 * (srcOffsL & 1)); 3200 /* For dst we need to get hold of 56 bits (7 bytes) from a total of 3201 11 bytes. If calculating the low part of the result, need bytes 3202 dstOffsL * 4 + (0 .. 6); if calculating the high part, 3203 dstOffsL * 4 + (4 .. 10). */ 3204 ULong dst; 3205 /* dstOffL = 0, Lo -> 0 .. 6 3206 dstOffL = 1, Lo -> 4 .. 10 3207 dstOffL = 0, Hi -> 4 .. 10 3208 dstOffL = 1, Hi -> 8 .. 14 3209 */ 3210 if (calcHi && dstOffsL) { 3211 /* 8 .. 14 */ 3212 dst = dHi & 0x00FFFFFFFFFFFFFFULL; 3213 } 3214 else if (!calcHi && !dstOffsL) { 3215 /* 0 .. 6 */ 3216 dst = dLo & 0x00FFFFFFFFFFFFFFULL; 3217 } 3218 else { 3219 /* 4 .. 10 */ 3220 dst = (dLo >> 32) | ((dHi & 0x00FFFFFFULL) << 32); 3221 } 3222 ULong r0 = sad_8x4( dst >> 0, src ); 3223 ULong r1 = sad_8x4( dst >> 8, src ); 3224 ULong r2 = sad_8x4( dst >> 16, src ); 3225 ULong r3 = sad_8x4( dst >> 24, src ); 3226 ULong res = (r3 << 48) | (r2 << 32) | (r1 << 16) | r0; 3227 return res; 3228 } 3229 3230 /* CALLED FROM GENERATED CODE: CLEAN HELPER */ 3231 ULong amd64g_calculate_pext ( ULong src_masked, ULong mask ) 3232 { 3233 ULong dst = 0; 3234 ULong src_bit; 3235 ULong dst_bit = 1; 3236 for (src_bit = 1; src_bit; src_bit <<= 1) { 3237 if (mask & src_bit) { 3238 if (src_masked & src_bit) dst |= dst_bit; 3239 dst_bit <<= 1; 3240 } 3241 } 3242 return dst; 3243 } 3244 3245 /* CALLED FROM GENERATED CODE: CLEAN HELPER */ 3246 ULong amd64g_calculate_pdep ( ULong src, ULong mask ) 3247 { 3248 ULong dst = 0; 3249 ULong dst_bit; 3250 ULong src_bit = 1; 3251 for (dst_bit = 1; dst_bit; dst_bit <<= 1) { 3252 if (mask & dst_bit) { 3253 if (src & src_bit) dst |= dst_bit; 3254 src_bit <<= 1; 3255 } 3256 } 3257 return dst; 3258 } 3259 3260 /*---------------------------------------------------------------*/ 3261 /*--- Helpers for SSE4.2 PCMP{E,I}STR{I,M} ---*/ 3262 /*---------------------------------------------------------------*/ 3263 3264 static UInt zmask_from_V128 ( V128* arg ) 3265 { 3266 UInt i, res = 0; 3267 for (i = 0; i < 16; i++) { 3268 res |= ((arg->w8[i] == 0) ? 1 : 0) << i; 3269 } 3270 return res; 3271 } 3272 3273 static UInt zmask_from_V128_wide ( V128* arg ) 3274 { 3275 UInt i, res = 0; 3276 for (i = 0; i < 8; i++) { 3277 res |= ((arg->w16[i] == 0) ? 1 : 0) << i; 3278 } 3279 return res; 3280 } 3281 3282 /* Helps with PCMP{I,E}STR{I,M}. 3283 3284 CALLED FROM GENERATED CODE: DIRTY HELPER(s). (But not really, 3285 actually it could be a clean helper, but for the fact that we can't 3286 pass by value 2 x V128 to a clean helper, nor have one returned.) 3287 Reads guest state, writes to guest state for the xSTRM cases, no 3288 accesses of memory, is a pure function. 3289 3290 opc_and_imm contains (4th byte of opcode << 8) | the-imm8-byte so 3291 the callee knows which I/E and I/M variant it is dealing with and 3292 what the specific operation is. 4th byte of opcode is in the range 3293 0x60 to 0x63: 3294 istri 66 0F 3A 63 3295 istrm 66 0F 3A 62 3296 estri 66 0F 3A 61 3297 estrm 66 0F 3A 60 3298 3299 gstOffL and gstOffR are the guest state offsets for the two XMM 3300 register inputs. We never have to deal with the memory case since 3301 that is handled by pre-loading the relevant value into the fake 3302 XMM16 register. 3303 3304 For ESTRx variants, edxIN and eaxIN hold the values of those two 3305 registers. 3306 3307 In all cases, the bottom 16 bits of the result contain the new 3308 OSZACP %rflags values. For xSTRI variants, bits[31:16] of the 3309 result hold the new %ecx value. For xSTRM variants, the helper 3310 writes the result directly to the guest XMM0. 3311 3312 Declarable side effects: in all cases, reads guest state at 3313 [gstOffL, +16) and [gstOffR, +16). For xSTRM variants, also writes 3314 guest_XMM0. 3315 3316 Is expected to be called with opc_and_imm combinations which have 3317 actually been validated, and will assert if otherwise. The front 3318 end should ensure we're only called with verified values. 3319 */ 3320 ULong amd64g_dirtyhelper_PCMPxSTRx ( 3321 VexGuestAMD64State* gst, 3322 HWord opc4_and_imm, 3323 HWord gstOffL, HWord gstOffR, 3324 HWord edxIN, HWord eaxIN 3325 ) 3326 { 3327 HWord opc4 = (opc4_and_imm >> 8) & 0xFF; 3328 HWord imm8 = opc4_and_imm & 0xFF; 3329 HWord isISTRx = opc4 & 2; 3330 HWord isxSTRM = (opc4 & 1) ^ 1; 3331 vassert((opc4 & 0xFC) == 0x60); /* 0x60 .. 0x63 */ 3332 HWord wide = (imm8 & 1); 3333 3334 // where the args are 3335 V128* argL = (V128*)( ((UChar*)gst) + gstOffL ); 3336 V128* argR = (V128*)( ((UChar*)gst) + gstOffR ); 3337 3338 /* Create the arg validity masks, either from the vectors 3339 themselves or from the supplied edx/eax values. */ 3340 // FIXME: this is only right for the 8-bit data cases. 3341 // At least that is asserted above. 3342 UInt zmaskL, zmaskR; 3343 3344 // temp spot for the resulting flags and vector. 3345 V128 resV; 3346 UInt resOSZACP; 3347 3348 // for checking whether case was handled 3349 Bool ok = False; 3350 3351 if (wide) { 3352 if (isISTRx) { 3353 zmaskL = zmask_from_V128_wide(argL); 3354 zmaskR = zmask_from_V128_wide(argR); 3355 } else { 3356 Int tmp; 3357 tmp = edxIN & 0xFFFFFFFF; 3358 if (tmp < -8) tmp = -8; 3359 if (tmp > 8) tmp = 8; 3360 if (tmp < 0) tmp = -tmp; 3361 vassert(tmp >= 0 && tmp <= 8); 3362 zmaskL = (1 << tmp) & 0xFF; 3363 tmp = eaxIN & 0xFFFFFFFF; 3364 if (tmp < -8) tmp = -8; 3365 if (tmp > 8) tmp = 8; 3366 if (tmp < 0) tmp = -tmp; 3367 vassert(tmp >= 0 && tmp <= 8); 3368 zmaskR = (1 << tmp) & 0xFF; 3369 } 3370 // do the meyaath 3371 ok = compute_PCMPxSTRx_wide ( 3372 &resV, &resOSZACP, argL, argR, 3373 zmaskL, zmaskR, imm8, (Bool)isxSTRM 3374 ); 3375 } else { 3376 if (isISTRx) { 3377 zmaskL = zmask_from_V128(argL); 3378 zmaskR = zmask_from_V128(argR); 3379 } else { 3380 Int tmp; 3381 tmp = edxIN & 0xFFFFFFFF; 3382 if (tmp < -16) tmp = -16; 3383 if (tmp > 16) tmp = 16; 3384 if (tmp < 0) tmp = -tmp; 3385 vassert(tmp >= 0 && tmp <= 16); 3386 zmaskL = (1 << tmp) & 0xFFFF; 3387 tmp = eaxIN & 0xFFFFFFFF; 3388 if (tmp < -16) tmp = -16; 3389 if (tmp > 16) tmp = 16; 3390 if (tmp < 0) tmp = -tmp; 3391 vassert(tmp >= 0 && tmp <= 16); 3392 zmaskR = (1 << tmp) & 0xFFFF; 3393 } 3394 // do the meyaath 3395 ok = compute_PCMPxSTRx ( 3396 &resV, &resOSZACP, argL, argR, 3397 zmaskL, zmaskR, imm8, (Bool)isxSTRM 3398 ); 3399 } 3400 3401 // front end shouldn't pass us any imm8 variants we can't 3402 // handle. Hence: 3403 vassert(ok); 3404 3405 // So, finally we need to get the results back to the caller. 3406 // In all cases, the new OSZACP value is the lowest 16 of 3407 // the return value. 3408 if (isxSTRM) { 3409 gst->guest_YMM0[0] = resV.w32[0]; 3410 gst->guest_YMM0[1] = resV.w32[1]; 3411 gst->guest_YMM0[2] = resV.w32[2]; 3412 gst->guest_YMM0[3] = resV.w32[3]; 3413 return resOSZACP & 0x8D5; 3414 } else { 3415 UInt newECX = resV.w32[0] & 0xFFFF; 3416 return (newECX << 16) | (resOSZACP & 0x8D5); 3417 } 3418 } 3419 3420 /*---------------------------------------------------------------*/ 3421 /*--- AES primitives and helpers ---*/ 3422 /*---------------------------------------------------------------*/ 3423 /* a 16 x 16 matrix */ 3424 static const UChar sbox[256] = { // row nr 3425 0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5, // 1 3426 0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76, 3427 0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0, // 2 3428 0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0, 3429 0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc, // 3 3430 0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15, 3431 0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a, // 4 3432 0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75, 3433 0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0, // 5 3434 0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84, 3435 0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b, // 6 3436 0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf, 3437 0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85, // 7 3438 0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8, 3439 0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5, // 8 3440 0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2, 3441 0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17, // 9 3442 0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73, 3443 0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88, //10 3444 0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb, 3445 0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c, //11 3446 0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79, 3447 0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9, //12 3448 0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08, 3449 0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6, //13 3450 0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a, 3451 0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e, //14 3452 0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e, 3453 0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94, //15 3454 0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf, 3455 0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68, //16 3456 0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16 3457 }; 3458 static void SubBytes (V128* v) 3459 { 3460 V128 r; 3461 UInt i; 3462 for (i = 0; i < 16; i++) 3463 r.w8[i] = sbox[v->w8[i]]; 3464 *v = r; 3465 } 3466 3467 /* a 16 x 16 matrix */ 3468 static const UChar invsbox[256] = { // row nr 3469 0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38, // 1 3470 0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb, 3471 0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87, // 2 3472 0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb, 3473 0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d, // 3 3474 0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e, 3475 0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2, // 4 3476 0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25, 3477 0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16, // 5 3478 0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92, 3479 0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda, // 6 3480 0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84, 3481 0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a, // 7 3482 0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06, 3483 0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02, // 8 3484 0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b, 3485 0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea, // 9 3486 0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73, 3487 0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85, //10 3488 0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e, 3489 0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89, //11 3490 0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b, 3491 0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20, //12 3492 0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4, 3493 0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31, //13 3494 0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f, 3495 0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d, //14 3496 0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef, 3497 0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0, //15 3498 0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61, 3499 0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26, //16 3500 0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d 3501 }; 3502 static void InvSubBytes (V128* v) 3503 { 3504 V128 r; 3505 UInt i; 3506 for (i = 0; i < 16; i++) 3507 r.w8[i] = invsbox[v->w8[i]]; 3508 *v = r; 3509 } 3510 3511 static const UChar ShiftRows_op[16] = 3512 {11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15, 10, 5, 0}; 3513 static void ShiftRows (V128* v) 3514 { 3515 V128 r; 3516 UInt i; 3517 for (i = 0; i < 16; i++) 3518 r.w8[i] = v->w8[ShiftRows_op[15-i]]; 3519 *v = r; 3520 } 3521 3522 static const UChar InvShiftRows_op[16] = 3523 {3, 6, 9, 12, 15, 2, 5, 8, 11, 14, 1, 4, 7, 10, 13, 0}; 3524 static void InvShiftRows (V128* v) 3525 { 3526 V128 r; 3527 UInt i; 3528 for (i = 0; i < 16; i++) 3529 r.w8[i] = v->w8[InvShiftRows_op[15-i]]; 3530 *v = r; 3531 } 3532 3533 /* Multiplication of the finite fields elements of AES. 3534 See "A Specification for The AES Algorithm Rijndael 3535 (by Joan Daemen & Vincent Rijmen)" 3536 Dr. Brian Gladman, v3.1, 3rd March 2001. */ 3537 /* N values so that (hex) xy = 0x03^N. 3538 0x00 cannot be used. We put 0xff for this value.*/ 3539 /* a 16 x 16 matrix */ 3540 static const UChar Nxy[256] = { // row nr 3541 0xff, 0x00, 0x19, 0x01, 0x32, 0x02, 0x1a, 0xc6, // 1 3542 0x4b, 0xc7, 0x1b, 0x68, 0x33, 0xee, 0xdf, 0x03, 3543 0x64, 0x04, 0xe0, 0x0e, 0x34, 0x8d, 0x81, 0xef, // 2 3544 0x4c, 0x71, 0x08, 0xc8, 0xf8, 0x69, 0x1c, 0xc1, 3545 0x7d, 0xc2, 0x1d, 0xb5, 0xf9, 0xb9, 0x27, 0x6a, // 3 3546 0x4d, 0xe4, 0xa6, 0x72, 0x9a, 0xc9, 0x09, 0x78, 3547 0x65, 0x2f, 0x8a, 0x05, 0x21, 0x0f, 0xe1, 0x24, // 4 3548 0x12, 0xf0, 0x82, 0x45, 0x35, 0x93, 0xda, 0x8e, 3549 0x96, 0x8f, 0xdb, 0xbd, 0x36, 0xd0, 0xce, 0x94, // 5 3550 0x13, 0x5c, 0xd2, 0xf1, 0x40, 0x46, 0x83, 0x38, 3551 0x66, 0xdd, 0xfd, 0x30, 0xbf, 0x06, 0x8b, 0x62, // 6 3552 0xb3, 0x25, 0xe2, 0x98, 0x22, 0x88, 0x91, 0x10, 3553 0x7e, 0x6e, 0x48, 0xc3, 0xa3, 0xb6, 0x1e, 0x42, // 7 3554 0x3a, 0x6b, 0x28, 0x54, 0xfa, 0x85, 0x3d, 0xba, 3555 0x2b, 0x79, 0x0a, 0x15, 0x9b, 0x9f, 0x5e, 0xca, // 8 3556 0x4e, 0xd4, 0xac, 0xe5, 0xf3, 0x73, 0xa7, 0x57, 3557 0xaf, 0x58, 0xa8, 0x50, 0xf4, 0xea, 0xd6, 0x74, // 9 3558 0x4f, 0xae, 0xe9, 0xd5, 0xe7, 0xe6, 0xad, 0xe8, 3559 0x2c, 0xd7, 0x75, 0x7a, 0xeb, 0x16, 0x0b, 0xf5, //10 3560 0x59, 0xcb, 0x5f, 0xb0, 0x9c, 0xa9, 0x51, 0xa0, 3561 0x7f, 0x0c, 0xf6, 0x6f, 0x17, 0xc4, 0x49, 0xec, //11 3562 0xd8, 0x43, 0x1f, 0x2d, 0xa4, 0x76, 0x7b, 0xb7, 3563 0xcc, 0xbb, 0x3e, 0x5a, 0xfb, 0x60, 0xb1, 0x86, //12 3564 0x3b, 0x52, 0xa1, 0x6c, 0xaa, 0x55, 0x29, 0x9d, 3565 0x97, 0xb2, 0x87, 0x90, 0x61, 0xbe, 0xdc, 0xfc, //13 3566 0xbc, 0x95, 0xcf, 0xcd, 0x37, 0x3f, 0x5b, 0xd1, 3567 0x53, 0x39, 0x84, 0x3c, 0x41, 0xa2, 0x6d, 0x47, //14 3568 0x14, 0x2a, 0x9e, 0x5d, 0x56, 0xf2, 0xd3, 0xab, 3569 0x44, 0x11, 0x92, 0xd9, 0x23, 0x20, 0x2e, 0x89, //15 3570 0xb4, 0x7c, 0xb8, 0x26, 0x77, 0x99, 0xe3, 0xa5, 3571 0x67, 0x4a, 0xed, 0xde, 0xc5, 0x31, 0xfe, 0x18, //16 3572 0x0d, 0x63, 0x8c, 0x80, 0xc0, 0xf7, 0x70, 0x07 3573 }; 3574 3575 /* E values so that E = 0x03^xy. */ 3576 static const UChar Exy[256] = { // row nr 3577 0x01, 0x03, 0x05, 0x0f, 0x11, 0x33, 0x55, 0xff, // 1 3578 0x1a, 0x2e, 0x72, 0x96, 0xa1, 0xf8, 0x13, 0x35, 3579 0x5f, 0xe1, 0x38, 0x48, 0xd8, 0x73, 0x95, 0xa4, // 2 3580 0xf7, 0x02, 0x06, 0x0a, 0x1e, 0x22, 0x66, 0xaa, 3581 0xe5, 0x34, 0x5c, 0xe4, 0x37, 0x59, 0xeb, 0x26, // 3 3582 0x6a, 0xbe, 0xd9, 0x70, 0x90, 0xab, 0xe6, 0x31, 3583 0x53, 0xf5, 0x04, 0x0c, 0x14, 0x3c, 0x44, 0xcc, // 4 3584 0x4f, 0xd1, 0x68, 0xb8, 0xd3, 0x6e, 0xb2, 0xcd, 3585 0x4c, 0xd4, 0x67, 0xa9, 0xe0, 0x3b, 0x4d, 0xd7, // 5 3586 0x62, 0xa6, 0xf1, 0x08, 0x18, 0x28, 0x78, 0x88, 3587 0x83, 0x9e, 0xb9, 0xd0, 0x6b, 0xbd, 0xdc, 0x7f, // 6 3588 0x81, 0x98, 0xb3, 0xce, 0x49, 0xdb, 0x76, 0x9a, 3589 0xb5, 0xc4, 0x57, 0xf9, 0x10, 0x30, 0x50, 0xf0, // 7 3590 0x0b, 0x1d, 0x27, 0x69, 0xbb, 0xd6, 0x61, 0xa3, 3591 0xfe, 0x19, 0x2b, 0x7d, 0x87, 0x92, 0xad, 0xec, // 8 3592 0x2f, 0x71, 0x93, 0xae, 0xe9, 0x20, 0x60, 0xa0, 3593 0xfb, 0x16, 0x3a, 0x4e, 0xd2, 0x6d, 0xb7, 0xc2, // 9 3594 0x5d, 0xe7, 0x32, 0x56, 0xfa, 0x15, 0x3f, 0x41, 3595 0xc3, 0x5e, 0xe2, 0x3d, 0x47, 0xc9, 0x40, 0xc0, //10 3596 0x5b, 0xed, 0x2c, 0x74, 0x9c, 0xbf, 0xda, 0x75, 3597 0x9f, 0xba, 0xd5, 0x64, 0xac, 0xef, 0x2a, 0x7e, //11 3598 0x82, 0x9d, 0xbc, 0xdf, 0x7a, 0x8e, 0x89, 0x80, 3599 0x9b, 0xb6, 0xc1, 0x58, 0xe8, 0x23, 0x65, 0xaf, //12 3600 0xea, 0x25, 0x6f, 0xb1, 0xc8, 0x43, 0xc5, 0x54, 3601 0xfc, 0x1f, 0x21, 0x63, 0xa5, 0xf4, 0x07, 0x09, //13 3602 0x1b, 0x2d, 0x77, 0x99, 0xb0, 0xcb, 0x46, 0xca, 3603 0x45, 0xcf, 0x4a, 0xde, 0x79, 0x8b, 0x86, 0x91, //14 3604 0xa8, 0xe3, 0x3e, 0x42, 0xc6, 0x51, 0xf3, 0x0e, 3605 0x12, 0x36, 0x5a, 0xee, 0x29, 0x7b, 0x8d, 0x8c, //15 3606 0x8f, 0x8a, 0x85, 0x94, 0xa7, 0xf2, 0x0d, 0x17, 3607 0x39, 0x4b, 0xdd, 0x7c, 0x84, 0x97, 0xa2, 0xfd, //16 3608 0x1c, 0x24, 0x6c, 0xb4, 0xc7, 0x52, 0xf6, 0x01}; 3609 3610 static inline UChar ff_mul(UChar u1, UChar u2) 3611 { 3612 if ((u1 > 0) && (u2 > 0)) { 3613 UInt ui = Nxy[u1] + Nxy[u2]; 3614 if (ui >= 255) 3615 ui = ui - 255; 3616 return Exy[ui]; 3617 } else { 3618 return 0; 3619 }; 3620 } 3621 3622 static void MixColumns (V128* v) 3623 { 3624 V128 r; 3625 Int j; 3626 #define P(x,row,col) (x)->w8[((row)*4+(col))] 3627 for (j = 0; j < 4; j++) { 3628 P(&r,j,0) = ff_mul(0x02, P(v,j,0)) ^ ff_mul(0x03, P(v,j,1)) 3629 ^ P(v,j,2) ^ P(v,j,3); 3630 P(&r,j,1) = P(v,j,0) ^ ff_mul( 0x02, P(v,j,1) ) 3631 ^ ff_mul(0x03, P(v,j,2) ) ^ P(v,j,3); 3632 P(&r,j,2) = P(v,j,0) ^ P(v,j,1) ^ ff_mul( 0x02, P(v,j,2) ) 3633 ^ ff_mul(0x03, P(v,j,3) ); 3634 P(&r,j,3) = ff_mul(0x03, P(v,j,0) ) ^ P(v,j,1) ^ P(v,j,2) 3635 ^ ff_mul( 0x02, P(v,j,3) ); 3636 } 3637 *v = r; 3638 #undef P 3639 } 3640 3641 static void InvMixColumns (V128* v) 3642 { 3643 V128 r; 3644 Int j; 3645 #define P(x,row,col) (x)->w8[((row)*4+(col))] 3646 for (j = 0; j < 4; j++) { 3647 P(&r,j,0) = ff_mul(0x0e, P(v,j,0) ) ^ ff_mul(0x0b, P(v,j,1) ) 3648 ^ ff_mul(0x0d,P(v,j,2) ) ^ ff_mul(0x09, P(v,j,3) ); 3649 P(&r,j,1) = ff_mul(0x09, P(v,j,0) ) ^ ff_mul(0x0e, P(v,j,1) ) 3650 ^ ff_mul(0x0b,P(v,j,2) ) ^ ff_mul(0x0d, P(v,j,3) ); 3651 P(&r,j,2) = ff_mul(0x0d, P(v,j,0) ) ^ ff_mul(0x09, P(v,j,1) ) 3652 ^ ff_mul(0x0e,P(v,j,2) ) ^ ff_mul(0x0b, P(v,j,3) ); 3653 P(&r,j,3) = ff_mul(0x0b, P(v,j,0) ) ^ ff_mul(0x0d, P(v,j,1) ) 3654 ^ ff_mul(0x09,P(v,j,2) ) ^ ff_mul(0x0e, P(v,j,3) ); 3655 } 3656 *v = r; 3657 #undef P 3658 3659 } 3660 3661 /* For description, see definition in guest_amd64_defs.h */ 3662 void amd64g_dirtyhelper_AES ( 3663 VexGuestAMD64State* gst, 3664 HWord opc4, HWord gstOffD, 3665 HWord gstOffL, HWord gstOffR 3666 ) 3667 { 3668 // where the args are 3669 V128* argD = (V128*)( ((UChar*)gst) + gstOffD ); 3670 V128* argL = (V128*)( ((UChar*)gst) + gstOffL ); 3671 V128* argR = (V128*)( ((UChar*)gst) + gstOffR ); 3672 V128 r; 3673 3674 switch (opc4) { 3675 case 0xDC: /* AESENC */ 3676 case 0xDD: /* AESENCLAST */ 3677 r = *argR; 3678 ShiftRows (&r); 3679 SubBytes (&r); 3680 if (opc4 == 0xDC) 3681 MixColumns (&r); 3682 argD->w64[0] = r.w64[0] ^ argL->w64[0]; 3683 argD->w64[1] = r.w64[1] ^ argL->w64[1]; 3684 break; 3685 3686 case 0xDE: /* AESDEC */ 3687 case 0xDF: /* AESDECLAST */ 3688 r = *argR; 3689 InvShiftRows (&r); 3690 InvSubBytes (&r); 3691 if (opc4 == 0xDE) 3692 InvMixColumns (&r); 3693 argD->w64[0] = r.w64[0] ^ argL->w64[0]; 3694 argD->w64[1] = r.w64[1] ^ argL->w64[1]; 3695 break; 3696 3697 case 0xDB: /* AESIMC */ 3698 *argD = *argL; 3699 InvMixColumns (argD); 3700 break; 3701 default: vassert(0); 3702 } 3703 } 3704 3705 static inline UInt RotWord (UInt w32) 3706 { 3707 return ((w32 >> 8) | (w32 << 24)); 3708 } 3709 3710 static inline UInt SubWord (UInt w32) 3711 { 3712 UChar *w8; 3713 UChar *r8; 3714 UInt res; 3715 w8 = (UChar*) &w32; 3716 r8 = (UChar*) &res; 3717 r8[0] = sbox[w8[0]]; 3718 r8[1] = sbox[w8[1]]; 3719 r8[2] = sbox[w8[2]]; 3720 r8[3] = sbox[w8[3]]; 3721 return res; 3722 } 3723 3724 /* For description, see definition in guest_amd64_defs.h */ 3725 extern void amd64g_dirtyhelper_AESKEYGENASSIST ( 3726 VexGuestAMD64State* gst, 3727 HWord imm8, 3728 HWord gstOffL, HWord gstOffR 3729 ) 3730 { 3731 // where the args are 3732 V128* argL = (V128*)( ((UChar*)gst) + gstOffL ); 3733 V128* argR = (V128*)( ((UChar*)gst) + gstOffR ); 3734 3735 argR->w32[3] = RotWord (SubWord (argL->w32[3])) ^ imm8; 3736 argR->w32[2] = SubWord (argL->w32[3]); 3737 argR->w32[1] = RotWord (SubWord (argL->w32[1])) ^ imm8; 3738 argR->w32[0] = SubWord (argL->w32[1]); 3739 } 3740 3741 3742 3743 /*---------------------------------------------------------------*/ 3744 /*--- Helpers for dealing with, and describing, ---*/ 3745 /*--- guest state as a whole. ---*/ 3746 /*---------------------------------------------------------------*/ 3747 3748 /* Initialise the entire amd64 guest state. */ 3749 /* VISIBLE TO LIBVEX CLIENT */ 3750 void LibVEX_GuestAMD64_initialise ( /*OUT*/VexGuestAMD64State* vex_state ) 3751 { 3752 vex_state->host_EvC_FAILADDR = 0; 3753 vex_state->host_EvC_COUNTER = 0; 3754 vex_state->pad0 = 0; 3755 3756 vex_state->guest_RAX = 0; 3757 vex_state->guest_RCX = 0; 3758 vex_state->guest_RDX = 0; 3759 vex_state->guest_RBX = 0; 3760 vex_state->guest_RSP = 0; 3761 vex_state->guest_RBP = 0; 3762 vex_state->guest_RSI = 0; 3763 vex_state->guest_RDI = 0; 3764 vex_state->guest_R8 = 0; 3765 vex_state->guest_R9 = 0; 3766 vex_state->guest_R10 = 0; 3767 vex_state->guest_R11 = 0; 3768 vex_state->guest_R12 = 0; 3769 vex_state->guest_R13 = 0; 3770 vex_state->guest_R14 = 0; 3771 vex_state->guest_R15 = 0; 3772 3773 vex_state->guest_CC_OP = AMD64G_CC_OP_COPY; 3774 vex_state->guest_CC_DEP1 = 0; 3775 vex_state->guest_CC_DEP2 = 0; 3776 vex_state->guest_CC_NDEP = 0; 3777 3778 vex_state->guest_DFLAG = 1; /* forwards */ 3779 vex_state->guest_IDFLAG = 0; 3780 vex_state->guest_ACFLAG = 0; 3781 3782 /* HACK: represent the offset associated with %fs==0. This 3783 assumes that %fs is only ever zero. */ 3784 vex_state->guest_FS_ZERO = 0; 3785 3786 vex_state->guest_RIP = 0; 3787 3788 /* Initialise the simulated FPU */ 3789 amd64g_dirtyhelper_FINIT( vex_state ); 3790 3791 /* Initialise the AVX state. */ 3792 # define AVXZERO(_ymm) \ 3793 do { _ymm[0]=_ymm[1]=_ymm[2]=_ymm[3] = 0; \ 3794 _ymm[4]=_ymm[5]=_ymm[6]=_ymm[7] = 0; \ 3795 } while (0) 3796 vex_state->guest_SSEROUND = (ULong)Irrm_NEAREST; 3797 AVXZERO(vex_state->guest_YMM0); 3798 AVXZERO(vex_state->guest_YMM1); 3799 AVXZERO(vex_state->guest_YMM2); 3800 AVXZERO(vex_state->guest_YMM3); 3801 AVXZERO(vex_state->guest_YMM4); 3802 AVXZERO(vex_state->guest_YMM5); 3803 AVXZERO(vex_state->guest_YMM6); 3804 AVXZERO(vex_state->guest_YMM7); 3805 AVXZERO(vex_state->guest_YMM8); 3806 AVXZERO(vex_state->guest_YMM9); 3807 AVXZERO(vex_state->guest_YMM10); 3808 AVXZERO(vex_state->guest_YMM11); 3809 AVXZERO(vex_state->guest_YMM12); 3810 AVXZERO(vex_state->guest_YMM13); 3811 AVXZERO(vex_state->guest_YMM14); 3812 AVXZERO(vex_state->guest_YMM15); 3813 AVXZERO(vex_state->guest_YMM16); 3814 3815 # undef AVXZERO 3816 3817 vex_state->guest_EMNOTE = EmNote_NONE; 3818 3819 /* These should not ever be either read or written, but we 3820 initialise them anyway. */ 3821 vex_state->guest_CMSTART = 0; 3822 vex_state->guest_CMLEN = 0; 3823 3824 vex_state->guest_NRADDR = 0; 3825 vex_state->guest_SC_CLASS = 0; 3826 vex_state->guest_GS_0x60 = 0; 3827 3828 vex_state->guest_IP_AT_SYSCALL = 0; 3829 vex_state->pad1 = 0; 3830 } 3831 3832 3833 /* Figure out if any part of the guest state contained in minoff 3834 .. maxoff requires precise memory exceptions. If in doubt return 3835 True (but this generates significantly slower code). 3836 3837 By default we enforce precise exns for guest %RSP, %RBP and %RIP 3838 only. These are the minimum needed to extract correct stack 3839 backtraces from amd64 code. 3840 3841 Only %RSP is needed in mode VexRegUpdSpAtMemAccess. 3842 */ 3843 Bool guest_amd64_state_requires_precise_mem_exns ( Int minoff, 3844 Int maxoff) 3845 { 3846 Int rbp_min = offsetof(VexGuestAMD64State, guest_RBP); 3847 Int rbp_max = rbp_min + 8 - 1; 3848 Int rsp_min = offsetof(VexGuestAMD64State, guest_RSP); 3849 Int rsp_max = rsp_min + 8 - 1; 3850 Int rip_min = offsetof(VexGuestAMD64State, guest_RIP); 3851 Int rip_max = rip_min + 8 - 1; 3852 3853 if (maxoff < rsp_min || minoff > rsp_max) { 3854 /* no overlap with rsp */ 3855 if (vex_control.iropt_register_updates == VexRegUpdSpAtMemAccess) 3856 return False; // We only need to check stack pointer. 3857 } else { 3858 return True; 3859 } 3860 3861 if (maxoff < rbp_min || minoff > rbp_max) { 3862 /* no overlap with rbp */ 3863 } else { 3864 return True; 3865 } 3866 3867 if (maxoff < rip_min || minoff > rip_max) { 3868 /* no overlap with eip */ 3869 } else { 3870 return True; 3871 } 3872 3873 return False; 3874 } 3875 3876 3877 #define ALWAYSDEFD(field) \ 3878 { offsetof(VexGuestAMD64State, field), \ 3879 (sizeof ((VexGuestAMD64State*)0)->field) } 3880 3881 VexGuestLayout 3882 amd64guest_layout 3883 = { 3884 /* Total size of the guest state, in bytes. */ 3885 .total_sizeB = sizeof(VexGuestAMD64State), 3886 3887 /* Describe the stack pointer. */ 3888 .offset_SP = offsetof(VexGuestAMD64State,guest_RSP), 3889 .sizeof_SP = 8, 3890 3891 /* Describe the frame pointer. */ 3892 .offset_FP = offsetof(VexGuestAMD64State,guest_RBP), 3893 .sizeof_FP = 8, 3894 3895 /* Describe the instruction pointer. */ 3896 .offset_IP = offsetof(VexGuestAMD64State,guest_RIP), 3897 .sizeof_IP = 8, 3898 3899 /* Describe any sections to be regarded by Memcheck as 3900 'always-defined'. */ 3901 .n_alwaysDefd = 16, 3902 3903 /* flags thunk: OP and NDEP are always defd, whereas DEP1 3904 and DEP2 have to be tracked. See detailed comment in 3905 gdefs.h on meaning of thunk fields. */ 3906 .alwaysDefd 3907 = { /* 0 */ ALWAYSDEFD(guest_CC_OP), 3908 /* 1 */ ALWAYSDEFD(guest_CC_NDEP), 3909 /* 2 */ ALWAYSDEFD(guest_DFLAG), 3910 /* 3 */ ALWAYSDEFD(guest_IDFLAG), 3911 /* 4 */ ALWAYSDEFD(guest_RIP), 3912 /* 5 */ ALWAYSDEFD(guest_FS_ZERO), 3913 /* 6 */ ALWAYSDEFD(guest_FTOP), 3914 /* 7 */ ALWAYSDEFD(guest_FPTAG), 3915 /* 8 */ ALWAYSDEFD(guest_FPROUND), 3916 /* 9 */ ALWAYSDEFD(guest_FC3210), 3917 // /* */ ALWAYSDEFD(guest_CS), 3918 // /* */ ALWAYSDEFD(guest_DS), 3919 // /* */ ALWAYSDEFD(guest_ES), 3920 // /* */ ALWAYSDEFD(guest_FS), 3921 // /* */ ALWAYSDEFD(guest_GS), 3922 // /* */ ALWAYSDEFD(guest_SS), 3923 // /* */ ALWAYSDEFD(guest_LDT), 3924 // /* */ ALWAYSDEFD(guest_GDT), 3925 /* 10 */ ALWAYSDEFD(guest_EMNOTE), 3926 /* 11 */ ALWAYSDEFD(guest_SSEROUND), 3927 /* 12 */ ALWAYSDEFD(guest_CMSTART), 3928 /* 13 */ ALWAYSDEFD(guest_CMLEN), 3929 /* 14 */ ALWAYSDEFD(guest_SC_CLASS), 3930 /* 15 */ ALWAYSDEFD(guest_IP_AT_SYSCALL) 3931 } 3932 }; 3933 3934 3935 /*---------------------------------------------------------------*/ 3936 /*--- end guest_amd64_helpers.c ---*/ 3937 /*---------------------------------------------------------------*/ 3938