1 2 /*---------------------------------------------------------------*/ 3 /*--- begin guest_amd64_helpers.c ---*/ 4 /*---------------------------------------------------------------*/ 5 6 /* 7 This file is part of Valgrind, a dynamic binary instrumentation 8 framework. 9 10 Copyright (C) 2004-2012 OpenWorks LLP 11 info (at) open-works.net 12 13 This program is free software; you can redistribute it and/or 14 modify it under the terms of the GNU General Public License as 15 published by the Free Software Foundation; either version 2 of the 16 License, or (at your option) any later version. 17 18 This program is distributed in the hope that it will be useful, but 19 WITHOUT ANY WARRANTY; without even the implied warranty of 20 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 21 General Public License for more details. 22 23 You should have received a copy of the GNU General Public License 24 along with this program; if not, write to the Free Software 25 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 26 02110-1301, USA. 27 28 The GNU General Public License is contained in the file COPYING. 29 30 Neither the names of the U.S. Department of Energy nor the 31 University of California nor the names of its contributors may be 32 used to endorse or promote products derived from this software 33 without prior written permission. 34 */ 35 36 #include "libvex_basictypes.h" 37 #include "libvex_emwarn.h" 38 #include "libvex_guest_amd64.h" 39 #include "libvex_ir.h" 40 #include "libvex.h" 41 42 #include "main_util.h" 43 #include "guest_generic_bb_to_IR.h" 44 #include "guest_amd64_defs.h" 45 #include "guest_generic_x87.h" 46 47 48 /* This file contains helper functions for amd64 guest code. 49 Calls to these functions are generated by the back end. 50 These calls are of course in the host machine code and 51 this file will be compiled to host machine code, so that 52 all makes sense. 53 54 Only change the signatures of these helper functions very 55 carefully. If you change the signature here, you'll have to change 56 the parameters passed to it in the IR calls constructed by 57 guest-amd64/toIR.c. 58 59 The convention used is that all functions called from generated 60 code are named amd64g_<something>, and any function whose name lacks 61 that prefix is not called from generated code. Note that some 62 LibVEX_* functions can however be called by VEX's client, but that 63 is not the same as calling them from VEX-generated code. 64 */ 65 66 67 /* Set to 1 to get detailed profiling info about use of the flag 68 machinery. */ 69 #define PROFILE_RFLAGS 0 70 71 72 /*---------------------------------------------------------------*/ 73 /*--- %rflags run-time helpers. ---*/ 74 /*---------------------------------------------------------------*/ 75 76 /* Do 64x64 -> 128 signed/unsigned multiplies, for computing flags 77 after imulq/mulq. */ 78 79 static void mullS64 ( Long u, Long v, Long* rHi, Long* rLo ) 80 { 81 ULong u0, v0, w0; 82 Long u1, v1, w1, w2, t; 83 u0 = u & 0xFFFFFFFFULL; 84 u1 = u >> 32; 85 v0 = v & 0xFFFFFFFFULL; 86 v1 = v >> 32; 87 w0 = u0 * v0; 88 t = u1 * v0 + (w0 >> 32); 89 w1 = t & 0xFFFFFFFFULL; 90 w2 = t >> 32; 91 w1 = u0 * v1 + w1; 92 *rHi = u1 * v1 + w2 + (w1 >> 32); 93 *rLo = u * v; 94 } 95 96 static void mullU64 ( ULong u, ULong v, ULong* rHi, ULong* rLo ) 97 { 98 ULong u0, v0, w0; 99 ULong u1, v1, w1,w2,t; 100 u0 = u & 0xFFFFFFFFULL; 101 u1 = u >> 32; 102 v0 = v & 0xFFFFFFFFULL; 103 v1 = v >> 32; 104 w0 = u0 * v0; 105 t = u1 * v0 + (w0 >> 32); 106 w1 = t & 0xFFFFFFFFULL; 107 w2 = t >> 32; 108 w1 = u0 * v1 + w1; 109 *rHi = u1 * v1 + w2 + (w1 >> 32); 110 *rLo = u * v; 111 } 112 113 114 static const UChar parity_table[256] = { 115 AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, 116 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 117 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 118 AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, 119 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 120 AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, 121 AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, 122 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 123 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 124 AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, 125 AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, 126 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 127 AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, 128 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 129 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 130 AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, 131 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 132 AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, 133 AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, 134 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 135 AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, 136 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 137 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 138 AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, 139 AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, 140 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 141 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 142 AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, 143 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 144 AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, 145 AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, 146 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 147 }; 148 149 /* generalised left-shifter */ 150 static inline Long lshift ( Long x, Int n ) 151 { 152 if (n >= 0) 153 return x << n; 154 else 155 return x >> (-n); 156 } 157 158 /* identity on ULong */ 159 static inline ULong idULong ( ULong x ) 160 { 161 return x; 162 } 163 164 165 #define PREAMBLE(__data_bits) \ 166 /* const */ ULong DATA_MASK \ 167 = __data_bits==8 \ 168 ? 0xFFULL \ 169 : (__data_bits==16 \ 170 ? 0xFFFFULL \ 171 : (__data_bits==32 \ 172 ? 0xFFFFFFFFULL \ 173 : 0xFFFFFFFFFFFFFFFFULL)); \ 174 /* const */ ULong SIGN_MASK = 1ULL << (__data_bits - 1); \ 175 /* const */ ULong CC_DEP1 = cc_dep1_formal; \ 176 /* const */ ULong CC_DEP2 = cc_dep2_formal; \ 177 /* const */ ULong CC_NDEP = cc_ndep_formal; \ 178 /* Four bogus assignments, which hopefully gcc can */ \ 179 /* optimise away, and which stop it complaining about */ \ 180 /* unused variables. */ \ 181 SIGN_MASK = SIGN_MASK; \ 182 DATA_MASK = DATA_MASK; \ 183 CC_DEP2 = CC_DEP2; \ 184 CC_NDEP = CC_NDEP; 185 186 187 /*-------------------------------------------------------------*/ 188 189 #define ACTIONS_ADD(DATA_BITS,DATA_UTYPE) \ 190 { \ 191 PREAMBLE(DATA_BITS); \ 192 { Long cf, pf, af, zf, sf, of; \ 193 Long argL, argR, res; \ 194 argL = CC_DEP1; \ 195 argR = CC_DEP2; \ 196 res = argL + argR; \ 197 cf = (DATA_UTYPE)res < (DATA_UTYPE)argL; \ 198 pf = parity_table[(UChar)res]; \ 199 af = (res ^ argL ^ argR) & 0x10; \ 200 zf = ((DATA_UTYPE)res == 0) << 6; \ 201 sf = lshift(res, 8 - DATA_BITS) & 0x80; \ 202 of = lshift((argL ^ argR ^ -1) & (argL ^ res), \ 203 12 - DATA_BITS) & AMD64G_CC_MASK_O; \ 204 return cf | pf | af | zf | sf | of; \ 205 } \ 206 } 207 208 /*-------------------------------------------------------------*/ 209 210 #define ACTIONS_SUB(DATA_BITS,DATA_UTYPE) \ 211 { \ 212 PREAMBLE(DATA_BITS); \ 213 { Long cf, pf, af, zf, sf, of; \ 214 Long argL, argR, res; \ 215 argL = CC_DEP1; \ 216 argR = CC_DEP2; \ 217 res = argL - argR; \ 218 cf = (DATA_UTYPE)argL < (DATA_UTYPE)argR; \ 219 pf = parity_table[(UChar)res]; \ 220 af = (res ^ argL ^ argR) & 0x10; \ 221 zf = ((DATA_UTYPE)res == 0) << 6; \ 222 sf = lshift(res, 8 - DATA_BITS) & 0x80; \ 223 of = lshift((argL ^ argR) & (argL ^ res), \ 224 12 - DATA_BITS) & AMD64G_CC_MASK_O; \ 225 return cf | pf | af | zf | sf | of; \ 226 } \ 227 } 228 229 /*-------------------------------------------------------------*/ 230 231 #define ACTIONS_ADC(DATA_BITS,DATA_UTYPE) \ 232 { \ 233 PREAMBLE(DATA_BITS); \ 234 { Long cf, pf, af, zf, sf, of; \ 235 Long argL, argR, oldC, res; \ 236 oldC = CC_NDEP & AMD64G_CC_MASK_C; \ 237 argL = CC_DEP1; \ 238 argR = CC_DEP2 ^ oldC; \ 239 res = (argL + argR) + oldC; \ 240 if (oldC) \ 241 cf = (DATA_UTYPE)res <= (DATA_UTYPE)argL; \ 242 else \ 243 cf = (DATA_UTYPE)res < (DATA_UTYPE)argL; \ 244 pf = parity_table[(UChar)res]; \ 245 af = (res ^ argL ^ argR) & 0x10; \ 246 zf = ((DATA_UTYPE)res == 0) << 6; \ 247 sf = lshift(res, 8 - DATA_BITS) & 0x80; \ 248 of = lshift((argL ^ argR ^ -1) & (argL ^ res), \ 249 12 - DATA_BITS) & AMD64G_CC_MASK_O; \ 250 return cf | pf | af | zf | sf | of; \ 251 } \ 252 } 253 254 /*-------------------------------------------------------------*/ 255 256 #define ACTIONS_SBB(DATA_BITS,DATA_UTYPE) \ 257 { \ 258 PREAMBLE(DATA_BITS); \ 259 { Long cf, pf, af, zf, sf, of; \ 260 Long argL, argR, oldC, res; \ 261 oldC = CC_NDEP & AMD64G_CC_MASK_C; \ 262 argL = CC_DEP1; \ 263 argR = CC_DEP2 ^ oldC; \ 264 res = (argL - argR) - oldC; \ 265 if (oldC) \ 266 cf = (DATA_UTYPE)argL <= (DATA_UTYPE)argR; \ 267 else \ 268 cf = (DATA_UTYPE)argL < (DATA_UTYPE)argR; \ 269 pf = parity_table[(UChar)res]; \ 270 af = (res ^ argL ^ argR) & 0x10; \ 271 zf = ((DATA_UTYPE)res == 0) << 6; \ 272 sf = lshift(res, 8 - DATA_BITS) & 0x80; \ 273 of = lshift((argL ^ argR) & (argL ^ res), \ 274 12 - DATA_BITS) & AMD64G_CC_MASK_O; \ 275 return cf | pf | af | zf | sf | of; \ 276 } \ 277 } 278 279 /*-------------------------------------------------------------*/ 280 281 #define ACTIONS_LOGIC(DATA_BITS,DATA_UTYPE) \ 282 { \ 283 PREAMBLE(DATA_BITS); \ 284 { Long cf, pf, af, zf, sf, of; \ 285 cf = 0; \ 286 pf = parity_table[(UChar)CC_DEP1]; \ 287 af = 0; \ 288 zf = ((DATA_UTYPE)CC_DEP1 == 0) << 6; \ 289 sf = lshift(CC_DEP1, 8 - DATA_BITS) & 0x80; \ 290 of = 0; \ 291 return cf | pf | af | zf | sf | of; \ 292 } \ 293 } 294 295 /*-------------------------------------------------------------*/ 296 297 #define ACTIONS_INC(DATA_BITS,DATA_UTYPE) \ 298 { \ 299 PREAMBLE(DATA_BITS); \ 300 { Long cf, pf, af, zf, sf, of; \ 301 Long argL, argR, res; \ 302 res = CC_DEP1; \ 303 argL = res - 1; \ 304 argR = 1; \ 305 cf = CC_NDEP & AMD64G_CC_MASK_C; \ 306 pf = parity_table[(UChar)res]; \ 307 af = (res ^ argL ^ argR) & 0x10; \ 308 zf = ((DATA_UTYPE)res == 0) << 6; \ 309 sf = lshift(res, 8 - DATA_BITS) & 0x80; \ 310 of = ((res & DATA_MASK) == SIGN_MASK) << 11; \ 311 return cf | pf | af | zf | sf | of; \ 312 } \ 313 } 314 315 /*-------------------------------------------------------------*/ 316 317 #define ACTIONS_DEC(DATA_BITS,DATA_UTYPE) \ 318 { \ 319 PREAMBLE(DATA_BITS); \ 320 { Long cf, pf, af, zf, sf, of; \ 321 Long argL, argR, res; \ 322 res = CC_DEP1; \ 323 argL = res + 1; \ 324 argR = 1; \ 325 cf = CC_NDEP & AMD64G_CC_MASK_C; \ 326 pf = parity_table[(UChar)res]; \ 327 af = (res ^ argL ^ argR) & 0x10; \ 328 zf = ((DATA_UTYPE)res == 0) << 6; \ 329 sf = lshift(res, 8 - DATA_BITS) & 0x80; \ 330 of = ((res & DATA_MASK) \ 331 == ((ULong)SIGN_MASK - 1)) << 11; \ 332 return cf | pf | af | zf | sf | of; \ 333 } \ 334 } 335 336 /*-------------------------------------------------------------*/ 337 338 #define ACTIONS_SHL(DATA_BITS,DATA_UTYPE) \ 339 { \ 340 PREAMBLE(DATA_BITS); \ 341 { Long cf, pf, af, zf, sf, of; \ 342 cf = (CC_DEP2 >> (DATA_BITS - 1)) & AMD64G_CC_MASK_C; \ 343 pf = parity_table[(UChar)CC_DEP1]; \ 344 af = 0; /* undefined */ \ 345 zf = ((DATA_UTYPE)CC_DEP1 == 0) << 6; \ 346 sf = lshift(CC_DEP1, 8 - DATA_BITS) & 0x80; \ 347 /* of is defined if shift count == 1 */ \ 348 of = lshift(CC_DEP2 ^ CC_DEP1, 12 - DATA_BITS) \ 349 & AMD64G_CC_MASK_O; \ 350 return cf | pf | af | zf | sf | of; \ 351 } \ 352 } 353 354 /*-------------------------------------------------------------*/ 355 356 #define ACTIONS_SHR(DATA_BITS,DATA_UTYPE) \ 357 { \ 358 PREAMBLE(DATA_BITS); \ 359 { Long cf, pf, af, zf, sf, of; \ 360 cf = CC_DEP2 & 1; \ 361 pf = parity_table[(UChar)CC_DEP1]; \ 362 af = 0; /* undefined */ \ 363 zf = ((DATA_UTYPE)CC_DEP1 == 0) << 6; \ 364 sf = lshift(CC_DEP1, 8 - DATA_BITS) & 0x80; \ 365 /* of is defined if shift count == 1 */ \ 366 of = lshift(CC_DEP2 ^ CC_DEP1, 12 - DATA_BITS) \ 367 & AMD64G_CC_MASK_O; \ 368 return cf | pf | af | zf | sf | of; \ 369 } \ 370 } 371 372 /*-------------------------------------------------------------*/ 373 374 /* ROL: cf' = lsb(result). of' = msb(result) ^ lsb(result). */ 375 /* DEP1 = result, NDEP = old flags */ 376 #define ACTIONS_ROL(DATA_BITS,DATA_UTYPE) \ 377 { \ 378 PREAMBLE(DATA_BITS); \ 379 { Long fl \ 380 = (CC_NDEP & ~(AMD64G_CC_MASK_O | AMD64G_CC_MASK_C)) \ 381 | (AMD64G_CC_MASK_C & CC_DEP1) \ 382 | (AMD64G_CC_MASK_O & (lshift(CC_DEP1, \ 383 11-(DATA_BITS-1)) \ 384 ^ lshift(CC_DEP1, 11))); \ 385 return fl; \ 386 } \ 387 } 388 389 /*-------------------------------------------------------------*/ 390 391 /* ROR: cf' = msb(result). of' = msb(result) ^ msb-1(result). */ 392 /* DEP1 = result, NDEP = old flags */ 393 #define ACTIONS_ROR(DATA_BITS,DATA_UTYPE) \ 394 { \ 395 PREAMBLE(DATA_BITS); \ 396 { Long fl \ 397 = (CC_NDEP & ~(AMD64G_CC_MASK_O | AMD64G_CC_MASK_C)) \ 398 | (AMD64G_CC_MASK_C & (CC_DEP1 >> (DATA_BITS-1))) \ 399 | (AMD64G_CC_MASK_O & (lshift(CC_DEP1, \ 400 11-(DATA_BITS-1)) \ 401 ^ lshift(CC_DEP1, 11-(DATA_BITS-1)+1))); \ 402 return fl; \ 403 } \ 404 } 405 406 /*-------------------------------------------------------------*/ 407 408 #define ACTIONS_UMUL(DATA_BITS, DATA_UTYPE, NARROWtoU, \ 409 DATA_U2TYPE, NARROWto2U) \ 410 { \ 411 PREAMBLE(DATA_BITS); \ 412 { Long cf, pf, af, zf, sf, of; \ 413 DATA_UTYPE hi; \ 414 DATA_UTYPE lo \ 415 = NARROWtoU( ((DATA_UTYPE)CC_DEP1) \ 416 * ((DATA_UTYPE)CC_DEP2) ); \ 417 DATA_U2TYPE rr \ 418 = NARROWto2U( \ 419 ((DATA_U2TYPE)((DATA_UTYPE)CC_DEP1)) \ 420 * ((DATA_U2TYPE)((DATA_UTYPE)CC_DEP2)) ); \ 421 hi = NARROWtoU(rr >>/*u*/ DATA_BITS); \ 422 cf = (hi != 0); \ 423 pf = parity_table[(UChar)lo]; \ 424 af = 0; /* undefined */ \ 425 zf = (lo == 0) << 6; \ 426 sf = lshift(lo, 8 - DATA_BITS) & 0x80; \ 427 of = cf << 11; \ 428 return cf | pf | af | zf | sf | of; \ 429 } \ 430 } 431 432 /*-------------------------------------------------------------*/ 433 434 #define ACTIONS_SMUL(DATA_BITS, DATA_STYPE, NARROWtoS, \ 435 DATA_S2TYPE, NARROWto2S) \ 436 { \ 437 PREAMBLE(DATA_BITS); \ 438 { Long cf, pf, af, zf, sf, of; \ 439 DATA_STYPE hi; \ 440 DATA_STYPE lo \ 441 = NARROWtoS( ((DATA_STYPE)CC_DEP1) \ 442 * ((DATA_STYPE)CC_DEP2) ); \ 443 DATA_S2TYPE rr \ 444 = NARROWto2S( \ 445 ((DATA_S2TYPE)((DATA_STYPE)CC_DEP1)) \ 446 * ((DATA_S2TYPE)((DATA_STYPE)CC_DEP2)) ); \ 447 hi = NARROWtoS(rr >>/*s*/ DATA_BITS); \ 448 cf = (hi != (lo >>/*s*/ (DATA_BITS-1))); \ 449 pf = parity_table[(UChar)lo]; \ 450 af = 0; /* undefined */ \ 451 zf = (lo == 0) << 6; \ 452 sf = lshift(lo, 8 - DATA_BITS) & 0x80; \ 453 of = cf << 11; \ 454 return cf | pf | af | zf | sf | of; \ 455 } \ 456 } 457 458 /*-------------------------------------------------------------*/ 459 460 #define ACTIONS_UMULQ \ 461 { \ 462 PREAMBLE(64); \ 463 { Long cf, pf, af, zf, sf, of; \ 464 ULong lo, hi; \ 465 mullU64( (ULong)CC_DEP1, (ULong)CC_DEP2, &hi, &lo ); \ 466 cf = (hi != 0); \ 467 pf = parity_table[(UChar)lo]; \ 468 af = 0; /* undefined */ \ 469 zf = (lo == 0) << 6; \ 470 sf = lshift(lo, 8 - 64) & 0x80; \ 471 of = cf << 11; \ 472 return cf | pf | af | zf | sf | of; \ 473 } \ 474 } 475 476 /*-------------------------------------------------------------*/ 477 478 #define ACTIONS_SMULQ \ 479 { \ 480 PREAMBLE(64); \ 481 { Long cf, pf, af, zf, sf, of; \ 482 Long lo, hi; \ 483 mullS64( (Long)CC_DEP1, (Long)CC_DEP2, &hi, &lo ); \ 484 cf = (hi != (lo >>/*s*/ (64-1))); \ 485 pf = parity_table[(UChar)lo]; \ 486 af = 0; /* undefined */ \ 487 zf = (lo == 0) << 6; \ 488 sf = lshift(lo, 8 - 64) & 0x80; \ 489 of = cf << 11; \ 490 return cf | pf | af | zf | sf | of; \ 491 } \ 492 } 493 494 495 #if PROFILE_RFLAGS 496 497 static Bool initted = False; 498 499 /* C flag, fast route */ 500 static UInt tabc_fast[AMD64G_CC_OP_NUMBER]; 501 /* C flag, slow route */ 502 static UInt tabc_slow[AMD64G_CC_OP_NUMBER]; 503 /* table for calculate_cond */ 504 static UInt tab_cond[AMD64G_CC_OP_NUMBER][16]; 505 /* total entry counts for calc_all, calc_c, calc_cond. */ 506 static UInt n_calc_all = 0; 507 static UInt n_calc_c = 0; 508 static UInt n_calc_cond = 0; 509 510 #define SHOW_COUNTS_NOW (0 == (0x3FFFFF & (n_calc_all+n_calc_c+n_calc_cond))) 511 512 513 static void showCounts ( void ) 514 { 515 Int op, co; 516 Char ch; 517 vex_printf("\nTotal calls: calc_all=%u calc_cond=%u calc_c=%u\n", 518 n_calc_all, n_calc_cond, n_calc_c); 519 520 vex_printf(" cSLOW cFAST O NO B NB Z NZ BE NBE" 521 " S NS P NP L NL LE NLE\n"); 522 vex_printf(" -----------------------------------------------------" 523 "----------------------------------------\n"); 524 for (op = 0; op < AMD64G_CC_OP_NUMBER; op++) { 525 526 ch = ' '; 527 if (op > 0 && (op-1) % 4 == 0) 528 ch = 'B'; 529 if (op > 0 && (op-1) % 4 == 1) 530 ch = 'W'; 531 if (op > 0 && (op-1) % 4 == 2) 532 ch = 'L'; 533 if (op > 0 && (op-1) % 4 == 3) 534 ch = 'Q'; 535 536 vex_printf("%2d%c: ", op, ch); 537 vex_printf("%6u ", tabc_slow[op]); 538 vex_printf("%6u ", tabc_fast[op]); 539 for (co = 0; co < 16; co++) { 540 Int n = tab_cond[op][co]; 541 if (n >= 1000) { 542 vex_printf(" %3dK", n / 1000); 543 } else 544 if (n >= 0) { 545 vex_printf(" %3d ", n ); 546 } else { 547 vex_printf(" "); 548 } 549 } 550 vex_printf("\n"); 551 } 552 vex_printf("\n"); 553 } 554 555 static void initCounts ( void ) 556 { 557 Int op, co; 558 initted = True; 559 for (op = 0; op < AMD64G_CC_OP_NUMBER; op++) { 560 tabc_fast[op] = tabc_slow[op] = 0; 561 for (co = 0; co < 16; co++) 562 tab_cond[op][co] = 0; 563 } 564 } 565 566 #endif /* PROFILE_RFLAGS */ 567 568 569 /* CALLED FROM GENERATED CODE: CLEAN HELPER */ 570 /* Calculate all the 6 flags from the supplied thunk parameters. 571 Worker function, not directly called from generated code. */ 572 static 573 ULong amd64g_calculate_rflags_all_WRK ( ULong cc_op, 574 ULong cc_dep1_formal, 575 ULong cc_dep2_formal, 576 ULong cc_ndep_formal ) 577 { 578 switch (cc_op) { 579 case AMD64G_CC_OP_COPY: 580 return cc_dep1_formal 581 & (AMD64G_CC_MASK_O | AMD64G_CC_MASK_S | AMD64G_CC_MASK_Z 582 | AMD64G_CC_MASK_A | AMD64G_CC_MASK_C | AMD64G_CC_MASK_P); 583 584 case AMD64G_CC_OP_ADDB: ACTIONS_ADD( 8, UChar ); 585 case AMD64G_CC_OP_ADDW: ACTIONS_ADD( 16, UShort ); 586 case AMD64G_CC_OP_ADDL: ACTIONS_ADD( 32, UInt ); 587 case AMD64G_CC_OP_ADDQ: ACTIONS_ADD( 64, ULong ); 588 589 case AMD64G_CC_OP_ADCB: ACTIONS_ADC( 8, UChar ); 590 case AMD64G_CC_OP_ADCW: ACTIONS_ADC( 16, UShort ); 591 case AMD64G_CC_OP_ADCL: ACTIONS_ADC( 32, UInt ); 592 case AMD64G_CC_OP_ADCQ: ACTIONS_ADC( 64, ULong ); 593 594 case AMD64G_CC_OP_SUBB: ACTIONS_SUB( 8, UChar ); 595 case AMD64G_CC_OP_SUBW: ACTIONS_SUB( 16, UShort ); 596 case AMD64G_CC_OP_SUBL: ACTIONS_SUB( 32, UInt ); 597 case AMD64G_CC_OP_SUBQ: ACTIONS_SUB( 64, ULong ); 598 599 case AMD64G_CC_OP_SBBB: ACTIONS_SBB( 8, UChar ); 600 case AMD64G_CC_OP_SBBW: ACTIONS_SBB( 16, UShort ); 601 case AMD64G_CC_OP_SBBL: ACTIONS_SBB( 32, UInt ); 602 case AMD64G_CC_OP_SBBQ: ACTIONS_SBB( 64, ULong ); 603 604 case AMD64G_CC_OP_LOGICB: ACTIONS_LOGIC( 8, UChar ); 605 case AMD64G_CC_OP_LOGICW: ACTIONS_LOGIC( 16, UShort ); 606 case AMD64G_CC_OP_LOGICL: ACTIONS_LOGIC( 32, UInt ); 607 case AMD64G_CC_OP_LOGICQ: ACTIONS_LOGIC( 64, ULong ); 608 609 case AMD64G_CC_OP_INCB: ACTIONS_INC( 8, UChar ); 610 case AMD64G_CC_OP_INCW: ACTIONS_INC( 16, UShort ); 611 case AMD64G_CC_OP_INCL: ACTIONS_INC( 32, UInt ); 612 case AMD64G_CC_OP_INCQ: ACTIONS_INC( 64, ULong ); 613 614 case AMD64G_CC_OP_DECB: ACTIONS_DEC( 8, UChar ); 615 case AMD64G_CC_OP_DECW: ACTIONS_DEC( 16, UShort ); 616 case AMD64G_CC_OP_DECL: ACTIONS_DEC( 32, UInt ); 617 case AMD64G_CC_OP_DECQ: ACTIONS_DEC( 64, ULong ); 618 619 case AMD64G_CC_OP_SHLB: ACTIONS_SHL( 8, UChar ); 620 case AMD64G_CC_OP_SHLW: ACTIONS_SHL( 16, UShort ); 621 case AMD64G_CC_OP_SHLL: ACTIONS_SHL( 32, UInt ); 622 case AMD64G_CC_OP_SHLQ: ACTIONS_SHL( 64, ULong ); 623 624 case AMD64G_CC_OP_SHRB: ACTIONS_SHR( 8, UChar ); 625 case AMD64G_CC_OP_SHRW: ACTIONS_SHR( 16, UShort ); 626 case AMD64G_CC_OP_SHRL: ACTIONS_SHR( 32, UInt ); 627 case AMD64G_CC_OP_SHRQ: ACTIONS_SHR( 64, ULong ); 628 629 case AMD64G_CC_OP_ROLB: ACTIONS_ROL( 8, UChar ); 630 case AMD64G_CC_OP_ROLW: ACTIONS_ROL( 16, UShort ); 631 case AMD64G_CC_OP_ROLL: ACTIONS_ROL( 32, UInt ); 632 case AMD64G_CC_OP_ROLQ: ACTIONS_ROL( 64, ULong ); 633 634 case AMD64G_CC_OP_RORB: ACTIONS_ROR( 8, UChar ); 635 case AMD64G_CC_OP_RORW: ACTIONS_ROR( 16, UShort ); 636 case AMD64G_CC_OP_RORL: ACTIONS_ROR( 32, UInt ); 637 case AMD64G_CC_OP_RORQ: ACTIONS_ROR( 64, ULong ); 638 639 case AMD64G_CC_OP_UMULB: ACTIONS_UMUL( 8, UChar, toUChar, 640 UShort, toUShort ); 641 case AMD64G_CC_OP_UMULW: ACTIONS_UMUL( 16, UShort, toUShort, 642 UInt, toUInt ); 643 case AMD64G_CC_OP_UMULL: ACTIONS_UMUL( 32, UInt, toUInt, 644 ULong, idULong ); 645 646 case AMD64G_CC_OP_UMULQ: ACTIONS_UMULQ; 647 648 case AMD64G_CC_OP_SMULB: ACTIONS_SMUL( 8, Char, toUChar, 649 Short, toUShort ); 650 case AMD64G_CC_OP_SMULW: ACTIONS_SMUL( 16, Short, toUShort, 651 Int, toUInt ); 652 case AMD64G_CC_OP_SMULL: ACTIONS_SMUL( 32, Int, toUInt, 653 Long, idULong ); 654 655 case AMD64G_CC_OP_SMULQ: ACTIONS_SMULQ; 656 657 default: 658 /* shouldn't really make these calls from generated code */ 659 vex_printf("amd64g_calculate_rflags_all_WRK(AMD64)" 660 "( %llu, 0x%llx, 0x%llx, 0x%llx )\n", 661 cc_op, cc_dep1_formal, cc_dep2_formal, cc_ndep_formal ); 662 vpanic("amd64g_calculate_rflags_all_WRK(AMD64)"); 663 } 664 } 665 666 667 /* CALLED FROM GENERATED CODE: CLEAN HELPER */ 668 /* Calculate all the 6 flags from the supplied thunk parameters. */ 669 ULong amd64g_calculate_rflags_all ( ULong cc_op, 670 ULong cc_dep1, 671 ULong cc_dep2, 672 ULong cc_ndep ) 673 { 674 # if PROFILE_RFLAGS 675 if (!initted) initCounts(); 676 n_calc_all++; 677 if (SHOW_COUNTS_NOW) showCounts(); 678 # endif 679 return 680 amd64g_calculate_rflags_all_WRK ( cc_op, cc_dep1, cc_dep2, cc_ndep ); 681 } 682 683 684 /* CALLED FROM GENERATED CODE: CLEAN HELPER */ 685 /* Calculate just the carry flag from the supplied thunk parameters. */ 686 ULong amd64g_calculate_rflags_c ( ULong cc_op, 687 ULong cc_dep1, 688 ULong cc_dep2, 689 ULong cc_ndep ) 690 { 691 # if PROFILE_RFLAGS 692 if (!initted) initCounts(); 693 n_calc_c++; 694 tabc_fast[cc_op]++; 695 if (SHOW_COUNTS_NOW) showCounts(); 696 # endif 697 698 /* Fast-case some common ones. */ 699 switch (cc_op) { 700 case AMD64G_CC_OP_COPY: 701 return (cc_dep1 >> AMD64G_CC_SHIFT_C) & 1; 702 case AMD64G_CC_OP_LOGICQ: 703 case AMD64G_CC_OP_LOGICL: 704 case AMD64G_CC_OP_LOGICW: 705 case AMD64G_CC_OP_LOGICB: 706 return 0; 707 // case AMD64G_CC_OP_SUBL: 708 // return ((UInt)cc_dep1) < ((UInt)cc_dep2) 709 // ? AMD64G_CC_MASK_C : 0; 710 // case AMD64G_CC_OP_SUBW: 711 // return ((UInt)(cc_dep1 & 0xFFFF)) < ((UInt)(cc_dep2 & 0xFFFF)) 712 // ? AMD64G_CC_MASK_C : 0; 713 // case AMD64G_CC_OP_SUBB: 714 // return ((UInt)(cc_dep1 & 0xFF)) < ((UInt)(cc_dep2 & 0xFF)) 715 // ? AMD64G_CC_MASK_C : 0; 716 // case AMD64G_CC_OP_INCL: 717 // case AMD64G_CC_OP_DECL: 718 // return cc_ndep & AMD64G_CC_MASK_C; 719 default: 720 break; 721 } 722 723 # if PROFILE_RFLAGS 724 tabc_fast[cc_op]--; 725 tabc_slow[cc_op]++; 726 # endif 727 728 return amd64g_calculate_rflags_all_WRK(cc_op,cc_dep1,cc_dep2,cc_ndep) 729 & AMD64G_CC_MASK_C; 730 } 731 732 733 /* CALLED FROM GENERATED CODE: CLEAN HELPER */ 734 /* returns 1 or 0 */ 735 ULong amd64g_calculate_condition ( ULong/*AMD64Condcode*/ cond, 736 ULong cc_op, 737 ULong cc_dep1, 738 ULong cc_dep2, 739 ULong cc_ndep ) 740 { 741 ULong rflags = amd64g_calculate_rflags_all_WRK(cc_op, cc_dep1, 742 cc_dep2, cc_ndep); 743 ULong of,sf,zf,cf,pf; 744 ULong inv = cond & 1; 745 746 # if PROFILE_RFLAGS 747 if (!initted) initCounts(); 748 tab_cond[cc_op][cond]++; 749 n_calc_cond++; 750 if (SHOW_COUNTS_NOW) showCounts(); 751 # endif 752 753 switch (cond) { 754 case AMD64CondNO: 755 case AMD64CondO: /* OF == 1 */ 756 of = rflags >> AMD64G_CC_SHIFT_O; 757 return 1 & (inv ^ of); 758 759 case AMD64CondNZ: 760 case AMD64CondZ: /* ZF == 1 */ 761 zf = rflags >> AMD64G_CC_SHIFT_Z; 762 return 1 & (inv ^ zf); 763 764 case AMD64CondNB: 765 case AMD64CondB: /* CF == 1 */ 766 cf = rflags >> AMD64G_CC_SHIFT_C; 767 return 1 & (inv ^ cf); 768 break; 769 770 case AMD64CondNBE: 771 case AMD64CondBE: /* (CF or ZF) == 1 */ 772 cf = rflags >> AMD64G_CC_SHIFT_C; 773 zf = rflags >> AMD64G_CC_SHIFT_Z; 774 return 1 & (inv ^ (cf | zf)); 775 break; 776 777 case AMD64CondNS: 778 case AMD64CondS: /* SF == 1 */ 779 sf = rflags >> AMD64G_CC_SHIFT_S; 780 return 1 & (inv ^ sf); 781 782 case AMD64CondNP: 783 case AMD64CondP: /* PF == 1 */ 784 pf = rflags >> AMD64G_CC_SHIFT_P; 785 return 1 & (inv ^ pf); 786 787 case AMD64CondNL: 788 case AMD64CondL: /* (SF xor OF) == 1 */ 789 sf = rflags >> AMD64G_CC_SHIFT_S; 790 of = rflags >> AMD64G_CC_SHIFT_O; 791 return 1 & (inv ^ (sf ^ of)); 792 break; 793 794 case AMD64CondNLE: 795 case AMD64CondLE: /* ((SF xor OF) or ZF) == 1 */ 796 sf = rflags >> AMD64G_CC_SHIFT_S; 797 of = rflags >> AMD64G_CC_SHIFT_O; 798 zf = rflags >> AMD64G_CC_SHIFT_Z; 799 return 1 & (inv ^ ((sf ^ of) | zf)); 800 break; 801 802 default: 803 /* shouldn't really make these calls from generated code */ 804 vex_printf("amd64g_calculate_condition" 805 "( %llu, %llu, 0x%llx, 0x%llx, 0x%llx )\n", 806 cond, cc_op, cc_dep1, cc_dep2, cc_ndep ); 807 vpanic("amd64g_calculate_condition"); 808 } 809 } 810 811 812 /* VISIBLE TO LIBVEX CLIENT */ 813 ULong LibVEX_GuestAMD64_get_rflags ( /*IN*/VexGuestAMD64State* vex_state ) 814 { 815 ULong rflags = amd64g_calculate_rflags_all_WRK( 816 vex_state->guest_CC_OP, 817 vex_state->guest_CC_DEP1, 818 vex_state->guest_CC_DEP2, 819 vex_state->guest_CC_NDEP 820 ); 821 Long dflag = vex_state->guest_DFLAG; 822 vassert(dflag == 1 || dflag == -1); 823 if (dflag == -1) 824 rflags |= (1<<10); 825 if (vex_state->guest_IDFLAG == 1) 826 rflags |= (1<<21); 827 if (vex_state->guest_ACFLAG == 1) 828 rflags |= (1<<18); 829 830 return rflags; 831 } 832 833 /* VISIBLE TO LIBVEX CLIENT */ 834 void 835 LibVEX_GuestAMD64_put_rflag_c ( ULong new_carry_flag, 836 /*MOD*/VexGuestAMD64State* vex_state ) 837 { 838 ULong oszacp = amd64g_calculate_rflags_all_WRK( 839 vex_state->guest_CC_OP, 840 vex_state->guest_CC_DEP1, 841 vex_state->guest_CC_DEP2, 842 vex_state->guest_CC_NDEP 843 ); 844 if (new_carry_flag & 1) { 845 oszacp |= AMD64G_CC_MASK_C; 846 } else { 847 oszacp &= ~AMD64G_CC_MASK_C; 848 } 849 vex_state->guest_CC_OP = AMD64G_CC_OP_COPY; 850 vex_state->guest_CC_DEP1 = oszacp; 851 vex_state->guest_CC_DEP2 = 0; 852 vex_state->guest_CC_NDEP = 0; 853 } 854 855 856 /*---------------------------------------------------------------*/ 857 /*--- %rflags translation-time function specialisers. ---*/ 858 /*--- These help iropt specialise calls the above run-time ---*/ 859 /*--- %rflags functions. ---*/ 860 /*---------------------------------------------------------------*/ 861 862 /* Used by the optimiser to try specialisations. Returns an 863 equivalent expression, or NULL if none. */ 864 865 static Bool isU64 ( IRExpr* e, ULong n ) 866 { 867 return toBool( e->tag == Iex_Const 868 && e->Iex.Const.con->tag == Ico_U64 869 && e->Iex.Const.con->Ico.U64 == n ); 870 } 871 872 IRExpr* guest_amd64_spechelper ( HChar* function_name, 873 IRExpr** args, 874 IRStmt** precedingStmts, 875 Int n_precedingStmts ) 876 { 877 # define unop(_op,_a1) IRExpr_Unop((_op),(_a1)) 878 # define binop(_op,_a1,_a2) IRExpr_Binop((_op),(_a1),(_a2)) 879 # define mkU64(_n) IRExpr_Const(IRConst_U64(_n)) 880 # define mkU32(_n) IRExpr_Const(IRConst_U32(_n)) 881 # define mkU8(_n) IRExpr_Const(IRConst_U8(_n)) 882 883 Int i, arity = 0; 884 for (i = 0; args[i]; i++) 885 arity++; 886 # if 0 887 vex_printf("spec request:\n"); 888 vex_printf(" %s ", function_name); 889 for (i = 0; i < arity; i++) { 890 vex_printf(" "); 891 ppIRExpr(args[i]); 892 } 893 vex_printf("\n"); 894 # endif 895 896 /* --------- specialising "amd64g_calculate_condition" --------- */ 897 898 if (vex_streq(function_name, "amd64g_calculate_condition")) { 899 /* specialise calls to above "calculate condition" function */ 900 IRExpr *cond, *cc_op, *cc_dep1, *cc_dep2; 901 vassert(arity == 5); 902 cond = args[0]; 903 cc_op = args[1]; 904 cc_dep1 = args[2]; 905 cc_dep2 = args[3]; 906 907 /*---------------- ADDQ ----------------*/ 908 909 if (isU64(cc_op, AMD64G_CC_OP_ADDQ) && isU64(cond, AMD64CondZ)) { 910 /* long long add, then Z --> test (dst+src == 0) */ 911 return unop(Iop_1Uto64, 912 binop(Iop_CmpEQ64, 913 binop(Iop_Add64, cc_dep1, cc_dep2), 914 mkU64(0))); 915 } 916 917 /*---------------- SUBQ ----------------*/ 918 919 if (isU64(cc_op, AMD64G_CC_OP_SUBQ) && isU64(cond, AMD64CondZ)) { 920 /* long long sub/cmp, then Z --> test dst==src */ 921 return unop(Iop_1Uto64, 922 binop(Iop_CmpEQ64,cc_dep1,cc_dep2)); 923 } 924 if (isU64(cc_op, AMD64G_CC_OP_SUBQ) && isU64(cond, AMD64CondNZ)) { 925 /* long long sub/cmp, then NZ --> test dst!=src */ 926 return unop(Iop_1Uto64, 927 binop(Iop_CmpNE64,cc_dep1,cc_dep2)); 928 } 929 930 if (isU64(cc_op, AMD64G_CC_OP_SUBQ) && isU64(cond, AMD64CondL)) { 931 /* long long sub/cmp, then L (signed less than) 932 --> test dst <s src */ 933 return unop(Iop_1Uto64, 934 binop(Iop_CmpLT64S, cc_dep1, cc_dep2)); 935 } 936 937 if (isU64(cc_op, AMD64G_CC_OP_SUBQ) && isU64(cond, AMD64CondB)) { 938 /* long long sub/cmp, then B (unsigned less than) 939 --> test dst <u src */ 940 return unop(Iop_1Uto64, 941 binop(Iop_CmpLT64U, cc_dep1, cc_dep2)); 942 } 943 if (isU64(cc_op, AMD64G_CC_OP_SUBQ) && isU64(cond, AMD64CondNB)) { 944 /* long long sub/cmp, then NB (unsigned greater than or equal) 945 --> test src <=u dst */ 946 /* Note, args are opposite way round from the usual */ 947 return unop(Iop_1Uto64, 948 binop(Iop_CmpLE64U, cc_dep2, cc_dep1)); 949 } 950 951 if (isU64(cc_op, AMD64G_CC_OP_SUBQ) && isU64(cond, AMD64CondBE)) { 952 /* long long sub/cmp, then BE (unsigned less than or equal) 953 --> test dst <=u src */ 954 return unop(Iop_1Uto64, 955 binop(Iop_CmpLE64U, cc_dep1, cc_dep2)); 956 } 957 if (isU64(cc_op, AMD64G_CC_OP_SUBQ) && isU64(cond, AMD64CondNBE)) { 958 /* long long sub/cmp, then NBE (unsigned greater than) 959 --> test !(dst <=u src) */ 960 return binop(Iop_Xor64, 961 unop(Iop_1Uto64, 962 binop(Iop_CmpLE64U, cc_dep1, cc_dep2)), 963 mkU64(1)); 964 } 965 966 /*---------------- SUBL ----------------*/ 967 968 if (isU64(cc_op, AMD64G_CC_OP_SUBL) && isU64(cond, AMD64CondZ)) { 969 /* long sub/cmp, then Z --> test dst==src */ 970 return unop(Iop_1Uto64, 971 binop(Iop_CmpEQ32, 972 unop(Iop_64to32, cc_dep1), 973 unop(Iop_64to32, cc_dep2))); 974 } 975 if (isU64(cc_op, AMD64G_CC_OP_SUBL) && isU64(cond, AMD64CondNZ)) { 976 /* long sub/cmp, then NZ --> test dst!=src */ 977 return unop(Iop_1Uto64, 978 binop(Iop_CmpNE32, 979 unop(Iop_64to32, cc_dep1), 980 unop(Iop_64to32, cc_dep2))); 981 } 982 983 if (isU64(cc_op, AMD64G_CC_OP_SUBL) && isU64(cond, AMD64CondL)) { 984 /* long sub/cmp, then L (signed less than) 985 --> test dst <s src */ 986 return unop(Iop_1Uto64, 987 binop(Iop_CmpLT32S, 988 unop(Iop_64to32, cc_dep1), 989 unop(Iop_64to32, cc_dep2))); 990 } 991 992 if (isU64(cc_op, AMD64G_CC_OP_SUBL) && isU64(cond, AMD64CondLE)) { 993 /* long sub/cmp, then LE (signed less than or equal) 994 --> test dst <=s src */ 995 return unop(Iop_1Uto64, 996 binop(Iop_CmpLE32S, 997 unop(Iop_64to32, cc_dep1), 998 unop(Iop_64to32, cc_dep2))); 999 1000 } 1001 if (isU64(cc_op, AMD64G_CC_OP_SUBL) && isU64(cond, AMD64CondNLE)) { 1002 /* long sub/cmp, then NLE (signed greater than) 1003 --> test !(dst <=s src) 1004 --> test (dst >s src) 1005 --> test (src <s dst) */ 1006 return unop(Iop_1Uto64, 1007 binop(Iop_CmpLT32S, 1008 unop(Iop_64to32, cc_dep2), 1009 unop(Iop_64to32, cc_dep1))); 1010 1011 } 1012 1013 if (isU64(cc_op, AMD64G_CC_OP_SUBL) && isU64(cond, AMD64CondBE)) { 1014 /* long sub/cmp, then BE (unsigned less than or equal) 1015 --> test dst <=u src */ 1016 return unop(Iop_1Uto64, 1017 binop(Iop_CmpLE32U, 1018 unop(Iop_64to32, cc_dep1), 1019 unop(Iop_64to32, cc_dep2))); 1020 } 1021 if (isU64(cc_op, AMD64G_CC_OP_SUBL) && isU64(cond, AMD64CondNBE)) { 1022 /* long sub/cmp, then NBE (unsigned greater than) 1023 --> test src <u dst */ 1024 /* Note, args are opposite way round from the usual */ 1025 return unop(Iop_1Uto64, 1026 binop(Iop_CmpLT32U, 1027 unop(Iop_64to32, cc_dep2), 1028 unop(Iop_64to32, cc_dep1))); 1029 } 1030 1031 if (isU64(cc_op, AMD64G_CC_OP_SUBL) && isU64(cond, AMD64CondS)) { 1032 /* long sub/cmp, then S (negative) --> test (dst-src <s 0) */ 1033 return unop(Iop_1Uto64, 1034 binop(Iop_CmpLT32S, 1035 binop(Iop_Sub32, 1036 unop(Iop_64to32, cc_dep1), 1037 unop(Iop_64to32, cc_dep2)), 1038 mkU32(0))); 1039 } 1040 1041 if (isU64(cc_op, AMD64G_CC_OP_SUBL) && isU64(cond, AMD64CondB)) { 1042 /* long sub/cmp, then B (unsigned less than) 1043 --> test dst <u src */ 1044 return unop(Iop_1Uto64, 1045 binop(Iop_CmpLT32U, 1046 unop(Iop_64to32, cc_dep1), 1047 unop(Iop_64to32, cc_dep2))); 1048 } 1049 1050 /*---------------- SUBW ----------------*/ 1051 1052 if (isU64(cc_op, AMD64G_CC_OP_SUBW) && isU64(cond, AMD64CondZ)) { 1053 /* word sub/cmp, then Z --> test dst==src */ 1054 return unop(Iop_1Uto64, 1055 binop(Iop_CmpEQ16, 1056 unop(Iop_64to16,cc_dep1), 1057 unop(Iop_64to16,cc_dep2))); 1058 } 1059 if (isU64(cc_op, AMD64G_CC_OP_SUBW) && isU64(cond, AMD64CondNZ)) { 1060 /* word sub/cmp, then NZ --> test dst!=src */ 1061 return unop(Iop_1Uto64, 1062 binop(Iop_CmpNE16, 1063 unop(Iop_64to16,cc_dep1), 1064 unop(Iop_64to16,cc_dep2))); 1065 } 1066 1067 if (isU64(cc_op, AMD64G_CC_OP_SUBW) && isU64(cond, AMD64CondLE)) { 1068 /* word sub/cmp, then LE (signed less than or equal) 1069 --> test dst <=s src */ 1070 return unop(Iop_1Uto64, 1071 binop(Iop_CmpLE64S, 1072 binop(Iop_Shl64,cc_dep1,mkU8(48)), 1073 binop(Iop_Shl64,cc_dep2,mkU8(48)))); 1074 1075 } 1076 1077 /*---------------- SUBB ----------------*/ 1078 1079 if (isU64(cc_op, AMD64G_CC_OP_SUBB) && isU64(cond, AMD64CondZ)) { 1080 /* byte sub/cmp, then Z --> test dst==src */ 1081 return unop(Iop_1Uto64, 1082 binop(Iop_CmpEQ8, 1083 unop(Iop_64to8,cc_dep1), 1084 unop(Iop_64to8,cc_dep2))); 1085 } 1086 if (isU64(cc_op, AMD64G_CC_OP_SUBB) && isU64(cond, AMD64CondNZ)) { 1087 /* byte sub/cmp, then NZ --> test dst!=src */ 1088 return unop(Iop_1Uto64, 1089 binop(Iop_CmpNE8, 1090 unop(Iop_64to8,cc_dep1), 1091 unop(Iop_64to8,cc_dep2))); 1092 } 1093 1094 if (isU64(cc_op, AMD64G_CC_OP_SUBB) && isU64(cond, AMD64CondBE)) { 1095 /* byte sub/cmp, then BE (unsigned less than or equal) 1096 --> test dst <=u src */ 1097 return unop(Iop_1Uto64, 1098 binop(Iop_CmpLE64U, 1099 binop(Iop_And64, cc_dep1, mkU64(0xFF)), 1100 binop(Iop_And64, cc_dep2, mkU64(0xFF)))); 1101 } 1102 1103 if (isU64(cc_op, AMD64G_CC_OP_SUBB) && isU64(cond, AMD64CondS) 1104 && isU64(cc_dep2, 0)) { 1105 /* byte sub/cmp of zero, then S --> test (dst-0 <s 0) 1106 --> test dst <s 0 1107 --> (ULong)dst[7] 1108 This is yet another scheme by which gcc figures out if the 1109 top bit of a byte is 1 or 0. See also LOGICB/CondS below. */ 1110 /* Note: isU64(cc_dep2, 0) is correct, even though this is 1111 for an 8-bit comparison, since the args to the helper 1112 function are always U64s. */ 1113 return binop(Iop_And64, 1114 binop(Iop_Shr64,cc_dep1,mkU8(7)), 1115 mkU64(1)); 1116 } 1117 if (isU64(cc_op, AMD64G_CC_OP_SUBB) && isU64(cond, AMD64CondNS) 1118 && isU64(cc_dep2, 0)) { 1119 /* byte sub/cmp of zero, then NS --> test !(dst-0 <s 0) 1120 --> test !(dst <s 0) 1121 --> (ULong) !dst[7] 1122 */ 1123 return binop(Iop_Xor64, 1124 binop(Iop_And64, 1125 binop(Iop_Shr64,cc_dep1,mkU8(7)), 1126 mkU64(1)), 1127 mkU64(1)); 1128 } 1129 1130 /*---------------- LOGICQ ----------------*/ 1131 1132 if (isU64(cc_op, AMD64G_CC_OP_LOGICQ) && isU64(cond, AMD64CondZ)) { 1133 /* long long and/or/xor, then Z --> test dst==0 */ 1134 return unop(Iop_1Uto64, 1135 binop(Iop_CmpEQ64, cc_dep1, mkU64(0))); 1136 } 1137 if (isU64(cc_op, AMD64G_CC_OP_LOGICQ) && isU64(cond, AMD64CondNZ)) { 1138 /* long long and/or/xor, then NZ --> test dst!=0 */ 1139 return unop(Iop_1Uto64, 1140 binop(Iop_CmpNE64, cc_dep1, mkU64(0))); 1141 } 1142 1143 if (isU64(cc_op, AMD64G_CC_OP_LOGICQ) && isU64(cond, AMD64CondL)) { 1144 /* long long and/or/xor, then L 1145 LOGIC sets SF and ZF according to the 1146 result and makes OF be zero. L computes SF ^ OF, but 1147 OF is zero, so this reduces to SF -- which will be 1 iff 1148 the result is < signed 0. Hence ... 1149 */ 1150 return unop(Iop_1Uto64, 1151 binop(Iop_CmpLT64S, 1152 cc_dep1, 1153 mkU64(0))); 1154 } 1155 1156 /*---------------- LOGICL ----------------*/ 1157 1158 if (isU64(cc_op, AMD64G_CC_OP_LOGICL) && isU64(cond, AMD64CondZ)) { 1159 /* long and/or/xor, then Z --> test dst==0 */ 1160 return unop(Iop_1Uto64, 1161 binop(Iop_CmpEQ32, 1162 unop(Iop_64to32, cc_dep1), 1163 mkU32(0))); 1164 } 1165 if (isU64(cc_op, AMD64G_CC_OP_LOGICL) && isU64(cond, AMD64CondNZ)) { 1166 /* long and/or/xor, then NZ --> test dst!=0 */ 1167 return unop(Iop_1Uto64, 1168 binop(Iop_CmpNE32, 1169 unop(Iop_64to32, cc_dep1), 1170 mkU32(0))); 1171 } 1172 1173 if (isU64(cc_op, AMD64G_CC_OP_LOGICL) && isU64(cond, AMD64CondLE)) { 1174 /* long and/or/xor, then LE 1175 This is pretty subtle. LOGIC sets SF and ZF according to the 1176 result and makes OF be zero. LE computes (SF ^ OF) | ZF, but 1177 OF is zero, so this reduces to SF | ZF -- which will be 1 iff 1178 the result is <=signed 0. Hence ... 1179 */ 1180 return unop(Iop_1Uto64, 1181 binop(Iop_CmpLE32S, 1182 unop(Iop_64to32, cc_dep1), 1183 mkU32(0))); 1184 } 1185 1186 if (isU64(cc_op, AMD64G_CC_OP_LOGICL) && isU64(cond, AMD64CondS)) { 1187 /* long and/or/xor, then S --> (ULong)result[31] */ 1188 return binop(Iop_And64, 1189 binop(Iop_Shr64, cc_dep1, mkU8(31)), 1190 mkU64(1)); 1191 } 1192 if (isU64(cc_op, AMD64G_CC_OP_LOGICL) && isU64(cond, AMD64CondNS)) { 1193 /* long and/or/xor, then S --> (ULong) ~ result[31] */ 1194 return binop(Iop_Xor64, 1195 binop(Iop_And64, 1196 binop(Iop_Shr64, cc_dep1, mkU8(31)), 1197 mkU64(1)), 1198 mkU64(1)); 1199 } 1200 1201 /*---------------- LOGICW ----------------*/ 1202 1203 if (isU64(cc_op, AMD64G_CC_OP_LOGICW) && isU64(cond, AMD64CondZ)) { 1204 /* word and/or/xor, then Z --> test dst==0 */ 1205 return unop(Iop_1Uto64, 1206 binop(Iop_CmpEQ64, 1207 binop(Iop_And64, cc_dep1, mkU64(0xFFFF)), 1208 mkU64(0))); 1209 } 1210 if (isU64(cc_op, AMD64G_CC_OP_LOGICW) && isU64(cond, AMD64CondNZ)) { 1211 /* word and/or/xor, then NZ --> test dst!=0 */ 1212 return unop(Iop_1Uto64, 1213 binop(Iop_CmpNE64, 1214 binop(Iop_And64, cc_dep1, mkU64(0xFFFF)), 1215 mkU64(0))); 1216 } 1217 1218 /*---------------- LOGICB ----------------*/ 1219 1220 if (isU64(cc_op, AMD64G_CC_OP_LOGICB) && isU64(cond, AMD64CondZ)) { 1221 /* byte and/or/xor, then Z --> test dst==0 */ 1222 return unop(Iop_1Uto64, 1223 binop(Iop_CmpEQ64, binop(Iop_And64,cc_dep1,mkU64(255)), 1224 mkU64(0))); 1225 } 1226 if (isU64(cc_op, AMD64G_CC_OP_LOGICB) && isU64(cond, AMD64CondNZ)) { 1227 /* byte and/or/xor, then NZ --> test dst!=0 */ 1228 return unop(Iop_1Uto64, 1229 binop(Iop_CmpNE64, binop(Iop_And64,cc_dep1,mkU64(255)), 1230 mkU64(0))); 1231 } 1232 1233 if (isU64(cc_op, AMD64G_CC_OP_LOGICB) && isU64(cond, AMD64CondS)) { 1234 /* this is an idiom gcc sometimes uses to find out if the top 1235 bit of a byte register is set: eg testb %al,%al; js .. 1236 Since it just depends on the top bit of the byte, extract 1237 that bit and explicitly get rid of all the rest. This 1238 helps memcheck avoid false positives in the case where any 1239 of the other bits in the byte are undefined. */ 1240 /* byte and/or/xor, then S --> (UInt)result[7] */ 1241 return binop(Iop_And64, 1242 binop(Iop_Shr64,cc_dep1,mkU8(7)), 1243 mkU64(1)); 1244 } 1245 if (isU64(cc_op, AMD64G_CC_OP_LOGICB) && isU64(cond, AMD64CondNS)) { 1246 /* byte and/or/xor, then NS --> (UInt)!result[7] */ 1247 return binop(Iop_Xor64, 1248 binop(Iop_And64, 1249 binop(Iop_Shr64,cc_dep1,mkU8(7)), 1250 mkU64(1)), 1251 mkU64(1)); 1252 } 1253 1254 /*---------------- INCB ----------------*/ 1255 1256 if (isU64(cc_op, AMD64G_CC_OP_INCB) && isU64(cond, AMD64CondLE)) { 1257 /* 8-bit inc, then LE --> sign bit of the arg */ 1258 return binop(Iop_And64, 1259 binop(Iop_Shr64, 1260 binop(Iop_Sub64, cc_dep1, mkU64(1)), 1261 mkU8(7)), 1262 mkU64(1)); 1263 } 1264 1265 /*---------------- INCW ----------------*/ 1266 1267 if (isU64(cc_op, AMD64G_CC_OP_INCW) && isU64(cond, AMD64CondZ)) { 1268 /* 16-bit inc, then Z --> test dst == 0 */ 1269 return unop(Iop_1Uto64, 1270 binop(Iop_CmpEQ64, 1271 binop(Iop_Shl64,cc_dep1,mkU8(48)), 1272 mkU64(0))); 1273 } 1274 1275 /*---------------- DECL ----------------*/ 1276 1277 if (isU64(cc_op, AMD64G_CC_OP_DECL) && isU64(cond, AMD64CondZ)) { 1278 /* dec L, then Z --> test dst == 0 */ 1279 return unop(Iop_1Uto64, 1280 binop(Iop_CmpEQ32, 1281 unop(Iop_64to32, cc_dep1), 1282 mkU32(0))); 1283 } 1284 1285 /*---------------- DECW ----------------*/ 1286 1287 if (isU64(cc_op, AMD64G_CC_OP_DECW) && isU64(cond, AMD64CondNZ)) { 1288 /* 16-bit dec, then NZ --> test dst != 0 */ 1289 return unop(Iop_1Uto64, 1290 binop(Iop_CmpNE64, 1291 binop(Iop_Shl64,cc_dep1,mkU8(48)), 1292 mkU64(0))); 1293 } 1294 1295 /*---------------- COPY ----------------*/ 1296 /* This can happen, as a result of amd64 FP compares: "comisd ... ; 1297 jbe" for example. */ 1298 1299 if (isU64(cc_op, AMD64G_CC_OP_COPY) && 1300 (isU64(cond, AMD64CondBE) || isU64(cond, AMD64CondNBE))) { 1301 /* COPY, then BE --> extract C and Z from dep1, and test (C 1302 or Z == 1). */ 1303 /* COPY, then NBE --> extract C and Z from dep1, and test (C 1304 or Z == 0). */ 1305 ULong nnn = isU64(cond, AMD64CondBE) ? 1 : 0; 1306 return 1307 unop( 1308 Iop_1Uto64, 1309 binop( 1310 Iop_CmpEQ64, 1311 binop( 1312 Iop_And64, 1313 binop( 1314 Iop_Or64, 1315 binop(Iop_Shr64, cc_dep1, mkU8(AMD64G_CC_SHIFT_C)), 1316 binop(Iop_Shr64, cc_dep1, mkU8(AMD64G_CC_SHIFT_Z)) 1317 ), 1318 mkU64(1) 1319 ), 1320 mkU64(nnn) 1321 ) 1322 ); 1323 } 1324 1325 if (isU64(cc_op, AMD64G_CC_OP_COPY) && isU64(cond, AMD64CondB)) { 1326 /* COPY, then B --> extract C dep1, and test (C == 1). */ 1327 return 1328 unop( 1329 Iop_1Uto64, 1330 binop( 1331 Iop_CmpNE64, 1332 binop( 1333 Iop_And64, 1334 binop(Iop_Shr64, cc_dep1, mkU8(AMD64G_CC_SHIFT_C)), 1335 mkU64(1) 1336 ), 1337 mkU64(0) 1338 ) 1339 ); 1340 } 1341 1342 if (isU64(cc_op, AMD64G_CC_OP_COPY) 1343 && (isU64(cond, AMD64CondZ) || isU64(cond, AMD64CondNZ))) { 1344 /* COPY, then Z --> extract Z from dep1, and test (Z == 1). */ 1345 /* COPY, then NZ --> extract Z from dep1, and test (Z == 0). */ 1346 UInt nnn = isU64(cond, AMD64CondZ) ? 1 : 0; 1347 return 1348 unop( 1349 Iop_1Uto64, 1350 binop( 1351 Iop_CmpEQ64, 1352 binop( 1353 Iop_And64, 1354 binop(Iop_Shr64, cc_dep1, mkU8(AMD64G_CC_SHIFT_Z)), 1355 mkU64(1) 1356 ), 1357 mkU64(nnn) 1358 ) 1359 ); 1360 } 1361 1362 if (isU64(cc_op, AMD64G_CC_OP_COPY) && isU64(cond, AMD64CondP)) { 1363 /* COPY, then P --> extract P from dep1, and test (P == 1). */ 1364 return 1365 unop( 1366 Iop_1Uto64, 1367 binop( 1368 Iop_CmpNE64, 1369 binop( 1370 Iop_And64, 1371 binop(Iop_Shr64, cc_dep1, mkU8(AMD64G_CC_SHIFT_P)), 1372 mkU64(1) 1373 ), 1374 mkU64(0) 1375 ) 1376 ); 1377 } 1378 1379 return NULL; 1380 } 1381 1382 /* --------- specialising "amd64g_calculate_rflags_c" --------- */ 1383 1384 if (vex_streq(function_name, "amd64g_calculate_rflags_c")) { 1385 /* specialise calls to above "calculate_rflags_c" function */ 1386 IRExpr *cc_op, *cc_dep1, *cc_dep2, *cc_ndep; 1387 vassert(arity == 4); 1388 cc_op = args[0]; 1389 cc_dep1 = args[1]; 1390 cc_dep2 = args[2]; 1391 cc_ndep = args[3]; 1392 1393 if (isU64(cc_op, AMD64G_CC_OP_SUBQ)) { 1394 /* C after sub denotes unsigned less than */ 1395 return unop(Iop_1Uto64, 1396 binop(Iop_CmpLT64U, 1397 cc_dep1, 1398 cc_dep2)); 1399 } 1400 if (isU64(cc_op, AMD64G_CC_OP_SUBL)) { 1401 /* C after sub denotes unsigned less than */ 1402 return unop(Iop_1Uto64, 1403 binop(Iop_CmpLT32U, 1404 unop(Iop_64to32, cc_dep1), 1405 unop(Iop_64to32, cc_dep2))); 1406 } 1407 if (isU64(cc_op, AMD64G_CC_OP_SUBB)) { 1408 /* C after sub denotes unsigned less than */ 1409 return unop(Iop_1Uto64, 1410 binop(Iop_CmpLT64U, 1411 binop(Iop_And64,cc_dep1,mkU64(0xFF)), 1412 binop(Iop_And64,cc_dep2,mkU64(0xFF)))); 1413 } 1414 if (isU64(cc_op, AMD64G_CC_OP_LOGICQ) 1415 || isU64(cc_op, AMD64G_CC_OP_LOGICL) 1416 || isU64(cc_op, AMD64G_CC_OP_LOGICW) 1417 || isU64(cc_op, AMD64G_CC_OP_LOGICB)) { 1418 /* cflag after logic is zero */ 1419 return mkU64(0); 1420 } 1421 if (isU64(cc_op, AMD64G_CC_OP_DECL) || isU64(cc_op, AMD64G_CC_OP_INCL) 1422 || isU64(cc_op, AMD64G_CC_OP_DECQ) || isU64(cc_op, AMD64G_CC_OP_INCQ)) { 1423 /* If the thunk is dec or inc, the cflag is supplied as CC_NDEP. */ 1424 return cc_ndep; 1425 } 1426 1427 # if 0 1428 if (cc_op->tag == Iex_Const) { 1429 vex_printf("CFLAG "); ppIRExpr(cc_op); vex_printf("\n"); 1430 } 1431 # endif 1432 1433 return NULL; 1434 } 1435 1436 # undef unop 1437 # undef binop 1438 # undef mkU64 1439 # undef mkU32 1440 # undef mkU8 1441 1442 return NULL; 1443 } 1444 1445 1446 /*---------------------------------------------------------------*/ 1447 /*--- Supporting functions for x87 FPU activities. ---*/ 1448 /*---------------------------------------------------------------*/ 1449 1450 static inline Bool host_is_little_endian ( void ) 1451 { 1452 UInt x = 0x76543210; 1453 UChar* p = (UChar*)(&x); 1454 return toBool(*p == 0x10); 1455 } 1456 1457 /* Inspect a value and its tag, as per the x87 'FXAM' instruction. */ 1458 /* CALLED FROM GENERATED CODE: CLEAN HELPER */ 1459 ULong amd64g_calculate_FXAM ( ULong tag, ULong dbl ) 1460 { 1461 Bool mantissaIsZero; 1462 Int bexp; 1463 UChar sign; 1464 UChar* f64; 1465 1466 vassert(host_is_little_endian()); 1467 1468 /* vex_printf("calculate_FXAM ( %d, %llx ) .. ", tag, dbl ); */ 1469 1470 f64 = (UChar*)(&dbl); 1471 sign = toUChar( (f64[7] >> 7) & 1 ); 1472 1473 /* First off, if the tag indicates the register was empty, 1474 return 1,0,sign,1 */ 1475 if (tag == 0) { 1476 /* vex_printf("Empty\n"); */ 1477 return AMD64G_FC_MASK_C3 | 0 | (sign << AMD64G_FC_SHIFT_C1) 1478 | AMD64G_FC_MASK_C0; 1479 } 1480 1481 bexp = (f64[7] << 4) | ((f64[6] >> 4) & 0x0F); 1482 bexp &= 0x7FF; 1483 1484 mantissaIsZero 1485 = toBool( 1486 (f64[6] & 0x0F) == 0 1487 && (f64[5] | f64[4] | f64[3] | f64[2] | f64[1] | f64[0]) == 0 1488 ); 1489 1490 /* If both exponent and mantissa are zero, the value is zero. 1491 Return 1,0,sign,0. */ 1492 if (bexp == 0 && mantissaIsZero) { 1493 /* vex_printf("Zero\n"); */ 1494 return AMD64G_FC_MASK_C3 | 0 1495 | (sign << AMD64G_FC_SHIFT_C1) | 0; 1496 } 1497 1498 /* If exponent is zero but mantissa isn't, it's a denormal. 1499 Return 1,1,sign,0. */ 1500 if (bexp == 0 && !mantissaIsZero) { 1501 /* vex_printf("Denormal\n"); */ 1502 return AMD64G_FC_MASK_C3 | AMD64G_FC_MASK_C2 1503 | (sign << AMD64G_FC_SHIFT_C1) | 0; 1504 } 1505 1506 /* If the exponent is 7FF and the mantissa is zero, this is an infinity. 1507 Return 0,1,sign,1. */ 1508 if (bexp == 0x7FF && mantissaIsZero) { 1509 /* vex_printf("Inf\n"); */ 1510 return 0 | AMD64G_FC_MASK_C2 | (sign << AMD64G_FC_SHIFT_C1) 1511 | AMD64G_FC_MASK_C0; 1512 } 1513 1514 /* If the exponent is 7FF and the mantissa isn't zero, this is a NaN. 1515 Return 0,0,sign,1. */ 1516 if (bexp == 0x7FF && !mantissaIsZero) { 1517 /* vex_printf("NaN\n"); */ 1518 return 0 | 0 | (sign << AMD64G_FC_SHIFT_C1) | AMD64G_FC_MASK_C0; 1519 } 1520 1521 /* Uh, ok, we give up. It must be a normal finite number. 1522 Return 0,1,sign,0. 1523 */ 1524 /* vex_printf("normal\n"); */ 1525 return 0 | AMD64G_FC_MASK_C2 | (sign << AMD64G_FC_SHIFT_C1) | 0; 1526 } 1527 1528 1529 /* This is used to implement both 'frstor' and 'fldenv'. The latter 1530 appears to differ from the former only in that the 8 FP registers 1531 themselves are not transferred into the guest state. */ 1532 static 1533 VexEmWarn do_put_x87 ( Bool moveRegs, 1534 /*IN*/UChar* x87_state, 1535 /*OUT*/VexGuestAMD64State* vex_state ) 1536 { 1537 Int stno, preg; 1538 UInt tag; 1539 ULong* vexRegs = (ULong*)(&vex_state->guest_FPREG[0]); 1540 UChar* vexTags = (UChar*)(&vex_state->guest_FPTAG[0]); 1541 Fpu_State* x87 = (Fpu_State*)x87_state; 1542 UInt ftop = (x87->env[FP_ENV_STAT] >> 11) & 7; 1543 UInt tagw = x87->env[FP_ENV_TAG]; 1544 UInt fpucw = x87->env[FP_ENV_CTRL]; 1545 UInt c3210 = x87->env[FP_ENV_STAT] & 0x4700; 1546 VexEmWarn ew; 1547 UInt fpround; 1548 ULong pair; 1549 1550 /* Copy registers and tags */ 1551 for (stno = 0; stno < 8; stno++) { 1552 preg = (stno + ftop) & 7; 1553 tag = (tagw >> (2*preg)) & 3; 1554 if (tag == 3) { 1555 /* register is empty */ 1556 /* hmm, if it's empty, does it still get written? Probably 1557 safer to say it does. If we don't, memcheck could get out 1558 of sync, in that it thinks all FP registers are defined by 1559 this helper, but in reality some have not been updated. */ 1560 if (moveRegs) 1561 vexRegs[preg] = 0; /* IEEE754 64-bit zero */ 1562 vexTags[preg] = 0; 1563 } else { 1564 /* register is non-empty */ 1565 if (moveRegs) 1566 convert_f80le_to_f64le( &x87->reg[10*stno], 1567 (UChar*)&vexRegs[preg] ); 1568 vexTags[preg] = 1; 1569 } 1570 } 1571 1572 /* stack pointer */ 1573 vex_state->guest_FTOP = ftop; 1574 1575 /* status word */ 1576 vex_state->guest_FC3210 = c3210; 1577 1578 /* handle the control word, setting FPROUND and detecting any 1579 emulation warnings. */ 1580 pair = amd64g_check_fldcw ( (ULong)fpucw ); 1581 fpround = (UInt)pair & 0xFFFFFFFFULL; 1582 ew = (VexEmWarn)(pair >> 32); 1583 1584 vex_state->guest_FPROUND = fpround & 3; 1585 1586 /* emulation warnings --> caller */ 1587 return ew; 1588 } 1589 1590 1591 /* Create an x87 FPU state from the guest state, as close as 1592 we can approximate it. */ 1593 static 1594 void do_get_x87 ( /*IN*/VexGuestAMD64State* vex_state, 1595 /*OUT*/UChar* x87_state ) 1596 { 1597 Int i, stno, preg; 1598 UInt tagw; 1599 ULong* vexRegs = (ULong*)(&vex_state->guest_FPREG[0]); 1600 UChar* vexTags = (UChar*)(&vex_state->guest_FPTAG[0]); 1601 Fpu_State* x87 = (Fpu_State*)x87_state; 1602 UInt ftop = vex_state->guest_FTOP; 1603 UInt c3210 = vex_state->guest_FC3210; 1604 1605 for (i = 0; i < 14; i++) 1606 x87->env[i] = 0; 1607 1608 x87->env[1] = x87->env[3] = x87->env[5] = x87->env[13] = 0xFFFF; 1609 x87->env[FP_ENV_STAT] 1610 = toUShort(((ftop & 7) << 11) | (c3210 & 0x4700)); 1611 x87->env[FP_ENV_CTRL] 1612 = toUShort(amd64g_create_fpucw( vex_state->guest_FPROUND )); 1613 1614 /* Dump the register stack in ST order. */ 1615 tagw = 0; 1616 for (stno = 0; stno < 8; stno++) { 1617 preg = (stno + ftop) & 7; 1618 if (vexTags[preg] == 0) { 1619 /* register is empty */ 1620 tagw |= (3 << (2*preg)); 1621 convert_f64le_to_f80le( (UChar*)&vexRegs[preg], 1622 &x87->reg[10*stno] ); 1623 } else { 1624 /* register is full. */ 1625 tagw |= (0 << (2*preg)); 1626 convert_f64le_to_f80le( (UChar*)&vexRegs[preg], 1627 &x87->reg[10*stno] ); 1628 } 1629 } 1630 x87->env[FP_ENV_TAG] = toUShort(tagw); 1631 } 1632 1633 1634 /* CALLED FROM GENERATED CODE */ 1635 /* DIRTY HELPER (reads guest state, writes guest mem) */ 1636 /* NOTE: only handles 32-bit format (no REX.W on the insn) */ 1637 void amd64g_dirtyhelper_FXSAVE ( VexGuestAMD64State* gst, HWord addr ) 1638 { 1639 /* Derived from values obtained from 1640 vendor_id : AuthenticAMD 1641 cpu family : 15 1642 model : 12 1643 model name : AMD Athlon(tm) 64 Processor 3200+ 1644 stepping : 0 1645 cpu MHz : 2200.000 1646 cache size : 512 KB 1647 */ 1648 /* Somewhat roundabout, but at least it's simple. */ 1649 Fpu_State tmp; 1650 UShort* addrS = (UShort*)addr; 1651 UChar* addrC = (UChar*)addr; 1652 U128* xmm = (U128*)(addr + 160); 1653 UInt mxcsr; 1654 UShort fp_tags; 1655 UInt summary_tags; 1656 Int r, stno; 1657 UShort *srcS, *dstS; 1658 1659 do_get_x87( gst, (UChar*)&tmp ); 1660 mxcsr = amd64g_create_mxcsr( gst->guest_SSEROUND ); 1661 1662 /* Now build the proper fxsave image from the x87 image we just 1663 made. */ 1664 1665 addrS[0] = tmp.env[FP_ENV_CTRL]; /* FCW: fpu control word */ 1666 addrS[1] = tmp.env[FP_ENV_STAT]; /* FCW: fpu status word */ 1667 1668 /* set addrS[2] in an endian-independent way */ 1669 summary_tags = 0; 1670 fp_tags = tmp.env[FP_ENV_TAG]; 1671 for (r = 0; r < 8; r++) { 1672 if ( ((fp_tags >> (2*r)) & 3) != 3 ) 1673 summary_tags |= (1 << r); 1674 } 1675 addrC[4] = toUChar(summary_tags); /* FTW: tag summary byte */ 1676 addrC[5] = 0; /* pad */ 1677 1678 /* FOP: faulting fpu opcode. From experimentation, the real CPU 1679 does not write this field. (?!) */ 1680 addrS[3] = 0; /* BOGUS */ 1681 1682 /* RIP (Last x87 instruction pointer). From experimentation, the 1683 real CPU does not write this field. (?!) */ 1684 addrS[4] = 0; /* BOGUS */ 1685 addrS[5] = 0; /* BOGUS */ 1686 addrS[6] = 0; /* BOGUS */ 1687 addrS[7] = 0; /* BOGUS */ 1688 1689 /* RDP (Last x87 data pointer). From experimentation, the real CPU 1690 does not write this field. (?!) */ 1691 addrS[8] = 0; /* BOGUS */ 1692 addrS[9] = 0; /* BOGUS */ 1693 addrS[10] = 0; /* BOGUS */ 1694 addrS[11] = 0; /* BOGUS */ 1695 1696 addrS[12] = toUShort(mxcsr); /* MXCSR */ 1697 addrS[13] = toUShort(mxcsr >> 16); 1698 1699 addrS[14] = 0xFFFF; /* MXCSR mask (lo16) */ 1700 addrS[15] = 0x0000; /* MXCSR mask (hi16) */ 1701 1702 /* Copy in the FP registers, in ST order. */ 1703 for (stno = 0; stno < 8; stno++) { 1704 srcS = (UShort*)(&tmp.reg[10*stno]); 1705 dstS = (UShort*)(&addrS[16 + 8*stno]); 1706 dstS[0] = srcS[0]; 1707 dstS[1] = srcS[1]; 1708 dstS[2] = srcS[2]; 1709 dstS[3] = srcS[3]; 1710 dstS[4] = srcS[4]; 1711 dstS[5] = 0; 1712 dstS[6] = 0; 1713 dstS[7] = 0; 1714 } 1715 1716 /* That's the first 160 bytes of the image done. Now only %xmm0 1717 .. %xmm15 remain to be copied. If the host is big-endian, these 1718 need to be byte-swapped. */ 1719 vassert(host_is_little_endian()); 1720 1721 # define COPY_U128(_dst,_src) \ 1722 do { _dst[0] = _src[0]; _dst[1] = _src[1]; \ 1723 _dst[2] = _src[2]; _dst[3] = _src[3]; } \ 1724 while (0) 1725 1726 COPY_U128( xmm[0], gst->guest_YMM0 ); 1727 COPY_U128( xmm[1], gst->guest_YMM1 ); 1728 COPY_U128( xmm[2], gst->guest_YMM2 ); 1729 COPY_U128( xmm[3], gst->guest_YMM3 ); 1730 COPY_U128( xmm[4], gst->guest_YMM4 ); 1731 COPY_U128( xmm[5], gst->guest_YMM5 ); 1732 COPY_U128( xmm[6], gst->guest_YMM6 ); 1733 COPY_U128( xmm[7], gst->guest_YMM7 ); 1734 COPY_U128( xmm[8], gst->guest_YMM8 ); 1735 COPY_U128( xmm[9], gst->guest_YMM9 ); 1736 COPY_U128( xmm[10], gst->guest_YMM10 ); 1737 COPY_U128( xmm[11], gst->guest_YMM11 ); 1738 COPY_U128( xmm[12], gst->guest_YMM12 ); 1739 COPY_U128( xmm[13], gst->guest_YMM13 ); 1740 COPY_U128( xmm[14], gst->guest_YMM14 ); 1741 COPY_U128( xmm[15], gst->guest_YMM15 ); 1742 1743 # undef COPY_U128 1744 } 1745 1746 1747 /* CALLED FROM GENERATED CODE */ 1748 /* DIRTY HELPER (writes guest state, reads guest mem) */ 1749 VexEmWarn amd64g_dirtyhelper_FXRSTOR ( VexGuestAMD64State* gst, HWord addr ) 1750 { 1751 Fpu_State tmp; 1752 VexEmWarn warnX87 = EmWarn_NONE; 1753 VexEmWarn warnXMM = EmWarn_NONE; 1754 UShort* addrS = (UShort*)addr; 1755 UChar* addrC = (UChar*)addr; 1756 U128* xmm = (U128*)(addr + 160); 1757 UShort fp_tags; 1758 Int r, stno, i; 1759 1760 /* Restore %xmm0 .. %xmm15. If the host is big-endian, these need 1761 to be byte-swapped. */ 1762 vassert(host_is_little_endian()); 1763 1764 # define COPY_U128(_dst,_src) \ 1765 do { _dst[0] = _src[0]; _dst[1] = _src[1]; \ 1766 _dst[2] = _src[2]; _dst[3] = _src[3]; } \ 1767 while (0) 1768 1769 COPY_U128( gst->guest_YMM0, xmm[0] ); 1770 COPY_U128( gst->guest_YMM1, xmm[1] ); 1771 COPY_U128( gst->guest_YMM2, xmm[2] ); 1772 COPY_U128( gst->guest_YMM3, xmm[3] ); 1773 COPY_U128( gst->guest_YMM4, xmm[4] ); 1774 COPY_U128( gst->guest_YMM5, xmm[5] ); 1775 COPY_U128( gst->guest_YMM6, xmm[6] ); 1776 COPY_U128( gst->guest_YMM7, xmm[7] ); 1777 COPY_U128( gst->guest_YMM8, xmm[8] ); 1778 COPY_U128( gst->guest_YMM9, xmm[9] ); 1779 COPY_U128( gst->guest_YMM10, xmm[10] ); 1780 COPY_U128( gst->guest_YMM11, xmm[11] ); 1781 COPY_U128( gst->guest_YMM12, xmm[12] ); 1782 COPY_U128( gst->guest_YMM13, xmm[13] ); 1783 COPY_U128( gst->guest_YMM14, xmm[14] ); 1784 COPY_U128( gst->guest_YMM15, xmm[15] ); 1785 1786 # undef COPY_U128 1787 1788 /* Copy the x87 registers out of the image, into a temporary 1789 Fpu_State struct. */ 1790 for (i = 0; i < 14; i++) tmp.env[i] = 0; 1791 for (i = 0; i < 80; i++) tmp.reg[i] = 0; 1792 /* fill in tmp.reg[0..7] */ 1793 for (stno = 0; stno < 8; stno++) { 1794 UShort* dstS = (UShort*)(&tmp.reg[10*stno]); 1795 UShort* srcS = (UShort*)(&addrS[16 + 8*stno]); 1796 dstS[0] = srcS[0]; 1797 dstS[1] = srcS[1]; 1798 dstS[2] = srcS[2]; 1799 dstS[3] = srcS[3]; 1800 dstS[4] = srcS[4]; 1801 } 1802 /* fill in tmp.env[0..13] */ 1803 tmp.env[FP_ENV_CTRL] = addrS[0]; /* FCW: fpu control word */ 1804 tmp.env[FP_ENV_STAT] = addrS[1]; /* FCW: fpu status word */ 1805 1806 fp_tags = 0; 1807 for (r = 0; r < 8; r++) { 1808 if (addrC[4] & (1<<r)) 1809 fp_tags |= (0 << (2*r)); /* EMPTY */ 1810 else 1811 fp_tags |= (3 << (2*r)); /* VALID -- not really precise enough. */ 1812 } 1813 tmp.env[FP_ENV_TAG] = fp_tags; 1814 1815 /* Now write 'tmp' into the guest state. */ 1816 warnX87 = do_put_x87( True/*moveRegs*/, (UChar*)&tmp, gst ); 1817 1818 { UInt w32 = (((UInt)addrS[12]) & 0xFFFF) 1819 | ((((UInt)addrS[13]) & 0xFFFF) << 16); 1820 ULong w64 = amd64g_check_ldmxcsr( (ULong)w32 ); 1821 1822 warnXMM = (VexEmWarn)(w64 >> 32); 1823 1824 gst->guest_SSEROUND = w64 & 0xFFFFFFFFULL; 1825 } 1826 1827 /* Prefer an X87 emwarn over an XMM one, if both exist. */ 1828 if (warnX87 != EmWarn_NONE) 1829 return warnX87; 1830 else 1831 return warnXMM; 1832 } 1833 1834 1835 /* DIRTY HELPER (writes guest state) */ 1836 /* Initialise the x87 FPU state as per 'finit'. */ 1837 void amd64g_dirtyhelper_FINIT ( VexGuestAMD64State* gst ) 1838 { 1839 Int i; 1840 gst->guest_FTOP = 0; 1841 for (i = 0; i < 8; i++) { 1842 gst->guest_FPTAG[i] = 0; /* empty */ 1843 gst->guest_FPREG[i] = 0; /* IEEE754 64-bit zero */ 1844 } 1845 gst->guest_FPROUND = (ULong)Irrm_NEAREST; 1846 gst->guest_FC3210 = 0; 1847 } 1848 1849 1850 /* CALLED FROM GENERATED CODE */ 1851 /* DIRTY HELPER (reads guest memory) */ 1852 ULong amd64g_dirtyhelper_loadF80le ( ULong addrU ) 1853 { 1854 ULong f64; 1855 convert_f80le_to_f64le ( (UChar*)ULong_to_Ptr(addrU), (UChar*)&f64 ); 1856 return f64; 1857 } 1858 1859 /* CALLED FROM GENERATED CODE */ 1860 /* DIRTY HELPER (writes guest memory) */ 1861 void amd64g_dirtyhelper_storeF80le ( ULong addrU, ULong f64 ) 1862 { 1863 convert_f64le_to_f80le( (UChar*)&f64, (UChar*)ULong_to_Ptr(addrU) ); 1864 } 1865 1866 1867 /* CALLED FROM GENERATED CODE */ 1868 /* CLEAN HELPER */ 1869 /* mxcsr[15:0] contains a SSE native format MXCSR value. 1870 Extract from it the required SSEROUND value and any resulting 1871 emulation warning, and return (warn << 32) | sseround value. 1872 */ 1873 ULong amd64g_check_ldmxcsr ( ULong mxcsr ) 1874 { 1875 /* Decide on a rounding mode. mxcsr[14:13] holds it. */ 1876 /* NOTE, encoded exactly as per enum IRRoundingMode. */ 1877 ULong rmode = (mxcsr >> 13) & 3; 1878 1879 /* Detect any required emulation warnings. */ 1880 VexEmWarn ew = EmWarn_NONE; 1881 1882 if ((mxcsr & 0x1F80) != 0x1F80) { 1883 /* unmasked exceptions! */ 1884 ew = EmWarn_X86_sseExns; 1885 } 1886 else 1887 if (mxcsr & (1<<15)) { 1888 /* FZ is set */ 1889 ew = EmWarn_X86_fz; 1890 } 1891 else 1892 if (mxcsr & (1<<6)) { 1893 /* DAZ is set */ 1894 ew = EmWarn_X86_daz; 1895 } 1896 1897 return (((ULong)ew) << 32) | ((ULong)rmode); 1898 } 1899 1900 1901 /* CALLED FROM GENERATED CODE */ 1902 /* CLEAN HELPER */ 1903 /* Given sseround as an IRRoundingMode value, create a suitable SSE 1904 native format MXCSR value. */ 1905 ULong amd64g_create_mxcsr ( ULong sseround ) 1906 { 1907 sseround &= 3; 1908 return 0x1F80 | (sseround << 13); 1909 } 1910 1911 1912 /* CLEAN HELPER */ 1913 /* fpucw[15:0] contains a x87 native format FPU control word. 1914 Extract from it the required FPROUND value and any resulting 1915 emulation warning, and return (warn << 32) | fpround value. 1916 */ 1917 ULong amd64g_check_fldcw ( ULong fpucw ) 1918 { 1919 /* Decide on a rounding mode. fpucw[11:10] holds it. */ 1920 /* NOTE, encoded exactly as per enum IRRoundingMode. */ 1921 ULong rmode = (fpucw >> 10) & 3; 1922 1923 /* Detect any required emulation warnings. */ 1924 VexEmWarn ew = EmWarn_NONE; 1925 1926 if ((fpucw & 0x3F) != 0x3F) { 1927 /* unmasked exceptions! */ 1928 ew = EmWarn_X86_x87exns; 1929 } 1930 else 1931 if (((fpucw >> 8) & 3) != 3) { 1932 /* unsupported precision */ 1933 ew = EmWarn_X86_x87precision; 1934 } 1935 1936 return (((ULong)ew) << 32) | ((ULong)rmode); 1937 } 1938 1939 1940 /* CLEAN HELPER */ 1941 /* Given fpround as an IRRoundingMode value, create a suitable x87 1942 native format FPU control word. */ 1943 ULong amd64g_create_fpucw ( ULong fpround ) 1944 { 1945 fpround &= 3; 1946 return 0x037F | (fpround << 10); 1947 } 1948 1949 1950 /* This is used to implement 'fldenv'. 1951 Reads 28 bytes at x87_state[0 .. 27]. */ 1952 /* CALLED FROM GENERATED CODE */ 1953 /* DIRTY HELPER */ 1954 VexEmWarn amd64g_dirtyhelper_FLDENV ( /*OUT*/VexGuestAMD64State* vex_state, 1955 /*IN*/HWord x87_state) 1956 { 1957 return do_put_x87( False, (UChar*)x87_state, vex_state ); 1958 } 1959 1960 1961 /* CALLED FROM GENERATED CODE */ 1962 /* DIRTY HELPER */ 1963 /* Create an x87 FPU env from the guest state, as close as we can 1964 approximate it. Writes 28 bytes at x87_state[0..27]. */ 1965 void amd64g_dirtyhelper_FSTENV ( /*IN*/VexGuestAMD64State* vex_state, 1966 /*OUT*/HWord x87_state ) 1967 { 1968 Int i, stno, preg; 1969 UInt tagw; 1970 UChar* vexTags = (UChar*)(&vex_state->guest_FPTAG[0]); 1971 Fpu_State* x87 = (Fpu_State*)x87_state; 1972 UInt ftop = vex_state->guest_FTOP; 1973 ULong c3210 = vex_state->guest_FC3210; 1974 1975 for (i = 0; i < 14; i++) 1976 x87->env[i] = 0; 1977 1978 x87->env[1] = x87->env[3] = x87->env[5] = x87->env[13] = 0xFFFF; 1979 x87->env[FP_ENV_STAT] 1980 = toUShort(toUInt( ((ftop & 7) << 11) | (c3210 & 0x4700) )); 1981 x87->env[FP_ENV_CTRL] 1982 = toUShort(toUInt( amd64g_create_fpucw( vex_state->guest_FPROUND ) )); 1983 1984 /* Compute the x87 tag word. */ 1985 tagw = 0; 1986 for (stno = 0; stno < 8; stno++) { 1987 preg = (stno + ftop) & 7; 1988 if (vexTags[preg] == 0) { 1989 /* register is empty */ 1990 tagw |= (3 << (2*preg)); 1991 } else { 1992 /* register is full. */ 1993 tagw |= (0 << (2*preg)); 1994 } 1995 } 1996 x87->env[FP_ENV_TAG] = toUShort(tagw); 1997 1998 /* We don't dump the x87 registers, tho. */ 1999 } 2000 2001 2002 /* This is used to implement 'fnsave'. 2003 Writes 108 bytes at x87_state[0 .. 107]. */ 2004 /* CALLED FROM GENERATED CODE */ 2005 /* DIRTY HELPER */ 2006 void amd64g_dirtyhelper_FNSAVE ( /*IN*/VexGuestAMD64State* vex_state, 2007 /*OUT*/HWord x87_state) 2008 { 2009 do_get_x87( vex_state, (UChar*)x87_state ); 2010 } 2011 2012 2013 /* This is used to implement 'fnsaves'. 2014 Writes 94 bytes at x87_state[0 .. 93]. */ 2015 /* CALLED FROM GENERATED CODE */ 2016 /* DIRTY HELPER */ 2017 void amd64g_dirtyhelper_FNSAVES ( /*IN*/VexGuestAMD64State* vex_state, 2018 /*OUT*/HWord x87_state) 2019 { 2020 Int i, stno, preg; 2021 UInt tagw; 2022 ULong* vexRegs = (ULong*)(&vex_state->guest_FPREG[0]); 2023 UChar* vexTags = (UChar*)(&vex_state->guest_FPTAG[0]); 2024 Fpu_State_16* x87 = (Fpu_State_16*)x87_state; 2025 UInt ftop = vex_state->guest_FTOP; 2026 UInt c3210 = vex_state->guest_FC3210; 2027 2028 for (i = 0; i < 7; i++) 2029 x87->env[i] = 0; 2030 2031 x87->env[FPS_ENV_STAT] 2032 = toUShort(((ftop & 7) << 11) | (c3210 & 0x4700)); 2033 x87->env[FPS_ENV_CTRL] 2034 = toUShort(amd64g_create_fpucw( vex_state->guest_FPROUND )); 2035 2036 /* Dump the register stack in ST order. */ 2037 tagw = 0; 2038 for (stno = 0; stno < 8; stno++) { 2039 preg = (stno + ftop) & 7; 2040 if (vexTags[preg] == 0) { 2041 /* register is empty */ 2042 tagw |= (3 << (2*preg)); 2043 convert_f64le_to_f80le( (UChar*)&vexRegs[preg], 2044 &x87->reg[10*stno] ); 2045 } else { 2046 /* register is full. */ 2047 tagw |= (0 << (2*preg)); 2048 convert_f64le_to_f80le( (UChar*)&vexRegs[preg], 2049 &x87->reg[10*stno] ); 2050 } 2051 } 2052 x87->env[FPS_ENV_TAG] = toUShort(tagw); 2053 } 2054 2055 2056 /* This is used to implement 'frstor'. 2057 Reads 108 bytes at x87_state[0 .. 107]. */ 2058 /* CALLED FROM GENERATED CODE */ 2059 /* DIRTY HELPER */ 2060 VexEmWarn amd64g_dirtyhelper_FRSTOR ( /*OUT*/VexGuestAMD64State* vex_state, 2061 /*IN*/HWord x87_state) 2062 { 2063 return do_put_x87( True, (UChar*)x87_state, vex_state ); 2064 } 2065 2066 2067 /* This is used to implement 'frstors'. 2068 Reads 94 bytes at x87_state[0 .. 93]. */ 2069 /* CALLED FROM GENERATED CODE */ 2070 /* DIRTY HELPER */ 2071 VexEmWarn amd64g_dirtyhelper_FRSTORS ( /*OUT*/VexGuestAMD64State* vex_state, 2072 /*IN*/HWord x87_state) 2073 { 2074 Int stno, preg; 2075 UInt tag; 2076 ULong* vexRegs = (ULong*)(&vex_state->guest_FPREG[0]); 2077 UChar* vexTags = (UChar*)(&vex_state->guest_FPTAG[0]); 2078 Fpu_State_16* x87 = (Fpu_State_16*)x87_state; 2079 UInt ftop = (x87->env[FPS_ENV_STAT] >> 11) & 7; 2080 UInt tagw = x87->env[FPS_ENV_TAG]; 2081 UInt fpucw = x87->env[FPS_ENV_CTRL]; 2082 UInt c3210 = x87->env[FPS_ENV_STAT] & 0x4700; 2083 VexEmWarn ew; 2084 UInt fpround; 2085 ULong pair; 2086 2087 /* Copy registers and tags */ 2088 for (stno = 0; stno < 8; stno++) { 2089 preg = (stno + ftop) & 7; 2090 tag = (tagw >> (2*preg)) & 3; 2091 if (tag == 3) { 2092 /* register is empty */ 2093 /* hmm, if it's empty, does it still get written? Probably 2094 safer to say it does. If we don't, memcheck could get out 2095 of sync, in that it thinks all FP registers are defined by 2096 this helper, but in reality some have not been updated. */ 2097 vexRegs[preg] = 0; /* IEEE754 64-bit zero */ 2098 vexTags[preg] = 0; 2099 } else { 2100 /* register is non-empty */ 2101 convert_f80le_to_f64le( &x87->reg[10*stno], 2102 (UChar*)&vexRegs[preg] ); 2103 vexTags[preg] = 1; 2104 } 2105 } 2106 2107 /* stack pointer */ 2108 vex_state->guest_FTOP = ftop; 2109 2110 /* status word */ 2111 vex_state->guest_FC3210 = c3210; 2112 2113 /* handle the control word, setting FPROUND and detecting any 2114 emulation warnings. */ 2115 pair = amd64g_check_fldcw ( (ULong)fpucw ); 2116 fpround = (UInt)pair & 0xFFFFFFFFULL; 2117 ew = (VexEmWarn)(pair >> 32); 2118 2119 vex_state->guest_FPROUND = fpround & 3; 2120 2121 /* emulation warnings --> caller */ 2122 return ew; 2123 } 2124 2125 2126 /*---------------------------------------------------------------*/ 2127 /*--- Misc integer helpers, including rotates and CPUID. ---*/ 2128 /*---------------------------------------------------------------*/ 2129 2130 /* Claim to be the following CPU, which is probably representative of 2131 the lowliest (earliest) amd64 offerings. It can do neither sse3 2132 nor cx16. 2133 2134 vendor_id : AuthenticAMD 2135 cpu family : 15 2136 model : 5 2137 model name : AMD Opteron (tm) Processor 848 2138 stepping : 10 2139 cpu MHz : 1797.682 2140 cache size : 1024 KB 2141 fpu : yes 2142 fpu_exception : yes 2143 cpuid level : 1 2144 wp : yes 2145 flags : fpu vme de pse tsc msr pae mce cx8 apic sep 2146 mtrr pge mca cmov pat pse36 clflush mmx fxsr 2147 sse sse2 syscall nx mmxext lm 3dnowext 3dnow 2148 bogomips : 3600.62 2149 TLB size : 1088 4K pages 2150 clflush size : 64 2151 cache_alignment : 64 2152 address sizes : 40 bits physical, 48 bits virtual 2153 power management: ts fid vid ttp 2154 2155 2012-Feb-21: don't claim 3dnow or 3dnowext, since in fact 2156 we don't support them. See #291568. 3dnow is 80000001.EDX.31 2157 and 3dnowext is 80000001.EDX.30. 2158 */ 2159 void amd64g_dirtyhelper_CPUID_baseline ( VexGuestAMD64State* st ) 2160 { 2161 # define SET_ABCD(_a,_b,_c,_d) \ 2162 do { st->guest_RAX = (ULong)(_a); \ 2163 st->guest_RBX = (ULong)(_b); \ 2164 st->guest_RCX = (ULong)(_c); \ 2165 st->guest_RDX = (ULong)(_d); \ 2166 } while (0) 2167 2168 switch (0xFFFFFFFF & st->guest_RAX) { 2169 case 0x00000000: 2170 SET_ABCD(0x00000001, 0x68747541, 0x444d4163, 0x69746e65); 2171 break; 2172 case 0x00000001: 2173 SET_ABCD(0x00000f5a, 0x01000800, 0x00000000, 0x078bfbff); 2174 break; 2175 case 0x80000000: 2176 SET_ABCD(0x80000018, 0x68747541, 0x444d4163, 0x69746e65); 2177 break; 2178 case 0x80000001: 2179 /* Don't claim to support 3dnow or 3dnowext. 0xe1d3fbff is 2180 the original it-is-supported value that the h/w provides. 2181 See #291568. */ 2182 SET_ABCD(0x00000f5a, 0x00000505, 0x00000000, /*0xe1d3fbff*/ 2183 0x21d3fbff); 2184 break; 2185 case 0x80000002: 2186 SET_ABCD(0x20444d41, 0x6574704f, 0x206e6f72, 0x296d7428); 2187 break; 2188 case 0x80000003: 2189 SET_ABCD(0x6f725020, 0x73736563, 0x3820726f, 0x00003834); 2190 break; 2191 case 0x80000004: 2192 SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000); 2193 break; 2194 case 0x80000005: 2195 SET_ABCD(0xff08ff08, 0xff20ff20, 0x40020140, 0x40020140); 2196 break; 2197 case 0x80000006: 2198 SET_ABCD(0x00000000, 0x42004200, 0x04008140, 0x00000000); 2199 break; 2200 case 0x80000007: 2201 SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x0000000f); 2202 break; 2203 case 0x80000008: 2204 SET_ABCD(0x00003028, 0x00000000, 0x00000000, 0x00000000); 2205 break; 2206 default: 2207 SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000); 2208 break; 2209 } 2210 # undef SET_ABCD 2211 } 2212 2213 2214 /* Claim to be the following CPU (2 x ...), which is sse3 and cx16 2215 capable. 2216 2217 vendor_id : GenuineIntel 2218 cpu family : 6 2219 model : 15 2220 model name : Intel(R) Core(TM)2 CPU 6600 @ 2.40GHz 2221 stepping : 6 2222 cpu MHz : 2394.000 2223 cache size : 4096 KB 2224 physical id : 0 2225 siblings : 2 2226 core id : 0 2227 cpu cores : 2 2228 fpu : yes 2229 fpu_exception : yes 2230 cpuid level : 10 2231 wp : yes 2232 flags : fpu vme de pse tsc msr pae mce cx8 apic sep 2233 mtrr pge mca cmov pat pse36 clflush dts acpi 2234 mmx fxsr sse sse2 ss ht tm syscall nx lm 2235 constant_tsc pni monitor ds_cpl vmx est tm2 2236 cx16 xtpr lahf_lm 2237 bogomips : 4798.78 2238 clflush size : 64 2239 cache_alignment : 64 2240 address sizes : 36 bits physical, 48 bits virtual 2241 power management: 2242 */ 2243 void amd64g_dirtyhelper_CPUID_sse3_and_cx16 ( VexGuestAMD64State* st ) 2244 { 2245 # define SET_ABCD(_a,_b,_c,_d) \ 2246 do { st->guest_RAX = (ULong)(_a); \ 2247 st->guest_RBX = (ULong)(_b); \ 2248 st->guest_RCX = (ULong)(_c); \ 2249 st->guest_RDX = (ULong)(_d); \ 2250 } while (0) 2251 2252 switch (0xFFFFFFFF & st->guest_RAX) { 2253 case 0x00000000: 2254 SET_ABCD(0x0000000a, 0x756e6547, 0x6c65746e, 0x49656e69); 2255 break; 2256 case 0x00000001: 2257 SET_ABCD(0x000006f6, 0x00020800, 0x0000e3bd, 0xbfebfbff); 2258 break; 2259 case 0x00000002: 2260 SET_ABCD(0x05b0b101, 0x005657f0, 0x00000000, 0x2cb43049); 2261 break; 2262 case 0x00000003: 2263 SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000); 2264 break; 2265 case 0x00000004: { 2266 switch (0xFFFFFFFF & st->guest_RCX) { 2267 case 0x00000000: SET_ABCD(0x04000121, 0x01c0003f, 2268 0x0000003f, 0x00000001); break; 2269 case 0x00000001: SET_ABCD(0x04000122, 0x01c0003f, 2270 0x0000003f, 0x00000001); break; 2271 case 0x00000002: SET_ABCD(0x04004143, 0x03c0003f, 2272 0x00000fff, 0x00000001); break; 2273 default: SET_ABCD(0x00000000, 0x00000000, 2274 0x00000000, 0x00000000); break; 2275 } 2276 break; 2277 } 2278 case 0x00000005: 2279 SET_ABCD(0x00000040, 0x00000040, 0x00000003, 0x00000020); 2280 break; 2281 case 0x00000006: 2282 SET_ABCD(0x00000001, 0x00000002, 0x00000001, 0x00000000); 2283 break; 2284 case 0x00000007: 2285 SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000); 2286 break; 2287 case 0x00000008: 2288 SET_ABCD(0x00000400, 0x00000000, 0x00000000, 0x00000000); 2289 break; 2290 case 0x00000009: 2291 SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000); 2292 break; 2293 case 0x0000000a: 2294 unhandled_eax_value: 2295 SET_ABCD(0x07280202, 0x00000000, 0x00000000, 0x00000000); 2296 break; 2297 case 0x80000000: 2298 SET_ABCD(0x80000008, 0x00000000, 0x00000000, 0x00000000); 2299 break; 2300 case 0x80000001: 2301 SET_ABCD(0x00000000, 0x00000000, 0x00000001, 0x20100800); 2302 break; 2303 case 0x80000002: 2304 SET_ABCD(0x65746e49, 0x2952286c, 0x726f4320, 0x4d542865); 2305 break; 2306 case 0x80000003: 2307 SET_ABCD(0x43203229, 0x20205550, 0x20202020, 0x20202020); 2308 break; 2309 case 0x80000004: 2310 SET_ABCD(0x30303636, 0x20402020, 0x30342e32, 0x007a4847); 2311 break; 2312 case 0x80000005: 2313 SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000); 2314 break; 2315 case 0x80000006: 2316 SET_ABCD(0x00000000, 0x00000000, 0x10008040, 0x00000000); 2317 break; 2318 case 0x80000007: 2319 SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000); 2320 break; 2321 case 0x80000008: 2322 SET_ABCD(0x00003024, 0x00000000, 0x00000000, 0x00000000); 2323 break; 2324 default: 2325 goto unhandled_eax_value; 2326 } 2327 # undef SET_ABCD 2328 } 2329 2330 2331 /* Claim to be the following CPU (4 x ...), which is sse4.2 and cx16 2332 capable. 2333 2334 vendor_id : GenuineIntel 2335 cpu family : 6 2336 model : 37 2337 model name : Intel(R) Core(TM) i5 CPU 670 @ 3.47GHz 2338 stepping : 2 2339 cpu MHz : 3334.000 2340 cache size : 4096 KB 2341 physical id : 0 2342 siblings : 4 2343 core id : 0 2344 cpu cores : 2 2345 apicid : 0 2346 initial apicid : 0 2347 fpu : yes 2348 fpu_exception : yes 2349 cpuid level : 11 2350 wp : yes 2351 flags : fpu vme de pse tsc msr pae mce cx8 apic sep 2352 mtrr pge mca cmov pat pse36 clflush dts acpi 2353 mmx fxsr sse sse2 ss ht tm pbe syscall nx rdtscp 2354 lm constant_tsc arch_perfmon pebs bts rep_good 2355 xtopology nonstop_tsc aperfmperf pni pclmulqdq 2356 dtes64 monitor ds_cpl vmx smx est tm2 ssse3 cx16 2357 xtpr pdcm sse4_1 sse4_2 popcnt aes lahf_lm ida 2358 arat tpr_shadow vnmi flexpriority ept vpid 2359 bogomips : 6957.57 2360 clflush size : 64 2361 cache_alignment : 64 2362 address sizes : 36 bits physical, 48 bits virtual 2363 power management: 2364 */ 2365 void amd64g_dirtyhelper_CPUID_sse42_and_cx16 ( VexGuestAMD64State* st ) 2366 { 2367 # define SET_ABCD(_a,_b,_c,_d) \ 2368 do { st->guest_RAX = (ULong)(_a); \ 2369 st->guest_RBX = (ULong)(_b); \ 2370 st->guest_RCX = (ULong)(_c); \ 2371 st->guest_RDX = (ULong)(_d); \ 2372 } while (0) 2373 2374 UInt old_eax = (UInt)st->guest_RAX; 2375 UInt old_ecx = (UInt)st->guest_RCX; 2376 2377 switch (old_eax) { 2378 case 0x00000000: 2379 SET_ABCD(0x0000000b, 0x756e6547, 0x6c65746e, 0x49656e69); 2380 break; 2381 case 0x00000001: 2382 SET_ABCD(0x00020652, 0x00100800, 0x0298e3ff, 0xbfebfbff); 2383 break; 2384 case 0x00000002: 2385 SET_ABCD(0x55035a01, 0x00f0b2e3, 0x00000000, 0x09ca212c); 2386 break; 2387 case 0x00000003: 2388 SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000); 2389 break; 2390 case 0x00000004: 2391 switch (old_ecx) { 2392 case 0x00000000: SET_ABCD(0x1c004121, 0x01c0003f, 2393 0x0000003f, 0x00000000); break; 2394 case 0x00000001: SET_ABCD(0x1c004122, 0x00c0003f, 2395 0x0000007f, 0x00000000); break; 2396 case 0x00000002: SET_ABCD(0x1c004143, 0x01c0003f, 2397 0x000001ff, 0x00000000); break; 2398 case 0x00000003: SET_ABCD(0x1c03c163, 0x03c0003f, 2399 0x00000fff, 0x00000002); break; 2400 default: SET_ABCD(0x00000000, 0x00000000, 2401 0x00000000, 0x00000000); break; 2402 } 2403 break; 2404 case 0x00000005: 2405 SET_ABCD(0x00000040, 0x00000040, 0x00000003, 0x00001120); 2406 break; 2407 case 0x00000006: 2408 SET_ABCD(0x00000007, 0x00000002, 0x00000001, 0x00000000); 2409 break; 2410 case 0x00000007: 2411 SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000); 2412 break; 2413 case 0x00000008: 2414 SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000); 2415 break; 2416 case 0x00000009: 2417 SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000); 2418 break; 2419 case 0x0000000a: 2420 SET_ABCD(0x07300403, 0x00000004, 0x00000000, 0x00000603); 2421 break; 2422 case 0x0000000b: 2423 switch (old_ecx) { 2424 case 0x00000000: 2425 SET_ABCD(0x00000001, 0x00000002, 2426 0x00000100, 0x00000000); break; 2427 case 0x00000001: 2428 SET_ABCD(0x00000004, 0x00000004, 2429 0x00000201, 0x00000000); break; 2430 default: 2431 SET_ABCD(0x00000000, 0x00000000, 2432 old_ecx, 0x00000000); break; 2433 } 2434 break; 2435 case 0x0000000c: 2436 SET_ABCD(0x00000001, 0x00000002, 0x00000100, 0x00000000); 2437 break; 2438 case 0x0000000d: 2439 switch (old_ecx) { 2440 case 0x00000000: SET_ABCD(0x00000001, 0x00000002, 2441 0x00000100, 0x00000000); break; 2442 case 0x00000001: SET_ABCD(0x00000004, 0x00000004, 2443 0x00000201, 0x00000000); break; 2444 default: SET_ABCD(0x00000000, 0x00000000, 2445 old_ecx, 0x00000000); break; 2446 } 2447 break; 2448 case 0x80000000: 2449 SET_ABCD(0x80000008, 0x00000000, 0x00000000, 0x00000000); 2450 break; 2451 case 0x80000001: 2452 SET_ABCD(0x00000000, 0x00000000, 0x00000001, 0x28100800); 2453 break; 2454 case 0x80000002: 2455 SET_ABCD(0x65746e49, 0x2952286c, 0x726f4320, 0x4d542865); 2456 break; 2457 case 0x80000003: 2458 SET_ABCD(0x35692029, 0x55504320, 0x20202020, 0x20202020); 2459 break; 2460 case 0x80000004: 2461 SET_ABCD(0x30373620, 0x20402020, 0x37342e33, 0x007a4847); 2462 break; 2463 case 0x80000005: 2464 SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000); 2465 break; 2466 case 0x80000006: 2467 SET_ABCD(0x00000000, 0x00000000, 0x01006040, 0x00000000); 2468 break; 2469 case 0x80000007: 2470 SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000100); 2471 break; 2472 case 0x80000008: 2473 SET_ABCD(0x00003024, 0x00000000, 0x00000000, 0x00000000); 2474 break; 2475 default: 2476 SET_ABCD(0x00000001, 0x00000002, 0x00000100, 0x00000000); 2477 break; 2478 } 2479 # undef SET_ABCD 2480 } 2481 2482 2483 /* Claim to be the following CPU (4 x ...), which is AVX and cx16 2484 capable. 2485 2486 vendor_id : GenuineIntel 2487 cpu family : 6 2488 model : 42 2489 model name : Intel(R) Core(TM) i5-2300 CPU @ 2.80GHz 2490 stepping : 7 2491 cpu MHz : 1600.000 2492 cache size : 6144 KB 2493 physical id : 0 2494 siblings : 4 2495 core id : 3 2496 cpu cores : 4 2497 apicid : 6 2498 initial apicid : 6 2499 fpu : yes 2500 fpu_exception : yes 2501 cpuid level : 13 2502 wp : yes 2503 flags : fpu vme de pse tsc msr pae mce cx8 apic sep 2504 mtrr pge mca cmov pat pse36 clflush dts acpi 2505 mmx fxsr sse sse2 ss ht tm pbe syscall nx rdtscp 2506 lm constant_tsc arch_perfmon pebs bts rep_good 2507 nopl xtopology nonstop_tsc aperfmperf pni pclmulqdq 2508 dtes64 monitor ds_cpl vmx est tm2 ssse3 cx16 2509 xtpr pdcm sse4_1 sse4_2 popcnt aes xsave avx 2510 lahf_lm ida arat epb xsaveopt pln pts dts 2511 tpr_shadow vnmi flexpriority ept vpid 2512 2513 bogomips : 5768.94 2514 clflush size : 64 2515 cache_alignment : 64 2516 address sizes : 36 bits physical, 48 bits virtual 2517 power management: 2518 */ 2519 void amd64g_dirtyhelper_CPUID_avx_and_cx16 ( VexGuestAMD64State* st ) 2520 { 2521 # define SET_ABCD(_a,_b,_c,_d) \ 2522 do { st->guest_RAX = (ULong)(_a); \ 2523 st->guest_RBX = (ULong)(_b); \ 2524 st->guest_RCX = (ULong)(_c); \ 2525 st->guest_RDX = (ULong)(_d); \ 2526 } while (0) 2527 2528 UInt old_eax = (UInt)st->guest_RAX; 2529 UInt old_ecx = (UInt)st->guest_RCX; 2530 2531 switch (old_eax) { 2532 case 0x00000000: 2533 SET_ABCD(0x0000000d, 0x756e6547, 0x6c65746e, 0x49656e69); 2534 break; 2535 case 0x00000001: 2536 SET_ABCD(0x000206a7, 0x00100800, 0x1f9ae3bf, 0xbfebfbff); 2537 break; 2538 case 0x00000002: 2539 SET_ABCD(0x76035a01, 0x00f0b0ff, 0x00000000, 0x00ca0000); 2540 break; 2541 case 0x00000003: 2542 SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000); 2543 break; 2544 case 0x00000004: 2545 switch (old_ecx) { 2546 case 0x00000000: SET_ABCD(0x1c004121, 0x01c0003f, 2547 0x0000003f, 0x00000000); break; 2548 case 0x00000001: SET_ABCD(0x1c004122, 0x01c0003f, 2549 0x0000003f, 0x00000000); break; 2550 case 0x00000002: SET_ABCD(0x1c004143, 0x01c0003f, 2551 0x000001ff, 0x00000000); break; 2552 case 0x00000003: SET_ABCD(0x1c03c163, 0x02c0003f, 2553 0x00001fff, 0x00000006); break; 2554 default: SET_ABCD(0x00000000, 0x00000000, 2555 0x00000000, 0x00000000); break; 2556 } 2557 break; 2558 case 0x00000005: 2559 SET_ABCD(0x00000040, 0x00000040, 0x00000003, 0x00001120); 2560 break; 2561 case 0x00000006: 2562 SET_ABCD(0x00000077, 0x00000002, 0x00000009, 0x00000000); 2563 break; 2564 case 0x00000007: 2565 SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000); 2566 break; 2567 case 0x00000008: 2568 SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000); 2569 break; 2570 case 0x00000009: 2571 SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000); 2572 break; 2573 case 0x0000000a: 2574 SET_ABCD(0x07300803, 0x00000000, 0x00000000, 0x00000603); 2575 break; 2576 case 0x0000000b: 2577 switch (old_ecx) { 2578 case 0x00000000: 2579 SET_ABCD(0x00000001, 0x00000001, 2580 0x00000100, 0x00000000); break; 2581 case 0x00000001: 2582 SET_ABCD(0x00000004, 0x00000004, 2583 0x00000201, 0x00000000); break; 2584 default: 2585 SET_ABCD(0x00000000, 0x00000000, 2586 old_ecx, 0x00000000); break; 2587 } 2588 break; 2589 case 0x0000000c: 2590 SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000); 2591 break; 2592 case 0x0000000d: 2593 switch (old_ecx) { 2594 case 0x00000000: SET_ABCD(0x00000007, 0x00000340, 2595 0x00000340, 0x00000000); break; 2596 case 0x00000001: SET_ABCD(0x00000001, 0x00000000, 2597 0x00000000, 0x00000000); break; 2598 case 0x00000002: SET_ABCD(0x00000100, 0x00000240, 2599 0x00000000, 0x00000000); break; 2600 default: SET_ABCD(0x00000000, 0x00000000, 2601 0x00000000, 0x00000000); break; 2602 } 2603 break; 2604 case 0x0000000e: 2605 SET_ABCD(0x00000007, 0x00000340, 0x00000340, 0x00000000); 2606 break; 2607 case 0x0000000f: 2608 SET_ABCD(0x00000007, 0x00000340, 0x00000340, 0x00000000); 2609 break; 2610 case 0x80000000: 2611 SET_ABCD(0x80000008, 0x00000000, 0x00000000, 0x00000000); 2612 break; 2613 case 0x80000001: 2614 SET_ABCD(0x00000000, 0x00000000, 0x00000001, 0x28100800); 2615 break; 2616 case 0x80000002: 2617 SET_ABCD(0x20202020, 0x20202020, 0x65746e49, 0x2952286c); 2618 break; 2619 case 0x80000003: 2620 SET_ABCD(0x726f4320, 0x4d542865, 0x35692029, 0x3033322d); 2621 break; 2622 case 0x80000004: 2623 SET_ABCD(0x50432030, 0x20402055, 0x30382e32, 0x007a4847); 2624 break; 2625 case 0x80000005: 2626 SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000); 2627 break; 2628 case 0x80000006: 2629 SET_ABCD(0x00000000, 0x00000000, 0x01006040, 0x00000000); 2630 break; 2631 case 0x80000007: 2632 SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000100); 2633 break; 2634 case 0x80000008: 2635 SET_ABCD(0x00003024, 0x00000000, 0x00000000, 0x00000000); 2636 break; 2637 default: 2638 SET_ABCD(0x00000007, 0x00000340, 0x00000340, 0x00000000); 2639 break; 2640 } 2641 # undef SET_ABCD 2642 } 2643 2644 2645 ULong amd64g_calculate_RCR ( ULong arg, 2646 ULong rot_amt, 2647 ULong rflags_in, 2648 Long szIN ) 2649 { 2650 Bool wantRflags = toBool(szIN < 0); 2651 ULong sz = wantRflags ? (-szIN) : szIN; 2652 ULong tempCOUNT = rot_amt & (sz == 8 ? 0x3F : 0x1F); 2653 ULong cf=0, of=0, tempcf; 2654 2655 switch (sz) { 2656 case 8: 2657 cf = (rflags_in >> AMD64G_CC_SHIFT_C) & 1; 2658 of = ((arg >> 63) ^ cf) & 1; 2659 while (tempCOUNT > 0) { 2660 tempcf = arg & 1; 2661 arg = (arg >> 1) | (cf << 63); 2662 cf = tempcf; 2663 tempCOUNT--; 2664 } 2665 break; 2666 case 4: 2667 while (tempCOUNT >= 33) tempCOUNT -= 33; 2668 cf = (rflags_in >> AMD64G_CC_SHIFT_C) & 1; 2669 of = ((arg >> 31) ^ cf) & 1; 2670 while (tempCOUNT > 0) { 2671 tempcf = arg & 1; 2672 arg = ((arg >> 1) & 0x7FFFFFFFULL) | (cf << 31); 2673 cf = tempcf; 2674 tempCOUNT--; 2675 } 2676 break; 2677 case 2: 2678 while (tempCOUNT >= 17) tempCOUNT -= 17; 2679 cf = (rflags_in >> AMD64G_CC_SHIFT_C) & 1; 2680 of = ((arg >> 15) ^ cf) & 1; 2681 while (tempCOUNT > 0) { 2682 tempcf = arg & 1; 2683 arg = ((arg >> 1) & 0x7FFFULL) | (cf << 15); 2684 cf = tempcf; 2685 tempCOUNT--; 2686 } 2687 break; 2688 case 1: 2689 while (tempCOUNT >= 9) tempCOUNT -= 9; 2690 cf = (rflags_in >> AMD64G_CC_SHIFT_C) & 1; 2691 of = ((arg >> 7) ^ cf) & 1; 2692 while (tempCOUNT > 0) { 2693 tempcf = arg & 1; 2694 arg = ((arg >> 1) & 0x7FULL) | (cf << 7); 2695 cf = tempcf; 2696 tempCOUNT--; 2697 } 2698 break; 2699 default: 2700 vpanic("calculate_RCR(amd64g): invalid size"); 2701 } 2702 2703 cf &= 1; 2704 of &= 1; 2705 rflags_in &= ~(AMD64G_CC_MASK_C | AMD64G_CC_MASK_O); 2706 rflags_in |= (cf << AMD64G_CC_SHIFT_C) | (of << AMD64G_CC_SHIFT_O); 2707 2708 /* caller can ask to have back either the resulting flags or 2709 resulting value, but not both */ 2710 return wantRflags ? rflags_in : arg; 2711 } 2712 2713 ULong amd64g_calculate_RCL ( ULong arg, 2714 ULong rot_amt, 2715 ULong rflags_in, 2716 Long szIN ) 2717 { 2718 Bool wantRflags = toBool(szIN < 0); 2719 ULong sz = wantRflags ? (-szIN) : szIN; 2720 ULong tempCOUNT = rot_amt & (sz == 8 ? 0x3F : 0x1F); 2721 ULong cf=0, of=0, tempcf; 2722 2723 switch (sz) { 2724 case 8: 2725 cf = (rflags_in >> AMD64G_CC_SHIFT_C) & 1; 2726 while (tempCOUNT > 0) { 2727 tempcf = (arg >> 63) & 1; 2728 arg = (arg << 1) | (cf & 1); 2729 cf = tempcf; 2730 tempCOUNT--; 2731 } 2732 of = ((arg >> 63) ^ cf) & 1; 2733 break; 2734 case 4: 2735 while (tempCOUNT >= 33) tempCOUNT -= 33; 2736 cf = (rflags_in >> AMD64G_CC_SHIFT_C) & 1; 2737 while (tempCOUNT > 0) { 2738 tempcf = (arg >> 31) & 1; 2739 arg = 0xFFFFFFFFULL & ((arg << 1) | (cf & 1)); 2740 cf = tempcf; 2741 tempCOUNT--; 2742 } 2743 of = ((arg >> 31) ^ cf) & 1; 2744 break; 2745 case 2: 2746 while (tempCOUNT >= 17) tempCOUNT -= 17; 2747 cf = (rflags_in >> AMD64G_CC_SHIFT_C) & 1; 2748 while (tempCOUNT > 0) { 2749 tempcf = (arg >> 15) & 1; 2750 arg = 0xFFFFULL & ((arg << 1) | (cf & 1)); 2751 cf = tempcf; 2752 tempCOUNT--; 2753 } 2754 of = ((arg >> 15) ^ cf) & 1; 2755 break; 2756 case 1: 2757 while (tempCOUNT >= 9) tempCOUNT -= 9; 2758 cf = (rflags_in >> AMD64G_CC_SHIFT_C) & 1; 2759 while (tempCOUNT > 0) { 2760 tempcf = (arg >> 7) & 1; 2761 arg = 0xFFULL & ((arg << 1) | (cf & 1)); 2762 cf = tempcf; 2763 tempCOUNT--; 2764 } 2765 of = ((arg >> 7) ^ cf) & 1; 2766 break; 2767 default: 2768 vpanic("calculate_RCL(amd64g): invalid size"); 2769 } 2770 2771 cf &= 1; 2772 of &= 1; 2773 rflags_in &= ~(AMD64G_CC_MASK_C | AMD64G_CC_MASK_O); 2774 rflags_in |= (cf << AMD64G_CC_SHIFT_C) | (of << AMD64G_CC_SHIFT_O); 2775 2776 return wantRflags ? rflags_in : arg; 2777 } 2778 2779 /* Taken from gf2x-0.9.5, released under GPLv2+ (later versions LGPLv2+) 2780 * svn://scm.gforge.inria.fr/svn/gf2x/trunk/hardware/opteron/gf2x_mul1.h@25 2781 */ 2782 ULong amd64g_calculate_pclmul(ULong a, ULong b, ULong which) 2783 { 2784 ULong hi, lo, tmp, A[16]; 2785 2786 A[0] = 0; A[1] = a; 2787 A[2] = A[1] << 1; A[3] = A[2] ^ a; 2788 A[4] = A[2] << 1; A[5] = A[4] ^ a; 2789 A[6] = A[3] << 1; A[7] = A[6] ^ a; 2790 A[8] = A[4] << 1; A[9] = A[8] ^ a; 2791 A[10] = A[5] << 1; A[11] = A[10] ^ a; 2792 A[12] = A[6] << 1; A[13] = A[12] ^ a; 2793 A[14] = A[7] << 1; A[15] = A[14] ^ a; 2794 2795 lo = (A[b >> 60] << 4) ^ A[(b >> 56) & 15]; 2796 hi = lo >> 56; 2797 lo = (lo << 8) ^ (A[(b >> 52) & 15] << 4) ^ A[(b >> 48) & 15]; 2798 hi = (hi << 8) | (lo >> 56); 2799 lo = (lo << 8) ^ (A[(b >> 44) & 15] << 4) ^ A[(b >> 40) & 15]; 2800 hi = (hi << 8) | (lo >> 56); 2801 lo = (lo << 8) ^ (A[(b >> 36) & 15] << 4) ^ A[(b >> 32) & 15]; 2802 hi = (hi << 8) | (lo >> 56); 2803 lo = (lo << 8) ^ (A[(b >> 28) & 15] << 4) ^ A[(b >> 24) & 15]; 2804 hi = (hi << 8) | (lo >> 56); 2805 lo = (lo << 8) ^ (A[(b >> 20) & 15] << 4) ^ A[(b >> 16) & 15]; 2806 hi = (hi << 8) | (lo >> 56); 2807 lo = (lo << 8) ^ (A[(b >> 12) & 15] << 4) ^ A[(b >> 8) & 15]; 2808 hi = (hi << 8) | (lo >> 56); 2809 lo = (lo << 8) ^ (A[(b >> 4) & 15] << 4) ^ A[b & 15]; 2810 2811 ULong m0 = -1; 2812 m0 /= 255; 2813 tmp = -((a >> 63) & 1); tmp &= ((b & (m0 * 0xfe)) >> 1); hi = hi ^ tmp; 2814 tmp = -((a >> 62) & 1); tmp &= ((b & (m0 * 0xfc)) >> 2); hi = hi ^ tmp; 2815 tmp = -((a >> 61) & 1); tmp &= ((b & (m0 * 0xf8)) >> 3); hi = hi ^ tmp; 2816 tmp = -((a >> 60) & 1); tmp &= ((b & (m0 * 0xf0)) >> 4); hi = hi ^ tmp; 2817 tmp = -((a >> 59) & 1); tmp &= ((b & (m0 * 0xe0)) >> 5); hi = hi ^ tmp; 2818 tmp = -((a >> 58) & 1); tmp &= ((b & (m0 * 0xc0)) >> 6); hi = hi ^ tmp; 2819 tmp = -((a >> 57) & 1); tmp &= ((b & (m0 * 0x80)) >> 7); hi = hi ^ tmp; 2820 2821 return which ? hi : lo; 2822 } 2823 2824 2825 /* CALLED FROM GENERATED CODE */ 2826 /* DIRTY HELPER (non-referentially-transparent) */ 2827 /* Horrible hack. On non-amd64 platforms, return 1. */ 2828 ULong amd64g_dirtyhelper_RDTSC ( void ) 2829 { 2830 # if defined(__x86_64__) 2831 UInt eax, edx; 2832 __asm__ __volatile__("rdtsc" : "=a" (eax), "=d" (edx)); 2833 return (((ULong)edx) << 32) | ((ULong)eax); 2834 # else 2835 return 1ULL; 2836 # endif 2837 } 2838 2839 2840 /* CALLED FROM GENERATED CODE */ 2841 /* DIRTY HELPER (non-referentially-transparent) */ 2842 /* Horrible hack. On non-amd64 platforms, return 0. */ 2843 ULong amd64g_dirtyhelper_IN ( ULong portno, ULong sz/*1,2 or 4*/ ) 2844 { 2845 # if defined(__x86_64__) 2846 ULong r = 0; 2847 portno &= 0xFFFF; 2848 switch (sz) { 2849 case 4: 2850 __asm__ __volatile__("movq $0,%%rax; inl %w1,%%eax; movq %%rax,%0" 2851 : "=a" (r) : "Nd" (portno)); 2852 break; 2853 case 2: 2854 __asm__ __volatile__("movq $0,%%rax; inw %w1,%w0" 2855 : "=a" (r) : "Nd" (portno)); 2856 break; 2857 case 1: 2858 __asm__ __volatile__("movq $0,%%rax; inb %w1,%b0" 2859 : "=a" (r) : "Nd" (portno)); 2860 break; 2861 default: 2862 break; /* note: no 64-bit version of insn exists */ 2863 } 2864 return r; 2865 # else 2866 return 0; 2867 # endif 2868 } 2869 2870 2871 /* CALLED FROM GENERATED CODE */ 2872 /* DIRTY HELPER (non-referentially-transparent) */ 2873 /* Horrible hack. On non-amd64 platforms, do nothing. */ 2874 void amd64g_dirtyhelper_OUT ( ULong portno, ULong data, ULong sz/*1,2 or 4*/ ) 2875 { 2876 # if defined(__x86_64__) 2877 portno &= 0xFFFF; 2878 switch (sz) { 2879 case 4: 2880 __asm__ __volatile__("movq %0,%%rax; outl %%eax, %w1" 2881 : : "a" (data), "Nd" (portno)); 2882 break; 2883 case 2: 2884 __asm__ __volatile__("outw %w0, %w1" 2885 : : "a" (data), "Nd" (portno)); 2886 break; 2887 case 1: 2888 __asm__ __volatile__("outb %b0, %w1" 2889 : : "a" (data), "Nd" (portno)); 2890 break; 2891 default: 2892 break; /* note: no 64-bit version of insn exists */ 2893 } 2894 # else 2895 /* do nothing */ 2896 # endif 2897 } 2898 2899 /* CALLED FROM GENERATED CODE */ 2900 /* DIRTY HELPER (non-referentially-transparent) */ 2901 /* Horrible hack. On non-amd64 platforms, do nothing. */ 2902 /* op = 0: call the native SGDT instruction. 2903 op = 1: call the native SIDT instruction. 2904 */ 2905 void amd64g_dirtyhelper_SxDT ( void *address, ULong op ) { 2906 # if defined(__x86_64__) 2907 switch (op) { 2908 case 0: 2909 __asm__ __volatile__("sgdt (%0)" : : "r" (address) : "memory"); 2910 break; 2911 case 1: 2912 __asm__ __volatile__("sidt (%0)" : : "r" (address) : "memory"); 2913 break; 2914 default: 2915 vpanic("amd64g_dirtyhelper_SxDT"); 2916 } 2917 # else 2918 /* do nothing */ 2919 UChar* p = (UChar*)address; 2920 p[0] = p[1] = p[2] = p[3] = p[4] = p[5] = 0; 2921 p[6] = p[7] = p[8] = p[9] = 0; 2922 # endif 2923 } 2924 2925 /*---------------------------------------------------------------*/ 2926 /*--- Helpers for MMX/SSE/SSE2. ---*/ 2927 /*---------------------------------------------------------------*/ 2928 2929 static inline UChar abdU8 ( UChar xx, UChar yy ) { 2930 return toUChar(xx>yy ? xx-yy : yy-xx); 2931 } 2932 2933 static inline ULong mk32x2 ( UInt w1, UInt w0 ) { 2934 return (((ULong)w1) << 32) | ((ULong)w0); 2935 } 2936 2937 static inline UShort sel16x4_3 ( ULong w64 ) { 2938 UInt hi32 = toUInt(w64 >> 32); 2939 return toUShort(hi32 >> 16); 2940 } 2941 static inline UShort sel16x4_2 ( ULong w64 ) { 2942 UInt hi32 = toUInt(w64 >> 32); 2943 return toUShort(hi32); 2944 } 2945 static inline UShort sel16x4_1 ( ULong w64 ) { 2946 UInt lo32 = toUInt(w64); 2947 return toUShort(lo32 >> 16); 2948 } 2949 static inline UShort sel16x4_0 ( ULong w64 ) { 2950 UInt lo32 = toUInt(w64); 2951 return toUShort(lo32); 2952 } 2953 2954 static inline UChar sel8x8_7 ( ULong w64 ) { 2955 UInt hi32 = toUInt(w64 >> 32); 2956 return toUChar(hi32 >> 24); 2957 } 2958 static inline UChar sel8x8_6 ( ULong w64 ) { 2959 UInt hi32 = toUInt(w64 >> 32); 2960 return toUChar(hi32 >> 16); 2961 } 2962 static inline UChar sel8x8_5 ( ULong w64 ) { 2963 UInt hi32 = toUInt(w64 >> 32); 2964 return toUChar(hi32 >> 8); 2965 } 2966 static inline UChar sel8x8_4 ( ULong w64 ) { 2967 UInt hi32 = toUInt(w64 >> 32); 2968 return toUChar(hi32 >> 0); 2969 } 2970 static inline UChar sel8x8_3 ( ULong w64 ) { 2971 UInt lo32 = toUInt(w64); 2972 return toUChar(lo32 >> 24); 2973 } 2974 static inline UChar sel8x8_2 ( ULong w64 ) { 2975 UInt lo32 = toUInt(w64); 2976 return toUChar(lo32 >> 16); 2977 } 2978 static inline UChar sel8x8_1 ( ULong w64 ) { 2979 UInt lo32 = toUInt(w64); 2980 return toUChar(lo32 >> 8); 2981 } 2982 static inline UChar sel8x8_0 ( ULong w64 ) { 2983 UInt lo32 = toUInt(w64); 2984 return toUChar(lo32 >> 0); 2985 } 2986 2987 /* CALLED FROM GENERATED CODE: CLEAN HELPER */ 2988 ULong amd64g_calculate_mmx_pmaddwd ( ULong xx, ULong yy ) 2989 { 2990 return 2991 mk32x2( 2992 (((Int)(Short)sel16x4_3(xx)) * ((Int)(Short)sel16x4_3(yy))) 2993 + (((Int)(Short)sel16x4_2(xx)) * ((Int)(Short)sel16x4_2(yy))), 2994 (((Int)(Short)sel16x4_1(xx)) * ((Int)(Short)sel16x4_1(yy))) 2995 + (((Int)(Short)sel16x4_0(xx)) * ((Int)(Short)sel16x4_0(yy))) 2996 ); 2997 } 2998 2999 /* CALLED FROM GENERATED CODE: CLEAN HELPER */ 3000 ULong amd64g_calculate_mmx_pmovmskb ( ULong xx ) 3001 { 3002 ULong r = 0; 3003 if (xx & (1ULL << (64-1))) r |= (1<<7); 3004 if (xx & (1ULL << (56-1))) r |= (1<<6); 3005 if (xx & (1ULL << (48-1))) r |= (1<<5); 3006 if (xx & (1ULL << (40-1))) r |= (1<<4); 3007 if (xx & (1ULL << (32-1))) r |= (1<<3); 3008 if (xx & (1ULL << (24-1))) r |= (1<<2); 3009 if (xx & (1ULL << (16-1))) r |= (1<<1); 3010 if (xx & (1ULL << ( 8-1))) r |= (1<<0); 3011 return r; 3012 } 3013 3014 /* CALLED FROM GENERATED CODE: CLEAN HELPER */ 3015 ULong amd64g_calculate_mmx_psadbw ( ULong xx, ULong yy ) 3016 { 3017 UInt t = 0; 3018 t += (UInt)abdU8( sel8x8_7(xx), sel8x8_7(yy) ); 3019 t += (UInt)abdU8( sel8x8_6(xx), sel8x8_6(yy) ); 3020 t += (UInt)abdU8( sel8x8_5(xx), sel8x8_5(yy) ); 3021 t += (UInt)abdU8( sel8x8_4(xx), sel8x8_4(yy) ); 3022 t += (UInt)abdU8( sel8x8_3(xx), sel8x8_3(yy) ); 3023 t += (UInt)abdU8( sel8x8_2(xx), sel8x8_2(yy) ); 3024 t += (UInt)abdU8( sel8x8_1(xx), sel8x8_1(yy) ); 3025 t += (UInt)abdU8( sel8x8_0(xx), sel8x8_0(yy) ); 3026 t &= 0xFFFF; 3027 return (ULong)t; 3028 } 3029 3030 /* CALLED FROM GENERATED CODE: CLEAN HELPER */ 3031 ULong amd64g_calculate_sse_pmovmskb ( ULong w64hi, ULong w64lo ) 3032 { 3033 ULong rHi8 = amd64g_calculate_mmx_pmovmskb ( w64hi ); 3034 ULong rLo8 = amd64g_calculate_mmx_pmovmskb ( w64lo ); 3035 return ((rHi8 & 0xFF) << 8) | (rLo8 & 0xFF); 3036 } 3037 3038 /* CALLED FROM GENERATED CODE: CLEAN HELPER */ 3039 ULong amd64g_calculate_sse_phminposuw ( ULong sLo, ULong sHi ) 3040 { 3041 UShort t, min; 3042 UInt idx; 3043 t = sel16x4_0(sLo); if (True) { min = t; idx = 0; } 3044 t = sel16x4_1(sLo); if (t < min) { min = t; idx = 1; } 3045 t = sel16x4_2(sLo); if (t < min) { min = t; idx = 2; } 3046 t = sel16x4_3(sLo); if (t < min) { min = t; idx = 3; } 3047 t = sel16x4_0(sHi); if (t < min) { min = t; idx = 4; } 3048 t = sel16x4_1(sHi); if (t < min) { min = t; idx = 5; } 3049 t = sel16x4_2(sHi); if (t < min) { min = t; idx = 6; } 3050 t = sel16x4_3(sHi); if (t < min) { min = t; idx = 7; } 3051 return ((ULong)(idx << 16)) | ((ULong)min); 3052 } 3053 3054 /* CALLED FROM GENERATED CODE: CLEAN HELPER */ 3055 ULong amd64g_calc_crc32b ( ULong crcIn, ULong b ) 3056 { 3057 UInt i; 3058 ULong crc = (b & 0xFFULL) ^ crcIn; 3059 for (i = 0; i < 8; i++) 3060 crc = (crc >> 1) ^ ((crc & 1) ? 0x82f63b78ULL : 0); 3061 return crc; 3062 } 3063 3064 /* CALLED FROM GENERATED CODE: CLEAN HELPER */ 3065 ULong amd64g_calc_crc32w ( ULong crcIn, ULong w ) 3066 { 3067 UInt i; 3068 ULong crc = (w & 0xFFFFULL) ^ crcIn; 3069 for (i = 0; i < 16; i++) 3070 crc = (crc >> 1) ^ ((crc & 1) ? 0x82f63b78ULL : 0); 3071 return crc; 3072 } 3073 3074 /* CALLED FROM GENERATED CODE: CLEAN HELPER */ 3075 ULong amd64g_calc_crc32l ( ULong crcIn, ULong l ) 3076 { 3077 UInt i; 3078 ULong crc = (l & 0xFFFFFFFFULL) ^ crcIn; 3079 for (i = 0; i < 32; i++) 3080 crc = (crc >> 1) ^ ((crc & 1) ? 0x82f63b78ULL : 0); 3081 return crc; 3082 } 3083 3084 /* CALLED FROM GENERATED CODE: CLEAN HELPER */ 3085 ULong amd64g_calc_crc32q ( ULong crcIn, ULong q ) 3086 { 3087 ULong crc = amd64g_calc_crc32l(crcIn, q); 3088 return amd64g_calc_crc32l(crc, q >> 32); 3089 } 3090 3091 3092 /* .. helper for next fn .. */ 3093 static inline ULong sad_8x4 ( ULong xx, ULong yy ) 3094 { 3095 UInt t = 0; 3096 t += (UInt)abdU8( sel8x8_3(xx), sel8x8_3(yy) ); 3097 t += (UInt)abdU8( sel8x8_2(xx), sel8x8_2(yy) ); 3098 t += (UInt)abdU8( sel8x8_1(xx), sel8x8_1(yy) ); 3099 t += (UInt)abdU8( sel8x8_0(xx), sel8x8_0(yy) ); 3100 return (ULong)t; 3101 } 3102 3103 /* CALLED FROM GENERATED CODE: CLEAN HELPER */ 3104 ULong amd64g_calc_mpsadbw ( ULong sHi, ULong sLo, 3105 ULong dHi, ULong dLo, 3106 ULong imm_and_return_control_bit ) 3107 { 3108 UInt imm8 = imm_and_return_control_bit & 7; 3109 Bool calcHi = (imm_and_return_control_bit >> 7) & 1; 3110 UInt srcOffsL = imm8 & 3; /* src offs in 32-bit (L) chunks */ 3111 UInt dstOffsL = (imm8 >> 2) & 1; /* dst offs in ditto chunks */ 3112 /* For src we only need 32 bits, so get them into the 3113 lower half of a 64 bit word. */ 3114 ULong src = ((srcOffsL & 2) ? sHi : sLo) >> (32 * (srcOffsL & 1)); 3115 /* For dst we need to get hold of 56 bits (7 bytes) from a total of 3116 11 bytes. If calculating the low part of the result, need bytes 3117 dstOffsL * 4 + (0 .. 6); if calculating the high part, 3118 dstOffsL * 4 + (4 .. 10). */ 3119 ULong dst; 3120 /* dstOffL = 0, Lo -> 0 .. 6 3121 dstOffL = 1, Lo -> 4 .. 10 3122 dstOffL = 0, Hi -> 4 .. 10 3123 dstOffL = 1, Hi -> 8 .. 14 3124 */ 3125 if (calcHi && dstOffsL) { 3126 /* 8 .. 14 */ 3127 dst = dHi & 0x00FFFFFFFFFFFFFFULL; 3128 } 3129 else if (!calcHi && !dstOffsL) { 3130 /* 0 .. 6 */ 3131 dst = dLo & 0x00FFFFFFFFFFFFFFULL; 3132 } 3133 else { 3134 /* 4 .. 10 */ 3135 dst = (dLo >> 32) | ((dHi & 0x00FFFFFFULL) << 32); 3136 } 3137 ULong r0 = sad_8x4( dst >> 0, src ); 3138 ULong r1 = sad_8x4( dst >> 8, src ); 3139 ULong r2 = sad_8x4( dst >> 16, src ); 3140 ULong r3 = sad_8x4( dst >> 24, src ); 3141 ULong res = (r3 << 48) | (r2 << 32) | (r1 << 16) | r0; 3142 return res; 3143 } 3144 3145 /*---------------------------------------------------------------*/ 3146 /*--- Helpers for SSE4.2 PCMP{E,I}STR{I,M} ---*/ 3147 /*---------------------------------------------------------------*/ 3148 3149 static UInt zmask_from_V128 ( V128* arg ) 3150 { 3151 UInt i, res = 0; 3152 for (i = 0; i < 16; i++) { 3153 res |= ((arg->w8[i] == 0) ? 1 : 0) << i; 3154 } 3155 return res; 3156 } 3157 3158 static UInt zmask_from_V128_wide ( V128* arg ) 3159 { 3160 UInt i, res = 0; 3161 for (i = 0; i < 8; i++) { 3162 res |= ((arg->w16[i] == 0) ? 1 : 0) << i; 3163 } 3164 return res; 3165 } 3166 3167 /* Helps with PCMP{I,E}STR{I,M}. 3168 3169 CALLED FROM GENERATED CODE: DIRTY HELPER(s). (But not really, 3170 actually it could be a clean helper, but for the fact that we can't 3171 pass by value 2 x V128 to a clean helper, nor have one returned.) 3172 Reads guest state, writes to guest state for the xSTRM cases, no 3173 accesses of memory, is a pure function. 3174 3175 opc_and_imm contains (4th byte of opcode << 8) | the-imm8-byte so 3176 the callee knows which I/E and I/M variant it is dealing with and 3177 what the specific operation is. 4th byte of opcode is in the range 3178 0x60 to 0x63: 3179 istri 66 0F 3A 63 3180 istrm 66 0F 3A 62 3181 estri 66 0F 3A 61 3182 estrm 66 0F 3A 60 3183 3184 gstOffL and gstOffR are the guest state offsets for the two XMM 3185 register inputs. We never have to deal with the memory case since 3186 that is handled by pre-loading the relevant value into the fake 3187 XMM16 register. 3188 3189 For ESTRx variants, edxIN and eaxIN hold the values of those two 3190 registers. 3191 3192 In all cases, the bottom 16 bits of the result contain the new 3193 OSZACP %rflags values. For xSTRI variants, bits[31:16] of the 3194 result hold the new %ecx value. For xSTRM variants, the helper 3195 writes the result directly to the guest XMM0. 3196 3197 Declarable side effects: in all cases, reads guest state at 3198 [gstOffL, +16) and [gstOffR, +16). For xSTRM variants, also writes 3199 guest_XMM0. 3200 3201 Is expected to be called with opc_and_imm combinations which have 3202 actually been validated, and will assert if otherwise. The front 3203 end should ensure we're only called with verified values. 3204 */ 3205 ULong amd64g_dirtyhelper_PCMPxSTRx ( 3206 VexGuestAMD64State* gst, 3207 HWord opc4_and_imm, 3208 HWord gstOffL, HWord gstOffR, 3209 HWord edxIN, HWord eaxIN 3210 ) 3211 { 3212 HWord opc4 = (opc4_and_imm >> 8) & 0xFF; 3213 HWord imm8 = opc4_and_imm & 0xFF; 3214 HWord isISTRx = opc4 & 2; 3215 HWord isxSTRM = (opc4 & 1) ^ 1; 3216 vassert((opc4 & 0xFC) == 0x60); /* 0x60 .. 0x63 */ 3217 HWord wide = (imm8 & 1); 3218 3219 // where the args are 3220 V128* argL = (V128*)( ((UChar*)gst) + gstOffL ); 3221 V128* argR = (V128*)( ((UChar*)gst) + gstOffR ); 3222 3223 /* Create the arg validity masks, either from the vectors 3224 themselves or from the supplied edx/eax values. */ 3225 // FIXME: this is only right for the 8-bit data cases. 3226 // At least that is asserted above. 3227 UInt zmaskL, zmaskR; 3228 3229 // temp spot for the resulting flags and vector. 3230 V128 resV; 3231 UInt resOSZACP; 3232 3233 // for checking whether case was handled 3234 Bool ok = False; 3235 3236 if (wide) { 3237 if (isISTRx) { 3238 zmaskL = zmask_from_V128_wide(argL); 3239 zmaskR = zmask_from_V128_wide(argR); 3240 } else { 3241 Int tmp; 3242 tmp = edxIN & 0xFFFFFFFF; 3243 if (tmp < -8) tmp = -8; 3244 if (tmp > 8) tmp = 8; 3245 if (tmp < 0) tmp = -tmp; 3246 vassert(tmp >= 0 && tmp <= 8); 3247 zmaskL = (1 << tmp) & 0xFF; 3248 tmp = eaxIN & 0xFFFFFFFF; 3249 if (tmp < -8) tmp = -8; 3250 if (tmp > 8) tmp = 8; 3251 if (tmp < 0) tmp = -tmp; 3252 vassert(tmp >= 0 && tmp <= 8); 3253 zmaskR = (1 << tmp) & 0xFF; 3254 } 3255 // do the meyaath 3256 ok = compute_PCMPxSTRx_wide ( 3257 &resV, &resOSZACP, argL, argR, 3258 zmaskL, zmaskR, imm8, (Bool)isxSTRM 3259 ); 3260 } else { 3261 if (isISTRx) { 3262 zmaskL = zmask_from_V128(argL); 3263 zmaskR = zmask_from_V128(argR); 3264 } else { 3265 Int tmp; 3266 tmp = edxIN & 0xFFFFFFFF; 3267 if (tmp < -16) tmp = -16; 3268 if (tmp > 16) tmp = 16; 3269 if (tmp < 0) tmp = -tmp; 3270 vassert(tmp >= 0 && tmp <= 16); 3271 zmaskL = (1 << tmp) & 0xFFFF; 3272 tmp = eaxIN & 0xFFFFFFFF; 3273 if (tmp < -16) tmp = -16; 3274 if (tmp > 16) tmp = 16; 3275 if (tmp < 0) tmp = -tmp; 3276 vassert(tmp >= 0 && tmp <= 16); 3277 zmaskR = (1 << tmp) & 0xFFFF; 3278 } 3279 // do the meyaath 3280 ok = compute_PCMPxSTRx ( 3281 &resV, &resOSZACP, argL, argR, 3282 zmaskL, zmaskR, imm8, (Bool)isxSTRM 3283 ); 3284 } 3285 3286 // front end shouldn't pass us any imm8 variants we can't 3287 // handle. Hence: 3288 vassert(ok); 3289 3290 // So, finally we need to get the results back to the caller. 3291 // In all cases, the new OSZACP value is the lowest 16 of 3292 // the return value. 3293 if (isxSTRM) { 3294 gst->guest_YMM0[0] = resV.w32[0]; 3295 gst->guest_YMM0[1] = resV.w32[1]; 3296 gst->guest_YMM0[2] = resV.w32[2]; 3297 gst->guest_YMM0[3] = resV.w32[3]; 3298 return resOSZACP & 0x8D5; 3299 } else { 3300 UInt newECX = resV.w32[0] & 0xFFFF; 3301 return (newECX << 16) | (resOSZACP & 0x8D5); 3302 } 3303 } 3304 3305 /*---------------------------------------------------------------*/ 3306 /*--- AES primitives and helpers ---*/ 3307 /*---------------------------------------------------------------*/ 3308 /* a 16 x 16 matrix */ 3309 static const UChar sbox[256] = { // row nr 3310 0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5, // 1 3311 0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76, 3312 0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0, // 2 3313 0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0, 3314 0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc, // 3 3315 0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15, 3316 0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a, // 4 3317 0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75, 3318 0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0, // 5 3319 0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84, 3320 0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b, // 6 3321 0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf, 3322 0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85, // 7 3323 0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8, 3324 0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5, // 8 3325 0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2, 3326 0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17, // 9 3327 0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73, 3328 0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88, //10 3329 0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb, 3330 0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c, //11 3331 0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79, 3332 0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9, //12 3333 0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08, 3334 0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6, //13 3335 0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a, 3336 0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e, //14 3337 0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e, 3338 0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94, //15 3339 0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf, 3340 0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68, //16 3341 0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16 3342 }; 3343 static void SubBytes (V128* v) 3344 { 3345 V128 r; 3346 UInt i; 3347 for (i = 0; i < 16; i++) 3348 r.w8[i] = sbox[v->w8[i]]; 3349 *v = r; 3350 } 3351 3352 /* a 16 x 16 matrix */ 3353 static const UChar invsbox[256] = { // row nr 3354 0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38, // 1 3355 0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb, 3356 0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87, // 2 3357 0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb, 3358 0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d, // 3 3359 0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e, 3360 0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2, // 4 3361 0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25, 3362 0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16, // 5 3363 0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92, 3364 0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda, // 6 3365 0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84, 3366 0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a, // 7 3367 0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06, 3368 0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02, // 8 3369 0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b, 3370 0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea, // 9 3371 0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73, 3372 0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85, //10 3373 0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e, 3374 0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89, //11 3375 0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b, 3376 0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20, //12 3377 0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4, 3378 0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31, //13 3379 0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f, 3380 0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d, //14 3381 0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef, 3382 0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0, //15 3383 0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61, 3384 0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26, //16 3385 0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d 3386 }; 3387 static void InvSubBytes (V128* v) 3388 { 3389 V128 r; 3390 UInt i; 3391 for (i = 0; i < 16; i++) 3392 r.w8[i] = invsbox[v->w8[i]]; 3393 *v = r; 3394 } 3395 3396 static const UChar ShiftRows_op[16] = 3397 {11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15, 10, 5, 0}; 3398 static void ShiftRows (V128* v) 3399 { 3400 V128 r; 3401 UInt i; 3402 for (i = 0; i < 16; i++) 3403 r.w8[i] = v->w8[ShiftRows_op[15-i]]; 3404 *v = r; 3405 } 3406 3407 static const UChar InvShiftRows_op[16] = 3408 {3, 6, 9, 12, 15, 2, 5, 8, 11, 14, 1, 4, 7, 10, 13, 0}; 3409 static void InvShiftRows (V128* v) 3410 { 3411 V128 r; 3412 UInt i; 3413 for (i = 0; i < 16; i++) 3414 r.w8[i] = v->w8[InvShiftRows_op[15-i]]; 3415 *v = r; 3416 } 3417 3418 /* Multiplication of the finite fields elements of AES. 3419 See "A Specification for The AES Algorithm Rijndael 3420 (by Joan Daemen & Vincent Rijmen)" 3421 Dr. Brian Gladman, v3.1, 3rd March 2001. */ 3422 /* N values so that (hex) xy = 0x03^N. 3423 0x00 cannot be used. We put 0xff for this value.*/ 3424 /* a 16 x 16 matrix */ 3425 static const UChar Nxy[256] = { // row nr 3426 0xff, 0x00, 0x19, 0x01, 0x32, 0x02, 0x1a, 0xc6, // 1 3427 0x4b, 0xc7, 0x1b, 0x68, 0x33, 0xee, 0xdf, 0x03, 3428 0x64, 0x04, 0xe0, 0x0e, 0x34, 0x8d, 0x81, 0xef, // 2 3429 0x4c, 0x71, 0x08, 0xc8, 0xf8, 0x69, 0x1c, 0xc1, 3430 0x7d, 0xc2, 0x1d, 0xb5, 0xf9, 0xb9, 0x27, 0x6a, // 3 3431 0x4d, 0xe4, 0xa6, 0x72, 0x9a, 0xc9, 0x09, 0x78, 3432 0x65, 0x2f, 0x8a, 0x05, 0x21, 0x0f, 0xe1, 0x24, // 4 3433 0x12, 0xf0, 0x82, 0x45, 0x35, 0x93, 0xda, 0x8e, 3434 0x96, 0x8f, 0xdb, 0xbd, 0x36, 0xd0, 0xce, 0x94, // 5 3435 0x13, 0x5c, 0xd2, 0xf1, 0x40, 0x46, 0x83, 0x38, 3436 0x66, 0xdd, 0xfd, 0x30, 0xbf, 0x06, 0x8b, 0x62, // 6 3437 0xb3, 0x25, 0xe2, 0x98, 0x22, 0x88, 0x91, 0x10, 3438 0x7e, 0x6e, 0x48, 0xc3, 0xa3, 0xb6, 0x1e, 0x42, // 7 3439 0x3a, 0x6b, 0x28, 0x54, 0xfa, 0x85, 0x3d, 0xba, 3440 0x2b, 0x79, 0x0a, 0x15, 0x9b, 0x9f, 0x5e, 0xca, // 8 3441 0x4e, 0xd4, 0xac, 0xe5, 0xf3, 0x73, 0xa7, 0x57, 3442 0xaf, 0x58, 0xa8, 0x50, 0xf4, 0xea, 0xd6, 0x74, // 9 3443 0x4f, 0xae, 0xe9, 0xd5, 0xe7, 0xe6, 0xad, 0xe8, 3444 0x2c, 0xd7, 0x75, 0x7a, 0xeb, 0x16, 0x0b, 0xf5, //10 3445 0x59, 0xcb, 0x5f, 0xb0, 0x9c, 0xa9, 0x51, 0xa0, 3446 0x7f, 0x0c, 0xf6, 0x6f, 0x17, 0xc4, 0x49, 0xec, //11 3447 0xd8, 0x43, 0x1f, 0x2d, 0xa4, 0x76, 0x7b, 0xb7, 3448 0xcc, 0xbb, 0x3e, 0x5a, 0xfb, 0x60, 0xb1, 0x86, //12 3449 0x3b, 0x52, 0xa1, 0x6c, 0xaa, 0x55, 0x29, 0x9d, 3450 0x97, 0xb2, 0x87, 0x90, 0x61, 0xbe, 0xdc, 0xfc, //13 3451 0xbc, 0x95, 0xcf, 0xcd, 0x37, 0x3f, 0x5b, 0xd1, 3452 0x53, 0x39, 0x84, 0x3c, 0x41, 0xa2, 0x6d, 0x47, //14 3453 0x14, 0x2a, 0x9e, 0x5d, 0x56, 0xf2, 0xd3, 0xab, 3454 0x44, 0x11, 0x92, 0xd9, 0x23, 0x20, 0x2e, 0x89, //15 3455 0xb4, 0x7c, 0xb8, 0x26, 0x77, 0x99, 0xe3, 0xa5, 3456 0x67, 0x4a, 0xed, 0xde, 0xc5, 0x31, 0xfe, 0x18, //16 3457 0x0d, 0x63, 0x8c, 0x80, 0xc0, 0xf7, 0x70, 0x07 3458 }; 3459 3460 /* E values so that E = 0x03^xy. */ 3461 static const UChar Exy[256] = { // row nr 3462 0x01, 0x03, 0x05, 0x0f, 0x11, 0x33, 0x55, 0xff, // 1 3463 0x1a, 0x2e, 0x72, 0x96, 0xa1, 0xf8, 0x13, 0x35, 3464 0x5f, 0xe1, 0x38, 0x48, 0xd8, 0x73, 0x95, 0xa4, // 2 3465 0xf7, 0x02, 0x06, 0x0a, 0x1e, 0x22, 0x66, 0xaa, 3466 0xe5, 0x34, 0x5c, 0xe4, 0x37, 0x59, 0xeb, 0x26, // 3 3467 0x6a, 0xbe, 0xd9, 0x70, 0x90, 0xab, 0xe6, 0x31, 3468 0x53, 0xf5, 0x04, 0x0c, 0x14, 0x3c, 0x44, 0xcc, // 4 3469 0x4f, 0xd1, 0x68, 0xb8, 0xd3, 0x6e, 0xb2, 0xcd, 3470 0x4c, 0xd4, 0x67, 0xa9, 0xe0, 0x3b, 0x4d, 0xd7, // 5 3471 0x62, 0xa6, 0xf1, 0x08, 0x18, 0x28, 0x78, 0x88, 3472 0x83, 0x9e, 0xb9, 0xd0, 0x6b, 0xbd, 0xdc, 0x7f, // 6 3473 0x81, 0x98, 0xb3, 0xce, 0x49, 0xdb, 0x76, 0x9a, 3474 0xb5, 0xc4, 0x57, 0xf9, 0x10, 0x30, 0x50, 0xf0, // 7 3475 0x0b, 0x1d, 0x27, 0x69, 0xbb, 0xd6, 0x61, 0xa3, 3476 0xfe, 0x19, 0x2b, 0x7d, 0x87, 0x92, 0xad, 0xec, // 8 3477 0x2f, 0x71, 0x93, 0xae, 0xe9, 0x20, 0x60, 0xa0, 3478 0xfb, 0x16, 0x3a, 0x4e, 0xd2, 0x6d, 0xb7, 0xc2, // 9 3479 0x5d, 0xe7, 0x32, 0x56, 0xfa, 0x15, 0x3f, 0x41, 3480 0xc3, 0x5e, 0xe2, 0x3d, 0x47, 0xc9, 0x40, 0xc0, //10 3481 0x5b, 0xed, 0x2c, 0x74, 0x9c, 0xbf, 0xda, 0x75, 3482 0x9f, 0xba, 0xd5, 0x64, 0xac, 0xef, 0x2a, 0x7e, //11 3483 0x82, 0x9d, 0xbc, 0xdf, 0x7a, 0x8e, 0x89, 0x80, 3484 0x9b, 0xb6, 0xc1, 0x58, 0xe8, 0x23, 0x65, 0xaf, //12 3485 0xea, 0x25, 0x6f, 0xb1, 0xc8, 0x43, 0xc5, 0x54, 3486 0xfc, 0x1f, 0x21, 0x63, 0xa5, 0xf4, 0x07, 0x09, //13 3487 0x1b, 0x2d, 0x77, 0x99, 0xb0, 0xcb, 0x46, 0xca, 3488 0x45, 0xcf, 0x4a, 0xde, 0x79, 0x8b, 0x86, 0x91, //14 3489 0xa8, 0xe3, 0x3e, 0x42, 0xc6, 0x51, 0xf3, 0x0e, 3490 0x12, 0x36, 0x5a, 0xee, 0x29, 0x7b, 0x8d, 0x8c, //15 3491 0x8f, 0x8a, 0x85, 0x94, 0xa7, 0xf2, 0x0d, 0x17, 3492 0x39, 0x4b, 0xdd, 0x7c, 0x84, 0x97, 0xa2, 0xfd, //16 3493 0x1c, 0x24, 0x6c, 0xb4, 0xc7, 0x52, 0xf6, 0x01}; 3494 3495 static inline UChar ff_mul(UChar u1, UChar u2) 3496 { 3497 if ((u1 > 0) && (u2 > 0)) { 3498 UInt ui = Nxy[u1] + Nxy[u2]; 3499 if (ui >= 255) 3500 ui = ui - 255; 3501 return Exy[ui]; 3502 } else { 3503 return 0; 3504 }; 3505 } 3506 3507 static void MixColumns (V128* v) 3508 { 3509 V128 r; 3510 Int j; 3511 #define P(x,row,col) (x)->w8[((row)*4+(col))] 3512 for (j = 0; j < 4; j++) { 3513 P(&r,j,0) = ff_mul(0x02, P(v,j,0)) ^ ff_mul(0x03, P(v,j,1)) 3514 ^ P(v,j,2) ^ P(v,j,3); 3515 P(&r,j,1) = P(v,j,0) ^ ff_mul( 0x02, P(v,j,1) ) 3516 ^ ff_mul(0x03, P(v,j,2) ) ^ P(v,j,3); 3517 P(&r,j,2) = P(v,j,0) ^ P(v,j,1) ^ ff_mul( 0x02, P(v,j,2) ) 3518 ^ ff_mul(0x03, P(v,j,3) ); 3519 P(&r,j,3) = ff_mul(0x03, P(v,j,0) ) ^ P(v,j,1) ^ P(v,j,2) 3520 ^ ff_mul( 0x02, P(v,j,3) ); 3521 } 3522 *v = r; 3523 #undef P 3524 } 3525 3526 static void InvMixColumns (V128* v) 3527 { 3528 V128 r; 3529 Int j; 3530 #define P(x,row,col) (x)->w8[((row)*4+(col))] 3531 for (j = 0; j < 4; j++) { 3532 P(&r,j,0) = ff_mul(0x0e, P(v,j,0) ) ^ ff_mul(0x0b, P(v,j,1) ) 3533 ^ ff_mul(0x0d,P(v,j,2) ) ^ ff_mul(0x09, P(v,j,3) ); 3534 P(&r,j,1) = ff_mul(0x09, P(v,j,0) ) ^ ff_mul(0x0e, P(v,j,1) ) 3535 ^ ff_mul(0x0b,P(v,j,2) ) ^ ff_mul(0x0d, P(v,j,3) ); 3536 P(&r,j,2) = ff_mul(0x0d, P(v,j,0) ) ^ ff_mul(0x09, P(v,j,1) ) 3537 ^ ff_mul(0x0e,P(v,j,2) ) ^ ff_mul(0x0b, P(v,j,3) ); 3538 P(&r,j,3) = ff_mul(0x0b, P(v,j,0) ) ^ ff_mul(0x0d, P(v,j,1) ) 3539 ^ ff_mul(0x09,P(v,j,2) ) ^ ff_mul(0x0e, P(v,j,3) ); 3540 } 3541 *v = r; 3542 #undef P 3543 3544 } 3545 3546 /* For description, see definition in guest_amd64_defs.h */ 3547 void amd64g_dirtyhelper_AES ( 3548 VexGuestAMD64State* gst, 3549 HWord opc4, HWord gstOffD, 3550 HWord gstOffL, HWord gstOffR 3551 ) 3552 { 3553 // where the args are 3554 V128* argD = (V128*)( ((UChar*)gst) + gstOffD ); 3555 V128* argL = (V128*)( ((UChar*)gst) + gstOffL ); 3556 V128* argR = (V128*)( ((UChar*)gst) + gstOffR ); 3557 V128 r; 3558 3559 switch (opc4) { 3560 case 0xDC: /* AESENC */ 3561 case 0xDD: /* AESENCLAST */ 3562 r = *argR; 3563 ShiftRows (&r); 3564 SubBytes (&r); 3565 if (opc4 == 0xDC) 3566 MixColumns (&r); 3567 argD->w64[0] = r.w64[0] ^ argL->w64[0]; 3568 argD->w64[1] = r.w64[1] ^ argL->w64[1]; 3569 break; 3570 3571 case 0xDE: /* AESDEC */ 3572 case 0xDF: /* AESDECLAST */ 3573 r = *argR; 3574 InvShiftRows (&r); 3575 InvSubBytes (&r); 3576 if (opc4 == 0xDE) 3577 InvMixColumns (&r); 3578 argD->w64[0] = r.w64[0] ^ argL->w64[0]; 3579 argD->w64[1] = r.w64[1] ^ argL->w64[1]; 3580 break; 3581 3582 case 0xDB: /* AESIMC */ 3583 *argD = *argL; 3584 InvMixColumns (argD); 3585 break; 3586 default: vassert(0); 3587 } 3588 } 3589 3590 static inline UInt RotWord (UInt w32) 3591 { 3592 return ((w32 >> 8) | (w32 << 24)); 3593 } 3594 3595 static inline UInt SubWord (UInt w32) 3596 { 3597 UChar *w8; 3598 UChar *r8; 3599 UInt res; 3600 w8 = (UChar*) &w32; 3601 r8 = (UChar*) &res; 3602 r8[0] = sbox[w8[0]]; 3603 r8[1] = sbox[w8[1]]; 3604 r8[2] = sbox[w8[2]]; 3605 r8[3] = sbox[w8[3]]; 3606 return res; 3607 } 3608 3609 /* For description, see definition in guest_amd64_defs.h */ 3610 extern void amd64g_dirtyhelper_AESKEYGENASSIST ( 3611 VexGuestAMD64State* gst, 3612 HWord imm8, 3613 HWord gstOffL, HWord gstOffR 3614 ) 3615 { 3616 // where the args are 3617 V128* argL = (V128*)( ((UChar*)gst) + gstOffL ); 3618 V128* argR = (V128*)( ((UChar*)gst) + gstOffR ); 3619 3620 argR->w32[3] = RotWord (SubWord (argL->w32[3])) ^ imm8; 3621 argR->w32[2] = SubWord (argL->w32[3]); 3622 argR->w32[1] = RotWord (SubWord (argL->w32[1])) ^ imm8; 3623 argR->w32[0] = SubWord (argL->w32[1]); 3624 } 3625 3626 3627 3628 /*---------------------------------------------------------------*/ 3629 /*--- Helpers for dealing with, and describing, ---*/ 3630 /*--- guest state as a whole. ---*/ 3631 /*---------------------------------------------------------------*/ 3632 3633 /* Initialise the entire amd64 guest state. */ 3634 /* VISIBLE TO LIBVEX CLIENT */ 3635 void LibVEX_GuestAMD64_initialise ( /*OUT*/VexGuestAMD64State* vex_state ) 3636 { 3637 vex_state->host_EvC_FAILADDR = 0; 3638 vex_state->host_EvC_COUNTER = 0; 3639 vex_state->pad0 = 0; 3640 3641 vex_state->guest_RAX = 0; 3642 vex_state->guest_RCX = 0; 3643 vex_state->guest_RDX = 0; 3644 vex_state->guest_RBX = 0; 3645 vex_state->guest_RSP = 0; 3646 vex_state->guest_RBP = 0; 3647 vex_state->guest_RSI = 0; 3648 vex_state->guest_RDI = 0; 3649 vex_state->guest_R8 = 0; 3650 vex_state->guest_R9 = 0; 3651 vex_state->guest_R10 = 0; 3652 vex_state->guest_R11 = 0; 3653 vex_state->guest_R12 = 0; 3654 vex_state->guest_R13 = 0; 3655 vex_state->guest_R14 = 0; 3656 vex_state->guest_R15 = 0; 3657 3658 vex_state->guest_CC_OP = AMD64G_CC_OP_COPY; 3659 vex_state->guest_CC_DEP1 = 0; 3660 vex_state->guest_CC_DEP2 = 0; 3661 vex_state->guest_CC_NDEP = 0; 3662 3663 vex_state->guest_DFLAG = 1; /* forwards */ 3664 vex_state->guest_IDFLAG = 0; 3665 3666 /* HACK: represent the offset associated with %fs==0. This 3667 assumes that %fs is only ever zero. */ 3668 vex_state->guest_FS_ZERO = 0; 3669 3670 vex_state->guest_RIP = 0; 3671 3672 /* Initialise the simulated FPU */ 3673 amd64g_dirtyhelper_FINIT( vex_state ); 3674 3675 /* Initialise the AVX state. */ 3676 # define AVXZERO(_ymm) \ 3677 do { _ymm[0]=_ymm[1]=_ymm[2]=_ymm[3] = 0; \ 3678 _ymm[4]=_ymm[5]=_ymm[6]=_ymm[7] = 0; \ 3679 } while (0) 3680 vex_state->guest_SSEROUND = (ULong)Irrm_NEAREST; 3681 AVXZERO(vex_state->guest_YMM0); 3682 AVXZERO(vex_state->guest_YMM1); 3683 AVXZERO(vex_state->guest_YMM2); 3684 AVXZERO(vex_state->guest_YMM3); 3685 AVXZERO(vex_state->guest_YMM4); 3686 AVXZERO(vex_state->guest_YMM5); 3687 AVXZERO(vex_state->guest_YMM6); 3688 AVXZERO(vex_state->guest_YMM7); 3689 AVXZERO(vex_state->guest_YMM8); 3690 AVXZERO(vex_state->guest_YMM9); 3691 AVXZERO(vex_state->guest_YMM10); 3692 AVXZERO(vex_state->guest_YMM11); 3693 AVXZERO(vex_state->guest_YMM12); 3694 AVXZERO(vex_state->guest_YMM13); 3695 AVXZERO(vex_state->guest_YMM14); 3696 AVXZERO(vex_state->guest_YMM15); 3697 AVXZERO(vex_state->guest_YMM16); 3698 3699 # undef AVXZERO 3700 3701 vex_state->guest_EMWARN = EmWarn_NONE; 3702 3703 /* These should not ever be either read or written, but we 3704 initialise them anyway. */ 3705 vex_state->guest_TISTART = 0; 3706 vex_state->guest_TILEN = 0; 3707 3708 vex_state->guest_NRADDR = 0; 3709 vex_state->guest_SC_CLASS = 0; 3710 vex_state->guest_GS_0x60 = 0; 3711 3712 vex_state->guest_IP_AT_SYSCALL = 0; 3713 vex_state->pad1 = 0; 3714 } 3715 3716 3717 /* Figure out if any part of the guest state contained in minoff 3718 .. maxoff requires precise memory exceptions. If in doubt return 3719 True (but this is generates significantly slower code). 3720 3721 By default we enforce precise exns for guest %RSP, %RBP and %RIP 3722 only. These are the minimum needed to extract correct stack 3723 backtraces from amd64 code. 3724 */ 3725 Bool guest_amd64_state_requires_precise_mem_exns ( Int minoff, 3726 Int maxoff) 3727 { 3728 Int rbp_min = offsetof(VexGuestAMD64State, guest_RBP); 3729 Int rbp_max = rbp_min + 8 - 1; 3730 Int rsp_min = offsetof(VexGuestAMD64State, guest_RSP); 3731 Int rsp_max = rsp_min + 8 - 1; 3732 Int rip_min = offsetof(VexGuestAMD64State, guest_RIP); 3733 Int rip_max = rip_min + 8 - 1; 3734 3735 if (maxoff < rbp_min || minoff > rbp_max) { 3736 /* no overlap with rbp */ 3737 } else { 3738 return True; 3739 } 3740 3741 if (maxoff < rsp_min || minoff > rsp_max) { 3742 /* no overlap with rsp */ 3743 } else { 3744 return True; 3745 } 3746 3747 if (maxoff < rip_min || minoff > rip_max) { 3748 /* no overlap with eip */ 3749 } else { 3750 return True; 3751 } 3752 3753 return False; 3754 } 3755 3756 3757 #define ALWAYSDEFD(field) \ 3758 { offsetof(VexGuestAMD64State, field), \ 3759 (sizeof ((VexGuestAMD64State*)0)->field) } 3760 3761 VexGuestLayout 3762 amd64guest_layout 3763 = { 3764 /* Total size of the guest state, in bytes. */ 3765 .total_sizeB = sizeof(VexGuestAMD64State), 3766 3767 /* Describe the stack pointer. */ 3768 .offset_SP = offsetof(VexGuestAMD64State,guest_RSP), 3769 .sizeof_SP = 8, 3770 3771 /* Describe the frame pointer. */ 3772 .offset_FP = offsetof(VexGuestAMD64State,guest_RBP), 3773 .sizeof_FP = 8, 3774 3775 /* Describe the instruction pointer. */ 3776 .offset_IP = offsetof(VexGuestAMD64State,guest_RIP), 3777 .sizeof_IP = 8, 3778 3779 /* Describe any sections to be regarded by Memcheck as 3780 'always-defined'. */ 3781 .n_alwaysDefd = 16, 3782 3783 /* flags thunk: OP and NDEP are always defd, whereas DEP1 3784 and DEP2 have to be tracked. See detailed comment in 3785 gdefs.h on meaning of thunk fields. */ 3786 .alwaysDefd 3787 = { /* 0 */ ALWAYSDEFD(guest_CC_OP), 3788 /* 1 */ ALWAYSDEFD(guest_CC_NDEP), 3789 /* 2 */ ALWAYSDEFD(guest_DFLAG), 3790 /* 3 */ ALWAYSDEFD(guest_IDFLAG), 3791 /* 4 */ ALWAYSDEFD(guest_RIP), 3792 /* 5 */ ALWAYSDEFD(guest_FS_ZERO), 3793 /* 6 */ ALWAYSDEFD(guest_FTOP), 3794 /* 7 */ ALWAYSDEFD(guest_FPTAG), 3795 /* 8 */ ALWAYSDEFD(guest_FPROUND), 3796 /* 9 */ ALWAYSDEFD(guest_FC3210), 3797 // /* */ ALWAYSDEFD(guest_CS), 3798 // /* */ ALWAYSDEFD(guest_DS), 3799 // /* */ ALWAYSDEFD(guest_ES), 3800 // /* */ ALWAYSDEFD(guest_FS), 3801 // /* */ ALWAYSDEFD(guest_GS), 3802 // /* */ ALWAYSDEFD(guest_SS), 3803 // /* */ ALWAYSDEFD(guest_LDT), 3804 // /* */ ALWAYSDEFD(guest_GDT), 3805 /* 10 */ ALWAYSDEFD(guest_EMWARN), 3806 /* 11 */ ALWAYSDEFD(guest_SSEROUND), 3807 /* 12 */ ALWAYSDEFD(guest_TISTART), 3808 /* 13 */ ALWAYSDEFD(guest_TILEN), 3809 /* 14 */ ALWAYSDEFD(guest_SC_CLASS), 3810 /* 15 */ ALWAYSDEFD(guest_IP_AT_SYSCALL) 3811 } 3812 }; 3813 3814 3815 /*---------------------------------------------------------------*/ 3816 /*--- end guest_amd64_helpers.c ---*/ 3817 /*---------------------------------------------------------------*/ 3818