1 2 /*---------------------------------------------------------------*/ 3 /*--- begin guest_generic_x87.c ---*/ 4 /*---------------------------------------------------------------*/ 5 6 /* 7 This file is part of Valgrind, a dynamic binary instrumentation 8 framework. 9 10 Copyright (C) 2004-2012 OpenWorks LLP 11 info (at) open-works.net 12 13 This program is free software; you can redistribute it and/or 14 modify it under the terms of the GNU General Public License as 15 published by the Free Software Foundation; either version 2 of the 16 License, or (at your option) any later version. 17 18 This program is distributed in the hope that it will be useful, but 19 WITHOUT ANY WARRANTY; without even the implied warranty of 20 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 21 General Public License for more details. 22 23 You should have received a copy of the GNU General Public License 24 along with this program; if not, write to the Free Software 25 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 26 02110-1301, USA. 27 28 The GNU General Public License is contained in the file COPYING. 29 30 Neither the names of the U.S. Department of Energy nor the 31 University of California nor the names of its contributors may be 32 used to endorse or promote products derived from this software 33 without prior written permission. 34 */ 35 36 /* This file contains functions for doing some x87-specific 37 operations. Both the amd64 and x86 front ends (guests) indirectly 38 call these functions via guest helper calls. By putting them here, 39 code duplication is avoided. Some of these functions are tricky 40 and hard to verify, so there is much to be said for only having one 41 copy thereof. 42 */ 43 44 #include "libvex_basictypes.h" 45 46 #include "main_util.h" 47 #include "guest_generic_x87.h" 48 49 50 /* 80 and 64-bit floating point formats: 51 52 80-bit: 53 54 S 0 0-------0 zero 55 S 0 0X------X denormals 56 S 1-7FFE 1X------X normals (all normals have leading 1) 57 S 7FFF 10------0 infinity 58 S 7FFF 10X-----X snan 59 S 7FFF 11X-----X qnan 60 61 S is the sign bit. For runs X----X, at least one of the Xs must be 62 nonzero. Exponent is 15 bits, fractional part is 63 bits, and 63 there is an explicitly represented leading 1, and a sign bit, 64 giving 80 in total. 65 66 64-bit avoids the confusion of an explicitly represented leading 1 67 and so is simpler: 68 69 S 0 0------0 zero 70 S 0 X------X denormals 71 S 1-7FE any normals 72 S 7FF 0------0 infinity 73 S 7FF 0X-----X snan 74 S 7FF 1X-----X qnan 75 76 Exponent is 11 bits, fractional part is 52 bits, and there is a 77 sign bit, giving 64 in total. 78 */ 79 80 81 static inline UInt read_bit_array ( UChar* arr, UInt n ) 82 { 83 UChar c = arr[n >> 3]; 84 c >>= (n&7); 85 return c & 1; 86 } 87 88 static inline void write_bit_array ( UChar* arr, UInt n, UInt b ) 89 { 90 UChar c = arr[n >> 3]; 91 c = toUChar( c & ~(1 << (n&7)) ); 92 c = toUChar( c | ((b&1) << (n&7)) ); 93 arr[n >> 3] = c; 94 } 95 96 /* Convert an IEEE754 double (64-bit) into an x87 extended double 97 (80-bit), mimicing the hardware fairly closely. Both numbers are 98 stored little-endian. Limitations, all of which could be fixed, 99 given some level of hassle: 100 101 * Identity of NaNs is not preserved. 102 103 See comments in the code for more details. 104 */ 105 void convert_f64le_to_f80le ( /*IN*/UChar* f64, /*OUT*/UChar* f80 ) 106 { 107 Bool mantissaIsZero; 108 Int bexp, i, j, shift; 109 UChar sign; 110 111 sign = toUChar( (f64[7] >> 7) & 1 ); 112 bexp = (f64[7] << 4) | ((f64[6] >> 4) & 0x0F); 113 bexp &= 0x7FF; 114 115 mantissaIsZero = False; 116 if (bexp == 0 || bexp == 0x7FF) { 117 /* We'll need to know whether or not the mantissa (bits 51:0) is 118 all zeroes in order to handle these cases. So figure it 119 out. */ 120 mantissaIsZero 121 = toBool( 122 (f64[6] & 0x0F) == 0 123 && f64[5] == 0 && f64[4] == 0 && f64[3] == 0 124 && f64[2] == 0 && f64[1] == 0 && f64[0] == 0 125 ); 126 } 127 128 /* If the exponent is zero, either we have a zero or a denormal. 129 Produce a zero. This is a hack in that it forces denormals to 130 zero. Could do better. */ 131 if (bexp == 0) { 132 f80[9] = toUChar( sign << 7 ); 133 f80[8] = f80[7] = f80[6] = f80[5] = f80[4] 134 = f80[3] = f80[2] = f80[1] = f80[0] = 0; 135 136 if (mantissaIsZero) 137 /* It really is zero, so that's all we can do. */ 138 return; 139 140 /* There is at least one 1-bit in the mantissa. So it's a 141 potentially denormalised double -- but we can produce a 142 normalised long double. Count the leading zeroes in the 143 mantissa so as to decide how much to bump the exponent down 144 by. Note, this is SLOW. */ 145 shift = 0; 146 for (i = 51; i >= 0; i--) { 147 if (read_bit_array(f64, i)) 148 break; 149 shift++; 150 } 151 152 /* and copy into place as many bits as we can get our hands on. */ 153 j = 63; 154 for (i = 51 - shift; i >= 0; i--) { 155 write_bit_array( f80, j, 156 read_bit_array( f64, i ) ); 157 j--; 158 } 159 160 /* Set the exponent appropriately, and we're done. */ 161 bexp -= shift; 162 bexp += (16383 - 1023); 163 f80[9] = toUChar( (sign << 7) | ((bexp >> 8) & 0xFF) ); 164 f80[8] = toUChar( bexp & 0xFF ); 165 return; 166 } 167 168 /* If the exponent is 7FF, this is either an Infinity, a SNaN or 169 QNaN, as determined by examining bits 51:0, thus: 170 0 ... 0 Inf 171 0X ... X SNaN 172 1X ... X QNaN 173 where at least one of the Xs is not zero. 174 */ 175 if (bexp == 0x7FF) { 176 if (mantissaIsZero) { 177 /* Produce an appropriately signed infinity: 178 S 1--1 (15) 1 0--0 (63) 179 */ 180 f80[9] = toUChar( (sign << 7) | 0x7F ); 181 f80[8] = 0xFF; 182 f80[7] = 0x80; 183 f80[6] = f80[5] = f80[4] = f80[3] 184 = f80[2] = f80[1] = f80[0] = 0; 185 return; 186 } 187 /* So it's either a QNaN or SNaN. Distinguish by considering 188 bit 51. Note, this destroys all the trailing bits 189 (identity?) of the NaN. IEEE754 doesn't require preserving 190 these (it only requires that there be one QNaN value and one 191 SNaN value), but x87 does seem to have some ability to 192 preserve them. Anyway, here, the NaN's identity is 193 destroyed. Could be improved. */ 194 if (f64[6] & 8) { 195 /* QNaN. Make a canonical QNaN: 196 S 1--1 (15) 1 1 0--0 (62) 197 */ 198 f80[9] = toUChar( (sign << 7) | 0x7F ); 199 f80[8] = 0xFF; 200 f80[7] = 0xC0; 201 f80[6] = f80[5] = f80[4] = f80[3] 202 = f80[2] = f80[1] = f80[0] = 0x00; 203 } else { 204 /* SNaN. Make a SNaN: 205 S 1--1 (15) 1 0 1--1 (62) 206 */ 207 f80[9] = toUChar( (sign << 7) | 0x7F ); 208 f80[8] = 0xFF; 209 f80[7] = 0xBF; 210 f80[6] = f80[5] = f80[4] = f80[3] 211 = f80[2] = f80[1] = f80[0] = 0xFF; 212 } 213 return; 214 } 215 216 /* It's not a zero, denormal, infinity or nan. So it must be a 217 normalised number. Rebias the exponent and build the new 218 number. */ 219 bexp += (16383 - 1023); 220 221 f80[9] = toUChar( (sign << 7) | ((bexp >> 8) & 0xFF) ); 222 f80[8] = toUChar( bexp & 0xFF ); 223 f80[7] = toUChar( (1 << 7) | ((f64[6] << 3) & 0x78) 224 | ((f64[5] >> 5) & 7) ); 225 f80[6] = toUChar( ((f64[5] << 3) & 0xF8) | ((f64[4] >> 5) & 7) ); 226 f80[5] = toUChar( ((f64[4] << 3) & 0xF8) | ((f64[3] >> 5) & 7) ); 227 f80[4] = toUChar( ((f64[3] << 3) & 0xF8) | ((f64[2] >> 5) & 7) ); 228 f80[3] = toUChar( ((f64[2] << 3) & 0xF8) | ((f64[1] >> 5) & 7) ); 229 f80[2] = toUChar( ((f64[1] << 3) & 0xF8) | ((f64[0] >> 5) & 7) ); 230 f80[1] = toUChar( ((f64[0] << 3) & 0xF8) ); 231 f80[0] = toUChar( 0 ); 232 } 233 234 235 /* Convert an x87 extended double (80-bit) into an IEEE 754 double 236 (64-bit), mimicking the hardware fairly closely. Both numbers are 237 stored little-endian. Limitations, both of which could be fixed, 238 given some level of hassle: 239 240 * Rounding following truncation could be a bit better. 241 242 * Identity of NaNs is not preserved. 243 244 See comments in the code for more details. 245 */ 246 void convert_f80le_to_f64le ( /*IN*/UChar* f80, /*OUT*/UChar* f64 ) 247 { 248 Bool isInf; 249 Int bexp, i, j; 250 UChar sign; 251 252 sign = toUChar((f80[9] >> 7) & 1); 253 bexp = (((UInt)f80[9]) << 8) | (UInt)f80[8]; 254 bexp &= 0x7FFF; 255 256 /* If the exponent is zero, either we have a zero or a denormal. 257 But an extended precision denormal becomes a double precision 258 zero, so in either case, just produce the appropriately signed 259 zero. */ 260 if (bexp == 0) { 261 f64[7] = toUChar(sign << 7); 262 f64[6] = f64[5] = f64[4] = f64[3] = f64[2] = f64[1] = f64[0] = 0; 263 return; 264 } 265 266 /* If the exponent is 7FFF, this is either an Infinity, a SNaN or 267 QNaN, as determined by examining bits 62:0, thus: 268 10 ... 0 Inf 269 10X ... X SNaN 270 11X ... X QNaN 271 where at least one of the Xs is not zero. 272 */ 273 if (bexp == 0x7FFF) { 274 isInf = toBool( 275 (f80[7] & 0x7F) == 0 276 && f80[6] == 0 && f80[5] == 0 && f80[4] == 0 277 && f80[3] == 0 && f80[2] == 0 && f80[1] == 0 278 && f80[0] == 0 279 ); 280 if (isInf) { 281 if (0 == (f80[7] & 0x80)) 282 goto wierd_NaN; 283 /* Produce an appropriately signed infinity: 284 S 1--1 (11) 0--0 (52) 285 */ 286 f64[7] = toUChar((sign << 7) | 0x7F); 287 f64[6] = 0xF0; 288 f64[5] = f64[4] = f64[3] = f64[2] = f64[1] = f64[0] = 0; 289 return; 290 } 291 /* So it's either a QNaN or SNaN. Distinguish by considering 292 bit 61. Note, this destroys all the trailing bits 293 (identity?) of the NaN. IEEE754 doesn't require preserving 294 these (it only requires that there be one QNaN value and one 295 SNaN value), but x87 does seem to have some ability to 296 preserve them. Anyway, here, the NaN's identity is 297 destroyed. Could be improved. */ 298 if (f80[7] & 0x40) { 299 /* QNaN. Make a canonical QNaN: 300 S 1--1 (11) 1 0--0 (51) 301 */ 302 f64[7] = toUChar((sign << 7) | 0x7F); 303 f64[6] = 0xF8; 304 f64[5] = f64[4] = f64[3] = f64[2] = f64[1] = f64[0] = 0x00; 305 } else { 306 /* SNaN. Make a SNaN: 307 S 1--1 (11) 0 1--1 (51) 308 */ 309 f64[7] = toUChar((sign << 7) | 0x7F); 310 f64[6] = 0xF7; 311 f64[5] = f64[4] = f64[3] = f64[2] = f64[1] = f64[0] = 0xFF; 312 } 313 return; 314 } 315 316 /* If it's not a Zero, NaN or Inf, and the integer part (bit 62) is 317 zero, the x87 FPU appears to consider the number denormalised 318 and converts it to a QNaN. */ 319 if (0 == (f80[7] & 0x80)) { 320 wierd_NaN: 321 /* Strange hardware QNaN: 322 S 1--1 (11) 1 0--0 (51) 323 */ 324 /* On a PIII, these QNaNs always appear with sign==1. I have 325 no idea why. */ 326 f64[7] = (1 /*sign*/ << 7) | 0x7F; 327 f64[6] = 0xF8; 328 f64[5] = f64[4] = f64[3] = f64[2] = f64[1] = f64[0] = 0; 329 return; 330 } 331 332 /* It's not a zero, denormal, infinity or nan. So it must be a 333 normalised number. Rebias the exponent and consider. */ 334 bexp -= (16383 - 1023); 335 if (bexp >= 0x7FF) { 336 /* It's too big for a double. Construct an infinity. */ 337 f64[7] = toUChar((sign << 7) | 0x7F); 338 f64[6] = 0xF0; 339 f64[5] = f64[4] = f64[3] = f64[2] = f64[1] = f64[0] = 0; 340 return; 341 } 342 343 if (bexp <= 0) { 344 /* It's too small for a normalised double. First construct a 345 zero and then see if it can be improved into a denormal. */ 346 f64[7] = toUChar(sign << 7); 347 f64[6] = f64[5] = f64[4] = f64[3] = f64[2] = f64[1] = f64[0] = 0; 348 349 if (bexp < -52) 350 /* Too small even for a denormal. */ 351 return; 352 353 /* Ok, let's make a denormal. Note, this is SLOW. */ 354 /* Copy bits 63, 62, 61, etc of the src mantissa into the dst, 355 indexes 52+bexp, 51+bexp, etc, until k+bexp < 0. */ 356 /* bexp is in range -52 .. 0 inclusive */ 357 for (i = 63; i >= 0; i--) { 358 j = i - 12 + bexp; 359 if (j < 0) break; 360 /* We shouldn't really call vassert from generated code. */ 361 vassert(j >= 0 && j < 52); 362 write_bit_array ( f64, 363 j, 364 read_bit_array ( f80, i ) ); 365 } 366 /* and now we might have to round ... */ 367 if (read_bit_array(f80, 10+1 - bexp) == 1) 368 goto do_rounding; 369 370 return; 371 } 372 373 /* Ok, it's a normalised number which is representable as a double. 374 Copy the exponent and mantissa into place. */ 375 /* 376 for (i = 0; i < 52; i++) 377 write_bit_array ( f64, 378 i, 379 read_bit_array ( f80, i+11 ) ); 380 */ 381 f64[0] = toUChar( (f80[1] >> 3) | (f80[2] << 5) ); 382 f64[1] = toUChar( (f80[2] >> 3) | (f80[3] << 5) ); 383 f64[2] = toUChar( (f80[3] >> 3) | (f80[4] << 5) ); 384 f64[3] = toUChar( (f80[4] >> 3) | (f80[5] << 5) ); 385 f64[4] = toUChar( (f80[5] >> 3) | (f80[6] << 5) ); 386 f64[5] = toUChar( (f80[6] >> 3) | (f80[7] << 5) ); 387 388 f64[6] = toUChar( ((bexp << 4) & 0xF0) | ((f80[7] >> 3) & 0x0F) ); 389 390 f64[7] = toUChar( (sign << 7) | ((bexp >> 4) & 0x7F) ); 391 392 /* Now consider any rounding that needs to happen as a result of 393 truncating the mantissa. */ 394 if (f80[1] & 4) /* read_bit_array(f80, 10) == 1) */ { 395 396 /* If the bottom bits of f80 are "100 0000 0000", then the 397 infinitely precise value is deemed to be mid-way between the 398 two closest representable values. Since we're doing 399 round-to-nearest (the default mode), in that case it is the 400 bit immediately above which indicates whether we should round 401 upwards or not -- if 0, we don't. All that is encapsulated 402 in the following simple test. */ 403 if ((f80[1] & 0xF) == 4/*0100b*/ && f80[0] == 0) 404 return; 405 406 do_rounding: 407 /* Round upwards. This is a kludge. Once in every 2^24 408 roundings (statistically) the bottom three bytes are all 0xFF 409 and so we don't round at all. Could be improved. */ 410 if (f64[0] != 0xFF) { 411 f64[0]++; 412 } 413 else 414 if (f64[0] == 0xFF && f64[1] != 0xFF) { 415 f64[0] = 0; 416 f64[1]++; 417 } 418 else 419 if (f64[0] == 0xFF && f64[1] == 0xFF && f64[2] != 0xFF) { 420 f64[0] = 0; 421 f64[1] = 0; 422 f64[2]++; 423 } 424 /* else we don't round, but we should. */ 425 } 426 } 427 428 429 /* CALLED FROM GENERATED CODE: CLEAN HELPER */ 430 /* Extract the signed significand or exponent component as per 431 fxtract. Arg and result are doubles travelling under the guise of 432 ULongs. Returns significand when getExp is zero and exponent 433 otherwise. */ 434 ULong x86amd64g_calculate_FXTRACT ( ULong arg, HWord getExp ) 435 { 436 ULong uSig, uExp; 437 /* Long sSig; */ 438 Int sExp, i; 439 UInt sign, expExp; 440 441 /* 442 S 7FF 0------0 infinity 443 S 7FF 0X-----X snan 444 S 7FF 1X-----X qnan 445 */ 446 const ULong posInf = 0x7FF0000000000000ULL; 447 const ULong negInf = 0xFFF0000000000000ULL; 448 const ULong nanMask = 0x7FF0000000000000ULL; 449 const ULong qNan = 0x7FF8000000000000ULL; 450 const ULong posZero = 0x0000000000000000ULL; 451 const ULong negZero = 0x8000000000000000ULL; 452 const ULong bit51 = 1ULL << 51; 453 const ULong bit52 = 1ULL << 52; 454 const ULong sigMask = bit52 - 1; 455 456 /* Mimic Core i5 behaviour for special cases. */ 457 if (arg == posInf) 458 return getExp ? posInf : posInf; 459 if (arg == negInf) 460 return getExp ? posInf : negInf; 461 if ((arg & nanMask) == nanMask) 462 return qNan | (arg & (1ULL << 63)); 463 if (arg == posZero) 464 return getExp ? negInf : posZero; 465 if (arg == negZero) 466 return getExp ? negInf : negZero; 467 468 /* Split into sign, exponent and significand. */ 469 sign = ((UInt)(arg >> 63)) & 1; 470 471 /* Mask off exponent & sign. uSig is in range 0 .. 2^52-1. */ 472 uSig = arg & sigMask; 473 474 /* Get the exponent. */ 475 sExp = ((Int)(arg >> 52)) & 0x7FF; 476 477 /* Deal with denormals: if the exponent is zero, then the 478 significand cannot possibly be zero (negZero/posZero are handled 479 above). Shift the significand left until bit 51 of it becomes 480 1, and decrease the exponent accordingly. 481 */ 482 if (sExp == 0) { 483 for (i = 0; i < 52; i++) { 484 if (uSig & bit51) 485 break; 486 uSig <<= 1; 487 sExp--; 488 } 489 uSig <<= 1; 490 } else { 491 /* Add the implied leading-1 in the significand. */ 492 uSig |= bit52; 493 } 494 495 /* Roll in the sign. */ 496 /* sSig = uSig; */ 497 /* if (sign) sSig =- sSig; */ 498 499 /* Convert sig into a double. This should be an exact conversion. 500 Then divide by 2^52, which should give a value in the range 1.0 501 to 2.0-epsilon, at least for normalised args. */ 502 /* dSig = (Double)sSig; */ 503 /* dSig /= 67108864.0; */ /* 2^26 */ 504 /* dSig /= 67108864.0; */ /* 2^26 */ 505 uSig &= sigMask; 506 uSig |= 0x3FF0000000000000ULL; 507 if (sign) 508 uSig ^= negZero; 509 510 /* Convert exp into a double. Also an exact conversion. */ 511 /* dExp = (Double)(sExp - 1023); */ 512 sExp -= 1023; 513 if (sExp == 0) { 514 uExp = 0; 515 } else { 516 uExp = sExp < 0 ? -sExp : sExp; 517 expExp = 0x3FF +52; 518 /* 1 <= uExp <= 1074 */ 519 /* Skip first 42 iterations of normalisation loop as we know they 520 will always happen */ 521 uExp <<= 42; 522 expExp -= 42; 523 for (i = 0; i < 52-42; i++) { 524 if (uExp & bit52) 525 break; 526 uExp <<= 1; 527 expExp--; 528 } 529 uExp &= sigMask; 530 uExp |= ((ULong)expExp) << 52; 531 if (sExp < 0) uExp ^= negZero; 532 } 533 534 return getExp ? uExp : uSig; 535 } 536 537 538 539 /*---------------------------------------------------------*/ 540 /*--- SSE4.2 PCMP{E,I}STR{I,M} helpers ---*/ 541 /*---------------------------------------------------------*/ 542 543 /* We need the definitions for OSZACP eflags/rflags offsets. 544 #including guest_{amd64,x86}_defs.h causes chaos, so just copy the 545 required values directly. They are not going to change in the 546 foreseeable future :-) 547 */ 548 549 #define SHIFT_O 11 550 #define SHIFT_S 7 551 #define SHIFT_Z 6 552 #define SHIFT_A 4 553 #define SHIFT_C 0 554 #define SHIFT_P 2 555 556 #define MASK_O (1 << SHIFT_O) 557 #define MASK_S (1 << SHIFT_S) 558 #define MASK_Z (1 << SHIFT_Z) 559 #define MASK_A (1 << SHIFT_A) 560 #define MASK_C (1 << SHIFT_C) 561 #define MASK_P (1 << SHIFT_P) 562 563 564 /* Count leading zeroes, w/ 0-produces-32 semantics, a la Hacker's 565 Delight. */ 566 static UInt clz32 ( UInt x ) 567 { 568 Int y, m, n; 569 y = -(x >> 16); 570 m = (y >> 16) & 16; 571 n = 16 - m; 572 x = x >> m; 573 y = x - 0x100; 574 m = (y >> 16) & 8; 575 n = n + m; 576 x = x << m; 577 y = x - 0x1000; 578 m = (y >> 16) & 4; 579 n = n + m; 580 x = x << m; 581 y = x - 0x4000; 582 m = (y >> 16) & 2; 583 n = n + m; 584 x = x << m; 585 y = x >> 14; 586 m = y & ~(y >> 1); 587 return n + 2 - m; 588 } 589 590 static UInt ctz32 ( UInt x ) 591 { 592 return 32 - clz32((~x) & (x-1)); 593 } 594 595 /* Convert a 4-bit value to a 32-bit value by cloning each bit 8 596 times. There's surely a better way to do this, but I don't know 597 what it is. */ 598 static UInt bits4_to_bytes4 ( UInt bits4 ) 599 { 600 UInt r = 0; 601 r |= (bits4 & 1) ? 0x000000FF : 0; 602 r |= (bits4 & 2) ? 0x0000FF00 : 0; 603 r |= (bits4 & 4) ? 0x00FF0000 : 0; 604 r |= (bits4 & 8) ? 0xFF000000 : 0; 605 return r; 606 } 607 608 609 /* Convert a 2-bit value to a 32-bit value by cloning each bit 16 610 times. There's surely a better way to do this, but I don't know 611 what it is. */ 612 static UInt bits2_to_bytes4 ( UInt bits2 ) 613 { 614 UInt r = 0; 615 r |= (bits2 & 1) ? 0x0000FFFF : 0; 616 r |= (bits2 & 2) ? 0xFFFF0000 : 0; 617 return r; 618 } 619 620 621 /* Given partial results from a pcmpXstrX operation (intRes1, 622 basically), generate an I- or M-format output value, also the new 623 OSZACP flags. */ 624 static 625 void compute_PCMPxSTRx_gen_output (/*OUT*/V128* resV, 626 /*OUT*/UInt* resOSZACP, 627 UInt intRes1, 628 UInt zmaskL, UInt zmaskR, 629 UInt validL, 630 UInt pol, UInt idx, 631 Bool isxSTRM ) 632 { 633 vassert((pol >> 2) == 0); 634 vassert((idx >> 1) == 0); 635 636 UInt intRes2 = 0; 637 switch (pol) { 638 case 0: intRes2 = intRes1; break; // pol + 639 case 1: intRes2 = ~intRes1; break; // pol - 640 case 2: intRes2 = intRes1; break; // pol m+ 641 case 3: intRes2 = intRes1 ^ validL; break; // pol m- 642 } 643 intRes2 &= 0xFFFF; 644 645 if (isxSTRM) { 646 647 // generate M-format output (a bit or byte mask in XMM0) 648 if (idx) { 649 resV->w32[0] = bits4_to_bytes4( (intRes2 >> 0) & 0xF ); 650 resV->w32[1] = bits4_to_bytes4( (intRes2 >> 4) & 0xF ); 651 resV->w32[2] = bits4_to_bytes4( (intRes2 >> 8) & 0xF ); 652 resV->w32[3] = bits4_to_bytes4( (intRes2 >> 12) & 0xF ); 653 } else { 654 resV->w32[0] = intRes2 & 0xFFFF; 655 resV->w32[1] = 0; 656 resV->w32[2] = 0; 657 resV->w32[3] = 0; 658 } 659 660 } else { 661 662 // generate I-format output (an index in ECX) 663 // generate ecx value 664 UInt newECX = 0; 665 if (idx) { 666 // index of ms-1-bit 667 newECX = intRes2 == 0 ? 16 : (31 - clz32(intRes2)); 668 } else { 669 // index of ls-1-bit 670 newECX = intRes2 == 0 ? 16 : ctz32(intRes2); 671 } 672 673 resV->w32[0] = newECX; 674 resV->w32[1] = 0; 675 resV->w32[2] = 0; 676 resV->w32[3] = 0; 677 678 } 679 680 // generate new flags, common to all ISTRI and ISTRM cases 681 *resOSZACP // A, P are zero 682 = ((intRes2 == 0) ? 0 : MASK_C) // C == 0 iff intRes2 == 0 683 | ((zmaskL == 0) ? 0 : MASK_Z) // Z == 1 iff any in argL is 0 684 | ((zmaskR == 0) ? 0 : MASK_S) // S == 1 iff any in argR is 0 685 | ((intRes2 & 1) << SHIFT_O); // O == IntRes2[0] 686 } 687 688 689 /* Given partial results from a 16-bit pcmpXstrX operation (intRes1, 690 basically), generate an I- or M-format output value, also the new 691 OSZACP flags. */ 692 static 693 void compute_PCMPxSTRx_gen_output_wide (/*OUT*/V128* resV, 694 /*OUT*/UInt* resOSZACP, 695 UInt intRes1, 696 UInt zmaskL, UInt zmaskR, 697 UInt validL, 698 UInt pol, UInt idx, 699 Bool isxSTRM ) 700 { 701 vassert((pol >> 2) == 0); 702 vassert((idx >> 1) == 0); 703 704 UInt intRes2 = 0; 705 switch (pol) { 706 case 0: intRes2 = intRes1; break; // pol + 707 case 1: intRes2 = ~intRes1; break; // pol - 708 case 2: intRes2 = intRes1; break; // pol m+ 709 case 3: intRes2 = intRes1 ^ validL; break; // pol m- 710 } 711 intRes2 &= 0xFF; 712 713 if (isxSTRM) { 714 715 // generate M-format output (a bit or byte mask in XMM0) 716 if (idx) { 717 resV->w32[0] = bits2_to_bytes4( (intRes2 >> 0) & 0x3 ); 718 resV->w32[1] = bits2_to_bytes4( (intRes2 >> 2) & 0x3 ); 719 resV->w32[2] = bits2_to_bytes4( (intRes2 >> 4) & 0x3 ); 720 resV->w32[3] = bits2_to_bytes4( (intRes2 >> 6) & 0x3 ); 721 } else { 722 resV->w32[0] = intRes2 & 0xFF; 723 resV->w32[1] = 0; 724 resV->w32[2] = 0; 725 resV->w32[3] = 0; 726 } 727 728 } else { 729 730 // generate I-format output (an index in ECX) 731 // generate ecx value 732 UInt newECX = 0; 733 if (idx) { 734 // index of ms-1-bit 735 newECX = intRes2 == 0 ? 8 : (31 - clz32(intRes2)); 736 } else { 737 // index of ls-1-bit 738 newECX = intRes2 == 0 ? 8 : ctz32(intRes2); 739 } 740 741 resV->w32[0] = newECX; 742 resV->w32[1] = 0; 743 resV->w32[2] = 0; 744 resV->w32[3] = 0; 745 746 } 747 748 // generate new flags, common to all ISTRI and ISTRM cases 749 *resOSZACP // A, P are zero 750 = ((intRes2 == 0) ? 0 : MASK_C) // C == 0 iff intRes2 == 0 751 | ((zmaskL == 0) ? 0 : MASK_Z) // Z == 1 iff any in argL is 0 752 | ((zmaskR == 0) ? 0 : MASK_S) // S == 1 iff any in argR is 0 753 | ((intRes2 & 1) << SHIFT_O); // O == IntRes2[0] 754 } 755 756 757 /* Compute result and new OSZACP flags for all PCMP{E,I}STR{I,M} 758 variants on 8-bit data. 759 760 For xSTRI variants, the new ECX value is placed in the 32 bits 761 pointed to by *resV, and the top 96 bits are zeroed. For xSTRM 762 variants, the result is a 128 bit value and is placed at *resV in 763 the obvious way. 764 765 For all variants, the new OSZACP value is placed at *resOSZACP. 766 767 argLV and argRV are the vector args. The caller must prepare a 768 16-bit mask for each, zmaskL and zmaskR. For ISTRx variants this 769 must be 1 for each zero byte of of the respective arg. For ESTRx 770 variants this is derived from the explicit length indication, and 771 must be 0 in all places except at the bit index corresponding to 772 the valid length (0 .. 16). If the valid length is 16 then the 773 mask must be all zeroes. In all cases, bits 31:16 must be zero. 774 775 imm8 is the original immediate from the instruction. isSTRM 776 indicates whether this is a xSTRM or xSTRI variant, which controls 777 how much of *res is written. 778 779 If the given imm8 case can be handled, the return value is True. 780 If not, False is returned, and neither *res not *resOSZACP are 781 altered. 782 */ 783 784 Bool compute_PCMPxSTRx ( /*OUT*/V128* resV, 785 /*OUT*/UInt* resOSZACP, 786 V128* argLV, V128* argRV, 787 UInt zmaskL, UInt zmaskR, 788 UInt imm8, Bool isxSTRM ) 789 { 790 vassert(imm8 < 0x80); 791 vassert((zmaskL >> 16) == 0); 792 vassert((zmaskR >> 16) == 0); 793 794 /* Explicitly reject any imm8 values that haven't been validated, 795 even if they would probably work. Life is too short to have 796 unvalidated cases in the code base. */ 797 switch (imm8) { 798 case 0x00: 799 case 0x02: case 0x08: case 0x0A: case 0x0C: case 0x12: 800 case 0x1A: case 0x38: case 0x3A: case 0x44: case 0x4A: 801 case 0x46: 802 break; 803 default: 804 return False; 805 } 806 807 UInt fmt = (imm8 >> 0) & 3; // imm8[1:0] data format 808 UInt agg = (imm8 >> 2) & 3; // imm8[3:2] aggregation fn 809 UInt pol = (imm8 >> 4) & 3; // imm8[5:4] polarity 810 UInt idx = (imm8 >> 6) & 1; // imm8[6] 1==msb/bytemask 811 812 /*----------------------------------------*/ 813 /*-- strcmp on byte data --*/ 814 /*----------------------------------------*/ 815 816 if (agg == 2/*equal each, aka strcmp*/ 817 && (fmt == 0/*ub*/ || fmt == 2/*sb*/)) { 818 Int i; 819 UChar* argL = (UChar*)argLV; 820 UChar* argR = (UChar*)argRV; 821 UInt boolResII = 0; 822 for (i = 15; i >= 0; i--) { 823 UChar cL = argL[i]; 824 UChar cR = argR[i]; 825 boolResII = (boolResII << 1) | (cL == cR ? 1 : 0); 826 } 827 UInt validL = ~(zmaskL | -zmaskL); // not(left(zmaskL)) 828 UInt validR = ~(zmaskR | -zmaskR); // not(left(zmaskR)) 829 830 // do invalidation, common to all equal-each cases 831 UInt intRes1 832 = (boolResII & validL & validR) // if both valid, use cmpres 833 | (~ (validL | validR)); // if both invalid, force 1 834 // else force 0 835 intRes1 &= 0xFFFF; 836 837 // generate I-format output 838 compute_PCMPxSTRx_gen_output( 839 resV, resOSZACP, 840 intRes1, zmaskL, zmaskR, validL, pol, idx, isxSTRM 841 ); 842 843 return True; 844 } 845 846 /*----------------------------------------*/ 847 /*-- set membership on byte data --*/ 848 /*----------------------------------------*/ 849 850 if (agg == 0/*equal any, aka find chars in a set*/ 851 && (fmt == 0/*ub*/ || fmt == 2/*sb*/)) { 852 /* argL: the string, argR: charset */ 853 UInt si, ci; 854 UChar* argL = (UChar*)argLV; 855 UChar* argR = (UChar*)argRV; 856 UInt boolRes = 0; 857 UInt validL = ~(zmaskL | -zmaskL); // not(left(zmaskL)) 858 UInt validR = ~(zmaskR | -zmaskR); // not(left(zmaskR)) 859 860 for (si = 0; si < 16; si++) { 861 if ((validL & (1 << si)) == 0) 862 // run off the end of the string. 863 break; 864 UInt m = 0; 865 for (ci = 0; ci < 16; ci++) { 866 if ((validR & (1 << ci)) == 0) break; 867 if (argR[ci] == argL[si]) { m = 1; break; } 868 } 869 boolRes |= (m << si); 870 } 871 872 // boolRes is "pre-invalidated" 873 UInt intRes1 = boolRes & 0xFFFF; 874 875 // generate I-format output 876 compute_PCMPxSTRx_gen_output( 877 resV, resOSZACP, 878 intRes1, zmaskL, zmaskR, validL, pol, idx, isxSTRM 879 ); 880 881 return True; 882 } 883 884 /*----------------------------------------*/ 885 /*-- substring search on byte data --*/ 886 /*----------------------------------------*/ 887 888 if (agg == 3/*equal ordered, aka substring search*/ 889 && (fmt == 0/*ub*/ || fmt == 2/*sb*/)) { 890 891 /* argL: haystack, argR: needle */ 892 UInt ni, hi; 893 UChar* argL = (UChar*)argLV; 894 UChar* argR = (UChar*)argRV; 895 UInt boolRes = 0; 896 UInt validL = ~(zmaskL | -zmaskL); // not(left(zmaskL)) 897 UInt validR = ~(zmaskR | -zmaskR); // not(left(zmaskR)) 898 for (hi = 0; hi < 16; hi++) { 899 UInt m = 1; 900 for (ni = 0; ni < 16; ni++) { 901 if ((validR & (1 << ni)) == 0) break; 902 UInt i = ni + hi; 903 if (i >= 16) break; 904 if (argL[i] != argR[ni]) { m = 0; break; } 905 } 906 boolRes |= (m << hi); 907 if ((validL & (1 << hi)) == 0) 908 // run off the end of the haystack 909 break; 910 } 911 912 // boolRes is "pre-invalidated" 913 UInt intRes1 = boolRes & 0xFFFF; 914 915 // generate I-format output 916 compute_PCMPxSTRx_gen_output( 917 resV, resOSZACP, 918 intRes1, zmaskL, zmaskR, validL, pol, idx, isxSTRM 919 ); 920 921 return True; 922 } 923 924 /*----------------------------------------*/ 925 /*-- ranges, unsigned byte data --*/ 926 /*----------------------------------------*/ 927 928 if (agg == 1/*ranges*/ 929 && fmt == 0/*ub*/) { 930 931 /* argL: string, argR: range-pairs */ 932 UInt ri, si; 933 UChar* argL = (UChar*)argLV; 934 UChar* argR = (UChar*)argRV; 935 UInt boolRes = 0; 936 UInt validL = ~(zmaskL | -zmaskL); // not(left(zmaskL)) 937 UInt validR = ~(zmaskR | -zmaskR); // not(left(zmaskR)) 938 for (si = 0; si < 16; si++) { 939 if ((validL & (1 << si)) == 0) 940 // run off the end of the string 941 break; 942 UInt m = 0; 943 for (ri = 0; ri < 16; ri += 2) { 944 if ((validR & (3 << ri)) != (3 << ri)) break; 945 if (argR[ri] <= argL[si] && argL[si] <= argR[ri+1]) { 946 m = 1; break; 947 } 948 } 949 boolRes |= (m << si); 950 } 951 952 // boolRes is "pre-invalidated" 953 UInt intRes1 = boolRes & 0xFFFF; 954 955 // generate I-format output 956 compute_PCMPxSTRx_gen_output( 957 resV, resOSZACP, 958 intRes1, zmaskL, zmaskR, validL, pol, idx, isxSTRM 959 ); 960 961 return True; 962 } 963 964 /*----------------------------------------*/ 965 /*-- ranges, signed byte data --*/ 966 /*----------------------------------------*/ 967 968 if (agg == 1/*ranges*/ 969 && fmt == 2/*sb*/) { 970 971 /* argL: string, argR: range-pairs */ 972 UInt ri, si; 973 Char* argL = (Char*)argLV; 974 Char* argR = (Char*)argRV; 975 UInt boolRes = 0; 976 UInt validL = ~(zmaskL | -zmaskL); // not(left(zmaskL)) 977 UInt validR = ~(zmaskR | -zmaskR); // not(left(zmaskR)) 978 for (si = 0; si < 16; si++) { 979 if ((validL & (1 << si)) == 0) 980 // run off the end of the string 981 break; 982 UInt m = 0; 983 for (ri = 0; ri < 16; ri += 2) { 984 if ((validR & (3 << ri)) != (3 << ri)) break; 985 if (argR[ri] <= argL[si] && argL[si] <= argR[ri+1]) { 986 m = 1; break; 987 } 988 } 989 boolRes |= (m << si); 990 } 991 992 // boolRes is "pre-invalidated" 993 UInt intRes1 = boolRes & 0xFFFF; 994 995 // generate I-format output 996 compute_PCMPxSTRx_gen_output( 997 resV, resOSZACP, 998 intRes1, zmaskL, zmaskR, validL, pol, idx, isxSTRM 999 ); 1000 1001 return True; 1002 } 1003 1004 return False; 1005 } 1006 1007 1008 /* Compute result and new OSZACP flags for all PCMP{E,I}STR{I,M} 1009 variants on 16-bit characters. 1010 1011 For xSTRI variants, the new ECX value is placed in the 32 bits 1012 pointed to by *resV, and the top 96 bits are zeroed. For xSTRM 1013 variants, the result is a 128 bit value and is placed at *resV in 1014 the obvious way. 1015 1016 For all variants, the new OSZACP value is placed at *resOSZACP. 1017 1018 argLV and argRV are the vector args. The caller must prepare a 1019 8-bit mask for each, zmaskL and zmaskR. For ISTRx variants this 1020 must be 1 for each zero byte of of the respective arg. For ESTRx 1021 variants this is derived from the explicit length indication, and 1022 must be 0 in all places except at the bit index corresponding to 1023 the valid length (0 .. 8). If the valid length is 8 then the 1024 mask must be all zeroes. In all cases, bits 31:8 must be zero. 1025 1026 imm8 is the original immediate from the instruction. isSTRM 1027 indicates whether this is a xSTRM or xSTRI variant, which controls 1028 how much of *res is written. 1029 1030 If the given imm8 case can be handled, the return value is True. 1031 If not, False is returned, and neither *res not *resOSZACP are 1032 altered. 1033 */ 1034 1035 Bool compute_PCMPxSTRx_wide ( /*OUT*/V128* resV, 1036 /*OUT*/UInt* resOSZACP, 1037 V128* argLV, V128* argRV, 1038 UInt zmaskL, UInt zmaskR, 1039 UInt imm8, Bool isxSTRM ) 1040 { 1041 vassert(imm8 < 0x80); 1042 vassert((zmaskL >> 8) == 0); 1043 vassert((zmaskR >> 8) == 0); 1044 1045 /* Explicitly reject any imm8 values that haven't been validated, 1046 even if they would probably work. Life is too short to have 1047 unvalidated cases in the code base. */ 1048 switch (imm8) { 1049 case 0x01: 1050 case 0x03: case 0x09: case 0x0B: case 0x0D: case 0x13: 1051 case 0x1B: case 0x39: case 0x3B: case 0x45: case 0x4B: 1052 break; 1053 default: 1054 return False; 1055 } 1056 1057 UInt fmt = (imm8 >> 0) & 3; // imm8[1:0] data format 1058 UInt agg = (imm8 >> 2) & 3; // imm8[3:2] aggregation fn 1059 UInt pol = (imm8 >> 4) & 3; // imm8[5:4] polarity 1060 UInt idx = (imm8 >> 6) & 1; // imm8[6] 1==msb/bytemask 1061 1062 /*----------------------------------------*/ 1063 /*-- strcmp on wide data --*/ 1064 /*----------------------------------------*/ 1065 1066 if (agg == 2/*equal each, aka strcmp*/ 1067 && (fmt == 1/*uw*/ || fmt == 3/*sw*/)) { 1068 Int i; 1069 UShort* argL = (UShort*)argLV; 1070 UShort* argR = (UShort*)argRV; 1071 UInt boolResII = 0; 1072 for (i = 7; i >= 0; i--) { 1073 UShort cL = argL[i]; 1074 UShort cR = argR[i]; 1075 boolResII = (boolResII << 1) | (cL == cR ? 1 : 0); 1076 } 1077 UInt validL = ~(zmaskL | -zmaskL); // not(left(zmaskL)) 1078 UInt validR = ~(zmaskR | -zmaskR); // not(left(zmaskR)) 1079 1080 // do invalidation, common to all equal-each cases 1081 UInt intRes1 1082 = (boolResII & validL & validR) // if both valid, use cmpres 1083 | (~ (validL | validR)); // if both invalid, force 1 1084 // else force 0 1085 intRes1 &= 0xFF; 1086 1087 // generate I-format output 1088 compute_PCMPxSTRx_gen_output_wide( 1089 resV, resOSZACP, 1090 intRes1, zmaskL, zmaskR, validL, pol, idx, isxSTRM 1091 ); 1092 1093 return True; 1094 } 1095 1096 /*----------------------------------------*/ 1097 /*-- set membership on wide data --*/ 1098 /*----------------------------------------*/ 1099 1100 if (agg == 0/*equal any, aka find chars in a set*/ 1101 && (fmt == 1/*uw*/ || fmt == 3/*sw*/)) { 1102 /* argL: the string, argR: charset */ 1103 UInt si, ci; 1104 UShort* argL = (UShort*)argLV; 1105 UShort* argR = (UShort*)argRV; 1106 UInt boolRes = 0; 1107 UInt validL = ~(zmaskL | -zmaskL); // not(left(zmaskL)) 1108 UInt validR = ~(zmaskR | -zmaskR); // not(left(zmaskR)) 1109 1110 for (si = 0; si < 8; si++) { 1111 if ((validL & (1 << si)) == 0) 1112 // run off the end of the string. 1113 break; 1114 UInt m = 0; 1115 for (ci = 0; ci < 8; ci++) { 1116 if ((validR & (1 << ci)) == 0) break; 1117 if (argR[ci] == argL[si]) { m = 1; break; } 1118 } 1119 boolRes |= (m << si); 1120 } 1121 1122 // boolRes is "pre-invalidated" 1123 UInt intRes1 = boolRes & 0xFF; 1124 1125 // generate I-format output 1126 compute_PCMPxSTRx_gen_output_wide( 1127 resV, resOSZACP, 1128 intRes1, zmaskL, zmaskR, validL, pol, idx, isxSTRM 1129 ); 1130 1131 return True; 1132 } 1133 1134 /*----------------------------------------*/ 1135 /*-- substring search on wide data --*/ 1136 /*----------------------------------------*/ 1137 1138 if (agg == 3/*equal ordered, aka substring search*/ 1139 && (fmt == 1/*uw*/ || fmt == 3/*sw*/)) { 1140 1141 /* argL: haystack, argR: needle */ 1142 UInt ni, hi; 1143 UShort* argL = (UShort*)argLV; 1144 UShort* argR = (UShort*)argRV; 1145 UInt boolRes = 0; 1146 UInt validL = ~(zmaskL | -zmaskL); // not(left(zmaskL)) 1147 UInt validR = ~(zmaskR | -zmaskR); // not(left(zmaskR)) 1148 for (hi = 0; hi < 8; hi++) { 1149 UInt m = 1; 1150 for (ni = 0; ni < 8; ni++) { 1151 if ((validR & (1 << ni)) == 0) break; 1152 UInt i = ni + hi; 1153 if (i >= 8) break; 1154 if (argL[i] != argR[ni]) { m = 0; break; } 1155 } 1156 boolRes |= (m << hi); 1157 if ((validL & (1 << hi)) == 0) 1158 // run off the end of the haystack 1159 break; 1160 } 1161 1162 // boolRes is "pre-invalidated" 1163 UInt intRes1 = boolRes & 0xFF; 1164 1165 // generate I-format output 1166 compute_PCMPxSTRx_gen_output_wide( 1167 resV, resOSZACP, 1168 intRes1, zmaskL, zmaskR, validL, pol, idx, isxSTRM 1169 ); 1170 1171 return True; 1172 } 1173 1174 /*----------------------------------------*/ 1175 /*-- ranges, unsigned wide data --*/ 1176 /*----------------------------------------*/ 1177 1178 if (agg == 1/*ranges*/ 1179 && fmt == 1/*uw*/) { 1180 1181 /* argL: string, argR: range-pairs */ 1182 UInt ri, si; 1183 UShort* argL = (UShort*)argLV; 1184 UShort* argR = (UShort*)argRV; 1185 UInt boolRes = 0; 1186 UInt validL = ~(zmaskL | -zmaskL); // not(left(zmaskL)) 1187 UInt validR = ~(zmaskR | -zmaskR); // not(left(zmaskR)) 1188 for (si = 0; si < 8; si++) { 1189 if ((validL & (1 << si)) == 0) 1190 // run off the end of the string 1191 break; 1192 UInt m = 0; 1193 for (ri = 0; ri < 8; ri += 2) { 1194 if ((validR & (3 << ri)) != (3 << ri)) break; 1195 if (argR[ri] <= argL[si] && argL[si] <= argR[ri+1]) { 1196 m = 1; break; 1197 } 1198 } 1199 boolRes |= (m << si); 1200 } 1201 1202 // boolRes is "pre-invalidated" 1203 UInt intRes1 = boolRes & 0xFF; 1204 1205 // generate I-format output 1206 compute_PCMPxSTRx_gen_output_wide( 1207 resV, resOSZACP, 1208 intRes1, zmaskL, zmaskR, validL, pol, idx, isxSTRM 1209 ); 1210 1211 return True; 1212 } 1213 1214 return False; 1215 } 1216 1217 1218 /*---------------------------------------------------------------*/ 1219 /*--- end guest_generic_x87.c ---*/ 1220 /*---------------------------------------------------------------*/ 1221