1 /* 2 * QEMU float support 3 * 4 * Derived from SoftFloat. 5 */ 6 7 /*============================================================================ 8 9 This C source file is part of the SoftFloat IEC/IEEE Floating-point Arithmetic 10 Package, Release 2b. 11 12 Written by John R. Hauser. This work was made possible in part by the 13 International Computer Science Institute, located at Suite 600, 1947 Center 14 Street, Berkeley, California 94704. Funding was partially provided by the 15 National Science Foundation under grant MIP-9311980. The original version 16 of this code was written as part of a project to build a fixed-point vector 17 processor in collaboration with the University of California at Berkeley, 18 overseen by Profs. Nelson Morgan and John Wawrzynek. More information 19 is available through the Web page `http://www.cs.berkeley.edu/~jhauser/ 20 arithmetic/SoftFloat.html'. 21 22 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort has 23 been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT TIMES 24 RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO PERSONS 25 AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ALL LOSSES, 26 COSTS, OR OTHER PROBLEMS THEY INCUR DUE TO THE SOFTWARE, AND WHO FURTHERMORE 27 EFFECTIVELY INDEMNIFY JOHN HAUSER AND THE INTERNATIONAL COMPUTER SCIENCE 28 INSTITUTE (possibly via similar legal warning) AGAINST ALL LOSSES, COSTS, OR 29 OTHER PROBLEMS INCURRED BY THEIR CUSTOMERS AND CLIENTS DUE TO THE SOFTWARE. 30 31 Derivative works are acceptable, even for commercial purposes, so long as 32 (1) the source code for the derivative work includes prominent notice that 33 the work is derivative, and (2) the source code includes prominent notice with 34 these four paragraphs for those parts of this code that are retained. 35 36 =============================================================================*/ 37 38 #include "softfloat.h" 39 40 /*---------------------------------------------------------------------------- 41 | Primitive arithmetic functions, including multi-word arithmetic, and 42 | division and square root approximations. (Can be specialized to target if 43 | desired.) 44 *----------------------------------------------------------------------------*/ 45 #include "softfloat-macros.h" 46 47 /*---------------------------------------------------------------------------- 48 | Functions and definitions to determine: (1) whether tininess for underflow 49 | is detected before or after rounding by default, (2) what (if anything) 50 | happens when exceptions are raised, (3) how signaling NaNs are distinguished 51 | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs 52 | are propagated from function inputs to output. These details are target- 53 | specific. 54 *----------------------------------------------------------------------------*/ 55 #include "softfloat-specialize.h" 56 57 void set_float_rounding_mode(int val STATUS_PARAM) 58 { 59 STATUS(float_rounding_mode) = val; 60 } 61 62 void set_float_exception_flags(int val STATUS_PARAM) 63 { 64 STATUS(float_exception_flags) = val; 65 } 66 67 #ifdef FLOATX80 68 void set_floatx80_rounding_precision(int val STATUS_PARAM) 69 { 70 STATUS(floatx80_rounding_precision) = val; 71 } 72 #endif 73 74 /*---------------------------------------------------------------------------- 75 | Returns the fraction bits of the half-precision floating-point value `a'. 76 *----------------------------------------------------------------------------*/ 77 78 INLINE uint32_t extractFloat16Frac(float16 a) 79 { 80 return float16_val(a) & 0x3ff; 81 } 82 83 /*---------------------------------------------------------------------------- 84 | Returns the exponent bits of the half-precision floating-point value `a'. 85 *----------------------------------------------------------------------------*/ 86 87 INLINE int16 extractFloat16Exp(float16 a) 88 { 89 return (float16_val(a) >> 10) & 0x1f; 90 } 91 92 /*---------------------------------------------------------------------------- 93 | Returns the sign bit of the single-precision floating-point value `a'. 94 *----------------------------------------------------------------------------*/ 95 96 INLINE flag extractFloat16Sign(float16 a) 97 { 98 return float16_val(a)>>15; 99 } 100 101 /*---------------------------------------------------------------------------- 102 | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6 103 | and 7, and returns the properly rounded 32-bit integer corresponding to the 104 | input. If `zSign' is 1, the input is negated before being converted to an 105 | integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point input 106 | is simply rounded to an integer, with the inexact exception raised if the 107 | input cannot be represented exactly as an integer. However, if the fixed- 108 | point input is too large, the invalid exception is raised and the largest 109 | positive or negative integer is returned. 110 *----------------------------------------------------------------------------*/ 111 112 static int32 roundAndPackInt32( flag zSign, uint64_t absZ STATUS_PARAM) 113 { 114 int8 roundingMode; 115 flag roundNearestEven; 116 int8 roundIncrement, roundBits; 117 int32 z; 118 119 roundingMode = STATUS(float_rounding_mode); 120 roundNearestEven = ( roundingMode == float_round_nearest_even ); 121 roundIncrement = 0x40; 122 if ( ! roundNearestEven ) { 123 if ( roundingMode == float_round_to_zero ) { 124 roundIncrement = 0; 125 } 126 else { 127 roundIncrement = 0x7F; 128 if ( zSign ) { 129 if ( roundingMode == float_round_up ) roundIncrement = 0; 130 } 131 else { 132 if ( roundingMode == float_round_down ) roundIncrement = 0; 133 } 134 } 135 } 136 roundBits = absZ & 0x7F; 137 absZ = ( absZ + roundIncrement )>>7; 138 absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven ); 139 z = absZ; 140 if ( zSign ) z = - z; 141 if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) { 142 float_raise( float_flag_invalid STATUS_VAR); 143 return zSign ? (int32_t) 0x80000000 : 0x7FFFFFFF; 144 } 145 if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact; 146 return z; 147 148 } 149 150 /*---------------------------------------------------------------------------- 151 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and 152 | `absZ1', with binary point between bits 63 and 64 (between the input words), 153 | and returns the properly rounded 64-bit integer corresponding to the input. 154 | If `zSign' is 1, the input is negated before being converted to an integer. 155 | Ordinarily, the fixed-point input is simply rounded to an integer, with 156 | the inexact exception raised if the input cannot be represented exactly as 157 | an integer. However, if the fixed-point input is too large, the invalid 158 | exception is raised and the largest positive or negative integer is 159 | returned. 160 *----------------------------------------------------------------------------*/ 161 162 static int64 roundAndPackInt64( flag zSign, uint64_t absZ0, uint64_t absZ1 STATUS_PARAM) 163 { 164 int8 roundingMode; 165 flag roundNearestEven, increment; 166 int64 z; 167 168 roundingMode = STATUS(float_rounding_mode); 169 roundNearestEven = ( roundingMode == float_round_nearest_even ); 170 increment = ( (int64_t) absZ1 < 0 ); 171 if ( ! roundNearestEven ) { 172 if ( roundingMode == float_round_to_zero ) { 173 increment = 0; 174 } 175 else { 176 if ( zSign ) { 177 increment = ( roundingMode == float_round_down ) && absZ1; 178 } 179 else { 180 increment = ( roundingMode == float_round_up ) && absZ1; 181 } 182 } 183 } 184 if ( increment ) { 185 ++absZ0; 186 if ( absZ0 == 0 ) goto overflow; 187 absZ0 &= ~ ( ( (uint64_t) ( absZ1<<1 ) == 0 ) & roundNearestEven ); 188 } 189 z = absZ0; 190 if ( zSign ) z = - z; 191 if ( z && ( ( z < 0 ) ^ zSign ) ) { 192 overflow: 193 float_raise( float_flag_invalid STATUS_VAR); 194 return 195 zSign ? (int64_t) LIT64( 0x8000000000000000 ) 196 : LIT64( 0x7FFFFFFFFFFFFFFF ); 197 } 198 if ( absZ1 ) STATUS(float_exception_flags) |= float_flag_inexact; 199 return z; 200 201 } 202 203 /*---------------------------------------------------------------------------- 204 | Returns the fraction bits of the single-precision floating-point value `a'. 205 *----------------------------------------------------------------------------*/ 206 207 INLINE uint32_t extractFloat32Frac( float32 a ) 208 { 209 210 return float32_val(a) & 0x007FFFFF; 211 212 } 213 214 /*---------------------------------------------------------------------------- 215 | Returns the exponent bits of the single-precision floating-point value `a'. 216 *----------------------------------------------------------------------------*/ 217 218 INLINE int16 extractFloat32Exp( float32 a ) 219 { 220 221 return ( float32_val(a)>>23 ) & 0xFF; 222 223 } 224 225 /*---------------------------------------------------------------------------- 226 | Returns the sign bit of the single-precision floating-point value `a'. 227 *----------------------------------------------------------------------------*/ 228 229 INLINE flag extractFloat32Sign( float32 a ) 230 { 231 232 return float32_val(a)>>31; 233 234 } 235 236 /*---------------------------------------------------------------------------- 237 | If `a' is denormal and we are in flush-to-zero mode then set the 238 | input-denormal exception and return zero. Otherwise just return the value. 239 *----------------------------------------------------------------------------*/ 240 static float32 float32_squash_input_denormal(float32 a STATUS_PARAM) 241 { 242 if (STATUS(flush_inputs_to_zero)) { 243 if (extractFloat32Exp(a) == 0 && extractFloat32Frac(a) != 0) { 244 float_raise(float_flag_input_denormal STATUS_VAR); 245 return make_float32(float32_val(a) & 0x80000000); 246 } 247 } 248 return a; 249 } 250 251 /*---------------------------------------------------------------------------- 252 | Normalizes the subnormal single-precision floating-point value represented 253 | by the denormalized significand `aSig'. The normalized exponent and 254 | significand are stored at the locations pointed to by `zExpPtr' and 255 | `zSigPtr', respectively. 256 *----------------------------------------------------------------------------*/ 257 258 static void 259 normalizeFloat32Subnormal( uint32_t aSig, int16 *zExpPtr, uint32_t *zSigPtr ) 260 { 261 int8 shiftCount; 262 263 shiftCount = countLeadingZeros32( aSig ) - 8; 264 *zSigPtr = aSig<<shiftCount; 265 *zExpPtr = 1 - shiftCount; 266 267 } 268 269 /*---------------------------------------------------------------------------- 270 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a 271 | single-precision floating-point value, returning the result. After being 272 | shifted into the proper positions, the three fields are simply added 273 | together to form the result. This means that any integer portion of `zSig' 274 | will be added into the exponent. Since a properly normalized significand 275 | will have an integer portion equal to 1, the `zExp' input should be 1 less 276 | than the desired result exponent whenever `zSig' is a complete, normalized 277 | significand. 278 *----------------------------------------------------------------------------*/ 279 280 INLINE float32 packFloat32( flag zSign, int16 zExp, uint32_t zSig ) 281 { 282 283 return make_float32( 284 ( ( (uint32_t) zSign )<<31 ) + ( ( (uint32_t) zExp )<<23 ) + zSig); 285 286 } 287 288 /*---------------------------------------------------------------------------- 289 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 290 | and significand `zSig', and returns the proper single-precision floating- 291 | point value corresponding to the abstract input. Ordinarily, the abstract 292 | value is simply rounded and packed into the single-precision format, with 293 | the inexact exception raised if the abstract input cannot be represented 294 | exactly. However, if the abstract value is too large, the overflow and 295 | inexact exceptions are raised and an infinity or maximal finite value is 296 | returned. If the abstract value is too small, the input value is rounded to 297 | a subnormal number, and the underflow and inexact exceptions are raised if 298 | the abstract input cannot be represented exactly as a subnormal single- 299 | precision floating-point number. 300 | The input significand `zSig' has its binary point between bits 30 301 | and 29, which is 7 bits to the left of the usual location. This shifted 302 | significand must be normalized or smaller. If `zSig' is not normalized, 303 | `zExp' must be 0; in that case, the result returned is a subnormal number, 304 | and it must not require rounding. In the usual case that `zSig' is 305 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent. 306 | The handling of underflow and overflow follows the IEC/IEEE Standard for 307 | Binary Floating-Point Arithmetic. 308 *----------------------------------------------------------------------------*/ 309 310 static float32 roundAndPackFloat32( flag zSign, int16 zExp, uint32_t zSig STATUS_PARAM) 311 { 312 int8 roundingMode; 313 flag roundNearestEven; 314 int8 roundIncrement, roundBits; 315 flag isTiny; 316 317 roundingMode = STATUS(float_rounding_mode); 318 roundNearestEven = ( roundingMode == float_round_nearest_even ); 319 roundIncrement = 0x40; 320 if ( ! roundNearestEven ) { 321 if ( roundingMode == float_round_to_zero ) { 322 roundIncrement = 0; 323 } 324 else { 325 roundIncrement = 0x7F; 326 if ( zSign ) { 327 if ( roundingMode == float_round_up ) roundIncrement = 0; 328 } 329 else { 330 if ( roundingMode == float_round_down ) roundIncrement = 0; 331 } 332 } 333 } 334 roundBits = zSig & 0x7F; 335 if ( 0xFD <= (uint16_t) zExp ) { 336 if ( ( 0xFD < zExp ) 337 || ( ( zExp == 0xFD ) 338 && ( (int32_t) ( zSig + roundIncrement ) < 0 ) ) 339 ) { 340 float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR); 341 return packFloat32( zSign, 0xFF, - ( roundIncrement == 0 )); 342 } 343 if ( zExp < 0 ) { 344 if (STATUS(flush_to_zero)) { 345 float_raise(float_flag_output_denormal STATUS_VAR); 346 return packFloat32(zSign, 0, 0); 347 } 348 isTiny = 349 ( STATUS(float_detect_tininess) == float_tininess_before_rounding ) 350 || ( zExp < -1 ) 351 || ( zSig + roundIncrement < 0x80000000 ); 352 shift32RightJamming( zSig, - zExp, &zSig ); 353 zExp = 0; 354 roundBits = zSig & 0x7F; 355 if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR); 356 } 357 } 358 if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact; 359 zSig = ( zSig + roundIncrement )>>7; 360 zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven ); 361 if ( zSig == 0 ) zExp = 0; 362 return packFloat32( zSign, zExp, zSig ); 363 364 } 365 366 /*---------------------------------------------------------------------------- 367 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 368 | and significand `zSig', and returns the proper single-precision floating- 369 | point value corresponding to the abstract input. This routine is just like 370 | `roundAndPackFloat32' except that `zSig' does not have to be normalized. 371 | Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true'' 372 | floating-point exponent. 373 *----------------------------------------------------------------------------*/ 374 375 static float32 376 normalizeRoundAndPackFloat32( flag zSign, int16 zExp, uint32_t zSig STATUS_PARAM) 377 { 378 int8 shiftCount; 379 380 shiftCount = countLeadingZeros32( zSig ) - 1; 381 return roundAndPackFloat32( zSign, zExp - shiftCount, zSig<<shiftCount STATUS_VAR); 382 383 } 384 385 /*---------------------------------------------------------------------------- 386 | Returns the fraction bits of the double-precision floating-point value `a'. 387 *----------------------------------------------------------------------------*/ 388 389 INLINE uint64_t extractFloat64Frac( float64 a ) 390 { 391 392 return float64_val(a) & LIT64( 0x000FFFFFFFFFFFFF ); 393 394 } 395 396 /*---------------------------------------------------------------------------- 397 | Returns the exponent bits of the double-precision floating-point value `a'. 398 *----------------------------------------------------------------------------*/ 399 400 INLINE int16 extractFloat64Exp( float64 a ) 401 { 402 403 return ( float64_val(a)>>52 ) & 0x7FF; 404 405 } 406 407 /*---------------------------------------------------------------------------- 408 | Returns the sign bit of the double-precision floating-point value `a'. 409 *----------------------------------------------------------------------------*/ 410 411 INLINE flag extractFloat64Sign( float64 a ) 412 { 413 414 return float64_val(a)>>63; 415 416 } 417 418 /*---------------------------------------------------------------------------- 419 | If `a' is denormal and we are in flush-to-zero mode then set the 420 | input-denormal exception and return zero. Otherwise just return the value. 421 *----------------------------------------------------------------------------*/ 422 static float64 float64_squash_input_denormal(float64 a STATUS_PARAM) 423 { 424 if (STATUS(flush_inputs_to_zero)) { 425 if (extractFloat64Exp(a) == 0 && extractFloat64Frac(a) != 0) { 426 float_raise(float_flag_input_denormal STATUS_VAR); 427 return make_float64(float64_val(a) & (1ULL << 63)); 428 } 429 } 430 return a; 431 } 432 433 /*---------------------------------------------------------------------------- 434 | Normalizes the subnormal double-precision floating-point value represented 435 | by the denormalized significand `aSig'. The normalized exponent and 436 | significand are stored at the locations pointed to by `zExpPtr' and 437 | `zSigPtr', respectively. 438 *----------------------------------------------------------------------------*/ 439 440 static void 441 normalizeFloat64Subnormal( uint64_t aSig, int16 *zExpPtr, uint64_t *zSigPtr ) 442 { 443 int8 shiftCount; 444 445 shiftCount = countLeadingZeros64( aSig ) - 11; 446 *zSigPtr = aSig<<shiftCount; 447 *zExpPtr = 1 - shiftCount; 448 449 } 450 451 /*---------------------------------------------------------------------------- 452 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a 453 | double-precision floating-point value, returning the result. After being 454 | shifted into the proper positions, the three fields are simply added 455 | together to form the result. This means that any integer portion of `zSig' 456 | will be added into the exponent. Since a properly normalized significand 457 | will have an integer portion equal to 1, the `zExp' input should be 1 less 458 | than the desired result exponent whenever `zSig' is a complete, normalized 459 | significand. 460 *----------------------------------------------------------------------------*/ 461 462 INLINE float64 packFloat64( flag zSign, int16 zExp, uint64_t zSig ) 463 { 464 465 return make_float64( 466 ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<52 ) + zSig); 467 468 } 469 470 /*---------------------------------------------------------------------------- 471 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 472 | and significand `zSig', and returns the proper double-precision floating- 473 | point value corresponding to the abstract input. Ordinarily, the abstract 474 | value is simply rounded and packed into the double-precision format, with 475 | the inexact exception raised if the abstract input cannot be represented 476 | exactly. However, if the abstract value is too large, the overflow and 477 | inexact exceptions are raised and an infinity or maximal finite value is 478 | returned. If the abstract value is too small, the input value is rounded 479 | to a subnormal number, and the underflow and inexact exceptions are raised 480 | if the abstract input cannot be represented exactly as a subnormal double- 481 | precision floating-point number. 482 | The input significand `zSig' has its binary point between bits 62 483 | and 61, which is 10 bits to the left of the usual location. This shifted 484 | significand must be normalized or smaller. If `zSig' is not normalized, 485 | `zExp' must be 0; in that case, the result returned is a subnormal number, 486 | and it must not require rounding. In the usual case that `zSig' is 487 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent. 488 | The handling of underflow and overflow follows the IEC/IEEE Standard for 489 | Binary Floating-Point Arithmetic. 490 *----------------------------------------------------------------------------*/ 491 492 static float64 roundAndPackFloat64( flag zSign, int16 zExp, uint64_t zSig STATUS_PARAM) 493 { 494 int8 roundingMode; 495 flag roundNearestEven; 496 int16 roundIncrement, roundBits; 497 flag isTiny; 498 499 roundingMode = STATUS(float_rounding_mode); 500 roundNearestEven = ( roundingMode == float_round_nearest_even ); 501 roundIncrement = 0x200; 502 if ( ! roundNearestEven ) { 503 if ( roundingMode == float_round_to_zero ) { 504 roundIncrement = 0; 505 } 506 else { 507 roundIncrement = 0x3FF; 508 if ( zSign ) { 509 if ( roundingMode == float_round_up ) roundIncrement = 0; 510 } 511 else { 512 if ( roundingMode == float_round_down ) roundIncrement = 0; 513 } 514 } 515 } 516 roundBits = zSig & 0x3FF; 517 if ( 0x7FD <= (uint16_t) zExp ) { 518 if ( ( 0x7FD < zExp ) 519 || ( ( zExp == 0x7FD ) 520 && ( (int64_t) ( zSig + roundIncrement ) < 0 ) ) 521 ) { 522 float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR); 523 return packFloat64( zSign, 0x7FF, - ( roundIncrement == 0 )); 524 } 525 if ( zExp < 0 ) { 526 if (STATUS(flush_to_zero)) { 527 float_raise(float_flag_output_denormal STATUS_VAR); 528 return packFloat64(zSign, 0, 0); 529 } 530 isTiny = 531 ( STATUS(float_detect_tininess) == float_tininess_before_rounding ) 532 || ( zExp < -1 ) 533 || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) ); 534 shift64RightJamming( zSig, - zExp, &zSig ); 535 zExp = 0; 536 roundBits = zSig & 0x3FF; 537 if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR); 538 } 539 } 540 if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact; 541 zSig = ( zSig + roundIncrement )>>10; 542 zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven ); 543 if ( zSig == 0 ) zExp = 0; 544 return packFloat64( zSign, zExp, zSig ); 545 546 } 547 548 /*---------------------------------------------------------------------------- 549 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 550 | and significand `zSig', and returns the proper double-precision floating- 551 | point value corresponding to the abstract input. This routine is just like 552 | `roundAndPackFloat64' except that `zSig' does not have to be normalized. 553 | Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true'' 554 | floating-point exponent. 555 *----------------------------------------------------------------------------*/ 556 557 static float64 558 normalizeRoundAndPackFloat64( flag zSign, int16 zExp, uint64_t zSig STATUS_PARAM) 559 { 560 int8 shiftCount; 561 562 shiftCount = countLeadingZeros64( zSig ) - 1; 563 return roundAndPackFloat64( zSign, zExp - shiftCount, zSig<<shiftCount STATUS_VAR); 564 565 } 566 567 #ifdef FLOATX80 568 569 /*---------------------------------------------------------------------------- 570 | Returns the fraction bits of the extended double-precision floating-point 571 | value `a'. 572 *----------------------------------------------------------------------------*/ 573 574 INLINE uint64_t extractFloatx80Frac( floatx80 a ) 575 { 576 577 return a.low; 578 579 } 580 581 /*---------------------------------------------------------------------------- 582 | Returns the exponent bits of the extended double-precision floating-point 583 | value `a'. 584 *----------------------------------------------------------------------------*/ 585 586 INLINE int32 extractFloatx80Exp( floatx80 a ) 587 { 588 589 return a.high & 0x7FFF; 590 591 } 592 593 /*---------------------------------------------------------------------------- 594 | Returns the sign bit of the extended double-precision floating-point value 595 | `a'. 596 *----------------------------------------------------------------------------*/ 597 598 INLINE flag extractFloatx80Sign( floatx80 a ) 599 { 600 601 return a.high>>15; 602 603 } 604 605 /*---------------------------------------------------------------------------- 606 | Normalizes the subnormal extended double-precision floating-point value 607 | represented by the denormalized significand `aSig'. The normalized exponent 608 | and significand are stored at the locations pointed to by `zExpPtr' and 609 | `zSigPtr', respectively. 610 *----------------------------------------------------------------------------*/ 611 612 static void 613 normalizeFloatx80Subnormal( uint64_t aSig, int32 *zExpPtr, uint64_t *zSigPtr ) 614 { 615 int8 shiftCount; 616 617 shiftCount = countLeadingZeros64( aSig ); 618 *zSigPtr = aSig<<shiftCount; 619 *zExpPtr = 1 - shiftCount; 620 621 } 622 623 /*---------------------------------------------------------------------------- 624 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into an 625 | extended double-precision floating-point value, returning the result. 626 *----------------------------------------------------------------------------*/ 627 628 INLINE floatx80 packFloatx80( flag zSign, int32 zExp, uint64_t zSig ) 629 { 630 floatx80 z; 631 632 z.low = zSig; 633 z.high = ( ( (uint16_t) zSign )<<15 ) + zExp; 634 return z; 635 636 } 637 638 /*---------------------------------------------------------------------------- 639 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 640 | and extended significand formed by the concatenation of `zSig0' and `zSig1', 641 | and returns the proper extended double-precision floating-point value 642 | corresponding to the abstract input. Ordinarily, the abstract value is 643 | rounded and packed into the extended double-precision format, with the 644 | inexact exception raised if the abstract input cannot be represented 645 | exactly. However, if the abstract value is too large, the overflow and 646 | inexact exceptions are raised and an infinity or maximal finite value is 647 | returned. If the abstract value is too small, the input value is rounded to 648 | a subnormal number, and the underflow and inexact exceptions are raised if 649 | the abstract input cannot be represented exactly as a subnormal extended 650 | double-precision floating-point number. 651 | If `roundingPrecision' is 32 or 64, the result is rounded to the same 652 | number of bits as single or double precision, respectively. Otherwise, the 653 | result is rounded to the full precision of the extended double-precision 654 | format. 655 | The input significand must be normalized or smaller. If the input 656 | significand is not normalized, `zExp' must be 0; in that case, the result 657 | returned is a subnormal number, and it must not require rounding. The 658 | handling of underflow and overflow follows the IEC/IEEE Standard for Binary 659 | Floating-Point Arithmetic. 660 *----------------------------------------------------------------------------*/ 661 662 static floatx80 663 roundAndPackFloatx80( 664 int8 roundingPrecision, flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1 665 STATUS_PARAM) 666 { 667 int8 roundingMode; 668 flag roundNearestEven, increment, isTiny; 669 int64 roundIncrement, roundMask, roundBits; 670 671 roundingMode = STATUS(float_rounding_mode); 672 roundNearestEven = ( roundingMode == float_round_nearest_even ); 673 if ( roundingPrecision == 80 ) goto precision80; 674 if ( roundingPrecision == 64 ) { 675 roundIncrement = LIT64( 0x0000000000000400 ); 676 roundMask = LIT64( 0x00000000000007FF ); 677 } 678 else if ( roundingPrecision == 32 ) { 679 roundIncrement = LIT64( 0x0000008000000000 ); 680 roundMask = LIT64( 0x000000FFFFFFFFFF ); 681 } 682 else { 683 goto precision80; 684 } 685 zSig0 |= ( zSig1 != 0 ); 686 if ( ! roundNearestEven ) { 687 if ( roundingMode == float_round_to_zero ) { 688 roundIncrement = 0; 689 } 690 else { 691 roundIncrement = roundMask; 692 if ( zSign ) { 693 if ( roundingMode == float_round_up ) roundIncrement = 0; 694 } 695 else { 696 if ( roundingMode == float_round_down ) roundIncrement = 0; 697 } 698 } 699 } 700 roundBits = zSig0 & roundMask; 701 if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) { 702 if ( ( 0x7FFE < zExp ) 703 || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) ) 704 ) { 705 goto overflow; 706 } 707 if ( zExp <= 0 ) { 708 if (STATUS(flush_to_zero)) { 709 float_raise(float_flag_output_denormal STATUS_VAR); 710 return packFloatx80(zSign, 0, 0); 711 } 712 isTiny = 713 ( STATUS(float_detect_tininess) == float_tininess_before_rounding ) 714 || ( zExp < 0 ) 715 || ( zSig0 <= zSig0 + roundIncrement ); 716 shift64RightJamming( zSig0, 1 - zExp, &zSig0 ); 717 zExp = 0; 718 roundBits = zSig0 & roundMask; 719 if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR); 720 if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact; 721 zSig0 += roundIncrement; 722 if ( (int64_t) zSig0 < 0 ) zExp = 1; 723 roundIncrement = roundMask + 1; 724 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) { 725 roundMask |= roundIncrement; 726 } 727 zSig0 &= ~ roundMask; 728 return packFloatx80( zSign, zExp, zSig0 ); 729 } 730 } 731 if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact; 732 zSig0 += roundIncrement; 733 if ( zSig0 < roundIncrement ) { 734 ++zExp; 735 zSig0 = LIT64( 0x8000000000000000 ); 736 } 737 roundIncrement = roundMask + 1; 738 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) { 739 roundMask |= roundIncrement; 740 } 741 zSig0 &= ~ roundMask; 742 if ( zSig0 == 0 ) zExp = 0; 743 return packFloatx80( zSign, zExp, zSig0 ); 744 precision80: 745 increment = ( (int64_t) zSig1 < 0 ); 746 if ( ! roundNearestEven ) { 747 if ( roundingMode == float_round_to_zero ) { 748 increment = 0; 749 } 750 else { 751 if ( zSign ) { 752 increment = ( roundingMode == float_round_down ) && zSig1; 753 } 754 else { 755 increment = ( roundingMode == float_round_up ) && zSig1; 756 } 757 } 758 } 759 if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) { 760 if ( ( 0x7FFE < zExp ) 761 || ( ( zExp == 0x7FFE ) 762 && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) ) 763 && increment 764 ) 765 ) { 766 roundMask = 0; 767 overflow: 768 float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR); 769 if ( ( roundingMode == float_round_to_zero ) 770 || ( zSign && ( roundingMode == float_round_up ) ) 771 || ( ! zSign && ( roundingMode == float_round_down ) ) 772 ) { 773 return packFloatx80( zSign, 0x7FFE, ~ roundMask ); 774 } 775 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 776 } 777 if ( zExp <= 0 ) { 778 isTiny = 779 ( STATUS(float_detect_tininess) == float_tininess_before_rounding ) 780 || ( zExp < 0 ) 781 || ! increment 782 || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) ); 783 shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 ); 784 zExp = 0; 785 if ( isTiny && zSig1 ) float_raise( float_flag_underflow STATUS_VAR); 786 if ( zSig1 ) STATUS(float_exception_flags) |= float_flag_inexact; 787 if ( roundNearestEven ) { 788 increment = ( (int64_t) zSig1 < 0 ); 789 } 790 else { 791 if ( zSign ) { 792 increment = ( roundingMode == float_round_down ) && zSig1; 793 } 794 else { 795 increment = ( roundingMode == float_round_up ) && zSig1; 796 } 797 } 798 if ( increment ) { 799 ++zSig0; 800 zSig0 &= 801 ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven ); 802 if ( (int64_t) zSig0 < 0 ) zExp = 1; 803 } 804 return packFloatx80( zSign, zExp, zSig0 ); 805 } 806 } 807 if ( zSig1 ) STATUS(float_exception_flags) |= float_flag_inexact; 808 if ( increment ) { 809 ++zSig0; 810 if ( zSig0 == 0 ) { 811 ++zExp; 812 zSig0 = LIT64( 0x8000000000000000 ); 813 } 814 else { 815 zSig0 &= ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven ); 816 } 817 } 818 else { 819 if ( zSig0 == 0 ) zExp = 0; 820 } 821 return packFloatx80( zSign, zExp, zSig0 ); 822 823 } 824 825 /*---------------------------------------------------------------------------- 826 | Takes an abstract floating-point value having sign `zSign', exponent 827 | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1', 828 | and returns the proper extended double-precision floating-point value 829 | corresponding to the abstract input. This routine is just like 830 | `roundAndPackFloatx80' except that the input significand does not have to be 831 | normalized. 832 *----------------------------------------------------------------------------*/ 833 834 static floatx80 835 normalizeRoundAndPackFloatx80( 836 int8 roundingPrecision, flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1 837 STATUS_PARAM) 838 { 839 int8 shiftCount; 840 841 if ( zSig0 == 0 ) { 842 zSig0 = zSig1; 843 zSig1 = 0; 844 zExp -= 64; 845 } 846 shiftCount = countLeadingZeros64( zSig0 ); 847 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 ); 848 zExp -= shiftCount; 849 return 850 roundAndPackFloatx80( roundingPrecision, zSign, zExp, zSig0, zSig1 STATUS_VAR); 851 852 } 853 854 #endif 855 856 #ifdef FLOAT128 857 858 /*---------------------------------------------------------------------------- 859 | Returns the least-significant 64 fraction bits of the quadruple-precision 860 | floating-point value `a'. 861 *----------------------------------------------------------------------------*/ 862 863 INLINE uint64_t extractFloat128Frac1( float128 a ) 864 { 865 866 return a.low; 867 868 } 869 870 /*---------------------------------------------------------------------------- 871 | Returns the most-significant 48 fraction bits of the quadruple-precision 872 | floating-point value `a'. 873 *----------------------------------------------------------------------------*/ 874 875 INLINE uint64_t extractFloat128Frac0( float128 a ) 876 { 877 878 return a.high & LIT64( 0x0000FFFFFFFFFFFF ); 879 880 } 881 882 /*---------------------------------------------------------------------------- 883 | Returns the exponent bits of the quadruple-precision floating-point value 884 | `a'. 885 *----------------------------------------------------------------------------*/ 886 887 INLINE int32 extractFloat128Exp( float128 a ) 888 { 889 890 return ( a.high>>48 ) & 0x7FFF; 891 892 } 893 894 /*---------------------------------------------------------------------------- 895 | Returns the sign bit of the quadruple-precision floating-point value `a'. 896 *----------------------------------------------------------------------------*/ 897 898 INLINE flag extractFloat128Sign( float128 a ) 899 { 900 901 return a.high>>63; 902 903 } 904 905 /*---------------------------------------------------------------------------- 906 | Normalizes the subnormal quadruple-precision floating-point value 907 | represented by the denormalized significand formed by the concatenation of 908 | `aSig0' and `aSig1'. The normalized exponent is stored at the location 909 | pointed to by `zExpPtr'. The most significant 49 bits of the normalized 910 | significand are stored at the location pointed to by `zSig0Ptr', and the 911 | least significant 64 bits of the normalized significand are stored at the 912 | location pointed to by `zSig1Ptr'. 913 *----------------------------------------------------------------------------*/ 914 915 static void 916 normalizeFloat128Subnormal( 917 uint64_t aSig0, 918 uint64_t aSig1, 919 int32 *zExpPtr, 920 uint64_t *zSig0Ptr, 921 uint64_t *zSig1Ptr 922 ) 923 { 924 int8 shiftCount; 925 926 if ( aSig0 == 0 ) { 927 shiftCount = countLeadingZeros64( aSig1 ) - 15; 928 if ( shiftCount < 0 ) { 929 *zSig0Ptr = aSig1>>( - shiftCount ); 930 *zSig1Ptr = aSig1<<( shiftCount & 63 ); 931 } 932 else { 933 *zSig0Ptr = aSig1<<shiftCount; 934 *zSig1Ptr = 0; 935 } 936 *zExpPtr = - shiftCount - 63; 937 } 938 else { 939 shiftCount = countLeadingZeros64( aSig0 ) - 15; 940 shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr ); 941 *zExpPtr = 1 - shiftCount; 942 } 943 944 } 945 946 /*---------------------------------------------------------------------------- 947 | Packs the sign `zSign', the exponent `zExp', and the significand formed 948 | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision 949 | floating-point value, returning the result. After being shifted into the 950 | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply 951 | added together to form the most significant 32 bits of the result. This 952 | means that any integer portion of `zSig0' will be added into the exponent. 953 | Since a properly normalized significand will have an integer portion equal 954 | to 1, the `zExp' input should be 1 less than the desired result exponent 955 | whenever `zSig0' and `zSig1' concatenated form a complete, normalized 956 | significand. 957 *----------------------------------------------------------------------------*/ 958 959 INLINE float128 960 packFloat128( flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1 ) 961 { 962 float128 z; 963 964 z.low = zSig1; 965 z.high = ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<48 ) + zSig0; 966 return z; 967 968 } 969 970 /*---------------------------------------------------------------------------- 971 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 972 | and extended significand formed by the concatenation of `zSig0', `zSig1', 973 | and `zSig2', and returns the proper quadruple-precision floating-point value 974 | corresponding to the abstract input. Ordinarily, the abstract value is 975 | simply rounded and packed into the quadruple-precision format, with the 976 | inexact exception raised if the abstract input cannot be represented 977 | exactly. However, if the abstract value is too large, the overflow and 978 | inexact exceptions are raised and an infinity or maximal finite value is 979 | returned. If the abstract value is too small, the input value is rounded to 980 | a subnormal number, and the underflow and inexact exceptions are raised if 981 | the abstract input cannot be represented exactly as a subnormal quadruple- 982 | precision floating-point number. 983 | The input significand must be normalized or smaller. If the input 984 | significand is not normalized, `zExp' must be 0; in that case, the result 985 | returned is a subnormal number, and it must not require rounding. In the 986 | usual case that the input significand is normalized, `zExp' must be 1 less 987 | than the ``true'' floating-point exponent. The handling of underflow and 988 | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 989 *----------------------------------------------------------------------------*/ 990 991 static float128 992 roundAndPackFloat128( 993 flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1, uint64_t zSig2 STATUS_PARAM) 994 { 995 int8 roundingMode; 996 flag roundNearestEven, increment, isTiny; 997 998 roundingMode = STATUS(float_rounding_mode); 999 roundNearestEven = ( roundingMode == float_round_nearest_even ); 1000 increment = ( (int64_t) zSig2 < 0 ); 1001 if ( ! roundNearestEven ) { 1002 if ( roundingMode == float_round_to_zero ) { 1003 increment = 0; 1004 } 1005 else { 1006 if ( zSign ) { 1007 increment = ( roundingMode == float_round_down ) && zSig2; 1008 } 1009 else { 1010 increment = ( roundingMode == float_round_up ) && zSig2; 1011 } 1012 } 1013 } 1014 if ( 0x7FFD <= (uint32_t) zExp ) { 1015 if ( ( 0x7FFD < zExp ) 1016 || ( ( zExp == 0x7FFD ) 1017 && eq128( 1018 LIT64( 0x0001FFFFFFFFFFFF ), 1019 LIT64( 0xFFFFFFFFFFFFFFFF ), 1020 zSig0, 1021 zSig1 1022 ) 1023 && increment 1024 ) 1025 ) { 1026 float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR); 1027 if ( ( roundingMode == float_round_to_zero ) 1028 || ( zSign && ( roundingMode == float_round_up ) ) 1029 || ( ! zSign && ( roundingMode == float_round_down ) ) 1030 ) { 1031 return 1032 packFloat128( 1033 zSign, 1034 0x7FFE, 1035 LIT64( 0x0000FFFFFFFFFFFF ), 1036 LIT64( 0xFFFFFFFFFFFFFFFF ) 1037 ); 1038 } 1039 return packFloat128( zSign, 0x7FFF, 0, 0 ); 1040 } 1041 if ( zExp < 0 ) { 1042 if (STATUS(flush_to_zero)) { 1043 float_raise(float_flag_output_denormal STATUS_VAR); 1044 return packFloat128(zSign, 0, 0, 0); 1045 } 1046 isTiny = 1047 ( STATUS(float_detect_tininess) == float_tininess_before_rounding ) 1048 || ( zExp < -1 ) 1049 || ! increment 1050 || lt128( 1051 zSig0, 1052 zSig1, 1053 LIT64( 0x0001FFFFFFFFFFFF ), 1054 LIT64( 0xFFFFFFFFFFFFFFFF ) 1055 ); 1056 shift128ExtraRightJamming( 1057 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 ); 1058 zExp = 0; 1059 if ( isTiny && zSig2 ) float_raise( float_flag_underflow STATUS_VAR); 1060 if ( roundNearestEven ) { 1061 increment = ( (int64_t) zSig2 < 0 ); 1062 } 1063 else { 1064 if ( zSign ) { 1065 increment = ( roundingMode == float_round_down ) && zSig2; 1066 } 1067 else { 1068 increment = ( roundingMode == float_round_up ) && zSig2; 1069 } 1070 } 1071 } 1072 } 1073 if ( zSig2 ) STATUS(float_exception_flags) |= float_flag_inexact; 1074 if ( increment ) { 1075 add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 ); 1076 zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven ); 1077 } 1078 else { 1079 if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0; 1080 } 1081 return packFloat128( zSign, zExp, zSig0, zSig1 ); 1082 1083 } 1084 1085 /*---------------------------------------------------------------------------- 1086 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 1087 | and significand formed by the concatenation of `zSig0' and `zSig1', and 1088 | returns the proper quadruple-precision floating-point value corresponding 1089 | to the abstract input. This routine is just like `roundAndPackFloat128' 1090 | except that the input significand has fewer bits and does not have to be 1091 | normalized. In all cases, `zExp' must be 1 less than the ``true'' floating- 1092 | point exponent. 1093 *----------------------------------------------------------------------------*/ 1094 1095 static float128 1096 normalizeRoundAndPackFloat128( 1097 flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1 STATUS_PARAM) 1098 { 1099 int8 shiftCount; 1100 uint64_t zSig2; 1101 1102 if ( zSig0 == 0 ) { 1103 zSig0 = zSig1; 1104 zSig1 = 0; 1105 zExp -= 64; 1106 } 1107 shiftCount = countLeadingZeros64( zSig0 ) - 15; 1108 if ( 0 <= shiftCount ) { 1109 zSig2 = 0; 1110 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 ); 1111 } 1112 else { 1113 shift128ExtraRightJamming( 1114 zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 ); 1115 } 1116 zExp -= shiftCount; 1117 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR); 1118 1119 } 1120 1121 #endif 1122 1123 /*---------------------------------------------------------------------------- 1124 | Returns the result of converting the 32-bit two's complement integer `a' 1125 | to the single-precision floating-point format. The conversion is performed 1126 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1127 *----------------------------------------------------------------------------*/ 1128 1129 float32 int32_to_float32( int32 a STATUS_PARAM ) 1130 { 1131 flag zSign; 1132 1133 if ( a == 0 ) return float32_zero; 1134 if ( a == (int32_t) 0x80000000 ) return packFloat32( 1, 0x9E, 0 ); 1135 zSign = ( a < 0 ); 1136 return normalizeRoundAndPackFloat32( zSign, 0x9C, zSign ? - a : a STATUS_VAR ); 1137 1138 } 1139 1140 /*---------------------------------------------------------------------------- 1141 | Returns the result of converting the 32-bit two's complement integer `a' 1142 | to the double-precision floating-point format. The conversion is performed 1143 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1144 *----------------------------------------------------------------------------*/ 1145 1146 float64 int32_to_float64( int32 a STATUS_PARAM ) 1147 { 1148 flag zSign; 1149 uint32 absA; 1150 int8 shiftCount; 1151 uint64_t zSig; 1152 1153 if ( a == 0 ) return float64_zero; 1154 zSign = ( a < 0 ); 1155 absA = zSign ? - a : a; 1156 shiftCount = countLeadingZeros32( absA ) + 21; 1157 zSig = absA; 1158 return packFloat64( zSign, 0x432 - shiftCount, zSig<<shiftCount ); 1159 1160 } 1161 1162 #ifdef FLOATX80 1163 1164 /*---------------------------------------------------------------------------- 1165 | Returns the result of converting the 32-bit two's complement integer `a' 1166 | to the extended double-precision floating-point format. The conversion 1167 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 1168 | Arithmetic. 1169 *----------------------------------------------------------------------------*/ 1170 1171 floatx80 int32_to_floatx80( int32 a STATUS_PARAM ) 1172 { 1173 flag zSign; 1174 uint32 absA; 1175 int8 shiftCount; 1176 uint64_t zSig; 1177 1178 if ( a == 0 ) return packFloatx80( 0, 0, 0 ); 1179 zSign = ( a < 0 ); 1180 absA = zSign ? - a : a; 1181 shiftCount = countLeadingZeros32( absA ) + 32; 1182 zSig = absA; 1183 return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount ); 1184 1185 } 1186 1187 #endif 1188 1189 #ifdef FLOAT128 1190 1191 /*---------------------------------------------------------------------------- 1192 | Returns the result of converting the 32-bit two's complement integer `a' to 1193 | the quadruple-precision floating-point format. The conversion is performed 1194 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1195 *----------------------------------------------------------------------------*/ 1196 1197 float128 int32_to_float128( int32 a STATUS_PARAM ) 1198 { 1199 flag zSign; 1200 uint32 absA; 1201 int8 shiftCount; 1202 uint64_t zSig0; 1203 1204 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 ); 1205 zSign = ( a < 0 ); 1206 absA = zSign ? - a : a; 1207 shiftCount = countLeadingZeros32( absA ) + 17; 1208 zSig0 = absA; 1209 return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 ); 1210 1211 } 1212 1213 #endif 1214 1215 /*---------------------------------------------------------------------------- 1216 | Returns the result of converting the 64-bit two's complement integer `a' 1217 | to the single-precision floating-point format. The conversion is performed 1218 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1219 *----------------------------------------------------------------------------*/ 1220 1221 float32 int64_to_float32( int64 a STATUS_PARAM ) 1222 { 1223 flag zSign; 1224 uint64 absA; 1225 int8 shiftCount; 1226 1227 if ( a == 0 ) return float32_zero; 1228 zSign = ( a < 0 ); 1229 absA = zSign ? - a : a; 1230 shiftCount = countLeadingZeros64( absA ) - 40; 1231 if ( 0 <= shiftCount ) { 1232 return packFloat32( zSign, 0x95 - shiftCount, absA<<shiftCount ); 1233 } 1234 else { 1235 shiftCount += 7; 1236 if ( shiftCount < 0 ) { 1237 shift64RightJamming( absA, - shiftCount, &absA ); 1238 } 1239 else { 1240 absA <<= shiftCount; 1241 } 1242 return roundAndPackFloat32( zSign, 0x9C - shiftCount, absA STATUS_VAR ); 1243 } 1244 1245 } 1246 1247 float32 uint64_to_float32( uint64 a STATUS_PARAM ) 1248 { 1249 int8 shiftCount; 1250 1251 if ( a == 0 ) return float32_zero; 1252 shiftCount = countLeadingZeros64( a ) - 40; 1253 if ( 0 <= shiftCount ) { 1254 return packFloat32( 1 > 0, 0x95 - shiftCount, a<<shiftCount ); 1255 } 1256 else { 1257 shiftCount += 7; 1258 if ( shiftCount < 0 ) { 1259 shift64RightJamming( a, - shiftCount, &a ); 1260 } 1261 else { 1262 a <<= shiftCount; 1263 } 1264 return roundAndPackFloat32( 1 > 0, 0x9C - shiftCount, a STATUS_VAR ); 1265 } 1266 } 1267 1268 /*---------------------------------------------------------------------------- 1269 | Returns the result of converting the 64-bit two's complement integer `a' 1270 | to the double-precision floating-point format. The conversion is performed 1271 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1272 *----------------------------------------------------------------------------*/ 1273 1274 float64 int64_to_float64( int64 a STATUS_PARAM ) 1275 { 1276 flag zSign; 1277 1278 if ( a == 0 ) return float64_zero; 1279 if ( a == (int64_t) LIT64( 0x8000000000000000 ) ) { 1280 return packFloat64( 1, 0x43E, 0 ); 1281 } 1282 zSign = ( a < 0 ); 1283 return normalizeRoundAndPackFloat64( zSign, 0x43C, zSign ? - a : a STATUS_VAR ); 1284 1285 } 1286 1287 float64 uint64_to_float64( uint64 a STATUS_PARAM ) 1288 { 1289 if ( a == 0 ) return float64_zero; 1290 return normalizeRoundAndPackFloat64( 0, 0x43C, a STATUS_VAR ); 1291 1292 } 1293 1294 #ifdef FLOATX80 1295 1296 /*---------------------------------------------------------------------------- 1297 | Returns the result of converting the 64-bit two's complement integer `a' 1298 | to the extended double-precision floating-point format. The conversion 1299 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 1300 | Arithmetic. 1301 *----------------------------------------------------------------------------*/ 1302 1303 floatx80 int64_to_floatx80( int64 a STATUS_PARAM ) 1304 { 1305 flag zSign; 1306 uint64 absA; 1307 int8 shiftCount; 1308 1309 if ( a == 0 ) return packFloatx80( 0, 0, 0 ); 1310 zSign = ( a < 0 ); 1311 absA = zSign ? - a : a; 1312 shiftCount = countLeadingZeros64( absA ); 1313 return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount ); 1314 1315 } 1316 1317 #endif 1318 1319 #ifdef FLOAT128 1320 1321 /*---------------------------------------------------------------------------- 1322 | Returns the result of converting the 64-bit two's complement integer `a' to 1323 | the quadruple-precision floating-point format. The conversion is performed 1324 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1325 *----------------------------------------------------------------------------*/ 1326 1327 float128 int64_to_float128( int64 a STATUS_PARAM ) 1328 { 1329 flag zSign; 1330 uint64 absA; 1331 int8 shiftCount; 1332 int32 zExp; 1333 uint64_t zSig0, zSig1; 1334 1335 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 ); 1336 zSign = ( a < 0 ); 1337 absA = zSign ? - a : a; 1338 shiftCount = countLeadingZeros64( absA ) + 49; 1339 zExp = 0x406E - shiftCount; 1340 if ( 64 <= shiftCount ) { 1341 zSig1 = 0; 1342 zSig0 = absA; 1343 shiftCount -= 64; 1344 } 1345 else { 1346 zSig1 = absA; 1347 zSig0 = 0; 1348 } 1349 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 ); 1350 return packFloat128( zSign, zExp, zSig0, zSig1 ); 1351 1352 } 1353 1354 #endif 1355 1356 /*---------------------------------------------------------------------------- 1357 | Returns the result of converting the single-precision floating-point value 1358 | `a' to the 32-bit two's complement integer format. The conversion is 1359 | performed according to the IEC/IEEE Standard for Binary Floating-Point 1360 | Arithmetic---which means in particular that the conversion is rounded 1361 | according to the current rounding mode. If `a' is a NaN, the largest 1362 | positive integer is returned. Otherwise, if the conversion overflows, the 1363 | largest integer with the same sign as `a' is returned. 1364 *----------------------------------------------------------------------------*/ 1365 1366 int32 float32_to_int32( float32 a STATUS_PARAM ) 1367 { 1368 flag aSign; 1369 int16 aExp, shiftCount; 1370 uint32_t aSig; 1371 uint64_t aSig64; 1372 1373 a = float32_squash_input_denormal(a STATUS_VAR); 1374 aSig = extractFloat32Frac( a ); 1375 aExp = extractFloat32Exp( a ); 1376 aSign = extractFloat32Sign( a ); 1377 if ( ( aExp == 0xFF ) && aSig ) aSign = 0; 1378 if ( aExp ) aSig |= 0x00800000; 1379 shiftCount = 0xAF - aExp; 1380 aSig64 = aSig; 1381 aSig64 <<= 32; 1382 if ( 0 < shiftCount ) shift64RightJamming( aSig64, shiftCount, &aSig64 ); 1383 return roundAndPackInt32( aSign, aSig64 STATUS_VAR ); 1384 1385 } 1386 1387 /*---------------------------------------------------------------------------- 1388 | Returns the result of converting the single-precision floating-point value 1389 | `a' to the 32-bit two's complement integer format. The conversion is 1390 | performed according to the IEC/IEEE Standard for Binary Floating-Point 1391 | Arithmetic, except that the conversion is always rounded toward zero. 1392 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if 1393 | the conversion overflows, the largest integer with the same sign as `a' is 1394 | returned. 1395 *----------------------------------------------------------------------------*/ 1396 1397 int32 float32_to_int32_round_to_zero( float32 a STATUS_PARAM ) 1398 { 1399 flag aSign; 1400 int16 aExp, shiftCount; 1401 uint32_t aSig; 1402 int32 z; 1403 a = float32_squash_input_denormal(a STATUS_VAR); 1404 1405 aSig = extractFloat32Frac( a ); 1406 aExp = extractFloat32Exp( a ); 1407 aSign = extractFloat32Sign( a ); 1408 shiftCount = aExp - 0x9E; 1409 if ( 0 <= shiftCount ) { 1410 if ( float32_val(a) != 0xCF000000 ) { 1411 float_raise( float_flag_invalid STATUS_VAR); 1412 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF; 1413 } 1414 return (int32_t) 0x80000000; 1415 } 1416 else if ( aExp <= 0x7E ) { 1417 if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact; 1418 return 0; 1419 } 1420 aSig = ( aSig | 0x00800000 )<<8; 1421 z = aSig>>( - shiftCount ); 1422 if ( (uint32_t) ( aSig<<( shiftCount & 31 ) ) ) { 1423 STATUS(float_exception_flags) |= float_flag_inexact; 1424 } 1425 if ( aSign ) z = - z; 1426 return z; 1427 1428 } 1429 1430 /*---------------------------------------------------------------------------- 1431 | Returns the result of converting the single-precision floating-point value 1432 | `a' to the 16-bit two's complement integer format. The conversion is 1433 | performed according to the IEC/IEEE Standard for Binary Floating-Point 1434 | Arithmetic, except that the conversion is always rounded toward zero. 1435 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if 1436 | the conversion overflows, the largest integer with the same sign as `a' is 1437 | returned. 1438 *----------------------------------------------------------------------------*/ 1439 1440 int16 float32_to_int16_round_to_zero( float32 a STATUS_PARAM ) 1441 { 1442 flag aSign; 1443 int16 aExp, shiftCount; 1444 uint32_t aSig; 1445 int32 z; 1446 1447 aSig = extractFloat32Frac( a ); 1448 aExp = extractFloat32Exp( a ); 1449 aSign = extractFloat32Sign( a ); 1450 shiftCount = aExp - 0x8E; 1451 if ( 0 <= shiftCount ) { 1452 if ( float32_val(a) != 0xC7000000 ) { 1453 float_raise( float_flag_invalid STATUS_VAR); 1454 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) { 1455 return 0x7FFF; 1456 } 1457 } 1458 return (int32_t) 0xffff8000; 1459 } 1460 else if ( aExp <= 0x7E ) { 1461 if ( aExp | aSig ) { 1462 STATUS(float_exception_flags) |= float_flag_inexact; 1463 } 1464 return 0; 1465 } 1466 shiftCount -= 0x10; 1467 aSig = ( aSig | 0x00800000 )<<8; 1468 z = aSig>>( - shiftCount ); 1469 if ( (uint32_t) ( aSig<<( shiftCount & 31 ) ) ) { 1470 STATUS(float_exception_flags) |= float_flag_inexact; 1471 } 1472 if ( aSign ) { 1473 z = - z; 1474 } 1475 return z; 1476 1477 } 1478 1479 /*---------------------------------------------------------------------------- 1480 | Returns the result of converting the single-precision floating-point value 1481 | `a' to the 64-bit two's complement integer format. The conversion is 1482 | performed according to the IEC/IEEE Standard for Binary Floating-Point 1483 | Arithmetic---which means in particular that the conversion is rounded 1484 | according to the current rounding mode. If `a' is a NaN, the largest 1485 | positive integer is returned. Otherwise, if the conversion overflows, the 1486 | largest integer with the same sign as `a' is returned. 1487 *----------------------------------------------------------------------------*/ 1488 1489 int64 float32_to_int64( float32 a STATUS_PARAM ) 1490 { 1491 flag aSign; 1492 int16 aExp, shiftCount; 1493 uint32_t aSig; 1494 uint64_t aSig64, aSigExtra; 1495 a = float32_squash_input_denormal(a STATUS_VAR); 1496 1497 aSig = extractFloat32Frac( a ); 1498 aExp = extractFloat32Exp( a ); 1499 aSign = extractFloat32Sign( a ); 1500 shiftCount = 0xBE - aExp; 1501 if ( shiftCount < 0 ) { 1502 float_raise( float_flag_invalid STATUS_VAR); 1503 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) { 1504 return LIT64( 0x7FFFFFFFFFFFFFFF ); 1505 } 1506 return (int64_t) LIT64( 0x8000000000000000 ); 1507 } 1508 if ( aExp ) aSig |= 0x00800000; 1509 aSig64 = aSig; 1510 aSig64 <<= 40; 1511 shift64ExtraRightJamming( aSig64, 0, shiftCount, &aSig64, &aSigExtra ); 1512 return roundAndPackInt64( aSign, aSig64, aSigExtra STATUS_VAR ); 1513 1514 } 1515 1516 /*---------------------------------------------------------------------------- 1517 | Returns the result of converting the single-precision floating-point value 1518 | `a' to the 64-bit two's complement integer format. The conversion is 1519 | performed according to the IEC/IEEE Standard for Binary Floating-Point 1520 | Arithmetic, except that the conversion is always rounded toward zero. If 1521 | `a' is a NaN, the largest positive integer is returned. Otherwise, if the 1522 | conversion overflows, the largest integer with the same sign as `a' is 1523 | returned. 1524 *----------------------------------------------------------------------------*/ 1525 1526 int64 float32_to_int64_round_to_zero( float32 a STATUS_PARAM ) 1527 { 1528 flag aSign; 1529 int16 aExp, shiftCount; 1530 uint32_t aSig; 1531 uint64_t aSig64; 1532 int64 z; 1533 a = float32_squash_input_denormal(a STATUS_VAR); 1534 1535 aSig = extractFloat32Frac( a ); 1536 aExp = extractFloat32Exp( a ); 1537 aSign = extractFloat32Sign( a ); 1538 shiftCount = aExp - 0xBE; 1539 if ( 0 <= shiftCount ) { 1540 if ( float32_val(a) != 0xDF000000 ) { 1541 float_raise( float_flag_invalid STATUS_VAR); 1542 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) { 1543 return LIT64( 0x7FFFFFFFFFFFFFFF ); 1544 } 1545 } 1546 return (int64_t) LIT64( 0x8000000000000000 ); 1547 } 1548 else if ( aExp <= 0x7E ) { 1549 if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact; 1550 return 0; 1551 } 1552 aSig64 = aSig | 0x00800000; 1553 aSig64 <<= 40; 1554 z = aSig64>>( - shiftCount ); 1555 if ( (uint64_t) ( aSig64<<( shiftCount & 63 ) ) ) { 1556 STATUS(float_exception_flags) |= float_flag_inexact; 1557 } 1558 if ( aSign ) z = - z; 1559 return z; 1560 1561 } 1562 1563 /*---------------------------------------------------------------------------- 1564 | Returns the result of converting the single-precision floating-point value 1565 | `a' to the double-precision floating-point format. The conversion is 1566 | performed according to the IEC/IEEE Standard for Binary Floating-Point 1567 | Arithmetic. 1568 *----------------------------------------------------------------------------*/ 1569 1570 float64 float32_to_float64( float32 a STATUS_PARAM ) 1571 { 1572 flag aSign; 1573 int16 aExp; 1574 uint32_t aSig; 1575 a = float32_squash_input_denormal(a STATUS_VAR); 1576 1577 aSig = extractFloat32Frac( a ); 1578 aExp = extractFloat32Exp( a ); 1579 aSign = extractFloat32Sign( a ); 1580 if ( aExp == 0xFF ) { 1581 if ( aSig ) return commonNaNToFloat64( float32ToCommonNaN( a STATUS_VAR ) STATUS_VAR ); 1582 return packFloat64( aSign, 0x7FF, 0 ); 1583 } 1584 if ( aExp == 0 ) { 1585 if ( aSig == 0 ) return packFloat64( aSign, 0, 0 ); 1586 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 1587 --aExp; 1588 } 1589 return packFloat64( aSign, aExp + 0x380, ( (uint64_t) aSig )<<29 ); 1590 1591 } 1592 1593 #ifdef FLOATX80 1594 1595 /*---------------------------------------------------------------------------- 1596 | Returns the result of converting the single-precision floating-point value 1597 | `a' to the extended double-precision floating-point format. The conversion 1598 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 1599 | Arithmetic. 1600 *----------------------------------------------------------------------------*/ 1601 1602 floatx80 float32_to_floatx80( float32 a STATUS_PARAM ) 1603 { 1604 flag aSign; 1605 int16 aExp; 1606 uint32_t aSig; 1607 1608 a = float32_squash_input_denormal(a STATUS_VAR); 1609 aSig = extractFloat32Frac( a ); 1610 aExp = extractFloat32Exp( a ); 1611 aSign = extractFloat32Sign( a ); 1612 if ( aExp == 0xFF ) { 1613 if ( aSig ) return commonNaNToFloatx80( float32ToCommonNaN( a STATUS_VAR ) STATUS_VAR ); 1614 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 1615 } 1616 if ( aExp == 0 ) { 1617 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 ); 1618 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 1619 } 1620 aSig |= 0x00800000; 1621 return packFloatx80( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<40 ); 1622 1623 } 1624 1625 #endif 1626 1627 #ifdef FLOAT128 1628 1629 /*---------------------------------------------------------------------------- 1630 | Returns the result of converting the single-precision floating-point value 1631 | `a' to the double-precision floating-point format. The conversion is 1632 | performed according to the IEC/IEEE Standard for Binary Floating-Point 1633 | Arithmetic. 1634 *----------------------------------------------------------------------------*/ 1635 1636 float128 float32_to_float128( float32 a STATUS_PARAM ) 1637 { 1638 flag aSign; 1639 int16 aExp; 1640 uint32_t aSig; 1641 1642 a = float32_squash_input_denormal(a STATUS_VAR); 1643 aSig = extractFloat32Frac( a ); 1644 aExp = extractFloat32Exp( a ); 1645 aSign = extractFloat32Sign( a ); 1646 if ( aExp == 0xFF ) { 1647 if ( aSig ) return commonNaNToFloat128( float32ToCommonNaN( a STATUS_VAR ) STATUS_VAR ); 1648 return packFloat128( aSign, 0x7FFF, 0, 0 ); 1649 } 1650 if ( aExp == 0 ) { 1651 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 ); 1652 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 1653 --aExp; 1654 } 1655 return packFloat128( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<25, 0 ); 1656 1657 } 1658 1659 #endif 1660 1661 /*---------------------------------------------------------------------------- 1662 | Rounds the single-precision floating-point value `a' to an integer, and 1663 | returns the result as a single-precision floating-point value. The 1664 | operation is performed according to the IEC/IEEE Standard for Binary 1665 | Floating-Point Arithmetic. 1666 *----------------------------------------------------------------------------*/ 1667 1668 float32 float32_round_to_int( float32 a STATUS_PARAM) 1669 { 1670 flag aSign; 1671 int16 aExp; 1672 uint32_t lastBitMask, roundBitsMask; 1673 int8 roundingMode; 1674 uint32_t z; 1675 a = float32_squash_input_denormal(a STATUS_VAR); 1676 1677 aExp = extractFloat32Exp( a ); 1678 if ( 0x96 <= aExp ) { 1679 if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) { 1680 return propagateFloat32NaN( a, a STATUS_VAR ); 1681 } 1682 return a; 1683 } 1684 if ( aExp <= 0x7E ) { 1685 if ( (uint32_t) ( float32_val(a)<<1 ) == 0 ) return a; 1686 STATUS(float_exception_flags) |= float_flag_inexact; 1687 aSign = extractFloat32Sign( a ); 1688 switch ( STATUS(float_rounding_mode) ) { 1689 case float_round_nearest_even: 1690 if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) { 1691 return packFloat32( aSign, 0x7F, 0 ); 1692 } 1693 break; 1694 case float_round_down: 1695 return make_float32(aSign ? 0xBF800000 : 0); 1696 case float_round_up: 1697 return make_float32(aSign ? 0x80000000 : 0x3F800000); 1698 } 1699 return packFloat32( aSign, 0, 0 ); 1700 } 1701 lastBitMask = 1; 1702 lastBitMask <<= 0x96 - aExp; 1703 roundBitsMask = lastBitMask - 1; 1704 z = float32_val(a); 1705 roundingMode = STATUS(float_rounding_mode); 1706 if ( roundingMode == float_round_nearest_even ) { 1707 z += lastBitMask>>1; 1708 if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask; 1709 } 1710 else if ( roundingMode != float_round_to_zero ) { 1711 if ( extractFloat32Sign( make_float32(z) ) ^ ( roundingMode == float_round_up ) ) { 1712 z += roundBitsMask; 1713 } 1714 } 1715 z &= ~ roundBitsMask; 1716 if ( z != float32_val(a) ) STATUS(float_exception_flags) |= float_flag_inexact; 1717 return make_float32(z); 1718 1719 } 1720 1721 /*---------------------------------------------------------------------------- 1722 | Returns the result of adding the absolute values of the single-precision 1723 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated 1724 | before being returned. `zSign' is ignored if the result is a NaN. 1725 | The addition is performed according to the IEC/IEEE Standard for Binary 1726 | Floating-Point Arithmetic. 1727 *----------------------------------------------------------------------------*/ 1728 1729 static float32 addFloat32Sigs( float32 a, float32 b, flag zSign STATUS_PARAM) 1730 { 1731 int16 aExp, bExp, zExp; 1732 uint32_t aSig, bSig, zSig; 1733 int16 expDiff; 1734 1735 aSig = extractFloat32Frac( a ); 1736 aExp = extractFloat32Exp( a ); 1737 bSig = extractFloat32Frac( b ); 1738 bExp = extractFloat32Exp( b ); 1739 expDiff = aExp - bExp; 1740 aSig <<= 6; 1741 bSig <<= 6; 1742 if ( 0 < expDiff ) { 1743 if ( aExp == 0xFF ) { 1744 if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR ); 1745 return a; 1746 } 1747 if ( bExp == 0 ) { 1748 --expDiff; 1749 } 1750 else { 1751 bSig |= 0x20000000; 1752 } 1753 shift32RightJamming( bSig, expDiff, &bSig ); 1754 zExp = aExp; 1755 } 1756 else if ( expDiff < 0 ) { 1757 if ( bExp == 0xFF ) { 1758 if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR ); 1759 return packFloat32( zSign, 0xFF, 0 ); 1760 } 1761 if ( aExp == 0 ) { 1762 ++expDiff; 1763 } 1764 else { 1765 aSig |= 0x20000000; 1766 } 1767 shift32RightJamming( aSig, - expDiff, &aSig ); 1768 zExp = bExp; 1769 } 1770 else { 1771 if ( aExp == 0xFF ) { 1772 if ( aSig | bSig ) return propagateFloat32NaN( a, b STATUS_VAR ); 1773 return a; 1774 } 1775 if ( aExp == 0 ) { 1776 if (STATUS(flush_to_zero)) { 1777 if (aSig | bSig) { 1778 float_raise(float_flag_output_denormal STATUS_VAR); 1779 } 1780 return packFloat32(zSign, 0, 0); 1781 } 1782 return packFloat32( zSign, 0, ( aSig + bSig )>>6 ); 1783 } 1784 zSig = 0x40000000 + aSig + bSig; 1785 zExp = aExp; 1786 goto roundAndPack; 1787 } 1788 aSig |= 0x20000000; 1789 zSig = ( aSig + bSig )<<1; 1790 --zExp; 1791 if ( (int32_t) zSig < 0 ) { 1792 zSig = aSig + bSig; 1793 ++zExp; 1794 } 1795 roundAndPack: 1796 return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR ); 1797 1798 } 1799 1800 /*---------------------------------------------------------------------------- 1801 | Returns the result of subtracting the absolute values of the single- 1802 | precision floating-point values `a' and `b'. If `zSign' is 1, the 1803 | difference is negated before being returned. `zSign' is ignored if the 1804 | result is a NaN. The subtraction is performed according to the IEC/IEEE 1805 | Standard for Binary Floating-Point Arithmetic. 1806 *----------------------------------------------------------------------------*/ 1807 1808 static float32 subFloat32Sigs( float32 a, float32 b, flag zSign STATUS_PARAM) 1809 { 1810 int16 aExp, bExp, zExp; 1811 uint32_t aSig, bSig, zSig; 1812 int16 expDiff; 1813 1814 aSig = extractFloat32Frac( a ); 1815 aExp = extractFloat32Exp( a ); 1816 bSig = extractFloat32Frac( b ); 1817 bExp = extractFloat32Exp( b ); 1818 expDiff = aExp - bExp; 1819 aSig <<= 7; 1820 bSig <<= 7; 1821 if ( 0 < expDiff ) goto aExpBigger; 1822 if ( expDiff < 0 ) goto bExpBigger; 1823 if ( aExp == 0xFF ) { 1824 if ( aSig | bSig ) return propagateFloat32NaN( a, b STATUS_VAR ); 1825 float_raise( float_flag_invalid STATUS_VAR); 1826 return float32_default_nan; 1827 } 1828 if ( aExp == 0 ) { 1829 aExp = 1; 1830 bExp = 1; 1831 } 1832 if ( bSig < aSig ) goto aBigger; 1833 if ( aSig < bSig ) goto bBigger; 1834 return packFloat32( STATUS(float_rounding_mode) == float_round_down, 0, 0 ); 1835 bExpBigger: 1836 if ( bExp == 0xFF ) { 1837 if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR ); 1838 return packFloat32( zSign ^ 1, 0xFF, 0 ); 1839 } 1840 if ( aExp == 0 ) { 1841 ++expDiff; 1842 } 1843 else { 1844 aSig |= 0x40000000; 1845 } 1846 shift32RightJamming( aSig, - expDiff, &aSig ); 1847 bSig |= 0x40000000; 1848 bBigger: 1849 zSig = bSig - aSig; 1850 zExp = bExp; 1851 zSign ^= 1; 1852 goto normalizeRoundAndPack; 1853 aExpBigger: 1854 if ( aExp == 0xFF ) { 1855 if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR ); 1856 return a; 1857 } 1858 if ( bExp == 0 ) { 1859 --expDiff; 1860 } 1861 else { 1862 bSig |= 0x40000000; 1863 } 1864 shift32RightJamming( bSig, expDiff, &bSig ); 1865 aSig |= 0x40000000; 1866 aBigger: 1867 zSig = aSig - bSig; 1868 zExp = aExp; 1869 normalizeRoundAndPack: 1870 --zExp; 1871 return normalizeRoundAndPackFloat32( zSign, zExp, zSig STATUS_VAR ); 1872 1873 } 1874 1875 /*---------------------------------------------------------------------------- 1876 | Returns the result of adding the single-precision floating-point values `a' 1877 | and `b'. The operation is performed according to the IEC/IEEE Standard for 1878 | Binary Floating-Point Arithmetic. 1879 *----------------------------------------------------------------------------*/ 1880 1881 float32 float32_add( float32 a, float32 b STATUS_PARAM ) 1882 { 1883 flag aSign, bSign; 1884 a = float32_squash_input_denormal(a STATUS_VAR); 1885 b = float32_squash_input_denormal(b STATUS_VAR); 1886 1887 aSign = extractFloat32Sign( a ); 1888 bSign = extractFloat32Sign( b ); 1889 if ( aSign == bSign ) { 1890 return addFloat32Sigs( a, b, aSign STATUS_VAR); 1891 } 1892 else { 1893 return subFloat32Sigs( a, b, aSign STATUS_VAR ); 1894 } 1895 1896 } 1897 1898 /*---------------------------------------------------------------------------- 1899 | Returns the result of subtracting the single-precision floating-point values 1900 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard 1901 | for Binary Floating-Point Arithmetic. 1902 *----------------------------------------------------------------------------*/ 1903 1904 float32 float32_sub( float32 a, float32 b STATUS_PARAM ) 1905 { 1906 flag aSign, bSign; 1907 a = float32_squash_input_denormal(a STATUS_VAR); 1908 b = float32_squash_input_denormal(b STATUS_VAR); 1909 1910 aSign = extractFloat32Sign( a ); 1911 bSign = extractFloat32Sign( b ); 1912 if ( aSign == bSign ) { 1913 return subFloat32Sigs( a, b, aSign STATUS_VAR ); 1914 } 1915 else { 1916 return addFloat32Sigs( a, b, aSign STATUS_VAR ); 1917 } 1918 1919 } 1920 1921 /*---------------------------------------------------------------------------- 1922 | Returns the result of multiplying the single-precision floating-point values 1923 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard 1924 | for Binary Floating-Point Arithmetic. 1925 *----------------------------------------------------------------------------*/ 1926 1927 float32 float32_mul( float32 a, float32 b STATUS_PARAM ) 1928 { 1929 flag aSign, bSign, zSign; 1930 int16 aExp, bExp, zExp; 1931 uint32_t aSig, bSig; 1932 uint64_t zSig64; 1933 uint32_t zSig; 1934 1935 a = float32_squash_input_denormal(a STATUS_VAR); 1936 b = float32_squash_input_denormal(b STATUS_VAR); 1937 1938 aSig = extractFloat32Frac( a ); 1939 aExp = extractFloat32Exp( a ); 1940 aSign = extractFloat32Sign( a ); 1941 bSig = extractFloat32Frac( b ); 1942 bExp = extractFloat32Exp( b ); 1943 bSign = extractFloat32Sign( b ); 1944 zSign = aSign ^ bSign; 1945 if ( aExp == 0xFF ) { 1946 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) { 1947 return propagateFloat32NaN( a, b STATUS_VAR ); 1948 } 1949 if ( ( bExp | bSig ) == 0 ) { 1950 float_raise( float_flag_invalid STATUS_VAR); 1951 return float32_default_nan; 1952 } 1953 return packFloat32( zSign, 0xFF, 0 ); 1954 } 1955 if ( bExp == 0xFF ) { 1956 if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR ); 1957 if ( ( aExp | aSig ) == 0 ) { 1958 float_raise( float_flag_invalid STATUS_VAR); 1959 return float32_default_nan; 1960 } 1961 return packFloat32( zSign, 0xFF, 0 ); 1962 } 1963 if ( aExp == 0 ) { 1964 if ( aSig == 0 ) return packFloat32( zSign, 0, 0 ); 1965 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 1966 } 1967 if ( bExp == 0 ) { 1968 if ( bSig == 0 ) return packFloat32( zSign, 0, 0 ); 1969 normalizeFloat32Subnormal( bSig, &bExp, &bSig ); 1970 } 1971 zExp = aExp + bExp - 0x7F; 1972 aSig = ( aSig | 0x00800000 )<<7; 1973 bSig = ( bSig | 0x00800000 )<<8; 1974 shift64RightJamming( ( (uint64_t) aSig ) * bSig, 32, &zSig64 ); 1975 zSig = zSig64; 1976 if ( 0 <= (int32_t) ( zSig<<1 ) ) { 1977 zSig <<= 1; 1978 --zExp; 1979 } 1980 return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR ); 1981 1982 } 1983 1984 /*---------------------------------------------------------------------------- 1985 | Returns the result of dividing the single-precision floating-point value `a' 1986 | by the corresponding value `b'. The operation is performed according to the 1987 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1988 *----------------------------------------------------------------------------*/ 1989 1990 float32 float32_div( float32 a, float32 b STATUS_PARAM ) 1991 { 1992 flag aSign, bSign, zSign; 1993 int16 aExp, bExp, zExp; 1994 uint32_t aSig, bSig, zSig; 1995 a = float32_squash_input_denormal(a STATUS_VAR); 1996 b = float32_squash_input_denormal(b STATUS_VAR); 1997 1998 aSig = extractFloat32Frac( a ); 1999 aExp = extractFloat32Exp( a ); 2000 aSign = extractFloat32Sign( a ); 2001 bSig = extractFloat32Frac( b ); 2002 bExp = extractFloat32Exp( b ); 2003 bSign = extractFloat32Sign( b ); 2004 zSign = aSign ^ bSign; 2005 if ( aExp == 0xFF ) { 2006 if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR ); 2007 if ( bExp == 0xFF ) { 2008 if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR ); 2009 float_raise( float_flag_invalid STATUS_VAR); 2010 return float32_default_nan; 2011 } 2012 return packFloat32( zSign, 0xFF, 0 ); 2013 } 2014 if ( bExp == 0xFF ) { 2015 if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR ); 2016 return packFloat32( zSign, 0, 0 ); 2017 } 2018 if ( bExp == 0 ) { 2019 if ( bSig == 0 ) { 2020 if ( ( aExp | aSig ) == 0 ) { 2021 float_raise( float_flag_invalid STATUS_VAR); 2022 return float32_default_nan; 2023 } 2024 float_raise( float_flag_divbyzero STATUS_VAR); 2025 return packFloat32( zSign, 0xFF, 0 ); 2026 } 2027 normalizeFloat32Subnormal( bSig, &bExp, &bSig ); 2028 } 2029 if ( aExp == 0 ) { 2030 if ( aSig == 0 ) return packFloat32( zSign, 0, 0 ); 2031 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 2032 } 2033 zExp = aExp - bExp + 0x7D; 2034 aSig = ( aSig | 0x00800000 )<<7; 2035 bSig = ( bSig | 0x00800000 )<<8; 2036 if ( bSig <= ( aSig + aSig ) ) { 2037 aSig >>= 1; 2038 ++zExp; 2039 } 2040 zSig = ( ( (uint64_t) aSig )<<32 ) / bSig; 2041 if ( ( zSig & 0x3F ) == 0 ) { 2042 zSig |= ( (uint64_t) bSig * zSig != ( (uint64_t) aSig )<<32 ); 2043 } 2044 return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR ); 2045 2046 } 2047 2048 /*---------------------------------------------------------------------------- 2049 | Returns the remainder of the single-precision floating-point value `a' 2050 | with respect to the corresponding value `b'. The operation is performed 2051 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 2052 *----------------------------------------------------------------------------*/ 2053 2054 float32 float32_rem( float32 a, float32 b STATUS_PARAM ) 2055 { 2056 flag aSign, zSign; 2057 int16 aExp, bExp, expDiff; 2058 uint32_t aSig, bSig; 2059 uint32_t q; 2060 uint64_t aSig64, bSig64, q64; 2061 uint32_t alternateASig; 2062 int32_t sigMean; 2063 a = float32_squash_input_denormal(a STATUS_VAR); 2064 b = float32_squash_input_denormal(b STATUS_VAR); 2065 2066 aSig = extractFloat32Frac( a ); 2067 aExp = extractFloat32Exp( a ); 2068 aSign = extractFloat32Sign( a ); 2069 bSig = extractFloat32Frac( b ); 2070 bExp = extractFloat32Exp( b ); 2071 if ( aExp == 0xFF ) { 2072 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) { 2073 return propagateFloat32NaN( a, b STATUS_VAR ); 2074 } 2075 float_raise( float_flag_invalid STATUS_VAR); 2076 return float32_default_nan; 2077 } 2078 if ( bExp == 0xFF ) { 2079 if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR ); 2080 return a; 2081 } 2082 if ( bExp == 0 ) { 2083 if ( bSig == 0 ) { 2084 float_raise( float_flag_invalid STATUS_VAR); 2085 return float32_default_nan; 2086 } 2087 normalizeFloat32Subnormal( bSig, &bExp, &bSig ); 2088 } 2089 if ( aExp == 0 ) { 2090 if ( aSig == 0 ) return a; 2091 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 2092 } 2093 expDiff = aExp - bExp; 2094 aSig |= 0x00800000; 2095 bSig |= 0x00800000; 2096 if ( expDiff < 32 ) { 2097 aSig <<= 8; 2098 bSig <<= 8; 2099 if ( expDiff < 0 ) { 2100 if ( expDiff < -1 ) return a; 2101 aSig >>= 1; 2102 } 2103 q = ( bSig <= aSig ); 2104 if ( q ) aSig -= bSig; 2105 if ( 0 < expDiff ) { 2106 q = ( ( (uint64_t) aSig )<<32 ) / bSig; 2107 q >>= 32 - expDiff; 2108 bSig >>= 2; 2109 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q; 2110 } 2111 else { 2112 aSig >>= 2; 2113 bSig >>= 2; 2114 } 2115 } 2116 else { 2117 if ( bSig <= aSig ) aSig -= bSig; 2118 aSig64 = ( (uint64_t) aSig )<<40; 2119 bSig64 = ( (uint64_t) bSig )<<40; 2120 expDiff -= 64; 2121 while ( 0 < expDiff ) { 2122 q64 = estimateDiv128To64( aSig64, 0, bSig64 ); 2123 q64 = ( 2 < q64 ) ? q64 - 2 : 0; 2124 aSig64 = - ( ( bSig * q64 )<<38 ); 2125 expDiff -= 62; 2126 } 2127 expDiff += 64; 2128 q64 = estimateDiv128To64( aSig64, 0, bSig64 ); 2129 q64 = ( 2 < q64 ) ? q64 - 2 : 0; 2130 q = q64>>( 64 - expDiff ); 2131 bSig <<= 6; 2132 aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q; 2133 } 2134 do { 2135 alternateASig = aSig; 2136 ++q; 2137 aSig -= bSig; 2138 } while ( 0 <= (int32_t) aSig ); 2139 sigMean = aSig + alternateASig; 2140 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) { 2141 aSig = alternateASig; 2142 } 2143 zSign = ( (int32_t) aSig < 0 ); 2144 if ( zSign ) aSig = - aSig; 2145 return normalizeRoundAndPackFloat32( aSign ^ zSign, bExp, aSig STATUS_VAR ); 2146 2147 } 2148 2149 /*---------------------------------------------------------------------------- 2150 | Returns the square root of the single-precision floating-point value `a'. 2151 | The operation is performed according to the IEC/IEEE Standard for Binary 2152 | Floating-Point Arithmetic. 2153 *----------------------------------------------------------------------------*/ 2154 2155 float32 float32_sqrt( float32 a STATUS_PARAM ) 2156 { 2157 flag aSign; 2158 int16 aExp, zExp; 2159 uint32_t aSig, zSig; 2160 uint64_t rem, term; 2161 a = float32_squash_input_denormal(a STATUS_VAR); 2162 2163 aSig = extractFloat32Frac( a ); 2164 aExp = extractFloat32Exp( a ); 2165 aSign = extractFloat32Sign( a ); 2166 if ( aExp == 0xFF ) { 2167 if ( aSig ) return propagateFloat32NaN( a, float32_zero STATUS_VAR ); 2168 if ( ! aSign ) return a; 2169 float_raise( float_flag_invalid STATUS_VAR); 2170 return float32_default_nan; 2171 } 2172 if ( aSign ) { 2173 if ( ( aExp | aSig ) == 0 ) return a; 2174 float_raise( float_flag_invalid STATUS_VAR); 2175 return float32_default_nan; 2176 } 2177 if ( aExp == 0 ) { 2178 if ( aSig == 0 ) return float32_zero; 2179 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 2180 } 2181 zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E; 2182 aSig = ( aSig | 0x00800000 )<<8; 2183 zSig = estimateSqrt32( aExp, aSig ) + 2; 2184 if ( ( zSig & 0x7F ) <= 5 ) { 2185 if ( zSig < 2 ) { 2186 zSig = 0x7FFFFFFF; 2187 goto roundAndPack; 2188 } 2189 aSig >>= aExp & 1; 2190 term = ( (uint64_t) zSig ) * zSig; 2191 rem = ( ( (uint64_t) aSig )<<32 ) - term; 2192 while ( (int64_t) rem < 0 ) { 2193 --zSig; 2194 rem += ( ( (uint64_t) zSig )<<1 ) | 1; 2195 } 2196 zSig |= ( rem != 0 ); 2197 } 2198 shift32RightJamming( zSig, 1, &zSig ); 2199 roundAndPack: 2200 return roundAndPackFloat32( 0, zExp, zSig STATUS_VAR ); 2201 2202 } 2203 2204 /*---------------------------------------------------------------------------- 2205 | Returns the binary exponential of the single-precision floating-point value 2206 | `a'. The operation is performed according to the IEC/IEEE Standard for 2207 | Binary Floating-Point Arithmetic. 2208 | 2209 | Uses the following identities: 2210 | 2211 | 1. ------------------------------------------------------------------------- 2212 | x x*ln(2) 2213 | 2 = e 2214 | 2215 | 2. ------------------------------------------------------------------------- 2216 | 2 3 4 5 n 2217 | x x x x x x x 2218 | e = 1 + --- + --- + --- + --- + --- + ... + --- + ... 2219 | 1! 2! 3! 4! 5! n! 2220 *----------------------------------------------------------------------------*/ 2221 2222 static const float64 float32_exp2_coefficients[15] = 2223 { 2224 const_float64( 0x3ff0000000000000ll ), /* 1 */ 2225 const_float64( 0x3fe0000000000000ll ), /* 2 */ 2226 const_float64( 0x3fc5555555555555ll ), /* 3 */ 2227 const_float64( 0x3fa5555555555555ll ), /* 4 */ 2228 const_float64( 0x3f81111111111111ll ), /* 5 */ 2229 const_float64( 0x3f56c16c16c16c17ll ), /* 6 */ 2230 const_float64( 0x3f2a01a01a01a01all ), /* 7 */ 2231 const_float64( 0x3efa01a01a01a01all ), /* 8 */ 2232 const_float64( 0x3ec71de3a556c734ll ), /* 9 */ 2233 const_float64( 0x3e927e4fb7789f5cll ), /* 10 */ 2234 const_float64( 0x3e5ae64567f544e4ll ), /* 11 */ 2235 const_float64( 0x3e21eed8eff8d898ll ), /* 12 */ 2236 const_float64( 0x3de6124613a86d09ll ), /* 13 */ 2237 const_float64( 0x3da93974a8c07c9dll ), /* 14 */ 2238 const_float64( 0x3d6ae7f3e733b81fll ), /* 15 */ 2239 }; 2240 2241 float32 float32_exp2( float32 a STATUS_PARAM ) 2242 { 2243 flag aSign; 2244 int16 aExp; 2245 uint32_t aSig; 2246 float64 r, x, xn; 2247 int i; 2248 a = float32_squash_input_denormal(a STATUS_VAR); 2249 2250 aSig = extractFloat32Frac( a ); 2251 aExp = extractFloat32Exp( a ); 2252 aSign = extractFloat32Sign( a ); 2253 2254 if ( aExp == 0xFF) { 2255 if ( aSig ) return propagateFloat32NaN( a, float32_zero STATUS_VAR ); 2256 return (aSign) ? float32_zero : a; 2257 } 2258 if (aExp == 0) { 2259 if (aSig == 0) return float32_one; 2260 } 2261 2262 float_raise( float_flag_inexact STATUS_VAR); 2263 2264 /* ******************************* */ 2265 /* using float64 for approximation */ 2266 /* ******************************* */ 2267 x = float32_to_float64(a STATUS_VAR); 2268 x = float64_mul(x, float64_ln2 STATUS_VAR); 2269 2270 xn = x; 2271 r = float64_one; 2272 for (i = 0 ; i < 15 ; i++) { 2273 float64 f; 2274 2275 f = float64_mul(xn, float32_exp2_coefficients[i] STATUS_VAR); 2276 r = float64_add(r, f STATUS_VAR); 2277 2278 xn = float64_mul(xn, x STATUS_VAR); 2279 } 2280 2281 return float64_to_float32(r, status); 2282 } 2283 2284 /*---------------------------------------------------------------------------- 2285 | Returns the binary log of the single-precision floating-point value `a'. 2286 | The operation is performed according to the IEC/IEEE Standard for Binary 2287 | Floating-Point Arithmetic. 2288 *----------------------------------------------------------------------------*/ 2289 float32 float32_log2( float32 a STATUS_PARAM ) 2290 { 2291 flag aSign, zSign; 2292 int16 aExp; 2293 uint32_t aSig, zSig, i; 2294 2295 a = float32_squash_input_denormal(a STATUS_VAR); 2296 aSig = extractFloat32Frac( a ); 2297 aExp = extractFloat32Exp( a ); 2298 aSign = extractFloat32Sign( a ); 2299 2300 if ( aExp == 0 ) { 2301 if ( aSig == 0 ) return packFloat32( 1, 0xFF, 0 ); 2302 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 2303 } 2304 if ( aSign ) { 2305 float_raise( float_flag_invalid STATUS_VAR); 2306 return float32_default_nan; 2307 } 2308 if ( aExp == 0xFF ) { 2309 if ( aSig ) return propagateFloat32NaN( a, float32_zero STATUS_VAR ); 2310 return a; 2311 } 2312 2313 aExp -= 0x7F; 2314 aSig |= 0x00800000; 2315 zSign = aExp < 0; 2316 zSig = aExp << 23; 2317 2318 for (i = 1 << 22; i > 0; i >>= 1) { 2319 aSig = ( (uint64_t)aSig * aSig ) >> 23; 2320 if ( aSig & 0x01000000 ) { 2321 aSig >>= 1; 2322 zSig |= i; 2323 } 2324 } 2325 2326 if ( zSign ) 2327 zSig = -zSig; 2328 2329 return normalizeRoundAndPackFloat32( zSign, 0x85, zSig STATUS_VAR ); 2330 } 2331 2332 /*---------------------------------------------------------------------------- 2333 | Returns 1 if the single-precision floating-point value `a' is equal to 2334 | the corresponding value `b', and 0 otherwise. The invalid exception is 2335 | raised if either operand is a NaN. Otherwise, the comparison is performed 2336 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 2337 *----------------------------------------------------------------------------*/ 2338 2339 int float32_eq( float32 a, float32 b STATUS_PARAM ) 2340 { 2341 uint32_t av, bv; 2342 a = float32_squash_input_denormal(a STATUS_VAR); 2343 b = float32_squash_input_denormal(b STATUS_VAR); 2344 2345 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 2346 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 2347 ) { 2348 float_raise( float_flag_invalid STATUS_VAR); 2349 return 0; 2350 } 2351 av = float32_val(a); 2352 bv = float32_val(b); 2353 return ( av == bv ) || ( (uint32_t) ( ( av | bv )<<1 ) == 0 ); 2354 } 2355 2356 /*---------------------------------------------------------------------------- 2357 | Returns 1 if the single-precision floating-point value `a' is less than 2358 | or equal to the corresponding value `b', and 0 otherwise. The invalid 2359 | exception is raised if either operand is a NaN. The comparison is performed 2360 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 2361 *----------------------------------------------------------------------------*/ 2362 2363 int float32_le( float32 a, float32 b STATUS_PARAM ) 2364 { 2365 flag aSign, bSign; 2366 uint32_t av, bv; 2367 a = float32_squash_input_denormal(a STATUS_VAR); 2368 b = float32_squash_input_denormal(b STATUS_VAR); 2369 2370 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 2371 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 2372 ) { 2373 float_raise( float_flag_invalid STATUS_VAR); 2374 return 0; 2375 } 2376 aSign = extractFloat32Sign( a ); 2377 bSign = extractFloat32Sign( b ); 2378 av = float32_val(a); 2379 bv = float32_val(b); 2380 if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 ); 2381 return ( av == bv ) || ( aSign ^ ( av < bv ) ); 2382 2383 } 2384 2385 /*---------------------------------------------------------------------------- 2386 | Returns 1 if the single-precision floating-point value `a' is less than 2387 | the corresponding value `b', and 0 otherwise. The invalid exception is 2388 | raised if either operand is a NaN. The comparison is performed according 2389 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 2390 *----------------------------------------------------------------------------*/ 2391 2392 int float32_lt( float32 a, float32 b STATUS_PARAM ) 2393 { 2394 flag aSign, bSign; 2395 uint32_t av, bv; 2396 a = float32_squash_input_denormal(a STATUS_VAR); 2397 b = float32_squash_input_denormal(b STATUS_VAR); 2398 2399 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 2400 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 2401 ) { 2402 float_raise( float_flag_invalid STATUS_VAR); 2403 return 0; 2404 } 2405 aSign = extractFloat32Sign( a ); 2406 bSign = extractFloat32Sign( b ); 2407 av = float32_val(a); 2408 bv = float32_val(b); 2409 if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 ); 2410 return ( av != bv ) && ( aSign ^ ( av < bv ) ); 2411 2412 } 2413 2414 /*---------------------------------------------------------------------------- 2415 | Returns 1 if the single-precision floating-point values `a' and `b' cannot 2416 | be compared, and 0 otherwise. The invalid exception is raised if either 2417 | operand is a NaN. The comparison is performed according to the IEC/IEEE 2418 | Standard for Binary Floating-Point Arithmetic. 2419 *----------------------------------------------------------------------------*/ 2420 2421 int float32_unordered( float32 a, float32 b STATUS_PARAM ) 2422 { 2423 a = float32_squash_input_denormal(a STATUS_VAR); 2424 b = float32_squash_input_denormal(b STATUS_VAR); 2425 2426 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 2427 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 2428 ) { 2429 float_raise( float_flag_invalid STATUS_VAR); 2430 return 1; 2431 } 2432 return 0; 2433 } 2434 2435 /*---------------------------------------------------------------------------- 2436 | Returns 1 if the single-precision floating-point value `a' is equal to 2437 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 2438 | exception. The comparison is performed according to the IEC/IEEE Standard 2439 | for Binary Floating-Point Arithmetic. 2440 *----------------------------------------------------------------------------*/ 2441 2442 int float32_eq_quiet( float32 a, float32 b STATUS_PARAM ) 2443 { 2444 a = float32_squash_input_denormal(a STATUS_VAR); 2445 b = float32_squash_input_denormal(b STATUS_VAR); 2446 2447 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 2448 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 2449 ) { 2450 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) { 2451 float_raise( float_flag_invalid STATUS_VAR); 2452 } 2453 return 0; 2454 } 2455 return ( float32_val(a) == float32_val(b) ) || 2456 ( (uint32_t) ( ( float32_val(a) | float32_val(b) )<<1 ) == 0 ); 2457 } 2458 2459 /*---------------------------------------------------------------------------- 2460 | Returns 1 if the single-precision floating-point value `a' is less than or 2461 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not 2462 | cause an exception. Otherwise, the comparison is performed according to the 2463 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 2464 *----------------------------------------------------------------------------*/ 2465 2466 int float32_le_quiet( float32 a, float32 b STATUS_PARAM ) 2467 { 2468 flag aSign, bSign; 2469 uint32_t av, bv; 2470 a = float32_squash_input_denormal(a STATUS_VAR); 2471 b = float32_squash_input_denormal(b STATUS_VAR); 2472 2473 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 2474 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 2475 ) { 2476 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) { 2477 float_raise( float_flag_invalid STATUS_VAR); 2478 } 2479 return 0; 2480 } 2481 aSign = extractFloat32Sign( a ); 2482 bSign = extractFloat32Sign( b ); 2483 av = float32_val(a); 2484 bv = float32_val(b); 2485 if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 ); 2486 return ( av == bv ) || ( aSign ^ ( av < bv ) ); 2487 2488 } 2489 2490 /*---------------------------------------------------------------------------- 2491 | Returns 1 if the single-precision floating-point value `a' is less than 2492 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 2493 | exception. Otherwise, the comparison is performed according to the IEC/IEEE 2494 | Standard for Binary Floating-Point Arithmetic. 2495 *----------------------------------------------------------------------------*/ 2496 2497 int float32_lt_quiet( float32 a, float32 b STATUS_PARAM ) 2498 { 2499 flag aSign, bSign; 2500 uint32_t av, bv; 2501 a = float32_squash_input_denormal(a STATUS_VAR); 2502 b = float32_squash_input_denormal(b STATUS_VAR); 2503 2504 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 2505 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 2506 ) { 2507 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) { 2508 float_raise( float_flag_invalid STATUS_VAR); 2509 } 2510 return 0; 2511 } 2512 aSign = extractFloat32Sign( a ); 2513 bSign = extractFloat32Sign( b ); 2514 av = float32_val(a); 2515 bv = float32_val(b); 2516 if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 ); 2517 return ( av != bv ) && ( aSign ^ ( av < bv ) ); 2518 2519 } 2520 2521 /*---------------------------------------------------------------------------- 2522 | Returns 1 if the single-precision floating-point values `a' and `b' cannot 2523 | be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The 2524 | comparison is performed according to the IEC/IEEE Standard for Binary 2525 | Floating-Point Arithmetic. 2526 *----------------------------------------------------------------------------*/ 2527 2528 int float32_unordered_quiet( float32 a, float32 b STATUS_PARAM ) 2529 { 2530 a = float32_squash_input_denormal(a STATUS_VAR); 2531 b = float32_squash_input_denormal(b STATUS_VAR); 2532 2533 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 2534 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 2535 ) { 2536 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) { 2537 float_raise( float_flag_invalid STATUS_VAR); 2538 } 2539 return 1; 2540 } 2541 return 0; 2542 } 2543 2544 /*---------------------------------------------------------------------------- 2545 | Returns the result of converting the double-precision floating-point value 2546 | `a' to the 32-bit two's complement integer format. The conversion is 2547 | performed according to the IEC/IEEE Standard for Binary Floating-Point 2548 | Arithmetic---which means in particular that the conversion is rounded 2549 | according to the current rounding mode. If `a' is a NaN, the largest 2550 | positive integer is returned. Otherwise, if the conversion overflows, the 2551 | largest integer with the same sign as `a' is returned. 2552 *----------------------------------------------------------------------------*/ 2553 2554 int32 float64_to_int32( float64 a STATUS_PARAM ) 2555 { 2556 flag aSign; 2557 int16 aExp, shiftCount; 2558 uint64_t aSig; 2559 a = float64_squash_input_denormal(a STATUS_VAR); 2560 2561 aSig = extractFloat64Frac( a ); 2562 aExp = extractFloat64Exp( a ); 2563 aSign = extractFloat64Sign( a ); 2564 if ( ( aExp == 0x7FF ) && aSig ) aSign = 0; 2565 if ( aExp ) aSig |= LIT64( 0x0010000000000000 ); 2566 shiftCount = 0x42C - aExp; 2567 if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig ); 2568 return roundAndPackInt32( aSign, aSig STATUS_VAR ); 2569 2570 } 2571 2572 /*---------------------------------------------------------------------------- 2573 | Returns the result of converting the double-precision floating-point value 2574 | `a' to the 32-bit two's complement integer format. The conversion is 2575 | performed according to the IEC/IEEE Standard for Binary Floating-Point 2576 | Arithmetic, except that the conversion is always rounded toward zero. 2577 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if 2578 | the conversion overflows, the largest integer with the same sign as `a' is 2579 | returned. 2580 *----------------------------------------------------------------------------*/ 2581 2582 int32 float64_to_int32_round_to_zero( float64 a STATUS_PARAM ) 2583 { 2584 flag aSign; 2585 int16 aExp, shiftCount; 2586 uint64_t aSig, savedASig; 2587 int32 z; 2588 a = float64_squash_input_denormal(a STATUS_VAR); 2589 2590 aSig = extractFloat64Frac( a ); 2591 aExp = extractFloat64Exp( a ); 2592 aSign = extractFloat64Sign( a ); 2593 if ( 0x41E < aExp ) { 2594 if ( ( aExp == 0x7FF ) && aSig ) aSign = 0; 2595 goto invalid; 2596 } 2597 else if ( aExp < 0x3FF ) { 2598 if ( aExp || aSig ) STATUS(float_exception_flags) |= float_flag_inexact; 2599 return 0; 2600 } 2601 aSig |= LIT64( 0x0010000000000000 ); 2602 shiftCount = 0x433 - aExp; 2603 savedASig = aSig; 2604 aSig >>= shiftCount; 2605 z = aSig; 2606 if ( aSign ) z = - z; 2607 if ( ( z < 0 ) ^ aSign ) { 2608 invalid: 2609 float_raise( float_flag_invalid STATUS_VAR); 2610 return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF; 2611 } 2612 if ( ( aSig<<shiftCount ) != savedASig ) { 2613 STATUS(float_exception_flags) |= float_flag_inexact; 2614 } 2615 return z; 2616 2617 } 2618 2619 /*---------------------------------------------------------------------------- 2620 | Returns the result of converting the double-precision floating-point value 2621 | `a' to the 16-bit two's complement integer format. The conversion is 2622 | performed according to the IEC/IEEE Standard for Binary Floating-Point 2623 | Arithmetic, except that the conversion is always rounded toward zero. 2624 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if 2625 | the conversion overflows, the largest integer with the same sign as `a' is 2626 | returned. 2627 *----------------------------------------------------------------------------*/ 2628 2629 int16 float64_to_int16_round_to_zero( float64 a STATUS_PARAM ) 2630 { 2631 flag aSign; 2632 int16 aExp, shiftCount; 2633 uint64_t aSig, savedASig; 2634 int32 z; 2635 2636 aSig = extractFloat64Frac( a ); 2637 aExp = extractFloat64Exp( a ); 2638 aSign = extractFloat64Sign( a ); 2639 if ( 0x40E < aExp ) { 2640 if ( ( aExp == 0x7FF ) && aSig ) { 2641 aSign = 0; 2642 } 2643 goto invalid; 2644 } 2645 else if ( aExp < 0x3FF ) { 2646 if ( aExp || aSig ) { 2647 STATUS(float_exception_flags) |= float_flag_inexact; 2648 } 2649 return 0; 2650 } 2651 aSig |= LIT64( 0x0010000000000000 ); 2652 shiftCount = 0x433 - aExp; 2653 savedASig = aSig; 2654 aSig >>= shiftCount; 2655 z = aSig; 2656 if ( aSign ) { 2657 z = - z; 2658 } 2659 if ( ( (int16_t)z < 0 ) ^ aSign ) { 2660 invalid: 2661 float_raise( float_flag_invalid STATUS_VAR); 2662 return aSign ? (int32_t) 0xffff8000 : 0x7FFF; 2663 } 2664 if ( ( aSig<<shiftCount ) != savedASig ) { 2665 STATUS(float_exception_flags) |= float_flag_inexact; 2666 } 2667 return z; 2668 } 2669 2670 /*---------------------------------------------------------------------------- 2671 | Returns the result of converting the double-precision floating-point value 2672 | `a' to the 64-bit two's complement integer format. The conversion is 2673 | performed according to the IEC/IEEE Standard for Binary Floating-Point 2674 | Arithmetic---which means in particular that the conversion is rounded 2675 | according to the current rounding mode. If `a' is a NaN, the largest 2676 | positive integer is returned. Otherwise, if the conversion overflows, the 2677 | largest integer with the same sign as `a' is returned. 2678 *----------------------------------------------------------------------------*/ 2679 2680 int64 float64_to_int64( float64 a STATUS_PARAM ) 2681 { 2682 flag aSign; 2683 int16 aExp, shiftCount; 2684 uint64_t aSig, aSigExtra; 2685 a = float64_squash_input_denormal(a STATUS_VAR); 2686 2687 aSig = extractFloat64Frac( a ); 2688 aExp = extractFloat64Exp( a ); 2689 aSign = extractFloat64Sign( a ); 2690 if ( aExp ) aSig |= LIT64( 0x0010000000000000 ); 2691 shiftCount = 0x433 - aExp; 2692 if ( shiftCount <= 0 ) { 2693 if ( 0x43E < aExp ) { 2694 float_raise( float_flag_invalid STATUS_VAR); 2695 if ( ! aSign 2696 || ( ( aExp == 0x7FF ) 2697 && ( aSig != LIT64( 0x0010000000000000 ) ) ) 2698 ) { 2699 return LIT64( 0x7FFFFFFFFFFFFFFF ); 2700 } 2701 return (int64_t) LIT64( 0x8000000000000000 ); 2702 } 2703 aSigExtra = 0; 2704 aSig <<= - shiftCount; 2705 } 2706 else { 2707 shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra ); 2708 } 2709 return roundAndPackInt64( aSign, aSig, aSigExtra STATUS_VAR ); 2710 2711 } 2712 2713 /*---------------------------------------------------------------------------- 2714 | Returns the result of converting the double-precision floating-point value 2715 | `a' to the 64-bit two's complement integer format. The conversion is 2716 | performed according to the IEC/IEEE Standard for Binary Floating-Point 2717 | Arithmetic, except that the conversion is always rounded toward zero. 2718 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if 2719 | the conversion overflows, the largest integer with the same sign as `a' is 2720 | returned. 2721 *----------------------------------------------------------------------------*/ 2722 2723 int64 float64_to_int64_round_to_zero( float64 a STATUS_PARAM ) 2724 { 2725 flag aSign; 2726 int16 aExp, shiftCount; 2727 uint64_t aSig; 2728 int64 z; 2729 a = float64_squash_input_denormal(a STATUS_VAR); 2730 2731 aSig = extractFloat64Frac( a ); 2732 aExp = extractFloat64Exp( a ); 2733 aSign = extractFloat64Sign( a ); 2734 if ( aExp ) aSig |= LIT64( 0x0010000000000000 ); 2735 shiftCount = aExp - 0x433; 2736 if ( 0 <= shiftCount ) { 2737 if ( 0x43E <= aExp ) { 2738 if ( float64_val(a) != LIT64( 0xC3E0000000000000 ) ) { 2739 float_raise( float_flag_invalid STATUS_VAR); 2740 if ( ! aSign 2741 || ( ( aExp == 0x7FF ) 2742 && ( aSig != LIT64( 0x0010000000000000 ) ) ) 2743 ) { 2744 return LIT64( 0x7FFFFFFFFFFFFFFF ); 2745 } 2746 } 2747 return (int64_t) LIT64( 0x8000000000000000 ); 2748 } 2749 z = aSig<<shiftCount; 2750 } 2751 else { 2752 if ( aExp < 0x3FE ) { 2753 if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact; 2754 return 0; 2755 } 2756 z = aSig>>( - shiftCount ); 2757 if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) { 2758 STATUS(float_exception_flags) |= float_flag_inexact; 2759 } 2760 } 2761 if ( aSign ) z = - z; 2762 return z; 2763 2764 } 2765 2766 /*---------------------------------------------------------------------------- 2767 | Returns the result of converting the double-precision floating-point value 2768 | `a' to the single-precision floating-point format. The conversion is 2769 | performed according to the IEC/IEEE Standard for Binary Floating-Point 2770 | Arithmetic. 2771 *----------------------------------------------------------------------------*/ 2772 2773 float32 float64_to_float32( float64 a STATUS_PARAM ) 2774 { 2775 flag aSign; 2776 int16 aExp; 2777 uint64_t aSig; 2778 uint32_t zSig; 2779 a = float64_squash_input_denormal(a STATUS_VAR); 2780 2781 aSig = extractFloat64Frac( a ); 2782 aExp = extractFloat64Exp( a ); 2783 aSign = extractFloat64Sign( a ); 2784 if ( aExp == 0x7FF ) { 2785 if ( aSig ) return commonNaNToFloat32( float64ToCommonNaN( a STATUS_VAR ) STATUS_VAR ); 2786 return packFloat32( aSign, 0xFF, 0 ); 2787 } 2788 shift64RightJamming( aSig, 22, &aSig ); 2789 zSig = aSig; 2790 if ( aExp || zSig ) { 2791 zSig |= 0x40000000; 2792 aExp -= 0x381; 2793 } 2794 return roundAndPackFloat32( aSign, aExp, zSig STATUS_VAR ); 2795 2796 } 2797 2798 2799 /*---------------------------------------------------------------------------- 2800 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a 2801 | half-precision floating-point value, returning the result. After being 2802 | shifted into the proper positions, the three fields are simply added 2803 | together to form the result. This means that any integer portion of `zSig' 2804 | will be added into the exponent. Since a properly normalized significand 2805 | will have an integer portion equal to 1, the `zExp' input should be 1 less 2806 | than the desired result exponent whenever `zSig' is a complete, normalized 2807 | significand. 2808 *----------------------------------------------------------------------------*/ 2809 static float16 packFloat16(flag zSign, int16 zExp, uint16_t zSig) 2810 { 2811 return make_float16( 2812 (((uint32_t)zSign) << 15) + (((uint32_t)zExp) << 10) + zSig); 2813 } 2814 2815 /* Half precision floats come in two formats: standard IEEE and "ARM" format. 2816 The latter gains extra exponent range by omitting the NaN/Inf encodings. */ 2817 2818 float32 float16_to_float32(float16 a, flag ieee STATUS_PARAM) 2819 { 2820 flag aSign; 2821 int16 aExp; 2822 uint32_t aSig; 2823 2824 aSign = extractFloat16Sign(a); 2825 aExp = extractFloat16Exp(a); 2826 aSig = extractFloat16Frac(a); 2827 2828 if (aExp == 0x1f && ieee) { 2829 if (aSig) { 2830 return commonNaNToFloat32(float16ToCommonNaN(a STATUS_VAR) STATUS_VAR); 2831 } 2832 return packFloat32(aSign, 0xff, aSig << 13); 2833 } 2834 if (aExp == 0) { 2835 int8 shiftCount; 2836 2837 if (aSig == 0) { 2838 return packFloat32(aSign, 0, 0); 2839 } 2840 2841 shiftCount = countLeadingZeros32( aSig ) - 21; 2842 aSig = aSig << shiftCount; 2843 aExp = -shiftCount; 2844 } 2845 return packFloat32( aSign, aExp + 0x70, aSig << 13); 2846 } 2847 2848 float16 float32_to_float16(float32 a, flag ieee STATUS_PARAM) 2849 { 2850 flag aSign; 2851 int16 aExp; 2852 uint32_t aSig; 2853 uint32_t mask; 2854 uint32_t increment; 2855 int8 roundingMode; 2856 a = float32_squash_input_denormal(a STATUS_VAR); 2857 2858 aSig = extractFloat32Frac( a ); 2859 aExp = extractFloat32Exp( a ); 2860 aSign = extractFloat32Sign( a ); 2861 if ( aExp == 0xFF ) { 2862 if (aSig) { 2863 /* Input is a NaN */ 2864 float16 r = commonNaNToFloat16( float32ToCommonNaN( a STATUS_VAR ) STATUS_VAR ); 2865 if (!ieee) { 2866 return packFloat16(aSign, 0, 0); 2867 } 2868 return r; 2869 } 2870 /* Infinity */ 2871 if (!ieee) { 2872 float_raise(float_flag_invalid STATUS_VAR); 2873 return packFloat16(aSign, 0x1f, 0x3ff); 2874 } 2875 return packFloat16(aSign, 0x1f, 0); 2876 } 2877 if (aExp == 0 && aSig == 0) { 2878 return packFloat16(aSign, 0, 0); 2879 } 2880 /* Decimal point between bits 22 and 23. */ 2881 aSig |= 0x00800000; 2882 aExp -= 0x7f; 2883 if (aExp < -14) { 2884 mask = 0x00ffffff; 2885 if (aExp >= -24) { 2886 mask >>= 25 + aExp; 2887 } 2888 } else { 2889 mask = 0x00001fff; 2890 } 2891 if (aSig & mask) { 2892 float_raise( float_flag_underflow STATUS_VAR ); 2893 roundingMode = STATUS(float_rounding_mode); 2894 switch (roundingMode) { 2895 case float_round_nearest_even: 2896 increment = (mask + 1) >> 1; 2897 if ((aSig & mask) == increment) { 2898 increment = aSig & (increment << 1); 2899 } 2900 break; 2901 case float_round_up: 2902 increment = aSign ? 0 : mask; 2903 break; 2904 case float_round_down: 2905 increment = aSign ? mask : 0; 2906 break; 2907 default: /* round_to_zero */ 2908 increment = 0; 2909 break; 2910 } 2911 aSig += increment; 2912 if (aSig >= 0x01000000) { 2913 aSig >>= 1; 2914 aExp++; 2915 } 2916 } else if (aExp < -14 2917 && STATUS(float_detect_tininess) == float_tininess_before_rounding) { 2918 float_raise( float_flag_underflow STATUS_VAR); 2919 } 2920 2921 if (ieee) { 2922 if (aExp > 15) { 2923 float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR); 2924 return packFloat16(aSign, 0x1f, 0); 2925 } 2926 } else { 2927 if (aExp > 16) { 2928 float_raise(float_flag_invalid | float_flag_inexact STATUS_VAR); 2929 return packFloat16(aSign, 0x1f, 0x3ff); 2930 } 2931 } 2932 if (aExp < -24) { 2933 return packFloat16(aSign, 0, 0); 2934 } 2935 if (aExp < -14) { 2936 aSig >>= -14 - aExp; 2937 aExp = -14; 2938 } 2939 return packFloat16(aSign, aExp + 14, aSig >> 13); 2940 } 2941 2942 #ifdef FLOATX80 2943 2944 /*---------------------------------------------------------------------------- 2945 | Returns the result of converting the double-precision floating-point value 2946 | `a' to the extended double-precision floating-point format. The conversion 2947 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 2948 | Arithmetic. 2949 *----------------------------------------------------------------------------*/ 2950 2951 floatx80 float64_to_floatx80( float64 a STATUS_PARAM ) 2952 { 2953 flag aSign; 2954 int16 aExp; 2955 uint64_t aSig; 2956 2957 a = float64_squash_input_denormal(a STATUS_VAR); 2958 aSig = extractFloat64Frac( a ); 2959 aExp = extractFloat64Exp( a ); 2960 aSign = extractFloat64Sign( a ); 2961 if ( aExp == 0x7FF ) { 2962 if ( aSig ) return commonNaNToFloatx80( float64ToCommonNaN( a STATUS_VAR ) STATUS_VAR ); 2963 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 2964 } 2965 if ( aExp == 0 ) { 2966 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 ); 2967 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 2968 } 2969 return 2970 packFloatx80( 2971 aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 ); 2972 2973 } 2974 2975 #endif 2976 2977 #ifdef FLOAT128 2978 2979 /*---------------------------------------------------------------------------- 2980 | Returns the result of converting the double-precision floating-point value 2981 | `a' to the quadruple-precision floating-point format. The conversion is 2982 | performed according to the IEC/IEEE Standard for Binary Floating-Point 2983 | Arithmetic. 2984 *----------------------------------------------------------------------------*/ 2985 2986 float128 float64_to_float128( float64 a STATUS_PARAM ) 2987 { 2988 flag aSign; 2989 int16 aExp; 2990 uint64_t aSig, zSig0, zSig1; 2991 2992 a = float64_squash_input_denormal(a STATUS_VAR); 2993 aSig = extractFloat64Frac( a ); 2994 aExp = extractFloat64Exp( a ); 2995 aSign = extractFloat64Sign( a ); 2996 if ( aExp == 0x7FF ) { 2997 if ( aSig ) return commonNaNToFloat128( float64ToCommonNaN( a STATUS_VAR ) STATUS_VAR ); 2998 return packFloat128( aSign, 0x7FFF, 0, 0 ); 2999 } 3000 if ( aExp == 0 ) { 3001 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 ); 3002 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 3003 --aExp; 3004 } 3005 shift128Right( aSig, 0, 4, &zSig0, &zSig1 ); 3006 return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 ); 3007 3008 } 3009 3010 #endif 3011 3012 /*---------------------------------------------------------------------------- 3013 | Rounds the double-precision floating-point value `a' to an integer, and 3014 | returns the result as a double-precision floating-point value. The 3015 | operation is performed according to the IEC/IEEE Standard for Binary 3016 | Floating-Point Arithmetic. 3017 *----------------------------------------------------------------------------*/ 3018 3019 float64 float64_round_to_int( float64 a STATUS_PARAM ) 3020 { 3021 flag aSign; 3022 int16 aExp; 3023 uint64_t lastBitMask, roundBitsMask; 3024 int8 roundingMode; 3025 uint64_t z; 3026 a = float64_squash_input_denormal(a STATUS_VAR); 3027 3028 aExp = extractFloat64Exp( a ); 3029 if ( 0x433 <= aExp ) { 3030 if ( ( aExp == 0x7FF ) && extractFloat64Frac( a ) ) { 3031 return propagateFloat64NaN( a, a STATUS_VAR ); 3032 } 3033 return a; 3034 } 3035 if ( aExp < 0x3FF ) { 3036 if ( (uint64_t) ( float64_val(a)<<1 ) == 0 ) return a; 3037 STATUS(float_exception_flags) |= float_flag_inexact; 3038 aSign = extractFloat64Sign( a ); 3039 switch ( STATUS(float_rounding_mode) ) { 3040 case float_round_nearest_even: 3041 if ( ( aExp == 0x3FE ) && extractFloat64Frac( a ) ) { 3042 return packFloat64( aSign, 0x3FF, 0 ); 3043 } 3044 break; 3045 case float_round_down: 3046 return make_float64(aSign ? LIT64( 0xBFF0000000000000 ) : 0); 3047 case float_round_up: 3048 return make_float64( 3049 aSign ? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 )); 3050 } 3051 return packFloat64( aSign, 0, 0 ); 3052 } 3053 lastBitMask = 1; 3054 lastBitMask <<= 0x433 - aExp; 3055 roundBitsMask = lastBitMask - 1; 3056 z = float64_val(a); 3057 roundingMode = STATUS(float_rounding_mode); 3058 if ( roundingMode == float_round_nearest_even ) { 3059 z += lastBitMask>>1; 3060 if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask; 3061 } 3062 else if ( roundingMode != float_round_to_zero ) { 3063 if ( extractFloat64Sign( make_float64(z) ) ^ ( roundingMode == float_round_up ) ) { 3064 z += roundBitsMask; 3065 } 3066 } 3067 z &= ~ roundBitsMask; 3068 if ( z != float64_val(a) ) 3069 STATUS(float_exception_flags) |= float_flag_inexact; 3070 return make_float64(z); 3071 3072 } 3073 3074 float64 float64_trunc_to_int( float64 a STATUS_PARAM) 3075 { 3076 int oldmode; 3077 float64 res; 3078 oldmode = STATUS(float_rounding_mode); 3079 STATUS(float_rounding_mode) = float_round_to_zero; 3080 res = float64_round_to_int(a STATUS_VAR); 3081 STATUS(float_rounding_mode) = oldmode; 3082 return res; 3083 } 3084 3085 /*---------------------------------------------------------------------------- 3086 | Returns the result of adding the absolute values of the double-precision 3087 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated 3088 | before being returned. `zSign' is ignored if the result is a NaN. 3089 | The addition is performed according to the IEC/IEEE Standard for Binary 3090 | Floating-Point Arithmetic. 3091 *----------------------------------------------------------------------------*/ 3092 3093 static float64 addFloat64Sigs( float64 a, float64 b, flag zSign STATUS_PARAM ) 3094 { 3095 int16 aExp, bExp, zExp; 3096 uint64_t aSig, bSig, zSig; 3097 int16 expDiff; 3098 3099 aSig = extractFloat64Frac( a ); 3100 aExp = extractFloat64Exp( a ); 3101 bSig = extractFloat64Frac( b ); 3102 bExp = extractFloat64Exp( b ); 3103 expDiff = aExp - bExp; 3104 aSig <<= 9; 3105 bSig <<= 9; 3106 if ( 0 < expDiff ) { 3107 if ( aExp == 0x7FF ) { 3108 if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR ); 3109 return a; 3110 } 3111 if ( bExp == 0 ) { 3112 --expDiff; 3113 } 3114 else { 3115 bSig |= LIT64( 0x2000000000000000 ); 3116 } 3117 shift64RightJamming( bSig, expDiff, &bSig ); 3118 zExp = aExp; 3119 } 3120 else if ( expDiff < 0 ) { 3121 if ( bExp == 0x7FF ) { 3122 if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR ); 3123 return packFloat64( zSign, 0x7FF, 0 ); 3124 } 3125 if ( aExp == 0 ) { 3126 ++expDiff; 3127 } 3128 else { 3129 aSig |= LIT64( 0x2000000000000000 ); 3130 } 3131 shift64RightJamming( aSig, - expDiff, &aSig ); 3132 zExp = bExp; 3133 } 3134 else { 3135 if ( aExp == 0x7FF ) { 3136 if ( aSig | bSig ) return propagateFloat64NaN( a, b STATUS_VAR ); 3137 return a; 3138 } 3139 if ( aExp == 0 ) { 3140 if (STATUS(flush_to_zero)) { 3141 if (aSig | bSig) { 3142 float_raise(float_flag_output_denormal STATUS_VAR); 3143 } 3144 return packFloat64(zSign, 0, 0); 3145 } 3146 return packFloat64( zSign, 0, ( aSig + bSig )>>9 ); 3147 } 3148 zSig = LIT64( 0x4000000000000000 ) + aSig + bSig; 3149 zExp = aExp; 3150 goto roundAndPack; 3151 } 3152 aSig |= LIT64( 0x2000000000000000 ); 3153 zSig = ( aSig + bSig )<<1; 3154 --zExp; 3155 if ( (int64_t) zSig < 0 ) { 3156 zSig = aSig + bSig; 3157 ++zExp; 3158 } 3159 roundAndPack: 3160 return roundAndPackFloat64( zSign, zExp, zSig STATUS_VAR ); 3161 3162 } 3163 3164 /*---------------------------------------------------------------------------- 3165 | Returns the result of subtracting the absolute values of the double- 3166 | precision floating-point values `a' and `b'. If `zSign' is 1, the 3167 | difference is negated before being returned. `zSign' is ignored if the 3168 | result is a NaN. The subtraction is performed according to the IEC/IEEE 3169 | Standard for Binary Floating-Point Arithmetic. 3170 *----------------------------------------------------------------------------*/ 3171 3172 static float64 subFloat64Sigs( float64 a, float64 b, flag zSign STATUS_PARAM ) 3173 { 3174 int16 aExp, bExp, zExp; 3175 uint64_t aSig, bSig, zSig; 3176 int16 expDiff; 3177 3178 aSig = extractFloat64Frac( a ); 3179 aExp = extractFloat64Exp( a ); 3180 bSig = extractFloat64Frac( b ); 3181 bExp = extractFloat64Exp( b ); 3182 expDiff = aExp - bExp; 3183 aSig <<= 10; 3184 bSig <<= 10; 3185 if ( 0 < expDiff ) goto aExpBigger; 3186 if ( expDiff < 0 ) goto bExpBigger; 3187 if ( aExp == 0x7FF ) { 3188 if ( aSig | bSig ) return propagateFloat64NaN( a, b STATUS_VAR ); 3189 float_raise( float_flag_invalid STATUS_VAR); 3190 return float64_default_nan; 3191 } 3192 if ( aExp == 0 ) { 3193 aExp = 1; 3194 bExp = 1; 3195 } 3196 if ( bSig < aSig ) goto aBigger; 3197 if ( aSig < bSig ) goto bBigger; 3198 return packFloat64( STATUS(float_rounding_mode) == float_round_down, 0, 0 ); 3199 bExpBigger: 3200 if ( bExp == 0x7FF ) { 3201 if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR ); 3202 return packFloat64( zSign ^ 1, 0x7FF, 0 ); 3203 } 3204 if ( aExp == 0 ) { 3205 ++expDiff; 3206 } 3207 else { 3208 aSig |= LIT64( 0x4000000000000000 ); 3209 } 3210 shift64RightJamming( aSig, - expDiff, &aSig ); 3211 bSig |= LIT64( 0x4000000000000000 ); 3212 bBigger: 3213 zSig = bSig - aSig; 3214 zExp = bExp; 3215 zSign ^= 1; 3216 goto normalizeRoundAndPack; 3217 aExpBigger: 3218 if ( aExp == 0x7FF ) { 3219 if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR ); 3220 return a; 3221 } 3222 if ( bExp == 0 ) { 3223 --expDiff; 3224 } 3225 else { 3226 bSig |= LIT64( 0x4000000000000000 ); 3227 } 3228 shift64RightJamming( bSig, expDiff, &bSig ); 3229 aSig |= LIT64( 0x4000000000000000 ); 3230 aBigger: 3231 zSig = aSig - bSig; 3232 zExp = aExp; 3233 normalizeRoundAndPack: 3234 --zExp; 3235 return normalizeRoundAndPackFloat64( zSign, zExp, zSig STATUS_VAR ); 3236 3237 } 3238 3239 /*---------------------------------------------------------------------------- 3240 | Returns the result of adding the double-precision floating-point values `a' 3241 | and `b'. The operation is performed according to the IEC/IEEE Standard for 3242 | Binary Floating-Point Arithmetic. 3243 *----------------------------------------------------------------------------*/ 3244 3245 float64 float64_add( float64 a, float64 b STATUS_PARAM ) 3246 { 3247 flag aSign, bSign; 3248 a = float64_squash_input_denormal(a STATUS_VAR); 3249 b = float64_squash_input_denormal(b STATUS_VAR); 3250 3251 aSign = extractFloat64Sign( a ); 3252 bSign = extractFloat64Sign( b ); 3253 if ( aSign == bSign ) { 3254 return addFloat64Sigs( a, b, aSign STATUS_VAR ); 3255 } 3256 else { 3257 return subFloat64Sigs( a, b, aSign STATUS_VAR ); 3258 } 3259 3260 } 3261 3262 /*---------------------------------------------------------------------------- 3263 | Returns the result of subtracting the double-precision floating-point values 3264 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard 3265 | for Binary Floating-Point Arithmetic. 3266 *----------------------------------------------------------------------------*/ 3267 3268 float64 float64_sub( float64 a, float64 b STATUS_PARAM ) 3269 { 3270 flag aSign, bSign; 3271 a = float64_squash_input_denormal(a STATUS_VAR); 3272 b = float64_squash_input_denormal(b STATUS_VAR); 3273 3274 aSign = extractFloat64Sign( a ); 3275 bSign = extractFloat64Sign( b ); 3276 if ( aSign == bSign ) { 3277 return subFloat64Sigs( a, b, aSign STATUS_VAR ); 3278 } 3279 else { 3280 return addFloat64Sigs( a, b, aSign STATUS_VAR ); 3281 } 3282 3283 } 3284 3285 /*---------------------------------------------------------------------------- 3286 | Returns the result of multiplying the double-precision floating-point values 3287 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard 3288 | for Binary Floating-Point Arithmetic. 3289 *----------------------------------------------------------------------------*/ 3290 3291 float64 float64_mul( float64 a, float64 b STATUS_PARAM ) 3292 { 3293 flag aSign, bSign, zSign; 3294 int16 aExp, bExp, zExp; 3295 uint64_t aSig, bSig, zSig0, zSig1; 3296 3297 a = float64_squash_input_denormal(a STATUS_VAR); 3298 b = float64_squash_input_denormal(b STATUS_VAR); 3299 3300 aSig = extractFloat64Frac( a ); 3301 aExp = extractFloat64Exp( a ); 3302 aSign = extractFloat64Sign( a ); 3303 bSig = extractFloat64Frac( b ); 3304 bExp = extractFloat64Exp( b ); 3305 bSign = extractFloat64Sign( b ); 3306 zSign = aSign ^ bSign; 3307 if ( aExp == 0x7FF ) { 3308 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) { 3309 return propagateFloat64NaN( a, b STATUS_VAR ); 3310 } 3311 if ( ( bExp | bSig ) == 0 ) { 3312 float_raise( float_flag_invalid STATUS_VAR); 3313 return float64_default_nan; 3314 } 3315 return packFloat64( zSign, 0x7FF, 0 ); 3316 } 3317 if ( bExp == 0x7FF ) { 3318 if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR ); 3319 if ( ( aExp | aSig ) == 0 ) { 3320 float_raise( float_flag_invalid STATUS_VAR); 3321 return float64_default_nan; 3322 } 3323 return packFloat64( zSign, 0x7FF, 0 ); 3324 } 3325 if ( aExp == 0 ) { 3326 if ( aSig == 0 ) return packFloat64( zSign, 0, 0 ); 3327 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 3328 } 3329 if ( bExp == 0 ) { 3330 if ( bSig == 0 ) return packFloat64( zSign, 0, 0 ); 3331 normalizeFloat64Subnormal( bSig, &bExp, &bSig ); 3332 } 3333 zExp = aExp + bExp - 0x3FF; 3334 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10; 3335 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11; 3336 mul64To128( aSig, bSig, &zSig0, &zSig1 ); 3337 zSig0 |= ( zSig1 != 0 ); 3338 if ( 0 <= (int64_t) ( zSig0<<1 ) ) { 3339 zSig0 <<= 1; 3340 --zExp; 3341 } 3342 return roundAndPackFloat64( zSign, zExp, zSig0 STATUS_VAR ); 3343 3344 } 3345 3346 /*---------------------------------------------------------------------------- 3347 | Returns the result of dividing the double-precision floating-point value `a' 3348 | by the corresponding value `b'. The operation is performed according to 3349 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3350 *----------------------------------------------------------------------------*/ 3351 3352 float64 float64_div( float64 a, float64 b STATUS_PARAM ) 3353 { 3354 flag aSign, bSign, zSign; 3355 int16 aExp, bExp, zExp; 3356 uint64_t aSig, bSig, zSig; 3357 uint64_t rem0, rem1; 3358 uint64_t term0, term1; 3359 a = float64_squash_input_denormal(a STATUS_VAR); 3360 b = float64_squash_input_denormal(b STATUS_VAR); 3361 3362 aSig = extractFloat64Frac( a ); 3363 aExp = extractFloat64Exp( a ); 3364 aSign = extractFloat64Sign( a ); 3365 bSig = extractFloat64Frac( b ); 3366 bExp = extractFloat64Exp( b ); 3367 bSign = extractFloat64Sign( b ); 3368 zSign = aSign ^ bSign; 3369 if ( aExp == 0x7FF ) { 3370 if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR ); 3371 if ( bExp == 0x7FF ) { 3372 if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR ); 3373 float_raise( float_flag_invalid STATUS_VAR); 3374 return float64_default_nan; 3375 } 3376 return packFloat64( zSign, 0x7FF, 0 ); 3377 } 3378 if ( bExp == 0x7FF ) { 3379 if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR ); 3380 return packFloat64( zSign, 0, 0 ); 3381 } 3382 if ( bExp == 0 ) { 3383 if ( bSig == 0 ) { 3384 if ( ( aExp | aSig ) == 0 ) { 3385 float_raise( float_flag_invalid STATUS_VAR); 3386 return float64_default_nan; 3387 } 3388 float_raise( float_flag_divbyzero STATUS_VAR); 3389 return packFloat64( zSign, 0x7FF, 0 ); 3390 } 3391 normalizeFloat64Subnormal( bSig, &bExp, &bSig ); 3392 } 3393 if ( aExp == 0 ) { 3394 if ( aSig == 0 ) return packFloat64( zSign, 0, 0 ); 3395 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 3396 } 3397 zExp = aExp - bExp + 0x3FD; 3398 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10; 3399 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11; 3400 if ( bSig <= ( aSig + aSig ) ) { 3401 aSig >>= 1; 3402 ++zExp; 3403 } 3404 zSig = estimateDiv128To64( aSig, 0, bSig ); 3405 if ( ( zSig & 0x1FF ) <= 2 ) { 3406 mul64To128( bSig, zSig, &term0, &term1 ); 3407 sub128( aSig, 0, term0, term1, &rem0, &rem1 ); 3408 while ( (int64_t) rem0 < 0 ) { 3409 --zSig; 3410 add128( rem0, rem1, 0, bSig, &rem0, &rem1 ); 3411 } 3412 zSig |= ( rem1 != 0 ); 3413 } 3414 return roundAndPackFloat64( zSign, zExp, zSig STATUS_VAR ); 3415 3416 } 3417 3418 /*---------------------------------------------------------------------------- 3419 | Returns the remainder of the double-precision floating-point value `a' 3420 | with respect to the corresponding value `b'. The operation is performed 3421 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3422 *----------------------------------------------------------------------------*/ 3423 3424 float64 float64_rem( float64 a, float64 b STATUS_PARAM ) 3425 { 3426 flag aSign, zSign; 3427 int16 aExp, bExp, expDiff; 3428 uint64_t aSig, bSig; 3429 uint64_t q, alternateASig; 3430 int64_t sigMean; 3431 3432 a = float64_squash_input_denormal(a STATUS_VAR); 3433 b = float64_squash_input_denormal(b STATUS_VAR); 3434 aSig = extractFloat64Frac( a ); 3435 aExp = extractFloat64Exp( a ); 3436 aSign = extractFloat64Sign( a ); 3437 bSig = extractFloat64Frac( b ); 3438 bExp = extractFloat64Exp( b ); 3439 if ( aExp == 0x7FF ) { 3440 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) { 3441 return propagateFloat64NaN( a, b STATUS_VAR ); 3442 } 3443 float_raise( float_flag_invalid STATUS_VAR); 3444 return float64_default_nan; 3445 } 3446 if ( bExp == 0x7FF ) { 3447 if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR ); 3448 return a; 3449 } 3450 if ( bExp == 0 ) { 3451 if ( bSig == 0 ) { 3452 float_raise( float_flag_invalid STATUS_VAR); 3453 return float64_default_nan; 3454 } 3455 normalizeFloat64Subnormal( bSig, &bExp, &bSig ); 3456 } 3457 if ( aExp == 0 ) { 3458 if ( aSig == 0 ) return a; 3459 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 3460 } 3461 expDiff = aExp - bExp; 3462 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11; 3463 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11; 3464 if ( expDiff < 0 ) { 3465 if ( expDiff < -1 ) return a; 3466 aSig >>= 1; 3467 } 3468 q = ( bSig <= aSig ); 3469 if ( q ) aSig -= bSig; 3470 expDiff -= 64; 3471 while ( 0 < expDiff ) { 3472 q = estimateDiv128To64( aSig, 0, bSig ); 3473 q = ( 2 < q ) ? q - 2 : 0; 3474 aSig = - ( ( bSig>>2 ) * q ); 3475 expDiff -= 62; 3476 } 3477 expDiff += 64; 3478 if ( 0 < expDiff ) { 3479 q = estimateDiv128To64( aSig, 0, bSig ); 3480 q = ( 2 < q ) ? q - 2 : 0; 3481 q >>= 64 - expDiff; 3482 bSig >>= 2; 3483 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q; 3484 } 3485 else { 3486 aSig >>= 2; 3487 bSig >>= 2; 3488 } 3489 do { 3490 alternateASig = aSig; 3491 ++q; 3492 aSig -= bSig; 3493 } while ( 0 <= (int64_t) aSig ); 3494 sigMean = aSig + alternateASig; 3495 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) { 3496 aSig = alternateASig; 3497 } 3498 zSign = ( (int64_t) aSig < 0 ); 3499 if ( zSign ) aSig = - aSig; 3500 return normalizeRoundAndPackFloat64( aSign ^ zSign, bExp, aSig STATUS_VAR ); 3501 3502 } 3503 3504 /*---------------------------------------------------------------------------- 3505 | Returns the square root of the double-precision floating-point value `a'. 3506 | The operation is performed according to the IEC/IEEE Standard for Binary 3507 | Floating-Point Arithmetic. 3508 *----------------------------------------------------------------------------*/ 3509 3510 float64 float64_sqrt( float64 a STATUS_PARAM ) 3511 { 3512 flag aSign; 3513 int16 aExp, zExp; 3514 uint64_t aSig, zSig, doubleZSig; 3515 uint64_t rem0, rem1, term0, term1; 3516 a = float64_squash_input_denormal(a STATUS_VAR); 3517 3518 aSig = extractFloat64Frac( a ); 3519 aExp = extractFloat64Exp( a ); 3520 aSign = extractFloat64Sign( a ); 3521 if ( aExp == 0x7FF ) { 3522 if ( aSig ) return propagateFloat64NaN( a, a STATUS_VAR ); 3523 if ( ! aSign ) return a; 3524 float_raise( float_flag_invalid STATUS_VAR); 3525 return float64_default_nan; 3526 } 3527 if ( aSign ) { 3528 if ( ( aExp | aSig ) == 0 ) return a; 3529 float_raise( float_flag_invalid STATUS_VAR); 3530 return float64_default_nan; 3531 } 3532 if ( aExp == 0 ) { 3533 if ( aSig == 0 ) return float64_zero; 3534 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 3535 } 3536 zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE; 3537 aSig |= LIT64( 0x0010000000000000 ); 3538 zSig = estimateSqrt32( aExp, aSig>>21 ); 3539 aSig <<= 9 - ( aExp & 1 ); 3540 zSig = estimateDiv128To64( aSig, 0, zSig<<32 ) + ( zSig<<30 ); 3541 if ( ( zSig & 0x1FF ) <= 5 ) { 3542 doubleZSig = zSig<<1; 3543 mul64To128( zSig, zSig, &term0, &term1 ); 3544 sub128( aSig, 0, term0, term1, &rem0, &rem1 ); 3545 while ( (int64_t) rem0 < 0 ) { 3546 --zSig; 3547 doubleZSig -= 2; 3548 add128( rem0, rem1, zSig>>63, doubleZSig | 1, &rem0, &rem1 ); 3549 } 3550 zSig |= ( ( rem0 | rem1 ) != 0 ); 3551 } 3552 return roundAndPackFloat64( 0, zExp, zSig STATUS_VAR ); 3553 3554 } 3555 3556 /*---------------------------------------------------------------------------- 3557 | Returns the binary log of the double-precision floating-point value `a'. 3558 | The operation is performed according to the IEC/IEEE Standard for Binary 3559 | Floating-Point Arithmetic. 3560 *----------------------------------------------------------------------------*/ 3561 float64 float64_log2( float64 a STATUS_PARAM ) 3562 { 3563 flag aSign, zSign; 3564 int16 aExp; 3565 uint64_t aSig, aSig0, aSig1, zSig, i; 3566 a = float64_squash_input_denormal(a STATUS_VAR); 3567 3568 aSig = extractFloat64Frac( a ); 3569 aExp = extractFloat64Exp( a ); 3570 aSign = extractFloat64Sign( a ); 3571 3572 if ( aExp == 0 ) { 3573 if ( aSig == 0 ) return packFloat64( 1, 0x7FF, 0 ); 3574 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 3575 } 3576 if ( aSign ) { 3577 float_raise( float_flag_invalid STATUS_VAR); 3578 return float64_default_nan; 3579 } 3580 if ( aExp == 0x7FF ) { 3581 if ( aSig ) return propagateFloat64NaN( a, float64_zero STATUS_VAR ); 3582 return a; 3583 } 3584 3585 aExp -= 0x3FF; 3586 aSig |= LIT64( 0x0010000000000000 ); 3587 zSign = aExp < 0; 3588 zSig = (uint64_t)aExp << 52; 3589 for (i = 1LL << 51; i > 0; i >>= 1) { 3590 mul64To128( aSig, aSig, &aSig0, &aSig1 ); 3591 aSig = ( aSig0 << 12 ) | ( aSig1 >> 52 ); 3592 if ( aSig & LIT64( 0x0020000000000000 ) ) { 3593 aSig >>= 1; 3594 zSig |= i; 3595 } 3596 } 3597 3598 if ( zSign ) 3599 zSig = -zSig; 3600 return normalizeRoundAndPackFloat64( zSign, 0x408, zSig STATUS_VAR ); 3601 } 3602 3603 /*---------------------------------------------------------------------------- 3604 | Returns 1 if the double-precision floating-point value `a' is equal to the 3605 | corresponding value `b', and 0 otherwise. The invalid exception is raised 3606 | if either operand is a NaN. Otherwise, the comparison is performed 3607 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3608 *----------------------------------------------------------------------------*/ 3609 3610 int float64_eq( float64 a, float64 b STATUS_PARAM ) 3611 { 3612 uint64_t av, bv; 3613 a = float64_squash_input_denormal(a STATUS_VAR); 3614 b = float64_squash_input_denormal(b STATUS_VAR); 3615 3616 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 3617 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 3618 ) { 3619 float_raise( float_flag_invalid STATUS_VAR); 3620 return 0; 3621 } 3622 av = float64_val(a); 3623 bv = float64_val(b); 3624 return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 ); 3625 3626 } 3627 3628 /*---------------------------------------------------------------------------- 3629 | Returns 1 if the double-precision floating-point value `a' is less than or 3630 | equal to the corresponding value `b', and 0 otherwise. The invalid 3631 | exception is raised if either operand is a NaN. The comparison is performed 3632 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3633 *----------------------------------------------------------------------------*/ 3634 3635 int float64_le( float64 a, float64 b STATUS_PARAM ) 3636 { 3637 flag aSign, bSign; 3638 uint64_t av, bv; 3639 a = float64_squash_input_denormal(a STATUS_VAR); 3640 b = float64_squash_input_denormal(b STATUS_VAR); 3641 3642 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 3643 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 3644 ) { 3645 float_raise( float_flag_invalid STATUS_VAR); 3646 return 0; 3647 } 3648 aSign = extractFloat64Sign( a ); 3649 bSign = extractFloat64Sign( b ); 3650 av = float64_val(a); 3651 bv = float64_val(b); 3652 if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 ); 3653 return ( av == bv ) || ( aSign ^ ( av < bv ) ); 3654 3655 } 3656 3657 /*---------------------------------------------------------------------------- 3658 | Returns 1 if the double-precision floating-point value `a' is less than 3659 | the corresponding value `b', and 0 otherwise. The invalid exception is 3660 | raised if either operand is a NaN. The comparison is performed according 3661 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3662 *----------------------------------------------------------------------------*/ 3663 3664 int float64_lt( float64 a, float64 b STATUS_PARAM ) 3665 { 3666 flag aSign, bSign; 3667 uint64_t av, bv; 3668 3669 a = float64_squash_input_denormal(a STATUS_VAR); 3670 b = float64_squash_input_denormal(b STATUS_VAR); 3671 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 3672 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 3673 ) { 3674 float_raise( float_flag_invalid STATUS_VAR); 3675 return 0; 3676 } 3677 aSign = extractFloat64Sign( a ); 3678 bSign = extractFloat64Sign( b ); 3679 av = float64_val(a); 3680 bv = float64_val(b); 3681 if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 ); 3682 return ( av != bv ) && ( aSign ^ ( av < bv ) ); 3683 3684 } 3685 3686 /*---------------------------------------------------------------------------- 3687 | Returns 1 if the double-precision floating-point values `a' and `b' cannot 3688 | be compared, and 0 otherwise. The invalid exception is raised if either 3689 | operand is a NaN. The comparison is performed according to the IEC/IEEE 3690 | Standard for Binary Floating-Point Arithmetic. 3691 *----------------------------------------------------------------------------*/ 3692 3693 int float64_unordered( float64 a, float64 b STATUS_PARAM ) 3694 { 3695 a = float64_squash_input_denormal(a STATUS_VAR); 3696 b = float64_squash_input_denormal(b STATUS_VAR); 3697 3698 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 3699 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 3700 ) { 3701 float_raise( float_flag_invalid STATUS_VAR); 3702 return 1; 3703 } 3704 return 0; 3705 } 3706 3707 /*---------------------------------------------------------------------------- 3708 | Returns 1 if the double-precision floating-point value `a' is equal to the 3709 | corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 3710 | exception.The comparison is performed according to the IEC/IEEE Standard 3711 | for Binary Floating-Point Arithmetic. 3712 *----------------------------------------------------------------------------*/ 3713 3714 int float64_eq_quiet( float64 a, float64 b STATUS_PARAM ) 3715 { 3716 uint64_t av, bv; 3717 a = float64_squash_input_denormal(a STATUS_VAR); 3718 b = float64_squash_input_denormal(b STATUS_VAR); 3719 3720 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 3721 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 3722 ) { 3723 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) { 3724 float_raise( float_flag_invalid STATUS_VAR); 3725 } 3726 return 0; 3727 } 3728 av = float64_val(a); 3729 bv = float64_val(b); 3730 return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 ); 3731 3732 } 3733 3734 /*---------------------------------------------------------------------------- 3735 | Returns 1 if the double-precision floating-point value `a' is less than or 3736 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not 3737 | cause an exception. Otherwise, the comparison is performed according to the 3738 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3739 *----------------------------------------------------------------------------*/ 3740 3741 int float64_le_quiet( float64 a, float64 b STATUS_PARAM ) 3742 { 3743 flag aSign, bSign; 3744 uint64_t av, bv; 3745 a = float64_squash_input_denormal(a STATUS_VAR); 3746 b = float64_squash_input_denormal(b STATUS_VAR); 3747 3748 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 3749 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 3750 ) { 3751 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) { 3752 float_raise( float_flag_invalid STATUS_VAR); 3753 } 3754 return 0; 3755 } 3756 aSign = extractFloat64Sign( a ); 3757 bSign = extractFloat64Sign( b ); 3758 av = float64_val(a); 3759 bv = float64_val(b); 3760 if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 ); 3761 return ( av == bv ) || ( aSign ^ ( av < bv ) ); 3762 3763 } 3764 3765 /*---------------------------------------------------------------------------- 3766 | Returns 1 if the double-precision floating-point value `a' is less than 3767 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 3768 | exception. Otherwise, the comparison is performed according to the IEC/IEEE 3769 | Standard for Binary Floating-Point Arithmetic. 3770 *----------------------------------------------------------------------------*/ 3771 3772 int float64_lt_quiet( float64 a, float64 b STATUS_PARAM ) 3773 { 3774 flag aSign, bSign; 3775 uint64_t av, bv; 3776 a = float64_squash_input_denormal(a STATUS_VAR); 3777 b = float64_squash_input_denormal(b STATUS_VAR); 3778 3779 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 3780 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 3781 ) { 3782 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) { 3783 float_raise( float_flag_invalid STATUS_VAR); 3784 } 3785 return 0; 3786 } 3787 aSign = extractFloat64Sign( a ); 3788 bSign = extractFloat64Sign( b ); 3789 av = float64_val(a); 3790 bv = float64_val(b); 3791 if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 ); 3792 return ( av != bv ) && ( aSign ^ ( av < bv ) ); 3793 3794 } 3795 3796 /*---------------------------------------------------------------------------- 3797 | Returns 1 if the double-precision floating-point values `a' and `b' cannot 3798 | be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The 3799 | comparison is performed according to the IEC/IEEE Standard for Binary 3800 | Floating-Point Arithmetic. 3801 *----------------------------------------------------------------------------*/ 3802 3803 int float64_unordered_quiet( float64 a, float64 b STATUS_PARAM ) 3804 { 3805 a = float64_squash_input_denormal(a STATUS_VAR); 3806 b = float64_squash_input_denormal(b STATUS_VAR); 3807 3808 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 3809 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 3810 ) { 3811 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) { 3812 float_raise( float_flag_invalid STATUS_VAR); 3813 } 3814 return 1; 3815 } 3816 return 0; 3817 } 3818 3819 #ifdef FLOATX80 3820 3821 /*---------------------------------------------------------------------------- 3822 | Returns the result of converting the extended double-precision floating- 3823 | point value `a' to the 32-bit two's complement integer format. The 3824 | conversion is performed according to the IEC/IEEE Standard for Binary 3825 | Floating-Point Arithmetic---which means in particular that the conversion 3826 | is rounded according to the current rounding mode. If `a' is a NaN, the 3827 | largest positive integer is returned. Otherwise, if the conversion 3828 | overflows, the largest integer with the same sign as `a' is returned. 3829 *----------------------------------------------------------------------------*/ 3830 3831 int32 floatx80_to_int32( floatx80 a STATUS_PARAM ) 3832 { 3833 flag aSign; 3834 int32 aExp, shiftCount; 3835 uint64_t aSig; 3836 3837 aSig = extractFloatx80Frac( a ); 3838 aExp = extractFloatx80Exp( a ); 3839 aSign = extractFloatx80Sign( a ); 3840 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0; 3841 shiftCount = 0x4037 - aExp; 3842 if ( shiftCount <= 0 ) shiftCount = 1; 3843 shift64RightJamming( aSig, shiftCount, &aSig ); 3844 return roundAndPackInt32( aSign, aSig STATUS_VAR ); 3845 3846 } 3847 3848 /*---------------------------------------------------------------------------- 3849 | Returns the result of converting the extended double-precision floating- 3850 | point value `a' to the 32-bit two's complement integer format. The 3851 | conversion is performed according to the IEC/IEEE Standard for Binary 3852 | Floating-Point Arithmetic, except that the conversion is always rounded 3853 | toward zero. If `a' is a NaN, the largest positive integer is returned. 3854 | Otherwise, if the conversion overflows, the largest integer with the same 3855 | sign as `a' is returned. 3856 *----------------------------------------------------------------------------*/ 3857 3858 int32 floatx80_to_int32_round_to_zero( floatx80 a STATUS_PARAM ) 3859 { 3860 flag aSign; 3861 int32 aExp, shiftCount; 3862 uint64_t aSig, savedASig; 3863 int32 z; 3864 3865 aSig = extractFloatx80Frac( a ); 3866 aExp = extractFloatx80Exp( a ); 3867 aSign = extractFloatx80Sign( a ); 3868 if ( 0x401E < aExp ) { 3869 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0; 3870 goto invalid; 3871 } 3872 else if ( aExp < 0x3FFF ) { 3873 if ( aExp || aSig ) STATUS(float_exception_flags) |= float_flag_inexact; 3874 return 0; 3875 } 3876 shiftCount = 0x403E - aExp; 3877 savedASig = aSig; 3878 aSig >>= shiftCount; 3879 z = aSig; 3880 if ( aSign ) z = - z; 3881 if ( ( z < 0 ) ^ aSign ) { 3882 invalid: 3883 float_raise( float_flag_invalid STATUS_VAR); 3884 return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF; 3885 } 3886 if ( ( aSig<<shiftCount ) != savedASig ) { 3887 STATUS(float_exception_flags) |= float_flag_inexact; 3888 } 3889 return z; 3890 3891 } 3892 3893 /*---------------------------------------------------------------------------- 3894 | Returns the result of converting the extended double-precision floating- 3895 | point value `a' to the 64-bit two's complement integer format. The 3896 | conversion is performed according to the IEC/IEEE Standard for Binary 3897 | Floating-Point Arithmetic---which means in particular that the conversion 3898 | is rounded according to the current rounding mode. If `a' is a NaN, 3899 | the largest positive integer is returned. Otherwise, if the conversion 3900 | overflows, the largest integer with the same sign as `a' is returned. 3901 *----------------------------------------------------------------------------*/ 3902 3903 int64 floatx80_to_int64( floatx80 a STATUS_PARAM ) 3904 { 3905 flag aSign; 3906 int32 aExp, shiftCount; 3907 uint64_t aSig, aSigExtra; 3908 3909 aSig = extractFloatx80Frac( a ); 3910 aExp = extractFloatx80Exp( a ); 3911 aSign = extractFloatx80Sign( a ); 3912 shiftCount = 0x403E - aExp; 3913 if ( shiftCount <= 0 ) { 3914 if ( shiftCount ) { 3915 float_raise( float_flag_invalid STATUS_VAR); 3916 if ( ! aSign 3917 || ( ( aExp == 0x7FFF ) 3918 && ( aSig != LIT64( 0x8000000000000000 ) ) ) 3919 ) { 3920 return LIT64( 0x7FFFFFFFFFFFFFFF ); 3921 } 3922 return (int64_t) LIT64( 0x8000000000000000 ); 3923 } 3924 aSigExtra = 0; 3925 } 3926 else { 3927 shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra ); 3928 } 3929 return roundAndPackInt64( aSign, aSig, aSigExtra STATUS_VAR ); 3930 3931 } 3932 3933 /*---------------------------------------------------------------------------- 3934 | Returns the result of converting the extended double-precision floating- 3935 | point value `a' to the 64-bit two's complement integer format. The 3936 | conversion is performed according to the IEC/IEEE Standard for Binary 3937 | Floating-Point Arithmetic, except that the conversion is always rounded 3938 | toward zero. If `a' is a NaN, the largest positive integer is returned. 3939 | Otherwise, if the conversion overflows, the largest integer with the same 3940 | sign as `a' is returned. 3941 *----------------------------------------------------------------------------*/ 3942 3943 int64 floatx80_to_int64_round_to_zero( floatx80 a STATUS_PARAM ) 3944 { 3945 flag aSign; 3946 int32 aExp, shiftCount; 3947 uint64_t aSig; 3948 int64 z; 3949 3950 aSig = extractFloatx80Frac( a ); 3951 aExp = extractFloatx80Exp( a ); 3952 aSign = extractFloatx80Sign( a ); 3953 shiftCount = aExp - 0x403E; 3954 if ( 0 <= shiftCount ) { 3955 aSig &= LIT64( 0x7FFFFFFFFFFFFFFF ); 3956 if ( ( a.high != 0xC03E ) || aSig ) { 3957 float_raise( float_flag_invalid STATUS_VAR); 3958 if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) { 3959 return LIT64( 0x7FFFFFFFFFFFFFFF ); 3960 } 3961 } 3962 return (int64_t) LIT64( 0x8000000000000000 ); 3963 } 3964 else if ( aExp < 0x3FFF ) { 3965 if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact; 3966 return 0; 3967 } 3968 z = aSig>>( - shiftCount ); 3969 if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) { 3970 STATUS(float_exception_flags) |= float_flag_inexact; 3971 } 3972 if ( aSign ) z = - z; 3973 return z; 3974 3975 } 3976 3977 /*---------------------------------------------------------------------------- 3978 | Returns the result of converting the extended double-precision floating- 3979 | point value `a' to the single-precision floating-point format. The 3980 | conversion is performed according to the IEC/IEEE Standard for Binary 3981 | Floating-Point Arithmetic. 3982 *----------------------------------------------------------------------------*/ 3983 3984 float32 floatx80_to_float32( floatx80 a STATUS_PARAM ) 3985 { 3986 flag aSign; 3987 int32 aExp; 3988 uint64_t aSig; 3989 3990 aSig = extractFloatx80Frac( a ); 3991 aExp = extractFloatx80Exp( a ); 3992 aSign = extractFloatx80Sign( a ); 3993 if ( aExp == 0x7FFF ) { 3994 if ( (uint64_t) ( aSig<<1 ) ) { 3995 return commonNaNToFloat32( floatx80ToCommonNaN( a STATUS_VAR ) STATUS_VAR ); 3996 } 3997 return packFloat32( aSign, 0xFF, 0 ); 3998 } 3999 shift64RightJamming( aSig, 33, &aSig ); 4000 if ( aExp || aSig ) aExp -= 0x3F81; 4001 return roundAndPackFloat32( aSign, aExp, aSig STATUS_VAR ); 4002 4003 } 4004 4005 /*---------------------------------------------------------------------------- 4006 | Returns the result of converting the extended double-precision floating- 4007 | point value `a' to the double-precision floating-point format. The 4008 | conversion is performed according to the IEC/IEEE Standard for Binary 4009 | Floating-Point Arithmetic. 4010 *----------------------------------------------------------------------------*/ 4011 4012 float64 floatx80_to_float64( floatx80 a STATUS_PARAM ) 4013 { 4014 flag aSign; 4015 int32 aExp; 4016 uint64_t aSig, zSig; 4017 4018 aSig = extractFloatx80Frac( a ); 4019 aExp = extractFloatx80Exp( a ); 4020 aSign = extractFloatx80Sign( a ); 4021 if ( aExp == 0x7FFF ) { 4022 if ( (uint64_t) ( aSig<<1 ) ) { 4023 return commonNaNToFloat64( floatx80ToCommonNaN( a STATUS_VAR ) STATUS_VAR ); 4024 } 4025 return packFloat64( aSign, 0x7FF, 0 ); 4026 } 4027 shift64RightJamming( aSig, 1, &zSig ); 4028 if ( aExp || aSig ) aExp -= 0x3C01; 4029 return roundAndPackFloat64( aSign, aExp, zSig STATUS_VAR ); 4030 4031 } 4032 4033 #ifdef FLOAT128 4034 4035 /*---------------------------------------------------------------------------- 4036 | Returns the result of converting the extended double-precision floating- 4037 | point value `a' to the quadruple-precision floating-point format. The 4038 | conversion is performed according to the IEC/IEEE Standard for Binary 4039 | Floating-Point Arithmetic. 4040 *----------------------------------------------------------------------------*/ 4041 4042 float128 floatx80_to_float128( floatx80 a STATUS_PARAM ) 4043 { 4044 flag aSign; 4045 int16 aExp; 4046 uint64_t aSig, zSig0, zSig1; 4047 4048 aSig = extractFloatx80Frac( a ); 4049 aExp = extractFloatx80Exp( a ); 4050 aSign = extractFloatx80Sign( a ); 4051 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) { 4052 return commonNaNToFloat128( floatx80ToCommonNaN( a STATUS_VAR ) STATUS_VAR ); 4053 } 4054 shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 ); 4055 return packFloat128( aSign, aExp, zSig0, zSig1 ); 4056 4057 } 4058 4059 #endif 4060 4061 /*---------------------------------------------------------------------------- 4062 | Rounds the extended double-precision floating-point value `a' to an integer, 4063 | and returns the result as an extended quadruple-precision floating-point 4064 | value. The operation is performed according to the IEC/IEEE Standard for 4065 | Binary Floating-Point Arithmetic. 4066 *----------------------------------------------------------------------------*/ 4067 4068 floatx80 floatx80_round_to_int( floatx80 a STATUS_PARAM ) 4069 { 4070 flag aSign; 4071 int32 aExp; 4072 uint64_t lastBitMask, roundBitsMask; 4073 int8 roundingMode; 4074 floatx80 z; 4075 4076 aExp = extractFloatx80Exp( a ); 4077 if ( 0x403E <= aExp ) { 4078 if ( ( aExp == 0x7FFF ) && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) { 4079 return propagateFloatx80NaN( a, a STATUS_VAR ); 4080 } 4081 return a; 4082 } 4083 if ( aExp < 0x3FFF ) { 4084 if ( ( aExp == 0 ) 4085 && ( (uint64_t) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) { 4086 return a; 4087 } 4088 STATUS(float_exception_flags) |= float_flag_inexact; 4089 aSign = extractFloatx80Sign( a ); 4090 switch ( STATUS(float_rounding_mode) ) { 4091 case float_round_nearest_even: 4092 if ( ( aExp == 0x3FFE ) && (uint64_t) ( extractFloatx80Frac( a )<<1 ) 4093 ) { 4094 return 4095 packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) ); 4096 } 4097 break; 4098 case float_round_down: 4099 return 4100 aSign ? 4101 packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) ) 4102 : packFloatx80( 0, 0, 0 ); 4103 case float_round_up: 4104 return 4105 aSign ? packFloatx80( 1, 0, 0 ) 4106 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) ); 4107 } 4108 return packFloatx80( aSign, 0, 0 ); 4109 } 4110 lastBitMask = 1; 4111 lastBitMask <<= 0x403E - aExp; 4112 roundBitsMask = lastBitMask - 1; 4113 z = a; 4114 roundingMode = STATUS(float_rounding_mode); 4115 if ( roundingMode == float_round_nearest_even ) { 4116 z.low += lastBitMask>>1; 4117 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask; 4118 } 4119 else if ( roundingMode != float_round_to_zero ) { 4120 if ( extractFloatx80Sign( z ) ^ ( roundingMode == float_round_up ) ) { 4121 z.low += roundBitsMask; 4122 } 4123 } 4124 z.low &= ~ roundBitsMask; 4125 if ( z.low == 0 ) { 4126 ++z.high; 4127 z.low = LIT64( 0x8000000000000000 ); 4128 } 4129 if ( z.low != a.low ) STATUS(float_exception_flags) |= float_flag_inexact; 4130 return z; 4131 4132 } 4133 4134 /*---------------------------------------------------------------------------- 4135 | Returns the result of adding the absolute values of the extended double- 4136 | precision floating-point values `a' and `b'. If `zSign' is 1, the sum is 4137 | negated before being returned. `zSign' is ignored if the result is a NaN. 4138 | The addition is performed according to the IEC/IEEE Standard for Binary 4139 | Floating-Point Arithmetic. 4140 *----------------------------------------------------------------------------*/ 4141 4142 static floatx80 addFloatx80Sigs( floatx80 a, floatx80 b, flag zSign STATUS_PARAM) 4143 { 4144 int32 aExp, bExp, zExp; 4145 uint64_t aSig, bSig, zSig0, zSig1; 4146 int32 expDiff; 4147 4148 aSig = extractFloatx80Frac( a ); 4149 aExp = extractFloatx80Exp( a ); 4150 bSig = extractFloatx80Frac( b ); 4151 bExp = extractFloatx80Exp( b ); 4152 expDiff = aExp - bExp; 4153 if ( 0 < expDiff ) { 4154 if ( aExp == 0x7FFF ) { 4155 if ( (uint64_t) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR ); 4156 return a; 4157 } 4158 if ( bExp == 0 ) --expDiff; 4159 shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 ); 4160 zExp = aExp; 4161 } 4162 else if ( expDiff < 0 ) { 4163 if ( bExp == 0x7FFF ) { 4164 if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR ); 4165 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 4166 } 4167 if ( aExp == 0 ) ++expDiff; 4168 shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 ); 4169 zExp = bExp; 4170 } 4171 else { 4172 if ( aExp == 0x7FFF ) { 4173 if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) { 4174 return propagateFloatx80NaN( a, b STATUS_VAR ); 4175 } 4176 return a; 4177 } 4178 zSig1 = 0; 4179 zSig0 = aSig + bSig; 4180 if ( aExp == 0 ) { 4181 normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 ); 4182 goto roundAndPack; 4183 } 4184 zExp = aExp; 4185 goto shiftRight1; 4186 } 4187 zSig0 = aSig + bSig; 4188 if ( (int64_t) zSig0 < 0 ) goto roundAndPack; 4189 shiftRight1: 4190 shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 ); 4191 zSig0 |= LIT64( 0x8000000000000000 ); 4192 ++zExp; 4193 roundAndPack: 4194 return 4195 roundAndPackFloatx80( 4196 STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR ); 4197 4198 } 4199 4200 /*---------------------------------------------------------------------------- 4201 | Returns the result of subtracting the absolute values of the extended 4202 | double-precision floating-point values `a' and `b'. If `zSign' is 1, the 4203 | difference is negated before being returned. `zSign' is ignored if the 4204 | result is a NaN. The subtraction is performed according to the IEC/IEEE 4205 | Standard for Binary Floating-Point Arithmetic. 4206 *----------------------------------------------------------------------------*/ 4207 4208 static floatx80 subFloatx80Sigs( floatx80 a, floatx80 b, flag zSign STATUS_PARAM ) 4209 { 4210 int32 aExp, bExp, zExp; 4211 uint64_t aSig, bSig, zSig0, zSig1; 4212 int32 expDiff; 4213 floatx80 z; 4214 4215 aSig = extractFloatx80Frac( a ); 4216 aExp = extractFloatx80Exp( a ); 4217 bSig = extractFloatx80Frac( b ); 4218 bExp = extractFloatx80Exp( b ); 4219 expDiff = aExp - bExp; 4220 if ( 0 < expDiff ) goto aExpBigger; 4221 if ( expDiff < 0 ) goto bExpBigger; 4222 if ( aExp == 0x7FFF ) { 4223 if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) { 4224 return propagateFloatx80NaN( a, b STATUS_VAR ); 4225 } 4226 float_raise( float_flag_invalid STATUS_VAR); 4227 z.low = floatx80_default_nan_low; 4228 z.high = floatx80_default_nan_high; 4229 return z; 4230 } 4231 if ( aExp == 0 ) { 4232 aExp = 1; 4233 bExp = 1; 4234 } 4235 zSig1 = 0; 4236 if ( bSig < aSig ) goto aBigger; 4237 if ( aSig < bSig ) goto bBigger; 4238 return packFloatx80( STATUS(float_rounding_mode) == float_round_down, 0, 0 ); 4239 bExpBigger: 4240 if ( bExp == 0x7FFF ) { 4241 if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR ); 4242 return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) ); 4243 } 4244 if ( aExp == 0 ) ++expDiff; 4245 shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 ); 4246 bBigger: 4247 sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 ); 4248 zExp = bExp; 4249 zSign ^= 1; 4250 goto normalizeRoundAndPack; 4251 aExpBigger: 4252 if ( aExp == 0x7FFF ) { 4253 if ( (uint64_t) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR ); 4254 return a; 4255 } 4256 if ( bExp == 0 ) --expDiff; 4257 shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 ); 4258 aBigger: 4259 sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 ); 4260 zExp = aExp; 4261 normalizeRoundAndPack: 4262 return 4263 normalizeRoundAndPackFloatx80( 4264 STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR ); 4265 4266 } 4267 4268 /*---------------------------------------------------------------------------- 4269 | Returns the result of adding the extended double-precision floating-point 4270 | values `a' and `b'. The operation is performed according to the IEC/IEEE 4271 | Standard for Binary Floating-Point Arithmetic. 4272 *----------------------------------------------------------------------------*/ 4273 4274 floatx80 floatx80_add( floatx80 a, floatx80 b STATUS_PARAM ) 4275 { 4276 flag aSign, bSign; 4277 4278 aSign = extractFloatx80Sign( a ); 4279 bSign = extractFloatx80Sign( b ); 4280 if ( aSign == bSign ) { 4281 return addFloatx80Sigs( a, b, aSign STATUS_VAR ); 4282 } 4283 else { 4284 return subFloatx80Sigs( a, b, aSign STATUS_VAR ); 4285 } 4286 4287 } 4288 4289 /*---------------------------------------------------------------------------- 4290 | Returns the result of subtracting the extended double-precision floating- 4291 | point values `a' and `b'. The operation is performed according to the 4292 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4293 *----------------------------------------------------------------------------*/ 4294 4295 floatx80 floatx80_sub( floatx80 a, floatx80 b STATUS_PARAM ) 4296 { 4297 flag aSign, bSign; 4298 4299 aSign = extractFloatx80Sign( a ); 4300 bSign = extractFloatx80Sign( b ); 4301 if ( aSign == bSign ) { 4302 return subFloatx80Sigs( a, b, aSign STATUS_VAR ); 4303 } 4304 else { 4305 return addFloatx80Sigs( a, b, aSign STATUS_VAR ); 4306 } 4307 4308 } 4309 4310 /*---------------------------------------------------------------------------- 4311 | Returns the result of multiplying the extended double-precision floating- 4312 | point values `a' and `b'. The operation is performed according to the 4313 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4314 *----------------------------------------------------------------------------*/ 4315 4316 floatx80 floatx80_mul( floatx80 a, floatx80 b STATUS_PARAM ) 4317 { 4318 flag aSign, bSign, zSign; 4319 int32 aExp, bExp, zExp; 4320 uint64_t aSig, bSig, zSig0, zSig1; 4321 floatx80 z; 4322 4323 aSig = extractFloatx80Frac( a ); 4324 aExp = extractFloatx80Exp( a ); 4325 aSign = extractFloatx80Sign( a ); 4326 bSig = extractFloatx80Frac( b ); 4327 bExp = extractFloatx80Exp( b ); 4328 bSign = extractFloatx80Sign( b ); 4329 zSign = aSign ^ bSign; 4330 if ( aExp == 0x7FFF ) { 4331 if ( (uint64_t) ( aSig<<1 ) 4332 || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) { 4333 return propagateFloatx80NaN( a, b STATUS_VAR ); 4334 } 4335 if ( ( bExp | bSig ) == 0 ) goto invalid; 4336 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 4337 } 4338 if ( bExp == 0x7FFF ) { 4339 if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR ); 4340 if ( ( aExp | aSig ) == 0 ) { 4341 invalid: 4342 float_raise( float_flag_invalid STATUS_VAR); 4343 z.low = floatx80_default_nan_low; 4344 z.high = floatx80_default_nan_high; 4345 return z; 4346 } 4347 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 4348 } 4349 if ( aExp == 0 ) { 4350 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 ); 4351 normalizeFloatx80Subnormal( aSig, &aExp, &aSig ); 4352 } 4353 if ( bExp == 0 ) { 4354 if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 ); 4355 normalizeFloatx80Subnormal( bSig, &bExp, &bSig ); 4356 } 4357 zExp = aExp + bExp - 0x3FFE; 4358 mul64To128( aSig, bSig, &zSig0, &zSig1 ); 4359 if ( 0 < (int64_t) zSig0 ) { 4360 shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 ); 4361 --zExp; 4362 } 4363 return 4364 roundAndPackFloatx80( 4365 STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR ); 4366 4367 } 4368 4369 /*---------------------------------------------------------------------------- 4370 | Returns the result of dividing the extended double-precision floating-point 4371 | value `a' by the corresponding value `b'. The operation is performed 4372 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4373 *----------------------------------------------------------------------------*/ 4374 4375 floatx80 floatx80_div( floatx80 a, floatx80 b STATUS_PARAM ) 4376 { 4377 flag aSign, bSign, zSign; 4378 int32 aExp, bExp, zExp; 4379 uint64_t aSig, bSig, zSig0, zSig1; 4380 uint64_t rem0, rem1, rem2, term0, term1, term2; 4381 floatx80 z; 4382 4383 aSig = extractFloatx80Frac( a ); 4384 aExp = extractFloatx80Exp( a ); 4385 aSign = extractFloatx80Sign( a ); 4386 bSig = extractFloatx80Frac( b ); 4387 bExp = extractFloatx80Exp( b ); 4388 bSign = extractFloatx80Sign( b ); 4389 zSign = aSign ^ bSign; 4390 if ( aExp == 0x7FFF ) { 4391 if ( (uint64_t) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR ); 4392 if ( bExp == 0x7FFF ) { 4393 if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR ); 4394 goto invalid; 4395 } 4396 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 4397 } 4398 if ( bExp == 0x7FFF ) { 4399 if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR ); 4400 return packFloatx80( zSign, 0, 0 ); 4401 } 4402 if ( bExp == 0 ) { 4403 if ( bSig == 0 ) { 4404 if ( ( aExp | aSig ) == 0 ) { 4405 invalid: 4406 float_raise( float_flag_invalid STATUS_VAR); 4407 z.low = floatx80_default_nan_low; 4408 z.high = floatx80_default_nan_high; 4409 return z; 4410 } 4411 float_raise( float_flag_divbyzero STATUS_VAR); 4412 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 4413 } 4414 normalizeFloatx80Subnormal( bSig, &bExp, &bSig ); 4415 } 4416 if ( aExp == 0 ) { 4417 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 ); 4418 normalizeFloatx80Subnormal( aSig, &aExp, &aSig ); 4419 } 4420 zExp = aExp - bExp + 0x3FFE; 4421 rem1 = 0; 4422 if ( bSig <= aSig ) { 4423 shift128Right( aSig, 0, 1, &aSig, &rem1 ); 4424 ++zExp; 4425 } 4426 zSig0 = estimateDiv128To64( aSig, rem1, bSig ); 4427 mul64To128( bSig, zSig0, &term0, &term1 ); 4428 sub128( aSig, rem1, term0, term1, &rem0, &rem1 ); 4429 while ( (int64_t) rem0 < 0 ) { 4430 --zSig0; 4431 add128( rem0, rem1, 0, bSig, &rem0, &rem1 ); 4432 } 4433 zSig1 = estimateDiv128To64( rem1, 0, bSig ); 4434 if ( (uint64_t) ( zSig1<<1 ) <= 8 ) { 4435 mul64To128( bSig, zSig1, &term1, &term2 ); 4436 sub128( rem1, 0, term1, term2, &rem1, &rem2 ); 4437 while ( (int64_t) rem1 < 0 ) { 4438 --zSig1; 4439 add128( rem1, rem2, 0, bSig, &rem1, &rem2 ); 4440 } 4441 zSig1 |= ( ( rem1 | rem2 ) != 0 ); 4442 } 4443 return 4444 roundAndPackFloatx80( 4445 STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR ); 4446 4447 } 4448 4449 /*---------------------------------------------------------------------------- 4450 | Returns the remainder of the extended double-precision floating-point value 4451 | `a' with respect to the corresponding value `b'. The operation is performed 4452 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4453 *----------------------------------------------------------------------------*/ 4454 4455 floatx80 floatx80_rem( floatx80 a, floatx80 b STATUS_PARAM ) 4456 { 4457 flag aSign, zSign; 4458 int32 aExp, bExp, expDiff; 4459 uint64_t aSig0, aSig1, bSig; 4460 uint64_t q, term0, term1, alternateASig0, alternateASig1; 4461 floatx80 z; 4462 4463 aSig0 = extractFloatx80Frac( a ); 4464 aExp = extractFloatx80Exp( a ); 4465 aSign = extractFloatx80Sign( a ); 4466 bSig = extractFloatx80Frac( b ); 4467 bExp = extractFloatx80Exp( b ); 4468 if ( aExp == 0x7FFF ) { 4469 if ( (uint64_t) ( aSig0<<1 ) 4470 || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) { 4471 return propagateFloatx80NaN( a, b STATUS_VAR ); 4472 } 4473 goto invalid; 4474 } 4475 if ( bExp == 0x7FFF ) { 4476 if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR ); 4477 return a; 4478 } 4479 if ( bExp == 0 ) { 4480 if ( bSig == 0 ) { 4481 invalid: 4482 float_raise( float_flag_invalid STATUS_VAR); 4483 z.low = floatx80_default_nan_low; 4484 z.high = floatx80_default_nan_high; 4485 return z; 4486 } 4487 normalizeFloatx80Subnormal( bSig, &bExp, &bSig ); 4488 } 4489 if ( aExp == 0 ) { 4490 if ( (uint64_t) ( aSig0<<1 ) == 0 ) return a; 4491 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 ); 4492 } 4493 bSig |= LIT64( 0x8000000000000000 ); 4494 zSign = aSign; 4495 expDiff = aExp - bExp; 4496 aSig1 = 0; 4497 if ( expDiff < 0 ) { 4498 if ( expDiff < -1 ) return a; 4499 shift128Right( aSig0, 0, 1, &aSig0, &aSig1 ); 4500 expDiff = 0; 4501 } 4502 q = ( bSig <= aSig0 ); 4503 if ( q ) aSig0 -= bSig; 4504 expDiff -= 64; 4505 while ( 0 < expDiff ) { 4506 q = estimateDiv128To64( aSig0, aSig1, bSig ); 4507 q = ( 2 < q ) ? q - 2 : 0; 4508 mul64To128( bSig, q, &term0, &term1 ); 4509 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 ); 4510 shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 ); 4511 expDiff -= 62; 4512 } 4513 expDiff += 64; 4514 if ( 0 < expDiff ) { 4515 q = estimateDiv128To64( aSig0, aSig1, bSig ); 4516 q = ( 2 < q ) ? q - 2 : 0; 4517 q >>= 64 - expDiff; 4518 mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 ); 4519 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 ); 4520 shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 ); 4521 while ( le128( term0, term1, aSig0, aSig1 ) ) { 4522 ++q; 4523 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 ); 4524 } 4525 } 4526 else { 4527 term1 = 0; 4528 term0 = bSig; 4529 } 4530 sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 ); 4531 if ( lt128( alternateASig0, alternateASig1, aSig0, aSig1 ) 4532 || ( eq128( alternateASig0, alternateASig1, aSig0, aSig1 ) 4533 && ( q & 1 ) ) 4534 ) { 4535 aSig0 = alternateASig0; 4536 aSig1 = alternateASig1; 4537 zSign = ! zSign; 4538 } 4539 return 4540 normalizeRoundAndPackFloatx80( 4541 80, zSign, bExp + expDiff, aSig0, aSig1 STATUS_VAR ); 4542 4543 } 4544 4545 /*---------------------------------------------------------------------------- 4546 | Returns the square root of the extended double-precision floating-point 4547 | value `a'. The operation is performed according to the IEC/IEEE Standard 4548 | for Binary Floating-Point Arithmetic. 4549 *----------------------------------------------------------------------------*/ 4550 4551 floatx80 floatx80_sqrt( floatx80 a STATUS_PARAM ) 4552 { 4553 flag aSign; 4554 int32 aExp, zExp; 4555 uint64_t aSig0, aSig1, zSig0, zSig1, doubleZSig0; 4556 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3; 4557 floatx80 z; 4558 4559 aSig0 = extractFloatx80Frac( a ); 4560 aExp = extractFloatx80Exp( a ); 4561 aSign = extractFloatx80Sign( a ); 4562 if ( aExp == 0x7FFF ) { 4563 if ( (uint64_t) ( aSig0<<1 ) ) return propagateFloatx80NaN( a, a STATUS_VAR ); 4564 if ( ! aSign ) return a; 4565 goto invalid; 4566 } 4567 if ( aSign ) { 4568 if ( ( aExp | aSig0 ) == 0 ) return a; 4569 invalid: 4570 float_raise( float_flag_invalid STATUS_VAR); 4571 z.low = floatx80_default_nan_low; 4572 z.high = floatx80_default_nan_high; 4573 return z; 4574 } 4575 if ( aExp == 0 ) { 4576 if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 ); 4577 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 ); 4578 } 4579 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF; 4580 zSig0 = estimateSqrt32( aExp, aSig0>>32 ); 4581 shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 ); 4582 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 ); 4583 doubleZSig0 = zSig0<<1; 4584 mul64To128( zSig0, zSig0, &term0, &term1 ); 4585 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 ); 4586 while ( (int64_t) rem0 < 0 ) { 4587 --zSig0; 4588 doubleZSig0 -= 2; 4589 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 ); 4590 } 4591 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 ); 4592 if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) { 4593 if ( zSig1 == 0 ) zSig1 = 1; 4594 mul64To128( doubleZSig0, zSig1, &term1, &term2 ); 4595 sub128( rem1, 0, term1, term2, &rem1, &rem2 ); 4596 mul64To128( zSig1, zSig1, &term2, &term3 ); 4597 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 ); 4598 while ( (int64_t) rem1 < 0 ) { 4599 --zSig1; 4600 shortShift128Left( 0, zSig1, 1, &term2, &term3 ); 4601 term3 |= 1; 4602 term2 |= doubleZSig0; 4603 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 ); 4604 } 4605 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 ); 4606 } 4607 shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 ); 4608 zSig0 |= doubleZSig0; 4609 return 4610 roundAndPackFloatx80( 4611 STATUS(floatx80_rounding_precision), 0, zExp, zSig0, zSig1 STATUS_VAR ); 4612 4613 } 4614 4615 /*---------------------------------------------------------------------------- 4616 | Returns 1 if the extended double-precision floating-point value `a' is equal 4617 | to the corresponding value `b', and 0 otherwise. The invalid exception is 4618 | raised if either operand is a NaN. Otherwise, the comparison is performed 4619 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4620 *----------------------------------------------------------------------------*/ 4621 4622 int floatx80_eq( floatx80 a, floatx80 b STATUS_PARAM ) 4623 { 4624 4625 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 4626 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) 4627 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 4628 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) ) 4629 ) { 4630 float_raise( float_flag_invalid STATUS_VAR); 4631 return 0; 4632 } 4633 return 4634 ( a.low == b.low ) 4635 && ( ( a.high == b.high ) 4636 || ( ( a.low == 0 ) 4637 && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) ) 4638 ); 4639 4640 } 4641 4642 /*---------------------------------------------------------------------------- 4643 | Returns 1 if the extended double-precision floating-point value `a' is 4644 | less than or equal to the corresponding value `b', and 0 otherwise. The 4645 | invalid exception is raised if either operand is a NaN. The comparison is 4646 | performed according to the IEC/IEEE Standard for Binary Floating-Point 4647 | Arithmetic. 4648 *----------------------------------------------------------------------------*/ 4649 4650 int floatx80_le( floatx80 a, floatx80 b STATUS_PARAM ) 4651 { 4652 flag aSign, bSign; 4653 4654 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 4655 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) 4656 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 4657 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) ) 4658 ) { 4659 float_raise( float_flag_invalid STATUS_VAR); 4660 return 0; 4661 } 4662 aSign = extractFloatx80Sign( a ); 4663 bSign = extractFloatx80Sign( b ); 4664 if ( aSign != bSign ) { 4665 return 4666 aSign 4667 || ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 4668 == 0 ); 4669 } 4670 return 4671 aSign ? le128( b.high, b.low, a.high, a.low ) 4672 : le128( a.high, a.low, b.high, b.low ); 4673 4674 } 4675 4676 /*---------------------------------------------------------------------------- 4677 | Returns 1 if the extended double-precision floating-point value `a' is 4678 | less than the corresponding value `b', and 0 otherwise. The invalid 4679 | exception is raised if either operand is a NaN. The comparison is performed 4680 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4681 *----------------------------------------------------------------------------*/ 4682 4683 int floatx80_lt( floatx80 a, floatx80 b STATUS_PARAM ) 4684 { 4685 flag aSign, bSign; 4686 4687 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 4688 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) 4689 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 4690 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) ) 4691 ) { 4692 float_raise( float_flag_invalid STATUS_VAR); 4693 return 0; 4694 } 4695 aSign = extractFloatx80Sign( a ); 4696 bSign = extractFloatx80Sign( b ); 4697 if ( aSign != bSign ) { 4698 return 4699 aSign 4700 && ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 4701 != 0 ); 4702 } 4703 return 4704 aSign ? lt128( b.high, b.low, a.high, a.low ) 4705 : lt128( a.high, a.low, b.high, b.low ); 4706 4707 } 4708 4709 /*---------------------------------------------------------------------------- 4710 | Returns 1 if the extended double-precision floating-point values `a' and `b' 4711 | cannot be compared, and 0 otherwise. The invalid exception is raised if 4712 | either operand is a NaN. The comparison is performed according to the 4713 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4714 *----------------------------------------------------------------------------*/ 4715 int floatx80_unordered( floatx80 a, floatx80 b STATUS_PARAM ) 4716 { 4717 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 4718 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) 4719 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 4720 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) ) 4721 ) { 4722 float_raise( float_flag_invalid STATUS_VAR); 4723 return 1; 4724 } 4725 return 0; 4726 } 4727 4728 /*---------------------------------------------------------------------------- 4729 | Returns 1 if the extended double-precision floating-point value `a' is 4730 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not 4731 | cause an exception. The comparison is performed according to the IEC/IEEE 4732 | Standard for Binary Floating-Point Arithmetic. 4733 *----------------------------------------------------------------------------*/ 4734 4735 int floatx80_eq_quiet( floatx80 a, floatx80 b STATUS_PARAM ) 4736 { 4737 4738 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 4739 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) 4740 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 4741 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) ) 4742 ) { 4743 if ( floatx80_is_signaling_nan( a ) 4744 || floatx80_is_signaling_nan( b ) ) { 4745 float_raise( float_flag_invalid STATUS_VAR); 4746 } 4747 return 0; 4748 } 4749 return 4750 ( a.low == b.low ) 4751 && ( ( a.high == b.high ) 4752 || ( ( a.low == 0 ) 4753 && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) ) 4754 ); 4755 4756 } 4757 4758 /*---------------------------------------------------------------------------- 4759 | Returns 1 if the extended double-precision floating-point value `a' is less 4760 | than or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs 4761 | do not cause an exception. Otherwise, the comparison is performed according 4762 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4763 *----------------------------------------------------------------------------*/ 4764 4765 int floatx80_le_quiet( floatx80 a, floatx80 b STATUS_PARAM ) 4766 { 4767 flag aSign, bSign; 4768 4769 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 4770 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) 4771 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 4772 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) ) 4773 ) { 4774 if ( floatx80_is_signaling_nan( a ) 4775 || floatx80_is_signaling_nan( b ) ) { 4776 float_raise( float_flag_invalid STATUS_VAR); 4777 } 4778 return 0; 4779 } 4780 aSign = extractFloatx80Sign( a ); 4781 bSign = extractFloatx80Sign( b ); 4782 if ( aSign != bSign ) { 4783 return 4784 aSign 4785 || ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 4786 == 0 ); 4787 } 4788 return 4789 aSign ? le128( b.high, b.low, a.high, a.low ) 4790 : le128( a.high, a.low, b.high, b.low ); 4791 4792 } 4793 4794 /*---------------------------------------------------------------------------- 4795 | Returns 1 if the extended double-precision floating-point value `a' is less 4796 | than the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause 4797 | an exception. Otherwise, the comparison is performed according to the 4798 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4799 *----------------------------------------------------------------------------*/ 4800 4801 int floatx80_lt_quiet( floatx80 a, floatx80 b STATUS_PARAM ) 4802 { 4803 flag aSign, bSign; 4804 4805 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 4806 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) 4807 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 4808 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) ) 4809 ) { 4810 if ( floatx80_is_signaling_nan( a ) 4811 || floatx80_is_signaling_nan( b ) ) { 4812 float_raise( float_flag_invalid STATUS_VAR); 4813 } 4814 return 0; 4815 } 4816 aSign = extractFloatx80Sign( a ); 4817 bSign = extractFloatx80Sign( b ); 4818 if ( aSign != bSign ) { 4819 return 4820 aSign 4821 && ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 4822 != 0 ); 4823 } 4824 return 4825 aSign ? lt128( b.high, b.low, a.high, a.low ) 4826 : lt128( a.high, a.low, b.high, b.low ); 4827 4828 } 4829 4830 /*---------------------------------------------------------------------------- 4831 | Returns 1 if the extended double-precision floating-point values `a' and `b' 4832 | cannot be compared, and 0 otherwise. Quiet NaNs do not cause an exception. 4833 | The comparison is performed according to the IEC/IEEE Standard for Binary 4834 | Floating-Point Arithmetic. 4835 *----------------------------------------------------------------------------*/ 4836 int floatx80_unordered_quiet( floatx80 a, floatx80 b STATUS_PARAM ) 4837 { 4838 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 4839 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) 4840 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 4841 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) ) 4842 ) { 4843 if ( floatx80_is_signaling_nan( a ) 4844 || floatx80_is_signaling_nan( b ) ) { 4845 float_raise( float_flag_invalid STATUS_VAR); 4846 } 4847 return 1; 4848 } 4849 return 0; 4850 } 4851 4852 #endif 4853 4854 #ifdef FLOAT128 4855 4856 /*---------------------------------------------------------------------------- 4857 | Returns the result of converting the quadruple-precision floating-point 4858 | value `a' to the 32-bit two's complement integer format. The conversion 4859 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 4860 | Arithmetic---which means in particular that the conversion is rounded 4861 | according to the current rounding mode. If `a' is a NaN, the largest 4862 | positive integer is returned. Otherwise, if the conversion overflows, the 4863 | largest integer with the same sign as `a' is returned. 4864 *----------------------------------------------------------------------------*/ 4865 4866 int32 float128_to_int32( float128 a STATUS_PARAM ) 4867 { 4868 flag aSign; 4869 int32 aExp, shiftCount; 4870 uint64_t aSig0, aSig1; 4871 4872 aSig1 = extractFloat128Frac1( a ); 4873 aSig0 = extractFloat128Frac0( a ); 4874 aExp = extractFloat128Exp( a ); 4875 aSign = extractFloat128Sign( a ); 4876 if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0; 4877 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 ); 4878 aSig0 |= ( aSig1 != 0 ); 4879 shiftCount = 0x4028 - aExp; 4880 if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 ); 4881 return roundAndPackInt32( aSign, aSig0 STATUS_VAR ); 4882 4883 } 4884 4885 /*---------------------------------------------------------------------------- 4886 | Returns the result of converting the quadruple-precision floating-point 4887 | value `a' to the 32-bit two's complement integer format. The conversion 4888 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 4889 | Arithmetic, except that the conversion is always rounded toward zero. If 4890 | `a' is a NaN, the largest positive integer is returned. Otherwise, if the 4891 | conversion overflows, the largest integer with the same sign as `a' is 4892 | returned. 4893 *----------------------------------------------------------------------------*/ 4894 4895 int32 float128_to_int32_round_to_zero( float128 a STATUS_PARAM ) 4896 { 4897 flag aSign; 4898 int32 aExp, shiftCount; 4899 uint64_t aSig0, aSig1, savedASig; 4900 int32 z; 4901 4902 aSig1 = extractFloat128Frac1( a ); 4903 aSig0 = extractFloat128Frac0( a ); 4904 aExp = extractFloat128Exp( a ); 4905 aSign = extractFloat128Sign( a ); 4906 aSig0 |= ( aSig1 != 0 ); 4907 if ( 0x401E < aExp ) { 4908 if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0; 4909 goto invalid; 4910 } 4911 else if ( aExp < 0x3FFF ) { 4912 if ( aExp || aSig0 ) STATUS(float_exception_flags) |= float_flag_inexact; 4913 return 0; 4914 } 4915 aSig0 |= LIT64( 0x0001000000000000 ); 4916 shiftCount = 0x402F - aExp; 4917 savedASig = aSig0; 4918 aSig0 >>= shiftCount; 4919 z = aSig0; 4920 if ( aSign ) z = - z; 4921 if ( ( z < 0 ) ^ aSign ) { 4922 invalid: 4923 float_raise( float_flag_invalid STATUS_VAR); 4924 return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF; 4925 } 4926 if ( ( aSig0<<shiftCount ) != savedASig ) { 4927 STATUS(float_exception_flags) |= float_flag_inexact; 4928 } 4929 return z; 4930 4931 } 4932 4933 /*---------------------------------------------------------------------------- 4934 | Returns the result of converting the quadruple-precision floating-point 4935 | value `a' to the 64-bit two's complement integer format. The conversion 4936 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 4937 | Arithmetic---which means in particular that the conversion is rounded 4938 | according to the current rounding mode. If `a' is a NaN, the largest 4939 | positive integer is returned. Otherwise, if the conversion overflows, the 4940 | largest integer with the same sign as `a' is returned. 4941 *----------------------------------------------------------------------------*/ 4942 4943 int64 float128_to_int64( float128 a STATUS_PARAM ) 4944 { 4945 flag aSign; 4946 int32 aExp, shiftCount; 4947 uint64_t aSig0, aSig1; 4948 4949 aSig1 = extractFloat128Frac1( a ); 4950 aSig0 = extractFloat128Frac0( a ); 4951 aExp = extractFloat128Exp( a ); 4952 aSign = extractFloat128Sign( a ); 4953 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 ); 4954 shiftCount = 0x402F - aExp; 4955 if ( shiftCount <= 0 ) { 4956 if ( 0x403E < aExp ) { 4957 float_raise( float_flag_invalid STATUS_VAR); 4958 if ( ! aSign 4959 || ( ( aExp == 0x7FFF ) 4960 && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) ) 4961 ) 4962 ) { 4963 return LIT64( 0x7FFFFFFFFFFFFFFF ); 4964 } 4965 return (int64_t) LIT64( 0x8000000000000000 ); 4966 } 4967 shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 ); 4968 } 4969 else { 4970 shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 ); 4971 } 4972 return roundAndPackInt64( aSign, aSig0, aSig1 STATUS_VAR ); 4973 4974 } 4975 4976 /*---------------------------------------------------------------------------- 4977 | Returns the result of converting the quadruple-precision floating-point 4978 | value `a' to the 64-bit two's complement integer format. The conversion 4979 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 4980 | Arithmetic, except that the conversion is always rounded toward zero. 4981 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if 4982 | the conversion overflows, the largest integer with the same sign as `a' is 4983 | returned. 4984 *----------------------------------------------------------------------------*/ 4985 4986 int64 float128_to_int64_round_to_zero( float128 a STATUS_PARAM ) 4987 { 4988 flag aSign; 4989 int32 aExp, shiftCount; 4990 uint64_t aSig0, aSig1; 4991 int64 z; 4992 4993 aSig1 = extractFloat128Frac1( a ); 4994 aSig0 = extractFloat128Frac0( a ); 4995 aExp = extractFloat128Exp( a ); 4996 aSign = extractFloat128Sign( a ); 4997 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 ); 4998 shiftCount = aExp - 0x402F; 4999 if ( 0 < shiftCount ) { 5000 if ( 0x403E <= aExp ) { 5001 aSig0 &= LIT64( 0x0000FFFFFFFFFFFF ); 5002 if ( ( a.high == LIT64( 0xC03E000000000000 ) ) 5003 && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) { 5004 if ( aSig1 ) STATUS(float_exception_flags) |= float_flag_inexact; 5005 } 5006 else { 5007 float_raise( float_flag_invalid STATUS_VAR); 5008 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) { 5009 return LIT64( 0x7FFFFFFFFFFFFFFF ); 5010 } 5011 } 5012 return (int64_t) LIT64( 0x8000000000000000 ); 5013 } 5014 z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) ); 5015 if ( (uint64_t) ( aSig1<<shiftCount ) ) { 5016 STATUS(float_exception_flags) |= float_flag_inexact; 5017 } 5018 } 5019 else { 5020 if ( aExp < 0x3FFF ) { 5021 if ( aExp | aSig0 | aSig1 ) { 5022 STATUS(float_exception_flags) |= float_flag_inexact; 5023 } 5024 return 0; 5025 } 5026 z = aSig0>>( - shiftCount ); 5027 if ( aSig1 5028 || ( shiftCount && (uint64_t) ( aSig0<<( shiftCount & 63 ) ) ) ) { 5029 STATUS(float_exception_flags) |= float_flag_inexact; 5030 } 5031 } 5032 if ( aSign ) z = - z; 5033 return z; 5034 5035 } 5036 5037 /*---------------------------------------------------------------------------- 5038 | Returns the result of converting the quadruple-precision floating-point 5039 | value `a' to the single-precision floating-point format. The conversion 5040 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 5041 | Arithmetic. 5042 *----------------------------------------------------------------------------*/ 5043 5044 float32 float128_to_float32( float128 a STATUS_PARAM ) 5045 { 5046 flag aSign; 5047 int32 aExp; 5048 uint64_t aSig0, aSig1; 5049 uint32_t zSig; 5050 5051 aSig1 = extractFloat128Frac1( a ); 5052 aSig0 = extractFloat128Frac0( a ); 5053 aExp = extractFloat128Exp( a ); 5054 aSign = extractFloat128Sign( a ); 5055 if ( aExp == 0x7FFF ) { 5056 if ( aSig0 | aSig1 ) { 5057 return commonNaNToFloat32( float128ToCommonNaN( a STATUS_VAR ) STATUS_VAR ); 5058 } 5059 return packFloat32( aSign, 0xFF, 0 ); 5060 } 5061 aSig0 |= ( aSig1 != 0 ); 5062 shift64RightJamming( aSig0, 18, &aSig0 ); 5063 zSig = aSig0; 5064 if ( aExp || zSig ) { 5065 zSig |= 0x40000000; 5066 aExp -= 0x3F81; 5067 } 5068 return roundAndPackFloat32( aSign, aExp, zSig STATUS_VAR ); 5069 5070 } 5071 5072 /*---------------------------------------------------------------------------- 5073 | Returns the result of converting the quadruple-precision floating-point 5074 | value `a' to the double-precision floating-point format. The conversion 5075 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 5076 | Arithmetic. 5077 *----------------------------------------------------------------------------*/ 5078 5079 float64 float128_to_float64( float128 a STATUS_PARAM ) 5080 { 5081 flag aSign; 5082 int32 aExp; 5083 uint64_t aSig0, aSig1; 5084 5085 aSig1 = extractFloat128Frac1( a ); 5086 aSig0 = extractFloat128Frac0( a ); 5087 aExp = extractFloat128Exp( a ); 5088 aSign = extractFloat128Sign( a ); 5089 if ( aExp == 0x7FFF ) { 5090 if ( aSig0 | aSig1 ) { 5091 return commonNaNToFloat64( float128ToCommonNaN( a STATUS_VAR ) STATUS_VAR ); 5092 } 5093 return packFloat64( aSign, 0x7FF, 0 ); 5094 } 5095 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 ); 5096 aSig0 |= ( aSig1 != 0 ); 5097 if ( aExp || aSig0 ) { 5098 aSig0 |= LIT64( 0x4000000000000000 ); 5099 aExp -= 0x3C01; 5100 } 5101 return roundAndPackFloat64( aSign, aExp, aSig0 STATUS_VAR ); 5102 5103 } 5104 5105 #ifdef FLOATX80 5106 5107 /*---------------------------------------------------------------------------- 5108 | Returns the result of converting the quadruple-precision floating-point 5109 | value `a' to the extended double-precision floating-point format. The 5110 | conversion is performed according to the IEC/IEEE Standard for Binary 5111 | Floating-Point Arithmetic. 5112 *----------------------------------------------------------------------------*/ 5113 5114 floatx80 float128_to_floatx80( float128 a STATUS_PARAM ) 5115 { 5116 flag aSign; 5117 int32 aExp; 5118 uint64_t aSig0, aSig1; 5119 5120 aSig1 = extractFloat128Frac1( a ); 5121 aSig0 = extractFloat128Frac0( a ); 5122 aExp = extractFloat128Exp( a ); 5123 aSign = extractFloat128Sign( a ); 5124 if ( aExp == 0x7FFF ) { 5125 if ( aSig0 | aSig1 ) { 5126 return commonNaNToFloatx80( float128ToCommonNaN( a STATUS_VAR ) STATUS_VAR ); 5127 } 5128 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 5129 } 5130 if ( aExp == 0 ) { 5131 if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 ); 5132 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 5133 } 5134 else { 5135 aSig0 |= LIT64( 0x0001000000000000 ); 5136 } 5137 shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 ); 5138 return roundAndPackFloatx80( 80, aSign, aExp, aSig0, aSig1 STATUS_VAR ); 5139 5140 } 5141 5142 #endif 5143 5144 /*---------------------------------------------------------------------------- 5145 | Rounds the quadruple-precision floating-point value `a' to an integer, and 5146 | returns the result as a quadruple-precision floating-point value. The 5147 | operation is performed according to the IEC/IEEE Standard for Binary 5148 | Floating-Point Arithmetic. 5149 *----------------------------------------------------------------------------*/ 5150 5151 float128 float128_round_to_int( float128 a STATUS_PARAM ) 5152 { 5153 flag aSign; 5154 int32 aExp; 5155 uint64_t lastBitMask, roundBitsMask; 5156 int8 roundingMode; 5157 float128 z; 5158 5159 aExp = extractFloat128Exp( a ); 5160 if ( 0x402F <= aExp ) { 5161 if ( 0x406F <= aExp ) { 5162 if ( ( aExp == 0x7FFF ) 5163 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) 5164 ) { 5165 return propagateFloat128NaN( a, a STATUS_VAR ); 5166 } 5167 return a; 5168 } 5169 lastBitMask = 1; 5170 lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1; 5171 roundBitsMask = lastBitMask - 1; 5172 z = a; 5173 roundingMode = STATUS(float_rounding_mode); 5174 if ( roundingMode == float_round_nearest_even ) { 5175 if ( lastBitMask ) { 5176 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low ); 5177 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask; 5178 } 5179 else { 5180 if ( (int64_t) z.low < 0 ) { 5181 ++z.high; 5182 if ( (uint64_t) ( z.low<<1 ) == 0 ) z.high &= ~1; 5183 } 5184 } 5185 } 5186 else if ( roundingMode != float_round_to_zero ) { 5187 if ( extractFloat128Sign( z ) 5188 ^ ( roundingMode == float_round_up ) ) { 5189 add128( z.high, z.low, 0, roundBitsMask, &z.high, &z.low ); 5190 } 5191 } 5192 z.low &= ~ roundBitsMask; 5193 } 5194 else { 5195 if ( aExp < 0x3FFF ) { 5196 if ( ( ( (uint64_t) ( a.high<<1 ) ) | a.low ) == 0 ) return a; 5197 STATUS(float_exception_flags) |= float_flag_inexact; 5198 aSign = extractFloat128Sign( a ); 5199 switch ( STATUS(float_rounding_mode) ) { 5200 case float_round_nearest_even: 5201 if ( ( aExp == 0x3FFE ) 5202 && ( extractFloat128Frac0( a ) 5203 | extractFloat128Frac1( a ) ) 5204 ) { 5205 return packFloat128( aSign, 0x3FFF, 0, 0 ); 5206 } 5207 break; 5208 case float_round_down: 5209 return 5210 aSign ? packFloat128( 1, 0x3FFF, 0, 0 ) 5211 : packFloat128( 0, 0, 0, 0 ); 5212 case float_round_up: 5213 return 5214 aSign ? packFloat128( 1, 0, 0, 0 ) 5215 : packFloat128( 0, 0x3FFF, 0, 0 ); 5216 } 5217 return packFloat128( aSign, 0, 0, 0 ); 5218 } 5219 lastBitMask = 1; 5220 lastBitMask <<= 0x402F - aExp; 5221 roundBitsMask = lastBitMask - 1; 5222 z.low = 0; 5223 z.high = a.high; 5224 roundingMode = STATUS(float_rounding_mode); 5225 if ( roundingMode == float_round_nearest_even ) { 5226 z.high += lastBitMask>>1; 5227 if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) { 5228 z.high &= ~ lastBitMask; 5229 } 5230 } 5231 else if ( roundingMode != float_round_to_zero ) { 5232 if ( extractFloat128Sign( z ) 5233 ^ ( roundingMode == float_round_up ) ) { 5234 z.high |= ( a.low != 0 ); 5235 z.high += roundBitsMask; 5236 } 5237 } 5238 z.high &= ~ roundBitsMask; 5239 } 5240 if ( ( z.low != a.low ) || ( z.high != a.high ) ) { 5241 STATUS(float_exception_flags) |= float_flag_inexact; 5242 } 5243 return z; 5244 5245 } 5246 5247 /*---------------------------------------------------------------------------- 5248 | Returns the result of adding the absolute values of the quadruple-precision 5249 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated 5250 | before being returned. `zSign' is ignored if the result is a NaN. 5251 | The addition is performed according to the IEC/IEEE Standard for Binary 5252 | Floating-Point Arithmetic. 5253 *----------------------------------------------------------------------------*/ 5254 5255 static float128 addFloat128Sigs( float128 a, float128 b, flag zSign STATUS_PARAM) 5256 { 5257 int32 aExp, bExp, zExp; 5258 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2; 5259 int32 expDiff; 5260 5261 aSig1 = extractFloat128Frac1( a ); 5262 aSig0 = extractFloat128Frac0( a ); 5263 aExp = extractFloat128Exp( a ); 5264 bSig1 = extractFloat128Frac1( b ); 5265 bSig0 = extractFloat128Frac0( b ); 5266 bExp = extractFloat128Exp( b ); 5267 expDiff = aExp - bExp; 5268 if ( 0 < expDiff ) { 5269 if ( aExp == 0x7FFF ) { 5270 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR ); 5271 return a; 5272 } 5273 if ( bExp == 0 ) { 5274 --expDiff; 5275 } 5276 else { 5277 bSig0 |= LIT64( 0x0001000000000000 ); 5278 } 5279 shift128ExtraRightJamming( 5280 bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 ); 5281 zExp = aExp; 5282 } 5283 else if ( expDiff < 0 ) { 5284 if ( bExp == 0x7FFF ) { 5285 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR ); 5286 return packFloat128( zSign, 0x7FFF, 0, 0 ); 5287 } 5288 if ( aExp == 0 ) { 5289 ++expDiff; 5290 } 5291 else { 5292 aSig0 |= LIT64( 0x0001000000000000 ); 5293 } 5294 shift128ExtraRightJamming( 5295 aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 ); 5296 zExp = bExp; 5297 } 5298 else { 5299 if ( aExp == 0x7FFF ) { 5300 if ( aSig0 | aSig1 | bSig0 | bSig1 ) { 5301 return propagateFloat128NaN( a, b STATUS_VAR ); 5302 } 5303 return a; 5304 } 5305 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 ); 5306 if ( aExp == 0 ) { 5307 if (STATUS(flush_to_zero)) { 5308 if (zSig0 | zSig1) { 5309 float_raise(float_flag_output_denormal STATUS_VAR); 5310 } 5311 return packFloat128(zSign, 0, 0, 0); 5312 } 5313 return packFloat128( zSign, 0, zSig0, zSig1 ); 5314 } 5315 zSig2 = 0; 5316 zSig0 |= LIT64( 0x0002000000000000 ); 5317 zExp = aExp; 5318 goto shiftRight1; 5319 } 5320 aSig0 |= LIT64( 0x0001000000000000 ); 5321 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 ); 5322 --zExp; 5323 if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack; 5324 ++zExp; 5325 shiftRight1: 5326 shift128ExtraRightJamming( 5327 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 ); 5328 roundAndPack: 5329 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR ); 5330 5331 } 5332 5333 /*---------------------------------------------------------------------------- 5334 | Returns the result of subtracting the absolute values of the quadruple- 5335 | precision floating-point values `a' and `b'. If `zSign' is 1, the 5336 | difference is negated before being returned. `zSign' is ignored if the 5337 | result is a NaN. The subtraction is performed according to the IEC/IEEE 5338 | Standard for Binary Floating-Point Arithmetic. 5339 *----------------------------------------------------------------------------*/ 5340 5341 static float128 subFloat128Sigs( float128 a, float128 b, flag zSign STATUS_PARAM) 5342 { 5343 int32 aExp, bExp, zExp; 5344 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1; 5345 int32 expDiff; 5346 float128 z; 5347 5348 aSig1 = extractFloat128Frac1( a ); 5349 aSig0 = extractFloat128Frac0( a ); 5350 aExp = extractFloat128Exp( a ); 5351 bSig1 = extractFloat128Frac1( b ); 5352 bSig0 = extractFloat128Frac0( b ); 5353 bExp = extractFloat128Exp( b ); 5354 expDiff = aExp - bExp; 5355 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 ); 5356 shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 ); 5357 if ( 0 < expDiff ) goto aExpBigger; 5358 if ( expDiff < 0 ) goto bExpBigger; 5359 if ( aExp == 0x7FFF ) { 5360 if ( aSig0 | aSig1 | bSig0 | bSig1 ) { 5361 return propagateFloat128NaN( a, b STATUS_VAR ); 5362 } 5363 float_raise( float_flag_invalid STATUS_VAR); 5364 z.low = float128_default_nan_low; 5365 z.high = float128_default_nan_high; 5366 return z; 5367 } 5368 if ( aExp == 0 ) { 5369 aExp = 1; 5370 bExp = 1; 5371 } 5372 if ( bSig0 < aSig0 ) goto aBigger; 5373 if ( aSig0 < bSig0 ) goto bBigger; 5374 if ( bSig1 < aSig1 ) goto aBigger; 5375 if ( aSig1 < bSig1 ) goto bBigger; 5376 return packFloat128( STATUS(float_rounding_mode) == float_round_down, 0, 0, 0 ); 5377 bExpBigger: 5378 if ( bExp == 0x7FFF ) { 5379 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR ); 5380 return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 ); 5381 } 5382 if ( aExp == 0 ) { 5383 ++expDiff; 5384 } 5385 else { 5386 aSig0 |= LIT64( 0x4000000000000000 ); 5387 } 5388 shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 ); 5389 bSig0 |= LIT64( 0x4000000000000000 ); 5390 bBigger: 5391 sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 ); 5392 zExp = bExp; 5393 zSign ^= 1; 5394 goto normalizeRoundAndPack; 5395 aExpBigger: 5396 if ( aExp == 0x7FFF ) { 5397 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR ); 5398 return a; 5399 } 5400 if ( bExp == 0 ) { 5401 --expDiff; 5402 } 5403 else { 5404 bSig0 |= LIT64( 0x4000000000000000 ); 5405 } 5406 shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 ); 5407 aSig0 |= LIT64( 0x4000000000000000 ); 5408 aBigger: 5409 sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 ); 5410 zExp = aExp; 5411 normalizeRoundAndPack: 5412 --zExp; 5413 return normalizeRoundAndPackFloat128( zSign, zExp - 14, zSig0, zSig1 STATUS_VAR ); 5414 5415 } 5416 5417 /*---------------------------------------------------------------------------- 5418 | Returns the result of adding the quadruple-precision floating-point values 5419 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard 5420 | for Binary Floating-Point Arithmetic. 5421 *----------------------------------------------------------------------------*/ 5422 5423 float128 float128_add( float128 a, float128 b STATUS_PARAM ) 5424 { 5425 flag aSign, bSign; 5426 5427 aSign = extractFloat128Sign( a ); 5428 bSign = extractFloat128Sign( b ); 5429 if ( aSign == bSign ) { 5430 return addFloat128Sigs( a, b, aSign STATUS_VAR ); 5431 } 5432 else { 5433 return subFloat128Sigs( a, b, aSign STATUS_VAR ); 5434 } 5435 5436 } 5437 5438 /*---------------------------------------------------------------------------- 5439 | Returns the result of subtracting the quadruple-precision floating-point 5440 | values `a' and `b'. The operation is performed according to the IEC/IEEE 5441 | Standard for Binary Floating-Point Arithmetic. 5442 *----------------------------------------------------------------------------*/ 5443 5444 float128 float128_sub( float128 a, float128 b STATUS_PARAM ) 5445 { 5446 flag aSign, bSign; 5447 5448 aSign = extractFloat128Sign( a ); 5449 bSign = extractFloat128Sign( b ); 5450 if ( aSign == bSign ) { 5451 return subFloat128Sigs( a, b, aSign STATUS_VAR ); 5452 } 5453 else { 5454 return addFloat128Sigs( a, b, aSign STATUS_VAR ); 5455 } 5456 5457 } 5458 5459 /*---------------------------------------------------------------------------- 5460 | Returns the result of multiplying the quadruple-precision floating-point 5461 | values `a' and `b'. The operation is performed according to the IEC/IEEE 5462 | Standard for Binary Floating-Point Arithmetic. 5463 *----------------------------------------------------------------------------*/ 5464 5465 float128 float128_mul( float128 a, float128 b STATUS_PARAM ) 5466 { 5467 flag aSign, bSign, zSign; 5468 int32 aExp, bExp, zExp; 5469 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3; 5470 float128 z; 5471 5472 aSig1 = extractFloat128Frac1( a ); 5473 aSig0 = extractFloat128Frac0( a ); 5474 aExp = extractFloat128Exp( a ); 5475 aSign = extractFloat128Sign( a ); 5476 bSig1 = extractFloat128Frac1( b ); 5477 bSig0 = extractFloat128Frac0( b ); 5478 bExp = extractFloat128Exp( b ); 5479 bSign = extractFloat128Sign( b ); 5480 zSign = aSign ^ bSign; 5481 if ( aExp == 0x7FFF ) { 5482 if ( ( aSig0 | aSig1 ) 5483 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) { 5484 return propagateFloat128NaN( a, b STATUS_VAR ); 5485 } 5486 if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid; 5487 return packFloat128( zSign, 0x7FFF, 0, 0 ); 5488 } 5489 if ( bExp == 0x7FFF ) { 5490 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR ); 5491 if ( ( aExp | aSig0 | aSig1 ) == 0 ) { 5492 invalid: 5493 float_raise( float_flag_invalid STATUS_VAR); 5494 z.low = float128_default_nan_low; 5495 z.high = float128_default_nan_high; 5496 return z; 5497 } 5498 return packFloat128( zSign, 0x7FFF, 0, 0 ); 5499 } 5500 if ( aExp == 0 ) { 5501 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 ); 5502 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 5503 } 5504 if ( bExp == 0 ) { 5505 if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 ); 5506 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 ); 5507 } 5508 zExp = aExp + bExp - 0x4000; 5509 aSig0 |= LIT64( 0x0001000000000000 ); 5510 shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 ); 5511 mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 ); 5512 add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 ); 5513 zSig2 |= ( zSig3 != 0 ); 5514 if ( LIT64( 0x0002000000000000 ) <= zSig0 ) { 5515 shift128ExtraRightJamming( 5516 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 ); 5517 ++zExp; 5518 } 5519 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR ); 5520 5521 } 5522 5523 /*---------------------------------------------------------------------------- 5524 | Returns the result of dividing the quadruple-precision floating-point value 5525 | `a' by the corresponding value `b'. The operation is performed according to 5526 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5527 *----------------------------------------------------------------------------*/ 5528 5529 float128 float128_div( float128 a, float128 b STATUS_PARAM ) 5530 { 5531 flag aSign, bSign, zSign; 5532 int32 aExp, bExp, zExp; 5533 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2; 5534 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3; 5535 float128 z; 5536 5537 aSig1 = extractFloat128Frac1( a ); 5538 aSig0 = extractFloat128Frac0( a ); 5539 aExp = extractFloat128Exp( a ); 5540 aSign = extractFloat128Sign( a ); 5541 bSig1 = extractFloat128Frac1( b ); 5542 bSig0 = extractFloat128Frac0( b ); 5543 bExp = extractFloat128Exp( b ); 5544 bSign = extractFloat128Sign( b ); 5545 zSign = aSign ^ bSign; 5546 if ( aExp == 0x7FFF ) { 5547 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR ); 5548 if ( bExp == 0x7FFF ) { 5549 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR ); 5550 goto invalid; 5551 } 5552 return packFloat128( zSign, 0x7FFF, 0, 0 ); 5553 } 5554 if ( bExp == 0x7FFF ) { 5555 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR ); 5556 return packFloat128( zSign, 0, 0, 0 ); 5557 } 5558 if ( bExp == 0 ) { 5559 if ( ( bSig0 | bSig1 ) == 0 ) { 5560 if ( ( aExp | aSig0 | aSig1 ) == 0 ) { 5561 invalid: 5562 float_raise( float_flag_invalid STATUS_VAR); 5563 z.low = float128_default_nan_low; 5564 z.high = float128_default_nan_high; 5565 return z; 5566 } 5567 float_raise( float_flag_divbyzero STATUS_VAR); 5568 return packFloat128( zSign, 0x7FFF, 0, 0 ); 5569 } 5570 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 ); 5571 } 5572 if ( aExp == 0 ) { 5573 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 ); 5574 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 5575 } 5576 zExp = aExp - bExp + 0x3FFD; 5577 shortShift128Left( 5578 aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 ); 5579 shortShift128Left( 5580 bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 ); 5581 if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) { 5582 shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 ); 5583 ++zExp; 5584 } 5585 zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 ); 5586 mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 ); 5587 sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 ); 5588 while ( (int64_t) rem0 < 0 ) { 5589 --zSig0; 5590 add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 ); 5591 } 5592 zSig1 = estimateDiv128To64( rem1, rem2, bSig0 ); 5593 if ( ( zSig1 & 0x3FFF ) <= 4 ) { 5594 mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 ); 5595 sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 ); 5596 while ( (int64_t) rem1 < 0 ) { 5597 --zSig1; 5598 add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 ); 5599 } 5600 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 ); 5601 } 5602 shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 ); 5603 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR ); 5604 5605 } 5606 5607 /*---------------------------------------------------------------------------- 5608 | Returns the remainder of the quadruple-precision floating-point value `a' 5609 | with respect to the corresponding value `b'. The operation is performed 5610 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5611 *----------------------------------------------------------------------------*/ 5612 5613 float128 float128_rem( float128 a, float128 b STATUS_PARAM ) 5614 { 5615 flag aSign, zSign; 5616 int32 aExp, bExp, expDiff; 5617 uint64_t aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2; 5618 uint64_t allZero, alternateASig0, alternateASig1, sigMean1; 5619 int64_t sigMean0; 5620 float128 z; 5621 5622 aSig1 = extractFloat128Frac1( a ); 5623 aSig0 = extractFloat128Frac0( a ); 5624 aExp = extractFloat128Exp( a ); 5625 aSign = extractFloat128Sign( a ); 5626 bSig1 = extractFloat128Frac1( b ); 5627 bSig0 = extractFloat128Frac0( b ); 5628 bExp = extractFloat128Exp( b ); 5629 if ( aExp == 0x7FFF ) { 5630 if ( ( aSig0 | aSig1 ) 5631 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) { 5632 return propagateFloat128NaN( a, b STATUS_VAR ); 5633 } 5634 goto invalid; 5635 } 5636 if ( bExp == 0x7FFF ) { 5637 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR ); 5638 return a; 5639 } 5640 if ( bExp == 0 ) { 5641 if ( ( bSig0 | bSig1 ) == 0 ) { 5642 invalid: 5643 float_raise( float_flag_invalid STATUS_VAR); 5644 z.low = float128_default_nan_low; 5645 z.high = float128_default_nan_high; 5646 return z; 5647 } 5648 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 ); 5649 } 5650 if ( aExp == 0 ) { 5651 if ( ( aSig0 | aSig1 ) == 0 ) return a; 5652 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 5653 } 5654 expDiff = aExp - bExp; 5655 if ( expDiff < -1 ) return a; 5656 shortShift128Left( 5657 aSig0 | LIT64( 0x0001000000000000 ), 5658 aSig1, 5659 15 - ( expDiff < 0 ), 5660 &aSig0, 5661 &aSig1 5662 ); 5663 shortShift128Left( 5664 bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 ); 5665 q = le128( bSig0, bSig1, aSig0, aSig1 ); 5666 if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 ); 5667 expDiff -= 64; 5668 while ( 0 < expDiff ) { 5669 q = estimateDiv128To64( aSig0, aSig1, bSig0 ); 5670 q = ( 4 < q ) ? q - 4 : 0; 5671 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 ); 5672 shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero ); 5673 shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero ); 5674 sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 ); 5675 expDiff -= 61; 5676 } 5677 if ( -64 < expDiff ) { 5678 q = estimateDiv128To64( aSig0, aSig1, bSig0 ); 5679 q = ( 4 < q ) ? q - 4 : 0; 5680 q >>= - expDiff; 5681 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 ); 5682 expDiff += 52; 5683 if ( expDiff < 0 ) { 5684 shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 ); 5685 } 5686 else { 5687 shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 ); 5688 } 5689 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 ); 5690 sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 ); 5691 } 5692 else { 5693 shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 ); 5694 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 ); 5695 } 5696 do { 5697 alternateASig0 = aSig0; 5698 alternateASig1 = aSig1; 5699 ++q; 5700 sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 ); 5701 } while ( 0 <= (int64_t) aSig0 ); 5702 add128( 5703 aSig0, aSig1, alternateASig0, alternateASig1, (uint64_t *)&sigMean0, &sigMean1 ); 5704 if ( ( sigMean0 < 0 ) 5705 || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) { 5706 aSig0 = alternateASig0; 5707 aSig1 = alternateASig1; 5708 } 5709 zSign = ( (int64_t) aSig0 < 0 ); 5710 if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 ); 5711 return 5712 normalizeRoundAndPackFloat128( aSign ^ zSign, bExp - 4, aSig0, aSig1 STATUS_VAR ); 5713 5714 } 5715 5716 /*---------------------------------------------------------------------------- 5717 | Returns the square root of the quadruple-precision floating-point value `a'. 5718 | The operation is performed according to the IEC/IEEE Standard for Binary 5719 | Floating-Point Arithmetic. 5720 *----------------------------------------------------------------------------*/ 5721 5722 float128 float128_sqrt( float128 a STATUS_PARAM ) 5723 { 5724 flag aSign; 5725 int32 aExp, zExp; 5726 uint64_t aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0; 5727 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3; 5728 float128 z; 5729 5730 aSig1 = extractFloat128Frac1( a ); 5731 aSig0 = extractFloat128Frac0( a ); 5732 aExp = extractFloat128Exp( a ); 5733 aSign = extractFloat128Sign( a ); 5734 if ( aExp == 0x7FFF ) { 5735 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, a STATUS_VAR ); 5736 if ( ! aSign ) return a; 5737 goto invalid; 5738 } 5739 if ( aSign ) { 5740 if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a; 5741 invalid: 5742 float_raise( float_flag_invalid STATUS_VAR); 5743 z.low = float128_default_nan_low; 5744 z.high = float128_default_nan_high; 5745 return z; 5746 } 5747 if ( aExp == 0 ) { 5748 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 ); 5749 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 5750 } 5751 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE; 5752 aSig0 |= LIT64( 0x0001000000000000 ); 5753 zSig0 = estimateSqrt32( aExp, aSig0>>17 ); 5754 shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 ); 5755 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 ); 5756 doubleZSig0 = zSig0<<1; 5757 mul64To128( zSig0, zSig0, &term0, &term1 ); 5758 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 ); 5759 while ( (int64_t) rem0 < 0 ) { 5760 --zSig0; 5761 doubleZSig0 -= 2; 5762 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 ); 5763 } 5764 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 ); 5765 if ( ( zSig1 & 0x1FFF ) <= 5 ) { 5766 if ( zSig1 == 0 ) zSig1 = 1; 5767 mul64To128( doubleZSig0, zSig1, &term1, &term2 ); 5768 sub128( rem1, 0, term1, term2, &rem1, &rem2 ); 5769 mul64To128( zSig1, zSig1, &term2, &term3 ); 5770 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 ); 5771 while ( (int64_t) rem1 < 0 ) { 5772 --zSig1; 5773 shortShift128Left( 0, zSig1, 1, &term2, &term3 ); 5774 term3 |= 1; 5775 term2 |= doubleZSig0; 5776 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 ); 5777 } 5778 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 ); 5779 } 5780 shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 ); 5781 return roundAndPackFloat128( 0, zExp, zSig0, zSig1, zSig2 STATUS_VAR ); 5782 5783 } 5784 5785 /*---------------------------------------------------------------------------- 5786 | Returns 1 if the quadruple-precision floating-point value `a' is equal to 5787 | the corresponding value `b', and 0 otherwise. The invalid exception is 5788 | raised if either operand is a NaN. Otherwise, the comparison is performed 5789 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5790 *----------------------------------------------------------------------------*/ 5791 5792 int float128_eq( float128 a, float128 b STATUS_PARAM ) 5793 { 5794 5795 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 5796 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 5797 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 5798 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 5799 ) { 5800 float_raise( float_flag_invalid STATUS_VAR); 5801 return 0; 5802 } 5803 return 5804 ( a.low == b.low ) 5805 && ( ( a.high == b.high ) 5806 || ( ( a.low == 0 ) 5807 && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) ) 5808 ); 5809 5810 } 5811 5812 /*---------------------------------------------------------------------------- 5813 | Returns 1 if the quadruple-precision floating-point value `a' is less than 5814 | or equal to the corresponding value `b', and 0 otherwise. The invalid 5815 | exception is raised if either operand is a NaN. The comparison is performed 5816 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5817 *----------------------------------------------------------------------------*/ 5818 5819 int float128_le( float128 a, float128 b STATUS_PARAM ) 5820 { 5821 flag aSign, bSign; 5822 5823 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 5824 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 5825 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 5826 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 5827 ) { 5828 float_raise( float_flag_invalid STATUS_VAR); 5829 return 0; 5830 } 5831 aSign = extractFloat128Sign( a ); 5832 bSign = extractFloat128Sign( b ); 5833 if ( aSign != bSign ) { 5834 return 5835 aSign 5836 || ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 5837 == 0 ); 5838 } 5839 return 5840 aSign ? le128( b.high, b.low, a.high, a.low ) 5841 : le128( a.high, a.low, b.high, b.low ); 5842 5843 } 5844 5845 /*---------------------------------------------------------------------------- 5846 | Returns 1 if the quadruple-precision floating-point value `a' is less than 5847 | the corresponding value `b', and 0 otherwise. The invalid exception is 5848 | raised if either operand is a NaN. The comparison is performed according 5849 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5850 *----------------------------------------------------------------------------*/ 5851 5852 int float128_lt( float128 a, float128 b STATUS_PARAM ) 5853 { 5854 flag aSign, bSign; 5855 5856 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 5857 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 5858 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 5859 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 5860 ) { 5861 float_raise( float_flag_invalid STATUS_VAR); 5862 return 0; 5863 } 5864 aSign = extractFloat128Sign( a ); 5865 bSign = extractFloat128Sign( b ); 5866 if ( aSign != bSign ) { 5867 return 5868 aSign 5869 && ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 5870 != 0 ); 5871 } 5872 return 5873 aSign ? lt128( b.high, b.low, a.high, a.low ) 5874 : lt128( a.high, a.low, b.high, b.low ); 5875 5876 } 5877 5878 /*---------------------------------------------------------------------------- 5879 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot 5880 | be compared, and 0 otherwise. The invalid exception is raised if either 5881 | operand is a NaN. The comparison is performed according to the IEC/IEEE 5882 | Standard for Binary Floating-Point Arithmetic. 5883 *----------------------------------------------------------------------------*/ 5884 5885 int float128_unordered( float128 a, float128 b STATUS_PARAM ) 5886 { 5887 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 5888 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 5889 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 5890 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 5891 ) { 5892 float_raise( float_flag_invalid STATUS_VAR); 5893 return 1; 5894 } 5895 return 0; 5896 } 5897 5898 /*---------------------------------------------------------------------------- 5899 | Returns 1 if the quadruple-precision floating-point value `a' is equal to 5900 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 5901 | exception. The comparison is performed according to the IEC/IEEE Standard 5902 | for Binary Floating-Point Arithmetic. 5903 *----------------------------------------------------------------------------*/ 5904 5905 int float128_eq_quiet( float128 a, float128 b STATUS_PARAM ) 5906 { 5907 5908 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 5909 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 5910 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 5911 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 5912 ) { 5913 if ( float128_is_signaling_nan( a ) 5914 || float128_is_signaling_nan( b ) ) { 5915 float_raise( float_flag_invalid STATUS_VAR); 5916 } 5917 return 0; 5918 } 5919 return 5920 ( a.low == b.low ) 5921 && ( ( a.high == b.high ) 5922 || ( ( a.low == 0 ) 5923 && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) ) 5924 ); 5925 5926 } 5927 5928 /*---------------------------------------------------------------------------- 5929 | Returns 1 if the quadruple-precision floating-point value `a' is less than 5930 | or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not 5931 | cause an exception. Otherwise, the comparison is performed according to the 5932 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5933 *----------------------------------------------------------------------------*/ 5934 5935 int float128_le_quiet( float128 a, float128 b STATUS_PARAM ) 5936 { 5937 flag aSign, bSign; 5938 5939 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 5940 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 5941 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 5942 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 5943 ) { 5944 if ( float128_is_signaling_nan( a ) 5945 || float128_is_signaling_nan( b ) ) { 5946 float_raise( float_flag_invalid STATUS_VAR); 5947 } 5948 return 0; 5949 } 5950 aSign = extractFloat128Sign( a ); 5951 bSign = extractFloat128Sign( b ); 5952 if ( aSign != bSign ) { 5953 return 5954 aSign 5955 || ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 5956 == 0 ); 5957 } 5958 return 5959 aSign ? le128( b.high, b.low, a.high, a.low ) 5960 : le128( a.high, a.low, b.high, b.low ); 5961 5962 } 5963 5964 /*---------------------------------------------------------------------------- 5965 | Returns 1 if the quadruple-precision floating-point value `a' is less than 5966 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 5967 | exception. Otherwise, the comparison is performed according to the IEC/IEEE 5968 | Standard for Binary Floating-Point Arithmetic. 5969 *----------------------------------------------------------------------------*/ 5970 5971 int float128_lt_quiet( float128 a, float128 b STATUS_PARAM ) 5972 { 5973 flag aSign, bSign; 5974 5975 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 5976 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 5977 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 5978 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 5979 ) { 5980 if ( float128_is_signaling_nan( a ) 5981 || float128_is_signaling_nan( b ) ) { 5982 float_raise( float_flag_invalid STATUS_VAR); 5983 } 5984 return 0; 5985 } 5986 aSign = extractFloat128Sign( a ); 5987 bSign = extractFloat128Sign( b ); 5988 if ( aSign != bSign ) { 5989 return 5990 aSign 5991 && ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 5992 != 0 ); 5993 } 5994 return 5995 aSign ? lt128( b.high, b.low, a.high, a.low ) 5996 : lt128( a.high, a.low, b.high, b.low ); 5997 5998 } 5999 6000 /*---------------------------------------------------------------------------- 6001 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot 6002 | be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The 6003 | comparison is performed according to the IEC/IEEE Standard for Binary 6004 | Floating-Point Arithmetic. 6005 *----------------------------------------------------------------------------*/ 6006 6007 int float128_unordered_quiet( float128 a, float128 b STATUS_PARAM ) 6008 { 6009 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 6010 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 6011 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 6012 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 6013 ) { 6014 if ( float128_is_signaling_nan( a ) 6015 || float128_is_signaling_nan( b ) ) { 6016 float_raise( float_flag_invalid STATUS_VAR); 6017 } 6018 return 1; 6019 } 6020 return 0; 6021 } 6022 6023 #endif 6024 6025 /* misc functions */ 6026 float32 uint32_to_float32( unsigned int a STATUS_PARAM ) 6027 { 6028 return int64_to_float32(a STATUS_VAR); 6029 } 6030 6031 float64 uint32_to_float64( unsigned int a STATUS_PARAM ) 6032 { 6033 return int64_to_float64(a STATUS_VAR); 6034 } 6035 6036 unsigned int float32_to_uint32( float32 a STATUS_PARAM ) 6037 { 6038 int64_t v; 6039 unsigned int res; 6040 6041 v = float32_to_int64(a STATUS_VAR); 6042 if (v < 0) { 6043 res = 0; 6044 float_raise( float_flag_invalid STATUS_VAR); 6045 } else if (v > 0xffffffff) { 6046 res = 0xffffffff; 6047 float_raise( float_flag_invalid STATUS_VAR); 6048 } else { 6049 res = v; 6050 } 6051 return res; 6052 } 6053 6054 unsigned int float32_to_uint32_round_to_zero( float32 a STATUS_PARAM ) 6055 { 6056 int64_t v; 6057 unsigned int res; 6058 6059 v = float32_to_int64_round_to_zero(a STATUS_VAR); 6060 if (v < 0) { 6061 res = 0; 6062 float_raise( float_flag_invalid STATUS_VAR); 6063 } else if (v > 0xffffffff) { 6064 res = 0xffffffff; 6065 float_raise( float_flag_invalid STATUS_VAR); 6066 } else { 6067 res = v; 6068 } 6069 return res; 6070 } 6071 6072 unsigned int float32_to_uint16_round_to_zero( float32 a STATUS_PARAM ) 6073 { 6074 int64_t v; 6075 unsigned int res; 6076 6077 v = float32_to_int64_round_to_zero(a STATUS_VAR); 6078 if (v < 0) { 6079 res = 0; 6080 float_raise( float_flag_invalid STATUS_VAR); 6081 } else if (v > 0xffff) { 6082 res = 0xffff; 6083 float_raise( float_flag_invalid STATUS_VAR); 6084 } else { 6085 res = v; 6086 } 6087 return res; 6088 } 6089 6090 unsigned int float64_to_uint32( float64 a STATUS_PARAM ) 6091 { 6092 int64_t v; 6093 unsigned int res; 6094 6095 v = float64_to_int64(a STATUS_VAR); 6096 if (v < 0) { 6097 res = 0; 6098 float_raise( float_flag_invalid STATUS_VAR); 6099 } else if (v > 0xffffffff) { 6100 res = 0xffffffff; 6101 float_raise( float_flag_invalid STATUS_VAR); 6102 } else { 6103 res = v; 6104 } 6105 return res; 6106 } 6107 6108 unsigned int float64_to_uint32_round_to_zero( float64 a STATUS_PARAM ) 6109 { 6110 int64_t v; 6111 unsigned int res; 6112 6113 v = float64_to_int64_round_to_zero(a STATUS_VAR); 6114 if (v < 0) { 6115 res = 0; 6116 float_raise( float_flag_invalid STATUS_VAR); 6117 } else if (v > 0xffffffff) { 6118 res = 0xffffffff; 6119 float_raise( float_flag_invalid STATUS_VAR); 6120 } else { 6121 res = v; 6122 } 6123 return res; 6124 } 6125 6126 unsigned int float64_to_uint16_round_to_zero( float64 a STATUS_PARAM ) 6127 { 6128 int64_t v; 6129 unsigned int res; 6130 6131 v = float64_to_int64_round_to_zero(a STATUS_VAR); 6132 if (v < 0) { 6133 res = 0; 6134 float_raise( float_flag_invalid STATUS_VAR); 6135 } else if (v > 0xffff) { 6136 res = 0xffff; 6137 float_raise( float_flag_invalid STATUS_VAR); 6138 } else { 6139 res = v; 6140 } 6141 return res; 6142 } 6143 6144 /* FIXME: This looks broken. */ 6145 uint64_t float64_to_uint64 (float64 a STATUS_PARAM) 6146 { 6147 int64_t v; 6148 6149 v = float64_val(int64_to_float64(INT64_MIN STATUS_VAR)); 6150 v += float64_val(a); 6151 v = float64_to_int64(make_float64(v) STATUS_VAR); 6152 6153 return v - INT64_MIN; 6154 } 6155 6156 uint64_t float64_to_uint64_round_to_zero (float64 a STATUS_PARAM) 6157 { 6158 int64_t v; 6159 6160 v = float64_val(int64_to_float64(INT64_MIN STATUS_VAR)); 6161 v += float64_val(a); 6162 v = float64_to_int64_round_to_zero(make_float64(v) STATUS_VAR); 6163 6164 return v - INT64_MIN; 6165 } 6166 6167 #define COMPARE(s, nan_exp) \ 6168 INLINE int float ## s ## _compare_internal( float ## s a, float ## s b, \ 6169 int is_quiet STATUS_PARAM ) \ 6170 { \ 6171 flag aSign, bSign; \ 6172 uint ## s ## _t av, bv; \ 6173 a = float ## s ## _squash_input_denormal(a STATUS_VAR); \ 6174 b = float ## s ## _squash_input_denormal(b STATUS_VAR); \ 6175 \ 6176 if (( ( extractFloat ## s ## Exp( a ) == nan_exp ) && \ 6177 extractFloat ## s ## Frac( a ) ) || \ 6178 ( ( extractFloat ## s ## Exp( b ) == nan_exp ) && \ 6179 extractFloat ## s ## Frac( b ) )) { \ 6180 if (!is_quiet || \ 6181 float ## s ## _is_signaling_nan( a ) || \ 6182 float ## s ## _is_signaling_nan( b ) ) { \ 6183 float_raise( float_flag_invalid STATUS_VAR); \ 6184 } \ 6185 return float_relation_unordered; \ 6186 } \ 6187 aSign = extractFloat ## s ## Sign( a ); \ 6188 bSign = extractFloat ## s ## Sign( b ); \ 6189 av = float ## s ## _val(a); \ 6190 bv = float ## s ## _val(b); \ 6191 if ( aSign != bSign ) { \ 6192 if ( (uint ## s ## _t) ( ( av | bv )<<1 ) == 0 ) { \ 6193 /* zero case */ \ 6194 return float_relation_equal; \ 6195 } else { \ 6196 return 1 - (2 * aSign); \ 6197 } \ 6198 } else { \ 6199 if (av == bv) { \ 6200 return float_relation_equal; \ 6201 } else { \ 6202 return 1 - 2 * (aSign ^ ( av < bv )); \ 6203 } \ 6204 } \ 6205 } \ 6206 \ 6207 int float ## s ## _compare( float ## s a, float ## s b STATUS_PARAM ) \ 6208 { \ 6209 return float ## s ## _compare_internal(a, b, 0 STATUS_VAR); \ 6210 } \ 6211 \ 6212 int float ## s ## _compare_quiet( float ## s a, float ## s b STATUS_PARAM ) \ 6213 { \ 6214 return float ## s ## _compare_internal(a, b, 1 STATUS_VAR); \ 6215 } 6216 6217 COMPARE(32, 0xff) 6218 COMPARE(64, 0x7ff) 6219 6220 INLINE int floatx80_compare_internal( floatx80 a, floatx80 b, 6221 int is_quiet STATUS_PARAM ) 6222 { 6223 flag aSign, bSign; 6224 6225 if (( ( extractFloatx80Exp( a ) == 0x7fff ) && 6226 ( extractFloatx80Frac( a )<<1 ) ) || 6227 ( ( extractFloatx80Exp( b ) == 0x7fff ) && 6228 ( extractFloatx80Frac( b )<<1 ) )) { 6229 if (!is_quiet || 6230 floatx80_is_signaling_nan( a ) || 6231 floatx80_is_signaling_nan( b ) ) { 6232 float_raise( float_flag_invalid STATUS_VAR); 6233 } 6234 return float_relation_unordered; 6235 } 6236 aSign = extractFloatx80Sign( a ); 6237 bSign = extractFloatx80Sign( b ); 6238 if ( aSign != bSign ) { 6239 6240 if ( ( ( (uint16_t) ( ( a.high | b.high ) << 1 ) ) == 0) && 6241 ( ( a.low | b.low ) == 0 ) ) { 6242 /* zero case */ 6243 return float_relation_equal; 6244 } else { 6245 return 1 - (2 * aSign); 6246 } 6247 } else { 6248 if (a.low == b.low && a.high == b.high) { 6249 return float_relation_equal; 6250 } else { 6251 return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) )); 6252 } 6253 } 6254 } 6255 6256 int floatx80_compare( floatx80 a, floatx80 b STATUS_PARAM ) 6257 { 6258 return floatx80_compare_internal(a, b, 0 STATUS_VAR); 6259 } 6260 6261 int floatx80_compare_quiet( floatx80 a, floatx80 b STATUS_PARAM ) 6262 { 6263 return floatx80_compare_internal(a, b, 1 STATUS_VAR); 6264 } 6265 6266 INLINE int float128_compare_internal( float128 a, float128 b, 6267 int is_quiet STATUS_PARAM ) 6268 { 6269 flag aSign, bSign; 6270 6271 if (( ( extractFloat128Exp( a ) == 0x7fff ) && 6272 ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) || 6273 ( ( extractFloat128Exp( b ) == 0x7fff ) && 6274 ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )) { 6275 if (!is_quiet || 6276 float128_is_signaling_nan( a ) || 6277 float128_is_signaling_nan( b ) ) { 6278 float_raise( float_flag_invalid STATUS_VAR); 6279 } 6280 return float_relation_unordered; 6281 } 6282 aSign = extractFloat128Sign( a ); 6283 bSign = extractFloat128Sign( b ); 6284 if ( aSign != bSign ) { 6285 if ( ( ( ( a.high | b.high )<<1 ) | a.low | b.low ) == 0 ) { 6286 /* zero case */ 6287 return float_relation_equal; 6288 } else { 6289 return 1 - (2 * aSign); 6290 } 6291 } else { 6292 if (a.low == b.low && a.high == b.high) { 6293 return float_relation_equal; 6294 } else { 6295 return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) )); 6296 } 6297 } 6298 } 6299 6300 int float128_compare( float128 a, float128 b STATUS_PARAM ) 6301 { 6302 return float128_compare_internal(a, b, 0 STATUS_VAR); 6303 } 6304 6305 int float128_compare_quiet( float128 a, float128 b STATUS_PARAM ) 6306 { 6307 return float128_compare_internal(a, b, 1 STATUS_VAR); 6308 } 6309 6310 /* min() and max() functions. These can't be implemented as 6311 * 'compare and pick one input' because that would mishandle 6312 * NaNs and +0 vs -0. 6313 */ 6314 #define MINMAX(s, nan_exp) \ 6315 INLINE float ## s float ## s ## _minmax(float ## s a, float ## s b, \ 6316 int ismin STATUS_PARAM ) \ 6317 { \ 6318 flag aSign, bSign; \ 6319 uint ## s ## _t av, bv; \ 6320 a = float ## s ## _squash_input_denormal(a STATUS_VAR); \ 6321 b = float ## s ## _squash_input_denormal(b STATUS_VAR); \ 6322 if (float ## s ## _is_any_nan(a) || \ 6323 float ## s ## _is_any_nan(b)) { \ 6324 return propagateFloat ## s ## NaN(a, b STATUS_VAR); \ 6325 } \ 6326 aSign = extractFloat ## s ## Sign(a); \ 6327 bSign = extractFloat ## s ## Sign(b); \ 6328 av = float ## s ## _val(a); \ 6329 bv = float ## s ## _val(b); \ 6330 if (aSign != bSign) { \ 6331 if (ismin) { \ 6332 return aSign ? a : b; \ 6333 } else { \ 6334 return aSign ? b : a; \ 6335 } \ 6336 } else { \ 6337 if (ismin) { \ 6338 return (aSign ^ (av < bv)) ? a : b; \ 6339 } else { \ 6340 return (aSign ^ (av < bv)) ? b : a; \ 6341 } \ 6342 } \ 6343 } \ 6344 \ 6345 float ## s float ## s ## _min(float ## s a, float ## s b STATUS_PARAM) \ 6346 { \ 6347 return float ## s ## _minmax(a, b, 1 STATUS_VAR); \ 6348 } \ 6349 \ 6350 float ## s float ## s ## _max(float ## s a, float ## s b STATUS_PARAM) \ 6351 { \ 6352 return float ## s ## _minmax(a, b, 0 STATUS_VAR); \ 6353 } 6354 6355 MINMAX(32, 0xff) 6356 MINMAX(64, 0x7ff) 6357 6358 6359 /* Multiply A by 2 raised to the power N. */ 6360 float32 float32_scalbn( float32 a, int n STATUS_PARAM ) 6361 { 6362 flag aSign; 6363 int16_t aExp; 6364 uint32_t aSig; 6365 6366 a = float32_squash_input_denormal(a STATUS_VAR); 6367 aSig = extractFloat32Frac( a ); 6368 aExp = extractFloat32Exp( a ); 6369 aSign = extractFloat32Sign( a ); 6370 6371 if ( aExp == 0xFF ) { 6372 if ( aSig ) { 6373 return propagateFloat32NaN( a, a STATUS_VAR ); 6374 } 6375 return a; 6376 } 6377 if ( aExp != 0 ) 6378 aSig |= 0x00800000; 6379 else if ( aSig == 0 ) 6380 return a; 6381 6382 if (n > 0x200) { 6383 n = 0x200; 6384 } else if (n < -0x200) { 6385 n = -0x200; 6386 } 6387 6388 aExp += n - 1; 6389 aSig <<= 7; 6390 return normalizeRoundAndPackFloat32( aSign, aExp, aSig STATUS_VAR ); 6391 } 6392 6393 float64 float64_scalbn( float64 a, int n STATUS_PARAM ) 6394 { 6395 flag aSign; 6396 int16_t aExp; 6397 uint64_t aSig; 6398 6399 a = float64_squash_input_denormal(a STATUS_VAR); 6400 aSig = extractFloat64Frac( a ); 6401 aExp = extractFloat64Exp( a ); 6402 aSign = extractFloat64Sign( a ); 6403 6404 if ( aExp == 0x7FF ) { 6405 if ( aSig ) { 6406 return propagateFloat64NaN( a, a STATUS_VAR ); 6407 } 6408 return a; 6409 } 6410 if ( aExp != 0 ) 6411 aSig |= LIT64( 0x0010000000000000 ); 6412 else if ( aSig == 0 ) 6413 return a; 6414 6415 if (n > 0x1000) { 6416 n = 0x1000; 6417 } else if (n < -0x1000) { 6418 n = -0x1000; 6419 } 6420 6421 aExp += n - 1; 6422 aSig <<= 10; 6423 return normalizeRoundAndPackFloat64( aSign, aExp, aSig STATUS_VAR ); 6424 } 6425 6426 #ifdef FLOATX80 6427 floatx80 floatx80_scalbn( floatx80 a, int n STATUS_PARAM ) 6428 { 6429 flag aSign; 6430 int32_t aExp; 6431 uint64_t aSig; 6432 6433 aSig = extractFloatx80Frac( a ); 6434 aExp = extractFloatx80Exp( a ); 6435 aSign = extractFloatx80Sign( a ); 6436 6437 if ( aExp == 0x7FFF ) { 6438 if ( aSig<<1 ) { 6439 return propagateFloatx80NaN( a, a STATUS_VAR ); 6440 } 6441 return a; 6442 } 6443 6444 if (aExp == 0 && aSig == 0) 6445 return a; 6446 6447 if (n > 0x10000) { 6448 n = 0x10000; 6449 } else if (n < -0x10000) { 6450 n = -0x10000; 6451 } 6452 6453 aExp += n; 6454 return normalizeRoundAndPackFloatx80( STATUS(floatx80_rounding_precision), 6455 aSign, aExp, aSig, 0 STATUS_VAR ); 6456 } 6457 #endif 6458 6459 #ifdef FLOAT128 6460 float128 float128_scalbn( float128 a, int n STATUS_PARAM ) 6461 { 6462 flag aSign; 6463 int32_t aExp; 6464 uint64_t aSig0, aSig1; 6465 6466 aSig1 = extractFloat128Frac1( a ); 6467 aSig0 = extractFloat128Frac0( a ); 6468 aExp = extractFloat128Exp( a ); 6469 aSign = extractFloat128Sign( a ); 6470 if ( aExp == 0x7FFF ) { 6471 if ( aSig0 | aSig1 ) { 6472 return propagateFloat128NaN( a, a STATUS_VAR ); 6473 } 6474 return a; 6475 } 6476 if ( aExp != 0 ) 6477 aSig0 |= LIT64( 0x0001000000000000 ); 6478 else if ( aSig0 == 0 && aSig1 == 0 ) 6479 return a; 6480 6481 if (n > 0x10000) { 6482 n = 0x10000; 6483 } else if (n < -0x10000) { 6484 n = -0x10000; 6485 } 6486 6487 aExp += n - 1; 6488 return normalizeRoundAndPackFloat128( aSign, aExp, aSig0, aSig1 6489 STATUS_VAR ); 6490 6491 } 6492 #endif 6493