1 2 /*============================================================================ 3 4 This C source file is part of the SoftFloat IEC/IEEE Floating-point Arithmetic 5 Package, Release 2b. 6 7 Written by John R. Hauser. This work was made possible in part by the 8 International Computer Science Institute, located at Suite 600, 1947 Center 9 Street, Berkeley, California 94704. Funding was partially provided by the 10 National Science Foundation under grant MIP-9311980. The original version 11 of this code was written as part of a project to build a fixed-point vector 12 processor in collaboration with the University of California at Berkeley, 13 overseen by Profs. Nelson Morgan and John Wawrzynek. More information 14 is available through the Web page `http://www.cs.berkeley.edu/~jhauser/ 15 arithmetic/SoftFloat.html'. 16 17 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort has 18 been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT TIMES 19 RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO PERSONS 20 AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ALL LOSSES, 21 COSTS, OR OTHER PROBLEMS THEY INCUR DUE TO THE SOFTWARE, AND WHO FURTHERMORE 22 EFFECTIVELY INDEMNIFY JOHN HAUSER AND THE INTERNATIONAL COMPUTER SCIENCE 23 INSTITUTE (possibly via similar legal warning) AGAINST ALL LOSSES, COSTS, OR 24 OTHER PROBLEMS INCURRED BY THEIR CUSTOMERS AND CLIENTS DUE TO THE SOFTWARE. 25 26 Derivative works are acceptable, even for commercial purposes, so long as 27 (1) the source code for the derivative work includes prominent notice that 28 the work is derivative, and (2) the source code includes prominent notice with 29 these four paragraphs for those parts of this code that are retained. 30 31 =============================================================================*/ 32 33 /* FIXME: Flush-To-Zero only effects results. Denormal inputs should also 34 be flushed to zero. */ 35 #include "softfloat.h" 36 37 /*---------------------------------------------------------------------------- 38 | Primitive arithmetic functions, including multi-word arithmetic, and 39 | division and square root approximations. (Can be specialized to target if 40 | desired.) 41 *----------------------------------------------------------------------------*/ 42 #include "softfloat-macros.h" 43 44 /*---------------------------------------------------------------------------- 45 | Functions and definitions to determine: (1) whether tininess for underflow 46 | is detected before or after rounding by default, (2) what (if anything) 47 | happens when exceptions are raised, (3) how signaling NaNs are distinguished 48 | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs 49 | are propagated from function inputs to output. These details are target- 50 | specific. 51 *----------------------------------------------------------------------------*/ 52 #include "softfloat-specialize.h" 53 54 void set_float_rounding_mode(int val STATUS_PARAM) 55 { 56 STATUS(float_rounding_mode) = val; 57 } 58 59 void set_float_exception_flags(int val STATUS_PARAM) 60 { 61 STATUS(float_exception_flags) = val; 62 } 63 64 #ifdef FLOATX80 65 void set_floatx80_rounding_precision(int val STATUS_PARAM) 66 { 67 STATUS(floatx80_rounding_precision) = val; 68 } 69 #endif 70 71 /*---------------------------------------------------------------------------- 72 | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6 73 | and 7, and returns the properly rounded 32-bit integer corresponding to the 74 | input. If `zSign' is 1, the input is negated before being converted to an 75 | integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point input 76 | is simply rounded to an integer, with the inexact exception raised if the 77 | input cannot be represented exactly as an integer. However, if the fixed- 78 | point input is too large, the invalid exception is raised and the largest 79 | positive or negative integer is returned. 80 *----------------------------------------------------------------------------*/ 81 82 static int32 roundAndPackInt32( flag zSign, bits64 absZ STATUS_PARAM) 83 { 84 int8 roundingMode; 85 flag roundNearestEven; 86 int8 roundIncrement, roundBits; 87 int32 z; 88 89 roundingMode = STATUS(float_rounding_mode); 90 roundNearestEven = ( roundingMode == float_round_nearest_even ); 91 roundIncrement = 0x40; 92 if ( ! roundNearestEven ) { 93 if ( roundingMode == float_round_to_zero ) { 94 roundIncrement = 0; 95 } 96 else { 97 roundIncrement = 0x7F; 98 if ( zSign ) { 99 if ( roundingMode == float_round_up ) roundIncrement = 0; 100 } 101 else { 102 if ( roundingMode == float_round_down ) roundIncrement = 0; 103 } 104 } 105 } 106 roundBits = absZ & 0x7F; 107 absZ = ( absZ + roundIncrement )>>7; 108 absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven ); 109 z = absZ; 110 if ( zSign ) z = - z; 111 if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) { 112 float_raise( float_flag_invalid STATUS_VAR); 113 return zSign ? (sbits32) 0x80000000 : 0x7FFFFFFF; 114 } 115 if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact; 116 return z; 117 118 } 119 120 /*---------------------------------------------------------------------------- 121 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and 122 | `absZ1', with binary point between bits 63 and 64 (between the input words), 123 | and returns the properly rounded 64-bit integer corresponding to the input. 124 | If `zSign' is 1, the input is negated before being converted to an integer. 125 | Ordinarily, the fixed-point input is simply rounded to an integer, with 126 | the inexact exception raised if the input cannot be represented exactly as 127 | an integer. However, if the fixed-point input is too large, the invalid 128 | exception is raised and the largest positive or negative integer is 129 | returned. 130 *----------------------------------------------------------------------------*/ 131 132 static int64 roundAndPackInt64( flag zSign, bits64 absZ0, bits64 absZ1 STATUS_PARAM) 133 { 134 int8 roundingMode; 135 flag roundNearestEven, increment; 136 int64 z; 137 138 roundingMode = STATUS(float_rounding_mode); 139 roundNearestEven = ( roundingMode == float_round_nearest_even ); 140 increment = ( (sbits64) absZ1 < 0 ); 141 if ( ! roundNearestEven ) { 142 if ( roundingMode == float_round_to_zero ) { 143 increment = 0; 144 } 145 else { 146 if ( zSign ) { 147 increment = ( roundingMode == float_round_down ) && absZ1; 148 } 149 else { 150 increment = ( roundingMode == float_round_up ) && absZ1; 151 } 152 } 153 } 154 if ( increment ) { 155 ++absZ0; 156 if ( absZ0 == 0 ) goto overflow; 157 absZ0 &= ~ ( ( (bits64) ( absZ1<<1 ) == 0 ) & roundNearestEven ); 158 } 159 z = absZ0; 160 if ( zSign ) z = - z; 161 if ( z && ( ( z < 0 ) ^ zSign ) ) { 162 overflow: 163 float_raise( float_flag_invalid STATUS_VAR); 164 return 165 zSign ? (sbits64) LIT64( 0x8000000000000000 ) 166 : LIT64( 0x7FFFFFFFFFFFFFFF ); 167 } 168 if ( absZ1 ) STATUS(float_exception_flags) |= float_flag_inexact; 169 return z; 170 171 } 172 173 /*---------------------------------------------------------------------------- 174 | Returns the fraction bits of the single-precision floating-point value `a'. 175 *----------------------------------------------------------------------------*/ 176 177 INLINE bits32 extractFloat32Frac( float32 a ) 178 { 179 180 return float32_val(a) & 0x007FFFFF; 181 182 } 183 184 /*---------------------------------------------------------------------------- 185 | Returns the exponent bits of the single-precision floating-point value `a'. 186 *----------------------------------------------------------------------------*/ 187 188 INLINE int16 extractFloat32Exp( float32 a ) 189 { 190 191 return ( float32_val(a)>>23 ) & 0xFF; 192 193 } 194 195 /*---------------------------------------------------------------------------- 196 | Returns the sign bit of the single-precision floating-point value `a'. 197 *----------------------------------------------------------------------------*/ 198 199 INLINE flag extractFloat32Sign( float32 a ) 200 { 201 202 return float32_val(a)>>31; 203 204 } 205 206 /*---------------------------------------------------------------------------- 207 | Normalizes the subnormal single-precision floating-point value represented 208 | by the denormalized significand `aSig'. The normalized exponent and 209 | significand are stored at the locations pointed to by `zExpPtr' and 210 | `zSigPtr', respectively. 211 *----------------------------------------------------------------------------*/ 212 213 static void 214 normalizeFloat32Subnormal( bits32 aSig, int16 *zExpPtr, bits32 *zSigPtr ) 215 { 216 int8 shiftCount; 217 218 shiftCount = countLeadingZeros32( aSig ) - 8; 219 *zSigPtr = aSig<<shiftCount; 220 *zExpPtr = 1 - shiftCount; 221 222 } 223 224 /*---------------------------------------------------------------------------- 225 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a 226 | single-precision floating-point value, returning the result. After being 227 | shifted into the proper positions, the three fields are simply added 228 | together to form the result. This means that any integer portion of `zSig' 229 | will be added into the exponent. Since a properly normalized significand 230 | will have an integer portion equal to 1, the `zExp' input should be 1 less 231 | than the desired result exponent whenever `zSig' is a complete, normalized 232 | significand. 233 *----------------------------------------------------------------------------*/ 234 235 INLINE float32 packFloat32( flag zSign, int16 zExp, bits32 zSig ) 236 { 237 238 return make_float32( 239 ( ( (bits32) zSign )<<31 ) + ( ( (bits32) zExp )<<23 ) + zSig); 240 241 } 242 243 /*---------------------------------------------------------------------------- 244 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 245 | and significand `zSig', and returns the proper single-precision floating- 246 | point value corresponding to the abstract input. Ordinarily, the abstract 247 | value is simply rounded and packed into the single-precision format, with 248 | the inexact exception raised if the abstract input cannot be represented 249 | exactly. However, if the abstract value is too large, the overflow and 250 | inexact exceptions are raised and an infinity or maximal finite value is 251 | returned. If the abstract value is too small, the input value is rounded to 252 | a subnormal number, and the underflow and inexact exceptions are raised if 253 | the abstract input cannot be represented exactly as a subnormal single- 254 | precision floating-point number. 255 | The input significand `zSig' has its binary point between bits 30 256 | and 29, which is 7 bits to the left of the usual location. This shifted 257 | significand must be normalized or smaller. If `zSig' is not normalized, 258 | `zExp' must be 0; in that case, the result returned is a subnormal number, 259 | and it must not require rounding. In the usual case that `zSig' is 260 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent. 261 | The handling of underflow and overflow follows the IEC/IEEE Standard for 262 | Binary Floating-Point Arithmetic. 263 *----------------------------------------------------------------------------*/ 264 265 static float32 roundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig STATUS_PARAM) 266 { 267 int8 roundingMode; 268 flag roundNearestEven; 269 int8 roundIncrement, roundBits; 270 flag isTiny; 271 272 roundingMode = STATUS(float_rounding_mode); 273 roundNearestEven = ( roundingMode == float_round_nearest_even ); 274 roundIncrement = 0x40; 275 if ( ! roundNearestEven ) { 276 if ( roundingMode == float_round_to_zero ) { 277 roundIncrement = 0; 278 } 279 else { 280 roundIncrement = 0x7F; 281 if ( zSign ) { 282 if ( roundingMode == float_round_up ) roundIncrement = 0; 283 } 284 else { 285 if ( roundingMode == float_round_down ) roundIncrement = 0; 286 } 287 } 288 } 289 roundBits = zSig & 0x7F; 290 if ( 0xFD <= (bits16) zExp ) { 291 if ( ( 0xFD < zExp ) 292 || ( ( zExp == 0xFD ) 293 && ( (sbits32) ( zSig + roundIncrement ) < 0 ) ) 294 ) { 295 float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR); 296 return packFloat32( zSign, 0xFF, - ( roundIncrement == 0 )); 297 } 298 if ( zExp < 0 ) { 299 if ( STATUS(flush_to_zero) ) return packFloat32( zSign, 0, 0 ); 300 isTiny = 301 ( STATUS(float_detect_tininess) == float_tininess_before_rounding ) 302 || ( zExp < -1 ) 303 || ( zSig + roundIncrement < 0x80000000 ); 304 shift32RightJamming( zSig, - zExp, &zSig ); 305 zExp = 0; 306 roundBits = zSig & 0x7F; 307 if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR); 308 } 309 } 310 if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact; 311 zSig = ( zSig + roundIncrement )>>7; 312 zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven ); 313 if ( zSig == 0 ) zExp = 0; 314 return packFloat32( zSign, zExp, zSig ); 315 316 } 317 318 /*---------------------------------------------------------------------------- 319 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 320 | and significand `zSig', and returns the proper single-precision floating- 321 | point value corresponding to the abstract input. This routine is just like 322 | `roundAndPackFloat32' except that `zSig' does not have to be normalized. 323 | Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true'' 324 | floating-point exponent. 325 *----------------------------------------------------------------------------*/ 326 327 static float32 328 normalizeRoundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig STATUS_PARAM) 329 { 330 int8 shiftCount; 331 332 shiftCount = countLeadingZeros32( zSig ) - 1; 333 return roundAndPackFloat32( zSign, zExp - shiftCount, zSig<<shiftCount STATUS_VAR); 334 335 } 336 337 /*---------------------------------------------------------------------------- 338 | Returns the fraction bits of the double-precision floating-point value `a'. 339 *----------------------------------------------------------------------------*/ 340 341 INLINE bits64 extractFloat64Frac( float64 a ) 342 { 343 344 return float64_val(a) & LIT64( 0x000FFFFFFFFFFFFF ); 345 346 } 347 348 /*---------------------------------------------------------------------------- 349 | Returns the exponent bits of the double-precision floating-point value `a'. 350 *----------------------------------------------------------------------------*/ 351 352 INLINE int16 extractFloat64Exp( float64 a ) 353 { 354 355 return ( float64_val(a)>>52 ) & 0x7FF; 356 357 } 358 359 /*---------------------------------------------------------------------------- 360 | Returns the sign bit of the double-precision floating-point value `a'. 361 *----------------------------------------------------------------------------*/ 362 363 INLINE flag extractFloat64Sign( float64 a ) 364 { 365 366 return float64_val(a)>>63; 367 368 } 369 370 /*---------------------------------------------------------------------------- 371 | Normalizes the subnormal double-precision floating-point value represented 372 | by the denormalized significand `aSig'. The normalized exponent and 373 | significand are stored at the locations pointed to by `zExpPtr' and 374 | `zSigPtr', respectively. 375 *----------------------------------------------------------------------------*/ 376 377 static void 378 normalizeFloat64Subnormal( bits64 aSig, int16 *zExpPtr, bits64 *zSigPtr ) 379 { 380 int8 shiftCount; 381 382 shiftCount = countLeadingZeros64( aSig ) - 11; 383 *zSigPtr = aSig<<shiftCount; 384 *zExpPtr = 1 - shiftCount; 385 386 } 387 388 /*---------------------------------------------------------------------------- 389 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a 390 | double-precision floating-point value, returning the result. After being 391 | shifted into the proper positions, the three fields are simply added 392 | together to form the result. This means that any integer portion of `zSig' 393 | will be added into the exponent. Since a properly normalized significand 394 | will have an integer portion equal to 1, the `zExp' input should be 1 less 395 | than the desired result exponent whenever `zSig' is a complete, normalized 396 | significand. 397 *----------------------------------------------------------------------------*/ 398 399 INLINE float64 packFloat64( flag zSign, int16 zExp, bits64 zSig ) 400 { 401 402 return make_float64( 403 ( ( (bits64) zSign )<<63 ) + ( ( (bits64) zExp )<<52 ) + zSig); 404 405 } 406 407 /*---------------------------------------------------------------------------- 408 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 409 | and significand `zSig', and returns the proper double-precision floating- 410 | point value corresponding to the abstract input. Ordinarily, the abstract 411 | value is simply rounded and packed into the double-precision format, with 412 | the inexact exception raised if the abstract input cannot be represented 413 | exactly. However, if the abstract value is too large, the overflow and 414 | inexact exceptions are raised and an infinity or maximal finite value is 415 | returned. If the abstract value is too small, the input value is rounded 416 | to a subnormal number, and the underflow and inexact exceptions are raised 417 | if the abstract input cannot be represented exactly as a subnormal double- 418 | precision floating-point number. 419 | The input significand `zSig' has its binary point between bits 62 420 | and 61, which is 10 bits to the left of the usual location. This shifted 421 | significand must be normalized or smaller. If `zSig' is not normalized, 422 | `zExp' must be 0; in that case, the result returned is a subnormal number, 423 | and it must not require rounding. In the usual case that `zSig' is 424 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent. 425 | The handling of underflow and overflow follows the IEC/IEEE Standard for 426 | Binary Floating-Point Arithmetic. 427 *----------------------------------------------------------------------------*/ 428 429 static float64 roundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig STATUS_PARAM) 430 { 431 int8 roundingMode; 432 flag roundNearestEven; 433 int16 roundIncrement, roundBits; 434 flag isTiny; 435 436 roundingMode = STATUS(float_rounding_mode); 437 roundNearestEven = ( roundingMode == float_round_nearest_even ); 438 roundIncrement = 0x200; 439 if ( ! roundNearestEven ) { 440 if ( roundingMode == float_round_to_zero ) { 441 roundIncrement = 0; 442 } 443 else { 444 roundIncrement = 0x3FF; 445 if ( zSign ) { 446 if ( roundingMode == float_round_up ) roundIncrement = 0; 447 } 448 else { 449 if ( roundingMode == float_round_down ) roundIncrement = 0; 450 } 451 } 452 } 453 roundBits = zSig & 0x3FF; 454 if ( 0x7FD <= (bits16) zExp ) { 455 if ( ( 0x7FD < zExp ) 456 || ( ( zExp == 0x7FD ) 457 && ( (sbits64) ( zSig + roundIncrement ) < 0 ) ) 458 ) { 459 float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR); 460 return packFloat64( zSign, 0x7FF, - ( roundIncrement == 0 )); 461 } 462 if ( zExp < 0 ) { 463 if ( STATUS(flush_to_zero) ) return packFloat64( zSign, 0, 0 ); 464 isTiny = 465 ( STATUS(float_detect_tininess) == float_tininess_before_rounding ) 466 || ( zExp < -1 ) 467 || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) ); 468 shift64RightJamming( zSig, - zExp, &zSig ); 469 zExp = 0; 470 roundBits = zSig & 0x3FF; 471 if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR); 472 } 473 } 474 if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact; 475 zSig = ( zSig + roundIncrement )>>10; 476 zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven ); 477 if ( zSig == 0 ) zExp = 0; 478 return packFloat64( zSign, zExp, zSig ); 479 480 } 481 482 /*---------------------------------------------------------------------------- 483 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 484 | and significand `zSig', and returns the proper double-precision floating- 485 | point value corresponding to the abstract input. This routine is just like 486 | `roundAndPackFloat64' except that `zSig' does not have to be normalized. 487 | Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true'' 488 | floating-point exponent. 489 *----------------------------------------------------------------------------*/ 490 491 static float64 492 normalizeRoundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig STATUS_PARAM) 493 { 494 int8 shiftCount; 495 496 shiftCount = countLeadingZeros64( zSig ) - 1; 497 return roundAndPackFloat64( zSign, zExp - shiftCount, zSig<<shiftCount STATUS_VAR); 498 499 } 500 501 #ifdef FLOATX80 502 503 /*---------------------------------------------------------------------------- 504 | Returns the fraction bits of the extended double-precision floating-point 505 | value `a'. 506 *----------------------------------------------------------------------------*/ 507 508 INLINE bits64 extractFloatx80Frac( floatx80 a ) 509 { 510 511 return a.low; 512 513 } 514 515 /*---------------------------------------------------------------------------- 516 | Returns the exponent bits of the extended double-precision floating-point 517 | value `a'. 518 *----------------------------------------------------------------------------*/ 519 520 INLINE int32 extractFloatx80Exp( floatx80 a ) 521 { 522 523 return a.high & 0x7FFF; 524 525 } 526 527 /*---------------------------------------------------------------------------- 528 | Returns the sign bit of the extended double-precision floating-point value 529 | `a'. 530 *----------------------------------------------------------------------------*/ 531 532 INLINE flag extractFloatx80Sign( floatx80 a ) 533 { 534 535 return a.high>>15; 536 537 } 538 539 /*---------------------------------------------------------------------------- 540 | Normalizes the subnormal extended double-precision floating-point value 541 | represented by the denormalized significand `aSig'. The normalized exponent 542 | and significand are stored at the locations pointed to by `zExpPtr' and 543 | `zSigPtr', respectively. 544 *----------------------------------------------------------------------------*/ 545 546 static void 547 normalizeFloatx80Subnormal( bits64 aSig, int32 *zExpPtr, bits64 *zSigPtr ) 548 { 549 int8 shiftCount; 550 551 shiftCount = countLeadingZeros64( aSig ); 552 *zSigPtr = aSig<<shiftCount; 553 *zExpPtr = 1 - shiftCount; 554 555 } 556 557 /*---------------------------------------------------------------------------- 558 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into an 559 | extended double-precision floating-point value, returning the result. 560 *----------------------------------------------------------------------------*/ 561 562 INLINE floatx80 packFloatx80( flag zSign, int32 zExp, bits64 zSig ) 563 { 564 floatx80 z; 565 566 z.low = zSig; 567 z.high = ( ( (bits16) zSign )<<15 ) + zExp; 568 return z; 569 570 } 571 572 /*---------------------------------------------------------------------------- 573 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 574 | and extended significand formed by the concatenation of `zSig0' and `zSig1', 575 | and returns the proper extended double-precision floating-point value 576 | corresponding to the abstract input. Ordinarily, the abstract value is 577 | rounded and packed into the extended double-precision format, with the 578 | inexact exception raised if the abstract input cannot be represented 579 | exactly. However, if the abstract value is too large, the overflow and 580 | inexact exceptions are raised and an infinity or maximal finite value is 581 | returned. If the abstract value is too small, the input value is rounded to 582 | a subnormal number, and the underflow and inexact exceptions are raised if 583 | the abstract input cannot be represented exactly as a subnormal extended 584 | double-precision floating-point number. 585 | If `roundingPrecision' is 32 or 64, the result is rounded to the same 586 | number of bits as single or double precision, respectively. Otherwise, the 587 | result is rounded to the full precision of the extended double-precision 588 | format. 589 | The input significand must be normalized or smaller. If the input 590 | significand is not normalized, `zExp' must be 0; in that case, the result 591 | returned is a subnormal number, and it must not require rounding. The 592 | handling of underflow and overflow follows the IEC/IEEE Standard for Binary 593 | Floating-Point Arithmetic. 594 *----------------------------------------------------------------------------*/ 595 596 static floatx80 597 roundAndPackFloatx80( 598 int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 599 STATUS_PARAM) 600 { 601 int8 roundingMode; 602 flag roundNearestEven, increment, isTiny; 603 int64 roundIncrement, roundMask, roundBits; 604 605 roundingMode = STATUS(float_rounding_mode); 606 roundNearestEven = ( roundingMode == float_round_nearest_even ); 607 if ( roundingPrecision == 80 ) goto precision80; 608 if ( roundingPrecision == 64 ) { 609 roundIncrement = LIT64( 0x0000000000000400 ); 610 roundMask = LIT64( 0x00000000000007FF ); 611 } 612 else if ( roundingPrecision == 32 ) { 613 roundIncrement = LIT64( 0x0000008000000000 ); 614 roundMask = LIT64( 0x000000FFFFFFFFFF ); 615 } 616 else { 617 goto precision80; 618 } 619 zSig0 |= ( zSig1 != 0 ); 620 if ( ! roundNearestEven ) { 621 if ( roundingMode == float_round_to_zero ) { 622 roundIncrement = 0; 623 } 624 else { 625 roundIncrement = roundMask; 626 if ( zSign ) { 627 if ( roundingMode == float_round_up ) roundIncrement = 0; 628 } 629 else { 630 if ( roundingMode == float_round_down ) roundIncrement = 0; 631 } 632 } 633 } 634 roundBits = zSig0 & roundMask; 635 if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) { 636 if ( ( 0x7FFE < zExp ) 637 || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) ) 638 ) { 639 goto overflow; 640 } 641 if ( zExp <= 0 ) { 642 if ( STATUS(flush_to_zero) ) return packFloatx80( zSign, 0, 0 ); 643 isTiny = 644 ( STATUS(float_detect_tininess) == float_tininess_before_rounding ) 645 || ( zExp < 0 ) 646 || ( zSig0 <= zSig0 + roundIncrement ); 647 shift64RightJamming( zSig0, 1 - zExp, &zSig0 ); 648 zExp = 0; 649 roundBits = zSig0 & roundMask; 650 if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR); 651 if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact; 652 zSig0 += roundIncrement; 653 if ( (sbits64) zSig0 < 0 ) zExp = 1; 654 roundIncrement = roundMask + 1; 655 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) { 656 roundMask |= roundIncrement; 657 } 658 zSig0 &= ~ roundMask; 659 return packFloatx80( zSign, zExp, zSig0 ); 660 } 661 } 662 if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact; 663 zSig0 += roundIncrement; 664 if ( zSig0 < roundIncrement ) { 665 ++zExp; 666 zSig0 = LIT64( 0x8000000000000000 ); 667 } 668 roundIncrement = roundMask + 1; 669 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) { 670 roundMask |= roundIncrement; 671 } 672 zSig0 &= ~ roundMask; 673 if ( zSig0 == 0 ) zExp = 0; 674 return packFloatx80( zSign, zExp, zSig0 ); 675 precision80: 676 increment = ( (sbits64) zSig1 < 0 ); 677 if ( ! roundNearestEven ) { 678 if ( roundingMode == float_round_to_zero ) { 679 increment = 0; 680 } 681 else { 682 if ( zSign ) { 683 increment = ( roundingMode == float_round_down ) && zSig1; 684 } 685 else { 686 increment = ( roundingMode == float_round_up ) && zSig1; 687 } 688 } 689 } 690 if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) { 691 if ( ( 0x7FFE < zExp ) 692 || ( ( zExp == 0x7FFE ) 693 && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) ) 694 && increment 695 ) 696 ) { 697 roundMask = 0; 698 overflow: 699 float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR); 700 if ( ( roundingMode == float_round_to_zero ) 701 || ( zSign && ( roundingMode == float_round_up ) ) 702 || ( ! zSign && ( roundingMode == float_round_down ) ) 703 ) { 704 return packFloatx80( zSign, 0x7FFE, ~ roundMask ); 705 } 706 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 707 } 708 if ( zExp <= 0 ) { 709 isTiny = 710 ( STATUS(float_detect_tininess) == float_tininess_before_rounding ) 711 || ( zExp < 0 ) 712 || ! increment 713 || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) ); 714 shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 ); 715 zExp = 0; 716 if ( isTiny && zSig1 ) float_raise( float_flag_underflow STATUS_VAR); 717 if ( zSig1 ) STATUS(float_exception_flags) |= float_flag_inexact; 718 if ( roundNearestEven ) { 719 increment = ( (sbits64) zSig1 < 0 ); 720 } 721 else { 722 if ( zSign ) { 723 increment = ( roundingMode == float_round_down ) && zSig1; 724 } 725 else { 726 increment = ( roundingMode == float_round_up ) && zSig1; 727 } 728 } 729 if ( increment ) { 730 ++zSig0; 731 zSig0 &= 732 ~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven ); 733 if ( (sbits64) zSig0 < 0 ) zExp = 1; 734 } 735 return packFloatx80( zSign, zExp, zSig0 ); 736 } 737 } 738 if ( zSig1 ) STATUS(float_exception_flags) |= float_flag_inexact; 739 if ( increment ) { 740 ++zSig0; 741 if ( zSig0 == 0 ) { 742 ++zExp; 743 zSig0 = LIT64( 0x8000000000000000 ); 744 } 745 else { 746 zSig0 &= ~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven ); 747 } 748 } 749 else { 750 if ( zSig0 == 0 ) zExp = 0; 751 } 752 return packFloatx80( zSign, zExp, zSig0 ); 753 754 } 755 756 /*---------------------------------------------------------------------------- 757 | Takes an abstract floating-point value having sign `zSign', exponent 758 | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1', 759 | and returns the proper extended double-precision floating-point value 760 | corresponding to the abstract input. This routine is just like 761 | `roundAndPackFloatx80' except that the input significand does not have to be 762 | normalized. 763 *----------------------------------------------------------------------------*/ 764 765 static floatx80 766 normalizeRoundAndPackFloatx80( 767 int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 768 STATUS_PARAM) 769 { 770 int8 shiftCount; 771 772 if ( zSig0 == 0 ) { 773 zSig0 = zSig1; 774 zSig1 = 0; 775 zExp -= 64; 776 } 777 shiftCount = countLeadingZeros64( zSig0 ); 778 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 ); 779 zExp -= shiftCount; 780 return 781 roundAndPackFloatx80( roundingPrecision, zSign, zExp, zSig0, zSig1 STATUS_VAR); 782 783 } 784 785 #endif 786 787 #ifdef FLOAT128 788 789 /*---------------------------------------------------------------------------- 790 | Returns the least-significant 64 fraction bits of the quadruple-precision 791 | floating-point value `a'. 792 *----------------------------------------------------------------------------*/ 793 794 INLINE bits64 extractFloat128Frac1( float128 a ) 795 { 796 797 return a.low; 798 799 } 800 801 /*---------------------------------------------------------------------------- 802 | Returns the most-significant 48 fraction bits of the quadruple-precision 803 | floating-point value `a'. 804 *----------------------------------------------------------------------------*/ 805 806 INLINE bits64 extractFloat128Frac0( float128 a ) 807 { 808 809 return a.high & LIT64( 0x0000FFFFFFFFFFFF ); 810 811 } 812 813 /*---------------------------------------------------------------------------- 814 | Returns the exponent bits of the quadruple-precision floating-point value 815 | `a'. 816 *----------------------------------------------------------------------------*/ 817 818 INLINE int32 extractFloat128Exp( float128 a ) 819 { 820 821 return ( a.high>>48 ) & 0x7FFF; 822 823 } 824 825 /*---------------------------------------------------------------------------- 826 | Returns the sign bit of the quadruple-precision floating-point value `a'. 827 *----------------------------------------------------------------------------*/ 828 829 INLINE flag extractFloat128Sign( float128 a ) 830 { 831 832 return a.high>>63; 833 834 } 835 836 /*---------------------------------------------------------------------------- 837 | Normalizes the subnormal quadruple-precision floating-point value 838 | represented by the denormalized significand formed by the concatenation of 839 | `aSig0' and `aSig1'. The normalized exponent is stored at the location 840 | pointed to by `zExpPtr'. The most significant 49 bits of the normalized 841 | significand are stored at the location pointed to by `zSig0Ptr', and the 842 | least significant 64 bits of the normalized significand are stored at the 843 | location pointed to by `zSig1Ptr'. 844 *----------------------------------------------------------------------------*/ 845 846 static void 847 normalizeFloat128Subnormal( 848 bits64 aSig0, 849 bits64 aSig1, 850 int32 *zExpPtr, 851 bits64 *zSig0Ptr, 852 bits64 *zSig1Ptr 853 ) 854 { 855 int8 shiftCount; 856 857 if ( aSig0 == 0 ) { 858 shiftCount = countLeadingZeros64( aSig1 ) - 15; 859 if ( shiftCount < 0 ) { 860 *zSig0Ptr = aSig1>>( - shiftCount ); 861 *zSig1Ptr = aSig1<<( shiftCount & 63 ); 862 } 863 else { 864 *zSig0Ptr = aSig1<<shiftCount; 865 *zSig1Ptr = 0; 866 } 867 *zExpPtr = - shiftCount - 63; 868 } 869 else { 870 shiftCount = countLeadingZeros64( aSig0 ) - 15; 871 shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr ); 872 *zExpPtr = 1 - shiftCount; 873 } 874 875 } 876 877 /*---------------------------------------------------------------------------- 878 | Packs the sign `zSign', the exponent `zExp', and the significand formed 879 | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision 880 | floating-point value, returning the result. After being shifted into the 881 | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply 882 | added together to form the most significant 32 bits of the result. This 883 | means that any integer portion of `zSig0' will be added into the exponent. 884 | Since a properly normalized significand will have an integer portion equal 885 | to 1, the `zExp' input should be 1 less than the desired result exponent 886 | whenever `zSig0' and `zSig1' concatenated form a complete, normalized 887 | significand. 888 *----------------------------------------------------------------------------*/ 889 890 INLINE float128 891 packFloat128( flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 ) 892 { 893 float128 z; 894 895 z.low = zSig1; 896 z.high = ( ( (bits64) zSign )<<63 ) + ( ( (bits64) zExp )<<48 ) + zSig0; 897 return z; 898 899 } 900 901 /*---------------------------------------------------------------------------- 902 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 903 | and extended significand formed by the concatenation of `zSig0', `zSig1', 904 | and `zSig2', and returns the proper quadruple-precision floating-point value 905 | corresponding to the abstract input. Ordinarily, the abstract value is 906 | simply rounded and packed into the quadruple-precision format, with the 907 | inexact exception raised if the abstract input cannot be represented 908 | exactly. However, if the abstract value is too large, the overflow and 909 | inexact exceptions are raised and an infinity or maximal finite value is 910 | returned. If the abstract value is too small, the input value is rounded to 911 | a subnormal number, and the underflow and inexact exceptions are raised if 912 | the abstract input cannot be represented exactly as a subnormal quadruple- 913 | precision floating-point number. 914 | The input significand must be normalized or smaller. If the input 915 | significand is not normalized, `zExp' must be 0; in that case, the result 916 | returned is a subnormal number, and it must not require rounding. In the 917 | usual case that the input significand is normalized, `zExp' must be 1 less 918 | than the ``true'' floating-point exponent. The handling of underflow and 919 | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 920 *----------------------------------------------------------------------------*/ 921 922 static float128 923 roundAndPackFloat128( 924 flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1, bits64 zSig2 STATUS_PARAM) 925 { 926 int8 roundingMode; 927 flag roundNearestEven, increment, isTiny; 928 929 roundingMode = STATUS(float_rounding_mode); 930 roundNearestEven = ( roundingMode == float_round_nearest_even ); 931 increment = ( (sbits64) zSig2 < 0 ); 932 if ( ! roundNearestEven ) { 933 if ( roundingMode == float_round_to_zero ) { 934 increment = 0; 935 } 936 else { 937 if ( zSign ) { 938 increment = ( roundingMode == float_round_down ) && zSig2; 939 } 940 else { 941 increment = ( roundingMode == float_round_up ) && zSig2; 942 } 943 } 944 } 945 if ( 0x7FFD <= (bits32) zExp ) { 946 if ( ( 0x7FFD < zExp ) 947 || ( ( zExp == 0x7FFD ) 948 && eq128( 949 LIT64( 0x0001FFFFFFFFFFFF ), 950 LIT64( 0xFFFFFFFFFFFFFFFF ), 951 zSig0, 952 zSig1 953 ) 954 && increment 955 ) 956 ) { 957 float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR); 958 if ( ( roundingMode == float_round_to_zero ) 959 || ( zSign && ( roundingMode == float_round_up ) ) 960 || ( ! zSign && ( roundingMode == float_round_down ) ) 961 ) { 962 return 963 packFloat128( 964 zSign, 965 0x7FFE, 966 LIT64( 0x0000FFFFFFFFFFFF ), 967 LIT64( 0xFFFFFFFFFFFFFFFF ) 968 ); 969 } 970 return packFloat128( zSign, 0x7FFF, 0, 0 ); 971 } 972 if ( zExp < 0 ) { 973 if ( STATUS(flush_to_zero) ) return packFloat128( zSign, 0, 0, 0 ); 974 isTiny = 975 ( STATUS(float_detect_tininess) == float_tininess_before_rounding ) 976 || ( zExp < -1 ) 977 || ! increment 978 || lt128( 979 zSig0, 980 zSig1, 981 LIT64( 0x0001FFFFFFFFFFFF ), 982 LIT64( 0xFFFFFFFFFFFFFFFF ) 983 ); 984 shift128ExtraRightJamming( 985 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 ); 986 zExp = 0; 987 if ( isTiny && zSig2 ) float_raise( float_flag_underflow STATUS_VAR); 988 if ( roundNearestEven ) { 989 increment = ( (sbits64) zSig2 < 0 ); 990 } 991 else { 992 if ( zSign ) { 993 increment = ( roundingMode == float_round_down ) && zSig2; 994 } 995 else { 996 increment = ( roundingMode == float_round_up ) && zSig2; 997 } 998 } 999 } 1000 } 1001 if ( zSig2 ) STATUS(float_exception_flags) |= float_flag_inexact; 1002 if ( increment ) { 1003 add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 ); 1004 zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven ); 1005 } 1006 else { 1007 if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0; 1008 } 1009 return packFloat128( zSign, zExp, zSig0, zSig1 ); 1010 1011 } 1012 1013 /*---------------------------------------------------------------------------- 1014 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 1015 | and significand formed by the concatenation of `zSig0' and `zSig1', and 1016 | returns the proper quadruple-precision floating-point value corresponding 1017 | to the abstract input. This routine is just like `roundAndPackFloat128' 1018 | except that the input significand has fewer bits and does not have to be 1019 | normalized. In all cases, `zExp' must be 1 less than the ``true'' floating- 1020 | point exponent. 1021 *----------------------------------------------------------------------------*/ 1022 1023 static float128 1024 normalizeRoundAndPackFloat128( 1025 flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 STATUS_PARAM) 1026 { 1027 int8 shiftCount; 1028 bits64 zSig2; 1029 1030 if ( zSig0 == 0 ) { 1031 zSig0 = zSig1; 1032 zSig1 = 0; 1033 zExp -= 64; 1034 } 1035 shiftCount = countLeadingZeros64( zSig0 ) - 15; 1036 if ( 0 <= shiftCount ) { 1037 zSig2 = 0; 1038 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 ); 1039 } 1040 else { 1041 shift128ExtraRightJamming( 1042 zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 ); 1043 } 1044 zExp -= shiftCount; 1045 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR); 1046 1047 } 1048 1049 #endif 1050 1051 /*---------------------------------------------------------------------------- 1052 | Returns the result of converting the 32-bit two's complement integer `a' 1053 | to the single-precision floating-point format. The conversion is performed 1054 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1055 *----------------------------------------------------------------------------*/ 1056 1057 float32 int32_to_float32( int32 a STATUS_PARAM ) 1058 { 1059 flag zSign; 1060 1061 if ( a == 0 ) return float32_zero; 1062 if ( a == (sbits32) 0x80000000 ) return packFloat32( 1, 0x9E, 0 ); 1063 zSign = ( a < 0 ); 1064 return normalizeRoundAndPackFloat32( zSign, 0x9C, zSign ? - a : a STATUS_VAR ); 1065 1066 } 1067 1068 /*---------------------------------------------------------------------------- 1069 | Returns the result of converting the 32-bit two's complement integer `a' 1070 | to the double-precision floating-point format. The conversion is performed 1071 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1072 *----------------------------------------------------------------------------*/ 1073 1074 float64 int32_to_float64( int32 a STATUS_PARAM ) 1075 { 1076 flag zSign; 1077 uint32 absA; 1078 int8 shiftCount; 1079 bits64 zSig; 1080 1081 if ( a == 0 ) return float64_zero; 1082 zSign = ( a < 0 ); 1083 absA = zSign ? - a : a; 1084 shiftCount = countLeadingZeros32( absA ) + 21; 1085 zSig = absA; 1086 return packFloat64( zSign, 0x432 - shiftCount, zSig<<shiftCount ); 1087 1088 } 1089 1090 #ifdef FLOATX80 1091 1092 /*---------------------------------------------------------------------------- 1093 | Returns the result of converting the 32-bit two's complement integer `a' 1094 | to the extended double-precision floating-point format. The conversion 1095 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 1096 | Arithmetic. 1097 *----------------------------------------------------------------------------*/ 1098 1099 floatx80 int32_to_floatx80( int32 a STATUS_PARAM ) 1100 { 1101 flag zSign; 1102 uint32 absA; 1103 int8 shiftCount; 1104 bits64 zSig; 1105 1106 if ( a == 0 ) return packFloatx80( 0, 0, 0 ); 1107 zSign = ( a < 0 ); 1108 absA = zSign ? - a : a; 1109 shiftCount = countLeadingZeros32( absA ) + 32; 1110 zSig = absA; 1111 return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount ); 1112 1113 } 1114 1115 #endif 1116 1117 #ifdef FLOAT128 1118 1119 /*---------------------------------------------------------------------------- 1120 | Returns the result of converting the 32-bit two's complement integer `a' to 1121 | the quadruple-precision floating-point format. The conversion is performed 1122 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1123 *----------------------------------------------------------------------------*/ 1124 1125 float128 int32_to_float128( int32 a STATUS_PARAM ) 1126 { 1127 flag zSign; 1128 uint32 absA; 1129 int8 shiftCount; 1130 bits64 zSig0; 1131 1132 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 ); 1133 zSign = ( a < 0 ); 1134 absA = zSign ? - a : a; 1135 shiftCount = countLeadingZeros32( absA ) + 17; 1136 zSig0 = absA; 1137 return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 ); 1138 1139 } 1140 1141 #endif 1142 1143 /*---------------------------------------------------------------------------- 1144 | Returns the result of converting the 64-bit two's complement integer `a' 1145 | to the single-precision floating-point format. The conversion is performed 1146 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1147 *----------------------------------------------------------------------------*/ 1148 1149 float32 int64_to_float32( int64 a STATUS_PARAM ) 1150 { 1151 flag zSign; 1152 uint64 absA; 1153 int8 shiftCount; 1154 1155 if ( a == 0 ) return float32_zero; 1156 zSign = ( a < 0 ); 1157 absA = zSign ? - a : a; 1158 shiftCount = countLeadingZeros64( absA ) - 40; 1159 if ( 0 <= shiftCount ) { 1160 return packFloat32( zSign, 0x95 - shiftCount, absA<<shiftCount ); 1161 } 1162 else { 1163 shiftCount += 7; 1164 if ( shiftCount < 0 ) { 1165 shift64RightJamming( absA, - shiftCount, &absA ); 1166 } 1167 else { 1168 absA <<= shiftCount; 1169 } 1170 return roundAndPackFloat32( zSign, 0x9C - shiftCount, absA STATUS_VAR ); 1171 } 1172 1173 } 1174 1175 float32 uint64_to_float32( uint64 a STATUS_PARAM ) 1176 { 1177 int8 shiftCount; 1178 1179 if ( a == 0 ) return float32_zero; 1180 shiftCount = countLeadingZeros64( a ) - 40; 1181 if ( 0 <= shiftCount ) { 1182 return packFloat32( 1 > 0, 0x95 - shiftCount, a<<shiftCount ); 1183 } 1184 else { 1185 shiftCount += 7; 1186 if ( shiftCount < 0 ) { 1187 shift64RightJamming( a, - shiftCount, &a ); 1188 } 1189 else { 1190 a <<= shiftCount; 1191 } 1192 return roundAndPackFloat32( 1 > 0, 0x9C - shiftCount, a STATUS_VAR ); 1193 } 1194 } 1195 1196 /*---------------------------------------------------------------------------- 1197 | Returns the result of converting the 64-bit two's complement integer `a' 1198 | to the double-precision floating-point format. The conversion is performed 1199 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1200 *----------------------------------------------------------------------------*/ 1201 1202 float64 int64_to_float64( int64 a STATUS_PARAM ) 1203 { 1204 flag zSign; 1205 1206 if ( a == 0 ) return float64_zero; 1207 if ( a == (sbits64) LIT64( 0x8000000000000000 ) ) { 1208 return packFloat64( 1, 0x43E, 0 ); 1209 } 1210 zSign = ( a < 0 ); 1211 return normalizeRoundAndPackFloat64( zSign, 0x43C, zSign ? - a : a STATUS_VAR ); 1212 1213 } 1214 1215 float64 uint64_to_float64( uint64 a STATUS_PARAM ) 1216 { 1217 if ( a == 0 ) return float64_zero; 1218 return normalizeRoundAndPackFloat64( 0, 0x43C, a STATUS_VAR ); 1219 1220 } 1221 1222 #ifdef FLOATX80 1223 1224 /*---------------------------------------------------------------------------- 1225 | Returns the result of converting the 64-bit two's complement integer `a' 1226 | to the extended double-precision floating-point format. The conversion 1227 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 1228 | Arithmetic. 1229 *----------------------------------------------------------------------------*/ 1230 1231 floatx80 int64_to_floatx80( int64 a STATUS_PARAM ) 1232 { 1233 flag zSign; 1234 uint64 absA; 1235 int8 shiftCount; 1236 1237 if ( a == 0 ) return packFloatx80( 0, 0, 0 ); 1238 zSign = ( a < 0 ); 1239 absA = zSign ? - a : a; 1240 shiftCount = countLeadingZeros64( absA ); 1241 return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount ); 1242 1243 } 1244 1245 #endif 1246 1247 #ifdef FLOAT128 1248 1249 /*---------------------------------------------------------------------------- 1250 | Returns the result of converting the 64-bit two's complement integer `a' to 1251 | the quadruple-precision floating-point format. The conversion is performed 1252 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1253 *----------------------------------------------------------------------------*/ 1254 1255 float128 int64_to_float128( int64 a STATUS_PARAM ) 1256 { 1257 flag zSign; 1258 uint64 absA; 1259 int8 shiftCount; 1260 int32 zExp; 1261 bits64 zSig0, zSig1; 1262 1263 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 ); 1264 zSign = ( a < 0 ); 1265 absA = zSign ? - a : a; 1266 shiftCount = countLeadingZeros64( absA ) + 49; 1267 zExp = 0x406E - shiftCount; 1268 if ( 64 <= shiftCount ) { 1269 zSig1 = 0; 1270 zSig0 = absA; 1271 shiftCount -= 64; 1272 } 1273 else { 1274 zSig1 = absA; 1275 zSig0 = 0; 1276 } 1277 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 ); 1278 return packFloat128( zSign, zExp, zSig0, zSig1 ); 1279 1280 } 1281 1282 #endif 1283 1284 /*---------------------------------------------------------------------------- 1285 | Returns the result of converting the single-precision floating-point value 1286 | `a' to the 32-bit two's complement integer format. The conversion is 1287 | performed according to the IEC/IEEE Standard for Binary Floating-Point 1288 | Arithmetic---which means in particular that the conversion is rounded 1289 | according to the current rounding mode. If `a' is a NaN, the largest 1290 | positive integer is returned. Otherwise, if the conversion overflows, the 1291 | largest integer with the same sign as `a' is returned. 1292 *----------------------------------------------------------------------------*/ 1293 1294 int32 float32_to_int32( float32 a STATUS_PARAM ) 1295 { 1296 flag aSign; 1297 int16 aExp, shiftCount; 1298 bits32 aSig; 1299 bits64 aSig64; 1300 1301 aSig = extractFloat32Frac( a ); 1302 aExp = extractFloat32Exp( a ); 1303 aSign = extractFloat32Sign( a ); 1304 if ( ( aExp == 0xFF ) && aSig ) aSign = 0; 1305 if ( aExp ) aSig |= 0x00800000; 1306 shiftCount = 0xAF - aExp; 1307 aSig64 = aSig; 1308 aSig64 <<= 32; 1309 if ( 0 < shiftCount ) shift64RightJamming( aSig64, shiftCount, &aSig64 ); 1310 return roundAndPackInt32( aSign, aSig64 STATUS_VAR ); 1311 1312 } 1313 1314 /*---------------------------------------------------------------------------- 1315 | Returns the result of converting the single-precision floating-point value 1316 | `a' to the 32-bit two's complement integer format. The conversion is 1317 | performed according to the IEC/IEEE Standard for Binary Floating-Point 1318 | Arithmetic, except that the conversion is always rounded toward zero. 1319 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if 1320 | the conversion overflows, the largest integer with the same sign as `a' is 1321 | returned. 1322 *----------------------------------------------------------------------------*/ 1323 1324 int32 float32_to_int32_round_to_zero( float32 a STATUS_PARAM ) 1325 { 1326 flag aSign; 1327 int16 aExp, shiftCount; 1328 bits32 aSig; 1329 int32 z; 1330 1331 aSig = extractFloat32Frac( a ); 1332 aExp = extractFloat32Exp( a ); 1333 aSign = extractFloat32Sign( a ); 1334 shiftCount = aExp - 0x9E; 1335 if ( 0 <= shiftCount ) { 1336 if ( float32_val(a) != 0xCF000000 ) { 1337 float_raise( float_flag_invalid STATUS_VAR); 1338 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF; 1339 } 1340 return (sbits32) 0x80000000; 1341 } 1342 else if ( aExp <= 0x7E ) { 1343 if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact; 1344 return 0; 1345 } 1346 aSig = ( aSig | 0x00800000 )<<8; 1347 z = aSig>>( - shiftCount ); 1348 if ( (bits32) ( aSig<<( shiftCount & 31 ) ) ) { 1349 STATUS(float_exception_flags) |= float_flag_inexact; 1350 } 1351 if ( aSign ) z = - z; 1352 return z; 1353 1354 } 1355 1356 /*---------------------------------------------------------------------------- 1357 | Returns the result of converting the single-precision floating-point value 1358 | `a' to the 64-bit two's complement integer format. The conversion is 1359 | performed according to the IEC/IEEE Standard for Binary Floating-Point 1360 | Arithmetic---which means in particular that the conversion is rounded 1361 | according to the current rounding mode. If `a' is a NaN, the largest 1362 | positive integer is returned. Otherwise, if the conversion overflows, the 1363 | largest integer with the same sign as `a' is returned. 1364 *----------------------------------------------------------------------------*/ 1365 1366 int64 float32_to_int64( float32 a STATUS_PARAM ) 1367 { 1368 flag aSign; 1369 int16 aExp, shiftCount; 1370 bits32 aSig; 1371 bits64 aSig64, aSigExtra; 1372 1373 aSig = extractFloat32Frac( a ); 1374 aExp = extractFloat32Exp( a ); 1375 aSign = extractFloat32Sign( a ); 1376 shiftCount = 0xBE - aExp; 1377 if ( shiftCount < 0 ) { 1378 float_raise( float_flag_invalid STATUS_VAR); 1379 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) { 1380 return LIT64( 0x7FFFFFFFFFFFFFFF ); 1381 } 1382 return (sbits64) LIT64( 0x8000000000000000 ); 1383 } 1384 if ( aExp ) aSig |= 0x00800000; 1385 aSig64 = aSig; 1386 aSig64 <<= 40; 1387 shift64ExtraRightJamming( aSig64, 0, shiftCount, &aSig64, &aSigExtra ); 1388 return roundAndPackInt64( aSign, aSig64, aSigExtra STATUS_VAR ); 1389 1390 } 1391 1392 /*---------------------------------------------------------------------------- 1393 | Returns the result of converting the single-precision floating-point value 1394 | `a' to the 64-bit two's complement integer format. The conversion is 1395 | performed according to the IEC/IEEE Standard for Binary Floating-Point 1396 | Arithmetic, except that the conversion is always rounded toward zero. If 1397 | `a' is a NaN, the largest positive integer is returned. Otherwise, if the 1398 | conversion overflows, the largest integer with the same sign as `a' is 1399 | returned. 1400 *----------------------------------------------------------------------------*/ 1401 1402 int64 float32_to_int64_round_to_zero( float32 a STATUS_PARAM ) 1403 { 1404 flag aSign; 1405 int16 aExp, shiftCount; 1406 bits32 aSig; 1407 bits64 aSig64; 1408 int64 z; 1409 1410 aSig = extractFloat32Frac( a ); 1411 aExp = extractFloat32Exp( a ); 1412 aSign = extractFloat32Sign( a ); 1413 shiftCount = aExp - 0xBE; 1414 if ( 0 <= shiftCount ) { 1415 if ( float32_val(a) != 0xDF000000 ) { 1416 float_raise( float_flag_invalid STATUS_VAR); 1417 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) { 1418 return LIT64( 0x7FFFFFFFFFFFFFFF ); 1419 } 1420 } 1421 return (sbits64) LIT64( 0x8000000000000000 ); 1422 } 1423 else if ( aExp <= 0x7E ) { 1424 if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact; 1425 return 0; 1426 } 1427 aSig64 = aSig | 0x00800000; 1428 aSig64 <<= 40; 1429 z = aSig64>>( - shiftCount ); 1430 if ( (bits64) ( aSig64<<( shiftCount & 63 ) ) ) { 1431 STATUS(float_exception_flags) |= float_flag_inexact; 1432 } 1433 if ( aSign ) z = - z; 1434 return z; 1435 1436 } 1437 1438 /*---------------------------------------------------------------------------- 1439 | Returns the result of converting the single-precision floating-point value 1440 | `a' to the double-precision floating-point format. The conversion is 1441 | performed according to the IEC/IEEE Standard for Binary Floating-Point 1442 | Arithmetic. 1443 *----------------------------------------------------------------------------*/ 1444 1445 float64 float32_to_float64( float32 a STATUS_PARAM ) 1446 { 1447 flag aSign; 1448 int16 aExp; 1449 bits32 aSig; 1450 1451 aSig = extractFloat32Frac( a ); 1452 aExp = extractFloat32Exp( a ); 1453 aSign = extractFloat32Sign( a ); 1454 if ( aExp == 0xFF ) { 1455 if ( aSig ) return commonNaNToFloat64( float32ToCommonNaN( a STATUS_VAR )); 1456 return packFloat64( aSign, 0x7FF, 0 ); 1457 } 1458 if ( aExp == 0 ) { 1459 if ( aSig == 0 ) return packFloat64( aSign, 0, 0 ); 1460 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 1461 --aExp; 1462 } 1463 return packFloat64( aSign, aExp + 0x380, ( (bits64) aSig )<<29 ); 1464 1465 } 1466 1467 #ifdef FLOATX80 1468 1469 /*---------------------------------------------------------------------------- 1470 | Returns the result of converting the single-precision floating-point value 1471 | `a' to the extended double-precision floating-point format. The conversion 1472 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 1473 | Arithmetic. 1474 *----------------------------------------------------------------------------*/ 1475 1476 floatx80 float32_to_floatx80( float32 a STATUS_PARAM ) 1477 { 1478 flag aSign; 1479 int16 aExp; 1480 bits32 aSig; 1481 1482 aSig = extractFloat32Frac( a ); 1483 aExp = extractFloat32Exp( a ); 1484 aSign = extractFloat32Sign( a ); 1485 if ( aExp == 0xFF ) { 1486 if ( aSig ) return commonNaNToFloatx80( float32ToCommonNaN( a STATUS_VAR ) ); 1487 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 1488 } 1489 if ( aExp == 0 ) { 1490 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 ); 1491 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 1492 } 1493 aSig |= 0x00800000; 1494 return packFloatx80( aSign, aExp + 0x3F80, ( (bits64) aSig )<<40 ); 1495 1496 } 1497 1498 #endif 1499 1500 #ifdef FLOAT128 1501 1502 /*---------------------------------------------------------------------------- 1503 | Returns the result of converting the single-precision floating-point value 1504 | `a' to the double-precision floating-point format. The conversion is 1505 | performed according to the IEC/IEEE Standard for Binary Floating-Point 1506 | Arithmetic. 1507 *----------------------------------------------------------------------------*/ 1508 1509 float128 float32_to_float128( float32 a STATUS_PARAM ) 1510 { 1511 flag aSign; 1512 int16 aExp; 1513 bits32 aSig; 1514 1515 aSig = extractFloat32Frac( a ); 1516 aExp = extractFloat32Exp( a ); 1517 aSign = extractFloat32Sign( a ); 1518 if ( aExp == 0xFF ) { 1519 if ( aSig ) return commonNaNToFloat128( float32ToCommonNaN( a STATUS_VAR ) ); 1520 return packFloat128( aSign, 0x7FFF, 0, 0 ); 1521 } 1522 if ( aExp == 0 ) { 1523 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 ); 1524 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 1525 --aExp; 1526 } 1527 return packFloat128( aSign, aExp + 0x3F80, ( (bits64) aSig )<<25, 0 ); 1528 1529 } 1530 1531 #endif 1532 1533 /*---------------------------------------------------------------------------- 1534 | Rounds the single-precision floating-point value `a' to an integer, and 1535 | returns the result as a single-precision floating-point value. The 1536 | operation is performed according to the IEC/IEEE Standard for Binary 1537 | Floating-Point Arithmetic. 1538 *----------------------------------------------------------------------------*/ 1539 1540 float32 float32_round_to_int( float32 a STATUS_PARAM) 1541 { 1542 flag aSign; 1543 int16 aExp; 1544 bits32 lastBitMask, roundBitsMask; 1545 int8 roundingMode; 1546 bits32 z; 1547 1548 aExp = extractFloat32Exp( a ); 1549 if ( 0x96 <= aExp ) { 1550 if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) { 1551 return propagateFloat32NaN( a, a STATUS_VAR ); 1552 } 1553 return a; 1554 } 1555 if ( aExp <= 0x7E ) { 1556 if ( (bits32) ( float32_val(a)<<1 ) == 0 ) return a; 1557 STATUS(float_exception_flags) |= float_flag_inexact; 1558 aSign = extractFloat32Sign( a ); 1559 switch ( STATUS(float_rounding_mode) ) { 1560 case float_round_nearest_even: 1561 if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) { 1562 return packFloat32( aSign, 0x7F, 0 ); 1563 } 1564 break; 1565 case float_round_down: 1566 return make_float32(aSign ? 0xBF800000 : 0); 1567 case float_round_up: 1568 return make_float32(aSign ? 0x80000000 : 0x3F800000); 1569 } 1570 return packFloat32( aSign, 0, 0 ); 1571 } 1572 lastBitMask = 1; 1573 lastBitMask <<= 0x96 - aExp; 1574 roundBitsMask = lastBitMask - 1; 1575 z = float32_val(a); 1576 roundingMode = STATUS(float_rounding_mode); 1577 if ( roundingMode == float_round_nearest_even ) { 1578 z += lastBitMask>>1; 1579 if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask; 1580 } 1581 else if ( roundingMode != float_round_to_zero ) { 1582 if ( extractFloat32Sign( make_float32(z) ) ^ ( roundingMode == float_round_up ) ) { 1583 z += roundBitsMask; 1584 } 1585 } 1586 z &= ~ roundBitsMask; 1587 if ( z != float32_val(a) ) STATUS(float_exception_flags) |= float_flag_inexact; 1588 return make_float32(z); 1589 1590 } 1591 1592 /*---------------------------------------------------------------------------- 1593 | Returns the result of adding the absolute values of the single-precision 1594 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated 1595 | before being returned. `zSign' is ignored if the result is a NaN. 1596 | The addition is performed according to the IEC/IEEE Standard for Binary 1597 | Floating-Point Arithmetic. 1598 *----------------------------------------------------------------------------*/ 1599 1600 static float32 addFloat32Sigs( float32 a, float32 b, flag zSign STATUS_PARAM) 1601 { 1602 int16 aExp, bExp, zExp; 1603 bits32 aSig, bSig, zSig; 1604 int16 expDiff; 1605 1606 aSig = extractFloat32Frac( a ); 1607 aExp = extractFloat32Exp( a ); 1608 bSig = extractFloat32Frac( b ); 1609 bExp = extractFloat32Exp( b ); 1610 expDiff = aExp - bExp; 1611 aSig <<= 6; 1612 bSig <<= 6; 1613 if ( 0 < expDiff ) { 1614 if ( aExp == 0xFF ) { 1615 if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR ); 1616 return a; 1617 } 1618 if ( bExp == 0 ) { 1619 --expDiff; 1620 } 1621 else { 1622 bSig |= 0x20000000; 1623 } 1624 shift32RightJamming( bSig, expDiff, &bSig ); 1625 zExp = aExp; 1626 } 1627 else if ( expDiff < 0 ) { 1628 if ( bExp == 0xFF ) { 1629 if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR ); 1630 return packFloat32( zSign, 0xFF, 0 ); 1631 } 1632 if ( aExp == 0 ) { 1633 ++expDiff; 1634 } 1635 else { 1636 aSig |= 0x20000000; 1637 } 1638 shift32RightJamming( aSig, - expDiff, &aSig ); 1639 zExp = bExp; 1640 } 1641 else { 1642 if ( aExp == 0xFF ) { 1643 if ( aSig | bSig ) return propagateFloat32NaN( a, b STATUS_VAR ); 1644 return a; 1645 } 1646 if ( aExp == 0 ) { 1647 if ( STATUS(flush_to_zero) ) return packFloat32( zSign, 0, 0 ); 1648 return packFloat32( zSign, 0, ( aSig + bSig )>>6 ); 1649 } 1650 zSig = 0x40000000 + aSig + bSig; 1651 zExp = aExp; 1652 goto roundAndPack; 1653 } 1654 aSig |= 0x20000000; 1655 zSig = ( aSig + bSig )<<1; 1656 --zExp; 1657 if ( (sbits32) zSig < 0 ) { 1658 zSig = aSig + bSig; 1659 ++zExp; 1660 } 1661 roundAndPack: 1662 return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR ); 1663 1664 } 1665 1666 /*---------------------------------------------------------------------------- 1667 | Returns the result of subtracting the absolute values of the single- 1668 | precision floating-point values `a' and `b'. If `zSign' is 1, the 1669 | difference is negated before being returned. `zSign' is ignored if the 1670 | result is a NaN. The subtraction is performed according to the IEC/IEEE 1671 | Standard for Binary Floating-Point Arithmetic. 1672 *----------------------------------------------------------------------------*/ 1673 1674 static float32 subFloat32Sigs( float32 a, float32 b, flag zSign STATUS_PARAM) 1675 { 1676 int16 aExp, bExp, zExp; 1677 bits32 aSig, bSig, zSig; 1678 int16 expDiff; 1679 1680 aSig = extractFloat32Frac( a ); 1681 aExp = extractFloat32Exp( a ); 1682 bSig = extractFloat32Frac( b ); 1683 bExp = extractFloat32Exp( b ); 1684 expDiff = aExp - bExp; 1685 aSig <<= 7; 1686 bSig <<= 7; 1687 if ( 0 < expDiff ) goto aExpBigger; 1688 if ( expDiff < 0 ) goto bExpBigger; 1689 if ( aExp == 0xFF ) { 1690 if ( aSig | bSig ) return propagateFloat32NaN( a, b STATUS_VAR ); 1691 float_raise( float_flag_invalid STATUS_VAR); 1692 return float32_default_nan; 1693 } 1694 if ( aExp == 0 ) { 1695 aExp = 1; 1696 bExp = 1; 1697 } 1698 if ( bSig < aSig ) goto aBigger; 1699 if ( aSig < bSig ) goto bBigger; 1700 return packFloat32( STATUS(float_rounding_mode) == float_round_down, 0, 0 ); 1701 bExpBigger: 1702 if ( bExp == 0xFF ) { 1703 if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR ); 1704 return packFloat32( zSign ^ 1, 0xFF, 0 ); 1705 } 1706 if ( aExp == 0 ) { 1707 ++expDiff; 1708 } 1709 else { 1710 aSig |= 0x40000000; 1711 } 1712 shift32RightJamming( aSig, - expDiff, &aSig ); 1713 bSig |= 0x40000000; 1714 bBigger: 1715 zSig = bSig - aSig; 1716 zExp = bExp; 1717 zSign ^= 1; 1718 goto normalizeRoundAndPack; 1719 aExpBigger: 1720 if ( aExp == 0xFF ) { 1721 if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR ); 1722 return a; 1723 } 1724 if ( bExp == 0 ) { 1725 --expDiff; 1726 } 1727 else { 1728 bSig |= 0x40000000; 1729 } 1730 shift32RightJamming( bSig, expDiff, &bSig ); 1731 aSig |= 0x40000000; 1732 aBigger: 1733 zSig = aSig - bSig; 1734 zExp = aExp; 1735 normalizeRoundAndPack: 1736 --zExp; 1737 return normalizeRoundAndPackFloat32( zSign, zExp, zSig STATUS_VAR ); 1738 1739 } 1740 1741 /*---------------------------------------------------------------------------- 1742 | Returns the result of adding the single-precision floating-point values `a' 1743 | and `b'. The operation is performed according to the IEC/IEEE Standard for 1744 | Binary Floating-Point Arithmetic. 1745 *----------------------------------------------------------------------------*/ 1746 1747 float32 float32_add( float32 a, float32 b STATUS_PARAM ) 1748 { 1749 flag aSign, bSign; 1750 1751 aSign = extractFloat32Sign( a ); 1752 bSign = extractFloat32Sign( b ); 1753 if ( aSign == bSign ) { 1754 return addFloat32Sigs( a, b, aSign STATUS_VAR); 1755 } 1756 else { 1757 return subFloat32Sigs( a, b, aSign STATUS_VAR ); 1758 } 1759 1760 } 1761 1762 /*---------------------------------------------------------------------------- 1763 | Returns the result of subtracting the single-precision floating-point values 1764 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard 1765 | for Binary Floating-Point Arithmetic. 1766 *----------------------------------------------------------------------------*/ 1767 1768 float32 float32_sub( float32 a, float32 b STATUS_PARAM ) 1769 { 1770 flag aSign, bSign; 1771 1772 aSign = extractFloat32Sign( a ); 1773 bSign = extractFloat32Sign( b ); 1774 if ( aSign == bSign ) { 1775 return subFloat32Sigs( a, b, aSign STATUS_VAR ); 1776 } 1777 else { 1778 return addFloat32Sigs( a, b, aSign STATUS_VAR ); 1779 } 1780 1781 } 1782 1783 /*---------------------------------------------------------------------------- 1784 | Returns the result of multiplying the single-precision floating-point values 1785 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard 1786 | for Binary Floating-Point Arithmetic. 1787 *----------------------------------------------------------------------------*/ 1788 1789 float32 float32_mul( float32 a, float32 b STATUS_PARAM ) 1790 { 1791 flag aSign, bSign, zSign; 1792 int16 aExp, bExp, zExp; 1793 bits32 aSig, bSig; 1794 bits64 zSig64; 1795 bits32 zSig; 1796 1797 aSig = extractFloat32Frac( a ); 1798 aExp = extractFloat32Exp( a ); 1799 aSign = extractFloat32Sign( a ); 1800 bSig = extractFloat32Frac( b ); 1801 bExp = extractFloat32Exp( b ); 1802 bSign = extractFloat32Sign( b ); 1803 zSign = aSign ^ bSign; 1804 if ( aExp == 0xFF ) { 1805 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) { 1806 return propagateFloat32NaN( a, b STATUS_VAR ); 1807 } 1808 if ( ( bExp | bSig ) == 0 ) { 1809 float_raise( float_flag_invalid STATUS_VAR); 1810 return float32_default_nan; 1811 } 1812 return packFloat32( zSign, 0xFF, 0 ); 1813 } 1814 if ( bExp == 0xFF ) { 1815 if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR ); 1816 if ( ( aExp | aSig ) == 0 ) { 1817 float_raise( float_flag_invalid STATUS_VAR); 1818 return float32_default_nan; 1819 } 1820 return packFloat32( zSign, 0xFF, 0 ); 1821 } 1822 if ( aExp == 0 ) { 1823 if ( aSig == 0 ) return packFloat32( zSign, 0, 0 ); 1824 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 1825 } 1826 if ( bExp == 0 ) { 1827 if ( bSig == 0 ) return packFloat32( zSign, 0, 0 ); 1828 normalizeFloat32Subnormal( bSig, &bExp, &bSig ); 1829 } 1830 zExp = aExp + bExp - 0x7F; 1831 aSig = ( aSig | 0x00800000 )<<7; 1832 bSig = ( bSig | 0x00800000 )<<8; 1833 shift64RightJamming( ( (bits64) aSig ) * bSig, 32, &zSig64 ); 1834 zSig = zSig64; 1835 if ( 0 <= (sbits32) ( zSig<<1 ) ) { 1836 zSig <<= 1; 1837 --zExp; 1838 } 1839 return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR ); 1840 1841 } 1842 1843 /*---------------------------------------------------------------------------- 1844 | Returns the result of dividing the single-precision floating-point value `a' 1845 | by the corresponding value `b'. The operation is performed according to the 1846 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1847 *----------------------------------------------------------------------------*/ 1848 1849 float32 float32_div( float32 a, float32 b STATUS_PARAM ) 1850 { 1851 flag aSign, bSign, zSign; 1852 int16 aExp, bExp, zExp; 1853 bits32 aSig, bSig, zSig; 1854 1855 aSig = extractFloat32Frac( a ); 1856 aExp = extractFloat32Exp( a ); 1857 aSign = extractFloat32Sign( a ); 1858 bSig = extractFloat32Frac( b ); 1859 bExp = extractFloat32Exp( b ); 1860 bSign = extractFloat32Sign( b ); 1861 zSign = aSign ^ bSign; 1862 if ( aExp == 0xFF ) { 1863 if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR ); 1864 if ( bExp == 0xFF ) { 1865 if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR ); 1866 float_raise( float_flag_invalid STATUS_VAR); 1867 return float32_default_nan; 1868 } 1869 return packFloat32( zSign, 0xFF, 0 ); 1870 } 1871 if ( bExp == 0xFF ) { 1872 if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR ); 1873 return packFloat32( zSign, 0, 0 ); 1874 } 1875 if ( bExp == 0 ) { 1876 if ( bSig == 0 ) { 1877 if ( ( aExp | aSig ) == 0 ) { 1878 float_raise( float_flag_invalid STATUS_VAR); 1879 return float32_default_nan; 1880 } 1881 float_raise( float_flag_divbyzero STATUS_VAR); 1882 return packFloat32( zSign, 0xFF, 0 ); 1883 } 1884 normalizeFloat32Subnormal( bSig, &bExp, &bSig ); 1885 } 1886 if ( aExp == 0 ) { 1887 if ( aSig == 0 ) return packFloat32( zSign, 0, 0 ); 1888 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 1889 } 1890 zExp = aExp - bExp + 0x7D; 1891 aSig = ( aSig | 0x00800000 )<<7; 1892 bSig = ( bSig | 0x00800000 )<<8; 1893 if ( bSig <= ( aSig + aSig ) ) { 1894 aSig >>= 1; 1895 ++zExp; 1896 } 1897 zSig = ( ( (bits64) aSig )<<32 ) / bSig; 1898 if ( ( zSig & 0x3F ) == 0 ) { 1899 zSig |= ( (bits64) bSig * zSig != ( (bits64) aSig )<<32 ); 1900 } 1901 return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR ); 1902 1903 } 1904 1905 /*---------------------------------------------------------------------------- 1906 | Returns the remainder of the single-precision floating-point value `a' 1907 | with respect to the corresponding value `b'. The operation is performed 1908 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1909 *----------------------------------------------------------------------------*/ 1910 1911 float32 float32_rem( float32 a, float32 b STATUS_PARAM ) 1912 { 1913 flag aSign, bSign, zSign; 1914 int16 aExp, bExp, expDiff; 1915 bits32 aSig, bSig; 1916 bits32 q; 1917 bits64 aSig64, bSig64, q64; 1918 bits32 alternateASig; 1919 sbits32 sigMean; 1920 1921 aSig = extractFloat32Frac( a ); 1922 aExp = extractFloat32Exp( a ); 1923 aSign = extractFloat32Sign( a ); 1924 bSig = extractFloat32Frac( b ); 1925 bExp = extractFloat32Exp( b ); 1926 bSign = extractFloat32Sign( b ); 1927 if ( aExp == 0xFF ) { 1928 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) { 1929 return propagateFloat32NaN( a, b STATUS_VAR ); 1930 } 1931 float_raise( float_flag_invalid STATUS_VAR); 1932 return float32_default_nan; 1933 } 1934 if ( bExp == 0xFF ) { 1935 if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR ); 1936 return a; 1937 } 1938 if ( bExp == 0 ) { 1939 if ( bSig == 0 ) { 1940 float_raise( float_flag_invalid STATUS_VAR); 1941 return float32_default_nan; 1942 } 1943 normalizeFloat32Subnormal( bSig, &bExp, &bSig ); 1944 } 1945 if ( aExp == 0 ) { 1946 if ( aSig == 0 ) return a; 1947 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 1948 } 1949 expDiff = aExp - bExp; 1950 aSig |= 0x00800000; 1951 bSig |= 0x00800000; 1952 if ( expDiff < 32 ) { 1953 aSig <<= 8; 1954 bSig <<= 8; 1955 if ( expDiff < 0 ) { 1956 if ( expDiff < -1 ) return a; 1957 aSig >>= 1; 1958 } 1959 q = ( bSig <= aSig ); 1960 if ( q ) aSig -= bSig; 1961 if ( 0 < expDiff ) { 1962 q = ( ( (bits64) aSig )<<32 ) / bSig; 1963 q >>= 32 - expDiff; 1964 bSig >>= 2; 1965 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q; 1966 } 1967 else { 1968 aSig >>= 2; 1969 bSig >>= 2; 1970 } 1971 } 1972 else { 1973 if ( bSig <= aSig ) aSig -= bSig; 1974 aSig64 = ( (bits64) aSig )<<40; 1975 bSig64 = ( (bits64) bSig )<<40; 1976 expDiff -= 64; 1977 while ( 0 < expDiff ) { 1978 q64 = estimateDiv128To64( aSig64, 0, bSig64 ); 1979 q64 = ( 2 < q64 ) ? q64 - 2 : 0; 1980 aSig64 = - ( ( bSig * q64 )<<38 ); 1981 expDiff -= 62; 1982 } 1983 expDiff += 64; 1984 q64 = estimateDiv128To64( aSig64, 0, bSig64 ); 1985 q64 = ( 2 < q64 ) ? q64 - 2 : 0; 1986 q = q64>>( 64 - expDiff ); 1987 bSig <<= 6; 1988 aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q; 1989 } 1990 do { 1991 alternateASig = aSig; 1992 ++q; 1993 aSig -= bSig; 1994 } while ( 0 <= (sbits32) aSig ); 1995 sigMean = aSig + alternateASig; 1996 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) { 1997 aSig = alternateASig; 1998 } 1999 zSign = ( (sbits32) aSig < 0 ); 2000 if ( zSign ) aSig = - aSig; 2001 return normalizeRoundAndPackFloat32( aSign ^ zSign, bExp, aSig STATUS_VAR ); 2002 2003 } 2004 2005 /*---------------------------------------------------------------------------- 2006 | Returns the square root of the single-precision floating-point value `a'. 2007 | The operation is performed according to the IEC/IEEE Standard for Binary 2008 | Floating-Point Arithmetic. 2009 *----------------------------------------------------------------------------*/ 2010 2011 float32 float32_sqrt( float32 a STATUS_PARAM ) 2012 { 2013 flag aSign; 2014 int16 aExp, zExp; 2015 bits32 aSig, zSig; 2016 bits64 rem, term; 2017 2018 aSig = extractFloat32Frac( a ); 2019 aExp = extractFloat32Exp( a ); 2020 aSign = extractFloat32Sign( a ); 2021 if ( aExp == 0xFF ) { 2022 if ( aSig ) return propagateFloat32NaN( a, float32_zero STATUS_VAR ); 2023 if ( ! aSign ) return a; 2024 float_raise( float_flag_invalid STATUS_VAR); 2025 return float32_default_nan; 2026 } 2027 if ( aSign ) { 2028 if ( ( aExp | aSig ) == 0 ) return a; 2029 float_raise( float_flag_invalid STATUS_VAR); 2030 return float32_default_nan; 2031 } 2032 if ( aExp == 0 ) { 2033 if ( aSig == 0 ) return float32_zero; 2034 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 2035 } 2036 zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E; 2037 aSig = ( aSig | 0x00800000 )<<8; 2038 zSig = estimateSqrt32( aExp, aSig ) + 2; 2039 if ( ( zSig & 0x7F ) <= 5 ) { 2040 if ( zSig < 2 ) { 2041 zSig = 0x7FFFFFFF; 2042 goto roundAndPack; 2043 } 2044 aSig >>= aExp & 1; 2045 term = ( (bits64) zSig ) * zSig; 2046 rem = ( ( (bits64) aSig )<<32 ) - term; 2047 while ( (sbits64) rem < 0 ) { 2048 --zSig; 2049 rem += ( ( (bits64) zSig )<<1 ) | 1; 2050 } 2051 zSig |= ( rem != 0 ); 2052 } 2053 shift32RightJamming( zSig, 1, &zSig ); 2054 roundAndPack: 2055 return roundAndPackFloat32( 0, zExp, zSig STATUS_VAR ); 2056 2057 } 2058 2059 /*---------------------------------------------------------------------------- 2060 | Returns the binary log of the single-precision floating-point value `a'. 2061 | The operation is performed according to the IEC/IEEE Standard for Binary 2062 | Floating-Point Arithmetic. 2063 *----------------------------------------------------------------------------*/ 2064 float32 float32_log2( float32 a STATUS_PARAM ) 2065 { 2066 flag aSign, zSign; 2067 int16 aExp; 2068 bits32 aSig, zSig, i; 2069 2070 aSig = extractFloat32Frac( a ); 2071 aExp = extractFloat32Exp( a ); 2072 aSign = extractFloat32Sign( a ); 2073 2074 if ( aExp == 0 ) { 2075 if ( aSig == 0 ) return packFloat32( 1, 0xFF, 0 ); 2076 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 2077 } 2078 if ( aSign ) { 2079 float_raise( float_flag_invalid STATUS_VAR); 2080 return float32_default_nan; 2081 } 2082 if ( aExp == 0xFF ) { 2083 if ( aSig ) return propagateFloat32NaN( a, float32_zero STATUS_VAR ); 2084 return a; 2085 } 2086 2087 aExp -= 0x7F; 2088 aSig |= 0x00800000; 2089 zSign = aExp < 0; 2090 zSig = aExp << 23; 2091 2092 for (i = 1 << 22; i > 0; i >>= 1) { 2093 aSig = ( (bits64)aSig * aSig ) >> 23; 2094 if ( aSig & 0x01000000 ) { 2095 aSig >>= 1; 2096 zSig |= i; 2097 } 2098 } 2099 2100 if ( zSign ) 2101 zSig = -zSig; 2102 2103 return normalizeRoundAndPackFloat32( zSign, 0x85, zSig STATUS_VAR ); 2104 } 2105 2106 /*---------------------------------------------------------------------------- 2107 | Returns 1 if the single-precision floating-point value `a' is equal to 2108 | the corresponding value `b', and 0 otherwise. The comparison is performed 2109 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 2110 *----------------------------------------------------------------------------*/ 2111 2112 int float32_eq( float32 a, float32 b STATUS_PARAM ) 2113 { 2114 2115 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 2116 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 2117 ) { 2118 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) { 2119 float_raise( float_flag_invalid STATUS_VAR); 2120 } 2121 return 0; 2122 } 2123 return ( float32_val(a) == float32_val(b) ) || 2124 ( (bits32) ( ( float32_val(a) | float32_val(b) )<<1 ) == 0 ); 2125 2126 } 2127 2128 /*---------------------------------------------------------------------------- 2129 | Returns 1 if the single-precision floating-point value `a' is less than 2130 | or equal to the corresponding value `b', and 0 otherwise. The comparison 2131 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 2132 | Arithmetic. 2133 *----------------------------------------------------------------------------*/ 2134 2135 int float32_le( float32 a, float32 b STATUS_PARAM ) 2136 { 2137 flag aSign, bSign; 2138 bits32 av, bv; 2139 2140 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 2141 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 2142 ) { 2143 float_raise( float_flag_invalid STATUS_VAR); 2144 return 0; 2145 } 2146 aSign = extractFloat32Sign( a ); 2147 bSign = extractFloat32Sign( b ); 2148 av = float32_val(a); 2149 bv = float32_val(b); 2150 if ( aSign != bSign ) return aSign || ( (bits32) ( ( av | bv )<<1 ) == 0 ); 2151 return ( av == bv ) || ( aSign ^ ( av < bv ) ); 2152 2153 } 2154 2155 /*---------------------------------------------------------------------------- 2156 | Returns 1 if the single-precision floating-point value `a' is less than 2157 | the corresponding value `b', and 0 otherwise. The comparison is performed 2158 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 2159 *----------------------------------------------------------------------------*/ 2160 2161 int float32_lt( float32 a, float32 b STATUS_PARAM ) 2162 { 2163 flag aSign, bSign; 2164 bits32 av, bv; 2165 2166 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 2167 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 2168 ) { 2169 float_raise( float_flag_invalid STATUS_VAR); 2170 return 0; 2171 } 2172 aSign = extractFloat32Sign( a ); 2173 bSign = extractFloat32Sign( b ); 2174 av = float32_val(a); 2175 bv = float32_val(b); 2176 if ( aSign != bSign ) return aSign && ( (bits32) ( ( av | bv )<<1 ) != 0 ); 2177 return ( av != bv ) && ( aSign ^ ( av < bv ) ); 2178 2179 } 2180 2181 /*---------------------------------------------------------------------------- 2182 | Returns 1 if the single-precision floating-point value `a' is equal to 2183 | the corresponding value `b', and 0 otherwise. The invalid exception is 2184 | raised if either operand is a NaN. Otherwise, the comparison is performed 2185 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 2186 *----------------------------------------------------------------------------*/ 2187 2188 int float32_eq_signaling( float32 a, float32 b STATUS_PARAM ) 2189 { 2190 bits32 av, bv; 2191 2192 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 2193 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 2194 ) { 2195 float_raise( float_flag_invalid STATUS_VAR); 2196 return 0; 2197 } 2198 av = float32_val(a); 2199 bv = float32_val(b); 2200 return ( av == bv ) || ( (bits32) ( ( av | bv )<<1 ) == 0 ); 2201 2202 } 2203 2204 /*---------------------------------------------------------------------------- 2205 | Returns 1 if the single-precision floating-point value `a' is less than or 2206 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not 2207 | cause an exception. Otherwise, the comparison is performed according to the 2208 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 2209 *----------------------------------------------------------------------------*/ 2210 2211 int float32_le_quiet( float32 a, float32 b STATUS_PARAM ) 2212 { 2213 flag aSign, bSign; 2214 bits32 av, bv; 2215 2216 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 2217 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 2218 ) { 2219 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) { 2220 float_raise( float_flag_invalid STATUS_VAR); 2221 } 2222 return 0; 2223 } 2224 aSign = extractFloat32Sign( a ); 2225 bSign = extractFloat32Sign( b ); 2226 av = float32_val(a); 2227 bv = float32_val(b); 2228 if ( aSign != bSign ) return aSign || ( (bits32) ( ( av | bv )<<1 ) == 0 ); 2229 return ( av == bv ) || ( aSign ^ ( av < bv ) ); 2230 2231 } 2232 2233 /*---------------------------------------------------------------------------- 2234 | Returns 1 if the single-precision floating-point value `a' is less than 2235 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 2236 | exception. Otherwise, the comparison is performed according to the IEC/IEEE 2237 | Standard for Binary Floating-Point Arithmetic. 2238 *----------------------------------------------------------------------------*/ 2239 2240 int float32_lt_quiet( float32 a, float32 b STATUS_PARAM ) 2241 { 2242 flag aSign, bSign; 2243 bits32 av, bv; 2244 2245 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 2246 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 2247 ) { 2248 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) { 2249 float_raise( float_flag_invalid STATUS_VAR); 2250 } 2251 return 0; 2252 } 2253 aSign = extractFloat32Sign( a ); 2254 bSign = extractFloat32Sign( b ); 2255 av = float32_val(a); 2256 bv = float32_val(b); 2257 if ( aSign != bSign ) return aSign && ( (bits32) ( ( av | bv )<<1 ) != 0 ); 2258 return ( av != bv ) && ( aSign ^ ( av < bv ) ); 2259 2260 } 2261 2262 /*---------------------------------------------------------------------------- 2263 | Returns the result of converting the double-precision floating-point value 2264 | `a' to the 32-bit two's complement integer format. The conversion is 2265 | performed according to the IEC/IEEE Standard for Binary Floating-Point 2266 | Arithmetic---which means in particular that the conversion is rounded 2267 | according to the current rounding mode. If `a' is a NaN, the largest 2268 | positive integer is returned. Otherwise, if the conversion overflows, the 2269 | largest integer with the same sign as `a' is returned. 2270 *----------------------------------------------------------------------------*/ 2271 2272 int32 float64_to_int32( float64 a STATUS_PARAM ) 2273 { 2274 flag aSign; 2275 int16 aExp, shiftCount; 2276 bits64 aSig; 2277 2278 aSig = extractFloat64Frac( a ); 2279 aExp = extractFloat64Exp( a ); 2280 aSign = extractFloat64Sign( a ); 2281 if ( ( aExp == 0x7FF ) && aSig ) aSign = 0; 2282 if ( aExp ) aSig |= LIT64( 0x0010000000000000 ); 2283 shiftCount = 0x42C - aExp; 2284 if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig ); 2285 return roundAndPackInt32( aSign, aSig STATUS_VAR ); 2286 2287 } 2288 2289 /*---------------------------------------------------------------------------- 2290 | Returns the result of converting the double-precision floating-point value 2291 | `a' to the 32-bit two's complement integer format. The conversion is 2292 | performed according to the IEC/IEEE Standard for Binary Floating-Point 2293 | Arithmetic, except that the conversion is always rounded toward zero. 2294 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if 2295 | the conversion overflows, the largest integer with the same sign as `a' is 2296 | returned. 2297 *----------------------------------------------------------------------------*/ 2298 2299 int32 float64_to_int32_round_to_zero( float64 a STATUS_PARAM ) 2300 { 2301 flag aSign; 2302 int16 aExp, shiftCount; 2303 bits64 aSig, savedASig; 2304 int32 z; 2305 2306 aSig = extractFloat64Frac( a ); 2307 aExp = extractFloat64Exp( a ); 2308 aSign = extractFloat64Sign( a ); 2309 if ( 0x41E < aExp ) { 2310 if ( ( aExp == 0x7FF ) && aSig ) aSign = 0; 2311 goto invalid; 2312 } 2313 else if ( aExp < 0x3FF ) { 2314 if ( aExp || aSig ) STATUS(float_exception_flags) |= float_flag_inexact; 2315 return 0; 2316 } 2317 aSig |= LIT64( 0x0010000000000000 ); 2318 shiftCount = 0x433 - aExp; 2319 savedASig = aSig; 2320 aSig >>= shiftCount; 2321 z = aSig; 2322 if ( aSign ) z = - z; 2323 if ( ( z < 0 ) ^ aSign ) { 2324 invalid: 2325 float_raise( float_flag_invalid STATUS_VAR); 2326 return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF; 2327 } 2328 if ( ( aSig<<shiftCount ) != savedASig ) { 2329 STATUS(float_exception_flags) |= float_flag_inexact; 2330 } 2331 return z; 2332 2333 } 2334 2335 /*---------------------------------------------------------------------------- 2336 | Returns the result of converting the double-precision floating-point value 2337 | `a' to the 64-bit two's complement integer format. The conversion is 2338 | performed according to the IEC/IEEE Standard for Binary Floating-Point 2339 | Arithmetic---which means in particular that the conversion is rounded 2340 | according to the current rounding mode. If `a' is a NaN, the largest 2341 | positive integer is returned. Otherwise, if the conversion overflows, the 2342 | largest integer with the same sign as `a' is returned. 2343 *----------------------------------------------------------------------------*/ 2344 2345 int64 float64_to_int64( float64 a STATUS_PARAM ) 2346 { 2347 flag aSign; 2348 int16 aExp, shiftCount; 2349 bits64 aSig, aSigExtra; 2350 2351 aSig = extractFloat64Frac( a ); 2352 aExp = extractFloat64Exp( a ); 2353 aSign = extractFloat64Sign( a ); 2354 if ( aExp ) aSig |= LIT64( 0x0010000000000000 ); 2355 shiftCount = 0x433 - aExp; 2356 if ( shiftCount <= 0 ) { 2357 if ( 0x43E < aExp ) { 2358 float_raise( float_flag_invalid STATUS_VAR); 2359 if ( ! aSign 2360 || ( ( aExp == 0x7FF ) 2361 && ( aSig != LIT64( 0x0010000000000000 ) ) ) 2362 ) { 2363 return LIT64( 0x7FFFFFFFFFFFFFFF ); 2364 } 2365 return (sbits64) LIT64( 0x8000000000000000 ); 2366 } 2367 aSigExtra = 0; 2368 aSig <<= - shiftCount; 2369 } 2370 else { 2371 shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra ); 2372 } 2373 return roundAndPackInt64( aSign, aSig, aSigExtra STATUS_VAR ); 2374 2375 } 2376 2377 /*---------------------------------------------------------------------------- 2378 | Returns the result of converting the double-precision floating-point value 2379 | `a' to the 64-bit two's complement integer format. The conversion is 2380 | performed according to the IEC/IEEE Standard for Binary Floating-Point 2381 | Arithmetic, except that the conversion is always rounded toward zero. 2382 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if 2383 | the conversion overflows, the largest integer with the same sign as `a' is 2384 | returned. 2385 *----------------------------------------------------------------------------*/ 2386 2387 int64 float64_to_int64_round_to_zero( float64 a STATUS_PARAM ) 2388 { 2389 flag aSign; 2390 int16 aExp, shiftCount; 2391 bits64 aSig; 2392 int64 z; 2393 2394 aSig = extractFloat64Frac( a ); 2395 aExp = extractFloat64Exp( a ); 2396 aSign = extractFloat64Sign( a ); 2397 if ( aExp ) aSig |= LIT64( 0x0010000000000000 ); 2398 shiftCount = aExp - 0x433; 2399 if ( 0 <= shiftCount ) { 2400 if ( 0x43E <= aExp ) { 2401 if ( float64_val(a) != LIT64( 0xC3E0000000000000 ) ) { 2402 float_raise( float_flag_invalid STATUS_VAR); 2403 if ( ! aSign 2404 || ( ( aExp == 0x7FF ) 2405 && ( aSig != LIT64( 0x0010000000000000 ) ) ) 2406 ) { 2407 return LIT64( 0x7FFFFFFFFFFFFFFF ); 2408 } 2409 } 2410 return (sbits64) LIT64( 0x8000000000000000 ); 2411 } 2412 z = aSig<<shiftCount; 2413 } 2414 else { 2415 if ( aExp < 0x3FE ) { 2416 if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact; 2417 return 0; 2418 } 2419 z = aSig>>( - shiftCount ); 2420 if ( (bits64) ( aSig<<( shiftCount & 63 ) ) ) { 2421 STATUS(float_exception_flags) |= float_flag_inexact; 2422 } 2423 } 2424 if ( aSign ) z = - z; 2425 return z; 2426 2427 } 2428 2429 /*---------------------------------------------------------------------------- 2430 | Returns the result of converting the double-precision floating-point value 2431 | `a' to the single-precision floating-point format. The conversion is 2432 | performed according to the IEC/IEEE Standard for Binary Floating-Point 2433 | Arithmetic. 2434 *----------------------------------------------------------------------------*/ 2435 2436 float32 float64_to_float32( float64 a STATUS_PARAM ) 2437 { 2438 flag aSign; 2439 int16 aExp; 2440 bits64 aSig; 2441 bits32 zSig; 2442 2443 aSig = extractFloat64Frac( a ); 2444 aExp = extractFloat64Exp( a ); 2445 aSign = extractFloat64Sign( a ); 2446 if ( aExp == 0x7FF ) { 2447 if ( aSig ) return commonNaNToFloat32( float64ToCommonNaN( a STATUS_VAR ) ); 2448 return packFloat32( aSign, 0xFF, 0 ); 2449 } 2450 shift64RightJamming( aSig, 22, &aSig ); 2451 zSig = aSig; 2452 if ( aExp || zSig ) { 2453 zSig |= 0x40000000; 2454 aExp -= 0x381; 2455 } 2456 return roundAndPackFloat32( aSign, aExp, zSig STATUS_VAR ); 2457 2458 } 2459 2460 #ifdef FLOATX80 2461 2462 /*---------------------------------------------------------------------------- 2463 | Returns the result of converting the double-precision floating-point value 2464 | `a' to the extended double-precision floating-point format. The conversion 2465 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 2466 | Arithmetic. 2467 *----------------------------------------------------------------------------*/ 2468 2469 floatx80 float64_to_floatx80( float64 a STATUS_PARAM ) 2470 { 2471 flag aSign; 2472 int16 aExp; 2473 bits64 aSig; 2474 2475 aSig = extractFloat64Frac( a ); 2476 aExp = extractFloat64Exp( a ); 2477 aSign = extractFloat64Sign( a ); 2478 if ( aExp == 0x7FF ) { 2479 if ( aSig ) return commonNaNToFloatx80( float64ToCommonNaN( a STATUS_VAR ) ); 2480 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 2481 } 2482 if ( aExp == 0 ) { 2483 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 ); 2484 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 2485 } 2486 return 2487 packFloatx80( 2488 aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 ); 2489 2490 } 2491 2492 #endif 2493 2494 #ifdef FLOAT128 2495 2496 /*---------------------------------------------------------------------------- 2497 | Returns the result of converting the double-precision floating-point value 2498 | `a' to the quadruple-precision floating-point format. The conversion is 2499 | performed according to the IEC/IEEE Standard for Binary Floating-Point 2500 | Arithmetic. 2501 *----------------------------------------------------------------------------*/ 2502 2503 float128 float64_to_float128( float64 a STATUS_PARAM ) 2504 { 2505 flag aSign; 2506 int16 aExp; 2507 bits64 aSig, zSig0, zSig1; 2508 2509 aSig = extractFloat64Frac( a ); 2510 aExp = extractFloat64Exp( a ); 2511 aSign = extractFloat64Sign( a ); 2512 if ( aExp == 0x7FF ) { 2513 if ( aSig ) return commonNaNToFloat128( float64ToCommonNaN( a STATUS_VAR ) ); 2514 return packFloat128( aSign, 0x7FFF, 0, 0 ); 2515 } 2516 if ( aExp == 0 ) { 2517 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 ); 2518 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 2519 --aExp; 2520 } 2521 shift128Right( aSig, 0, 4, &zSig0, &zSig1 ); 2522 return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 ); 2523 2524 } 2525 2526 #endif 2527 2528 /*---------------------------------------------------------------------------- 2529 | Rounds the double-precision floating-point value `a' to an integer, and 2530 | returns the result as a double-precision floating-point value. The 2531 | operation is performed according to the IEC/IEEE Standard for Binary 2532 | Floating-Point Arithmetic. 2533 *----------------------------------------------------------------------------*/ 2534 2535 float64 float64_round_to_int( float64 a STATUS_PARAM ) 2536 { 2537 flag aSign; 2538 int16 aExp; 2539 bits64 lastBitMask, roundBitsMask; 2540 int8 roundingMode; 2541 bits64 z; 2542 2543 aExp = extractFloat64Exp( a ); 2544 if ( 0x433 <= aExp ) { 2545 if ( ( aExp == 0x7FF ) && extractFloat64Frac( a ) ) { 2546 return propagateFloat64NaN( a, a STATUS_VAR ); 2547 } 2548 return a; 2549 } 2550 if ( aExp < 0x3FF ) { 2551 if ( (bits64) ( float64_val(a)<<1 ) == 0 ) return a; 2552 STATUS(float_exception_flags) |= float_flag_inexact; 2553 aSign = extractFloat64Sign( a ); 2554 switch ( STATUS(float_rounding_mode) ) { 2555 case float_round_nearest_even: 2556 if ( ( aExp == 0x3FE ) && extractFloat64Frac( a ) ) { 2557 return packFloat64( aSign, 0x3FF, 0 ); 2558 } 2559 break; 2560 case float_round_down: 2561 return make_float64(aSign ? LIT64( 0xBFF0000000000000 ) : 0); 2562 case float_round_up: 2563 return make_float64( 2564 aSign ? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 )); 2565 } 2566 return packFloat64( aSign, 0, 0 ); 2567 } 2568 lastBitMask = 1; 2569 lastBitMask <<= 0x433 - aExp; 2570 roundBitsMask = lastBitMask - 1; 2571 z = float64_val(a); 2572 roundingMode = STATUS(float_rounding_mode); 2573 if ( roundingMode == float_round_nearest_even ) { 2574 z += lastBitMask>>1; 2575 if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask; 2576 } 2577 else if ( roundingMode != float_round_to_zero ) { 2578 if ( extractFloat64Sign( make_float64(z) ) ^ ( roundingMode == float_round_up ) ) { 2579 z += roundBitsMask; 2580 } 2581 } 2582 z &= ~ roundBitsMask; 2583 if ( z != float64_val(a) ) 2584 STATUS(float_exception_flags) |= float_flag_inexact; 2585 return make_float64(z); 2586 2587 } 2588 2589 float64 float64_trunc_to_int( float64 a STATUS_PARAM) 2590 { 2591 int oldmode; 2592 float64 res; 2593 oldmode = STATUS(float_rounding_mode); 2594 STATUS(float_rounding_mode) = float_round_to_zero; 2595 res = float64_round_to_int(a STATUS_VAR); 2596 STATUS(float_rounding_mode) = oldmode; 2597 return res; 2598 } 2599 2600 /*---------------------------------------------------------------------------- 2601 | Returns the result of adding the absolute values of the double-precision 2602 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated 2603 | before being returned. `zSign' is ignored if the result is a NaN. 2604 | The addition is performed according to the IEC/IEEE Standard for Binary 2605 | Floating-Point Arithmetic. 2606 *----------------------------------------------------------------------------*/ 2607 2608 static float64 addFloat64Sigs( float64 a, float64 b, flag zSign STATUS_PARAM ) 2609 { 2610 int16 aExp, bExp, zExp; 2611 bits64 aSig, bSig, zSig; 2612 int16 expDiff; 2613 2614 aSig = extractFloat64Frac( a ); 2615 aExp = extractFloat64Exp( a ); 2616 bSig = extractFloat64Frac( b ); 2617 bExp = extractFloat64Exp( b ); 2618 expDiff = aExp - bExp; 2619 aSig <<= 9; 2620 bSig <<= 9; 2621 if ( 0 < expDiff ) { 2622 if ( aExp == 0x7FF ) { 2623 if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR ); 2624 return a; 2625 } 2626 if ( bExp == 0 ) { 2627 --expDiff; 2628 } 2629 else { 2630 bSig |= LIT64( 0x2000000000000000 ); 2631 } 2632 shift64RightJamming( bSig, expDiff, &bSig ); 2633 zExp = aExp; 2634 } 2635 else if ( expDiff < 0 ) { 2636 if ( bExp == 0x7FF ) { 2637 if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR ); 2638 return packFloat64( zSign, 0x7FF, 0 ); 2639 } 2640 if ( aExp == 0 ) { 2641 ++expDiff; 2642 } 2643 else { 2644 aSig |= LIT64( 0x2000000000000000 ); 2645 } 2646 shift64RightJamming( aSig, - expDiff, &aSig ); 2647 zExp = bExp; 2648 } 2649 else { 2650 if ( aExp == 0x7FF ) { 2651 if ( aSig | bSig ) return propagateFloat64NaN( a, b STATUS_VAR ); 2652 return a; 2653 } 2654 if ( aExp == 0 ) { 2655 if ( STATUS(flush_to_zero) ) return packFloat64( zSign, 0, 0 ); 2656 return packFloat64( zSign, 0, ( aSig + bSig )>>9 ); 2657 } 2658 zSig = LIT64( 0x4000000000000000 ) + aSig + bSig; 2659 zExp = aExp; 2660 goto roundAndPack; 2661 } 2662 aSig |= LIT64( 0x2000000000000000 ); 2663 zSig = ( aSig + bSig )<<1; 2664 --zExp; 2665 if ( (sbits64) zSig < 0 ) { 2666 zSig = aSig + bSig; 2667 ++zExp; 2668 } 2669 roundAndPack: 2670 return roundAndPackFloat64( zSign, zExp, zSig STATUS_VAR ); 2671 2672 } 2673 2674 /*---------------------------------------------------------------------------- 2675 | Returns the result of subtracting the absolute values of the double- 2676 | precision floating-point values `a' and `b'. If `zSign' is 1, the 2677 | difference is negated before being returned. `zSign' is ignored if the 2678 | result is a NaN. The subtraction is performed according to the IEC/IEEE 2679 | Standard for Binary Floating-Point Arithmetic. 2680 *----------------------------------------------------------------------------*/ 2681 2682 static float64 subFloat64Sigs( float64 a, float64 b, flag zSign STATUS_PARAM ) 2683 { 2684 int16 aExp, bExp, zExp; 2685 bits64 aSig, bSig, zSig; 2686 int16 expDiff; 2687 2688 aSig = extractFloat64Frac( a ); 2689 aExp = extractFloat64Exp( a ); 2690 bSig = extractFloat64Frac( b ); 2691 bExp = extractFloat64Exp( b ); 2692 expDiff = aExp - bExp; 2693 aSig <<= 10; 2694 bSig <<= 10; 2695 if ( 0 < expDiff ) goto aExpBigger; 2696 if ( expDiff < 0 ) goto bExpBigger; 2697 if ( aExp == 0x7FF ) { 2698 if ( aSig | bSig ) return propagateFloat64NaN( a, b STATUS_VAR ); 2699 float_raise( float_flag_invalid STATUS_VAR); 2700 return float64_default_nan; 2701 } 2702 if ( aExp == 0 ) { 2703 aExp = 1; 2704 bExp = 1; 2705 } 2706 if ( bSig < aSig ) goto aBigger; 2707 if ( aSig < bSig ) goto bBigger; 2708 return packFloat64( STATUS(float_rounding_mode) == float_round_down, 0, 0 ); 2709 bExpBigger: 2710 if ( bExp == 0x7FF ) { 2711 if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR ); 2712 return packFloat64( zSign ^ 1, 0x7FF, 0 ); 2713 } 2714 if ( aExp == 0 ) { 2715 ++expDiff; 2716 } 2717 else { 2718 aSig |= LIT64( 0x4000000000000000 ); 2719 } 2720 shift64RightJamming( aSig, - expDiff, &aSig ); 2721 bSig |= LIT64( 0x4000000000000000 ); 2722 bBigger: 2723 zSig = bSig - aSig; 2724 zExp = bExp; 2725 zSign ^= 1; 2726 goto normalizeRoundAndPack; 2727 aExpBigger: 2728 if ( aExp == 0x7FF ) { 2729 if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR ); 2730 return a; 2731 } 2732 if ( bExp == 0 ) { 2733 --expDiff; 2734 } 2735 else { 2736 bSig |= LIT64( 0x4000000000000000 ); 2737 } 2738 shift64RightJamming( bSig, expDiff, &bSig ); 2739 aSig |= LIT64( 0x4000000000000000 ); 2740 aBigger: 2741 zSig = aSig - bSig; 2742 zExp = aExp; 2743 normalizeRoundAndPack: 2744 --zExp; 2745 return normalizeRoundAndPackFloat64( zSign, zExp, zSig STATUS_VAR ); 2746 2747 } 2748 2749 /*---------------------------------------------------------------------------- 2750 | Returns the result of adding the double-precision floating-point values `a' 2751 | and `b'. The operation is performed according to the IEC/IEEE Standard for 2752 | Binary Floating-Point Arithmetic. 2753 *----------------------------------------------------------------------------*/ 2754 2755 float64 float64_add( float64 a, float64 b STATUS_PARAM ) 2756 { 2757 flag aSign, bSign; 2758 2759 aSign = extractFloat64Sign( a ); 2760 bSign = extractFloat64Sign( b ); 2761 if ( aSign == bSign ) { 2762 return addFloat64Sigs( a, b, aSign STATUS_VAR ); 2763 } 2764 else { 2765 return subFloat64Sigs( a, b, aSign STATUS_VAR ); 2766 } 2767 2768 } 2769 2770 /*---------------------------------------------------------------------------- 2771 | Returns the result of subtracting the double-precision floating-point values 2772 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard 2773 | for Binary Floating-Point Arithmetic. 2774 *----------------------------------------------------------------------------*/ 2775 2776 float64 float64_sub( float64 a, float64 b STATUS_PARAM ) 2777 { 2778 flag aSign, bSign; 2779 2780 aSign = extractFloat64Sign( a ); 2781 bSign = extractFloat64Sign( b ); 2782 if ( aSign == bSign ) { 2783 return subFloat64Sigs( a, b, aSign STATUS_VAR ); 2784 } 2785 else { 2786 return addFloat64Sigs( a, b, aSign STATUS_VAR ); 2787 } 2788 2789 } 2790 2791 /*---------------------------------------------------------------------------- 2792 | Returns the result of multiplying the double-precision floating-point values 2793 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard 2794 | for Binary Floating-Point Arithmetic. 2795 *----------------------------------------------------------------------------*/ 2796 2797 float64 float64_mul( float64 a, float64 b STATUS_PARAM ) 2798 { 2799 flag aSign, bSign, zSign; 2800 int16 aExp, bExp, zExp; 2801 bits64 aSig, bSig, zSig0, zSig1; 2802 2803 aSig = extractFloat64Frac( a ); 2804 aExp = extractFloat64Exp( a ); 2805 aSign = extractFloat64Sign( a ); 2806 bSig = extractFloat64Frac( b ); 2807 bExp = extractFloat64Exp( b ); 2808 bSign = extractFloat64Sign( b ); 2809 zSign = aSign ^ bSign; 2810 if ( aExp == 0x7FF ) { 2811 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) { 2812 return propagateFloat64NaN( a, b STATUS_VAR ); 2813 } 2814 if ( ( bExp | bSig ) == 0 ) { 2815 float_raise( float_flag_invalid STATUS_VAR); 2816 return float64_default_nan; 2817 } 2818 return packFloat64( zSign, 0x7FF, 0 ); 2819 } 2820 if ( bExp == 0x7FF ) { 2821 if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR ); 2822 if ( ( aExp | aSig ) == 0 ) { 2823 float_raise( float_flag_invalid STATUS_VAR); 2824 return float64_default_nan; 2825 } 2826 return packFloat64( zSign, 0x7FF, 0 ); 2827 } 2828 if ( aExp == 0 ) { 2829 if ( aSig == 0 ) return packFloat64( zSign, 0, 0 ); 2830 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 2831 } 2832 if ( bExp == 0 ) { 2833 if ( bSig == 0 ) return packFloat64( zSign, 0, 0 ); 2834 normalizeFloat64Subnormal( bSig, &bExp, &bSig ); 2835 } 2836 zExp = aExp + bExp - 0x3FF; 2837 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10; 2838 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11; 2839 mul64To128( aSig, bSig, &zSig0, &zSig1 ); 2840 zSig0 |= ( zSig1 != 0 ); 2841 if ( 0 <= (sbits64) ( zSig0<<1 ) ) { 2842 zSig0 <<= 1; 2843 --zExp; 2844 } 2845 return roundAndPackFloat64( zSign, zExp, zSig0 STATUS_VAR ); 2846 2847 } 2848 2849 /*---------------------------------------------------------------------------- 2850 | Returns the result of dividing the double-precision floating-point value `a' 2851 | by the corresponding value `b'. The operation is performed according to 2852 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 2853 *----------------------------------------------------------------------------*/ 2854 2855 float64 float64_div( float64 a, float64 b STATUS_PARAM ) 2856 { 2857 flag aSign, bSign, zSign; 2858 int16 aExp, bExp, zExp; 2859 bits64 aSig, bSig, zSig; 2860 bits64 rem0, rem1; 2861 bits64 term0, term1; 2862 2863 aSig = extractFloat64Frac( a ); 2864 aExp = extractFloat64Exp( a ); 2865 aSign = extractFloat64Sign( a ); 2866 bSig = extractFloat64Frac( b ); 2867 bExp = extractFloat64Exp( b ); 2868 bSign = extractFloat64Sign( b ); 2869 zSign = aSign ^ bSign; 2870 if ( aExp == 0x7FF ) { 2871 if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR ); 2872 if ( bExp == 0x7FF ) { 2873 if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR ); 2874 float_raise( float_flag_invalid STATUS_VAR); 2875 return float64_default_nan; 2876 } 2877 return packFloat64( zSign, 0x7FF, 0 ); 2878 } 2879 if ( bExp == 0x7FF ) { 2880 if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR ); 2881 return packFloat64( zSign, 0, 0 ); 2882 } 2883 if ( bExp == 0 ) { 2884 if ( bSig == 0 ) { 2885 if ( ( aExp | aSig ) == 0 ) { 2886 float_raise( float_flag_invalid STATUS_VAR); 2887 return float64_default_nan; 2888 } 2889 float_raise( float_flag_divbyzero STATUS_VAR); 2890 return packFloat64( zSign, 0x7FF, 0 ); 2891 } 2892 normalizeFloat64Subnormal( bSig, &bExp, &bSig ); 2893 } 2894 if ( aExp == 0 ) { 2895 if ( aSig == 0 ) return packFloat64( zSign, 0, 0 ); 2896 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 2897 } 2898 zExp = aExp - bExp + 0x3FD; 2899 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10; 2900 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11; 2901 if ( bSig <= ( aSig + aSig ) ) { 2902 aSig >>= 1; 2903 ++zExp; 2904 } 2905 zSig = estimateDiv128To64( aSig, 0, bSig ); 2906 if ( ( zSig & 0x1FF ) <= 2 ) { 2907 mul64To128( bSig, zSig, &term0, &term1 ); 2908 sub128( aSig, 0, term0, term1, &rem0, &rem1 ); 2909 while ( (sbits64) rem0 < 0 ) { 2910 --zSig; 2911 add128( rem0, rem1, 0, bSig, &rem0, &rem1 ); 2912 } 2913 zSig |= ( rem1 != 0 ); 2914 } 2915 return roundAndPackFloat64( zSign, zExp, zSig STATUS_VAR ); 2916 2917 } 2918 2919 /*---------------------------------------------------------------------------- 2920 | Returns the remainder of the double-precision floating-point value `a' 2921 | with respect to the corresponding value `b'. The operation is performed 2922 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 2923 *----------------------------------------------------------------------------*/ 2924 2925 float64 float64_rem( float64 a, float64 b STATUS_PARAM ) 2926 { 2927 flag aSign, bSign, zSign; 2928 int16 aExp, bExp, expDiff; 2929 bits64 aSig, bSig; 2930 bits64 q, alternateASig; 2931 sbits64 sigMean; 2932 2933 aSig = extractFloat64Frac( a ); 2934 aExp = extractFloat64Exp( a ); 2935 aSign = extractFloat64Sign( a ); 2936 bSig = extractFloat64Frac( b ); 2937 bExp = extractFloat64Exp( b ); 2938 bSign = extractFloat64Sign( b ); 2939 if ( aExp == 0x7FF ) { 2940 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) { 2941 return propagateFloat64NaN( a, b STATUS_VAR ); 2942 } 2943 float_raise( float_flag_invalid STATUS_VAR); 2944 return float64_default_nan; 2945 } 2946 if ( bExp == 0x7FF ) { 2947 if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR ); 2948 return a; 2949 } 2950 if ( bExp == 0 ) { 2951 if ( bSig == 0 ) { 2952 float_raise( float_flag_invalid STATUS_VAR); 2953 return float64_default_nan; 2954 } 2955 normalizeFloat64Subnormal( bSig, &bExp, &bSig ); 2956 } 2957 if ( aExp == 0 ) { 2958 if ( aSig == 0 ) return a; 2959 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 2960 } 2961 expDiff = aExp - bExp; 2962 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11; 2963 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11; 2964 if ( expDiff < 0 ) { 2965 if ( expDiff < -1 ) return a; 2966 aSig >>= 1; 2967 } 2968 q = ( bSig <= aSig ); 2969 if ( q ) aSig -= bSig; 2970 expDiff -= 64; 2971 while ( 0 < expDiff ) { 2972 q = estimateDiv128To64( aSig, 0, bSig ); 2973 q = ( 2 < q ) ? q - 2 : 0; 2974 aSig = - ( ( bSig>>2 ) * q ); 2975 expDiff -= 62; 2976 } 2977 expDiff += 64; 2978 if ( 0 < expDiff ) { 2979 q = estimateDiv128To64( aSig, 0, bSig ); 2980 q = ( 2 < q ) ? q - 2 : 0; 2981 q >>= 64 - expDiff; 2982 bSig >>= 2; 2983 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q; 2984 } 2985 else { 2986 aSig >>= 2; 2987 bSig >>= 2; 2988 } 2989 do { 2990 alternateASig = aSig; 2991 ++q; 2992 aSig -= bSig; 2993 } while ( 0 <= (sbits64) aSig ); 2994 sigMean = aSig + alternateASig; 2995 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) { 2996 aSig = alternateASig; 2997 } 2998 zSign = ( (sbits64) aSig < 0 ); 2999 if ( zSign ) aSig = - aSig; 3000 return normalizeRoundAndPackFloat64( aSign ^ zSign, bExp, aSig STATUS_VAR ); 3001 3002 } 3003 3004 /*---------------------------------------------------------------------------- 3005 | Returns the square root of the double-precision floating-point value `a'. 3006 | The operation is performed according to the IEC/IEEE Standard for Binary 3007 | Floating-Point Arithmetic. 3008 *----------------------------------------------------------------------------*/ 3009 3010 float64 float64_sqrt( float64 a STATUS_PARAM ) 3011 { 3012 flag aSign; 3013 int16 aExp, zExp; 3014 bits64 aSig, zSig, doubleZSig; 3015 bits64 rem0, rem1, term0, term1; 3016 3017 aSig = extractFloat64Frac( a ); 3018 aExp = extractFloat64Exp( a ); 3019 aSign = extractFloat64Sign( a ); 3020 if ( aExp == 0x7FF ) { 3021 if ( aSig ) return propagateFloat64NaN( a, a STATUS_VAR ); 3022 if ( ! aSign ) return a; 3023 float_raise( float_flag_invalid STATUS_VAR); 3024 return float64_default_nan; 3025 } 3026 if ( aSign ) { 3027 if ( ( aExp | aSig ) == 0 ) return a; 3028 float_raise( float_flag_invalid STATUS_VAR); 3029 return float64_default_nan; 3030 } 3031 if ( aExp == 0 ) { 3032 if ( aSig == 0 ) return float64_zero; 3033 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 3034 } 3035 zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE; 3036 aSig |= LIT64( 0x0010000000000000 ); 3037 zSig = estimateSqrt32( aExp, aSig>>21 ); 3038 aSig <<= 9 - ( aExp & 1 ); 3039 zSig = estimateDiv128To64( aSig, 0, zSig<<32 ) + ( zSig<<30 ); 3040 if ( ( zSig & 0x1FF ) <= 5 ) { 3041 doubleZSig = zSig<<1; 3042 mul64To128( zSig, zSig, &term0, &term1 ); 3043 sub128( aSig, 0, term0, term1, &rem0, &rem1 ); 3044 while ( (sbits64) rem0 < 0 ) { 3045 --zSig; 3046 doubleZSig -= 2; 3047 add128( rem0, rem1, zSig>>63, doubleZSig | 1, &rem0, &rem1 ); 3048 } 3049 zSig |= ( ( rem0 | rem1 ) != 0 ); 3050 } 3051 return roundAndPackFloat64( 0, zExp, zSig STATUS_VAR ); 3052 3053 } 3054 3055 /*---------------------------------------------------------------------------- 3056 | Returns the binary log of the double-precision floating-point value `a'. 3057 | The operation is performed according to the IEC/IEEE Standard for Binary 3058 | Floating-Point Arithmetic. 3059 *----------------------------------------------------------------------------*/ 3060 float64 float64_log2( float64 a STATUS_PARAM ) 3061 { 3062 flag aSign, zSign; 3063 int16 aExp; 3064 bits64 aSig, aSig0, aSig1, zSig, i; 3065 3066 aSig = extractFloat64Frac( a ); 3067 aExp = extractFloat64Exp( a ); 3068 aSign = extractFloat64Sign( a ); 3069 3070 if ( aExp == 0 ) { 3071 if ( aSig == 0 ) return packFloat64( 1, 0x7FF, 0 ); 3072 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 3073 } 3074 if ( aSign ) { 3075 float_raise( float_flag_invalid STATUS_VAR); 3076 return float64_default_nan; 3077 } 3078 if ( aExp == 0x7FF ) { 3079 if ( aSig ) return propagateFloat64NaN( a, float64_zero STATUS_VAR ); 3080 return a; 3081 } 3082 3083 aExp -= 0x3FF; 3084 aSig |= LIT64( 0x0010000000000000 ); 3085 zSign = aExp < 0; 3086 zSig = (bits64)aExp << 52; 3087 for (i = 1LL << 51; i > 0; i >>= 1) { 3088 mul64To128( aSig, aSig, &aSig0, &aSig1 ); 3089 aSig = ( aSig0 << 12 ) | ( aSig1 >> 52 ); 3090 if ( aSig & LIT64( 0x0020000000000000 ) ) { 3091 aSig >>= 1; 3092 zSig |= i; 3093 } 3094 } 3095 3096 if ( zSign ) 3097 zSig = -zSig; 3098 return normalizeRoundAndPackFloat64( zSign, 0x408, zSig STATUS_VAR ); 3099 } 3100 3101 /*---------------------------------------------------------------------------- 3102 | Returns 1 if the double-precision floating-point value `a' is equal to the 3103 | corresponding value `b', and 0 otherwise. The comparison is performed 3104 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3105 *----------------------------------------------------------------------------*/ 3106 3107 int float64_eq( float64 a, float64 b STATUS_PARAM ) 3108 { 3109 bits64 av, bv; 3110 3111 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 3112 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 3113 ) { 3114 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) { 3115 float_raise( float_flag_invalid STATUS_VAR); 3116 } 3117 return 0; 3118 } 3119 av = float64_val(a); 3120 bv = float64_val(b); 3121 return ( av == bv ) || ( (bits64) ( ( av | bv )<<1 ) == 0 ); 3122 3123 } 3124 3125 /*---------------------------------------------------------------------------- 3126 | Returns 1 if the double-precision floating-point value `a' is less than or 3127 | equal to the corresponding value `b', and 0 otherwise. The comparison is 3128 | performed according to the IEC/IEEE Standard for Binary Floating-Point 3129 | Arithmetic. 3130 *----------------------------------------------------------------------------*/ 3131 3132 int float64_le( float64 a, float64 b STATUS_PARAM ) 3133 { 3134 flag aSign, bSign; 3135 bits64 av, bv; 3136 3137 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 3138 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 3139 ) { 3140 float_raise( float_flag_invalid STATUS_VAR); 3141 return 0; 3142 } 3143 aSign = extractFloat64Sign( a ); 3144 bSign = extractFloat64Sign( b ); 3145 av = float64_val(a); 3146 bv = float64_val(b); 3147 if ( aSign != bSign ) return aSign || ( (bits64) ( ( av | bv )<<1 ) == 0 ); 3148 return ( av == bv ) || ( aSign ^ ( av < bv ) ); 3149 3150 } 3151 3152 /*---------------------------------------------------------------------------- 3153 | Returns 1 if the double-precision floating-point value `a' is less than 3154 | the corresponding value `b', and 0 otherwise. The comparison is performed 3155 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3156 *----------------------------------------------------------------------------*/ 3157 3158 int float64_lt( float64 a, float64 b STATUS_PARAM ) 3159 { 3160 flag aSign, bSign; 3161 bits64 av, bv; 3162 3163 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 3164 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 3165 ) { 3166 float_raise( float_flag_invalid STATUS_VAR); 3167 return 0; 3168 } 3169 aSign = extractFloat64Sign( a ); 3170 bSign = extractFloat64Sign( b ); 3171 av = float64_val(a); 3172 bv = float64_val(b); 3173 if ( aSign != bSign ) return aSign && ( (bits64) ( ( av | bv )<<1 ) != 0 ); 3174 return ( av != bv ) && ( aSign ^ ( av < bv ) ); 3175 3176 } 3177 3178 /*---------------------------------------------------------------------------- 3179 | Returns 1 if the double-precision floating-point value `a' is equal to the 3180 | corresponding value `b', and 0 otherwise. The invalid exception is raised 3181 | if either operand is a NaN. Otherwise, the comparison is performed 3182 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3183 *----------------------------------------------------------------------------*/ 3184 3185 int float64_eq_signaling( float64 a, float64 b STATUS_PARAM ) 3186 { 3187 bits64 av, bv; 3188 3189 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 3190 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 3191 ) { 3192 float_raise( float_flag_invalid STATUS_VAR); 3193 return 0; 3194 } 3195 av = float64_val(a); 3196 bv = float64_val(b); 3197 return ( av == bv ) || ( (bits64) ( ( av | bv )<<1 ) == 0 ); 3198 3199 } 3200 3201 /*---------------------------------------------------------------------------- 3202 | Returns 1 if the double-precision floating-point value `a' is less than or 3203 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not 3204 | cause an exception. Otherwise, the comparison is performed according to the 3205 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3206 *----------------------------------------------------------------------------*/ 3207 3208 int float64_le_quiet( float64 a, float64 b STATUS_PARAM ) 3209 { 3210 flag aSign, bSign; 3211 bits64 av, bv; 3212 3213 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 3214 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 3215 ) { 3216 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) { 3217 float_raise( float_flag_invalid STATUS_VAR); 3218 } 3219 return 0; 3220 } 3221 aSign = extractFloat64Sign( a ); 3222 bSign = extractFloat64Sign( b ); 3223 av = float64_val(a); 3224 bv = float64_val(b); 3225 if ( aSign != bSign ) return aSign || ( (bits64) ( ( av | bv )<<1 ) == 0 ); 3226 return ( av == bv ) || ( aSign ^ ( av < bv ) ); 3227 3228 } 3229 3230 /*---------------------------------------------------------------------------- 3231 | Returns 1 if the double-precision floating-point value `a' is less than 3232 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 3233 | exception. Otherwise, the comparison is performed according to the IEC/IEEE 3234 | Standard for Binary Floating-Point Arithmetic. 3235 *----------------------------------------------------------------------------*/ 3236 3237 int float64_lt_quiet( float64 a, float64 b STATUS_PARAM ) 3238 { 3239 flag aSign, bSign; 3240 bits64 av, bv; 3241 3242 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 3243 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 3244 ) { 3245 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) { 3246 float_raise( float_flag_invalid STATUS_VAR); 3247 } 3248 return 0; 3249 } 3250 aSign = extractFloat64Sign( a ); 3251 bSign = extractFloat64Sign( b ); 3252 av = float64_val(a); 3253 bv = float64_val(b); 3254 if ( aSign != bSign ) return aSign && ( (bits64) ( ( av | bv )<<1 ) != 0 ); 3255 return ( av != bv ) && ( aSign ^ ( av < bv ) ); 3256 3257 } 3258 3259 #ifdef FLOATX80 3260 3261 /*---------------------------------------------------------------------------- 3262 | Returns the result of converting the extended double-precision floating- 3263 | point value `a' to the 32-bit two's complement integer format. The 3264 | conversion is performed according to the IEC/IEEE Standard for Binary 3265 | Floating-Point Arithmetic---which means in particular that the conversion 3266 | is rounded according to the current rounding mode. If `a' is a NaN, the 3267 | largest positive integer is returned. Otherwise, if the conversion 3268 | overflows, the largest integer with the same sign as `a' is returned. 3269 *----------------------------------------------------------------------------*/ 3270 3271 int32 floatx80_to_int32( floatx80 a STATUS_PARAM ) 3272 { 3273 flag aSign; 3274 int32 aExp, shiftCount; 3275 bits64 aSig; 3276 3277 aSig = extractFloatx80Frac( a ); 3278 aExp = extractFloatx80Exp( a ); 3279 aSign = extractFloatx80Sign( a ); 3280 if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0; 3281 shiftCount = 0x4037 - aExp; 3282 if ( shiftCount <= 0 ) shiftCount = 1; 3283 shift64RightJamming( aSig, shiftCount, &aSig ); 3284 return roundAndPackInt32( aSign, aSig STATUS_VAR ); 3285 3286 } 3287 3288 /*---------------------------------------------------------------------------- 3289 | Returns the result of converting the extended double-precision floating- 3290 | point value `a' to the 32-bit two's complement integer format. The 3291 | conversion is performed according to the IEC/IEEE Standard for Binary 3292 | Floating-Point Arithmetic, except that the conversion is always rounded 3293 | toward zero. If `a' is a NaN, the largest positive integer is returned. 3294 | Otherwise, if the conversion overflows, the largest integer with the same 3295 | sign as `a' is returned. 3296 *----------------------------------------------------------------------------*/ 3297 3298 int32 floatx80_to_int32_round_to_zero( floatx80 a STATUS_PARAM ) 3299 { 3300 flag aSign; 3301 int32 aExp, shiftCount; 3302 bits64 aSig, savedASig; 3303 int32 z; 3304 3305 aSig = extractFloatx80Frac( a ); 3306 aExp = extractFloatx80Exp( a ); 3307 aSign = extractFloatx80Sign( a ); 3308 if ( 0x401E < aExp ) { 3309 if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0; 3310 goto invalid; 3311 } 3312 else if ( aExp < 0x3FFF ) { 3313 if ( aExp || aSig ) STATUS(float_exception_flags) |= float_flag_inexact; 3314 return 0; 3315 } 3316 shiftCount = 0x403E - aExp; 3317 savedASig = aSig; 3318 aSig >>= shiftCount; 3319 z = aSig; 3320 if ( aSign ) z = - z; 3321 if ( ( z < 0 ) ^ aSign ) { 3322 invalid: 3323 float_raise( float_flag_invalid STATUS_VAR); 3324 return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF; 3325 } 3326 if ( ( aSig<<shiftCount ) != savedASig ) { 3327 STATUS(float_exception_flags) |= float_flag_inexact; 3328 } 3329 return z; 3330 3331 } 3332 3333 /*---------------------------------------------------------------------------- 3334 | Returns the result of converting the extended double-precision floating- 3335 | point value `a' to the 64-bit two's complement integer format. The 3336 | conversion is performed according to the IEC/IEEE Standard for Binary 3337 | Floating-Point Arithmetic---which means in particular that the conversion 3338 | is rounded according to the current rounding mode. If `a' is a NaN, 3339 | the largest positive integer is returned. Otherwise, if the conversion 3340 | overflows, the largest integer with the same sign as `a' is returned. 3341 *----------------------------------------------------------------------------*/ 3342 3343 int64 floatx80_to_int64( floatx80 a STATUS_PARAM ) 3344 { 3345 flag aSign; 3346 int32 aExp, shiftCount; 3347 bits64 aSig, aSigExtra; 3348 3349 aSig = extractFloatx80Frac( a ); 3350 aExp = extractFloatx80Exp( a ); 3351 aSign = extractFloatx80Sign( a ); 3352 shiftCount = 0x403E - aExp; 3353 if ( shiftCount <= 0 ) { 3354 if ( shiftCount ) { 3355 float_raise( float_flag_invalid STATUS_VAR); 3356 if ( ! aSign 3357 || ( ( aExp == 0x7FFF ) 3358 && ( aSig != LIT64( 0x8000000000000000 ) ) ) 3359 ) { 3360 return LIT64( 0x7FFFFFFFFFFFFFFF ); 3361 } 3362 return (sbits64) LIT64( 0x8000000000000000 ); 3363 } 3364 aSigExtra = 0; 3365 } 3366 else { 3367 shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra ); 3368 } 3369 return roundAndPackInt64( aSign, aSig, aSigExtra STATUS_VAR ); 3370 3371 } 3372 3373 /*---------------------------------------------------------------------------- 3374 | Returns the result of converting the extended double-precision floating- 3375 | point value `a' to the 64-bit two's complement integer format. The 3376 | conversion is performed according to the IEC/IEEE Standard for Binary 3377 | Floating-Point Arithmetic, except that the conversion is always rounded 3378 | toward zero. If `a' is a NaN, the largest positive integer is returned. 3379 | Otherwise, if the conversion overflows, the largest integer with the same 3380 | sign as `a' is returned. 3381 *----------------------------------------------------------------------------*/ 3382 3383 int64 floatx80_to_int64_round_to_zero( floatx80 a STATUS_PARAM ) 3384 { 3385 flag aSign; 3386 int32 aExp, shiftCount; 3387 bits64 aSig; 3388 int64 z; 3389 3390 aSig = extractFloatx80Frac( a ); 3391 aExp = extractFloatx80Exp( a ); 3392 aSign = extractFloatx80Sign( a ); 3393 shiftCount = aExp - 0x403E; 3394 if ( 0 <= shiftCount ) { 3395 aSig &= LIT64( 0x7FFFFFFFFFFFFFFF ); 3396 if ( ( a.high != 0xC03E ) || aSig ) { 3397 float_raise( float_flag_invalid STATUS_VAR); 3398 if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) { 3399 return LIT64( 0x7FFFFFFFFFFFFFFF ); 3400 } 3401 } 3402 return (sbits64) LIT64( 0x8000000000000000 ); 3403 } 3404 else if ( aExp < 0x3FFF ) { 3405 if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact; 3406 return 0; 3407 } 3408 z = aSig>>( - shiftCount ); 3409 if ( (bits64) ( aSig<<( shiftCount & 63 ) ) ) { 3410 STATUS(float_exception_flags) |= float_flag_inexact; 3411 } 3412 if ( aSign ) z = - z; 3413 return z; 3414 3415 } 3416 3417 /*---------------------------------------------------------------------------- 3418 | Returns the result of converting the extended double-precision floating- 3419 | point value `a' to the single-precision floating-point format. The 3420 | conversion is performed according to the IEC/IEEE Standard for Binary 3421 | Floating-Point Arithmetic. 3422 *----------------------------------------------------------------------------*/ 3423 3424 float32 floatx80_to_float32( floatx80 a STATUS_PARAM ) 3425 { 3426 flag aSign; 3427 int32 aExp; 3428 bits64 aSig; 3429 3430 aSig = extractFloatx80Frac( a ); 3431 aExp = extractFloatx80Exp( a ); 3432 aSign = extractFloatx80Sign( a ); 3433 if ( aExp == 0x7FFF ) { 3434 if ( (bits64) ( aSig<<1 ) ) { 3435 return commonNaNToFloat32( floatx80ToCommonNaN( a STATUS_VAR ) ); 3436 } 3437 return packFloat32( aSign, 0xFF, 0 ); 3438 } 3439 shift64RightJamming( aSig, 33, &aSig ); 3440 if ( aExp || aSig ) aExp -= 0x3F81; 3441 return roundAndPackFloat32( aSign, aExp, aSig STATUS_VAR ); 3442 3443 } 3444 3445 /*---------------------------------------------------------------------------- 3446 | Returns the result of converting the extended double-precision floating- 3447 | point value `a' to the double-precision floating-point format. The 3448 | conversion is performed according to the IEC/IEEE Standard for Binary 3449 | Floating-Point Arithmetic. 3450 *----------------------------------------------------------------------------*/ 3451 3452 float64 floatx80_to_float64( floatx80 a STATUS_PARAM ) 3453 { 3454 flag aSign; 3455 int32 aExp; 3456 bits64 aSig, zSig; 3457 3458 aSig = extractFloatx80Frac( a ); 3459 aExp = extractFloatx80Exp( a ); 3460 aSign = extractFloatx80Sign( a ); 3461 if ( aExp == 0x7FFF ) { 3462 if ( (bits64) ( aSig<<1 ) ) { 3463 return commonNaNToFloat64( floatx80ToCommonNaN( a STATUS_VAR ) ); 3464 } 3465 return packFloat64( aSign, 0x7FF, 0 ); 3466 } 3467 shift64RightJamming( aSig, 1, &zSig ); 3468 if ( aExp || aSig ) aExp -= 0x3C01; 3469 return roundAndPackFloat64( aSign, aExp, zSig STATUS_VAR ); 3470 3471 } 3472 3473 #ifdef FLOAT128 3474 3475 /*---------------------------------------------------------------------------- 3476 | Returns the result of converting the extended double-precision floating- 3477 | point value `a' to the quadruple-precision floating-point format. The 3478 | conversion is performed according to the IEC/IEEE Standard for Binary 3479 | Floating-Point Arithmetic. 3480 *----------------------------------------------------------------------------*/ 3481 3482 float128 floatx80_to_float128( floatx80 a STATUS_PARAM ) 3483 { 3484 flag aSign; 3485 int16 aExp; 3486 bits64 aSig, zSig0, zSig1; 3487 3488 aSig = extractFloatx80Frac( a ); 3489 aExp = extractFloatx80Exp( a ); 3490 aSign = extractFloatx80Sign( a ); 3491 if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) { 3492 return commonNaNToFloat128( floatx80ToCommonNaN( a STATUS_VAR ) ); 3493 } 3494 shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 ); 3495 return packFloat128( aSign, aExp, zSig0, zSig1 ); 3496 3497 } 3498 3499 #endif 3500 3501 /*---------------------------------------------------------------------------- 3502 | Rounds the extended double-precision floating-point value `a' to an integer, 3503 | and returns the result as an extended quadruple-precision floating-point 3504 | value. The operation is performed according to the IEC/IEEE Standard for 3505 | Binary Floating-Point Arithmetic. 3506 *----------------------------------------------------------------------------*/ 3507 3508 floatx80 floatx80_round_to_int( floatx80 a STATUS_PARAM ) 3509 { 3510 flag aSign; 3511 int32 aExp; 3512 bits64 lastBitMask, roundBitsMask; 3513 int8 roundingMode; 3514 floatx80 z; 3515 3516 aExp = extractFloatx80Exp( a ); 3517 if ( 0x403E <= aExp ) { 3518 if ( ( aExp == 0x7FFF ) && (bits64) ( extractFloatx80Frac( a )<<1 ) ) { 3519 return propagateFloatx80NaN( a, a STATUS_VAR ); 3520 } 3521 return a; 3522 } 3523 if ( aExp < 0x3FFF ) { 3524 if ( ( aExp == 0 ) 3525 && ( (bits64) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) { 3526 return a; 3527 } 3528 STATUS(float_exception_flags) |= float_flag_inexact; 3529 aSign = extractFloatx80Sign( a ); 3530 switch ( STATUS(float_rounding_mode) ) { 3531 case float_round_nearest_even: 3532 if ( ( aExp == 0x3FFE ) && (bits64) ( extractFloatx80Frac( a )<<1 ) 3533 ) { 3534 return 3535 packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) ); 3536 } 3537 break; 3538 case float_round_down: 3539 return 3540 aSign ? 3541 packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) ) 3542 : packFloatx80( 0, 0, 0 ); 3543 case float_round_up: 3544 return 3545 aSign ? packFloatx80( 1, 0, 0 ) 3546 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) ); 3547 } 3548 return packFloatx80( aSign, 0, 0 ); 3549 } 3550 lastBitMask = 1; 3551 lastBitMask <<= 0x403E - aExp; 3552 roundBitsMask = lastBitMask - 1; 3553 z = a; 3554 roundingMode = STATUS(float_rounding_mode); 3555 if ( roundingMode == float_round_nearest_even ) { 3556 z.low += lastBitMask>>1; 3557 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask; 3558 } 3559 else if ( roundingMode != float_round_to_zero ) { 3560 if ( extractFloatx80Sign( z ) ^ ( roundingMode == float_round_up ) ) { 3561 z.low += roundBitsMask; 3562 } 3563 } 3564 z.low &= ~ roundBitsMask; 3565 if ( z.low == 0 ) { 3566 ++z.high; 3567 z.low = LIT64( 0x8000000000000000 ); 3568 } 3569 if ( z.low != a.low ) STATUS(float_exception_flags) |= float_flag_inexact; 3570 return z; 3571 3572 } 3573 3574 /*---------------------------------------------------------------------------- 3575 | Returns the result of adding the absolute values of the extended double- 3576 | precision floating-point values `a' and `b'. If `zSign' is 1, the sum is 3577 | negated before being returned. `zSign' is ignored if the result is a NaN. 3578 | The addition is performed according to the IEC/IEEE Standard for Binary 3579 | Floating-Point Arithmetic. 3580 *----------------------------------------------------------------------------*/ 3581 3582 static floatx80 addFloatx80Sigs( floatx80 a, floatx80 b, flag zSign STATUS_PARAM) 3583 { 3584 int32 aExp, bExp, zExp; 3585 bits64 aSig, bSig, zSig0, zSig1; 3586 int32 expDiff; 3587 3588 aSig = extractFloatx80Frac( a ); 3589 aExp = extractFloatx80Exp( a ); 3590 bSig = extractFloatx80Frac( b ); 3591 bExp = extractFloatx80Exp( b ); 3592 expDiff = aExp - bExp; 3593 if ( 0 < expDiff ) { 3594 if ( aExp == 0x7FFF ) { 3595 if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR ); 3596 return a; 3597 } 3598 if ( bExp == 0 ) --expDiff; 3599 shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 ); 3600 zExp = aExp; 3601 } 3602 else if ( expDiff < 0 ) { 3603 if ( bExp == 0x7FFF ) { 3604 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR ); 3605 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 3606 } 3607 if ( aExp == 0 ) ++expDiff; 3608 shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 ); 3609 zExp = bExp; 3610 } 3611 else { 3612 if ( aExp == 0x7FFF ) { 3613 if ( (bits64) ( ( aSig | bSig )<<1 ) ) { 3614 return propagateFloatx80NaN( a, b STATUS_VAR ); 3615 } 3616 return a; 3617 } 3618 zSig1 = 0; 3619 zSig0 = aSig + bSig; 3620 if ( aExp == 0 ) { 3621 normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 ); 3622 goto roundAndPack; 3623 } 3624 zExp = aExp; 3625 goto shiftRight1; 3626 } 3627 zSig0 = aSig + bSig; 3628 if ( (sbits64) zSig0 < 0 ) goto roundAndPack; 3629 shiftRight1: 3630 shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 ); 3631 zSig0 |= LIT64( 0x8000000000000000 ); 3632 ++zExp; 3633 roundAndPack: 3634 return 3635 roundAndPackFloatx80( 3636 STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR ); 3637 3638 } 3639 3640 /*---------------------------------------------------------------------------- 3641 | Returns the result of subtracting the absolute values of the extended 3642 | double-precision floating-point values `a' and `b'. If `zSign' is 1, the 3643 | difference is negated before being returned. `zSign' is ignored if the 3644 | result is a NaN. The subtraction is performed according to the IEC/IEEE 3645 | Standard for Binary Floating-Point Arithmetic. 3646 *----------------------------------------------------------------------------*/ 3647 3648 static floatx80 subFloatx80Sigs( floatx80 a, floatx80 b, flag zSign STATUS_PARAM ) 3649 { 3650 int32 aExp, bExp, zExp; 3651 bits64 aSig, bSig, zSig0, zSig1; 3652 int32 expDiff; 3653 floatx80 z; 3654 3655 aSig = extractFloatx80Frac( a ); 3656 aExp = extractFloatx80Exp( a ); 3657 bSig = extractFloatx80Frac( b ); 3658 bExp = extractFloatx80Exp( b ); 3659 expDiff = aExp - bExp; 3660 if ( 0 < expDiff ) goto aExpBigger; 3661 if ( expDiff < 0 ) goto bExpBigger; 3662 if ( aExp == 0x7FFF ) { 3663 if ( (bits64) ( ( aSig | bSig )<<1 ) ) { 3664 return propagateFloatx80NaN( a, b STATUS_VAR ); 3665 } 3666 float_raise( float_flag_invalid STATUS_VAR); 3667 z.low = floatx80_default_nan_low; 3668 z.high = floatx80_default_nan_high; 3669 return z; 3670 } 3671 if ( aExp == 0 ) { 3672 aExp = 1; 3673 bExp = 1; 3674 } 3675 zSig1 = 0; 3676 if ( bSig < aSig ) goto aBigger; 3677 if ( aSig < bSig ) goto bBigger; 3678 return packFloatx80( STATUS(float_rounding_mode) == float_round_down, 0, 0 ); 3679 bExpBigger: 3680 if ( bExp == 0x7FFF ) { 3681 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR ); 3682 return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) ); 3683 } 3684 if ( aExp == 0 ) ++expDiff; 3685 shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 ); 3686 bBigger: 3687 sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 ); 3688 zExp = bExp; 3689 zSign ^= 1; 3690 goto normalizeRoundAndPack; 3691 aExpBigger: 3692 if ( aExp == 0x7FFF ) { 3693 if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR ); 3694 return a; 3695 } 3696 if ( bExp == 0 ) --expDiff; 3697 shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 ); 3698 aBigger: 3699 sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 ); 3700 zExp = aExp; 3701 normalizeRoundAndPack: 3702 return 3703 normalizeRoundAndPackFloatx80( 3704 STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR ); 3705 3706 } 3707 3708 /*---------------------------------------------------------------------------- 3709 | Returns the result of adding the extended double-precision floating-point 3710 | values `a' and `b'. The operation is performed according to the IEC/IEEE 3711 | Standard for Binary Floating-Point Arithmetic. 3712 *----------------------------------------------------------------------------*/ 3713 3714 floatx80 floatx80_add( floatx80 a, floatx80 b STATUS_PARAM ) 3715 { 3716 flag aSign, bSign; 3717 3718 aSign = extractFloatx80Sign( a ); 3719 bSign = extractFloatx80Sign( b ); 3720 if ( aSign == bSign ) { 3721 return addFloatx80Sigs( a, b, aSign STATUS_VAR ); 3722 } 3723 else { 3724 return subFloatx80Sigs( a, b, aSign STATUS_VAR ); 3725 } 3726 3727 } 3728 3729 /*---------------------------------------------------------------------------- 3730 | Returns the result of subtracting the extended double-precision floating- 3731 | point values `a' and `b'. The operation is performed according to the 3732 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3733 *----------------------------------------------------------------------------*/ 3734 3735 floatx80 floatx80_sub( floatx80 a, floatx80 b STATUS_PARAM ) 3736 { 3737 flag aSign, bSign; 3738 3739 aSign = extractFloatx80Sign( a ); 3740 bSign = extractFloatx80Sign( b ); 3741 if ( aSign == bSign ) { 3742 return subFloatx80Sigs( a, b, aSign STATUS_VAR ); 3743 } 3744 else { 3745 return addFloatx80Sigs( a, b, aSign STATUS_VAR ); 3746 } 3747 3748 } 3749 3750 /*---------------------------------------------------------------------------- 3751 | Returns the result of multiplying the extended double-precision floating- 3752 | point values `a' and `b'. The operation is performed according to the 3753 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3754 *----------------------------------------------------------------------------*/ 3755 3756 floatx80 floatx80_mul( floatx80 a, floatx80 b STATUS_PARAM ) 3757 { 3758 flag aSign, bSign, zSign; 3759 int32 aExp, bExp, zExp; 3760 bits64 aSig, bSig, zSig0, zSig1; 3761 floatx80 z; 3762 3763 aSig = extractFloatx80Frac( a ); 3764 aExp = extractFloatx80Exp( a ); 3765 aSign = extractFloatx80Sign( a ); 3766 bSig = extractFloatx80Frac( b ); 3767 bExp = extractFloatx80Exp( b ); 3768 bSign = extractFloatx80Sign( b ); 3769 zSign = aSign ^ bSign; 3770 if ( aExp == 0x7FFF ) { 3771 if ( (bits64) ( aSig<<1 ) 3772 || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) { 3773 return propagateFloatx80NaN( a, b STATUS_VAR ); 3774 } 3775 if ( ( bExp | bSig ) == 0 ) goto invalid; 3776 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 3777 } 3778 if ( bExp == 0x7FFF ) { 3779 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR ); 3780 if ( ( aExp | aSig ) == 0 ) { 3781 invalid: 3782 float_raise( float_flag_invalid STATUS_VAR); 3783 z.low = floatx80_default_nan_low; 3784 z.high = floatx80_default_nan_high; 3785 return z; 3786 } 3787 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 3788 } 3789 if ( aExp == 0 ) { 3790 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 ); 3791 normalizeFloatx80Subnormal( aSig, &aExp, &aSig ); 3792 } 3793 if ( bExp == 0 ) { 3794 if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 ); 3795 normalizeFloatx80Subnormal( bSig, &bExp, &bSig ); 3796 } 3797 zExp = aExp + bExp - 0x3FFE; 3798 mul64To128( aSig, bSig, &zSig0, &zSig1 ); 3799 if ( 0 < (sbits64) zSig0 ) { 3800 shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 ); 3801 --zExp; 3802 } 3803 return 3804 roundAndPackFloatx80( 3805 STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR ); 3806 3807 } 3808 3809 /*---------------------------------------------------------------------------- 3810 | Returns the result of dividing the extended double-precision floating-point 3811 | value `a' by the corresponding value `b'. The operation is performed 3812 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3813 *----------------------------------------------------------------------------*/ 3814 3815 floatx80 floatx80_div( floatx80 a, floatx80 b STATUS_PARAM ) 3816 { 3817 flag aSign, bSign, zSign; 3818 int32 aExp, bExp, zExp; 3819 bits64 aSig, bSig, zSig0, zSig1; 3820 bits64 rem0, rem1, rem2, term0, term1, term2; 3821 floatx80 z; 3822 3823 aSig = extractFloatx80Frac( a ); 3824 aExp = extractFloatx80Exp( a ); 3825 aSign = extractFloatx80Sign( a ); 3826 bSig = extractFloatx80Frac( b ); 3827 bExp = extractFloatx80Exp( b ); 3828 bSign = extractFloatx80Sign( b ); 3829 zSign = aSign ^ bSign; 3830 if ( aExp == 0x7FFF ) { 3831 if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR ); 3832 if ( bExp == 0x7FFF ) { 3833 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR ); 3834 goto invalid; 3835 } 3836 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 3837 } 3838 if ( bExp == 0x7FFF ) { 3839 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR ); 3840 return packFloatx80( zSign, 0, 0 ); 3841 } 3842 if ( bExp == 0 ) { 3843 if ( bSig == 0 ) { 3844 if ( ( aExp | aSig ) == 0 ) { 3845 invalid: 3846 float_raise( float_flag_invalid STATUS_VAR); 3847 z.low = floatx80_default_nan_low; 3848 z.high = floatx80_default_nan_high; 3849 return z; 3850 } 3851 float_raise( float_flag_divbyzero STATUS_VAR); 3852 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 3853 } 3854 normalizeFloatx80Subnormal( bSig, &bExp, &bSig ); 3855 } 3856 if ( aExp == 0 ) { 3857 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 ); 3858 normalizeFloatx80Subnormal( aSig, &aExp, &aSig ); 3859 } 3860 zExp = aExp - bExp + 0x3FFE; 3861 rem1 = 0; 3862 if ( bSig <= aSig ) { 3863 shift128Right( aSig, 0, 1, &aSig, &rem1 ); 3864 ++zExp; 3865 } 3866 zSig0 = estimateDiv128To64( aSig, rem1, bSig ); 3867 mul64To128( bSig, zSig0, &term0, &term1 ); 3868 sub128( aSig, rem1, term0, term1, &rem0, &rem1 ); 3869 while ( (sbits64) rem0 < 0 ) { 3870 --zSig0; 3871 add128( rem0, rem1, 0, bSig, &rem0, &rem1 ); 3872 } 3873 zSig1 = estimateDiv128To64( rem1, 0, bSig ); 3874 if ( (bits64) ( zSig1<<1 ) <= 8 ) { 3875 mul64To128( bSig, zSig1, &term1, &term2 ); 3876 sub128( rem1, 0, term1, term2, &rem1, &rem2 ); 3877 while ( (sbits64) rem1 < 0 ) { 3878 --zSig1; 3879 add128( rem1, rem2, 0, bSig, &rem1, &rem2 ); 3880 } 3881 zSig1 |= ( ( rem1 | rem2 ) != 0 ); 3882 } 3883 return 3884 roundAndPackFloatx80( 3885 STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR ); 3886 3887 } 3888 3889 /*---------------------------------------------------------------------------- 3890 | Returns the remainder of the extended double-precision floating-point value 3891 | `a' with respect to the corresponding value `b'. The operation is performed 3892 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3893 *----------------------------------------------------------------------------*/ 3894 3895 floatx80 floatx80_rem( floatx80 a, floatx80 b STATUS_PARAM ) 3896 { 3897 flag aSign, bSign, zSign; 3898 int32 aExp, bExp, expDiff; 3899 bits64 aSig0, aSig1, bSig; 3900 bits64 q, term0, term1, alternateASig0, alternateASig1; 3901 floatx80 z; 3902 3903 aSig0 = extractFloatx80Frac( a ); 3904 aExp = extractFloatx80Exp( a ); 3905 aSign = extractFloatx80Sign( a ); 3906 bSig = extractFloatx80Frac( b ); 3907 bExp = extractFloatx80Exp( b ); 3908 bSign = extractFloatx80Sign( b ); 3909 if ( aExp == 0x7FFF ) { 3910 if ( (bits64) ( aSig0<<1 ) 3911 || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) { 3912 return propagateFloatx80NaN( a, b STATUS_VAR ); 3913 } 3914 goto invalid; 3915 } 3916 if ( bExp == 0x7FFF ) { 3917 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR ); 3918 return a; 3919 } 3920 if ( bExp == 0 ) { 3921 if ( bSig == 0 ) { 3922 invalid: 3923 float_raise( float_flag_invalid STATUS_VAR); 3924 z.low = floatx80_default_nan_low; 3925 z.high = floatx80_default_nan_high; 3926 return z; 3927 } 3928 normalizeFloatx80Subnormal( bSig, &bExp, &bSig ); 3929 } 3930 if ( aExp == 0 ) { 3931 if ( (bits64) ( aSig0<<1 ) == 0 ) return a; 3932 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 ); 3933 } 3934 bSig |= LIT64( 0x8000000000000000 ); 3935 zSign = aSign; 3936 expDiff = aExp - bExp; 3937 aSig1 = 0; 3938 if ( expDiff < 0 ) { 3939 if ( expDiff < -1 ) return a; 3940 shift128Right( aSig0, 0, 1, &aSig0, &aSig1 ); 3941 expDiff = 0; 3942 } 3943 q = ( bSig <= aSig0 ); 3944 if ( q ) aSig0 -= bSig; 3945 expDiff -= 64; 3946 while ( 0 < expDiff ) { 3947 q = estimateDiv128To64( aSig0, aSig1, bSig ); 3948 q = ( 2 < q ) ? q - 2 : 0; 3949 mul64To128( bSig, q, &term0, &term1 ); 3950 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 ); 3951 shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 ); 3952 expDiff -= 62; 3953 } 3954 expDiff += 64; 3955 if ( 0 < expDiff ) { 3956 q = estimateDiv128To64( aSig0, aSig1, bSig ); 3957 q = ( 2 < q ) ? q - 2 : 0; 3958 q >>= 64 - expDiff; 3959 mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 ); 3960 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 ); 3961 shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 ); 3962 while ( le128( term0, term1, aSig0, aSig1 ) ) { 3963 ++q; 3964 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 ); 3965 } 3966 } 3967 else { 3968 term1 = 0; 3969 term0 = bSig; 3970 } 3971 sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 ); 3972 if ( lt128( alternateASig0, alternateASig1, aSig0, aSig1 ) 3973 || ( eq128( alternateASig0, alternateASig1, aSig0, aSig1 ) 3974 && ( q & 1 ) ) 3975 ) { 3976 aSig0 = alternateASig0; 3977 aSig1 = alternateASig1; 3978 zSign = ! zSign; 3979 } 3980 return 3981 normalizeRoundAndPackFloatx80( 3982 80, zSign, bExp + expDiff, aSig0, aSig1 STATUS_VAR ); 3983 3984 } 3985 3986 /*---------------------------------------------------------------------------- 3987 | Returns the square root of the extended double-precision floating-point 3988 | value `a'. The operation is performed according to the IEC/IEEE Standard 3989 | for Binary Floating-Point Arithmetic. 3990 *----------------------------------------------------------------------------*/ 3991 3992 floatx80 floatx80_sqrt( floatx80 a STATUS_PARAM ) 3993 { 3994 flag aSign; 3995 int32 aExp, zExp; 3996 bits64 aSig0, aSig1, zSig0, zSig1, doubleZSig0; 3997 bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3; 3998 floatx80 z; 3999 4000 aSig0 = extractFloatx80Frac( a ); 4001 aExp = extractFloatx80Exp( a ); 4002 aSign = extractFloatx80Sign( a ); 4003 if ( aExp == 0x7FFF ) { 4004 if ( (bits64) ( aSig0<<1 ) ) return propagateFloatx80NaN( a, a STATUS_VAR ); 4005 if ( ! aSign ) return a; 4006 goto invalid; 4007 } 4008 if ( aSign ) { 4009 if ( ( aExp | aSig0 ) == 0 ) return a; 4010 invalid: 4011 float_raise( float_flag_invalid STATUS_VAR); 4012 z.low = floatx80_default_nan_low; 4013 z.high = floatx80_default_nan_high; 4014 return z; 4015 } 4016 if ( aExp == 0 ) { 4017 if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 ); 4018 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 ); 4019 } 4020 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF; 4021 zSig0 = estimateSqrt32( aExp, aSig0>>32 ); 4022 shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 ); 4023 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 ); 4024 doubleZSig0 = zSig0<<1; 4025 mul64To128( zSig0, zSig0, &term0, &term1 ); 4026 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 ); 4027 while ( (sbits64) rem0 < 0 ) { 4028 --zSig0; 4029 doubleZSig0 -= 2; 4030 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 ); 4031 } 4032 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 ); 4033 if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) { 4034 if ( zSig1 == 0 ) zSig1 = 1; 4035 mul64To128( doubleZSig0, zSig1, &term1, &term2 ); 4036 sub128( rem1, 0, term1, term2, &rem1, &rem2 ); 4037 mul64To128( zSig1, zSig1, &term2, &term3 ); 4038 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 ); 4039 while ( (sbits64) rem1 < 0 ) { 4040 --zSig1; 4041 shortShift128Left( 0, zSig1, 1, &term2, &term3 ); 4042 term3 |= 1; 4043 term2 |= doubleZSig0; 4044 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 ); 4045 } 4046 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 ); 4047 } 4048 shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 ); 4049 zSig0 |= doubleZSig0; 4050 return 4051 roundAndPackFloatx80( 4052 STATUS(floatx80_rounding_precision), 0, zExp, zSig0, zSig1 STATUS_VAR ); 4053 4054 } 4055 4056 /*---------------------------------------------------------------------------- 4057 | Returns 1 if the extended double-precision floating-point value `a' is 4058 | equal to the corresponding value `b', and 0 otherwise. The comparison is 4059 | performed according to the IEC/IEEE Standard for Binary Floating-Point 4060 | Arithmetic. 4061 *----------------------------------------------------------------------------*/ 4062 4063 int floatx80_eq( floatx80 a, floatx80 b STATUS_PARAM ) 4064 { 4065 4066 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 4067 && (bits64) ( extractFloatx80Frac( a )<<1 ) ) 4068 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 4069 && (bits64) ( extractFloatx80Frac( b )<<1 ) ) 4070 ) { 4071 if ( floatx80_is_signaling_nan( a ) 4072 || floatx80_is_signaling_nan( b ) ) { 4073 float_raise( float_flag_invalid STATUS_VAR); 4074 } 4075 return 0; 4076 } 4077 return 4078 ( a.low == b.low ) 4079 && ( ( a.high == b.high ) 4080 || ( ( a.low == 0 ) 4081 && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) ) 4082 ); 4083 4084 } 4085 4086 /*---------------------------------------------------------------------------- 4087 | Returns 1 if the extended double-precision floating-point value `a' is 4088 | less than or equal to the corresponding value `b', and 0 otherwise. The 4089 | comparison is performed according to the IEC/IEEE Standard for Binary 4090 | Floating-Point Arithmetic. 4091 *----------------------------------------------------------------------------*/ 4092 4093 int floatx80_le( floatx80 a, floatx80 b STATUS_PARAM ) 4094 { 4095 flag aSign, bSign; 4096 4097 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 4098 && (bits64) ( extractFloatx80Frac( a )<<1 ) ) 4099 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 4100 && (bits64) ( extractFloatx80Frac( b )<<1 ) ) 4101 ) { 4102 float_raise( float_flag_invalid STATUS_VAR); 4103 return 0; 4104 } 4105 aSign = extractFloatx80Sign( a ); 4106 bSign = extractFloatx80Sign( b ); 4107 if ( aSign != bSign ) { 4108 return 4109 aSign 4110 || ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 4111 == 0 ); 4112 } 4113 return 4114 aSign ? le128( b.high, b.low, a.high, a.low ) 4115 : le128( a.high, a.low, b.high, b.low ); 4116 4117 } 4118 4119 /*---------------------------------------------------------------------------- 4120 | Returns 1 if the extended double-precision floating-point value `a' is 4121 | less than the corresponding value `b', and 0 otherwise. The comparison 4122 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 4123 | Arithmetic. 4124 *----------------------------------------------------------------------------*/ 4125 4126 int floatx80_lt( floatx80 a, floatx80 b STATUS_PARAM ) 4127 { 4128 flag aSign, bSign; 4129 4130 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 4131 && (bits64) ( extractFloatx80Frac( a )<<1 ) ) 4132 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 4133 && (bits64) ( extractFloatx80Frac( b )<<1 ) ) 4134 ) { 4135 float_raise( float_flag_invalid STATUS_VAR); 4136 return 0; 4137 } 4138 aSign = extractFloatx80Sign( a ); 4139 bSign = extractFloatx80Sign( b ); 4140 if ( aSign != bSign ) { 4141 return 4142 aSign 4143 && ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 4144 != 0 ); 4145 } 4146 return 4147 aSign ? lt128( b.high, b.low, a.high, a.low ) 4148 : lt128( a.high, a.low, b.high, b.low ); 4149 4150 } 4151 4152 /*---------------------------------------------------------------------------- 4153 | Returns 1 if the extended double-precision floating-point value `a' is equal 4154 | to the corresponding value `b', and 0 otherwise. The invalid exception is 4155 | raised if either operand is a NaN. Otherwise, the comparison is performed 4156 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4157 *----------------------------------------------------------------------------*/ 4158 4159 int floatx80_eq_signaling( floatx80 a, floatx80 b STATUS_PARAM ) 4160 { 4161 4162 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 4163 && (bits64) ( extractFloatx80Frac( a )<<1 ) ) 4164 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 4165 && (bits64) ( extractFloatx80Frac( b )<<1 ) ) 4166 ) { 4167 float_raise( float_flag_invalid STATUS_VAR); 4168 return 0; 4169 } 4170 return 4171 ( a.low == b.low ) 4172 && ( ( a.high == b.high ) 4173 || ( ( a.low == 0 ) 4174 && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) ) 4175 ); 4176 4177 } 4178 4179 /*---------------------------------------------------------------------------- 4180 | Returns 1 if the extended double-precision floating-point value `a' is less 4181 | than or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs 4182 | do not cause an exception. Otherwise, the comparison is performed according 4183 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4184 *----------------------------------------------------------------------------*/ 4185 4186 int floatx80_le_quiet( floatx80 a, floatx80 b STATUS_PARAM ) 4187 { 4188 flag aSign, bSign; 4189 4190 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 4191 && (bits64) ( extractFloatx80Frac( a )<<1 ) ) 4192 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 4193 && (bits64) ( extractFloatx80Frac( b )<<1 ) ) 4194 ) { 4195 if ( floatx80_is_signaling_nan( a ) 4196 || floatx80_is_signaling_nan( b ) ) { 4197 float_raise( float_flag_invalid STATUS_VAR); 4198 } 4199 return 0; 4200 } 4201 aSign = extractFloatx80Sign( a ); 4202 bSign = extractFloatx80Sign( b ); 4203 if ( aSign != bSign ) { 4204 return 4205 aSign 4206 || ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 4207 == 0 ); 4208 } 4209 return 4210 aSign ? le128( b.high, b.low, a.high, a.low ) 4211 : le128( a.high, a.low, b.high, b.low ); 4212 4213 } 4214 4215 /*---------------------------------------------------------------------------- 4216 | Returns 1 if the extended double-precision floating-point value `a' is less 4217 | than the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause 4218 | an exception. Otherwise, the comparison is performed according to the 4219 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4220 *----------------------------------------------------------------------------*/ 4221 4222 int floatx80_lt_quiet( floatx80 a, floatx80 b STATUS_PARAM ) 4223 { 4224 flag aSign, bSign; 4225 4226 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 4227 && (bits64) ( extractFloatx80Frac( a )<<1 ) ) 4228 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 4229 && (bits64) ( extractFloatx80Frac( b )<<1 ) ) 4230 ) { 4231 if ( floatx80_is_signaling_nan( a ) 4232 || floatx80_is_signaling_nan( b ) ) { 4233 float_raise( float_flag_invalid STATUS_VAR); 4234 } 4235 return 0; 4236 } 4237 aSign = extractFloatx80Sign( a ); 4238 bSign = extractFloatx80Sign( b ); 4239 if ( aSign != bSign ) { 4240 return 4241 aSign 4242 && ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 4243 != 0 ); 4244 } 4245 return 4246 aSign ? lt128( b.high, b.low, a.high, a.low ) 4247 : lt128( a.high, a.low, b.high, b.low ); 4248 4249 } 4250 4251 #endif 4252 4253 #ifdef FLOAT128 4254 4255 /*---------------------------------------------------------------------------- 4256 | Returns the result of converting the quadruple-precision floating-point 4257 | value `a' to the 32-bit two's complement integer format. The conversion 4258 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 4259 | Arithmetic---which means in particular that the conversion is rounded 4260 | according to the current rounding mode. If `a' is a NaN, the largest 4261 | positive integer is returned. Otherwise, if the conversion overflows, the 4262 | largest integer with the same sign as `a' is returned. 4263 *----------------------------------------------------------------------------*/ 4264 4265 int32 float128_to_int32( float128 a STATUS_PARAM ) 4266 { 4267 flag aSign; 4268 int32 aExp, shiftCount; 4269 bits64 aSig0, aSig1; 4270 4271 aSig1 = extractFloat128Frac1( a ); 4272 aSig0 = extractFloat128Frac0( a ); 4273 aExp = extractFloat128Exp( a ); 4274 aSign = extractFloat128Sign( a ); 4275 if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0; 4276 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 ); 4277 aSig0 |= ( aSig1 != 0 ); 4278 shiftCount = 0x4028 - aExp; 4279 if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 ); 4280 return roundAndPackInt32( aSign, aSig0 STATUS_VAR ); 4281 4282 } 4283 4284 /*---------------------------------------------------------------------------- 4285 | Returns the result of converting the quadruple-precision floating-point 4286 | value `a' to the 32-bit two's complement integer format. The conversion 4287 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 4288 | Arithmetic, except that the conversion is always rounded toward zero. If 4289 | `a' is a NaN, the largest positive integer is returned. Otherwise, if the 4290 | conversion overflows, the largest integer with the same sign as `a' is 4291 | returned. 4292 *----------------------------------------------------------------------------*/ 4293 4294 int32 float128_to_int32_round_to_zero( float128 a STATUS_PARAM ) 4295 { 4296 flag aSign; 4297 int32 aExp, shiftCount; 4298 bits64 aSig0, aSig1, savedASig; 4299 int32 z; 4300 4301 aSig1 = extractFloat128Frac1( a ); 4302 aSig0 = extractFloat128Frac0( a ); 4303 aExp = extractFloat128Exp( a ); 4304 aSign = extractFloat128Sign( a ); 4305 aSig0 |= ( aSig1 != 0 ); 4306 if ( 0x401E < aExp ) { 4307 if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0; 4308 goto invalid; 4309 } 4310 else if ( aExp < 0x3FFF ) { 4311 if ( aExp || aSig0 ) STATUS(float_exception_flags) |= float_flag_inexact; 4312 return 0; 4313 } 4314 aSig0 |= LIT64( 0x0001000000000000 ); 4315 shiftCount = 0x402F - aExp; 4316 savedASig = aSig0; 4317 aSig0 >>= shiftCount; 4318 z = aSig0; 4319 if ( aSign ) z = - z; 4320 if ( ( z < 0 ) ^ aSign ) { 4321 invalid: 4322 float_raise( float_flag_invalid STATUS_VAR); 4323 return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF; 4324 } 4325 if ( ( aSig0<<shiftCount ) != savedASig ) { 4326 STATUS(float_exception_flags) |= float_flag_inexact; 4327 } 4328 return z; 4329 4330 } 4331 4332 /*---------------------------------------------------------------------------- 4333 | Returns the result of converting the quadruple-precision floating-point 4334 | value `a' to the 64-bit two's complement integer format. The conversion 4335 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 4336 | Arithmetic---which means in particular that the conversion is rounded 4337 | according to the current rounding mode. If `a' is a NaN, the largest 4338 | positive integer is returned. Otherwise, if the conversion overflows, the 4339 | largest integer with the same sign as `a' is returned. 4340 *----------------------------------------------------------------------------*/ 4341 4342 int64 float128_to_int64( float128 a STATUS_PARAM ) 4343 { 4344 flag aSign; 4345 int32 aExp, shiftCount; 4346 bits64 aSig0, aSig1; 4347 4348 aSig1 = extractFloat128Frac1( a ); 4349 aSig0 = extractFloat128Frac0( a ); 4350 aExp = extractFloat128Exp( a ); 4351 aSign = extractFloat128Sign( a ); 4352 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 ); 4353 shiftCount = 0x402F - aExp; 4354 if ( shiftCount <= 0 ) { 4355 if ( 0x403E < aExp ) { 4356 float_raise( float_flag_invalid STATUS_VAR); 4357 if ( ! aSign 4358 || ( ( aExp == 0x7FFF ) 4359 && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) ) 4360 ) 4361 ) { 4362 return LIT64( 0x7FFFFFFFFFFFFFFF ); 4363 } 4364 return (sbits64) LIT64( 0x8000000000000000 ); 4365 } 4366 shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 ); 4367 } 4368 else { 4369 shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 ); 4370 } 4371 return roundAndPackInt64( aSign, aSig0, aSig1 STATUS_VAR ); 4372 4373 } 4374 4375 /*---------------------------------------------------------------------------- 4376 | Returns the result of converting the quadruple-precision floating-point 4377 | value `a' to the 64-bit two's complement integer format. The conversion 4378 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 4379 | Arithmetic, except that the conversion is always rounded toward zero. 4380 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if 4381 | the conversion overflows, the largest integer with the same sign as `a' is 4382 | returned. 4383 *----------------------------------------------------------------------------*/ 4384 4385 int64 float128_to_int64_round_to_zero( float128 a STATUS_PARAM ) 4386 { 4387 flag aSign; 4388 int32 aExp, shiftCount; 4389 bits64 aSig0, aSig1; 4390 int64 z; 4391 4392 aSig1 = extractFloat128Frac1( a ); 4393 aSig0 = extractFloat128Frac0( a ); 4394 aExp = extractFloat128Exp( a ); 4395 aSign = extractFloat128Sign( a ); 4396 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 ); 4397 shiftCount = aExp - 0x402F; 4398 if ( 0 < shiftCount ) { 4399 if ( 0x403E <= aExp ) { 4400 aSig0 &= LIT64( 0x0000FFFFFFFFFFFF ); 4401 if ( ( a.high == LIT64( 0xC03E000000000000 ) ) 4402 && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) { 4403 if ( aSig1 ) STATUS(float_exception_flags) |= float_flag_inexact; 4404 } 4405 else { 4406 float_raise( float_flag_invalid STATUS_VAR); 4407 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) { 4408 return LIT64( 0x7FFFFFFFFFFFFFFF ); 4409 } 4410 } 4411 return (sbits64) LIT64( 0x8000000000000000 ); 4412 } 4413 z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) ); 4414 if ( (bits64) ( aSig1<<shiftCount ) ) { 4415 STATUS(float_exception_flags) |= float_flag_inexact; 4416 } 4417 } 4418 else { 4419 if ( aExp < 0x3FFF ) { 4420 if ( aExp | aSig0 | aSig1 ) { 4421 STATUS(float_exception_flags) |= float_flag_inexact; 4422 } 4423 return 0; 4424 } 4425 z = aSig0>>( - shiftCount ); 4426 if ( aSig1 4427 || ( shiftCount && (bits64) ( aSig0<<( shiftCount & 63 ) ) ) ) { 4428 STATUS(float_exception_flags) |= float_flag_inexact; 4429 } 4430 } 4431 if ( aSign ) z = - z; 4432 return z; 4433 4434 } 4435 4436 /*---------------------------------------------------------------------------- 4437 | Returns the result of converting the quadruple-precision floating-point 4438 | value `a' to the single-precision floating-point format. The conversion 4439 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 4440 | Arithmetic. 4441 *----------------------------------------------------------------------------*/ 4442 4443 float32 float128_to_float32( float128 a STATUS_PARAM ) 4444 { 4445 flag aSign; 4446 int32 aExp; 4447 bits64 aSig0, aSig1; 4448 bits32 zSig; 4449 4450 aSig1 = extractFloat128Frac1( a ); 4451 aSig0 = extractFloat128Frac0( a ); 4452 aExp = extractFloat128Exp( a ); 4453 aSign = extractFloat128Sign( a ); 4454 if ( aExp == 0x7FFF ) { 4455 if ( aSig0 | aSig1 ) { 4456 return commonNaNToFloat32( float128ToCommonNaN( a STATUS_VAR ) ); 4457 } 4458 return packFloat32( aSign, 0xFF, 0 ); 4459 } 4460 aSig0 |= ( aSig1 != 0 ); 4461 shift64RightJamming( aSig0, 18, &aSig0 ); 4462 zSig = aSig0; 4463 if ( aExp || zSig ) { 4464 zSig |= 0x40000000; 4465 aExp -= 0x3F81; 4466 } 4467 return roundAndPackFloat32( aSign, aExp, zSig STATUS_VAR ); 4468 4469 } 4470 4471 /*---------------------------------------------------------------------------- 4472 | Returns the result of converting the quadruple-precision floating-point 4473 | value `a' to the double-precision floating-point format. The conversion 4474 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 4475 | Arithmetic. 4476 *----------------------------------------------------------------------------*/ 4477 4478 float64 float128_to_float64( float128 a STATUS_PARAM ) 4479 { 4480 flag aSign; 4481 int32 aExp; 4482 bits64 aSig0, aSig1; 4483 4484 aSig1 = extractFloat128Frac1( a ); 4485 aSig0 = extractFloat128Frac0( a ); 4486 aExp = extractFloat128Exp( a ); 4487 aSign = extractFloat128Sign( a ); 4488 if ( aExp == 0x7FFF ) { 4489 if ( aSig0 | aSig1 ) { 4490 return commonNaNToFloat64( float128ToCommonNaN( a STATUS_VAR ) ); 4491 } 4492 return packFloat64( aSign, 0x7FF, 0 ); 4493 } 4494 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 ); 4495 aSig0 |= ( aSig1 != 0 ); 4496 if ( aExp || aSig0 ) { 4497 aSig0 |= LIT64( 0x4000000000000000 ); 4498 aExp -= 0x3C01; 4499 } 4500 return roundAndPackFloat64( aSign, aExp, aSig0 STATUS_VAR ); 4501 4502 } 4503 4504 #ifdef FLOATX80 4505 4506 /*---------------------------------------------------------------------------- 4507 | Returns the result of converting the quadruple-precision floating-point 4508 | value `a' to the extended double-precision floating-point format. The 4509 | conversion is performed according to the IEC/IEEE Standard for Binary 4510 | Floating-Point Arithmetic. 4511 *----------------------------------------------------------------------------*/ 4512 4513 floatx80 float128_to_floatx80( float128 a STATUS_PARAM ) 4514 { 4515 flag aSign; 4516 int32 aExp; 4517 bits64 aSig0, aSig1; 4518 4519 aSig1 = extractFloat128Frac1( a ); 4520 aSig0 = extractFloat128Frac0( a ); 4521 aExp = extractFloat128Exp( a ); 4522 aSign = extractFloat128Sign( a ); 4523 if ( aExp == 0x7FFF ) { 4524 if ( aSig0 | aSig1 ) { 4525 return commonNaNToFloatx80( float128ToCommonNaN( a STATUS_VAR ) ); 4526 } 4527 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 4528 } 4529 if ( aExp == 0 ) { 4530 if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 ); 4531 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 4532 } 4533 else { 4534 aSig0 |= LIT64( 0x0001000000000000 ); 4535 } 4536 shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 ); 4537 return roundAndPackFloatx80( 80, aSign, aExp, aSig0, aSig1 STATUS_VAR ); 4538 4539 } 4540 4541 #endif 4542 4543 /*---------------------------------------------------------------------------- 4544 | Rounds the quadruple-precision floating-point value `a' to an integer, and 4545 | returns the result as a quadruple-precision floating-point value. The 4546 | operation is performed according to the IEC/IEEE Standard for Binary 4547 | Floating-Point Arithmetic. 4548 *----------------------------------------------------------------------------*/ 4549 4550 float128 float128_round_to_int( float128 a STATUS_PARAM ) 4551 { 4552 flag aSign; 4553 int32 aExp; 4554 bits64 lastBitMask, roundBitsMask; 4555 int8 roundingMode; 4556 float128 z; 4557 4558 aExp = extractFloat128Exp( a ); 4559 if ( 0x402F <= aExp ) { 4560 if ( 0x406F <= aExp ) { 4561 if ( ( aExp == 0x7FFF ) 4562 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) 4563 ) { 4564 return propagateFloat128NaN( a, a STATUS_VAR ); 4565 } 4566 return a; 4567 } 4568 lastBitMask = 1; 4569 lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1; 4570 roundBitsMask = lastBitMask - 1; 4571 z = a; 4572 roundingMode = STATUS(float_rounding_mode); 4573 if ( roundingMode == float_round_nearest_even ) { 4574 if ( lastBitMask ) { 4575 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low ); 4576 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask; 4577 } 4578 else { 4579 if ( (sbits64) z.low < 0 ) { 4580 ++z.high; 4581 if ( (bits64) ( z.low<<1 ) == 0 ) z.high &= ~1; 4582 } 4583 } 4584 } 4585 else if ( roundingMode != float_round_to_zero ) { 4586 if ( extractFloat128Sign( z ) 4587 ^ ( roundingMode == float_round_up ) ) { 4588 add128( z.high, z.low, 0, roundBitsMask, &z.high, &z.low ); 4589 } 4590 } 4591 z.low &= ~ roundBitsMask; 4592 } 4593 else { 4594 if ( aExp < 0x3FFF ) { 4595 if ( ( ( (bits64) ( a.high<<1 ) ) | a.low ) == 0 ) return a; 4596 STATUS(float_exception_flags) |= float_flag_inexact; 4597 aSign = extractFloat128Sign( a ); 4598 switch ( STATUS(float_rounding_mode) ) { 4599 case float_round_nearest_even: 4600 if ( ( aExp == 0x3FFE ) 4601 && ( extractFloat128Frac0( a ) 4602 | extractFloat128Frac1( a ) ) 4603 ) { 4604 return packFloat128( aSign, 0x3FFF, 0, 0 ); 4605 } 4606 break; 4607 case float_round_down: 4608 return 4609 aSign ? packFloat128( 1, 0x3FFF, 0, 0 ) 4610 : packFloat128( 0, 0, 0, 0 ); 4611 case float_round_up: 4612 return 4613 aSign ? packFloat128( 1, 0, 0, 0 ) 4614 : packFloat128( 0, 0x3FFF, 0, 0 ); 4615 } 4616 return packFloat128( aSign, 0, 0, 0 ); 4617 } 4618 lastBitMask = 1; 4619 lastBitMask <<= 0x402F - aExp; 4620 roundBitsMask = lastBitMask - 1; 4621 z.low = 0; 4622 z.high = a.high; 4623 roundingMode = STATUS(float_rounding_mode); 4624 if ( roundingMode == float_round_nearest_even ) { 4625 z.high += lastBitMask>>1; 4626 if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) { 4627 z.high &= ~ lastBitMask; 4628 } 4629 } 4630 else if ( roundingMode != float_round_to_zero ) { 4631 if ( extractFloat128Sign( z ) 4632 ^ ( roundingMode == float_round_up ) ) { 4633 z.high |= ( a.low != 0 ); 4634 z.high += roundBitsMask; 4635 } 4636 } 4637 z.high &= ~ roundBitsMask; 4638 } 4639 if ( ( z.low != a.low ) || ( z.high != a.high ) ) { 4640 STATUS(float_exception_flags) |= float_flag_inexact; 4641 } 4642 return z; 4643 4644 } 4645 4646 /*---------------------------------------------------------------------------- 4647 | Returns the result of adding the absolute values of the quadruple-precision 4648 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated 4649 | before being returned. `zSign' is ignored if the result is a NaN. 4650 | The addition is performed according to the IEC/IEEE Standard for Binary 4651 | Floating-Point Arithmetic. 4652 *----------------------------------------------------------------------------*/ 4653 4654 static float128 addFloat128Sigs( float128 a, float128 b, flag zSign STATUS_PARAM) 4655 { 4656 int32 aExp, bExp, zExp; 4657 bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2; 4658 int32 expDiff; 4659 4660 aSig1 = extractFloat128Frac1( a ); 4661 aSig0 = extractFloat128Frac0( a ); 4662 aExp = extractFloat128Exp( a ); 4663 bSig1 = extractFloat128Frac1( b ); 4664 bSig0 = extractFloat128Frac0( b ); 4665 bExp = extractFloat128Exp( b ); 4666 expDiff = aExp - bExp; 4667 if ( 0 < expDiff ) { 4668 if ( aExp == 0x7FFF ) { 4669 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR ); 4670 return a; 4671 } 4672 if ( bExp == 0 ) { 4673 --expDiff; 4674 } 4675 else { 4676 bSig0 |= LIT64( 0x0001000000000000 ); 4677 } 4678 shift128ExtraRightJamming( 4679 bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 ); 4680 zExp = aExp; 4681 } 4682 else if ( expDiff < 0 ) { 4683 if ( bExp == 0x7FFF ) { 4684 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR ); 4685 return packFloat128( zSign, 0x7FFF, 0, 0 ); 4686 } 4687 if ( aExp == 0 ) { 4688 ++expDiff; 4689 } 4690 else { 4691 aSig0 |= LIT64( 0x0001000000000000 ); 4692 } 4693 shift128ExtraRightJamming( 4694 aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 ); 4695 zExp = bExp; 4696 } 4697 else { 4698 if ( aExp == 0x7FFF ) { 4699 if ( aSig0 | aSig1 | bSig0 | bSig1 ) { 4700 return propagateFloat128NaN( a, b STATUS_VAR ); 4701 } 4702 return a; 4703 } 4704 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 ); 4705 if ( aExp == 0 ) { 4706 if ( STATUS(flush_to_zero) ) return packFloat128( zSign, 0, 0, 0 ); 4707 return packFloat128( zSign, 0, zSig0, zSig1 ); 4708 } 4709 zSig2 = 0; 4710 zSig0 |= LIT64( 0x0002000000000000 ); 4711 zExp = aExp; 4712 goto shiftRight1; 4713 } 4714 aSig0 |= LIT64( 0x0001000000000000 ); 4715 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 ); 4716 --zExp; 4717 if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack; 4718 ++zExp; 4719 shiftRight1: 4720 shift128ExtraRightJamming( 4721 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 ); 4722 roundAndPack: 4723 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR ); 4724 4725 } 4726 4727 /*---------------------------------------------------------------------------- 4728 | Returns the result of subtracting the absolute values of the quadruple- 4729 | precision floating-point values `a' and `b'. If `zSign' is 1, the 4730 | difference is negated before being returned. `zSign' is ignored if the 4731 | result is a NaN. The subtraction is performed according to the IEC/IEEE 4732 | Standard for Binary Floating-Point Arithmetic. 4733 *----------------------------------------------------------------------------*/ 4734 4735 static float128 subFloat128Sigs( float128 a, float128 b, flag zSign STATUS_PARAM) 4736 { 4737 int32 aExp, bExp, zExp; 4738 bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1; 4739 int32 expDiff; 4740 float128 z; 4741 4742 aSig1 = extractFloat128Frac1( a ); 4743 aSig0 = extractFloat128Frac0( a ); 4744 aExp = extractFloat128Exp( a ); 4745 bSig1 = extractFloat128Frac1( b ); 4746 bSig0 = extractFloat128Frac0( b ); 4747 bExp = extractFloat128Exp( b ); 4748 expDiff = aExp - bExp; 4749 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 ); 4750 shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 ); 4751 if ( 0 < expDiff ) goto aExpBigger; 4752 if ( expDiff < 0 ) goto bExpBigger; 4753 if ( aExp == 0x7FFF ) { 4754 if ( aSig0 | aSig1 | bSig0 | bSig1 ) { 4755 return propagateFloat128NaN( a, b STATUS_VAR ); 4756 } 4757 float_raise( float_flag_invalid STATUS_VAR); 4758 z.low = float128_default_nan_low; 4759 z.high = float128_default_nan_high; 4760 return z; 4761 } 4762 if ( aExp == 0 ) { 4763 aExp = 1; 4764 bExp = 1; 4765 } 4766 if ( bSig0 < aSig0 ) goto aBigger; 4767 if ( aSig0 < bSig0 ) goto bBigger; 4768 if ( bSig1 < aSig1 ) goto aBigger; 4769 if ( aSig1 < bSig1 ) goto bBigger; 4770 return packFloat128( STATUS(float_rounding_mode) == float_round_down, 0, 0, 0 ); 4771 bExpBigger: 4772 if ( bExp == 0x7FFF ) { 4773 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR ); 4774 return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 ); 4775 } 4776 if ( aExp == 0 ) { 4777 ++expDiff; 4778 } 4779 else { 4780 aSig0 |= LIT64( 0x4000000000000000 ); 4781 } 4782 shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 ); 4783 bSig0 |= LIT64( 0x4000000000000000 ); 4784 bBigger: 4785 sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 ); 4786 zExp = bExp; 4787 zSign ^= 1; 4788 goto normalizeRoundAndPack; 4789 aExpBigger: 4790 if ( aExp == 0x7FFF ) { 4791 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR ); 4792 return a; 4793 } 4794 if ( bExp == 0 ) { 4795 --expDiff; 4796 } 4797 else { 4798 bSig0 |= LIT64( 0x4000000000000000 ); 4799 } 4800 shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 ); 4801 aSig0 |= LIT64( 0x4000000000000000 ); 4802 aBigger: 4803 sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 ); 4804 zExp = aExp; 4805 normalizeRoundAndPack: 4806 --zExp; 4807 return normalizeRoundAndPackFloat128( zSign, zExp - 14, zSig0, zSig1 STATUS_VAR ); 4808 4809 } 4810 4811 /*---------------------------------------------------------------------------- 4812 | Returns the result of adding the quadruple-precision floating-point values 4813 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard 4814 | for Binary Floating-Point Arithmetic. 4815 *----------------------------------------------------------------------------*/ 4816 4817 float128 float128_add( float128 a, float128 b STATUS_PARAM ) 4818 { 4819 flag aSign, bSign; 4820 4821 aSign = extractFloat128Sign( a ); 4822 bSign = extractFloat128Sign( b ); 4823 if ( aSign == bSign ) { 4824 return addFloat128Sigs( a, b, aSign STATUS_VAR ); 4825 } 4826 else { 4827 return subFloat128Sigs( a, b, aSign STATUS_VAR ); 4828 } 4829 4830 } 4831 4832 /*---------------------------------------------------------------------------- 4833 | Returns the result of subtracting the quadruple-precision floating-point 4834 | values `a' and `b'. The operation is performed according to the IEC/IEEE 4835 | Standard for Binary Floating-Point Arithmetic. 4836 *----------------------------------------------------------------------------*/ 4837 4838 float128 float128_sub( float128 a, float128 b STATUS_PARAM ) 4839 { 4840 flag aSign, bSign; 4841 4842 aSign = extractFloat128Sign( a ); 4843 bSign = extractFloat128Sign( b ); 4844 if ( aSign == bSign ) { 4845 return subFloat128Sigs( a, b, aSign STATUS_VAR ); 4846 } 4847 else { 4848 return addFloat128Sigs( a, b, aSign STATUS_VAR ); 4849 } 4850 4851 } 4852 4853 /*---------------------------------------------------------------------------- 4854 | Returns the result of multiplying the quadruple-precision floating-point 4855 | values `a' and `b'. The operation is performed according to the IEC/IEEE 4856 | Standard for Binary Floating-Point Arithmetic. 4857 *----------------------------------------------------------------------------*/ 4858 4859 float128 float128_mul( float128 a, float128 b STATUS_PARAM ) 4860 { 4861 flag aSign, bSign, zSign; 4862 int32 aExp, bExp, zExp; 4863 bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3; 4864 float128 z; 4865 4866 aSig1 = extractFloat128Frac1( a ); 4867 aSig0 = extractFloat128Frac0( a ); 4868 aExp = extractFloat128Exp( a ); 4869 aSign = extractFloat128Sign( a ); 4870 bSig1 = extractFloat128Frac1( b ); 4871 bSig0 = extractFloat128Frac0( b ); 4872 bExp = extractFloat128Exp( b ); 4873 bSign = extractFloat128Sign( b ); 4874 zSign = aSign ^ bSign; 4875 if ( aExp == 0x7FFF ) { 4876 if ( ( aSig0 | aSig1 ) 4877 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) { 4878 return propagateFloat128NaN( a, b STATUS_VAR ); 4879 } 4880 if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid; 4881 return packFloat128( zSign, 0x7FFF, 0, 0 ); 4882 } 4883 if ( bExp == 0x7FFF ) { 4884 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR ); 4885 if ( ( aExp | aSig0 | aSig1 ) == 0 ) { 4886 invalid: 4887 float_raise( float_flag_invalid STATUS_VAR); 4888 z.low = float128_default_nan_low; 4889 z.high = float128_default_nan_high; 4890 return z; 4891 } 4892 return packFloat128( zSign, 0x7FFF, 0, 0 ); 4893 } 4894 if ( aExp == 0 ) { 4895 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 ); 4896 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 4897 } 4898 if ( bExp == 0 ) { 4899 if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 ); 4900 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 ); 4901 } 4902 zExp = aExp + bExp - 0x4000; 4903 aSig0 |= LIT64( 0x0001000000000000 ); 4904 shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 ); 4905 mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 ); 4906 add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 ); 4907 zSig2 |= ( zSig3 != 0 ); 4908 if ( LIT64( 0x0002000000000000 ) <= zSig0 ) { 4909 shift128ExtraRightJamming( 4910 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 ); 4911 ++zExp; 4912 } 4913 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR ); 4914 4915 } 4916 4917 /*---------------------------------------------------------------------------- 4918 | Returns the result of dividing the quadruple-precision floating-point value 4919 | `a' by the corresponding value `b'. The operation is performed according to 4920 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4921 *----------------------------------------------------------------------------*/ 4922 4923 float128 float128_div( float128 a, float128 b STATUS_PARAM ) 4924 { 4925 flag aSign, bSign, zSign; 4926 int32 aExp, bExp, zExp; 4927 bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2; 4928 bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3; 4929 float128 z; 4930 4931 aSig1 = extractFloat128Frac1( a ); 4932 aSig0 = extractFloat128Frac0( a ); 4933 aExp = extractFloat128Exp( a ); 4934 aSign = extractFloat128Sign( a ); 4935 bSig1 = extractFloat128Frac1( b ); 4936 bSig0 = extractFloat128Frac0( b ); 4937 bExp = extractFloat128Exp( b ); 4938 bSign = extractFloat128Sign( b ); 4939 zSign = aSign ^ bSign; 4940 if ( aExp == 0x7FFF ) { 4941 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR ); 4942 if ( bExp == 0x7FFF ) { 4943 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR ); 4944 goto invalid; 4945 } 4946 return packFloat128( zSign, 0x7FFF, 0, 0 ); 4947 } 4948 if ( bExp == 0x7FFF ) { 4949 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR ); 4950 return packFloat128( zSign, 0, 0, 0 ); 4951 } 4952 if ( bExp == 0 ) { 4953 if ( ( bSig0 | bSig1 ) == 0 ) { 4954 if ( ( aExp | aSig0 | aSig1 ) == 0 ) { 4955 invalid: 4956 float_raise( float_flag_invalid STATUS_VAR); 4957 z.low = float128_default_nan_low; 4958 z.high = float128_default_nan_high; 4959 return z; 4960 } 4961 float_raise( float_flag_divbyzero STATUS_VAR); 4962 return packFloat128( zSign, 0x7FFF, 0, 0 ); 4963 } 4964 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 ); 4965 } 4966 if ( aExp == 0 ) { 4967 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 ); 4968 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 4969 } 4970 zExp = aExp - bExp + 0x3FFD; 4971 shortShift128Left( 4972 aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 ); 4973 shortShift128Left( 4974 bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 ); 4975 if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) { 4976 shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 ); 4977 ++zExp; 4978 } 4979 zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 ); 4980 mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 ); 4981 sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 ); 4982 while ( (sbits64) rem0 < 0 ) { 4983 --zSig0; 4984 add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 ); 4985 } 4986 zSig1 = estimateDiv128To64( rem1, rem2, bSig0 ); 4987 if ( ( zSig1 & 0x3FFF ) <= 4 ) { 4988 mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 ); 4989 sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 ); 4990 while ( (sbits64) rem1 < 0 ) { 4991 --zSig1; 4992 add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 ); 4993 } 4994 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 ); 4995 } 4996 shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 ); 4997 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR ); 4998 4999 } 5000 5001 /*---------------------------------------------------------------------------- 5002 | Returns the remainder of the quadruple-precision floating-point value `a' 5003 | with respect to the corresponding value `b'. The operation is performed 5004 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5005 *----------------------------------------------------------------------------*/ 5006 5007 float128 float128_rem( float128 a, float128 b STATUS_PARAM ) 5008 { 5009 flag aSign, bSign, zSign; 5010 int32 aExp, bExp, expDiff; 5011 bits64 aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2; 5012 bits64 allZero, alternateASig0, alternateASig1, sigMean1; 5013 sbits64 sigMean0; 5014 float128 z; 5015 5016 aSig1 = extractFloat128Frac1( a ); 5017 aSig0 = extractFloat128Frac0( a ); 5018 aExp = extractFloat128Exp( a ); 5019 aSign = extractFloat128Sign( a ); 5020 bSig1 = extractFloat128Frac1( b ); 5021 bSig0 = extractFloat128Frac0( b ); 5022 bExp = extractFloat128Exp( b ); 5023 bSign = extractFloat128Sign( b ); 5024 if ( aExp == 0x7FFF ) { 5025 if ( ( aSig0 | aSig1 ) 5026 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) { 5027 return propagateFloat128NaN( a, b STATUS_VAR ); 5028 } 5029 goto invalid; 5030 } 5031 if ( bExp == 0x7FFF ) { 5032 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR ); 5033 return a; 5034 } 5035 if ( bExp == 0 ) { 5036 if ( ( bSig0 | bSig1 ) == 0 ) { 5037 invalid: 5038 float_raise( float_flag_invalid STATUS_VAR); 5039 z.low = float128_default_nan_low; 5040 z.high = float128_default_nan_high; 5041 return z; 5042 } 5043 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 ); 5044 } 5045 if ( aExp == 0 ) { 5046 if ( ( aSig0 | aSig1 ) == 0 ) return a; 5047 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 5048 } 5049 expDiff = aExp - bExp; 5050 if ( expDiff < -1 ) return a; 5051 shortShift128Left( 5052 aSig0 | LIT64( 0x0001000000000000 ), 5053 aSig1, 5054 15 - ( expDiff < 0 ), 5055 &aSig0, 5056 &aSig1 5057 ); 5058 shortShift128Left( 5059 bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 ); 5060 q = le128( bSig0, bSig1, aSig0, aSig1 ); 5061 if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 ); 5062 expDiff -= 64; 5063 while ( 0 < expDiff ) { 5064 q = estimateDiv128To64( aSig0, aSig1, bSig0 ); 5065 q = ( 4 < q ) ? q - 4 : 0; 5066 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 ); 5067 shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero ); 5068 shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero ); 5069 sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 ); 5070 expDiff -= 61; 5071 } 5072 if ( -64 < expDiff ) { 5073 q = estimateDiv128To64( aSig0, aSig1, bSig0 ); 5074 q = ( 4 < q ) ? q - 4 : 0; 5075 q >>= - expDiff; 5076 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 ); 5077 expDiff += 52; 5078 if ( expDiff < 0 ) { 5079 shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 ); 5080 } 5081 else { 5082 shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 ); 5083 } 5084 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 ); 5085 sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 ); 5086 } 5087 else { 5088 shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 ); 5089 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 ); 5090 } 5091 do { 5092 alternateASig0 = aSig0; 5093 alternateASig1 = aSig1; 5094 ++q; 5095 sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 ); 5096 } while ( 0 <= (sbits64) aSig0 ); 5097 add128( 5098 aSig0, aSig1, alternateASig0, alternateASig1, (bits64 *)&sigMean0, &sigMean1 ); 5099 if ( ( sigMean0 < 0 ) 5100 || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) { 5101 aSig0 = alternateASig0; 5102 aSig1 = alternateASig1; 5103 } 5104 zSign = ( (sbits64) aSig0 < 0 ); 5105 if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 ); 5106 return 5107 normalizeRoundAndPackFloat128( aSign ^ zSign, bExp - 4, aSig0, aSig1 STATUS_VAR ); 5108 5109 } 5110 5111 /*---------------------------------------------------------------------------- 5112 | Returns the square root of the quadruple-precision floating-point value `a'. 5113 | The operation is performed according to the IEC/IEEE Standard for Binary 5114 | Floating-Point Arithmetic. 5115 *----------------------------------------------------------------------------*/ 5116 5117 float128 float128_sqrt( float128 a STATUS_PARAM ) 5118 { 5119 flag aSign; 5120 int32 aExp, zExp; 5121 bits64 aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0; 5122 bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3; 5123 float128 z; 5124 5125 aSig1 = extractFloat128Frac1( a ); 5126 aSig0 = extractFloat128Frac0( a ); 5127 aExp = extractFloat128Exp( a ); 5128 aSign = extractFloat128Sign( a ); 5129 if ( aExp == 0x7FFF ) { 5130 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, a STATUS_VAR ); 5131 if ( ! aSign ) return a; 5132 goto invalid; 5133 } 5134 if ( aSign ) { 5135 if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a; 5136 invalid: 5137 float_raise( float_flag_invalid STATUS_VAR); 5138 z.low = float128_default_nan_low; 5139 z.high = float128_default_nan_high; 5140 return z; 5141 } 5142 if ( aExp == 0 ) { 5143 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 ); 5144 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 5145 } 5146 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE; 5147 aSig0 |= LIT64( 0x0001000000000000 ); 5148 zSig0 = estimateSqrt32( aExp, aSig0>>17 ); 5149 shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 ); 5150 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 ); 5151 doubleZSig0 = zSig0<<1; 5152 mul64To128( zSig0, zSig0, &term0, &term1 ); 5153 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 ); 5154 while ( (sbits64) rem0 < 0 ) { 5155 --zSig0; 5156 doubleZSig0 -= 2; 5157 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 ); 5158 } 5159 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 ); 5160 if ( ( zSig1 & 0x1FFF ) <= 5 ) { 5161 if ( zSig1 == 0 ) zSig1 = 1; 5162 mul64To128( doubleZSig0, zSig1, &term1, &term2 ); 5163 sub128( rem1, 0, term1, term2, &rem1, &rem2 ); 5164 mul64To128( zSig1, zSig1, &term2, &term3 ); 5165 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 ); 5166 while ( (sbits64) rem1 < 0 ) { 5167 --zSig1; 5168 shortShift128Left( 0, zSig1, 1, &term2, &term3 ); 5169 term3 |= 1; 5170 term2 |= doubleZSig0; 5171 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 ); 5172 } 5173 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 ); 5174 } 5175 shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 ); 5176 return roundAndPackFloat128( 0, zExp, zSig0, zSig1, zSig2 STATUS_VAR ); 5177 5178 } 5179 5180 /*---------------------------------------------------------------------------- 5181 | Returns 1 if the quadruple-precision floating-point value `a' is equal to 5182 | the corresponding value `b', and 0 otherwise. The comparison is performed 5183 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5184 *----------------------------------------------------------------------------*/ 5185 5186 int float128_eq( float128 a, float128 b STATUS_PARAM ) 5187 { 5188 5189 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 5190 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 5191 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 5192 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 5193 ) { 5194 if ( float128_is_signaling_nan( a ) 5195 || float128_is_signaling_nan( b ) ) { 5196 float_raise( float_flag_invalid STATUS_VAR); 5197 } 5198 return 0; 5199 } 5200 return 5201 ( a.low == b.low ) 5202 && ( ( a.high == b.high ) 5203 || ( ( a.low == 0 ) 5204 && ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) ) 5205 ); 5206 5207 } 5208 5209 /*---------------------------------------------------------------------------- 5210 | Returns 1 if the quadruple-precision floating-point value `a' is less than 5211 | or equal to the corresponding value `b', and 0 otherwise. The comparison 5212 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 5213 | Arithmetic. 5214 *----------------------------------------------------------------------------*/ 5215 5216 int float128_le( float128 a, float128 b STATUS_PARAM ) 5217 { 5218 flag aSign, bSign; 5219 5220 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 5221 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 5222 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 5223 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 5224 ) { 5225 float_raise( float_flag_invalid STATUS_VAR); 5226 return 0; 5227 } 5228 aSign = extractFloat128Sign( a ); 5229 bSign = extractFloat128Sign( b ); 5230 if ( aSign != bSign ) { 5231 return 5232 aSign 5233 || ( ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 5234 == 0 ); 5235 } 5236 return 5237 aSign ? le128( b.high, b.low, a.high, a.low ) 5238 : le128( a.high, a.low, b.high, b.low ); 5239 5240 } 5241 5242 /*---------------------------------------------------------------------------- 5243 | Returns 1 if the quadruple-precision floating-point value `a' is less than 5244 | the corresponding value `b', and 0 otherwise. The comparison is performed 5245 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5246 *----------------------------------------------------------------------------*/ 5247 5248 int float128_lt( float128 a, float128 b STATUS_PARAM ) 5249 { 5250 flag aSign, bSign; 5251 5252 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 5253 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 5254 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 5255 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 5256 ) { 5257 float_raise( float_flag_invalid STATUS_VAR); 5258 return 0; 5259 } 5260 aSign = extractFloat128Sign( a ); 5261 bSign = extractFloat128Sign( b ); 5262 if ( aSign != bSign ) { 5263 return 5264 aSign 5265 && ( ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 5266 != 0 ); 5267 } 5268 return 5269 aSign ? lt128( b.high, b.low, a.high, a.low ) 5270 : lt128( a.high, a.low, b.high, b.low ); 5271 5272 } 5273 5274 /*---------------------------------------------------------------------------- 5275 | Returns 1 if the quadruple-precision floating-point value `a' is equal to 5276 | the corresponding value `b', and 0 otherwise. The invalid exception is 5277 | raised if either operand is a NaN. Otherwise, the comparison is performed 5278 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5279 *----------------------------------------------------------------------------*/ 5280 5281 int float128_eq_signaling( float128 a, float128 b STATUS_PARAM ) 5282 { 5283 5284 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 5285 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 5286 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 5287 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 5288 ) { 5289 float_raise( float_flag_invalid STATUS_VAR); 5290 return 0; 5291 } 5292 return 5293 ( a.low == b.low ) 5294 && ( ( a.high == b.high ) 5295 || ( ( a.low == 0 ) 5296 && ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) ) 5297 ); 5298 5299 } 5300 5301 /*---------------------------------------------------------------------------- 5302 | Returns 1 if the quadruple-precision floating-point value `a' is less than 5303 | or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not 5304 | cause an exception. Otherwise, the comparison is performed according to the 5305 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5306 *----------------------------------------------------------------------------*/ 5307 5308 int float128_le_quiet( float128 a, float128 b STATUS_PARAM ) 5309 { 5310 flag aSign, bSign; 5311 5312 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 5313 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 5314 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 5315 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 5316 ) { 5317 if ( float128_is_signaling_nan( a ) 5318 || float128_is_signaling_nan( b ) ) { 5319 float_raise( float_flag_invalid STATUS_VAR); 5320 } 5321 return 0; 5322 } 5323 aSign = extractFloat128Sign( a ); 5324 bSign = extractFloat128Sign( b ); 5325 if ( aSign != bSign ) { 5326 return 5327 aSign 5328 || ( ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 5329 == 0 ); 5330 } 5331 return 5332 aSign ? le128( b.high, b.low, a.high, a.low ) 5333 : le128( a.high, a.low, b.high, b.low ); 5334 5335 } 5336 5337 /*---------------------------------------------------------------------------- 5338 | Returns 1 if the quadruple-precision floating-point value `a' is less than 5339 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 5340 | exception. Otherwise, the comparison is performed according to the IEC/IEEE 5341 | Standard for Binary Floating-Point Arithmetic. 5342 *----------------------------------------------------------------------------*/ 5343 5344 int float128_lt_quiet( float128 a, float128 b STATUS_PARAM ) 5345 { 5346 flag aSign, bSign; 5347 5348 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 5349 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 5350 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 5351 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 5352 ) { 5353 if ( float128_is_signaling_nan( a ) 5354 || float128_is_signaling_nan( b ) ) { 5355 float_raise( float_flag_invalid STATUS_VAR); 5356 } 5357 return 0; 5358 } 5359 aSign = extractFloat128Sign( a ); 5360 bSign = extractFloat128Sign( b ); 5361 if ( aSign != bSign ) { 5362 return 5363 aSign 5364 && ( ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 5365 != 0 ); 5366 } 5367 return 5368 aSign ? lt128( b.high, b.low, a.high, a.low ) 5369 : lt128( a.high, a.low, b.high, b.low ); 5370 5371 } 5372 5373 #endif 5374 5375 /* misc functions */ 5376 float32 uint32_to_float32( unsigned int a STATUS_PARAM ) 5377 { 5378 return int64_to_float32(a STATUS_VAR); 5379 } 5380 5381 float64 uint32_to_float64( unsigned int a STATUS_PARAM ) 5382 { 5383 return int64_to_float64(a STATUS_VAR); 5384 } 5385 5386 unsigned int float32_to_uint32( float32 a STATUS_PARAM ) 5387 { 5388 int64_t v; 5389 unsigned int res; 5390 5391 v = float32_to_int64(a STATUS_VAR); 5392 if (v < 0) { 5393 res = 0; 5394 float_raise( float_flag_invalid STATUS_VAR); 5395 } else if (v > 0xffffffff) { 5396 res = 0xffffffff; 5397 float_raise( float_flag_invalid STATUS_VAR); 5398 } else { 5399 res = v; 5400 } 5401 return res; 5402 } 5403 5404 unsigned int float32_to_uint32_round_to_zero( float32 a STATUS_PARAM ) 5405 { 5406 int64_t v; 5407 unsigned int res; 5408 5409 v = float32_to_int64_round_to_zero(a STATUS_VAR); 5410 if (v < 0) { 5411 res = 0; 5412 float_raise( float_flag_invalid STATUS_VAR); 5413 } else if (v > 0xffffffff) { 5414 res = 0xffffffff; 5415 float_raise( float_flag_invalid STATUS_VAR); 5416 } else { 5417 res = v; 5418 } 5419 return res; 5420 } 5421 5422 unsigned int float64_to_uint32( float64 a STATUS_PARAM ) 5423 { 5424 int64_t v; 5425 unsigned int res; 5426 5427 v = float64_to_int64(a STATUS_VAR); 5428 if (v < 0) { 5429 res = 0; 5430 float_raise( float_flag_invalid STATUS_VAR); 5431 } else if (v > 0xffffffff) { 5432 res = 0xffffffff; 5433 float_raise( float_flag_invalid STATUS_VAR); 5434 } else { 5435 res = v; 5436 } 5437 return res; 5438 } 5439 5440 unsigned int float64_to_uint32_round_to_zero( float64 a STATUS_PARAM ) 5441 { 5442 int64_t v; 5443 unsigned int res; 5444 5445 v = float64_to_int64_round_to_zero(a STATUS_VAR); 5446 if (v < 0) { 5447 res = 0; 5448 float_raise( float_flag_invalid STATUS_VAR); 5449 } else if (v > 0xffffffff) { 5450 res = 0xffffffff; 5451 float_raise( float_flag_invalid STATUS_VAR); 5452 } else { 5453 res = v; 5454 } 5455 return res; 5456 } 5457 5458 /* FIXME: This looks broken. */ 5459 uint64_t float64_to_uint64 (float64 a STATUS_PARAM) 5460 { 5461 int64_t v; 5462 5463 v = float64_val(int64_to_float64(INT64_MIN STATUS_VAR)); 5464 v += float64_val(a); 5465 v = float64_to_int64(make_float64(v) STATUS_VAR); 5466 5467 return v - INT64_MIN; 5468 } 5469 5470 uint64_t float64_to_uint64_round_to_zero (float64 a STATUS_PARAM) 5471 { 5472 int64_t v; 5473 5474 v = float64_val(int64_to_float64(INT64_MIN STATUS_VAR)); 5475 v += float64_val(a); 5476 v = float64_to_int64_round_to_zero(make_float64(v) STATUS_VAR); 5477 5478 return v - INT64_MIN; 5479 } 5480 5481 #define COMPARE(s, nan_exp) \ 5482 INLINE int float ## s ## _compare_internal( float ## s a, float ## s b, \ 5483 int is_quiet STATUS_PARAM ) \ 5484 { \ 5485 flag aSign, bSign; \ 5486 bits ## s av, bv; \ 5487 \ 5488 if (( ( extractFloat ## s ## Exp( a ) == nan_exp ) && \ 5489 extractFloat ## s ## Frac( a ) ) || \ 5490 ( ( extractFloat ## s ## Exp( b ) == nan_exp ) && \ 5491 extractFloat ## s ## Frac( b ) )) { \ 5492 if (!is_quiet || \ 5493 float ## s ## _is_signaling_nan( a ) || \ 5494 float ## s ## _is_signaling_nan( b ) ) { \ 5495 float_raise( float_flag_invalid STATUS_VAR); \ 5496 } \ 5497 return float_relation_unordered; \ 5498 } \ 5499 aSign = extractFloat ## s ## Sign( a ); \ 5500 bSign = extractFloat ## s ## Sign( b ); \ 5501 av = float ## s ## _val(a); \ 5502 bv = float ## s ## _val(b); \ 5503 if ( aSign != bSign ) { \ 5504 if ( (bits ## s) ( ( av | bv )<<1 ) == 0 ) { \ 5505 /* zero case */ \ 5506 return float_relation_equal; \ 5507 } else { \ 5508 return 1 - (2 * aSign); \ 5509 } \ 5510 } else { \ 5511 if (av == bv) { \ 5512 return float_relation_equal; \ 5513 } else { \ 5514 return 1 - 2 * (aSign ^ ( av < bv )); \ 5515 } \ 5516 } \ 5517 } \ 5518 \ 5519 int float ## s ## _compare( float ## s a, float ## s b STATUS_PARAM ) \ 5520 { \ 5521 return float ## s ## _compare_internal(a, b, 0 STATUS_VAR); \ 5522 } \ 5523 \ 5524 int float ## s ## _compare_quiet( float ## s a, float ## s b STATUS_PARAM ) \ 5525 { \ 5526 return float ## s ## _compare_internal(a, b, 1 STATUS_VAR); \ 5527 } 5528 5529 COMPARE(32, 0xff) 5530 COMPARE(64, 0x7ff) 5531 5532 INLINE int float128_compare_internal( float128 a, float128 b, 5533 int is_quiet STATUS_PARAM ) 5534 { 5535 flag aSign, bSign; 5536 5537 if (( ( extractFloat128Exp( a ) == 0x7fff ) && 5538 ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) || 5539 ( ( extractFloat128Exp( b ) == 0x7fff ) && 5540 ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )) { 5541 if (!is_quiet || 5542 float128_is_signaling_nan( a ) || 5543 float128_is_signaling_nan( b ) ) { 5544 float_raise( float_flag_invalid STATUS_VAR); 5545 } 5546 return float_relation_unordered; 5547 } 5548 aSign = extractFloat128Sign( a ); 5549 bSign = extractFloat128Sign( b ); 5550 if ( aSign != bSign ) { 5551 if ( ( ( ( a.high | b.high )<<1 ) | a.low | b.low ) == 0 ) { 5552 /* zero case */ 5553 return float_relation_equal; 5554 } else { 5555 return 1 - (2 * aSign); 5556 } 5557 } else { 5558 if (a.low == b.low && a.high == b.high) { 5559 return float_relation_equal; 5560 } else { 5561 return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) )); 5562 } 5563 } 5564 } 5565 5566 int float128_compare( float128 a, float128 b STATUS_PARAM ) 5567 { 5568 return float128_compare_internal(a, b, 0 STATUS_VAR); 5569 } 5570 5571 int float128_compare_quiet( float128 a, float128 b STATUS_PARAM ) 5572 { 5573 return float128_compare_internal(a, b, 1 STATUS_VAR); 5574 } 5575 5576 /* Multiply A by 2 raised to the power N. */ 5577 float32 float32_scalbn( float32 a, int n STATUS_PARAM ) 5578 { 5579 flag aSign; 5580 int16 aExp; 5581 bits32 aSig; 5582 5583 aSig = extractFloat32Frac( a ); 5584 aExp = extractFloat32Exp( a ); 5585 aSign = extractFloat32Sign( a ); 5586 5587 if ( aExp == 0xFF ) { 5588 return a; 5589 } 5590 if ( aExp != 0 ) 5591 aSig |= 0x00800000; 5592 else if ( aSig == 0 ) 5593 return a; 5594 5595 aExp += n - 1; 5596 aSig <<= 7; 5597 return normalizeRoundAndPackFloat32( aSign, aExp, aSig STATUS_VAR ); 5598 } 5599 5600 float64 float64_scalbn( float64 a, int n STATUS_PARAM ) 5601 { 5602 flag aSign; 5603 int16 aExp; 5604 bits64 aSig; 5605 5606 aSig = extractFloat64Frac( a ); 5607 aExp = extractFloat64Exp( a ); 5608 aSign = extractFloat64Sign( a ); 5609 5610 if ( aExp == 0x7FF ) { 5611 return a; 5612 } 5613 if ( aExp != 0 ) 5614 aSig |= LIT64( 0x0010000000000000 ); 5615 else if ( aSig == 0 ) 5616 return a; 5617 5618 aExp += n - 1; 5619 aSig <<= 10; 5620 return normalizeRoundAndPackFloat64( aSign, aExp, aSig STATUS_VAR ); 5621 } 5622 5623 #ifdef FLOATX80 5624 floatx80 floatx80_scalbn( floatx80 a, int n STATUS_PARAM ) 5625 { 5626 flag aSign; 5627 int16 aExp; 5628 bits64 aSig; 5629 5630 aSig = extractFloatx80Frac( a ); 5631 aExp = extractFloatx80Exp( a ); 5632 aSign = extractFloatx80Sign( a ); 5633 5634 if ( aExp == 0x7FF ) { 5635 return a; 5636 } 5637 if (aExp == 0 && aSig == 0) 5638 return a; 5639 5640 aExp += n; 5641 return normalizeRoundAndPackFloatx80( STATUS(floatx80_rounding_precision), 5642 aSign, aExp, aSig, 0 STATUS_VAR ); 5643 } 5644 #endif 5645 5646 #ifdef FLOAT128 5647 float128 float128_scalbn( float128 a, int n STATUS_PARAM ) 5648 { 5649 flag aSign; 5650 int32 aExp; 5651 bits64 aSig0, aSig1; 5652 5653 aSig1 = extractFloat128Frac1( a ); 5654 aSig0 = extractFloat128Frac0( a ); 5655 aExp = extractFloat128Exp( a ); 5656 aSign = extractFloat128Sign( a ); 5657 if ( aExp == 0x7FFF ) { 5658 return a; 5659 } 5660 if ( aExp != 0 ) 5661 aSig0 |= LIT64( 0x0001000000000000 ); 5662 else if ( aSig0 == 0 && aSig1 == 0 ) 5663 return a; 5664 5665 aExp += n - 1; 5666 return normalizeRoundAndPackFloat128( aSign, aExp, aSig0, aSig1 5667 STATUS_VAR ); 5668 5669 } 5670 #endif 5671