1 /* $NetBSD: softfloat.c,v 1.13 2013/11/22 17:04:24 martin Exp $ */ 2 3 /* 4 * This version hacked for use with gcc -msoft-float by bjh21. 5 * (Mostly a case of #ifdefing out things GCC doesn't need or provides 6 * itself). 7 */ 8 9 /* 10 * Things you may want to define: 11 * 12 * SOFTFLOAT_FOR_GCC - build only those functions necessary for GCC (with 13 * -msoft-float) to work. Include "softfloat-for-gcc.h" to get them 14 * properly renamed. 15 */ 16 17 /* 18 =============================================================================== 19 20 This C source file is part of the SoftFloat IEC/IEEE Floating-point 21 Arithmetic Package, Release 2a. 22 23 Written by John R. Hauser. This work was made possible in part by the 24 International Computer Science Institute, located at Suite 600, 1947 Center 25 Street, Berkeley, California 94704. Funding was partially provided by the 26 National Science Foundation under grant MIP-9311980. The original version 27 of this code was written as part of a project to build a fixed-point vector 28 processor in collaboration with the University of California at Berkeley, 29 overseen by Profs. Nelson Morgan and John Wawrzynek. More information 30 is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/ 31 arithmetic/SoftFloat.html'. 32 33 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort 34 has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT 35 TIMES RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO 36 PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY 37 AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE. 38 39 Derivative works are acceptable, even for commercial purposes, so long as 40 (1) they include prominent notice that the work is derivative, and (2) they 41 include prominent notice akin to these four paragraphs for those parts of 42 this code that are retained. 43 44 =============================================================================== 45 */ 46 47 #include <sys/cdefs.h> 48 #if defined(LIBC_SCCS) && !defined(lint) 49 __RCSID("$NetBSD: softfloat.c,v 1.13 2013/11/22 17:04:24 martin Exp $"); 50 #endif /* LIBC_SCCS and not lint */ 51 52 #ifdef SOFTFLOAT_FOR_GCC 53 #include "softfloat-for-gcc.h" 54 #endif 55 56 #include "milieu.h" 57 #include "softfloat.h" 58 59 /* 60 * Conversions between floats as stored in memory and floats as 61 * SoftFloat uses them 62 */ 63 #ifndef FLOAT64_DEMANGLE 64 #define FLOAT64_DEMANGLE(a) (a) 65 #endif 66 #ifndef FLOAT64_MANGLE 67 #define FLOAT64_MANGLE(a) (a) 68 #endif 69 70 /* 71 ------------------------------------------------------------------------------- 72 Floating-point rounding mode, extended double-precision rounding precision, 73 and exception flags. 74 ------------------------------------------------------------------------------- 75 */ 76 #ifndef set_float_rounding_mode 77 fp_rnd float_rounding_mode = float_round_nearest_even; 78 fp_except float_exception_flags = 0; 79 #endif 80 #ifndef set_float_exception_inexact_flag 81 #define set_float_exception_inexact_flag() \ 82 ((void)(float_exception_flags |= float_flag_inexact)) 83 #endif 84 #ifdef FLOATX80 85 int8 floatx80_rounding_precision = 80; 86 #endif 87 88 /* 89 ------------------------------------------------------------------------------- 90 Primitive arithmetic functions, including multi-word arithmetic, and 91 division and square root approximations. (Can be specialized to target if 92 desired.) 93 ------------------------------------------------------------------------------- 94 */ 95 #include "softfloat-macros" 96 97 /* 98 ------------------------------------------------------------------------------- 99 Functions and definitions to determine: (1) whether tininess for underflow 100 is detected before or after rounding by default, (2) what (if anything) 101 happens when exceptions are raised, (3) how signaling NaNs are distinguished 102 from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs 103 are propagated from function inputs to output. These details are target- 104 specific. 105 ------------------------------------------------------------------------------- 106 */ 107 #include "softfloat-specialize" 108 109 #if !defined(SOFTFLOAT_FOR_GCC) || defined(FLOATX80) || defined(FLOAT128) 110 /* 111 ------------------------------------------------------------------------------- 112 Takes a 64-bit fixed-point value `absZ' with binary point between bits 6 113 and 7, and returns the properly rounded 32-bit integer corresponding to the 114 input. If `zSign' is 1, the input is negated before being converted to an 115 integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point input 116 is simply rounded to an integer, with the inexact exception raised if the 117 input cannot be represented exactly as an integer. However, if the fixed- 118 point input is too large, the invalid exception is raised and the largest 119 positive or negative integer is returned. 120 ------------------------------------------------------------------------------- 121 */ 122 static int32 roundAndPackInt32( flag zSign, bits64 absZ ) 123 { 124 int8 roundingMode; 125 flag roundNearestEven; 126 int8 roundIncrement, roundBits; 127 int32 z; 128 129 roundingMode = float_rounding_mode; 130 roundNearestEven = ( roundingMode == float_round_nearest_even ); 131 roundIncrement = 0x40; 132 if ( ! roundNearestEven ) { 133 if ( roundingMode == float_round_to_zero ) { 134 roundIncrement = 0; 135 } 136 else { 137 roundIncrement = 0x7F; 138 if ( zSign ) { 139 if ( roundingMode == float_round_up ) roundIncrement = 0; 140 } 141 else { 142 if ( roundingMode == float_round_down ) roundIncrement = 0; 143 } 144 } 145 } 146 roundBits = (int8)(absZ & 0x7F); 147 absZ = ( absZ + roundIncrement )>>7; 148 absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven ); 149 z = (int32)absZ; 150 if ( zSign ) z = - z; 151 if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) { 152 float_raise( float_flag_invalid ); 153 return zSign ? (sbits32) 0x80000000 : 0x7FFFFFFF; 154 } 155 if ( roundBits ) set_float_exception_inexact_flag(); 156 return z; 157 158 } 159 160 /* 161 ------------------------------------------------------------------------------- 162 Takes the 128-bit fixed-point value formed by concatenating `absZ0' and 163 `absZ1', with binary point between bits 63 and 64 (between the input words), 164 and returns the properly rounded 64-bit integer corresponding to the input. 165 If `zSign' is 1, the input is negated before being converted to an integer. 166 Ordinarily, the fixed-point input is simply rounded to an integer, with 167 the inexact exception raised if the input cannot be represented exactly as 168 an integer. However, if the fixed-point input is too large, the invalid 169 exception is raised and the largest positive or negative integer is 170 returned. 171 ------------------------------------------------------------------------------- 172 */ 173 static int64 roundAndPackInt64( flag zSign, bits64 absZ0, bits64 absZ1 ) 174 { 175 int8 roundingMode; 176 flag roundNearestEven, increment; 177 int64 z; 178 179 roundingMode = float_rounding_mode; 180 roundNearestEven = ( roundingMode == float_round_nearest_even ); 181 increment = ( (sbits64) absZ1 < 0 ); 182 if ( ! roundNearestEven ) { 183 if ( roundingMode == float_round_to_zero ) { 184 increment = 0; 185 } 186 else { 187 if ( zSign ) { 188 increment = ( roundingMode == float_round_down ) && absZ1; 189 } 190 else { 191 increment = ( roundingMode == float_round_up ) && absZ1; 192 } 193 } 194 } 195 if ( increment ) { 196 ++absZ0; 197 if ( absZ0 == 0 ) goto overflow; 198 absZ0 &= ~ ( ( (bits64) ( absZ1<<1 ) == 0 ) & roundNearestEven ); 199 } 200 z = absZ0; 201 if ( zSign ) z = - z; 202 if ( z && ( ( z < 0 ) ^ zSign ) ) { 203 overflow: 204 float_raise( float_flag_invalid ); 205 return 206 zSign ? (sbits64) LIT64( 0x8000000000000000 ) 207 : LIT64( 0x7FFFFFFFFFFFFFFF ); 208 } 209 if ( absZ1 ) set_float_exception_inexact_flag(); 210 return z; 211 212 } 213 #endif 214 215 /* 216 ------------------------------------------------------------------------------- 217 Returns the fraction bits of the single-precision floating-point value `a'. 218 ------------------------------------------------------------------------------- 219 */ 220 INLINE bits32 extractFloat32Frac( float32 a ) 221 { 222 223 return a & 0x007FFFFF; 224 225 } 226 227 /* 228 ------------------------------------------------------------------------------- 229 Returns the exponent bits of the single-precision floating-point value `a'. 230 ------------------------------------------------------------------------------- 231 */ 232 INLINE int16 extractFloat32Exp( float32 a ) 233 { 234 235 return ( a>>23 ) & 0xFF; 236 237 } 238 239 /* 240 ------------------------------------------------------------------------------- 241 Returns the sign bit of the single-precision floating-point value `a'. 242 ------------------------------------------------------------------------------- 243 */ 244 INLINE flag extractFloat32Sign( float32 a ) 245 { 246 247 return a>>31; 248 249 } 250 251 /* 252 ------------------------------------------------------------------------------- 253 Normalizes the subnormal single-precision floating-point value represented 254 by the denormalized significand `aSig'. The normalized exponent and 255 significand are stored at the locations pointed to by `zExpPtr' and 256 `zSigPtr', respectively. 257 ------------------------------------------------------------------------------- 258 */ 259 static void 260 normalizeFloat32Subnormal( bits32 aSig, int16 *zExpPtr, bits32 *zSigPtr ) 261 { 262 int8 shiftCount; 263 264 shiftCount = countLeadingZeros32( aSig ) - 8; 265 *zSigPtr = aSig<<shiftCount; 266 *zExpPtr = 1 - shiftCount; 267 268 } 269 270 /* 271 ------------------------------------------------------------------------------- 272 Packs the sign `zSign', exponent `zExp', and significand `zSig' into a 273 single-precision floating-point value, returning the result. After being 274 shifted into the proper positions, the three fields are simply added 275 together to form the result. This means that any integer portion of `zSig' 276 will be added into the exponent. Since a properly normalized significand 277 will have an integer portion equal to 1, the `zExp' input should be 1 less 278 than the desired result exponent whenever `zSig' is a complete, normalized 279 significand. 280 ------------------------------------------------------------------------------- 281 */ 282 INLINE float32 packFloat32( flag zSign, int16 zExp, bits32 zSig ) 283 { 284 285 return ( ( (bits32) zSign )<<31 ) + ( ( (bits32) zExp )<<23 ) + zSig; 286 287 } 288 289 /* 290 ------------------------------------------------------------------------------- 291 Takes an abstract floating-point value having sign `zSign', exponent `zExp', 292 and significand `zSig', and returns the proper single-precision floating- 293 point value corresponding to the abstract input. Ordinarily, the abstract 294 value is simply rounded and packed into the single-precision format, with 295 the inexact exception raised if the abstract input cannot be represented 296 exactly. However, if the abstract value is too large, the overflow and 297 inexact exceptions are raised and an infinity or maximal finite value is 298 returned. If the abstract value is too small, the input value is rounded to 299 a subnormal number, and the underflow and inexact exceptions are raised if 300 the abstract input cannot be represented exactly as a subnormal single- 301 precision floating-point number. 302 The input significand `zSig' has its binary point between bits 30 303 and 29, which is 7 bits to the left of the usual location. This shifted 304 significand must be normalized or smaller. If `zSig' is not normalized, 305 `zExp' must be 0; in that case, the result returned is a subnormal number, 306 and it must not require rounding. In the usual case that `zSig' is 307 normalized, `zExp' must be 1 less than the ``true'' floating-point exponent. 308 The handling of underflow and overflow follows the IEC/IEEE Standard for 309 Binary Floating-Point Arithmetic. 310 ------------------------------------------------------------------------------- 311 */ 312 static float32 roundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig ) 313 { 314 int8 roundingMode; 315 flag roundNearestEven; 316 int8 roundIncrement, roundBits; 317 flag isTiny; 318 319 roundingMode = float_rounding_mode; 320 roundNearestEven = ( roundingMode == float_round_nearest_even ); 321 roundIncrement = 0x40; 322 if ( ! roundNearestEven ) { 323 if ( roundingMode == float_round_to_zero ) { 324 roundIncrement = 0; 325 } 326 else { 327 roundIncrement = 0x7F; 328 if ( zSign ) { 329 if ( roundingMode == float_round_up ) roundIncrement = 0; 330 } 331 else { 332 if ( roundingMode == float_round_down ) roundIncrement = 0; 333 } 334 } 335 } 336 roundBits = zSig & 0x7F; 337 if ( 0xFD <= (bits16) zExp ) { 338 if ( ( 0xFD < zExp ) 339 || ( ( zExp == 0xFD ) 340 && ( (sbits32) ( zSig + roundIncrement ) < 0 ) ) 341 ) { 342 float_raise( float_flag_overflow | float_flag_inexact ); 343 return packFloat32( zSign, 0xFF, 0 ) - ( roundIncrement == 0 ); 344 } 345 if ( zExp < 0 ) { 346 isTiny = 347 ( float_detect_tininess == float_tininess_before_rounding ) 348 || ( zExp < -1 ) 349 || ( zSig + roundIncrement < 0x80000000U ); 350 shift32RightJamming( zSig, - zExp, &zSig ); 351 zExp = 0; 352 roundBits = zSig & 0x7F; 353 if ( isTiny && roundBits ) float_raise( float_flag_underflow ); 354 } 355 } 356 if ( roundBits ) set_float_exception_inexact_flag(); 357 zSig = ( zSig + roundIncrement )>>7; 358 zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven ); 359 if ( zSig == 0 ) zExp = 0; 360 return packFloat32( zSign, zExp, zSig ); 361 362 } 363 364 /* 365 ------------------------------------------------------------------------------- 366 Takes an abstract floating-point value having sign `zSign', exponent `zExp', 367 and significand `zSig', and returns the proper single-precision floating- 368 point value corresponding to the abstract input. This routine is just like 369 `roundAndPackFloat32' except that `zSig' does not have to be normalized. 370 Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true'' 371 floating-point exponent. 372 ------------------------------------------------------------------------------- 373 */ 374 static float32 375 normalizeRoundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig ) 376 { 377 int8 shiftCount; 378 379 shiftCount = countLeadingZeros32( zSig ) - 1; 380 return roundAndPackFloat32( zSign, zExp - shiftCount, zSig<<shiftCount ); 381 382 } 383 384 /* 385 ------------------------------------------------------------------------------- 386 Returns the fraction bits of the double-precision floating-point value `a'. 387 ------------------------------------------------------------------------------- 388 */ 389 INLINE bits64 extractFloat64Frac( float64 a ) 390 { 391 392 return FLOAT64_DEMANGLE(a) & LIT64( 0x000FFFFFFFFFFFFF ); 393 394 } 395 396 /* 397 ------------------------------------------------------------------------------- 398 Returns the exponent bits of the double-precision floating-point value `a'. 399 ------------------------------------------------------------------------------- 400 */ 401 INLINE int16 extractFloat64Exp( float64 a ) 402 { 403 404 return (int16)((FLOAT64_DEMANGLE(a) >> 52) & 0x7FF); 405 406 } 407 408 /* 409 ------------------------------------------------------------------------------- 410 Returns the sign bit of the double-precision floating-point value `a'. 411 ------------------------------------------------------------------------------- 412 */ 413 INLINE flag extractFloat64Sign( float64 a ) 414 { 415 416 return (flag)(FLOAT64_DEMANGLE(a) >> 63); 417 418 } 419 420 /* 421 ------------------------------------------------------------------------------- 422 Normalizes the subnormal double-precision floating-point value represented 423 by the denormalized significand `aSig'. The normalized exponent and 424 significand are stored at the locations pointed to by `zExpPtr' and 425 `zSigPtr', respectively. 426 ------------------------------------------------------------------------------- 427 */ 428 static void 429 normalizeFloat64Subnormal( bits64 aSig, int16 *zExpPtr, bits64 *zSigPtr ) 430 { 431 int8 shiftCount; 432 433 shiftCount = countLeadingZeros64( aSig ) - 11; 434 *zSigPtr = aSig<<shiftCount; 435 *zExpPtr = 1 - shiftCount; 436 437 } 438 439 /* 440 ------------------------------------------------------------------------------- 441 Packs the sign `zSign', exponent `zExp', and significand `zSig' into a 442 double-precision floating-point value, returning the result. After being 443 shifted into the proper positions, the three fields are simply added 444 together to form the result. This means that any integer portion of `zSig' 445 will be added into the exponent. Since a properly normalized significand 446 will have an integer portion equal to 1, the `zExp' input should be 1 less 447 than the desired result exponent whenever `zSig' is a complete, normalized 448 significand. 449 ------------------------------------------------------------------------------- 450 */ 451 INLINE float64 packFloat64( flag zSign, int16 zExp, bits64 zSig ) 452 { 453 454 return FLOAT64_MANGLE( ( ( (bits64) zSign )<<63 ) + 455 ( ( (bits64) zExp )<<52 ) + zSig ); 456 457 } 458 459 /* 460 ------------------------------------------------------------------------------- 461 Takes an abstract floating-point value having sign `zSign', exponent `zExp', 462 and significand `zSig', and returns the proper double-precision floating- 463 point value corresponding to the abstract input. Ordinarily, the abstract 464 value is simply rounded and packed into the double-precision format, with 465 the inexact exception raised if the abstract input cannot be represented 466 exactly. However, if the abstract value is too large, the overflow and 467 inexact exceptions are raised and an infinity or maximal finite value is 468 returned. If the abstract value is too small, the input value is rounded to 469 a subnormal number, and the underflow and inexact exceptions are raised if 470 the abstract input cannot be represented exactly as a subnormal double- 471 precision floating-point number. 472 The input significand `zSig' has its binary point between bits 62 473 and 61, which is 10 bits to the left of the usual location. This shifted 474 significand must be normalized or smaller. If `zSig' is not normalized, 475 `zExp' must be 0; in that case, the result returned is a subnormal number, 476 and it must not require rounding. In the usual case that `zSig' is 477 normalized, `zExp' must be 1 less than the ``true'' floating-point exponent. 478 The handling of underflow and overflow follows the IEC/IEEE Standard for 479 Binary Floating-Point Arithmetic. 480 ------------------------------------------------------------------------------- 481 */ 482 static float64 roundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig ) 483 { 484 int8 roundingMode; 485 flag roundNearestEven; 486 int16 roundIncrement, roundBits; 487 flag isTiny; 488 489 roundingMode = float_rounding_mode; 490 roundNearestEven = ( roundingMode == float_round_nearest_even ); 491 roundIncrement = 0x200; 492 if ( ! roundNearestEven ) { 493 if ( roundingMode == float_round_to_zero ) { 494 roundIncrement = 0; 495 } 496 else { 497 roundIncrement = 0x3FF; 498 if ( zSign ) { 499 if ( roundingMode == float_round_up ) roundIncrement = 0; 500 } 501 else { 502 if ( roundingMode == float_round_down ) roundIncrement = 0; 503 } 504 } 505 } 506 roundBits = (int16)(zSig & 0x3FF); 507 if ( 0x7FD <= (bits16) zExp ) { 508 if ( ( 0x7FD < zExp ) 509 || ( ( zExp == 0x7FD ) 510 && ( (sbits64) ( zSig + roundIncrement ) < 0 ) ) 511 ) { 512 float_raise( float_flag_overflow | float_flag_inexact ); 513 return FLOAT64_MANGLE( 514 FLOAT64_DEMANGLE(packFloat64( zSign, 0x7FF, 0 )) - 515 ( roundIncrement == 0 )); 516 } 517 if ( zExp < 0 ) { 518 isTiny = 519 ( float_detect_tininess == float_tininess_before_rounding ) 520 || ( zExp < -1 ) 521 || ( zSig + roundIncrement < (bits64)LIT64( 0x8000000000000000 ) ); 522 shift64RightJamming( zSig, - zExp, &zSig ); 523 zExp = 0; 524 roundBits = (int16)(zSig & 0x3FF); 525 if ( isTiny && roundBits ) float_raise( float_flag_underflow ); 526 } 527 } 528 if ( roundBits ) set_float_exception_inexact_flag(); 529 zSig = ( zSig + roundIncrement )>>10; 530 zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven ); 531 if ( zSig == 0 ) zExp = 0; 532 return packFloat64( zSign, zExp, zSig ); 533 534 } 535 536 /* 537 ------------------------------------------------------------------------------- 538 Takes an abstract floating-point value having sign `zSign', exponent `zExp', 539 and significand `zSig', and returns the proper double-precision floating- 540 point value corresponding to the abstract input. This routine is just like 541 `roundAndPackFloat64' except that `zSig' does not have to be normalized. 542 Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true'' 543 floating-point exponent. 544 ------------------------------------------------------------------------------- 545 */ 546 static float64 547 normalizeRoundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig ) 548 { 549 int8 shiftCount; 550 551 shiftCount = countLeadingZeros64( zSig ) - 1; 552 return roundAndPackFloat64( zSign, zExp - shiftCount, zSig<<shiftCount ); 553 554 } 555 556 #ifdef FLOATX80 557 558 /* 559 ------------------------------------------------------------------------------- 560 Returns the fraction bits of the extended double-precision floating-point 561 value `a'. 562 ------------------------------------------------------------------------------- 563 */ 564 INLINE bits64 extractFloatx80Frac( floatx80 a ) 565 { 566 567 return a.low; 568 569 } 570 571 /* 572 ------------------------------------------------------------------------------- 573 Returns the exponent bits of the extended double-precision floating-point 574 value `a'. 575 ------------------------------------------------------------------------------- 576 */ 577 INLINE int32 extractFloatx80Exp( floatx80 a ) 578 { 579 580 return a.high & 0x7FFF; 581 582 } 583 584 /* 585 ------------------------------------------------------------------------------- 586 Returns the sign bit of the extended double-precision floating-point value 587 `a'. 588 ------------------------------------------------------------------------------- 589 */ 590 INLINE flag extractFloatx80Sign( floatx80 a ) 591 { 592 593 return a.high>>15; 594 595 } 596 597 /* 598 ------------------------------------------------------------------------------- 599 Normalizes the subnormal extended double-precision floating-point value 600 represented by the denormalized significand `aSig'. The normalized exponent 601 and significand are stored at the locations pointed to by `zExpPtr' and 602 `zSigPtr', respectively. 603 ------------------------------------------------------------------------------- 604 */ 605 static void 606 normalizeFloatx80Subnormal( bits64 aSig, int32 *zExpPtr, bits64 *zSigPtr ) 607 { 608 int8 shiftCount; 609 610 shiftCount = countLeadingZeros64( aSig ); 611 *zSigPtr = aSig<<shiftCount; 612 *zExpPtr = 1 - shiftCount; 613 614 } 615 616 /* 617 ------------------------------------------------------------------------------- 618 Packs the sign `zSign', exponent `zExp', and significand `zSig' into an 619 extended double-precision floating-point value, returning the result. 620 ------------------------------------------------------------------------------- 621 */ 622 INLINE floatx80 packFloatx80( flag zSign, int32 zExp, bits64 zSig ) 623 { 624 floatx80 z; 625 626 z.low = zSig; 627 z.high = ( ( (bits16) zSign )<<15 ) + zExp; 628 return z; 629 630 } 631 632 /* 633 ------------------------------------------------------------------------------- 634 Takes an abstract floating-point value having sign `zSign', exponent `zExp', 635 and extended significand formed by the concatenation of `zSig0' and `zSig1', 636 and returns the proper extended double-precision floating-point value 637 corresponding to the abstract input. Ordinarily, the abstract value is 638 rounded and packed into the extended double-precision format, with the 639 inexact exception raised if the abstract input cannot be represented 640 exactly. However, if the abstract value is too large, the overflow and 641 inexact exceptions are raised and an infinity or maximal finite value is 642 returned. If the abstract value is too small, the input value is rounded to 643 a subnormal number, and the underflow and inexact exceptions are raised if 644 the abstract input cannot be represented exactly as a subnormal extended 645 double-precision floating-point number. 646 If `roundingPrecision' is 32 or 64, the result is rounded to the same 647 number of bits as single or double precision, respectively. Otherwise, the 648 result is rounded to the full precision of the extended double-precision 649 format. 650 The input significand must be normalized or smaller. If the input 651 significand is not normalized, `zExp' must be 0; in that case, the result 652 returned is a subnormal number, and it must not require rounding. The 653 handling of underflow and overflow follows the IEC/IEEE Standard for Binary 654 Floating-Point Arithmetic. 655 ------------------------------------------------------------------------------- 656 */ 657 static floatx80 658 roundAndPackFloatx80( 659 int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 660 ) 661 { 662 int8 roundingMode; 663 flag roundNearestEven, increment, isTiny; 664 int64 roundIncrement, roundMask, roundBits; 665 666 roundingMode = float_rounding_mode; 667 roundNearestEven = ( roundingMode == float_round_nearest_even ); 668 if ( roundingPrecision == 80 ) goto precision80; 669 if ( roundingPrecision == 64 ) { 670 roundIncrement = LIT64( 0x0000000000000400 ); 671 roundMask = LIT64( 0x00000000000007FF ); 672 } 673 else if ( roundingPrecision == 32 ) { 674 roundIncrement = LIT64( 0x0000008000000000 ); 675 roundMask = LIT64( 0x000000FFFFFFFFFF ); 676 } 677 else { 678 goto precision80; 679 } 680 zSig0 |= ( zSig1 != 0 ); 681 if ( ! roundNearestEven ) { 682 if ( roundingMode == float_round_to_zero ) { 683 roundIncrement = 0; 684 } 685 else { 686 roundIncrement = roundMask; 687 if ( zSign ) { 688 if ( roundingMode == float_round_up ) roundIncrement = 0; 689 } 690 else { 691 if ( roundingMode == float_round_down ) roundIncrement = 0; 692 } 693 } 694 } 695 roundBits = zSig0 & roundMask; 696 if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) { 697 if ( ( 0x7FFE < zExp ) 698 || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) ) 699 ) { 700 goto overflow; 701 } 702 if ( zExp <= 0 ) { 703 isTiny = 704 ( float_detect_tininess == float_tininess_before_rounding ) 705 || ( zExp < 0 ) 706 || ( zSig0 <= zSig0 + roundIncrement ); 707 shift64RightJamming( zSig0, 1 - zExp, &zSig0 ); 708 zExp = 0; 709 roundBits = zSig0 & roundMask; 710 if ( isTiny && roundBits ) float_raise( float_flag_underflow ); 711 if ( roundBits ) set_float_exception_inexact_flag(); 712 zSig0 += roundIncrement; 713 if ( (sbits64) zSig0 < 0 ) zExp = 1; 714 roundIncrement = roundMask + 1; 715 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) { 716 roundMask |= roundIncrement; 717 } 718 zSig0 &= ~ roundMask; 719 return packFloatx80( zSign, zExp, zSig0 ); 720 } 721 } 722 if ( roundBits ) set_float_exception_inexact_flag(); 723 zSig0 += roundIncrement; 724 if ( zSig0 < roundIncrement ) { 725 ++zExp; 726 zSig0 = LIT64( 0x8000000000000000 ); 727 } 728 roundIncrement = roundMask + 1; 729 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) { 730 roundMask |= roundIncrement; 731 } 732 zSig0 &= ~ roundMask; 733 if ( zSig0 == 0 ) zExp = 0; 734 return packFloatx80( zSign, zExp, zSig0 ); 735 precision80: 736 increment = ( (sbits64) zSig1 < 0 ); 737 if ( ! roundNearestEven ) { 738 if ( roundingMode == float_round_to_zero ) { 739 increment = 0; 740 } 741 else { 742 if ( zSign ) { 743 increment = ( roundingMode == float_round_down ) && zSig1; 744 } 745 else { 746 increment = ( roundingMode == float_round_up ) && zSig1; 747 } 748 } 749 } 750 if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) { 751 if ( ( 0x7FFE < zExp ) 752 || ( ( zExp == 0x7FFE ) 753 && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) ) 754 && increment 755 ) 756 ) { 757 roundMask = 0; 758 overflow: 759 float_raise( float_flag_overflow | float_flag_inexact ); 760 if ( ( roundingMode == float_round_to_zero ) 761 || ( zSign && ( roundingMode == float_round_up ) ) 762 || ( ! zSign && ( roundingMode == float_round_down ) ) 763 ) { 764 return packFloatx80( zSign, 0x7FFE, ~ roundMask ); 765 } 766 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 767 } 768 if ( zExp <= 0 ) { 769 isTiny = 770 ( float_detect_tininess == float_tininess_before_rounding ) 771 || ( zExp < 0 ) 772 || ! increment 773 || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) ); 774 shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 ); 775 zExp = 0; 776 if ( isTiny && zSig1 ) float_raise( float_flag_underflow ); 777 if ( zSig1 ) set_float_exception_inexact_flag(); 778 if ( roundNearestEven ) { 779 increment = ( (sbits64) zSig1 < 0 ); 780 } 781 else { 782 if ( zSign ) { 783 increment = ( roundingMode == float_round_down ) && zSig1; 784 } 785 else { 786 increment = ( roundingMode == float_round_up ) && zSig1; 787 } 788 } 789 if ( increment ) { 790 ++zSig0; 791 zSig0 &= 792 ~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven ); 793 if ( (sbits64) zSig0 < 0 ) zExp = 1; 794 } 795 return packFloatx80( zSign, zExp, zSig0 ); 796 } 797 } 798 if ( zSig1 ) set_float_exception_inexact_flag(); 799 if ( increment ) { 800 ++zSig0; 801 if ( zSig0 == 0 ) { 802 ++zExp; 803 zSig0 = LIT64( 0x8000000000000000 ); 804 } 805 else { 806 zSig0 &= ~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven ); 807 } 808 } 809 else { 810 if ( zSig0 == 0 ) zExp = 0; 811 } 812 return packFloatx80( zSign, zExp, zSig0 ); 813 814 } 815 816 /* 817 ------------------------------------------------------------------------------- 818 Takes an abstract floating-point value having sign `zSign', exponent 819 `zExp', and significand formed by the concatenation of `zSig0' and `zSig1', 820 and returns the proper extended double-precision floating-point value 821 corresponding to the abstract input. This routine is just like 822 `roundAndPackFloatx80' except that the input significand does not have to be 823 normalized. 824 ------------------------------------------------------------------------------- 825 */ 826 static floatx80 827 normalizeRoundAndPackFloatx80( 828 int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 829 ) 830 { 831 int8 shiftCount; 832 833 if ( zSig0 == 0 ) { 834 zSig0 = zSig1; 835 zSig1 = 0; 836 zExp -= 64; 837 } 838 shiftCount = countLeadingZeros64( zSig0 ); 839 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 ); 840 zExp -= shiftCount; 841 return 842 roundAndPackFloatx80( roundingPrecision, zSign, zExp, zSig0, zSig1 ); 843 844 } 845 846 #endif 847 848 #ifdef FLOAT128 849 850 /* 851 ------------------------------------------------------------------------------- 852 Returns the least-significant 64 fraction bits of the quadruple-precision 853 floating-point value `a'. 854 ------------------------------------------------------------------------------- 855 */ 856 INLINE bits64 extractFloat128Frac1( float128 a ) 857 { 858 859 return a.low; 860 861 } 862 863 /* 864 ------------------------------------------------------------------------------- 865 Returns the most-significant 48 fraction bits of the quadruple-precision 866 floating-point value `a'. 867 ------------------------------------------------------------------------------- 868 */ 869 INLINE bits64 extractFloat128Frac0( float128 a ) 870 { 871 872 return a.high & LIT64( 0x0000FFFFFFFFFFFF ); 873 874 } 875 876 /* 877 ------------------------------------------------------------------------------- 878 Returns the exponent bits of the quadruple-precision floating-point value 879 `a'. 880 ------------------------------------------------------------------------------- 881 */ 882 INLINE int32 extractFloat128Exp( float128 a ) 883 { 884 885 return (int32)((a.high >> 48) & 0x7FFF); 886 887 } 888 889 /* 890 ------------------------------------------------------------------------------- 891 Returns the sign bit of the quadruple-precision floating-point value `a'. 892 ------------------------------------------------------------------------------- 893 */ 894 INLINE flag extractFloat128Sign( float128 a ) 895 { 896 897 return (flag)(a.high >> 63); 898 899 } 900 901 /* 902 ------------------------------------------------------------------------------- 903 Normalizes the subnormal quadruple-precision floating-point value 904 represented by the denormalized significand formed by the concatenation of 905 `aSig0' and `aSig1'. The normalized exponent is stored at the location 906 pointed to by `zExpPtr'. The most significant 49 bits of the normalized 907 significand are stored at the location pointed to by `zSig0Ptr', and the 908 least significant 64 bits of the normalized significand are stored at the 909 location pointed to by `zSig1Ptr'. 910 ------------------------------------------------------------------------------- 911 */ 912 static void 913 normalizeFloat128Subnormal( 914 bits64 aSig0, 915 bits64 aSig1, 916 int32 *zExpPtr, 917 bits64 *zSig0Ptr, 918 bits64 *zSig1Ptr 919 ) 920 { 921 int8 shiftCount; 922 923 if ( aSig0 == 0 ) { 924 shiftCount = countLeadingZeros64( aSig1 ) - 15; 925 if ( shiftCount < 0 ) { 926 *zSig0Ptr = aSig1>>( - shiftCount ); 927 *zSig1Ptr = aSig1<<( shiftCount & 63 ); 928 } 929 else { 930 *zSig0Ptr = aSig1<<shiftCount; 931 *zSig1Ptr = 0; 932 } 933 *zExpPtr = - shiftCount - 63; 934 } 935 else { 936 shiftCount = countLeadingZeros64( aSig0 ) - 15; 937 shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr ); 938 *zExpPtr = 1 - shiftCount; 939 } 940 941 } 942 943 /* 944 ------------------------------------------------------------------------------- 945 Packs the sign `zSign', the exponent `zExp', and the significand formed 946 by the concatenation of `zSig0' and `zSig1' into a quadruple-precision 947 floating-point value, returning the result. After being shifted into the 948 proper positions, the three fields `zSign', `zExp', and `zSig0' are simply 949 added together to form the most significant 32 bits of the result. This 950 means that any integer portion of `zSig0' will be added into the exponent. 951 Since a properly normalized significand will have an integer portion equal 952 to 1, the `zExp' input should be 1 less than the desired result exponent 953 whenever `zSig0' and `zSig1' concatenated form a complete, normalized 954 significand. 955 ------------------------------------------------------------------------------- 956 */ 957 INLINE float128 958 packFloat128( flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 ) 959 { 960 float128 z; 961 962 z.low = zSig1; 963 z.high = ( ( (bits64) zSign )<<63 ) + ( ( (bits64) zExp )<<48 ) + zSig0; 964 return z; 965 966 } 967 968 /* 969 ------------------------------------------------------------------------------- 970 Takes an abstract floating-point value having sign `zSign', exponent `zExp', 971 and extended significand formed by the concatenation of `zSig0', `zSig1', 972 and `zSig2', and returns the proper quadruple-precision floating-point value 973 corresponding to the abstract input. Ordinarily, the abstract value is 974 simply rounded and packed into the quadruple-precision format, with the 975 inexact exception raised if the abstract input cannot be represented 976 exactly. However, if the abstract value is too large, the overflow and 977 inexact exceptions are raised and an infinity or maximal finite value is 978 returned. If the abstract value is too small, the input value is rounded to 979 a subnormal number, and the underflow and inexact exceptions are raised if 980 the abstract input cannot be represented exactly as a subnormal quadruple- 981 precision floating-point number. 982 The input significand must be normalized or smaller. If the input 983 significand is not normalized, `zExp' must be 0; in that case, the result 984 returned is a subnormal number, and it must not require rounding. In the 985 usual case that the input significand is normalized, `zExp' must be 1 less 986 than the ``true'' floating-point exponent. The handling of underflow and 987 overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 988 ------------------------------------------------------------------------------- 989 */ 990 static float128 991 roundAndPackFloat128( 992 flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1, bits64 zSig2 ) 993 { 994 int8 roundingMode; 995 flag roundNearestEven, increment, isTiny; 996 997 roundingMode = float_rounding_mode; 998 roundNearestEven = ( roundingMode == float_round_nearest_even ); 999 increment = ( (sbits64) zSig2 < 0 ); 1000 if ( ! roundNearestEven ) { 1001 if ( roundingMode == float_round_to_zero ) { 1002 increment = 0; 1003 } 1004 else { 1005 if ( zSign ) { 1006 increment = ( roundingMode == float_round_down ) && zSig2; 1007 } 1008 else { 1009 increment = ( roundingMode == float_round_up ) && zSig2; 1010 } 1011 } 1012 } 1013 if ( 0x7FFD <= (bits32) zExp ) { 1014 if ( ( 0x7FFD < zExp ) 1015 || ( ( zExp == 0x7FFD ) 1016 && eq128( 1017 LIT64( 0x0001FFFFFFFFFFFF ), 1018 LIT64( 0xFFFFFFFFFFFFFFFF ), 1019 zSig0, 1020 zSig1 1021 ) 1022 && increment 1023 ) 1024 ) { 1025 float_raise( float_flag_overflow | float_flag_inexact ); 1026 if ( ( roundingMode == float_round_to_zero ) 1027 || ( zSign && ( roundingMode == float_round_up ) ) 1028 || ( ! zSign && ( roundingMode == float_round_down ) ) 1029 ) { 1030 return 1031 packFloat128( 1032 zSign, 1033 0x7FFE, 1034 LIT64( 0x0000FFFFFFFFFFFF ), 1035 LIT64( 0xFFFFFFFFFFFFFFFF ) 1036 ); 1037 } 1038 return packFloat128( zSign, 0x7FFF, 0, 0 ); 1039 } 1040 if ( zExp < 0 ) { 1041 isTiny = 1042 ( float_detect_tininess == float_tininess_before_rounding ) 1043 || ( zExp < -1 ) 1044 || ! increment 1045 || lt128( 1046 zSig0, 1047 zSig1, 1048 LIT64( 0x0001FFFFFFFFFFFF ), 1049 LIT64( 0xFFFFFFFFFFFFFFFF ) 1050 ); 1051 shift128ExtraRightJamming( 1052 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 ); 1053 zExp = 0; 1054 if ( isTiny && zSig2 ) float_raise( float_flag_underflow ); 1055 if ( roundNearestEven ) { 1056 increment = ( (sbits64) zSig2 < 0 ); 1057 } 1058 else { 1059 if ( zSign ) { 1060 increment = ( roundingMode == float_round_down ) && zSig2; 1061 } 1062 else { 1063 increment = ( roundingMode == float_round_up ) && zSig2; 1064 } 1065 } 1066 } 1067 } 1068 if ( zSig2 ) set_float_exception_inexact_flag(); 1069 if ( increment ) { 1070 add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 ); 1071 zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven ); 1072 } 1073 else { 1074 if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0; 1075 } 1076 return packFloat128( zSign, zExp, zSig0, zSig1 ); 1077 1078 } 1079 1080 /* 1081 ------------------------------------------------------------------------------- 1082 Takes an abstract floating-point value having sign `zSign', exponent `zExp', 1083 and significand formed by the concatenation of `zSig0' and `zSig1', and 1084 returns the proper quadruple-precision floating-point value corresponding 1085 to the abstract input. This routine is just like `roundAndPackFloat128' 1086 except that the input significand has fewer bits and does not have to be 1087 normalized. In all cases, `zExp' must be 1 less than the ``true'' floating- 1088 point exponent. 1089 ------------------------------------------------------------------------------- 1090 */ 1091 static float128 1092 normalizeRoundAndPackFloat128( 1093 flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 ) 1094 { 1095 int8 shiftCount; 1096 bits64 zSig2; 1097 1098 if ( zSig0 == 0 ) { 1099 zSig0 = zSig1; 1100 zSig1 = 0; 1101 zExp -= 64; 1102 } 1103 shiftCount = countLeadingZeros64( zSig0 ) - 15; 1104 if ( 0 <= shiftCount ) { 1105 zSig2 = 0; 1106 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 ); 1107 } 1108 else { 1109 shift128ExtraRightJamming( 1110 zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 ); 1111 } 1112 zExp -= shiftCount; 1113 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 ); 1114 1115 } 1116 1117 #endif 1118 1119 /* 1120 ------------------------------------------------------------------------------- 1121 Returns the result of converting the 32-bit two's complement integer `a' 1122 to the single-precision floating-point format. The conversion is performed 1123 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1124 ------------------------------------------------------------------------------- 1125 */ 1126 float32 int32_to_float32( int32 a ) 1127 { 1128 flag zSign; 1129 1130 if ( a == 0 ) return 0; 1131 if ( a == (sbits32) 0x80000000 ) return packFloat32( 1, 0x9E, 0 ); 1132 zSign = ( a < 0 ); 1133 return normalizeRoundAndPackFloat32(zSign, 0x9C, (uint32)(zSign ? - a : a)); 1134 1135 } 1136 1137 float32 uint32_to_float32( uint32 a ) 1138 { 1139 if ( a == 0 ) return 0; 1140 if ( a & (bits32) 0x80000000 ) 1141 return normalizeRoundAndPackFloat32( 0, 0x9D, a >> 1 ); 1142 return normalizeRoundAndPackFloat32( 0, 0x9C, a ); 1143 } 1144 1145 1146 /* 1147 ------------------------------------------------------------------------------- 1148 Returns the result of converting the 32-bit two's complement integer `a' 1149 to the double-precision floating-point format. The conversion is performed 1150 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1151 ------------------------------------------------------------------------------- 1152 */ 1153 float64 int32_to_float64( int32 a ) 1154 { 1155 flag zSign; 1156 uint32 absA; 1157 int8 shiftCount; 1158 bits64 zSig; 1159 1160 if ( a == 0 ) return 0; 1161 zSign = ( a < 0 ); 1162 absA = zSign ? - a : a; 1163 shiftCount = countLeadingZeros32( absA ) + 21; 1164 zSig = absA; 1165 return packFloat64( zSign, 0x432 - shiftCount, zSig<<shiftCount ); 1166 1167 } 1168 1169 float64 uint32_to_float64( uint32 a ) 1170 { 1171 int8 shiftCount; 1172 bits64 zSig = a; 1173 1174 if ( a == 0 ) return 0; 1175 shiftCount = countLeadingZeros32( a ) + 21; 1176 return packFloat64( 0, 0x432 - shiftCount, zSig<<shiftCount ); 1177 1178 } 1179 1180 #ifdef FLOATX80 1181 1182 /* 1183 ------------------------------------------------------------------------------- 1184 Returns the result of converting the 32-bit two's complement integer `a' 1185 to the extended double-precision floating-point format. The conversion 1186 is performed according to the IEC/IEEE Standard for Binary Floating-Point 1187 Arithmetic. 1188 ------------------------------------------------------------------------------- 1189 */ 1190 floatx80 int32_to_floatx80( int32 a ) 1191 { 1192 flag zSign; 1193 uint32 absA; 1194 int8 shiftCount; 1195 bits64 zSig; 1196 1197 if ( a == 0 ) return packFloatx80( 0, 0, 0 ); 1198 zSign = ( a < 0 ); 1199 absA = zSign ? - a : a; 1200 shiftCount = countLeadingZeros32( absA ) + 32; 1201 zSig = absA; 1202 return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount ); 1203 1204 } 1205 1206 floatx80 uint32_to_floatx80( uint32 a ) 1207 { 1208 int8 shiftCount; 1209 bits64 zSig = a; 1210 1211 if ( a == 0 ) return packFloatx80( 0, 0, 0 ); 1212 shiftCount = countLeadingZeros32( a ) + 32; 1213 return packFloatx80( 0, 0x403E - shiftCount, zSig<<shiftCount ); 1214 1215 } 1216 1217 #endif 1218 1219 #ifdef FLOAT128 1220 1221 /* 1222 ------------------------------------------------------------------------------- 1223 Returns the result of converting the 32-bit two's complement integer `a' to 1224 the quadruple-precision floating-point format. The conversion is performed 1225 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1226 ------------------------------------------------------------------------------- 1227 */ 1228 float128 int32_to_float128( int32 a ) 1229 { 1230 flag zSign; 1231 uint32 absA; 1232 int8 shiftCount; 1233 bits64 zSig0; 1234 1235 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 ); 1236 zSign = ( a < 0 ); 1237 absA = zSign ? - a : a; 1238 shiftCount = countLeadingZeros32( absA ) + 17; 1239 zSig0 = absA; 1240 return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 ); 1241 1242 } 1243 1244 float128 uint32_to_float128( uint32 a ) 1245 { 1246 int8 shiftCount; 1247 bits64 zSig0 = a; 1248 1249 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 ); 1250 shiftCount = countLeadingZeros32( a ) + 17; 1251 return packFloat128( 0, 0x402E - shiftCount, zSig0<<shiftCount, 0 ); 1252 1253 } 1254 1255 #endif 1256 1257 #ifndef SOFTFLOAT_FOR_GCC /* __floatdi?f is in libgcc2.c */ 1258 /* 1259 ------------------------------------------------------------------------------- 1260 Returns the result of converting the 64-bit two's complement integer `a' 1261 to the single-precision floating-point format. The conversion is performed 1262 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1263 ------------------------------------------------------------------------------- 1264 */ 1265 float32 int64_to_float32( int64 a ) 1266 { 1267 flag zSign; 1268 uint64 absA; 1269 int8 shiftCount; 1270 1271 if ( a == 0 ) return 0; 1272 zSign = ( a < 0 ); 1273 absA = zSign ? - a : a; 1274 shiftCount = countLeadingZeros64( absA ) - 40; 1275 if ( 0 <= shiftCount ) { 1276 return packFloat32( zSign, 0x95 - shiftCount, absA<<shiftCount ); 1277 } 1278 else { 1279 shiftCount += 7; 1280 if ( shiftCount < 0 ) { 1281 shift64RightJamming( absA, - shiftCount, &absA ); 1282 } 1283 else { 1284 absA <<= shiftCount; 1285 } 1286 return roundAndPackFloat32( zSign, 0x9C - shiftCount, absA ); 1287 } 1288 1289 } 1290 1291 /* 1292 ------------------------------------------------------------------------------- 1293 Returns the result of converting the 64-bit two's complement integer `a' 1294 to the double-precision floating-point format. The conversion is performed 1295 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1296 ------------------------------------------------------------------------------- 1297 */ 1298 float64 int64_to_float64( int64 a ) 1299 { 1300 flag zSign; 1301 1302 if ( a == 0 ) return 0; 1303 if ( a == (sbits64) LIT64( 0x8000000000000000 ) ) { 1304 return packFloat64( 1, 0x43E, 0 ); 1305 } 1306 zSign = ( a < 0 ); 1307 return normalizeRoundAndPackFloat64( zSign, 0x43C, zSign ? - a : a ); 1308 1309 } 1310 1311 #ifdef FLOATX80 1312 1313 /* 1314 ------------------------------------------------------------------------------- 1315 Returns the result of converting the 64-bit two's complement integer `a' 1316 to the extended double-precision floating-point format. The conversion 1317 is performed according to the IEC/IEEE Standard for Binary Floating-Point 1318 Arithmetic. 1319 ------------------------------------------------------------------------------- 1320 */ 1321 floatx80 int64_to_floatx80( int64 a ) 1322 { 1323 flag zSign; 1324 uint64 absA; 1325 int8 shiftCount; 1326 1327 if ( a == 0 ) return packFloatx80( 0, 0, 0 ); 1328 zSign = ( a < 0 ); 1329 absA = zSign ? - a : a; 1330 shiftCount = countLeadingZeros64( absA ); 1331 return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount ); 1332 1333 } 1334 1335 #endif 1336 1337 #endif /* !SOFTFLOAT_FOR_GCC */ 1338 1339 #ifdef FLOAT128 1340 1341 /* 1342 ------------------------------------------------------------------------------- 1343 Returns the result of converting the 64-bit two's complement integer `a' to 1344 the quadruple-precision floating-point format. The conversion is performed 1345 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1346 ------------------------------------------------------------------------------- 1347 */ 1348 float128 int64_to_float128( int64 a ) 1349 { 1350 flag zSign; 1351 uint64 absA; 1352 int8 shiftCount; 1353 int32 zExp; 1354 bits64 zSig0, zSig1; 1355 1356 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 ); 1357 zSign = ( a < 0 ); 1358 absA = zSign ? - a : a; 1359 shiftCount = countLeadingZeros64( absA ) + 49; 1360 zExp = 0x406E - shiftCount; 1361 if ( 64 <= shiftCount ) { 1362 zSig1 = 0; 1363 zSig0 = absA; 1364 shiftCount -= 64; 1365 } 1366 else { 1367 zSig1 = absA; 1368 zSig0 = 0; 1369 } 1370 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 ); 1371 return packFloat128( zSign, zExp, zSig0, zSig1 ); 1372 1373 } 1374 1375 #endif 1376 1377 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */ 1378 /* 1379 ------------------------------------------------------------------------------- 1380 Returns the result of converting the single-precision floating-point value 1381 `a' to the 32-bit two's complement integer format. The conversion is 1382 performed according to the IEC/IEEE Standard for Binary Floating-Point 1383 Arithmetic---which means in particular that the conversion is rounded 1384 according to the current rounding mode. If `a' is a NaN, the largest 1385 positive integer is returned. Otherwise, if the conversion overflows, the 1386 largest integer with the same sign as `a' is returned. 1387 ------------------------------------------------------------------------------- 1388 */ 1389 int32 float32_to_int32( float32 a ) 1390 { 1391 flag aSign; 1392 int16 aExp, shiftCount; 1393 bits32 aSig; 1394 bits64 aSig64; 1395 1396 aSig = extractFloat32Frac( a ); 1397 aExp = extractFloat32Exp( a ); 1398 aSign = extractFloat32Sign( a ); 1399 if ( ( aExp == 0xFF ) && aSig ) aSign = 0; 1400 if ( aExp ) aSig |= 0x00800000; 1401 shiftCount = 0xAF - aExp; 1402 aSig64 = aSig; 1403 aSig64 <<= 32; 1404 if ( 0 < shiftCount ) shift64RightJamming( aSig64, shiftCount, &aSig64 ); 1405 return roundAndPackInt32( aSign, aSig64 ); 1406 1407 } 1408 #endif /* !SOFTFLOAT_FOR_GCC */ 1409 1410 /* 1411 ------------------------------------------------------------------------------- 1412 Returns the result of converting the single-precision floating-point value 1413 `a' to the 32-bit two's complement integer format. The conversion is 1414 performed according to the IEC/IEEE Standard for Binary Floating-Point 1415 Arithmetic, except that the conversion is always rounded toward zero. 1416 If `a' is a NaN, the largest positive integer is returned. Otherwise, if 1417 the conversion overflows, the largest integer with the same sign as `a' is 1418 returned. 1419 ------------------------------------------------------------------------------- 1420 */ 1421 int32 float32_to_int32_round_to_zero( float32 a ) 1422 { 1423 flag aSign; 1424 int16 aExp, shiftCount; 1425 bits32 aSig; 1426 int32 z; 1427 1428 aSig = extractFloat32Frac( a ); 1429 aExp = extractFloat32Exp( a ); 1430 aSign = extractFloat32Sign( a ); 1431 shiftCount = aExp - 0x9E; 1432 if ( 0 <= shiftCount ) { 1433 if ( a != 0xCF000000 ) { 1434 float_raise( float_flag_invalid ); 1435 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF; 1436 } 1437 return (sbits32) 0x80000000; 1438 } 1439 else if ( aExp <= 0x7E ) { 1440 if ( aExp | aSig ) set_float_exception_inexact_flag(); 1441 return 0; 1442 } 1443 aSig = ( aSig | 0x00800000 )<<8; 1444 z = aSig>>( - shiftCount ); 1445 if ( (bits32) ( aSig<<( shiftCount & 31 ) ) ) { 1446 set_float_exception_inexact_flag(); 1447 } 1448 if ( aSign ) z = - z; 1449 return z; 1450 1451 } 1452 1453 #ifndef SOFTFLOAT_FOR_GCC /* __fix?fdi provided by libgcc2.c */ 1454 /* 1455 ------------------------------------------------------------------------------- 1456 Returns the result of converting the single-precision floating-point value 1457 `a' to the 64-bit two's complement integer format. The conversion is 1458 performed according to the IEC/IEEE Standard for Binary Floating-Point 1459 Arithmetic---which means in particular that the conversion is rounded 1460 according to the current rounding mode. If `a' is a NaN, the largest 1461 positive integer is returned. Otherwise, if the conversion overflows, the 1462 largest integer with the same sign as `a' is returned. 1463 ------------------------------------------------------------------------------- 1464 */ 1465 int64 float32_to_int64( float32 a ) 1466 { 1467 flag aSign; 1468 int16 aExp, shiftCount; 1469 bits32 aSig; 1470 bits64 aSig64, aSigExtra; 1471 1472 aSig = extractFloat32Frac( a ); 1473 aExp = extractFloat32Exp( a ); 1474 aSign = extractFloat32Sign( a ); 1475 shiftCount = 0xBE - aExp; 1476 if ( shiftCount < 0 ) { 1477 float_raise( float_flag_invalid ); 1478 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) { 1479 return LIT64( 0x7FFFFFFFFFFFFFFF ); 1480 } 1481 return (sbits64) LIT64( 0x8000000000000000 ); 1482 } 1483 if ( aExp ) aSig |= 0x00800000; 1484 aSig64 = aSig; 1485 aSig64 <<= 40; 1486 shift64ExtraRightJamming( aSig64, 0, shiftCount, &aSig64, &aSigExtra ); 1487 return roundAndPackInt64( aSign, aSig64, aSigExtra ); 1488 1489 } 1490 1491 /* 1492 ------------------------------------------------------------------------------- 1493 Returns the result of converting the single-precision floating-point value 1494 `a' to the 64-bit two's complement integer format. The conversion is 1495 performed according to the IEC/IEEE Standard for Binary Floating-Point 1496 Arithmetic, except that the conversion is always rounded toward zero. If 1497 `a' is a NaN, the largest positive integer is returned. Otherwise, if the 1498 conversion overflows, the largest integer with the same sign as `a' is 1499 returned. 1500 ------------------------------------------------------------------------------- 1501 */ 1502 int64 float32_to_int64_round_to_zero( float32 a ) 1503 { 1504 flag aSign; 1505 int16 aExp, shiftCount; 1506 bits32 aSig; 1507 bits64 aSig64; 1508 int64 z; 1509 1510 aSig = extractFloat32Frac( a ); 1511 aExp = extractFloat32Exp( a ); 1512 aSign = extractFloat32Sign( a ); 1513 shiftCount = aExp - 0xBE; 1514 if ( 0 <= shiftCount ) { 1515 if ( a != 0xDF000000 ) { 1516 float_raise( float_flag_invalid ); 1517 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) { 1518 return LIT64( 0x7FFFFFFFFFFFFFFF ); 1519 } 1520 } 1521 return (sbits64) LIT64( 0x8000000000000000 ); 1522 } 1523 else if ( aExp <= 0x7E ) { 1524 if ( aExp | aSig ) set_float_exception_inexact_flag(); 1525 return 0; 1526 } 1527 aSig64 = aSig | 0x00800000; 1528 aSig64 <<= 40; 1529 z = aSig64>>( - shiftCount ); 1530 if ( (bits64) ( aSig64<<( shiftCount & 63 ) ) ) { 1531 set_float_exception_inexact_flag(); 1532 } 1533 if ( aSign ) z = - z; 1534 return z; 1535 1536 } 1537 #endif /* !SOFTFLOAT_FOR_GCC */ 1538 1539 /* 1540 ------------------------------------------------------------------------------- 1541 Returns the result of converting the single-precision floating-point value 1542 `a' to the double-precision floating-point format. The conversion is 1543 performed according to the IEC/IEEE Standard for Binary Floating-Point 1544 Arithmetic. 1545 ------------------------------------------------------------------------------- 1546 */ 1547 float64 float32_to_float64( float32 a ) 1548 { 1549 flag aSign; 1550 int16 aExp; 1551 bits32 aSig; 1552 1553 aSig = extractFloat32Frac( a ); 1554 aExp = extractFloat32Exp( a ); 1555 aSign = extractFloat32Sign( a ); 1556 if ( aExp == 0xFF ) { 1557 if ( aSig ) return commonNaNToFloat64( float32ToCommonNaN( a ) ); 1558 return packFloat64( aSign, 0x7FF, 0 ); 1559 } 1560 if ( aExp == 0 ) { 1561 if ( aSig == 0 ) return packFloat64( aSign, 0, 0 ); 1562 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 1563 --aExp; 1564 } 1565 return packFloat64( aSign, aExp + 0x380, ( (bits64) aSig )<<29 ); 1566 1567 } 1568 1569 #ifdef FLOATX80 1570 1571 /* 1572 ------------------------------------------------------------------------------- 1573 Returns the result of converting the single-precision floating-point value 1574 `a' to the extended double-precision floating-point format. The conversion 1575 is performed according to the IEC/IEEE Standard for Binary Floating-Point 1576 Arithmetic. 1577 ------------------------------------------------------------------------------- 1578 */ 1579 floatx80 float32_to_floatx80( float32 a ) 1580 { 1581 flag aSign; 1582 int16 aExp; 1583 bits32 aSig; 1584 1585 aSig = extractFloat32Frac( a ); 1586 aExp = extractFloat32Exp( a ); 1587 aSign = extractFloat32Sign( a ); 1588 if ( aExp == 0xFF ) { 1589 if ( aSig ) return commonNaNToFloatx80( float32ToCommonNaN( a ) ); 1590 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 1591 } 1592 if ( aExp == 0 ) { 1593 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 ); 1594 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 1595 } 1596 aSig |= 0x00800000; 1597 return packFloatx80( aSign, aExp + 0x3F80, ( (bits64) aSig )<<40 ); 1598 1599 } 1600 1601 #endif 1602 1603 #ifdef FLOAT128 1604 1605 /* 1606 ------------------------------------------------------------------------------- 1607 Returns the result of converting the single-precision floating-point value 1608 `a' to the double-precision floating-point format. The conversion is 1609 performed according to the IEC/IEEE Standard for Binary Floating-Point 1610 Arithmetic. 1611 ------------------------------------------------------------------------------- 1612 */ 1613 float128 float32_to_float128( float32 a ) 1614 { 1615 flag aSign; 1616 int16 aExp; 1617 bits32 aSig; 1618 1619 aSig = extractFloat32Frac( a ); 1620 aExp = extractFloat32Exp( a ); 1621 aSign = extractFloat32Sign( a ); 1622 if ( aExp == 0xFF ) { 1623 if ( aSig ) return commonNaNToFloat128( float32ToCommonNaN( a ) ); 1624 return packFloat128( aSign, 0x7FFF, 0, 0 ); 1625 } 1626 if ( aExp == 0 ) { 1627 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 ); 1628 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 1629 --aExp; 1630 } 1631 return packFloat128( aSign, aExp + 0x3F80, ( (bits64) aSig )<<25, 0 ); 1632 1633 } 1634 1635 #endif 1636 1637 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */ 1638 /* 1639 ------------------------------------------------------------------------------- 1640 Rounds the single-precision floating-point value `a' to an integer, and 1641 returns the result as a single-precision floating-point value. The 1642 operation is performed according to the IEC/IEEE Standard for Binary 1643 Floating-Point Arithmetic. 1644 ------------------------------------------------------------------------------- 1645 */ 1646 float32 float32_round_to_int( float32 a ) 1647 { 1648 flag aSign; 1649 int16 aExp; 1650 bits32 lastBitMask, roundBitsMask; 1651 int8 roundingMode; 1652 float32 z; 1653 1654 aExp = extractFloat32Exp( a ); 1655 if ( 0x96 <= aExp ) { 1656 if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) { 1657 return propagateFloat32NaN( a, a ); 1658 } 1659 return a; 1660 } 1661 if ( aExp <= 0x7E ) { 1662 if ( (bits32) ( a<<1 ) == 0 ) return a; 1663 set_float_exception_inexact_flag(); 1664 aSign = extractFloat32Sign( a ); 1665 switch ( float_rounding_mode ) { 1666 case float_round_nearest_even: 1667 if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) { 1668 return packFloat32( aSign, 0x7F, 0 ); 1669 } 1670 break; 1671 case float_round_to_zero: 1672 break; 1673 case float_round_down: 1674 return aSign ? 0xBF800000 : 0; 1675 case float_round_up: 1676 return aSign ? 0x80000000 : 0x3F800000; 1677 } 1678 return packFloat32( aSign, 0, 0 ); 1679 } 1680 lastBitMask = 1; 1681 lastBitMask <<= 0x96 - aExp; 1682 roundBitsMask = lastBitMask - 1; 1683 z = a; 1684 roundingMode = float_rounding_mode; 1685 if ( roundingMode == float_round_nearest_even ) { 1686 z += lastBitMask>>1; 1687 if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask; 1688 } 1689 else if ( roundingMode != float_round_to_zero ) { 1690 if ( extractFloat32Sign( z ) ^ ( roundingMode == float_round_up ) ) { 1691 z += roundBitsMask; 1692 } 1693 } 1694 z &= ~ roundBitsMask; 1695 if ( z != a ) set_float_exception_inexact_flag(); 1696 return z; 1697 1698 } 1699 #endif /* !SOFTFLOAT_FOR_GCC */ 1700 1701 /* 1702 ------------------------------------------------------------------------------- 1703 Returns the result of adding the absolute values of the single-precision 1704 floating-point values `a' and `b'. If `zSign' is 1, the sum is negated 1705 before being returned. `zSign' is ignored if the result is a NaN. 1706 The addition is performed according to the IEC/IEEE Standard for Binary 1707 Floating-Point Arithmetic. 1708 ------------------------------------------------------------------------------- 1709 */ 1710 static float32 addFloat32Sigs( float32 a, float32 b, flag zSign ) 1711 { 1712 int16 aExp, bExp, zExp; 1713 bits32 aSig, bSig, zSig; 1714 int16 expDiff; 1715 1716 aSig = extractFloat32Frac( a ); 1717 aExp = extractFloat32Exp( a ); 1718 bSig = extractFloat32Frac( b ); 1719 bExp = extractFloat32Exp( b ); 1720 expDiff = aExp - bExp; 1721 aSig <<= 6; 1722 bSig <<= 6; 1723 if ( 0 < expDiff ) { 1724 if ( aExp == 0xFF ) { 1725 if ( aSig ) return propagateFloat32NaN( a, b ); 1726 return a; 1727 } 1728 if ( bExp == 0 ) { 1729 --expDiff; 1730 } 1731 else { 1732 bSig |= 0x20000000; 1733 } 1734 shift32RightJamming( bSig, expDiff, &bSig ); 1735 zExp = aExp; 1736 } 1737 else if ( expDiff < 0 ) { 1738 if ( bExp == 0xFF ) { 1739 if ( bSig ) return propagateFloat32NaN( a, b ); 1740 return packFloat32( zSign, 0xFF, 0 ); 1741 } 1742 if ( aExp == 0 ) { 1743 ++expDiff; 1744 } 1745 else { 1746 aSig |= 0x20000000; 1747 } 1748 shift32RightJamming( aSig, - expDiff, &aSig ); 1749 zExp = bExp; 1750 } 1751 else { 1752 if ( aExp == 0xFF ) { 1753 if ( aSig | bSig ) return propagateFloat32NaN( a, b ); 1754 return a; 1755 } 1756 if ( aExp == 0 ) return packFloat32( zSign, 0, ( aSig + bSig )>>6 ); 1757 zSig = 0x40000000 + aSig + bSig; 1758 zExp = aExp; 1759 goto roundAndPack; 1760 } 1761 aSig |= 0x20000000; 1762 zSig = ( aSig + bSig )<<1; 1763 --zExp; 1764 if ( (sbits32) zSig < 0 ) { 1765 zSig = aSig + bSig; 1766 ++zExp; 1767 } 1768 roundAndPack: 1769 return roundAndPackFloat32( zSign, zExp, zSig ); 1770 1771 } 1772 1773 /* 1774 ------------------------------------------------------------------------------- 1775 Returns the result of subtracting the absolute values of the single- 1776 precision floating-point values `a' and `b'. If `zSign' is 1, the 1777 difference is negated before being returned. `zSign' is ignored if the 1778 result is a NaN. The subtraction is performed according to the IEC/IEEE 1779 Standard for Binary Floating-Point Arithmetic. 1780 ------------------------------------------------------------------------------- 1781 */ 1782 static float32 subFloat32Sigs( float32 a, float32 b, flag zSign ) 1783 { 1784 int16 aExp, bExp, zExp; 1785 bits32 aSig, bSig, zSig; 1786 int16 expDiff; 1787 1788 aSig = extractFloat32Frac( a ); 1789 aExp = extractFloat32Exp( a ); 1790 bSig = extractFloat32Frac( b ); 1791 bExp = extractFloat32Exp( b ); 1792 expDiff = aExp - bExp; 1793 aSig <<= 7; 1794 bSig <<= 7; 1795 if ( 0 < expDiff ) goto aExpBigger; 1796 if ( expDiff < 0 ) goto bExpBigger; 1797 if ( aExp == 0xFF ) { 1798 if ( aSig | bSig ) return propagateFloat32NaN( a, b ); 1799 float_raise( float_flag_invalid ); 1800 return float32_default_nan; 1801 } 1802 if ( aExp == 0 ) { 1803 aExp = 1; 1804 bExp = 1; 1805 } 1806 if ( bSig < aSig ) goto aBigger; 1807 if ( aSig < bSig ) goto bBigger; 1808 return packFloat32( float_rounding_mode == float_round_down, 0, 0 ); 1809 bExpBigger: 1810 if ( bExp == 0xFF ) { 1811 if ( bSig ) return propagateFloat32NaN( a, b ); 1812 return packFloat32( zSign ^ 1, 0xFF, 0 ); 1813 } 1814 if ( aExp == 0 ) { 1815 ++expDiff; 1816 } 1817 else { 1818 aSig |= 0x40000000; 1819 } 1820 shift32RightJamming( aSig, - expDiff, &aSig ); 1821 bSig |= 0x40000000; 1822 bBigger: 1823 zSig = bSig - aSig; 1824 zExp = bExp; 1825 zSign ^= 1; 1826 goto normalizeRoundAndPack; 1827 aExpBigger: 1828 if ( aExp == 0xFF ) { 1829 if ( aSig ) return propagateFloat32NaN( a, b ); 1830 return a; 1831 } 1832 if ( bExp == 0 ) { 1833 --expDiff; 1834 } 1835 else { 1836 bSig |= 0x40000000; 1837 } 1838 shift32RightJamming( bSig, expDiff, &bSig ); 1839 aSig |= 0x40000000; 1840 aBigger: 1841 zSig = aSig - bSig; 1842 zExp = aExp; 1843 normalizeRoundAndPack: 1844 --zExp; 1845 return normalizeRoundAndPackFloat32( zSign, zExp, zSig ); 1846 1847 } 1848 1849 /* 1850 ------------------------------------------------------------------------------- 1851 Returns the result of adding the single-precision floating-point values `a' 1852 and `b'. The operation is performed according to the IEC/IEEE Standard for 1853 Binary Floating-Point Arithmetic. 1854 ------------------------------------------------------------------------------- 1855 */ 1856 float32 float32_add( float32 a, float32 b ) 1857 { 1858 flag aSign, bSign; 1859 1860 aSign = extractFloat32Sign( a ); 1861 bSign = extractFloat32Sign( b ); 1862 if ( aSign == bSign ) { 1863 return addFloat32Sigs( a, b, aSign ); 1864 } 1865 else { 1866 return subFloat32Sigs( a, b, aSign ); 1867 } 1868 1869 } 1870 1871 /* 1872 ------------------------------------------------------------------------------- 1873 Returns the result of subtracting the single-precision floating-point values 1874 `a' and `b'. The operation is performed according to the IEC/IEEE Standard 1875 for Binary Floating-Point Arithmetic. 1876 ------------------------------------------------------------------------------- 1877 */ 1878 float32 float32_sub( float32 a, float32 b ) 1879 { 1880 flag aSign, bSign; 1881 1882 aSign = extractFloat32Sign( a ); 1883 bSign = extractFloat32Sign( b ); 1884 if ( aSign == bSign ) { 1885 return subFloat32Sigs( a, b, aSign ); 1886 } 1887 else { 1888 return addFloat32Sigs( a, b, aSign ); 1889 } 1890 1891 } 1892 1893 /* 1894 ------------------------------------------------------------------------------- 1895 Returns the result of multiplying the single-precision floating-point values 1896 `a' and `b'. The operation is performed according to the IEC/IEEE Standard 1897 for Binary Floating-Point Arithmetic. 1898 ------------------------------------------------------------------------------- 1899 */ 1900 float32 float32_mul( float32 a, float32 b ) 1901 { 1902 flag aSign, bSign, zSign; 1903 int16 aExp, bExp, zExp; 1904 bits32 aSig, bSig; 1905 bits64 zSig64; 1906 bits32 zSig; 1907 1908 aSig = extractFloat32Frac( a ); 1909 aExp = extractFloat32Exp( a ); 1910 aSign = extractFloat32Sign( a ); 1911 bSig = extractFloat32Frac( b ); 1912 bExp = extractFloat32Exp( b ); 1913 bSign = extractFloat32Sign( b ); 1914 zSign = aSign ^ bSign; 1915 if ( aExp == 0xFF ) { 1916 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) { 1917 return propagateFloat32NaN( a, b ); 1918 } 1919 if ( ( bExp | bSig ) == 0 ) { 1920 float_raise( float_flag_invalid ); 1921 return float32_default_nan; 1922 } 1923 return packFloat32( zSign, 0xFF, 0 ); 1924 } 1925 if ( bExp == 0xFF ) { 1926 if ( bSig ) return propagateFloat32NaN( a, b ); 1927 if ( ( aExp | aSig ) == 0 ) { 1928 float_raise( float_flag_invalid ); 1929 return float32_default_nan; 1930 } 1931 return packFloat32( zSign, 0xFF, 0 ); 1932 } 1933 if ( aExp == 0 ) { 1934 if ( aSig == 0 ) return packFloat32( zSign, 0, 0 ); 1935 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 1936 } 1937 if ( bExp == 0 ) { 1938 if ( bSig == 0 ) return packFloat32( zSign, 0, 0 ); 1939 normalizeFloat32Subnormal( bSig, &bExp, &bSig ); 1940 } 1941 zExp = aExp + bExp - 0x7F; 1942 aSig = ( aSig | 0x00800000 )<<7; 1943 bSig = ( bSig | 0x00800000 )<<8; 1944 shift64RightJamming( ( (bits64) aSig ) * bSig, 32, &zSig64 ); 1945 zSig = (bits32)zSig64; 1946 if ( 0 <= (sbits32) ( zSig<<1 ) ) { 1947 zSig <<= 1; 1948 --zExp; 1949 } 1950 return roundAndPackFloat32( zSign, zExp, zSig ); 1951 1952 } 1953 1954 /* 1955 ------------------------------------------------------------------------------- 1956 Returns the result of dividing the single-precision floating-point value `a' 1957 by the corresponding value `b'. The operation is performed according to the 1958 IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1959 ------------------------------------------------------------------------------- 1960 */ 1961 float32 float32_div( float32 a, float32 b ) 1962 { 1963 flag aSign, bSign, zSign; 1964 int16 aExp, bExp, zExp; 1965 bits32 aSig, bSig, zSig; 1966 1967 aSig = extractFloat32Frac( a ); 1968 aExp = extractFloat32Exp( a ); 1969 aSign = extractFloat32Sign( a ); 1970 bSig = extractFloat32Frac( b ); 1971 bExp = extractFloat32Exp( b ); 1972 bSign = extractFloat32Sign( b ); 1973 zSign = aSign ^ bSign; 1974 if ( aExp == 0xFF ) { 1975 if ( aSig ) return propagateFloat32NaN( a, b ); 1976 if ( bExp == 0xFF ) { 1977 if ( bSig ) return propagateFloat32NaN( a, b ); 1978 float_raise( float_flag_invalid ); 1979 return float32_default_nan; 1980 } 1981 return packFloat32( zSign, 0xFF, 0 ); 1982 } 1983 if ( bExp == 0xFF ) { 1984 if ( bSig ) return propagateFloat32NaN( a, b ); 1985 return packFloat32( zSign, 0, 0 ); 1986 } 1987 if ( bExp == 0 ) { 1988 if ( bSig == 0 ) { 1989 if ( ( aExp | aSig ) == 0 ) { 1990 float_raise( float_flag_invalid ); 1991 return float32_default_nan; 1992 } 1993 float_raise( float_flag_divbyzero ); 1994 return packFloat32( zSign, 0xFF, 0 ); 1995 } 1996 normalizeFloat32Subnormal( bSig, &bExp, &bSig ); 1997 } 1998 if ( aExp == 0 ) { 1999 if ( aSig == 0 ) return packFloat32( zSign, 0, 0 ); 2000 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 2001 } 2002 zExp = aExp - bExp + 0x7D; 2003 aSig = ( aSig | 0x00800000 )<<7; 2004 bSig = ( bSig | 0x00800000 )<<8; 2005 if ( bSig <= ( aSig + aSig ) ) { 2006 aSig >>= 1; 2007 ++zExp; 2008 } 2009 zSig = (bits32)((((bits64) aSig) << 32) / bSig); 2010 if ( ( zSig & 0x3F ) == 0 ) { 2011 zSig |= ( (bits64) bSig * zSig != ( (bits64) aSig )<<32 ); 2012 } 2013 return roundAndPackFloat32( zSign, zExp, zSig ); 2014 2015 } 2016 2017 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */ 2018 /* 2019 ------------------------------------------------------------------------------- 2020 Returns the remainder of the single-precision floating-point value `a' 2021 with respect to the corresponding value `b'. The operation is performed 2022 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 2023 ------------------------------------------------------------------------------- 2024 */ 2025 float32 float32_rem( float32 a, float32 b ) 2026 { 2027 flag aSign, bSign, zSign; 2028 int16 aExp, bExp, expDiff; 2029 bits32 aSig, bSig; 2030 bits32 q; 2031 bits64 aSig64, bSig64, q64; 2032 bits32 alternateASig; 2033 sbits32 sigMean; 2034 2035 aSig = extractFloat32Frac( a ); 2036 aExp = extractFloat32Exp( a ); 2037 aSign = extractFloat32Sign( a ); 2038 bSig = extractFloat32Frac( b ); 2039 bExp = extractFloat32Exp( b ); 2040 bSign = extractFloat32Sign( b ); 2041 if ( aExp == 0xFF ) { 2042 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) { 2043 return propagateFloat32NaN( a, b ); 2044 } 2045 float_raise( float_flag_invalid ); 2046 return float32_default_nan; 2047 } 2048 if ( bExp == 0xFF ) { 2049 if ( bSig ) return propagateFloat32NaN( a, b ); 2050 return a; 2051 } 2052 if ( bExp == 0 ) { 2053 if ( bSig == 0 ) { 2054 float_raise( float_flag_invalid ); 2055 return float32_default_nan; 2056 } 2057 normalizeFloat32Subnormal( bSig, &bExp, &bSig ); 2058 } 2059 if ( aExp == 0 ) { 2060 if ( aSig == 0 ) return a; 2061 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 2062 } 2063 expDiff = aExp - bExp; 2064 aSig |= 0x00800000; 2065 bSig |= 0x00800000; 2066 if ( expDiff < 32 ) { 2067 aSig <<= 8; 2068 bSig <<= 8; 2069 if ( expDiff < 0 ) { 2070 if ( expDiff < -1 ) return a; 2071 aSig >>= 1; 2072 } 2073 q = ( bSig <= aSig ); 2074 if ( q ) aSig -= bSig; 2075 if ( 0 < expDiff ) { 2076 q = ( ( (bits64) aSig )<<32 ) / bSig; 2077 q >>= 32 - expDiff; 2078 bSig >>= 2; 2079 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q; 2080 } 2081 else { 2082 aSig >>= 2; 2083 bSig >>= 2; 2084 } 2085 } 2086 else { 2087 if ( bSig <= aSig ) aSig -= bSig; 2088 aSig64 = ( (bits64) aSig )<<40; 2089 bSig64 = ( (bits64) bSig )<<40; 2090 expDiff -= 64; 2091 while ( 0 < expDiff ) { 2092 q64 = estimateDiv128To64( aSig64, 0, bSig64 ); 2093 q64 = ( 2 < q64 ) ? q64 - 2 : 0; 2094 aSig64 = - ( ( bSig * q64 )<<38 ); 2095 expDiff -= 62; 2096 } 2097 expDiff += 64; 2098 q64 = estimateDiv128To64( aSig64, 0, bSig64 ); 2099 q64 = ( 2 < q64 ) ? q64 - 2 : 0; 2100 q = q64>>( 64 - expDiff ); 2101 bSig <<= 6; 2102 aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q; 2103 } 2104 do { 2105 alternateASig = aSig; 2106 ++q; 2107 aSig -= bSig; 2108 } while ( 0 <= (sbits32) aSig ); 2109 sigMean = aSig + alternateASig; 2110 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) { 2111 aSig = alternateASig; 2112 } 2113 zSign = ( (sbits32) aSig < 0 ); 2114 if ( zSign ) aSig = - aSig; 2115 return normalizeRoundAndPackFloat32( aSign ^ zSign, bExp, aSig ); 2116 2117 } 2118 #endif /* !SOFTFLOAT_FOR_GCC */ 2119 2120 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */ 2121 /* 2122 ------------------------------------------------------------------------------- 2123 Returns the square root of the single-precision floating-point value `a'. 2124 The operation is performed according to the IEC/IEEE Standard for Binary 2125 Floating-Point Arithmetic. 2126 ------------------------------------------------------------------------------- 2127 */ 2128 float32 float32_sqrt( float32 a ) 2129 { 2130 flag aSign; 2131 int16 aExp, zExp; 2132 bits32 aSig, zSig; 2133 bits64 rem, term; 2134 2135 aSig = extractFloat32Frac( a ); 2136 aExp = extractFloat32Exp( a ); 2137 aSign = extractFloat32Sign( a ); 2138 if ( aExp == 0xFF ) { 2139 if ( aSig ) return propagateFloat32NaN( a, 0 ); 2140 if ( ! aSign ) return a; 2141 float_raise( float_flag_invalid ); 2142 return float32_default_nan; 2143 } 2144 if ( aSign ) { 2145 if ( ( aExp | aSig ) == 0 ) return a; 2146 float_raise( float_flag_invalid ); 2147 return float32_default_nan; 2148 } 2149 if ( aExp == 0 ) { 2150 if ( aSig == 0 ) return 0; 2151 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 2152 } 2153 zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E; 2154 aSig = ( aSig | 0x00800000 )<<8; 2155 zSig = estimateSqrt32( aExp, aSig ) + 2; 2156 if ( ( zSig & 0x7F ) <= 5 ) { 2157 if ( zSig < 2 ) { 2158 zSig = 0x7FFFFFFF; 2159 goto roundAndPack; 2160 } 2161 aSig >>= aExp & 1; 2162 term = ( (bits64) zSig ) * zSig; 2163 rem = ( ( (bits64) aSig )<<32 ) - term; 2164 while ( (sbits64) rem < 0 ) { 2165 --zSig; 2166 rem += ( ( (bits64) zSig )<<1 ) | 1; 2167 } 2168 zSig |= ( rem != 0 ); 2169 } 2170 shift32RightJamming( zSig, 1, &zSig ); 2171 roundAndPack: 2172 return roundAndPackFloat32( 0, zExp, zSig ); 2173 2174 } 2175 #endif /* !SOFTFLOAT_FOR_GCC */ 2176 2177 /* 2178 ------------------------------------------------------------------------------- 2179 Returns 1 if the single-precision floating-point value `a' is equal to 2180 the corresponding value `b', and 0 otherwise. The comparison is performed 2181 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 2182 ------------------------------------------------------------------------------- 2183 */ 2184 flag float32_eq( float32 a, float32 b ) 2185 { 2186 2187 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 2188 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 2189 ) { 2190 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) { 2191 float_raise( float_flag_invalid ); 2192 } 2193 return 0; 2194 } 2195 return ( a == b ) || ( (bits32) ( ( a | b )<<1 ) == 0 ); 2196 2197 } 2198 2199 /* 2200 ------------------------------------------------------------------------------- 2201 Returns 1 if the single-precision floating-point value `a' is less than 2202 or equal to the corresponding value `b', and 0 otherwise. The comparison 2203 is performed according to the IEC/IEEE Standard for Binary Floating-Point 2204 Arithmetic. 2205 ------------------------------------------------------------------------------- 2206 */ 2207 flag float32_le( float32 a, float32 b ) 2208 { 2209 flag aSign, bSign; 2210 2211 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 2212 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 2213 ) { 2214 float_raise( float_flag_invalid ); 2215 return 0; 2216 } 2217 aSign = extractFloat32Sign( a ); 2218 bSign = extractFloat32Sign( b ); 2219 if ( aSign != bSign ) return aSign || ( (bits32) ( ( a | b )<<1 ) == 0 ); 2220 return ( a == b ) || ( aSign ^ ( a < b ) ); 2221 2222 } 2223 2224 /* 2225 ------------------------------------------------------------------------------- 2226 Returns 1 if the single-precision floating-point value `a' is less than 2227 the corresponding value `b', and 0 otherwise. The comparison is performed 2228 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 2229 ------------------------------------------------------------------------------- 2230 */ 2231 flag float32_lt( float32 a, float32 b ) 2232 { 2233 flag aSign, bSign; 2234 2235 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 2236 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 2237 ) { 2238 float_raise( float_flag_invalid ); 2239 return 0; 2240 } 2241 aSign = extractFloat32Sign( a ); 2242 bSign = extractFloat32Sign( b ); 2243 if ( aSign != bSign ) return aSign && ( (bits32) ( ( a | b )<<1 ) != 0 ); 2244 return ( a != b ) && ( aSign ^ ( a < b ) ); 2245 2246 } 2247 2248 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */ 2249 /* 2250 ------------------------------------------------------------------------------- 2251 Returns 1 if the single-precision floating-point value `a' is equal to 2252 the corresponding value `b', and 0 otherwise. The invalid exception is 2253 raised if either operand is a NaN. Otherwise, the comparison is performed 2254 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 2255 ------------------------------------------------------------------------------- 2256 */ 2257 flag float32_eq_signaling( float32 a, float32 b ) 2258 { 2259 2260 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 2261 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 2262 ) { 2263 float_raise( float_flag_invalid ); 2264 return 0; 2265 } 2266 return ( a == b ) || ( (bits32) ( ( a | b )<<1 ) == 0 ); 2267 2268 } 2269 2270 /* 2271 ------------------------------------------------------------------------------- 2272 Returns 1 if the single-precision floating-point value `a' is less than or 2273 equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not 2274 cause an exception. Otherwise, the comparison is performed according to the 2275 IEC/IEEE Standard for Binary Floating-Point Arithmetic. 2276 ------------------------------------------------------------------------------- 2277 */ 2278 flag float32_le_quiet( float32 a, float32 b ) 2279 { 2280 flag aSign, bSign; 2281 2282 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 2283 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 2284 ) { 2285 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) { 2286 float_raise( float_flag_invalid ); 2287 } 2288 return 0; 2289 } 2290 aSign = extractFloat32Sign( a ); 2291 bSign = extractFloat32Sign( b ); 2292 if ( aSign != bSign ) return aSign || ( (bits32) ( ( a | b )<<1 ) == 0 ); 2293 return ( a == b ) || ( aSign ^ ( a < b ) ); 2294 2295 } 2296 2297 /* 2298 ------------------------------------------------------------------------------- 2299 Returns 1 if the single-precision floating-point value `a' is less than 2300 the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 2301 exception. Otherwise, the comparison is performed according to the IEC/IEEE 2302 Standard for Binary Floating-Point Arithmetic. 2303 ------------------------------------------------------------------------------- 2304 */ 2305 flag float32_lt_quiet( float32 a, float32 b ) 2306 { 2307 flag aSign, bSign; 2308 2309 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 2310 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 2311 ) { 2312 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) { 2313 float_raise( float_flag_invalid ); 2314 } 2315 return 0; 2316 } 2317 aSign = extractFloat32Sign( a ); 2318 bSign = extractFloat32Sign( b ); 2319 if ( aSign != bSign ) return aSign && ( (bits32) ( ( a | b )<<1 ) != 0 ); 2320 return ( a != b ) && ( aSign ^ ( a < b ) ); 2321 2322 } 2323 #endif /* !SOFTFLOAT_FOR_GCC */ 2324 2325 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */ 2326 /* 2327 ------------------------------------------------------------------------------- 2328 Returns the result of converting the double-precision floating-point value 2329 `a' to the 32-bit two's complement integer format. The conversion is 2330 performed according to the IEC/IEEE Standard for Binary Floating-Point 2331 Arithmetic---which means in particular that the conversion is rounded 2332 according to the current rounding mode. If `a' is a NaN, the largest 2333 positive integer is returned. Otherwise, if the conversion overflows, the 2334 largest integer with the same sign as `a' is returned. 2335 ------------------------------------------------------------------------------- 2336 */ 2337 int32 float64_to_int32( float64 a ) 2338 { 2339 flag aSign; 2340 int16 aExp, shiftCount; 2341 bits64 aSig; 2342 2343 aSig = extractFloat64Frac( a ); 2344 aExp = extractFloat64Exp( a ); 2345 aSign = extractFloat64Sign( a ); 2346 if ( ( aExp == 0x7FF ) && aSig ) aSign = 0; 2347 if ( aExp ) aSig |= LIT64( 0x0010000000000000 ); 2348 shiftCount = 0x42C - aExp; 2349 if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig ); 2350 return roundAndPackInt32( aSign, aSig ); 2351 2352 } 2353 #endif /* !SOFTFLOAT_FOR_GCC */ 2354 2355 /* 2356 ------------------------------------------------------------------------------- 2357 Returns the result of converting the double-precision floating-point value 2358 `a' to the 32-bit two's complement integer format. The conversion is 2359 performed according to the IEC/IEEE Standard for Binary Floating-Point 2360 Arithmetic, except that the conversion is always rounded toward zero. 2361 If `a' is a NaN, the largest positive integer is returned. Otherwise, if 2362 the conversion overflows, the largest integer with the same sign as `a' is 2363 returned. 2364 ------------------------------------------------------------------------------- 2365 */ 2366 int32 float64_to_int32_round_to_zero( float64 a ) 2367 { 2368 flag aSign; 2369 int16 aExp, shiftCount; 2370 bits64 aSig, savedASig; 2371 int32 z; 2372 2373 aSig = extractFloat64Frac( a ); 2374 aExp = extractFloat64Exp( a ); 2375 aSign = extractFloat64Sign( a ); 2376 if ( 0x41E < aExp ) { 2377 if ( ( aExp == 0x7FF ) && aSig ) aSign = 0; 2378 goto invalid; 2379 } 2380 else if ( aExp < 0x3FF ) { 2381 if ( aExp || aSig ) set_float_exception_inexact_flag(); 2382 return 0; 2383 } 2384 aSig |= LIT64( 0x0010000000000000 ); 2385 shiftCount = 0x433 - aExp; 2386 savedASig = aSig; 2387 aSig >>= shiftCount; 2388 z = (int32)aSig; 2389 if ( aSign ) z = - z; 2390 if ( ( z < 0 ) ^ aSign ) { 2391 invalid: 2392 float_raise( float_flag_invalid ); 2393 return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF; 2394 } 2395 if ( ( aSig<<shiftCount ) != savedASig ) { 2396 set_float_exception_inexact_flag(); 2397 } 2398 return z; 2399 2400 } 2401 2402 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */ 2403 /* 2404 ------------------------------------------------------------------------------- 2405 Returns the result of converting the double-precision floating-point value 2406 `a' to the 64-bit two's complement integer format. The conversion is 2407 performed according to the IEC/IEEE Standard for Binary Floating-Point 2408 Arithmetic---which means in particular that the conversion is rounded 2409 according to the current rounding mode. If `a' is a NaN, the largest 2410 positive integer is returned. Otherwise, if the conversion overflows, the 2411 largest integer with the same sign as `a' is returned. 2412 ------------------------------------------------------------------------------- 2413 */ 2414 int64 float64_to_int64( float64 a ) 2415 { 2416 flag aSign; 2417 int16 aExp, shiftCount; 2418 bits64 aSig, aSigExtra; 2419 2420 aSig = extractFloat64Frac( a ); 2421 aExp = extractFloat64Exp( a ); 2422 aSign = extractFloat64Sign( a ); 2423 if ( aExp ) aSig |= LIT64( 0x0010000000000000 ); 2424 shiftCount = 0x433 - aExp; 2425 if ( shiftCount <= 0 ) { 2426 if ( 0x43E < aExp ) { 2427 float_raise( float_flag_invalid ); 2428 if ( ! aSign 2429 || ( ( aExp == 0x7FF ) 2430 && ( aSig != LIT64( 0x0010000000000000 ) ) ) 2431 ) { 2432 return LIT64( 0x7FFFFFFFFFFFFFFF ); 2433 } 2434 return (sbits64) LIT64( 0x8000000000000000 ); 2435 } 2436 aSigExtra = 0; 2437 aSig <<= - shiftCount; 2438 } 2439 else { 2440 shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra ); 2441 } 2442 return roundAndPackInt64( aSign, aSig, aSigExtra ); 2443 2444 } 2445 2446 /* 2447 ------------------------------------------------------------------------------- 2448 Returns the result of converting the double-precision floating-point value 2449 `a' to the 64-bit two's complement integer format. The conversion is 2450 performed according to the IEC/IEEE Standard for Binary Floating-Point 2451 Arithmetic, except that the conversion is always rounded toward zero. 2452 If `a' is a NaN, the largest positive integer is returned. Otherwise, if 2453 the conversion overflows, the largest integer with the same sign as `a' is 2454 returned. 2455 ------------------------------------------------------------------------------- 2456 */ 2457 int64 float64_to_int64_round_to_zero( float64 a ) 2458 { 2459 flag aSign; 2460 int16 aExp, shiftCount; 2461 bits64 aSig; 2462 int64 z; 2463 2464 aSig = extractFloat64Frac( a ); 2465 aExp = extractFloat64Exp( a ); 2466 aSign = extractFloat64Sign( a ); 2467 if ( aExp ) aSig |= LIT64( 0x0010000000000000 ); 2468 shiftCount = aExp - 0x433; 2469 if ( 0 <= shiftCount ) { 2470 if ( 0x43E <= aExp ) { 2471 if ( a != LIT64( 0xC3E0000000000000 ) ) { 2472 float_raise( float_flag_invalid ); 2473 if ( ! aSign 2474 || ( ( aExp == 0x7FF ) 2475 && ( aSig != LIT64( 0x0010000000000000 ) ) ) 2476 ) { 2477 return LIT64( 0x7FFFFFFFFFFFFFFF ); 2478 } 2479 } 2480 return (sbits64) LIT64( 0x8000000000000000 ); 2481 } 2482 z = aSig<<shiftCount; 2483 } 2484 else { 2485 if ( aExp < 0x3FE ) { 2486 if ( aExp | aSig ) set_float_exception_inexact_flag(); 2487 return 0; 2488 } 2489 z = aSig>>( - shiftCount ); 2490 if ( (bits64) ( aSig<<( shiftCount & 63 ) ) ) { 2491 set_float_exception_inexact_flag(); 2492 } 2493 } 2494 if ( aSign ) z = - z; 2495 return z; 2496 2497 } 2498 #endif /* !SOFTFLOAT_FOR_GCC */ 2499 2500 /* 2501 ------------------------------------------------------------------------------- 2502 Returns the result of converting the double-precision floating-point value 2503 `a' to the single-precision floating-point format. The conversion is 2504 performed according to the IEC/IEEE Standard for Binary Floating-Point 2505 Arithmetic. 2506 ------------------------------------------------------------------------------- 2507 */ 2508 float32 float64_to_float32( float64 a ) 2509 { 2510 flag aSign; 2511 int16 aExp; 2512 bits64 aSig; 2513 bits32 zSig; 2514 2515 aSig = extractFloat64Frac( a ); 2516 aExp = extractFloat64Exp( a ); 2517 aSign = extractFloat64Sign( a ); 2518 if ( aExp == 0x7FF ) { 2519 if ( aSig ) return commonNaNToFloat32( float64ToCommonNaN( a ) ); 2520 return packFloat32( aSign, 0xFF, 0 ); 2521 } 2522 shift64RightJamming( aSig, 22, &aSig ); 2523 zSig = (bits32)aSig; 2524 if ( aExp || zSig ) { 2525 zSig |= 0x40000000; 2526 aExp -= 0x381; 2527 } 2528 return roundAndPackFloat32( aSign, aExp, zSig ); 2529 2530 } 2531 2532 #ifdef FLOATX80 2533 2534 /* 2535 ------------------------------------------------------------------------------- 2536 Returns the result of converting the double-precision floating-point value 2537 `a' to the extended double-precision floating-point format. The conversion 2538 is performed according to the IEC/IEEE Standard for Binary Floating-Point 2539 Arithmetic. 2540 ------------------------------------------------------------------------------- 2541 */ 2542 floatx80 float64_to_floatx80( float64 a ) 2543 { 2544 flag aSign; 2545 int16 aExp; 2546 bits64 aSig; 2547 2548 aSig = extractFloat64Frac( a ); 2549 aExp = extractFloat64Exp( a ); 2550 aSign = extractFloat64Sign( a ); 2551 if ( aExp == 0x7FF ) { 2552 if ( aSig ) return commonNaNToFloatx80( float64ToCommonNaN( a ) ); 2553 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 2554 } 2555 if ( aExp == 0 ) { 2556 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 ); 2557 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 2558 } 2559 return 2560 packFloatx80( 2561 aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 ); 2562 2563 } 2564 2565 #endif 2566 2567 #ifdef FLOAT128 2568 2569 /* 2570 ------------------------------------------------------------------------------- 2571 Returns the result of converting the double-precision floating-point value 2572 `a' to the quadruple-precision floating-point format. The conversion is 2573 performed according to the IEC/IEEE Standard for Binary Floating-Point 2574 Arithmetic. 2575 ------------------------------------------------------------------------------- 2576 */ 2577 float128 float64_to_float128( float64 a ) 2578 { 2579 flag aSign; 2580 int16 aExp; 2581 bits64 aSig, zSig0, zSig1; 2582 2583 aSig = extractFloat64Frac( a ); 2584 aExp = extractFloat64Exp( a ); 2585 aSign = extractFloat64Sign( a ); 2586 if ( aExp == 0x7FF ) { 2587 if ( aSig ) return commonNaNToFloat128( float64ToCommonNaN( a ) ); 2588 return packFloat128( aSign, 0x7FFF, 0, 0 ); 2589 } 2590 if ( aExp == 0 ) { 2591 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 ); 2592 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 2593 --aExp; 2594 } 2595 shift128Right( aSig, 0, 4, &zSig0, &zSig1 ); 2596 return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 ); 2597 2598 } 2599 2600 #endif 2601 2602 #ifndef SOFTFLOAT_FOR_GCC 2603 /* 2604 ------------------------------------------------------------------------------- 2605 Rounds the double-precision floating-point value `a' to an integer, and 2606 returns the result as a double-precision floating-point value. The 2607 operation is performed according to the IEC/IEEE Standard for Binary 2608 Floating-Point Arithmetic. 2609 ------------------------------------------------------------------------------- 2610 */ 2611 float64 float64_round_to_int( float64 a ) 2612 { 2613 flag aSign; 2614 int16 aExp; 2615 bits64 lastBitMask, roundBitsMask; 2616 int8 roundingMode; 2617 float64 z; 2618 2619 aExp = extractFloat64Exp( a ); 2620 if ( 0x433 <= aExp ) { 2621 if ( ( aExp == 0x7FF ) && extractFloat64Frac( a ) ) { 2622 return propagateFloat64NaN( a, a ); 2623 } 2624 return a; 2625 } 2626 if ( aExp < 0x3FF ) { 2627 if ( (bits64) ( a<<1 ) == 0 ) return a; 2628 set_float_exception_inexact_flag(); 2629 aSign = extractFloat64Sign( a ); 2630 switch ( float_rounding_mode ) { 2631 case float_round_nearest_even: 2632 if ( ( aExp == 0x3FE ) && extractFloat64Frac( a ) ) { 2633 return packFloat64( aSign, 0x3FF, 0 ); 2634 } 2635 break; 2636 case float_round_to_zero: 2637 break; 2638 case float_round_down: 2639 return aSign ? LIT64( 0xBFF0000000000000 ) : 0; 2640 case float_round_up: 2641 return 2642 aSign ? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 ); 2643 } 2644 return packFloat64( aSign, 0, 0 ); 2645 } 2646 lastBitMask = 1; 2647 lastBitMask <<= 0x433 - aExp; 2648 roundBitsMask = lastBitMask - 1; 2649 z = a; 2650 roundingMode = float_rounding_mode; 2651 if ( roundingMode == float_round_nearest_even ) { 2652 z += lastBitMask>>1; 2653 if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask; 2654 } 2655 else if ( roundingMode != float_round_to_zero ) { 2656 if ( extractFloat64Sign( z ) ^ ( roundingMode == float_round_up ) ) { 2657 z += roundBitsMask; 2658 } 2659 } 2660 z &= ~ roundBitsMask; 2661 if ( z != a ) set_float_exception_inexact_flag(); 2662 return z; 2663 2664 } 2665 #endif 2666 2667 /* 2668 ------------------------------------------------------------------------------- 2669 Returns the result of adding the absolute values of the double-precision 2670 floating-point values `a' and `b'. If `zSign' is 1, the sum is negated 2671 before being returned. `zSign' is ignored if the result is a NaN. 2672 The addition is performed according to the IEC/IEEE Standard for Binary 2673 Floating-Point Arithmetic. 2674 ------------------------------------------------------------------------------- 2675 */ 2676 static float64 addFloat64Sigs( float64 a, float64 b, flag zSign ) 2677 { 2678 int16 aExp, bExp, zExp; 2679 bits64 aSig, bSig, zSig; 2680 int16 expDiff; 2681 2682 aSig = extractFloat64Frac( a ); 2683 aExp = extractFloat64Exp( a ); 2684 bSig = extractFloat64Frac( b ); 2685 bExp = extractFloat64Exp( b ); 2686 expDiff = aExp - bExp; 2687 aSig <<= 9; 2688 bSig <<= 9; 2689 if ( 0 < expDiff ) { 2690 if ( aExp == 0x7FF ) { 2691 if ( aSig ) return propagateFloat64NaN( a, b ); 2692 return a; 2693 } 2694 if ( bExp == 0 ) { 2695 --expDiff; 2696 } 2697 else { 2698 bSig |= LIT64( 0x2000000000000000 ); 2699 } 2700 shift64RightJamming( bSig, expDiff, &bSig ); 2701 zExp = aExp; 2702 } 2703 else if ( expDiff < 0 ) { 2704 if ( bExp == 0x7FF ) { 2705 if ( bSig ) return propagateFloat64NaN( a, b ); 2706 return packFloat64( zSign, 0x7FF, 0 ); 2707 } 2708 if ( aExp == 0 ) { 2709 ++expDiff; 2710 } 2711 else { 2712 aSig |= LIT64( 0x2000000000000000 ); 2713 } 2714 shift64RightJamming( aSig, - expDiff, &aSig ); 2715 zExp = bExp; 2716 } 2717 else { 2718 if ( aExp == 0x7FF ) { 2719 if ( aSig | bSig ) return propagateFloat64NaN( a, b ); 2720 return a; 2721 } 2722 if ( aExp == 0 ) return packFloat64( zSign, 0, ( aSig + bSig )>>9 ); 2723 zSig = LIT64( 0x4000000000000000 ) + aSig + bSig; 2724 zExp = aExp; 2725 goto roundAndPack; 2726 } 2727 aSig |= LIT64( 0x2000000000000000 ); 2728 zSig = ( aSig + bSig )<<1; 2729 --zExp; 2730 if ( (sbits64) zSig < 0 ) { 2731 zSig = aSig + bSig; 2732 ++zExp; 2733 } 2734 roundAndPack: 2735 return roundAndPackFloat64( zSign, zExp, zSig ); 2736 2737 } 2738 2739 /* 2740 ------------------------------------------------------------------------------- 2741 Returns the result of subtracting the absolute values of the double- 2742 precision floating-point values `a' and `b'. If `zSign' is 1, the 2743 difference is negated before being returned. `zSign' is ignored if the 2744 result is a NaN. The subtraction is performed according to the IEC/IEEE 2745 Standard for Binary Floating-Point Arithmetic. 2746 ------------------------------------------------------------------------------- 2747 */ 2748 static float64 subFloat64Sigs( float64 a, float64 b, flag zSign ) 2749 { 2750 int16 aExp, bExp, zExp; 2751 bits64 aSig, bSig, zSig; 2752 int16 expDiff; 2753 2754 aSig = extractFloat64Frac( a ); 2755 aExp = extractFloat64Exp( a ); 2756 bSig = extractFloat64Frac( b ); 2757 bExp = extractFloat64Exp( b ); 2758 expDiff = aExp - bExp; 2759 aSig <<= 10; 2760 bSig <<= 10; 2761 if ( 0 < expDiff ) goto aExpBigger; 2762 if ( expDiff < 0 ) goto bExpBigger; 2763 if ( aExp == 0x7FF ) { 2764 if ( aSig | bSig ) return propagateFloat64NaN( a, b ); 2765 float_raise( float_flag_invalid ); 2766 return float64_default_nan; 2767 } 2768 if ( aExp == 0 ) { 2769 aExp = 1; 2770 bExp = 1; 2771 } 2772 if ( bSig < aSig ) goto aBigger; 2773 if ( aSig < bSig ) goto bBigger; 2774 return packFloat64( float_rounding_mode == float_round_down, 0, 0 ); 2775 bExpBigger: 2776 if ( bExp == 0x7FF ) { 2777 if ( bSig ) return propagateFloat64NaN( a, b ); 2778 return packFloat64( zSign ^ 1, 0x7FF, 0 ); 2779 } 2780 if ( aExp == 0 ) { 2781 ++expDiff; 2782 } 2783 else { 2784 aSig |= LIT64( 0x4000000000000000 ); 2785 } 2786 shift64RightJamming( aSig, - expDiff, &aSig ); 2787 bSig |= LIT64( 0x4000000000000000 ); 2788 bBigger: 2789 zSig = bSig - aSig; 2790 zExp = bExp; 2791 zSign ^= 1; 2792 goto normalizeRoundAndPack; 2793 aExpBigger: 2794 if ( aExp == 0x7FF ) { 2795 if ( aSig ) return propagateFloat64NaN( a, b ); 2796 return a; 2797 } 2798 if ( bExp == 0 ) { 2799 --expDiff; 2800 } 2801 else { 2802 bSig |= LIT64( 0x4000000000000000 ); 2803 } 2804 shift64RightJamming( bSig, expDiff, &bSig ); 2805 aSig |= LIT64( 0x4000000000000000 ); 2806 aBigger: 2807 zSig = aSig - bSig; 2808 zExp = aExp; 2809 normalizeRoundAndPack: 2810 --zExp; 2811 return normalizeRoundAndPackFloat64( zSign, zExp, zSig ); 2812 2813 } 2814 2815 /* 2816 ------------------------------------------------------------------------------- 2817 Returns the result of adding the double-precision floating-point values `a' 2818 and `b'. The operation is performed according to the IEC/IEEE Standard for 2819 Binary Floating-Point Arithmetic. 2820 ------------------------------------------------------------------------------- 2821 */ 2822 float64 float64_add( float64 a, float64 b ) 2823 { 2824 flag aSign, bSign; 2825 2826 aSign = extractFloat64Sign( a ); 2827 bSign = extractFloat64Sign( b ); 2828 if ( aSign == bSign ) { 2829 return addFloat64Sigs( a, b, aSign ); 2830 } 2831 else { 2832 return subFloat64Sigs( a, b, aSign ); 2833 } 2834 2835 } 2836 2837 /* 2838 ------------------------------------------------------------------------------- 2839 Returns the result of subtracting the double-precision floating-point values 2840 `a' and `b'. The operation is performed according to the IEC/IEEE Standard 2841 for Binary Floating-Point Arithmetic. 2842 ------------------------------------------------------------------------------- 2843 */ 2844 float64 float64_sub( float64 a, float64 b ) 2845 { 2846 flag aSign, bSign; 2847 2848 aSign = extractFloat64Sign( a ); 2849 bSign = extractFloat64Sign( b ); 2850 if ( aSign == bSign ) { 2851 return subFloat64Sigs( a, b, aSign ); 2852 } 2853 else { 2854 return addFloat64Sigs( a, b, aSign ); 2855 } 2856 2857 } 2858 2859 /* 2860 ------------------------------------------------------------------------------- 2861 Returns the result of multiplying the double-precision floating-point values 2862 `a' and `b'. The operation is performed according to the IEC/IEEE Standard 2863 for Binary Floating-Point Arithmetic. 2864 ------------------------------------------------------------------------------- 2865 */ 2866 float64 float64_mul( float64 a, float64 b ) 2867 { 2868 flag aSign, bSign, zSign; 2869 int16 aExp, bExp, zExp; 2870 bits64 aSig, bSig, zSig0, zSig1; 2871 2872 aSig = extractFloat64Frac( a ); 2873 aExp = extractFloat64Exp( a ); 2874 aSign = extractFloat64Sign( a ); 2875 bSig = extractFloat64Frac( b ); 2876 bExp = extractFloat64Exp( b ); 2877 bSign = extractFloat64Sign( b ); 2878 zSign = aSign ^ bSign; 2879 if ( aExp == 0x7FF ) { 2880 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) { 2881 return propagateFloat64NaN( a, b ); 2882 } 2883 if ( ( bExp | bSig ) == 0 ) { 2884 float_raise( float_flag_invalid ); 2885 return float64_default_nan; 2886 } 2887 return packFloat64( zSign, 0x7FF, 0 ); 2888 } 2889 if ( bExp == 0x7FF ) { 2890 if ( bSig ) return propagateFloat64NaN( a, b ); 2891 if ( ( aExp | aSig ) == 0 ) { 2892 float_raise( float_flag_invalid ); 2893 return float64_default_nan; 2894 } 2895 return packFloat64( zSign, 0x7FF, 0 ); 2896 } 2897 if ( aExp == 0 ) { 2898 if ( aSig == 0 ) return packFloat64( zSign, 0, 0 ); 2899 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 2900 } 2901 if ( bExp == 0 ) { 2902 if ( bSig == 0 ) return packFloat64( zSign, 0, 0 ); 2903 normalizeFloat64Subnormal( bSig, &bExp, &bSig ); 2904 } 2905 zExp = aExp + bExp - 0x3FF; 2906 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10; 2907 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11; 2908 mul64To128( aSig, bSig, &zSig0, &zSig1 ); 2909 zSig0 |= ( zSig1 != 0 ); 2910 if ( 0 <= (sbits64) ( zSig0<<1 ) ) { 2911 zSig0 <<= 1; 2912 --zExp; 2913 } 2914 return roundAndPackFloat64( zSign, zExp, zSig0 ); 2915 2916 } 2917 2918 /* 2919 ------------------------------------------------------------------------------- 2920 Returns the result of dividing the double-precision floating-point value `a' 2921 by the corresponding value `b'. The operation is performed according to 2922 the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 2923 ------------------------------------------------------------------------------- 2924 */ 2925 float64 float64_div( float64 a, float64 b ) 2926 { 2927 flag aSign, bSign, zSign; 2928 int16 aExp, bExp, zExp; 2929 bits64 aSig, bSig, zSig; 2930 bits64 rem0, rem1; 2931 bits64 term0, term1; 2932 2933 aSig = extractFloat64Frac( a ); 2934 aExp = extractFloat64Exp( a ); 2935 aSign = extractFloat64Sign( a ); 2936 bSig = extractFloat64Frac( b ); 2937 bExp = extractFloat64Exp( b ); 2938 bSign = extractFloat64Sign( b ); 2939 zSign = aSign ^ bSign; 2940 if ( aExp == 0x7FF ) { 2941 if ( aSig ) return propagateFloat64NaN( a, b ); 2942 if ( bExp == 0x7FF ) { 2943 if ( bSig ) return propagateFloat64NaN( a, b ); 2944 float_raise( float_flag_invalid ); 2945 return float64_default_nan; 2946 } 2947 return packFloat64( zSign, 0x7FF, 0 ); 2948 } 2949 if ( bExp == 0x7FF ) { 2950 if ( bSig ) return propagateFloat64NaN( a, b ); 2951 return packFloat64( zSign, 0, 0 ); 2952 } 2953 if ( bExp == 0 ) { 2954 if ( bSig == 0 ) { 2955 if ( ( aExp | aSig ) == 0 ) { 2956 float_raise( float_flag_invalid ); 2957 return float64_default_nan; 2958 } 2959 float_raise( float_flag_divbyzero ); 2960 return packFloat64( zSign, 0x7FF, 0 ); 2961 } 2962 normalizeFloat64Subnormal( bSig, &bExp, &bSig ); 2963 } 2964 if ( aExp == 0 ) { 2965 if ( aSig == 0 ) return packFloat64( zSign, 0, 0 ); 2966 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 2967 } 2968 zExp = aExp - bExp + 0x3FD; 2969 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10; 2970 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11; 2971 if ( bSig <= ( aSig + aSig ) ) { 2972 aSig >>= 1; 2973 ++zExp; 2974 } 2975 zSig = estimateDiv128To64( aSig, 0, bSig ); 2976 if ( ( zSig & 0x1FF ) <= 2 ) { 2977 mul64To128( bSig, zSig, &term0, &term1 ); 2978 sub128( aSig, 0, term0, term1, &rem0, &rem1 ); 2979 while ( (sbits64) rem0 < 0 ) { 2980 --zSig; 2981 add128( rem0, rem1, 0, bSig, &rem0, &rem1 ); 2982 } 2983 zSig |= ( rem1 != 0 ); 2984 } 2985 return roundAndPackFloat64( zSign, zExp, zSig ); 2986 2987 } 2988 2989 #ifndef SOFTFLOAT_FOR_GCC 2990 /* 2991 ------------------------------------------------------------------------------- 2992 Returns the remainder of the double-precision floating-point value `a' 2993 with respect to the corresponding value `b'. The operation is performed 2994 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 2995 ------------------------------------------------------------------------------- 2996 */ 2997 float64 float64_rem( float64 a, float64 b ) 2998 { 2999 flag aSign, bSign, zSign; 3000 int16 aExp, bExp, expDiff; 3001 bits64 aSig, bSig; 3002 bits64 q, alternateASig; 3003 sbits64 sigMean; 3004 3005 aSig = extractFloat64Frac( a ); 3006 aExp = extractFloat64Exp( a ); 3007 aSign = extractFloat64Sign( a ); 3008 bSig = extractFloat64Frac( b ); 3009 bExp = extractFloat64Exp( b ); 3010 bSign = extractFloat64Sign( b ); 3011 if ( aExp == 0x7FF ) { 3012 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) { 3013 return propagateFloat64NaN( a, b ); 3014 } 3015 float_raise( float_flag_invalid ); 3016 return float64_default_nan; 3017 } 3018 if ( bExp == 0x7FF ) { 3019 if ( bSig ) return propagateFloat64NaN( a, b ); 3020 return a; 3021 } 3022 if ( bExp == 0 ) { 3023 if ( bSig == 0 ) { 3024 float_raise( float_flag_invalid ); 3025 return float64_default_nan; 3026 } 3027 normalizeFloat64Subnormal( bSig, &bExp, &bSig ); 3028 } 3029 if ( aExp == 0 ) { 3030 if ( aSig == 0 ) return a; 3031 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 3032 } 3033 expDiff = aExp - bExp; 3034 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11; 3035 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11; 3036 if ( expDiff < 0 ) { 3037 if ( expDiff < -1 ) return a; 3038 aSig >>= 1; 3039 } 3040 q = ( bSig <= aSig ); 3041 if ( q ) aSig -= bSig; 3042 expDiff -= 64; 3043 while ( 0 < expDiff ) { 3044 q = estimateDiv128To64( aSig, 0, bSig ); 3045 q = ( 2 < q ) ? q - 2 : 0; 3046 aSig = - ( ( bSig>>2 ) * q ); 3047 expDiff -= 62; 3048 } 3049 expDiff += 64; 3050 if ( 0 < expDiff ) { 3051 q = estimateDiv128To64( aSig, 0, bSig ); 3052 q = ( 2 < q ) ? q - 2 : 0; 3053 q >>= 64 - expDiff; 3054 bSig >>= 2; 3055 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q; 3056 } 3057 else { 3058 aSig >>= 2; 3059 bSig >>= 2; 3060 } 3061 do { 3062 alternateASig = aSig; 3063 ++q; 3064 aSig -= bSig; 3065 } while ( 0 <= (sbits64) aSig ); 3066 sigMean = aSig + alternateASig; 3067 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) { 3068 aSig = alternateASig; 3069 } 3070 zSign = ( (sbits64) aSig < 0 ); 3071 if ( zSign ) aSig = - aSig; 3072 return normalizeRoundAndPackFloat64( aSign ^ zSign, bExp, aSig ); 3073 3074 } 3075 3076 /* 3077 ------------------------------------------------------------------------------- 3078 Returns the square root of the double-precision floating-point value `a'. 3079 The operation is performed according to the IEC/IEEE Standard for Binary 3080 Floating-Point Arithmetic. 3081 ------------------------------------------------------------------------------- 3082 */ 3083 float64 float64_sqrt( float64 a ) 3084 { 3085 flag aSign; 3086 int16 aExp, zExp; 3087 bits64 aSig, zSig, doubleZSig; 3088 bits64 rem0, rem1, term0, term1; 3089 3090 aSig = extractFloat64Frac( a ); 3091 aExp = extractFloat64Exp( a ); 3092 aSign = extractFloat64Sign( a ); 3093 if ( aExp == 0x7FF ) { 3094 if ( aSig ) return propagateFloat64NaN( a, a ); 3095 if ( ! aSign ) return a; 3096 float_raise( float_flag_invalid ); 3097 return float64_default_nan; 3098 } 3099 if ( aSign ) { 3100 if ( ( aExp | aSig ) == 0 ) return a; 3101 float_raise( float_flag_invalid ); 3102 return float64_default_nan; 3103 } 3104 if ( aExp == 0 ) { 3105 if ( aSig == 0 ) return 0; 3106 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 3107 } 3108 zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE; 3109 aSig |= LIT64( 0x0010000000000000 ); 3110 zSig = estimateSqrt32( aExp, aSig>>21 ); 3111 aSig <<= 9 - ( aExp & 1 ); 3112 zSig = estimateDiv128To64( aSig, 0, zSig<<32 ) + ( zSig<<30 ); 3113 if ( ( zSig & 0x1FF ) <= 5 ) { 3114 doubleZSig = zSig<<1; 3115 mul64To128( zSig, zSig, &term0, &term1 ); 3116 sub128( aSig, 0, term0, term1, &rem0, &rem1 ); 3117 while ( (sbits64) rem0 < 0 ) { 3118 --zSig; 3119 doubleZSig -= 2; 3120 add128( rem0, rem1, zSig>>63, doubleZSig | 1, &rem0, &rem1 ); 3121 } 3122 zSig |= ( ( rem0 | rem1 ) != 0 ); 3123 } 3124 return roundAndPackFloat64( 0, zExp, zSig ); 3125 3126 } 3127 #endif 3128 3129 /* 3130 ------------------------------------------------------------------------------- 3131 Returns 1 if the double-precision floating-point value `a' is equal to the 3132 corresponding value `b', and 0 otherwise. The comparison is performed 3133 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3134 ------------------------------------------------------------------------------- 3135 */ 3136 flag float64_eq( float64 a, float64 b ) 3137 { 3138 3139 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 3140 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 3141 ) { 3142 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) { 3143 float_raise( float_flag_invalid ); 3144 } 3145 return 0; 3146 } 3147 return ( a == b ) || 3148 ( (bits64) ( ( FLOAT64_DEMANGLE(a) | FLOAT64_DEMANGLE(b) )<<1 ) == 0 ); 3149 3150 } 3151 3152 /* 3153 ------------------------------------------------------------------------------- 3154 Returns 1 if the double-precision floating-point value `a' is less than or 3155 equal to the corresponding value `b', and 0 otherwise. The comparison is 3156 performed according to the IEC/IEEE Standard for Binary Floating-Point 3157 Arithmetic. 3158 ------------------------------------------------------------------------------- 3159 */ 3160 flag float64_le( float64 a, float64 b ) 3161 { 3162 flag aSign, bSign; 3163 3164 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 3165 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 3166 ) { 3167 float_raise( float_flag_invalid ); 3168 return 0; 3169 } 3170 aSign = extractFloat64Sign( a ); 3171 bSign = extractFloat64Sign( b ); 3172 if ( aSign != bSign ) 3173 return aSign || 3174 ( (bits64) ( ( FLOAT64_DEMANGLE(a) | FLOAT64_DEMANGLE(b) )<<1 ) == 3175 0 ); 3176 return ( a == b ) || 3177 ( aSign ^ ( FLOAT64_DEMANGLE(a) < FLOAT64_DEMANGLE(b) ) ); 3178 3179 } 3180 3181 /* 3182 ------------------------------------------------------------------------------- 3183 Returns 1 if the double-precision floating-point value `a' is less than 3184 the corresponding value `b', and 0 otherwise. The comparison is performed 3185 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3186 ------------------------------------------------------------------------------- 3187 */ 3188 flag float64_lt( float64 a, float64 b ) 3189 { 3190 flag aSign, bSign; 3191 3192 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 3193 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 3194 ) { 3195 float_raise( float_flag_invalid ); 3196 return 0; 3197 } 3198 aSign = extractFloat64Sign( a ); 3199 bSign = extractFloat64Sign( b ); 3200 if ( aSign != bSign ) 3201 return aSign && 3202 ( (bits64) ( ( FLOAT64_DEMANGLE(a) | FLOAT64_DEMANGLE(b) )<<1 ) != 3203 0 ); 3204 return ( a != b ) && 3205 ( aSign ^ ( FLOAT64_DEMANGLE(a) < FLOAT64_DEMANGLE(b) ) ); 3206 3207 } 3208 3209 #ifndef SOFTFLOAT_FOR_GCC 3210 /* 3211 ------------------------------------------------------------------------------- 3212 Returns 1 if the double-precision floating-point value `a' is equal to the 3213 corresponding value `b', and 0 otherwise. The invalid exception is raised 3214 if either operand is a NaN. Otherwise, the comparison is performed 3215 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3216 ------------------------------------------------------------------------------- 3217 */ 3218 flag float64_eq_signaling( float64 a, float64 b ) 3219 { 3220 3221 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 3222 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 3223 ) { 3224 float_raise( float_flag_invalid ); 3225 return 0; 3226 } 3227 return ( a == b ) || ( (bits64) ( ( a | b )<<1 ) == 0 ); 3228 3229 } 3230 3231 /* 3232 ------------------------------------------------------------------------------- 3233 Returns 1 if the double-precision floating-point value `a' is less than or 3234 equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not 3235 cause an exception. Otherwise, the comparison is performed according to the 3236 IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3237 ------------------------------------------------------------------------------- 3238 */ 3239 flag float64_le_quiet( float64 a, float64 b ) 3240 { 3241 flag aSign, bSign; 3242 3243 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 3244 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 3245 ) { 3246 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) { 3247 float_raise( float_flag_invalid ); 3248 } 3249 return 0; 3250 } 3251 aSign = extractFloat64Sign( a ); 3252 bSign = extractFloat64Sign( b ); 3253 if ( aSign != bSign ) return aSign || ( (bits64) ( ( a | b )<<1 ) == 0 ); 3254 return ( a == b ) || ( aSign ^ ( a < b ) ); 3255 3256 } 3257 3258 /* 3259 ------------------------------------------------------------------------------- 3260 Returns 1 if the double-precision floating-point value `a' is less than 3261 the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 3262 exception. Otherwise, the comparison is performed according to the IEC/IEEE 3263 Standard for Binary Floating-Point Arithmetic. 3264 ------------------------------------------------------------------------------- 3265 */ 3266 flag float64_lt_quiet( float64 a, float64 b ) 3267 { 3268 flag aSign, bSign; 3269 3270 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 3271 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 3272 ) { 3273 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) { 3274 float_raise( float_flag_invalid ); 3275 } 3276 return 0; 3277 } 3278 aSign = extractFloat64Sign( a ); 3279 bSign = extractFloat64Sign( b ); 3280 if ( aSign != bSign ) return aSign && ( (bits64) ( ( a | b )<<1 ) != 0 ); 3281 return ( a != b ) && ( aSign ^ ( a < b ) ); 3282 3283 } 3284 #endif 3285 3286 #ifdef FLOATX80 3287 3288 /* 3289 ------------------------------------------------------------------------------- 3290 Returns the result of converting the extended double-precision floating- 3291 point value `a' to the 32-bit two's complement integer format. The 3292 conversion is performed according to the IEC/IEEE Standard for Binary 3293 Floating-Point Arithmetic---which means in particular that the conversion 3294 is rounded according to the current rounding mode. If `a' is a NaN, the 3295 largest positive integer is returned. Otherwise, if the conversion 3296 overflows, the largest integer with the same sign as `a' is returned. 3297 ------------------------------------------------------------------------------- 3298 */ 3299 int32 floatx80_to_int32( floatx80 a ) 3300 { 3301 flag aSign; 3302 int32 aExp, shiftCount; 3303 bits64 aSig; 3304 3305 aSig = extractFloatx80Frac( a ); 3306 aExp = extractFloatx80Exp( a ); 3307 aSign = extractFloatx80Sign( a ); 3308 if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0; 3309 shiftCount = 0x4037 - aExp; 3310 if ( shiftCount <= 0 ) shiftCount = 1; 3311 shift64RightJamming( aSig, shiftCount, &aSig ); 3312 return roundAndPackInt32( aSign, aSig ); 3313 3314 } 3315 3316 /* 3317 ------------------------------------------------------------------------------- 3318 Returns the result of converting the extended double-precision floating- 3319 point value `a' to the 32-bit two's complement integer format. The 3320 conversion is performed according to the IEC/IEEE Standard for Binary 3321 Floating-Point Arithmetic, except that the conversion is always rounded 3322 toward zero. If `a' is a NaN, the largest positive integer is returned. 3323 Otherwise, if the conversion overflows, the largest integer with the same 3324 sign as `a' is returned. 3325 ------------------------------------------------------------------------------- 3326 */ 3327 int32 floatx80_to_int32_round_to_zero( floatx80 a ) 3328 { 3329 flag aSign; 3330 int32 aExp, shiftCount; 3331 bits64 aSig, savedASig; 3332 int32 z; 3333 3334 aSig = extractFloatx80Frac( a ); 3335 aExp = extractFloatx80Exp( a ); 3336 aSign = extractFloatx80Sign( a ); 3337 if ( 0x401E < aExp ) { 3338 if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0; 3339 goto invalid; 3340 } 3341 else if ( aExp < 0x3FFF ) { 3342 if ( aExp || aSig ) set_float_exception_inexact_flag(); 3343 return 0; 3344 } 3345 shiftCount = 0x403E - aExp; 3346 savedASig = aSig; 3347 aSig >>= shiftCount; 3348 z = aSig; 3349 if ( aSign ) z = - z; 3350 if ( ( z < 0 ) ^ aSign ) { 3351 invalid: 3352 float_raise( float_flag_invalid ); 3353 return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF; 3354 } 3355 if ( ( aSig<<shiftCount ) != savedASig ) { 3356 set_float_exception_inexact_flag(); 3357 } 3358 return z; 3359 3360 } 3361 3362 /* 3363 ------------------------------------------------------------------------------- 3364 Returns the result of converting the extended double-precision floating- 3365 point value `a' to the 64-bit two's complement integer format. The 3366 conversion is performed according to the IEC/IEEE Standard for Binary 3367 Floating-Point Arithmetic---which means in particular that the conversion 3368 is rounded according to the current rounding mode. If `a' is a NaN, 3369 the largest positive integer is returned. Otherwise, if the conversion 3370 overflows, the largest integer with the same sign as `a' is returned. 3371 ------------------------------------------------------------------------------- 3372 */ 3373 int64 floatx80_to_int64( floatx80 a ) 3374 { 3375 flag aSign; 3376 int32 aExp, shiftCount; 3377 bits64 aSig, aSigExtra; 3378 3379 aSig = extractFloatx80Frac( a ); 3380 aExp = extractFloatx80Exp( a ); 3381 aSign = extractFloatx80Sign( a ); 3382 shiftCount = 0x403E - aExp; 3383 if ( shiftCount <= 0 ) { 3384 if ( shiftCount ) { 3385 float_raise( float_flag_invalid ); 3386 if ( ! aSign 3387 || ( ( aExp == 0x7FFF ) 3388 && ( aSig != LIT64( 0x8000000000000000 ) ) ) 3389 ) { 3390 return LIT64( 0x7FFFFFFFFFFFFFFF ); 3391 } 3392 return (sbits64) LIT64( 0x8000000000000000 ); 3393 } 3394 aSigExtra = 0; 3395 } 3396 else { 3397 shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra ); 3398 } 3399 return roundAndPackInt64( aSign, aSig, aSigExtra ); 3400 3401 } 3402 3403 /* 3404 ------------------------------------------------------------------------------- 3405 Returns the result of converting the extended double-precision floating- 3406 point value `a' to the 64-bit two's complement integer format. The 3407 conversion is performed according to the IEC/IEEE Standard for Binary 3408 Floating-Point Arithmetic, except that the conversion is always rounded 3409 toward zero. If `a' is a NaN, the largest positive integer is returned. 3410 Otherwise, if the conversion overflows, the largest integer with the same 3411 sign as `a' is returned. 3412 ------------------------------------------------------------------------------- 3413 */ 3414 int64 floatx80_to_int64_round_to_zero( floatx80 a ) 3415 { 3416 flag aSign; 3417 int32 aExp, shiftCount; 3418 bits64 aSig; 3419 int64 z; 3420 3421 aSig = extractFloatx80Frac( a ); 3422 aExp = extractFloatx80Exp( a ); 3423 aSign = extractFloatx80Sign( a ); 3424 shiftCount = aExp - 0x403E; 3425 if ( 0 <= shiftCount ) { 3426 aSig &= LIT64( 0x7FFFFFFFFFFFFFFF ); 3427 if ( ( a.high != 0xC03E ) || aSig ) { 3428 float_raise( float_flag_invalid ); 3429 if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) { 3430 return LIT64( 0x7FFFFFFFFFFFFFFF ); 3431 } 3432 } 3433 return (sbits64) LIT64( 0x8000000000000000 ); 3434 } 3435 else if ( aExp < 0x3FFF ) { 3436 if ( aExp | aSig ) set_float_exception_inexact_flag(); 3437 return 0; 3438 } 3439 z = aSig>>( - shiftCount ); 3440 if ( (bits64) ( aSig<<( shiftCount & 63 ) ) ) { 3441 set_float_exception_inexact_flag(); 3442 } 3443 if ( aSign ) z = - z; 3444 return z; 3445 3446 } 3447 3448 /* 3449 ------------------------------------------------------------------------------- 3450 Returns the result of converting the extended double-precision floating- 3451 point value `a' to the single-precision floating-point format. The 3452 conversion is performed according to the IEC/IEEE Standard for Binary 3453 Floating-Point Arithmetic. 3454 ------------------------------------------------------------------------------- 3455 */ 3456 float32 floatx80_to_float32( floatx80 a ) 3457 { 3458 flag aSign; 3459 int32 aExp; 3460 bits64 aSig; 3461 3462 aSig = extractFloatx80Frac( a ); 3463 aExp = extractFloatx80Exp( a ); 3464 aSign = extractFloatx80Sign( a ); 3465 if ( aExp == 0x7FFF ) { 3466 if ( (bits64) ( aSig<<1 ) ) { 3467 return commonNaNToFloat32( floatx80ToCommonNaN( a ) ); 3468 } 3469 return packFloat32( aSign, 0xFF, 0 ); 3470 } 3471 shift64RightJamming( aSig, 33, &aSig ); 3472 if ( aExp || aSig ) aExp -= 0x3F81; 3473 return roundAndPackFloat32( aSign, aExp, aSig ); 3474 3475 } 3476 3477 /* 3478 ------------------------------------------------------------------------------- 3479 Returns the result of converting the extended double-precision floating- 3480 point value `a' to the double-precision floating-point format. The 3481 conversion is performed according to the IEC/IEEE Standard for Binary 3482 Floating-Point Arithmetic. 3483 ------------------------------------------------------------------------------- 3484 */ 3485 float64 floatx80_to_float64( floatx80 a ) 3486 { 3487 flag aSign; 3488 int32 aExp; 3489 bits64 aSig, zSig; 3490 3491 aSig = extractFloatx80Frac( a ); 3492 aExp = extractFloatx80Exp( a ); 3493 aSign = extractFloatx80Sign( a ); 3494 if ( aExp == 0x7FFF ) { 3495 if ( (bits64) ( aSig<<1 ) ) { 3496 return commonNaNToFloat64( floatx80ToCommonNaN( a ) ); 3497 } 3498 return packFloat64( aSign, 0x7FF, 0 ); 3499 } 3500 shift64RightJamming( aSig, 1, &zSig ); 3501 if ( aExp || aSig ) aExp -= 0x3C01; 3502 return roundAndPackFloat64( aSign, aExp, zSig ); 3503 3504 } 3505 3506 #ifdef FLOAT128 3507 3508 /* 3509 ------------------------------------------------------------------------------- 3510 Returns the result of converting the extended double-precision floating- 3511 point value `a' to the quadruple-precision floating-point format. The 3512 conversion is performed according to the IEC/IEEE Standard for Binary 3513 Floating-Point Arithmetic. 3514 ------------------------------------------------------------------------------- 3515 */ 3516 float128 floatx80_to_float128( floatx80 a ) 3517 { 3518 flag aSign; 3519 int16 aExp; 3520 bits64 aSig, zSig0, zSig1; 3521 3522 aSig = extractFloatx80Frac( a ); 3523 aExp = extractFloatx80Exp( a ); 3524 aSign = extractFloatx80Sign( a ); 3525 if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) { 3526 return commonNaNToFloat128( floatx80ToCommonNaN( a ) ); 3527 } 3528 shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 ); 3529 return packFloat128( aSign, aExp, zSig0, zSig1 ); 3530 3531 } 3532 3533 #endif 3534 3535 /* 3536 ------------------------------------------------------------------------------- 3537 Rounds the extended double-precision floating-point value `a' to an integer, 3538 and returns the result as an extended quadruple-precision floating-point 3539 value. The operation is performed according to the IEC/IEEE Standard for 3540 Binary Floating-Point Arithmetic. 3541 ------------------------------------------------------------------------------- 3542 */ 3543 floatx80 floatx80_round_to_int( floatx80 a ) 3544 { 3545 flag aSign; 3546 int32 aExp; 3547 bits64 lastBitMask, roundBitsMask; 3548 int8 roundingMode; 3549 floatx80 z; 3550 3551 aExp = extractFloatx80Exp( a ); 3552 if ( 0x403E <= aExp ) { 3553 if ( ( aExp == 0x7FFF ) && (bits64) ( extractFloatx80Frac( a )<<1 ) ) { 3554 return propagateFloatx80NaN( a, a ); 3555 } 3556 return a; 3557 } 3558 if ( aExp < 0x3FFF ) { 3559 if ( ( aExp == 0 ) 3560 && ( (bits64) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) { 3561 return a; 3562 } 3563 set_float_exception_inexact_flag(); 3564 aSign = extractFloatx80Sign( a ); 3565 switch ( float_rounding_mode ) { 3566 case float_round_nearest_even: 3567 if ( ( aExp == 0x3FFE ) && (bits64) ( extractFloatx80Frac( a )<<1 ) 3568 ) { 3569 return 3570 packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) ); 3571 } 3572 break; 3573 case float_round_to_zero: 3574 break; 3575 case float_round_down: 3576 return 3577 aSign ? 3578 packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) ) 3579 : packFloatx80( 0, 0, 0 ); 3580 case float_round_up: 3581 return 3582 aSign ? packFloatx80( 1, 0, 0 ) 3583 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) ); 3584 } 3585 return packFloatx80( aSign, 0, 0 ); 3586 } 3587 lastBitMask = 1; 3588 lastBitMask <<= 0x403E - aExp; 3589 roundBitsMask = lastBitMask - 1; 3590 z = a; 3591 roundingMode = float_rounding_mode; 3592 if ( roundingMode == float_round_nearest_even ) { 3593 z.low += lastBitMask>>1; 3594 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask; 3595 } 3596 else if ( roundingMode != float_round_to_zero ) { 3597 if ( extractFloatx80Sign( z ) ^ ( roundingMode == float_round_up ) ) { 3598 z.low += roundBitsMask; 3599 } 3600 } 3601 z.low &= ~ roundBitsMask; 3602 if ( z.low == 0 ) { 3603 ++z.high; 3604 z.low = LIT64( 0x8000000000000000 ); 3605 } 3606 if ( z.low != a.low ) set_float_exception_inexact_flag(); 3607 return z; 3608 3609 } 3610 3611 /* 3612 ------------------------------------------------------------------------------- 3613 Returns the result of adding the absolute values of the extended double- 3614 precision floating-point values `a' and `b'. If `zSign' is 1, the sum is 3615 negated before being returned. `zSign' is ignored if the result is a NaN. 3616 The addition is performed according to the IEC/IEEE Standard for Binary 3617 Floating-Point Arithmetic. 3618 ------------------------------------------------------------------------------- 3619 */ 3620 static floatx80 addFloatx80Sigs( floatx80 a, floatx80 b, flag zSign ) 3621 { 3622 int32 aExp, bExp, zExp; 3623 bits64 aSig, bSig, zSig0, zSig1; 3624 int32 expDiff; 3625 3626 aSig = extractFloatx80Frac( a ); 3627 aExp = extractFloatx80Exp( a ); 3628 bSig = extractFloatx80Frac( b ); 3629 bExp = extractFloatx80Exp( b ); 3630 expDiff = aExp - bExp; 3631 if ( 0 < expDiff ) { 3632 if ( aExp == 0x7FFF ) { 3633 if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b ); 3634 return a; 3635 } 3636 if ( bExp == 0 ) --expDiff; 3637 shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 ); 3638 zExp = aExp; 3639 } 3640 else if ( expDiff < 0 ) { 3641 if ( bExp == 0x7FFF ) { 3642 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b ); 3643 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 3644 } 3645 if ( aExp == 0 ) ++expDiff; 3646 shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 ); 3647 zExp = bExp; 3648 } 3649 else { 3650 if ( aExp == 0x7FFF ) { 3651 if ( (bits64) ( ( aSig | bSig )<<1 ) ) { 3652 return propagateFloatx80NaN( a, b ); 3653 } 3654 return a; 3655 } 3656 zSig1 = 0; 3657 zSig0 = aSig + bSig; 3658 if ( aExp == 0 ) { 3659 normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 ); 3660 goto roundAndPack; 3661 } 3662 zExp = aExp; 3663 goto shiftRight1; 3664 } 3665 zSig0 = aSig + bSig; 3666 if ( (sbits64) zSig0 < 0 ) goto roundAndPack; 3667 shiftRight1: 3668 shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 ); 3669 zSig0 |= LIT64( 0x8000000000000000 ); 3670 ++zExp; 3671 roundAndPack: 3672 return 3673 roundAndPackFloatx80( 3674 floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 ); 3675 3676 } 3677 3678 /* 3679 ------------------------------------------------------------------------------- 3680 Returns the result of subtracting the absolute values of the extended 3681 double-precision floating-point values `a' and `b'. If `zSign' is 1, the 3682 difference is negated before being returned. `zSign' is ignored if the 3683 result is a NaN. The subtraction is performed according to the IEC/IEEE 3684 Standard for Binary Floating-Point Arithmetic. 3685 ------------------------------------------------------------------------------- 3686 */ 3687 static floatx80 subFloatx80Sigs( floatx80 a, floatx80 b, flag zSign ) 3688 { 3689 int32 aExp, bExp, zExp; 3690 bits64 aSig, bSig, zSig0, zSig1; 3691 int32 expDiff; 3692 floatx80 z; 3693 3694 aSig = extractFloatx80Frac( a ); 3695 aExp = extractFloatx80Exp( a ); 3696 bSig = extractFloatx80Frac( b ); 3697 bExp = extractFloatx80Exp( b ); 3698 expDiff = aExp - bExp; 3699 if ( 0 < expDiff ) goto aExpBigger; 3700 if ( expDiff < 0 ) goto bExpBigger; 3701 if ( aExp == 0x7FFF ) { 3702 if ( (bits64) ( ( aSig | bSig )<<1 ) ) { 3703 return propagateFloatx80NaN( a, b ); 3704 } 3705 float_raise( float_flag_invalid ); 3706 z.low = floatx80_default_nan_low; 3707 z.high = floatx80_default_nan_high; 3708 return z; 3709 } 3710 if ( aExp == 0 ) { 3711 aExp = 1; 3712 bExp = 1; 3713 } 3714 zSig1 = 0; 3715 if ( bSig < aSig ) goto aBigger; 3716 if ( aSig < bSig ) goto bBigger; 3717 return packFloatx80( float_rounding_mode == float_round_down, 0, 0 ); 3718 bExpBigger: 3719 if ( bExp == 0x7FFF ) { 3720 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b ); 3721 return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) ); 3722 } 3723 if ( aExp == 0 ) ++expDiff; 3724 shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 ); 3725 bBigger: 3726 sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 ); 3727 zExp = bExp; 3728 zSign ^= 1; 3729 goto normalizeRoundAndPack; 3730 aExpBigger: 3731 if ( aExp == 0x7FFF ) { 3732 if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b ); 3733 return a; 3734 } 3735 if ( bExp == 0 ) --expDiff; 3736 shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 ); 3737 aBigger: 3738 sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 ); 3739 zExp = aExp; 3740 normalizeRoundAndPack: 3741 return 3742 normalizeRoundAndPackFloatx80( 3743 floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 ); 3744 3745 } 3746 3747 /* 3748 ------------------------------------------------------------------------------- 3749 Returns the result of adding the extended double-precision floating-point 3750 values `a' and `b'. The operation is performed according to the IEC/IEEE 3751 Standard for Binary Floating-Point Arithmetic. 3752 ------------------------------------------------------------------------------- 3753 */ 3754 floatx80 floatx80_add( floatx80 a, floatx80 b ) 3755 { 3756 flag aSign, bSign; 3757 3758 aSign = extractFloatx80Sign( a ); 3759 bSign = extractFloatx80Sign( b ); 3760 if ( aSign == bSign ) { 3761 return addFloatx80Sigs( a, b, aSign ); 3762 } 3763 else { 3764 return subFloatx80Sigs( a, b, aSign ); 3765 } 3766 3767 } 3768 3769 /* 3770 ------------------------------------------------------------------------------- 3771 Returns the result of subtracting the extended double-precision floating- 3772 point values `a' and `b'. The operation is performed according to the 3773 IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3774 ------------------------------------------------------------------------------- 3775 */ 3776 floatx80 floatx80_sub( floatx80 a, floatx80 b ) 3777 { 3778 flag aSign, bSign; 3779 3780 aSign = extractFloatx80Sign( a ); 3781 bSign = extractFloatx80Sign( b ); 3782 if ( aSign == bSign ) { 3783 return subFloatx80Sigs( a, b, aSign ); 3784 } 3785 else { 3786 return addFloatx80Sigs( a, b, aSign ); 3787 } 3788 3789 } 3790 3791 /* 3792 ------------------------------------------------------------------------------- 3793 Returns the result of multiplying the extended double-precision floating- 3794 point values `a' and `b'. The operation is performed according to the 3795 IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3796 ------------------------------------------------------------------------------- 3797 */ 3798 floatx80 floatx80_mul( floatx80 a, floatx80 b ) 3799 { 3800 flag aSign, bSign, zSign; 3801 int32 aExp, bExp, zExp; 3802 bits64 aSig, bSig, zSig0, zSig1; 3803 floatx80 z; 3804 3805 aSig = extractFloatx80Frac( a ); 3806 aExp = extractFloatx80Exp( a ); 3807 aSign = extractFloatx80Sign( a ); 3808 bSig = extractFloatx80Frac( b ); 3809 bExp = extractFloatx80Exp( b ); 3810 bSign = extractFloatx80Sign( b ); 3811 zSign = aSign ^ bSign; 3812 if ( aExp == 0x7FFF ) { 3813 if ( (bits64) ( aSig<<1 ) 3814 || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) { 3815 return propagateFloatx80NaN( a, b ); 3816 } 3817 if ( ( bExp | bSig ) == 0 ) goto invalid; 3818 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 3819 } 3820 if ( bExp == 0x7FFF ) { 3821 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b ); 3822 if ( ( aExp | aSig ) == 0 ) { 3823 invalid: 3824 float_raise( float_flag_invalid ); 3825 z.low = floatx80_default_nan_low; 3826 z.high = floatx80_default_nan_high; 3827 return z; 3828 } 3829 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 3830 } 3831 if ( aExp == 0 ) { 3832 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 ); 3833 normalizeFloatx80Subnormal( aSig, &aExp, &aSig ); 3834 } 3835 if ( bExp == 0 ) { 3836 if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 ); 3837 normalizeFloatx80Subnormal( bSig, &bExp, &bSig ); 3838 } 3839 zExp = aExp + bExp - 0x3FFE; 3840 mul64To128( aSig, bSig, &zSig0, &zSig1 ); 3841 if ( 0 < (sbits64) zSig0 ) { 3842 shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 ); 3843 --zExp; 3844 } 3845 return 3846 roundAndPackFloatx80( 3847 floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 ); 3848 3849 } 3850 3851 /* 3852 ------------------------------------------------------------------------------- 3853 Returns the result of dividing the extended double-precision floating-point 3854 value `a' by the corresponding value `b'. The operation is performed 3855 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3856 ------------------------------------------------------------------------------- 3857 */ 3858 floatx80 floatx80_div( floatx80 a, floatx80 b ) 3859 { 3860 flag aSign, bSign, zSign; 3861 int32 aExp, bExp, zExp; 3862 bits64 aSig, bSig, zSig0, zSig1; 3863 bits64 rem0, rem1, rem2, term0, term1, term2; 3864 floatx80 z; 3865 3866 aSig = extractFloatx80Frac( a ); 3867 aExp = extractFloatx80Exp( a ); 3868 aSign = extractFloatx80Sign( a ); 3869 bSig = extractFloatx80Frac( b ); 3870 bExp = extractFloatx80Exp( b ); 3871 bSign = extractFloatx80Sign( b ); 3872 zSign = aSign ^ bSign; 3873 if ( aExp == 0x7FFF ) { 3874 if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b ); 3875 if ( bExp == 0x7FFF ) { 3876 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b ); 3877 goto invalid; 3878 } 3879 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 3880 } 3881 if ( bExp == 0x7FFF ) { 3882 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b ); 3883 return packFloatx80( zSign, 0, 0 ); 3884 } 3885 if ( bExp == 0 ) { 3886 if ( bSig == 0 ) { 3887 if ( ( aExp | aSig ) == 0 ) { 3888 invalid: 3889 float_raise( float_flag_invalid ); 3890 z.low = floatx80_default_nan_low; 3891 z.high = floatx80_default_nan_high; 3892 return z; 3893 } 3894 float_raise( float_flag_divbyzero ); 3895 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 3896 } 3897 normalizeFloatx80Subnormal( bSig, &bExp, &bSig ); 3898 } 3899 if ( aExp == 0 ) { 3900 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 ); 3901 normalizeFloatx80Subnormal( aSig, &aExp, &aSig ); 3902 } 3903 zExp = aExp - bExp + 0x3FFE; 3904 rem1 = 0; 3905 if ( bSig <= aSig ) { 3906 shift128Right( aSig, 0, 1, &aSig, &rem1 ); 3907 ++zExp; 3908 } 3909 zSig0 = estimateDiv128To64( aSig, rem1, bSig ); 3910 mul64To128( bSig, zSig0, &term0, &term1 ); 3911 sub128( aSig, rem1, term0, term1, &rem0, &rem1 ); 3912 while ( (sbits64) rem0 < 0 ) { 3913 --zSig0; 3914 add128( rem0, rem1, 0, bSig, &rem0, &rem1 ); 3915 } 3916 zSig1 = estimateDiv128To64( rem1, 0, bSig ); 3917 if ( (bits64) ( zSig1<<1 ) <= 8 ) { 3918 mul64To128( bSig, zSig1, &term1, &term2 ); 3919 sub128( rem1, 0, term1, term2, &rem1, &rem2 ); 3920 while ( (sbits64) rem1 < 0 ) { 3921 --zSig1; 3922 add128( rem1, rem2, 0, bSig, &rem1, &rem2 ); 3923 } 3924 zSig1 |= ( ( rem1 | rem2 ) != 0 ); 3925 } 3926 return 3927 roundAndPackFloatx80( 3928 floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 ); 3929 3930 } 3931 3932 /* 3933 ------------------------------------------------------------------------------- 3934 Returns the remainder of the extended double-precision floating-point value 3935 `a' with respect to the corresponding value `b'. The operation is performed 3936 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3937 ------------------------------------------------------------------------------- 3938 */ 3939 floatx80 floatx80_rem( floatx80 a, floatx80 b ) 3940 { 3941 flag aSign, bSign, zSign; 3942 int32 aExp, bExp, expDiff; 3943 bits64 aSig0, aSig1, bSig; 3944 bits64 q, term0, term1, alternateASig0, alternateASig1; 3945 floatx80 z; 3946 3947 aSig0 = extractFloatx80Frac( a ); 3948 aExp = extractFloatx80Exp( a ); 3949 aSign = extractFloatx80Sign( a ); 3950 bSig = extractFloatx80Frac( b ); 3951 bExp = extractFloatx80Exp( b ); 3952 bSign = extractFloatx80Sign( b ); 3953 if ( aExp == 0x7FFF ) { 3954 if ( (bits64) ( aSig0<<1 ) 3955 || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) { 3956 return propagateFloatx80NaN( a, b ); 3957 } 3958 goto invalid; 3959 } 3960 if ( bExp == 0x7FFF ) { 3961 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b ); 3962 return a; 3963 } 3964 if ( bExp == 0 ) { 3965 if ( bSig == 0 ) { 3966 invalid: 3967 float_raise( float_flag_invalid ); 3968 z.low = floatx80_default_nan_low; 3969 z.high = floatx80_default_nan_high; 3970 return z; 3971 } 3972 normalizeFloatx80Subnormal( bSig, &bExp, &bSig ); 3973 } 3974 if ( aExp == 0 ) { 3975 if ( (bits64) ( aSig0<<1 ) == 0 ) return a; 3976 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 ); 3977 } 3978 bSig |= LIT64( 0x8000000000000000 ); 3979 zSign = aSign; 3980 expDiff = aExp - bExp; 3981 aSig1 = 0; 3982 if ( expDiff < 0 ) { 3983 if ( expDiff < -1 ) return a; 3984 shift128Right( aSig0, 0, 1, &aSig0, &aSig1 ); 3985 expDiff = 0; 3986 } 3987 q = ( bSig <= aSig0 ); 3988 if ( q ) aSig0 -= bSig; 3989 expDiff -= 64; 3990 while ( 0 < expDiff ) { 3991 q = estimateDiv128To64( aSig0, aSig1, bSig ); 3992 q = ( 2 < q ) ? q - 2 : 0; 3993 mul64To128( bSig, q, &term0, &term1 ); 3994 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 ); 3995 shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 ); 3996 expDiff -= 62; 3997 } 3998 expDiff += 64; 3999 if ( 0 < expDiff ) { 4000 q = estimateDiv128To64( aSig0, aSig1, bSig ); 4001 q = ( 2 < q ) ? q - 2 : 0; 4002 q >>= 64 - expDiff; 4003 mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 ); 4004 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 ); 4005 shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 ); 4006 while ( le128( term0, term1, aSig0, aSig1 ) ) { 4007 ++q; 4008 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 ); 4009 } 4010 } 4011 else { 4012 term1 = 0; 4013 term0 = bSig; 4014 } 4015 sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 ); 4016 if ( lt128( alternateASig0, alternateASig1, aSig0, aSig1 ) 4017 || ( eq128( alternateASig0, alternateASig1, aSig0, aSig1 ) 4018 && ( q & 1 ) ) 4019 ) { 4020 aSig0 = alternateASig0; 4021 aSig1 = alternateASig1; 4022 zSign = ! zSign; 4023 } 4024 return 4025 normalizeRoundAndPackFloatx80( 4026 80, zSign, bExp + expDiff, aSig0, aSig1 ); 4027 4028 } 4029 4030 /* 4031 ------------------------------------------------------------------------------- 4032 Returns the square root of the extended double-precision floating-point 4033 value `a'. The operation is performed according to the IEC/IEEE Standard 4034 for Binary Floating-Point Arithmetic. 4035 ------------------------------------------------------------------------------- 4036 */ 4037 floatx80 floatx80_sqrt( floatx80 a ) 4038 { 4039 flag aSign; 4040 int32 aExp, zExp; 4041 bits64 aSig0, aSig1, zSig0, zSig1, doubleZSig0; 4042 bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3; 4043 floatx80 z; 4044 4045 aSig0 = extractFloatx80Frac( a ); 4046 aExp = extractFloatx80Exp( a ); 4047 aSign = extractFloatx80Sign( a ); 4048 if ( aExp == 0x7FFF ) { 4049 if ( (bits64) ( aSig0<<1 ) ) return propagateFloatx80NaN( a, a ); 4050 if ( ! aSign ) return a; 4051 goto invalid; 4052 } 4053 if ( aSign ) { 4054 if ( ( aExp | aSig0 ) == 0 ) return a; 4055 invalid: 4056 float_raise( float_flag_invalid ); 4057 z.low = floatx80_default_nan_low; 4058 z.high = floatx80_default_nan_high; 4059 return z; 4060 } 4061 if ( aExp == 0 ) { 4062 if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 ); 4063 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 ); 4064 } 4065 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF; 4066 zSig0 = estimateSqrt32( aExp, aSig0>>32 ); 4067 shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 ); 4068 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 ); 4069 doubleZSig0 = zSig0<<1; 4070 mul64To128( zSig0, zSig0, &term0, &term1 ); 4071 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 ); 4072 while ( (sbits64) rem0 < 0 ) { 4073 --zSig0; 4074 doubleZSig0 -= 2; 4075 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 ); 4076 } 4077 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 ); 4078 if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) { 4079 if ( zSig1 == 0 ) zSig1 = 1; 4080 mul64To128( doubleZSig0, zSig1, &term1, &term2 ); 4081 sub128( rem1, 0, term1, term2, &rem1, &rem2 ); 4082 mul64To128( zSig1, zSig1, &term2, &term3 ); 4083 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 ); 4084 while ( (sbits64) rem1 < 0 ) { 4085 --zSig1; 4086 shortShift128Left( 0, zSig1, 1, &term2, &term3 ); 4087 term3 |= 1; 4088 term2 |= doubleZSig0; 4089 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 ); 4090 } 4091 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 ); 4092 } 4093 shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 ); 4094 zSig0 |= doubleZSig0; 4095 return 4096 roundAndPackFloatx80( 4097 floatx80_rounding_precision, 0, zExp, zSig0, zSig1 ); 4098 4099 } 4100 4101 /* 4102 ------------------------------------------------------------------------------- 4103 Returns 1 if the extended double-precision floating-point value `a' is 4104 equal to the corresponding value `b', and 0 otherwise. The comparison is 4105 performed according to the IEC/IEEE Standard for Binary Floating-Point 4106 Arithmetic. 4107 ------------------------------------------------------------------------------- 4108 */ 4109 flag floatx80_eq( floatx80 a, floatx80 b ) 4110 { 4111 4112 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 4113 && (bits64) ( extractFloatx80Frac( a )<<1 ) ) 4114 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 4115 && (bits64) ( extractFloatx80Frac( b )<<1 ) ) 4116 ) { 4117 if ( floatx80_is_signaling_nan( a ) 4118 || floatx80_is_signaling_nan( b ) ) { 4119 float_raise( float_flag_invalid ); 4120 } 4121 return 0; 4122 } 4123 return 4124 ( a.low == b.low ) 4125 && ( ( a.high == b.high ) 4126 || ( ( a.low == 0 ) 4127 && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) ) 4128 ); 4129 4130 } 4131 4132 /* 4133 ------------------------------------------------------------------------------- 4134 Returns 1 if the extended double-precision floating-point value `a' is 4135 less than or equal to the corresponding value `b', and 0 otherwise. The 4136 comparison is performed according to the IEC/IEEE Standard for Binary 4137 Floating-Point Arithmetic. 4138 ------------------------------------------------------------------------------- 4139 */ 4140 flag floatx80_le( floatx80 a, floatx80 b ) 4141 { 4142 flag aSign, bSign; 4143 4144 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 4145 && (bits64) ( extractFloatx80Frac( a )<<1 ) ) 4146 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 4147 && (bits64) ( extractFloatx80Frac( b )<<1 ) ) 4148 ) { 4149 float_raise( float_flag_invalid ); 4150 return 0; 4151 } 4152 aSign = extractFloatx80Sign( a ); 4153 bSign = extractFloatx80Sign( b ); 4154 if ( aSign != bSign ) { 4155 return 4156 aSign 4157 || ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 4158 == 0 ); 4159 } 4160 return 4161 aSign ? le128( b.high, b.low, a.high, a.low ) 4162 : le128( a.high, a.low, b.high, b.low ); 4163 4164 } 4165 4166 /* 4167 ------------------------------------------------------------------------------- 4168 Returns 1 if the extended double-precision floating-point value `a' is 4169 less than the corresponding value `b', and 0 otherwise. The comparison 4170 is performed according to the IEC/IEEE Standard for Binary Floating-Point 4171 Arithmetic. 4172 ------------------------------------------------------------------------------- 4173 */ 4174 flag floatx80_lt( floatx80 a, floatx80 b ) 4175 { 4176 flag aSign, bSign; 4177 4178 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 4179 && (bits64) ( extractFloatx80Frac( a )<<1 ) ) 4180 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 4181 && (bits64) ( extractFloatx80Frac( b )<<1 ) ) 4182 ) { 4183 float_raise( float_flag_invalid ); 4184 return 0; 4185 } 4186 aSign = extractFloatx80Sign( a ); 4187 bSign = extractFloatx80Sign( b ); 4188 if ( aSign != bSign ) { 4189 return 4190 aSign 4191 && ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 4192 != 0 ); 4193 } 4194 return 4195 aSign ? lt128( b.high, b.low, a.high, a.low ) 4196 : lt128( a.high, a.low, b.high, b.low ); 4197 4198 } 4199 4200 /* 4201 ------------------------------------------------------------------------------- 4202 Returns 1 if the extended double-precision floating-point value `a' is equal 4203 to the corresponding value `b', and 0 otherwise. The invalid exception is 4204 raised if either operand is a NaN. Otherwise, the comparison is performed 4205 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4206 ------------------------------------------------------------------------------- 4207 */ 4208 flag floatx80_eq_signaling( floatx80 a, floatx80 b ) 4209 { 4210 4211 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 4212 && (bits64) ( extractFloatx80Frac( a )<<1 ) ) 4213 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 4214 && (bits64) ( extractFloatx80Frac( b )<<1 ) ) 4215 ) { 4216 float_raise( float_flag_invalid ); 4217 return 0; 4218 } 4219 return 4220 ( a.low == b.low ) 4221 && ( ( a.high == b.high ) 4222 || ( ( a.low == 0 ) 4223 && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) ) 4224 ); 4225 4226 } 4227 4228 /* 4229 ------------------------------------------------------------------------------- 4230 Returns 1 if the extended double-precision floating-point value `a' is less 4231 than or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs 4232 do not cause an exception. Otherwise, the comparison is performed according 4233 to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4234 ------------------------------------------------------------------------------- 4235 */ 4236 flag floatx80_le_quiet( floatx80 a, floatx80 b ) 4237 { 4238 flag aSign, bSign; 4239 4240 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 4241 && (bits64) ( extractFloatx80Frac( a )<<1 ) ) 4242 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 4243 && (bits64) ( extractFloatx80Frac( b )<<1 ) ) 4244 ) { 4245 if ( floatx80_is_signaling_nan( a ) 4246 || floatx80_is_signaling_nan( b ) ) { 4247 float_raise( float_flag_invalid ); 4248 } 4249 return 0; 4250 } 4251 aSign = extractFloatx80Sign( a ); 4252 bSign = extractFloatx80Sign( b ); 4253 if ( aSign != bSign ) { 4254 return 4255 aSign 4256 || ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 4257 == 0 ); 4258 } 4259 return 4260 aSign ? le128( b.high, b.low, a.high, a.low ) 4261 : le128( a.high, a.low, b.high, b.low ); 4262 4263 } 4264 4265 /* 4266 ------------------------------------------------------------------------------- 4267 Returns 1 if the extended double-precision floating-point value `a' is less 4268 than the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause 4269 an exception. Otherwise, the comparison is performed according to the 4270 IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4271 ------------------------------------------------------------------------------- 4272 */ 4273 flag floatx80_lt_quiet( floatx80 a, floatx80 b ) 4274 { 4275 flag aSign, bSign; 4276 4277 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 4278 && (bits64) ( extractFloatx80Frac( a )<<1 ) ) 4279 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 4280 && (bits64) ( extractFloatx80Frac( b )<<1 ) ) 4281 ) { 4282 if ( floatx80_is_signaling_nan( a ) 4283 || floatx80_is_signaling_nan( b ) ) { 4284 float_raise( float_flag_invalid ); 4285 } 4286 return 0; 4287 } 4288 aSign = extractFloatx80Sign( a ); 4289 bSign = extractFloatx80Sign( b ); 4290 if ( aSign != bSign ) { 4291 return 4292 aSign 4293 && ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 4294 != 0 ); 4295 } 4296 return 4297 aSign ? lt128( b.high, b.low, a.high, a.low ) 4298 : lt128( a.high, a.low, b.high, b.low ); 4299 4300 } 4301 4302 #endif 4303 4304 #ifdef FLOAT128 4305 4306 /* 4307 ------------------------------------------------------------------------------- 4308 Returns the result of converting the quadruple-precision floating-point 4309 value `a' to the 32-bit two's complement integer format. The conversion 4310 is performed according to the IEC/IEEE Standard for Binary Floating-Point 4311 Arithmetic---which means in particular that the conversion is rounded 4312 according to the current rounding mode. If `a' is a NaN, the largest 4313 positive integer is returned. Otherwise, if the conversion overflows, the 4314 largest integer with the same sign as `a' is returned. 4315 ------------------------------------------------------------------------------- 4316 */ 4317 int32 float128_to_int32( float128 a ) 4318 { 4319 flag aSign; 4320 int32 aExp, shiftCount; 4321 bits64 aSig0, aSig1; 4322 4323 aSig1 = extractFloat128Frac1( a ); 4324 aSig0 = extractFloat128Frac0( a ); 4325 aExp = extractFloat128Exp( a ); 4326 aSign = extractFloat128Sign( a ); 4327 if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0; 4328 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 ); 4329 aSig0 |= ( aSig1 != 0 ); 4330 shiftCount = 0x4028 - aExp; 4331 if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 ); 4332 return roundAndPackInt32( aSign, aSig0 ); 4333 4334 } 4335 4336 /* 4337 ------------------------------------------------------------------------------- 4338 Returns the result of converting the quadruple-precision floating-point 4339 value `a' to the 32-bit two's complement integer format. The conversion 4340 is performed according to the IEC/IEEE Standard for Binary Floating-Point 4341 Arithmetic, except that the conversion is always rounded toward zero. If 4342 `a' is a NaN, the largest positive integer is returned. Otherwise, if the 4343 conversion overflows, the largest integer with the same sign as `a' is 4344 returned. 4345 ------------------------------------------------------------------------------- 4346 */ 4347 int32 float128_to_int32_round_to_zero( float128 a ) 4348 { 4349 flag aSign; 4350 int32 aExp, shiftCount; 4351 bits64 aSig0, aSig1, savedASig; 4352 int32 z; 4353 4354 aSig1 = extractFloat128Frac1( a ); 4355 aSig0 = extractFloat128Frac0( a ); 4356 aExp = extractFloat128Exp( a ); 4357 aSign = extractFloat128Sign( a ); 4358 aSig0 |= ( aSig1 != 0 ); 4359 if ( 0x401E < aExp ) { 4360 if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0; 4361 goto invalid; 4362 } 4363 else if ( aExp < 0x3FFF ) { 4364 if ( aExp || aSig0 ) set_float_exception_inexact_flag(); 4365 return 0; 4366 } 4367 aSig0 |= LIT64( 0x0001000000000000 ); 4368 shiftCount = 0x402F - aExp; 4369 savedASig = aSig0; 4370 aSig0 >>= shiftCount; 4371 z = (int32)aSig0; 4372 if ( aSign ) z = - z; 4373 if ( ( z < 0 ) ^ aSign ) { 4374 invalid: 4375 float_raise( float_flag_invalid ); 4376 return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF; 4377 } 4378 if ( ( aSig0<<shiftCount ) != savedASig ) { 4379 set_float_exception_inexact_flag(); 4380 } 4381 return z; 4382 4383 } 4384 4385 /* 4386 ------------------------------------------------------------------------------- 4387 Returns the result of converting the quadruple-precision floating-point 4388 value `a' to the 64-bit two's complement integer format. The conversion 4389 is performed according to the IEC/IEEE Standard for Binary Floating-Point 4390 Arithmetic---which means in particular that the conversion is rounded 4391 according to the current rounding mode. If `a' is a NaN, the largest 4392 positive integer is returned. Otherwise, if the conversion overflows, the 4393 largest integer with the same sign as `a' is returned. 4394 ------------------------------------------------------------------------------- 4395 */ 4396 int64 float128_to_int64( float128 a ) 4397 { 4398 flag aSign; 4399 int32 aExp, shiftCount; 4400 bits64 aSig0, aSig1; 4401 4402 aSig1 = extractFloat128Frac1( a ); 4403 aSig0 = extractFloat128Frac0( a ); 4404 aExp = extractFloat128Exp( a ); 4405 aSign = extractFloat128Sign( a ); 4406 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 ); 4407 shiftCount = 0x402F - aExp; 4408 if ( shiftCount <= 0 ) { 4409 if ( 0x403E < aExp ) { 4410 float_raise( float_flag_invalid ); 4411 if ( ! aSign 4412 || ( ( aExp == 0x7FFF ) 4413 && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) ) 4414 ) 4415 ) { 4416 return LIT64( 0x7FFFFFFFFFFFFFFF ); 4417 } 4418 return (sbits64) LIT64( 0x8000000000000000 ); 4419 } 4420 shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 ); 4421 } 4422 else { 4423 shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 ); 4424 } 4425 return roundAndPackInt64( aSign, aSig0, aSig1 ); 4426 4427 } 4428 4429 /* 4430 ------------------------------------------------------------------------------- 4431 Returns the result of converting the quadruple-precision floating-point 4432 value `a' to the 64-bit two's complement integer format. The conversion 4433 is performed according to the IEC/IEEE Standard for Binary Floating-Point 4434 Arithmetic, except that the conversion is always rounded toward zero. 4435 If `a' is a NaN, the largest positive integer is returned. Otherwise, if 4436 the conversion overflows, the largest integer with the same sign as `a' is 4437 returned. 4438 ------------------------------------------------------------------------------- 4439 */ 4440 int64 float128_to_int64_round_to_zero( float128 a ) 4441 { 4442 flag aSign; 4443 int32 aExp, shiftCount; 4444 bits64 aSig0, aSig1; 4445 int64 z; 4446 4447 aSig1 = extractFloat128Frac1( a ); 4448 aSig0 = extractFloat128Frac0( a ); 4449 aExp = extractFloat128Exp( a ); 4450 aSign = extractFloat128Sign( a ); 4451 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 ); 4452 shiftCount = aExp - 0x402F; 4453 if ( 0 < shiftCount ) { 4454 if ( 0x403E <= aExp ) { 4455 aSig0 &= LIT64( 0x0000FFFFFFFFFFFF ); 4456 if ( ( a.high == LIT64( 0xC03E000000000000 ) ) 4457 && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) { 4458 if ( aSig1 ) set_float_exception_inexact_flag(); 4459 } 4460 else { 4461 float_raise( float_flag_invalid ); 4462 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) { 4463 return LIT64( 0x7FFFFFFFFFFFFFFF ); 4464 } 4465 } 4466 return (sbits64) LIT64( 0x8000000000000000 ); 4467 } 4468 z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) ); 4469 if ( (bits64) ( aSig1<<shiftCount ) ) { 4470 set_float_exception_inexact_flag(); 4471 } 4472 } 4473 else { 4474 if ( aExp < 0x3FFF ) { 4475 if ( aExp | aSig0 | aSig1 ) { 4476 set_float_exception_inexact_flag(); 4477 } 4478 return 0; 4479 } 4480 z = aSig0>>( - shiftCount ); 4481 if ( aSig1 4482 || ( shiftCount && (bits64) ( aSig0<<( shiftCount & 63 ) ) ) ) { 4483 set_float_exception_inexact_flag(); 4484 } 4485 } 4486 if ( aSign ) z = - z; 4487 return z; 4488 4489 } 4490 4491 #if (defined(SOFTFLOATSPARC64_FOR_GCC) || defined(SOFTFLOAT_FOR_GCC)) \ 4492 && defined(SOFTFLOAT_NEED_FIXUNS) 4493 /* 4494 * just like above - but do not care for overflow of signed results 4495 */ 4496 uint64 float128_to_uint64_round_to_zero( float128 a ) 4497 { 4498 flag aSign; 4499 int32 aExp, shiftCount; 4500 bits64 aSig0, aSig1; 4501 uint64 z; 4502 4503 aSig1 = extractFloat128Frac1( a ); 4504 aSig0 = extractFloat128Frac0( a ); 4505 aExp = extractFloat128Exp( a ); 4506 aSign = extractFloat128Sign( a ); 4507 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 ); 4508 shiftCount = aExp - 0x402F; 4509 if ( 0 < shiftCount ) { 4510 if ( 0x403F <= aExp ) { 4511 aSig0 &= LIT64( 0x0000FFFFFFFFFFFF ); 4512 if ( ( a.high == LIT64( 0xC03E000000000000 ) ) 4513 && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) { 4514 if ( aSig1 ) set_float_exception_inexact_flag(); 4515 } 4516 else { 4517 float_raise( float_flag_invalid ); 4518 } 4519 return LIT64( 0xFFFFFFFFFFFFFFFF ); 4520 } 4521 z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) ); 4522 if ( (bits64) ( aSig1<<shiftCount ) ) { 4523 set_float_exception_inexact_flag(); 4524 } 4525 } 4526 else { 4527 if ( aExp < 0x3FFF ) { 4528 if ( aExp | aSig0 | aSig1 ) { 4529 set_float_exception_inexact_flag(); 4530 } 4531 return 0; 4532 } 4533 z = aSig0>>( - shiftCount ); 4534 if (aSig1 || ( shiftCount && (bits64) ( aSig0<<( shiftCount & 63 ) ) ) ) { 4535 set_float_exception_inexact_flag(); 4536 } 4537 } 4538 if ( aSign ) z = - z; 4539 return z; 4540 4541 } 4542 #endif /* (SOFTFLOATSPARC64_FOR_GCC || SOFTFLOAT_FOR_GCC) && SOFTFLOAT_NEED_FIXUNS */ 4543 4544 /* 4545 ------------------------------------------------------------------------------- 4546 Returns the result of converting the quadruple-precision floating-point 4547 value `a' to the single-precision floating-point format. The conversion 4548 is performed according to the IEC/IEEE Standard for Binary Floating-Point 4549 Arithmetic. 4550 ------------------------------------------------------------------------------- 4551 */ 4552 float32 float128_to_float32( float128 a ) 4553 { 4554 flag aSign; 4555 int32 aExp; 4556 bits64 aSig0, aSig1; 4557 bits32 zSig; 4558 4559 aSig1 = extractFloat128Frac1( a ); 4560 aSig0 = extractFloat128Frac0( a ); 4561 aExp = extractFloat128Exp( a ); 4562 aSign = extractFloat128Sign( a ); 4563 if ( aExp == 0x7FFF ) { 4564 if ( aSig0 | aSig1 ) { 4565 return commonNaNToFloat32( float128ToCommonNaN( a ) ); 4566 } 4567 return packFloat32( aSign, 0xFF, 0 ); 4568 } 4569 aSig0 |= ( aSig1 != 0 ); 4570 shift64RightJamming( aSig0, 18, &aSig0 ); 4571 zSig = (bits32)aSig0; 4572 if ( aExp || zSig ) { 4573 zSig |= 0x40000000; 4574 aExp -= 0x3F81; 4575 } 4576 return roundAndPackFloat32( aSign, aExp, zSig ); 4577 4578 } 4579 4580 /* 4581 ------------------------------------------------------------------------------- 4582 Returns the result of converting the quadruple-precision floating-point 4583 value `a' to the double-precision floating-point format. The conversion 4584 is performed according to the IEC/IEEE Standard for Binary Floating-Point 4585 Arithmetic. 4586 ------------------------------------------------------------------------------- 4587 */ 4588 float64 float128_to_float64( float128 a ) 4589 { 4590 flag aSign; 4591 int32 aExp; 4592 bits64 aSig0, aSig1; 4593 4594 aSig1 = extractFloat128Frac1( a ); 4595 aSig0 = extractFloat128Frac0( a ); 4596 aExp = extractFloat128Exp( a ); 4597 aSign = extractFloat128Sign( a ); 4598 if ( aExp == 0x7FFF ) { 4599 if ( aSig0 | aSig1 ) { 4600 return commonNaNToFloat64( float128ToCommonNaN( a ) ); 4601 } 4602 return packFloat64( aSign, 0x7FF, 0 ); 4603 } 4604 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 ); 4605 aSig0 |= ( aSig1 != 0 ); 4606 if ( aExp || aSig0 ) { 4607 aSig0 |= LIT64( 0x4000000000000000 ); 4608 aExp -= 0x3C01; 4609 } 4610 return roundAndPackFloat64( aSign, aExp, aSig0 ); 4611 4612 } 4613 4614 #ifdef FLOATX80 4615 4616 /* 4617 ------------------------------------------------------------------------------- 4618 Returns the result of converting the quadruple-precision floating-point 4619 value `a' to the extended double-precision floating-point format. The 4620 conversion is performed according to the IEC/IEEE Standard for Binary 4621 Floating-Point Arithmetic. 4622 ------------------------------------------------------------------------------- 4623 */ 4624 floatx80 float128_to_floatx80( float128 a ) 4625 { 4626 flag aSign; 4627 int32 aExp; 4628 bits64 aSig0, aSig1; 4629 4630 aSig1 = extractFloat128Frac1( a ); 4631 aSig0 = extractFloat128Frac0( a ); 4632 aExp = extractFloat128Exp( a ); 4633 aSign = extractFloat128Sign( a ); 4634 if ( aExp == 0x7FFF ) { 4635 if ( aSig0 | aSig1 ) { 4636 return commonNaNToFloatx80( float128ToCommonNaN( a ) ); 4637 } 4638 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 4639 } 4640 if ( aExp == 0 ) { 4641 if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 ); 4642 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 4643 } 4644 else { 4645 aSig0 |= LIT64( 0x0001000000000000 ); 4646 } 4647 shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 ); 4648 return roundAndPackFloatx80( 80, aSign, aExp, aSig0, aSig1 ); 4649 4650 } 4651 4652 #endif 4653 4654 /* 4655 ------------------------------------------------------------------------------- 4656 Rounds the quadruple-precision floating-point value `a' to an integer, and 4657 returns the result as a quadruple-precision floating-point value. The 4658 operation is performed according to the IEC/IEEE Standard for Binary 4659 Floating-Point Arithmetic. 4660 ------------------------------------------------------------------------------- 4661 */ 4662 float128 float128_round_to_int( float128 a ) 4663 { 4664 flag aSign; 4665 int32 aExp; 4666 bits64 lastBitMask, roundBitsMask; 4667 int8 roundingMode; 4668 float128 z; 4669 4670 aExp = extractFloat128Exp( a ); 4671 if ( 0x402F <= aExp ) { 4672 if ( 0x406F <= aExp ) { 4673 if ( ( aExp == 0x7FFF ) 4674 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) 4675 ) { 4676 return propagateFloat128NaN( a, a ); 4677 } 4678 return a; 4679 } 4680 lastBitMask = 1; 4681 lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1; 4682 roundBitsMask = lastBitMask - 1; 4683 z = a; 4684 roundingMode = float_rounding_mode; 4685 if ( roundingMode == float_round_nearest_even ) { 4686 if ( lastBitMask ) { 4687 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low ); 4688 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask; 4689 } 4690 else { 4691 if ( (sbits64) z.low < 0 ) { 4692 ++z.high; 4693 if ( (bits64) ( z.low<<1 ) == 0 ) z.high &= ~1; 4694 } 4695 } 4696 } 4697 else if ( roundingMode != float_round_to_zero ) { 4698 if ( extractFloat128Sign( z ) 4699 ^ ( roundingMode == float_round_up ) ) { 4700 add128( z.high, z.low, 0, roundBitsMask, &z.high, &z.low ); 4701 } 4702 } 4703 z.low &= ~ roundBitsMask; 4704 } 4705 else { 4706 if ( aExp < 0x3FFF ) { 4707 if ( ( ( (bits64) ( a.high<<1 ) ) | a.low ) == 0 ) return a; 4708 set_float_exception_inexact_flag(); 4709 aSign = extractFloat128Sign( a ); 4710 switch ( float_rounding_mode ) { 4711 case float_round_nearest_even: 4712 if ( ( aExp == 0x3FFE ) 4713 && ( extractFloat128Frac0( a ) 4714 | extractFloat128Frac1( a ) ) 4715 ) { 4716 return packFloat128( aSign, 0x3FFF, 0, 0 ); 4717 } 4718 break; 4719 case float_round_to_zero: 4720 break; 4721 case float_round_down: 4722 return 4723 aSign ? packFloat128( 1, 0x3FFF, 0, 0 ) 4724 : packFloat128( 0, 0, 0, 0 ); 4725 case float_round_up: 4726 return 4727 aSign ? packFloat128( 1, 0, 0, 0 ) 4728 : packFloat128( 0, 0x3FFF, 0, 0 ); 4729 } 4730 return packFloat128( aSign, 0, 0, 0 ); 4731 } 4732 lastBitMask = 1; 4733 lastBitMask <<= 0x402F - aExp; 4734 roundBitsMask = lastBitMask - 1; 4735 z.low = 0; 4736 z.high = a.high; 4737 roundingMode = float_rounding_mode; 4738 if ( roundingMode == float_round_nearest_even ) { 4739 z.high += lastBitMask>>1; 4740 if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) { 4741 z.high &= ~ lastBitMask; 4742 } 4743 } 4744 else if ( roundingMode != float_round_to_zero ) { 4745 if ( extractFloat128Sign( z ) 4746 ^ ( roundingMode == float_round_up ) ) { 4747 z.high |= ( a.low != 0 ); 4748 z.high += roundBitsMask; 4749 } 4750 } 4751 z.high &= ~ roundBitsMask; 4752 } 4753 if ( ( z.low != a.low ) || ( z.high != a.high ) ) { 4754 set_float_exception_inexact_flag(); 4755 } 4756 return z; 4757 4758 } 4759 4760 /* 4761 ------------------------------------------------------------------------------- 4762 Returns the result of adding the absolute values of the quadruple-precision 4763 floating-point values `a' and `b'. If `zSign' is 1, the sum is negated 4764 before being returned. `zSign' is ignored if the result is a NaN. 4765 The addition is performed according to the IEC/IEEE Standard for Binary 4766 Floating-Point Arithmetic. 4767 ------------------------------------------------------------------------------- 4768 */ 4769 static float128 addFloat128Sigs( float128 a, float128 b, flag zSign ) 4770 { 4771 int32 aExp, bExp, zExp; 4772 bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2; 4773 int32 expDiff; 4774 4775 aSig1 = extractFloat128Frac1( a ); 4776 aSig0 = extractFloat128Frac0( a ); 4777 aExp = extractFloat128Exp( a ); 4778 bSig1 = extractFloat128Frac1( b ); 4779 bSig0 = extractFloat128Frac0( b ); 4780 bExp = extractFloat128Exp( b ); 4781 expDiff = aExp - bExp; 4782 if ( 0 < expDiff ) { 4783 if ( aExp == 0x7FFF ) { 4784 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b ); 4785 return a; 4786 } 4787 if ( bExp == 0 ) { 4788 --expDiff; 4789 } 4790 else { 4791 bSig0 |= LIT64( 0x0001000000000000 ); 4792 } 4793 shift128ExtraRightJamming( 4794 bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 ); 4795 zExp = aExp; 4796 } 4797 else if ( expDiff < 0 ) { 4798 if ( bExp == 0x7FFF ) { 4799 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b ); 4800 return packFloat128( zSign, 0x7FFF, 0, 0 ); 4801 } 4802 if ( aExp == 0 ) { 4803 ++expDiff; 4804 } 4805 else { 4806 aSig0 |= LIT64( 0x0001000000000000 ); 4807 } 4808 shift128ExtraRightJamming( 4809 aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 ); 4810 zExp = bExp; 4811 } 4812 else { 4813 if ( aExp == 0x7FFF ) { 4814 if ( aSig0 | aSig1 | bSig0 | bSig1 ) { 4815 return propagateFloat128NaN( a, b ); 4816 } 4817 return a; 4818 } 4819 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 ); 4820 if ( aExp == 0 ) return packFloat128( zSign, 0, zSig0, zSig1 ); 4821 zSig2 = 0; 4822 zSig0 |= LIT64( 0x0002000000000000 ); 4823 zExp = aExp; 4824 goto shiftRight1; 4825 } 4826 aSig0 |= LIT64( 0x0001000000000000 ); 4827 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 ); 4828 --zExp; 4829 if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack; 4830 ++zExp; 4831 shiftRight1: 4832 shift128ExtraRightJamming( 4833 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 ); 4834 roundAndPack: 4835 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 ); 4836 4837 } 4838 4839 /* 4840 ------------------------------------------------------------------------------- 4841 Returns the result of subtracting the absolute values of the quadruple- 4842 precision floating-point values `a' and `b'. If `zSign' is 1, the 4843 difference is negated before being returned. `zSign' is ignored if the 4844 result is a NaN. The subtraction is performed according to the IEC/IEEE 4845 Standard for Binary Floating-Point Arithmetic. 4846 ------------------------------------------------------------------------------- 4847 */ 4848 static float128 subFloat128Sigs( float128 a, float128 b, flag zSign ) 4849 { 4850 int32 aExp, bExp, zExp; 4851 bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1; 4852 int32 expDiff; 4853 float128 z; 4854 4855 aSig1 = extractFloat128Frac1( a ); 4856 aSig0 = extractFloat128Frac0( a ); 4857 aExp = extractFloat128Exp( a ); 4858 bSig1 = extractFloat128Frac1( b ); 4859 bSig0 = extractFloat128Frac0( b ); 4860 bExp = extractFloat128Exp( b ); 4861 expDiff = aExp - bExp; 4862 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 ); 4863 shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 ); 4864 if ( 0 < expDiff ) goto aExpBigger; 4865 if ( expDiff < 0 ) goto bExpBigger; 4866 if ( aExp == 0x7FFF ) { 4867 if ( aSig0 | aSig1 | bSig0 | bSig1 ) { 4868 return propagateFloat128NaN( a, b ); 4869 } 4870 float_raise( float_flag_invalid ); 4871 z.low = float128_default_nan_low; 4872 z.high = float128_default_nan_high; 4873 return z; 4874 } 4875 if ( aExp == 0 ) { 4876 aExp = 1; 4877 bExp = 1; 4878 } 4879 if ( bSig0 < aSig0 ) goto aBigger; 4880 if ( aSig0 < bSig0 ) goto bBigger; 4881 if ( bSig1 < aSig1 ) goto aBigger; 4882 if ( aSig1 < bSig1 ) goto bBigger; 4883 return packFloat128( float_rounding_mode == float_round_down, 0, 0, 0 ); 4884 bExpBigger: 4885 if ( bExp == 0x7FFF ) { 4886 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b ); 4887 return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 ); 4888 } 4889 if ( aExp == 0 ) { 4890 ++expDiff; 4891 } 4892 else { 4893 aSig0 |= LIT64( 0x4000000000000000 ); 4894 } 4895 shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 ); 4896 bSig0 |= LIT64( 0x4000000000000000 ); 4897 bBigger: 4898 sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 ); 4899 zExp = bExp; 4900 zSign ^= 1; 4901 goto normalizeRoundAndPack; 4902 aExpBigger: 4903 if ( aExp == 0x7FFF ) { 4904 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b ); 4905 return a; 4906 } 4907 if ( bExp == 0 ) { 4908 --expDiff; 4909 } 4910 else { 4911 bSig0 |= LIT64( 0x4000000000000000 ); 4912 } 4913 shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 ); 4914 aSig0 |= LIT64( 0x4000000000000000 ); 4915 aBigger: 4916 sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 ); 4917 zExp = aExp; 4918 normalizeRoundAndPack: 4919 --zExp; 4920 return normalizeRoundAndPackFloat128( zSign, zExp - 14, zSig0, zSig1 ); 4921 4922 } 4923 4924 /* 4925 ------------------------------------------------------------------------------- 4926 Returns the result of adding the quadruple-precision floating-point values 4927 `a' and `b'. The operation is performed according to the IEC/IEEE Standard 4928 for Binary Floating-Point Arithmetic. 4929 ------------------------------------------------------------------------------- 4930 */ 4931 float128 float128_add( float128 a, float128 b ) 4932 { 4933 flag aSign, bSign; 4934 4935 aSign = extractFloat128Sign( a ); 4936 bSign = extractFloat128Sign( b ); 4937 if ( aSign == bSign ) { 4938 return addFloat128Sigs( a, b, aSign ); 4939 } 4940 else { 4941 return subFloat128Sigs( a, b, aSign ); 4942 } 4943 4944 } 4945 4946 /* 4947 ------------------------------------------------------------------------------- 4948 Returns the result of subtracting the quadruple-precision floating-point 4949 values `a' and `b'. The operation is performed according to the IEC/IEEE 4950 Standard for Binary Floating-Point Arithmetic. 4951 ------------------------------------------------------------------------------- 4952 */ 4953 float128 float128_sub( float128 a, float128 b ) 4954 { 4955 flag aSign, bSign; 4956 4957 aSign = extractFloat128Sign( a ); 4958 bSign = extractFloat128Sign( b ); 4959 if ( aSign == bSign ) { 4960 return subFloat128Sigs( a, b, aSign ); 4961 } 4962 else { 4963 return addFloat128Sigs( a, b, aSign ); 4964 } 4965 4966 } 4967 4968 /* 4969 ------------------------------------------------------------------------------- 4970 Returns the result of multiplying the quadruple-precision floating-point 4971 values `a' and `b'. The operation is performed according to the IEC/IEEE 4972 Standard for Binary Floating-Point Arithmetic. 4973 ------------------------------------------------------------------------------- 4974 */ 4975 float128 float128_mul( float128 a, float128 b ) 4976 { 4977 flag aSign, bSign, zSign; 4978 int32 aExp, bExp, zExp; 4979 bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3; 4980 float128 z; 4981 4982 aSig1 = extractFloat128Frac1( a ); 4983 aSig0 = extractFloat128Frac0( a ); 4984 aExp = extractFloat128Exp( a ); 4985 aSign = extractFloat128Sign( a ); 4986 bSig1 = extractFloat128Frac1( b ); 4987 bSig0 = extractFloat128Frac0( b ); 4988 bExp = extractFloat128Exp( b ); 4989 bSign = extractFloat128Sign( b ); 4990 zSign = aSign ^ bSign; 4991 if ( aExp == 0x7FFF ) { 4992 if ( ( aSig0 | aSig1 ) 4993 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) { 4994 return propagateFloat128NaN( a, b ); 4995 } 4996 if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid; 4997 return packFloat128( zSign, 0x7FFF, 0, 0 ); 4998 } 4999 if ( bExp == 0x7FFF ) { 5000 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b ); 5001 if ( ( aExp | aSig0 | aSig1 ) == 0 ) { 5002 invalid: 5003 float_raise( float_flag_invalid ); 5004 z.low = float128_default_nan_low; 5005 z.high = float128_default_nan_high; 5006 return z; 5007 } 5008 return packFloat128( zSign, 0x7FFF, 0, 0 ); 5009 } 5010 if ( aExp == 0 ) { 5011 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 ); 5012 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 5013 } 5014 if ( bExp == 0 ) { 5015 if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 ); 5016 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 ); 5017 } 5018 zExp = aExp + bExp - 0x4000; 5019 aSig0 |= LIT64( 0x0001000000000000 ); 5020 shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 ); 5021 mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 ); 5022 add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 ); 5023 zSig2 |= ( zSig3 != 0 ); 5024 if ( LIT64( 0x0002000000000000 ) <= zSig0 ) { 5025 shift128ExtraRightJamming( 5026 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 ); 5027 ++zExp; 5028 } 5029 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 ); 5030 5031 } 5032 5033 /* 5034 ------------------------------------------------------------------------------- 5035 Returns the result of dividing the quadruple-precision floating-point value 5036 `a' by the corresponding value `b'. The operation is performed according to 5037 the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5038 ------------------------------------------------------------------------------- 5039 */ 5040 float128 float128_div( float128 a, float128 b ) 5041 { 5042 flag aSign, bSign, zSign; 5043 int32 aExp, bExp, zExp; 5044 bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2; 5045 bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3; 5046 float128 z; 5047 5048 aSig1 = extractFloat128Frac1( a ); 5049 aSig0 = extractFloat128Frac0( a ); 5050 aExp = extractFloat128Exp( a ); 5051 aSign = extractFloat128Sign( a ); 5052 bSig1 = extractFloat128Frac1( b ); 5053 bSig0 = extractFloat128Frac0( b ); 5054 bExp = extractFloat128Exp( b ); 5055 bSign = extractFloat128Sign( b ); 5056 zSign = aSign ^ bSign; 5057 if ( aExp == 0x7FFF ) { 5058 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b ); 5059 if ( bExp == 0x7FFF ) { 5060 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b ); 5061 goto invalid; 5062 } 5063 return packFloat128( zSign, 0x7FFF, 0, 0 ); 5064 } 5065 if ( bExp == 0x7FFF ) { 5066 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b ); 5067 return packFloat128( zSign, 0, 0, 0 ); 5068 } 5069 if ( bExp == 0 ) { 5070 if ( ( bSig0 | bSig1 ) == 0 ) { 5071 if ( ( aExp | aSig0 | aSig1 ) == 0 ) { 5072 invalid: 5073 float_raise( float_flag_invalid ); 5074 z.low = float128_default_nan_low; 5075 z.high = float128_default_nan_high; 5076 return z; 5077 } 5078 float_raise( float_flag_divbyzero ); 5079 return packFloat128( zSign, 0x7FFF, 0, 0 ); 5080 } 5081 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 ); 5082 } 5083 if ( aExp == 0 ) { 5084 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 ); 5085 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 5086 } 5087 zExp = aExp - bExp + 0x3FFD; 5088 shortShift128Left( 5089 aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 ); 5090 shortShift128Left( 5091 bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 ); 5092 if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) { 5093 shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 ); 5094 ++zExp; 5095 } 5096 zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 ); 5097 mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 ); 5098 sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 ); 5099 while ( (sbits64) rem0 < 0 ) { 5100 --zSig0; 5101 add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 ); 5102 } 5103 zSig1 = estimateDiv128To64( rem1, rem2, bSig0 ); 5104 if ( ( zSig1 & 0x3FFF ) <= 4 ) { 5105 mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 ); 5106 sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 ); 5107 while ( (sbits64) rem1 < 0 ) { 5108 --zSig1; 5109 add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 ); 5110 } 5111 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 ); 5112 } 5113 shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 ); 5114 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 ); 5115 5116 } 5117 5118 /* 5119 ------------------------------------------------------------------------------- 5120 Returns the remainder of the quadruple-precision floating-point value `a' 5121 with respect to the corresponding value `b'. The operation is performed 5122 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5123 ------------------------------------------------------------------------------- 5124 */ 5125 float128 float128_rem( float128 a, float128 b ) 5126 { 5127 flag aSign, zSign; 5128 int32 aExp, bExp, expDiff; 5129 bits64 aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2; 5130 bits64 allZero, alternateASig0, alternateASig1, sigMean1; 5131 sbits64 sigMean0; 5132 float128 z; 5133 5134 aSig1 = extractFloat128Frac1( a ); 5135 aSig0 = extractFloat128Frac0( a ); 5136 aExp = extractFloat128Exp( a ); 5137 aSign = extractFloat128Sign( a ); 5138 bSig1 = extractFloat128Frac1( b ); 5139 bSig0 = extractFloat128Frac0( b ); 5140 bExp = extractFloat128Exp( b ); 5141 if ( aExp == 0x7FFF ) { 5142 if ( ( aSig0 | aSig1 ) 5143 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) { 5144 return propagateFloat128NaN( a, b ); 5145 } 5146 goto invalid; 5147 } 5148 if ( bExp == 0x7FFF ) { 5149 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b ); 5150 return a; 5151 } 5152 if ( bExp == 0 ) { 5153 if ( ( bSig0 | bSig1 ) == 0 ) { 5154 invalid: 5155 float_raise( float_flag_invalid ); 5156 z.low = float128_default_nan_low; 5157 z.high = float128_default_nan_high; 5158 return z; 5159 } 5160 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 ); 5161 } 5162 if ( aExp == 0 ) { 5163 if ( ( aSig0 | aSig1 ) == 0 ) return a; 5164 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 5165 } 5166 expDiff = aExp - bExp; 5167 if ( expDiff < -1 ) return a; 5168 shortShift128Left( 5169 aSig0 | LIT64( 0x0001000000000000 ), 5170 aSig1, 5171 15 - ( expDiff < 0 ), 5172 &aSig0, 5173 &aSig1 5174 ); 5175 shortShift128Left( 5176 bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 ); 5177 q = le128( bSig0, bSig1, aSig0, aSig1 ); 5178 if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 ); 5179 expDiff -= 64; 5180 while ( 0 < expDiff ) { 5181 q = estimateDiv128To64( aSig0, aSig1, bSig0 ); 5182 q = ( 4 < q ) ? q - 4 : 0; 5183 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 ); 5184 shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero ); 5185 shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero ); 5186 sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 ); 5187 expDiff -= 61; 5188 } 5189 if ( -64 < expDiff ) { 5190 q = estimateDiv128To64( aSig0, aSig1, bSig0 ); 5191 q = ( 4 < q ) ? q - 4 : 0; 5192 q >>= - expDiff; 5193 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 ); 5194 expDiff += 52; 5195 if ( expDiff < 0 ) { 5196 shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 ); 5197 } 5198 else { 5199 shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 ); 5200 } 5201 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 ); 5202 sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 ); 5203 } 5204 else { 5205 shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 ); 5206 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 ); 5207 } 5208 do { 5209 alternateASig0 = aSig0; 5210 alternateASig1 = aSig1; 5211 ++q; 5212 sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 ); 5213 } while ( 0 <= (sbits64) aSig0 ); 5214 add128( 5215 aSig0, aSig1, alternateASig0, alternateASig1, (bits64 *)&sigMean0, &sigMean1 ); 5216 if ( ( sigMean0 < 0 ) 5217 || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) { 5218 aSig0 = alternateASig0; 5219 aSig1 = alternateASig1; 5220 } 5221 zSign = ( (sbits64) aSig0 < 0 ); 5222 if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 ); 5223 return 5224 normalizeRoundAndPackFloat128( aSign ^ zSign, bExp - 4, aSig0, aSig1 ); 5225 5226 } 5227 5228 /* 5229 ------------------------------------------------------------------------------- 5230 Returns the square root of the quadruple-precision floating-point value `a'. 5231 The operation is performed according to the IEC/IEEE Standard for Binary 5232 Floating-Point Arithmetic. 5233 ------------------------------------------------------------------------------- 5234 */ 5235 float128 float128_sqrt( float128 a ) 5236 { 5237 flag aSign; 5238 int32 aExp, zExp; 5239 bits64 aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0; 5240 bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3; 5241 float128 z; 5242 5243 aSig1 = extractFloat128Frac1( a ); 5244 aSig0 = extractFloat128Frac0( a ); 5245 aExp = extractFloat128Exp( a ); 5246 aSign = extractFloat128Sign( a ); 5247 if ( aExp == 0x7FFF ) { 5248 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, a ); 5249 if ( ! aSign ) return a; 5250 goto invalid; 5251 } 5252 if ( aSign ) { 5253 if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a; 5254 invalid: 5255 float_raise( float_flag_invalid ); 5256 z.low = float128_default_nan_low; 5257 z.high = float128_default_nan_high; 5258 return z; 5259 } 5260 if ( aExp == 0 ) { 5261 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 ); 5262 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 5263 } 5264 zExp = (int32) ( (aExp - 0x3FFF) >> 1) + 0x3FFE; 5265 aSig0 |= LIT64( 0x0001000000000000 ); 5266 zSig0 = estimateSqrt32((int16)aExp, (bits32)(aSig0>>17)); 5267 shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 ); 5268 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 ); 5269 doubleZSig0 = zSig0<<1; 5270 mul64To128( zSig0, zSig0, &term0, &term1 ); 5271 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 ); 5272 while ( (sbits64) rem0 < 0 ) { 5273 --zSig0; 5274 doubleZSig0 -= 2; 5275 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 ); 5276 } 5277 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 ); 5278 if ( ( zSig1 & 0x1FFF ) <= 5 ) { 5279 if ( zSig1 == 0 ) zSig1 = 1; 5280 mul64To128( doubleZSig0, zSig1, &term1, &term2 ); 5281 sub128( rem1, 0, term1, term2, &rem1, &rem2 ); 5282 mul64To128( zSig1, zSig1, &term2, &term3 ); 5283 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 ); 5284 while ( (sbits64) rem1 < 0 ) { 5285 --zSig1; 5286 shortShift128Left( 0, zSig1, 1, &term2, &term3 ); 5287 term3 |= 1; 5288 term2 |= doubleZSig0; 5289 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 ); 5290 } 5291 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 ); 5292 } 5293 shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 ); 5294 return roundAndPackFloat128( 0, zExp, zSig0, zSig1, zSig2 ); 5295 5296 } 5297 5298 /* 5299 ------------------------------------------------------------------------------- 5300 Returns 1 if the quadruple-precision floating-point value `a' is equal to 5301 the corresponding value `b', and 0 otherwise. The comparison is performed 5302 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5303 ------------------------------------------------------------------------------- 5304 */ 5305 flag float128_eq( float128 a, float128 b ) 5306 { 5307 5308 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 5309 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 5310 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 5311 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 5312 ) { 5313 if ( float128_is_signaling_nan( a ) 5314 || float128_is_signaling_nan( b ) ) { 5315 float_raise( float_flag_invalid ); 5316 } 5317 return 0; 5318 } 5319 return 5320 ( a.low == b.low ) 5321 && ( ( a.high == b.high ) 5322 || ( ( a.low == 0 ) 5323 && ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) ) 5324 ); 5325 5326 } 5327 5328 /* 5329 ------------------------------------------------------------------------------- 5330 Returns 1 if the quadruple-precision floating-point value `a' is less than 5331 or equal to the corresponding value `b', and 0 otherwise. The comparison 5332 is performed according to the IEC/IEEE Standard for Binary Floating-Point 5333 Arithmetic. 5334 ------------------------------------------------------------------------------- 5335 */ 5336 flag float128_le( float128 a, float128 b ) 5337 { 5338 flag aSign, bSign; 5339 5340 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 5341 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 5342 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 5343 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 5344 ) { 5345 float_raise( float_flag_invalid ); 5346 return 0; 5347 } 5348 aSign = extractFloat128Sign( a ); 5349 bSign = extractFloat128Sign( b ); 5350 if ( aSign != bSign ) { 5351 return 5352 aSign 5353 || ( ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 5354 == 0 ); 5355 } 5356 return 5357 aSign ? le128( b.high, b.low, a.high, a.low ) 5358 : le128( a.high, a.low, b.high, b.low ); 5359 5360 } 5361 5362 /* 5363 ------------------------------------------------------------------------------- 5364 Returns 1 if the quadruple-precision floating-point value `a' is less than 5365 the corresponding value `b', and 0 otherwise. The comparison is performed 5366 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5367 ------------------------------------------------------------------------------- 5368 */ 5369 flag float128_lt( float128 a, float128 b ) 5370 { 5371 flag aSign, bSign; 5372 5373 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 5374 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 5375 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 5376 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 5377 ) { 5378 float_raise( float_flag_invalid ); 5379 return 0; 5380 } 5381 aSign = extractFloat128Sign( a ); 5382 bSign = extractFloat128Sign( b ); 5383 if ( aSign != bSign ) { 5384 return 5385 aSign 5386 && ( ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 5387 != 0 ); 5388 } 5389 return 5390 aSign ? lt128( b.high, b.low, a.high, a.low ) 5391 : lt128( a.high, a.low, b.high, b.low ); 5392 5393 } 5394 5395 /* 5396 ------------------------------------------------------------------------------- 5397 Returns 1 if the quadruple-precision floating-point value `a' is equal to 5398 the corresponding value `b', and 0 otherwise. The invalid exception is 5399 raised if either operand is a NaN. Otherwise, the comparison is performed 5400 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5401 ------------------------------------------------------------------------------- 5402 */ 5403 flag float128_eq_signaling( float128 a, float128 b ) 5404 { 5405 5406 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 5407 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 5408 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 5409 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 5410 ) { 5411 float_raise( float_flag_invalid ); 5412 return 0; 5413 } 5414 return 5415 ( a.low == b.low ) 5416 && ( ( a.high == b.high ) 5417 || ( ( a.low == 0 ) 5418 && ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) ) 5419 ); 5420 5421 } 5422 5423 /* 5424 ------------------------------------------------------------------------------- 5425 Returns 1 if the quadruple-precision floating-point value `a' is less than 5426 or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not 5427 cause an exception. Otherwise, the comparison is performed according to the 5428 IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5429 ------------------------------------------------------------------------------- 5430 */ 5431 flag float128_le_quiet( float128 a, float128 b ) 5432 { 5433 flag aSign, bSign; 5434 5435 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 5436 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 5437 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 5438 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 5439 ) { 5440 if ( float128_is_signaling_nan( a ) 5441 || float128_is_signaling_nan( b ) ) { 5442 float_raise( float_flag_invalid ); 5443 } 5444 return 0; 5445 } 5446 aSign = extractFloat128Sign( a ); 5447 bSign = extractFloat128Sign( b ); 5448 if ( aSign != bSign ) { 5449 return 5450 aSign 5451 || ( ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 5452 == 0 ); 5453 } 5454 return 5455 aSign ? le128( b.high, b.low, a.high, a.low ) 5456 : le128( a.high, a.low, b.high, b.low ); 5457 5458 } 5459 5460 /* 5461 ------------------------------------------------------------------------------- 5462 Returns 1 if the quadruple-precision floating-point value `a' is less than 5463 the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 5464 exception. Otherwise, the comparison is performed according to the IEC/IEEE 5465 Standard for Binary Floating-Point Arithmetic. 5466 ------------------------------------------------------------------------------- 5467 */ 5468 flag float128_lt_quiet( float128 a, float128 b ) 5469 { 5470 flag aSign, bSign; 5471 5472 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 5473 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 5474 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 5475 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 5476 ) { 5477 if ( float128_is_signaling_nan( a ) 5478 || float128_is_signaling_nan( b ) ) { 5479 float_raise( float_flag_invalid ); 5480 } 5481 return 0; 5482 } 5483 aSign = extractFloat128Sign( a ); 5484 bSign = extractFloat128Sign( b ); 5485 if ( aSign != bSign ) { 5486 return 5487 aSign 5488 && ( ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 5489 != 0 ); 5490 } 5491 return 5492 aSign ? lt128( b.high, b.low, a.high, a.low ) 5493 : lt128( a.high, a.low, b.high, b.low ); 5494 5495 } 5496 5497 #endif 5498 5499 5500 #if defined(SOFTFLOAT_FOR_GCC) && defined(SOFTFLOAT_NEED_FIXUNS) 5501 5502 /* 5503 * These two routines are not part of the original softfloat distribution. 5504 * 5505 * They are based on the corresponding conversions to integer but return 5506 * unsigned numbers instead since these functions are required by GCC. 5507 * 5508 * Added by Mark Brinicombe <mark (at) NetBSD.org> 27/09/97 5509 * 5510 * float64 version overhauled for SoftFloat 2a [bjh21 2000-07-15] 5511 */ 5512 5513 /* 5514 ------------------------------------------------------------------------------- 5515 Returns the result of converting the double-precision floating-point value 5516 `a' to the 32-bit unsigned integer format. The conversion is 5517 performed according to the IEC/IEEE Standard for Binary Floating-point 5518 Arithmetic, except that the conversion is always rounded toward zero. If 5519 `a' is a NaN, the largest positive integer is returned. If the conversion 5520 overflows, the largest integer positive is returned. 5521 ------------------------------------------------------------------------------- 5522 */ 5523 uint32 float64_to_uint32_round_to_zero( float64 a ) 5524 { 5525 flag aSign; 5526 int16 aExp, shiftCount; 5527 bits64 aSig, savedASig; 5528 uint32 z; 5529 5530 aSig = extractFloat64Frac( a ); 5531 aExp = extractFloat64Exp( a ); 5532 aSign = extractFloat64Sign( a ); 5533 5534 if (aSign) { 5535 float_raise( float_flag_invalid ); 5536 return(0); 5537 } 5538 5539 if ( 0x41E < aExp ) { 5540 float_raise( float_flag_invalid ); 5541 return 0xffffffff; 5542 } 5543 else if ( aExp < 0x3FF ) { 5544 if ( aExp || aSig ) set_float_exception_inexact_flag(); 5545 return 0; 5546 } 5547 aSig |= LIT64( 0x0010000000000000 ); 5548 shiftCount = 0x433 - aExp; 5549 savedASig = aSig; 5550 aSig >>= shiftCount; 5551 z = (uint32)aSig; 5552 if ( ( aSig<<shiftCount ) != savedASig ) { 5553 set_float_exception_inexact_flag(); 5554 } 5555 return z; 5556 5557 } 5558 5559 /* 5560 ------------------------------------------------------------------------------- 5561 Returns the result of converting the single-precision floating-point value 5562 `a' to the 32-bit unsigned integer format. The conversion is 5563 performed according to the IEC/IEEE Standard for Binary Floating-point 5564 Arithmetic, except that the conversion is always rounded toward zero. If 5565 `a' is a NaN, the largest positive integer is returned. If the conversion 5566 overflows, the largest positive integer is returned. 5567 ------------------------------------------------------------------------------- 5568 */ 5569 uint32 float32_to_uint32_round_to_zero( float32 a ) 5570 { 5571 flag aSign; 5572 int16 aExp, shiftCount; 5573 bits32 aSig; 5574 uint32 z; 5575 5576 aSig = extractFloat32Frac( a ); 5577 aExp = extractFloat32Exp( a ); 5578 aSign = extractFloat32Sign( a ); 5579 shiftCount = aExp - 0x9E; 5580 5581 if (aSign) { 5582 float_raise( float_flag_invalid ); 5583 return(0); 5584 } 5585 if ( 0 < shiftCount ) { 5586 float_raise( float_flag_invalid ); 5587 return 0xFFFFFFFF; 5588 } 5589 else if ( aExp <= 0x7E ) { 5590 if ( aExp | aSig ) set_float_exception_inexact_flag(); 5591 return 0; 5592 } 5593 aSig = ( aSig | 0x800000 )<<8; 5594 z = aSig>>( - shiftCount ); 5595 if ( aSig<<( shiftCount & 31 ) ) { 5596 set_float_exception_inexact_flag(); 5597 } 5598 return z; 5599 5600 } 5601 5602 #endif 5603