Home | History | Annotate | Download | only in bits64
      1 /* $NetBSD: softfloat.c,v 1.13 2013/11/22 17:04:24 martin Exp $ */
      2 
      3 /*
      4  * This version hacked for use with gcc -msoft-float by bjh21.
      5  * (Mostly a case of #ifdefing out things GCC doesn't need or provides
      6  *  itself).
      7  */
      8 
      9 /*
     10  * Things you may want to define:
     11  *
     12  * SOFTFLOAT_FOR_GCC - build only those functions necessary for GCC (with
     13  *   -msoft-float) to work.  Include "softfloat-for-gcc.h" to get them
     14  *   properly renamed.
     15  */
     16 
     17 /*
     18 ===============================================================================
     19 
     20 This C source file is part of the SoftFloat IEC/IEEE Floating-point
     21 Arithmetic Package, Release 2a.
     22 
     23 Written by John R. Hauser.  This work was made possible in part by the
     24 International Computer Science Institute, located at Suite 600, 1947 Center
     25 Street, Berkeley, California 94704.  Funding was partially provided by the
     26 National Science Foundation under grant MIP-9311980.  The original version
     27 of this code was written as part of a project to build a fixed-point vector
     28 processor in collaboration with the University of California at Berkeley,
     29 overseen by Profs. Nelson Morgan and John Wawrzynek.  More information
     30 is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/
     31 arithmetic/SoftFloat.html'.
     32 
     33 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE.  Although reasonable effort
     34 has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
     35 TIMES RESULT IN INCORRECT BEHAVIOR.  USE OF THIS SOFTWARE IS RESTRICTED TO
     36 PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
     37 AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
     38 
     39 Derivative works are acceptable, even for commercial purposes, so long as
     40 (1) they include prominent notice that the work is derivative, and (2) they
     41 include prominent notice akin to these four paragraphs for those parts of
     42 this code that are retained.
     43 
     44 ===============================================================================
     45 */
     46 
     47 #include <sys/cdefs.h>
     48 #if defined(LIBC_SCCS) && !defined(lint)
     49 __RCSID("$NetBSD: softfloat.c,v 1.13 2013/11/22 17:04:24 martin Exp $");
     50 #endif /* LIBC_SCCS and not lint */
     51 
     52 #ifdef SOFTFLOAT_FOR_GCC
     53 #include "softfloat-for-gcc.h"
     54 #endif
     55 
     56 #include "milieu.h"
     57 #include "softfloat.h"
     58 
     59 /*
     60  * Conversions between floats as stored in memory and floats as
     61  * SoftFloat uses them
     62  */
     63 #ifndef FLOAT64_DEMANGLE
     64 #define FLOAT64_DEMANGLE(a) (a)
     65 #endif
     66 #ifndef FLOAT64_MANGLE
     67 #define FLOAT64_MANGLE(a)   (a)
     68 #endif
     69 
     70 /*
     71 -------------------------------------------------------------------------------
     72 Floating-point rounding mode, extended double-precision rounding precision,
     73 and exception flags.
     74 -------------------------------------------------------------------------------
     75 */
     76 #ifndef set_float_rounding_mode
     77 fp_rnd float_rounding_mode = float_round_nearest_even;
     78 fp_except float_exception_flags = 0;
     79 #endif
     80 #ifndef set_float_exception_inexact_flag
     81 #define set_float_exception_inexact_flag() \
     82     ((void)(float_exception_flags |= float_flag_inexact))
     83 #endif
     84 #ifdef FLOATX80
     85 int8 floatx80_rounding_precision = 80;
     86 #endif
     87 
     88 /*
     89 -------------------------------------------------------------------------------
     90 Primitive arithmetic functions, including multi-word arithmetic, and
     91 division and square root approximations.  (Can be specialized to target if
     92 desired.)
     93 -------------------------------------------------------------------------------
     94 */
     95 #include "softfloat-macros"
     96 
     97 /*
     98 -------------------------------------------------------------------------------
     99 Functions and definitions to determine:  (1) whether tininess for underflow
    100 is detected before or after rounding by default, (2) what (if anything)
    101 happens when exceptions are raised, (3) how signaling NaNs are distinguished
    102 from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
    103 are propagated from function inputs to output.  These details are target-
    104 specific.
    105 -------------------------------------------------------------------------------
    106 */
    107 #include "softfloat-specialize"
    108 
    109 #if !defined(SOFTFLOAT_FOR_GCC) || defined(FLOATX80) || defined(FLOAT128)
    110 /*
    111 -------------------------------------------------------------------------------
    112 Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
    113 and 7, and returns the properly rounded 32-bit integer corresponding to the
    114 input.  If `zSign' is 1, the input is negated before being converted to an
    115 integer.  Bit 63 of `absZ' must be zero.  Ordinarily, the fixed-point input
    116 is simply rounded to an integer, with the inexact exception raised if the
    117 input cannot be represented exactly as an integer.  However, if the fixed-
    118 point input is too large, the invalid exception is raised and the largest
    119 positive or negative integer is returned.
    120 -------------------------------------------------------------------------------
    121 */
    122 static int32 roundAndPackInt32( flag zSign, bits64 absZ )
    123 {
    124     int8 roundingMode;
    125     flag roundNearestEven;
    126     int8 roundIncrement, roundBits;
    127     int32 z;
    128 
    129     roundingMode = float_rounding_mode;
    130     roundNearestEven = ( roundingMode == float_round_nearest_even );
    131     roundIncrement = 0x40;
    132     if ( ! roundNearestEven ) {
    133         if ( roundingMode == float_round_to_zero ) {
    134             roundIncrement = 0;
    135         }
    136         else {
    137             roundIncrement = 0x7F;
    138             if ( zSign ) {
    139                 if ( roundingMode == float_round_up ) roundIncrement = 0;
    140             }
    141             else {
    142                 if ( roundingMode == float_round_down ) roundIncrement = 0;
    143             }
    144         }
    145     }
    146     roundBits = (int8)(absZ & 0x7F);
    147     absZ = ( absZ + roundIncrement )>>7;
    148     absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
    149     z = (int32)absZ;
    150     if ( zSign ) z = - z;
    151     if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
    152         float_raise( float_flag_invalid );
    153         return zSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
    154     }
    155     if ( roundBits ) set_float_exception_inexact_flag();
    156     return z;
    157 
    158 }
    159 
    160 /*
    161 -------------------------------------------------------------------------------
    162 Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
    163 `absZ1', with binary point between bits 63 and 64 (between the input words),
    164 and returns the properly rounded 64-bit integer corresponding to the input.
    165 If `zSign' is 1, the input is negated before being converted to an integer.
    166 Ordinarily, the fixed-point input is simply rounded to an integer, with
    167 the inexact exception raised if the input cannot be represented exactly as
    168 an integer.  However, if the fixed-point input is too large, the invalid
    169 exception is raised and the largest positive or negative integer is
    170 returned.
    171 -------------------------------------------------------------------------------
    172 */
    173 static int64 roundAndPackInt64( flag zSign, bits64 absZ0, bits64 absZ1 )
    174 {
    175     int8 roundingMode;
    176     flag roundNearestEven, increment;
    177     int64 z;
    178 
    179     roundingMode = float_rounding_mode;
    180     roundNearestEven = ( roundingMode == float_round_nearest_even );
    181     increment = ( (sbits64) absZ1 < 0 );
    182     if ( ! roundNearestEven ) {
    183         if ( roundingMode == float_round_to_zero ) {
    184             increment = 0;
    185         }
    186         else {
    187             if ( zSign ) {
    188                 increment = ( roundingMode == float_round_down ) && absZ1;
    189             }
    190             else {
    191                 increment = ( roundingMode == float_round_up ) && absZ1;
    192             }
    193         }
    194     }
    195     if ( increment ) {
    196         ++absZ0;
    197         if ( absZ0 == 0 ) goto overflow;
    198         absZ0 &= ~ ( ( (bits64) ( absZ1<<1 ) == 0 ) & roundNearestEven );
    199     }
    200     z = absZ0;
    201     if ( zSign ) z = - z;
    202     if ( z && ( ( z < 0 ) ^ zSign ) ) {
    203  overflow:
    204         float_raise( float_flag_invalid );
    205         return
    206               zSign ? (sbits64) LIT64( 0x8000000000000000 )
    207             : LIT64( 0x7FFFFFFFFFFFFFFF );
    208     }
    209     if ( absZ1 ) set_float_exception_inexact_flag();
    210     return z;
    211 
    212 }
    213 #endif
    214 
    215 /*
    216 -------------------------------------------------------------------------------
    217 Returns the fraction bits of the single-precision floating-point value `a'.
    218 -------------------------------------------------------------------------------
    219 */
    220 INLINE bits32 extractFloat32Frac( float32 a )
    221 {
    222 
    223     return a & 0x007FFFFF;
    224 
    225 }
    226 
    227 /*
    228 -------------------------------------------------------------------------------
    229 Returns the exponent bits of the single-precision floating-point value `a'.
    230 -------------------------------------------------------------------------------
    231 */
    232 INLINE int16 extractFloat32Exp( float32 a )
    233 {
    234 
    235     return ( a>>23 ) & 0xFF;
    236 
    237 }
    238 
    239 /*
    240 -------------------------------------------------------------------------------
    241 Returns the sign bit of the single-precision floating-point value `a'.
    242 -------------------------------------------------------------------------------
    243 */
    244 INLINE flag extractFloat32Sign( float32 a )
    245 {
    246 
    247     return a>>31;
    248 
    249 }
    250 
    251 /*
    252 -------------------------------------------------------------------------------
    253 Normalizes the subnormal single-precision floating-point value represented
    254 by the denormalized significand `aSig'.  The normalized exponent and
    255 significand are stored at the locations pointed to by `zExpPtr' and
    256 `zSigPtr', respectively.
    257 -------------------------------------------------------------------------------
    258 */
    259 static void
    260  normalizeFloat32Subnormal( bits32 aSig, int16 *zExpPtr, bits32 *zSigPtr )
    261 {
    262     int8 shiftCount;
    263 
    264     shiftCount = countLeadingZeros32( aSig ) - 8;
    265     *zSigPtr = aSig<<shiftCount;
    266     *zExpPtr = 1 - shiftCount;
    267 
    268 }
    269 
    270 /*
    271 -------------------------------------------------------------------------------
    272 Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
    273 single-precision floating-point value, returning the result.  After being
    274 shifted into the proper positions, the three fields are simply added
    275 together to form the result.  This means that any integer portion of `zSig'
    276 will be added into the exponent.  Since a properly normalized significand
    277 will have an integer portion equal to 1, the `zExp' input should be 1 less
    278 than the desired result exponent whenever `zSig' is a complete, normalized
    279 significand.
    280 -------------------------------------------------------------------------------
    281 */
    282 INLINE float32 packFloat32( flag zSign, int16 zExp, bits32 zSig )
    283 {
    284 
    285     return ( ( (bits32) zSign )<<31 ) + ( ( (bits32) zExp )<<23 ) + zSig;
    286 
    287 }
    288 
    289 /*
    290 -------------------------------------------------------------------------------
    291 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
    292 and significand `zSig', and returns the proper single-precision floating-
    293 point value corresponding to the abstract input.  Ordinarily, the abstract
    294 value is simply rounded and packed into the single-precision format, with
    295 the inexact exception raised if the abstract input cannot be represented
    296 exactly.  However, if the abstract value is too large, the overflow and
    297 inexact exceptions are raised and an infinity or maximal finite value is
    298 returned.  If the abstract value is too small, the input value is rounded to
    299 a subnormal number, and the underflow and inexact exceptions are raised if
    300 the abstract input cannot be represented exactly as a subnormal single-
    301 precision floating-point number.
    302     The input significand `zSig' has its binary point between bits 30
    303 and 29, which is 7 bits to the left of the usual location.  This shifted
    304 significand must be normalized or smaller.  If `zSig' is not normalized,
    305 `zExp' must be 0; in that case, the result returned is a subnormal number,
    306 and it must not require rounding.  In the usual case that `zSig' is
    307 normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
    308 The handling of underflow and overflow follows the IEC/IEEE Standard for
    309 Binary Floating-Point Arithmetic.
    310 -------------------------------------------------------------------------------
    311 */
    312 static float32 roundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig )
    313 {
    314     int8 roundingMode;
    315     flag roundNearestEven;
    316     int8 roundIncrement, roundBits;
    317     flag isTiny;
    318 
    319     roundingMode = float_rounding_mode;
    320     roundNearestEven = ( roundingMode == float_round_nearest_even );
    321     roundIncrement = 0x40;
    322     if ( ! roundNearestEven ) {
    323         if ( roundingMode == float_round_to_zero ) {
    324             roundIncrement = 0;
    325         }
    326         else {
    327             roundIncrement = 0x7F;
    328             if ( zSign ) {
    329                 if ( roundingMode == float_round_up ) roundIncrement = 0;
    330             }
    331             else {
    332                 if ( roundingMode == float_round_down ) roundIncrement = 0;
    333             }
    334         }
    335     }
    336     roundBits = zSig & 0x7F;
    337     if ( 0xFD <= (bits16) zExp ) {
    338         if (    ( 0xFD < zExp )
    339              || (    ( zExp == 0xFD )
    340                   && ( (sbits32) ( zSig + roundIncrement ) < 0 ) )
    341            ) {
    342             float_raise( float_flag_overflow | float_flag_inexact );
    343             return packFloat32( zSign, 0xFF, 0 ) - ( roundIncrement == 0 );
    344         }
    345         if ( zExp < 0 ) {
    346             isTiny =
    347                    ( float_detect_tininess == float_tininess_before_rounding )
    348                 || ( zExp < -1 )
    349                 || ( zSig + roundIncrement < 0x80000000U );
    350             shift32RightJamming( zSig, - zExp, &zSig );
    351             zExp = 0;
    352             roundBits = zSig & 0x7F;
    353             if ( isTiny && roundBits ) float_raise( float_flag_underflow );
    354         }
    355     }
    356     if ( roundBits ) set_float_exception_inexact_flag();
    357     zSig = ( zSig + roundIncrement )>>7;
    358     zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
    359     if ( zSig == 0 ) zExp = 0;
    360     return packFloat32( zSign, zExp, zSig );
    361 
    362 }
    363 
    364 /*
    365 -------------------------------------------------------------------------------
    366 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
    367 and significand `zSig', and returns the proper single-precision floating-
    368 point value corresponding to the abstract input.  This routine is just like
    369 `roundAndPackFloat32' except that `zSig' does not have to be normalized.
    370 Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
    371 floating-point exponent.
    372 -------------------------------------------------------------------------------
    373 */
    374 static float32
    375  normalizeRoundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig )
    376 {
    377     int8 shiftCount;
    378 
    379     shiftCount = countLeadingZeros32( zSig ) - 1;
    380     return roundAndPackFloat32( zSign, zExp - shiftCount, zSig<<shiftCount );
    381 
    382 }
    383 
    384 /*
    385 -------------------------------------------------------------------------------
    386 Returns the fraction bits of the double-precision floating-point value `a'.
    387 -------------------------------------------------------------------------------
    388 */
    389 INLINE bits64 extractFloat64Frac( float64 a )
    390 {
    391 
    392     return FLOAT64_DEMANGLE(a) & LIT64( 0x000FFFFFFFFFFFFF );
    393 
    394 }
    395 
    396 /*
    397 -------------------------------------------------------------------------------
    398 Returns the exponent bits of the double-precision floating-point value `a'.
    399 -------------------------------------------------------------------------------
    400 */
    401 INLINE int16 extractFloat64Exp( float64 a )
    402 {
    403 
    404     return (int16)((FLOAT64_DEMANGLE(a) >> 52) & 0x7FF);
    405 
    406 }
    407 
    408 /*
    409 -------------------------------------------------------------------------------
    410 Returns the sign bit of the double-precision floating-point value `a'.
    411 -------------------------------------------------------------------------------
    412 */
    413 INLINE flag extractFloat64Sign( float64 a )
    414 {
    415 
    416     return (flag)(FLOAT64_DEMANGLE(a) >> 63);
    417 
    418 }
    419 
    420 /*
    421 -------------------------------------------------------------------------------
    422 Normalizes the subnormal double-precision floating-point value represented
    423 by the denormalized significand `aSig'.  The normalized exponent and
    424 significand are stored at the locations pointed to by `zExpPtr' and
    425 `zSigPtr', respectively.
    426 -------------------------------------------------------------------------------
    427 */
    428 static void
    429  normalizeFloat64Subnormal( bits64 aSig, int16 *zExpPtr, bits64 *zSigPtr )
    430 {
    431     int8 shiftCount;
    432 
    433     shiftCount = countLeadingZeros64( aSig ) - 11;
    434     *zSigPtr = aSig<<shiftCount;
    435     *zExpPtr = 1 - shiftCount;
    436 
    437 }
    438 
    439 /*
    440 -------------------------------------------------------------------------------
    441 Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
    442 double-precision floating-point value, returning the result.  After being
    443 shifted into the proper positions, the three fields are simply added
    444 together to form the result.  This means that any integer portion of `zSig'
    445 will be added into the exponent.  Since a properly normalized significand
    446 will have an integer portion equal to 1, the `zExp' input should be 1 less
    447 than the desired result exponent whenever `zSig' is a complete, normalized
    448 significand.
    449 -------------------------------------------------------------------------------
    450 */
    451 INLINE float64 packFloat64( flag zSign, int16 zExp, bits64 zSig )
    452 {
    453 
    454     return FLOAT64_MANGLE( ( ( (bits64) zSign )<<63 ) +
    455                            ( ( (bits64) zExp )<<52 ) + zSig );
    456 
    457 }
    458 
    459 /*
    460 -------------------------------------------------------------------------------
    461 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
    462 and significand `zSig', and returns the proper double-precision floating-
    463 point value corresponding to the abstract input.  Ordinarily, the abstract
    464 value is simply rounded and packed into the double-precision format, with
    465 the inexact exception raised if the abstract input cannot be represented
    466 exactly.  However, if the abstract value is too large, the overflow and
    467 inexact exceptions are raised and an infinity or maximal finite value is
    468 returned.  If the abstract value is too small, the input value is rounded to
    469 a subnormal number, and the underflow and inexact exceptions are raised if
    470 the abstract input cannot be represented exactly as a subnormal double-
    471 precision floating-point number.
    472     The input significand `zSig' has its binary point between bits 62
    473 and 61, which is 10 bits to the left of the usual location.  This shifted
    474 significand must be normalized or smaller.  If `zSig' is not normalized,
    475 `zExp' must be 0; in that case, the result returned is a subnormal number,
    476 and it must not require rounding.  In the usual case that `zSig' is
    477 normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
    478 The handling of underflow and overflow follows the IEC/IEEE Standard for
    479 Binary Floating-Point Arithmetic.
    480 -------------------------------------------------------------------------------
    481 */
    482 static float64 roundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig )
    483 {
    484     int8 roundingMode;
    485     flag roundNearestEven;
    486     int16 roundIncrement, roundBits;
    487     flag isTiny;
    488 
    489     roundingMode = float_rounding_mode;
    490     roundNearestEven = ( roundingMode == float_round_nearest_even );
    491     roundIncrement = 0x200;
    492     if ( ! roundNearestEven ) {
    493         if ( roundingMode == float_round_to_zero ) {
    494             roundIncrement = 0;
    495         }
    496         else {
    497             roundIncrement = 0x3FF;
    498             if ( zSign ) {
    499                 if ( roundingMode == float_round_up ) roundIncrement = 0;
    500             }
    501             else {
    502                 if ( roundingMode == float_round_down ) roundIncrement = 0;
    503             }
    504         }
    505     }
    506     roundBits = (int16)(zSig & 0x3FF);
    507     if ( 0x7FD <= (bits16) zExp ) {
    508         if (    ( 0x7FD < zExp )
    509              || (    ( zExp == 0x7FD )
    510                   && ( (sbits64) ( zSig + roundIncrement ) < 0 ) )
    511            ) {
    512             float_raise( float_flag_overflow | float_flag_inexact );
    513             return FLOAT64_MANGLE(
    514                        FLOAT64_DEMANGLE(packFloat64( zSign, 0x7FF, 0 )) -
    515                        ( roundIncrement == 0 ));
    516         }
    517         if ( zExp < 0 ) {
    518             isTiny =
    519                    ( float_detect_tininess == float_tininess_before_rounding )
    520                 || ( zExp < -1 )
    521                 || ( zSig + roundIncrement < (bits64)LIT64( 0x8000000000000000 ) );
    522             shift64RightJamming( zSig, - zExp, &zSig );
    523             zExp = 0;
    524             roundBits = (int16)(zSig & 0x3FF);
    525             if ( isTiny && roundBits ) float_raise( float_flag_underflow );
    526         }
    527     }
    528     if ( roundBits ) set_float_exception_inexact_flag();
    529     zSig = ( zSig + roundIncrement )>>10;
    530     zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
    531     if ( zSig == 0 ) zExp = 0;
    532     return packFloat64( zSign, zExp, zSig );
    533 
    534 }
    535 
    536 /*
    537 -------------------------------------------------------------------------------
    538 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
    539 and significand `zSig', and returns the proper double-precision floating-
    540 point value corresponding to the abstract input.  This routine is just like
    541 `roundAndPackFloat64' except that `zSig' does not have to be normalized.
    542 Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
    543 floating-point exponent.
    544 -------------------------------------------------------------------------------
    545 */
    546 static float64
    547  normalizeRoundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig )
    548 {
    549     int8 shiftCount;
    550 
    551     shiftCount = countLeadingZeros64( zSig ) - 1;
    552     return roundAndPackFloat64( zSign, zExp - shiftCount, zSig<<shiftCount );
    553 
    554 }
    555 
    556 #ifdef FLOATX80
    557 
    558 /*
    559 -------------------------------------------------------------------------------
    560 Returns the fraction bits of the extended double-precision floating-point
    561 value `a'.
    562 -------------------------------------------------------------------------------
    563 */
    564 INLINE bits64 extractFloatx80Frac( floatx80 a )
    565 {
    566 
    567     return a.low;
    568 
    569 }
    570 
    571 /*
    572 -------------------------------------------------------------------------------
    573 Returns the exponent bits of the extended double-precision floating-point
    574 value `a'.
    575 -------------------------------------------------------------------------------
    576 */
    577 INLINE int32 extractFloatx80Exp( floatx80 a )
    578 {
    579 
    580     return a.high & 0x7FFF;
    581 
    582 }
    583 
    584 /*
    585 -------------------------------------------------------------------------------
    586 Returns the sign bit of the extended double-precision floating-point value
    587 `a'.
    588 -------------------------------------------------------------------------------
    589 */
    590 INLINE flag extractFloatx80Sign( floatx80 a )
    591 {
    592 
    593     return a.high>>15;
    594 
    595 }
    596 
    597 /*
    598 -------------------------------------------------------------------------------
    599 Normalizes the subnormal extended double-precision floating-point value
    600 represented by the denormalized significand `aSig'.  The normalized exponent
    601 and significand are stored at the locations pointed to by `zExpPtr' and
    602 `zSigPtr', respectively.
    603 -------------------------------------------------------------------------------
    604 */
    605 static void
    606  normalizeFloatx80Subnormal( bits64 aSig, int32 *zExpPtr, bits64 *zSigPtr )
    607 {
    608     int8 shiftCount;
    609 
    610     shiftCount = countLeadingZeros64( aSig );
    611     *zSigPtr = aSig<<shiftCount;
    612     *zExpPtr = 1 - shiftCount;
    613 
    614 }
    615 
    616 /*
    617 -------------------------------------------------------------------------------
    618 Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
    619 extended double-precision floating-point value, returning the result.
    620 -------------------------------------------------------------------------------
    621 */
    622 INLINE floatx80 packFloatx80( flag zSign, int32 zExp, bits64 zSig )
    623 {
    624     floatx80 z;
    625 
    626     z.low = zSig;
    627     z.high = ( ( (bits16) zSign )<<15 ) + zExp;
    628     return z;
    629 
    630 }
    631 
    632 /*
    633 -------------------------------------------------------------------------------
    634 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
    635 and extended significand formed by the concatenation of `zSig0' and `zSig1',
    636 and returns the proper extended double-precision floating-point value
    637 corresponding to the abstract input.  Ordinarily, the abstract value is
    638 rounded and packed into the extended double-precision format, with the
    639 inexact exception raised if the abstract input cannot be represented
    640 exactly.  However, if the abstract value is too large, the overflow and
    641 inexact exceptions are raised and an infinity or maximal finite value is
    642 returned.  If the abstract value is too small, the input value is rounded to
    643 a subnormal number, and the underflow and inexact exceptions are raised if
    644 the abstract input cannot be represented exactly as a subnormal extended
    645 double-precision floating-point number.
    646     If `roundingPrecision' is 32 or 64, the result is rounded to the same
    647 number of bits as single or double precision, respectively.  Otherwise, the
    648 result is rounded to the full precision of the extended double-precision
    649 format.
    650     The input significand must be normalized or smaller.  If the input
    651 significand is not normalized, `zExp' must be 0; in that case, the result
    652 returned is a subnormal number, and it must not require rounding.  The
    653 handling of underflow and overflow follows the IEC/IEEE Standard for Binary
    654 Floating-Point Arithmetic.
    655 -------------------------------------------------------------------------------
    656 */
    657 static floatx80
    658  roundAndPackFloatx80(
    659      int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
    660  )
    661 {
    662     int8 roundingMode;
    663     flag roundNearestEven, increment, isTiny;
    664     int64 roundIncrement, roundMask, roundBits;
    665 
    666     roundingMode = float_rounding_mode;
    667     roundNearestEven = ( roundingMode == float_round_nearest_even );
    668     if ( roundingPrecision == 80 ) goto precision80;
    669     if ( roundingPrecision == 64 ) {
    670         roundIncrement = LIT64( 0x0000000000000400 );
    671         roundMask = LIT64( 0x00000000000007FF );
    672     }
    673     else if ( roundingPrecision == 32 ) {
    674         roundIncrement = LIT64( 0x0000008000000000 );
    675         roundMask = LIT64( 0x000000FFFFFFFFFF );
    676     }
    677     else {
    678         goto precision80;
    679     }
    680     zSig0 |= ( zSig1 != 0 );
    681     if ( ! roundNearestEven ) {
    682         if ( roundingMode == float_round_to_zero ) {
    683             roundIncrement = 0;
    684         }
    685         else {
    686             roundIncrement = roundMask;
    687             if ( zSign ) {
    688                 if ( roundingMode == float_round_up ) roundIncrement = 0;
    689             }
    690             else {
    691                 if ( roundingMode == float_round_down ) roundIncrement = 0;
    692             }
    693         }
    694     }
    695     roundBits = zSig0 & roundMask;
    696     if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
    697         if (    ( 0x7FFE < zExp )
    698              || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
    699            ) {
    700             goto overflow;
    701         }
    702         if ( zExp <= 0 ) {
    703             isTiny =
    704                    ( float_detect_tininess == float_tininess_before_rounding )
    705                 || ( zExp < 0 )
    706                 || ( zSig0 <= zSig0 + roundIncrement );
    707             shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
    708             zExp = 0;
    709             roundBits = zSig0 & roundMask;
    710             if ( isTiny && roundBits ) float_raise( float_flag_underflow );
    711             if ( roundBits ) set_float_exception_inexact_flag();
    712             zSig0 += roundIncrement;
    713             if ( (sbits64) zSig0 < 0 ) zExp = 1;
    714             roundIncrement = roundMask + 1;
    715             if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
    716                 roundMask |= roundIncrement;
    717             }
    718             zSig0 &= ~ roundMask;
    719             return packFloatx80( zSign, zExp, zSig0 );
    720         }
    721     }
    722     if ( roundBits ) set_float_exception_inexact_flag();
    723     zSig0 += roundIncrement;
    724     if ( zSig0 < roundIncrement ) {
    725         ++zExp;
    726         zSig0 = LIT64( 0x8000000000000000 );
    727     }
    728     roundIncrement = roundMask + 1;
    729     if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
    730         roundMask |= roundIncrement;
    731     }
    732     zSig0 &= ~ roundMask;
    733     if ( zSig0 == 0 ) zExp = 0;
    734     return packFloatx80( zSign, zExp, zSig0 );
    735  precision80:
    736     increment = ( (sbits64) zSig1 < 0 );
    737     if ( ! roundNearestEven ) {
    738         if ( roundingMode == float_round_to_zero ) {
    739             increment = 0;
    740         }
    741         else {
    742             if ( zSign ) {
    743                 increment = ( roundingMode == float_round_down ) && zSig1;
    744             }
    745             else {
    746                 increment = ( roundingMode == float_round_up ) && zSig1;
    747             }
    748         }
    749     }
    750     if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
    751         if (    ( 0x7FFE < zExp )
    752              || (    ( zExp == 0x7FFE )
    753                   && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
    754                   && increment
    755                 )
    756            ) {
    757             roundMask = 0;
    758  overflow:
    759             float_raise( float_flag_overflow | float_flag_inexact );
    760             if (    ( roundingMode == float_round_to_zero )
    761                  || ( zSign && ( roundingMode == float_round_up ) )
    762                  || ( ! zSign && ( roundingMode == float_round_down ) )
    763                ) {
    764                 return packFloatx80( zSign, 0x7FFE, ~ roundMask );
    765             }
    766             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
    767         }
    768         if ( zExp <= 0 ) {
    769             isTiny =
    770                    ( float_detect_tininess == float_tininess_before_rounding )
    771                 || ( zExp < 0 )
    772                 || ! increment
    773                 || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
    774             shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
    775             zExp = 0;
    776             if ( isTiny && zSig1 ) float_raise( float_flag_underflow );
    777             if ( zSig1 ) set_float_exception_inexact_flag();
    778             if ( roundNearestEven ) {
    779                 increment = ( (sbits64) zSig1 < 0 );
    780             }
    781             else {
    782                 if ( zSign ) {
    783                     increment = ( roundingMode == float_round_down ) && zSig1;
    784                 }
    785                 else {
    786                     increment = ( roundingMode == float_round_up ) && zSig1;
    787                 }
    788             }
    789             if ( increment ) {
    790                 ++zSig0;
    791                 zSig0 &=
    792                     ~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven );
    793                 if ( (sbits64) zSig0 < 0 ) zExp = 1;
    794             }
    795             return packFloatx80( zSign, zExp, zSig0 );
    796         }
    797     }
    798     if ( zSig1 ) set_float_exception_inexact_flag();
    799     if ( increment ) {
    800         ++zSig0;
    801         if ( zSig0 == 0 ) {
    802             ++zExp;
    803             zSig0 = LIT64( 0x8000000000000000 );
    804         }
    805         else {
    806             zSig0 &= ~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven );
    807         }
    808     }
    809     else {
    810         if ( zSig0 == 0 ) zExp = 0;
    811     }
    812     return packFloatx80( zSign, zExp, zSig0 );
    813 
    814 }
    815 
    816 /*
    817 -------------------------------------------------------------------------------
    818 Takes an abstract floating-point value having sign `zSign', exponent
    819 `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
    820 and returns the proper extended double-precision floating-point value
    821 corresponding to the abstract input.  This routine is just like
    822 `roundAndPackFloatx80' except that the input significand does not have to be
    823 normalized.
    824 -------------------------------------------------------------------------------
    825 */
    826 static floatx80
    827  normalizeRoundAndPackFloatx80(
    828      int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
    829  )
    830 {
    831     int8 shiftCount;
    832 
    833     if ( zSig0 == 0 ) {
    834         zSig0 = zSig1;
    835         zSig1 = 0;
    836         zExp -= 64;
    837     }
    838     shiftCount = countLeadingZeros64( zSig0 );
    839     shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
    840     zExp -= shiftCount;
    841     return
    842         roundAndPackFloatx80( roundingPrecision, zSign, zExp, zSig0, zSig1 );
    843 
    844 }
    845 
    846 #endif
    847 
    848 #ifdef FLOAT128
    849 
    850 /*
    851 -------------------------------------------------------------------------------
    852 Returns the least-significant 64 fraction bits of the quadruple-precision
    853 floating-point value `a'.
    854 -------------------------------------------------------------------------------
    855 */
    856 INLINE bits64 extractFloat128Frac1( float128 a )
    857 {
    858 
    859     return a.low;
    860 
    861 }
    862 
    863 /*
    864 -------------------------------------------------------------------------------
    865 Returns the most-significant 48 fraction bits of the quadruple-precision
    866 floating-point value `a'.
    867 -------------------------------------------------------------------------------
    868 */
    869 INLINE bits64 extractFloat128Frac0( float128 a )
    870 {
    871 
    872     return a.high & LIT64( 0x0000FFFFFFFFFFFF );
    873 
    874 }
    875 
    876 /*
    877 -------------------------------------------------------------------------------
    878 Returns the exponent bits of the quadruple-precision floating-point value
    879 `a'.
    880 -------------------------------------------------------------------------------
    881 */
    882 INLINE int32 extractFloat128Exp( float128 a )
    883 {
    884 
    885     return (int32)((a.high >> 48) & 0x7FFF);
    886 
    887 }
    888 
    889 /*
    890 -------------------------------------------------------------------------------
    891 Returns the sign bit of the quadruple-precision floating-point value `a'.
    892 -------------------------------------------------------------------------------
    893 */
    894 INLINE flag extractFloat128Sign( float128 a )
    895 {
    896 
    897     return (flag)(a.high >> 63);
    898 
    899 }
    900 
    901 /*
    902 -------------------------------------------------------------------------------
    903 Normalizes the subnormal quadruple-precision floating-point value
    904 represented by the denormalized significand formed by the concatenation of
    905 `aSig0' and `aSig1'.  The normalized exponent is stored at the location
    906 pointed to by `zExpPtr'.  The most significant 49 bits of the normalized
    907 significand are stored at the location pointed to by `zSig0Ptr', and the
    908 least significant 64 bits of the normalized significand are stored at the
    909 location pointed to by `zSig1Ptr'.
    910 -------------------------------------------------------------------------------
    911 */
    912 static void
    913  normalizeFloat128Subnormal(
    914      bits64 aSig0,
    915      bits64 aSig1,
    916      int32 *zExpPtr,
    917      bits64 *zSig0Ptr,
    918      bits64 *zSig1Ptr
    919  )
    920 {
    921     int8 shiftCount;
    922 
    923     if ( aSig0 == 0 ) {
    924         shiftCount = countLeadingZeros64( aSig1 ) - 15;
    925         if ( shiftCount < 0 ) {
    926             *zSig0Ptr = aSig1>>( - shiftCount );
    927             *zSig1Ptr = aSig1<<( shiftCount & 63 );
    928         }
    929         else {
    930             *zSig0Ptr = aSig1<<shiftCount;
    931             *zSig1Ptr = 0;
    932         }
    933         *zExpPtr = - shiftCount - 63;
    934     }
    935     else {
    936         shiftCount = countLeadingZeros64( aSig0 ) - 15;
    937         shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
    938         *zExpPtr = 1 - shiftCount;
    939     }
    940 
    941 }
    942 
    943 /*
    944 -------------------------------------------------------------------------------
    945 Packs the sign `zSign', the exponent `zExp', and the significand formed
    946 by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
    947 floating-point value, returning the result.  After being shifted into the
    948 proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
    949 added together to form the most significant 32 bits of the result.  This
    950 means that any integer portion of `zSig0' will be added into the exponent.
    951 Since a properly normalized significand will have an integer portion equal
    952 to 1, the `zExp' input should be 1 less than the desired result exponent
    953 whenever `zSig0' and `zSig1' concatenated form a complete, normalized
    954 significand.
    955 -------------------------------------------------------------------------------
    956 */
    957 INLINE float128
    958  packFloat128( flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 )
    959 {
    960     float128 z;
    961 
    962     z.low = zSig1;
    963     z.high = ( ( (bits64) zSign )<<63 ) + ( ( (bits64) zExp )<<48 ) + zSig0;
    964     return z;
    965 
    966 }
    967 
    968 /*
    969 -------------------------------------------------------------------------------
    970 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
    971 and extended significand formed by the concatenation of `zSig0', `zSig1',
    972 and `zSig2', and returns the proper quadruple-precision floating-point value
    973 corresponding to the abstract input.  Ordinarily, the abstract value is
    974 simply rounded and packed into the quadruple-precision format, with the
    975 inexact exception raised if the abstract input cannot be represented
    976 exactly.  However, if the abstract value is too large, the overflow and
    977 inexact exceptions are raised and an infinity or maximal finite value is
    978 returned.  If the abstract value is too small, the input value is rounded to
    979 a subnormal number, and the underflow and inexact exceptions are raised if
    980 the abstract input cannot be represented exactly as a subnormal quadruple-
    981 precision floating-point number.
    982     The input significand must be normalized or smaller.  If the input
    983 significand is not normalized, `zExp' must be 0; in that case, the result
    984 returned is a subnormal number, and it must not require rounding.  In the
    985 usual case that the input significand is normalized, `zExp' must be 1 less
    986 than the ``true'' floating-point exponent.  The handling of underflow and
    987 overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
    988 -------------------------------------------------------------------------------
    989 */
    990 static float128
    991  roundAndPackFloat128(
    992      flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1, bits64 zSig2 )
    993 {
    994     int8 roundingMode;
    995     flag roundNearestEven, increment, isTiny;
    996 
    997     roundingMode = float_rounding_mode;
    998     roundNearestEven = ( roundingMode == float_round_nearest_even );
    999     increment = ( (sbits64) zSig2 < 0 );
   1000     if ( ! roundNearestEven ) {
   1001         if ( roundingMode == float_round_to_zero ) {
   1002             increment = 0;
   1003         }
   1004         else {
   1005             if ( zSign ) {
   1006                 increment = ( roundingMode == float_round_down ) && zSig2;
   1007             }
   1008             else {
   1009                 increment = ( roundingMode == float_round_up ) && zSig2;
   1010             }
   1011         }
   1012     }
   1013     if ( 0x7FFD <= (bits32) zExp ) {
   1014         if (    ( 0x7FFD < zExp )
   1015              || (    ( zExp == 0x7FFD )
   1016                   && eq128(
   1017                          LIT64( 0x0001FFFFFFFFFFFF ),
   1018                          LIT64( 0xFFFFFFFFFFFFFFFF ),
   1019                          zSig0,
   1020                          zSig1
   1021                      )
   1022                   && increment
   1023                 )
   1024            ) {
   1025             float_raise( float_flag_overflow | float_flag_inexact );
   1026             if (    ( roundingMode == float_round_to_zero )
   1027                  || ( zSign && ( roundingMode == float_round_up ) )
   1028                  || ( ! zSign && ( roundingMode == float_round_down ) )
   1029                ) {
   1030                 return
   1031                     packFloat128(
   1032                         zSign,
   1033                         0x7FFE,
   1034                         LIT64( 0x0000FFFFFFFFFFFF ),
   1035                         LIT64( 0xFFFFFFFFFFFFFFFF )
   1036                     );
   1037             }
   1038             return packFloat128( zSign, 0x7FFF, 0, 0 );
   1039         }
   1040         if ( zExp < 0 ) {
   1041             isTiny =
   1042                    ( float_detect_tininess == float_tininess_before_rounding )
   1043                 || ( zExp < -1 )
   1044                 || ! increment
   1045                 || lt128(
   1046                        zSig0,
   1047                        zSig1,
   1048                        LIT64( 0x0001FFFFFFFFFFFF ),
   1049                        LIT64( 0xFFFFFFFFFFFFFFFF )
   1050                    );
   1051             shift128ExtraRightJamming(
   1052                 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
   1053             zExp = 0;
   1054             if ( isTiny && zSig2 ) float_raise( float_flag_underflow );
   1055             if ( roundNearestEven ) {
   1056                 increment = ( (sbits64) zSig2 < 0 );
   1057             }
   1058             else {
   1059                 if ( zSign ) {
   1060                     increment = ( roundingMode == float_round_down ) && zSig2;
   1061                 }
   1062                 else {
   1063                     increment = ( roundingMode == float_round_up ) && zSig2;
   1064                 }
   1065             }
   1066         }
   1067     }
   1068     if ( zSig2 ) set_float_exception_inexact_flag();
   1069     if ( increment ) {
   1070         add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
   1071         zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven );
   1072     }
   1073     else {
   1074         if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
   1075     }
   1076     return packFloat128( zSign, zExp, zSig0, zSig1 );
   1077 
   1078 }
   1079 
   1080 /*
   1081 -------------------------------------------------------------------------------
   1082 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
   1083 and significand formed by the concatenation of `zSig0' and `zSig1', and
   1084 returns the proper quadruple-precision floating-point value corresponding
   1085 to the abstract input.  This routine is just like `roundAndPackFloat128'
   1086 except that the input significand has fewer bits and does not have to be
   1087 normalized.  In all cases, `zExp' must be 1 less than the ``true'' floating-
   1088 point exponent.
   1089 -------------------------------------------------------------------------------
   1090 */
   1091 static float128
   1092  normalizeRoundAndPackFloat128(
   1093      flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 )
   1094 {
   1095     int8 shiftCount;
   1096     bits64 zSig2;
   1097 
   1098     if ( zSig0 == 0 ) {
   1099         zSig0 = zSig1;
   1100         zSig1 = 0;
   1101         zExp -= 64;
   1102     }
   1103     shiftCount = countLeadingZeros64( zSig0 ) - 15;
   1104     if ( 0 <= shiftCount ) {
   1105         zSig2 = 0;
   1106         shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
   1107     }
   1108     else {
   1109         shift128ExtraRightJamming(
   1110             zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
   1111     }
   1112     zExp -= shiftCount;
   1113     return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
   1114 
   1115 }
   1116 
   1117 #endif
   1118 
   1119 /*
   1120 -------------------------------------------------------------------------------
   1121 Returns the result of converting the 32-bit two's complement integer `a'
   1122 to the single-precision floating-point format.  The conversion is performed
   1123 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   1124 -------------------------------------------------------------------------------
   1125 */
   1126 float32 int32_to_float32( int32 a )
   1127 {
   1128     flag zSign;
   1129 
   1130     if ( a == 0 ) return 0;
   1131     if ( a == (sbits32) 0x80000000 ) return packFloat32( 1, 0x9E, 0 );
   1132     zSign = ( a < 0 );
   1133     return normalizeRoundAndPackFloat32(zSign, 0x9C, (uint32)(zSign ? - a : a));
   1134 
   1135 }
   1136 
   1137 float32 uint32_to_float32( uint32 a )
   1138 {
   1139     if ( a == 0 ) return 0;
   1140     if ( a & (bits32) 0x80000000 )
   1141         return normalizeRoundAndPackFloat32( 0, 0x9D, a >> 1 );
   1142     return normalizeRoundAndPackFloat32( 0, 0x9C, a );
   1143 }
   1144 
   1145 
   1146 /*
   1147 -------------------------------------------------------------------------------
   1148 Returns the result of converting the 32-bit two's complement integer `a'
   1149 to the double-precision floating-point format.  The conversion is performed
   1150 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   1151 -------------------------------------------------------------------------------
   1152 */
   1153 float64 int32_to_float64( int32 a )
   1154 {
   1155     flag zSign;
   1156     uint32 absA;
   1157     int8 shiftCount;
   1158     bits64 zSig;
   1159 
   1160     if ( a == 0 ) return 0;
   1161     zSign = ( a < 0 );
   1162     absA = zSign ? - a : a;
   1163     shiftCount = countLeadingZeros32( absA ) + 21;
   1164     zSig = absA;
   1165     return packFloat64( zSign, 0x432 - shiftCount, zSig<<shiftCount );
   1166 
   1167 }
   1168 
   1169 float64 uint32_to_float64( uint32 a )
   1170 {
   1171     int8 shiftCount;
   1172     bits64 zSig = a;
   1173 
   1174     if ( a == 0 ) return 0;
   1175     shiftCount = countLeadingZeros32( a ) + 21;
   1176     return packFloat64( 0, 0x432 - shiftCount, zSig<<shiftCount );
   1177 
   1178 }
   1179 
   1180 #ifdef FLOATX80
   1181 
   1182 /*
   1183 -------------------------------------------------------------------------------
   1184 Returns the result of converting the 32-bit two's complement integer `a'
   1185 to the extended double-precision floating-point format.  The conversion
   1186 is performed according to the IEC/IEEE Standard for Binary Floating-Point
   1187 Arithmetic.
   1188 -------------------------------------------------------------------------------
   1189 */
   1190 floatx80 int32_to_floatx80( int32 a )
   1191 {
   1192     flag zSign;
   1193     uint32 absA;
   1194     int8 shiftCount;
   1195     bits64 zSig;
   1196 
   1197     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
   1198     zSign = ( a < 0 );
   1199     absA = zSign ? - a : a;
   1200     shiftCount = countLeadingZeros32( absA ) + 32;
   1201     zSig = absA;
   1202     return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
   1203 
   1204 }
   1205 
   1206 floatx80 uint32_to_floatx80( uint32 a )
   1207 {
   1208     int8 shiftCount;
   1209     bits64 zSig = a;
   1210 
   1211     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
   1212     shiftCount = countLeadingZeros32( a ) + 32;
   1213     return packFloatx80( 0, 0x403E - shiftCount, zSig<<shiftCount );
   1214 
   1215 }
   1216 
   1217 #endif
   1218 
   1219 #ifdef FLOAT128
   1220 
   1221 /*
   1222 -------------------------------------------------------------------------------
   1223 Returns the result of converting the 32-bit two's complement integer `a' to
   1224 the quadruple-precision floating-point format.  The conversion is performed
   1225 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   1226 -------------------------------------------------------------------------------
   1227 */
   1228 float128 int32_to_float128( int32 a )
   1229 {
   1230     flag zSign;
   1231     uint32 absA;
   1232     int8 shiftCount;
   1233     bits64 zSig0;
   1234 
   1235     if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
   1236     zSign = ( a < 0 );
   1237     absA = zSign ? - a : a;
   1238     shiftCount = countLeadingZeros32( absA ) + 17;
   1239     zSig0 = absA;
   1240     return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
   1241 
   1242 }
   1243 
   1244 float128 uint32_to_float128( uint32 a )
   1245 {
   1246     int8 shiftCount;
   1247     bits64 zSig0 = a;
   1248 
   1249     if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
   1250     shiftCount = countLeadingZeros32( a ) + 17;
   1251     return packFloat128( 0, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
   1252 
   1253 }
   1254 
   1255 #endif
   1256 
   1257 #ifndef SOFTFLOAT_FOR_GCC /* __floatdi?f is in libgcc2.c */
   1258 /*
   1259 -------------------------------------------------------------------------------
   1260 Returns the result of converting the 64-bit two's complement integer `a'
   1261 to the single-precision floating-point format.  The conversion is performed
   1262 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   1263 -------------------------------------------------------------------------------
   1264 */
   1265 float32 int64_to_float32( int64 a )
   1266 {
   1267     flag zSign;
   1268     uint64 absA;
   1269     int8 shiftCount;
   1270 
   1271     if ( a == 0 ) return 0;
   1272     zSign = ( a < 0 );
   1273     absA = zSign ? - a : a;
   1274     shiftCount = countLeadingZeros64( absA ) - 40;
   1275     if ( 0 <= shiftCount ) {
   1276         return packFloat32( zSign, 0x95 - shiftCount, absA<<shiftCount );
   1277     }
   1278     else {
   1279         shiftCount += 7;
   1280         if ( shiftCount < 0 ) {
   1281             shift64RightJamming( absA, - shiftCount, &absA );
   1282         }
   1283         else {
   1284             absA <<= shiftCount;
   1285         }
   1286         return roundAndPackFloat32( zSign, 0x9C - shiftCount, absA );
   1287     }
   1288 
   1289 }
   1290 
   1291 /*
   1292 -------------------------------------------------------------------------------
   1293 Returns the result of converting the 64-bit two's complement integer `a'
   1294 to the double-precision floating-point format.  The conversion is performed
   1295 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   1296 -------------------------------------------------------------------------------
   1297 */
   1298 float64 int64_to_float64( int64 a )
   1299 {
   1300     flag zSign;
   1301 
   1302     if ( a == 0 ) return 0;
   1303     if ( a == (sbits64) LIT64( 0x8000000000000000 ) ) {
   1304         return packFloat64( 1, 0x43E, 0 );
   1305     }
   1306     zSign = ( a < 0 );
   1307     return normalizeRoundAndPackFloat64( zSign, 0x43C, zSign ? - a : a );
   1308 
   1309 }
   1310 
   1311 #ifdef FLOATX80
   1312 
   1313 /*
   1314 -------------------------------------------------------------------------------
   1315 Returns the result of converting the 64-bit two's complement integer `a'
   1316 to the extended double-precision floating-point format.  The conversion
   1317 is performed according to the IEC/IEEE Standard for Binary Floating-Point
   1318 Arithmetic.
   1319 -------------------------------------------------------------------------------
   1320 */
   1321 floatx80 int64_to_floatx80( int64 a )
   1322 {
   1323     flag zSign;
   1324     uint64 absA;
   1325     int8 shiftCount;
   1326 
   1327     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
   1328     zSign = ( a < 0 );
   1329     absA = zSign ? - a : a;
   1330     shiftCount = countLeadingZeros64( absA );
   1331     return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
   1332 
   1333 }
   1334 
   1335 #endif
   1336 
   1337 #endif /* !SOFTFLOAT_FOR_GCC */
   1338 
   1339 #ifdef FLOAT128
   1340 
   1341 /*
   1342 -------------------------------------------------------------------------------
   1343 Returns the result of converting the 64-bit two's complement integer `a' to
   1344 the quadruple-precision floating-point format.  The conversion is performed
   1345 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   1346 -------------------------------------------------------------------------------
   1347 */
   1348 float128 int64_to_float128( int64 a )
   1349 {
   1350     flag zSign;
   1351     uint64 absA;
   1352     int8 shiftCount;
   1353     int32 zExp;
   1354     bits64 zSig0, zSig1;
   1355 
   1356     if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
   1357     zSign = ( a < 0 );
   1358     absA = zSign ? - a : a;
   1359     shiftCount = countLeadingZeros64( absA ) + 49;
   1360     zExp = 0x406E - shiftCount;
   1361     if ( 64 <= shiftCount ) {
   1362         zSig1 = 0;
   1363         zSig0 = absA;
   1364         shiftCount -= 64;
   1365     }
   1366     else {
   1367         zSig1 = absA;
   1368         zSig0 = 0;
   1369     }
   1370     shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
   1371     return packFloat128( zSign, zExp, zSig0, zSig1 );
   1372 
   1373 }
   1374 
   1375 #endif
   1376 
   1377 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
   1378 /*
   1379 -------------------------------------------------------------------------------
   1380 Returns the result of converting the single-precision floating-point value
   1381 `a' to the 32-bit two's complement integer format.  The conversion is
   1382 performed according to the IEC/IEEE Standard for Binary Floating-Point
   1383 Arithmetic---which means in particular that the conversion is rounded
   1384 according to the current rounding mode.  If `a' is a NaN, the largest
   1385 positive integer is returned.  Otherwise, if the conversion overflows, the
   1386 largest integer with the same sign as `a' is returned.
   1387 -------------------------------------------------------------------------------
   1388 */
   1389 int32 float32_to_int32( float32 a )
   1390 {
   1391     flag aSign;
   1392     int16 aExp, shiftCount;
   1393     bits32 aSig;
   1394     bits64 aSig64;
   1395 
   1396     aSig = extractFloat32Frac( a );
   1397     aExp = extractFloat32Exp( a );
   1398     aSign = extractFloat32Sign( a );
   1399     if ( ( aExp == 0xFF ) && aSig ) aSign = 0;
   1400     if ( aExp ) aSig |= 0x00800000;
   1401     shiftCount = 0xAF - aExp;
   1402     aSig64 = aSig;
   1403     aSig64 <<= 32;
   1404     if ( 0 < shiftCount ) shift64RightJamming( aSig64, shiftCount, &aSig64 );
   1405     return roundAndPackInt32( aSign, aSig64 );
   1406 
   1407 }
   1408 #endif /* !SOFTFLOAT_FOR_GCC */
   1409 
   1410 /*
   1411 -------------------------------------------------------------------------------
   1412 Returns the result of converting the single-precision floating-point value
   1413 `a' to the 32-bit two's complement integer format.  The conversion is
   1414 performed according to the IEC/IEEE Standard for Binary Floating-Point
   1415 Arithmetic, except that the conversion is always rounded toward zero.
   1416 If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
   1417 the conversion overflows, the largest integer with the same sign as `a' is
   1418 returned.
   1419 -------------------------------------------------------------------------------
   1420 */
   1421 int32 float32_to_int32_round_to_zero( float32 a )
   1422 {
   1423     flag aSign;
   1424     int16 aExp, shiftCount;
   1425     bits32 aSig;
   1426     int32 z;
   1427 
   1428     aSig = extractFloat32Frac( a );
   1429     aExp = extractFloat32Exp( a );
   1430     aSign = extractFloat32Sign( a );
   1431     shiftCount = aExp - 0x9E;
   1432     if ( 0 <= shiftCount ) {
   1433         if ( a != 0xCF000000 ) {
   1434             float_raise( float_flag_invalid );
   1435             if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF;
   1436         }
   1437         return (sbits32) 0x80000000;
   1438     }
   1439     else if ( aExp <= 0x7E ) {
   1440         if ( aExp | aSig ) set_float_exception_inexact_flag();
   1441         return 0;
   1442     }
   1443     aSig = ( aSig | 0x00800000 )<<8;
   1444     z = aSig>>( - shiftCount );
   1445     if ( (bits32) ( aSig<<( shiftCount & 31 ) ) ) {
   1446         set_float_exception_inexact_flag();
   1447     }
   1448     if ( aSign ) z = - z;
   1449     return z;
   1450 
   1451 }
   1452 
   1453 #ifndef SOFTFLOAT_FOR_GCC /* __fix?fdi provided by libgcc2.c */
   1454 /*
   1455 -------------------------------------------------------------------------------
   1456 Returns the result of converting the single-precision floating-point value
   1457 `a' to the 64-bit two's complement integer format.  The conversion is
   1458 performed according to the IEC/IEEE Standard for Binary Floating-Point
   1459 Arithmetic---which means in particular that the conversion is rounded
   1460 according to the current rounding mode.  If `a' is a NaN, the largest
   1461 positive integer is returned.  Otherwise, if the conversion overflows, the
   1462 largest integer with the same sign as `a' is returned.
   1463 -------------------------------------------------------------------------------
   1464 */
   1465 int64 float32_to_int64( float32 a )
   1466 {
   1467     flag aSign;
   1468     int16 aExp, shiftCount;
   1469     bits32 aSig;
   1470     bits64 aSig64, aSigExtra;
   1471 
   1472     aSig = extractFloat32Frac( a );
   1473     aExp = extractFloat32Exp( a );
   1474     aSign = extractFloat32Sign( a );
   1475     shiftCount = 0xBE - aExp;
   1476     if ( shiftCount < 0 ) {
   1477         float_raise( float_flag_invalid );
   1478         if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
   1479             return LIT64( 0x7FFFFFFFFFFFFFFF );
   1480         }
   1481         return (sbits64) LIT64( 0x8000000000000000 );
   1482     }
   1483     if ( aExp ) aSig |= 0x00800000;
   1484     aSig64 = aSig;
   1485     aSig64 <<= 40;
   1486     shift64ExtraRightJamming( aSig64, 0, shiftCount, &aSig64, &aSigExtra );
   1487     return roundAndPackInt64( aSign, aSig64, aSigExtra );
   1488 
   1489 }
   1490 
   1491 /*
   1492 -------------------------------------------------------------------------------
   1493 Returns the result of converting the single-precision floating-point value
   1494 `a' to the 64-bit two's complement integer format.  The conversion is
   1495 performed according to the IEC/IEEE Standard for Binary Floating-Point
   1496 Arithmetic, except that the conversion is always rounded toward zero.  If
   1497 `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
   1498 conversion overflows, the largest integer with the same sign as `a' is
   1499 returned.
   1500 -------------------------------------------------------------------------------
   1501 */
   1502 int64 float32_to_int64_round_to_zero( float32 a )
   1503 {
   1504     flag aSign;
   1505     int16 aExp, shiftCount;
   1506     bits32 aSig;
   1507     bits64 aSig64;
   1508     int64 z;
   1509 
   1510     aSig = extractFloat32Frac( a );
   1511     aExp = extractFloat32Exp( a );
   1512     aSign = extractFloat32Sign( a );
   1513     shiftCount = aExp - 0xBE;
   1514     if ( 0 <= shiftCount ) {
   1515         if ( a != 0xDF000000 ) {
   1516             float_raise( float_flag_invalid );
   1517             if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
   1518                 return LIT64( 0x7FFFFFFFFFFFFFFF );
   1519             }
   1520         }
   1521         return (sbits64) LIT64( 0x8000000000000000 );
   1522     }
   1523     else if ( aExp <= 0x7E ) {
   1524         if ( aExp | aSig ) set_float_exception_inexact_flag();
   1525         return 0;
   1526     }
   1527     aSig64 = aSig | 0x00800000;
   1528     aSig64 <<= 40;
   1529     z = aSig64>>( - shiftCount );
   1530     if ( (bits64) ( aSig64<<( shiftCount & 63 ) ) ) {
   1531         set_float_exception_inexact_flag();
   1532     }
   1533     if ( aSign ) z = - z;
   1534     return z;
   1535 
   1536 }
   1537 #endif /* !SOFTFLOAT_FOR_GCC */
   1538 
   1539 /*
   1540 -------------------------------------------------------------------------------
   1541 Returns the result of converting the single-precision floating-point value
   1542 `a' to the double-precision floating-point format.  The conversion is
   1543 performed according to the IEC/IEEE Standard for Binary Floating-Point
   1544 Arithmetic.
   1545 -------------------------------------------------------------------------------
   1546 */
   1547 float64 float32_to_float64( float32 a )
   1548 {
   1549     flag aSign;
   1550     int16 aExp;
   1551     bits32 aSig;
   1552 
   1553     aSig = extractFloat32Frac( a );
   1554     aExp = extractFloat32Exp( a );
   1555     aSign = extractFloat32Sign( a );
   1556     if ( aExp == 0xFF ) {
   1557         if ( aSig ) return commonNaNToFloat64( float32ToCommonNaN( a ) );
   1558         return packFloat64( aSign, 0x7FF, 0 );
   1559     }
   1560     if ( aExp == 0 ) {
   1561         if ( aSig == 0 ) return packFloat64( aSign, 0, 0 );
   1562         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
   1563         --aExp;
   1564     }
   1565     return packFloat64( aSign, aExp + 0x380, ( (bits64) aSig )<<29 );
   1566 
   1567 }
   1568 
   1569 #ifdef FLOATX80
   1570 
   1571 /*
   1572 -------------------------------------------------------------------------------
   1573 Returns the result of converting the single-precision floating-point value
   1574 `a' to the extended double-precision floating-point format.  The conversion
   1575 is performed according to the IEC/IEEE Standard for Binary Floating-Point
   1576 Arithmetic.
   1577 -------------------------------------------------------------------------------
   1578 */
   1579 floatx80 float32_to_floatx80( float32 a )
   1580 {
   1581     flag aSign;
   1582     int16 aExp;
   1583     bits32 aSig;
   1584 
   1585     aSig = extractFloat32Frac( a );
   1586     aExp = extractFloat32Exp( a );
   1587     aSign = extractFloat32Sign( a );
   1588     if ( aExp == 0xFF ) {
   1589         if ( aSig ) return commonNaNToFloatx80( float32ToCommonNaN( a ) );
   1590         return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
   1591     }
   1592     if ( aExp == 0 ) {
   1593         if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
   1594         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
   1595     }
   1596     aSig |= 0x00800000;
   1597     return packFloatx80( aSign, aExp + 0x3F80, ( (bits64) aSig )<<40 );
   1598 
   1599 }
   1600 
   1601 #endif
   1602 
   1603 #ifdef FLOAT128
   1604 
   1605 /*
   1606 -------------------------------------------------------------------------------
   1607 Returns the result of converting the single-precision floating-point value
   1608 `a' to the double-precision floating-point format.  The conversion is
   1609 performed according to the IEC/IEEE Standard for Binary Floating-Point
   1610 Arithmetic.
   1611 -------------------------------------------------------------------------------
   1612 */
   1613 float128 float32_to_float128( float32 a )
   1614 {
   1615     flag aSign;
   1616     int16 aExp;
   1617     bits32 aSig;
   1618 
   1619     aSig = extractFloat32Frac( a );
   1620     aExp = extractFloat32Exp( a );
   1621     aSign = extractFloat32Sign( a );
   1622     if ( aExp == 0xFF ) {
   1623         if ( aSig ) return commonNaNToFloat128( float32ToCommonNaN( a ) );
   1624         return packFloat128( aSign, 0x7FFF, 0, 0 );
   1625     }
   1626     if ( aExp == 0 ) {
   1627         if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
   1628         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
   1629         --aExp;
   1630     }
   1631     return packFloat128( aSign, aExp + 0x3F80, ( (bits64) aSig )<<25, 0 );
   1632 
   1633 }
   1634 
   1635 #endif
   1636 
   1637 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
   1638 /*
   1639 -------------------------------------------------------------------------------
   1640 Rounds the single-precision floating-point value `a' to an integer, and
   1641 returns the result as a single-precision floating-point value.  The
   1642 operation is performed according to the IEC/IEEE Standard for Binary
   1643 Floating-Point Arithmetic.
   1644 -------------------------------------------------------------------------------
   1645 */
   1646 float32 float32_round_to_int( float32 a )
   1647 {
   1648     flag aSign;
   1649     int16 aExp;
   1650     bits32 lastBitMask, roundBitsMask;
   1651     int8 roundingMode;
   1652     float32 z;
   1653 
   1654     aExp = extractFloat32Exp( a );
   1655     if ( 0x96 <= aExp ) {
   1656         if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) {
   1657             return propagateFloat32NaN( a, a );
   1658         }
   1659         return a;
   1660     }
   1661     if ( aExp <= 0x7E ) {
   1662         if ( (bits32) ( a<<1 ) == 0 ) return a;
   1663         set_float_exception_inexact_flag();
   1664         aSign = extractFloat32Sign( a );
   1665         switch ( float_rounding_mode ) {
   1666          case float_round_nearest_even:
   1667             if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) {
   1668                 return packFloat32( aSign, 0x7F, 0 );
   1669             }
   1670             break;
   1671          case float_round_to_zero:
   1672             break;
   1673          case float_round_down:
   1674             return aSign ? 0xBF800000 : 0;
   1675          case float_round_up:
   1676             return aSign ? 0x80000000 : 0x3F800000;
   1677         }
   1678         return packFloat32( aSign, 0, 0 );
   1679     }
   1680     lastBitMask = 1;
   1681     lastBitMask <<= 0x96 - aExp;
   1682     roundBitsMask = lastBitMask - 1;
   1683     z = a;
   1684     roundingMode = float_rounding_mode;
   1685     if ( roundingMode == float_round_nearest_even ) {
   1686         z += lastBitMask>>1;
   1687         if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
   1688     }
   1689     else if ( roundingMode != float_round_to_zero ) {
   1690         if ( extractFloat32Sign( z ) ^ ( roundingMode == float_round_up ) ) {
   1691             z += roundBitsMask;
   1692         }
   1693     }
   1694     z &= ~ roundBitsMask;
   1695     if ( z != a ) set_float_exception_inexact_flag();
   1696     return z;
   1697 
   1698 }
   1699 #endif /* !SOFTFLOAT_FOR_GCC */
   1700 
   1701 /*
   1702 -------------------------------------------------------------------------------
   1703 Returns the result of adding the absolute values of the single-precision
   1704 floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
   1705 before being returned.  `zSign' is ignored if the result is a NaN.
   1706 The addition is performed according to the IEC/IEEE Standard for Binary
   1707 Floating-Point Arithmetic.
   1708 -------------------------------------------------------------------------------
   1709 */
   1710 static float32 addFloat32Sigs( float32 a, float32 b, flag zSign )
   1711 {
   1712     int16 aExp, bExp, zExp;
   1713     bits32 aSig, bSig, zSig;
   1714     int16 expDiff;
   1715 
   1716     aSig = extractFloat32Frac( a );
   1717     aExp = extractFloat32Exp( a );
   1718     bSig = extractFloat32Frac( b );
   1719     bExp = extractFloat32Exp( b );
   1720     expDiff = aExp - bExp;
   1721     aSig <<= 6;
   1722     bSig <<= 6;
   1723     if ( 0 < expDiff ) {
   1724         if ( aExp == 0xFF ) {
   1725             if ( aSig ) return propagateFloat32NaN( a, b );
   1726             return a;
   1727         }
   1728         if ( bExp == 0 ) {
   1729             --expDiff;
   1730         }
   1731         else {
   1732             bSig |= 0x20000000;
   1733         }
   1734         shift32RightJamming( bSig, expDiff, &bSig );
   1735         zExp = aExp;
   1736     }
   1737     else if ( expDiff < 0 ) {
   1738         if ( bExp == 0xFF ) {
   1739             if ( bSig ) return propagateFloat32NaN( a, b );
   1740             return packFloat32( zSign, 0xFF, 0 );
   1741         }
   1742         if ( aExp == 0 ) {
   1743             ++expDiff;
   1744         }
   1745         else {
   1746             aSig |= 0x20000000;
   1747         }
   1748         shift32RightJamming( aSig, - expDiff, &aSig );
   1749         zExp = bExp;
   1750     }
   1751     else {
   1752         if ( aExp == 0xFF ) {
   1753             if ( aSig | bSig ) return propagateFloat32NaN( a, b );
   1754             return a;
   1755         }
   1756         if ( aExp == 0 ) return packFloat32( zSign, 0, ( aSig + bSig )>>6 );
   1757         zSig = 0x40000000 + aSig + bSig;
   1758         zExp = aExp;
   1759         goto roundAndPack;
   1760     }
   1761     aSig |= 0x20000000;
   1762     zSig = ( aSig + bSig )<<1;
   1763     --zExp;
   1764     if ( (sbits32) zSig < 0 ) {
   1765         zSig = aSig + bSig;
   1766         ++zExp;
   1767     }
   1768  roundAndPack:
   1769     return roundAndPackFloat32( zSign, zExp, zSig );
   1770 
   1771 }
   1772 
   1773 /*
   1774 -------------------------------------------------------------------------------
   1775 Returns the result of subtracting the absolute values of the single-
   1776 precision floating-point values `a' and `b'.  If `zSign' is 1, the
   1777 difference is negated before being returned.  `zSign' is ignored if the
   1778 result is a NaN.  The subtraction is performed according to the IEC/IEEE
   1779 Standard for Binary Floating-Point Arithmetic.
   1780 -------------------------------------------------------------------------------
   1781 */
   1782 static float32 subFloat32Sigs( float32 a, float32 b, flag zSign )
   1783 {
   1784     int16 aExp, bExp, zExp;
   1785     bits32 aSig, bSig, zSig;
   1786     int16 expDiff;
   1787 
   1788     aSig = extractFloat32Frac( a );
   1789     aExp = extractFloat32Exp( a );
   1790     bSig = extractFloat32Frac( b );
   1791     bExp = extractFloat32Exp( b );
   1792     expDiff = aExp - bExp;
   1793     aSig <<= 7;
   1794     bSig <<= 7;
   1795     if ( 0 < expDiff ) goto aExpBigger;
   1796     if ( expDiff < 0 ) goto bExpBigger;
   1797     if ( aExp == 0xFF ) {
   1798         if ( aSig | bSig ) return propagateFloat32NaN( a, b );
   1799         float_raise( float_flag_invalid );
   1800         return float32_default_nan;
   1801     }
   1802     if ( aExp == 0 ) {
   1803         aExp = 1;
   1804         bExp = 1;
   1805     }
   1806     if ( bSig < aSig ) goto aBigger;
   1807     if ( aSig < bSig ) goto bBigger;
   1808     return packFloat32( float_rounding_mode == float_round_down, 0, 0 );
   1809  bExpBigger:
   1810     if ( bExp == 0xFF ) {
   1811         if ( bSig ) return propagateFloat32NaN( a, b );
   1812         return packFloat32( zSign ^ 1, 0xFF, 0 );
   1813     }
   1814     if ( aExp == 0 ) {
   1815         ++expDiff;
   1816     }
   1817     else {
   1818         aSig |= 0x40000000;
   1819     }
   1820     shift32RightJamming( aSig, - expDiff, &aSig );
   1821     bSig |= 0x40000000;
   1822  bBigger:
   1823     zSig = bSig - aSig;
   1824     zExp = bExp;
   1825     zSign ^= 1;
   1826     goto normalizeRoundAndPack;
   1827  aExpBigger:
   1828     if ( aExp == 0xFF ) {
   1829         if ( aSig ) return propagateFloat32NaN( a, b );
   1830         return a;
   1831     }
   1832     if ( bExp == 0 ) {
   1833         --expDiff;
   1834     }
   1835     else {
   1836         bSig |= 0x40000000;
   1837     }
   1838     shift32RightJamming( bSig, expDiff, &bSig );
   1839     aSig |= 0x40000000;
   1840  aBigger:
   1841     zSig = aSig - bSig;
   1842     zExp = aExp;
   1843  normalizeRoundAndPack:
   1844     --zExp;
   1845     return normalizeRoundAndPackFloat32( zSign, zExp, zSig );
   1846 
   1847 }
   1848 
   1849 /*
   1850 -------------------------------------------------------------------------------
   1851 Returns the result of adding the single-precision floating-point values `a'
   1852 and `b'.  The operation is performed according to the IEC/IEEE Standard for
   1853 Binary Floating-Point Arithmetic.
   1854 -------------------------------------------------------------------------------
   1855 */
   1856 float32 float32_add( float32 a, float32 b )
   1857 {
   1858     flag aSign, bSign;
   1859 
   1860     aSign = extractFloat32Sign( a );
   1861     bSign = extractFloat32Sign( b );
   1862     if ( aSign == bSign ) {
   1863         return addFloat32Sigs( a, b, aSign );
   1864     }
   1865     else {
   1866         return subFloat32Sigs( a, b, aSign );
   1867     }
   1868 
   1869 }
   1870 
   1871 /*
   1872 -------------------------------------------------------------------------------
   1873 Returns the result of subtracting the single-precision floating-point values
   1874 `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
   1875 for Binary Floating-Point Arithmetic.
   1876 -------------------------------------------------------------------------------
   1877 */
   1878 float32 float32_sub( float32 a, float32 b )
   1879 {
   1880     flag aSign, bSign;
   1881 
   1882     aSign = extractFloat32Sign( a );
   1883     bSign = extractFloat32Sign( b );
   1884     if ( aSign == bSign ) {
   1885         return subFloat32Sigs( a, b, aSign );
   1886     }
   1887     else {
   1888         return addFloat32Sigs( a, b, aSign );
   1889     }
   1890 
   1891 }
   1892 
   1893 /*
   1894 -------------------------------------------------------------------------------
   1895 Returns the result of multiplying the single-precision floating-point values
   1896 `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
   1897 for Binary Floating-Point Arithmetic.
   1898 -------------------------------------------------------------------------------
   1899 */
   1900 float32 float32_mul( float32 a, float32 b )
   1901 {
   1902     flag aSign, bSign, zSign;
   1903     int16 aExp, bExp, zExp;
   1904     bits32 aSig, bSig;
   1905     bits64 zSig64;
   1906     bits32 zSig;
   1907 
   1908     aSig = extractFloat32Frac( a );
   1909     aExp = extractFloat32Exp( a );
   1910     aSign = extractFloat32Sign( a );
   1911     bSig = extractFloat32Frac( b );
   1912     bExp = extractFloat32Exp( b );
   1913     bSign = extractFloat32Sign( b );
   1914     zSign = aSign ^ bSign;
   1915     if ( aExp == 0xFF ) {
   1916         if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
   1917             return propagateFloat32NaN( a, b );
   1918         }
   1919         if ( ( bExp | bSig ) == 0 ) {
   1920             float_raise( float_flag_invalid );
   1921             return float32_default_nan;
   1922         }
   1923         return packFloat32( zSign, 0xFF, 0 );
   1924     }
   1925     if ( bExp == 0xFF ) {
   1926         if ( bSig ) return propagateFloat32NaN( a, b );
   1927         if ( ( aExp | aSig ) == 0 ) {
   1928             float_raise( float_flag_invalid );
   1929             return float32_default_nan;
   1930         }
   1931         return packFloat32( zSign, 0xFF, 0 );
   1932     }
   1933     if ( aExp == 0 ) {
   1934         if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
   1935         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
   1936     }
   1937     if ( bExp == 0 ) {
   1938         if ( bSig == 0 ) return packFloat32( zSign, 0, 0 );
   1939         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
   1940     }
   1941     zExp = aExp + bExp - 0x7F;
   1942     aSig = ( aSig | 0x00800000 )<<7;
   1943     bSig = ( bSig | 0x00800000 )<<8;
   1944     shift64RightJamming( ( (bits64) aSig ) * bSig, 32, &zSig64 );
   1945     zSig = (bits32)zSig64;
   1946     if ( 0 <= (sbits32) ( zSig<<1 ) ) {
   1947         zSig <<= 1;
   1948         --zExp;
   1949     }
   1950     return roundAndPackFloat32( zSign, zExp, zSig );
   1951 
   1952 }
   1953 
   1954 /*
   1955 -------------------------------------------------------------------------------
   1956 Returns the result of dividing the single-precision floating-point value `a'
   1957 by the corresponding value `b'.  The operation is performed according to the
   1958 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   1959 -------------------------------------------------------------------------------
   1960 */
   1961 float32 float32_div( float32 a, float32 b )
   1962 {
   1963     flag aSign, bSign, zSign;
   1964     int16 aExp, bExp, zExp;
   1965     bits32 aSig, bSig, zSig;
   1966 
   1967     aSig = extractFloat32Frac( a );
   1968     aExp = extractFloat32Exp( a );
   1969     aSign = extractFloat32Sign( a );
   1970     bSig = extractFloat32Frac( b );
   1971     bExp = extractFloat32Exp( b );
   1972     bSign = extractFloat32Sign( b );
   1973     zSign = aSign ^ bSign;
   1974     if ( aExp == 0xFF ) {
   1975         if ( aSig ) return propagateFloat32NaN( a, b );
   1976         if ( bExp == 0xFF ) {
   1977             if ( bSig ) return propagateFloat32NaN( a, b );
   1978             float_raise( float_flag_invalid );
   1979             return float32_default_nan;
   1980         }
   1981         return packFloat32( zSign, 0xFF, 0 );
   1982     }
   1983     if ( bExp == 0xFF ) {
   1984         if ( bSig ) return propagateFloat32NaN( a, b );
   1985         return packFloat32( zSign, 0, 0 );
   1986     }
   1987     if ( bExp == 0 ) {
   1988         if ( bSig == 0 ) {
   1989             if ( ( aExp | aSig ) == 0 ) {
   1990                 float_raise( float_flag_invalid );
   1991                 return float32_default_nan;
   1992             }
   1993             float_raise( float_flag_divbyzero );
   1994             return packFloat32( zSign, 0xFF, 0 );
   1995         }
   1996         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
   1997     }
   1998     if ( aExp == 0 ) {
   1999         if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
   2000         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
   2001     }
   2002     zExp = aExp - bExp + 0x7D;
   2003     aSig = ( aSig | 0x00800000 )<<7;
   2004     bSig = ( bSig | 0x00800000 )<<8;
   2005     if ( bSig <= ( aSig + aSig ) ) {
   2006         aSig >>= 1;
   2007         ++zExp;
   2008     }
   2009     zSig = (bits32)((((bits64) aSig) << 32) / bSig);
   2010     if ( ( zSig & 0x3F ) == 0 ) {
   2011         zSig |= ( (bits64) bSig * zSig != ( (bits64) aSig )<<32 );
   2012     }
   2013     return roundAndPackFloat32( zSign, zExp, zSig );
   2014 
   2015 }
   2016 
   2017 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
   2018 /*
   2019 -------------------------------------------------------------------------------
   2020 Returns the remainder of the single-precision floating-point value `a'
   2021 with respect to the corresponding value `b'.  The operation is performed
   2022 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   2023 -------------------------------------------------------------------------------
   2024 */
   2025 float32 float32_rem( float32 a, float32 b )
   2026 {
   2027     flag aSign, bSign, zSign;
   2028     int16 aExp, bExp, expDiff;
   2029     bits32 aSig, bSig;
   2030     bits32 q;
   2031     bits64 aSig64, bSig64, q64;
   2032     bits32 alternateASig;
   2033     sbits32 sigMean;
   2034 
   2035     aSig = extractFloat32Frac( a );
   2036     aExp = extractFloat32Exp( a );
   2037     aSign = extractFloat32Sign( a );
   2038     bSig = extractFloat32Frac( b );
   2039     bExp = extractFloat32Exp( b );
   2040     bSign = extractFloat32Sign( b );
   2041     if ( aExp == 0xFF ) {
   2042         if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
   2043             return propagateFloat32NaN( a, b );
   2044         }
   2045         float_raise( float_flag_invalid );
   2046         return float32_default_nan;
   2047     }
   2048     if ( bExp == 0xFF ) {
   2049         if ( bSig ) return propagateFloat32NaN( a, b );
   2050         return a;
   2051     }
   2052     if ( bExp == 0 ) {
   2053         if ( bSig == 0 ) {
   2054             float_raise( float_flag_invalid );
   2055             return float32_default_nan;
   2056         }
   2057         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
   2058     }
   2059     if ( aExp == 0 ) {
   2060         if ( aSig == 0 ) return a;
   2061         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
   2062     }
   2063     expDiff = aExp - bExp;
   2064     aSig |= 0x00800000;
   2065     bSig |= 0x00800000;
   2066     if ( expDiff < 32 ) {
   2067         aSig <<= 8;
   2068         bSig <<= 8;
   2069         if ( expDiff < 0 ) {
   2070             if ( expDiff < -1 ) return a;
   2071             aSig >>= 1;
   2072         }
   2073         q = ( bSig <= aSig );
   2074         if ( q ) aSig -= bSig;
   2075         if ( 0 < expDiff ) {
   2076             q = ( ( (bits64) aSig )<<32 ) / bSig;
   2077             q >>= 32 - expDiff;
   2078             bSig >>= 2;
   2079             aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
   2080         }
   2081         else {
   2082             aSig >>= 2;
   2083             bSig >>= 2;
   2084         }
   2085     }
   2086     else {
   2087         if ( bSig <= aSig ) aSig -= bSig;
   2088         aSig64 = ( (bits64) aSig )<<40;
   2089         bSig64 = ( (bits64) bSig )<<40;
   2090         expDiff -= 64;
   2091         while ( 0 < expDiff ) {
   2092             q64 = estimateDiv128To64( aSig64, 0, bSig64 );
   2093             q64 = ( 2 < q64 ) ? q64 - 2 : 0;
   2094             aSig64 = - ( ( bSig * q64 )<<38 );
   2095             expDiff -= 62;
   2096         }
   2097         expDiff += 64;
   2098         q64 = estimateDiv128To64( aSig64, 0, bSig64 );
   2099         q64 = ( 2 < q64 ) ? q64 - 2 : 0;
   2100         q = q64>>( 64 - expDiff );
   2101         bSig <<= 6;
   2102         aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
   2103     }
   2104     do {
   2105         alternateASig = aSig;
   2106         ++q;
   2107         aSig -= bSig;
   2108     } while ( 0 <= (sbits32) aSig );
   2109     sigMean = aSig + alternateASig;
   2110     if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
   2111         aSig = alternateASig;
   2112     }
   2113     zSign = ( (sbits32) aSig < 0 );
   2114     if ( zSign ) aSig = - aSig;
   2115     return normalizeRoundAndPackFloat32( aSign ^ zSign, bExp, aSig );
   2116 
   2117 }
   2118 #endif /* !SOFTFLOAT_FOR_GCC */
   2119 
   2120 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
   2121 /*
   2122 -------------------------------------------------------------------------------
   2123 Returns the square root of the single-precision floating-point value `a'.
   2124 The operation is performed according to the IEC/IEEE Standard for Binary
   2125 Floating-Point Arithmetic.
   2126 -------------------------------------------------------------------------------
   2127 */
   2128 float32 float32_sqrt( float32 a )
   2129 {
   2130     flag aSign;
   2131     int16 aExp, zExp;
   2132     bits32 aSig, zSig;
   2133     bits64 rem, term;
   2134 
   2135     aSig = extractFloat32Frac( a );
   2136     aExp = extractFloat32Exp( a );
   2137     aSign = extractFloat32Sign( a );
   2138     if ( aExp == 0xFF ) {
   2139         if ( aSig ) return propagateFloat32NaN( a, 0 );
   2140         if ( ! aSign ) return a;
   2141         float_raise( float_flag_invalid );
   2142         return float32_default_nan;
   2143     }
   2144     if ( aSign ) {
   2145         if ( ( aExp | aSig ) == 0 ) return a;
   2146         float_raise( float_flag_invalid );
   2147         return float32_default_nan;
   2148     }
   2149     if ( aExp == 0 ) {
   2150         if ( aSig == 0 ) return 0;
   2151         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
   2152     }
   2153     zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E;
   2154     aSig = ( aSig | 0x00800000 )<<8;
   2155     zSig = estimateSqrt32( aExp, aSig ) + 2;
   2156     if ( ( zSig & 0x7F ) <= 5 ) {
   2157         if ( zSig < 2 ) {
   2158             zSig = 0x7FFFFFFF;
   2159             goto roundAndPack;
   2160         }
   2161         aSig >>= aExp & 1;
   2162         term = ( (bits64) zSig ) * zSig;
   2163         rem = ( ( (bits64) aSig )<<32 ) - term;
   2164         while ( (sbits64) rem < 0 ) {
   2165             --zSig;
   2166             rem += ( ( (bits64) zSig )<<1 ) | 1;
   2167         }
   2168         zSig |= ( rem != 0 );
   2169     }
   2170     shift32RightJamming( zSig, 1, &zSig );
   2171  roundAndPack:
   2172     return roundAndPackFloat32( 0, zExp, zSig );
   2173 
   2174 }
   2175 #endif /* !SOFTFLOAT_FOR_GCC */
   2176 
   2177 /*
   2178 -------------------------------------------------------------------------------
   2179 Returns 1 if the single-precision floating-point value `a' is equal to
   2180 the corresponding value `b', and 0 otherwise.  The comparison is performed
   2181 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   2182 -------------------------------------------------------------------------------
   2183 */
   2184 flag float32_eq( float32 a, float32 b )
   2185 {
   2186 
   2187     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
   2188          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
   2189        ) {
   2190         if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
   2191             float_raise( float_flag_invalid );
   2192         }
   2193         return 0;
   2194     }
   2195     return ( a == b ) || ( (bits32) ( ( a | b )<<1 ) == 0 );
   2196 
   2197 }
   2198 
   2199 /*
   2200 -------------------------------------------------------------------------------
   2201 Returns 1 if the single-precision floating-point value `a' is less than
   2202 or equal to the corresponding value `b', and 0 otherwise.  The comparison
   2203 is performed according to the IEC/IEEE Standard for Binary Floating-Point
   2204 Arithmetic.
   2205 -------------------------------------------------------------------------------
   2206 */
   2207 flag float32_le( float32 a, float32 b )
   2208 {
   2209     flag aSign, bSign;
   2210 
   2211     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
   2212          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
   2213        ) {
   2214         float_raise( float_flag_invalid );
   2215         return 0;
   2216     }
   2217     aSign = extractFloat32Sign( a );
   2218     bSign = extractFloat32Sign( b );
   2219     if ( aSign != bSign ) return aSign || ( (bits32) ( ( a | b )<<1 ) == 0 );
   2220     return ( a == b ) || ( aSign ^ ( a < b ) );
   2221 
   2222 }
   2223 
   2224 /*
   2225 -------------------------------------------------------------------------------
   2226 Returns 1 if the single-precision floating-point value `a' is less than
   2227 the corresponding value `b', and 0 otherwise.  The comparison is performed
   2228 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   2229 -------------------------------------------------------------------------------
   2230 */
   2231 flag float32_lt( float32 a, float32 b )
   2232 {
   2233     flag aSign, bSign;
   2234 
   2235     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
   2236          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
   2237        ) {
   2238         float_raise( float_flag_invalid );
   2239         return 0;
   2240     }
   2241     aSign = extractFloat32Sign( a );
   2242     bSign = extractFloat32Sign( b );
   2243     if ( aSign != bSign ) return aSign && ( (bits32) ( ( a | b )<<1 ) != 0 );
   2244     return ( a != b ) && ( aSign ^ ( a < b ) );
   2245 
   2246 }
   2247 
   2248 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
   2249 /*
   2250 -------------------------------------------------------------------------------
   2251 Returns 1 if the single-precision floating-point value `a' is equal to
   2252 the corresponding value `b', and 0 otherwise.  The invalid exception is
   2253 raised if either operand is a NaN.  Otherwise, the comparison is performed
   2254 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   2255 -------------------------------------------------------------------------------
   2256 */
   2257 flag float32_eq_signaling( float32 a, float32 b )
   2258 {
   2259 
   2260     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
   2261          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
   2262        ) {
   2263         float_raise( float_flag_invalid );
   2264         return 0;
   2265     }
   2266     return ( a == b ) || ( (bits32) ( ( a | b )<<1 ) == 0 );
   2267 
   2268 }
   2269 
   2270 /*
   2271 -------------------------------------------------------------------------------
   2272 Returns 1 if the single-precision floating-point value `a' is less than or
   2273 equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
   2274 cause an exception.  Otherwise, the comparison is performed according to the
   2275 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   2276 -------------------------------------------------------------------------------
   2277 */
   2278 flag float32_le_quiet( float32 a, float32 b )
   2279 {
   2280     flag aSign, bSign;
   2281 
   2282     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
   2283          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
   2284        ) {
   2285         if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
   2286             float_raise( float_flag_invalid );
   2287         }
   2288         return 0;
   2289     }
   2290     aSign = extractFloat32Sign( a );
   2291     bSign = extractFloat32Sign( b );
   2292     if ( aSign != bSign ) return aSign || ( (bits32) ( ( a | b )<<1 ) == 0 );
   2293     return ( a == b ) || ( aSign ^ ( a < b ) );
   2294 
   2295 }
   2296 
   2297 /*
   2298 -------------------------------------------------------------------------------
   2299 Returns 1 if the single-precision floating-point value `a' is less than
   2300 the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
   2301 exception.  Otherwise, the comparison is performed according to the IEC/IEEE
   2302 Standard for Binary Floating-Point Arithmetic.
   2303 -------------------------------------------------------------------------------
   2304 */
   2305 flag float32_lt_quiet( float32 a, float32 b )
   2306 {
   2307     flag aSign, bSign;
   2308 
   2309     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
   2310          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
   2311        ) {
   2312         if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
   2313             float_raise( float_flag_invalid );
   2314         }
   2315         return 0;
   2316     }
   2317     aSign = extractFloat32Sign( a );
   2318     bSign = extractFloat32Sign( b );
   2319     if ( aSign != bSign ) return aSign && ( (bits32) ( ( a | b )<<1 ) != 0 );
   2320     return ( a != b ) && ( aSign ^ ( a < b ) );
   2321 
   2322 }
   2323 #endif /* !SOFTFLOAT_FOR_GCC */
   2324 
   2325 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
   2326 /*
   2327 -------------------------------------------------------------------------------
   2328 Returns the result of converting the double-precision floating-point value
   2329 `a' to the 32-bit two's complement integer format.  The conversion is
   2330 performed according to the IEC/IEEE Standard for Binary Floating-Point
   2331 Arithmetic---which means in particular that the conversion is rounded
   2332 according to the current rounding mode.  If `a' is a NaN, the largest
   2333 positive integer is returned.  Otherwise, if the conversion overflows, the
   2334 largest integer with the same sign as `a' is returned.
   2335 -------------------------------------------------------------------------------
   2336 */
   2337 int32 float64_to_int32( float64 a )
   2338 {
   2339     flag aSign;
   2340     int16 aExp, shiftCount;
   2341     bits64 aSig;
   2342 
   2343     aSig = extractFloat64Frac( a );
   2344     aExp = extractFloat64Exp( a );
   2345     aSign = extractFloat64Sign( a );
   2346     if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
   2347     if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
   2348     shiftCount = 0x42C - aExp;
   2349     if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig );
   2350     return roundAndPackInt32( aSign, aSig );
   2351 
   2352 }
   2353 #endif /* !SOFTFLOAT_FOR_GCC */
   2354 
   2355 /*
   2356 -------------------------------------------------------------------------------
   2357 Returns the result of converting the double-precision floating-point value
   2358 `a' to the 32-bit two's complement integer format.  The conversion is
   2359 performed according to the IEC/IEEE Standard for Binary Floating-Point
   2360 Arithmetic, except that the conversion is always rounded toward zero.
   2361 If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
   2362 the conversion overflows, the largest integer with the same sign as `a' is
   2363 returned.
   2364 -------------------------------------------------------------------------------
   2365 */
   2366 int32 float64_to_int32_round_to_zero( float64 a )
   2367 {
   2368     flag aSign;
   2369     int16 aExp, shiftCount;
   2370     bits64 aSig, savedASig;
   2371     int32 z;
   2372 
   2373     aSig = extractFloat64Frac( a );
   2374     aExp = extractFloat64Exp( a );
   2375     aSign = extractFloat64Sign( a );
   2376     if ( 0x41E < aExp ) {
   2377         if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
   2378         goto invalid;
   2379     }
   2380     else if ( aExp < 0x3FF ) {
   2381         if ( aExp || aSig ) set_float_exception_inexact_flag();
   2382         return 0;
   2383     }
   2384     aSig |= LIT64( 0x0010000000000000 );
   2385     shiftCount = 0x433 - aExp;
   2386     savedASig = aSig;
   2387     aSig >>= shiftCount;
   2388     z = (int32)aSig;
   2389     if ( aSign ) z = - z;
   2390     if ( ( z < 0 ) ^ aSign ) {
   2391  invalid:
   2392         float_raise( float_flag_invalid );
   2393         return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
   2394     }
   2395     if ( ( aSig<<shiftCount ) != savedASig ) {
   2396         set_float_exception_inexact_flag();
   2397     }
   2398     return z;
   2399 
   2400 }
   2401 
   2402 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
   2403 /*
   2404 -------------------------------------------------------------------------------
   2405 Returns the result of converting the double-precision floating-point value
   2406 `a' to the 64-bit two's complement integer format.  The conversion is
   2407 performed according to the IEC/IEEE Standard for Binary Floating-Point
   2408 Arithmetic---which means in particular that the conversion is rounded
   2409 according to the current rounding mode.  If `a' is a NaN, the largest
   2410 positive integer is returned.  Otherwise, if the conversion overflows, the
   2411 largest integer with the same sign as `a' is returned.
   2412 -------------------------------------------------------------------------------
   2413 */
   2414 int64 float64_to_int64( float64 a )
   2415 {
   2416     flag aSign;
   2417     int16 aExp, shiftCount;
   2418     bits64 aSig, aSigExtra;
   2419 
   2420     aSig = extractFloat64Frac( a );
   2421     aExp = extractFloat64Exp( a );
   2422     aSign = extractFloat64Sign( a );
   2423     if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
   2424     shiftCount = 0x433 - aExp;
   2425     if ( shiftCount <= 0 ) {
   2426         if ( 0x43E < aExp ) {
   2427             float_raise( float_flag_invalid );
   2428             if (    ! aSign
   2429                  || (    ( aExp == 0x7FF )
   2430                       && ( aSig != LIT64( 0x0010000000000000 ) ) )
   2431                ) {
   2432                 return LIT64( 0x7FFFFFFFFFFFFFFF );
   2433             }
   2434             return (sbits64) LIT64( 0x8000000000000000 );
   2435         }
   2436         aSigExtra = 0;
   2437         aSig <<= - shiftCount;
   2438     }
   2439     else {
   2440         shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
   2441     }
   2442     return roundAndPackInt64( aSign, aSig, aSigExtra );
   2443 
   2444 }
   2445 
   2446 /*
   2447 -------------------------------------------------------------------------------
   2448 Returns the result of converting the double-precision floating-point value
   2449 `a' to the 64-bit two's complement integer format.  The conversion is
   2450 performed according to the IEC/IEEE Standard for Binary Floating-Point
   2451 Arithmetic, except that the conversion is always rounded toward zero.
   2452 If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
   2453 the conversion overflows, the largest integer with the same sign as `a' is
   2454 returned.
   2455 -------------------------------------------------------------------------------
   2456 */
   2457 int64 float64_to_int64_round_to_zero( float64 a )
   2458 {
   2459     flag aSign;
   2460     int16 aExp, shiftCount;
   2461     bits64 aSig;
   2462     int64 z;
   2463 
   2464     aSig = extractFloat64Frac( a );
   2465     aExp = extractFloat64Exp( a );
   2466     aSign = extractFloat64Sign( a );
   2467     if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
   2468     shiftCount = aExp - 0x433;
   2469     if ( 0 <= shiftCount ) {
   2470         if ( 0x43E <= aExp ) {
   2471             if ( a != LIT64( 0xC3E0000000000000 ) ) {
   2472                 float_raise( float_flag_invalid );
   2473                 if (    ! aSign
   2474                      || (    ( aExp == 0x7FF )
   2475                           && ( aSig != LIT64( 0x0010000000000000 ) ) )
   2476                    ) {
   2477                     return LIT64( 0x7FFFFFFFFFFFFFFF );
   2478                 }
   2479             }
   2480             return (sbits64) LIT64( 0x8000000000000000 );
   2481         }
   2482         z = aSig<<shiftCount;
   2483     }
   2484     else {
   2485         if ( aExp < 0x3FE ) {
   2486             if ( aExp | aSig ) set_float_exception_inexact_flag();
   2487             return 0;
   2488         }
   2489         z = aSig>>( - shiftCount );
   2490         if ( (bits64) ( aSig<<( shiftCount & 63 ) ) ) {
   2491             set_float_exception_inexact_flag();
   2492         }
   2493     }
   2494     if ( aSign ) z = - z;
   2495     return z;
   2496 
   2497 }
   2498 #endif /* !SOFTFLOAT_FOR_GCC */
   2499 
   2500 /*
   2501 -------------------------------------------------------------------------------
   2502 Returns the result of converting the double-precision floating-point value
   2503 `a' to the single-precision floating-point format.  The conversion is
   2504 performed according to the IEC/IEEE Standard for Binary Floating-Point
   2505 Arithmetic.
   2506 -------------------------------------------------------------------------------
   2507 */
   2508 float32 float64_to_float32( float64 a )
   2509 {
   2510     flag aSign;
   2511     int16 aExp;
   2512     bits64 aSig;
   2513     bits32 zSig;
   2514 
   2515     aSig = extractFloat64Frac( a );
   2516     aExp = extractFloat64Exp( a );
   2517     aSign = extractFloat64Sign( a );
   2518     if ( aExp == 0x7FF ) {
   2519         if ( aSig ) return commonNaNToFloat32( float64ToCommonNaN( a ) );
   2520         return packFloat32( aSign, 0xFF, 0 );
   2521     }
   2522     shift64RightJamming( aSig, 22, &aSig );
   2523     zSig = (bits32)aSig;
   2524     if ( aExp || zSig ) {
   2525         zSig |= 0x40000000;
   2526         aExp -= 0x381;
   2527     }
   2528     return roundAndPackFloat32( aSign, aExp, zSig );
   2529 
   2530 }
   2531 
   2532 #ifdef FLOATX80
   2533 
   2534 /*
   2535 -------------------------------------------------------------------------------
   2536 Returns the result of converting the double-precision floating-point value
   2537 `a' to the extended double-precision floating-point format.  The conversion
   2538 is performed according to the IEC/IEEE Standard for Binary Floating-Point
   2539 Arithmetic.
   2540 -------------------------------------------------------------------------------
   2541 */
   2542 floatx80 float64_to_floatx80( float64 a )
   2543 {
   2544     flag aSign;
   2545     int16 aExp;
   2546     bits64 aSig;
   2547 
   2548     aSig = extractFloat64Frac( a );
   2549     aExp = extractFloat64Exp( a );
   2550     aSign = extractFloat64Sign( a );
   2551     if ( aExp == 0x7FF ) {
   2552         if ( aSig ) return commonNaNToFloatx80( float64ToCommonNaN( a ) );
   2553         return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
   2554     }
   2555     if ( aExp == 0 ) {
   2556         if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
   2557         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
   2558     }
   2559     return
   2560         packFloatx80(
   2561             aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 );
   2562 
   2563 }
   2564 
   2565 #endif
   2566 
   2567 #ifdef FLOAT128
   2568 
   2569 /*
   2570 -------------------------------------------------------------------------------
   2571 Returns the result of converting the double-precision floating-point value
   2572 `a' to the quadruple-precision floating-point format.  The conversion is
   2573 performed according to the IEC/IEEE Standard for Binary Floating-Point
   2574 Arithmetic.
   2575 -------------------------------------------------------------------------------
   2576 */
   2577 float128 float64_to_float128( float64 a )
   2578 {
   2579     flag aSign;
   2580     int16 aExp;
   2581     bits64 aSig, zSig0, zSig1;
   2582 
   2583     aSig = extractFloat64Frac( a );
   2584     aExp = extractFloat64Exp( a );
   2585     aSign = extractFloat64Sign( a );
   2586     if ( aExp == 0x7FF ) {
   2587         if ( aSig ) return commonNaNToFloat128( float64ToCommonNaN( a ) );
   2588         return packFloat128( aSign, 0x7FFF, 0, 0 );
   2589     }
   2590     if ( aExp == 0 ) {
   2591         if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
   2592         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
   2593         --aExp;
   2594     }
   2595     shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
   2596     return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
   2597 
   2598 }
   2599 
   2600 #endif
   2601 
   2602 #ifndef SOFTFLOAT_FOR_GCC
   2603 /*
   2604 -------------------------------------------------------------------------------
   2605 Rounds the double-precision floating-point value `a' to an integer, and
   2606 returns the result as a double-precision floating-point value.  The
   2607 operation is performed according to the IEC/IEEE Standard for Binary
   2608 Floating-Point Arithmetic.
   2609 -------------------------------------------------------------------------------
   2610 */
   2611 float64 float64_round_to_int( float64 a )
   2612 {
   2613     flag aSign;
   2614     int16 aExp;
   2615     bits64 lastBitMask, roundBitsMask;
   2616     int8 roundingMode;
   2617     float64 z;
   2618 
   2619     aExp = extractFloat64Exp( a );
   2620     if ( 0x433 <= aExp ) {
   2621         if ( ( aExp == 0x7FF ) && extractFloat64Frac( a ) ) {
   2622             return propagateFloat64NaN( a, a );
   2623         }
   2624         return a;
   2625     }
   2626     if ( aExp < 0x3FF ) {
   2627         if ( (bits64) ( a<<1 ) == 0 ) return a;
   2628         set_float_exception_inexact_flag();
   2629         aSign = extractFloat64Sign( a );
   2630         switch ( float_rounding_mode ) {
   2631          case float_round_nearest_even:
   2632             if ( ( aExp == 0x3FE ) && extractFloat64Frac( a ) ) {
   2633                 return packFloat64( aSign, 0x3FF, 0 );
   2634             }
   2635             break;
   2636          case float_round_to_zero:
   2637             break;
   2638          case float_round_down:
   2639             return aSign ? LIT64( 0xBFF0000000000000 ) : 0;
   2640          case float_round_up:
   2641             return
   2642             aSign ? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 );
   2643         }
   2644         return packFloat64( aSign, 0, 0 );
   2645     }
   2646     lastBitMask = 1;
   2647     lastBitMask <<= 0x433 - aExp;
   2648     roundBitsMask = lastBitMask - 1;
   2649     z = a;
   2650     roundingMode = float_rounding_mode;
   2651     if ( roundingMode == float_round_nearest_even ) {
   2652         z += lastBitMask>>1;
   2653         if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
   2654     }
   2655     else if ( roundingMode != float_round_to_zero ) {
   2656         if ( extractFloat64Sign( z ) ^ ( roundingMode == float_round_up ) ) {
   2657             z += roundBitsMask;
   2658         }
   2659     }
   2660     z &= ~ roundBitsMask;
   2661     if ( z != a ) set_float_exception_inexact_flag();
   2662     return z;
   2663 
   2664 }
   2665 #endif
   2666 
   2667 /*
   2668 -------------------------------------------------------------------------------
   2669 Returns the result of adding the absolute values of the double-precision
   2670 floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
   2671 before being returned.  `zSign' is ignored if the result is a NaN.
   2672 The addition is performed according to the IEC/IEEE Standard for Binary
   2673 Floating-Point Arithmetic.
   2674 -------------------------------------------------------------------------------
   2675 */
   2676 static float64 addFloat64Sigs( float64 a, float64 b, flag zSign )
   2677 {
   2678     int16 aExp, bExp, zExp;
   2679     bits64 aSig, bSig, zSig;
   2680     int16 expDiff;
   2681 
   2682     aSig = extractFloat64Frac( a );
   2683     aExp = extractFloat64Exp( a );
   2684     bSig = extractFloat64Frac( b );
   2685     bExp = extractFloat64Exp( b );
   2686     expDiff = aExp - bExp;
   2687     aSig <<= 9;
   2688     bSig <<= 9;
   2689     if ( 0 < expDiff ) {
   2690         if ( aExp == 0x7FF ) {
   2691             if ( aSig ) return propagateFloat64NaN( a, b );
   2692             return a;
   2693         }
   2694         if ( bExp == 0 ) {
   2695             --expDiff;
   2696         }
   2697         else {
   2698             bSig |= LIT64( 0x2000000000000000 );
   2699         }
   2700         shift64RightJamming( bSig, expDiff, &bSig );
   2701         zExp = aExp;
   2702     }
   2703     else if ( expDiff < 0 ) {
   2704         if ( bExp == 0x7FF ) {
   2705             if ( bSig ) return propagateFloat64NaN( a, b );
   2706             return packFloat64( zSign, 0x7FF, 0 );
   2707         }
   2708         if ( aExp == 0 ) {
   2709             ++expDiff;
   2710         }
   2711         else {
   2712             aSig |= LIT64( 0x2000000000000000 );
   2713         }
   2714         shift64RightJamming( aSig, - expDiff, &aSig );
   2715         zExp = bExp;
   2716     }
   2717     else {
   2718         if ( aExp == 0x7FF ) {
   2719             if ( aSig | bSig ) return propagateFloat64NaN( a, b );
   2720             return a;
   2721         }
   2722         if ( aExp == 0 ) return packFloat64( zSign, 0, ( aSig + bSig )>>9 );
   2723         zSig = LIT64( 0x4000000000000000 ) + aSig + bSig;
   2724         zExp = aExp;
   2725         goto roundAndPack;
   2726     }
   2727     aSig |= LIT64( 0x2000000000000000 );
   2728     zSig = ( aSig + bSig )<<1;
   2729     --zExp;
   2730     if ( (sbits64) zSig < 0 ) {
   2731         zSig = aSig + bSig;
   2732         ++zExp;
   2733     }
   2734  roundAndPack:
   2735     return roundAndPackFloat64( zSign, zExp, zSig );
   2736 
   2737 }
   2738 
   2739 /*
   2740 -------------------------------------------------------------------------------
   2741 Returns the result of subtracting the absolute values of the double-
   2742 precision floating-point values `a' and `b'.  If `zSign' is 1, the
   2743 difference is negated before being returned.  `zSign' is ignored if the
   2744 result is a NaN.  The subtraction is performed according to the IEC/IEEE
   2745 Standard for Binary Floating-Point Arithmetic.
   2746 -------------------------------------------------------------------------------
   2747 */
   2748 static float64 subFloat64Sigs( float64 a, float64 b, flag zSign )
   2749 {
   2750     int16 aExp, bExp, zExp;
   2751     bits64 aSig, bSig, zSig;
   2752     int16 expDiff;
   2753 
   2754     aSig = extractFloat64Frac( a );
   2755     aExp = extractFloat64Exp( a );
   2756     bSig = extractFloat64Frac( b );
   2757     bExp = extractFloat64Exp( b );
   2758     expDiff = aExp - bExp;
   2759     aSig <<= 10;
   2760     bSig <<= 10;
   2761     if ( 0 < expDiff ) goto aExpBigger;
   2762     if ( expDiff < 0 ) goto bExpBigger;
   2763     if ( aExp == 0x7FF ) {
   2764         if ( aSig | bSig ) return propagateFloat64NaN( a, b );
   2765         float_raise( float_flag_invalid );
   2766         return float64_default_nan;
   2767     }
   2768     if ( aExp == 0 ) {
   2769         aExp = 1;
   2770         bExp = 1;
   2771     }
   2772     if ( bSig < aSig ) goto aBigger;
   2773     if ( aSig < bSig ) goto bBigger;
   2774     return packFloat64( float_rounding_mode == float_round_down, 0, 0 );
   2775  bExpBigger:
   2776     if ( bExp == 0x7FF ) {
   2777         if ( bSig ) return propagateFloat64NaN( a, b );
   2778         return packFloat64( zSign ^ 1, 0x7FF, 0 );
   2779     }
   2780     if ( aExp == 0 ) {
   2781         ++expDiff;
   2782     }
   2783     else {
   2784         aSig |= LIT64( 0x4000000000000000 );
   2785     }
   2786     shift64RightJamming( aSig, - expDiff, &aSig );
   2787     bSig |= LIT64( 0x4000000000000000 );
   2788  bBigger:
   2789     zSig = bSig - aSig;
   2790     zExp = bExp;
   2791     zSign ^= 1;
   2792     goto normalizeRoundAndPack;
   2793  aExpBigger:
   2794     if ( aExp == 0x7FF ) {
   2795         if ( aSig ) return propagateFloat64NaN( a, b );
   2796         return a;
   2797     }
   2798     if ( bExp == 0 ) {
   2799         --expDiff;
   2800     }
   2801     else {
   2802         bSig |= LIT64( 0x4000000000000000 );
   2803     }
   2804     shift64RightJamming( bSig, expDiff, &bSig );
   2805     aSig |= LIT64( 0x4000000000000000 );
   2806  aBigger:
   2807     zSig = aSig - bSig;
   2808     zExp = aExp;
   2809  normalizeRoundAndPack:
   2810     --zExp;
   2811     return normalizeRoundAndPackFloat64( zSign, zExp, zSig );
   2812 
   2813 }
   2814 
   2815 /*
   2816 -------------------------------------------------------------------------------
   2817 Returns the result of adding the double-precision floating-point values `a'
   2818 and `b'.  The operation is performed according to the IEC/IEEE Standard for
   2819 Binary Floating-Point Arithmetic.
   2820 -------------------------------------------------------------------------------
   2821 */
   2822 float64 float64_add( float64 a, float64 b )
   2823 {
   2824     flag aSign, bSign;
   2825 
   2826     aSign = extractFloat64Sign( a );
   2827     bSign = extractFloat64Sign( b );
   2828     if ( aSign == bSign ) {
   2829         return addFloat64Sigs( a, b, aSign );
   2830     }
   2831     else {
   2832         return subFloat64Sigs( a, b, aSign );
   2833     }
   2834 
   2835 }
   2836 
   2837 /*
   2838 -------------------------------------------------------------------------------
   2839 Returns the result of subtracting the double-precision floating-point values
   2840 `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
   2841 for Binary Floating-Point Arithmetic.
   2842 -------------------------------------------------------------------------------
   2843 */
   2844 float64 float64_sub( float64 a, float64 b )
   2845 {
   2846     flag aSign, bSign;
   2847 
   2848     aSign = extractFloat64Sign( a );
   2849     bSign = extractFloat64Sign( b );
   2850     if ( aSign == bSign ) {
   2851         return subFloat64Sigs( a, b, aSign );
   2852     }
   2853     else {
   2854         return addFloat64Sigs( a, b, aSign );
   2855     }
   2856 
   2857 }
   2858 
   2859 /*
   2860 -------------------------------------------------------------------------------
   2861 Returns the result of multiplying the double-precision floating-point values
   2862 `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
   2863 for Binary Floating-Point Arithmetic.
   2864 -------------------------------------------------------------------------------
   2865 */
   2866 float64 float64_mul( float64 a, float64 b )
   2867 {
   2868     flag aSign, bSign, zSign;
   2869     int16 aExp, bExp, zExp;
   2870     bits64 aSig, bSig, zSig0, zSig1;
   2871 
   2872     aSig = extractFloat64Frac( a );
   2873     aExp = extractFloat64Exp( a );
   2874     aSign = extractFloat64Sign( a );
   2875     bSig = extractFloat64Frac( b );
   2876     bExp = extractFloat64Exp( b );
   2877     bSign = extractFloat64Sign( b );
   2878     zSign = aSign ^ bSign;
   2879     if ( aExp == 0x7FF ) {
   2880         if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
   2881             return propagateFloat64NaN( a, b );
   2882         }
   2883         if ( ( bExp | bSig ) == 0 ) {
   2884             float_raise( float_flag_invalid );
   2885             return float64_default_nan;
   2886         }
   2887         return packFloat64( zSign, 0x7FF, 0 );
   2888     }
   2889     if ( bExp == 0x7FF ) {
   2890         if ( bSig ) return propagateFloat64NaN( a, b );
   2891         if ( ( aExp | aSig ) == 0 ) {
   2892             float_raise( float_flag_invalid );
   2893             return float64_default_nan;
   2894         }
   2895         return packFloat64( zSign, 0x7FF, 0 );
   2896     }
   2897     if ( aExp == 0 ) {
   2898         if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
   2899         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
   2900     }
   2901     if ( bExp == 0 ) {
   2902         if ( bSig == 0 ) return packFloat64( zSign, 0, 0 );
   2903         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
   2904     }
   2905     zExp = aExp + bExp - 0x3FF;
   2906     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
   2907     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
   2908     mul64To128( aSig, bSig, &zSig0, &zSig1 );
   2909     zSig0 |= ( zSig1 != 0 );
   2910     if ( 0 <= (sbits64) ( zSig0<<1 ) ) {
   2911         zSig0 <<= 1;
   2912         --zExp;
   2913     }
   2914     return roundAndPackFloat64( zSign, zExp, zSig0 );
   2915 
   2916 }
   2917 
   2918 /*
   2919 -------------------------------------------------------------------------------
   2920 Returns the result of dividing the double-precision floating-point value `a'
   2921 by the corresponding value `b'.  The operation is performed according to
   2922 the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   2923 -------------------------------------------------------------------------------
   2924 */
   2925 float64 float64_div( float64 a, float64 b )
   2926 {
   2927     flag aSign, bSign, zSign;
   2928     int16 aExp, bExp, zExp;
   2929     bits64 aSig, bSig, zSig;
   2930     bits64 rem0, rem1;
   2931     bits64 term0, term1;
   2932 
   2933     aSig = extractFloat64Frac( a );
   2934     aExp = extractFloat64Exp( a );
   2935     aSign = extractFloat64Sign( a );
   2936     bSig = extractFloat64Frac( b );
   2937     bExp = extractFloat64Exp( b );
   2938     bSign = extractFloat64Sign( b );
   2939     zSign = aSign ^ bSign;
   2940     if ( aExp == 0x7FF ) {
   2941         if ( aSig ) return propagateFloat64NaN( a, b );
   2942         if ( bExp == 0x7FF ) {
   2943             if ( bSig ) return propagateFloat64NaN( a, b );
   2944             float_raise( float_flag_invalid );
   2945             return float64_default_nan;
   2946         }
   2947         return packFloat64( zSign, 0x7FF, 0 );
   2948     }
   2949     if ( bExp == 0x7FF ) {
   2950         if ( bSig ) return propagateFloat64NaN( a, b );
   2951         return packFloat64( zSign, 0, 0 );
   2952     }
   2953     if ( bExp == 0 ) {
   2954         if ( bSig == 0 ) {
   2955             if ( ( aExp | aSig ) == 0 ) {
   2956                 float_raise( float_flag_invalid );
   2957                 return float64_default_nan;
   2958             }
   2959             float_raise( float_flag_divbyzero );
   2960             return packFloat64( zSign, 0x7FF, 0 );
   2961         }
   2962         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
   2963     }
   2964     if ( aExp == 0 ) {
   2965         if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
   2966         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
   2967     }
   2968     zExp = aExp - bExp + 0x3FD;
   2969     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
   2970     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
   2971     if ( bSig <= ( aSig + aSig ) ) {
   2972         aSig >>= 1;
   2973         ++zExp;
   2974     }
   2975     zSig = estimateDiv128To64( aSig, 0, bSig );
   2976     if ( ( zSig & 0x1FF ) <= 2 ) {
   2977         mul64To128( bSig, zSig, &term0, &term1 );
   2978         sub128( aSig, 0, term0, term1, &rem0, &rem1 );
   2979         while ( (sbits64) rem0 < 0 ) {
   2980             --zSig;
   2981             add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
   2982         }
   2983         zSig |= ( rem1 != 0 );
   2984     }
   2985     return roundAndPackFloat64( zSign, zExp, zSig );
   2986 
   2987 }
   2988 
   2989 #ifndef SOFTFLOAT_FOR_GCC
   2990 /*
   2991 -------------------------------------------------------------------------------
   2992 Returns the remainder of the double-precision floating-point value `a'
   2993 with respect to the corresponding value `b'.  The operation is performed
   2994 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   2995 -------------------------------------------------------------------------------
   2996 */
   2997 float64 float64_rem( float64 a, float64 b )
   2998 {
   2999     flag aSign, bSign, zSign;
   3000     int16 aExp, bExp, expDiff;
   3001     bits64 aSig, bSig;
   3002     bits64 q, alternateASig;
   3003     sbits64 sigMean;
   3004 
   3005     aSig = extractFloat64Frac( a );
   3006     aExp = extractFloat64Exp( a );
   3007     aSign = extractFloat64Sign( a );
   3008     bSig = extractFloat64Frac( b );
   3009     bExp = extractFloat64Exp( b );
   3010     bSign = extractFloat64Sign( b );
   3011     if ( aExp == 0x7FF ) {
   3012         if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
   3013             return propagateFloat64NaN( a, b );
   3014         }
   3015         float_raise( float_flag_invalid );
   3016         return float64_default_nan;
   3017     }
   3018     if ( bExp == 0x7FF ) {
   3019         if ( bSig ) return propagateFloat64NaN( a, b );
   3020         return a;
   3021     }
   3022     if ( bExp == 0 ) {
   3023         if ( bSig == 0 ) {
   3024             float_raise( float_flag_invalid );
   3025             return float64_default_nan;
   3026         }
   3027         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
   3028     }
   3029     if ( aExp == 0 ) {
   3030         if ( aSig == 0 ) return a;
   3031         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
   3032     }
   3033     expDiff = aExp - bExp;
   3034     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11;
   3035     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
   3036     if ( expDiff < 0 ) {
   3037         if ( expDiff < -1 ) return a;
   3038         aSig >>= 1;
   3039     }
   3040     q = ( bSig <= aSig );
   3041     if ( q ) aSig -= bSig;
   3042     expDiff -= 64;
   3043     while ( 0 < expDiff ) {
   3044         q = estimateDiv128To64( aSig, 0, bSig );
   3045         q = ( 2 < q ) ? q - 2 : 0;
   3046         aSig = - ( ( bSig>>2 ) * q );
   3047         expDiff -= 62;
   3048     }
   3049     expDiff += 64;
   3050     if ( 0 < expDiff ) {
   3051         q = estimateDiv128To64( aSig, 0, bSig );
   3052         q = ( 2 < q ) ? q - 2 : 0;
   3053         q >>= 64 - expDiff;
   3054         bSig >>= 2;
   3055         aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
   3056     }
   3057     else {
   3058         aSig >>= 2;
   3059         bSig >>= 2;
   3060     }
   3061     do {
   3062         alternateASig = aSig;
   3063         ++q;
   3064         aSig -= bSig;
   3065     } while ( 0 <= (sbits64) aSig );
   3066     sigMean = aSig + alternateASig;
   3067     if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
   3068         aSig = alternateASig;
   3069     }
   3070     zSign = ( (sbits64) aSig < 0 );
   3071     if ( zSign ) aSig = - aSig;
   3072     return normalizeRoundAndPackFloat64( aSign ^ zSign, bExp, aSig );
   3073 
   3074 }
   3075 
   3076 /*
   3077 -------------------------------------------------------------------------------
   3078 Returns the square root of the double-precision floating-point value `a'.
   3079 The operation is performed according to the IEC/IEEE Standard for Binary
   3080 Floating-Point Arithmetic.
   3081 -------------------------------------------------------------------------------
   3082 */
   3083 float64 float64_sqrt( float64 a )
   3084 {
   3085     flag aSign;
   3086     int16 aExp, zExp;
   3087     bits64 aSig, zSig, doubleZSig;
   3088     bits64 rem0, rem1, term0, term1;
   3089 
   3090     aSig = extractFloat64Frac( a );
   3091     aExp = extractFloat64Exp( a );
   3092     aSign = extractFloat64Sign( a );
   3093     if ( aExp == 0x7FF ) {
   3094         if ( aSig ) return propagateFloat64NaN( a, a );
   3095         if ( ! aSign ) return a;
   3096         float_raise( float_flag_invalid );
   3097         return float64_default_nan;
   3098     }
   3099     if ( aSign ) {
   3100         if ( ( aExp | aSig ) == 0 ) return a;
   3101         float_raise( float_flag_invalid );
   3102         return float64_default_nan;
   3103     }
   3104     if ( aExp == 0 ) {
   3105         if ( aSig == 0 ) return 0;
   3106         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
   3107     }
   3108     zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE;
   3109     aSig |= LIT64( 0x0010000000000000 );
   3110     zSig = estimateSqrt32( aExp, aSig>>21 );
   3111     aSig <<= 9 - ( aExp & 1 );
   3112     zSig = estimateDiv128To64( aSig, 0, zSig<<32 ) + ( zSig<<30 );
   3113     if ( ( zSig & 0x1FF ) <= 5 ) {
   3114         doubleZSig = zSig<<1;
   3115         mul64To128( zSig, zSig, &term0, &term1 );
   3116         sub128( aSig, 0, term0, term1, &rem0, &rem1 );
   3117         while ( (sbits64) rem0 < 0 ) {
   3118             --zSig;
   3119             doubleZSig -= 2;
   3120             add128( rem0, rem1, zSig>>63, doubleZSig | 1, &rem0, &rem1 );
   3121         }
   3122         zSig |= ( ( rem0 | rem1 ) != 0 );
   3123     }
   3124     return roundAndPackFloat64( 0, zExp, zSig );
   3125 
   3126 }
   3127 #endif
   3128 
   3129 /*
   3130 -------------------------------------------------------------------------------
   3131 Returns 1 if the double-precision floating-point value `a' is equal to the
   3132 corresponding value `b', and 0 otherwise.  The comparison is performed
   3133 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   3134 -------------------------------------------------------------------------------
   3135 */
   3136 flag float64_eq( float64 a, float64 b )
   3137 {
   3138 
   3139     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
   3140          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
   3141        ) {
   3142         if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
   3143             float_raise( float_flag_invalid );
   3144         }
   3145         return 0;
   3146     }
   3147     return ( a == b ) ||
   3148         ( (bits64) ( ( FLOAT64_DEMANGLE(a) | FLOAT64_DEMANGLE(b) )<<1 ) == 0 );
   3149 
   3150 }
   3151 
   3152 /*
   3153 -------------------------------------------------------------------------------
   3154 Returns 1 if the double-precision floating-point value `a' is less than or
   3155 equal to the corresponding value `b', and 0 otherwise.  The comparison is
   3156 performed according to the IEC/IEEE Standard for Binary Floating-Point
   3157 Arithmetic.
   3158 -------------------------------------------------------------------------------
   3159 */
   3160 flag float64_le( float64 a, float64 b )
   3161 {
   3162     flag aSign, bSign;
   3163 
   3164     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
   3165          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
   3166        ) {
   3167         float_raise( float_flag_invalid );
   3168         return 0;
   3169     }
   3170     aSign = extractFloat64Sign( a );
   3171     bSign = extractFloat64Sign( b );
   3172     if ( aSign != bSign )
   3173         return aSign ||
   3174             ( (bits64) ( ( FLOAT64_DEMANGLE(a) | FLOAT64_DEMANGLE(b) )<<1 ) ==
   3175               0 );
   3176     return ( a == b ) ||
   3177         ( aSign ^ ( FLOAT64_DEMANGLE(a) < FLOAT64_DEMANGLE(b) ) );
   3178 
   3179 }
   3180 
   3181 /*
   3182 -------------------------------------------------------------------------------
   3183 Returns 1 if the double-precision floating-point value `a' is less than
   3184 the corresponding value `b', and 0 otherwise.  The comparison is performed
   3185 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   3186 -------------------------------------------------------------------------------
   3187 */
   3188 flag float64_lt( float64 a, float64 b )
   3189 {
   3190     flag aSign, bSign;
   3191 
   3192     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
   3193          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
   3194        ) {
   3195         float_raise( float_flag_invalid );
   3196         return 0;
   3197     }
   3198     aSign = extractFloat64Sign( a );
   3199     bSign = extractFloat64Sign( b );
   3200     if ( aSign != bSign )
   3201         return aSign &&
   3202             ( (bits64) ( ( FLOAT64_DEMANGLE(a) | FLOAT64_DEMANGLE(b) )<<1 ) !=
   3203               0 );
   3204     return ( a != b ) &&
   3205         ( aSign ^ ( FLOAT64_DEMANGLE(a) < FLOAT64_DEMANGLE(b) ) );
   3206 
   3207 }
   3208 
   3209 #ifndef SOFTFLOAT_FOR_GCC
   3210 /*
   3211 -------------------------------------------------------------------------------
   3212 Returns 1 if the double-precision floating-point value `a' is equal to the
   3213 corresponding value `b', and 0 otherwise.  The invalid exception is raised
   3214 if either operand is a NaN.  Otherwise, the comparison is performed
   3215 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   3216 -------------------------------------------------------------------------------
   3217 */
   3218 flag float64_eq_signaling( float64 a, float64 b )
   3219 {
   3220 
   3221     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
   3222          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
   3223        ) {
   3224         float_raise( float_flag_invalid );
   3225         return 0;
   3226     }
   3227     return ( a == b ) || ( (bits64) ( ( a | b )<<1 ) == 0 );
   3228 
   3229 }
   3230 
   3231 /*
   3232 -------------------------------------------------------------------------------
   3233 Returns 1 if the double-precision floating-point value `a' is less than or
   3234 equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
   3235 cause an exception.  Otherwise, the comparison is performed according to the
   3236 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   3237 -------------------------------------------------------------------------------
   3238 */
   3239 flag float64_le_quiet( float64 a, float64 b )
   3240 {
   3241     flag aSign, bSign;
   3242 
   3243     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
   3244          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
   3245        ) {
   3246         if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
   3247             float_raise( float_flag_invalid );
   3248         }
   3249         return 0;
   3250     }
   3251     aSign = extractFloat64Sign( a );
   3252     bSign = extractFloat64Sign( b );
   3253     if ( aSign != bSign ) return aSign || ( (bits64) ( ( a | b )<<1 ) == 0 );
   3254     return ( a == b ) || ( aSign ^ ( a < b ) );
   3255 
   3256 }
   3257 
   3258 /*
   3259 -------------------------------------------------------------------------------
   3260 Returns 1 if the double-precision floating-point value `a' is less than
   3261 the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
   3262 exception.  Otherwise, the comparison is performed according to the IEC/IEEE
   3263 Standard for Binary Floating-Point Arithmetic.
   3264 -------------------------------------------------------------------------------
   3265 */
   3266 flag float64_lt_quiet( float64 a, float64 b )
   3267 {
   3268     flag aSign, bSign;
   3269 
   3270     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
   3271          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
   3272        ) {
   3273         if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
   3274             float_raise( float_flag_invalid );
   3275         }
   3276         return 0;
   3277     }
   3278     aSign = extractFloat64Sign( a );
   3279     bSign = extractFloat64Sign( b );
   3280     if ( aSign != bSign ) return aSign && ( (bits64) ( ( a | b )<<1 ) != 0 );
   3281     return ( a != b ) && ( aSign ^ ( a < b ) );
   3282 
   3283 }
   3284 #endif
   3285 
   3286 #ifdef FLOATX80
   3287 
   3288 /*
   3289 -------------------------------------------------------------------------------
   3290 Returns the result of converting the extended double-precision floating-
   3291 point value `a' to the 32-bit two's complement integer format.  The
   3292 conversion is performed according to the IEC/IEEE Standard for Binary
   3293 Floating-Point Arithmetic---which means in particular that the conversion
   3294 is rounded according to the current rounding mode.  If `a' is a NaN, the
   3295 largest positive integer is returned.  Otherwise, if the conversion
   3296 overflows, the largest integer with the same sign as `a' is returned.
   3297 -------------------------------------------------------------------------------
   3298 */
   3299 int32 floatx80_to_int32( floatx80 a )
   3300 {
   3301     flag aSign;
   3302     int32 aExp, shiftCount;
   3303     bits64 aSig;
   3304 
   3305     aSig = extractFloatx80Frac( a );
   3306     aExp = extractFloatx80Exp( a );
   3307     aSign = extractFloatx80Sign( a );
   3308     if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
   3309     shiftCount = 0x4037 - aExp;
   3310     if ( shiftCount <= 0 ) shiftCount = 1;
   3311     shift64RightJamming( aSig, shiftCount, &aSig );
   3312     return roundAndPackInt32( aSign, aSig );
   3313 
   3314 }
   3315 
   3316 /*
   3317 -------------------------------------------------------------------------------
   3318 Returns the result of converting the extended double-precision floating-
   3319 point value `a' to the 32-bit two's complement integer format.  The
   3320 conversion is performed according to the IEC/IEEE Standard for Binary
   3321 Floating-Point Arithmetic, except that the conversion is always rounded
   3322 toward zero.  If `a' is a NaN, the largest positive integer is returned.
   3323 Otherwise, if the conversion overflows, the largest integer with the same
   3324 sign as `a' is returned.
   3325 -------------------------------------------------------------------------------
   3326 */
   3327 int32 floatx80_to_int32_round_to_zero( floatx80 a )
   3328 {
   3329     flag aSign;
   3330     int32 aExp, shiftCount;
   3331     bits64 aSig, savedASig;
   3332     int32 z;
   3333 
   3334     aSig = extractFloatx80Frac( a );
   3335     aExp = extractFloatx80Exp( a );
   3336     aSign = extractFloatx80Sign( a );
   3337     if ( 0x401E < aExp ) {
   3338         if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
   3339         goto invalid;
   3340     }
   3341     else if ( aExp < 0x3FFF ) {
   3342         if ( aExp || aSig ) set_float_exception_inexact_flag();
   3343         return 0;
   3344     }
   3345     shiftCount = 0x403E - aExp;
   3346     savedASig = aSig;
   3347     aSig >>= shiftCount;
   3348     z = aSig;
   3349     if ( aSign ) z = - z;
   3350     if ( ( z < 0 ) ^ aSign ) {
   3351  invalid:
   3352         float_raise( float_flag_invalid );
   3353         return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
   3354     }
   3355     if ( ( aSig<<shiftCount ) != savedASig ) {
   3356         set_float_exception_inexact_flag();
   3357     }
   3358     return z;
   3359 
   3360 }
   3361 
   3362 /*
   3363 -------------------------------------------------------------------------------
   3364 Returns the result of converting the extended double-precision floating-
   3365 point value `a' to the 64-bit two's complement integer format.  The
   3366 conversion is performed according to the IEC/IEEE Standard for Binary
   3367 Floating-Point Arithmetic---which means in particular that the conversion
   3368 is rounded according to the current rounding mode.  If `a' is a NaN,
   3369 the largest positive integer is returned.  Otherwise, if the conversion
   3370 overflows, the largest integer with the same sign as `a' is returned.
   3371 -------------------------------------------------------------------------------
   3372 */
   3373 int64 floatx80_to_int64( floatx80 a )
   3374 {
   3375     flag aSign;
   3376     int32 aExp, shiftCount;
   3377     bits64 aSig, aSigExtra;
   3378 
   3379     aSig = extractFloatx80Frac( a );
   3380     aExp = extractFloatx80Exp( a );
   3381     aSign = extractFloatx80Sign( a );
   3382     shiftCount = 0x403E - aExp;
   3383     if ( shiftCount <= 0 ) {
   3384         if ( shiftCount ) {
   3385             float_raise( float_flag_invalid );
   3386             if (    ! aSign
   3387                  || (    ( aExp == 0x7FFF )
   3388                       && ( aSig != LIT64( 0x8000000000000000 ) ) )
   3389                ) {
   3390                 return LIT64( 0x7FFFFFFFFFFFFFFF );
   3391             }
   3392             return (sbits64) LIT64( 0x8000000000000000 );
   3393         }
   3394         aSigExtra = 0;
   3395     }
   3396     else {
   3397         shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
   3398     }
   3399     return roundAndPackInt64( aSign, aSig, aSigExtra );
   3400 
   3401 }
   3402 
   3403 /*
   3404 -------------------------------------------------------------------------------
   3405 Returns the result of converting the extended double-precision floating-
   3406 point value `a' to the 64-bit two's complement integer format.  The
   3407 conversion is performed according to the IEC/IEEE Standard for Binary
   3408 Floating-Point Arithmetic, except that the conversion is always rounded
   3409 toward zero.  If `a' is a NaN, the largest positive integer is returned.
   3410 Otherwise, if the conversion overflows, the largest integer with the same
   3411 sign as `a' is returned.
   3412 -------------------------------------------------------------------------------
   3413 */
   3414 int64 floatx80_to_int64_round_to_zero( floatx80 a )
   3415 {
   3416     flag aSign;
   3417     int32 aExp, shiftCount;
   3418     bits64 aSig;
   3419     int64 z;
   3420 
   3421     aSig = extractFloatx80Frac( a );
   3422     aExp = extractFloatx80Exp( a );
   3423     aSign = extractFloatx80Sign( a );
   3424     shiftCount = aExp - 0x403E;
   3425     if ( 0 <= shiftCount ) {
   3426         aSig &= LIT64( 0x7FFFFFFFFFFFFFFF );
   3427         if ( ( a.high != 0xC03E ) || aSig ) {
   3428             float_raise( float_flag_invalid );
   3429             if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {
   3430                 return LIT64( 0x7FFFFFFFFFFFFFFF );
   3431             }
   3432         }
   3433         return (sbits64) LIT64( 0x8000000000000000 );
   3434     }
   3435     else if ( aExp < 0x3FFF ) {
   3436         if ( aExp | aSig ) set_float_exception_inexact_flag();
   3437         return 0;
   3438     }
   3439     z = aSig>>( - shiftCount );
   3440     if ( (bits64) ( aSig<<( shiftCount & 63 ) ) ) {
   3441         set_float_exception_inexact_flag();
   3442     }
   3443     if ( aSign ) z = - z;
   3444     return z;
   3445 
   3446 }
   3447 
   3448 /*
   3449 -------------------------------------------------------------------------------
   3450 Returns the result of converting the extended double-precision floating-
   3451 point value `a' to the single-precision floating-point format.  The
   3452 conversion is performed according to the IEC/IEEE Standard for Binary
   3453 Floating-Point Arithmetic.
   3454 -------------------------------------------------------------------------------
   3455 */
   3456 float32 floatx80_to_float32( floatx80 a )
   3457 {
   3458     flag aSign;
   3459     int32 aExp;
   3460     bits64 aSig;
   3461 
   3462     aSig = extractFloatx80Frac( a );
   3463     aExp = extractFloatx80Exp( a );
   3464     aSign = extractFloatx80Sign( a );
   3465     if ( aExp == 0x7FFF ) {
   3466         if ( (bits64) ( aSig<<1 ) ) {
   3467             return commonNaNToFloat32( floatx80ToCommonNaN( a ) );
   3468         }
   3469         return packFloat32( aSign, 0xFF, 0 );
   3470     }
   3471     shift64RightJamming( aSig, 33, &aSig );
   3472     if ( aExp || aSig ) aExp -= 0x3F81;
   3473     return roundAndPackFloat32( aSign, aExp, aSig );
   3474 
   3475 }
   3476 
   3477 /*
   3478 -------------------------------------------------------------------------------
   3479 Returns the result of converting the extended double-precision floating-
   3480 point value `a' to the double-precision floating-point format.  The
   3481 conversion is performed according to the IEC/IEEE Standard for Binary
   3482 Floating-Point Arithmetic.
   3483 -------------------------------------------------------------------------------
   3484 */
   3485 float64 floatx80_to_float64( floatx80 a )
   3486 {
   3487     flag aSign;
   3488     int32 aExp;
   3489     bits64 aSig, zSig;
   3490 
   3491     aSig = extractFloatx80Frac( a );
   3492     aExp = extractFloatx80Exp( a );
   3493     aSign = extractFloatx80Sign( a );
   3494     if ( aExp == 0x7FFF ) {
   3495         if ( (bits64) ( aSig<<1 ) ) {
   3496             return commonNaNToFloat64( floatx80ToCommonNaN( a ) );
   3497         }
   3498         return packFloat64( aSign, 0x7FF, 0 );
   3499     }
   3500     shift64RightJamming( aSig, 1, &zSig );
   3501     if ( aExp || aSig ) aExp -= 0x3C01;
   3502     return roundAndPackFloat64( aSign, aExp, zSig );
   3503 
   3504 }
   3505 
   3506 #ifdef FLOAT128
   3507 
   3508 /*
   3509 -------------------------------------------------------------------------------
   3510 Returns the result of converting the extended double-precision floating-
   3511 point value `a' to the quadruple-precision floating-point format.  The
   3512 conversion is performed according to the IEC/IEEE Standard for Binary
   3513 Floating-Point Arithmetic.
   3514 -------------------------------------------------------------------------------
   3515 */
   3516 float128 floatx80_to_float128( floatx80 a )
   3517 {
   3518     flag aSign;
   3519     int16 aExp;
   3520     bits64 aSig, zSig0, zSig1;
   3521 
   3522     aSig = extractFloatx80Frac( a );
   3523     aExp = extractFloatx80Exp( a );
   3524     aSign = extractFloatx80Sign( a );
   3525     if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) {
   3526         return commonNaNToFloat128( floatx80ToCommonNaN( a ) );
   3527     }
   3528     shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
   3529     return packFloat128( aSign, aExp, zSig0, zSig1 );
   3530 
   3531 }
   3532 
   3533 #endif
   3534 
   3535 /*
   3536 -------------------------------------------------------------------------------
   3537 Rounds the extended double-precision floating-point value `a' to an integer,
   3538 and returns the result as an extended quadruple-precision floating-point
   3539 value.  The operation is performed according to the IEC/IEEE Standard for
   3540 Binary Floating-Point Arithmetic.
   3541 -------------------------------------------------------------------------------
   3542 */
   3543 floatx80 floatx80_round_to_int( floatx80 a )
   3544 {
   3545     flag aSign;
   3546     int32 aExp;
   3547     bits64 lastBitMask, roundBitsMask;
   3548     int8 roundingMode;
   3549     floatx80 z;
   3550 
   3551     aExp = extractFloatx80Exp( a );
   3552     if ( 0x403E <= aExp ) {
   3553         if ( ( aExp == 0x7FFF ) && (bits64) ( extractFloatx80Frac( a )<<1 ) ) {
   3554             return propagateFloatx80NaN( a, a );
   3555         }
   3556         return a;
   3557     }
   3558     if ( aExp < 0x3FFF ) {
   3559         if (    ( aExp == 0 )
   3560              && ( (bits64) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
   3561             return a;
   3562         }
   3563         set_float_exception_inexact_flag();
   3564         aSign = extractFloatx80Sign( a );
   3565         switch ( float_rounding_mode ) {
   3566          case float_round_nearest_even:
   3567             if ( ( aExp == 0x3FFE ) && (bits64) ( extractFloatx80Frac( a )<<1 )
   3568                ) {
   3569                 return
   3570                     packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );
   3571             }
   3572             break;
   3573          case float_round_to_zero:
   3574             break;
   3575          case float_round_down:
   3576             return
   3577                   aSign ?
   3578                       packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
   3579                 : packFloatx80( 0, 0, 0 );
   3580          case float_round_up:
   3581             return
   3582                   aSign ? packFloatx80( 1, 0, 0 )
   3583                 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
   3584         }
   3585         return packFloatx80( aSign, 0, 0 );
   3586     }
   3587     lastBitMask = 1;
   3588     lastBitMask <<= 0x403E - aExp;
   3589     roundBitsMask = lastBitMask - 1;
   3590     z = a;
   3591     roundingMode = float_rounding_mode;
   3592     if ( roundingMode == float_round_nearest_even ) {
   3593         z.low += lastBitMask>>1;
   3594         if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
   3595     }
   3596     else if ( roundingMode != float_round_to_zero ) {
   3597         if ( extractFloatx80Sign( z ) ^ ( roundingMode == float_round_up ) ) {
   3598             z.low += roundBitsMask;
   3599         }
   3600     }
   3601     z.low &= ~ roundBitsMask;
   3602     if ( z.low == 0 ) {
   3603         ++z.high;
   3604         z.low = LIT64( 0x8000000000000000 );
   3605     }
   3606     if ( z.low != a.low ) set_float_exception_inexact_flag();
   3607     return z;
   3608 
   3609 }
   3610 
   3611 /*
   3612 -------------------------------------------------------------------------------
   3613 Returns the result of adding the absolute values of the extended double-
   3614 precision floating-point values `a' and `b'.  If `zSign' is 1, the sum is
   3615 negated before being returned.  `zSign' is ignored if the result is a NaN.
   3616 The addition is performed according to the IEC/IEEE Standard for Binary
   3617 Floating-Point Arithmetic.
   3618 -------------------------------------------------------------------------------
   3619 */
   3620 static floatx80 addFloatx80Sigs( floatx80 a, floatx80 b, flag zSign )
   3621 {
   3622     int32 aExp, bExp, zExp;
   3623     bits64 aSig, bSig, zSig0, zSig1;
   3624     int32 expDiff;
   3625 
   3626     aSig = extractFloatx80Frac( a );
   3627     aExp = extractFloatx80Exp( a );
   3628     bSig = extractFloatx80Frac( b );
   3629     bExp = extractFloatx80Exp( b );
   3630     expDiff = aExp - bExp;
   3631     if ( 0 < expDiff ) {
   3632         if ( aExp == 0x7FFF ) {
   3633             if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
   3634             return a;
   3635         }
   3636         if ( bExp == 0 ) --expDiff;
   3637         shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
   3638         zExp = aExp;
   3639     }
   3640     else if ( expDiff < 0 ) {
   3641         if ( bExp == 0x7FFF ) {
   3642             if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
   3643             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
   3644         }
   3645         if ( aExp == 0 ) ++expDiff;
   3646         shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
   3647         zExp = bExp;
   3648     }
   3649     else {
   3650         if ( aExp == 0x7FFF ) {
   3651             if ( (bits64) ( ( aSig | bSig )<<1 ) ) {
   3652                 return propagateFloatx80NaN( a, b );
   3653             }
   3654             return a;
   3655         }
   3656         zSig1 = 0;
   3657         zSig0 = aSig + bSig;
   3658         if ( aExp == 0 ) {
   3659             normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
   3660             goto roundAndPack;
   3661         }
   3662         zExp = aExp;
   3663         goto shiftRight1;
   3664     }
   3665     zSig0 = aSig + bSig;
   3666     if ( (sbits64) zSig0 < 0 ) goto roundAndPack;
   3667  shiftRight1:
   3668     shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
   3669     zSig0 |= LIT64( 0x8000000000000000 );
   3670     ++zExp;
   3671  roundAndPack:
   3672     return
   3673         roundAndPackFloatx80(
   3674             floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
   3675 
   3676 }
   3677 
   3678 /*
   3679 -------------------------------------------------------------------------------
   3680 Returns the result of subtracting the absolute values of the extended
   3681 double-precision floating-point values `a' and `b'.  If `zSign' is 1, the
   3682 difference is negated before being returned.  `zSign' is ignored if the
   3683 result is a NaN.  The subtraction is performed according to the IEC/IEEE
   3684 Standard for Binary Floating-Point Arithmetic.
   3685 -------------------------------------------------------------------------------
   3686 */
   3687 static floatx80 subFloatx80Sigs( floatx80 a, floatx80 b, flag zSign )
   3688 {
   3689     int32 aExp, bExp, zExp;
   3690     bits64 aSig, bSig, zSig0, zSig1;
   3691     int32 expDiff;
   3692     floatx80 z;
   3693 
   3694     aSig = extractFloatx80Frac( a );
   3695     aExp = extractFloatx80Exp( a );
   3696     bSig = extractFloatx80Frac( b );
   3697     bExp = extractFloatx80Exp( b );
   3698     expDiff = aExp - bExp;
   3699     if ( 0 < expDiff ) goto aExpBigger;
   3700     if ( expDiff < 0 ) goto bExpBigger;
   3701     if ( aExp == 0x7FFF ) {
   3702         if ( (bits64) ( ( aSig | bSig )<<1 ) ) {
   3703             return propagateFloatx80NaN( a, b );
   3704         }
   3705         float_raise( float_flag_invalid );
   3706         z.low = floatx80_default_nan_low;
   3707         z.high = floatx80_default_nan_high;
   3708         return z;
   3709     }
   3710     if ( aExp == 0 ) {
   3711         aExp = 1;
   3712         bExp = 1;
   3713     }
   3714     zSig1 = 0;
   3715     if ( bSig < aSig ) goto aBigger;
   3716     if ( aSig < bSig ) goto bBigger;
   3717     return packFloatx80( float_rounding_mode == float_round_down, 0, 0 );
   3718  bExpBigger:
   3719     if ( bExp == 0x7FFF ) {
   3720         if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
   3721         return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) );
   3722     }
   3723     if ( aExp == 0 ) ++expDiff;
   3724     shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
   3725  bBigger:
   3726     sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
   3727     zExp = bExp;
   3728     zSign ^= 1;
   3729     goto normalizeRoundAndPack;
   3730  aExpBigger:
   3731     if ( aExp == 0x7FFF ) {
   3732         if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
   3733         return a;
   3734     }
   3735     if ( bExp == 0 ) --expDiff;
   3736     shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
   3737  aBigger:
   3738     sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
   3739     zExp = aExp;
   3740  normalizeRoundAndPack:
   3741     return
   3742         normalizeRoundAndPackFloatx80(
   3743             floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
   3744 
   3745 }
   3746 
   3747 /*
   3748 -------------------------------------------------------------------------------
   3749 Returns the result of adding the extended double-precision floating-point
   3750 values `a' and `b'.  The operation is performed according to the IEC/IEEE
   3751 Standard for Binary Floating-Point Arithmetic.
   3752 -------------------------------------------------------------------------------
   3753 */
   3754 floatx80 floatx80_add( floatx80 a, floatx80 b )
   3755 {
   3756     flag aSign, bSign;
   3757 
   3758     aSign = extractFloatx80Sign( a );
   3759     bSign = extractFloatx80Sign( b );
   3760     if ( aSign == bSign ) {
   3761         return addFloatx80Sigs( a, b, aSign );
   3762     }
   3763     else {
   3764         return subFloatx80Sigs( a, b, aSign );
   3765     }
   3766 
   3767 }
   3768 
   3769 /*
   3770 -------------------------------------------------------------------------------
   3771 Returns the result of subtracting the extended double-precision floating-
   3772 point values `a' and `b'.  The operation is performed according to the
   3773 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   3774 -------------------------------------------------------------------------------
   3775 */
   3776 floatx80 floatx80_sub( floatx80 a, floatx80 b )
   3777 {
   3778     flag aSign, bSign;
   3779 
   3780     aSign = extractFloatx80Sign( a );
   3781     bSign = extractFloatx80Sign( b );
   3782     if ( aSign == bSign ) {
   3783         return subFloatx80Sigs( a, b, aSign );
   3784     }
   3785     else {
   3786         return addFloatx80Sigs( a, b, aSign );
   3787     }
   3788 
   3789 }
   3790 
   3791 /*
   3792 -------------------------------------------------------------------------------
   3793 Returns the result of multiplying the extended double-precision floating-
   3794 point values `a' and `b'.  The operation is performed according to the
   3795 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   3796 -------------------------------------------------------------------------------
   3797 */
   3798 floatx80 floatx80_mul( floatx80 a, floatx80 b )
   3799 {
   3800     flag aSign, bSign, zSign;
   3801     int32 aExp, bExp, zExp;
   3802     bits64 aSig, bSig, zSig0, zSig1;
   3803     floatx80 z;
   3804 
   3805     aSig = extractFloatx80Frac( a );
   3806     aExp = extractFloatx80Exp( a );
   3807     aSign = extractFloatx80Sign( a );
   3808     bSig = extractFloatx80Frac( b );
   3809     bExp = extractFloatx80Exp( b );
   3810     bSign = extractFloatx80Sign( b );
   3811     zSign = aSign ^ bSign;
   3812     if ( aExp == 0x7FFF ) {
   3813         if (    (bits64) ( aSig<<1 )
   3814              || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
   3815             return propagateFloatx80NaN( a, b );
   3816         }
   3817         if ( ( bExp | bSig ) == 0 ) goto invalid;
   3818         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
   3819     }
   3820     if ( bExp == 0x7FFF ) {
   3821         if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
   3822         if ( ( aExp | aSig ) == 0 ) {
   3823  invalid:
   3824             float_raise( float_flag_invalid );
   3825             z.low = floatx80_default_nan_low;
   3826             z.high = floatx80_default_nan_high;
   3827             return z;
   3828         }
   3829         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
   3830     }
   3831     if ( aExp == 0 ) {
   3832         if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
   3833         normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
   3834     }
   3835     if ( bExp == 0 ) {
   3836         if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
   3837         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
   3838     }
   3839     zExp = aExp + bExp - 0x3FFE;
   3840     mul64To128( aSig, bSig, &zSig0, &zSig1 );
   3841     if ( 0 < (sbits64) zSig0 ) {
   3842         shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
   3843         --zExp;
   3844     }
   3845     return
   3846         roundAndPackFloatx80(
   3847             floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
   3848 
   3849 }
   3850 
   3851 /*
   3852 -------------------------------------------------------------------------------
   3853 Returns the result of dividing the extended double-precision floating-point
   3854 value `a' by the corresponding value `b'.  The operation is performed
   3855 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   3856 -------------------------------------------------------------------------------
   3857 */
   3858 floatx80 floatx80_div( floatx80 a, floatx80 b )
   3859 {
   3860     flag aSign, bSign, zSign;
   3861     int32 aExp, bExp, zExp;
   3862     bits64 aSig, bSig, zSig0, zSig1;
   3863     bits64 rem0, rem1, rem2, term0, term1, term2;
   3864     floatx80 z;
   3865 
   3866     aSig = extractFloatx80Frac( a );
   3867     aExp = extractFloatx80Exp( a );
   3868     aSign = extractFloatx80Sign( a );
   3869     bSig = extractFloatx80Frac( b );
   3870     bExp = extractFloatx80Exp( b );
   3871     bSign = extractFloatx80Sign( b );
   3872     zSign = aSign ^ bSign;
   3873     if ( aExp == 0x7FFF ) {
   3874         if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
   3875         if ( bExp == 0x7FFF ) {
   3876             if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
   3877             goto invalid;
   3878         }
   3879         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
   3880     }
   3881     if ( bExp == 0x7FFF ) {
   3882         if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
   3883         return packFloatx80( zSign, 0, 0 );
   3884     }
   3885     if ( bExp == 0 ) {
   3886         if ( bSig == 0 ) {
   3887             if ( ( aExp | aSig ) == 0 ) {
   3888  invalid:
   3889                 float_raise( float_flag_invalid );
   3890                 z.low = floatx80_default_nan_low;
   3891                 z.high = floatx80_default_nan_high;
   3892                 return z;
   3893             }
   3894             float_raise( float_flag_divbyzero );
   3895             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
   3896         }
   3897         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
   3898     }
   3899     if ( aExp == 0 ) {
   3900         if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
   3901         normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
   3902     }
   3903     zExp = aExp - bExp + 0x3FFE;
   3904     rem1 = 0;
   3905     if ( bSig <= aSig ) {
   3906         shift128Right( aSig, 0, 1, &aSig, &rem1 );
   3907         ++zExp;
   3908     }
   3909     zSig0 = estimateDiv128To64( aSig, rem1, bSig );
   3910     mul64To128( bSig, zSig0, &term0, &term1 );
   3911     sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
   3912     while ( (sbits64) rem0 < 0 ) {
   3913         --zSig0;
   3914         add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
   3915     }
   3916     zSig1 = estimateDiv128To64( rem1, 0, bSig );
   3917     if ( (bits64) ( zSig1<<1 ) <= 8 ) {
   3918         mul64To128( bSig, zSig1, &term1, &term2 );
   3919         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
   3920         while ( (sbits64) rem1 < 0 ) {
   3921             --zSig1;
   3922             add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
   3923         }
   3924         zSig1 |= ( ( rem1 | rem2 ) != 0 );
   3925     }
   3926     return
   3927         roundAndPackFloatx80(
   3928             floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
   3929 
   3930 }
   3931 
   3932 /*
   3933 -------------------------------------------------------------------------------
   3934 Returns the remainder of the extended double-precision floating-point value
   3935 `a' with respect to the corresponding value `b'.  The operation is performed
   3936 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   3937 -------------------------------------------------------------------------------
   3938 */
   3939 floatx80 floatx80_rem( floatx80 a, floatx80 b )
   3940 {
   3941     flag aSign, bSign, zSign;
   3942     int32 aExp, bExp, expDiff;
   3943     bits64 aSig0, aSig1, bSig;
   3944     bits64 q, term0, term1, alternateASig0, alternateASig1;
   3945     floatx80 z;
   3946 
   3947     aSig0 = extractFloatx80Frac( a );
   3948     aExp = extractFloatx80Exp( a );
   3949     aSign = extractFloatx80Sign( a );
   3950     bSig = extractFloatx80Frac( b );
   3951     bExp = extractFloatx80Exp( b );
   3952     bSign = extractFloatx80Sign( b );
   3953     if ( aExp == 0x7FFF ) {
   3954         if (    (bits64) ( aSig0<<1 )
   3955              || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
   3956             return propagateFloatx80NaN( a, b );
   3957         }
   3958         goto invalid;
   3959     }
   3960     if ( bExp == 0x7FFF ) {
   3961         if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
   3962         return a;
   3963     }
   3964     if ( bExp == 0 ) {
   3965         if ( bSig == 0 ) {
   3966  invalid:
   3967             float_raise( float_flag_invalid );
   3968             z.low = floatx80_default_nan_low;
   3969             z.high = floatx80_default_nan_high;
   3970             return z;
   3971         }
   3972         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
   3973     }
   3974     if ( aExp == 0 ) {
   3975         if ( (bits64) ( aSig0<<1 ) == 0 ) return a;
   3976         normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
   3977     }
   3978     bSig |= LIT64( 0x8000000000000000 );
   3979     zSign = aSign;
   3980     expDiff = aExp - bExp;
   3981     aSig1 = 0;
   3982     if ( expDiff < 0 ) {
   3983         if ( expDiff < -1 ) return a;
   3984         shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
   3985         expDiff = 0;
   3986     }
   3987     q = ( bSig <= aSig0 );
   3988     if ( q ) aSig0 -= bSig;
   3989     expDiff -= 64;
   3990     while ( 0 < expDiff ) {
   3991         q = estimateDiv128To64( aSig0, aSig1, bSig );
   3992         q = ( 2 < q ) ? q - 2 : 0;
   3993         mul64To128( bSig, q, &term0, &term1 );
   3994         sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
   3995         shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
   3996         expDiff -= 62;
   3997     }
   3998     expDiff += 64;
   3999     if ( 0 < expDiff ) {
   4000         q = estimateDiv128To64( aSig0, aSig1, bSig );
   4001         q = ( 2 < q ) ? q - 2 : 0;
   4002         q >>= 64 - expDiff;
   4003         mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
   4004         sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
   4005         shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
   4006         while ( le128( term0, term1, aSig0, aSig1 ) ) {
   4007             ++q;
   4008             sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
   4009         }
   4010     }
   4011     else {
   4012         term1 = 0;
   4013         term0 = bSig;
   4014     }
   4015     sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
   4016     if (    lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
   4017          || (    eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
   4018               && ( q & 1 ) )
   4019        ) {
   4020         aSig0 = alternateASig0;
   4021         aSig1 = alternateASig1;
   4022         zSign = ! zSign;
   4023     }
   4024     return
   4025         normalizeRoundAndPackFloatx80(
   4026             80, zSign, bExp + expDiff, aSig0, aSig1 );
   4027 
   4028 }
   4029 
   4030 /*
   4031 -------------------------------------------------------------------------------
   4032 Returns the square root of the extended double-precision floating-point
   4033 value `a'.  The operation is performed according to the IEC/IEEE Standard
   4034 for Binary Floating-Point Arithmetic.
   4035 -------------------------------------------------------------------------------
   4036 */
   4037 floatx80 floatx80_sqrt( floatx80 a )
   4038 {
   4039     flag aSign;
   4040     int32 aExp, zExp;
   4041     bits64 aSig0, aSig1, zSig0, zSig1, doubleZSig0;
   4042     bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
   4043     floatx80 z;
   4044 
   4045     aSig0 = extractFloatx80Frac( a );
   4046     aExp = extractFloatx80Exp( a );
   4047     aSign = extractFloatx80Sign( a );
   4048     if ( aExp == 0x7FFF ) {
   4049         if ( (bits64) ( aSig0<<1 ) ) return propagateFloatx80NaN( a, a );
   4050         if ( ! aSign ) return a;
   4051         goto invalid;
   4052     }
   4053     if ( aSign ) {
   4054         if ( ( aExp | aSig0 ) == 0 ) return a;
   4055  invalid:
   4056         float_raise( float_flag_invalid );
   4057         z.low = floatx80_default_nan_low;
   4058         z.high = floatx80_default_nan_high;
   4059         return z;
   4060     }
   4061     if ( aExp == 0 ) {
   4062         if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
   4063         normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
   4064     }
   4065     zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
   4066     zSig0 = estimateSqrt32( aExp, aSig0>>32 );
   4067     shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
   4068     zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
   4069     doubleZSig0 = zSig0<<1;
   4070     mul64To128( zSig0, zSig0, &term0, &term1 );
   4071     sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
   4072     while ( (sbits64) rem0 < 0 ) {
   4073         --zSig0;
   4074         doubleZSig0 -= 2;
   4075         add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
   4076     }
   4077     zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
   4078     if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
   4079         if ( zSig1 == 0 ) zSig1 = 1;
   4080         mul64To128( doubleZSig0, zSig1, &term1, &term2 );
   4081         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
   4082         mul64To128( zSig1, zSig1, &term2, &term3 );
   4083         sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
   4084         while ( (sbits64) rem1 < 0 ) {
   4085             --zSig1;
   4086             shortShift128Left( 0, zSig1, 1, &term2, &term3 );
   4087             term3 |= 1;
   4088             term2 |= doubleZSig0;
   4089             add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
   4090         }
   4091         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
   4092     }
   4093     shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
   4094     zSig0 |= doubleZSig0;
   4095     return
   4096         roundAndPackFloatx80(
   4097             floatx80_rounding_precision, 0, zExp, zSig0, zSig1 );
   4098 
   4099 }
   4100 
   4101 /*
   4102 -------------------------------------------------------------------------------
   4103 Returns 1 if the extended double-precision floating-point value `a' is
   4104 equal to the corresponding value `b', and 0 otherwise.  The comparison is
   4105 performed according to the IEC/IEEE Standard for Binary Floating-Point
   4106 Arithmetic.
   4107 -------------------------------------------------------------------------------
   4108 */
   4109 flag floatx80_eq( floatx80 a, floatx80 b )
   4110 {
   4111 
   4112     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
   4113               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
   4114          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
   4115               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
   4116        ) {
   4117         if (    floatx80_is_signaling_nan( a )
   4118              || floatx80_is_signaling_nan( b ) ) {
   4119             float_raise( float_flag_invalid );
   4120         }
   4121         return 0;
   4122     }
   4123     return
   4124            ( a.low == b.low )
   4125         && (    ( a.high == b.high )
   4126              || (    ( a.low == 0 )
   4127                   && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) )
   4128            );
   4129 
   4130 }
   4131 
   4132 /*
   4133 -------------------------------------------------------------------------------
   4134 Returns 1 if the extended double-precision floating-point value `a' is
   4135 less than or equal to the corresponding value `b', and 0 otherwise.  The
   4136 comparison is performed according to the IEC/IEEE Standard for Binary
   4137 Floating-Point Arithmetic.
   4138 -------------------------------------------------------------------------------
   4139 */
   4140 flag floatx80_le( floatx80 a, floatx80 b )
   4141 {
   4142     flag aSign, bSign;
   4143 
   4144     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
   4145               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
   4146          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
   4147               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
   4148        ) {
   4149         float_raise( float_flag_invalid );
   4150         return 0;
   4151     }
   4152     aSign = extractFloatx80Sign( a );
   4153     bSign = extractFloatx80Sign( b );
   4154     if ( aSign != bSign ) {
   4155         return
   4156                aSign
   4157             || (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
   4158                  == 0 );
   4159     }
   4160     return
   4161           aSign ? le128( b.high, b.low, a.high, a.low )
   4162         : le128( a.high, a.low, b.high, b.low );
   4163 
   4164 }
   4165 
   4166 /*
   4167 -------------------------------------------------------------------------------
   4168 Returns 1 if the extended double-precision floating-point value `a' is
   4169 less than the corresponding value `b', and 0 otherwise.  The comparison
   4170 is performed according to the IEC/IEEE Standard for Binary Floating-Point
   4171 Arithmetic.
   4172 -------------------------------------------------------------------------------
   4173 */
   4174 flag floatx80_lt( floatx80 a, floatx80 b )
   4175 {
   4176     flag aSign, bSign;
   4177 
   4178     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
   4179               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
   4180          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
   4181               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
   4182        ) {
   4183         float_raise( float_flag_invalid );
   4184         return 0;
   4185     }
   4186     aSign = extractFloatx80Sign( a );
   4187     bSign = extractFloatx80Sign( b );
   4188     if ( aSign != bSign ) {
   4189         return
   4190                aSign
   4191             && (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
   4192                  != 0 );
   4193     }
   4194     return
   4195           aSign ? lt128( b.high, b.low, a.high, a.low )
   4196         : lt128( a.high, a.low, b.high, b.low );
   4197 
   4198 }
   4199 
   4200 /*
   4201 -------------------------------------------------------------------------------
   4202 Returns 1 if the extended double-precision floating-point value `a' is equal
   4203 to the corresponding value `b', and 0 otherwise.  The invalid exception is
   4204 raised if either operand is a NaN.  Otherwise, the comparison is performed
   4205 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   4206 -------------------------------------------------------------------------------
   4207 */
   4208 flag floatx80_eq_signaling( floatx80 a, floatx80 b )
   4209 {
   4210 
   4211     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
   4212               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
   4213          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
   4214               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
   4215        ) {
   4216         float_raise( float_flag_invalid );
   4217         return 0;
   4218     }
   4219     return
   4220            ( a.low == b.low )
   4221         && (    ( a.high == b.high )
   4222              || (    ( a.low == 0 )
   4223                   && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) )
   4224            );
   4225 
   4226 }
   4227 
   4228 /*
   4229 -------------------------------------------------------------------------------
   4230 Returns 1 if the extended double-precision floating-point value `a' is less
   4231 than or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs
   4232 do not cause an exception.  Otherwise, the comparison is performed according
   4233 to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   4234 -------------------------------------------------------------------------------
   4235 */
   4236 flag floatx80_le_quiet( floatx80 a, floatx80 b )
   4237 {
   4238     flag aSign, bSign;
   4239 
   4240     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
   4241               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
   4242          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
   4243               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
   4244        ) {
   4245         if (    floatx80_is_signaling_nan( a )
   4246              || floatx80_is_signaling_nan( b ) ) {
   4247             float_raise( float_flag_invalid );
   4248         }
   4249         return 0;
   4250     }
   4251     aSign = extractFloatx80Sign( a );
   4252     bSign = extractFloatx80Sign( b );
   4253     if ( aSign != bSign ) {
   4254         return
   4255                aSign
   4256             || (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
   4257                  == 0 );
   4258     }
   4259     return
   4260           aSign ? le128( b.high, b.low, a.high, a.low )
   4261         : le128( a.high, a.low, b.high, b.low );
   4262 
   4263 }
   4264 
   4265 /*
   4266 -------------------------------------------------------------------------------
   4267 Returns 1 if the extended double-precision floating-point value `a' is less
   4268 than the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause
   4269 an exception.  Otherwise, the comparison is performed according to the
   4270 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   4271 -------------------------------------------------------------------------------
   4272 */
   4273 flag floatx80_lt_quiet( floatx80 a, floatx80 b )
   4274 {
   4275     flag aSign, bSign;
   4276 
   4277     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
   4278               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
   4279          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
   4280               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
   4281        ) {
   4282         if (    floatx80_is_signaling_nan( a )
   4283              || floatx80_is_signaling_nan( b ) ) {
   4284             float_raise( float_flag_invalid );
   4285         }
   4286         return 0;
   4287     }
   4288     aSign = extractFloatx80Sign( a );
   4289     bSign = extractFloatx80Sign( b );
   4290     if ( aSign != bSign ) {
   4291         return
   4292                aSign
   4293             && (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
   4294                  != 0 );
   4295     }
   4296     return
   4297           aSign ? lt128( b.high, b.low, a.high, a.low )
   4298         : lt128( a.high, a.low, b.high, b.low );
   4299 
   4300 }
   4301 
   4302 #endif
   4303 
   4304 #ifdef FLOAT128
   4305 
   4306 /*
   4307 -------------------------------------------------------------------------------
   4308 Returns the result of converting the quadruple-precision floating-point
   4309 value `a' to the 32-bit two's complement integer format.  The conversion
   4310 is performed according to the IEC/IEEE Standard for Binary Floating-Point
   4311 Arithmetic---which means in particular that the conversion is rounded
   4312 according to the current rounding mode.  If `a' is a NaN, the largest
   4313 positive integer is returned.  Otherwise, if the conversion overflows, the
   4314 largest integer with the same sign as `a' is returned.
   4315 -------------------------------------------------------------------------------
   4316 */
   4317 int32 float128_to_int32( float128 a )
   4318 {
   4319     flag aSign;
   4320     int32 aExp, shiftCount;
   4321     bits64 aSig0, aSig1;
   4322 
   4323     aSig1 = extractFloat128Frac1( a );
   4324     aSig0 = extractFloat128Frac0( a );
   4325     aExp = extractFloat128Exp( a );
   4326     aSign = extractFloat128Sign( a );
   4327     if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
   4328     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
   4329     aSig0 |= ( aSig1 != 0 );
   4330     shiftCount = 0x4028 - aExp;
   4331     if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
   4332     return roundAndPackInt32( aSign, aSig0 );
   4333 
   4334 }
   4335 
   4336 /*
   4337 -------------------------------------------------------------------------------
   4338 Returns the result of converting the quadruple-precision floating-point
   4339 value `a' to the 32-bit two's complement integer format.  The conversion
   4340 is performed according to the IEC/IEEE Standard for Binary Floating-Point
   4341 Arithmetic, except that the conversion is always rounded toward zero.  If
   4342 `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
   4343 conversion overflows, the largest integer with the same sign as `a' is
   4344 returned.
   4345 -------------------------------------------------------------------------------
   4346 */
   4347 int32 float128_to_int32_round_to_zero( float128 a )
   4348 {
   4349     flag aSign;
   4350     int32 aExp, shiftCount;
   4351     bits64 aSig0, aSig1, savedASig;
   4352     int32 z;
   4353 
   4354     aSig1 = extractFloat128Frac1( a );
   4355     aSig0 = extractFloat128Frac0( a );
   4356     aExp = extractFloat128Exp( a );
   4357     aSign = extractFloat128Sign( a );
   4358     aSig0 |= ( aSig1 != 0 );
   4359     if ( 0x401E < aExp ) {
   4360         if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
   4361         goto invalid;
   4362     }
   4363     else if ( aExp < 0x3FFF ) {
   4364         if ( aExp || aSig0 ) set_float_exception_inexact_flag();
   4365         return 0;
   4366     }
   4367     aSig0 |= LIT64( 0x0001000000000000 );
   4368     shiftCount = 0x402F - aExp;
   4369     savedASig = aSig0;
   4370     aSig0 >>= shiftCount;
   4371     z = (int32)aSig0;
   4372     if ( aSign ) z = - z;
   4373     if ( ( z < 0 ) ^ aSign ) {
   4374  invalid:
   4375         float_raise( float_flag_invalid );
   4376         return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
   4377     }
   4378     if ( ( aSig0<<shiftCount ) != savedASig ) {
   4379         set_float_exception_inexact_flag();
   4380     }
   4381     return z;
   4382 
   4383 }
   4384 
   4385 /*
   4386 -------------------------------------------------------------------------------
   4387 Returns the result of converting the quadruple-precision floating-point
   4388 value `a' to the 64-bit two's complement integer format.  The conversion
   4389 is performed according to the IEC/IEEE Standard for Binary Floating-Point
   4390 Arithmetic---which means in particular that the conversion is rounded
   4391 according to the current rounding mode.  If `a' is a NaN, the largest
   4392 positive integer is returned.  Otherwise, if the conversion overflows, the
   4393 largest integer with the same sign as `a' is returned.
   4394 -------------------------------------------------------------------------------
   4395 */
   4396 int64 float128_to_int64( float128 a )
   4397 {
   4398     flag aSign;
   4399     int32 aExp, shiftCount;
   4400     bits64 aSig0, aSig1;
   4401 
   4402     aSig1 = extractFloat128Frac1( a );
   4403     aSig0 = extractFloat128Frac0( a );
   4404     aExp = extractFloat128Exp( a );
   4405     aSign = extractFloat128Sign( a );
   4406     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
   4407     shiftCount = 0x402F - aExp;
   4408     if ( shiftCount <= 0 ) {
   4409         if ( 0x403E < aExp ) {
   4410             float_raise( float_flag_invalid );
   4411             if (    ! aSign
   4412                  || (    ( aExp == 0x7FFF )
   4413                       && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) )
   4414                     )
   4415                ) {
   4416                 return LIT64( 0x7FFFFFFFFFFFFFFF );
   4417             }
   4418             return (sbits64) LIT64( 0x8000000000000000 );
   4419         }
   4420         shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
   4421     }
   4422     else {
   4423         shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
   4424     }
   4425     return roundAndPackInt64( aSign, aSig0, aSig1 );
   4426 
   4427 }
   4428 
   4429 /*
   4430 -------------------------------------------------------------------------------
   4431 Returns the result of converting the quadruple-precision floating-point
   4432 value `a' to the 64-bit two's complement integer format.  The conversion
   4433 is performed according to the IEC/IEEE Standard for Binary Floating-Point
   4434 Arithmetic, except that the conversion is always rounded toward zero.
   4435 If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
   4436 the conversion overflows, the largest integer with the same sign as `a' is
   4437 returned.
   4438 -------------------------------------------------------------------------------
   4439 */
   4440 int64 float128_to_int64_round_to_zero( float128 a )
   4441 {
   4442     flag aSign;
   4443     int32 aExp, shiftCount;
   4444     bits64 aSig0, aSig1;
   4445     int64 z;
   4446 
   4447     aSig1 = extractFloat128Frac1( a );
   4448     aSig0 = extractFloat128Frac0( a );
   4449     aExp = extractFloat128Exp( a );
   4450     aSign = extractFloat128Sign( a );
   4451     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
   4452     shiftCount = aExp - 0x402F;
   4453     if ( 0 < shiftCount ) {
   4454         if ( 0x403E <= aExp ) {
   4455             aSig0 &= LIT64( 0x0000FFFFFFFFFFFF );
   4456             if (    ( a.high == LIT64( 0xC03E000000000000 ) )
   4457                  && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) {
   4458                 if ( aSig1 ) set_float_exception_inexact_flag();
   4459             }
   4460             else {
   4461                 float_raise( float_flag_invalid );
   4462                 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) {
   4463                     return LIT64( 0x7FFFFFFFFFFFFFFF );
   4464                 }
   4465             }
   4466             return (sbits64) LIT64( 0x8000000000000000 );
   4467         }
   4468         z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
   4469         if ( (bits64) ( aSig1<<shiftCount ) ) {
   4470             set_float_exception_inexact_flag();
   4471         }
   4472     }
   4473     else {
   4474         if ( aExp < 0x3FFF ) {
   4475             if ( aExp | aSig0 | aSig1 ) {
   4476                 set_float_exception_inexact_flag();
   4477             }
   4478             return 0;
   4479         }
   4480         z = aSig0>>( - shiftCount );
   4481         if (    aSig1
   4482              || ( shiftCount && (bits64) ( aSig0<<( shiftCount & 63 ) ) ) ) {
   4483             set_float_exception_inexact_flag();
   4484         }
   4485     }
   4486     if ( aSign ) z = - z;
   4487     return z;
   4488 
   4489 }
   4490 
   4491 #if (defined(SOFTFLOATSPARC64_FOR_GCC) || defined(SOFTFLOAT_FOR_GCC)) \
   4492     && defined(SOFTFLOAT_NEED_FIXUNS)
   4493 /*
   4494  * just like above - but do not care for overflow of signed results
   4495  */
   4496 uint64 float128_to_uint64_round_to_zero( float128 a )
   4497 {
   4498     flag aSign;
   4499     int32 aExp, shiftCount;
   4500     bits64 aSig0, aSig1;
   4501     uint64 z;
   4502 
   4503     aSig1 = extractFloat128Frac1( a );
   4504     aSig0 = extractFloat128Frac0( a );
   4505     aExp = extractFloat128Exp( a );
   4506     aSign = extractFloat128Sign( a );
   4507     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
   4508     shiftCount = aExp - 0x402F;
   4509     if ( 0 < shiftCount ) {
   4510         if ( 0x403F <= aExp ) {
   4511             aSig0 &= LIT64( 0x0000FFFFFFFFFFFF );
   4512             if (    ( a.high == LIT64( 0xC03E000000000000 ) )
   4513                  && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) {
   4514                 if ( aSig1 ) set_float_exception_inexact_flag();
   4515             }
   4516             else {
   4517                 float_raise( float_flag_invalid );
   4518             }
   4519             return LIT64( 0xFFFFFFFFFFFFFFFF );
   4520         }
   4521         z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
   4522         if ( (bits64) ( aSig1<<shiftCount ) ) {
   4523             set_float_exception_inexact_flag();
   4524         }
   4525     }
   4526     else {
   4527         if ( aExp < 0x3FFF ) {
   4528             if ( aExp | aSig0 | aSig1 ) {
   4529                 set_float_exception_inexact_flag();
   4530             }
   4531             return 0;
   4532         }
   4533         z = aSig0>>( - shiftCount );
   4534         if (aSig1 || ( shiftCount && (bits64) ( aSig0<<( shiftCount & 63 ) ) ) ) {
   4535             set_float_exception_inexact_flag();
   4536         }
   4537     }
   4538     if ( aSign ) z = - z;
   4539     return z;
   4540 
   4541 }
   4542 #endif /* (SOFTFLOATSPARC64_FOR_GCC || SOFTFLOAT_FOR_GCC) && SOFTFLOAT_NEED_FIXUNS */
   4543 
   4544 /*
   4545 -------------------------------------------------------------------------------
   4546 Returns the result of converting the quadruple-precision floating-point
   4547 value `a' to the single-precision floating-point format.  The conversion
   4548 is performed according to the IEC/IEEE Standard for Binary Floating-Point
   4549 Arithmetic.
   4550 -------------------------------------------------------------------------------
   4551 */
   4552 float32 float128_to_float32( float128 a )
   4553 {
   4554     flag aSign;
   4555     int32 aExp;
   4556     bits64 aSig0, aSig1;
   4557     bits32 zSig;
   4558 
   4559     aSig1 = extractFloat128Frac1( a );
   4560     aSig0 = extractFloat128Frac0( a );
   4561     aExp = extractFloat128Exp( a );
   4562     aSign = extractFloat128Sign( a );
   4563     if ( aExp == 0x7FFF ) {
   4564         if ( aSig0 | aSig1 ) {
   4565             return commonNaNToFloat32( float128ToCommonNaN( a ) );
   4566         }
   4567         return packFloat32( aSign, 0xFF, 0 );
   4568     }
   4569     aSig0 |= ( aSig1 != 0 );
   4570     shift64RightJamming( aSig0, 18, &aSig0 );
   4571     zSig = (bits32)aSig0;
   4572     if ( aExp || zSig ) {
   4573         zSig |= 0x40000000;
   4574         aExp -= 0x3F81;
   4575     }
   4576     return roundAndPackFloat32( aSign, aExp, zSig );
   4577 
   4578 }
   4579 
   4580 /*
   4581 -------------------------------------------------------------------------------
   4582 Returns the result of converting the quadruple-precision floating-point
   4583 value `a' to the double-precision floating-point format.  The conversion
   4584 is performed according to the IEC/IEEE Standard for Binary Floating-Point
   4585 Arithmetic.
   4586 -------------------------------------------------------------------------------
   4587 */
   4588 float64 float128_to_float64( float128 a )
   4589 {
   4590     flag aSign;
   4591     int32 aExp;
   4592     bits64 aSig0, aSig1;
   4593 
   4594     aSig1 = extractFloat128Frac1( a );
   4595     aSig0 = extractFloat128Frac0( a );
   4596     aExp = extractFloat128Exp( a );
   4597     aSign = extractFloat128Sign( a );
   4598     if ( aExp == 0x7FFF ) {
   4599         if ( aSig0 | aSig1 ) {
   4600             return commonNaNToFloat64( float128ToCommonNaN( a ) );
   4601         }
   4602         return packFloat64( aSign, 0x7FF, 0 );
   4603     }
   4604     shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
   4605     aSig0 |= ( aSig1 != 0 );
   4606     if ( aExp || aSig0 ) {
   4607         aSig0 |= LIT64( 0x4000000000000000 );
   4608         aExp -= 0x3C01;
   4609     }
   4610     return roundAndPackFloat64( aSign, aExp, aSig0 );
   4611 
   4612 }
   4613 
   4614 #ifdef FLOATX80
   4615 
   4616 /*
   4617 -------------------------------------------------------------------------------
   4618 Returns the result of converting the quadruple-precision floating-point
   4619 value `a' to the extended double-precision floating-point format.  The
   4620 conversion is performed according to the IEC/IEEE Standard for Binary
   4621 Floating-Point Arithmetic.
   4622 -------------------------------------------------------------------------------
   4623 */
   4624 floatx80 float128_to_floatx80( float128 a )
   4625 {
   4626     flag aSign;
   4627     int32 aExp;
   4628     bits64 aSig0, aSig1;
   4629 
   4630     aSig1 = extractFloat128Frac1( a );
   4631     aSig0 = extractFloat128Frac0( a );
   4632     aExp = extractFloat128Exp( a );
   4633     aSign = extractFloat128Sign( a );
   4634     if ( aExp == 0x7FFF ) {
   4635         if ( aSig0 | aSig1 ) {
   4636             return commonNaNToFloatx80( float128ToCommonNaN( a ) );
   4637         }
   4638         return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
   4639     }
   4640     if ( aExp == 0 ) {
   4641         if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
   4642         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
   4643     }
   4644     else {
   4645         aSig0 |= LIT64( 0x0001000000000000 );
   4646     }
   4647     shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
   4648     return roundAndPackFloatx80( 80, aSign, aExp, aSig0, aSig1 );
   4649 
   4650 }
   4651 
   4652 #endif
   4653 
   4654 /*
   4655 -------------------------------------------------------------------------------
   4656 Rounds the quadruple-precision floating-point value `a' to an integer, and
   4657 returns the result as a quadruple-precision floating-point value.  The
   4658 operation is performed according to the IEC/IEEE Standard for Binary
   4659 Floating-Point Arithmetic.
   4660 -------------------------------------------------------------------------------
   4661 */
   4662 float128 float128_round_to_int( float128 a )
   4663 {
   4664     flag aSign;
   4665     int32 aExp;
   4666     bits64 lastBitMask, roundBitsMask;
   4667     int8 roundingMode;
   4668     float128 z;
   4669 
   4670     aExp = extractFloat128Exp( a );
   4671     if ( 0x402F <= aExp ) {
   4672         if ( 0x406F <= aExp ) {
   4673             if (    ( aExp == 0x7FFF )
   4674                  && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )
   4675                ) {
   4676                 return propagateFloat128NaN( a, a );
   4677             }
   4678             return a;
   4679         }
   4680         lastBitMask = 1;
   4681         lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
   4682         roundBitsMask = lastBitMask - 1;
   4683         z = a;
   4684         roundingMode = float_rounding_mode;
   4685         if ( roundingMode == float_round_nearest_even ) {
   4686             if ( lastBitMask ) {
   4687                 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
   4688                 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
   4689             }
   4690             else {
   4691                 if ( (sbits64) z.low < 0 ) {
   4692                     ++z.high;
   4693                     if ( (bits64) ( z.low<<1 ) == 0 ) z.high &= ~1;
   4694                 }
   4695             }
   4696         }
   4697         else if ( roundingMode != float_round_to_zero ) {
   4698             if (   extractFloat128Sign( z )
   4699                  ^ ( roundingMode == float_round_up ) ) {
   4700                 add128( z.high, z.low, 0, roundBitsMask, &z.high, &z.low );
   4701             }
   4702         }
   4703         z.low &= ~ roundBitsMask;
   4704     }
   4705     else {
   4706         if ( aExp < 0x3FFF ) {
   4707             if ( ( ( (bits64) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
   4708             set_float_exception_inexact_flag();
   4709             aSign = extractFloat128Sign( a );
   4710             switch ( float_rounding_mode ) {
   4711              case float_round_nearest_even:
   4712                 if (    ( aExp == 0x3FFE )
   4713                      && (   extractFloat128Frac0( a )
   4714                           | extractFloat128Frac1( a ) )
   4715                    ) {
   4716                     return packFloat128( aSign, 0x3FFF, 0, 0 );
   4717                 }
   4718                 break;
   4719              case float_round_to_zero:
   4720                 break;
   4721              case float_round_down:
   4722                 return
   4723                       aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
   4724                     : packFloat128( 0, 0, 0, 0 );
   4725              case float_round_up:
   4726                 return
   4727                       aSign ? packFloat128( 1, 0, 0, 0 )
   4728                     : packFloat128( 0, 0x3FFF, 0, 0 );
   4729             }
   4730             return packFloat128( aSign, 0, 0, 0 );
   4731         }
   4732         lastBitMask = 1;
   4733         lastBitMask <<= 0x402F - aExp;
   4734         roundBitsMask = lastBitMask - 1;
   4735         z.low = 0;
   4736         z.high = a.high;
   4737         roundingMode = float_rounding_mode;
   4738         if ( roundingMode == float_round_nearest_even ) {
   4739             z.high += lastBitMask>>1;
   4740             if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
   4741                 z.high &= ~ lastBitMask;
   4742             }
   4743         }
   4744         else if ( roundingMode != float_round_to_zero ) {
   4745             if (   extractFloat128Sign( z )
   4746                  ^ ( roundingMode == float_round_up ) ) {
   4747                 z.high |= ( a.low != 0 );
   4748                 z.high += roundBitsMask;
   4749             }
   4750         }
   4751         z.high &= ~ roundBitsMask;
   4752     }
   4753     if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
   4754         set_float_exception_inexact_flag();
   4755     }
   4756     return z;
   4757 
   4758 }
   4759 
   4760 /*
   4761 -------------------------------------------------------------------------------
   4762 Returns the result of adding the absolute values of the quadruple-precision
   4763 floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
   4764 before being returned.  `zSign' is ignored if the result is a NaN.
   4765 The addition is performed according to the IEC/IEEE Standard for Binary
   4766 Floating-Point Arithmetic.
   4767 -------------------------------------------------------------------------------
   4768 */
   4769 static float128 addFloat128Sigs( float128 a, float128 b, flag zSign )
   4770 {
   4771     int32 aExp, bExp, zExp;
   4772     bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
   4773     int32 expDiff;
   4774 
   4775     aSig1 = extractFloat128Frac1( a );
   4776     aSig0 = extractFloat128Frac0( a );
   4777     aExp = extractFloat128Exp( a );
   4778     bSig1 = extractFloat128Frac1( b );
   4779     bSig0 = extractFloat128Frac0( b );
   4780     bExp = extractFloat128Exp( b );
   4781     expDiff = aExp - bExp;
   4782     if ( 0 < expDiff ) {
   4783         if ( aExp == 0x7FFF ) {
   4784             if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b );
   4785             return a;
   4786         }
   4787         if ( bExp == 0 ) {
   4788             --expDiff;
   4789         }
   4790         else {
   4791             bSig0 |= LIT64( 0x0001000000000000 );
   4792         }
   4793         shift128ExtraRightJamming(
   4794             bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
   4795         zExp = aExp;
   4796     }
   4797     else if ( expDiff < 0 ) {
   4798         if ( bExp == 0x7FFF ) {
   4799             if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
   4800             return packFloat128( zSign, 0x7FFF, 0, 0 );
   4801         }
   4802         if ( aExp == 0 ) {
   4803             ++expDiff;
   4804         }
   4805         else {
   4806             aSig0 |= LIT64( 0x0001000000000000 );
   4807         }
   4808         shift128ExtraRightJamming(
   4809             aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
   4810         zExp = bExp;
   4811     }
   4812     else {
   4813         if ( aExp == 0x7FFF ) {
   4814             if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
   4815                 return propagateFloat128NaN( a, b );
   4816             }
   4817             return a;
   4818         }
   4819         add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
   4820         if ( aExp == 0 ) return packFloat128( zSign, 0, zSig0, zSig1 );
   4821         zSig2 = 0;
   4822         zSig0 |= LIT64( 0x0002000000000000 );
   4823         zExp = aExp;
   4824         goto shiftRight1;
   4825     }
   4826     aSig0 |= LIT64( 0x0001000000000000 );
   4827     add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
   4828     --zExp;
   4829     if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack;
   4830     ++zExp;
   4831  shiftRight1:
   4832     shift128ExtraRightJamming(
   4833         zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
   4834  roundAndPack:
   4835     return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
   4836 
   4837 }
   4838 
   4839 /*
   4840 -------------------------------------------------------------------------------
   4841 Returns the result of subtracting the absolute values of the quadruple-
   4842 precision floating-point values `a' and `b'.  If `zSign' is 1, the
   4843 difference is negated before being returned.  `zSign' is ignored if the
   4844 result is a NaN.  The subtraction is performed according to the IEC/IEEE
   4845 Standard for Binary Floating-Point Arithmetic.
   4846 -------------------------------------------------------------------------------
   4847 */
   4848 static float128 subFloat128Sigs( float128 a, float128 b, flag zSign )
   4849 {
   4850     int32 aExp, bExp, zExp;
   4851     bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
   4852     int32 expDiff;
   4853     float128 z;
   4854 
   4855     aSig1 = extractFloat128Frac1( a );
   4856     aSig0 = extractFloat128Frac0( a );
   4857     aExp = extractFloat128Exp( a );
   4858     bSig1 = extractFloat128Frac1( b );
   4859     bSig0 = extractFloat128Frac0( b );
   4860     bExp = extractFloat128Exp( b );
   4861     expDiff = aExp - bExp;
   4862     shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
   4863     shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
   4864     if ( 0 < expDiff ) goto aExpBigger;
   4865     if ( expDiff < 0 ) goto bExpBigger;
   4866     if ( aExp == 0x7FFF ) {
   4867         if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
   4868             return propagateFloat128NaN( a, b );
   4869         }
   4870         float_raise( float_flag_invalid );
   4871         z.low = float128_default_nan_low;
   4872         z.high = float128_default_nan_high;
   4873         return z;
   4874     }
   4875     if ( aExp == 0 ) {
   4876         aExp = 1;
   4877         bExp = 1;
   4878     }
   4879     if ( bSig0 < aSig0 ) goto aBigger;
   4880     if ( aSig0 < bSig0 ) goto bBigger;
   4881     if ( bSig1 < aSig1 ) goto aBigger;
   4882     if ( aSig1 < bSig1 ) goto bBigger;
   4883     return packFloat128( float_rounding_mode == float_round_down, 0, 0, 0 );
   4884  bExpBigger:
   4885     if ( bExp == 0x7FFF ) {
   4886         if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
   4887         return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
   4888     }
   4889     if ( aExp == 0 ) {
   4890         ++expDiff;
   4891     }
   4892     else {
   4893         aSig0 |= LIT64( 0x4000000000000000 );
   4894     }
   4895     shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
   4896     bSig0 |= LIT64( 0x4000000000000000 );
   4897  bBigger:
   4898     sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
   4899     zExp = bExp;
   4900     zSign ^= 1;
   4901     goto normalizeRoundAndPack;
   4902  aExpBigger:
   4903     if ( aExp == 0x7FFF ) {
   4904         if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b );
   4905         return a;
   4906     }
   4907     if ( bExp == 0 ) {
   4908         --expDiff;
   4909     }
   4910     else {
   4911         bSig0 |= LIT64( 0x4000000000000000 );
   4912     }
   4913     shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
   4914     aSig0 |= LIT64( 0x4000000000000000 );
   4915  aBigger:
   4916     sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
   4917     zExp = aExp;
   4918  normalizeRoundAndPack:
   4919     --zExp;
   4920     return normalizeRoundAndPackFloat128( zSign, zExp - 14, zSig0, zSig1 );
   4921 
   4922 }
   4923 
   4924 /*
   4925 -------------------------------------------------------------------------------
   4926 Returns the result of adding the quadruple-precision floating-point values
   4927 `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
   4928 for Binary Floating-Point Arithmetic.
   4929 -------------------------------------------------------------------------------
   4930 */
   4931 float128 float128_add( float128 a, float128 b )
   4932 {
   4933     flag aSign, bSign;
   4934 
   4935     aSign = extractFloat128Sign( a );
   4936     bSign = extractFloat128Sign( b );
   4937     if ( aSign == bSign ) {
   4938         return addFloat128Sigs( a, b, aSign );
   4939     }
   4940     else {
   4941         return subFloat128Sigs( a, b, aSign );
   4942     }
   4943 
   4944 }
   4945 
   4946 /*
   4947 -------------------------------------------------------------------------------
   4948 Returns the result of subtracting the quadruple-precision floating-point
   4949 values `a' and `b'.  The operation is performed according to the IEC/IEEE
   4950 Standard for Binary Floating-Point Arithmetic.
   4951 -------------------------------------------------------------------------------
   4952 */
   4953 float128 float128_sub( float128 a, float128 b )
   4954 {
   4955     flag aSign, bSign;
   4956 
   4957     aSign = extractFloat128Sign( a );
   4958     bSign = extractFloat128Sign( b );
   4959     if ( aSign == bSign ) {
   4960         return subFloat128Sigs( a, b, aSign );
   4961     }
   4962     else {
   4963         return addFloat128Sigs( a, b, aSign );
   4964     }
   4965 
   4966 }
   4967 
   4968 /*
   4969 -------------------------------------------------------------------------------
   4970 Returns the result of multiplying the quadruple-precision floating-point
   4971 values `a' and `b'.  The operation is performed according to the IEC/IEEE
   4972 Standard for Binary Floating-Point Arithmetic.
   4973 -------------------------------------------------------------------------------
   4974 */
   4975 float128 float128_mul( float128 a, float128 b )
   4976 {
   4977     flag aSign, bSign, zSign;
   4978     int32 aExp, bExp, zExp;
   4979     bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
   4980     float128 z;
   4981 
   4982     aSig1 = extractFloat128Frac1( a );
   4983     aSig0 = extractFloat128Frac0( a );
   4984     aExp = extractFloat128Exp( a );
   4985     aSign = extractFloat128Sign( a );
   4986     bSig1 = extractFloat128Frac1( b );
   4987     bSig0 = extractFloat128Frac0( b );
   4988     bExp = extractFloat128Exp( b );
   4989     bSign = extractFloat128Sign( b );
   4990     zSign = aSign ^ bSign;
   4991     if ( aExp == 0x7FFF ) {
   4992         if (    ( aSig0 | aSig1 )
   4993              || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
   4994             return propagateFloat128NaN( a, b );
   4995         }
   4996         if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
   4997         return packFloat128( zSign, 0x7FFF, 0, 0 );
   4998     }
   4999     if ( bExp == 0x7FFF ) {
   5000         if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
   5001         if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
   5002  invalid:
   5003             float_raise( float_flag_invalid );
   5004             z.low = float128_default_nan_low;
   5005             z.high = float128_default_nan_high;
   5006             return z;
   5007         }
   5008         return packFloat128( zSign, 0x7FFF, 0, 0 );
   5009     }
   5010     if ( aExp == 0 ) {
   5011         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
   5012         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
   5013     }
   5014     if ( bExp == 0 ) {
   5015         if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
   5016         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
   5017     }
   5018     zExp = aExp + bExp - 0x4000;
   5019     aSig0 |= LIT64( 0x0001000000000000 );
   5020     shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
   5021     mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
   5022     add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
   5023     zSig2 |= ( zSig3 != 0 );
   5024     if ( LIT64( 0x0002000000000000 ) <= zSig0 ) {
   5025         shift128ExtraRightJamming(
   5026             zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
   5027         ++zExp;
   5028     }
   5029     return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
   5030 
   5031 }
   5032 
   5033 /*
   5034 -------------------------------------------------------------------------------
   5035 Returns the result of dividing the quadruple-precision floating-point value
   5036 `a' by the corresponding value `b'.  The operation is performed according to
   5037 the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   5038 -------------------------------------------------------------------------------
   5039 */
   5040 float128 float128_div( float128 a, float128 b )
   5041 {
   5042     flag aSign, bSign, zSign;
   5043     int32 aExp, bExp, zExp;
   5044     bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
   5045     bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
   5046     float128 z;
   5047 
   5048     aSig1 = extractFloat128Frac1( a );
   5049     aSig0 = extractFloat128Frac0( a );
   5050     aExp = extractFloat128Exp( a );
   5051     aSign = extractFloat128Sign( a );
   5052     bSig1 = extractFloat128Frac1( b );
   5053     bSig0 = extractFloat128Frac0( b );
   5054     bExp = extractFloat128Exp( b );
   5055     bSign = extractFloat128Sign( b );
   5056     zSign = aSign ^ bSign;
   5057     if ( aExp == 0x7FFF ) {
   5058         if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b );
   5059         if ( bExp == 0x7FFF ) {
   5060             if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
   5061             goto invalid;
   5062         }
   5063         return packFloat128( zSign, 0x7FFF, 0, 0 );
   5064     }
   5065     if ( bExp == 0x7FFF ) {
   5066         if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
   5067         return packFloat128( zSign, 0, 0, 0 );
   5068     }
   5069     if ( bExp == 0 ) {
   5070         if ( ( bSig0 | bSig1 ) == 0 ) {
   5071             if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
   5072  invalid:
   5073                 float_raise( float_flag_invalid );
   5074                 z.low = float128_default_nan_low;
   5075                 z.high = float128_default_nan_high;
   5076                 return z;
   5077             }
   5078             float_raise( float_flag_divbyzero );
   5079             return packFloat128( zSign, 0x7FFF, 0, 0 );
   5080         }
   5081         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
   5082     }
   5083     if ( aExp == 0 ) {
   5084         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
   5085         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
   5086     }
   5087     zExp = aExp - bExp + 0x3FFD;
   5088     shortShift128Left(
   5089         aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 );
   5090     shortShift128Left(
   5091         bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
   5092     if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
   5093         shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
   5094         ++zExp;
   5095     }
   5096     zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
   5097     mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
   5098     sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
   5099     while ( (sbits64) rem0 < 0 ) {
   5100         --zSig0;
   5101         add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
   5102     }
   5103     zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
   5104     if ( ( zSig1 & 0x3FFF ) <= 4 ) {
   5105         mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
   5106         sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
   5107         while ( (sbits64) rem1 < 0 ) {
   5108             --zSig1;
   5109             add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
   5110         }
   5111         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
   5112     }
   5113     shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
   5114     return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
   5115 
   5116 }
   5117 
   5118 /*
   5119 -------------------------------------------------------------------------------
   5120 Returns the remainder of the quadruple-precision floating-point value `a'
   5121 with respect to the corresponding value `b'.  The operation is performed
   5122 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   5123 -------------------------------------------------------------------------------
   5124 */
   5125 float128 float128_rem( float128 a, float128 b )
   5126 {
   5127     flag aSign, zSign;
   5128     int32 aExp, bExp, expDiff;
   5129     bits64 aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
   5130     bits64 allZero, alternateASig0, alternateASig1, sigMean1;
   5131     sbits64 sigMean0;
   5132     float128 z;
   5133 
   5134     aSig1 = extractFloat128Frac1( a );
   5135     aSig0 = extractFloat128Frac0( a );
   5136     aExp = extractFloat128Exp( a );
   5137     aSign = extractFloat128Sign( a );
   5138     bSig1 = extractFloat128Frac1( b );
   5139     bSig0 = extractFloat128Frac0( b );
   5140     bExp = extractFloat128Exp( b );
   5141     if ( aExp == 0x7FFF ) {
   5142         if (    ( aSig0 | aSig1 )
   5143              || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
   5144             return propagateFloat128NaN( a, b );
   5145         }
   5146         goto invalid;
   5147     }
   5148     if ( bExp == 0x7FFF ) {
   5149         if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
   5150         return a;
   5151     }
   5152     if ( bExp == 0 ) {
   5153         if ( ( bSig0 | bSig1 ) == 0 ) {
   5154  invalid:
   5155             float_raise( float_flag_invalid );
   5156             z.low = float128_default_nan_low;
   5157             z.high = float128_default_nan_high;
   5158             return z;
   5159         }
   5160         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
   5161     }
   5162     if ( aExp == 0 ) {
   5163         if ( ( aSig0 | aSig1 ) == 0 ) return a;
   5164         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
   5165     }
   5166     expDiff = aExp - bExp;
   5167     if ( expDiff < -1 ) return a;
   5168     shortShift128Left(
   5169         aSig0 | LIT64( 0x0001000000000000 ),
   5170         aSig1,
   5171         15 - ( expDiff < 0 ),
   5172         &aSig0,
   5173         &aSig1
   5174     );
   5175     shortShift128Left(
   5176         bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
   5177     q = le128( bSig0, bSig1, aSig0, aSig1 );
   5178     if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
   5179     expDiff -= 64;
   5180     while ( 0 < expDiff ) {
   5181         q = estimateDiv128To64( aSig0, aSig1, bSig0 );
   5182         q = ( 4 < q ) ? q - 4 : 0;
   5183         mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
   5184         shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
   5185         shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
   5186         sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
   5187         expDiff -= 61;
   5188     }
   5189     if ( -64 < expDiff ) {
   5190         q = estimateDiv128To64( aSig0, aSig1, bSig0 );
   5191         q = ( 4 < q ) ? q - 4 : 0;
   5192         q >>= - expDiff;
   5193         shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
   5194         expDiff += 52;
   5195         if ( expDiff < 0 ) {
   5196             shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
   5197         }
   5198         else {
   5199             shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
   5200         }
   5201         mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
   5202         sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
   5203     }
   5204     else {
   5205         shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
   5206         shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
   5207     }
   5208     do {
   5209         alternateASig0 = aSig0;
   5210         alternateASig1 = aSig1;
   5211         ++q;
   5212         sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
   5213     } while ( 0 <= (sbits64) aSig0 );
   5214     add128(
   5215         aSig0, aSig1, alternateASig0, alternateASig1, (bits64 *)&sigMean0, &sigMean1 );
   5216     if (    ( sigMean0 < 0 )
   5217          || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
   5218         aSig0 = alternateASig0;
   5219         aSig1 = alternateASig1;
   5220     }
   5221     zSign = ( (sbits64) aSig0 < 0 );
   5222     if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
   5223     return
   5224         normalizeRoundAndPackFloat128( aSign ^ zSign, bExp - 4, aSig0, aSig1 );
   5225 
   5226 }
   5227 
   5228 /*
   5229 -------------------------------------------------------------------------------
   5230 Returns the square root of the quadruple-precision floating-point value `a'.
   5231 The operation is performed according to the IEC/IEEE Standard for Binary
   5232 Floating-Point Arithmetic.
   5233 -------------------------------------------------------------------------------
   5234 */
   5235 float128 float128_sqrt( float128 a )
   5236 {
   5237     flag aSign;
   5238     int32 aExp, zExp;
   5239     bits64 aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
   5240     bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
   5241     float128 z;
   5242 
   5243     aSig1 = extractFloat128Frac1( a );
   5244     aSig0 = extractFloat128Frac0( a );
   5245     aExp = extractFloat128Exp( a );
   5246     aSign = extractFloat128Sign( a );
   5247     if ( aExp == 0x7FFF ) {
   5248         if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, a );
   5249         if ( ! aSign ) return a;
   5250         goto invalid;
   5251     }
   5252     if ( aSign ) {
   5253         if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
   5254  invalid:
   5255         float_raise( float_flag_invalid );
   5256         z.low = float128_default_nan_low;
   5257         z.high = float128_default_nan_high;
   5258         return z;
   5259     }
   5260     if ( aExp == 0 ) {
   5261         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
   5262         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
   5263     }
   5264     zExp = (int32) ( (aExp - 0x3FFF) >> 1) + 0x3FFE;
   5265     aSig0 |= LIT64( 0x0001000000000000 );
   5266     zSig0 = estimateSqrt32((int16)aExp, (bits32)(aSig0>>17));
   5267     shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
   5268     zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
   5269     doubleZSig0 = zSig0<<1;
   5270     mul64To128( zSig0, zSig0, &term0, &term1 );
   5271     sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
   5272     while ( (sbits64) rem0 < 0 ) {
   5273         --zSig0;
   5274         doubleZSig0 -= 2;
   5275         add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
   5276     }
   5277     zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
   5278     if ( ( zSig1 & 0x1FFF ) <= 5 ) {
   5279         if ( zSig1 == 0 ) zSig1 = 1;
   5280         mul64To128( doubleZSig0, zSig1, &term1, &term2 );
   5281         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
   5282         mul64To128( zSig1, zSig1, &term2, &term3 );
   5283         sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
   5284         while ( (sbits64) rem1 < 0 ) {
   5285             --zSig1;
   5286             shortShift128Left( 0, zSig1, 1, &term2, &term3 );
   5287             term3 |= 1;
   5288             term2 |= doubleZSig0;
   5289             add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
   5290         }
   5291         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
   5292     }
   5293     shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
   5294     return roundAndPackFloat128( 0, zExp, zSig0, zSig1, zSig2 );
   5295 
   5296 }
   5297 
   5298 /*
   5299 -------------------------------------------------------------------------------
   5300 Returns 1 if the quadruple-precision floating-point value `a' is equal to
   5301 the corresponding value `b', and 0 otherwise.  The comparison is performed
   5302 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   5303 -------------------------------------------------------------------------------
   5304 */
   5305 flag float128_eq( float128 a, float128 b )
   5306 {
   5307 
   5308     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
   5309               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
   5310          || (    ( extractFloat128Exp( b ) == 0x7FFF )
   5311               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
   5312        ) {
   5313         if (    float128_is_signaling_nan( a )
   5314              || float128_is_signaling_nan( b ) ) {
   5315             float_raise( float_flag_invalid );
   5316         }
   5317         return 0;
   5318     }
   5319     return
   5320            ( a.low == b.low )
   5321         && (    ( a.high == b.high )
   5322              || (    ( a.low == 0 )
   5323                   && ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) )
   5324            );
   5325 
   5326 }
   5327 
   5328 /*
   5329 -------------------------------------------------------------------------------
   5330 Returns 1 if the quadruple-precision floating-point value `a' is less than
   5331 or equal to the corresponding value `b', and 0 otherwise.  The comparison
   5332 is performed according to the IEC/IEEE Standard for Binary Floating-Point
   5333 Arithmetic.
   5334 -------------------------------------------------------------------------------
   5335 */
   5336 flag float128_le( float128 a, float128 b )
   5337 {
   5338     flag aSign, bSign;
   5339 
   5340     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
   5341               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
   5342          || (    ( extractFloat128Exp( b ) == 0x7FFF )
   5343               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
   5344        ) {
   5345         float_raise( float_flag_invalid );
   5346         return 0;
   5347     }
   5348     aSign = extractFloat128Sign( a );
   5349     bSign = extractFloat128Sign( b );
   5350     if ( aSign != bSign ) {
   5351         return
   5352                aSign
   5353             || (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
   5354                  == 0 );
   5355     }
   5356     return
   5357           aSign ? le128( b.high, b.low, a.high, a.low )
   5358         : le128( a.high, a.low, b.high, b.low );
   5359 
   5360 }
   5361 
   5362 /*
   5363 -------------------------------------------------------------------------------
   5364 Returns 1 if the quadruple-precision floating-point value `a' is less than
   5365 the corresponding value `b', and 0 otherwise.  The comparison is performed
   5366 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   5367 -------------------------------------------------------------------------------
   5368 */
   5369 flag float128_lt( float128 a, float128 b )
   5370 {
   5371     flag aSign, bSign;
   5372 
   5373     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
   5374               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
   5375          || (    ( extractFloat128Exp( b ) == 0x7FFF )
   5376               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
   5377        ) {
   5378         float_raise( float_flag_invalid );
   5379         return 0;
   5380     }
   5381     aSign = extractFloat128Sign( a );
   5382     bSign = extractFloat128Sign( b );
   5383     if ( aSign != bSign ) {
   5384         return
   5385                aSign
   5386             && (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
   5387                  != 0 );
   5388     }
   5389     return
   5390           aSign ? lt128( b.high, b.low, a.high, a.low )
   5391         : lt128( a.high, a.low, b.high, b.low );
   5392 
   5393 }
   5394 
   5395 /*
   5396 -------------------------------------------------------------------------------
   5397 Returns 1 if the quadruple-precision floating-point value `a' is equal to
   5398 the corresponding value `b', and 0 otherwise.  The invalid exception is
   5399 raised if either operand is a NaN.  Otherwise, the comparison is performed
   5400 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   5401 -------------------------------------------------------------------------------
   5402 */
   5403 flag float128_eq_signaling( float128 a, float128 b )
   5404 {
   5405 
   5406     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
   5407               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
   5408          || (    ( extractFloat128Exp( b ) == 0x7FFF )
   5409               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
   5410        ) {
   5411         float_raise( float_flag_invalid );
   5412         return 0;
   5413     }
   5414     return
   5415            ( a.low == b.low )
   5416         && (    ( a.high == b.high )
   5417              || (    ( a.low == 0 )
   5418                   && ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) )
   5419            );
   5420 
   5421 }
   5422 
   5423 /*
   5424 -------------------------------------------------------------------------------
   5425 Returns 1 if the quadruple-precision floating-point value `a' is less than
   5426 or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
   5427 cause an exception.  Otherwise, the comparison is performed according to the
   5428 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   5429 -------------------------------------------------------------------------------
   5430 */
   5431 flag float128_le_quiet( float128 a, float128 b )
   5432 {
   5433     flag aSign, bSign;
   5434 
   5435     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
   5436               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
   5437          || (    ( extractFloat128Exp( b ) == 0x7FFF )
   5438               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
   5439        ) {
   5440         if (    float128_is_signaling_nan( a )
   5441              || float128_is_signaling_nan( b ) ) {
   5442             float_raise( float_flag_invalid );
   5443         }
   5444         return 0;
   5445     }
   5446     aSign = extractFloat128Sign( a );
   5447     bSign = extractFloat128Sign( b );
   5448     if ( aSign != bSign ) {
   5449         return
   5450                aSign
   5451             || (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
   5452                  == 0 );
   5453     }
   5454     return
   5455           aSign ? le128( b.high, b.low, a.high, a.low )
   5456         : le128( a.high, a.low, b.high, b.low );
   5457 
   5458 }
   5459 
   5460 /*
   5461 -------------------------------------------------------------------------------
   5462 Returns 1 if the quadruple-precision floating-point value `a' is less than
   5463 the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
   5464 exception.  Otherwise, the comparison is performed according to the IEC/IEEE
   5465 Standard for Binary Floating-Point Arithmetic.
   5466 -------------------------------------------------------------------------------
   5467 */
   5468 flag float128_lt_quiet( float128 a, float128 b )
   5469 {
   5470     flag aSign, bSign;
   5471 
   5472     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
   5473               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
   5474          || (    ( extractFloat128Exp( b ) == 0x7FFF )
   5475               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
   5476        ) {
   5477         if (    float128_is_signaling_nan( a )
   5478              || float128_is_signaling_nan( b ) ) {
   5479             float_raise( float_flag_invalid );
   5480         }
   5481         return 0;
   5482     }
   5483     aSign = extractFloat128Sign( a );
   5484     bSign = extractFloat128Sign( b );
   5485     if ( aSign != bSign ) {
   5486         return
   5487                aSign
   5488             && (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
   5489                  != 0 );
   5490     }
   5491     return
   5492           aSign ? lt128( b.high, b.low, a.high, a.low )
   5493         : lt128( a.high, a.low, b.high, b.low );
   5494 
   5495 }
   5496 
   5497 #endif
   5498 
   5499 
   5500 #if defined(SOFTFLOAT_FOR_GCC) && defined(SOFTFLOAT_NEED_FIXUNS)
   5501 
   5502 /*
   5503  * These two routines are not part of the original softfloat distribution.
   5504  *
   5505  * They are based on the corresponding conversions to integer but return
   5506  * unsigned numbers instead since these functions are required by GCC.
   5507  *
   5508  * Added by Mark Brinicombe <mark (at) NetBSD.org>   27/09/97
   5509  *
   5510  * float64 version overhauled for SoftFloat 2a [bjh21 2000-07-15]
   5511  */
   5512 
   5513 /*
   5514 -------------------------------------------------------------------------------
   5515 Returns the result of converting the double-precision floating-point value
   5516 `a' to the 32-bit unsigned integer format.  The conversion is
   5517 performed according to the IEC/IEEE Standard for Binary Floating-point
   5518 Arithmetic, except that the conversion is always rounded toward zero.  If
   5519 `a' is a NaN, the largest positive integer is returned.  If the conversion
   5520 overflows, the largest integer positive is returned.
   5521 -------------------------------------------------------------------------------
   5522 */
   5523 uint32 float64_to_uint32_round_to_zero( float64 a )
   5524 {
   5525     flag aSign;
   5526     int16 aExp, shiftCount;
   5527     bits64 aSig, savedASig;
   5528     uint32 z;
   5529 
   5530     aSig = extractFloat64Frac( a );
   5531     aExp = extractFloat64Exp( a );
   5532     aSign = extractFloat64Sign( a );
   5533 
   5534     if (aSign) {
   5535         float_raise( float_flag_invalid );
   5536         return(0);
   5537     }
   5538 
   5539     if ( 0x41E < aExp ) {
   5540         float_raise( float_flag_invalid );
   5541         return 0xffffffff;
   5542     }
   5543     else if ( aExp < 0x3FF ) {
   5544         if ( aExp || aSig ) set_float_exception_inexact_flag();
   5545         return 0;
   5546     }
   5547     aSig |= LIT64( 0x0010000000000000 );
   5548     shiftCount = 0x433 - aExp;
   5549     savedASig = aSig;
   5550     aSig >>= shiftCount;
   5551     z = (uint32)aSig;
   5552     if ( ( aSig<<shiftCount ) != savedASig ) {
   5553         set_float_exception_inexact_flag();
   5554     }
   5555     return z;
   5556 
   5557 }
   5558 
   5559 /*
   5560 -------------------------------------------------------------------------------
   5561 Returns the result of converting the single-precision floating-point value
   5562 `a' to the 32-bit unsigned integer format.  The conversion is
   5563 performed according to the IEC/IEEE Standard for Binary Floating-point
   5564 Arithmetic, except that the conversion is always rounded toward zero.  If
   5565 `a' is a NaN, the largest positive integer is returned.  If the conversion
   5566 overflows, the largest positive integer is returned.
   5567 -------------------------------------------------------------------------------
   5568 */
   5569 uint32 float32_to_uint32_round_to_zero( float32 a )
   5570 {
   5571     flag aSign;
   5572     int16 aExp, shiftCount;
   5573     bits32 aSig;
   5574     uint32 z;
   5575 
   5576     aSig = extractFloat32Frac( a );
   5577     aExp = extractFloat32Exp( a );
   5578     aSign = extractFloat32Sign( a );
   5579     shiftCount = aExp - 0x9E;
   5580 
   5581     if (aSign) {
   5582         float_raise( float_flag_invalid );
   5583         return(0);
   5584     }
   5585     if ( 0 < shiftCount ) {
   5586         float_raise( float_flag_invalid );
   5587         return 0xFFFFFFFF;
   5588     }
   5589     else if ( aExp <= 0x7E ) {
   5590         if ( aExp | aSig ) set_float_exception_inexact_flag();
   5591         return 0;
   5592     }
   5593     aSig = ( aSig | 0x800000 )<<8;
   5594     z = aSig>>( - shiftCount );
   5595     if ( aSig<<( shiftCount & 31 ) ) {
   5596         set_float_exception_inexact_flag();
   5597     }
   5598     return z;
   5599 
   5600 }
   5601 
   5602 #endif
   5603