Home | History | Annotate | Download | only in fpu
      1 /*
      2  * QEMU float support
      3  *
      4  * Derived from SoftFloat.
      5  */
      6 
      7 /*============================================================================
      8 
      9 This C source file is part of the SoftFloat IEC/IEEE Floating-point Arithmetic
     10 Package, Release 2b.
     11 
     12 Written by John R. Hauser.  This work was made possible in part by the
     13 International Computer Science Institute, located at Suite 600, 1947 Center
     14 Street, Berkeley, California 94704.  Funding was partially provided by the
     15 National Science Foundation under grant MIP-9311980.  The original version
     16 of this code was written as part of a project to build a fixed-point vector
     17 processor in collaboration with the University of California at Berkeley,
     18 overseen by Profs. Nelson Morgan and John Wawrzynek.  More information
     19 is available through the Web page `http://www.cs.berkeley.edu/~jhauser/
     20 arithmetic/SoftFloat.html'.
     21 
     22 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE.  Although reasonable effort has
     23 been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT TIMES
     24 RESULT IN INCORRECT BEHAVIOR.  USE OF THIS SOFTWARE IS RESTRICTED TO PERSONS
     25 AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ALL LOSSES,
     26 COSTS, OR OTHER PROBLEMS THEY INCUR DUE TO THE SOFTWARE, AND WHO FURTHERMORE
     27 EFFECTIVELY INDEMNIFY JOHN HAUSER AND THE INTERNATIONAL COMPUTER SCIENCE
     28 INSTITUTE (possibly via similar legal warning) AGAINST ALL LOSSES, COSTS, OR
     29 OTHER PROBLEMS INCURRED BY THEIR CUSTOMERS AND CLIENTS DUE TO THE SOFTWARE.
     30 
     31 Derivative works are acceptable, even for commercial purposes, so long as
     32 (1) the source code for the derivative work includes prominent notice that
     33 the work is derivative, and (2) the source code includes prominent notice with
     34 these four paragraphs for those parts of this code that are retained.
     35 
     36 =============================================================================*/
     37 
     38 #include "softfloat.h"
     39 
     40 /*----------------------------------------------------------------------------
     41 | Primitive arithmetic functions, including multi-word arithmetic, and
     42 | division and square root approximations.  (Can be specialized to target if
     43 | desired.)
     44 *----------------------------------------------------------------------------*/
     45 #include "softfloat-macros.h"
     46 
     47 /*----------------------------------------------------------------------------
     48 | Functions and definitions to determine:  (1) whether tininess for underflow
     49 | is detected before or after rounding by default, (2) what (if anything)
     50 | happens when exceptions are raised, (3) how signaling NaNs are distinguished
     51 | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
     52 | are propagated from function inputs to output.  These details are target-
     53 | specific.
     54 *----------------------------------------------------------------------------*/
     55 #include "softfloat-specialize.h"
     56 
     57 void set_float_rounding_mode(int val STATUS_PARAM)
     58 {
     59     STATUS(float_rounding_mode) = val;
     60 }
     61 
     62 void set_float_exception_flags(int val STATUS_PARAM)
     63 {
     64     STATUS(float_exception_flags) = val;
     65 }
     66 
     67 #ifdef FLOATX80
     68 void set_floatx80_rounding_precision(int val STATUS_PARAM)
     69 {
     70     STATUS(floatx80_rounding_precision) = val;
     71 }
     72 #endif
     73 
     74 /*----------------------------------------------------------------------------
     75 | Returns the fraction bits of the half-precision floating-point value `a'.
     76 *----------------------------------------------------------------------------*/
     77 
     78 INLINE uint32_t extractFloat16Frac(float16 a)
     79 {
     80     return float16_val(a) & 0x3ff;
     81 }
     82 
     83 /*----------------------------------------------------------------------------
     84 | Returns the exponent bits of the half-precision floating-point value `a'.
     85 *----------------------------------------------------------------------------*/
     86 
     87 INLINE int16 extractFloat16Exp(float16 a)
     88 {
     89     return (float16_val(a) >> 10) & 0x1f;
     90 }
     91 
     92 /*----------------------------------------------------------------------------
     93 | Returns the sign bit of the single-precision floating-point value `a'.
     94 *----------------------------------------------------------------------------*/
     95 
     96 INLINE flag extractFloat16Sign(float16 a)
     97 {
     98     return float16_val(a)>>15;
     99 }
    100 
    101 /*----------------------------------------------------------------------------
    102 | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
    103 | and 7, and returns the properly rounded 32-bit integer corresponding to the
    104 | input.  If `zSign' is 1, the input is negated before being converted to an
    105 | integer.  Bit 63 of `absZ' must be zero.  Ordinarily, the fixed-point input
    106 | is simply rounded to an integer, with the inexact exception raised if the
    107 | input cannot be represented exactly as an integer.  However, if the fixed-
    108 | point input is too large, the invalid exception is raised and the largest
    109 | positive or negative integer is returned.
    110 *----------------------------------------------------------------------------*/
    111 
    112 static int32 roundAndPackInt32( flag zSign, uint64_t absZ STATUS_PARAM)
    113 {
    114     int8 roundingMode;
    115     flag roundNearestEven;
    116     int8 roundIncrement, roundBits;
    117     int32 z;
    118 
    119     roundingMode = STATUS(float_rounding_mode);
    120     roundNearestEven = ( roundingMode == float_round_nearest_even );
    121     roundIncrement = 0x40;
    122     if ( ! roundNearestEven ) {
    123         if ( roundingMode == float_round_to_zero ) {
    124             roundIncrement = 0;
    125         }
    126         else {
    127             roundIncrement = 0x7F;
    128             if ( zSign ) {
    129                 if ( roundingMode == float_round_up ) roundIncrement = 0;
    130             }
    131             else {
    132                 if ( roundingMode == float_round_down ) roundIncrement = 0;
    133             }
    134         }
    135     }
    136     roundBits = absZ & 0x7F;
    137     absZ = ( absZ + roundIncrement )>>7;
    138     absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
    139     z = absZ;
    140     if ( zSign ) z = - z;
    141     if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
    142         float_raise( float_flag_invalid STATUS_VAR);
    143         return zSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
    144     }
    145     if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
    146     return z;
    147 
    148 }
    149 
    150 /*----------------------------------------------------------------------------
    151 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
    152 | `absZ1', with binary point between bits 63 and 64 (between the input words),
    153 | and returns the properly rounded 64-bit integer corresponding to the input.
    154 | If `zSign' is 1, the input is negated before being converted to an integer.
    155 | Ordinarily, the fixed-point input is simply rounded to an integer, with
    156 | the inexact exception raised if the input cannot be represented exactly as
    157 | an integer.  However, if the fixed-point input is too large, the invalid
    158 | exception is raised and the largest positive or negative integer is
    159 | returned.
    160 *----------------------------------------------------------------------------*/
    161 
    162 static int64 roundAndPackInt64( flag zSign, uint64_t absZ0, uint64_t absZ1 STATUS_PARAM)
    163 {
    164     int8 roundingMode;
    165     flag roundNearestEven, increment;
    166     int64 z;
    167 
    168     roundingMode = STATUS(float_rounding_mode);
    169     roundNearestEven = ( roundingMode == float_round_nearest_even );
    170     increment = ( (int64_t) absZ1 < 0 );
    171     if ( ! roundNearestEven ) {
    172         if ( roundingMode == float_round_to_zero ) {
    173             increment = 0;
    174         }
    175         else {
    176             if ( zSign ) {
    177                 increment = ( roundingMode == float_round_down ) && absZ1;
    178             }
    179             else {
    180                 increment = ( roundingMode == float_round_up ) && absZ1;
    181             }
    182         }
    183     }
    184     if ( increment ) {
    185         ++absZ0;
    186         if ( absZ0 == 0 ) goto overflow;
    187         absZ0 &= ~ ( ( (uint64_t) ( absZ1<<1 ) == 0 ) & roundNearestEven );
    188     }
    189     z = absZ0;
    190     if ( zSign ) z = - z;
    191     if ( z && ( ( z < 0 ) ^ zSign ) ) {
    192  overflow:
    193         float_raise( float_flag_invalid STATUS_VAR);
    194         return
    195               zSign ? (int64_t) LIT64( 0x8000000000000000 )
    196             : LIT64( 0x7FFFFFFFFFFFFFFF );
    197     }
    198     if ( absZ1 ) STATUS(float_exception_flags) |= float_flag_inexact;
    199     return z;
    200 
    201 }
    202 
    203 /*----------------------------------------------------------------------------
    204 | Returns the fraction bits of the single-precision floating-point value `a'.
    205 *----------------------------------------------------------------------------*/
    206 
    207 INLINE uint32_t extractFloat32Frac( float32 a )
    208 {
    209 
    210     return float32_val(a) & 0x007FFFFF;
    211 
    212 }
    213 
    214 /*----------------------------------------------------------------------------
    215 | Returns the exponent bits of the single-precision floating-point value `a'.
    216 *----------------------------------------------------------------------------*/
    217 
    218 INLINE int16 extractFloat32Exp( float32 a )
    219 {
    220 
    221     return ( float32_val(a)>>23 ) & 0xFF;
    222 
    223 }
    224 
    225 /*----------------------------------------------------------------------------
    226 | Returns the sign bit of the single-precision floating-point value `a'.
    227 *----------------------------------------------------------------------------*/
    228 
    229 INLINE flag extractFloat32Sign( float32 a )
    230 {
    231 
    232     return float32_val(a)>>31;
    233 
    234 }
    235 
    236 /*----------------------------------------------------------------------------
    237 | If `a' is denormal and we are in flush-to-zero mode then set the
    238 | input-denormal exception and return zero. Otherwise just return the value.
    239 *----------------------------------------------------------------------------*/
    240 static float32 float32_squash_input_denormal(float32 a STATUS_PARAM)
    241 {
    242     if (STATUS(flush_inputs_to_zero)) {
    243         if (extractFloat32Exp(a) == 0 && extractFloat32Frac(a) != 0) {
    244             float_raise(float_flag_input_denormal STATUS_VAR);
    245             return make_float32(float32_val(a) & 0x80000000);
    246         }
    247     }
    248     return a;
    249 }
    250 
    251 /*----------------------------------------------------------------------------
    252 | Normalizes the subnormal single-precision floating-point value represented
    253 | by the denormalized significand `aSig'.  The normalized exponent and
    254 | significand are stored at the locations pointed to by `zExpPtr' and
    255 | `zSigPtr', respectively.
    256 *----------------------------------------------------------------------------*/
    257 
    258 static void
    259  normalizeFloat32Subnormal( uint32_t aSig, int16 *zExpPtr, uint32_t *zSigPtr )
    260 {
    261     int8 shiftCount;
    262 
    263     shiftCount = countLeadingZeros32( aSig ) - 8;
    264     *zSigPtr = aSig<<shiftCount;
    265     *zExpPtr = 1 - shiftCount;
    266 
    267 }
    268 
    269 /*----------------------------------------------------------------------------
    270 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
    271 | single-precision floating-point value, returning the result.  After being
    272 | shifted into the proper positions, the three fields are simply added
    273 | together to form the result.  This means that any integer portion of `zSig'
    274 | will be added into the exponent.  Since a properly normalized significand
    275 | will have an integer portion equal to 1, the `zExp' input should be 1 less
    276 | than the desired result exponent whenever `zSig' is a complete, normalized
    277 | significand.
    278 *----------------------------------------------------------------------------*/
    279 
    280 INLINE float32 packFloat32( flag zSign, int16 zExp, uint32_t zSig )
    281 {
    282 
    283     return make_float32(
    284           ( ( (uint32_t) zSign )<<31 ) + ( ( (uint32_t) zExp )<<23 ) + zSig);
    285 
    286 }
    287 
    288 /*----------------------------------------------------------------------------
    289 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
    290 | and significand `zSig', and returns the proper single-precision floating-
    291 | point value corresponding to the abstract input.  Ordinarily, the abstract
    292 | value is simply rounded and packed into the single-precision format, with
    293 | the inexact exception raised if the abstract input cannot be represented
    294 | exactly.  However, if the abstract value is too large, the overflow and
    295 | inexact exceptions are raised and an infinity or maximal finite value is
    296 | returned.  If the abstract value is too small, the input value is rounded to
    297 | a subnormal number, and the underflow and inexact exceptions are raised if
    298 | the abstract input cannot be represented exactly as a subnormal single-
    299 | precision floating-point number.
    300 |     The input significand `zSig' has its binary point between bits 30
    301 | and 29, which is 7 bits to the left of the usual location.  This shifted
    302 | significand must be normalized or smaller.  If `zSig' is not normalized,
    303 | `zExp' must be 0; in that case, the result returned is a subnormal number,
    304 | and it must not require rounding.  In the usual case that `zSig' is
    305 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
    306 | The handling of underflow and overflow follows the IEC/IEEE Standard for
    307 | Binary Floating-Point Arithmetic.
    308 *----------------------------------------------------------------------------*/
    309 
    310 static float32 roundAndPackFloat32( flag zSign, int16 zExp, uint32_t zSig STATUS_PARAM)
    311 {
    312     int8 roundingMode;
    313     flag roundNearestEven;
    314     int8 roundIncrement, roundBits;
    315     flag isTiny;
    316 
    317     roundingMode = STATUS(float_rounding_mode);
    318     roundNearestEven = ( roundingMode == float_round_nearest_even );
    319     roundIncrement = 0x40;
    320     if ( ! roundNearestEven ) {
    321         if ( roundingMode == float_round_to_zero ) {
    322             roundIncrement = 0;
    323         }
    324         else {
    325             roundIncrement = 0x7F;
    326             if ( zSign ) {
    327                 if ( roundingMode == float_round_up ) roundIncrement = 0;
    328             }
    329             else {
    330                 if ( roundingMode == float_round_down ) roundIncrement = 0;
    331             }
    332         }
    333     }
    334     roundBits = zSig & 0x7F;
    335     if ( 0xFD <= (uint16_t) zExp ) {
    336         if (    ( 0xFD < zExp )
    337              || (    ( zExp == 0xFD )
    338                   && ( (int32_t) ( zSig + roundIncrement ) < 0 ) )
    339            ) {
    340             float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
    341             return packFloat32( zSign, 0xFF, - ( roundIncrement == 0 ));
    342         }
    343         if ( zExp < 0 ) {
    344             if (STATUS(flush_to_zero)) {
    345                 float_raise(float_flag_output_denormal STATUS_VAR);
    346                 return packFloat32(zSign, 0, 0);
    347             }
    348             isTiny =
    349                    ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
    350                 || ( zExp < -1 )
    351                 || ( zSig + roundIncrement < 0x80000000 );
    352             shift32RightJamming( zSig, - zExp, &zSig );
    353             zExp = 0;
    354             roundBits = zSig & 0x7F;
    355             if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
    356         }
    357     }
    358     if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
    359     zSig = ( zSig + roundIncrement )>>7;
    360     zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
    361     if ( zSig == 0 ) zExp = 0;
    362     return packFloat32( zSign, zExp, zSig );
    363 
    364 }
    365 
    366 /*----------------------------------------------------------------------------
    367 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
    368 | and significand `zSig', and returns the proper single-precision floating-
    369 | point value corresponding to the abstract input.  This routine is just like
    370 | `roundAndPackFloat32' except that `zSig' does not have to be normalized.
    371 | Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
    372 | floating-point exponent.
    373 *----------------------------------------------------------------------------*/
    374 
    375 static float32
    376  normalizeRoundAndPackFloat32( flag zSign, int16 zExp, uint32_t zSig STATUS_PARAM)
    377 {
    378     int8 shiftCount;
    379 
    380     shiftCount = countLeadingZeros32( zSig ) - 1;
    381     return roundAndPackFloat32( zSign, zExp - shiftCount, zSig<<shiftCount STATUS_VAR);
    382 
    383 }
    384 
    385 /*----------------------------------------------------------------------------
    386 | Returns the fraction bits of the double-precision floating-point value `a'.
    387 *----------------------------------------------------------------------------*/
    388 
    389 INLINE uint64_t extractFloat64Frac( float64 a )
    390 {
    391 
    392     return float64_val(a) & LIT64( 0x000FFFFFFFFFFFFF );
    393 
    394 }
    395 
    396 /*----------------------------------------------------------------------------
    397 | Returns the exponent bits of the double-precision floating-point value `a'.
    398 *----------------------------------------------------------------------------*/
    399 
    400 INLINE int16 extractFloat64Exp( float64 a )
    401 {
    402 
    403     return ( float64_val(a)>>52 ) & 0x7FF;
    404 
    405 }
    406 
    407 /*----------------------------------------------------------------------------
    408 | Returns the sign bit of the double-precision floating-point value `a'.
    409 *----------------------------------------------------------------------------*/
    410 
    411 INLINE flag extractFloat64Sign( float64 a )
    412 {
    413 
    414     return float64_val(a)>>63;
    415 
    416 }
    417 
    418 /*----------------------------------------------------------------------------
    419 | If `a' is denormal and we are in flush-to-zero mode then set the
    420 | input-denormal exception and return zero. Otherwise just return the value.
    421 *----------------------------------------------------------------------------*/
    422 static float64 float64_squash_input_denormal(float64 a STATUS_PARAM)
    423 {
    424     if (STATUS(flush_inputs_to_zero)) {
    425         if (extractFloat64Exp(a) == 0 && extractFloat64Frac(a) != 0) {
    426             float_raise(float_flag_input_denormal STATUS_VAR);
    427             return make_float64(float64_val(a) & (1ULL << 63));
    428         }
    429     }
    430     return a;
    431 }
    432 
    433 /*----------------------------------------------------------------------------
    434 | Normalizes the subnormal double-precision floating-point value represented
    435 | by the denormalized significand `aSig'.  The normalized exponent and
    436 | significand are stored at the locations pointed to by `zExpPtr' and
    437 | `zSigPtr', respectively.
    438 *----------------------------------------------------------------------------*/
    439 
    440 static void
    441  normalizeFloat64Subnormal( uint64_t aSig, int16 *zExpPtr, uint64_t *zSigPtr )
    442 {
    443     int8 shiftCount;
    444 
    445     shiftCount = countLeadingZeros64( aSig ) - 11;
    446     *zSigPtr = aSig<<shiftCount;
    447     *zExpPtr = 1 - shiftCount;
    448 
    449 }
    450 
    451 /*----------------------------------------------------------------------------
    452 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
    453 | double-precision floating-point value, returning the result.  After being
    454 | shifted into the proper positions, the three fields are simply added
    455 | together to form the result.  This means that any integer portion of `zSig'
    456 | will be added into the exponent.  Since a properly normalized significand
    457 | will have an integer portion equal to 1, the `zExp' input should be 1 less
    458 | than the desired result exponent whenever `zSig' is a complete, normalized
    459 | significand.
    460 *----------------------------------------------------------------------------*/
    461 
    462 INLINE float64 packFloat64( flag zSign, int16 zExp, uint64_t zSig )
    463 {
    464 
    465     return make_float64(
    466         ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<52 ) + zSig);
    467 
    468 }
    469 
    470 /*----------------------------------------------------------------------------
    471 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
    472 | and significand `zSig', and returns the proper double-precision floating-
    473 | point value corresponding to the abstract input.  Ordinarily, the abstract
    474 | value is simply rounded and packed into the double-precision format, with
    475 | the inexact exception raised if the abstract input cannot be represented
    476 | exactly.  However, if the abstract value is too large, the overflow and
    477 | inexact exceptions are raised and an infinity or maximal finite value is
    478 | returned.  If the abstract value is too small, the input value is rounded
    479 | to a subnormal number, and the underflow and inexact exceptions are raised
    480 | if the abstract input cannot be represented exactly as a subnormal double-
    481 | precision floating-point number.
    482 |     The input significand `zSig' has its binary point between bits 62
    483 | and 61, which is 10 bits to the left of the usual location.  This shifted
    484 | significand must be normalized or smaller.  If `zSig' is not normalized,
    485 | `zExp' must be 0; in that case, the result returned is a subnormal number,
    486 | and it must not require rounding.  In the usual case that `zSig' is
    487 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
    488 | The handling of underflow and overflow follows the IEC/IEEE Standard for
    489 | Binary Floating-Point Arithmetic.
    490 *----------------------------------------------------------------------------*/
    491 
    492 static float64 roundAndPackFloat64( flag zSign, int16 zExp, uint64_t zSig STATUS_PARAM)
    493 {
    494     int8 roundingMode;
    495     flag roundNearestEven;
    496     int16 roundIncrement, roundBits;
    497     flag isTiny;
    498 
    499     roundingMode = STATUS(float_rounding_mode);
    500     roundNearestEven = ( roundingMode == float_round_nearest_even );
    501     roundIncrement = 0x200;
    502     if ( ! roundNearestEven ) {
    503         if ( roundingMode == float_round_to_zero ) {
    504             roundIncrement = 0;
    505         }
    506         else {
    507             roundIncrement = 0x3FF;
    508             if ( zSign ) {
    509                 if ( roundingMode == float_round_up ) roundIncrement = 0;
    510             }
    511             else {
    512                 if ( roundingMode == float_round_down ) roundIncrement = 0;
    513             }
    514         }
    515     }
    516     roundBits = zSig & 0x3FF;
    517     if ( 0x7FD <= (uint16_t) zExp ) {
    518         if (    ( 0x7FD < zExp )
    519              || (    ( zExp == 0x7FD )
    520                   && ( (int64_t) ( zSig + roundIncrement ) < 0 ) )
    521            ) {
    522             float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
    523             return packFloat64( zSign, 0x7FF, - ( roundIncrement == 0 ));
    524         }
    525         if ( zExp < 0 ) {
    526             if (STATUS(flush_to_zero)) {
    527                 float_raise(float_flag_output_denormal STATUS_VAR);
    528                 return packFloat64(zSign, 0, 0);
    529             }
    530             isTiny =
    531                    ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
    532                 || ( zExp < -1 )
    533                 || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) );
    534             shift64RightJamming( zSig, - zExp, &zSig );
    535             zExp = 0;
    536             roundBits = zSig & 0x3FF;
    537             if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
    538         }
    539     }
    540     if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
    541     zSig = ( zSig + roundIncrement )>>10;
    542     zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
    543     if ( zSig == 0 ) zExp = 0;
    544     return packFloat64( zSign, zExp, zSig );
    545 
    546 }
    547 
    548 /*----------------------------------------------------------------------------
    549 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
    550 | and significand `zSig', and returns the proper double-precision floating-
    551 | point value corresponding to the abstract input.  This routine is just like
    552 | `roundAndPackFloat64' except that `zSig' does not have to be normalized.
    553 | Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
    554 | floating-point exponent.
    555 *----------------------------------------------------------------------------*/
    556 
    557 static float64
    558  normalizeRoundAndPackFloat64( flag zSign, int16 zExp, uint64_t zSig STATUS_PARAM)
    559 {
    560     int8 shiftCount;
    561 
    562     shiftCount = countLeadingZeros64( zSig ) - 1;
    563     return roundAndPackFloat64( zSign, zExp - shiftCount, zSig<<shiftCount STATUS_VAR);
    564 
    565 }
    566 
    567 #ifdef FLOATX80
    568 
    569 /*----------------------------------------------------------------------------
    570 | Returns the fraction bits of the extended double-precision floating-point
    571 | value `a'.
    572 *----------------------------------------------------------------------------*/
    573 
    574 INLINE uint64_t extractFloatx80Frac( floatx80 a )
    575 {
    576 
    577     return a.low;
    578 
    579 }
    580 
    581 /*----------------------------------------------------------------------------
    582 | Returns the exponent bits of the extended double-precision floating-point
    583 | value `a'.
    584 *----------------------------------------------------------------------------*/
    585 
    586 INLINE int32 extractFloatx80Exp( floatx80 a )
    587 {
    588 
    589     return a.high & 0x7FFF;
    590 
    591 }
    592 
    593 /*----------------------------------------------------------------------------
    594 | Returns the sign bit of the extended double-precision floating-point value
    595 | `a'.
    596 *----------------------------------------------------------------------------*/
    597 
    598 INLINE flag extractFloatx80Sign( floatx80 a )
    599 {
    600 
    601     return a.high>>15;
    602 
    603 }
    604 
    605 /*----------------------------------------------------------------------------
    606 | Normalizes the subnormal extended double-precision floating-point value
    607 | represented by the denormalized significand `aSig'.  The normalized exponent
    608 | and significand are stored at the locations pointed to by `zExpPtr' and
    609 | `zSigPtr', respectively.
    610 *----------------------------------------------------------------------------*/
    611 
    612 static void
    613  normalizeFloatx80Subnormal( uint64_t aSig, int32 *zExpPtr, uint64_t *zSigPtr )
    614 {
    615     int8 shiftCount;
    616 
    617     shiftCount = countLeadingZeros64( aSig );
    618     *zSigPtr = aSig<<shiftCount;
    619     *zExpPtr = 1 - shiftCount;
    620 
    621 }
    622 
    623 /*----------------------------------------------------------------------------
    624 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
    625 | extended double-precision floating-point value, returning the result.
    626 *----------------------------------------------------------------------------*/
    627 
    628 INLINE floatx80 packFloatx80( flag zSign, int32 zExp, uint64_t zSig )
    629 {
    630     floatx80 z;
    631 
    632     z.low = zSig;
    633     z.high = ( ( (uint16_t) zSign )<<15 ) + zExp;
    634     return z;
    635 
    636 }
    637 
    638 /*----------------------------------------------------------------------------
    639 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
    640 | and extended significand formed by the concatenation of `zSig0' and `zSig1',
    641 | and returns the proper extended double-precision floating-point value
    642 | corresponding to the abstract input.  Ordinarily, the abstract value is
    643 | rounded and packed into the extended double-precision format, with the
    644 | inexact exception raised if the abstract input cannot be represented
    645 | exactly.  However, if the abstract value is too large, the overflow and
    646 | inexact exceptions are raised and an infinity or maximal finite value is
    647 | returned.  If the abstract value is too small, the input value is rounded to
    648 | a subnormal number, and the underflow and inexact exceptions are raised if
    649 | the abstract input cannot be represented exactly as a subnormal extended
    650 | double-precision floating-point number.
    651 |     If `roundingPrecision' is 32 or 64, the result is rounded to the same
    652 | number of bits as single or double precision, respectively.  Otherwise, the
    653 | result is rounded to the full precision of the extended double-precision
    654 | format.
    655 |     The input significand must be normalized or smaller.  If the input
    656 | significand is not normalized, `zExp' must be 0; in that case, the result
    657 | returned is a subnormal number, and it must not require rounding.  The
    658 | handling of underflow and overflow follows the IEC/IEEE Standard for Binary
    659 | Floating-Point Arithmetic.
    660 *----------------------------------------------------------------------------*/
    661 
    662 static floatx80
    663  roundAndPackFloatx80(
    664      int8 roundingPrecision, flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1
    665  STATUS_PARAM)
    666 {
    667     int8 roundingMode;
    668     flag roundNearestEven, increment, isTiny;
    669     int64 roundIncrement, roundMask, roundBits;
    670 
    671     roundingMode = STATUS(float_rounding_mode);
    672     roundNearestEven = ( roundingMode == float_round_nearest_even );
    673     if ( roundingPrecision == 80 ) goto precision80;
    674     if ( roundingPrecision == 64 ) {
    675         roundIncrement = LIT64( 0x0000000000000400 );
    676         roundMask = LIT64( 0x00000000000007FF );
    677     }
    678     else if ( roundingPrecision == 32 ) {
    679         roundIncrement = LIT64( 0x0000008000000000 );
    680         roundMask = LIT64( 0x000000FFFFFFFFFF );
    681     }
    682     else {
    683         goto precision80;
    684     }
    685     zSig0 |= ( zSig1 != 0 );
    686     if ( ! roundNearestEven ) {
    687         if ( roundingMode == float_round_to_zero ) {
    688             roundIncrement = 0;
    689         }
    690         else {
    691             roundIncrement = roundMask;
    692             if ( zSign ) {
    693                 if ( roundingMode == float_round_up ) roundIncrement = 0;
    694             }
    695             else {
    696                 if ( roundingMode == float_round_down ) roundIncrement = 0;
    697             }
    698         }
    699     }
    700     roundBits = zSig0 & roundMask;
    701     if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
    702         if (    ( 0x7FFE < zExp )
    703              || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
    704            ) {
    705             goto overflow;
    706         }
    707         if ( zExp <= 0 ) {
    708             if (STATUS(flush_to_zero)) {
    709                 float_raise(float_flag_output_denormal STATUS_VAR);
    710                 return packFloatx80(zSign, 0, 0);
    711             }
    712             isTiny =
    713                    ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
    714                 || ( zExp < 0 )
    715                 || ( zSig0 <= zSig0 + roundIncrement );
    716             shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
    717             zExp = 0;
    718             roundBits = zSig0 & roundMask;
    719             if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
    720             if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
    721             zSig0 += roundIncrement;
    722             if ( (int64_t) zSig0 < 0 ) zExp = 1;
    723             roundIncrement = roundMask + 1;
    724             if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
    725                 roundMask |= roundIncrement;
    726             }
    727             zSig0 &= ~ roundMask;
    728             return packFloatx80( zSign, zExp, zSig0 );
    729         }
    730     }
    731     if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
    732     zSig0 += roundIncrement;
    733     if ( zSig0 < roundIncrement ) {
    734         ++zExp;
    735         zSig0 = LIT64( 0x8000000000000000 );
    736     }
    737     roundIncrement = roundMask + 1;
    738     if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
    739         roundMask |= roundIncrement;
    740     }
    741     zSig0 &= ~ roundMask;
    742     if ( zSig0 == 0 ) zExp = 0;
    743     return packFloatx80( zSign, zExp, zSig0 );
    744  precision80:
    745     increment = ( (int64_t) zSig1 < 0 );
    746     if ( ! roundNearestEven ) {
    747         if ( roundingMode == float_round_to_zero ) {
    748             increment = 0;
    749         }
    750         else {
    751             if ( zSign ) {
    752                 increment = ( roundingMode == float_round_down ) && zSig1;
    753             }
    754             else {
    755                 increment = ( roundingMode == float_round_up ) && zSig1;
    756             }
    757         }
    758     }
    759     if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
    760         if (    ( 0x7FFE < zExp )
    761              || (    ( zExp == 0x7FFE )
    762                   && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
    763                   && increment
    764                 )
    765            ) {
    766             roundMask = 0;
    767  overflow:
    768             float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
    769             if (    ( roundingMode == float_round_to_zero )
    770                  || ( zSign && ( roundingMode == float_round_up ) )
    771                  || ( ! zSign && ( roundingMode == float_round_down ) )
    772                ) {
    773                 return packFloatx80( zSign, 0x7FFE, ~ roundMask );
    774             }
    775             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
    776         }
    777         if ( zExp <= 0 ) {
    778             isTiny =
    779                    ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
    780                 || ( zExp < 0 )
    781                 || ! increment
    782                 || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
    783             shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
    784             zExp = 0;
    785             if ( isTiny && zSig1 ) float_raise( float_flag_underflow STATUS_VAR);
    786             if ( zSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
    787             if ( roundNearestEven ) {
    788                 increment = ( (int64_t) zSig1 < 0 );
    789             }
    790             else {
    791                 if ( zSign ) {
    792                     increment = ( roundingMode == float_round_down ) && zSig1;
    793                 }
    794                 else {
    795                     increment = ( roundingMode == float_round_up ) && zSig1;
    796                 }
    797             }
    798             if ( increment ) {
    799                 ++zSig0;
    800                 zSig0 &=
    801                     ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
    802                 if ( (int64_t) zSig0 < 0 ) zExp = 1;
    803             }
    804             return packFloatx80( zSign, zExp, zSig0 );
    805         }
    806     }
    807     if ( zSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
    808     if ( increment ) {
    809         ++zSig0;
    810         if ( zSig0 == 0 ) {
    811             ++zExp;
    812             zSig0 = LIT64( 0x8000000000000000 );
    813         }
    814         else {
    815             zSig0 &= ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
    816         }
    817     }
    818     else {
    819         if ( zSig0 == 0 ) zExp = 0;
    820     }
    821     return packFloatx80( zSign, zExp, zSig0 );
    822 
    823 }
    824 
    825 /*----------------------------------------------------------------------------
    826 | Takes an abstract floating-point value having sign `zSign', exponent
    827 | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
    828 | and returns the proper extended double-precision floating-point value
    829 | corresponding to the abstract input.  This routine is just like
    830 | `roundAndPackFloatx80' except that the input significand does not have to be
    831 | normalized.
    832 *----------------------------------------------------------------------------*/
    833 
    834 static floatx80
    835  normalizeRoundAndPackFloatx80(
    836      int8 roundingPrecision, flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1
    837  STATUS_PARAM)
    838 {
    839     int8 shiftCount;
    840 
    841     if ( zSig0 == 0 ) {
    842         zSig0 = zSig1;
    843         zSig1 = 0;
    844         zExp -= 64;
    845     }
    846     shiftCount = countLeadingZeros64( zSig0 );
    847     shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
    848     zExp -= shiftCount;
    849     return
    850         roundAndPackFloatx80( roundingPrecision, zSign, zExp, zSig0, zSig1 STATUS_VAR);
    851 
    852 }
    853 
    854 #endif
    855 
    856 #ifdef FLOAT128
    857 
    858 /*----------------------------------------------------------------------------
    859 | Returns the least-significant 64 fraction bits of the quadruple-precision
    860 | floating-point value `a'.
    861 *----------------------------------------------------------------------------*/
    862 
    863 INLINE uint64_t extractFloat128Frac1( float128 a )
    864 {
    865 
    866     return a.low;
    867 
    868 }
    869 
    870 /*----------------------------------------------------------------------------
    871 | Returns the most-significant 48 fraction bits of the quadruple-precision
    872 | floating-point value `a'.
    873 *----------------------------------------------------------------------------*/
    874 
    875 INLINE uint64_t extractFloat128Frac0( float128 a )
    876 {
    877 
    878     return a.high & LIT64( 0x0000FFFFFFFFFFFF );
    879 
    880 }
    881 
    882 /*----------------------------------------------------------------------------
    883 | Returns the exponent bits of the quadruple-precision floating-point value
    884 | `a'.
    885 *----------------------------------------------------------------------------*/
    886 
    887 INLINE int32 extractFloat128Exp( float128 a )
    888 {
    889 
    890     return ( a.high>>48 ) & 0x7FFF;
    891 
    892 }
    893 
    894 /*----------------------------------------------------------------------------
    895 | Returns the sign bit of the quadruple-precision floating-point value `a'.
    896 *----------------------------------------------------------------------------*/
    897 
    898 INLINE flag extractFloat128Sign( float128 a )
    899 {
    900 
    901     return a.high>>63;
    902 
    903 }
    904 
    905 /*----------------------------------------------------------------------------
    906 | Normalizes the subnormal quadruple-precision floating-point value
    907 | represented by the denormalized significand formed by the concatenation of
    908 | `aSig0' and `aSig1'.  The normalized exponent is stored at the location
    909 | pointed to by `zExpPtr'.  The most significant 49 bits of the normalized
    910 | significand are stored at the location pointed to by `zSig0Ptr', and the
    911 | least significant 64 bits of the normalized significand are stored at the
    912 | location pointed to by `zSig1Ptr'.
    913 *----------------------------------------------------------------------------*/
    914 
    915 static void
    916  normalizeFloat128Subnormal(
    917      uint64_t aSig0,
    918      uint64_t aSig1,
    919      int32 *zExpPtr,
    920      uint64_t *zSig0Ptr,
    921      uint64_t *zSig1Ptr
    922  )
    923 {
    924     int8 shiftCount;
    925 
    926     if ( aSig0 == 0 ) {
    927         shiftCount = countLeadingZeros64( aSig1 ) - 15;
    928         if ( shiftCount < 0 ) {
    929             *zSig0Ptr = aSig1>>( - shiftCount );
    930             *zSig1Ptr = aSig1<<( shiftCount & 63 );
    931         }
    932         else {
    933             *zSig0Ptr = aSig1<<shiftCount;
    934             *zSig1Ptr = 0;
    935         }
    936         *zExpPtr = - shiftCount - 63;
    937     }
    938     else {
    939         shiftCount = countLeadingZeros64( aSig0 ) - 15;
    940         shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
    941         *zExpPtr = 1 - shiftCount;
    942     }
    943 
    944 }
    945 
    946 /*----------------------------------------------------------------------------
    947 | Packs the sign `zSign', the exponent `zExp', and the significand formed
    948 | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
    949 | floating-point value, returning the result.  After being shifted into the
    950 | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
    951 | added together to form the most significant 32 bits of the result.  This
    952 | means that any integer portion of `zSig0' will be added into the exponent.
    953 | Since a properly normalized significand will have an integer portion equal
    954 | to 1, the `zExp' input should be 1 less than the desired result exponent
    955 | whenever `zSig0' and `zSig1' concatenated form a complete, normalized
    956 | significand.
    957 *----------------------------------------------------------------------------*/
    958 
    959 INLINE float128
    960  packFloat128( flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1 )
    961 {
    962     float128 z;
    963 
    964     z.low = zSig1;
    965     z.high = ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<48 ) + zSig0;
    966     return z;
    967 
    968 }
    969 
    970 /*----------------------------------------------------------------------------
    971 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
    972 | and extended significand formed by the concatenation of `zSig0', `zSig1',
    973 | and `zSig2', and returns the proper quadruple-precision floating-point value
    974 | corresponding to the abstract input.  Ordinarily, the abstract value is
    975 | simply rounded and packed into the quadruple-precision format, with the
    976 | inexact exception raised if the abstract input cannot be represented
    977 | exactly.  However, if the abstract value is too large, the overflow and
    978 | inexact exceptions are raised and an infinity or maximal finite value is
    979 | returned.  If the abstract value is too small, the input value is rounded to
    980 | a subnormal number, and the underflow and inexact exceptions are raised if
    981 | the abstract input cannot be represented exactly as a subnormal quadruple-
    982 | precision floating-point number.
    983 |     The input significand must be normalized or smaller.  If the input
    984 | significand is not normalized, `zExp' must be 0; in that case, the result
    985 | returned is a subnormal number, and it must not require rounding.  In the
    986 | usual case that the input significand is normalized, `zExp' must be 1 less
    987 | than the ``true'' floating-point exponent.  The handling of underflow and
    988 | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
    989 *----------------------------------------------------------------------------*/
    990 
    991 static float128
    992  roundAndPackFloat128(
    993      flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1, uint64_t zSig2 STATUS_PARAM)
    994 {
    995     int8 roundingMode;
    996     flag roundNearestEven, increment, isTiny;
    997 
    998     roundingMode = STATUS(float_rounding_mode);
    999     roundNearestEven = ( roundingMode == float_round_nearest_even );
   1000     increment = ( (int64_t) zSig2 < 0 );
   1001     if ( ! roundNearestEven ) {
   1002         if ( roundingMode == float_round_to_zero ) {
   1003             increment = 0;
   1004         }
   1005         else {
   1006             if ( zSign ) {
   1007                 increment = ( roundingMode == float_round_down ) && zSig2;
   1008             }
   1009             else {
   1010                 increment = ( roundingMode == float_round_up ) && zSig2;
   1011             }
   1012         }
   1013     }
   1014     if ( 0x7FFD <= (uint32_t) zExp ) {
   1015         if (    ( 0x7FFD < zExp )
   1016              || (    ( zExp == 0x7FFD )
   1017                   && eq128(
   1018                          LIT64( 0x0001FFFFFFFFFFFF ),
   1019                          LIT64( 0xFFFFFFFFFFFFFFFF ),
   1020                          zSig0,
   1021                          zSig1
   1022                      )
   1023                   && increment
   1024                 )
   1025            ) {
   1026             float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
   1027             if (    ( roundingMode == float_round_to_zero )
   1028                  || ( zSign && ( roundingMode == float_round_up ) )
   1029                  || ( ! zSign && ( roundingMode == float_round_down ) )
   1030                ) {
   1031                 return
   1032                     packFloat128(
   1033                         zSign,
   1034                         0x7FFE,
   1035                         LIT64( 0x0000FFFFFFFFFFFF ),
   1036                         LIT64( 0xFFFFFFFFFFFFFFFF )
   1037                     );
   1038             }
   1039             return packFloat128( zSign, 0x7FFF, 0, 0 );
   1040         }
   1041         if ( zExp < 0 ) {
   1042             if (STATUS(flush_to_zero)) {
   1043                 float_raise(float_flag_output_denormal STATUS_VAR);
   1044                 return packFloat128(zSign, 0, 0, 0);
   1045             }
   1046             isTiny =
   1047                    ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
   1048                 || ( zExp < -1 )
   1049                 || ! increment
   1050                 || lt128(
   1051                        zSig0,
   1052                        zSig1,
   1053                        LIT64( 0x0001FFFFFFFFFFFF ),
   1054                        LIT64( 0xFFFFFFFFFFFFFFFF )
   1055                    );
   1056             shift128ExtraRightJamming(
   1057                 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
   1058             zExp = 0;
   1059             if ( isTiny && zSig2 ) float_raise( float_flag_underflow STATUS_VAR);
   1060             if ( roundNearestEven ) {
   1061                 increment = ( (int64_t) zSig2 < 0 );
   1062             }
   1063             else {
   1064                 if ( zSign ) {
   1065                     increment = ( roundingMode == float_round_down ) && zSig2;
   1066                 }
   1067                 else {
   1068                     increment = ( roundingMode == float_round_up ) && zSig2;
   1069                 }
   1070             }
   1071         }
   1072     }
   1073     if ( zSig2 ) STATUS(float_exception_flags) |= float_flag_inexact;
   1074     if ( increment ) {
   1075         add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
   1076         zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven );
   1077     }
   1078     else {
   1079         if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
   1080     }
   1081     return packFloat128( zSign, zExp, zSig0, zSig1 );
   1082 
   1083 }
   1084 
   1085 /*----------------------------------------------------------------------------
   1086 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
   1087 | and significand formed by the concatenation of `zSig0' and `zSig1', and
   1088 | returns the proper quadruple-precision floating-point value corresponding
   1089 | to the abstract input.  This routine is just like `roundAndPackFloat128'
   1090 | except that the input significand has fewer bits and does not have to be
   1091 | normalized.  In all cases, `zExp' must be 1 less than the ``true'' floating-
   1092 | point exponent.
   1093 *----------------------------------------------------------------------------*/
   1094 
   1095 static float128
   1096  normalizeRoundAndPackFloat128(
   1097      flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1 STATUS_PARAM)
   1098 {
   1099     int8 shiftCount;
   1100     uint64_t zSig2;
   1101 
   1102     if ( zSig0 == 0 ) {
   1103         zSig0 = zSig1;
   1104         zSig1 = 0;
   1105         zExp -= 64;
   1106     }
   1107     shiftCount = countLeadingZeros64( zSig0 ) - 15;
   1108     if ( 0 <= shiftCount ) {
   1109         zSig2 = 0;
   1110         shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
   1111     }
   1112     else {
   1113         shift128ExtraRightJamming(
   1114             zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
   1115     }
   1116     zExp -= shiftCount;
   1117     return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR);
   1118 
   1119 }
   1120 
   1121 #endif
   1122 
   1123 /*----------------------------------------------------------------------------
   1124 | Returns the result of converting the 32-bit two's complement integer `a'
   1125 | to the single-precision floating-point format.  The conversion is performed
   1126 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   1127 *----------------------------------------------------------------------------*/
   1128 
   1129 float32 int32_to_float32( int32 a STATUS_PARAM )
   1130 {
   1131     flag zSign;
   1132 
   1133     if ( a == 0 ) return float32_zero;
   1134     if ( a == (int32_t) 0x80000000 ) return packFloat32( 1, 0x9E, 0 );
   1135     zSign = ( a < 0 );
   1136     return normalizeRoundAndPackFloat32( zSign, 0x9C, zSign ? - a : a STATUS_VAR );
   1137 
   1138 }
   1139 
   1140 /*----------------------------------------------------------------------------
   1141 | Returns the result of converting the 32-bit two's complement integer `a'
   1142 | to the double-precision floating-point format.  The conversion is performed
   1143 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   1144 *----------------------------------------------------------------------------*/
   1145 
   1146 float64 int32_to_float64( int32 a STATUS_PARAM )
   1147 {
   1148     flag zSign;
   1149     uint32 absA;
   1150     int8 shiftCount;
   1151     uint64_t zSig;
   1152 
   1153     if ( a == 0 ) return float64_zero;
   1154     zSign = ( a < 0 );
   1155     absA = zSign ? - a : a;
   1156     shiftCount = countLeadingZeros32( absA ) + 21;
   1157     zSig = absA;
   1158     return packFloat64( zSign, 0x432 - shiftCount, zSig<<shiftCount );
   1159 
   1160 }
   1161 
   1162 #ifdef FLOATX80
   1163 
   1164 /*----------------------------------------------------------------------------
   1165 | Returns the result of converting the 32-bit two's complement integer `a'
   1166 | to the extended double-precision floating-point format.  The conversion
   1167 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
   1168 | Arithmetic.
   1169 *----------------------------------------------------------------------------*/
   1170 
   1171 floatx80 int32_to_floatx80( int32 a STATUS_PARAM )
   1172 {
   1173     flag zSign;
   1174     uint32 absA;
   1175     int8 shiftCount;
   1176     uint64_t zSig;
   1177 
   1178     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
   1179     zSign = ( a < 0 );
   1180     absA = zSign ? - a : a;
   1181     shiftCount = countLeadingZeros32( absA ) + 32;
   1182     zSig = absA;
   1183     return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
   1184 
   1185 }
   1186 
   1187 #endif
   1188 
   1189 #ifdef FLOAT128
   1190 
   1191 /*----------------------------------------------------------------------------
   1192 | Returns the result of converting the 32-bit two's complement integer `a' to
   1193 | the quadruple-precision floating-point format.  The conversion is performed
   1194 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   1195 *----------------------------------------------------------------------------*/
   1196 
   1197 float128 int32_to_float128( int32 a STATUS_PARAM )
   1198 {
   1199     flag zSign;
   1200     uint32 absA;
   1201     int8 shiftCount;
   1202     uint64_t zSig0;
   1203 
   1204     if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
   1205     zSign = ( a < 0 );
   1206     absA = zSign ? - a : a;
   1207     shiftCount = countLeadingZeros32( absA ) + 17;
   1208     zSig0 = absA;
   1209     return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
   1210 
   1211 }
   1212 
   1213 #endif
   1214 
   1215 /*----------------------------------------------------------------------------
   1216 | Returns the result of converting the 64-bit two's complement integer `a'
   1217 | to the single-precision floating-point format.  The conversion is performed
   1218 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   1219 *----------------------------------------------------------------------------*/
   1220 
   1221 float32 int64_to_float32( int64 a STATUS_PARAM )
   1222 {
   1223     flag zSign;
   1224     uint64 absA;
   1225     int8 shiftCount;
   1226 
   1227     if ( a == 0 ) return float32_zero;
   1228     zSign = ( a < 0 );
   1229     absA = zSign ? - a : a;
   1230     shiftCount = countLeadingZeros64( absA ) - 40;
   1231     if ( 0 <= shiftCount ) {
   1232         return packFloat32( zSign, 0x95 - shiftCount, absA<<shiftCount );
   1233     }
   1234     else {
   1235         shiftCount += 7;
   1236         if ( shiftCount < 0 ) {
   1237             shift64RightJamming( absA, - shiftCount, &absA );
   1238         }
   1239         else {
   1240             absA <<= shiftCount;
   1241         }
   1242         return roundAndPackFloat32( zSign, 0x9C - shiftCount, absA STATUS_VAR );
   1243     }
   1244 
   1245 }
   1246 
   1247 float32 uint64_to_float32( uint64 a STATUS_PARAM )
   1248 {
   1249     int8 shiftCount;
   1250 
   1251     if ( a == 0 ) return float32_zero;
   1252     shiftCount = countLeadingZeros64( a ) - 40;
   1253     if ( 0 <= shiftCount ) {
   1254         return packFloat32( 1 > 0, 0x95 - shiftCount, a<<shiftCount );
   1255     }
   1256     else {
   1257         shiftCount += 7;
   1258         if ( shiftCount < 0 ) {
   1259             shift64RightJamming( a, - shiftCount, &a );
   1260         }
   1261         else {
   1262             a <<= shiftCount;
   1263         }
   1264         return roundAndPackFloat32( 1 > 0, 0x9C - shiftCount, a STATUS_VAR );
   1265     }
   1266 }
   1267 
   1268 /*----------------------------------------------------------------------------
   1269 | Returns the result of converting the 64-bit two's complement integer `a'
   1270 | to the double-precision floating-point format.  The conversion is performed
   1271 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   1272 *----------------------------------------------------------------------------*/
   1273 
   1274 float64 int64_to_float64( int64 a STATUS_PARAM )
   1275 {
   1276     flag zSign;
   1277 
   1278     if ( a == 0 ) return float64_zero;
   1279     if ( a == (int64_t) LIT64( 0x8000000000000000 ) ) {
   1280         return packFloat64( 1, 0x43E, 0 );
   1281     }
   1282     zSign = ( a < 0 );
   1283     return normalizeRoundAndPackFloat64( zSign, 0x43C, zSign ? - a : a STATUS_VAR );
   1284 
   1285 }
   1286 
   1287 float64 uint64_to_float64( uint64 a STATUS_PARAM )
   1288 {
   1289     if ( a == 0 ) return float64_zero;
   1290     return normalizeRoundAndPackFloat64( 0, 0x43C, a STATUS_VAR );
   1291 
   1292 }
   1293 
   1294 #ifdef FLOATX80
   1295 
   1296 /*----------------------------------------------------------------------------
   1297 | Returns the result of converting the 64-bit two's complement integer `a'
   1298 | to the extended double-precision floating-point format.  The conversion
   1299 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
   1300 | Arithmetic.
   1301 *----------------------------------------------------------------------------*/
   1302 
   1303 floatx80 int64_to_floatx80( int64 a STATUS_PARAM )
   1304 {
   1305     flag zSign;
   1306     uint64 absA;
   1307     int8 shiftCount;
   1308 
   1309     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
   1310     zSign = ( a < 0 );
   1311     absA = zSign ? - a : a;
   1312     shiftCount = countLeadingZeros64( absA );
   1313     return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
   1314 
   1315 }
   1316 
   1317 #endif
   1318 
   1319 #ifdef FLOAT128
   1320 
   1321 /*----------------------------------------------------------------------------
   1322 | Returns the result of converting the 64-bit two's complement integer `a' to
   1323 | the quadruple-precision floating-point format.  The conversion is performed
   1324 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   1325 *----------------------------------------------------------------------------*/
   1326 
   1327 float128 int64_to_float128( int64 a STATUS_PARAM )
   1328 {
   1329     flag zSign;
   1330     uint64 absA;
   1331     int8 shiftCount;
   1332     int32 zExp;
   1333     uint64_t zSig0, zSig1;
   1334 
   1335     if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
   1336     zSign = ( a < 0 );
   1337     absA = zSign ? - a : a;
   1338     shiftCount = countLeadingZeros64( absA ) + 49;
   1339     zExp = 0x406E - shiftCount;
   1340     if ( 64 <= shiftCount ) {
   1341         zSig1 = 0;
   1342         zSig0 = absA;
   1343         shiftCount -= 64;
   1344     }
   1345     else {
   1346         zSig1 = absA;
   1347         zSig0 = 0;
   1348     }
   1349     shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
   1350     return packFloat128( zSign, zExp, zSig0, zSig1 );
   1351 
   1352 }
   1353 
   1354 #endif
   1355 
   1356 /*----------------------------------------------------------------------------
   1357 | Returns the result of converting the single-precision floating-point value
   1358 | `a' to the 32-bit two's complement integer format.  The conversion is
   1359 | performed according to the IEC/IEEE Standard for Binary Floating-Point
   1360 | Arithmetic---which means in particular that the conversion is rounded
   1361 | according to the current rounding mode.  If `a' is a NaN, the largest
   1362 | positive integer is returned.  Otherwise, if the conversion overflows, the
   1363 | largest integer with the same sign as `a' is returned.
   1364 *----------------------------------------------------------------------------*/
   1365 
   1366 int32 float32_to_int32( float32 a STATUS_PARAM )
   1367 {
   1368     flag aSign;
   1369     int16 aExp, shiftCount;
   1370     uint32_t aSig;
   1371     uint64_t aSig64;
   1372 
   1373     a = float32_squash_input_denormal(a STATUS_VAR);
   1374     aSig = extractFloat32Frac( a );
   1375     aExp = extractFloat32Exp( a );
   1376     aSign = extractFloat32Sign( a );
   1377     if ( ( aExp == 0xFF ) && aSig ) aSign = 0;
   1378     if ( aExp ) aSig |= 0x00800000;
   1379     shiftCount = 0xAF - aExp;
   1380     aSig64 = aSig;
   1381     aSig64 <<= 32;
   1382     if ( 0 < shiftCount ) shift64RightJamming( aSig64, shiftCount, &aSig64 );
   1383     return roundAndPackInt32( aSign, aSig64 STATUS_VAR );
   1384 
   1385 }
   1386 
   1387 /*----------------------------------------------------------------------------
   1388 | Returns the result of converting the single-precision floating-point value
   1389 | `a' to the 32-bit two's complement integer format.  The conversion is
   1390 | performed according to the IEC/IEEE Standard for Binary Floating-Point
   1391 | Arithmetic, except that the conversion is always rounded toward zero.
   1392 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
   1393 | the conversion overflows, the largest integer with the same sign as `a' is
   1394 | returned.
   1395 *----------------------------------------------------------------------------*/
   1396 
   1397 int32 float32_to_int32_round_to_zero( float32 a STATUS_PARAM )
   1398 {
   1399     flag aSign;
   1400     int16 aExp, shiftCount;
   1401     uint32_t aSig;
   1402     int32 z;
   1403     a = float32_squash_input_denormal(a STATUS_VAR);
   1404 
   1405     aSig = extractFloat32Frac( a );
   1406     aExp = extractFloat32Exp( a );
   1407     aSign = extractFloat32Sign( a );
   1408     shiftCount = aExp - 0x9E;
   1409     if ( 0 <= shiftCount ) {
   1410         if ( float32_val(a) != 0xCF000000 ) {
   1411             float_raise( float_flag_invalid STATUS_VAR);
   1412             if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF;
   1413         }
   1414         return (int32_t) 0x80000000;
   1415     }
   1416     else if ( aExp <= 0x7E ) {
   1417         if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
   1418         return 0;
   1419     }
   1420     aSig = ( aSig | 0x00800000 )<<8;
   1421     z = aSig>>( - shiftCount );
   1422     if ( (uint32_t) ( aSig<<( shiftCount & 31 ) ) ) {
   1423         STATUS(float_exception_flags) |= float_flag_inexact;
   1424     }
   1425     if ( aSign ) z = - z;
   1426     return z;
   1427 
   1428 }
   1429 
   1430 /*----------------------------------------------------------------------------
   1431 | Returns the result of converting the single-precision floating-point value
   1432 | `a' to the 16-bit two's complement integer format.  The conversion is
   1433 | performed according to the IEC/IEEE Standard for Binary Floating-Point
   1434 | Arithmetic, except that the conversion is always rounded toward zero.
   1435 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
   1436 | the conversion overflows, the largest integer with the same sign as `a' is
   1437 | returned.
   1438 *----------------------------------------------------------------------------*/
   1439 
   1440 int16 float32_to_int16_round_to_zero( float32 a STATUS_PARAM )
   1441 {
   1442     flag aSign;
   1443     int16 aExp, shiftCount;
   1444     uint32_t aSig;
   1445     int32 z;
   1446 
   1447     aSig = extractFloat32Frac( a );
   1448     aExp = extractFloat32Exp( a );
   1449     aSign = extractFloat32Sign( a );
   1450     shiftCount = aExp - 0x8E;
   1451     if ( 0 <= shiftCount ) {
   1452         if ( float32_val(a) != 0xC7000000 ) {
   1453             float_raise( float_flag_invalid STATUS_VAR);
   1454             if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
   1455                 return 0x7FFF;
   1456             }
   1457         }
   1458         return (int32_t) 0xffff8000;
   1459     }
   1460     else if ( aExp <= 0x7E ) {
   1461         if ( aExp | aSig ) {
   1462             STATUS(float_exception_flags) |= float_flag_inexact;
   1463         }
   1464         return 0;
   1465     }
   1466     shiftCount -= 0x10;
   1467     aSig = ( aSig | 0x00800000 )<<8;
   1468     z = aSig>>( - shiftCount );
   1469     if ( (uint32_t) ( aSig<<( shiftCount & 31 ) ) ) {
   1470         STATUS(float_exception_flags) |= float_flag_inexact;
   1471     }
   1472     if ( aSign ) {
   1473         z = - z;
   1474     }
   1475     return z;
   1476 
   1477 }
   1478 
   1479 /*----------------------------------------------------------------------------
   1480 | Returns the result of converting the single-precision floating-point value
   1481 | `a' to the 64-bit two's complement integer format.  The conversion is
   1482 | performed according to the IEC/IEEE Standard for Binary Floating-Point
   1483 | Arithmetic---which means in particular that the conversion is rounded
   1484 | according to the current rounding mode.  If `a' is a NaN, the largest
   1485 | positive integer is returned.  Otherwise, if the conversion overflows, the
   1486 | largest integer with the same sign as `a' is returned.
   1487 *----------------------------------------------------------------------------*/
   1488 
   1489 int64 float32_to_int64( float32 a STATUS_PARAM )
   1490 {
   1491     flag aSign;
   1492     int16 aExp, shiftCount;
   1493     uint32_t aSig;
   1494     uint64_t aSig64, aSigExtra;
   1495     a = float32_squash_input_denormal(a STATUS_VAR);
   1496 
   1497     aSig = extractFloat32Frac( a );
   1498     aExp = extractFloat32Exp( a );
   1499     aSign = extractFloat32Sign( a );
   1500     shiftCount = 0xBE - aExp;
   1501     if ( shiftCount < 0 ) {
   1502         float_raise( float_flag_invalid STATUS_VAR);
   1503         if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
   1504             return LIT64( 0x7FFFFFFFFFFFFFFF );
   1505         }
   1506         return (int64_t) LIT64( 0x8000000000000000 );
   1507     }
   1508     if ( aExp ) aSig |= 0x00800000;
   1509     aSig64 = aSig;
   1510     aSig64 <<= 40;
   1511     shift64ExtraRightJamming( aSig64, 0, shiftCount, &aSig64, &aSigExtra );
   1512     return roundAndPackInt64( aSign, aSig64, aSigExtra STATUS_VAR );
   1513 
   1514 }
   1515 
   1516 /*----------------------------------------------------------------------------
   1517 | Returns the result of converting the single-precision floating-point value
   1518 | `a' to the 64-bit two's complement integer format.  The conversion is
   1519 | performed according to the IEC/IEEE Standard for Binary Floating-Point
   1520 | Arithmetic, except that the conversion is always rounded toward zero.  If
   1521 | `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
   1522 | conversion overflows, the largest integer with the same sign as `a' is
   1523 | returned.
   1524 *----------------------------------------------------------------------------*/
   1525 
   1526 int64 float32_to_int64_round_to_zero( float32 a STATUS_PARAM )
   1527 {
   1528     flag aSign;
   1529     int16 aExp, shiftCount;
   1530     uint32_t aSig;
   1531     uint64_t aSig64;
   1532     int64 z;
   1533     a = float32_squash_input_denormal(a STATUS_VAR);
   1534 
   1535     aSig = extractFloat32Frac( a );
   1536     aExp = extractFloat32Exp( a );
   1537     aSign = extractFloat32Sign( a );
   1538     shiftCount = aExp - 0xBE;
   1539     if ( 0 <= shiftCount ) {
   1540         if ( float32_val(a) != 0xDF000000 ) {
   1541             float_raise( float_flag_invalid STATUS_VAR);
   1542             if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
   1543                 return LIT64( 0x7FFFFFFFFFFFFFFF );
   1544             }
   1545         }
   1546         return (int64_t) LIT64( 0x8000000000000000 );
   1547     }
   1548     else if ( aExp <= 0x7E ) {
   1549         if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
   1550         return 0;
   1551     }
   1552     aSig64 = aSig | 0x00800000;
   1553     aSig64 <<= 40;
   1554     z = aSig64>>( - shiftCount );
   1555     if ( (uint64_t) ( aSig64<<( shiftCount & 63 ) ) ) {
   1556         STATUS(float_exception_flags) |= float_flag_inexact;
   1557     }
   1558     if ( aSign ) z = - z;
   1559     return z;
   1560 
   1561 }
   1562 
   1563 /*----------------------------------------------------------------------------
   1564 | Returns the result of converting the single-precision floating-point value
   1565 | `a' to the double-precision floating-point format.  The conversion is
   1566 | performed according to the IEC/IEEE Standard for Binary Floating-Point
   1567 | Arithmetic.
   1568 *----------------------------------------------------------------------------*/
   1569 
   1570 float64 float32_to_float64( float32 a STATUS_PARAM )
   1571 {
   1572     flag aSign;
   1573     int16 aExp;
   1574     uint32_t aSig;
   1575     a = float32_squash_input_denormal(a STATUS_VAR);
   1576 
   1577     aSig = extractFloat32Frac( a );
   1578     aExp = extractFloat32Exp( a );
   1579     aSign = extractFloat32Sign( a );
   1580     if ( aExp == 0xFF ) {
   1581         if ( aSig ) return commonNaNToFloat64( float32ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
   1582         return packFloat64( aSign, 0x7FF, 0 );
   1583     }
   1584     if ( aExp == 0 ) {
   1585         if ( aSig == 0 ) return packFloat64( aSign, 0, 0 );
   1586         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
   1587         --aExp;
   1588     }
   1589     return packFloat64( aSign, aExp + 0x380, ( (uint64_t) aSig )<<29 );
   1590 
   1591 }
   1592 
   1593 #ifdef FLOATX80
   1594 
   1595 /*----------------------------------------------------------------------------
   1596 | Returns the result of converting the single-precision floating-point value
   1597 | `a' to the extended double-precision floating-point format.  The conversion
   1598 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
   1599 | Arithmetic.
   1600 *----------------------------------------------------------------------------*/
   1601 
   1602 floatx80 float32_to_floatx80( float32 a STATUS_PARAM )
   1603 {
   1604     flag aSign;
   1605     int16 aExp;
   1606     uint32_t aSig;
   1607 
   1608     a = float32_squash_input_denormal(a STATUS_VAR);
   1609     aSig = extractFloat32Frac( a );
   1610     aExp = extractFloat32Exp( a );
   1611     aSign = extractFloat32Sign( a );
   1612     if ( aExp == 0xFF ) {
   1613         if ( aSig ) return commonNaNToFloatx80( float32ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
   1614         return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
   1615     }
   1616     if ( aExp == 0 ) {
   1617         if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
   1618         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
   1619     }
   1620     aSig |= 0x00800000;
   1621     return packFloatx80( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<40 );
   1622 
   1623 }
   1624 
   1625 #endif
   1626 
   1627 #ifdef FLOAT128
   1628 
   1629 /*----------------------------------------------------------------------------
   1630 | Returns the result of converting the single-precision floating-point value
   1631 | `a' to the double-precision floating-point format.  The conversion is
   1632 | performed according to the IEC/IEEE Standard for Binary Floating-Point
   1633 | Arithmetic.
   1634 *----------------------------------------------------------------------------*/
   1635 
   1636 float128 float32_to_float128( float32 a STATUS_PARAM )
   1637 {
   1638     flag aSign;
   1639     int16 aExp;
   1640     uint32_t aSig;
   1641 
   1642     a = float32_squash_input_denormal(a STATUS_VAR);
   1643     aSig = extractFloat32Frac( a );
   1644     aExp = extractFloat32Exp( a );
   1645     aSign = extractFloat32Sign( a );
   1646     if ( aExp == 0xFF ) {
   1647         if ( aSig ) return commonNaNToFloat128( float32ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
   1648         return packFloat128( aSign, 0x7FFF, 0, 0 );
   1649     }
   1650     if ( aExp == 0 ) {
   1651         if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
   1652         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
   1653         --aExp;
   1654     }
   1655     return packFloat128( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<25, 0 );
   1656 
   1657 }
   1658 
   1659 #endif
   1660 
   1661 /*----------------------------------------------------------------------------
   1662 | Rounds the single-precision floating-point value `a' to an integer, and
   1663 | returns the result as a single-precision floating-point value.  The
   1664 | operation is performed according to the IEC/IEEE Standard for Binary
   1665 | Floating-Point Arithmetic.
   1666 *----------------------------------------------------------------------------*/
   1667 
   1668 float32 float32_round_to_int( float32 a STATUS_PARAM)
   1669 {
   1670     flag aSign;
   1671     int16 aExp;
   1672     uint32_t lastBitMask, roundBitsMask;
   1673     int8 roundingMode;
   1674     uint32_t z;
   1675     a = float32_squash_input_denormal(a STATUS_VAR);
   1676 
   1677     aExp = extractFloat32Exp( a );
   1678     if ( 0x96 <= aExp ) {
   1679         if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) {
   1680             return propagateFloat32NaN( a, a STATUS_VAR );
   1681         }
   1682         return a;
   1683     }
   1684     if ( aExp <= 0x7E ) {
   1685         if ( (uint32_t) ( float32_val(a)<<1 ) == 0 ) return a;
   1686         STATUS(float_exception_flags) |= float_flag_inexact;
   1687         aSign = extractFloat32Sign( a );
   1688         switch ( STATUS(float_rounding_mode) ) {
   1689          case float_round_nearest_even:
   1690             if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) {
   1691                 return packFloat32( aSign, 0x7F, 0 );
   1692             }
   1693             break;
   1694          case float_round_down:
   1695             return make_float32(aSign ? 0xBF800000 : 0);
   1696          case float_round_up:
   1697             return make_float32(aSign ? 0x80000000 : 0x3F800000);
   1698         }
   1699         return packFloat32( aSign, 0, 0 );
   1700     }
   1701     lastBitMask = 1;
   1702     lastBitMask <<= 0x96 - aExp;
   1703     roundBitsMask = lastBitMask - 1;
   1704     z = float32_val(a);
   1705     roundingMode = STATUS(float_rounding_mode);
   1706     if ( roundingMode == float_round_nearest_even ) {
   1707         z += lastBitMask>>1;
   1708         if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
   1709     }
   1710     else if ( roundingMode != float_round_to_zero ) {
   1711         if ( extractFloat32Sign( make_float32(z) ) ^ ( roundingMode == float_round_up ) ) {
   1712             z += roundBitsMask;
   1713         }
   1714     }
   1715     z &= ~ roundBitsMask;
   1716     if ( z != float32_val(a) ) STATUS(float_exception_flags) |= float_flag_inexact;
   1717     return make_float32(z);
   1718 
   1719 }
   1720 
   1721 /*----------------------------------------------------------------------------
   1722 | Returns the result of adding the absolute values of the single-precision
   1723 | floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
   1724 | before being returned.  `zSign' is ignored if the result is a NaN.
   1725 | The addition is performed according to the IEC/IEEE Standard for Binary
   1726 | Floating-Point Arithmetic.
   1727 *----------------------------------------------------------------------------*/
   1728 
   1729 static float32 addFloat32Sigs( float32 a, float32 b, flag zSign STATUS_PARAM)
   1730 {
   1731     int16 aExp, bExp, zExp;
   1732     uint32_t aSig, bSig, zSig;
   1733     int16 expDiff;
   1734 
   1735     aSig = extractFloat32Frac( a );
   1736     aExp = extractFloat32Exp( a );
   1737     bSig = extractFloat32Frac( b );
   1738     bExp = extractFloat32Exp( b );
   1739     expDiff = aExp - bExp;
   1740     aSig <<= 6;
   1741     bSig <<= 6;
   1742     if ( 0 < expDiff ) {
   1743         if ( aExp == 0xFF ) {
   1744             if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
   1745             return a;
   1746         }
   1747         if ( bExp == 0 ) {
   1748             --expDiff;
   1749         }
   1750         else {
   1751             bSig |= 0x20000000;
   1752         }
   1753         shift32RightJamming( bSig, expDiff, &bSig );
   1754         zExp = aExp;
   1755     }
   1756     else if ( expDiff < 0 ) {
   1757         if ( bExp == 0xFF ) {
   1758             if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
   1759             return packFloat32( zSign, 0xFF, 0 );
   1760         }
   1761         if ( aExp == 0 ) {
   1762             ++expDiff;
   1763         }
   1764         else {
   1765             aSig |= 0x20000000;
   1766         }
   1767         shift32RightJamming( aSig, - expDiff, &aSig );
   1768         zExp = bExp;
   1769     }
   1770     else {
   1771         if ( aExp == 0xFF ) {
   1772             if ( aSig | bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
   1773             return a;
   1774         }
   1775         if ( aExp == 0 ) {
   1776             if (STATUS(flush_to_zero)) {
   1777                 if (aSig | bSig) {
   1778                     float_raise(float_flag_output_denormal STATUS_VAR);
   1779                 }
   1780                 return packFloat32(zSign, 0, 0);
   1781             }
   1782             return packFloat32( zSign, 0, ( aSig + bSig )>>6 );
   1783         }
   1784         zSig = 0x40000000 + aSig + bSig;
   1785         zExp = aExp;
   1786         goto roundAndPack;
   1787     }
   1788     aSig |= 0x20000000;
   1789     zSig = ( aSig + bSig )<<1;
   1790     --zExp;
   1791     if ( (int32_t) zSig < 0 ) {
   1792         zSig = aSig + bSig;
   1793         ++zExp;
   1794     }
   1795  roundAndPack:
   1796     return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
   1797 
   1798 }
   1799 
   1800 /*----------------------------------------------------------------------------
   1801 | Returns the result of subtracting the absolute values of the single-
   1802 | precision floating-point values `a' and `b'.  If `zSign' is 1, the
   1803 | difference is negated before being returned.  `zSign' is ignored if the
   1804 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
   1805 | Standard for Binary Floating-Point Arithmetic.
   1806 *----------------------------------------------------------------------------*/
   1807 
   1808 static float32 subFloat32Sigs( float32 a, float32 b, flag zSign STATUS_PARAM)
   1809 {
   1810     int16 aExp, bExp, zExp;
   1811     uint32_t aSig, bSig, zSig;
   1812     int16 expDiff;
   1813 
   1814     aSig = extractFloat32Frac( a );
   1815     aExp = extractFloat32Exp( a );
   1816     bSig = extractFloat32Frac( b );
   1817     bExp = extractFloat32Exp( b );
   1818     expDiff = aExp - bExp;
   1819     aSig <<= 7;
   1820     bSig <<= 7;
   1821     if ( 0 < expDiff ) goto aExpBigger;
   1822     if ( expDiff < 0 ) goto bExpBigger;
   1823     if ( aExp == 0xFF ) {
   1824         if ( aSig | bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
   1825         float_raise( float_flag_invalid STATUS_VAR);
   1826         return float32_default_nan;
   1827     }
   1828     if ( aExp == 0 ) {
   1829         aExp = 1;
   1830         bExp = 1;
   1831     }
   1832     if ( bSig < aSig ) goto aBigger;
   1833     if ( aSig < bSig ) goto bBigger;
   1834     return packFloat32( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
   1835  bExpBigger:
   1836     if ( bExp == 0xFF ) {
   1837         if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
   1838         return packFloat32( zSign ^ 1, 0xFF, 0 );
   1839     }
   1840     if ( aExp == 0 ) {
   1841         ++expDiff;
   1842     }
   1843     else {
   1844         aSig |= 0x40000000;
   1845     }
   1846     shift32RightJamming( aSig, - expDiff, &aSig );
   1847     bSig |= 0x40000000;
   1848  bBigger:
   1849     zSig = bSig - aSig;
   1850     zExp = bExp;
   1851     zSign ^= 1;
   1852     goto normalizeRoundAndPack;
   1853  aExpBigger:
   1854     if ( aExp == 0xFF ) {
   1855         if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
   1856         return a;
   1857     }
   1858     if ( bExp == 0 ) {
   1859         --expDiff;
   1860     }
   1861     else {
   1862         bSig |= 0x40000000;
   1863     }
   1864     shift32RightJamming( bSig, expDiff, &bSig );
   1865     aSig |= 0x40000000;
   1866  aBigger:
   1867     zSig = aSig - bSig;
   1868     zExp = aExp;
   1869  normalizeRoundAndPack:
   1870     --zExp;
   1871     return normalizeRoundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
   1872 
   1873 }
   1874 
   1875 /*----------------------------------------------------------------------------
   1876 | Returns the result of adding the single-precision floating-point values `a'
   1877 | and `b'.  The operation is performed according to the IEC/IEEE Standard for
   1878 | Binary Floating-Point Arithmetic.
   1879 *----------------------------------------------------------------------------*/
   1880 
   1881 float32 float32_add( float32 a, float32 b STATUS_PARAM )
   1882 {
   1883     flag aSign, bSign;
   1884     a = float32_squash_input_denormal(a STATUS_VAR);
   1885     b = float32_squash_input_denormal(b STATUS_VAR);
   1886 
   1887     aSign = extractFloat32Sign( a );
   1888     bSign = extractFloat32Sign( b );
   1889     if ( aSign == bSign ) {
   1890         return addFloat32Sigs( a, b, aSign STATUS_VAR);
   1891     }
   1892     else {
   1893         return subFloat32Sigs( a, b, aSign STATUS_VAR );
   1894     }
   1895 
   1896 }
   1897 
   1898 /*----------------------------------------------------------------------------
   1899 | Returns the result of subtracting the single-precision floating-point values
   1900 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
   1901 | for Binary Floating-Point Arithmetic.
   1902 *----------------------------------------------------------------------------*/
   1903 
   1904 float32 float32_sub( float32 a, float32 b STATUS_PARAM )
   1905 {
   1906     flag aSign, bSign;
   1907     a = float32_squash_input_denormal(a STATUS_VAR);
   1908     b = float32_squash_input_denormal(b STATUS_VAR);
   1909 
   1910     aSign = extractFloat32Sign( a );
   1911     bSign = extractFloat32Sign( b );
   1912     if ( aSign == bSign ) {
   1913         return subFloat32Sigs( a, b, aSign STATUS_VAR );
   1914     }
   1915     else {
   1916         return addFloat32Sigs( a, b, aSign STATUS_VAR );
   1917     }
   1918 
   1919 }
   1920 
   1921 /*----------------------------------------------------------------------------
   1922 | Returns the result of multiplying the single-precision floating-point values
   1923 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
   1924 | for Binary Floating-Point Arithmetic.
   1925 *----------------------------------------------------------------------------*/
   1926 
   1927 float32 float32_mul( float32 a, float32 b STATUS_PARAM )
   1928 {
   1929     flag aSign, bSign, zSign;
   1930     int16 aExp, bExp, zExp;
   1931     uint32_t aSig, bSig;
   1932     uint64_t zSig64;
   1933     uint32_t zSig;
   1934 
   1935     a = float32_squash_input_denormal(a STATUS_VAR);
   1936     b = float32_squash_input_denormal(b STATUS_VAR);
   1937 
   1938     aSig = extractFloat32Frac( a );
   1939     aExp = extractFloat32Exp( a );
   1940     aSign = extractFloat32Sign( a );
   1941     bSig = extractFloat32Frac( b );
   1942     bExp = extractFloat32Exp( b );
   1943     bSign = extractFloat32Sign( b );
   1944     zSign = aSign ^ bSign;
   1945     if ( aExp == 0xFF ) {
   1946         if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
   1947             return propagateFloat32NaN( a, b STATUS_VAR );
   1948         }
   1949         if ( ( bExp | bSig ) == 0 ) {
   1950             float_raise( float_flag_invalid STATUS_VAR);
   1951             return float32_default_nan;
   1952         }
   1953         return packFloat32( zSign, 0xFF, 0 );
   1954     }
   1955     if ( bExp == 0xFF ) {
   1956         if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
   1957         if ( ( aExp | aSig ) == 0 ) {
   1958             float_raise( float_flag_invalid STATUS_VAR);
   1959             return float32_default_nan;
   1960         }
   1961         return packFloat32( zSign, 0xFF, 0 );
   1962     }
   1963     if ( aExp == 0 ) {
   1964         if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
   1965         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
   1966     }
   1967     if ( bExp == 0 ) {
   1968         if ( bSig == 0 ) return packFloat32( zSign, 0, 0 );
   1969         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
   1970     }
   1971     zExp = aExp + bExp - 0x7F;
   1972     aSig = ( aSig | 0x00800000 )<<7;
   1973     bSig = ( bSig | 0x00800000 )<<8;
   1974     shift64RightJamming( ( (uint64_t) aSig ) * bSig, 32, &zSig64 );
   1975     zSig = zSig64;
   1976     if ( 0 <= (int32_t) ( zSig<<1 ) ) {
   1977         zSig <<= 1;
   1978         --zExp;
   1979     }
   1980     return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
   1981 
   1982 }
   1983 
   1984 /*----------------------------------------------------------------------------
   1985 | Returns the result of dividing the single-precision floating-point value `a'
   1986 | by the corresponding value `b'.  The operation is performed according to the
   1987 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   1988 *----------------------------------------------------------------------------*/
   1989 
   1990 float32 float32_div( float32 a, float32 b STATUS_PARAM )
   1991 {
   1992     flag aSign, bSign, zSign;
   1993     int16 aExp, bExp, zExp;
   1994     uint32_t aSig, bSig, zSig;
   1995     a = float32_squash_input_denormal(a STATUS_VAR);
   1996     b = float32_squash_input_denormal(b STATUS_VAR);
   1997 
   1998     aSig = extractFloat32Frac( a );
   1999     aExp = extractFloat32Exp( a );
   2000     aSign = extractFloat32Sign( a );
   2001     bSig = extractFloat32Frac( b );
   2002     bExp = extractFloat32Exp( b );
   2003     bSign = extractFloat32Sign( b );
   2004     zSign = aSign ^ bSign;
   2005     if ( aExp == 0xFF ) {
   2006         if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
   2007         if ( bExp == 0xFF ) {
   2008             if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
   2009             float_raise( float_flag_invalid STATUS_VAR);
   2010             return float32_default_nan;
   2011         }
   2012         return packFloat32( zSign, 0xFF, 0 );
   2013     }
   2014     if ( bExp == 0xFF ) {
   2015         if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
   2016         return packFloat32( zSign, 0, 0 );
   2017     }
   2018     if ( bExp == 0 ) {
   2019         if ( bSig == 0 ) {
   2020             if ( ( aExp | aSig ) == 0 ) {
   2021                 float_raise( float_flag_invalid STATUS_VAR);
   2022                 return float32_default_nan;
   2023             }
   2024             float_raise( float_flag_divbyzero STATUS_VAR);
   2025             return packFloat32( zSign, 0xFF, 0 );
   2026         }
   2027         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
   2028     }
   2029     if ( aExp == 0 ) {
   2030         if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
   2031         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
   2032     }
   2033     zExp = aExp - bExp + 0x7D;
   2034     aSig = ( aSig | 0x00800000 )<<7;
   2035     bSig = ( bSig | 0x00800000 )<<8;
   2036     if ( bSig <= ( aSig + aSig ) ) {
   2037         aSig >>= 1;
   2038         ++zExp;
   2039     }
   2040     zSig = ( ( (uint64_t) aSig )<<32 ) / bSig;
   2041     if ( ( zSig & 0x3F ) == 0 ) {
   2042         zSig |= ( (uint64_t) bSig * zSig != ( (uint64_t) aSig )<<32 );
   2043     }
   2044     return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
   2045 
   2046 }
   2047 
   2048 /*----------------------------------------------------------------------------
   2049 | Returns the remainder of the single-precision floating-point value `a'
   2050 | with respect to the corresponding value `b'.  The operation is performed
   2051 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   2052 *----------------------------------------------------------------------------*/
   2053 
   2054 float32 float32_rem( float32 a, float32 b STATUS_PARAM )
   2055 {
   2056     flag aSign, zSign;
   2057     int16 aExp, bExp, expDiff;
   2058     uint32_t aSig, bSig;
   2059     uint32_t q;
   2060     uint64_t aSig64, bSig64, q64;
   2061     uint32_t alternateASig;
   2062     int32_t sigMean;
   2063     a = float32_squash_input_denormal(a STATUS_VAR);
   2064     b = float32_squash_input_denormal(b STATUS_VAR);
   2065 
   2066     aSig = extractFloat32Frac( a );
   2067     aExp = extractFloat32Exp( a );
   2068     aSign = extractFloat32Sign( a );
   2069     bSig = extractFloat32Frac( b );
   2070     bExp = extractFloat32Exp( b );
   2071     if ( aExp == 0xFF ) {
   2072         if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
   2073             return propagateFloat32NaN( a, b STATUS_VAR );
   2074         }
   2075         float_raise( float_flag_invalid STATUS_VAR);
   2076         return float32_default_nan;
   2077     }
   2078     if ( bExp == 0xFF ) {
   2079         if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
   2080         return a;
   2081     }
   2082     if ( bExp == 0 ) {
   2083         if ( bSig == 0 ) {
   2084             float_raise( float_flag_invalid STATUS_VAR);
   2085             return float32_default_nan;
   2086         }
   2087         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
   2088     }
   2089     if ( aExp == 0 ) {
   2090         if ( aSig == 0 ) return a;
   2091         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
   2092     }
   2093     expDiff = aExp - bExp;
   2094     aSig |= 0x00800000;
   2095     bSig |= 0x00800000;
   2096     if ( expDiff < 32 ) {
   2097         aSig <<= 8;
   2098         bSig <<= 8;
   2099         if ( expDiff < 0 ) {
   2100             if ( expDiff < -1 ) return a;
   2101             aSig >>= 1;
   2102         }
   2103         q = ( bSig <= aSig );
   2104         if ( q ) aSig -= bSig;
   2105         if ( 0 < expDiff ) {
   2106             q = ( ( (uint64_t) aSig )<<32 ) / bSig;
   2107             q >>= 32 - expDiff;
   2108             bSig >>= 2;
   2109             aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
   2110         }
   2111         else {
   2112             aSig >>= 2;
   2113             bSig >>= 2;
   2114         }
   2115     }
   2116     else {
   2117         if ( bSig <= aSig ) aSig -= bSig;
   2118         aSig64 = ( (uint64_t) aSig )<<40;
   2119         bSig64 = ( (uint64_t) bSig )<<40;
   2120         expDiff -= 64;
   2121         while ( 0 < expDiff ) {
   2122             q64 = estimateDiv128To64( aSig64, 0, bSig64 );
   2123             q64 = ( 2 < q64 ) ? q64 - 2 : 0;
   2124             aSig64 = - ( ( bSig * q64 )<<38 );
   2125             expDiff -= 62;
   2126         }
   2127         expDiff += 64;
   2128         q64 = estimateDiv128To64( aSig64, 0, bSig64 );
   2129         q64 = ( 2 < q64 ) ? q64 - 2 : 0;
   2130         q = q64>>( 64 - expDiff );
   2131         bSig <<= 6;
   2132         aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
   2133     }
   2134     do {
   2135         alternateASig = aSig;
   2136         ++q;
   2137         aSig -= bSig;
   2138     } while ( 0 <= (int32_t) aSig );
   2139     sigMean = aSig + alternateASig;
   2140     if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
   2141         aSig = alternateASig;
   2142     }
   2143     zSign = ( (int32_t) aSig < 0 );
   2144     if ( zSign ) aSig = - aSig;
   2145     return normalizeRoundAndPackFloat32( aSign ^ zSign, bExp, aSig STATUS_VAR );
   2146 
   2147 }
   2148 
   2149 /*----------------------------------------------------------------------------
   2150 | Returns the square root of the single-precision floating-point value `a'.
   2151 | The operation is performed according to the IEC/IEEE Standard for Binary
   2152 | Floating-Point Arithmetic.
   2153 *----------------------------------------------------------------------------*/
   2154 
   2155 float32 float32_sqrt( float32 a STATUS_PARAM )
   2156 {
   2157     flag aSign;
   2158     int16 aExp, zExp;
   2159     uint32_t aSig, zSig;
   2160     uint64_t rem, term;
   2161     a = float32_squash_input_denormal(a STATUS_VAR);
   2162 
   2163     aSig = extractFloat32Frac( a );
   2164     aExp = extractFloat32Exp( a );
   2165     aSign = extractFloat32Sign( a );
   2166     if ( aExp == 0xFF ) {
   2167         if ( aSig ) return propagateFloat32NaN( a, float32_zero STATUS_VAR );
   2168         if ( ! aSign ) return a;
   2169         float_raise( float_flag_invalid STATUS_VAR);
   2170         return float32_default_nan;
   2171     }
   2172     if ( aSign ) {
   2173         if ( ( aExp | aSig ) == 0 ) return a;
   2174         float_raise( float_flag_invalid STATUS_VAR);
   2175         return float32_default_nan;
   2176     }
   2177     if ( aExp == 0 ) {
   2178         if ( aSig == 0 ) return float32_zero;
   2179         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
   2180     }
   2181     zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E;
   2182     aSig = ( aSig | 0x00800000 )<<8;
   2183     zSig = estimateSqrt32( aExp, aSig ) + 2;
   2184     if ( ( zSig & 0x7F ) <= 5 ) {
   2185         if ( zSig < 2 ) {
   2186             zSig = 0x7FFFFFFF;
   2187             goto roundAndPack;
   2188         }
   2189         aSig >>= aExp & 1;
   2190         term = ( (uint64_t) zSig ) * zSig;
   2191         rem = ( ( (uint64_t) aSig )<<32 ) - term;
   2192         while ( (int64_t) rem < 0 ) {
   2193             --zSig;
   2194             rem += ( ( (uint64_t) zSig )<<1 ) | 1;
   2195         }
   2196         zSig |= ( rem != 0 );
   2197     }
   2198     shift32RightJamming( zSig, 1, &zSig );
   2199  roundAndPack:
   2200     return roundAndPackFloat32( 0, zExp, zSig STATUS_VAR );
   2201 
   2202 }
   2203 
   2204 /*----------------------------------------------------------------------------
   2205 | Returns the binary exponential of the single-precision floating-point value
   2206 | `a'. The operation is performed according to the IEC/IEEE Standard for
   2207 | Binary Floating-Point Arithmetic.
   2208 |
   2209 | Uses the following identities:
   2210 |
   2211 | 1. -------------------------------------------------------------------------
   2212 |      x    x*ln(2)
   2213 |     2  = e
   2214 |
   2215 | 2. -------------------------------------------------------------------------
   2216 |                      2     3     4     5           n
   2217 |      x        x     x     x     x     x           x
   2218 |     e  = 1 + --- + --- + --- + --- + --- + ... + --- + ...
   2219 |               1!    2!    3!    4!    5!          n!
   2220 *----------------------------------------------------------------------------*/
   2221 
   2222 static const float64 float32_exp2_coefficients[15] =
   2223 {
   2224     const_float64( 0x3ff0000000000000ll ), /*  1 */
   2225     const_float64( 0x3fe0000000000000ll ), /*  2 */
   2226     const_float64( 0x3fc5555555555555ll ), /*  3 */
   2227     const_float64( 0x3fa5555555555555ll ), /*  4 */
   2228     const_float64( 0x3f81111111111111ll ), /*  5 */
   2229     const_float64( 0x3f56c16c16c16c17ll ), /*  6 */
   2230     const_float64( 0x3f2a01a01a01a01all ), /*  7 */
   2231     const_float64( 0x3efa01a01a01a01all ), /*  8 */
   2232     const_float64( 0x3ec71de3a556c734ll ), /*  9 */
   2233     const_float64( 0x3e927e4fb7789f5cll ), /* 10 */
   2234     const_float64( 0x3e5ae64567f544e4ll ), /* 11 */
   2235     const_float64( 0x3e21eed8eff8d898ll ), /* 12 */
   2236     const_float64( 0x3de6124613a86d09ll ), /* 13 */
   2237     const_float64( 0x3da93974a8c07c9dll ), /* 14 */
   2238     const_float64( 0x3d6ae7f3e733b81fll ), /* 15 */
   2239 };
   2240 
   2241 float32 float32_exp2( float32 a STATUS_PARAM )
   2242 {
   2243     flag aSign;
   2244     int16 aExp;
   2245     uint32_t aSig;
   2246     float64 r, x, xn;
   2247     int i;
   2248     a = float32_squash_input_denormal(a STATUS_VAR);
   2249 
   2250     aSig = extractFloat32Frac( a );
   2251     aExp = extractFloat32Exp( a );
   2252     aSign = extractFloat32Sign( a );
   2253 
   2254     if ( aExp == 0xFF) {
   2255         if ( aSig ) return propagateFloat32NaN( a, float32_zero STATUS_VAR );
   2256         return (aSign) ? float32_zero : a;
   2257     }
   2258     if (aExp == 0) {
   2259         if (aSig == 0) return float32_one;
   2260     }
   2261 
   2262     float_raise( float_flag_inexact STATUS_VAR);
   2263 
   2264     /* ******************************* */
   2265     /* using float64 for approximation */
   2266     /* ******************************* */
   2267     x = float32_to_float64(a STATUS_VAR);
   2268     x = float64_mul(x, float64_ln2 STATUS_VAR);
   2269 
   2270     xn = x;
   2271     r = float64_one;
   2272     for (i = 0 ; i < 15 ; i++) {
   2273         float64 f;
   2274 
   2275         f = float64_mul(xn, float32_exp2_coefficients[i] STATUS_VAR);
   2276         r = float64_add(r, f STATUS_VAR);
   2277 
   2278         xn = float64_mul(xn, x STATUS_VAR);
   2279     }
   2280 
   2281     return float64_to_float32(r, status);
   2282 }
   2283 
   2284 /*----------------------------------------------------------------------------
   2285 | Returns the binary log of the single-precision floating-point value `a'.
   2286 | The operation is performed according to the IEC/IEEE Standard for Binary
   2287 | Floating-Point Arithmetic.
   2288 *----------------------------------------------------------------------------*/
   2289 float32 float32_log2( float32 a STATUS_PARAM )
   2290 {
   2291     flag aSign, zSign;
   2292     int16 aExp;
   2293     uint32_t aSig, zSig, i;
   2294 
   2295     a = float32_squash_input_denormal(a STATUS_VAR);
   2296     aSig = extractFloat32Frac( a );
   2297     aExp = extractFloat32Exp( a );
   2298     aSign = extractFloat32Sign( a );
   2299 
   2300     if ( aExp == 0 ) {
   2301         if ( aSig == 0 ) return packFloat32( 1, 0xFF, 0 );
   2302         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
   2303     }
   2304     if ( aSign ) {
   2305         float_raise( float_flag_invalid STATUS_VAR);
   2306         return float32_default_nan;
   2307     }
   2308     if ( aExp == 0xFF ) {
   2309         if ( aSig ) return propagateFloat32NaN( a, float32_zero STATUS_VAR );
   2310         return a;
   2311     }
   2312 
   2313     aExp -= 0x7F;
   2314     aSig |= 0x00800000;
   2315     zSign = aExp < 0;
   2316     zSig = aExp << 23;
   2317 
   2318     for (i = 1 << 22; i > 0; i >>= 1) {
   2319         aSig = ( (uint64_t)aSig * aSig ) >> 23;
   2320         if ( aSig & 0x01000000 ) {
   2321             aSig >>= 1;
   2322             zSig |= i;
   2323         }
   2324     }
   2325 
   2326     if ( zSign )
   2327         zSig = -zSig;
   2328 
   2329     return normalizeRoundAndPackFloat32( zSign, 0x85, zSig STATUS_VAR );
   2330 }
   2331 
   2332 /*----------------------------------------------------------------------------
   2333 | Returns 1 if the single-precision floating-point value `a' is equal to
   2334 | the corresponding value `b', and 0 otherwise.  The invalid exception is
   2335 | raised if either operand is a NaN.  Otherwise, the comparison is performed
   2336 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   2337 *----------------------------------------------------------------------------*/
   2338 
   2339 int float32_eq( float32 a, float32 b STATUS_PARAM )
   2340 {
   2341     uint32_t av, bv;
   2342     a = float32_squash_input_denormal(a STATUS_VAR);
   2343     b = float32_squash_input_denormal(b STATUS_VAR);
   2344 
   2345     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
   2346          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
   2347        ) {
   2348         float_raise( float_flag_invalid STATUS_VAR);
   2349         return 0;
   2350     }
   2351     av = float32_val(a);
   2352     bv = float32_val(b);
   2353     return ( av == bv ) || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
   2354 }
   2355 
   2356 /*----------------------------------------------------------------------------
   2357 | Returns 1 if the single-precision floating-point value `a' is less than
   2358 | or equal to the corresponding value `b', and 0 otherwise.  The invalid
   2359 | exception is raised if either operand is a NaN.  The comparison is performed
   2360 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   2361 *----------------------------------------------------------------------------*/
   2362 
   2363 int float32_le( float32 a, float32 b STATUS_PARAM )
   2364 {
   2365     flag aSign, bSign;
   2366     uint32_t av, bv;
   2367     a = float32_squash_input_denormal(a STATUS_VAR);
   2368     b = float32_squash_input_denormal(b STATUS_VAR);
   2369 
   2370     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
   2371          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
   2372        ) {
   2373         float_raise( float_flag_invalid STATUS_VAR);
   2374         return 0;
   2375     }
   2376     aSign = extractFloat32Sign( a );
   2377     bSign = extractFloat32Sign( b );
   2378     av = float32_val(a);
   2379     bv = float32_val(b);
   2380     if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
   2381     return ( av == bv ) || ( aSign ^ ( av < bv ) );
   2382 
   2383 }
   2384 
   2385 /*----------------------------------------------------------------------------
   2386 | Returns 1 if the single-precision floating-point value `a' is less than
   2387 | the corresponding value `b', and 0 otherwise.  The invalid exception is
   2388 | raised if either operand is a NaN.  The comparison is performed according
   2389 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   2390 *----------------------------------------------------------------------------*/
   2391 
   2392 int float32_lt( float32 a, float32 b STATUS_PARAM )
   2393 {
   2394     flag aSign, bSign;
   2395     uint32_t av, bv;
   2396     a = float32_squash_input_denormal(a STATUS_VAR);
   2397     b = float32_squash_input_denormal(b STATUS_VAR);
   2398 
   2399     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
   2400          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
   2401        ) {
   2402         float_raise( float_flag_invalid STATUS_VAR);
   2403         return 0;
   2404     }
   2405     aSign = extractFloat32Sign( a );
   2406     bSign = extractFloat32Sign( b );
   2407     av = float32_val(a);
   2408     bv = float32_val(b);
   2409     if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
   2410     return ( av != bv ) && ( aSign ^ ( av < bv ) );
   2411 
   2412 }
   2413 
   2414 /*----------------------------------------------------------------------------
   2415 | Returns 1 if the single-precision floating-point values `a' and `b' cannot
   2416 | be compared, and 0 otherwise.  The invalid exception is raised if either
   2417 | operand is a NaN.  The comparison is performed according to the IEC/IEEE
   2418 | Standard for Binary Floating-Point Arithmetic.
   2419 *----------------------------------------------------------------------------*/
   2420 
   2421 int float32_unordered( float32 a, float32 b STATUS_PARAM )
   2422 {
   2423     a = float32_squash_input_denormal(a STATUS_VAR);
   2424     b = float32_squash_input_denormal(b STATUS_VAR);
   2425 
   2426     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
   2427          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
   2428        ) {
   2429         float_raise( float_flag_invalid STATUS_VAR);
   2430         return 1;
   2431     }
   2432     return 0;
   2433 }
   2434 
   2435 /*----------------------------------------------------------------------------
   2436 | Returns 1 if the single-precision floating-point value `a' is equal to
   2437 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
   2438 | exception.  The comparison is performed according to the IEC/IEEE Standard
   2439 | for Binary Floating-Point Arithmetic.
   2440 *----------------------------------------------------------------------------*/
   2441 
   2442 int float32_eq_quiet( float32 a, float32 b STATUS_PARAM )
   2443 {
   2444     a = float32_squash_input_denormal(a STATUS_VAR);
   2445     b = float32_squash_input_denormal(b STATUS_VAR);
   2446 
   2447     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
   2448          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
   2449        ) {
   2450         if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
   2451             float_raise( float_flag_invalid STATUS_VAR);
   2452         }
   2453         return 0;
   2454     }
   2455     return ( float32_val(a) == float32_val(b) ) ||
   2456             ( (uint32_t) ( ( float32_val(a) | float32_val(b) )<<1 ) == 0 );
   2457 }
   2458 
   2459 /*----------------------------------------------------------------------------
   2460 | Returns 1 if the single-precision floating-point value `a' is less than or
   2461 | equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
   2462 | cause an exception.  Otherwise, the comparison is performed according to the
   2463 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   2464 *----------------------------------------------------------------------------*/
   2465 
   2466 int float32_le_quiet( float32 a, float32 b STATUS_PARAM )
   2467 {
   2468     flag aSign, bSign;
   2469     uint32_t av, bv;
   2470     a = float32_squash_input_denormal(a STATUS_VAR);
   2471     b = float32_squash_input_denormal(b STATUS_VAR);
   2472 
   2473     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
   2474          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
   2475        ) {
   2476         if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
   2477             float_raise( float_flag_invalid STATUS_VAR);
   2478         }
   2479         return 0;
   2480     }
   2481     aSign = extractFloat32Sign( a );
   2482     bSign = extractFloat32Sign( b );
   2483     av = float32_val(a);
   2484     bv = float32_val(b);
   2485     if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
   2486     return ( av == bv ) || ( aSign ^ ( av < bv ) );
   2487 
   2488 }
   2489 
   2490 /*----------------------------------------------------------------------------
   2491 | Returns 1 if the single-precision floating-point value `a' is less than
   2492 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
   2493 | exception.  Otherwise, the comparison is performed according to the IEC/IEEE
   2494 | Standard for Binary Floating-Point Arithmetic.
   2495 *----------------------------------------------------------------------------*/
   2496 
   2497 int float32_lt_quiet( float32 a, float32 b STATUS_PARAM )
   2498 {
   2499     flag aSign, bSign;
   2500     uint32_t av, bv;
   2501     a = float32_squash_input_denormal(a STATUS_VAR);
   2502     b = float32_squash_input_denormal(b STATUS_VAR);
   2503 
   2504     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
   2505          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
   2506        ) {
   2507         if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
   2508             float_raise( float_flag_invalid STATUS_VAR);
   2509         }
   2510         return 0;
   2511     }
   2512     aSign = extractFloat32Sign( a );
   2513     bSign = extractFloat32Sign( b );
   2514     av = float32_val(a);
   2515     bv = float32_val(b);
   2516     if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
   2517     return ( av != bv ) && ( aSign ^ ( av < bv ) );
   2518 
   2519 }
   2520 
   2521 /*----------------------------------------------------------------------------
   2522 | Returns 1 if the single-precision floating-point values `a' and `b' cannot
   2523 | be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.  The
   2524 | comparison is performed according to the IEC/IEEE Standard for Binary
   2525 | Floating-Point Arithmetic.
   2526 *----------------------------------------------------------------------------*/
   2527 
   2528 int float32_unordered_quiet( float32 a, float32 b STATUS_PARAM )
   2529 {
   2530     a = float32_squash_input_denormal(a STATUS_VAR);
   2531     b = float32_squash_input_denormal(b STATUS_VAR);
   2532 
   2533     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
   2534          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
   2535        ) {
   2536         if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
   2537             float_raise( float_flag_invalid STATUS_VAR);
   2538         }
   2539         return 1;
   2540     }
   2541     return 0;
   2542 }
   2543 
   2544 /*----------------------------------------------------------------------------
   2545 | Returns the result of converting the double-precision floating-point value
   2546 | `a' to the 32-bit two's complement integer format.  The conversion is
   2547 | performed according to the IEC/IEEE Standard for Binary Floating-Point
   2548 | Arithmetic---which means in particular that the conversion is rounded
   2549 | according to the current rounding mode.  If `a' is a NaN, the largest
   2550 | positive integer is returned.  Otherwise, if the conversion overflows, the
   2551 | largest integer with the same sign as `a' is returned.
   2552 *----------------------------------------------------------------------------*/
   2553 
   2554 int32 float64_to_int32( float64 a STATUS_PARAM )
   2555 {
   2556     flag aSign;
   2557     int16 aExp, shiftCount;
   2558     uint64_t aSig;
   2559     a = float64_squash_input_denormal(a STATUS_VAR);
   2560 
   2561     aSig = extractFloat64Frac( a );
   2562     aExp = extractFloat64Exp( a );
   2563     aSign = extractFloat64Sign( a );
   2564     if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
   2565     if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
   2566     shiftCount = 0x42C - aExp;
   2567     if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig );
   2568     return roundAndPackInt32( aSign, aSig STATUS_VAR );
   2569 
   2570 }
   2571 
   2572 /*----------------------------------------------------------------------------
   2573 | Returns the result of converting the double-precision floating-point value
   2574 | `a' to the 32-bit two's complement integer format.  The conversion is
   2575 | performed according to the IEC/IEEE Standard for Binary Floating-Point
   2576 | Arithmetic, except that the conversion is always rounded toward zero.
   2577 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
   2578 | the conversion overflows, the largest integer with the same sign as `a' is
   2579 | returned.
   2580 *----------------------------------------------------------------------------*/
   2581 
   2582 int32 float64_to_int32_round_to_zero( float64 a STATUS_PARAM )
   2583 {
   2584     flag aSign;
   2585     int16 aExp, shiftCount;
   2586     uint64_t aSig, savedASig;
   2587     int32 z;
   2588     a = float64_squash_input_denormal(a STATUS_VAR);
   2589 
   2590     aSig = extractFloat64Frac( a );
   2591     aExp = extractFloat64Exp( a );
   2592     aSign = extractFloat64Sign( a );
   2593     if ( 0x41E < aExp ) {
   2594         if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
   2595         goto invalid;
   2596     }
   2597     else if ( aExp < 0x3FF ) {
   2598         if ( aExp || aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
   2599         return 0;
   2600     }
   2601     aSig |= LIT64( 0x0010000000000000 );
   2602     shiftCount = 0x433 - aExp;
   2603     savedASig = aSig;
   2604     aSig >>= shiftCount;
   2605     z = aSig;
   2606     if ( aSign ) z = - z;
   2607     if ( ( z < 0 ) ^ aSign ) {
   2608  invalid:
   2609         float_raise( float_flag_invalid STATUS_VAR);
   2610         return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
   2611     }
   2612     if ( ( aSig<<shiftCount ) != savedASig ) {
   2613         STATUS(float_exception_flags) |= float_flag_inexact;
   2614     }
   2615     return z;
   2616 
   2617 }
   2618 
   2619 /*----------------------------------------------------------------------------
   2620 | Returns the result of converting the double-precision floating-point value
   2621 | `a' to the 16-bit two's complement integer format.  The conversion is
   2622 | performed according to the IEC/IEEE Standard for Binary Floating-Point
   2623 | Arithmetic, except that the conversion is always rounded toward zero.
   2624 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
   2625 | the conversion overflows, the largest integer with the same sign as `a' is
   2626 | returned.
   2627 *----------------------------------------------------------------------------*/
   2628 
   2629 int16 float64_to_int16_round_to_zero( float64 a STATUS_PARAM )
   2630 {
   2631     flag aSign;
   2632     int16 aExp, shiftCount;
   2633     uint64_t aSig, savedASig;
   2634     int32 z;
   2635 
   2636     aSig = extractFloat64Frac( a );
   2637     aExp = extractFloat64Exp( a );
   2638     aSign = extractFloat64Sign( a );
   2639     if ( 0x40E < aExp ) {
   2640         if ( ( aExp == 0x7FF ) && aSig ) {
   2641             aSign = 0;
   2642         }
   2643         goto invalid;
   2644     }
   2645     else if ( aExp < 0x3FF ) {
   2646         if ( aExp || aSig ) {
   2647             STATUS(float_exception_flags) |= float_flag_inexact;
   2648         }
   2649         return 0;
   2650     }
   2651     aSig |= LIT64( 0x0010000000000000 );
   2652     shiftCount = 0x433 - aExp;
   2653     savedASig = aSig;
   2654     aSig >>= shiftCount;
   2655     z = aSig;
   2656     if ( aSign ) {
   2657         z = - z;
   2658     }
   2659     if ( ( (int16_t)z < 0 ) ^ aSign ) {
   2660  invalid:
   2661         float_raise( float_flag_invalid STATUS_VAR);
   2662         return aSign ? (int32_t) 0xffff8000 : 0x7FFF;
   2663     }
   2664     if ( ( aSig<<shiftCount ) != savedASig ) {
   2665         STATUS(float_exception_flags) |= float_flag_inexact;
   2666     }
   2667     return z;
   2668 }
   2669 
   2670 /*----------------------------------------------------------------------------
   2671 | Returns the result of converting the double-precision floating-point value
   2672 | `a' to the 64-bit two's complement integer format.  The conversion is
   2673 | performed according to the IEC/IEEE Standard for Binary Floating-Point
   2674 | Arithmetic---which means in particular that the conversion is rounded
   2675 | according to the current rounding mode.  If `a' is a NaN, the largest
   2676 | positive integer is returned.  Otherwise, if the conversion overflows, the
   2677 | largest integer with the same sign as `a' is returned.
   2678 *----------------------------------------------------------------------------*/
   2679 
   2680 int64 float64_to_int64( float64 a STATUS_PARAM )
   2681 {
   2682     flag aSign;
   2683     int16 aExp, shiftCount;
   2684     uint64_t aSig, aSigExtra;
   2685     a = float64_squash_input_denormal(a STATUS_VAR);
   2686 
   2687     aSig = extractFloat64Frac( a );
   2688     aExp = extractFloat64Exp( a );
   2689     aSign = extractFloat64Sign( a );
   2690     if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
   2691     shiftCount = 0x433 - aExp;
   2692     if ( shiftCount <= 0 ) {
   2693         if ( 0x43E < aExp ) {
   2694             float_raise( float_flag_invalid STATUS_VAR);
   2695             if (    ! aSign
   2696                  || (    ( aExp == 0x7FF )
   2697                       && ( aSig != LIT64( 0x0010000000000000 ) ) )
   2698                ) {
   2699                 return LIT64( 0x7FFFFFFFFFFFFFFF );
   2700             }
   2701             return (int64_t) LIT64( 0x8000000000000000 );
   2702         }
   2703         aSigExtra = 0;
   2704         aSig <<= - shiftCount;
   2705     }
   2706     else {
   2707         shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
   2708     }
   2709     return roundAndPackInt64( aSign, aSig, aSigExtra STATUS_VAR );
   2710 
   2711 }
   2712 
   2713 /*----------------------------------------------------------------------------
   2714 | Returns the result of converting the double-precision floating-point value
   2715 | `a' to the 64-bit two's complement integer format.  The conversion is
   2716 | performed according to the IEC/IEEE Standard for Binary Floating-Point
   2717 | Arithmetic, except that the conversion is always rounded toward zero.
   2718 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
   2719 | the conversion overflows, the largest integer with the same sign as `a' is
   2720 | returned.
   2721 *----------------------------------------------------------------------------*/
   2722 
   2723 int64 float64_to_int64_round_to_zero( float64 a STATUS_PARAM )
   2724 {
   2725     flag aSign;
   2726     int16 aExp, shiftCount;
   2727     uint64_t aSig;
   2728     int64 z;
   2729     a = float64_squash_input_denormal(a STATUS_VAR);
   2730 
   2731     aSig = extractFloat64Frac( a );
   2732     aExp = extractFloat64Exp( a );
   2733     aSign = extractFloat64Sign( a );
   2734     if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
   2735     shiftCount = aExp - 0x433;
   2736     if ( 0 <= shiftCount ) {
   2737         if ( 0x43E <= aExp ) {
   2738             if ( float64_val(a) != LIT64( 0xC3E0000000000000 ) ) {
   2739                 float_raise( float_flag_invalid STATUS_VAR);
   2740                 if (    ! aSign
   2741                      || (    ( aExp == 0x7FF )
   2742                           && ( aSig != LIT64( 0x0010000000000000 ) ) )
   2743                    ) {
   2744                     return LIT64( 0x7FFFFFFFFFFFFFFF );
   2745                 }
   2746             }
   2747             return (int64_t) LIT64( 0x8000000000000000 );
   2748         }
   2749         z = aSig<<shiftCount;
   2750     }
   2751     else {
   2752         if ( aExp < 0x3FE ) {
   2753             if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
   2754             return 0;
   2755         }
   2756         z = aSig>>( - shiftCount );
   2757         if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) {
   2758             STATUS(float_exception_flags) |= float_flag_inexact;
   2759         }
   2760     }
   2761     if ( aSign ) z = - z;
   2762     return z;
   2763 
   2764 }
   2765 
   2766 /*----------------------------------------------------------------------------
   2767 | Returns the result of converting the double-precision floating-point value
   2768 | `a' to the single-precision floating-point format.  The conversion is
   2769 | performed according to the IEC/IEEE Standard for Binary Floating-Point
   2770 | Arithmetic.
   2771 *----------------------------------------------------------------------------*/
   2772 
   2773 float32 float64_to_float32( float64 a STATUS_PARAM )
   2774 {
   2775     flag aSign;
   2776     int16 aExp;
   2777     uint64_t aSig;
   2778     uint32_t zSig;
   2779     a = float64_squash_input_denormal(a STATUS_VAR);
   2780 
   2781     aSig = extractFloat64Frac( a );
   2782     aExp = extractFloat64Exp( a );
   2783     aSign = extractFloat64Sign( a );
   2784     if ( aExp == 0x7FF ) {
   2785         if ( aSig ) return commonNaNToFloat32( float64ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
   2786         return packFloat32( aSign, 0xFF, 0 );
   2787     }
   2788     shift64RightJamming( aSig, 22, &aSig );
   2789     zSig = aSig;
   2790     if ( aExp || zSig ) {
   2791         zSig |= 0x40000000;
   2792         aExp -= 0x381;
   2793     }
   2794     return roundAndPackFloat32( aSign, aExp, zSig STATUS_VAR );
   2795 
   2796 }
   2797 
   2798 
   2799 /*----------------------------------------------------------------------------
   2800 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
   2801 | half-precision floating-point value, returning the result.  After being
   2802 | shifted into the proper positions, the three fields are simply added
   2803 | together to form the result.  This means that any integer portion of `zSig'
   2804 | will be added into the exponent.  Since a properly normalized significand
   2805 | will have an integer portion equal to 1, the `zExp' input should be 1 less
   2806 | than the desired result exponent whenever `zSig' is a complete, normalized
   2807 | significand.
   2808 *----------------------------------------------------------------------------*/
   2809 static float16 packFloat16(flag zSign, int16 zExp, uint16_t zSig)
   2810 {
   2811     return make_float16(
   2812         (((uint32_t)zSign) << 15) + (((uint32_t)zExp) << 10) + zSig);
   2813 }
   2814 
   2815 /* Half precision floats come in two formats: standard IEEE and "ARM" format.
   2816    The latter gains extra exponent range by omitting the NaN/Inf encodings.  */
   2817 
   2818 float32 float16_to_float32(float16 a, flag ieee STATUS_PARAM)
   2819 {
   2820     flag aSign;
   2821     int16 aExp;
   2822     uint32_t aSig;
   2823 
   2824     aSign = extractFloat16Sign(a);
   2825     aExp = extractFloat16Exp(a);
   2826     aSig = extractFloat16Frac(a);
   2827 
   2828     if (aExp == 0x1f && ieee) {
   2829         if (aSig) {
   2830             return commonNaNToFloat32(float16ToCommonNaN(a STATUS_VAR) STATUS_VAR);
   2831         }
   2832         return packFloat32(aSign, 0xff, aSig << 13);
   2833     }
   2834     if (aExp == 0) {
   2835         int8 shiftCount;
   2836 
   2837         if (aSig == 0) {
   2838             return packFloat32(aSign, 0, 0);
   2839         }
   2840 
   2841         shiftCount = countLeadingZeros32( aSig ) - 21;
   2842         aSig = aSig << shiftCount;
   2843         aExp = -shiftCount;
   2844     }
   2845     return packFloat32( aSign, aExp + 0x70, aSig << 13);
   2846 }
   2847 
   2848 float16 float32_to_float16(float32 a, flag ieee STATUS_PARAM)
   2849 {
   2850     flag aSign;
   2851     int16 aExp;
   2852     uint32_t aSig;
   2853     uint32_t mask;
   2854     uint32_t increment;
   2855     int8 roundingMode;
   2856     a = float32_squash_input_denormal(a STATUS_VAR);
   2857 
   2858     aSig = extractFloat32Frac( a );
   2859     aExp = extractFloat32Exp( a );
   2860     aSign = extractFloat32Sign( a );
   2861     if ( aExp == 0xFF ) {
   2862         if (aSig) {
   2863             /* Input is a NaN */
   2864             float16 r = commonNaNToFloat16( float32ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
   2865             if (!ieee) {
   2866                 return packFloat16(aSign, 0, 0);
   2867             }
   2868             return r;
   2869         }
   2870         /* Infinity */
   2871         if (!ieee) {
   2872             float_raise(float_flag_invalid STATUS_VAR);
   2873             return packFloat16(aSign, 0x1f, 0x3ff);
   2874         }
   2875         return packFloat16(aSign, 0x1f, 0);
   2876     }
   2877     if (aExp == 0 && aSig == 0) {
   2878         return packFloat16(aSign, 0, 0);
   2879     }
   2880     /* Decimal point between bits 22 and 23.  */
   2881     aSig |= 0x00800000;
   2882     aExp -= 0x7f;
   2883     if (aExp < -14) {
   2884         mask = 0x00ffffff;
   2885         if (aExp >= -24) {
   2886             mask >>= 25 + aExp;
   2887         }
   2888     } else {
   2889         mask = 0x00001fff;
   2890     }
   2891     if (aSig & mask) {
   2892         float_raise( float_flag_underflow STATUS_VAR );
   2893         roundingMode = STATUS(float_rounding_mode);
   2894         switch (roundingMode) {
   2895         case float_round_nearest_even:
   2896             increment = (mask + 1) >> 1;
   2897             if ((aSig & mask) == increment) {
   2898                 increment = aSig & (increment << 1);
   2899             }
   2900             break;
   2901         case float_round_up:
   2902             increment = aSign ? 0 : mask;
   2903             break;
   2904         case float_round_down:
   2905             increment = aSign ? mask : 0;
   2906             break;
   2907         default: /* round_to_zero */
   2908             increment = 0;
   2909             break;
   2910         }
   2911         aSig += increment;
   2912         if (aSig >= 0x01000000) {
   2913             aSig >>= 1;
   2914             aExp++;
   2915         }
   2916     } else if (aExp < -14
   2917           && STATUS(float_detect_tininess) == float_tininess_before_rounding) {
   2918         float_raise( float_flag_underflow STATUS_VAR);
   2919     }
   2920 
   2921     if (ieee) {
   2922         if (aExp > 15) {
   2923             float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
   2924             return packFloat16(aSign, 0x1f, 0);
   2925         }
   2926     } else {
   2927         if (aExp > 16) {
   2928             float_raise(float_flag_invalid | float_flag_inexact STATUS_VAR);
   2929             return packFloat16(aSign, 0x1f, 0x3ff);
   2930         }
   2931     }
   2932     if (aExp < -24) {
   2933         return packFloat16(aSign, 0, 0);
   2934     }
   2935     if (aExp < -14) {
   2936         aSig >>= -14 - aExp;
   2937         aExp = -14;
   2938     }
   2939     return packFloat16(aSign, aExp + 14, aSig >> 13);
   2940 }
   2941 
   2942 #ifdef FLOATX80
   2943 
   2944 /*----------------------------------------------------------------------------
   2945 | Returns the result of converting the double-precision floating-point value
   2946 | `a' to the extended double-precision floating-point format.  The conversion
   2947 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
   2948 | Arithmetic.
   2949 *----------------------------------------------------------------------------*/
   2950 
   2951 floatx80 float64_to_floatx80( float64 a STATUS_PARAM )
   2952 {
   2953     flag aSign;
   2954     int16 aExp;
   2955     uint64_t aSig;
   2956 
   2957     a = float64_squash_input_denormal(a STATUS_VAR);
   2958     aSig = extractFloat64Frac( a );
   2959     aExp = extractFloat64Exp( a );
   2960     aSign = extractFloat64Sign( a );
   2961     if ( aExp == 0x7FF ) {
   2962         if ( aSig ) return commonNaNToFloatx80( float64ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
   2963         return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
   2964     }
   2965     if ( aExp == 0 ) {
   2966         if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
   2967         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
   2968     }
   2969     return
   2970         packFloatx80(
   2971             aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 );
   2972 
   2973 }
   2974 
   2975 #endif
   2976 
   2977 #ifdef FLOAT128
   2978 
   2979 /*----------------------------------------------------------------------------
   2980 | Returns the result of converting the double-precision floating-point value
   2981 | `a' to the quadruple-precision floating-point format.  The conversion is
   2982 | performed according to the IEC/IEEE Standard for Binary Floating-Point
   2983 | Arithmetic.
   2984 *----------------------------------------------------------------------------*/
   2985 
   2986 float128 float64_to_float128( float64 a STATUS_PARAM )
   2987 {
   2988     flag aSign;
   2989     int16 aExp;
   2990     uint64_t aSig, zSig0, zSig1;
   2991 
   2992     a = float64_squash_input_denormal(a STATUS_VAR);
   2993     aSig = extractFloat64Frac( a );
   2994     aExp = extractFloat64Exp( a );
   2995     aSign = extractFloat64Sign( a );
   2996     if ( aExp == 0x7FF ) {
   2997         if ( aSig ) return commonNaNToFloat128( float64ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
   2998         return packFloat128( aSign, 0x7FFF, 0, 0 );
   2999     }
   3000     if ( aExp == 0 ) {
   3001         if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
   3002         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
   3003         --aExp;
   3004     }
   3005     shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
   3006     return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
   3007 
   3008 }
   3009 
   3010 #endif
   3011 
   3012 /*----------------------------------------------------------------------------
   3013 | Rounds the double-precision floating-point value `a' to an integer, and
   3014 | returns the result as a double-precision floating-point value.  The
   3015 | operation is performed according to the IEC/IEEE Standard for Binary
   3016 | Floating-Point Arithmetic.
   3017 *----------------------------------------------------------------------------*/
   3018 
   3019 float64 float64_round_to_int( float64 a STATUS_PARAM )
   3020 {
   3021     flag aSign;
   3022     int16 aExp;
   3023     uint64_t lastBitMask, roundBitsMask;
   3024     int8 roundingMode;
   3025     uint64_t z;
   3026     a = float64_squash_input_denormal(a STATUS_VAR);
   3027 
   3028     aExp = extractFloat64Exp( a );
   3029     if ( 0x433 <= aExp ) {
   3030         if ( ( aExp == 0x7FF ) && extractFloat64Frac( a ) ) {
   3031             return propagateFloat64NaN( a, a STATUS_VAR );
   3032         }
   3033         return a;
   3034     }
   3035     if ( aExp < 0x3FF ) {
   3036         if ( (uint64_t) ( float64_val(a)<<1 ) == 0 ) return a;
   3037         STATUS(float_exception_flags) |= float_flag_inexact;
   3038         aSign = extractFloat64Sign( a );
   3039         switch ( STATUS(float_rounding_mode) ) {
   3040          case float_round_nearest_even:
   3041             if ( ( aExp == 0x3FE ) && extractFloat64Frac( a ) ) {
   3042                 return packFloat64( aSign, 0x3FF, 0 );
   3043             }
   3044             break;
   3045          case float_round_down:
   3046             return make_float64(aSign ? LIT64( 0xBFF0000000000000 ) : 0);
   3047          case float_round_up:
   3048             return make_float64(
   3049             aSign ? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 ));
   3050         }
   3051         return packFloat64( aSign, 0, 0 );
   3052     }
   3053     lastBitMask = 1;
   3054     lastBitMask <<= 0x433 - aExp;
   3055     roundBitsMask = lastBitMask - 1;
   3056     z = float64_val(a);
   3057     roundingMode = STATUS(float_rounding_mode);
   3058     if ( roundingMode == float_round_nearest_even ) {
   3059         z += lastBitMask>>1;
   3060         if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
   3061     }
   3062     else if ( roundingMode != float_round_to_zero ) {
   3063         if ( extractFloat64Sign( make_float64(z) ) ^ ( roundingMode == float_round_up ) ) {
   3064             z += roundBitsMask;
   3065         }
   3066     }
   3067     z &= ~ roundBitsMask;
   3068     if ( z != float64_val(a) )
   3069         STATUS(float_exception_flags) |= float_flag_inexact;
   3070     return make_float64(z);
   3071 
   3072 }
   3073 
   3074 float64 float64_trunc_to_int( float64 a STATUS_PARAM)
   3075 {
   3076     int oldmode;
   3077     float64 res;
   3078     oldmode = STATUS(float_rounding_mode);
   3079     STATUS(float_rounding_mode) = float_round_to_zero;
   3080     res = float64_round_to_int(a STATUS_VAR);
   3081     STATUS(float_rounding_mode) = oldmode;
   3082     return res;
   3083 }
   3084 
   3085 /*----------------------------------------------------------------------------
   3086 | Returns the result of adding the absolute values of the double-precision
   3087 | floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
   3088 | before being returned.  `zSign' is ignored if the result is a NaN.
   3089 | The addition is performed according to the IEC/IEEE Standard for Binary
   3090 | Floating-Point Arithmetic.
   3091 *----------------------------------------------------------------------------*/
   3092 
   3093 static float64 addFloat64Sigs( float64 a, float64 b, flag zSign STATUS_PARAM )
   3094 {
   3095     int16 aExp, bExp, zExp;
   3096     uint64_t aSig, bSig, zSig;
   3097     int16 expDiff;
   3098 
   3099     aSig = extractFloat64Frac( a );
   3100     aExp = extractFloat64Exp( a );
   3101     bSig = extractFloat64Frac( b );
   3102     bExp = extractFloat64Exp( b );
   3103     expDiff = aExp - bExp;
   3104     aSig <<= 9;
   3105     bSig <<= 9;
   3106     if ( 0 < expDiff ) {
   3107         if ( aExp == 0x7FF ) {
   3108             if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
   3109             return a;
   3110         }
   3111         if ( bExp == 0 ) {
   3112             --expDiff;
   3113         }
   3114         else {
   3115             bSig |= LIT64( 0x2000000000000000 );
   3116         }
   3117         shift64RightJamming( bSig, expDiff, &bSig );
   3118         zExp = aExp;
   3119     }
   3120     else if ( expDiff < 0 ) {
   3121         if ( bExp == 0x7FF ) {
   3122             if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
   3123             return packFloat64( zSign, 0x7FF, 0 );
   3124         }
   3125         if ( aExp == 0 ) {
   3126             ++expDiff;
   3127         }
   3128         else {
   3129             aSig |= LIT64( 0x2000000000000000 );
   3130         }
   3131         shift64RightJamming( aSig, - expDiff, &aSig );
   3132         zExp = bExp;
   3133     }
   3134     else {
   3135         if ( aExp == 0x7FF ) {
   3136             if ( aSig | bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
   3137             return a;
   3138         }
   3139         if ( aExp == 0 ) {
   3140             if (STATUS(flush_to_zero)) {
   3141                 if (aSig | bSig) {
   3142                     float_raise(float_flag_output_denormal STATUS_VAR);
   3143                 }
   3144                 return packFloat64(zSign, 0, 0);
   3145             }
   3146             return packFloat64( zSign, 0, ( aSig + bSig )>>9 );
   3147         }
   3148         zSig = LIT64( 0x4000000000000000 ) + aSig + bSig;
   3149         zExp = aExp;
   3150         goto roundAndPack;
   3151     }
   3152     aSig |= LIT64( 0x2000000000000000 );
   3153     zSig = ( aSig + bSig )<<1;
   3154     --zExp;
   3155     if ( (int64_t) zSig < 0 ) {
   3156         zSig = aSig + bSig;
   3157         ++zExp;
   3158     }
   3159  roundAndPack:
   3160     return roundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
   3161 
   3162 }
   3163 
   3164 /*----------------------------------------------------------------------------
   3165 | Returns the result of subtracting the absolute values of the double-
   3166 | precision floating-point values `a' and `b'.  If `zSign' is 1, the
   3167 | difference is negated before being returned.  `zSign' is ignored if the
   3168 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
   3169 | Standard for Binary Floating-Point Arithmetic.
   3170 *----------------------------------------------------------------------------*/
   3171 
   3172 static float64 subFloat64Sigs( float64 a, float64 b, flag zSign STATUS_PARAM )
   3173 {
   3174     int16 aExp, bExp, zExp;
   3175     uint64_t aSig, bSig, zSig;
   3176     int16 expDiff;
   3177 
   3178     aSig = extractFloat64Frac( a );
   3179     aExp = extractFloat64Exp( a );
   3180     bSig = extractFloat64Frac( b );
   3181     bExp = extractFloat64Exp( b );
   3182     expDiff = aExp - bExp;
   3183     aSig <<= 10;
   3184     bSig <<= 10;
   3185     if ( 0 < expDiff ) goto aExpBigger;
   3186     if ( expDiff < 0 ) goto bExpBigger;
   3187     if ( aExp == 0x7FF ) {
   3188         if ( aSig | bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
   3189         float_raise( float_flag_invalid STATUS_VAR);
   3190         return float64_default_nan;
   3191     }
   3192     if ( aExp == 0 ) {
   3193         aExp = 1;
   3194         bExp = 1;
   3195     }
   3196     if ( bSig < aSig ) goto aBigger;
   3197     if ( aSig < bSig ) goto bBigger;
   3198     return packFloat64( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
   3199  bExpBigger:
   3200     if ( bExp == 0x7FF ) {
   3201         if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
   3202         return packFloat64( zSign ^ 1, 0x7FF, 0 );
   3203     }
   3204     if ( aExp == 0 ) {
   3205         ++expDiff;
   3206     }
   3207     else {
   3208         aSig |= LIT64( 0x4000000000000000 );
   3209     }
   3210     shift64RightJamming( aSig, - expDiff, &aSig );
   3211     bSig |= LIT64( 0x4000000000000000 );
   3212  bBigger:
   3213     zSig = bSig - aSig;
   3214     zExp = bExp;
   3215     zSign ^= 1;
   3216     goto normalizeRoundAndPack;
   3217  aExpBigger:
   3218     if ( aExp == 0x7FF ) {
   3219         if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
   3220         return a;
   3221     }
   3222     if ( bExp == 0 ) {
   3223         --expDiff;
   3224     }
   3225     else {
   3226         bSig |= LIT64( 0x4000000000000000 );
   3227     }
   3228     shift64RightJamming( bSig, expDiff, &bSig );
   3229     aSig |= LIT64( 0x4000000000000000 );
   3230  aBigger:
   3231     zSig = aSig - bSig;
   3232     zExp = aExp;
   3233  normalizeRoundAndPack:
   3234     --zExp;
   3235     return normalizeRoundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
   3236 
   3237 }
   3238 
   3239 /*----------------------------------------------------------------------------
   3240 | Returns the result of adding the double-precision floating-point values `a'
   3241 | and `b'.  The operation is performed according to the IEC/IEEE Standard for
   3242 | Binary Floating-Point Arithmetic.
   3243 *----------------------------------------------------------------------------*/
   3244 
   3245 float64 float64_add( float64 a, float64 b STATUS_PARAM )
   3246 {
   3247     flag aSign, bSign;
   3248     a = float64_squash_input_denormal(a STATUS_VAR);
   3249     b = float64_squash_input_denormal(b STATUS_VAR);
   3250 
   3251     aSign = extractFloat64Sign( a );
   3252     bSign = extractFloat64Sign( b );
   3253     if ( aSign == bSign ) {
   3254         return addFloat64Sigs( a, b, aSign STATUS_VAR );
   3255     }
   3256     else {
   3257         return subFloat64Sigs( a, b, aSign STATUS_VAR );
   3258     }
   3259 
   3260 }
   3261 
   3262 /*----------------------------------------------------------------------------
   3263 | Returns the result of subtracting the double-precision floating-point values
   3264 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
   3265 | for Binary Floating-Point Arithmetic.
   3266 *----------------------------------------------------------------------------*/
   3267 
   3268 float64 float64_sub( float64 a, float64 b STATUS_PARAM )
   3269 {
   3270     flag aSign, bSign;
   3271     a = float64_squash_input_denormal(a STATUS_VAR);
   3272     b = float64_squash_input_denormal(b STATUS_VAR);
   3273 
   3274     aSign = extractFloat64Sign( a );
   3275     bSign = extractFloat64Sign( b );
   3276     if ( aSign == bSign ) {
   3277         return subFloat64Sigs( a, b, aSign STATUS_VAR );
   3278     }
   3279     else {
   3280         return addFloat64Sigs( a, b, aSign STATUS_VAR );
   3281     }
   3282 
   3283 }
   3284 
   3285 /*----------------------------------------------------------------------------
   3286 | Returns the result of multiplying the double-precision floating-point values
   3287 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
   3288 | for Binary Floating-Point Arithmetic.
   3289 *----------------------------------------------------------------------------*/
   3290 
   3291 float64 float64_mul( float64 a, float64 b STATUS_PARAM )
   3292 {
   3293     flag aSign, bSign, zSign;
   3294     int16 aExp, bExp, zExp;
   3295     uint64_t aSig, bSig, zSig0, zSig1;
   3296 
   3297     a = float64_squash_input_denormal(a STATUS_VAR);
   3298     b = float64_squash_input_denormal(b STATUS_VAR);
   3299 
   3300     aSig = extractFloat64Frac( a );
   3301     aExp = extractFloat64Exp( a );
   3302     aSign = extractFloat64Sign( a );
   3303     bSig = extractFloat64Frac( b );
   3304     bExp = extractFloat64Exp( b );
   3305     bSign = extractFloat64Sign( b );
   3306     zSign = aSign ^ bSign;
   3307     if ( aExp == 0x7FF ) {
   3308         if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
   3309             return propagateFloat64NaN( a, b STATUS_VAR );
   3310         }
   3311         if ( ( bExp | bSig ) == 0 ) {
   3312             float_raise( float_flag_invalid STATUS_VAR);
   3313             return float64_default_nan;
   3314         }
   3315         return packFloat64( zSign, 0x7FF, 0 );
   3316     }
   3317     if ( bExp == 0x7FF ) {
   3318         if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
   3319         if ( ( aExp | aSig ) == 0 ) {
   3320             float_raise( float_flag_invalid STATUS_VAR);
   3321             return float64_default_nan;
   3322         }
   3323         return packFloat64( zSign, 0x7FF, 0 );
   3324     }
   3325     if ( aExp == 0 ) {
   3326         if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
   3327         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
   3328     }
   3329     if ( bExp == 0 ) {
   3330         if ( bSig == 0 ) return packFloat64( zSign, 0, 0 );
   3331         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
   3332     }
   3333     zExp = aExp + bExp - 0x3FF;
   3334     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
   3335     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
   3336     mul64To128( aSig, bSig, &zSig0, &zSig1 );
   3337     zSig0 |= ( zSig1 != 0 );
   3338     if ( 0 <= (int64_t) ( zSig0<<1 ) ) {
   3339         zSig0 <<= 1;
   3340         --zExp;
   3341     }
   3342     return roundAndPackFloat64( zSign, zExp, zSig0 STATUS_VAR );
   3343 
   3344 }
   3345 
   3346 /*----------------------------------------------------------------------------
   3347 | Returns the result of dividing the double-precision floating-point value `a'
   3348 | by the corresponding value `b'.  The operation is performed according to
   3349 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   3350 *----------------------------------------------------------------------------*/
   3351 
   3352 float64 float64_div( float64 a, float64 b STATUS_PARAM )
   3353 {
   3354     flag aSign, bSign, zSign;
   3355     int16 aExp, bExp, zExp;
   3356     uint64_t aSig, bSig, zSig;
   3357     uint64_t rem0, rem1;
   3358     uint64_t term0, term1;
   3359     a = float64_squash_input_denormal(a STATUS_VAR);
   3360     b = float64_squash_input_denormal(b STATUS_VAR);
   3361 
   3362     aSig = extractFloat64Frac( a );
   3363     aExp = extractFloat64Exp( a );
   3364     aSign = extractFloat64Sign( a );
   3365     bSig = extractFloat64Frac( b );
   3366     bExp = extractFloat64Exp( b );
   3367     bSign = extractFloat64Sign( b );
   3368     zSign = aSign ^ bSign;
   3369     if ( aExp == 0x7FF ) {
   3370         if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
   3371         if ( bExp == 0x7FF ) {
   3372             if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
   3373             float_raise( float_flag_invalid STATUS_VAR);
   3374             return float64_default_nan;
   3375         }
   3376         return packFloat64( zSign, 0x7FF, 0 );
   3377     }
   3378     if ( bExp == 0x7FF ) {
   3379         if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
   3380         return packFloat64( zSign, 0, 0 );
   3381     }
   3382     if ( bExp == 0 ) {
   3383         if ( bSig == 0 ) {
   3384             if ( ( aExp | aSig ) == 0 ) {
   3385                 float_raise( float_flag_invalid STATUS_VAR);
   3386                 return float64_default_nan;
   3387             }
   3388             float_raise( float_flag_divbyzero STATUS_VAR);
   3389             return packFloat64( zSign, 0x7FF, 0 );
   3390         }
   3391         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
   3392     }
   3393     if ( aExp == 0 ) {
   3394         if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
   3395         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
   3396     }
   3397     zExp = aExp - bExp + 0x3FD;
   3398     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
   3399     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
   3400     if ( bSig <= ( aSig + aSig ) ) {
   3401         aSig >>= 1;
   3402         ++zExp;
   3403     }
   3404     zSig = estimateDiv128To64( aSig, 0, bSig );
   3405     if ( ( zSig & 0x1FF ) <= 2 ) {
   3406         mul64To128( bSig, zSig, &term0, &term1 );
   3407         sub128( aSig, 0, term0, term1, &rem0, &rem1 );
   3408         while ( (int64_t) rem0 < 0 ) {
   3409             --zSig;
   3410             add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
   3411         }
   3412         zSig |= ( rem1 != 0 );
   3413     }
   3414     return roundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
   3415 
   3416 }
   3417 
   3418 /*----------------------------------------------------------------------------
   3419 | Returns the remainder of the double-precision floating-point value `a'
   3420 | with respect to the corresponding value `b'.  The operation is performed
   3421 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   3422 *----------------------------------------------------------------------------*/
   3423 
   3424 float64 float64_rem( float64 a, float64 b STATUS_PARAM )
   3425 {
   3426     flag aSign, zSign;
   3427     int16 aExp, bExp, expDiff;
   3428     uint64_t aSig, bSig;
   3429     uint64_t q, alternateASig;
   3430     int64_t sigMean;
   3431 
   3432     a = float64_squash_input_denormal(a STATUS_VAR);
   3433     b = float64_squash_input_denormal(b STATUS_VAR);
   3434     aSig = extractFloat64Frac( a );
   3435     aExp = extractFloat64Exp( a );
   3436     aSign = extractFloat64Sign( a );
   3437     bSig = extractFloat64Frac( b );
   3438     bExp = extractFloat64Exp( b );
   3439     if ( aExp == 0x7FF ) {
   3440         if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
   3441             return propagateFloat64NaN( a, b STATUS_VAR );
   3442         }
   3443         float_raise( float_flag_invalid STATUS_VAR);
   3444         return float64_default_nan;
   3445     }
   3446     if ( bExp == 0x7FF ) {
   3447         if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
   3448         return a;
   3449     }
   3450     if ( bExp == 0 ) {
   3451         if ( bSig == 0 ) {
   3452             float_raise( float_flag_invalid STATUS_VAR);
   3453             return float64_default_nan;
   3454         }
   3455         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
   3456     }
   3457     if ( aExp == 0 ) {
   3458         if ( aSig == 0 ) return a;
   3459         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
   3460     }
   3461     expDiff = aExp - bExp;
   3462     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11;
   3463     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
   3464     if ( expDiff < 0 ) {
   3465         if ( expDiff < -1 ) return a;
   3466         aSig >>= 1;
   3467     }
   3468     q = ( bSig <= aSig );
   3469     if ( q ) aSig -= bSig;
   3470     expDiff -= 64;
   3471     while ( 0 < expDiff ) {
   3472         q = estimateDiv128To64( aSig, 0, bSig );
   3473         q = ( 2 < q ) ? q - 2 : 0;
   3474         aSig = - ( ( bSig>>2 ) * q );
   3475         expDiff -= 62;
   3476     }
   3477     expDiff += 64;
   3478     if ( 0 < expDiff ) {
   3479         q = estimateDiv128To64( aSig, 0, bSig );
   3480         q = ( 2 < q ) ? q - 2 : 0;
   3481         q >>= 64 - expDiff;
   3482         bSig >>= 2;
   3483         aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
   3484     }
   3485     else {
   3486         aSig >>= 2;
   3487         bSig >>= 2;
   3488     }
   3489     do {
   3490         alternateASig = aSig;
   3491         ++q;
   3492         aSig -= bSig;
   3493     } while ( 0 <= (int64_t) aSig );
   3494     sigMean = aSig + alternateASig;
   3495     if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
   3496         aSig = alternateASig;
   3497     }
   3498     zSign = ( (int64_t) aSig < 0 );
   3499     if ( zSign ) aSig = - aSig;
   3500     return normalizeRoundAndPackFloat64( aSign ^ zSign, bExp, aSig STATUS_VAR );
   3501 
   3502 }
   3503 
   3504 /*----------------------------------------------------------------------------
   3505 | Returns the square root of the double-precision floating-point value `a'.
   3506 | The operation is performed according to the IEC/IEEE Standard for Binary
   3507 | Floating-Point Arithmetic.
   3508 *----------------------------------------------------------------------------*/
   3509 
   3510 float64 float64_sqrt( float64 a STATUS_PARAM )
   3511 {
   3512     flag aSign;
   3513     int16 aExp, zExp;
   3514     uint64_t aSig, zSig, doubleZSig;
   3515     uint64_t rem0, rem1, term0, term1;
   3516     a = float64_squash_input_denormal(a STATUS_VAR);
   3517 
   3518     aSig = extractFloat64Frac( a );
   3519     aExp = extractFloat64Exp( a );
   3520     aSign = extractFloat64Sign( a );
   3521     if ( aExp == 0x7FF ) {
   3522         if ( aSig ) return propagateFloat64NaN( a, a STATUS_VAR );
   3523         if ( ! aSign ) return a;
   3524         float_raise( float_flag_invalid STATUS_VAR);
   3525         return float64_default_nan;
   3526     }
   3527     if ( aSign ) {
   3528         if ( ( aExp | aSig ) == 0 ) return a;
   3529         float_raise( float_flag_invalid STATUS_VAR);
   3530         return float64_default_nan;
   3531     }
   3532     if ( aExp == 0 ) {
   3533         if ( aSig == 0 ) return float64_zero;
   3534         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
   3535     }
   3536     zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE;
   3537     aSig |= LIT64( 0x0010000000000000 );
   3538     zSig = estimateSqrt32( aExp, aSig>>21 );
   3539     aSig <<= 9 - ( aExp & 1 );
   3540     zSig = estimateDiv128To64( aSig, 0, zSig<<32 ) + ( zSig<<30 );
   3541     if ( ( zSig & 0x1FF ) <= 5 ) {
   3542         doubleZSig = zSig<<1;
   3543         mul64To128( zSig, zSig, &term0, &term1 );
   3544         sub128( aSig, 0, term0, term1, &rem0, &rem1 );
   3545         while ( (int64_t) rem0 < 0 ) {
   3546             --zSig;
   3547             doubleZSig -= 2;
   3548             add128( rem0, rem1, zSig>>63, doubleZSig | 1, &rem0, &rem1 );
   3549         }
   3550         zSig |= ( ( rem0 | rem1 ) != 0 );
   3551     }
   3552     return roundAndPackFloat64( 0, zExp, zSig STATUS_VAR );
   3553 
   3554 }
   3555 
   3556 /*----------------------------------------------------------------------------
   3557 | Returns the binary log of the double-precision floating-point value `a'.
   3558 | The operation is performed according to the IEC/IEEE Standard for Binary
   3559 | Floating-Point Arithmetic.
   3560 *----------------------------------------------------------------------------*/
   3561 float64 float64_log2( float64 a STATUS_PARAM )
   3562 {
   3563     flag aSign, zSign;
   3564     int16 aExp;
   3565     uint64_t aSig, aSig0, aSig1, zSig, i;
   3566     a = float64_squash_input_denormal(a STATUS_VAR);
   3567 
   3568     aSig = extractFloat64Frac( a );
   3569     aExp = extractFloat64Exp( a );
   3570     aSign = extractFloat64Sign( a );
   3571 
   3572     if ( aExp == 0 ) {
   3573         if ( aSig == 0 ) return packFloat64( 1, 0x7FF, 0 );
   3574         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
   3575     }
   3576     if ( aSign ) {
   3577         float_raise( float_flag_invalid STATUS_VAR);
   3578         return float64_default_nan;
   3579     }
   3580     if ( aExp == 0x7FF ) {
   3581         if ( aSig ) return propagateFloat64NaN( a, float64_zero STATUS_VAR );
   3582         return a;
   3583     }
   3584 
   3585     aExp -= 0x3FF;
   3586     aSig |= LIT64( 0x0010000000000000 );
   3587     zSign = aExp < 0;
   3588     zSig = (uint64_t)aExp << 52;
   3589     for (i = 1LL << 51; i > 0; i >>= 1) {
   3590         mul64To128( aSig, aSig, &aSig0, &aSig1 );
   3591         aSig = ( aSig0 << 12 ) | ( aSig1 >> 52 );
   3592         if ( aSig & LIT64( 0x0020000000000000 ) ) {
   3593             aSig >>= 1;
   3594             zSig |= i;
   3595         }
   3596     }
   3597 
   3598     if ( zSign )
   3599         zSig = -zSig;
   3600     return normalizeRoundAndPackFloat64( zSign, 0x408, zSig STATUS_VAR );
   3601 }
   3602 
   3603 /*----------------------------------------------------------------------------
   3604 | Returns 1 if the double-precision floating-point value `a' is equal to the
   3605 | corresponding value `b', and 0 otherwise.  The invalid exception is raised
   3606 | if either operand is a NaN.  Otherwise, the comparison is performed
   3607 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   3608 *----------------------------------------------------------------------------*/
   3609 
   3610 int float64_eq( float64 a, float64 b STATUS_PARAM )
   3611 {
   3612     uint64_t av, bv;
   3613     a = float64_squash_input_denormal(a STATUS_VAR);
   3614     b = float64_squash_input_denormal(b STATUS_VAR);
   3615 
   3616     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
   3617          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
   3618        ) {
   3619         float_raise( float_flag_invalid STATUS_VAR);
   3620         return 0;
   3621     }
   3622     av = float64_val(a);
   3623     bv = float64_val(b);
   3624     return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
   3625 
   3626 }
   3627 
   3628 /*----------------------------------------------------------------------------
   3629 | Returns 1 if the double-precision floating-point value `a' is less than or
   3630 | equal to the corresponding value `b', and 0 otherwise.  The invalid
   3631 | exception is raised if either operand is a NaN.  The comparison is performed
   3632 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   3633 *----------------------------------------------------------------------------*/
   3634 
   3635 int float64_le( float64 a, float64 b STATUS_PARAM )
   3636 {
   3637     flag aSign, bSign;
   3638     uint64_t av, bv;
   3639     a = float64_squash_input_denormal(a STATUS_VAR);
   3640     b = float64_squash_input_denormal(b STATUS_VAR);
   3641 
   3642     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
   3643          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
   3644        ) {
   3645         float_raise( float_flag_invalid STATUS_VAR);
   3646         return 0;
   3647     }
   3648     aSign = extractFloat64Sign( a );
   3649     bSign = extractFloat64Sign( b );
   3650     av = float64_val(a);
   3651     bv = float64_val(b);
   3652     if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
   3653     return ( av == bv ) || ( aSign ^ ( av < bv ) );
   3654 
   3655 }
   3656 
   3657 /*----------------------------------------------------------------------------
   3658 | Returns 1 if the double-precision floating-point value `a' is less than
   3659 | the corresponding value `b', and 0 otherwise.  The invalid exception is
   3660 | raised if either operand is a NaN.  The comparison is performed according
   3661 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   3662 *----------------------------------------------------------------------------*/
   3663 
   3664 int float64_lt( float64 a, float64 b STATUS_PARAM )
   3665 {
   3666     flag aSign, bSign;
   3667     uint64_t av, bv;
   3668 
   3669     a = float64_squash_input_denormal(a STATUS_VAR);
   3670     b = float64_squash_input_denormal(b STATUS_VAR);
   3671     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
   3672          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
   3673        ) {
   3674         float_raise( float_flag_invalid STATUS_VAR);
   3675         return 0;
   3676     }
   3677     aSign = extractFloat64Sign( a );
   3678     bSign = extractFloat64Sign( b );
   3679     av = float64_val(a);
   3680     bv = float64_val(b);
   3681     if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
   3682     return ( av != bv ) && ( aSign ^ ( av < bv ) );
   3683 
   3684 }
   3685 
   3686 /*----------------------------------------------------------------------------
   3687 | Returns 1 if the double-precision floating-point values `a' and `b' cannot
   3688 | be compared, and 0 otherwise.  The invalid exception is raised if either
   3689 | operand is a NaN.  The comparison is performed according to the IEC/IEEE
   3690 | Standard for Binary Floating-Point Arithmetic.
   3691 *----------------------------------------------------------------------------*/
   3692 
   3693 int float64_unordered( float64 a, float64 b STATUS_PARAM )
   3694 {
   3695     a = float64_squash_input_denormal(a STATUS_VAR);
   3696     b = float64_squash_input_denormal(b STATUS_VAR);
   3697 
   3698     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
   3699          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
   3700        ) {
   3701         float_raise( float_flag_invalid STATUS_VAR);
   3702         return 1;
   3703     }
   3704     return 0;
   3705 }
   3706 
   3707 /*----------------------------------------------------------------------------
   3708 | Returns 1 if the double-precision floating-point value `a' is equal to the
   3709 | corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
   3710 | exception.The comparison is performed according to the IEC/IEEE Standard
   3711 | for Binary Floating-Point Arithmetic.
   3712 *----------------------------------------------------------------------------*/
   3713 
   3714 int float64_eq_quiet( float64 a, float64 b STATUS_PARAM )
   3715 {
   3716     uint64_t av, bv;
   3717     a = float64_squash_input_denormal(a STATUS_VAR);
   3718     b = float64_squash_input_denormal(b STATUS_VAR);
   3719 
   3720     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
   3721          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
   3722        ) {
   3723         if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
   3724             float_raise( float_flag_invalid STATUS_VAR);
   3725         }
   3726         return 0;
   3727     }
   3728     av = float64_val(a);
   3729     bv = float64_val(b);
   3730     return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
   3731 
   3732 }
   3733 
   3734 /*----------------------------------------------------------------------------
   3735 | Returns 1 if the double-precision floating-point value `a' is less than or
   3736 | equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
   3737 | cause an exception.  Otherwise, the comparison is performed according to the
   3738 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   3739 *----------------------------------------------------------------------------*/
   3740 
   3741 int float64_le_quiet( float64 a, float64 b STATUS_PARAM )
   3742 {
   3743     flag aSign, bSign;
   3744     uint64_t av, bv;
   3745     a = float64_squash_input_denormal(a STATUS_VAR);
   3746     b = float64_squash_input_denormal(b STATUS_VAR);
   3747 
   3748     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
   3749          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
   3750        ) {
   3751         if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
   3752             float_raise( float_flag_invalid STATUS_VAR);
   3753         }
   3754         return 0;
   3755     }
   3756     aSign = extractFloat64Sign( a );
   3757     bSign = extractFloat64Sign( b );
   3758     av = float64_val(a);
   3759     bv = float64_val(b);
   3760     if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
   3761     return ( av == bv ) || ( aSign ^ ( av < bv ) );
   3762 
   3763 }
   3764 
   3765 /*----------------------------------------------------------------------------
   3766 | Returns 1 if the double-precision floating-point value `a' is less than
   3767 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
   3768 | exception.  Otherwise, the comparison is performed according to the IEC/IEEE
   3769 | Standard for Binary Floating-Point Arithmetic.
   3770 *----------------------------------------------------------------------------*/
   3771 
   3772 int float64_lt_quiet( float64 a, float64 b STATUS_PARAM )
   3773 {
   3774     flag aSign, bSign;
   3775     uint64_t av, bv;
   3776     a = float64_squash_input_denormal(a STATUS_VAR);
   3777     b = float64_squash_input_denormal(b STATUS_VAR);
   3778 
   3779     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
   3780          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
   3781        ) {
   3782         if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
   3783             float_raise( float_flag_invalid STATUS_VAR);
   3784         }
   3785         return 0;
   3786     }
   3787     aSign = extractFloat64Sign( a );
   3788     bSign = extractFloat64Sign( b );
   3789     av = float64_val(a);
   3790     bv = float64_val(b);
   3791     if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
   3792     return ( av != bv ) && ( aSign ^ ( av < bv ) );
   3793 
   3794 }
   3795 
   3796 /*----------------------------------------------------------------------------
   3797 | Returns 1 if the double-precision floating-point values `a' and `b' cannot
   3798 | be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.  The
   3799 | comparison is performed according to the IEC/IEEE Standard for Binary
   3800 | Floating-Point Arithmetic.
   3801 *----------------------------------------------------------------------------*/
   3802 
   3803 int float64_unordered_quiet( float64 a, float64 b STATUS_PARAM )
   3804 {
   3805     a = float64_squash_input_denormal(a STATUS_VAR);
   3806     b = float64_squash_input_denormal(b STATUS_VAR);
   3807 
   3808     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
   3809          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
   3810        ) {
   3811         if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
   3812             float_raise( float_flag_invalid STATUS_VAR);
   3813         }
   3814         return 1;
   3815     }
   3816     return 0;
   3817 }
   3818 
   3819 #ifdef FLOATX80
   3820 
   3821 /*----------------------------------------------------------------------------
   3822 | Returns the result of converting the extended double-precision floating-
   3823 | point value `a' to the 32-bit two's complement integer format.  The
   3824 | conversion is performed according to the IEC/IEEE Standard for Binary
   3825 | Floating-Point Arithmetic---which means in particular that the conversion
   3826 | is rounded according to the current rounding mode.  If `a' is a NaN, the
   3827 | largest positive integer is returned.  Otherwise, if the conversion
   3828 | overflows, the largest integer with the same sign as `a' is returned.
   3829 *----------------------------------------------------------------------------*/
   3830 
   3831 int32 floatx80_to_int32( floatx80 a STATUS_PARAM )
   3832 {
   3833     flag aSign;
   3834     int32 aExp, shiftCount;
   3835     uint64_t aSig;
   3836 
   3837     aSig = extractFloatx80Frac( a );
   3838     aExp = extractFloatx80Exp( a );
   3839     aSign = extractFloatx80Sign( a );
   3840     if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
   3841     shiftCount = 0x4037 - aExp;
   3842     if ( shiftCount <= 0 ) shiftCount = 1;
   3843     shift64RightJamming( aSig, shiftCount, &aSig );
   3844     return roundAndPackInt32( aSign, aSig STATUS_VAR );
   3845 
   3846 }
   3847 
   3848 /*----------------------------------------------------------------------------
   3849 | Returns the result of converting the extended double-precision floating-
   3850 | point value `a' to the 32-bit two's complement integer format.  The
   3851 | conversion is performed according to the IEC/IEEE Standard for Binary
   3852 | Floating-Point Arithmetic, except that the conversion is always rounded
   3853 | toward zero.  If `a' is a NaN, the largest positive integer is returned.
   3854 | Otherwise, if the conversion overflows, the largest integer with the same
   3855 | sign as `a' is returned.
   3856 *----------------------------------------------------------------------------*/
   3857 
   3858 int32 floatx80_to_int32_round_to_zero( floatx80 a STATUS_PARAM )
   3859 {
   3860     flag aSign;
   3861     int32 aExp, shiftCount;
   3862     uint64_t aSig, savedASig;
   3863     int32 z;
   3864 
   3865     aSig = extractFloatx80Frac( a );
   3866     aExp = extractFloatx80Exp( a );
   3867     aSign = extractFloatx80Sign( a );
   3868     if ( 0x401E < aExp ) {
   3869         if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
   3870         goto invalid;
   3871     }
   3872     else if ( aExp < 0x3FFF ) {
   3873         if ( aExp || aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
   3874         return 0;
   3875     }
   3876     shiftCount = 0x403E - aExp;
   3877     savedASig = aSig;
   3878     aSig >>= shiftCount;
   3879     z = aSig;
   3880     if ( aSign ) z = - z;
   3881     if ( ( z < 0 ) ^ aSign ) {
   3882  invalid:
   3883         float_raise( float_flag_invalid STATUS_VAR);
   3884         return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
   3885     }
   3886     if ( ( aSig<<shiftCount ) != savedASig ) {
   3887         STATUS(float_exception_flags) |= float_flag_inexact;
   3888     }
   3889     return z;
   3890 
   3891 }
   3892 
   3893 /*----------------------------------------------------------------------------
   3894 | Returns the result of converting the extended double-precision floating-
   3895 | point value `a' to the 64-bit two's complement integer format.  The
   3896 | conversion is performed according to the IEC/IEEE Standard for Binary
   3897 | Floating-Point Arithmetic---which means in particular that the conversion
   3898 | is rounded according to the current rounding mode.  If `a' is a NaN,
   3899 | the largest positive integer is returned.  Otherwise, if the conversion
   3900 | overflows, the largest integer with the same sign as `a' is returned.
   3901 *----------------------------------------------------------------------------*/
   3902 
   3903 int64 floatx80_to_int64( floatx80 a STATUS_PARAM )
   3904 {
   3905     flag aSign;
   3906     int32 aExp, shiftCount;
   3907     uint64_t aSig, aSigExtra;
   3908 
   3909     aSig = extractFloatx80Frac( a );
   3910     aExp = extractFloatx80Exp( a );
   3911     aSign = extractFloatx80Sign( a );
   3912     shiftCount = 0x403E - aExp;
   3913     if ( shiftCount <= 0 ) {
   3914         if ( shiftCount ) {
   3915             float_raise( float_flag_invalid STATUS_VAR);
   3916             if (    ! aSign
   3917                  || (    ( aExp == 0x7FFF )
   3918                       && ( aSig != LIT64( 0x8000000000000000 ) ) )
   3919                ) {
   3920                 return LIT64( 0x7FFFFFFFFFFFFFFF );
   3921             }
   3922             return (int64_t) LIT64( 0x8000000000000000 );
   3923         }
   3924         aSigExtra = 0;
   3925     }
   3926     else {
   3927         shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
   3928     }
   3929     return roundAndPackInt64( aSign, aSig, aSigExtra STATUS_VAR );
   3930 
   3931 }
   3932 
   3933 /*----------------------------------------------------------------------------
   3934 | Returns the result of converting the extended double-precision floating-
   3935 | point value `a' to the 64-bit two's complement integer format.  The
   3936 | conversion is performed according to the IEC/IEEE Standard for Binary
   3937 | Floating-Point Arithmetic, except that the conversion is always rounded
   3938 | toward zero.  If `a' is a NaN, the largest positive integer is returned.
   3939 | Otherwise, if the conversion overflows, the largest integer with the same
   3940 | sign as `a' is returned.
   3941 *----------------------------------------------------------------------------*/
   3942 
   3943 int64 floatx80_to_int64_round_to_zero( floatx80 a STATUS_PARAM )
   3944 {
   3945     flag aSign;
   3946     int32 aExp, shiftCount;
   3947     uint64_t aSig;
   3948     int64 z;
   3949 
   3950     aSig = extractFloatx80Frac( a );
   3951     aExp = extractFloatx80Exp( a );
   3952     aSign = extractFloatx80Sign( a );
   3953     shiftCount = aExp - 0x403E;
   3954     if ( 0 <= shiftCount ) {
   3955         aSig &= LIT64( 0x7FFFFFFFFFFFFFFF );
   3956         if ( ( a.high != 0xC03E ) || aSig ) {
   3957             float_raise( float_flag_invalid STATUS_VAR);
   3958             if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {
   3959                 return LIT64( 0x7FFFFFFFFFFFFFFF );
   3960             }
   3961         }
   3962         return (int64_t) LIT64( 0x8000000000000000 );
   3963     }
   3964     else if ( aExp < 0x3FFF ) {
   3965         if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
   3966         return 0;
   3967     }
   3968     z = aSig>>( - shiftCount );
   3969     if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) {
   3970         STATUS(float_exception_flags) |= float_flag_inexact;
   3971     }
   3972     if ( aSign ) z = - z;
   3973     return z;
   3974 
   3975 }
   3976 
   3977 /*----------------------------------------------------------------------------
   3978 | Returns the result of converting the extended double-precision floating-
   3979 | point value `a' to the single-precision floating-point format.  The
   3980 | conversion is performed according to the IEC/IEEE Standard for Binary
   3981 | Floating-Point Arithmetic.
   3982 *----------------------------------------------------------------------------*/
   3983 
   3984 float32 floatx80_to_float32( floatx80 a STATUS_PARAM )
   3985 {
   3986     flag aSign;
   3987     int32 aExp;
   3988     uint64_t aSig;
   3989 
   3990     aSig = extractFloatx80Frac( a );
   3991     aExp = extractFloatx80Exp( a );
   3992     aSign = extractFloatx80Sign( a );
   3993     if ( aExp == 0x7FFF ) {
   3994         if ( (uint64_t) ( aSig<<1 ) ) {
   3995             return commonNaNToFloat32( floatx80ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
   3996         }
   3997         return packFloat32( aSign, 0xFF, 0 );
   3998     }
   3999     shift64RightJamming( aSig, 33, &aSig );
   4000     if ( aExp || aSig ) aExp -= 0x3F81;
   4001     return roundAndPackFloat32( aSign, aExp, aSig STATUS_VAR );
   4002 
   4003 }
   4004 
   4005 /*----------------------------------------------------------------------------
   4006 | Returns the result of converting the extended double-precision floating-
   4007 | point value `a' to the double-precision floating-point format.  The
   4008 | conversion is performed according to the IEC/IEEE Standard for Binary
   4009 | Floating-Point Arithmetic.
   4010 *----------------------------------------------------------------------------*/
   4011 
   4012 float64 floatx80_to_float64( floatx80 a STATUS_PARAM )
   4013 {
   4014     flag aSign;
   4015     int32 aExp;
   4016     uint64_t aSig, zSig;
   4017 
   4018     aSig = extractFloatx80Frac( a );
   4019     aExp = extractFloatx80Exp( a );
   4020     aSign = extractFloatx80Sign( a );
   4021     if ( aExp == 0x7FFF ) {
   4022         if ( (uint64_t) ( aSig<<1 ) ) {
   4023             return commonNaNToFloat64( floatx80ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
   4024         }
   4025         return packFloat64( aSign, 0x7FF, 0 );
   4026     }
   4027     shift64RightJamming( aSig, 1, &zSig );
   4028     if ( aExp || aSig ) aExp -= 0x3C01;
   4029     return roundAndPackFloat64( aSign, aExp, zSig STATUS_VAR );
   4030 
   4031 }
   4032 
   4033 #ifdef FLOAT128
   4034 
   4035 /*----------------------------------------------------------------------------
   4036 | Returns the result of converting the extended double-precision floating-
   4037 | point value `a' to the quadruple-precision floating-point format.  The
   4038 | conversion is performed according to the IEC/IEEE Standard for Binary
   4039 | Floating-Point Arithmetic.
   4040 *----------------------------------------------------------------------------*/
   4041 
   4042 float128 floatx80_to_float128( floatx80 a STATUS_PARAM )
   4043 {
   4044     flag aSign;
   4045     int16 aExp;
   4046     uint64_t aSig, zSig0, zSig1;
   4047 
   4048     aSig = extractFloatx80Frac( a );
   4049     aExp = extractFloatx80Exp( a );
   4050     aSign = extractFloatx80Sign( a );
   4051     if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) {
   4052         return commonNaNToFloat128( floatx80ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
   4053     }
   4054     shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
   4055     return packFloat128( aSign, aExp, zSig0, zSig1 );
   4056 
   4057 }
   4058 
   4059 #endif
   4060 
   4061 /*----------------------------------------------------------------------------
   4062 | Rounds the extended double-precision floating-point value `a' to an integer,
   4063 | and returns the result as an extended quadruple-precision floating-point
   4064 | value.  The operation is performed according to the IEC/IEEE Standard for
   4065 | Binary Floating-Point Arithmetic.
   4066 *----------------------------------------------------------------------------*/
   4067 
   4068 floatx80 floatx80_round_to_int( floatx80 a STATUS_PARAM )
   4069 {
   4070     flag aSign;
   4071     int32 aExp;
   4072     uint64_t lastBitMask, roundBitsMask;
   4073     int8 roundingMode;
   4074     floatx80 z;
   4075 
   4076     aExp = extractFloatx80Exp( a );
   4077     if ( 0x403E <= aExp ) {
   4078         if ( ( aExp == 0x7FFF ) && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) {
   4079             return propagateFloatx80NaN( a, a STATUS_VAR );
   4080         }
   4081         return a;
   4082     }
   4083     if ( aExp < 0x3FFF ) {
   4084         if (    ( aExp == 0 )
   4085              && ( (uint64_t) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
   4086             return a;
   4087         }
   4088         STATUS(float_exception_flags) |= float_flag_inexact;
   4089         aSign = extractFloatx80Sign( a );
   4090         switch ( STATUS(float_rounding_mode) ) {
   4091          case float_round_nearest_even:
   4092             if ( ( aExp == 0x3FFE ) && (uint64_t) ( extractFloatx80Frac( a )<<1 )
   4093                ) {
   4094                 return
   4095                     packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );
   4096             }
   4097             break;
   4098          case float_round_down:
   4099             return
   4100                   aSign ?
   4101                       packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
   4102                 : packFloatx80( 0, 0, 0 );
   4103          case float_round_up:
   4104             return
   4105                   aSign ? packFloatx80( 1, 0, 0 )
   4106                 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
   4107         }
   4108         return packFloatx80( aSign, 0, 0 );
   4109     }
   4110     lastBitMask = 1;
   4111     lastBitMask <<= 0x403E - aExp;
   4112     roundBitsMask = lastBitMask - 1;
   4113     z = a;
   4114     roundingMode = STATUS(float_rounding_mode);
   4115     if ( roundingMode == float_round_nearest_even ) {
   4116         z.low += lastBitMask>>1;
   4117         if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
   4118     }
   4119     else if ( roundingMode != float_round_to_zero ) {
   4120         if ( extractFloatx80Sign( z ) ^ ( roundingMode == float_round_up ) ) {
   4121             z.low += roundBitsMask;
   4122         }
   4123     }
   4124     z.low &= ~ roundBitsMask;
   4125     if ( z.low == 0 ) {
   4126         ++z.high;
   4127         z.low = LIT64( 0x8000000000000000 );
   4128     }
   4129     if ( z.low != a.low ) STATUS(float_exception_flags) |= float_flag_inexact;
   4130     return z;
   4131 
   4132 }
   4133 
   4134 /*----------------------------------------------------------------------------
   4135 | Returns the result of adding the absolute values of the extended double-
   4136 | precision floating-point values `a' and `b'.  If `zSign' is 1, the sum is
   4137 | negated before being returned.  `zSign' is ignored if the result is a NaN.
   4138 | The addition is performed according to the IEC/IEEE Standard for Binary
   4139 | Floating-Point Arithmetic.
   4140 *----------------------------------------------------------------------------*/
   4141 
   4142 static floatx80 addFloatx80Sigs( floatx80 a, floatx80 b, flag zSign STATUS_PARAM)
   4143 {
   4144     int32 aExp, bExp, zExp;
   4145     uint64_t aSig, bSig, zSig0, zSig1;
   4146     int32 expDiff;
   4147 
   4148     aSig = extractFloatx80Frac( a );
   4149     aExp = extractFloatx80Exp( a );
   4150     bSig = extractFloatx80Frac( b );
   4151     bExp = extractFloatx80Exp( b );
   4152     expDiff = aExp - bExp;
   4153     if ( 0 < expDiff ) {
   4154         if ( aExp == 0x7FFF ) {
   4155             if ( (uint64_t) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
   4156             return a;
   4157         }
   4158         if ( bExp == 0 ) --expDiff;
   4159         shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
   4160         zExp = aExp;
   4161     }
   4162     else if ( expDiff < 0 ) {
   4163         if ( bExp == 0x7FFF ) {
   4164             if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
   4165             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
   4166         }
   4167         if ( aExp == 0 ) ++expDiff;
   4168         shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
   4169         zExp = bExp;
   4170     }
   4171     else {
   4172         if ( aExp == 0x7FFF ) {
   4173             if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
   4174                 return propagateFloatx80NaN( a, b STATUS_VAR );
   4175             }
   4176             return a;
   4177         }
   4178         zSig1 = 0;
   4179         zSig0 = aSig + bSig;
   4180         if ( aExp == 0 ) {
   4181             normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
   4182             goto roundAndPack;
   4183         }
   4184         zExp = aExp;
   4185         goto shiftRight1;
   4186     }
   4187     zSig0 = aSig + bSig;
   4188     if ( (int64_t) zSig0 < 0 ) goto roundAndPack;
   4189  shiftRight1:
   4190     shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
   4191     zSig0 |= LIT64( 0x8000000000000000 );
   4192     ++zExp;
   4193  roundAndPack:
   4194     return
   4195         roundAndPackFloatx80(
   4196             STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
   4197 
   4198 }
   4199 
   4200 /*----------------------------------------------------------------------------
   4201 | Returns the result of subtracting the absolute values of the extended
   4202 | double-precision floating-point values `a' and `b'.  If `zSign' is 1, the
   4203 | difference is negated before being returned.  `zSign' is ignored if the
   4204 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
   4205 | Standard for Binary Floating-Point Arithmetic.
   4206 *----------------------------------------------------------------------------*/
   4207 
   4208 static floatx80 subFloatx80Sigs( floatx80 a, floatx80 b, flag zSign STATUS_PARAM )
   4209 {
   4210     int32 aExp, bExp, zExp;
   4211     uint64_t aSig, bSig, zSig0, zSig1;
   4212     int32 expDiff;
   4213     floatx80 z;
   4214 
   4215     aSig = extractFloatx80Frac( a );
   4216     aExp = extractFloatx80Exp( a );
   4217     bSig = extractFloatx80Frac( b );
   4218     bExp = extractFloatx80Exp( b );
   4219     expDiff = aExp - bExp;
   4220     if ( 0 < expDiff ) goto aExpBigger;
   4221     if ( expDiff < 0 ) goto bExpBigger;
   4222     if ( aExp == 0x7FFF ) {
   4223         if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
   4224             return propagateFloatx80NaN( a, b STATUS_VAR );
   4225         }
   4226         float_raise( float_flag_invalid STATUS_VAR);
   4227         z.low = floatx80_default_nan_low;
   4228         z.high = floatx80_default_nan_high;
   4229         return z;
   4230     }
   4231     if ( aExp == 0 ) {
   4232         aExp = 1;
   4233         bExp = 1;
   4234     }
   4235     zSig1 = 0;
   4236     if ( bSig < aSig ) goto aBigger;
   4237     if ( aSig < bSig ) goto bBigger;
   4238     return packFloatx80( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
   4239  bExpBigger:
   4240     if ( bExp == 0x7FFF ) {
   4241         if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
   4242         return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) );
   4243     }
   4244     if ( aExp == 0 ) ++expDiff;
   4245     shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
   4246  bBigger:
   4247     sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
   4248     zExp = bExp;
   4249     zSign ^= 1;
   4250     goto normalizeRoundAndPack;
   4251  aExpBigger:
   4252     if ( aExp == 0x7FFF ) {
   4253         if ( (uint64_t) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
   4254         return a;
   4255     }
   4256     if ( bExp == 0 ) --expDiff;
   4257     shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
   4258  aBigger:
   4259     sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
   4260     zExp = aExp;
   4261  normalizeRoundAndPack:
   4262     return
   4263         normalizeRoundAndPackFloatx80(
   4264             STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
   4265 
   4266 }
   4267 
   4268 /*----------------------------------------------------------------------------
   4269 | Returns the result of adding the extended double-precision floating-point
   4270 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
   4271 | Standard for Binary Floating-Point Arithmetic.
   4272 *----------------------------------------------------------------------------*/
   4273 
   4274 floatx80 floatx80_add( floatx80 a, floatx80 b STATUS_PARAM )
   4275 {
   4276     flag aSign, bSign;
   4277 
   4278     aSign = extractFloatx80Sign( a );
   4279     bSign = extractFloatx80Sign( b );
   4280     if ( aSign == bSign ) {
   4281         return addFloatx80Sigs( a, b, aSign STATUS_VAR );
   4282     }
   4283     else {
   4284         return subFloatx80Sigs( a, b, aSign STATUS_VAR );
   4285     }
   4286 
   4287 }
   4288 
   4289 /*----------------------------------------------------------------------------
   4290 | Returns the result of subtracting the extended double-precision floating-
   4291 | point values `a' and `b'.  The operation is performed according to the
   4292 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   4293 *----------------------------------------------------------------------------*/
   4294 
   4295 floatx80 floatx80_sub( floatx80 a, floatx80 b STATUS_PARAM )
   4296 {
   4297     flag aSign, bSign;
   4298 
   4299     aSign = extractFloatx80Sign( a );
   4300     bSign = extractFloatx80Sign( b );
   4301     if ( aSign == bSign ) {
   4302         return subFloatx80Sigs( a, b, aSign STATUS_VAR );
   4303     }
   4304     else {
   4305         return addFloatx80Sigs( a, b, aSign STATUS_VAR );
   4306     }
   4307 
   4308 }
   4309 
   4310 /*----------------------------------------------------------------------------
   4311 | Returns the result of multiplying the extended double-precision floating-
   4312 | point values `a' and `b'.  The operation is performed according to the
   4313 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   4314 *----------------------------------------------------------------------------*/
   4315 
   4316 floatx80 floatx80_mul( floatx80 a, floatx80 b STATUS_PARAM )
   4317 {
   4318     flag aSign, bSign, zSign;
   4319     int32 aExp, bExp, zExp;
   4320     uint64_t aSig, bSig, zSig0, zSig1;
   4321     floatx80 z;
   4322 
   4323     aSig = extractFloatx80Frac( a );
   4324     aExp = extractFloatx80Exp( a );
   4325     aSign = extractFloatx80Sign( a );
   4326     bSig = extractFloatx80Frac( b );
   4327     bExp = extractFloatx80Exp( b );
   4328     bSign = extractFloatx80Sign( b );
   4329     zSign = aSign ^ bSign;
   4330     if ( aExp == 0x7FFF ) {
   4331         if (    (uint64_t) ( aSig<<1 )
   4332              || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
   4333             return propagateFloatx80NaN( a, b STATUS_VAR );
   4334         }
   4335         if ( ( bExp | bSig ) == 0 ) goto invalid;
   4336         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
   4337     }
   4338     if ( bExp == 0x7FFF ) {
   4339         if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
   4340         if ( ( aExp | aSig ) == 0 ) {
   4341  invalid:
   4342             float_raise( float_flag_invalid STATUS_VAR);
   4343             z.low = floatx80_default_nan_low;
   4344             z.high = floatx80_default_nan_high;
   4345             return z;
   4346         }
   4347         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
   4348     }
   4349     if ( aExp == 0 ) {
   4350         if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
   4351         normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
   4352     }
   4353     if ( bExp == 0 ) {
   4354         if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
   4355         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
   4356     }
   4357     zExp = aExp + bExp - 0x3FFE;
   4358     mul64To128( aSig, bSig, &zSig0, &zSig1 );
   4359     if ( 0 < (int64_t) zSig0 ) {
   4360         shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
   4361         --zExp;
   4362     }
   4363     return
   4364         roundAndPackFloatx80(
   4365             STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
   4366 
   4367 }
   4368 
   4369 /*----------------------------------------------------------------------------
   4370 | Returns the result of dividing the extended double-precision floating-point
   4371 | value `a' by the corresponding value `b'.  The operation is performed
   4372 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   4373 *----------------------------------------------------------------------------*/
   4374 
   4375 floatx80 floatx80_div( floatx80 a, floatx80 b STATUS_PARAM )
   4376 {
   4377     flag aSign, bSign, zSign;
   4378     int32 aExp, bExp, zExp;
   4379     uint64_t aSig, bSig, zSig0, zSig1;
   4380     uint64_t rem0, rem1, rem2, term0, term1, term2;
   4381     floatx80 z;
   4382 
   4383     aSig = extractFloatx80Frac( a );
   4384     aExp = extractFloatx80Exp( a );
   4385     aSign = extractFloatx80Sign( a );
   4386     bSig = extractFloatx80Frac( b );
   4387     bExp = extractFloatx80Exp( b );
   4388     bSign = extractFloatx80Sign( b );
   4389     zSign = aSign ^ bSign;
   4390     if ( aExp == 0x7FFF ) {
   4391         if ( (uint64_t) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
   4392         if ( bExp == 0x7FFF ) {
   4393             if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
   4394             goto invalid;
   4395         }
   4396         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
   4397     }
   4398     if ( bExp == 0x7FFF ) {
   4399         if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
   4400         return packFloatx80( zSign, 0, 0 );
   4401     }
   4402     if ( bExp == 0 ) {
   4403         if ( bSig == 0 ) {
   4404             if ( ( aExp | aSig ) == 0 ) {
   4405  invalid:
   4406                 float_raise( float_flag_invalid STATUS_VAR);
   4407                 z.low = floatx80_default_nan_low;
   4408                 z.high = floatx80_default_nan_high;
   4409                 return z;
   4410             }
   4411             float_raise( float_flag_divbyzero STATUS_VAR);
   4412             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
   4413         }
   4414         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
   4415     }
   4416     if ( aExp == 0 ) {
   4417         if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
   4418         normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
   4419     }
   4420     zExp = aExp - bExp + 0x3FFE;
   4421     rem1 = 0;
   4422     if ( bSig <= aSig ) {
   4423         shift128Right( aSig, 0, 1, &aSig, &rem1 );
   4424         ++zExp;
   4425     }
   4426     zSig0 = estimateDiv128To64( aSig, rem1, bSig );
   4427     mul64To128( bSig, zSig0, &term0, &term1 );
   4428     sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
   4429     while ( (int64_t) rem0 < 0 ) {
   4430         --zSig0;
   4431         add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
   4432     }
   4433     zSig1 = estimateDiv128To64( rem1, 0, bSig );
   4434     if ( (uint64_t) ( zSig1<<1 ) <= 8 ) {
   4435         mul64To128( bSig, zSig1, &term1, &term2 );
   4436         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
   4437         while ( (int64_t) rem1 < 0 ) {
   4438             --zSig1;
   4439             add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
   4440         }
   4441         zSig1 |= ( ( rem1 | rem2 ) != 0 );
   4442     }
   4443     return
   4444         roundAndPackFloatx80(
   4445             STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
   4446 
   4447 }
   4448 
   4449 /*----------------------------------------------------------------------------
   4450 | Returns the remainder of the extended double-precision floating-point value
   4451 | `a' with respect to the corresponding value `b'.  The operation is performed
   4452 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   4453 *----------------------------------------------------------------------------*/
   4454 
   4455 floatx80 floatx80_rem( floatx80 a, floatx80 b STATUS_PARAM )
   4456 {
   4457     flag aSign, zSign;
   4458     int32 aExp, bExp, expDiff;
   4459     uint64_t aSig0, aSig1, bSig;
   4460     uint64_t q, term0, term1, alternateASig0, alternateASig1;
   4461     floatx80 z;
   4462 
   4463     aSig0 = extractFloatx80Frac( a );
   4464     aExp = extractFloatx80Exp( a );
   4465     aSign = extractFloatx80Sign( a );
   4466     bSig = extractFloatx80Frac( b );
   4467     bExp = extractFloatx80Exp( b );
   4468     if ( aExp == 0x7FFF ) {
   4469         if (    (uint64_t) ( aSig0<<1 )
   4470              || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
   4471             return propagateFloatx80NaN( a, b STATUS_VAR );
   4472         }
   4473         goto invalid;
   4474     }
   4475     if ( bExp == 0x7FFF ) {
   4476         if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
   4477         return a;
   4478     }
   4479     if ( bExp == 0 ) {
   4480         if ( bSig == 0 ) {
   4481  invalid:
   4482             float_raise( float_flag_invalid STATUS_VAR);
   4483             z.low = floatx80_default_nan_low;
   4484             z.high = floatx80_default_nan_high;
   4485             return z;
   4486         }
   4487         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
   4488     }
   4489     if ( aExp == 0 ) {
   4490         if ( (uint64_t) ( aSig0<<1 ) == 0 ) return a;
   4491         normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
   4492     }
   4493     bSig |= LIT64( 0x8000000000000000 );
   4494     zSign = aSign;
   4495     expDiff = aExp - bExp;
   4496     aSig1 = 0;
   4497     if ( expDiff < 0 ) {
   4498         if ( expDiff < -1 ) return a;
   4499         shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
   4500         expDiff = 0;
   4501     }
   4502     q = ( bSig <= aSig0 );
   4503     if ( q ) aSig0 -= bSig;
   4504     expDiff -= 64;
   4505     while ( 0 < expDiff ) {
   4506         q = estimateDiv128To64( aSig0, aSig1, bSig );
   4507         q = ( 2 < q ) ? q - 2 : 0;
   4508         mul64To128( bSig, q, &term0, &term1 );
   4509         sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
   4510         shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
   4511         expDiff -= 62;
   4512     }
   4513     expDiff += 64;
   4514     if ( 0 < expDiff ) {
   4515         q = estimateDiv128To64( aSig0, aSig1, bSig );
   4516         q = ( 2 < q ) ? q - 2 : 0;
   4517         q >>= 64 - expDiff;
   4518         mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
   4519         sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
   4520         shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
   4521         while ( le128( term0, term1, aSig0, aSig1 ) ) {
   4522             ++q;
   4523             sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
   4524         }
   4525     }
   4526     else {
   4527         term1 = 0;
   4528         term0 = bSig;
   4529     }
   4530     sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
   4531     if (    lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
   4532          || (    eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
   4533               && ( q & 1 ) )
   4534        ) {
   4535         aSig0 = alternateASig0;
   4536         aSig1 = alternateASig1;
   4537         zSign = ! zSign;
   4538     }
   4539     return
   4540         normalizeRoundAndPackFloatx80(
   4541             80, zSign, bExp + expDiff, aSig0, aSig1 STATUS_VAR );
   4542 
   4543 }
   4544 
   4545 /*----------------------------------------------------------------------------
   4546 | Returns the square root of the extended double-precision floating-point
   4547 | value `a'.  The operation is performed according to the IEC/IEEE Standard
   4548 | for Binary Floating-Point Arithmetic.
   4549 *----------------------------------------------------------------------------*/
   4550 
   4551 floatx80 floatx80_sqrt( floatx80 a STATUS_PARAM )
   4552 {
   4553     flag aSign;
   4554     int32 aExp, zExp;
   4555     uint64_t aSig0, aSig1, zSig0, zSig1, doubleZSig0;
   4556     uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
   4557     floatx80 z;
   4558 
   4559     aSig0 = extractFloatx80Frac( a );
   4560     aExp = extractFloatx80Exp( a );
   4561     aSign = extractFloatx80Sign( a );
   4562     if ( aExp == 0x7FFF ) {
   4563         if ( (uint64_t) ( aSig0<<1 ) ) return propagateFloatx80NaN( a, a STATUS_VAR );
   4564         if ( ! aSign ) return a;
   4565         goto invalid;
   4566     }
   4567     if ( aSign ) {
   4568         if ( ( aExp | aSig0 ) == 0 ) return a;
   4569  invalid:
   4570         float_raise( float_flag_invalid STATUS_VAR);
   4571         z.low = floatx80_default_nan_low;
   4572         z.high = floatx80_default_nan_high;
   4573         return z;
   4574     }
   4575     if ( aExp == 0 ) {
   4576         if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
   4577         normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
   4578     }
   4579     zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
   4580     zSig0 = estimateSqrt32( aExp, aSig0>>32 );
   4581     shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
   4582     zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
   4583     doubleZSig0 = zSig0<<1;
   4584     mul64To128( zSig0, zSig0, &term0, &term1 );
   4585     sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
   4586     while ( (int64_t) rem0 < 0 ) {
   4587         --zSig0;
   4588         doubleZSig0 -= 2;
   4589         add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
   4590     }
   4591     zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
   4592     if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
   4593         if ( zSig1 == 0 ) zSig1 = 1;
   4594         mul64To128( doubleZSig0, zSig1, &term1, &term2 );
   4595         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
   4596         mul64To128( zSig1, zSig1, &term2, &term3 );
   4597         sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
   4598         while ( (int64_t) rem1 < 0 ) {
   4599             --zSig1;
   4600             shortShift128Left( 0, zSig1, 1, &term2, &term3 );
   4601             term3 |= 1;
   4602             term2 |= doubleZSig0;
   4603             add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
   4604         }
   4605         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
   4606     }
   4607     shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
   4608     zSig0 |= doubleZSig0;
   4609     return
   4610         roundAndPackFloatx80(
   4611             STATUS(floatx80_rounding_precision), 0, zExp, zSig0, zSig1 STATUS_VAR );
   4612 
   4613 }
   4614 
   4615 /*----------------------------------------------------------------------------
   4616 | Returns 1 if the extended double-precision floating-point value `a' is equal
   4617 | to the corresponding value `b', and 0 otherwise.  The invalid exception is
   4618 | raised if either operand is a NaN.  Otherwise, the comparison is performed
   4619 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   4620 *----------------------------------------------------------------------------*/
   4621 
   4622 int floatx80_eq( floatx80 a, floatx80 b STATUS_PARAM )
   4623 {
   4624 
   4625     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
   4626               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
   4627          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
   4628               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
   4629        ) {
   4630         float_raise( float_flag_invalid STATUS_VAR);
   4631         return 0;
   4632     }
   4633     return
   4634            ( a.low == b.low )
   4635         && (    ( a.high == b.high )
   4636              || (    ( a.low == 0 )
   4637                   && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
   4638            );
   4639 
   4640 }
   4641 
   4642 /*----------------------------------------------------------------------------
   4643 | Returns 1 if the extended double-precision floating-point value `a' is
   4644 | less than or equal to the corresponding value `b', and 0 otherwise.  The
   4645 | invalid exception is raised if either operand is a NaN.  The comparison is
   4646 | performed according to the IEC/IEEE Standard for Binary Floating-Point
   4647 | Arithmetic.
   4648 *----------------------------------------------------------------------------*/
   4649 
   4650 int floatx80_le( floatx80 a, floatx80 b STATUS_PARAM )
   4651 {
   4652     flag aSign, bSign;
   4653 
   4654     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
   4655               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
   4656          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
   4657               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
   4658        ) {
   4659         float_raise( float_flag_invalid STATUS_VAR);
   4660         return 0;
   4661     }
   4662     aSign = extractFloatx80Sign( a );
   4663     bSign = extractFloatx80Sign( b );
   4664     if ( aSign != bSign ) {
   4665         return
   4666                aSign
   4667             || (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
   4668                  == 0 );
   4669     }
   4670     return
   4671           aSign ? le128( b.high, b.low, a.high, a.low )
   4672         : le128( a.high, a.low, b.high, b.low );
   4673 
   4674 }
   4675 
   4676 /*----------------------------------------------------------------------------
   4677 | Returns 1 if the extended double-precision floating-point value `a' is
   4678 | less than the corresponding value `b', and 0 otherwise.  The invalid
   4679 | exception is raised if either operand is a NaN.  The comparison is performed
   4680 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   4681 *----------------------------------------------------------------------------*/
   4682 
   4683 int floatx80_lt( floatx80 a, floatx80 b STATUS_PARAM )
   4684 {
   4685     flag aSign, bSign;
   4686 
   4687     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
   4688               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
   4689          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
   4690               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
   4691        ) {
   4692         float_raise( float_flag_invalid STATUS_VAR);
   4693         return 0;
   4694     }
   4695     aSign = extractFloatx80Sign( a );
   4696     bSign = extractFloatx80Sign( b );
   4697     if ( aSign != bSign ) {
   4698         return
   4699                aSign
   4700             && (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
   4701                  != 0 );
   4702     }
   4703     return
   4704           aSign ? lt128( b.high, b.low, a.high, a.low )
   4705         : lt128( a.high, a.low, b.high, b.low );
   4706 
   4707 }
   4708 
   4709 /*----------------------------------------------------------------------------
   4710 | Returns 1 if the extended double-precision floating-point values `a' and `b'
   4711 | cannot be compared, and 0 otherwise.  The invalid exception is raised if
   4712 | either operand is a NaN.   The comparison is performed according to the
   4713 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   4714 *----------------------------------------------------------------------------*/
   4715 int floatx80_unordered( floatx80 a, floatx80 b STATUS_PARAM )
   4716 {
   4717     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
   4718               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
   4719          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
   4720               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
   4721        ) {
   4722         float_raise( float_flag_invalid STATUS_VAR);
   4723         return 1;
   4724     }
   4725     return 0;
   4726 }
   4727 
   4728 /*----------------------------------------------------------------------------
   4729 | Returns 1 if the extended double-precision floating-point value `a' is
   4730 | equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
   4731 | cause an exception.  The comparison is performed according to the IEC/IEEE
   4732 | Standard for Binary Floating-Point Arithmetic.
   4733 *----------------------------------------------------------------------------*/
   4734 
   4735 int floatx80_eq_quiet( floatx80 a, floatx80 b STATUS_PARAM )
   4736 {
   4737 
   4738     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
   4739               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
   4740          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
   4741               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
   4742        ) {
   4743         if (    floatx80_is_signaling_nan( a )
   4744              || floatx80_is_signaling_nan( b ) ) {
   4745             float_raise( float_flag_invalid STATUS_VAR);
   4746         }
   4747         return 0;
   4748     }
   4749     return
   4750            ( a.low == b.low )
   4751         && (    ( a.high == b.high )
   4752              || (    ( a.low == 0 )
   4753                   && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
   4754            );
   4755 
   4756 }
   4757 
   4758 /*----------------------------------------------------------------------------
   4759 | Returns 1 if the extended double-precision floating-point value `a' is less
   4760 | than or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs
   4761 | do not cause an exception.  Otherwise, the comparison is performed according
   4762 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   4763 *----------------------------------------------------------------------------*/
   4764 
   4765 int floatx80_le_quiet( floatx80 a, floatx80 b STATUS_PARAM )
   4766 {
   4767     flag aSign, bSign;
   4768 
   4769     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
   4770               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
   4771          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
   4772               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
   4773        ) {
   4774         if (    floatx80_is_signaling_nan( a )
   4775              || floatx80_is_signaling_nan( b ) ) {
   4776             float_raise( float_flag_invalid STATUS_VAR);
   4777         }
   4778         return 0;
   4779     }
   4780     aSign = extractFloatx80Sign( a );
   4781     bSign = extractFloatx80Sign( b );
   4782     if ( aSign != bSign ) {
   4783         return
   4784                aSign
   4785             || (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
   4786                  == 0 );
   4787     }
   4788     return
   4789           aSign ? le128( b.high, b.low, a.high, a.low )
   4790         : le128( a.high, a.low, b.high, b.low );
   4791 
   4792 }
   4793 
   4794 /*----------------------------------------------------------------------------
   4795 | Returns 1 if the extended double-precision floating-point value `a' is less
   4796 | than the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause
   4797 | an exception.  Otherwise, the comparison is performed according to the
   4798 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   4799 *----------------------------------------------------------------------------*/
   4800 
   4801 int floatx80_lt_quiet( floatx80 a, floatx80 b STATUS_PARAM )
   4802 {
   4803     flag aSign, bSign;
   4804 
   4805     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
   4806               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
   4807          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
   4808               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
   4809        ) {
   4810         if (    floatx80_is_signaling_nan( a )
   4811              || floatx80_is_signaling_nan( b ) ) {
   4812             float_raise( float_flag_invalid STATUS_VAR);
   4813         }
   4814         return 0;
   4815     }
   4816     aSign = extractFloatx80Sign( a );
   4817     bSign = extractFloatx80Sign( b );
   4818     if ( aSign != bSign ) {
   4819         return
   4820                aSign
   4821             && (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
   4822                  != 0 );
   4823     }
   4824     return
   4825           aSign ? lt128( b.high, b.low, a.high, a.low )
   4826         : lt128( a.high, a.low, b.high, b.low );
   4827 
   4828 }
   4829 
   4830 /*----------------------------------------------------------------------------
   4831 | Returns 1 if the extended double-precision floating-point values `a' and `b'
   4832 | cannot be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.
   4833 | The comparison is performed according to the IEC/IEEE Standard for Binary
   4834 | Floating-Point Arithmetic.
   4835 *----------------------------------------------------------------------------*/
   4836 int floatx80_unordered_quiet( floatx80 a, floatx80 b STATUS_PARAM )
   4837 {
   4838     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
   4839               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
   4840          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
   4841               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
   4842        ) {
   4843         if (    floatx80_is_signaling_nan( a )
   4844              || floatx80_is_signaling_nan( b ) ) {
   4845             float_raise( float_flag_invalid STATUS_VAR);
   4846         }
   4847         return 1;
   4848     }
   4849     return 0;
   4850 }
   4851 
   4852 #endif
   4853 
   4854 #ifdef FLOAT128
   4855 
   4856 /*----------------------------------------------------------------------------
   4857 | Returns the result of converting the quadruple-precision floating-point
   4858 | value `a' to the 32-bit two's complement integer format.  The conversion
   4859 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
   4860 | Arithmetic---which means in particular that the conversion is rounded
   4861 | according to the current rounding mode.  If `a' is a NaN, the largest
   4862 | positive integer is returned.  Otherwise, if the conversion overflows, the
   4863 | largest integer with the same sign as `a' is returned.
   4864 *----------------------------------------------------------------------------*/
   4865 
   4866 int32 float128_to_int32( float128 a STATUS_PARAM )
   4867 {
   4868     flag aSign;
   4869     int32 aExp, shiftCount;
   4870     uint64_t aSig0, aSig1;
   4871 
   4872     aSig1 = extractFloat128Frac1( a );
   4873     aSig0 = extractFloat128Frac0( a );
   4874     aExp = extractFloat128Exp( a );
   4875     aSign = extractFloat128Sign( a );
   4876     if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
   4877     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
   4878     aSig0 |= ( aSig1 != 0 );
   4879     shiftCount = 0x4028 - aExp;
   4880     if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
   4881     return roundAndPackInt32( aSign, aSig0 STATUS_VAR );
   4882 
   4883 }
   4884 
   4885 /*----------------------------------------------------------------------------
   4886 | Returns the result of converting the quadruple-precision floating-point
   4887 | value `a' to the 32-bit two's complement integer format.  The conversion
   4888 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
   4889 | Arithmetic, except that the conversion is always rounded toward zero.  If
   4890 | `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
   4891 | conversion overflows, the largest integer with the same sign as `a' is
   4892 | returned.
   4893 *----------------------------------------------------------------------------*/
   4894 
   4895 int32 float128_to_int32_round_to_zero( float128 a STATUS_PARAM )
   4896 {
   4897     flag aSign;
   4898     int32 aExp, shiftCount;
   4899     uint64_t aSig0, aSig1, savedASig;
   4900     int32 z;
   4901 
   4902     aSig1 = extractFloat128Frac1( a );
   4903     aSig0 = extractFloat128Frac0( a );
   4904     aExp = extractFloat128Exp( a );
   4905     aSign = extractFloat128Sign( a );
   4906     aSig0 |= ( aSig1 != 0 );
   4907     if ( 0x401E < aExp ) {
   4908         if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
   4909         goto invalid;
   4910     }
   4911     else if ( aExp < 0x3FFF ) {
   4912         if ( aExp || aSig0 ) STATUS(float_exception_flags) |= float_flag_inexact;
   4913         return 0;
   4914     }
   4915     aSig0 |= LIT64( 0x0001000000000000 );
   4916     shiftCount = 0x402F - aExp;
   4917     savedASig = aSig0;
   4918     aSig0 >>= shiftCount;
   4919     z = aSig0;
   4920     if ( aSign ) z = - z;
   4921     if ( ( z < 0 ) ^ aSign ) {
   4922  invalid:
   4923         float_raise( float_flag_invalid STATUS_VAR);
   4924         return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
   4925     }
   4926     if ( ( aSig0<<shiftCount ) != savedASig ) {
   4927         STATUS(float_exception_flags) |= float_flag_inexact;
   4928     }
   4929     return z;
   4930 
   4931 }
   4932 
   4933 /*----------------------------------------------------------------------------
   4934 | Returns the result of converting the quadruple-precision floating-point
   4935 | value `a' to the 64-bit two's complement integer format.  The conversion
   4936 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
   4937 | Arithmetic---which means in particular that the conversion is rounded
   4938 | according to the current rounding mode.  If `a' is a NaN, the largest
   4939 | positive integer is returned.  Otherwise, if the conversion overflows, the
   4940 | largest integer with the same sign as `a' is returned.
   4941 *----------------------------------------------------------------------------*/
   4942 
   4943 int64 float128_to_int64( float128 a STATUS_PARAM )
   4944 {
   4945     flag aSign;
   4946     int32 aExp, shiftCount;
   4947     uint64_t aSig0, aSig1;
   4948 
   4949     aSig1 = extractFloat128Frac1( a );
   4950     aSig0 = extractFloat128Frac0( a );
   4951     aExp = extractFloat128Exp( a );
   4952     aSign = extractFloat128Sign( a );
   4953     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
   4954     shiftCount = 0x402F - aExp;
   4955     if ( shiftCount <= 0 ) {
   4956         if ( 0x403E < aExp ) {
   4957             float_raise( float_flag_invalid STATUS_VAR);
   4958             if (    ! aSign
   4959                  || (    ( aExp == 0x7FFF )
   4960                       && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) )
   4961                     )
   4962                ) {
   4963                 return LIT64( 0x7FFFFFFFFFFFFFFF );
   4964             }
   4965             return (int64_t) LIT64( 0x8000000000000000 );
   4966         }
   4967         shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
   4968     }
   4969     else {
   4970         shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
   4971     }
   4972     return roundAndPackInt64( aSign, aSig0, aSig1 STATUS_VAR );
   4973 
   4974 }
   4975 
   4976 /*----------------------------------------------------------------------------
   4977 | Returns the result of converting the quadruple-precision floating-point
   4978 | value `a' to the 64-bit two's complement integer format.  The conversion
   4979 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
   4980 | Arithmetic, except that the conversion is always rounded toward zero.
   4981 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
   4982 | the conversion overflows, the largest integer with the same sign as `a' is
   4983 | returned.
   4984 *----------------------------------------------------------------------------*/
   4985 
   4986 int64 float128_to_int64_round_to_zero( float128 a STATUS_PARAM )
   4987 {
   4988     flag aSign;
   4989     int32 aExp, shiftCount;
   4990     uint64_t aSig0, aSig1;
   4991     int64 z;
   4992 
   4993     aSig1 = extractFloat128Frac1( a );
   4994     aSig0 = extractFloat128Frac0( a );
   4995     aExp = extractFloat128Exp( a );
   4996     aSign = extractFloat128Sign( a );
   4997     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
   4998     shiftCount = aExp - 0x402F;
   4999     if ( 0 < shiftCount ) {
   5000         if ( 0x403E <= aExp ) {
   5001             aSig0 &= LIT64( 0x0000FFFFFFFFFFFF );
   5002             if (    ( a.high == LIT64( 0xC03E000000000000 ) )
   5003                  && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) {
   5004                 if ( aSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
   5005             }
   5006             else {
   5007                 float_raise( float_flag_invalid STATUS_VAR);
   5008                 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) {
   5009                     return LIT64( 0x7FFFFFFFFFFFFFFF );
   5010                 }
   5011             }
   5012             return (int64_t) LIT64( 0x8000000000000000 );
   5013         }
   5014         z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
   5015         if ( (uint64_t) ( aSig1<<shiftCount ) ) {
   5016             STATUS(float_exception_flags) |= float_flag_inexact;
   5017         }
   5018     }
   5019     else {
   5020         if ( aExp < 0x3FFF ) {
   5021             if ( aExp | aSig0 | aSig1 ) {
   5022                 STATUS(float_exception_flags) |= float_flag_inexact;
   5023             }
   5024             return 0;
   5025         }
   5026         z = aSig0>>( - shiftCount );
   5027         if (    aSig1
   5028              || ( shiftCount && (uint64_t) ( aSig0<<( shiftCount & 63 ) ) ) ) {
   5029             STATUS(float_exception_flags) |= float_flag_inexact;
   5030         }
   5031     }
   5032     if ( aSign ) z = - z;
   5033     return z;
   5034 
   5035 }
   5036 
   5037 /*----------------------------------------------------------------------------
   5038 | Returns the result of converting the quadruple-precision floating-point
   5039 | value `a' to the single-precision floating-point format.  The conversion
   5040 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
   5041 | Arithmetic.
   5042 *----------------------------------------------------------------------------*/
   5043 
   5044 float32 float128_to_float32( float128 a STATUS_PARAM )
   5045 {
   5046     flag aSign;
   5047     int32 aExp;
   5048     uint64_t aSig0, aSig1;
   5049     uint32_t zSig;
   5050 
   5051     aSig1 = extractFloat128Frac1( a );
   5052     aSig0 = extractFloat128Frac0( a );
   5053     aExp = extractFloat128Exp( a );
   5054     aSign = extractFloat128Sign( a );
   5055     if ( aExp == 0x7FFF ) {
   5056         if ( aSig0 | aSig1 ) {
   5057             return commonNaNToFloat32( float128ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
   5058         }
   5059         return packFloat32( aSign, 0xFF, 0 );
   5060     }
   5061     aSig0 |= ( aSig1 != 0 );
   5062     shift64RightJamming( aSig0, 18, &aSig0 );
   5063     zSig = aSig0;
   5064     if ( aExp || zSig ) {
   5065         zSig |= 0x40000000;
   5066         aExp -= 0x3F81;
   5067     }
   5068     return roundAndPackFloat32( aSign, aExp, zSig STATUS_VAR );
   5069 
   5070 }
   5071 
   5072 /*----------------------------------------------------------------------------
   5073 | Returns the result of converting the quadruple-precision floating-point
   5074 | value `a' to the double-precision floating-point format.  The conversion
   5075 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
   5076 | Arithmetic.
   5077 *----------------------------------------------------------------------------*/
   5078 
   5079 float64 float128_to_float64( float128 a STATUS_PARAM )
   5080 {
   5081     flag aSign;
   5082     int32 aExp;
   5083     uint64_t aSig0, aSig1;
   5084 
   5085     aSig1 = extractFloat128Frac1( a );
   5086     aSig0 = extractFloat128Frac0( a );
   5087     aExp = extractFloat128Exp( a );
   5088     aSign = extractFloat128Sign( a );
   5089     if ( aExp == 0x7FFF ) {
   5090         if ( aSig0 | aSig1 ) {
   5091             return commonNaNToFloat64( float128ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
   5092         }
   5093         return packFloat64( aSign, 0x7FF, 0 );
   5094     }
   5095     shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
   5096     aSig0 |= ( aSig1 != 0 );
   5097     if ( aExp || aSig0 ) {
   5098         aSig0 |= LIT64( 0x4000000000000000 );
   5099         aExp -= 0x3C01;
   5100     }
   5101     return roundAndPackFloat64( aSign, aExp, aSig0 STATUS_VAR );
   5102 
   5103 }
   5104 
   5105 #ifdef FLOATX80
   5106 
   5107 /*----------------------------------------------------------------------------
   5108 | Returns the result of converting the quadruple-precision floating-point
   5109 | value `a' to the extended double-precision floating-point format.  The
   5110 | conversion is performed according to the IEC/IEEE Standard for Binary
   5111 | Floating-Point Arithmetic.
   5112 *----------------------------------------------------------------------------*/
   5113 
   5114 floatx80 float128_to_floatx80( float128 a STATUS_PARAM )
   5115 {
   5116     flag aSign;
   5117     int32 aExp;
   5118     uint64_t aSig0, aSig1;
   5119 
   5120     aSig1 = extractFloat128Frac1( a );
   5121     aSig0 = extractFloat128Frac0( a );
   5122     aExp = extractFloat128Exp( a );
   5123     aSign = extractFloat128Sign( a );
   5124     if ( aExp == 0x7FFF ) {
   5125         if ( aSig0 | aSig1 ) {
   5126             return commonNaNToFloatx80( float128ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
   5127         }
   5128         return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
   5129     }
   5130     if ( aExp == 0 ) {
   5131         if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
   5132         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
   5133     }
   5134     else {
   5135         aSig0 |= LIT64( 0x0001000000000000 );
   5136     }
   5137     shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
   5138     return roundAndPackFloatx80( 80, aSign, aExp, aSig0, aSig1 STATUS_VAR );
   5139 
   5140 }
   5141 
   5142 #endif
   5143 
   5144 /*----------------------------------------------------------------------------
   5145 | Rounds the quadruple-precision floating-point value `a' to an integer, and
   5146 | returns the result as a quadruple-precision floating-point value.  The
   5147 | operation is performed according to the IEC/IEEE Standard for Binary
   5148 | Floating-Point Arithmetic.
   5149 *----------------------------------------------------------------------------*/
   5150 
   5151 float128 float128_round_to_int( float128 a STATUS_PARAM )
   5152 {
   5153     flag aSign;
   5154     int32 aExp;
   5155     uint64_t lastBitMask, roundBitsMask;
   5156     int8 roundingMode;
   5157     float128 z;
   5158 
   5159     aExp = extractFloat128Exp( a );
   5160     if ( 0x402F <= aExp ) {
   5161         if ( 0x406F <= aExp ) {
   5162             if (    ( aExp == 0x7FFF )
   5163                  && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )
   5164                ) {
   5165                 return propagateFloat128NaN( a, a STATUS_VAR );
   5166             }
   5167             return a;
   5168         }
   5169         lastBitMask = 1;
   5170         lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
   5171         roundBitsMask = lastBitMask - 1;
   5172         z = a;
   5173         roundingMode = STATUS(float_rounding_mode);
   5174         if ( roundingMode == float_round_nearest_even ) {
   5175             if ( lastBitMask ) {
   5176                 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
   5177                 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
   5178             }
   5179             else {
   5180                 if ( (int64_t) z.low < 0 ) {
   5181                     ++z.high;
   5182                     if ( (uint64_t) ( z.low<<1 ) == 0 ) z.high &= ~1;
   5183                 }
   5184             }
   5185         }
   5186         else if ( roundingMode != float_round_to_zero ) {
   5187             if (   extractFloat128Sign( z )
   5188                  ^ ( roundingMode == float_round_up ) ) {
   5189                 add128( z.high, z.low, 0, roundBitsMask, &z.high, &z.low );
   5190             }
   5191         }
   5192         z.low &= ~ roundBitsMask;
   5193     }
   5194     else {
   5195         if ( aExp < 0x3FFF ) {
   5196             if ( ( ( (uint64_t) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
   5197             STATUS(float_exception_flags) |= float_flag_inexact;
   5198             aSign = extractFloat128Sign( a );
   5199             switch ( STATUS(float_rounding_mode) ) {
   5200              case float_round_nearest_even:
   5201                 if (    ( aExp == 0x3FFE )
   5202                      && (   extractFloat128Frac0( a )
   5203                           | extractFloat128Frac1( a ) )
   5204                    ) {
   5205                     return packFloat128( aSign, 0x3FFF, 0, 0 );
   5206                 }
   5207                 break;
   5208              case float_round_down:
   5209                 return
   5210                       aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
   5211                     : packFloat128( 0, 0, 0, 0 );
   5212              case float_round_up:
   5213                 return
   5214                       aSign ? packFloat128( 1, 0, 0, 0 )
   5215                     : packFloat128( 0, 0x3FFF, 0, 0 );
   5216             }
   5217             return packFloat128( aSign, 0, 0, 0 );
   5218         }
   5219         lastBitMask = 1;
   5220         lastBitMask <<= 0x402F - aExp;
   5221         roundBitsMask = lastBitMask - 1;
   5222         z.low = 0;
   5223         z.high = a.high;
   5224         roundingMode = STATUS(float_rounding_mode);
   5225         if ( roundingMode == float_round_nearest_even ) {
   5226             z.high += lastBitMask>>1;
   5227             if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
   5228                 z.high &= ~ lastBitMask;
   5229             }
   5230         }
   5231         else if ( roundingMode != float_round_to_zero ) {
   5232             if (   extractFloat128Sign( z )
   5233                  ^ ( roundingMode == float_round_up ) ) {
   5234                 z.high |= ( a.low != 0 );
   5235                 z.high += roundBitsMask;
   5236             }
   5237         }
   5238         z.high &= ~ roundBitsMask;
   5239     }
   5240     if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
   5241         STATUS(float_exception_flags) |= float_flag_inexact;
   5242     }
   5243     return z;
   5244 
   5245 }
   5246 
   5247 /*----------------------------------------------------------------------------
   5248 | Returns the result of adding the absolute values of the quadruple-precision
   5249 | floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
   5250 | before being returned.  `zSign' is ignored if the result is a NaN.
   5251 | The addition is performed according to the IEC/IEEE Standard for Binary
   5252 | Floating-Point Arithmetic.
   5253 *----------------------------------------------------------------------------*/
   5254 
   5255 static float128 addFloat128Sigs( float128 a, float128 b, flag zSign STATUS_PARAM)
   5256 {
   5257     int32 aExp, bExp, zExp;
   5258     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
   5259     int32 expDiff;
   5260 
   5261     aSig1 = extractFloat128Frac1( a );
   5262     aSig0 = extractFloat128Frac0( a );
   5263     aExp = extractFloat128Exp( a );
   5264     bSig1 = extractFloat128Frac1( b );
   5265     bSig0 = extractFloat128Frac0( b );
   5266     bExp = extractFloat128Exp( b );
   5267     expDiff = aExp - bExp;
   5268     if ( 0 < expDiff ) {
   5269         if ( aExp == 0x7FFF ) {
   5270             if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
   5271             return a;
   5272         }
   5273         if ( bExp == 0 ) {
   5274             --expDiff;
   5275         }
   5276         else {
   5277             bSig0 |= LIT64( 0x0001000000000000 );
   5278         }
   5279         shift128ExtraRightJamming(
   5280             bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
   5281         zExp = aExp;
   5282     }
   5283     else if ( expDiff < 0 ) {
   5284         if ( bExp == 0x7FFF ) {
   5285             if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
   5286             return packFloat128( zSign, 0x7FFF, 0, 0 );
   5287         }
   5288         if ( aExp == 0 ) {
   5289             ++expDiff;
   5290         }
   5291         else {
   5292             aSig0 |= LIT64( 0x0001000000000000 );
   5293         }
   5294         shift128ExtraRightJamming(
   5295             aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
   5296         zExp = bExp;
   5297     }
   5298     else {
   5299         if ( aExp == 0x7FFF ) {
   5300             if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
   5301                 return propagateFloat128NaN( a, b STATUS_VAR );
   5302             }
   5303             return a;
   5304         }
   5305         add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
   5306         if ( aExp == 0 ) {
   5307             if (STATUS(flush_to_zero)) {
   5308                 if (zSig0 | zSig1) {
   5309                     float_raise(float_flag_output_denormal STATUS_VAR);
   5310                 }
   5311                 return packFloat128(zSign, 0, 0, 0);
   5312             }
   5313             return packFloat128( zSign, 0, zSig0, zSig1 );
   5314         }
   5315         zSig2 = 0;
   5316         zSig0 |= LIT64( 0x0002000000000000 );
   5317         zExp = aExp;
   5318         goto shiftRight1;
   5319     }
   5320     aSig0 |= LIT64( 0x0001000000000000 );
   5321     add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
   5322     --zExp;
   5323     if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack;
   5324     ++zExp;
   5325  shiftRight1:
   5326     shift128ExtraRightJamming(
   5327         zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
   5328  roundAndPack:
   5329     return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
   5330 
   5331 }
   5332 
   5333 /*----------------------------------------------------------------------------
   5334 | Returns the result of subtracting the absolute values of the quadruple-
   5335 | precision floating-point values `a' and `b'.  If `zSign' is 1, the
   5336 | difference is negated before being returned.  `zSign' is ignored if the
   5337 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
   5338 | Standard for Binary Floating-Point Arithmetic.
   5339 *----------------------------------------------------------------------------*/
   5340 
   5341 static float128 subFloat128Sigs( float128 a, float128 b, flag zSign STATUS_PARAM)
   5342 {
   5343     int32 aExp, bExp, zExp;
   5344     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
   5345     int32 expDiff;
   5346     float128 z;
   5347 
   5348     aSig1 = extractFloat128Frac1( a );
   5349     aSig0 = extractFloat128Frac0( a );
   5350     aExp = extractFloat128Exp( a );
   5351     bSig1 = extractFloat128Frac1( b );
   5352     bSig0 = extractFloat128Frac0( b );
   5353     bExp = extractFloat128Exp( b );
   5354     expDiff = aExp - bExp;
   5355     shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
   5356     shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
   5357     if ( 0 < expDiff ) goto aExpBigger;
   5358     if ( expDiff < 0 ) goto bExpBigger;
   5359     if ( aExp == 0x7FFF ) {
   5360         if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
   5361             return propagateFloat128NaN( a, b STATUS_VAR );
   5362         }
   5363         float_raise( float_flag_invalid STATUS_VAR);
   5364         z.low = float128_default_nan_low;
   5365         z.high = float128_default_nan_high;
   5366         return z;
   5367     }
   5368     if ( aExp == 0 ) {
   5369         aExp = 1;
   5370         bExp = 1;
   5371     }
   5372     if ( bSig0 < aSig0 ) goto aBigger;
   5373     if ( aSig0 < bSig0 ) goto bBigger;
   5374     if ( bSig1 < aSig1 ) goto aBigger;
   5375     if ( aSig1 < bSig1 ) goto bBigger;
   5376     return packFloat128( STATUS(float_rounding_mode) == float_round_down, 0, 0, 0 );
   5377  bExpBigger:
   5378     if ( bExp == 0x7FFF ) {
   5379         if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
   5380         return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
   5381     }
   5382     if ( aExp == 0 ) {
   5383         ++expDiff;
   5384     }
   5385     else {
   5386         aSig0 |= LIT64( 0x4000000000000000 );
   5387     }
   5388     shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
   5389     bSig0 |= LIT64( 0x4000000000000000 );
   5390  bBigger:
   5391     sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
   5392     zExp = bExp;
   5393     zSign ^= 1;
   5394     goto normalizeRoundAndPack;
   5395  aExpBigger:
   5396     if ( aExp == 0x7FFF ) {
   5397         if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
   5398         return a;
   5399     }
   5400     if ( bExp == 0 ) {
   5401         --expDiff;
   5402     }
   5403     else {
   5404         bSig0 |= LIT64( 0x4000000000000000 );
   5405     }
   5406     shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
   5407     aSig0 |= LIT64( 0x4000000000000000 );
   5408  aBigger:
   5409     sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
   5410     zExp = aExp;
   5411  normalizeRoundAndPack:
   5412     --zExp;
   5413     return normalizeRoundAndPackFloat128( zSign, zExp - 14, zSig0, zSig1 STATUS_VAR );
   5414 
   5415 }
   5416 
   5417 /*----------------------------------------------------------------------------
   5418 | Returns the result of adding the quadruple-precision floating-point values
   5419 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
   5420 | for Binary Floating-Point Arithmetic.
   5421 *----------------------------------------------------------------------------*/
   5422 
   5423 float128 float128_add( float128 a, float128 b STATUS_PARAM )
   5424 {
   5425     flag aSign, bSign;
   5426 
   5427     aSign = extractFloat128Sign( a );
   5428     bSign = extractFloat128Sign( b );
   5429     if ( aSign == bSign ) {
   5430         return addFloat128Sigs( a, b, aSign STATUS_VAR );
   5431     }
   5432     else {
   5433         return subFloat128Sigs( a, b, aSign STATUS_VAR );
   5434     }
   5435 
   5436 }
   5437 
   5438 /*----------------------------------------------------------------------------
   5439 | Returns the result of subtracting the quadruple-precision floating-point
   5440 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
   5441 | Standard for Binary Floating-Point Arithmetic.
   5442 *----------------------------------------------------------------------------*/
   5443 
   5444 float128 float128_sub( float128 a, float128 b STATUS_PARAM )
   5445 {
   5446     flag aSign, bSign;
   5447 
   5448     aSign = extractFloat128Sign( a );
   5449     bSign = extractFloat128Sign( b );
   5450     if ( aSign == bSign ) {
   5451         return subFloat128Sigs( a, b, aSign STATUS_VAR );
   5452     }
   5453     else {
   5454         return addFloat128Sigs( a, b, aSign STATUS_VAR );
   5455     }
   5456 
   5457 }
   5458 
   5459 /*----------------------------------------------------------------------------
   5460 | Returns the result of multiplying the quadruple-precision floating-point
   5461 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
   5462 | Standard for Binary Floating-Point Arithmetic.
   5463 *----------------------------------------------------------------------------*/
   5464 
   5465 float128 float128_mul( float128 a, float128 b STATUS_PARAM )
   5466 {
   5467     flag aSign, bSign, zSign;
   5468     int32 aExp, bExp, zExp;
   5469     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
   5470     float128 z;
   5471 
   5472     aSig1 = extractFloat128Frac1( a );
   5473     aSig0 = extractFloat128Frac0( a );
   5474     aExp = extractFloat128Exp( a );
   5475     aSign = extractFloat128Sign( a );
   5476     bSig1 = extractFloat128Frac1( b );
   5477     bSig0 = extractFloat128Frac0( b );
   5478     bExp = extractFloat128Exp( b );
   5479     bSign = extractFloat128Sign( b );
   5480     zSign = aSign ^ bSign;
   5481     if ( aExp == 0x7FFF ) {
   5482         if (    ( aSig0 | aSig1 )
   5483              || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
   5484             return propagateFloat128NaN( a, b STATUS_VAR );
   5485         }
   5486         if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
   5487         return packFloat128( zSign, 0x7FFF, 0, 0 );
   5488     }
   5489     if ( bExp == 0x7FFF ) {
   5490         if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
   5491         if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
   5492  invalid:
   5493             float_raise( float_flag_invalid STATUS_VAR);
   5494             z.low = float128_default_nan_low;
   5495             z.high = float128_default_nan_high;
   5496             return z;
   5497         }
   5498         return packFloat128( zSign, 0x7FFF, 0, 0 );
   5499     }
   5500     if ( aExp == 0 ) {
   5501         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
   5502         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
   5503     }
   5504     if ( bExp == 0 ) {
   5505         if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
   5506         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
   5507     }
   5508     zExp = aExp + bExp - 0x4000;
   5509     aSig0 |= LIT64( 0x0001000000000000 );
   5510     shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
   5511     mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
   5512     add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
   5513     zSig2 |= ( zSig3 != 0 );
   5514     if ( LIT64( 0x0002000000000000 ) <= zSig0 ) {
   5515         shift128ExtraRightJamming(
   5516             zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
   5517         ++zExp;
   5518     }
   5519     return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
   5520 
   5521 }
   5522 
   5523 /*----------------------------------------------------------------------------
   5524 | Returns the result of dividing the quadruple-precision floating-point value
   5525 | `a' by the corresponding value `b'.  The operation is performed according to
   5526 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   5527 *----------------------------------------------------------------------------*/
   5528 
   5529 float128 float128_div( float128 a, float128 b STATUS_PARAM )
   5530 {
   5531     flag aSign, bSign, zSign;
   5532     int32 aExp, bExp, zExp;
   5533     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
   5534     uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
   5535     float128 z;
   5536 
   5537     aSig1 = extractFloat128Frac1( a );
   5538     aSig0 = extractFloat128Frac0( a );
   5539     aExp = extractFloat128Exp( a );
   5540     aSign = extractFloat128Sign( a );
   5541     bSig1 = extractFloat128Frac1( b );
   5542     bSig0 = extractFloat128Frac0( b );
   5543     bExp = extractFloat128Exp( b );
   5544     bSign = extractFloat128Sign( b );
   5545     zSign = aSign ^ bSign;
   5546     if ( aExp == 0x7FFF ) {
   5547         if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
   5548         if ( bExp == 0x7FFF ) {
   5549             if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
   5550             goto invalid;
   5551         }
   5552         return packFloat128( zSign, 0x7FFF, 0, 0 );
   5553     }
   5554     if ( bExp == 0x7FFF ) {
   5555         if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
   5556         return packFloat128( zSign, 0, 0, 0 );
   5557     }
   5558     if ( bExp == 0 ) {
   5559         if ( ( bSig0 | bSig1 ) == 0 ) {
   5560             if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
   5561  invalid:
   5562                 float_raise( float_flag_invalid STATUS_VAR);
   5563                 z.low = float128_default_nan_low;
   5564                 z.high = float128_default_nan_high;
   5565                 return z;
   5566             }
   5567             float_raise( float_flag_divbyzero STATUS_VAR);
   5568             return packFloat128( zSign, 0x7FFF, 0, 0 );
   5569         }
   5570         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
   5571     }
   5572     if ( aExp == 0 ) {
   5573         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
   5574         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
   5575     }
   5576     zExp = aExp - bExp + 0x3FFD;
   5577     shortShift128Left(
   5578         aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 );
   5579     shortShift128Left(
   5580         bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
   5581     if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
   5582         shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
   5583         ++zExp;
   5584     }
   5585     zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
   5586     mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
   5587     sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
   5588     while ( (int64_t) rem0 < 0 ) {
   5589         --zSig0;
   5590         add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
   5591     }
   5592     zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
   5593     if ( ( zSig1 & 0x3FFF ) <= 4 ) {
   5594         mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
   5595         sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
   5596         while ( (int64_t) rem1 < 0 ) {
   5597             --zSig1;
   5598             add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
   5599         }
   5600         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
   5601     }
   5602     shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
   5603     return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
   5604 
   5605 }
   5606 
   5607 /*----------------------------------------------------------------------------
   5608 | Returns the remainder of the quadruple-precision floating-point value `a'
   5609 | with respect to the corresponding value `b'.  The operation is performed
   5610 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   5611 *----------------------------------------------------------------------------*/
   5612 
   5613 float128 float128_rem( float128 a, float128 b STATUS_PARAM )
   5614 {
   5615     flag aSign, zSign;
   5616     int32 aExp, bExp, expDiff;
   5617     uint64_t aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
   5618     uint64_t allZero, alternateASig0, alternateASig1, sigMean1;
   5619     int64_t sigMean0;
   5620     float128 z;
   5621 
   5622     aSig1 = extractFloat128Frac1( a );
   5623     aSig0 = extractFloat128Frac0( a );
   5624     aExp = extractFloat128Exp( a );
   5625     aSign = extractFloat128Sign( a );
   5626     bSig1 = extractFloat128Frac1( b );
   5627     bSig0 = extractFloat128Frac0( b );
   5628     bExp = extractFloat128Exp( b );
   5629     if ( aExp == 0x7FFF ) {
   5630         if (    ( aSig0 | aSig1 )
   5631              || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
   5632             return propagateFloat128NaN( a, b STATUS_VAR );
   5633         }
   5634         goto invalid;
   5635     }
   5636     if ( bExp == 0x7FFF ) {
   5637         if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
   5638         return a;
   5639     }
   5640     if ( bExp == 0 ) {
   5641         if ( ( bSig0 | bSig1 ) == 0 ) {
   5642  invalid:
   5643             float_raise( float_flag_invalid STATUS_VAR);
   5644             z.low = float128_default_nan_low;
   5645             z.high = float128_default_nan_high;
   5646             return z;
   5647         }
   5648         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
   5649     }
   5650     if ( aExp == 0 ) {
   5651         if ( ( aSig0 | aSig1 ) == 0 ) return a;
   5652         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
   5653     }
   5654     expDiff = aExp - bExp;
   5655     if ( expDiff < -1 ) return a;
   5656     shortShift128Left(
   5657         aSig0 | LIT64( 0x0001000000000000 ),
   5658         aSig1,
   5659         15 - ( expDiff < 0 ),
   5660         &aSig0,
   5661         &aSig1
   5662     );
   5663     shortShift128Left(
   5664         bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
   5665     q = le128( bSig0, bSig1, aSig0, aSig1 );
   5666     if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
   5667     expDiff -= 64;
   5668     while ( 0 < expDiff ) {
   5669         q = estimateDiv128To64( aSig0, aSig1, bSig0 );
   5670         q = ( 4 < q ) ? q - 4 : 0;
   5671         mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
   5672         shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
   5673         shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
   5674         sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
   5675         expDiff -= 61;
   5676     }
   5677     if ( -64 < expDiff ) {
   5678         q = estimateDiv128To64( aSig0, aSig1, bSig0 );
   5679         q = ( 4 < q ) ? q - 4 : 0;
   5680         q >>= - expDiff;
   5681         shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
   5682         expDiff += 52;
   5683         if ( expDiff < 0 ) {
   5684             shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
   5685         }
   5686         else {
   5687             shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
   5688         }
   5689         mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
   5690         sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
   5691     }
   5692     else {
   5693         shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
   5694         shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
   5695     }
   5696     do {
   5697         alternateASig0 = aSig0;
   5698         alternateASig1 = aSig1;
   5699         ++q;
   5700         sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
   5701     } while ( 0 <= (int64_t) aSig0 );
   5702     add128(
   5703         aSig0, aSig1, alternateASig0, alternateASig1, (uint64_t *)&sigMean0, &sigMean1 );
   5704     if (    ( sigMean0 < 0 )
   5705          || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
   5706         aSig0 = alternateASig0;
   5707         aSig1 = alternateASig1;
   5708     }
   5709     zSign = ( (int64_t) aSig0 < 0 );
   5710     if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
   5711     return
   5712         normalizeRoundAndPackFloat128( aSign ^ zSign, bExp - 4, aSig0, aSig1 STATUS_VAR );
   5713 
   5714 }
   5715 
   5716 /*----------------------------------------------------------------------------
   5717 | Returns the square root of the quadruple-precision floating-point value `a'.
   5718 | The operation is performed according to the IEC/IEEE Standard for Binary
   5719 | Floating-Point Arithmetic.
   5720 *----------------------------------------------------------------------------*/
   5721 
   5722 float128 float128_sqrt( float128 a STATUS_PARAM )
   5723 {
   5724     flag aSign;
   5725     int32 aExp, zExp;
   5726     uint64_t aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
   5727     uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
   5728     float128 z;
   5729 
   5730     aSig1 = extractFloat128Frac1( a );
   5731     aSig0 = extractFloat128Frac0( a );
   5732     aExp = extractFloat128Exp( a );
   5733     aSign = extractFloat128Sign( a );
   5734     if ( aExp == 0x7FFF ) {
   5735         if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, a STATUS_VAR );
   5736         if ( ! aSign ) return a;
   5737         goto invalid;
   5738     }
   5739     if ( aSign ) {
   5740         if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
   5741  invalid:
   5742         float_raise( float_flag_invalid STATUS_VAR);
   5743         z.low = float128_default_nan_low;
   5744         z.high = float128_default_nan_high;
   5745         return z;
   5746     }
   5747     if ( aExp == 0 ) {
   5748         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
   5749         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
   5750     }
   5751     zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;
   5752     aSig0 |= LIT64( 0x0001000000000000 );
   5753     zSig0 = estimateSqrt32( aExp, aSig0>>17 );
   5754     shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
   5755     zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
   5756     doubleZSig0 = zSig0<<1;
   5757     mul64To128( zSig0, zSig0, &term0, &term1 );
   5758     sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
   5759     while ( (int64_t) rem0 < 0 ) {
   5760         --zSig0;
   5761         doubleZSig0 -= 2;
   5762         add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
   5763     }
   5764     zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
   5765     if ( ( zSig1 & 0x1FFF ) <= 5 ) {
   5766         if ( zSig1 == 0 ) zSig1 = 1;
   5767         mul64To128( doubleZSig0, zSig1, &term1, &term2 );
   5768         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
   5769         mul64To128( zSig1, zSig1, &term2, &term3 );
   5770         sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
   5771         while ( (int64_t) rem1 < 0 ) {
   5772             --zSig1;
   5773             shortShift128Left( 0, zSig1, 1, &term2, &term3 );
   5774             term3 |= 1;
   5775             term2 |= doubleZSig0;
   5776             add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
   5777         }
   5778         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
   5779     }
   5780     shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
   5781     return roundAndPackFloat128( 0, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
   5782 
   5783 }
   5784 
   5785 /*----------------------------------------------------------------------------
   5786 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
   5787 | the corresponding value `b', and 0 otherwise.  The invalid exception is
   5788 | raised if either operand is a NaN.  Otherwise, the comparison is performed
   5789 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   5790 *----------------------------------------------------------------------------*/
   5791 
   5792 int float128_eq( float128 a, float128 b STATUS_PARAM )
   5793 {
   5794 
   5795     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
   5796               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
   5797          || (    ( extractFloat128Exp( b ) == 0x7FFF )
   5798               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
   5799        ) {
   5800         float_raise( float_flag_invalid STATUS_VAR);
   5801         return 0;
   5802     }
   5803     return
   5804            ( a.low == b.low )
   5805         && (    ( a.high == b.high )
   5806              || (    ( a.low == 0 )
   5807                   && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )
   5808            );
   5809 
   5810 }
   5811 
   5812 /*----------------------------------------------------------------------------
   5813 | Returns 1 if the quadruple-precision floating-point value `a' is less than
   5814 | or equal to the corresponding value `b', and 0 otherwise.  The invalid
   5815 | exception is raised if either operand is a NaN.  The comparison is performed
   5816 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   5817 *----------------------------------------------------------------------------*/
   5818 
   5819 int float128_le( float128 a, float128 b STATUS_PARAM )
   5820 {
   5821     flag aSign, bSign;
   5822 
   5823     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
   5824               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
   5825          || (    ( extractFloat128Exp( b ) == 0x7FFF )
   5826               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
   5827        ) {
   5828         float_raise( float_flag_invalid STATUS_VAR);
   5829         return 0;
   5830     }
   5831     aSign = extractFloat128Sign( a );
   5832     bSign = extractFloat128Sign( b );
   5833     if ( aSign != bSign ) {
   5834         return
   5835                aSign
   5836             || (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
   5837                  == 0 );
   5838     }
   5839     return
   5840           aSign ? le128( b.high, b.low, a.high, a.low )
   5841         : le128( a.high, a.low, b.high, b.low );
   5842 
   5843 }
   5844 
   5845 /*----------------------------------------------------------------------------
   5846 | Returns 1 if the quadruple-precision floating-point value `a' is less than
   5847 | the corresponding value `b', and 0 otherwise.  The invalid exception is
   5848 | raised if either operand is a NaN.  The comparison is performed according
   5849 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   5850 *----------------------------------------------------------------------------*/
   5851 
   5852 int float128_lt( float128 a, float128 b STATUS_PARAM )
   5853 {
   5854     flag aSign, bSign;
   5855 
   5856     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
   5857               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
   5858          || (    ( extractFloat128Exp( b ) == 0x7FFF )
   5859               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
   5860        ) {
   5861         float_raise( float_flag_invalid STATUS_VAR);
   5862         return 0;
   5863     }
   5864     aSign = extractFloat128Sign( a );
   5865     bSign = extractFloat128Sign( b );
   5866     if ( aSign != bSign ) {
   5867         return
   5868                aSign
   5869             && (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
   5870                  != 0 );
   5871     }
   5872     return
   5873           aSign ? lt128( b.high, b.low, a.high, a.low )
   5874         : lt128( a.high, a.low, b.high, b.low );
   5875 
   5876 }
   5877 
   5878 /*----------------------------------------------------------------------------
   5879 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
   5880 | be compared, and 0 otherwise.  The invalid exception is raised if either
   5881 | operand is a NaN. The comparison is performed according to the IEC/IEEE
   5882 | Standard for Binary Floating-Point Arithmetic.
   5883 *----------------------------------------------------------------------------*/
   5884 
   5885 int float128_unordered( float128 a, float128 b STATUS_PARAM )
   5886 {
   5887     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
   5888               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
   5889          || (    ( extractFloat128Exp( b ) == 0x7FFF )
   5890               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
   5891        ) {
   5892         float_raise( float_flag_invalid STATUS_VAR);
   5893         return 1;
   5894     }
   5895     return 0;
   5896 }
   5897 
   5898 /*----------------------------------------------------------------------------
   5899 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
   5900 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
   5901 | exception.  The comparison is performed according to the IEC/IEEE Standard
   5902 | for Binary Floating-Point Arithmetic.
   5903 *----------------------------------------------------------------------------*/
   5904 
   5905 int float128_eq_quiet( float128 a, float128 b STATUS_PARAM )
   5906 {
   5907 
   5908     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
   5909               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
   5910          || (    ( extractFloat128Exp( b ) == 0x7FFF )
   5911               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
   5912        ) {
   5913         if (    float128_is_signaling_nan( a )
   5914              || float128_is_signaling_nan( b ) ) {
   5915             float_raise( float_flag_invalid STATUS_VAR);
   5916         }
   5917         return 0;
   5918     }
   5919     return
   5920            ( a.low == b.low )
   5921         && (    ( a.high == b.high )
   5922              || (    ( a.low == 0 )
   5923                   && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )
   5924            );
   5925 
   5926 }
   5927 
   5928 /*----------------------------------------------------------------------------
   5929 | Returns 1 if the quadruple-precision floating-point value `a' is less than
   5930 | or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
   5931 | cause an exception.  Otherwise, the comparison is performed according to the
   5932 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   5933 *----------------------------------------------------------------------------*/
   5934 
   5935 int float128_le_quiet( float128 a, float128 b STATUS_PARAM )
   5936 {
   5937     flag aSign, bSign;
   5938 
   5939     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
   5940               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
   5941          || (    ( extractFloat128Exp( b ) == 0x7FFF )
   5942               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
   5943        ) {
   5944         if (    float128_is_signaling_nan( a )
   5945              || float128_is_signaling_nan( b ) ) {
   5946             float_raise( float_flag_invalid STATUS_VAR);
   5947         }
   5948         return 0;
   5949     }
   5950     aSign = extractFloat128Sign( a );
   5951     bSign = extractFloat128Sign( b );
   5952     if ( aSign != bSign ) {
   5953         return
   5954                aSign
   5955             || (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
   5956                  == 0 );
   5957     }
   5958     return
   5959           aSign ? le128( b.high, b.low, a.high, a.low )
   5960         : le128( a.high, a.low, b.high, b.low );
   5961 
   5962 }
   5963 
   5964 /*----------------------------------------------------------------------------
   5965 | Returns 1 if the quadruple-precision floating-point value `a' is less than
   5966 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
   5967 | exception.  Otherwise, the comparison is performed according to the IEC/IEEE
   5968 | Standard for Binary Floating-Point Arithmetic.
   5969 *----------------------------------------------------------------------------*/
   5970 
   5971 int float128_lt_quiet( float128 a, float128 b STATUS_PARAM )
   5972 {
   5973     flag aSign, bSign;
   5974 
   5975     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
   5976               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
   5977          || (    ( extractFloat128Exp( b ) == 0x7FFF )
   5978               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
   5979        ) {
   5980         if (    float128_is_signaling_nan( a )
   5981              || float128_is_signaling_nan( b ) ) {
   5982             float_raise( float_flag_invalid STATUS_VAR);
   5983         }
   5984         return 0;
   5985     }
   5986     aSign = extractFloat128Sign( a );
   5987     bSign = extractFloat128Sign( b );
   5988     if ( aSign != bSign ) {
   5989         return
   5990                aSign
   5991             && (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
   5992                  != 0 );
   5993     }
   5994     return
   5995           aSign ? lt128( b.high, b.low, a.high, a.low )
   5996         : lt128( a.high, a.low, b.high, b.low );
   5997 
   5998 }
   5999 
   6000 /*----------------------------------------------------------------------------
   6001 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
   6002 | be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.  The
   6003 | comparison is performed according to the IEC/IEEE Standard for Binary
   6004 | Floating-Point Arithmetic.
   6005 *----------------------------------------------------------------------------*/
   6006 
   6007 int float128_unordered_quiet( float128 a, float128 b STATUS_PARAM )
   6008 {
   6009     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
   6010               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
   6011          || (    ( extractFloat128Exp( b ) == 0x7FFF )
   6012               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
   6013        ) {
   6014         if (    float128_is_signaling_nan( a )
   6015              || float128_is_signaling_nan( b ) ) {
   6016             float_raise( float_flag_invalid STATUS_VAR);
   6017         }
   6018         return 1;
   6019     }
   6020     return 0;
   6021 }
   6022 
   6023 #endif
   6024 
   6025 /* misc functions */
   6026 float32 uint32_to_float32( unsigned int a STATUS_PARAM )
   6027 {
   6028     return int64_to_float32(a STATUS_VAR);
   6029 }
   6030 
   6031 float64 uint32_to_float64( unsigned int a STATUS_PARAM )
   6032 {
   6033     return int64_to_float64(a STATUS_VAR);
   6034 }
   6035 
   6036 unsigned int float32_to_uint32( float32 a STATUS_PARAM )
   6037 {
   6038     int64_t v;
   6039     unsigned int res;
   6040 
   6041     v = float32_to_int64(a STATUS_VAR);
   6042     if (v < 0) {
   6043         res = 0;
   6044         float_raise( float_flag_invalid STATUS_VAR);
   6045     } else if (v > 0xffffffff) {
   6046         res = 0xffffffff;
   6047         float_raise( float_flag_invalid STATUS_VAR);
   6048     } else {
   6049         res = v;
   6050     }
   6051     return res;
   6052 }
   6053 
   6054 unsigned int float32_to_uint32_round_to_zero( float32 a STATUS_PARAM )
   6055 {
   6056     int64_t v;
   6057     unsigned int res;
   6058 
   6059     v = float32_to_int64_round_to_zero(a STATUS_VAR);
   6060     if (v < 0) {
   6061         res = 0;
   6062         float_raise( float_flag_invalid STATUS_VAR);
   6063     } else if (v > 0xffffffff) {
   6064         res = 0xffffffff;
   6065         float_raise( float_flag_invalid STATUS_VAR);
   6066     } else {
   6067         res = v;
   6068     }
   6069     return res;
   6070 }
   6071 
   6072 unsigned int float32_to_uint16_round_to_zero( float32 a STATUS_PARAM )
   6073 {
   6074     int64_t v;
   6075     unsigned int res;
   6076 
   6077     v = float32_to_int64_round_to_zero(a STATUS_VAR);
   6078     if (v < 0) {
   6079         res = 0;
   6080         float_raise( float_flag_invalid STATUS_VAR);
   6081     } else if (v > 0xffff) {
   6082         res = 0xffff;
   6083         float_raise( float_flag_invalid STATUS_VAR);
   6084     } else {
   6085         res = v;
   6086     }
   6087     return res;
   6088 }
   6089 
   6090 unsigned int float64_to_uint32( float64 a STATUS_PARAM )
   6091 {
   6092     int64_t v;
   6093     unsigned int res;
   6094 
   6095     v = float64_to_int64(a STATUS_VAR);
   6096     if (v < 0) {
   6097         res = 0;
   6098         float_raise( float_flag_invalid STATUS_VAR);
   6099     } else if (v > 0xffffffff) {
   6100         res = 0xffffffff;
   6101         float_raise( float_flag_invalid STATUS_VAR);
   6102     } else {
   6103         res = v;
   6104     }
   6105     return res;
   6106 }
   6107 
   6108 unsigned int float64_to_uint32_round_to_zero( float64 a STATUS_PARAM )
   6109 {
   6110     int64_t v;
   6111     unsigned int res;
   6112 
   6113     v = float64_to_int64_round_to_zero(a STATUS_VAR);
   6114     if (v < 0) {
   6115         res = 0;
   6116         float_raise( float_flag_invalid STATUS_VAR);
   6117     } else if (v > 0xffffffff) {
   6118         res = 0xffffffff;
   6119         float_raise( float_flag_invalid STATUS_VAR);
   6120     } else {
   6121         res = v;
   6122     }
   6123     return res;
   6124 }
   6125 
   6126 unsigned int float64_to_uint16_round_to_zero( float64 a STATUS_PARAM )
   6127 {
   6128     int64_t v;
   6129     unsigned int res;
   6130 
   6131     v = float64_to_int64_round_to_zero(a STATUS_VAR);
   6132     if (v < 0) {
   6133         res = 0;
   6134         float_raise( float_flag_invalid STATUS_VAR);
   6135     } else if (v > 0xffff) {
   6136         res = 0xffff;
   6137         float_raise( float_flag_invalid STATUS_VAR);
   6138     } else {
   6139         res = v;
   6140     }
   6141     return res;
   6142 }
   6143 
   6144 /* FIXME: This looks broken.  */
   6145 uint64_t float64_to_uint64 (float64 a STATUS_PARAM)
   6146 {
   6147     int64_t v;
   6148 
   6149     v = float64_val(int64_to_float64(INT64_MIN STATUS_VAR));
   6150     v += float64_val(a);
   6151     v = float64_to_int64(make_float64(v) STATUS_VAR);
   6152 
   6153     return v - INT64_MIN;
   6154 }
   6155 
   6156 uint64_t float64_to_uint64_round_to_zero (float64 a STATUS_PARAM)
   6157 {
   6158     int64_t v;
   6159 
   6160     v = float64_val(int64_to_float64(INT64_MIN STATUS_VAR));
   6161     v += float64_val(a);
   6162     v = float64_to_int64_round_to_zero(make_float64(v) STATUS_VAR);
   6163 
   6164     return v - INT64_MIN;
   6165 }
   6166 
   6167 #define COMPARE(s, nan_exp)                                                  \
   6168 INLINE int float ## s ## _compare_internal( float ## s a, float ## s b,      \
   6169                                       int is_quiet STATUS_PARAM )            \
   6170 {                                                                            \
   6171     flag aSign, bSign;                                                       \
   6172     uint ## s ## _t av, bv;                                                  \
   6173     a = float ## s ## _squash_input_denormal(a STATUS_VAR);                  \
   6174     b = float ## s ## _squash_input_denormal(b STATUS_VAR);                  \
   6175                                                                              \
   6176     if (( ( extractFloat ## s ## Exp( a ) == nan_exp ) &&                    \
   6177          extractFloat ## s ## Frac( a ) ) ||                                 \
   6178         ( ( extractFloat ## s ## Exp( b ) == nan_exp ) &&                    \
   6179           extractFloat ## s ## Frac( b ) )) {                                \
   6180         if (!is_quiet ||                                                     \
   6181             float ## s ## _is_signaling_nan( a ) ||                          \
   6182             float ## s ## _is_signaling_nan( b ) ) {                         \
   6183             float_raise( float_flag_invalid STATUS_VAR);                     \
   6184         }                                                                    \
   6185         return float_relation_unordered;                                     \
   6186     }                                                                        \
   6187     aSign = extractFloat ## s ## Sign( a );                                  \
   6188     bSign = extractFloat ## s ## Sign( b );                                  \
   6189     av = float ## s ## _val(a);                                              \
   6190     bv = float ## s ## _val(b);                                              \
   6191     if ( aSign != bSign ) {                                                  \
   6192         if ( (uint ## s ## _t) ( ( av | bv )<<1 ) == 0 ) {                   \
   6193             /* zero case */                                                  \
   6194             return float_relation_equal;                                     \
   6195         } else {                                                             \
   6196             return 1 - (2 * aSign);                                          \
   6197         }                                                                    \
   6198     } else {                                                                 \
   6199         if (av == bv) {                                                      \
   6200             return float_relation_equal;                                     \
   6201         } else {                                                             \
   6202             return 1 - 2 * (aSign ^ ( av < bv ));                            \
   6203         }                                                                    \
   6204     }                                                                        \
   6205 }                                                                            \
   6206                                                                              \
   6207 int float ## s ## _compare( float ## s a, float ## s b STATUS_PARAM )        \
   6208 {                                                                            \
   6209     return float ## s ## _compare_internal(a, b, 0 STATUS_VAR);              \
   6210 }                                                                            \
   6211                                                                              \
   6212 int float ## s ## _compare_quiet( float ## s a, float ## s b STATUS_PARAM )  \
   6213 {                                                                            \
   6214     return float ## s ## _compare_internal(a, b, 1 STATUS_VAR);              \
   6215 }
   6216 
   6217 COMPARE(32, 0xff)
   6218 COMPARE(64, 0x7ff)
   6219 
   6220 INLINE int floatx80_compare_internal( floatx80 a, floatx80 b,
   6221                                       int is_quiet STATUS_PARAM )
   6222 {
   6223     flag aSign, bSign;
   6224 
   6225     if (( ( extractFloatx80Exp( a ) == 0x7fff ) &&
   6226           ( extractFloatx80Frac( a )<<1 ) ) ||
   6227         ( ( extractFloatx80Exp( b ) == 0x7fff ) &&
   6228           ( extractFloatx80Frac( b )<<1 ) )) {
   6229         if (!is_quiet ||
   6230             floatx80_is_signaling_nan( a ) ||
   6231             floatx80_is_signaling_nan( b ) ) {
   6232             float_raise( float_flag_invalid STATUS_VAR);
   6233         }
   6234         return float_relation_unordered;
   6235     }
   6236     aSign = extractFloatx80Sign( a );
   6237     bSign = extractFloatx80Sign( b );
   6238     if ( aSign != bSign ) {
   6239 
   6240         if ( ( ( (uint16_t) ( ( a.high | b.high ) << 1 ) ) == 0) &&
   6241              ( ( a.low | b.low ) == 0 ) ) {
   6242             /* zero case */
   6243             return float_relation_equal;
   6244         } else {
   6245             return 1 - (2 * aSign);
   6246         }
   6247     } else {
   6248         if (a.low == b.low && a.high == b.high) {
   6249             return float_relation_equal;
   6250         } else {
   6251             return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
   6252         }
   6253     }
   6254 }
   6255 
   6256 int floatx80_compare( floatx80 a, floatx80 b STATUS_PARAM )
   6257 {
   6258     return floatx80_compare_internal(a, b, 0 STATUS_VAR);
   6259 }
   6260 
   6261 int floatx80_compare_quiet( floatx80 a, floatx80 b STATUS_PARAM )
   6262 {
   6263     return floatx80_compare_internal(a, b, 1 STATUS_VAR);
   6264 }
   6265 
   6266 INLINE int float128_compare_internal( float128 a, float128 b,
   6267                                       int is_quiet STATUS_PARAM )
   6268 {
   6269     flag aSign, bSign;
   6270 
   6271     if (( ( extractFloat128Exp( a ) == 0x7fff ) &&
   6272           ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) ||
   6273         ( ( extractFloat128Exp( b ) == 0x7fff ) &&
   6274           ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )) {
   6275         if (!is_quiet ||
   6276             float128_is_signaling_nan( a ) ||
   6277             float128_is_signaling_nan( b ) ) {
   6278             float_raise( float_flag_invalid STATUS_VAR);
   6279         }
   6280         return float_relation_unordered;
   6281     }
   6282     aSign = extractFloat128Sign( a );
   6283     bSign = extractFloat128Sign( b );
   6284     if ( aSign != bSign ) {
   6285         if ( ( ( ( a.high | b.high )<<1 ) | a.low | b.low ) == 0 ) {
   6286             /* zero case */
   6287             return float_relation_equal;
   6288         } else {
   6289             return 1 - (2 * aSign);
   6290         }
   6291     } else {
   6292         if (a.low == b.low && a.high == b.high) {
   6293             return float_relation_equal;
   6294         } else {
   6295             return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
   6296         }
   6297     }
   6298 }
   6299 
   6300 int float128_compare( float128 a, float128 b STATUS_PARAM )
   6301 {
   6302     return float128_compare_internal(a, b, 0 STATUS_VAR);
   6303 }
   6304 
   6305 int float128_compare_quiet( float128 a, float128 b STATUS_PARAM )
   6306 {
   6307     return float128_compare_internal(a, b, 1 STATUS_VAR);
   6308 }
   6309 
   6310 /* min() and max() functions. These can't be implemented as
   6311  * 'compare and pick one input' because that would mishandle
   6312  * NaNs and +0 vs -0.
   6313  */
   6314 #define MINMAX(s, nan_exp)                                              \
   6315 INLINE float ## s float ## s ## _minmax(float ## s a, float ## s b,     \
   6316                                         int ismin STATUS_PARAM )        \
   6317 {                                                                       \
   6318     flag aSign, bSign;                                                  \
   6319     uint ## s ## _t av, bv;                                             \
   6320     a = float ## s ## _squash_input_denormal(a STATUS_VAR);             \
   6321     b = float ## s ## _squash_input_denormal(b STATUS_VAR);             \
   6322     if (float ## s ## _is_any_nan(a) ||                                 \
   6323         float ## s ## _is_any_nan(b)) {                                 \
   6324         return propagateFloat ## s ## NaN(a, b STATUS_VAR);             \
   6325     }                                                                   \
   6326     aSign = extractFloat ## s ## Sign(a);                               \
   6327     bSign = extractFloat ## s ## Sign(b);                               \
   6328     av = float ## s ## _val(a);                                         \
   6329     bv = float ## s ## _val(b);                                         \
   6330     if (aSign != bSign) {                                               \
   6331         if (ismin) {                                                    \
   6332             return aSign ? a : b;                                       \
   6333         } else {                                                        \
   6334             return aSign ? b : a;                                       \
   6335         }                                                               \
   6336     } else {                                                            \
   6337         if (ismin) {                                                    \
   6338             return (aSign ^ (av < bv)) ? a : b;                         \
   6339         } else {                                                        \
   6340             return (aSign ^ (av < bv)) ? b : a;                         \
   6341         }                                                               \
   6342     }                                                                   \
   6343 }                                                                       \
   6344                                                                         \
   6345 float ## s float ## s ## _min(float ## s a, float ## s b STATUS_PARAM)  \
   6346 {                                                                       \
   6347     return float ## s ## _minmax(a, b, 1 STATUS_VAR);                   \
   6348 }                                                                       \
   6349                                                                         \
   6350 float ## s float ## s ## _max(float ## s a, float ## s b STATUS_PARAM)  \
   6351 {                                                                       \
   6352     return float ## s ## _minmax(a, b, 0 STATUS_VAR);                   \
   6353 }
   6354 
   6355 MINMAX(32, 0xff)
   6356 MINMAX(64, 0x7ff)
   6357 
   6358 
   6359 /* Multiply A by 2 raised to the power N.  */
   6360 float32 float32_scalbn( float32 a, int n STATUS_PARAM )
   6361 {
   6362     flag aSign;
   6363     int16_t aExp;
   6364     uint32_t aSig;
   6365 
   6366     a = float32_squash_input_denormal(a STATUS_VAR);
   6367     aSig = extractFloat32Frac( a );
   6368     aExp = extractFloat32Exp( a );
   6369     aSign = extractFloat32Sign( a );
   6370 
   6371     if ( aExp == 0xFF ) {
   6372         if ( aSig ) {
   6373             return propagateFloat32NaN( a, a STATUS_VAR );
   6374         }
   6375         return a;
   6376     }
   6377     if ( aExp != 0 )
   6378         aSig |= 0x00800000;
   6379     else if ( aSig == 0 )
   6380         return a;
   6381 
   6382     if (n > 0x200) {
   6383         n = 0x200;
   6384     } else if (n < -0x200) {
   6385         n = -0x200;
   6386     }
   6387 
   6388     aExp += n - 1;
   6389     aSig <<= 7;
   6390     return normalizeRoundAndPackFloat32( aSign, aExp, aSig STATUS_VAR );
   6391 }
   6392 
   6393 float64 float64_scalbn( float64 a, int n STATUS_PARAM )
   6394 {
   6395     flag aSign;
   6396     int16_t aExp;
   6397     uint64_t aSig;
   6398 
   6399     a = float64_squash_input_denormal(a STATUS_VAR);
   6400     aSig = extractFloat64Frac( a );
   6401     aExp = extractFloat64Exp( a );
   6402     aSign = extractFloat64Sign( a );
   6403 
   6404     if ( aExp == 0x7FF ) {
   6405         if ( aSig ) {
   6406             return propagateFloat64NaN( a, a STATUS_VAR );
   6407         }
   6408         return a;
   6409     }
   6410     if ( aExp != 0 )
   6411         aSig |= LIT64( 0x0010000000000000 );
   6412     else if ( aSig == 0 )
   6413         return a;
   6414 
   6415     if (n > 0x1000) {
   6416         n = 0x1000;
   6417     } else if (n < -0x1000) {
   6418         n = -0x1000;
   6419     }
   6420 
   6421     aExp += n - 1;
   6422     aSig <<= 10;
   6423     return normalizeRoundAndPackFloat64( aSign, aExp, aSig STATUS_VAR );
   6424 }
   6425 
   6426 #ifdef FLOATX80
   6427 floatx80 floatx80_scalbn( floatx80 a, int n STATUS_PARAM )
   6428 {
   6429     flag aSign;
   6430     int32_t aExp;
   6431     uint64_t aSig;
   6432 
   6433     aSig = extractFloatx80Frac( a );
   6434     aExp = extractFloatx80Exp( a );
   6435     aSign = extractFloatx80Sign( a );
   6436 
   6437     if ( aExp == 0x7FFF ) {
   6438         if ( aSig<<1 ) {
   6439             return propagateFloatx80NaN( a, a STATUS_VAR );
   6440         }
   6441         return a;
   6442     }
   6443 
   6444     if (aExp == 0 && aSig == 0)
   6445         return a;
   6446 
   6447     if (n > 0x10000) {
   6448         n = 0x10000;
   6449     } else if (n < -0x10000) {
   6450         n = -0x10000;
   6451     }
   6452 
   6453     aExp += n;
   6454     return normalizeRoundAndPackFloatx80( STATUS(floatx80_rounding_precision),
   6455                                           aSign, aExp, aSig, 0 STATUS_VAR );
   6456 }
   6457 #endif
   6458 
   6459 #ifdef FLOAT128
   6460 float128 float128_scalbn( float128 a, int n STATUS_PARAM )
   6461 {
   6462     flag aSign;
   6463     int32_t aExp;
   6464     uint64_t aSig0, aSig1;
   6465 
   6466     aSig1 = extractFloat128Frac1( a );
   6467     aSig0 = extractFloat128Frac0( a );
   6468     aExp = extractFloat128Exp( a );
   6469     aSign = extractFloat128Sign( a );
   6470     if ( aExp == 0x7FFF ) {
   6471         if ( aSig0 | aSig1 ) {
   6472             return propagateFloat128NaN( a, a STATUS_VAR );
   6473         }
   6474         return a;
   6475     }
   6476     if ( aExp != 0 )
   6477         aSig0 |= LIT64( 0x0001000000000000 );
   6478     else if ( aSig0 == 0 && aSig1 == 0 )
   6479         return a;
   6480 
   6481     if (n > 0x10000) {
   6482         n = 0x10000;
   6483     } else if (n < -0x10000) {
   6484         n = -0x10000;
   6485     }
   6486 
   6487     aExp += n - 1;
   6488     return normalizeRoundAndPackFloat128( aSign, aExp, aSig0, aSig1
   6489                                           STATUS_VAR );
   6490 
   6491 }
   6492 #endif
   6493