Home | History | Annotate | Download | only in fpu
      1 /*
      2  * QEMU float support
      3  *
      4  * Derived from SoftFloat.
      5  */
      6 
      7 /*============================================================================
      8 
      9 This C source file is part of the SoftFloat IEC/IEEE Floating-point Arithmetic
     10 Package, Release 2b.
     11 
     12 Written by John R. Hauser.  This work was made possible in part by the
     13 International Computer Science Institute, located at Suite 600, 1947 Center
     14 Street, Berkeley, California 94704.  Funding was partially provided by the
     15 National Science Foundation under grant MIP-9311980.  The original version
     16 of this code was written as part of a project to build a fixed-point vector
     17 processor in collaboration with the University of California at Berkeley,
     18 overseen by Profs. Nelson Morgan and John Wawrzynek.  More information
     19 is available through the Web page `http://www.cs.berkeley.edu/~jhauser/
     20 arithmetic/SoftFloat.html'.
     21 
     22 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE.  Although reasonable effort has
     23 been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT TIMES
     24 RESULT IN INCORRECT BEHAVIOR.  USE OF THIS SOFTWARE IS RESTRICTED TO PERSONS
     25 AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ALL LOSSES,
     26 COSTS, OR OTHER PROBLEMS THEY INCUR DUE TO THE SOFTWARE, AND WHO FURTHERMORE
     27 EFFECTIVELY INDEMNIFY JOHN HAUSER AND THE INTERNATIONAL COMPUTER SCIENCE
     28 INSTITUTE (possibly via similar legal warning) AGAINST ALL LOSSES, COSTS, OR
     29 OTHER PROBLEMS INCURRED BY THEIR CUSTOMERS AND CLIENTS DUE TO THE SOFTWARE.
     30 
     31 Derivative works are acceptable, even for commercial purposes, so long as
     32 (1) the source code for the derivative work includes prominent notice that
     33 the work is derivative, and (2) the source code includes prominent notice with
     34 these four paragraphs for those parts of this code that are retained.
     35 
     36 =============================================================================*/
     37 
     38 /* softfloat (and in particular the code in softfloat-specialize.h) is
     39  * target-dependent and needs the TARGET_* macros.
     40  */
     41 #include "config.h"
     42 
     43 #include "fpu/softfloat.h"
     44 
     45 /*----------------------------------------------------------------------------
     46 | Primitive arithmetic functions, including multi-word arithmetic, and
     47 | division and square root approximations.  (Can be specialized to target if
     48 | desired.)
     49 *----------------------------------------------------------------------------*/
     50 #include "softfloat-macros.h"
     51 
     52 /*----------------------------------------------------------------------------
     53 | Functions and definitions to determine:  (1) whether tininess for underflow
     54 | is detected before or after rounding by default, (2) what (if anything)
     55 | happens when exceptions are raised, (3) how signaling NaNs are distinguished
     56 | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
     57 | are propagated from function inputs to output.  These details are target-
     58 | specific.
     59 *----------------------------------------------------------------------------*/
     60 #include "softfloat-specialize.h"
     61 
     62 void set_float_rounding_mode(int val STATUS_PARAM)
     63 {
     64     STATUS(float_rounding_mode) = val;
     65 }
     66 
     67 void set_float_exception_flags(int val STATUS_PARAM)
     68 {
     69     STATUS(float_exception_flags) = val;
     70 }
     71 
     72 void set_floatx80_rounding_precision(int val STATUS_PARAM)
     73 {
     74     STATUS(floatx80_rounding_precision) = val;
     75 }
     76 
     77 /*----------------------------------------------------------------------------
     78 | Returns the fraction bits of the half-precision floating-point value `a'.
     79 *----------------------------------------------------------------------------*/
     80 
     81 INLINE uint32_t extractFloat16Frac(float16 a)
     82 {
     83     return float16_val(a) & 0x3ff;
     84 }
     85 
     86 /*----------------------------------------------------------------------------
     87 | Returns the exponent bits of the half-precision floating-point value `a'.
     88 *----------------------------------------------------------------------------*/
     89 
     90 INLINE int_fast16_t extractFloat16Exp(float16 a)
     91 {
     92     return (float16_val(a) >> 10) & 0x1f;
     93 }
     94 
     95 /*----------------------------------------------------------------------------
     96 | Returns the sign bit of the single-precision floating-point value `a'.
     97 *----------------------------------------------------------------------------*/
     98 
     99 INLINE flag extractFloat16Sign(float16 a)
    100 {
    101     return float16_val(a)>>15;
    102 }
    103 
    104 /*----------------------------------------------------------------------------
    105 | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
    106 | and 7, and returns the properly rounded 32-bit integer corresponding to the
    107 | input.  If `zSign' is 1, the input is negated before being converted to an
    108 | integer.  Bit 63 of `absZ' must be zero.  Ordinarily, the fixed-point input
    109 | is simply rounded to an integer, with the inexact exception raised if the
    110 | input cannot be represented exactly as an integer.  However, if the fixed-
    111 | point input is too large, the invalid exception is raised and the largest
    112 | positive or negative integer is returned.
    113 *----------------------------------------------------------------------------*/
    114 
    115 static int32 roundAndPackInt32( flag zSign, uint64_t absZ STATUS_PARAM)
    116 {
    117     int8 roundingMode;
    118     flag roundNearestEven;
    119     int8 roundIncrement, roundBits;
    120     int32_t z;
    121 
    122     roundingMode = STATUS(float_rounding_mode);
    123     roundNearestEven = ( roundingMode == float_round_nearest_even );
    124     roundIncrement = 0x40;
    125     if ( ! roundNearestEven ) {
    126         if ( roundingMode == float_round_to_zero ) {
    127             roundIncrement = 0;
    128         }
    129         else {
    130             roundIncrement = 0x7F;
    131             if ( zSign ) {
    132                 if ( roundingMode == float_round_up ) roundIncrement = 0;
    133             }
    134             else {
    135                 if ( roundingMode == float_round_down ) roundIncrement = 0;
    136             }
    137         }
    138     }
    139     roundBits = absZ & 0x7F;
    140     absZ = ( absZ + roundIncrement )>>7;
    141     absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
    142     z = absZ;
    143     if ( zSign ) z = - z;
    144     if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
    145         float_raise( float_flag_invalid STATUS_VAR);
    146         return zSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
    147     }
    148     if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
    149     return z;
    150 
    151 }
    152 
    153 /*----------------------------------------------------------------------------
    154 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
    155 | `absZ1', with binary point between bits 63 and 64 (between the input words),
    156 | and returns the properly rounded 64-bit integer corresponding to the input.
    157 | If `zSign' is 1, the input is negated before being converted to an integer.
    158 | Ordinarily, the fixed-point input is simply rounded to an integer, with
    159 | the inexact exception raised if the input cannot be represented exactly as
    160 | an integer.  However, if the fixed-point input is too large, the invalid
    161 | exception is raised and the largest positive or negative integer is
    162 | returned.
    163 *----------------------------------------------------------------------------*/
    164 
    165 static int64 roundAndPackInt64( flag zSign, uint64_t absZ0, uint64_t absZ1 STATUS_PARAM)
    166 {
    167     int8 roundingMode;
    168     flag roundNearestEven, increment;
    169     int64_t z;
    170 
    171     roundingMode = STATUS(float_rounding_mode);
    172     roundNearestEven = ( roundingMode == float_round_nearest_even );
    173     increment = ( (int64_t) absZ1 < 0 );
    174     if ( ! roundNearestEven ) {
    175         if ( roundingMode == float_round_to_zero ) {
    176             increment = 0;
    177         }
    178         else {
    179             if ( zSign ) {
    180                 increment = ( roundingMode == float_round_down ) && absZ1;
    181             }
    182             else {
    183                 increment = ( roundingMode == float_round_up ) && absZ1;
    184             }
    185         }
    186     }
    187     if ( increment ) {
    188         ++absZ0;
    189         if ( absZ0 == 0 ) goto overflow;
    190         absZ0 &= ~ ( ( (uint64_t) ( absZ1<<1 ) == 0 ) & roundNearestEven );
    191     }
    192     z = absZ0;
    193     if ( zSign ) z = - z;
    194     if ( z && ( ( z < 0 ) ^ zSign ) ) {
    195  overflow:
    196         float_raise( float_flag_invalid STATUS_VAR);
    197         return
    198               zSign ? (int64_t) LIT64( 0x8000000000000000 )
    199             : LIT64( 0x7FFFFFFFFFFFFFFF );
    200     }
    201     if ( absZ1 ) STATUS(float_exception_flags) |= float_flag_inexact;
    202     return z;
    203 
    204 }
    205 
    206 /*----------------------------------------------------------------------------
    207 | Returns the fraction bits of the single-precision floating-point value `a'.
    208 *----------------------------------------------------------------------------*/
    209 
    210 INLINE uint32_t extractFloat32Frac( float32 a )
    211 {
    212 
    213     return float32_val(a) & 0x007FFFFF;
    214 
    215 }
    216 
    217 /*----------------------------------------------------------------------------
    218 | Returns the exponent bits of the single-precision floating-point value `a'.
    219 *----------------------------------------------------------------------------*/
    220 
    221 INLINE int_fast16_t extractFloat32Exp(float32 a)
    222 {
    223 
    224     return ( float32_val(a)>>23 ) & 0xFF;
    225 
    226 }
    227 
    228 /*----------------------------------------------------------------------------
    229 | Returns the sign bit of the single-precision floating-point value `a'.
    230 *----------------------------------------------------------------------------*/
    231 
    232 INLINE flag extractFloat32Sign( float32 a )
    233 {
    234 
    235     return float32_val(a)>>31;
    236 
    237 }
    238 
    239 /*----------------------------------------------------------------------------
    240 | If `a' is denormal and we are in flush-to-zero mode then set the
    241 | input-denormal exception and return zero. Otherwise just return the value.
    242 *----------------------------------------------------------------------------*/
    243 static float32 float32_squash_input_denormal(float32 a STATUS_PARAM)
    244 {
    245     if (STATUS(flush_inputs_to_zero)) {
    246         if (extractFloat32Exp(a) == 0 && extractFloat32Frac(a) != 0) {
    247             float_raise(float_flag_input_denormal STATUS_VAR);
    248             return make_float32(float32_val(a) & 0x80000000);
    249         }
    250     }
    251     return a;
    252 }
    253 
    254 /*----------------------------------------------------------------------------
    255 | Normalizes the subnormal single-precision floating-point value represented
    256 | by the denormalized significand `aSig'.  The normalized exponent and
    257 | significand are stored at the locations pointed to by `zExpPtr' and
    258 | `zSigPtr', respectively.
    259 *----------------------------------------------------------------------------*/
    260 
    261 static void
    262  normalizeFloat32Subnormal(uint32_t aSig, int_fast16_t *zExpPtr, uint32_t *zSigPtr)
    263 {
    264     int8 shiftCount;
    265 
    266     shiftCount = countLeadingZeros32( aSig ) - 8;
    267     *zSigPtr = aSig<<shiftCount;
    268     *zExpPtr = 1 - shiftCount;
    269 
    270 }
    271 
    272 /*----------------------------------------------------------------------------
    273 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
    274 | single-precision floating-point value, returning the result.  After being
    275 | shifted into the proper positions, the three fields are simply added
    276 | together to form the result.  This means that any integer portion of `zSig'
    277 | will be added into the exponent.  Since a properly normalized significand
    278 | will have an integer portion equal to 1, the `zExp' input should be 1 less
    279 | than the desired result exponent whenever `zSig' is a complete, normalized
    280 | significand.
    281 *----------------------------------------------------------------------------*/
    282 
    283 INLINE float32 packFloat32(flag zSign, int_fast16_t zExp, uint32_t zSig)
    284 {
    285 
    286     return make_float32(
    287           ( ( (uint32_t) zSign )<<31 ) + ( ( (uint32_t) zExp )<<23 ) + zSig);
    288 
    289 }
    290 
    291 /*----------------------------------------------------------------------------
    292 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
    293 | and significand `zSig', and returns the proper single-precision floating-
    294 | point value corresponding to the abstract input.  Ordinarily, the abstract
    295 | value is simply rounded and packed into the single-precision format, with
    296 | the inexact exception raised if the abstract input cannot be represented
    297 | exactly.  However, if the abstract value is too large, the overflow and
    298 | inexact exceptions are raised and an infinity or maximal finite value is
    299 | returned.  If the abstract value is too small, the input value is rounded to
    300 | a subnormal number, and the underflow and inexact exceptions are raised if
    301 | the abstract input cannot be represented exactly as a subnormal single-
    302 | precision floating-point number.
    303 |     The input significand `zSig' has its binary point between bits 30
    304 | and 29, which is 7 bits to the left of the usual location.  This shifted
    305 | significand must be normalized or smaller.  If `zSig' is not normalized,
    306 | `zExp' must be 0; in that case, the result returned is a subnormal number,
    307 | and it must not require rounding.  In the usual case that `zSig' is
    308 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
    309 | The handling of underflow and overflow follows the IEC/IEEE Standard for
    310 | Binary Floating-Point Arithmetic.
    311 *----------------------------------------------------------------------------*/
    312 
    313 static float32 roundAndPackFloat32(flag zSign, int_fast16_t zExp, uint32_t zSig STATUS_PARAM)
    314 {
    315     int8 roundingMode;
    316     flag roundNearestEven;
    317     int8 roundIncrement, roundBits;
    318     flag isTiny;
    319 
    320     roundingMode = STATUS(float_rounding_mode);
    321     roundNearestEven = ( roundingMode == float_round_nearest_even );
    322     roundIncrement = 0x40;
    323     if ( ! roundNearestEven ) {
    324         if ( roundingMode == float_round_to_zero ) {
    325             roundIncrement = 0;
    326         }
    327         else {
    328             roundIncrement = 0x7F;
    329             if ( zSign ) {
    330                 if ( roundingMode == float_round_up ) roundIncrement = 0;
    331             }
    332             else {
    333                 if ( roundingMode == float_round_down ) roundIncrement = 0;
    334             }
    335         }
    336     }
    337     roundBits = zSig & 0x7F;
    338     if ( 0xFD <= (uint16_t) zExp ) {
    339         if (    ( 0xFD < zExp )
    340              || (    ( zExp == 0xFD )
    341                   && ( (int32_t) ( zSig + roundIncrement ) < 0 ) )
    342            ) {
    343             float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
    344             return packFloat32( zSign, 0xFF, - ( roundIncrement == 0 ));
    345         }
    346         if ( zExp < 0 ) {
    347             if (STATUS(flush_to_zero)) {
    348                 float_raise(float_flag_output_denormal STATUS_VAR);
    349                 return packFloat32(zSign, 0, 0);
    350             }
    351             isTiny =
    352                    ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
    353                 || ( zExp < -1 )
    354                 || ( zSig + roundIncrement < 0x80000000 );
    355             shift32RightJamming( zSig, - zExp, &zSig );
    356             zExp = 0;
    357             roundBits = zSig & 0x7F;
    358             if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
    359         }
    360     }
    361     if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
    362     zSig = ( zSig + roundIncrement )>>7;
    363     zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
    364     if ( zSig == 0 ) zExp = 0;
    365     return packFloat32( zSign, zExp, zSig );
    366 
    367 }
    368 
    369 /*----------------------------------------------------------------------------
    370 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
    371 | and significand `zSig', and returns the proper single-precision floating-
    372 | point value corresponding to the abstract input.  This routine is just like
    373 | `roundAndPackFloat32' except that `zSig' does not have to be normalized.
    374 | Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
    375 | floating-point exponent.
    376 *----------------------------------------------------------------------------*/
    377 
    378 static float32
    379  normalizeRoundAndPackFloat32(flag zSign, int_fast16_t zExp, uint32_t zSig STATUS_PARAM)
    380 {
    381     int8 shiftCount;
    382 
    383     shiftCount = countLeadingZeros32( zSig ) - 1;
    384     return roundAndPackFloat32( zSign, zExp - shiftCount, zSig<<shiftCount STATUS_VAR);
    385 
    386 }
    387 
    388 /*----------------------------------------------------------------------------
    389 | Returns the fraction bits of the double-precision floating-point value `a'.
    390 *----------------------------------------------------------------------------*/
    391 
    392 INLINE uint64_t extractFloat64Frac( float64 a )
    393 {
    394 
    395     return float64_val(a) & LIT64( 0x000FFFFFFFFFFFFF );
    396 
    397 }
    398 
    399 /*----------------------------------------------------------------------------
    400 | Returns the exponent bits of the double-precision floating-point value `a'.
    401 *----------------------------------------------------------------------------*/
    402 
    403 INLINE int_fast16_t extractFloat64Exp(float64 a)
    404 {
    405 
    406     return ( float64_val(a)>>52 ) & 0x7FF;
    407 
    408 }
    409 
    410 /*----------------------------------------------------------------------------
    411 | Returns the sign bit of the double-precision floating-point value `a'.
    412 *----------------------------------------------------------------------------*/
    413 
    414 INLINE flag extractFloat64Sign( float64 a )
    415 {
    416 
    417     return float64_val(a)>>63;
    418 
    419 }
    420 
    421 /*----------------------------------------------------------------------------
    422 | If `a' is denormal and we are in flush-to-zero mode then set the
    423 | input-denormal exception and return zero. Otherwise just return the value.
    424 *----------------------------------------------------------------------------*/
    425 static float64 float64_squash_input_denormal(float64 a STATUS_PARAM)
    426 {
    427     if (STATUS(flush_inputs_to_zero)) {
    428         if (extractFloat64Exp(a) == 0 && extractFloat64Frac(a) != 0) {
    429             float_raise(float_flag_input_denormal STATUS_VAR);
    430             return make_float64(float64_val(a) & (1ULL << 63));
    431         }
    432     }
    433     return a;
    434 }
    435 
    436 /*----------------------------------------------------------------------------
    437 | Normalizes the subnormal double-precision floating-point value represented
    438 | by the denormalized significand `aSig'.  The normalized exponent and
    439 | significand are stored at the locations pointed to by `zExpPtr' and
    440 | `zSigPtr', respectively.
    441 *----------------------------------------------------------------------------*/
    442 
    443 static void
    444  normalizeFloat64Subnormal(uint64_t aSig, int_fast16_t *zExpPtr, uint64_t *zSigPtr)
    445 {
    446     int8 shiftCount;
    447 
    448     shiftCount = countLeadingZeros64( aSig ) - 11;
    449     *zSigPtr = aSig<<shiftCount;
    450     *zExpPtr = 1 - shiftCount;
    451 
    452 }
    453 
    454 /*----------------------------------------------------------------------------
    455 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
    456 | double-precision floating-point value, returning the result.  After being
    457 | shifted into the proper positions, the three fields are simply added
    458 | together to form the result.  This means that any integer portion of `zSig'
    459 | will be added into the exponent.  Since a properly normalized significand
    460 | will have an integer portion equal to 1, the `zExp' input should be 1 less
    461 | than the desired result exponent whenever `zSig' is a complete, normalized
    462 | significand.
    463 *----------------------------------------------------------------------------*/
    464 
    465 INLINE float64 packFloat64(flag zSign, int_fast16_t zExp, uint64_t zSig)
    466 {
    467 
    468     return make_float64(
    469         ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<52 ) + zSig);
    470 
    471 }
    472 
    473 /*----------------------------------------------------------------------------
    474 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
    475 | and significand `zSig', and returns the proper double-precision floating-
    476 | point value corresponding to the abstract input.  Ordinarily, the abstract
    477 | value is simply rounded and packed into the double-precision format, with
    478 | the inexact exception raised if the abstract input cannot be represented
    479 | exactly.  However, if the abstract value is too large, the overflow and
    480 | inexact exceptions are raised and an infinity or maximal finite value is
    481 | returned.  If the abstract value is too small, the input value is rounded
    482 | to a subnormal number, and the underflow and inexact exceptions are raised
    483 | if the abstract input cannot be represented exactly as a subnormal double-
    484 | precision floating-point number.
    485 |     The input significand `zSig' has its binary point between bits 62
    486 | and 61, which is 10 bits to the left of the usual location.  This shifted
    487 | significand must be normalized or smaller.  If `zSig' is not normalized,
    488 | `zExp' must be 0; in that case, the result returned is a subnormal number,
    489 | and it must not require rounding.  In the usual case that `zSig' is
    490 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
    491 | The handling of underflow and overflow follows the IEC/IEEE Standard for
    492 | Binary Floating-Point Arithmetic.
    493 *----------------------------------------------------------------------------*/
    494 
    495 static float64 roundAndPackFloat64(flag zSign, int_fast16_t zExp, uint64_t zSig STATUS_PARAM)
    496 {
    497     int8 roundingMode;
    498     flag roundNearestEven;
    499     int_fast16_t roundIncrement, roundBits;
    500     flag isTiny;
    501 
    502     roundingMode = STATUS(float_rounding_mode);
    503     roundNearestEven = ( roundingMode == float_round_nearest_even );
    504     roundIncrement = 0x200;
    505     if ( ! roundNearestEven ) {
    506         if ( roundingMode == float_round_to_zero ) {
    507             roundIncrement = 0;
    508         }
    509         else {
    510             roundIncrement = 0x3FF;
    511             if ( zSign ) {
    512                 if ( roundingMode == float_round_up ) roundIncrement = 0;
    513             }
    514             else {
    515                 if ( roundingMode == float_round_down ) roundIncrement = 0;
    516             }
    517         }
    518     }
    519     roundBits = zSig & 0x3FF;
    520     if ( 0x7FD <= (uint16_t) zExp ) {
    521         if (    ( 0x7FD < zExp )
    522              || (    ( zExp == 0x7FD )
    523                   && ( (int64_t) ( zSig + roundIncrement ) < 0 ) )
    524            ) {
    525             float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
    526             return packFloat64( zSign, 0x7FF, - ( roundIncrement == 0 ));
    527         }
    528         if ( zExp < 0 ) {
    529             if (STATUS(flush_to_zero)) {
    530                 float_raise(float_flag_output_denormal STATUS_VAR);
    531                 return packFloat64(zSign, 0, 0);
    532             }
    533             isTiny =
    534                    ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
    535                 || ( zExp < -1 )
    536                 || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) );
    537             shift64RightJamming( zSig, - zExp, &zSig );
    538             zExp = 0;
    539             roundBits = zSig & 0x3FF;
    540             if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
    541         }
    542     }
    543     if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
    544     zSig = ( zSig + roundIncrement )>>10;
    545     zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
    546     if ( zSig == 0 ) zExp = 0;
    547     return packFloat64( zSign, zExp, zSig );
    548 
    549 }
    550 
    551 /*----------------------------------------------------------------------------
    552 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
    553 | and significand `zSig', and returns the proper double-precision floating-
    554 | point value corresponding to the abstract input.  This routine is just like
    555 | `roundAndPackFloat64' except that `zSig' does not have to be normalized.
    556 | Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
    557 | floating-point exponent.
    558 *----------------------------------------------------------------------------*/
    559 
    560 static float64
    561  normalizeRoundAndPackFloat64(flag zSign, int_fast16_t zExp, uint64_t zSig STATUS_PARAM)
    562 {
    563     int8 shiftCount;
    564 
    565     shiftCount = countLeadingZeros64( zSig ) - 1;
    566     return roundAndPackFloat64( zSign, zExp - shiftCount, zSig<<shiftCount STATUS_VAR);
    567 
    568 }
    569 
    570 /*----------------------------------------------------------------------------
    571 | Returns the fraction bits of the extended double-precision floating-point
    572 | value `a'.
    573 *----------------------------------------------------------------------------*/
    574 
    575 INLINE uint64_t extractFloatx80Frac( floatx80 a )
    576 {
    577 
    578     return a.low;
    579 
    580 }
    581 
    582 /*----------------------------------------------------------------------------
    583 | Returns the exponent bits of the extended double-precision floating-point
    584 | value `a'.
    585 *----------------------------------------------------------------------------*/
    586 
    587 INLINE int32 extractFloatx80Exp( floatx80 a )
    588 {
    589 
    590     return a.high & 0x7FFF;
    591 
    592 }
    593 
    594 /*----------------------------------------------------------------------------
    595 | Returns the sign bit of the extended double-precision floating-point value
    596 | `a'.
    597 *----------------------------------------------------------------------------*/
    598 
    599 INLINE flag extractFloatx80Sign( floatx80 a )
    600 {
    601 
    602     return a.high>>15;
    603 
    604 }
    605 
    606 /*----------------------------------------------------------------------------
    607 | Normalizes the subnormal extended double-precision floating-point value
    608 | represented by the denormalized significand `aSig'.  The normalized exponent
    609 | and significand are stored at the locations pointed to by `zExpPtr' and
    610 | `zSigPtr', respectively.
    611 *----------------------------------------------------------------------------*/
    612 
    613 static void
    614  normalizeFloatx80Subnormal( uint64_t aSig, int32 *zExpPtr, uint64_t *zSigPtr )
    615 {
    616     int8 shiftCount;
    617 
    618     shiftCount = countLeadingZeros64( aSig );
    619     *zSigPtr = aSig<<shiftCount;
    620     *zExpPtr = 1 - shiftCount;
    621 
    622 }
    623 
    624 /*----------------------------------------------------------------------------
    625 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
    626 | extended double-precision floating-point value, returning the result.
    627 *----------------------------------------------------------------------------*/
    628 
    629 INLINE floatx80 packFloatx80( flag zSign, int32 zExp, uint64_t zSig )
    630 {
    631     floatx80 z;
    632 
    633     z.low = zSig;
    634     z.high = ( ( (uint16_t) zSign )<<15 ) + zExp;
    635     return z;
    636 
    637 }
    638 
    639 /*----------------------------------------------------------------------------
    640 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
    641 | and extended significand formed by the concatenation of `zSig0' and `zSig1',
    642 | and returns the proper extended double-precision floating-point value
    643 | corresponding to the abstract input.  Ordinarily, the abstract value is
    644 | rounded and packed into the extended double-precision format, with the
    645 | inexact exception raised if the abstract input cannot be represented
    646 | exactly.  However, if the abstract value is too large, the overflow and
    647 | inexact exceptions are raised and an infinity or maximal finite value is
    648 | returned.  If the abstract value is too small, the input value is rounded to
    649 | a subnormal number, and the underflow and inexact exceptions are raised if
    650 | the abstract input cannot be represented exactly as a subnormal extended
    651 | double-precision floating-point number.
    652 |     If `roundingPrecision' is 32 or 64, the result is rounded to the same
    653 | number of bits as single or double precision, respectively.  Otherwise, the
    654 | result is rounded to the full precision of the extended double-precision
    655 | format.
    656 |     The input significand must be normalized or smaller.  If the input
    657 | significand is not normalized, `zExp' must be 0; in that case, the result
    658 | returned is a subnormal number, and it must not require rounding.  The
    659 | handling of underflow and overflow follows the IEC/IEEE Standard for Binary
    660 | Floating-Point Arithmetic.
    661 *----------------------------------------------------------------------------*/
    662 
    663 static floatx80
    664  roundAndPackFloatx80(
    665      int8 roundingPrecision, flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1
    666  STATUS_PARAM)
    667 {
    668     int8 roundingMode;
    669     flag roundNearestEven, increment, isTiny;
    670     int64 roundIncrement, roundMask, roundBits;
    671 
    672     roundingMode = STATUS(float_rounding_mode);
    673     roundNearestEven = ( roundingMode == float_round_nearest_even );
    674     if ( roundingPrecision == 80 ) goto precision80;
    675     if ( roundingPrecision == 64 ) {
    676         roundIncrement = LIT64( 0x0000000000000400 );
    677         roundMask = LIT64( 0x00000000000007FF );
    678     }
    679     else if ( roundingPrecision == 32 ) {
    680         roundIncrement = LIT64( 0x0000008000000000 );
    681         roundMask = LIT64( 0x000000FFFFFFFFFF );
    682     }
    683     else {
    684         goto precision80;
    685     }
    686     zSig0 |= ( zSig1 != 0 );
    687     if ( ! roundNearestEven ) {
    688         if ( roundingMode == float_round_to_zero ) {
    689             roundIncrement = 0;
    690         }
    691         else {
    692             roundIncrement = roundMask;
    693             if ( zSign ) {
    694                 if ( roundingMode == float_round_up ) roundIncrement = 0;
    695             }
    696             else {
    697                 if ( roundingMode == float_round_down ) roundIncrement = 0;
    698             }
    699         }
    700     }
    701     roundBits = zSig0 & roundMask;
    702     if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
    703         if (    ( 0x7FFE < zExp )
    704              || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
    705            ) {
    706             goto overflow;
    707         }
    708         if ( zExp <= 0 ) {
    709             if (STATUS(flush_to_zero)) {
    710                 float_raise(float_flag_output_denormal STATUS_VAR);
    711                 return packFloatx80(zSign, 0, 0);
    712             }
    713             isTiny =
    714                    ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
    715                 || ( zExp < 0 )
    716                 || ( zSig0 <= zSig0 + roundIncrement );
    717             shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
    718             zExp = 0;
    719             roundBits = zSig0 & roundMask;
    720             if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
    721             if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
    722             zSig0 += roundIncrement;
    723             if ( (int64_t) zSig0 < 0 ) zExp = 1;
    724             roundIncrement = roundMask + 1;
    725             if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
    726                 roundMask |= roundIncrement;
    727             }
    728             zSig0 &= ~ roundMask;
    729             return packFloatx80( zSign, zExp, zSig0 );
    730         }
    731     }
    732     if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
    733     zSig0 += roundIncrement;
    734     if ( zSig0 < roundIncrement ) {
    735         ++zExp;
    736         zSig0 = LIT64( 0x8000000000000000 );
    737     }
    738     roundIncrement = roundMask + 1;
    739     if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
    740         roundMask |= roundIncrement;
    741     }
    742     zSig0 &= ~ roundMask;
    743     if ( zSig0 == 0 ) zExp = 0;
    744     return packFloatx80( zSign, zExp, zSig0 );
    745  precision80:
    746     increment = ( (int64_t) zSig1 < 0 );
    747     if ( ! roundNearestEven ) {
    748         if ( roundingMode == float_round_to_zero ) {
    749             increment = 0;
    750         }
    751         else {
    752             if ( zSign ) {
    753                 increment = ( roundingMode == float_round_down ) && zSig1;
    754             }
    755             else {
    756                 increment = ( roundingMode == float_round_up ) && zSig1;
    757             }
    758         }
    759     }
    760     if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
    761         if (    ( 0x7FFE < zExp )
    762              || (    ( zExp == 0x7FFE )
    763                   && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
    764                   && increment
    765                 )
    766            ) {
    767             roundMask = 0;
    768  overflow:
    769             float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
    770             if (    ( roundingMode == float_round_to_zero )
    771                  || ( zSign && ( roundingMode == float_round_up ) )
    772                  || ( ! zSign && ( roundingMode == float_round_down ) )
    773                ) {
    774                 return packFloatx80( zSign, 0x7FFE, ~ roundMask );
    775             }
    776             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
    777         }
    778         if ( zExp <= 0 ) {
    779             isTiny =
    780                    ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
    781                 || ( zExp < 0 )
    782                 || ! increment
    783                 || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
    784             shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
    785             zExp = 0;
    786             if ( isTiny && zSig1 ) float_raise( float_flag_underflow STATUS_VAR);
    787             if ( zSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
    788             if ( roundNearestEven ) {
    789                 increment = ( (int64_t) zSig1 < 0 );
    790             }
    791             else {
    792                 if ( zSign ) {
    793                     increment = ( roundingMode == float_round_down ) && zSig1;
    794                 }
    795                 else {
    796                     increment = ( roundingMode == float_round_up ) && zSig1;
    797                 }
    798             }
    799             if ( increment ) {
    800                 ++zSig0;
    801                 zSig0 &=
    802                     ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
    803                 if ( (int64_t) zSig0 < 0 ) zExp = 1;
    804             }
    805             return packFloatx80( zSign, zExp, zSig0 );
    806         }
    807     }
    808     if ( zSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
    809     if ( increment ) {
    810         ++zSig0;
    811         if ( zSig0 == 0 ) {
    812             ++zExp;
    813             zSig0 = LIT64( 0x8000000000000000 );
    814         }
    815         else {
    816             zSig0 &= ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
    817         }
    818     }
    819     else {
    820         if ( zSig0 == 0 ) zExp = 0;
    821     }
    822     return packFloatx80( zSign, zExp, zSig0 );
    823 
    824 }
    825 
    826 /*----------------------------------------------------------------------------
    827 | Takes an abstract floating-point value having sign `zSign', exponent
    828 | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
    829 | and returns the proper extended double-precision floating-point value
    830 | corresponding to the abstract input.  This routine is just like
    831 | `roundAndPackFloatx80' except that the input significand does not have to be
    832 | normalized.
    833 *----------------------------------------------------------------------------*/
    834 
    835 static floatx80
    836  normalizeRoundAndPackFloatx80(
    837      int8 roundingPrecision, flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1
    838  STATUS_PARAM)
    839 {
    840     int8 shiftCount;
    841 
    842     if ( zSig0 == 0 ) {
    843         zSig0 = zSig1;
    844         zSig1 = 0;
    845         zExp -= 64;
    846     }
    847     shiftCount = countLeadingZeros64( zSig0 );
    848     shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
    849     zExp -= shiftCount;
    850     return
    851         roundAndPackFloatx80( roundingPrecision, zSign, zExp, zSig0, zSig1 STATUS_VAR);
    852 
    853 }
    854 
    855 /*----------------------------------------------------------------------------
    856 | Returns the least-significant 64 fraction bits of the quadruple-precision
    857 | floating-point value `a'.
    858 *----------------------------------------------------------------------------*/
    859 
    860 INLINE uint64_t extractFloat128Frac1( float128 a )
    861 {
    862 
    863     return a.low;
    864 
    865 }
    866 
    867 /*----------------------------------------------------------------------------
    868 | Returns the most-significant 48 fraction bits of the quadruple-precision
    869 | floating-point value `a'.
    870 *----------------------------------------------------------------------------*/
    871 
    872 INLINE uint64_t extractFloat128Frac0( float128 a )
    873 {
    874 
    875     return a.high & LIT64( 0x0000FFFFFFFFFFFF );
    876 
    877 }
    878 
    879 /*----------------------------------------------------------------------------
    880 | Returns the exponent bits of the quadruple-precision floating-point value
    881 | `a'.
    882 *----------------------------------------------------------------------------*/
    883 
    884 INLINE int32 extractFloat128Exp( float128 a )
    885 {
    886 
    887     return ( a.high>>48 ) & 0x7FFF;
    888 
    889 }
    890 
    891 /*----------------------------------------------------------------------------
    892 | Returns the sign bit of the quadruple-precision floating-point value `a'.
    893 *----------------------------------------------------------------------------*/
    894 
    895 INLINE flag extractFloat128Sign( float128 a )
    896 {
    897 
    898     return a.high>>63;
    899 
    900 }
    901 
    902 /*----------------------------------------------------------------------------
    903 | Normalizes the subnormal quadruple-precision floating-point value
    904 | represented by the denormalized significand formed by the concatenation of
    905 | `aSig0' and `aSig1'.  The normalized exponent is stored at the location
    906 | pointed to by `zExpPtr'.  The most significant 49 bits of the normalized
    907 | significand are stored at the location pointed to by `zSig0Ptr', and the
    908 | least significant 64 bits of the normalized significand are stored at the
    909 | location pointed to by `zSig1Ptr'.
    910 *----------------------------------------------------------------------------*/
    911 
    912 static void
    913  normalizeFloat128Subnormal(
    914      uint64_t aSig0,
    915      uint64_t aSig1,
    916      int32 *zExpPtr,
    917      uint64_t *zSig0Ptr,
    918      uint64_t *zSig1Ptr
    919  )
    920 {
    921     int8 shiftCount;
    922 
    923     if ( aSig0 == 0 ) {
    924         shiftCount = countLeadingZeros64( aSig1 ) - 15;
    925         if ( shiftCount < 0 ) {
    926             *zSig0Ptr = aSig1>>( - shiftCount );
    927             *zSig1Ptr = aSig1<<( shiftCount & 63 );
    928         }
    929         else {
    930             *zSig0Ptr = aSig1<<shiftCount;
    931             *zSig1Ptr = 0;
    932         }
    933         *zExpPtr = - shiftCount - 63;
    934     }
    935     else {
    936         shiftCount = countLeadingZeros64( aSig0 ) - 15;
    937         shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
    938         *zExpPtr = 1 - shiftCount;
    939     }
    940 
    941 }
    942 
    943 /*----------------------------------------------------------------------------
    944 | Packs the sign `zSign', the exponent `zExp', and the significand formed
    945 | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
    946 | floating-point value, returning the result.  After being shifted into the
    947 | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
    948 | added together to form the most significant 32 bits of the result.  This
    949 | means that any integer portion of `zSig0' will be added into the exponent.
    950 | Since a properly normalized significand will have an integer portion equal
    951 | to 1, the `zExp' input should be 1 less than the desired result exponent
    952 | whenever `zSig0' and `zSig1' concatenated form a complete, normalized
    953 | significand.
    954 *----------------------------------------------------------------------------*/
    955 
    956 INLINE float128
    957  packFloat128( flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1 )
    958 {
    959     float128 z;
    960 
    961     z.low = zSig1;
    962     z.high = ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<48 ) + zSig0;
    963     return z;
    964 
    965 }
    966 
    967 /*----------------------------------------------------------------------------
    968 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
    969 | and extended significand formed by the concatenation of `zSig0', `zSig1',
    970 | and `zSig2', and returns the proper quadruple-precision floating-point value
    971 | corresponding to the abstract input.  Ordinarily, the abstract value is
    972 | simply rounded and packed into the quadruple-precision format, with the
    973 | inexact exception raised if the abstract input cannot be represented
    974 | exactly.  However, if the abstract value is too large, the overflow and
    975 | inexact exceptions are raised and an infinity or maximal finite value is
    976 | returned.  If the abstract value is too small, the input value is rounded to
    977 | a subnormal number, and the underflow and inexact exceptions are raised if
    978 | the abstract input cannot be represented exactly as a subnormal quadruple-
    979 | precision floating-point number.
    980 |     The input significand must be normalized or smaller.  If the input
    981 | significand is not normalized, `zExp' must be 0; in that case, the result
    982 | returned is a subnormal number, and it must not require rounding.  In the
    983 | usual case that the input significand is normalized, `zExp' must be 1 less
    984 | than the ``true'' floating-point exponent.  The handling of underflow and
    985 | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
    986 *----------------------------------------------------------------------------*/
    987 
    988 static float128
    989  roundAndPackFloat128(
    990      flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1, uint64_t zSig2 STATUS_PARAM)
    991 {
    992     int8 roundingMode;
    993     flag roundNearestEven, increment, isTiny;
    994 
    995     roundingMode = STATUS(float_rounding_mode);
    996     roundNearestEven = ( roundingMode == float_round_nearest_even );
    997     increment = ( (int64_t) zSig2 < 0 );
    998     if ( ! roundNearestEven ) {
    999         if ( roundingMode == float_round_to_zero ) {
   1000             increment = 0;
   1001         }
   1002         else {
   1003             if ( zSign ) {
   1004                 increment = ( roundingMode == float_round_down ) && zSig2;
   1005             }
   1006             else {
   1007                 increment = ( roundingMode == float_round_up ) && zSig2;
   1008             }
   1009         }
   1010     }
   1011     if ( 0x7FFD <= (uint32_t) zExp ) {
   1012         if (    ( 0x7FFD < zExp )
   1013              || (    ( zExp == 0x7FFD )
   1014                   && eq128(
   1015                          LIT64( 0x0001FFFFFFFFFFFF ),
   1016                          LIT64( 0xFFFFFFFFFFFFFFFF ),
   1017                          zSig0,
   1018                          zSig1
   1019                      )
   1020                   && increment
   1021                 )
   1022            ) {
   1023             float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
   1024             if (    ( roundingMode == float_round_to_zero )
   1025                  || ( zSign && ( roundingMode == float_round_up ) )
   1026                  || ( ! zSign && ( roundingMode == float_round_down ) )
   1027                ) {
   1028                 return
   1029                     packFloat128(
   1030                         zSign,
   1031                         0x7FFE,
   1032                         LIT64( 0x0000FFFFFFFFFFFF ),
   1033                         LIT64( 0xFFFFFFFFFFFFFFFF )
   1034                     );
   1035             }
   1036             return packFloat128( zSign, 0x7FFF, 0, 0 );
   1037         }
   1038         if ( zExp < 0 ) {
   1039             if (STATUS(flush_to_zero)) {
   1040                 float_raise(float_flag_output_denormal STATUS_VAR);
   1041                 return packFloat128(zSign, 0, 0, 0);
   1042             }
   1043             isTiny =
   1044                    ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
   1045                 || ( zExp < -1 )
   1046                 || ! increment
   1047                 || lt128(
   1048                        zSig0,
   1049                        zSig1,
   1050                        LIT64( 0x0001FFFFFFFFFFFF ),
   1051                        LIT64( 0xFFFFFFFFFFFFFFFF )
   1052                    );
   1053             shift128ExtraRightJamming(
   1054                 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
   1055             zExp = 0;
   1056             if ( isTiny && zSig2 ) float_raise( float_flag_underflow STATUS_VAR);
   1057             if ( roundNearestEven ) {
   1058                 increment = ( (int64_t) zSig2 < 0 );
   1059             }
   1060             else {
   1061                 if ( zSign ) {
   1062                     increment = ( roundingMode == float_round_down ) && zSig2;
   1063                 }
   1064                 else {
   1065                     increment = ( roundingMode == float_round_up ) && zSig2;
   1066                 }
   1067             }
   1068         }
   1069     }
   1070     if ( zSig2 ) STATUS(float_exception_flags) |= float_flag_inexact;
   1071     if ( increment ) {
   1072         add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
   1073         zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven );
   1074     }
   1075     else {
   1076         if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
   1077     }
   1078     return packFloat128( zSign, zExp, zSig0, zSig1 );
   1079 
   1080 }
   1081 
   1082 /*----------------------------------------------------------------------------
   1083 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
   1084 | and significand formed by the concatenation of `zSig0' and `zSig1', and
   1085 | returns the proper quadruple-precision floating-point value corresponding
   1086 | to the abstract input.  This routine is just like `roundAndPackFloat128'
   1087 | except that the input significand has fewer bits and does not have to be
   1088 | normalized.  In all cases, `zExp' must be 1 less than the ``true'' floating-
   1089 | point exponent.
   1090 *----------------------------------------------------------------------------*/
   1091 
   1092 static float128
   1093  normalizeRoundAndPackFloat128(
   1094      flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1 STATUS_PARAM)
   1095 {
   1096     int8 shiftCount;
   1097     uint64_t zSig2;
   1098 
   1099     if ( zSig0 == 0 ) {
   1100         zSig0 = zSig1;
   1101         zSig1 = 0;
   1102         zExp -= 64;
   1103     }
   1104     shiftCount = countLeadingZeros64( zSig0 ) - 15;
   1105     if ( 0 <= shiftCount ) {
   1106         zSig2 = 0;
   1107         shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
   1108     }
   1109     else {
   1110         shift128ExtraRightJamming(
   1111             zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
   1112     }
   1113     zExp -= shiftCount;
   1114     return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR);
   1115 
   1116 }
   1117 
   1118 /*----------------------------------------------------------------------------
   1119 | Returns the result of converting the 32-bit two's complement integer `a'
   1120 | to the single-precision floating-point format.  The conversion is performed
   1121 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   1122 *----------------------------------------------------------------------------*/
   1123 
   1124 float32 int32_to_float32( int32 a STATUS_PARAM )
   1125 {
   1126     flag zSign;
   1127 
   1128     if ( a == 0 ) return float32_zero;
   1129     if ( a == (int32_t) 0x80000000 ) return packFloat32( 1, 0x9E, 0 );
   1130     zSign = ( a < 0 );
   1131     return normalizeRoundAndPackFloat32( zSign, 0x9C, zSign ? - a : a STATUS_VAR );
   1132 
   1133 }
   1134 
   1135 /*----------------------------------------------------------------------------
   1136 | Returns the result of converting the 32-bit two's complement integer `a'
   1137 | to the double-precision floating-point format.  The conversion is performed
   1138 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   1139 *----------------------------------------------------------------------------*/
   1140 
   1141 float64 int32_to_float64( int32 a STATUS_PARAM )
   1142 {
   1143     flag zSign;
   1144     uint32 absA;
   1145     int8 shiftCount;
   1146     uint64_t zSig;
   1147 
   1148     if ( a == 0 ) return float64_zero;
   1149     zSign = ( a < 0 );
   1150     absA = zSign ? - a : a;
   1151     shiftCount = countLeadingZeros32( absA ) + 21;
   1152     zSig = absA;
   1153     return packFloat64( zSign, 0x432 - shiftCount, zSig<<shiftCount );
   1154 
   1155 }
   1156 
   1157 /*----------------------------------------------------------------------------
   1158 | Returns the result of converting the 32-bit two's complement integer `a'
   1159 | to the extended double-precision floating-point format.  The conversion
   1160 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
   1161 | Arithmetic.
   1162 *----------------------------------------------------------------------------*/
   1163 
   1164 floatx80 int32_to_floatx80( int32 a STATUS_PARAM )
   1165 {
   1166     flag zSign;
   1167     uint32 absA;
   1168     int8 shiftCount;
   1169     uint64_t zSig;
   1170 
   1171     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
   1172     zSign = ( a < 0 );
   1173     absA = zSign ? - a : a;
   1174     shiftCount = countLeadingZeros32( absA ) + 32;
   1175     zSig = absA;
   1176     return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
   1177 
   1178 }
   1179 
   1180 /*----------------------------------------------------------------------------
   1181 | Returns the result of converting the 32-bit two's complement integer `a' to
   1182 | the quadruple-precision floating-point format.  The conversion is performed
   1183 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   1184 *----------------------------------------------------------------------------*/
   1185 
   1186 float128 int32_to_float128( int32 a STATUS_PARAM )
   1187 {
   1188     flag zSign;
   1189     uint32 absA;
   1190     int8 shiftCount;
   1191     uint64_t zSig0;
   1192 
   1193     if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
   1194     zSign = ( a < 0 );
   1195     absA = zSign ? - a : a;
   1196     shiftCount = countLeadingZeros32( absA ) + 17;
   1197     zSig0 = absA;
   1198     return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
   1199 
   1200 }
   1201 
   1202 /*----------------------------------------------------------------------------
   1203 | Returns the result of converting the 64-bit two's complement integer `a'
   1204 | to the single-precision floating-point format.  The conversion is performed
   1205 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   1206 *----------------------------------------------------------------------------*/
   1207 
   1208 float32 int64_to_float32( int64 a STATUS_PARAM )
   1209 {
   1210     flag zSign;
   1211     uint64 absA;
   1212     int8 shiftCount;
   1213 
   1214     if ( a == 0 ) return float32_zero;
   1215     zSign = ( a < 0 );
   1216     absA = zSign ? - a : a;
   1217     shiftCount = countLeadingZeros64( absA ) - 40;
   1218     if ( 0 <= shiftCount ) {
   1219         return packFloat32( zSign, 0x95 - shiftCount, absA<<shiftCount );
   1220     }
   1221     else {
   1222         shiftCount += 7;
   1223         if ( shiftCount < 0 ) {
   1224             shift64RightJamming( absA, - shiftCount, &absA );
   1225         }
   1226         else {
   1227             absA <<= shiftCount;
   1228         }
   1229         return roundAndPackFloat32( zSign, 0x9C - shiftCount, absA STATUS_VAR );
   1230     }
   1231 
   1232 }
   1233 
   1234 float32 uint64_to_float32( uint64 a STATUS_PARAM )
   1235 {
   1236     int8 shiftCount;
   1237 
   1238     if ( a == 0 ) return float32_zero;
   1239     shiftCount = countLeadingZeros64( a ) - 40;
   1240     if ( 0 <= shiftCount ) {
   1241         return packFloat32(0, 0x95 - shiftCount, a<<shiftCount);
   1242     }
   1243     else {
   1244         shiftCount += 7;
   1245         if ( shiftCount < 0 ) {
   1246             shift64RightJamming( a, - shiftCount, &a );
   1247         }
   1248         else {
   1249             a <<= shiftCount;
   1250         }
   1251         return roundAndPackFloat32(0, 0x9C - shiftCount, a STATUS_VAR);
   1252     }
   1253 }
   1254 
   1255 /*----------------------------------------------------------------------------
   1256 | Returns the result of converting the 64-bit two's complement integer `a'
   1257 | to the double-precision floating-point format.  The conversion is performed
   1258 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   1259 *----------------------------------------------------------------------------*/
   1260 
   1261 float64 int64_to_float64( int64 a STATUS_PARAM )
   1262 {
   1263     flag zSign;
   1264 
   1265     if ( a == 0 ) return float64_zero;
   1266     if ( a == (int64_t) LIT64( 0x8000000000000000 ) ) {
   1267         return packFloat64( 1, 0x43E, 0 );
   1268     }
   1269     zSign = ( a < 0 );
   1270     return normalizeRoundAndPackFloat64( zSign, 0x43C, zSign ? - a : a STATUS_VAR );
   1271 
   1272 }
   1273 
   1274 float64 uint64_to_float64(uint64 a STATUS_PARAM)
   1275 {
   1276     int exp =  0x43C;
   1277 
   1278     if (a == 0) {
   1279         return float64_zero;
   1280     }
   1281     if ((int64_t)a < 0) {
   1282         shift64RightJamming(a, 1, &a);
   1283         exp += 1;
   1284     }
   1285     return normalizeRoundAndPackFloat64(0, exp, a STATUS_VAR);
   1286 }
   1287 
   1288 /*----------------------------------------------------------------------------
   1289 | Returns the result of converting the 64-bit two's complement integer `a'
   1290 | to the extended double-precision floating-point format.  The conversion
   1291 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
   1292 | Arithmetic.
   1293 *----------------------------------------------------------------------------*/
   1294 
   1295 floatx80 int64_to_floatx80( int64 a STATUS_PARAM )
   1296 {
   1297     flag zSign;
   1298     uint64 absA;
   1299     int8 shiftCount;
   1300 
   1301     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
   1302     zSign = ( a < 0 );
   1303     absA = zSign ? - a : a;
   1304     shiftCount = countLeadingZeros64( absA );
   1305     return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
   1306 
   1307 }
   1308 
   1309 /*----------------------------------------------------------------------------
   1310 | Returns the result of converting the 64-bit two's complement integer `a' to
   1311 | the quadruple-precision floating-point format.  The conversion is performed
   1312 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   1313 *----------------------------------------------------------------------------*/
   1314 
   1315 float128 int64_to_float128( int64 a STATUS_PARAM )
   1316 {
   1317     flag zSign;
   1318     uint64 absA;
   1319     int8 shiftCount;
   1320     int32 zExp;
   1321     uint64_t zSig0, zSig1;
   1322 
   1323     if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
   1324     zSign = ( a < 0 );
   1325     absA = zSign ? - a : a;
   1326     shiftCount = countLeadingZeros64( absA ) + 49;
   1327     zExp = 0x406E - shiftCount;
   1328     if ( 64 <= shiftCount ) {
   1329         zSig1 = 0;
   1330         zSig0 = absA;
   1331         shiftCount -= 64;
   1332     }
   1333     else {
   1334         zSig1 = absA;
   1335         zSig0 = 0;
   1336     }
   1337     shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
   1338     return packFloat128( zSign, zExp, zSig0, zSig1 );
   1339 
   1340 }
   1341 
   1342 float128 uint64_to_float128(uint64 a STATUS_PARAM)
   1343 {
   1344     if (a == 0) {
   1345         return float128_zero;
   1346     }
   1347     return normalizeRoundAndPackFloat128(0, 0x406E, a, 0 STATUS_VAR);
   1348 }
   1349 
   1350 /*----------------------------------------------------------------------------
   1351 | Returns the result of converting the single-precision floating-point value
   1352 | `a' to the 32-bit two's complement integer format.  The conversion is
   1353 | performed according to the IEC/IEEE Standard for Binary Floating-Point
   1354 | Arithmetic---which means in particular that the conversion is rounded
   1355 | according to the current rounding mode.  If `a' is a NaN, the largest
   1356 | positive integer is returned.  Otherwise, if the conversion overflows, the
   1357 | largest integer with the same sign as `a' is returned.
   1358 *----------------------------------------------------------------------------*/
   1359 
   1360 int32 float32_to_int32( float32 a STATUS_PARAM )
   1361 {
   1362     flag aSign;
   1363     int_fast16_t aExp, shiftCount;
   1364     uint32_t aSig;
   1365     uint64_t aSig64;
   1366 
   1367     a = float32_squash_input_denormal(a STATUS_VAR);
   1368     aSig = extractFloat32Frac( a );
   1369     aExp = extractFloat32Exp( a );
   1370     aSign = extractFloat32Sign( a );
   1371     if ( ( aExp == 0xFF ) && aSig ) aSign = 0;
   1372     if ( aExp ) aSig |= 0x00800000;
   1373     shiftCount = 0xAF - aExp;
   1374     aSig64 = aSig;
   1375     aSig64 <<= 32;
   1376     if ( 0 < shiftCount ) shift64RightJamming( aSig64, shiftCount, &aSig64 );
   1377     return roundAndPackInt32( aSign, aSig64 STATUS_VAR );
   1378 
   1379 }
   1380 
   1381 /*----------------------------------------------------------------------------
   1382 | Returns the result of converting the single-precision floating-point value
   1383 | `a' to the 32-bit two's complement integer format.  The conversion is
   1384 | performed according to the IEC/IEEE Standard for Binary Floating-Point
   1385 | Arithmetic, except that the conversion is always rounded toward zero.
   1386 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
   1387 | the conversion overflows, the largest integer with the same sign as `a' is
   1388 | returned.
   1389 *----------------------------------------------------------------------------*/
   1390 
   1391 int32 float32_to_int32_round_to_zero( float32 a STATUS_PARAM )
   1392 {
   1393     flag aSign;
   1394     int_fast16_t aExp, shiftCount;
   1395     uint32_t aSig;
   1396     int32_t z;
   1397     a = float32_squash_input_denormal(a STATUS_VAR);
   1398 
   1399     aSig = extractFloat32Frac( a );
   1400     aExp = extractFloat32Exp( a );
   1401     aSign = extractFloat32Sign( a );
   1402     shiftCount = aExp - 0x9E;
   1403     if ( 0 <= shiftCount ) {
   1404         if ( float32_val(a) != 0xCF000000 ) {
   1405             float_raise( float_flag_invalid STATUS_VAR);
   1406             if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF;
   1407         }
   1408         return (int32_t) 0x80000000;
   1409     }
   1410     else if ( aExp <= 0x7E ) {
   1411         if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
   1412         return 0;
   1413     }
   1414     aSig = ( aSig | 0x00800000 )<<8;
   1415     z = aSig>>( - shiftCount );
   1416     if ( (uint32_t) ( aSig<<( shiftCount & 31 ) ) ) {
   1417         STATUS(float_exception_flags) |= float_flag_inexact;
   1418     }
   1419     if ( aSign ) z = - z;
   1420     return z;
   1421 
   1422 }
   1423 
   1424 /*----------------------------------------------------------------------------
   1425 | Returns the result of converting the single-precision floating-point value
   1426 | `a' to the 16-bit two's complement integer format.  The conversion is
   1427 | performed according to the IEC/IEEE Standard for Binary Floating-Point
   1428 | Arithmetic, except that the conversion is always rounded toward zero.
   1429 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
   1430 | the conversion overflows, the largest integer with the same sign as `a' is
   1431 | returned.
   1432 *----------------------------------------------------------------------------*/
   1433 
   1434 int_fast16_t float32_to_int16_round_to_zero(float32 a STATUS_PARAM)
   1435 {
   1436     flag aSign;
   1437     int_fast16_t aExp, shiftCount;
   1438     uint32_t aSig;
   1439     int32 z;
   1440 
   1441     aSig = extractFloat32Frac( a );
   1442     aExp = extractFloat32Exp( a );
   1443     aSign = extractFloat32Sign( a );
   1444     shiftCount = aExp - 0x8E;
   1445     if ( 0 <= shiftCount ) {
   1446         if ( float32_val(a) != 0xC7000000 ) {
   1447             float_raise( float_flag_invalid STATUS_VAR);
   1448             if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
   1449                 return 0x7FFF;
   1450             }
   1451         }
   1452         return (int32_t) 0xffff8000;
   1453     }
   1454     else if ( aExp <= 0x7E ) {
   1455         if ( aExp | aSig ) {
   1456             STATUS(float_exception_flags) |= float_flag_inexact;
   1457         }
   1458         return 0;
   1459     }
   1460     shiftCount -= 0x10;
   1461     aSig = ( aSig | 0x00800000 )<<8;
   1462     z = aSig>>( - shiftCount );
   1463     if ( (uint32_t) ( aSig<<( shiftCount & 31 ) ) ) {
   1464         STATUS(float_exception_flags) |= float_flag_inexact;
   1465     }
   1466     if ( aSign ) {
   1467         z = - z;
   1468     }
   1469     return z;
   1470 
   1471 }
   1472 
   1473 /*----------------------------------------------------------------------------
   1474 | Returns the result of converting the single-precision floating-point value
   1475 | `a' to the 64-bit two's complement integer format.  The conversion is
   1476 | performed according to the IEC/IEEE Standard for Binary Floating-Point
   1477 | Arithmetic---which means in particular that the conversion is rounded
   1478 | according to the current rounding mode.  If `a' is a NaN, the largest
   1479 | positive integer is returned.  Otherwise, if the conversion overflows, the
   1480 | largest integer with the same sign as `a' is returned.
   1481 *----------------------------------------------------------------------------*/
   1482 
   1483 int64 float32_to_int64( float32 a STATUS_PARAM )
   1484 {
   1485     flag aSign;
   1486     int_fast16_t aExp, shiftCount;
   1487     uint32_t aSig;
   1488     uint64_t aSig64, aSigExtra;
   1489     a = float32_squash_input_denormal(a STATUS_VAR);
   1490 
   1491     aSig = extractFloat32Frac( a );
   1492     aExp = extractFloat32Exp( a );
   1493     aSign = extractFloat32Sign( a );
   1494     shiftCount = 0xBE - aExp;
   1495     if ( shiftCount < 0 ) {
   1496         float_raise( float_flag_invalid STATUS_VAR);
   1497         if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
   1498             return LIT64( 0x7FFFFFFFFFFFFFFF );
   1499         }
   1500         return (int64_t) LIT64( 0x8000000000000000 );
   1501     }
   1502     if ( aExp ) aSig |= 0x00800000;
   1503     aSig64 = aSig;
   1504     aSig64 <<= 40;
   1505     shift64ExtraRightJamming( aSig64, 0, shiftCount, &aSig64, &aSigExtra );
   1506     return roundAndPackInt64( aSign, aSig64, aSigExtra STATUS_VAR );
   1507 
   1508 }
   1509 
   1510 /*----------------------------------------------------------------------------
   1511 | Returns the result of converting the single-precision floating-point value
   1512 | `a' to the 64-bit two's complement integer format.  The conversion is
   1513 | performed according to the IEC/IEEE Standard for Binary Floating-Point
   1514 | Arithmetic, except that the conversion is always rounded toward zero.  If
   1515 | `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
   1516 | conversion overflows, the largest integer with the same sign as `a' is
   1517 | returned.
   1518 *----------------------------------------------------------------------------*/
   1519 
   1520 int64 float32_to_int64_round_to_zero( float32 a STATUS_PARAM )
   1521 {
   1522     flag aSign;
   1523     int_fast16_t aExp, shiftCount;
   1524     uint32_t aSig;
   1525     uint64_t aSig64;
   1526     int64 z;
   1527     a = float32_squash_input_denormal(a STATUS_VAR);
   1528 
   1529     aSig = extractFloat32Frac( a );
   1530     aExp = extractFloat32Exp( a );
   1531     aSign = extractFloat32Sign( a );
   1532     shiftCount = aExp - 0xBE;
   1533     if ( 0 <= shiftCount ) {
   1534         if ( float32_val(a) != 0xDF000000 ) {
   1535             float_raise( float_flag_invalid STATUS_VAR);
   1536             if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
   1537                 return LIT64( 0x7FFFFFFFFFFFFFFF );
   1538             }
   1539         }
   1540         return (int64_t) LIT64( 0x8000000000000000 );
   1541     }
   1542     else if ( aExp <= 0x7E ) {
   1543         if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
   1544         return 0;
   1545     }
   1546     aSig64 = aSig | 0x00800000;
   1547     aSig64 <<= 40;
   1548     z = aSig64>>( - shiftCount );
   1549     if ( (uint64_t) ( aSig64<<( shiftCount & 63 ) ) ) {
   1550         STATUS(float_exception_flags) |= float_flag_inexact;
   1551     }
   1552     if ( aSign ) z = - z;
   1553     return z;
   1554 
   1555 }
   1556 
   1557 /*----------------------------------------------------------------------------
   1558 | Returns the result of converting the single-precision floating-point value
   1559 | `a' to the double-precision floating-point format.  The conversion is
   1560 | performed according to the IEC/IEEE Standard for Binary Floating-Point
   1561 | Arithmetic.
   1562 *----------------------------------------------------------------------------*/
   1563 
   1564 float64 float32_to_float64( float32 a STATUS_PARAM )
   1565 {
   1566     flag aSign;
   1567     int_fast16_t aExp;
   1568     uint32_t aSig;
   1569     a = float32_squash_input_denormal(a STATUS_VAR);
   1570 
   1571     aSig = extractFloat32Frac( a );
   1572     aExp = extractFloat32Exp( a );
   1573     aSign = extractFloat32Sign( a );
   1574     if ( aExp == 0xFF ) {
   1575         if ( aSig ) return commonNaNToFloat64( float32ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
   1576         return packFloat64( aSign, 0x7FF, 0 );
   1577     }
   1578     if ( aExp == 0 ) {
   1579         if ( aSig == 0 ) return packFloat64( aSign, 0, 0 );
   1580         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
   1581         --aExp;
   1582     }
   1583     return packFloat64( aSign, aExp + 0x380, ( (uint64_t) aSig )<<29 );
   1584 
   1585 }
   1586 
   1587 /*----------------------------------------------------------------------------
   1588 | Returns the result of converting the single-precision floating-point value
   1589 | `a' to the extended double-precision floating-point format.  The conversion
   1590 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
   1591 | Arithmetic.
   1592 *----------------------------------------------------------------------------*/
   1593 
   1594 floatx80 float32_to_floatx80( float32 a STATUS_PARAM )
   1595 {
   1596     flag aSign;
   1597     int_fast16_t aExp;
   1598     uint32_t aSig;
   1599 
   1600     a = float32_squash_input_denormal(a STATUS_VAR);
   1601     aSig = extractFloat32Frac( a );
   1602     aExp = extractFloat32Exp( a );
   1603     aSign = extractFloat32Sign( a );
   1604     if ( aExp == 0xFF ) {
   1605         if ( aSig ) return commonNaNToFloatx80( float32ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
   1606         return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
   1607     }
   1608     if ( aExp == 0 ) {
   1609         if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
   1610         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
   1611     }
   1612     aSig |= 0x00800000;
   1613     return packFloatx80( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<40 );
   1614 
   1615 }
   1616 
   1617 /*----------------------------------------------------------------------------
   1618 | Returns the result of converting the single-precision floating-point value
   1619 | `a' to the double-precision floating-point format.  The conversion is
   1620 | performed according to the IEC/IEEE Standard for Binary Floating-Point
   1621 | Arithmetic.
   1622 *----------------------------------------------------------------------------*/
   1623 
   1624 float128 float32_to_float128( float32 a STATUS_PARAM )
   1625 {
   1626     flag aSign;
   1627     int_fast16_t aExp;
   1628     uint32_t aSig;
   1629 
   1630     a = float32_squash_input_denormal(a STATUS_VAR);
   1631     aSig = extractFloat32Frac( a );
   1632     aExp = extractFloat32Exp( a );
   1633     aSign = extractFloat32Sign( a );
   1634     if ( aExp == 0xFF ) {
   1635         if ( aSig ) return commonNaNToFloat128( float32ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
   1636         return packFloat128( aSign, 0x7FFF, 0, 0 );
   1637     }
   1638     if ( aExp == 0 ) {
   1639         if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
   1640         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
   1641         --aExp;
   1642     }
   1643     return packFloat128( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<25, 0 );
   1644 
   1645 }
   1646 
   1647 /*----------------------------------------------------------------------------
   1648 | Rounds the single-precision floating-point value `a' to an integer, and
   1649 | returns the result as a single-precision floating-point value.  The
   1650 | operation is performed according to the IEC/IEEE Standard for Binary
   1651 | Floating-Point Arithmetic.
   1652 *----------------------------------------------------------------------------*/
   1653 
   1654 float32 float32_round_to_int( float32 a STATUS_PARAM)
   1655 {
   1656     flag aSign;
   1657     int_fast16_t aExp;
   1658     uint32_t lastBitMask, roundBitsMask;
   1659     int8 roundingMode;
   1660     uint32_t z;
   1661     a = float32_squash_input_denormal(a STATUS_VAR);
   1662 
   1663     aExp = extractFloat32Exp( a );
   1664     if ( 0x96 <= aExp ) {
   1665         if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) {
   1666             return propagateFloat32NaN( a, a STATUS_VAR );
   1667         }
   1668         return a;
   1669     }
   1670     if ( aExp <= 0x7E ) {
   1671         if ( (uint32_t) ( float32_val(a)<<1 ) == 0 ) return a;
   1672         STATUS(float_exception_flags) |= float_flag_inexact;
   1673         aSign = extractFloat32Sign( a );
   1674         switch ( STATUS(float_rounding_mode) ) {
   1675          case float_round_nearest_even:
   1676             if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) {
   1677                 return packFloat32( aSign, 0x7F, 0 );
   1678             }
   1679             break;
   1680          case float_round_down:
   1681             return make_float32(aSign ? 0xBF800000 : 0);
   1682          case float_round_up:
   1683             return make_float32(aSign ? 0x80000000 : 0x3F800000);
   1684         }
   1685         return packFloat32( aSign, 0, 0 );
   1686     }
   1687     lastBitMask = 1;
   1688     lastBitMask <<= 0x96 - aExp;
   1689     roundBitsMask = lastBitMask - 1;
   1690     z = float32_val(a);
   1691     roundingMode = STATUS(float_rounding_mode);
   1692     if ( roundingMode == float_round_nearest_even ) {
   1693         z += lastBitMask>>1;
   1694         if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
   1695     }
   1696     else if ( roundingMode != float_round_to_zero ) {
   1697         if ( extractFloat32Sign( make_float32(z) ) ^ ( roundingMode == float_round_up ) ) {
   1698             z += roundBitsMask;
   1699         }
   1700     }
   1701     z &= ~ roundBitsMask;
   1702     if ( z != float32_val(a) ) STATUS(float_exception_flags) |= float_flag_inexact;
   1703     return make_float32(z);
   1704 
   1705 }
   1706 
   1707 /*----------------------------------------------------------------------------
   1708 | Returns the result of adding the absolute values of the single-precision
   1709 | floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
   1710 | before being returned.  `zSign' is ignored if the result is a NaN.
   1711 | The addition is performed according to the IEC/IEEE Standard for Binary
   1712 | Floating-Point Arithmetic.
   1713 *----------------------------------------------------------------------------*/
   1714 
   1715 static float32 addFloat32Sigs( float32 a, float32 b, flag zSign STATUS_PARAM)
   1716 {
   1717     int_fast16_t aExp, bExp, zExp;
   1718     uint32_t aSig, bSig, zSig;
   1719     int_fast16_t expDiff;
   1720 
   1721     aSig = extractFloat32Frac( a );
   1722     aExp = extractFloat32Exp( a );
   1723     bSig = extractFloat32Frac( b );
   1724     bExp = extractFloat32Exp( b );
   1725     expDiff = aExp - bExp;
   1726     aSig <<= 6;
   1727     bSig <<= 6;
   1728     if ( 0 < expDiff ) {
   1729         if ( aExp == 0xFF ) {
   1730             if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
   1731             return a;
   1732         }
   1733         if ( bExp == 0 ) {
   1734             --expDiff;
   1735         }
   1736         else {
   1737             bSig |= 0x20000000;
   1738         }
   1739         shift32RightJamming( bSig, expDiff, &bSig );
   1740         zExp = aExp;
   1741     }
   1742     else if ( expDiff < 0 ) {
   1743         if ( bExp == 0xFF ) {
   1744             if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
   1745             return packFloat32( zSign, 0xFF, 0 );
   1746         }
   1747         if ( aExp == 0 ) {
   1748             ++expDiff;
   1749         }
   1750         else {
   1751             aSig |= 0x20000000;
   1752         }
   1753         shift32RightJamming( aSig, - expDiff, &aSig );
   1754         zExp = bExp;
   1755     }
   1756     else {
   1757         if ( aExp == 0xFF ) {
   1758             if ( aSig | bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
   1759             return a;
   1760         }
   1761         if ( aExp == 0 ) {
   1762             if (STATUS(flush_to_zero)) {
   1763                 if (aSig | bSig) {
   1764                     float_raise(float_flag_output_denormal STATUS_VAR);
   1765                 }
   1766                 return packFloat32(zSign, 0, 0);
   1767             }
   1768             return packFloat32( zSign, 0, ( aSig + bSig )>>6 );
   1769         }
   1770         zSig = 0x40000000 + aSig + bSig;
   1771         zExp = aExp;
   1772         goto roundAndPack;
   1773     }
   1774     aSig |= 0x20000000;
   1775     zSig = ( aSig + bSig )<<1;
   1776     --zExp;
   1777     if ( (int32_t) zSig < 0 ) {
   1778         zSig = aSig + bSig;
   1779         ++zExp;
   1780     }
   1781  roundAndPack:
   1782     return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
   1783 
   1784 }
   1785 
   1786 /*----------------------------------------------------------------------------
   1787 | Returns the result of subtracting the absolute values of the single-
   1788 | precision floating-point values `a' and `b'.  If `zSign' is 1, the
   1789 | difference is negated before being returned.  `zSign' is ignored if the
   1790 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
   1791 | Standard for Binary Floating-Point Arithmetic.
   1792 *----------------------------------------------------------------------------*/
   1793 
   1794 static float32 subFloat32Sigs( float32 a, float32 b, flag zSign STATUS_PARAM)
   1795 {
   1796     int_fast16_t aExp, bExp, zExp;
   1797     uint32_t aSig, bSig, zSig;
   1798     int_fast16_t expDiff;
   1799 
   1800     aSig = extractFloat32Frac( a );
   1801     aExp = extractFloat32Exp( a );
   1802     bSig = extractFloat32Frac( b );
   1803     bExp = extractFloat32Exp( b );
   1804     expDiff = aExp - bExp;
   1805     aSig <<= 7;
   1806     bSig <<= 7;
   1807     if ( 0 < expDiff ) goto aExpBigger;
   1808     if ( expDiff < 0 ) goto bExpBigger;
   1809     if ( aExp == 0xFF ) {
   1810         if ( aSig | bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
   1811         float_raise( float_flag_invalid STATUS_VAR);
   1812         return float32_default_nan;
   1813     }
   1814     if ( aExp == 0 ) {
   1815         aExp = 1;
   1816         bExp = 1;
   1817     }
   1818     if ( bSig < aSig ) goto aBigger;
   1819     if ( aSig < bSig ) goto bBigger;
   1820     return packFloat32( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
   1821  bExpBigger:
   1822     if ( bExp == 0xFF ) {
   1823         if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
   1824         return packFloat32( zSign ^ 1, 0xFF, 0 );
   1825     }
   1826     if ( aExp == 0 ) {
   1827         ++expDiff;
   1828     }
   1829     else {
   1830         aSig |= 0x40000000;
   1831     }
   1832     shift32RightJamming( aSig, - expDiff, &aSig );
   1833     bSig |= 0x40000000;
   1834  bBigger:
   1835     zSig = bSig - aSig;
   1836     zExp = bExp;
   1837     zSign ^= 1;
   1838     goto normalizeRoundAndPack;
   1839  aExpBigger:
   1840     if ( aExp == 0xFF ) {
   1841         if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
   1842         return a;
   1843     }
   1844     if ( bExp == 0 ) {
   1845         --expDiff;
   1846     }
   1847     else {
   1848         bSig |= 0x40000000;
   1849     }
   1850     shift32RightJamming( bSig, expDiff, &bSig );
   1851     aSig |= 0x40000000;
   1852  aBigger:
   1853     zSig = aSig - bSig;
   1854     zExp = aExp;
   1855  normalizeRoundAndPack:
   1856     --zExp;
   1857     return normalizeRoundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
   1858 
   1859 }
   1860 
   1861 /*----------------------------------------------------------------------------
   1862 | Returns the result of adding the single-precision floating-point values `a'
   1863 | and `b'.  The operation is performed according to the IEC/IEEE Standard for
   1864 | Binary Floating-Point Arithmetic.
   1865 *----------------------------------------------------------------------------*/
   1866 
   1867 float32 float32_add( float32 a, float32 b STATUS_PARAM )
   1868 {
   1869     flag aSign, bSign;
   1870     a = float32_squash_input_denormal(a STATUS_VAR);
   1871     b = float32_squash_input_denormal(b STATUS_VAR);
   1872 
   1873     aSign = extractFloat32Sign( a );
   1874     bSign = extractFloat32Sign( b );
   1875     if ( aSign == bSign ) {
   1876         return addFloat32Sigs( a, b, aSign STATUS_VAR);
   1877     }
   1878     else {
   1879         return subFloat32Sigs( a, b, aSign STATUS_VAR );
   1880     }
   1881 
   1882 }
   1883 
   1884 /*----------------------------------------------------------------------------
   1885 | Returns the result of subtracting the single-precision floating-point values
   1886 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
   1887 | for Binary Floating-Point Arithmetic.
   1888 *----------------------------------------------------------------------------*/
   1889 
   1890 float32 float32_sub( float32 a, float32 b STATUS_PARAM )
   1891 {
   1892     flag aSign, bSign;
   1893     a = float32_squash_input_denormal(a STATUS_VAR);
   1894     b = float32_squash_input_denormal(b STATUS_VAR);
   1895 
   1896     aSign = extractFloat32Sign( a );
   1897     bSign = extractFloat32Sign( b );
   1898     if ( aSign == bSign ) {
   1899         return subFloat32Sigs( a, b, aSign STATUS_VAR );
   1900     }
   1901     else {
   1902         return addFloat32Sigs( a, b, aSign STATUS_VAR );
   1903     }
   1904 
   1905 }
   1906 
   1907 /*----------------------------------------------------------------------------
   1908 | Returns the result of multiplying the single-precision floating-point values
   1909 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
   1910 | for Binary Floating-Point Arithmetic.
   1911 *----------------------------------------------------------------------------*/
   1912 
   1913 float32 float32_mul( float32 a, float32 b STATUS_PARAM )
   1914 {
   1915     flag aSign, bSign, zSign;
   1916     int_fast16_t aExp, bExp, zExp;
   1917     uint32_t aSig, bSig;
   1918     uint64_t zSig64;
   1919     uint32_t zSig;
   1920 
   1921     a = float32_squash_input_denormal(a STATUS_VAR);
   1922     b = float32_squash_input_denormal(b STATUS_VAR);
   1923 
   1924     aSig = extractFloat32Frac( a );
   1925     aExp = extractFloat32Exp( a );
   1926     aSign = extractFloat32Sign( a );
   1927     bSig = extractFloat32Frac( b );
   1928     bExp = extractFloat32Exp( b );
   1929     bSign = extractFloat32Sign( b );
   1930     zSign = aSign ^ bSign;
   1931     if ( aExp == 0xFF ) {
   1932         if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
   1933             return propagateFloat32NaN( a, b STATUS_VAR );
   1934         }
   1935         if ( ( bExp | bSig ) == 0 ) {
   1936             float_raise( float_flag_invalid STATUS_VAR);
   1937             return float32_default_nan;
   1938         }
   1939         return packFloat32( zSign, 0xFF, 0 );
   1940     }
   1941     if ( bExp == 0xFF ) {
   1942         if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
   1943         if ( ( aExp | aSig ) == 0 ) {
   1944             float_raise( float_flag_invalid STATUS_VAR);
   1945             return float32_default_nan;
   1946         }
   1947         return packFloat32( zSign, 0xFF, 0 );
   1948     }
   1949     if ( aExp == 0 ) {
   1950         if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
   1951         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
   1952     }
   1953     if ( bExp == 0 ) {
   1954         if ( bSig == 0 ) return packFloat32( zSign, 0, 0 );
   1955         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
   1956     }
   1957     zExp = aExp + bExp - 0x7F;
   1958     aSig = ( aSig | 0x00800000 )<<7;
   1959     bSig = ( bSig | 0x00800000 )<<8;
   1960     shift64RightJamming( ( (uint64_t) aSig ) * bSig, 32, &zSig64 );
   1961     zSig = zSig64;
   1962     if ( 0 <= (int32_t) ( zSig<<1 ) ) {
   1963         zSig <<= 1;
   1964         --zExp;
   1965     }
   1966     return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
   1967 
   1968 }
   1969 
   1970 /*----------------------------------------------------------------------------
   1971 | Returns the result of dividing the single-precision floating-point value `a'
   1972 | by the corresponding value `b'.  The operation is performed according to the
   1973 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   1974 *----------------------------------------------------------------------------*/
   1975 
   1976 float32 float32_div( float32 a, float32 b STATUS_PARAM )
   1977 {
   1978     flag aSign, bSign, zSign;
   1979     int_fast16_t aExp, bExp, zExp;
   1980     uint32_t aSig, bSig, zSig;
   1981     a = float32_squash_input_denormal(a STATUS_VAR);
   1982     b = float32_squash_input_denormal(b STATUS_VAR);
   1983 
   1984     aSig = extractFloat32Frac( a );
   1985     aExp = extractFloat32Exp( a );
   1986     aSign = extractFloat32Sign( a );
   1987     bSig = extractFloat32Frac( b );
   1988     bExp = extractFloat32Exp( b );
   1989     bSign = extractFloat32Sign( b );
   1990     zSign = aSign ^ bSign;
   1991     if ( aExp == 0xFF ) {
   1992         if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
   1993         if ( bExp == 0xFF ) {
   1994             if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
   1995             float_raise( float_flag_invalid STATUS_VAR);
   1996             return float32_default_nan;
   1997         }
   1998         return packFloat32( zSign, 0xFF, 0 );
   1999     }
   2000     if ( bExp == 0xFF ) {
   2001         if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
   2002         return packFloat32( zSign, 0, 0 );
   2003     }
   2004     if ( bExp == 0 ) {
   2005         if ( bSig == 0 ) {
   2006             if ( ( aExp | aSig ) == 0 ) {
   2007                 float_raise( float_flag_invalid STATUS_VAR);
   2008                 return float32_default_nan;
   2009             }
   2010             float_raise( float_flag_divbyzero STATUS_VAR);
   2011             return packFloat32( zSign, 0xFF, 0 );
   2012         }
   2013         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
   2014     }
   2015     if ( aExp == 0 ) {
   2016         if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
   2017         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
   2018     }
   2019     zExp = aExp - bExp + 0x7D;
   2020     aSig = ( aSig | 0x00800000 )<<7;
   2021     bSig = ( bSig | 0x00800000 )<<8;
   2022     if ( bSig <= ( aSig + aSig ) ) {
   2023         aSig >>= 1;
   2024         ++zExp;
   2025     }
   2026     zSig = ( ( (uint64_t) aSig )<<32 ) / bSig;
   2027     if ( ( zSig & 0x3F ) == 0 ) {
   2028         zSig |= ( (uint64_t) bSig * zSig != ( (uint64_t) aSig )<<32 );
   2029     }
   2030     return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
   2031 
   2032 }
   2033 
   2034 /*----------------------------------------------------------------------------
   2035 | Returns the remainder of the single-precision floating-point value `a'
   2036 | with respect to the corresponding value `b'.  The operation is performed
   2037 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   2038 *----------------------------------------------------------------------------*/
   2039 
   2040 float32 float32_rem( float32 a, float32 b STATUS_PARAM )
   2041 {
   2042     flag aSign, zSign;
   2043     int_fast16_t aExp, bExp, expDiff;
   2044     uint32_t aSig, bSig;
   2045     uint32_t q;
   2046     uint64_t aSig64, bSig64, q64;
   2047     uint32_t alternateASig;
   2048     int32_t sigMean;
   2049     a = float32_squash_input_denormal(a STATUS_VAR);
   2050     b = float32_squash_input_denormal(b STATUS_VAR);
   2051 
   2052     aSig = extractFloat32Frac( a );
   2053     aExp = extractFloat32Exp( a );
   2054     aSign = extractFloat32Sign( a );
   2055     bSig = extractFloat32Frac( b );
   2056     bExp = extractFloat32Exp( b );
   2057     if ( aExp == 0xFF ) {
   2058         if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
   2059             return propagateFloat32NaN( a, b STATUS_VAR );
   2060         }
   2061         float_raise( float_flag_invalid STATUS_VAR);
   2062         return float32_default_nan;
   2063     }
   2064     if ( bExp == 0xFF ) {
   2065         if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
   2066         return a;
   2067     }
   2068     if ( bExp == 0 ) {
   2069         if ( bSig == 0 ) {
   2070             float_raise( float_flag_invalid STATUS_VAR);
   2071             return float32_default_nan;
   2072         }
   2073         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
   2074     }
   2075     if ( aExp == 0 ) {
   2076         if ( aSig == 0 ) return a;
   2077         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
   2078     }
   2079     expDiff = aExp - bExp;
   2080     aSig |= 0x00800000;
   2081     bSig |= 0x00800000;
   2082     if ( expDiff < 32 ) {
   2083         aSig <<= 8;
   2084         bSig <<= 8;
   2085         if ( expDiff < 0 ) {
   2086             if ( expDiff < -1 ) return a;
   2087             aSig >>= 1;
   2088         }
   2089         q = ( bSig <= aSig );
   2090         if ( q ) aSig -= bSig;
   2091         if ( 0 < expDiff ) {
   2092             q = ( ( (uint64_t) aSig )<<32 ) / bSig;
   2093             q >>= 32 - expDiff;
   2094             bSig >>= 2;
   2095             aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
   2096         }
   2097         else {
   2098             aSig >>= 2;
   2099             bSig >>= 2;
   2100         }
   2101     }
   2102     else {
   2103         if ( bSig <= aSig ) aSig -= bSig;
   2104         aSig64 = ( (uint64_t) aSig )<<40;
   2105         bSig64 = ( (uint64_t) bSig )<<40;
   2106         expDiff -= 64;
   2107         while ( 0 < expDiff ) {
   2108             q64 = estimateDiv128To64( aSig64, 0, bSig64 );
   2109             q64 = ( 2 < q64 ) ? q64 - 2 : 0;
   2110             aSig64 = - ( ( bSig * q64 )<<38 );
   2111             expDiff -= 62;
   2112         }
   2113         expDiff += 64;
   2114         q64 = estimateDiv128To64( aSig64, 0, bSig64 );
   2115         q64 = ( 2 < q64 ) ? q64 - 2 : 0;
   2116         q = q64>>( 64 - expDiff );
   2117         bSig <<= 6;
   2118         aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
   2119     }
   2120     do {
   2121         alternateASig = aSig;
   2122         ++q;
   2123         aSig -= bSig;
   2124     } while ( 0 <= (int32_t) aSig );
   2125     sigMean = aSig + alternateASig;
   2126     if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
   2127         aSig = alternateASig;
   2128     }
   2129     zSign = ( (int32_t) aSig < 0 );
   2130     if ( zSign ) aSig = - aSig;
   2131     return normalizeRoundAndPackFloat32( aSign ^ zSign, bExp, aSig STATUS_VAR );
   2132 
   2133 }
   2134 
   2135 /*----------------------------------------------------------------------------
   2136 | Returns the result of multiplying the single-precision floating-point values
   2137 | `a' and `b' then adding 'c', with no intermediate rounding step after the
   2138 | multiplication.  The operation is performed according to the IEC/IEEE
   2139 | Standard for Binary Floating-Point Arithmetic 754-2008.
   2140 | The flags argument allows the caller to select negation of the
   2141 | addend, the intermediate product, or the final result. (The difference
   2142 | between this and having the caller do a separate negation is that negating
   2143 | externally will flip the sign bit on NaNs.)
   2144 *----------------------------------------------------------------------------*/
   2145 
   2146 float32 float32_muladd(float32 a, float32 b, float32 c, int flags STATUS_PARAM)
   2147 {
   2148     flag aSign, bSign, cSign, zSign;
   2149     int_fast16_t aExp, bExp, cExp, pExp, zExp, expDiff;
   2150     uint32_t aSig, bSig, cSig;
   2151     flag pInf, pZero, pSign;
   2152     uint64_t pSig64, cSig64, zSig64;
   2153     uint32_t pSig;
   2154     int shiftcount;
   2155     flag signflip, infzero;
   2156 
   2157     a = float32_squash_input_denormal(a STATUS_VAR);
   2158     b = float32_squash_input_denormal(b STATUS_VAR);
   2159     c = float32_squash_input_denormal(c STATUS_VAR);
   2160     aSig = extractFloat32Frac(a);
   2161     aExp = extractFloat32Exp(a);
   2162     aSign = extractFloat32Sign(a);
   2163     bSig = extractFloat32Frac(b);
   2164     bExp = extractFloat32Exp(b);
   2165     bSign = extractFloat32Sign(b);
   2166     cSig = extractFloat32Frac(c);
   2167     cExp = extractFloat32Exp(c);
   2168     cSign = extractFloat32Sign(c);
   2169 
   2170     infzero = ((aExp == 0 && aSig == 0 && bExp == 0xff && bSig == 0) ||
   2171                (aExp == 0xff && aSig == 0 && bExp == 0 && bSig == 0));
   2172 
   2173     /* It is implementation-defined whether the cases of (0,inf,qnan)
   2174      * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN
   2175      * they return if they do), so we have to hand this information
   2176      * off to the target-specific pick-a-NaN routine.
   2177      */
   2178     if (((aExp == 0xff) && aSig) ||
   2179         ((bExp == 0xff) && bSig) ||
   2180         ((cExp == 0xff) && cSig)) {
   2181         return propagateFloat32MulAddNaN(a, b, c, infzero STATUS_VAR);
   2182     }
   2183 
   2184     if (infzero) {
   2185         float_raise(float_flag_invalid STATUS_VAR);
   2186         return float32_default_nan;
   2187     }
   2188 
   2189     if (flags & float_muladd_negate_c) {
   2190         cSign ^= 1;
   2191     }
   2192 
   2193     signflip = (flags & float_muladd_negate_result) ? 1 : 0;
   2194 
   2195     /* Work out the sign and type of the product */
   2196     pSign = aSign ^ bSign;
   2197     if (flags & float_muladd_negate_product) {
   2198         pSign ^= 1;
   2199     }
   2200     pInf = (aExp == 0xff) || (bExp == 0xff);
   2201     pZero = ((aExp | aSig) == 0) || ((bExp | bSig) == 0);
   2202 
   2203     if (cExp == 0xff) {
   2204         if (pInf && (pSign ^ cSign)) {
   2205             /* addition of opposite-signed infinities => InvalidOperation */
   2206             float_raise(float_flag_invalid STATUS_VAR);
   2207             return float32_default_nan;
   2208         }
   2209         /* Otherwise generate an infinity of the same sign */
   2210         return packFloat32(cSign ^ signflip, 0xff, 0);
   2211     }
   2212 
   2213     if (pInf) {
   2214         return packFloat32(pSign ^ signflip, 0xff, 0);
   2215     }
   2216 
   2217     if (pZero) {
   2218         if (cExp == 0) {
   2219             if (cSig == 0) {
   2220                 /* Adding two exact zeroes */
   2221                 if (pSign == cSign) {
   2222                     zSign = pSign;
   2223                 } else if (STATUS(float_rounding_mode) == float_round_down) {
   2224                     zSign = 1;
   2225                 } else {
   2226                     zSign = 0;
   2227                 }
   2228                 return packFloat32(zSign ^ signflip, 0, 0);
   2229             }
   2230             /* Exact zero plus a denorm */
   2231             if (STATUS(flush_to_zero)) {
   2232                 float_raise(float_flag_output_denormal STATUS_VAR);
   2233                 return packFloat32(cSign ^ signflip, 0, 0);
   2234             }
   2235         }
   2236         /* Zero plus something non-zero : just return the something */
   2237         return packFloat32(cSign ^ signflip, cExp, cSig);
   2238     }
   2239 
   2240     if (aExp == 0) {
   2241         normalizeFloat32Subnormal(aSig, &aExp, &aSig);
   2242     }
   2243     if (bExp == 0) {
   2244         normalizeFloat32Subnormal(bSig, &bExp, &bSig);
   2245     }
   2246 
   2247     /* Calculate the actual result a * b + c */
   2248 
   2249     /* Multiply first; this is easy. */
   2250     /* NB: we subtract 0x7e where float32_mul() subtracts 0x7f
   2251      * because we want the true exponent, not the "one-less-than"
   2252      * flavour that roundAndPackFloat32() takes.
   2253      */
   2254     pExp = aExp + bExp - 0x7e;
   2255     aSig = (aSig | 0x00800000) << 7;
   2256     bSig = (bSig | 0x00800000) << 8;
   2257     pSig64 = (uint64_t)aSig * bSig;
   2258     if ((int64_t)(pSig64 << 1) >= 0) {
   2259         pSig64 <<= 1;
   2260         pExp--;
   2261     }
   2262 
   2263     zSign = pSign ^ signflip;
   2264 
   2265     /* Now pSig64 is the significand of the multiply, with the explicit bit in
   2266      * position 62.
   2267      */
   2268     if (cExp == 0) {
   2269         if (!cSig) {
   2270             /* Throw out the special case of c being an exact zero now */
   2271             shift64RightJamming(pSig64, 32, &pSig64);
   2272             pSig = pSig64;
   2273             return roundAndPackFloat32(zSign, pExp - 1,
   2274                                        pSig STATUS_VAR);
   2275         }
   2276         normalizeFloat32Subnormal(cSig, &cExp, &cSig);
   2277     }
   2278 
   2279     cSig64 = (uint64_t)cSig << (62 - 23);
   2280     cSig64 |= LIT64(0x4000000000000000);
   2281     expDiff = pExp - cExp;
   2282 
   2283     if (pSign == cSign) {
   2284         /* Addition */
   2285         if (expDiff > 0) {
   2286             /* scale c to match p */
   2287             shift64RightJamming(cSig64, expDiff, &cSig64);
   2288             zExp = pExp;
   2289         } else if (expDiff < 0) {
   2290             /* scale p to match c */
   2291             shift64RightJamming(pSig64, -expDiff, &pSig64);
   2292             zExp = cExp;
   2293         } else {
   2294             /* no scaling needed */
   2295             zExp = cExp;
   2296         }
   2297         /* Add significands and make sure explicit bit ends up in posn 62 */
   2298         zSig64 = pSig64 + cSig64;
   2299         if ((int64_t)zSig64 < 0) {
   2300             shift64RightJamming(zSig64, 1, &zSig64);
   2301         } else {
   2302             zExp--;
   2303         }
   2304     } else {
   2305         /* Subtraction */
   2306         if (expDiff > 0) {
   2307             shift64RightJamming(cSig64, expDiff, &cSig64);
   2308             zSig64 = pSig64 - cSig64;
   2309             zExp = pExp;
   2310         } else if (expDiff < 0) {
   2311             shift64RightJamming(pSig64, -expDiff, &pSig64);
   2312             zSig64 = cSig64 - pSig64;
   2313             zExp = cExp;
   2314             zSign ^= 1;
   2315         } else {
   2316             zExp = pExp;
   2317             if (cSig64 < pSig64) {
   2318                 zSig64 = pSig64 - cSig64;
   2319             } else if (pSig64 < cSig64) {
   2320                 zSig64 = cSig64 - pSig64;
   2321                 zSign ^= 1;
   2322             } else {
   2323                 /* Exact zero */
   2324                 zSign = signflip;
   2325                 if (STATUS(float_rounding_mode) == float_round_down) {
   2326                     zSign ^= 1;
   2327                 }
   2328                 return packFloat32(zSign, 0, 0);
   2329             }
   2330         }
   2331         --zExp;
   2332         /* Normalize to put the explicit bit back into bit 62. */
   2333         shiftcount = countLeadingZeros64(zSig64) - 1;
   2334         zSig64 <<= shiftcount;
   2335         zExp -= shiftcount;
   2336     }
   2337     shift64RightJamming(zSig64, 32, &zSig64);
   2338     return roundAndPackFloat32(zSign, zExp, zSig64 STATUS_VAR);
   2339 }
   2340 
   2341 
   2342 /*----------------------------------------------------------------------------
   2343 | Returns the square root of the single-precision floating-point value `a'.
   2344 | The operation is performed according to the IEC/IEEE Standard for Binary
   2345 | Floating-Point Arithmetic.
   2346 *----------------------------------------------------------------------------*/
   2347 
   2348 float32 float32_sqrt( float32 a STATUS_PARAM )
   2349 {
   2350     flag aSign;
   2351     int_fast16_t aExp, zExp;
   2352     uint32_t aSig, zSig;
   2353     uint64_t rem, term;
   2354     a = float32_squash_input_denormal(a STATUS_VAR);
   2355 
   2356     aSig = extractFloat32Frac( a );
   2357     aExp = extractFloat32Exp( a );
   2358     aSign = extractFloat32Sign( a );
   2359     if ( aExp == 0xFF ) {
   2360         if ( aSig ) return propagateFloat32NaN( a, float32_zero STATUS_VAR );
   2361         if ( ! aSign ) return a;
   2362         float_raise( float_flag_invalid STATUS_VAR);
   2363         return float32_default_nan;
   2364     }
   2365     if ( aSign ) {
   2366         if ( ( aExp | aSig ) == 0 ) return a;
   2367         float_raise( float_flag_invalid STATUS_VAR);
   2368         return float32_default_nan;
   2369     }
   2370     if ( aExp == 0 ) {
   2371         if ( aSig == 0 ) return float32_zero;
   2372         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
   2373     }
   2374     zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E;
   2375     aSig = ( aSig | 0x00800000 )<<8;
   2376     zSig = estimateSqrt32( aExp, aSig ) + 2;
   2377     if ( ( zSig & 0x7F ) <= 5 ) {
   2378         if ( zSig < 2 ) {
   2379             zSig = 0x7FFFFFFF;
   2380             goto roundAndPack;
   2381         }
   2382         aSig >>= aExp & 1;
   2383         term = ( (uint64_t) zSig ) * zSig;
   2384         rem = ( ( (uint64_t) aSig )<<32 ) - term;
   2385         while ( (int64_t) rem < 0 ) {
   2386             --zSig;
   2387             rem += ( ( (uint64_t) zSig )<<1 ) | 1;
   2388         }
   2389         zSig |= ( rem != 0 );
   2390     }
   2391     shift32RightJamming( zSig, 1, &zSig );
   2392  roundAndPack:
   2393     return roundAndPackFloat32( 0, zExp, zSig STATUS_VAR );
   2394 
   2395 }
   2396 
   2397 /*----------------------------------------------------------------------------
   2398 | Returns the binary exponential of the single-precision floating-point value
   2399 | `a'. The operation is performed according to the IEC/IEEE Standard for
   2400 | Binary Floating-Point Arithmetic.
   2401 |
   2402 | Uses the following identities:
   2403 |
   2404 | 1. -------------------------------------------------------------------------
   2405 |      x    x*ln(2)
   2406 |     2  = e
   2407 |
   2408 | 2. -------------------------------------------------------------------------
   2409 |                      2     3     4     5           n
   2410 |      x        x     x     x     x     x           x
   2411 |     e  = 1 + --- + --- + --- + --- + --- + ... + --- + ...
   2412 |               1!    2!    3!    4!    5!          n!
   2413 *----------------------------------------------------------------------------*/
   2414 
   2415 static const float64 float32_exp2_coefficients[15] =
   2416 {
   2417     const_float64( 0x3ff0000000000000ll ), /*  1 */
   2418     const_float64( 0x3fe0000000000000ll ), /*  2 */
   2419     const_float64( 0x3fc5555555555555ll ), /*  3 */
   2420     const_float64( 0x3fa5555555555555ll ), /*  4 */
   2421     const_float64( 0x3f81111111111111ll ), /*  5 */
   2422     const_float64( 0x3f56c16c16c16c17ll ), /*  6 */
   2423     const_float64( 0x3f2a01a01a01a01all ), /*  7 */
   2424     const_float64( 0x3efa01a01a01a01all ), /*  8 */
   2425     const_float64( 0x3ec71de3a556c734ll ), /*  9 */
   2426     const_float64( 0x3e927e4fb7789f5cll ), /* 10 */
   2427     const_float64( 0x3e5ae64567f544e4ll ), /* 11 */
   2428     const_float64( 0x3e21eed8eff8d898ll ), /* 12 */
   2429     const_float64( 0x3de6124613a86d09ll ), /* 13 */
   2430     const_float64( 0x3da93974a8c07c9dll ), /* 14 */
   2431     const_float64( 0x3d6ae7f3e733b81fll ), /* 15 */
   2432 };
   2433 
   2434 float32 float32_exp2( float32 a STATUS_PARAM )
   2435 {
   2436     flag aSign;
   2437     int_fast16_t aExp;
   2438     uint32_t aSig;
   2439     float64 r, x, xn;
   2440     int i;
   2441     a = float32_squash_input_denormal(a STATUS_VAR);
   2442 
   2443     aSig = extractFloat32Frac( a );
   2444     aExp = extractFloat32Exp( a );
   2445     aSign = extractFloat32Sign( a );
   2446 
   2447     if ( aExp == 0xFF) {
   2448         if ( aSig ) return propagateFloat32NaN( a, float32_zero STATUS_VAR );
   2449         return (aSign) ? float32_zero : a;
   2450     }
   2451     if (aExp == 0) {
   2452         if (aSig == 0) return float32_one;
   2453     }
   2454 
   2455     float_raise( float_flag_inexact STATUS_VAR);
   2456 
   2457     /* ******************************* */
   2458     /* using float64 for approximation */
   2459     /* ******************************* */
   2460     x = float32_to_float64(a STATUS_VAR);
   2461     x = float64_mul(x, float64_ln2 STATUS_VAR);
   2462 
   2463     xn = x;
   2464     r = float64_one;
   2465     for (i = 0 ; i < 15 ; i++) {
   2466         float64 f;
   2467 
   2468         f = float64_mul(xn, float32_exp2_coefficients[i] STATUS_VAR);
   2469         r = float64_add(r, f STATUS_VAR);
   2470 
   2471         xn = float64_mul(xn, x STATUS_VAR);
   2472     }
   2473 
   2474     return float64_to_float32(r, status);
   2475 }
   2476 
   2477 /*----------------------------------------------------------------------------
   2478 | Returns the binary log of the single-precision floating-point value `a'.
   2479 | The operation is performed according to the IEC/IEEE Standard for Binary
   2480 | Floating-Point Arithmetic.
   2481 *----------------------------------------------------------------------------*/
   2482 float32 float32_log2( float32 a STATUS_PARAM )
   2483 {
   2484     flag aSign, zSign;
   2485     int_fast16_t aExp;
   2486     uint32_t aSig, zSig, i;
   2487 
   2488     a = float32_squash_input_denormal(a STATUS_VAR);
   2489     aSig = extractFloat32Frac( a );
   2490     aExp = extractFloat32Exp( a );
   2491     aSign = extractFloat32Sign( a );
   2492 
   2493     if ( aExp == 0 ) {
   2494         if ( aSig == 0 ) return packFloat32( 1, 0xFF, 0 );
   2495         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
   2496     }
   2497     if ( aSign ) {
   2498         float_raise( float_flag_invalid STATUS_VAR);
   2499         return float32_default_nan;
   2500     }
   2501     if ( aExp == 0xFF ) {
   2502         if ( aSig ) return propagateFloat32NaN( a, float32_zero STATUS_VAR );
   2503         return a;
   2504     }
   2505 
   2506     aExp -= 0x7F;
   2507     aSig |= 0x00800000;
   2508     zSign = aExp < 0;
   2509     zSig = aExp << 23;
   2510 
   2511     for (i = 1 << 22; i > 0; i >>= 1) {
   2512         aSig = ( (uint64_t)aSig * aSig ) >> 23;
   2513         if ( aSig & 0x01000000 ) {
   2514             aSig >>= 1;
   2515             zSig |= i;
   2516         }
   2517     }
   2518 
   2519     if ( zSign )
   2520         zSig = -zSig;
   2521 
   2522     return normalizeRoundAndPackFloat32( zSign, 0x85, zSig STATUS_VAR );
   2523 }
   2524 
   2525 /*----------------------------------------------------------------------------
   2526 | Returns 1 if the single-precision floating-point value `a' is equal to
   2527 | the corresponding value `b', and 0 otherwise.  The invalid exception is
   2528 | raised if either operand is a NaN.  Otherwise, the comparison is performed
   2529 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   2530 *----------------------------------------------------------------------------*/
   2531 
   2532 int float32_eq( float32 a, float32 b STATUS_PARAM )
   2533 {
   2534     uint32_t av, bv;
   2535     a = float32_squash_input_denormal(a STATUS_VAR);
   2536     b = float32_squash_input_denormal(b STATUS_VAR);
   2537 
   2538     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
   2539          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
   2540        ) {
   2541         float_raise( float_flag_invalid STATUS_VAR);
   2542         return 0;
   2543     }
   2544     av = float32_val(a);
   2545     bv = float32_val(b);
   2546     return ( av == bv ) || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
   2547 }
   2548 
   2549 /*----------------------------------------------------------------------------
   2550 | Returns 1 if the single-precision floating-point value `a' is less than
   2551 | or equal to the corresponding value `b', and 0 otherwise.  The invalid
   2552 | exception is raised if either operand is a NaN.  The comparison is performed
   2553 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   2554 *----------------------------------------------------------------------------*/
   2555 
   2556 int float32_le( float32 a, float32 b STATUS_PARAM )
   2557 {
   2558     flag aSign, bSign;
   2559     uint32_t av, bv;
   2560     a = float32_squash_input_denormal(a STATUS_VAR);
   2561     b = float32_squash_input_denormal(b STATUS_VAR);
   2562 
   2563     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
   2564          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
   2565        ) {
   2566         float_raise( float_flag_invalid STATUS_VAR);
   2567         return 0;
   2568     }
   2569     aSign = extractFloat32Sign( a );
   2570     bSign = extractFloat32Sign( b );
   2571     av = float32_val(a);
   2572     bv = float32_val(b);
   2573     if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
   2574     return ( av == bv ) || ( aSign ^ ( av < bv ) );
   2575 
   2576 }
   2577 
   2578 /*----------------------------------------------------------------------------
   2579 | Returns 1 if the single-precision floating-point value `a' is less than
   2580 | the corresponding value `b', and 0 otherwise.  The invalid exception is
   2581 | raised if either operand is a NaN.  The comparison is performed according
   2582 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   2583 *----------------------------------------------------------------------------*/
   2584 
   2585 int float32_lt( float32 a, float32 b STATUS_PARAM )
   2586 {
   2587     flag aSign, bSign;
   2588     uint32_t av, bv;
   2589     a = float32_squash_input_denormal(a STATUS_VAR);
   2590     b = float32_squash_input_denormal(b STATUS_VAR);
   2591 
   2592     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
   2593          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
   2594        ) {
   2595         float_raise( float_flag_invalid STATUS_VAR);
   2596         return 0;
   2597     }
   2598     aSign = extractFloat32Sign( a );
   2599     bSign = extractFloat32Sign( b );
   2600     av = float32_val(a);
   2601     bv = float32_val(b);
   2602     if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
   2603     return ( av != bv ) && ( aSign ^ ( av < bv ) );
   2604 
   2605 }
   2606 
   2607 /*----------------------------------------------------------------------------
   2608 | Returns 1 if the single-precision floating-point values `a' and `b' cannot
   2609 | be compared, and 0 otherwise.  The invalid exception is raised if either
   2610 | operand is a NaN.  The comparison is performed according to the IEC/IEEE
   2611 | Standard for Binary Floating-Point Arithmetic.
   2612 *----------------------------------------------------------------------------*/
   2613 
   2614 int float32_unordered( float32 a, float32 b STATUS_PARAM )
   2615 {
   2616     a = float32_squash_input_denormal(a STATUS_VAR);
   2617     b = float32_squash_input_denormal(b STATUS_VAR);
   2618 
   2619     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
   2620          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
   2621        ) {
   2622         float_raise( float_flag_invalid STATUS_VAR);
   2623         return 1;
   2624     }
   2625     return 0;
   2626 }
   2627 
   2628 /*----------------------------------------------------------------------------
   2629 | Returns 1 if the single-precision floating-point value `a' is equal to
   2630 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
   2631 | exception.  The comparison is performed according to the IEC/IEEE Standard
   2632 | for Binary Floating-Point Arithmetic.
   2633 *----------------------------------------------------------------------------*/
   2634 
   2635 int float32_eq_quiet( float32 a, float32 b STATUS_PARAM )
   2636 {
   2637     a = float32_squash_input_denormal(a STATUS_VAR);
   2638     b = float32_squash_input_denormal(b STATUS_VAR);
   2639 
   2640     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
   2641          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
   2642        ) {
   2643         if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
   2644             float_raise( float_flag_invalid STATUS_VAR);
   2645         }
   2646         return 0;
   2647     }
   2648     return ( float32_val(a) == float32_val(b) ) ||
   2649             ( (uint32_t) ( ( float32_val(a) | float32_val(b) )<<1 ) == 0 );
   2650 }
   2651 
   2652 /*----------------------------------------------------------------------------
   2653 | Returns 1 if the single-precision floating-point value `a' is less than or
   2654 | equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
   2655 | cause an exception.  Otherwise, the comparison is performed according to the
   2656 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   2657 *----------------------------------------------------------------------------*/
   2658 
   2659 int float32_le_quiet( float32 a, float32 b STATUS_PARAM )
   2660 {
   2661     flag aSign, bSign;
   2662     uint32_t av, bv;
   2663     a = float32_squash_input_denormal(a STATUS_VAR);
   2664     b = float32_squash_input_denormal(b STATUS_VAR);
   2665 
   2666     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
   2667          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
   2668        ) {
   2669         if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
   2670             float_raise( float_flag_invalid STATUS_VAR);
   2671         }
   2672         return 0;
   2673     }
   2674     aSign = extractFloat32Sign( a );
   2675     bSign = extractFloat32Sign( b );
   2676     av = float32_val(a);
   2677     bv = float32_val(b);
   2678     if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
   2679     return ( av == bv ) || ( aSign ^ ( av < bv ) );
   2680 
   2681 }
   2682 
   2683 /*----------------------------------------------------------------------------
   2684 | Returns 1 if the single-precision floating-point value `a' is less than
   2685 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
   2686 | exception.  Otherwise, the comparison is performed according to the IEC/IEEE
   2687 | Standard for Binary Floating-Point Arithmetic.
   2688 *----------------------------------------------------------------------------*/
   2689 
   2690 int float32_lt_quiet( float32 a, float32 b STATUS_PARAM )
   2691 {
   2692     flag aSign, bSign;
   2693     uint32_t av, bv;
   2694     a = float32_squash_input_denormal(a STATUS_VAR);
   2695     b = float32_squash_input_denormal(b STATUS_VAR);
   2696 
   2697     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
   2698          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
   2699        ) {
   2700         if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
   2701             float_raise( float_flag_invalid STATUS_VAR);
   2702         }
   2703         return 0;
   2704     }
   2705     aSign = extractFloat32Sign( a );
   2706     bSign = extractFloat32Sign( b );
   2707     av = float32_val(a);
   2708     bv = float32_val(b);
   2709     if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
   2710     return ( av != bv ) && ( aSign ^ ( av < bv ) );
   2711 
   2712 }
   2713 
   2714 /*----------------------------------------------------------------------------
   2715 | Returns 1 if the single-precision floating-point values `a' and `b' cannot
   2716 | be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.  The
   2717 | comparison is performed according to the IEC/IEEE Standard for Binary
   2718 | Floating-Point Arithmetic.
   2719 *----------------------------------------------------------------------------*/
   2720 
   2721 int float32_unordered_quiet( float32 a, float32 b STATUS_PARAM )
   2722 {
   2723     a = float32_squash_input_denormal(a STATUS_VAR);
   2724     b = float32_squash_input_denormal(b STATUS_VAR);
   2725 
   2726     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
   2727          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
   2728        ) {
   2729         if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
   2730             float_raise( float_flag_invalid STATUS_VAR);
   2731         }
   2732         return 1;
   2733     }
   2734     return 0;
   2735 }
   2736 
   2737 /*----------------------------------------------------------------------------
   2738 | Returns the result of converting the double-precision floating-point value
   2739 | `a' to the 32-bit two's complement integer format.  The conversion is
   2740 | performed according to the IEC/IEEE Standard for Binary Floating-Point
   2741 | Arithmetic---which means in particular that the conversion is rounded
   2742 | according to the current rounding mode.  If `a' is a NaN, the largest
   2743 | positive integer is returned.  Otherwise, if the conversion overflows, the
   2744 | largest integer with the same sign as `a' is returned.
   2745 *----------------------------------------------------------------------------*/
   2746 
   2747 int32 float64_to_int32( float64 a STATUS_PARAM )
   2748 {
   2749     flag aSign;
   2750     int_fast16_t aExp, shiftCount;
   2751     uint64_t aSig;
   2752     a = float64_squash_input_denormal(a STATUS_VAR);
   2753 
   2754     aSig = extractFloat64Frac( a );
   2755     aExp = extractFloat64Exp( a );
   2756     aSign = extractFloat64Sign( a );
   2757     if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
   2758     if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
   2759     shiftCount = 0x42C - aExp;
   2760     if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig );
   2761     return roundAndPackInt32( aSign, aSig STATUS_VAR );
   2762 
   2763 }
   2764 
   2765 /*----------------------------------------------------------------------------
   2766 | Returns the result of converting the double-precision floating-point value
   2767 | `a' to the 32-bit two's complement integer format.  The conversion is
   2768 | performed according to the IEC/IEEE Standard for Binary Floating-Point
   2769 | Arithmetic, except that the conversion is always rounded toward zero.
   2770 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
   2771 | the conversion overflows, the largest integer with the same sign as `a' is
   2772 | returned.
   2773 *----------------------------------------------------------------------------*/
   2774 
   2775 int32 float64_to_int32_round_to_zero( float64 a STATUS_PARAM )
   2776 {
   2777     flag aSign;
   2778     int_fast16_t aExp, shiftCount;
   2779     uint64_t aSig, savedASig;
   2780     int32_t z;
   2781     a = float64_squash_input_denormal(a STATUS_VAR);
   2782 
   2783     aSig = extractFloat64Frac( a );
   2784     aExp = extractFloat64Exp( a );
   2785     aSign = extractFloat64Sign( a );
   2786     if ( 0x41E < aExp ) {
   2787         if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
   2788         goto invalid;
   2789     }
   2790     else if ( aExp < 0x3FF ) {
   2791         if ( aExp || aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
   2792         return 0;
   2793     }
   2794     aSig |= LIT64( 0x0010000000000000 );
   2795     shiftCount = 0x433 - aExp;
   2796     savedASig = aSig;
   2797     aSig >>= shiftCount;
   2798     z = aSig;
   2799     if ( aSign ) z = - z;
   2800     if ( ( z < 0 ) ^ aSign ) {
   2801  invalid:
   2802         float_raise( float_flag_invalid STATUS_VAR);
   2803         return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
   2804     }
   2805     if ( ( aSig<<shiftCount ) != savedASig ) {
   2806         STATUS(float_exception_flags) |= float_flag_inexact;
   2807     }
   2808     return z;
   2809 
   2810 }
   2811 
   2812 /*----------------------------------------------------------------------------
   2813 | Returns the result of converting the double-precision floating-point value
   2814 | `a' to the 16-bit two's complement integer format.  The conversion is
   2815 | performed according to the IEC/IEEE Standard for Binary Floating-Point
   2816 | Arithmetic, except that the conversion is always rounded toward zero.
   2817 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
   2818 | the conversion overflows, the largest integer with the same sign as `a' is
   2819 | returned.
   2820 *----------------------------------------------------------------------------*/
   2821 
   2822 int_fast16_t float64_to_int16_round_to_zero(float64 a STATUS_PARAM)
   2823 {
   2824     flag aSign;
   2825     int_fast16_t aExp, shiftCount;
   2826     uint64_t aSig, savedASig;
   2827     int32 z;
   2828 
   2829     aSig = extractFloat64Frac( a );
   2830     aExp = extractFloat64Exp( a );
   2831     aSign = extractFloat64Sign( a );
   2832     if ( 0x40E < aExp ) {
   2833         if ( ( aExp == 0x7FF ) && aSig ) {
   2834             aSign = 0;
   2835         }
   2836         goto invalid;
   2837     }
   2838     else if ( aExp < 0x3FF ) {
   2839         if ( aExp || aSig ) {
   2840             STATUS(float_exception_flags) |= float_flag_inexact;
   2841         }
   2842         return 0;
   2843     }
   2844     aSig |= LIT64( 0x0010000000000000 );
   2845     shiftCount = 0x433 - aExp;
   2846     savedASig = aSig;
   2847     aSig >>= shiftCount;
   2848     z = aSig;
   2849     if ( aSign ) {
   2850         z = - z;
   2851     }
   2852     if ( ( (int16_t)z < 0 ) ^ aSign ) {
   2853  invalid:
   2854         float_raise( float_flag_invalid STATUS_VAR);
   2855         return aSign ? (int32_t) 0xffff8000 : 0x7FFF;
   2856     }
   2857     if ( ( aSig<<shiftCount ) != savedASig ) {
   2858         STATUS(float_exception_flags) |= float_flag_inexact;
   2859     }
   2860     return z;
   2861 }
   2862 
   2863 /*----------------------------------------------------------------------------
   2864 | Returns the result of converting the double-precision floating-point value
   2865 | `a' to the 64-bit two's complement integer format.  The conversion is
   2866 | performed according to the IEC/IEEE Standard for Binary Floating-Point
   2867 | Arithmetic---which means in particular that the conversion is rounded
   2868 | according to the current rounding mode.  If `a' is a NaN, the largest
   2869 | positive integer is returned.  Otherwise, if the conversion overflows, the
   2870 | largest integer with the same sign as `a' is returned.
   2871 *----------------------------------------------------------------------------*/
   2872 
   2873 int64 float64_to_int64( float64 a STATUS_PARAM )
   2874 {
   2875     flag aSign;
   2876     int_fast16_t aExp, shiftCount;
   2877     uint64_t aSig, aSigExtra;
   2878     a = float64_squash_input_denormal(a STATUS_VAR);
   2879 
   2880     aSig = extractFloat64Frac( a );
   2881     aExp = extractFloat64Exp( a );
   2882     aSign = extractFloat64Sign( a );
   2883     if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
   2884     shiftCount = 0x433 - aExp;
   2885     if ( shiftCount <= 0 ) {
   2886         if ( 0x43E < aExp ) {
   2887             float_raise( float_flag_invalid STATUS_VAR);
   2888             if (    ! aSign
   2889                  || (    ( aExp == 0x7FF )
   2890                       && ( aSig != LIT64( 0x0010000000000000 ) ) )
   2891                ) {
   2892                 return LIT64( 0x7FFFFFFFFFFFFFFF );
   2893             }
   2894             return (int64_t) LIT64( 0x8000000000000000 );
   2895         }
   2896         aSigExtra = 0;
   2897         aSig <<= - shiftCount;
   2898     }
   2899     else {
   2900         shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
   2901     }
   2902     return roundAndPackInt64( aSign, aSig, aSigExtra STATUS_VAR );
   2903 
   2904 }
   2905 
   2906 /*----------------------------------------------------------------------------
   2907 | Returns the result of converting the double-precision floating-point value
   2908 | `a' to the 64-bit two's complement integer format.  The conversion is
   2909 | performed according to the IEC/IEEE Standard for Binary Floating-Point
   2910 | Arithmetic, except that the conversion is always rounded toward zero.
   2911 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
   2912 | the conversion overflows, the largest integer with the same sign as `a' is
   2913 | returned.
   2914 *----------------------------------------------------------------------------*/
   2915 
   2916 int64 float64_to_int64_round_to_zero( float64 a STATUS_PARAM )
   2917 {
   2918     flag aSign;
   2919     int_fast16_t aExp, shiftCount;
   2920     uint64_t aSig;
   2921     int64 z;
   2922     a = float64_squash_input_denormal(a STATUS_VAR);
   2923 
   2924     aSig = extractFloat64Frac( a );
   2925     aExp = extractFloat64Exp( a );
   2926     aSign = extractFloat64Sign( a );
   2927     if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
   2928     shiftCount = aExp - 0x433;
   2929     if ( 0 <= shiftCount ) {
   2930         if ( 0x43E <= aExp ) {
   2931             if ( float64_val(a) != LIT64( 0xC3E0000000000000 ) ) {
   2932                 float_raise( float_flag_invalid STATUS_VAR);
   2933                 if (    ! aSign
   2934                      || (    ( aExp == 0x7FF )
   2935                           && ( aSig != LIT64( 0x0010000000000000 ) ) )
   2936                    ) {
   2937                     return LIT64( 0x7FFFFFFFFFFFFFFF );
   2938                 }
   2939             }
   2940             return (int64_t) LIT64( 0x8000000000000000 );
   2941         }
   2942         z = aSig<<shiftCount;
   2943     }
   2944     else {
   2945         if ( aExp < 0x3FE ) {
   2946             if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
   2947             return 0;
   2948         }
   2949         z = aSig>>( - shiftCount );
   2950         if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) {
   2951             STATUS(float_exception_flags) |= float_flag_inexact;
   2952         }
   2953     }
   2954     if ( aSign ) z = - z;
   2955     return z;
   2956 
   2957 }
   2958 
   2959 /*----------------------------------------------------------------------------
   2960 | Returns the result of converting the double-precision floating-point value
   2961 | `a' to the single-precision floating-point format.  The conversion is
   2962 | performed according to the IEC/IEEE Standard for Binary Floating-Point
   2963 | Arithmetic.
   2964 *----------------------------------------------------------------------------*/
   2965 
   2966 float32 float64_to_float32( float64 a STATUS_PARAM )
   2967 {
   2968     flag aSign;
   2969     int_fast16_t aExp;
   2970     uint64_t aSig;
   2971     uint32_t zSig;
   2972     a = float64_squash_input_denormal(a STATUS_VAR);
   2973 
   2974     aSig = extractFloat64Frac( a );
   2975     aExp = extractFloat64Exp( a );
   2976     aSign = extractFloat64Sign( a );
   2977     if ( aExp == 0x7FF ) {
   2978         if ( aSig ) return commonNaNToFloat32( float64ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
   2979         return packFloat32( aSign, 0xFF, 0 );
   2980     }
   2981     shift64RightJamming( aSig, 22, &aSig );
   2982     zSig = aSig;
   2983     if ( aExp || zSig ) {
   2984         zSig |= 0x40000000;
   2985         aExp -= 0x381;
   2986     }
   2987     return roundAndPackFloat32( aSign, aExp, zSig STATUS_VAR );
   2988 
   2989 }
   2990 
   2991 
   2992 /*----------------------------------------------------------------------------
   2993 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
   2994 | half-precision floating-point value, returning the result.  After being
   2995 | shifted into the proper positions, the three fields are simply added
   2996 | together to form the result.  This means that any integer portion of `zSig'
   2997 | will be added into the exponent.  Since a properly normalized significand
   2998 | will have an integer portion equal to 1, the `zExp' input should be 1 less
   2999 | than the desired result exponent whenever `zSig' is a complete, normalized
   3000 | significand.
   3001 *----------------------------------------------------------------------------*/
   3002 static float16 packFloat16(flag zSign, int_fast16_t zExp, uint16_t zSig)
   3003 {
   3004     return make_float16(
   3005         (((uint32_t)zSign) << 15) + (((uint32_t)zExp) << 10) + zSig);
   3006 }
   3007 
   3008 /* Half precision floats come in two formats: standard IEEE and "ARM" format.
   3009    The latter gains extra exponent range by omitting the NaN/Inf encodings.  */
   3010 
   3011 float32 float16_to_float32(float16 a, flag ieee STATUS_PARAM)
   3012 {
   3013     flag aSign;
   3014     int_fast16_t aExp;
   3015     uint32_t aSig;
   3016 
   3017     aSign = extractFloat16Sign(a);
   3018     aExp = extractFloat16Exp(a);
   3019     aSig = extractFloat16Frac(a);
   3020 
   3021     if (aExp == 0x1f && ieee) {
   3022         if (aSig) {
   3023             return commonNaNToFloat32(float16ToCommonNaN(a STATUS_VAR) STATUS_VAR);
   3024         }
   3025         return packFloat32(aSign, 0xff, 0);
   3026     }
   3027     if (aExp == 0) {
   3028         int8 shiftCount;
   3029 
   3030         if (aSig == 0) {
   3031             return packFloat32(aSign, 0, 0);
   3032         }
   3033 
   3034         shiftCount = countLeadingZeros32( aSig ) - 21;
   3035         aSig = aSig << shiftCount;
   3036         aExp = -shiftCount;
   3037     }
   3038     return packFloat32( aSign, aExp + 0x70, aSig << 13);
   3039 }
   3040 
   3041 float16 float32_to_float16(float32 a, flag ieee STATUS_PARAM)
   3042 {
   3043     flag aSign;
   3044     int_fast16_t aExp;
   3045     uint32_t aSig;
   3046     uint32_t mask;
   3047     uint32_t increment;
   3048     int8 roundingMode;
   3049     a = float32_squash_input_denormal(a STATUS_VAR);
   3050 
   3051     aSig = extractFloat32Frac( a );
   3052     aExp = extractFloat32Exp( a );
   3053     aSign = extractFloat32Sign( a );
   3054     if ( aExp == 0xFF ) {
   3055         if (aSig) {
   3056             /* Input is a NaN */
   3057             float16 r = commonNaNToFloat16( float32ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
   3058             if (!ieee) {
   3059                 return packFloat16(aSign, 0, 0);
   3060             }
   3061             return r;
   3062         }
   3063         /* Infinity */
   3064         if (!ieee) {
   3065             float_raise(float_flag_invalid STATUS_VAR);
   3066             return packFloat16(aSign, 0x1f, 0x3ff);
   3067         }
   3068         return packFloat16(aSign, 0x1f, 0);
   3069     }
   3070     if (aExp == 0 && aSig == 0) {
   3071         return packFloat16(aSign, 0, 0);
   3072     }
   3073     /* Decimal point between bits 22 and 23.  */
   3074     aSig |= 0x00800000;
   3075     aExp -= 0x7f;
   3076     if (aExp < -14) {
   3077         mask = 0x00ffffff;
   3078         if (aExp >= -24) {
   3079             mask >>= 25 + aExp;
   3080         }
   3081     } else {
   3082         mask = 0x00001fff;
   3083     }
   3084     if (aSig & mask) {
   3085         float_raise( float_flag_underflow STATUS_VAR );
   3086         roundingMode = STATUS(float_rounding_mode);
   3087         switch (roundingMode) {
   3088         case float_round_nearest_even:
   3089             increment = (mask + 1) >> 1;
   3090             if ((aSig & mask) == increment) {
   3091                 increment = aSig & (increment << 1);
   3092             }
   3093             break;
   3094         case float_round_up:
   3095             increment = aSign ? 0 : mask;
   3096             break;
   3097         case float_round_down:
   3098             increment = aSign ? mask : 0;
   3099             break;
   3100         default: /* round_to_zero */
   3101             increment = 0;
   3102             break;
   3103         }
   3104         aSig += increment;
   3105         if (aSig >= 0x01000000) {
   3106             aSig >>= 1;
   3107             aExp++;
   3108         }
   3109     } else if (aExp < -14
   3110           && STATUS(float_detect_tininess) == float_tininess_before_rounding) {
   3111         float_raise( float_flag_underflow STATUS_VAR);
   3112     }
   3113 
   3114     if (ieee) {
   3115         if (aExp > 15) {
   3116             float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
   3117             return packFloat16(aSign, 0x1f, 0);
   3118         }
   3119     } else {
   3120         if (aExp > 16) {
   3121             float_raise(float_flag_invalid | float_flag_inexact STATUS_VAR);
   3122             return packFloat16(aSign, 0x1f, 0x3ff);
   3123         }
   3124     }
   3125     if (aExp < -24) {
   3126         return packFloat16(aSign, 0, 0);
   3127     }
   3128     if (aExp < -14) {
   3129         aSig >>= -14 - aExp;
   3130         aExp = -14;
   3131     }
   3132     return packFloat16(aSign, aExp + 14, aSig >> 13);
   3133 }
   3134 
   3135 /*----------------------------------------------------------------------------
   3136 | Returns the result of converting the double-precision floating-point value
   3137 | `a' to the extended double-precision floating-point format.  The conversion
   3138 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
   3139 | Arithmetic.
   3140 *----------------------------------------------------------------------------*/
   3141 
   3142 floatx80 float64_to_floatx80( float64 a STATUS_PARAM )
   3143 {
   3144     flag aSign;
   3145     int_fast16_t aExp;
   3146     uint64_t aSig;
   3147 
   3148     a = float64_squash_input_denormal(a STATUS_VAR);
   3149     aSig = extractFloat64Frac( a );
   3150     aExp = extractFloat64Exp( a );
   3151     aSign = extractFloat64Sign( a );
   3152     if ( aExp == 0x7FF ) {
   3153         if ( aSig ) return commonNaNToFloatx80( float64ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
   3154         return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
   3155     }
   3156     if ( aExp == 0 ) {
   3157         if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
   3158         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
   3159     }
   3160     return
   3161         packFloatx80(
   3162             aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 );
   3163 
   3164 }
   3165 
   3166 /*----------------------------------------------------------------------------
   3167 | Returns the result of converting the double-precision floating-point value
   3168 | `a' to the quadruple-precision floating-point format.  The conversion is
   3169 | performed according to the IEC/IEEE Standard for Binary Floating-Point
   3170 | Arithmetic.
   3171 *----------------------------------------------------------------------------*/
   3172 
   3173 float128 float64_to_float128( float64 a STATUS_PARAM )
   3174 {
   3175     flag aSign;
   3176     int_fast16_t aExp;
   3177     uint64_t aSig, zSig0, zSig1;
   3178 
   3179     a = float64_squash_input_denormal(a STATUS_VAR);
   3180     aSig = extractFloat64Frac( a );
   3181     aExp = extractFloat64Exp( a );
   3182     aSign = extractFloat64Sign( a );
   3183     if ( aExp == 0x7FF ) {
   3184         if ( aSig ) return commonNaNToFloat128( float64ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
   3185         return packFloat128( aSign, 0x7FFF, 0, 0 );
   3186     }
   3187     if ( aExp == 0 ) {
   3188         if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
   3189         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
   3190         --aExp;
   3191     }
   3192     shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
   3193     return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
   3194 
   3195 }
   3196 
   3197 /*----------------------------------------------------------------------------
   3198 | Rounds the double-precision floating-point value `a' to an integer, and
   3199 | returns the result as a double-precision floating-point value.  The
   3200 | operation is performed according to the IEC/IEEE Standard for Binary
   3201 | Floating-Point Arithmetic.
   3202 *----------------------------------------------------------------------------*/
   3203 
   3204 float64 float64_round_to_int( float64 a STATUS_PARAM )
   3205 {
   3206     flag aSign;
   3207     int_fast16_t aExp;
   3208     uint64_t lastBitMask, roundBitsMask;
   3209     int8 roundingMode;
   3210     uint64_t z;
   3211     a = float64_squash_input_denormal(a STATUS_VAR);
   3212 
   3213     aExp = extractFloat64Exp( a );
   3214     if ( 0x433 <= aExp ) {
   3215         if ( ( aExp == 0x7FF ) && extractFloat64Frac( a ) ) {
   3216             return propagateFloat64NaN( a, a STATUS_VAR );
   3217         }
   3218         return a;
   3219     }
   3220     if ( aExp < 0x3FF ) {
   3221         if ( (uint64_t) ( float64_val(a)<<1 ) == 0 ) return a;
   3222         STATUS(float_exception_flags) |= float_flag_inexact;
   3223         aSign = extractFloat64Sign( a );
   3224         switch ( STATUS(float_rounding_mode) ) {
   3225          case float_round_nearest_even:
   3226             if ( ( aExp == 0x3FE ) && extractFloat64Frac( a ) ) {
   3227                 return packFloat64( aSign, 0x3FF, 0 );
   3228             }
   3229             break;
   3230          case float_round_down:
   3231             return make_float64(aSign ? LIT64( 0xBFF0000000000000 ) : 0);
   3232          case float_round_up:
   3233             return make_float64(
   3234             aSign ? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 ));
   3235         }
   3236         return packFloat64( aSign, 0, 0 );
   3237     }
   3238     lastBitMask = 1;
   3239     lastBitMask <<= 0x433 - aExp;
   3240     roundBitsMask = lastBitMask - 1;
   3241     z = float64_val(a);
   3242     roundingMode = STATUS(float_rounding_mode);
   3243     if ( roundingMode == float_round_nearest_even ) {
   3244         z += lastBitMask>>1;
   3245         if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
   3246     }
   3247     else if ( roundingMode != float_round_to_zero ) {
   3248         if ( extractFloat64Sign( make_float64(z) ) ^ ( roundingMode == float_round_up ) ) {
   3249             z += roundBitsMask;
   3250         }
   3251     }
   3252     z &= ~ roundBitsMask;
   3253     if ( z != float64_val(a) )
   3254         STATUS(float_exception_flags) |= float_flag_inexact;
   3255     return make_float64(z);
   3256 
   3257 }
   3258 
   3259 float64 float64_trunc_to_int( float64 a STATUS_PARAM)
   3260 {
   3261     int oldmode;
   3262     float64 res;
   3263     oldmode = STATUS(float_rounding_mode);
   3264     STATUS(float_rounding_mode) = float_round_to_zero;
   3265     res = float64_round_to_int(a STATUS_VAR);
   3266     STATUS(float_rounding_mode) = oldmode;
   3267     return res;
   3268 }
   3269 
   3270 /*----------------------------------------------------------------------------
   3271 | Returns the result of adding the absolute values of the double-precision
   3272 | floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
   3273 | before being returned.  `zSign' is ignored if the result is a NaN.
   3274 | The addition is performed according to the IEC/IEEE Standard for Binary
   3275 | Floating-Point Arithmetic.
   3276 *----------------------------------------------------------------------------*/
   3277 
   3278 static float64 addFloat64Sigs( float64 a, float64 b, flag zSign STATUS_PARAM )
   3279 {
   3280     int_fast16_t aExp, bExp, zExp;
   3281     uint64_t aSig, bSig, zSig;
   3282     int_fast16_t expDiff;
   3283 
   3284     aSig = extractFloat64Frac( a );
   3285     aExp = extractFloat64Exp( a );
   3286     bSig = extractFloat64Frac( b );
   3287     bExp = extractFloat64Exp( b );
   3288     expDiff = aExp - bExp;
   3289     aSig <<= 9;
   3290     bSig <<= 9;
   3291     if ( 0 < expDiff ) {
   3292         if ( aExp == 0x7FF ) {
   3293             if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
   3294             return a;
   3295         }
   3296         if ( bExp == 0 ) {
   3297             --expDiff;
   3298         }
   3299         else {
   3300             bSig |= LIT64( 0x2000000000000000 );
   3301         }
   3302         shift64RightJamming( bSig, expDiff, &bSig );
   3303         zExp = aExp;
   3304     }
   3305     else if ( expDiff < 0 ) {
   3306         if ( bExp == 0x7FF ) {
   3307             if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
   3308             return packFloat64( zSign, 0x7FF, 0 );
   3309         }
   3310         if ( aExp == 0 ) {
   3311             ++expDiff;
   3312         }
   3313         else {
   3314             aSig |= LIT64( 0x2000000000000000 );
   3315         }
   3316         shift64RightJamming( aSig, - expDiff, &aSig );
   3317         zExp = bExp;
   3318     }
   3319     else {
   3320         if ( aExp == 0x7FF ) {
   3321             if ( aSig | bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
   3322             return a;
   3323         }
   3324         if ( aExp == 0 ) {
   3325             if (STATUS(flush_to_zero)) {
   3326                 if (aSig | bSig) {
   3327                     float_raise(float_flag_output_denormal STATUS_VAR);
   3328                 }
   3329                 return packFloat64(zSign, 0, 0);
   3330             }
   3331             return packFloat64( zSign, 0, ( aSig + bSig )>>9 );
   3332         }
   3333         zSig = LIT64( 0x4000000000000000 ) + aSig + bSig;
   3334         zExp = aExp;
   3335         goto roundAndPack;
   3336     }
   3337     aSig |= LIT64( 0x2000000000000000 );
   3338     zSig = ( aSig + bSig )<<1;
   3339     --zExp;
   3340     if ( (int64_t) zSig < 0 ) {
   3341         zSig = aSig + bSig;
   3342         ++zExp;
   3343     }
   3344  roundAndPack:
   3345     return roundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
   3346 
   3347 }
   3348 
   3349 /*----------------------------------------------------------------------------
   3350 | Returns the result of subtracting the absolute values of the double-
   3351 | precision floating-point values `a' and `b'.  If `zSign' is 1, the
   3352 | difference is negated before being returned.  `zSign' is ignored if the
   3353 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
   3354 | Standard for Binary Floating-Point Arithmetic.
   3355 *----------------------------------------------------------------------------*/
   3356 
   3357 static float64 subFloat64Sigs( float64 a, float64 b, flag zSign STATUS_PARAM )
   3358 {
   3359     int_fast16_t aExp, bExp, zExp;
   3360     uint64_t aSig, bSig, zSig;
   3361     int_fast16_t expDiff;
   3362 
   3363     aSig = extractFloat64Frac( a );
   3364     aExp = extractFloat64Exp( a );
   3365     bSig = extractFloat64Frac( b );
   3366     bExp = extractFloat64Exp( b );
   3367     expDiff = aExp - bExp;
   3368     aSig <<= 10;
   3369     bSig <<= 10;
   3370     if ( 0 < expDiff ) goto aExpBigger;
   3371     if ( expDiff < 0 ) goto bExpBigger;
   3372     if ( aExp == 0x7FF ) {
   3373         if ( aSig | bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
   3374         float_raise( float_flag_invalid STATUS_VAR);
   3375         return float64_default_nan;
   3376     }
   3377     if ( aExp == 0 ) {
   3378         aExp = 1;
   3379         bExp = 1;
   3380     }
   3381     if ( bSig < aSig ) goto aBigger;
   3382     if ( aSig < bSig ) goto bBigger;
   3383     return packFloat64( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
   3384  bExpBigger:
   3385     if ( bExp == 0x7FF ) {
   3386         if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
   3387         return packFloat64( zSign ^ 1, 0x7FF, 0 );
   3388     }
   3389     if ( aExp == 0 ) {
   3390         ++expDiff;
   3391     }
   3392     else {
   3393         aSig |= LIT64( 0x4000000000000000 );
   3394     }
   3395     shift64RightJamming( aSig, - expDiff, &aSig );
   3396     bSig |= LIT64( 0x4000000000000000 );
   3397  bBigger:
   3398     zSig = bSig - aSig;
   3399     zExp = bExp;
   3400     zSign ^= 1;
   3401     goto normalizeRoundAndPack;
   3402  aExpBigger:
   3403     if ( aExp == 0x7FF ) {
   3404         if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
   3405         return a;
   3406     }
   3407     if ( bExp == 0 ) {
   3408         --expDiff;
   3409     }
   3410     else {
   3411         bSig |= LIT64( 0x4000000000000000 );
   3412     }
   3413     shift64RightJamming( bSig, expDiff, &bSig );
   3414     aSig |= LIT64( 0x4000000000000000 );
   3415  aBigger:
   3416     zSig = aSig - bSig;
   3417     zExp = aExp;
   3418  normalizeRoundAndPack:
   3419     --zExp;
   3420     return normalizeRoundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
   3421 
   3422 }
   3423 
   3424 /*----------------------------------------------------------------------------
   3425 | Returns the result of adding the double-precision floating-point values `a'
   3426 | and `b'.  The operation is performed according to the IEC/IEEE Standard for
   3427 | Binary Floating-Point Arithmetic.
   3428 *----------------------------------------------------------------------------*/
   3429 
   3430 float64 float64_add( float64 a, float64 b STATUS_PARAM )
   3431 {
   3432     flag aSign, bSign;
   3433     a = float64_squash_input_denormal(a STATUS_VAR);
   3434     b = float64_squash_input_denormal(b STATUS_VAR);
   3435 
   3436     aSign = extractFloat64Sign( a );
   3437     bSign = extractFloat64Sign( b );
   3438     if ( aSign == bSign ) {
   3439         return addFloat64Sigs( a, b, aSign STATUS_VAR );
   3440     }
   3441     else {
   3442         return subFloat64Sigs( a, b, aSign STATUS_VAR );
   3443     }
   3444 
   3445 }
   3446 
   3447 /*----------------------------------------------------------------------------
   3448 | Returns the result of subtracting the double-precision floating-point values
   3449 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
   3450 | for Binary Floating-Point Arithmetic.
   3451 *----------------------------------------------------------------------------*/
   3452 
   3453 float64 float64_sub( float64 a, float64 b STATUS_PARAM )
   3454 {
   3455     flag aSign, bSign;
   3456     a = float64_squash_input_denormal(a STATUS_VAR);
   3457     b = float64_squash_input_denormal(b STATUS_VAR);
   3458 
   3459     aSign = extractFloat64Sign( a );
   3460     bSign = extractFloat64Sign( b );
   3461     if ( aSign == bSign ) {
   3462         return subFloat64Sigs( a, b, aSign STATUS_VAR );
   3463     }
   3464     else {
   3465         return addFloat64Sigs( a, b, aSign STATUS_VAR );
   3466     }
   3467 
   3468 }
   3469 
   3470 /*----------------------------------------------------------------------------
   3471 | Returns the result of multiplying the double-precision floating-point values
   3472 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
   3473 | for Binary Floating-Point Arithmetic.
   3474 *----------------------------------------------------------------------------*/
   3475 
   3476 float64 float64_mul( float64 a, float64 b STATUS_PARAM )
   3477 {
   3478     flag aSign, bSign, zSign;
   3479     int_fast16_t aExp, bExp, zExp;
   3480     uint64_t aSig, bSig, zSig0, zSig1;
   3481 
   3482     a = float64_squash_input_denormal(a STATUS_VAR);
   3483     b = float64_squash_input_denormal(b STATUS_VAR);
   3484 
   3485     aSig = extractFloat64Frac( a );
   3486     aExp = extractFloat64Exp( a );
   3487     aSign = extractFloat64Sign( a );
   3488     bSig = extractFloat64Frac( b );
   3489     bExp = extractFloat64Exp( b );
   3490     bSign = extractFloat64Sign( b );
   3491     zSign = aSign ^ bSign;
   3492     if ( aExp == 0x7FF ) {
   3493         if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
   3494             return propagateFloat64NaN( a, b STATUS_VAR );
   3495         }
   3496         if ( ( bExp | bSig ) == 0 ) {
   3497             float_raise( float_flag_invalid STATUS_VAR);
   3498             return float64_default_nan;
   3499         }
   3500         return packFloat64( zSign, 0x7FF, 0 );
   3501     }
   3502     if ( bExp == 0x7FF ) {
   3503         if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
   3504         if ( ( aExp | aSig ) == 0 ) {
   3505             float_raise( float_flag_invalid STATUS_VAR);
   3506             return float64_default_nan;
   3507         }
   3508         return packFloat64( zSign, 0x7FF, 0 );
   3509     }
   3510     if ( aExp == 0 ) {
   3511         if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
   3512         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
   3513     }
   3514     if ( bExp == 0 ) {
   3515         if ( bSig == 0 ) return packFloat64( zSign, 0, 0 );
   3516         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
   3517     }
   3518     zExp = aExp + bExp - 0x3FF;
   3519     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
   3520     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
   3521     mul64To128( aSig, bSig, &zSig0, &zSig1 );
   3522     zSig0 |= ( zSig1 != 0 );
   3523     if ( 0 <= (int64_t) ( zSig0<<1 ) ) {
   3524         zSig0 <<= 1;
   3525         --zExp;
   3526     }
   3527     return roundAndPackFloat64( zSign, zExp, zSig0 STATUS_VAR );
   3528 
   3529 }
   3530 
   3531 /*----------------------------------------------------------------------------
   3532 | Returns the result of dividing the double-precision floating-point value `a'
   3533 | by the corresponding value `b'.  The operation is performed according to
   3534 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   3535 *----------------------------------------------------------------------------*/
   3536 
   3537 float64 float64_div( float64 a, float64 b STATUS_PARAM )
   3538 {
   3539     flag aSign, bSign, zSign;
   3540     int_fast16_t aExp, bExp, zExp;
   3541     uint64_t aSig, bSig, zSig;
   3542     uint64_t rem0, rem1;
   3543     uint64_t term0, term1;
   3544     a = float64_squash_input_denormal(a STATUS_VAR);
   3545     b = float64_squash_input_denormal(b STATUS_VAR);
   3546 
   3547     aSig = extractFloat64Frac( a );
   3548     aExp = extractFloat64Exp( a );
   3549     aSign = extractFloat64Sign( a );
   3550     bSig = extractFloat64Frac( b );
   3551     bExp = extractFloat64Exp( b );
   3552     bSign = extractFloat64Sign( b );
   3553     zSign = aSign ^ bSign;
   3554     if ( aExp == 0x7FF ) {
   3555         if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
   3556         if ( bExp == 0x7FF ) {
   3557             if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
   3558             float_raise( float_flag_invalid STATUS_VAR);
   3559             return float64_default_nan;
   3560         }
   3561         return packFloat64( zSign, 0x7FF, 0 );
   3562     }
   3563     if ( bExp == 0x7FF ) {
   3564         if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
   3565         return packFloat64( zSign, 0, 0 );
   3566     }
   3567     if ( bExp == 0 ) {
   3568         if ( bSig == 0 ) {
   3569             if ( ( aExp | aSig ) == 0 ) {
   3570                 float_raise( float_flag_invalid STATUS_VAR);
   3571                 return float64_default_nan;
   3572             }
   3573             float_raise( float_flag_divbyzero STATUS_VAR);
   3574             return packFloat64( zSign, 0x7FF, 0 );
   3575         }
   3576         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
   3577     }
   3578     if ( aExp == 0 ) {
   3579         if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
   3580         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
   3581     }
   3582     zExp = aExp - bExp + 0x3FD;
   3583     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
   3584     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
   3585     if ( bSig <= ( aSig + aSig ) ) {
   3586         aSig >>= 1;
   3587         ++zExp;
   3588     }
   3589     zSig = estimateDiv128To64( aSig, 0, bSig );
   3590     if ( ( zSig & 0x1FF ) <= 2 ) {
   3591         mul64To128( bSig, zSig, &term0, &term1 );
   3592         sub128( aSig, 0, term0, term1, &rem0, &rem1 );
   3593         while ( (int64_t) rem0 < 0 ) {
   3594             --zSig;
   3595             add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
   3596         }
   3597         zSig |= ( rem1 != 0 );
   3598     }
   3599     return roundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
   3600 
   3601 }
   3602 
   3603 /*----------------------------------------------------------------------------
   3604 | Returns the remainder of the double-precision floating-point value `a'
   3605 | with respect to the corresponding value `b'.  The operation is performed
   3606 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   3607 *----------------------------------------------------------------------------*/
   3608 
   3609 float64 float64_rem( float64 a, float64 b STATUS_PARAM )
   3610 {
   3611     flag aSign, zSign;
   3612     int_fast16_t aExp, bExp, expDiff;
   3613     uint64_t aSig, bSig;
   3614     uint64_t q, alternateASig;
   3615     int64_t sigMean;
   3616 
   3617     a = float64_squash_input_denormal(a STATUS_VAR);
   3618     b = float64_squash_input_denormal(b STATUS_VAR);
   3619     aSig = extractFloat64Frac( a );
   3620     aExp = extractFloat64Exp( a );
   3621     aSign = extractFloat64Sign( a );
   3622     bSig = extractFloat64Frac( b );
   3623     bExp = extractFloat64Exp( b );
   3624     if ( aExp == 0x7FF ) {
   3625         if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
   3626             return propagateFloat64NaN( a, b STATUS_VAR );
   3627         }
   3628         float_raise( float_flag_invalid STATUS_VAR);
   3629         return float64_default_nan;
   3630     }
   3631     if ( bExp == 0x7FF ) {
   3632         if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
   3633         return a;
   3634     }
   3635     if ( bExp == 0 ) {
   3636         if ( bSig == 0 ) {
   3637             float_raise( float_flag_invalid STATUS_VAR);
   3638             return float64_default_nan;
   3639         }
   3640         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
   3641     }
   3642     if ( aExp == 0 ) {
   3643         if ( aSig == 0 ) return a;
   3644         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
   3645     }
   3646     expDiff = aExp - bExp;
   3647     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11;
   3648     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
   3649     if ( expDiff < 0 ) {
   3650         if ( expDiff < -1 ) return a;
   3651         aSig >>= 1;
   3652     }
   3653     q = ( bSig <= aSig );
   3654     if ( q ) aSig -= bSig;
   3655     expDiff -= 64;
   3656     while ( 0 < expDiff ) {
   3657         q = estimateDiv128To64( aSig, 0, bSig );
   3658         q = ( 2 < q ) ? q - 2 : 0;
   3659         aSig = - ( ( bSig>>2 ) * q );
   3660         expDiff -= 62;
   3661     }
   3662     expDiff += 64;
   3663     if ( 0 < expDiff ) {
   3664         q = estimateDiv128To64( aSig, 0, bSig );
   3665         q = ( 2 < q ) ? q - 2 : 0;
   3666         q >>= 64 - expDiff;
   3667         bSig >>= 2;
   3668         aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
   3669     }
   3670     else {
   3671         aSig >>= 2;
   3672         bSig >>= 2;
   3673     }
   3674     do {
   3675         alternateASig = aSig;
   3676         ++q;
   3677         aSig -= bSig;
   3678     } while ( 0 <= (int64_t) aSig );
   3679     sigMean = aSig + alternateASig;
   3680     if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
   3681         aSig = alternateASig;
   3682     }
   3683     zSign = ( (int64_t) aSig < 0 );
   3684     if ( zSign ) aSig = - aSig;
   3685     return normalizeRoundAndPackFloat64( aSign ^ zSign, bExp, aSig STATUS_VAR );
   3686 
   3687 }
   3688 
   3689 /*----------------------------------------------------------------------------
   3690 | Returns the result of multiplying the double-precision floating-point values
   3691 | `a' and `b' then adding 'c', with no intermediate rounding step after the
   3692 | multiplication.  The operation is performed according to the IEC/IEEE
   3693 | Standard for Binary Floating-Point Arithmetic 754-2008.
   3694 | The flags argument allows the caller to select negation of the
   3695 | addend, the intermediate product, or the final result. (The difference
   3696 | between this and having the caller do a separate negation is that negating
   3697 | externally will flip the sign bit on NaNs.)
   3698 *----------------------------------------------------------------------------*/
   3699 
   3700 float64 float64_muladd(float64 a, float64 b, float64 c, int flags STATUS_PARAM)
   3701 {
   3702     flag aSign, bSign, cSign, zSign;
   3703     int_fast16_t aExp, bExp, cExp, pExp, zExp, expDiff;
   3704     uint64_t aSig, bSig, cSig;
   3705     flag pInf, pZero, pSign;
   3706     uint64_t pSig0, pSig1, cSig0, cSig1, zSig0, zSig1;
   3707     int shiftcount;
   3708     flag signflip, infzero;
   3709 
   3710     a = float64_squash_input_denormal(a STATUS_VAR);
   3711     b = float64_squash_input_denormal(b STATUS_VAR);
   3712     c = float64_squash_input_denormal(c STATUS_VAR);
   3713     aSig = extractFloat64Frac(a);
   3714     aExp = extractFloat64Exp(a);
   3715     aSign = extractFloat64Sign(a);
   3716     bSig = extractFloat64Frac(b);
   3717     bExp = extractFloat64Exp(b);
   3718     bSign = extractFloat64Sign(b);
   3719     cSig = extractFloat64Frac(c);
   3720     cExp = extractFloat64Exp(c);
   3721     cSign = extractFloat64Sign(c);
   3722 
   3723     infzero = ((aExp == 0 && aSig == 0 && bExp == 0x7ff && bSig == 0) ||
   3724                (aExp == 0x7ff && aSig == 0 && bExp == 0 && bSig == 0));
   3725 
   3726     /* It is implementation-defined whether the cases of (0,inf,qnan)
   3727      * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN
   3728      * they return if they do), so we have to hand this information
   3729      * off to the target-specific pick-a-NaN routine.
   3730      */
   3731     if (((aExp == 0x7ff) && aSig) ||
   3732         ((bExp == 0x7ff) && bSig) ||
   3733         ((cExp == 0x7ff) && cSig)) {
   3734         return propagateFloat64MulAddNaN(a, b, c, infzero STATUS_VAR);
   3735     }
   3736 
   3737     if (infzero) {
   3738         float_raise(float_flag_invalid STATUS_VAR);
   3739         return float64_default_nan;
   3740     }
   3741 
   3742     if (flags & float_muladd_negate_c) {
   3743         cSign ^= 1;
   3744     }
   3745 
   3746     signflip = (flags & float_muladd_negate_result) ? 1 : 0;
   3747 
   3748     /* Work out the sign and type of the product */
   3749     pSign = aSign ^ bSign;
   3750     if (flags & float_muladd_negate_product) {
   3751         pSign ^= 1;
   3752     }
   3753     pInf = (aExp == 0x7ff) || (bExp == 0x7ff);
   3754     pZero = ((aExp | aSig) == 0) || ((bExp | bSig) == 0);
   3755 
   3756     if (cExp == 0x7ff) {
   3757         if (pInf && (pSign ^ cSign)) {
   3758             /* addition of opposite-signed infinities => InvalidOperation */
   3759             float_raise(float_flag_invalid STATUS_VAR);
   3760             return float64_default_nan;
   3761         }
   3762         /* Otherwise generate an infinity of the same sign */
   3763         return packFloat64(cSign ^ signflip, 0x7ff, 0);
   3764     }
   3765 
   3766     if (pInf) {
   3767         return packFloat64(pSign ^ signflip, 0x7ff, 0);
   3768     }
   3769 
   3770     if (pZero) {
   3771         if (cExp == 0) {
   3772             if (cSig == 0) {
   3773                 /* Adding two exact zeroes */
   3774                 if (pSign == cSign) {
   3775                     zSign = pSign;
   3776                 } else if (STATUS(float_rounding_mode) == float_round_down) {
   3777                     zSign = 1;
   3778                 } else {
   3779                     zSign = 0;
   3780                 }
   3781                 return packFloat64(zSign ^ signflip, 0, 0);
   3782             }
   3783             /* Exact zero plus a denorm */
   3784             if (STATUS(flush_to_zero)) {
   3785                 float_raise(float_flag_output_denormal STATUS_VAR);
   3786                 return packFloat64(cSign ^ signflip, 0, 0);
   3787             }
   3788         }
   3789         /* Zero plus something non-zero : just return the something */
   3790         return packFloat64(cSign ^ signflip, cExp, cSig);
   3791     }
   3792 
   3793     if (aExp == 0) {
   3794         normalizeFloat64Subnormal(aSig, &aExp, &aSig);
   3795     }
   3796     if (bExp == 0) {
   3797         normalizeFloat64Subnormal(bSig, &bExp, &bSig);
   3798     }
   3799 
   3800     /* Calculate the actual result a * b + c */
   3801 
   3802     /* Multiply first; this is easy. */
   3803     /* NB: we subtract 0x3fe where float64_mul() subtracts 0x3ff
   3804      * because we want the true exponent, not the "one-less-than"
   3805      * flavour that roundAndPackFloat64() takes.
   3806      */
   3807     pExp = aExp + bExp - 0x3fe;
   3808     aSig = (aSig | LIT64(0x0010000000000000))<<10;
   3809     bSig = (bSig | LIT64(0x0010000000000000))<<11;
   3810     mul64To128(aSig, bSig, &pSig0, &pSig1);
   3811     if ((int64_t)(pSig0 << 1) >= 0) {
   3812         shortShift128Left(pSig0, pSig1, 1, &pSig0, &pSig1);
   3813         pExp--;
   3814     }
   3815 
   3816     zSign = pSign ^ signflip;
   3817 
   3818     /* Now [pSig0:pSig1] is the significand of the multiply, with the explicit
   3819      * bit in position 126.
   3820      */
   3821     if (cExp == 0) {
   3822         if (!cSig) {
   3823             /* Throw out the special case of c being an exact zero now */
   3824             shift128RightJamming(pSig0, pSig1, 64, &pSig0, &pSig1);
   3825             return roundAndPackFloat64(zSign, pExp - 1,
   3826                                        pSig1 STATUS_VAR);
   3827         }
   3828         normalizeFloat64Subnormal(cSig, &cExp, &cSig);
   3829     }
   3830 
   3831     /* Shift cSig and add the explicit bit so [cSig0:cSig1] is the
   3832      * significand of the addend, with the explicit bit in position 126.
   3833      */
   3834     cSig0 = cSig << (126 - 64 - 52);
   3835     cSig1 = 0;
   3836     cSig0 |= LIT64(0x4000000000000000);
   3837     expDiff = pExp - cExp;
   3838 
   3839     if (pSign == cSign) {
   3840         /* Addition */
   3841         if (expDiff > 0) {
   3842             /* scale c to match p */
   3843             shift128RightJamming(cSig0, cSig1, expDiff, &cSig0, &cSig1);
   3844             zExp = pExp;
   3845         } else if (expDiff < 0) {
   3846             /* scale p to match c */
   3847             shift128RightJamming(pSig0, pSig1, -expDiff, &pSig0, &pSig1);
   3848             zExp = cExp;
   3849         } else {
   3850             /* no scaling needed */
   3851             zExp = cExp;
   3852         }
   3853         /* Add significands and make sure explicit bit ends up in posn 126 */
   3854         add128(pSig0, pSig1, cSig0, cSig1, &zSig0, &zSig1);
   3855         if ((int64_t)zSig0 < 0) {
   3856             shift128RightJamming(zSig0, zSig1, 1, &zSig0, &zSig1);
   3857         } else {
   3858             zExp--;
   3859         }
   3860         shift128RightJamming(zSig0, zSig1, 64, &zSig0, &zSig1);
   3861         return roundAndPackFloat64(zSign, zExp, zSig1 STATUS_VAR);
   3862     } else {
   3863         /* Subtraction */
   3864         if (expDiff > 0) {
   3865             shift128RightJamming(cSig0, cSig1, expDiff, &cSig0, &cSig1);
   3866             sub128(pSig0, pSig1, cSig0, cSig1, &zSig0, &zSig1);
   3867             zExp = pExp;
   3868         } else if (expDiff < 0) {
   3869             shift128RightJamming(pSig0, pSig1, -expDiff, &pSig0, &pSig1);
   3870             sub128(cSig0, cSig1, pSig0, pSig1, &zSig0, &zSig1);
   3871             zExp = cExp;
   3872             zSign ^= 1;
   3873         } else {
   3874             zExp = pExp;
   3875             if (lt128(cSig0, cSig1, pSig0, pSig1)) {
   3876                 sub128(pSig0, pSig1, cSig0, cSig1, &zSig0, &zSig1);
   3877             } else if (lt128(pSig0, pSig1, cSig0, cSig1)) {
   3878                 sub128(cSig0, cSig1, pSig0, pSig1, &zSig0, &zSig1);
   3879                 zSign ^= 1;
   3880             } else {
   3881                 /* Exact zero */
   3882                 zSign = signflip;
   3883                 if (STATUS(float_rounding_mode) == float_round_down) {
   3884                     zSign ^= 1;
   3885                 }
   3886                 return packFloat64(zSign, 0, 0);
   3887             }
   3888         }
   3889         --zExp;
   3890         /* Do the equivalent of normalizeRoundAndPackFloat64() but
   3891          * starting with the significand in a pair of uint64_t.
   3892          */
   3893         if (zSig0) {
   3894             shiftcount = countLeadingZeros64(zSig0) - 1;
   3895             shortShift128Left(zSig0, zSig1, shiftcount, &zSig0, &zSig1);
   3896             if (zSig1) {
   3897                 zSig0 |= 1;
   3898             }
   3899             zExp -= shiftcount;
   3900         } else {
   3901             shiftcount = countLeadingZeros64(zSig1);
   3902             if (shiftcount == 0) {
   3903                 zSig0 = (zSig1 >> 1) | (zSig1 & 1);
   3904                 zExp -= 63;
   3905             } else {
   3906                 shiftcount--;
   3907                 zSig0 = zSig1 << shiftcount;
   3908                 zExp -= (shiftcount + 64);
   3909             }
   3910         }
   3911         return roundAndPackFloat64(zSign, zExp, zSig0 STATUS_VAR);
   3912     }
   3913 }
   3914 
   3915 /*----------------------------------------------------------------------------
   3916 | Returns the square root of the double-precision floating-point value `a'.
   3917 | The operation is performed according to the IEC/IEEE Standard for Binary
   3918 | Floating-Point Arithmetic.
   3919 *----------------------------------------------------------------------------*/
   3920 
   3921 float64 float64_sqrt( float64 a STATUS_PARAM )
   3922 {
   3923     flag aSign;
   3924     int_fast16_t aExp, zExp;
   3925     uint64_t aSig, zSig, doubleZSig;
   3926     uint64_t rem0, rem1, term0, term1;
   3927     a = float64_squash_input_denormal(a STATUS_VAR);
   3928 
   3929     aSig = extractFloat64Frac( a );
   3930     aExp = extractFloat64Exp( a );
   3931     aSign = extractFloat64Sign( a );
   3932     if ( aExp == 0x7FF ) {
   3933         if ( aSig ) return propagateFloat64NaN( a, a STATUS_VAR );
   3934         if ( ! aSign ) return a;
   3935         float_raise( float_flag_invalid STATUS_VAR);
   3936         return float64_default_nan;
   3937     }
   3938     if ( aSign ) {
   3939         if ( ( aExp | aSig ) == 0 ) return a;
   3940         float_raise( float_flag_invalid STATUS_VAR);
   3941         return float64_default_nan;
   3942     }
   3943     if ( aExp == 0 ) {
   3944         if ( aSig == 0 ) return float64_zero;
   3945         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
   3946     }
   3947     zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE;
   3948     aSig |= LIT64( 0x0010000000000000 );
   3949     zSig = estimateSqrt32( aExp, aSig>>21 );
   3950     aSig <<= 9 - ( aExp & 1 );
   3951     zSig = estimateDiv128To64( aSig, 0, zSig<<32 ) + ( zSig<<30 );
   3952     if ( ( zSig & 0x1FF ) <= 5 ) {
   3953         doubleZSig = zSig<<1;
   3954         mul64To128( zSig, zSig, &term0, &term1 );
   3955         sub128( aSig, 0, term0, term1, &rem0, &rem1 );
   3956         while ( (int64_t) rem0 < 0 ) {
   3957             --zSig;
   3958             doubleZSig -= 2;
   3959             add128( rem0, rem1, zSig>>63, doubleZSig | 1, &rem0, &rem1 );
   3960         }
   3961         zSig |= ( ( rem0 | rem1 ) != 0 );
   3962     }
   3963     return roundAndPackFloat64( 0, zExp, zSig STATUS_VAR );
   3964 
   3965 }
   3966 
   3967 /*----------------------------------------------------------------------------
   3968 | Returns the binary log of the double-precision floating-point value `a'.
   3969 | The operation is performed according to the IEC/IEEE Standard for Binary
   3970 | Floating-Point Arithmetic.
   3971 *----------------------------------------------------------------------------*/
   3972 float64 float64_log2( float64 a STATUS_PARAM )
   3973 {
   3974     flag aSign, zSign;
   3975     int_fast16_t aExp;
   3976     uint64_t aSig, aSig0, aSig1, zSig, i;
   3977     a = float64_squash_input_denormal(a STATUS_VAR);
   3978 
   3979     aSig = extractFloat64Frac( a );
   3980     aExp = extractFloat64Exp( a );
   3981     aSign = extractFloat64Sign( a );
   3982 
   3983     if ( aExp == 0 ) {
   3984         if ( aSig == 0 ) return packFloat64( 1, 0x7FF, 0 );
   3985         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
   3986     }
   3987     if ( aSign ) {
   3988         float_raise( float_flag_invalid STATUS_VAR);
   3989         return float64_default_nan;
   3990     }
   3991     if ( aExp == 0x7FF ) {
   3992         if ( aSig ) return propagateFloat64NaN( a, float64_zero STATUS_VAR );
   3993         return a;
   3994     }
   3995 
   3996     aExp -= 0x3FF;
   3997     aSig |= LIT64( 0x0010000000000000 );
   3998     zSign = aExp < 0;
   3999     zSig = (uint64_t)aExp << 52;
   4000     for (i = 1LL << 51; i > 0; i >>= 1) {
   4001         mul64To128( aSig, aSig, &aSig0, &aSig1 );
   4002         aSig = ( aSig0 << 12 ) | ( aSig1 >> 52 );
   4003         if ( aSig & LIT64( 0x0020000000000000 ) ) {
   4004             aSig >>= 1;
   4005             zSig |= i;
   4006         }
   4007     }
   4008 
   4009     if ( zSign )
   4010         zSig = -zSig;
   4011     return normalizeRoundAndPackFloat64( zSign, 0x408, zSig STATUS_VAR );
   4012 }
   4013 
   4014 /*----------------------------------------------------------------------------
   4015 | Returns 1 if the double-precision floating-point value `a' is equal to the
   4016 | corresponding value `b', and 0 otherwise.  The invalid exception is raised
   4017 | if either operand is a NaN.  Otherwise, the comparison is performed
   4018 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   4019 *----------------------------------------------------------------------------*/
   4020 
   4021 int float64_eq( float64 a, float64 b STATUS_PARAM )
   4022 {
   4023     uint64_t av, bv;
   4024     a = float64_squash_input_denormal(a STATUS_VAR);
   4025     b = float64_squash_input_denormal(b STATUS_VAR);
   4026 
   4027     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
   4028          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
   4029        ) {
   4030         float_raise( float_flag_invalid STATUS_VAR);
   4031         return 0;
   4032     }
   4033     av = float64_val(a);
   4034     bv = float64_val(b);
   4035     return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
   4036 
   4037 }
   4038 
   4039 /*----------------------------------------------------------------------------
   4040 | Returns 1 if the double-precision floating-point value `a' is less than or
   4041 | equal to the corresponding value `b', and 0 otherwise.  The invalid
   4042 | exception is raised if either operand is a NaN.  The comparison is performed
   4043 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   4044 *----------------------------------------------------------------------------*/
   4045 
   4046 int float64_le( float64 a, float64 b STATUS_PARAM )
   4047 {
   4048     flag aSign, bSign;
   4049     uint64_t av, bv;
   4050     a = float64_squash_input_denormal(a STATUS_VAR);
   4051     b = float64_squash_input_denormal(b STATUS_VAR);
   4052 
   4053     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
   4054          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
   4055        ) {
   4056         float_raise( float_flag_invalid STATUS_VAR);
   4057         return 0;
   4058     }
   4059     aSign = extractFloat64Sign( a );
   4060     bSign = extractFloat64Sign( b );
   4061     av = float64_val(a);
   4062     bv = float64_val(b);
   4063     if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
   4064     return ( av == bv ) || ( aSign ^ ( av < bv ) );
   4065 
   4066 }
   4067 
   4068 /*----------------------------------------------------------------------------
   4069 | Returns 1 if the double-precision floating-point value `a' is less than
   4070 | the corresponding value `b', and 0 otherwise.  The invalid exception is
   4071 | raised if either operand is a NaN.  The comparison is performed according
   4072 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   4073 *----------------------------------------------------------------------------*/
   4074 
   4075 int float64_lt( float64 a, float64 b STATUS_PARAM )
   4076 {
   4077     flag aSign, bSign;
   4078     uint64_t av, bv;
   4079 
   4080     a = float64_squash_input_denormal(a STATUS_VAR);
   4081     b = float64_squash_input_denormal(b STATUS_VAR);
   4082     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
   4083          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
   4084        ) {
   4085         float_raise( float_flag_invalid STATUS_VAR);
   4086         return 0;
   4087     }
   4088     aSign = extractFloat64Sign( a );
   4089     bSign = extractFloat64Sign( b );
   4090     av = float64_val(a);
   4091     bv = float64_val(b);
   4092     if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
   4093     return ( av != bv ) && ( aSign ^ ( av < bv ) );
   4094 
   4095 }
   4096 
   4097 /*----------------------------------------------------------------------------
   4098 | Returns 1 if the double-precision floating-point values `a' and `b' cannot
   4099 | be compared, and 0 otherwise.  The invalid exception is raised if either
   4100 | operand is a NaN.  The comparison is performed according to the IEC/IEEE
   4101 | Standard for Binary Floating-Point Arithmetic.
   4102 *----------------------------------------------------------------------------*/
   4103 
   4104 int float64_unordered( float64 a, float64 b STATUS_PARAM )
   4105 {
   4106     a = float64_squash_input_denormal(a STATUS_VAR);
   4107     b = float64_squash_input_denormal(b STATUS_VAR);
   4108 
   4109     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
   4110          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
   4111        ) {
   4112         float_raise( float_flag_invalid STATUS_VAR);
   4113         return 1;
   4114     }
   4115     return 0;
   4116 }
   4117 
   4118 /*----------------------------------------------------------------------------
   4119 | Returns 1 if the double-precision floating-point value `a' is equal to the
   4120 | corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
   4121 | exception.The comparison is performed according to the IEC/IEEE Standard
   4122 | for Binary Floating-Point Arithmetic.
   4123 *----------------------------------------------------------------------------*/
   4124 
   4125 int float64_eq_quiet( float64 a, float64 b STATUS_PARAM )
   4126 {
   4127     uint64_t av, bv;
   4128     a = float64_squash_input_denormal(a STATUS_VAR);
   4129     b = float64_squash_input_denormal(b STATUS_VAR);
   4130 
   4131     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
   4132          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
   4133        ) {
   4134         if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
   4135             float_raise( float_flag_invalid STATUS_VAR);
   4136         }
   4137         return 0;
   4138     }
   4139     av = float64_val(a);
   4140     bv = float64_val(b);
   4141     return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
   4142 
   4143 }
   4144 
   4145 /*----------------------------------------------------------------------------
   4146 | Returns 1 if the double-precision floating-point value `a' is less than or
   4147 | equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
   4148 | cause an exception.  Otherwise, the comparison is performed according to the
   4149 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   4150 *----------------------------------------------------------------------------*/
   4151 
   4152 int float64_le_quiet( float64 a, float64 b STATUS_PARAM )
   4153 {
   4154     flag aSign, bSign;
   4155     uint64_t av, bv;
   4156     a = float64_squash_input_denormal(a STATUS_VAR);
   4157     b = float64_squash_input_denormal(b STATUS_VAR);
   4158 
   4159     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
   4160          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
   4161        ) {
   4162         if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
   4163             float_raise( float_flag_invalid STATUS_VAR);
   4164         }
   4165         return 0;
   4166     }
   4167     aSign = extractFloat64Sign( a );
   4168     bSign = extractFloat64Sign( b );
   4169     av = float64_val(a);
   4170     bv = float64_val(b);
   4171     if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
   4172     return ( av == bv ) || ( aSign ^ ( av < bv ) );
   4173 
   4174 }
   4175 
   4176 /*----------------------------------------------------------------------------
   4177 | Returns 1 if the double-precision floating-point value `a' is less than
   4178 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
   4179 | exception.  Otherwise, the comparison is performed according to the IEC/IEEE
   4180 | Standard for Binary Floating-Point Arithmetic.
   4181 *----------------------------------------------------------------------------*/
   4182 
   4183 int float64_lt_quiet( float64 a, float64 b STATUS_PARAM )
   4184 {
   4185     flag aSign, bSign;
   4186     uint64_t av, bv;
   4187     a = float64_squash_input_denormal(a STATUS_VAR);
   4188     b = float64_squash_input_denormal(b STATUS_VAR);
   4189 
   4190     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
   4191          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
   4192        ) {
   4193         if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
   4194             float_raise( float_flag_invalid STATUS_VAR);
   4195         }
   4196         return 0;
   4197     }
   4198     aSign = extractFloat64Sign( a );
   4199     bSign = extractFloat64Sign( b );
   4200     av = float64_val(a);
   4201     bv = float64_val(b);
   4202     if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
   4203     return ( av != bv ) && ( aSign ^ ( av < bv ) );
   4204 
   4205 }
   4206 
   4207 /*----------------------------------------------------------------------------
   4208 | Returns 1 if the double-precision floating-point values `a' and `b' cannot
   4209 | be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.  The
   4210 | comparison is performed according to the IEC/IEEE Standard for Binary
   4211 | Floating-Point Arithmetic.
   4212 *----------------------------------------------------------------------------*/
   4213 
   4214 int float64_unordered_quiet( float64 a, float64 b STATUS_PARAM )
   4215 {
   4216     a = float64_squash_input_denormal(a STATUS_VAR);
   4217     b = float64_squash_input_denormal(b STATUS_VAR);
   4218 
   4219     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
   4220          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
   4221        ) {
   4222         if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
   4223             float_raise( float_flag_invalid STATUS_VAR);
   4224         }
   4225         return 1;
   4226     }
   4227     return 0;
   4228 }
   4229 
   4230 /*----------------------------------------------------------------------------
   4231 | Returns the result of converting the extended double-precision floating-
   4232 | point value `a' to the 32-bit two's complement integer format.  The
   4233 | conversion is performed according to the IEC/IEEE Standard for Binary
   4234 | Floating-Point Arithmetic---which means in particular that the conversion
   4235 | is rounded according to the current rounding mode.  If `a' is a NaN, the
   4236 | largest positive integer is returned.  Otherwise, if the conversion
   4237 | overflows, the largest integer with the same sign as `a' is returned.
   4238 *----------------------------------------------------------------------------*/
   4239 
   4240 int32 floatx80_to_int32( floatx80 a STATUS_PARAM )
   4241 {
   4242     flag aSign;
   4243     int32 aExp, shiftCount;
   4244     uint64_t aSig;
   4245 
   4246     aSig = extractFloatx80Frac( a );
   4247     aExp = extractFloatx80Exp( a );
   4248     aSign = extractFloatx80Sign( a );
   4249     if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
   4250     shiftCount = 0x4037 - aExp;
   4251     if ( shiftCount <= 0 ) shiftCount = 1;
   4252     shift64RightJamming( aSig, shiftCount, &aSig );
   4253     return roundAndPackInt32( aSign, aSig STATUS_VAR );
   4254 
   4255 }
   4256 
   4257 /*----------------------------------------------------------------------------
   4258 | Returns the result of converting the extended double-precision floating-
   4259 | point value `a' to the 32-bit two's complement integer format.  The
   4260 | conversion is performed according to the IEC/IEEE Standard for Binary
   4261 | Floating-Point Arithmetic, except that the conversion is always rounded
   4262 | toward zero.  If `a' is a NaN, the largest positive integer is returned.
   4263 | Otherwise, if the conversion overflows, the largest integer with the same
   4264 | sign as `a' is returned.
   4265 *----------------------------------------------------------------------------*/
   4266 
   4267 int32 floatx80_to_int32_round_to_zero( floatx80 a STATUS_PARAM )
   4268 {
   4269     flag aSign;
   4270     int32 aExp, shiftCount;
   4271     uint64_t aSig, savedASig;
   4272     int32_t z;
   4273 
   4274     aSig = extractFloatx80Frac( a );
   4275     aExp = extractFloatx80Exp( a );
   4276     aSign = extractFloatx80Sign( a );
   4277     if ( 0x401E < aExp ) {
   4278         if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
   4279         goto invalid;
   4280     }
   4281     else if ( aExp < 0x3FFF ) {
   4282         if ( aExp || aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
   4283         return 0;
   4284     }
   4285     shiftCount = 0x403E - aExp;
   4286     savedASig = aSig;
   4287     aSig >>= shiftCount;
   4288     z = aSig;
   4289     if ( aSign ) z = - z;
   4290     if ( ( z < 0 ) ^ aSign ) {
   4291  invalid:
   4292         float_raise( float_flag_invalid STATUS_VAR);
   4293         return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
   4294     }
   4295     if ( ( aSig<<shiftCount ) != savedASig ) {
   4296         STATUS(float_exception_flags) |= float_flag_inexact;
   4297     }
   4298     return z;
   4299 
   4300 }
   4301 
   4302 /*----------------------------------------------------------------------------
   4303 | Returns the result of converting the extended double-precision floating-
   4304 | point value `a' to the 64-bit two's complement integer format.  The
   4305 | conversion is performed according to the IEC/IEEE Standard for Binary
   4306 | Floating-Point Arithmetic---which means in particular that the conversion
   4307 | is rounded according to the current rounding mode.  If `a' is a NaN,
   4308 | the largest positive integer is returned.  Otherwise, if the conversion
   4309 | overflows, the largest integer with the same sign as `a' is returned.
   4310 *----------------------------------------------------------------------------*/
   4311 
   4312 int64 floatx80_to_int64( floatx80 a STATUS_PARAM )
   4313 {
   4314     flag aSign;
   4315     int32 aExp, shiftCount;
   4316     uint64_t aSig, aSigExtra;
   4317 
   4318     aSig = extractFloatx80Frac( a );
   4319     aExp = extractFloatx80Exp( a );
   4320     aSign = extractFloatx80Sign( a );
   4321     shiftCount = 0x403E - aExp;
   4322     if ( shiftCount <= 0 ) {
   4323         if ( shiftCount ) {
   4324             float_raise( float_flag_invalid STATUS_VAR);
   4325             if (    ! aSign
   4326                  || (    ( aExp == 0x7FFF )
   4327                       && ( aSig != LIT64( 0x8000000000000000 ) ) )
   4328                ) {
   4329                 return LIT64( 0x7FFFFFFFFFFFFFFF );
   4330             }
   4331             return (int64_t) LIT64( 0x8000000000000000 );
   4332         }
   4333         aSigExtra = 0;
   4334     }
   4335     else {
   4336         shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
   4337     }
   4338     return roundAndPackInt64( aSign, aSig, aSigExtra STATUS_VAR );
   4339 
   4340 }
   4341 
   4342 /*----------------------------------------------------------------------------
   4343 | Returns the result of converting the extended double-precision floating-
   4344 | point value `a' to the 64-bit two's complement integer format.  The
   4345 | conversion is performed according to the IEC/IEEE Standard for Binary
   4346 | Floating-Point Arithmetic, except that the conversion is always rounded
   4347 | toward zero.  If `a' is a NaN, the largest positive integer is returned.
   4348 | Otherwise, if the conversion overflows, the largest integer with the same
   4349 | sign as `a' is returned.
   4350 *----------------------------------------------------------------------------*/
   4351 
   4352 int64 floatx80_to_int64_round_to_zero( floatx80 a STATUS_PARAM )
   4353 {
   4354     flag aSign;
   4355     int32 aExp, shiftCount;
   4356     uint64_t aSig;
   4357     int64 z;
   4358 
   4359     aSig = extractFloatx80Frac( a );
   4360     aExp = extractFloatx80Exp( a );
   4361     aSign = extractFloatx80Sign( a );
   4362     shiftCount = aExp - 0x403E;
   4363     if ( 0 <= shiftCount ) {
   4364         aSig &= LIT64( 0x7FFFFFFFFFFFFFFF );
   4365         if ( ( a.high != 0xC03E ) || aSig ) {
   4366             float_raise( float_flag_invalid STATUS_VAR);
   4367             if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {
   4368                 return LIT64( 0x7FFFFFFFFFFFFFFF );
   4369             }
   4370         }
   4371         return (int64_t) LIT64( 0x8000000000000000 );
   4372     }
   4373     else if ( aExp < 0x3FFF ) {
   4374         if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
   4375         return 0;
   4376     }
   4377     z = aSig>>( - shiftCount );
   4378     if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) {
   4379         STATUS(float_exception_flags) |= float_flag_inexact;
   4380     }
   4381     if ( aSign ) z = - z;
   4382     return z;
   4383 
   4384 }
   4385 
   4386 /*----------------------------------------------------------------------------
   4387 | Returns the result of converting the extended double-precision floating-
   4388 | point value `a' to the single-precision floating-point format.  The
   4389 | conversion is performed according to the IEC/IEEE Standard for Binary
   4390 | Floating-Point Arithmetic.
   4391 *----------------------------------------------------------------------------*/
   4392 
   4393 float32 floatx80_to_float32( floatx80 a STATUS_PARAM )
   4394 {
   4395     flag aSign;
   4396     int32 aExp;
   4397     uint64_t aSig;
   4398 
   4399     aSig = extractFloatx80Frac( a );
   4400     aExp = extractFloatx80Exp( a );
   4401     aSign = extractFloatx80Sign( a );
   4402     if ( aExp == 0x7FFF ) {
   4403         if ( (uint64_t) ( aSig<<1 ) ) {
   4404             return commonNaNToFloat32( floatx80ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
   4405         }
   4406         return packFloat32( aSign, 0xFF, 0 );
   4407     }
   4408     shift64RightJamming( aSig, 33, &aSig );
   4409     if ( aExp || aSig ) aExp -= 0x3F81;
   4410     return roundAndPackFloat32( aSign, aExp, aSig STATUS_VAR );
   4411 
   4412 }
   4413 
   4414 /*----------------------------------------------------------------------------
   4415 | Returns the result of converting the extended double-precision floating-
   4416 | point value `a' to the double-precision floating-point format.  The
   4417 | conversion is performed according to the IEC/IEEE Standard for Binary
   4418 | Floating-Point Arithmetic.
   4419 *----------------------------------------------------------------------------*/
   4420 
   4421 float64 floatx80_to_float64( floatx80 a STATUS_PARAM )
   4422 {
   4423     flag aSign;
   4424     int32 aExp;
   4425     uint64_t aSig, zSig;
   4426 
   4427     aSig = extractFloatx80Frac( a );
   4428     aExp = extractFloatx80Exp( a );
   4429     aSign = extractFloatx80Sign( a );
   4430     if ( aExp == 0x7FFF ) {
   4431         if ( (uint64_t) ( aSig<<1 ) ) {
   4432             return commonNaNToFloat64( floatx80ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
   4433         }
   4434         return packFloat64( aSign, 0x7FF, 0 );
   4435     }
   4436     shift64RightJamming( aSig, 1, &zSig );
   4437     if ( aExp || aSig ) aExp -= 0x3C01;
   4438     return roundAndPackFloat64( aSign, aExp, zSig STATUS_VAR );
   4439 
   4440 }
   4441 
   4442 /*----------------------------------------------------------------------------
   4443 | Returns the result of converting the extended double-precision floating-
   4444 | point value `a' to the quadruple-precision floating-point format.  The
   4445 | conversion is performed according to the IEC/IEEE Standard for Binary
   4446 | Floating-Point Arithmetic.
   4447 *----------------------------------------------------------------------------*/
   4448 
   4449 float128 floatx80_to_float128( floatx80 a STATUS_PARAM )
   4450 {
   4451     flag aSign;
   4452     int_fast16_t aExp;
   4453     uint64_t aSig, zSig0, zSig1;
   4454 
   4455     aSig = extractFloatx80Frac( a );
   4456     aExp = extractFloatx80Exp( a );
   4457     aSign = extractFloatx80Sign( a );
   4458     if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) {
   4459         return commonNaNToFloat128( floatx80ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
   4460     }
   4461     shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
   4462     return packFloat128( aSign, aExp, zSig0, zSig1 );
   4463 
   4464 }
   4465 
   4466 /*----------------------------------------------------------------------------
   4467 | Rounds the extended double-precision floating-point value `a' to an integer,
   4468 | and returns the result as an extended quadruple-precision floating-point
   4469 | value.  The operation is performed according to the IEC/IEEE Standard for
   4470 | Binary Floating-Point Arithmetic.
   4471 *----------------------------------------------------------------------------*/
   4472 
   4473 floatx80 floatx80_round_to_int( floatx80 a STATUS_PARAM )
   4474 {
   4475     flag aSign;
   4476     int32 aExp;
   4477     uint64_t lastBitMask, roundBitsMask;
   4478     int8 roundingMode;
   4479     floatx80 z;
   4480 
   4481     aExp = extractFloatx80Exp( a );
   4482     if ( 0x403E <= aExp ) {
   4483         if ( ( aExp == 0x7FFF ) && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) {
   4484             return propagateFloatx80NaN( a, a STATUS_VAR );
   4485         }
   4486         return a;
   4487     }
   4488     if ( aExp < 0x3FFF ) {
   4489         if (    ( aExp == 0 )
   4490              && ( (uint64_t) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
   4491             return a;
   4492         }
   4493         STATUS(float_exception_flags) |= float_flag_inexact;
   4494         aSign = extractFloatx80Sign( a );
   4495         switch ( STATUS(float_rounding_mode) ) {
   4496          case float_round_nearest_even:
   4497             if ( ( aExp == 0x3FFE ) && (uint64_t) ( extractFloatx80Frac( a )<<1 )
   4498                ) {
   4499                 return
   4500                     packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );
   4501             }
   4502             break;
   4503          case float_round_down:
   4504             return
   4505                   aSign ?
   4506                       packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
   4507                 : packFloatx80( 0, 0, 0 );
   4508          case float_round_up:
   4509             return
   4510                   aSign ? packFloatx80( 1, 0, 0 )
   4511                 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
   4512         }
   4513         return packFloatx80( aSign, 0, 0 );
   4514     }
   4515     lastBitMask = 1;
   4516     lastBitMask <<= 0x403E - aExp;
   4517     roundBitsMask = lastBitMask - 1;
   4518     z = a;
   4519     roundingMode = STATUS(float_rounding_mode);
   4520     if ( roundingMode == float_round_nearest_even ) {
   4521         z.low += lastBitMask>>1;
   4522         if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
   4523     }
   4524     else if ( roundingMode != float_round_to_zero ) {
   4525         if ( extractFloatx80Sign( z ) ^ ( roundingMode == float_round_up ) ) {
   4526             z.low += roundBitsMask;
   4527         }
   4528     }
   4529     z.low &= ~ roundBitsMask;
   4530     if ( z.low == 0 ) {
   4531         ++z.high;
   4532         z.low = LIT64( 0x8000000000000000 );
   4533     }
   4534     if ( z.low != a.low ) STATUS(float_exception_flags) |= float_flag_inexact;
   4535     return z;
   4536 
   4537 }
   4538 
   4539 /*----------------------------------------------------------------------------
   4540 | Returns the result of adding the absolute values of the extended double-
   4541 | precision floating-point values `a' and `b'.  If `zSign' is 1, the sum is
   4542 | negated before being returned.  `zSign' is ignored if the result is a NaN.
   4543 | The addition is performed according to the IEC/IEEE Standard for Binary
   4544 | Floating-Point Arithmetic.
   4545 *----------------------------------------------------------------------------*/
   4546 
   4547 static floatx80 addFloatx80Sigs( floatx80 a, floatx80 b, flag zSign STATUS_PARAM)
   4548 {
   4549     int32 aExp, bExp, zExp;
   4550     uint64_t aSig, bSig, zSig0, zSig1;
   4551     int32 expDiff;
   4552 
   4553     aSig = extractFloatx80Frac( a );
   4554     aExp = extractFloatx80Exp( a );
   4555     bSig = extractFloatx80Frac( b );
   4556     bExp = extractFloatx80Exp( b );
   4557     expDiff = aExp - bExp;
   4558     if ( 0 < expDiff ) {
   4559         if ( aExp == 0x7FFF ) {
   4560             if ( (uint64_t) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
   4561             return a;
   4562         }
   4563         if ( bExp == 0 ) --expDiff;
   4564         shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
   4565         zExp = aExp;
   4566     }
   4567     else if ( expDiff < 0 ) {
   4568         if ( bExp == 0x7FFF ) {
   4569             if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
   4570             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
   4571         }
   4572         if ( aExp == 0 ) ++expDiff;
   4573         shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
   4574         zExp = bExp;
   4575     }
   4576     else {
   4577         if ( aExp == 0x7FFF ) {
   4578             if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
   4579                 return propagateFloatx80NaN( a, b STATUS_VAR );
   4580             }
   4581             return a;
   4582         }
   4583         zSig1 = 0;
   4584         zSig0 = aSig + bSig;
   4585         if ( aExp == 0 ) {
   4586             normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
   4587             goto roundAndPack;
   4588         }
   4589         zExp = aExp;
   4590         goto shiftRight1;
   4591     }
   4592     zSig0 = aSig + bSig;
   4593     if ( (int64_t) zSig0 < 0 ) goto roundAndPack;
   4594  shiftRight1:
   4595     shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
   4596     zSig0 |= LIT64( 0x8000000000000000 );
   4597     ++zExp;
   4598  roundAndPack:
   4599     return
   4600         roundAndPackFloatx80(
   4601             STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
   4602 
   4603 }
   4604 
   4605 /*----------------------------------------------------------------------------
   4606 | Returns the result of subtracting the absolute values of the extended
   4607 | double-precision floating-point values `a' and `b'.  If `zSign' is 1, the
   4608 | difference is negated before being returned.  `zSign' is ignored if the
   4609 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
   4610 | Standard for Binary Floating-Point Arithmetic.
   4611 *----------------------------------------------------------------------------*/
   4612 
   4613 static floatx80 subFloatx80Sigs( floatx80 a, floatx80 b, flag zSign STATUS_PARAM )
   4614 {
   4615     int32 aExp, bExp, zExp;
   4616     uint64_t aSig, bSig, zSig0, zSig1;
   4617     int32 expDiff;
   4618     floatx80 z;
   4619 
   4620     aSig = extractFloatx80Frac( a );
   4621     aExp = extractFloatx80Exp( a );
   4622     bSig = extractFloatx80Frac( b );
   4623     bExp = extractFloatx80Exp( b );
   4624     expDiff = aExp - bExp;
   4625     if ( 0 < expDiff ) goto aExpBigger;
   4626     if ( expDiff < 0 ) goto bExpBigger;
   4627     if ( aExp == 0x7FFF ) {
   4628         if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
   4629             return propagateFloatx80NaN( a, b STATUS_VAR );
   4630         }
   4631         float_raise( float_flag_invalid STATUS_VAR);
   4632         z.low = floatx80_default_nan_low;
   4633         z.high = floatx80_default_nan_high;
   4634         return z;
   4635     }
   4636     if ( aExp == 0 ) {
   4637         aExp = 1;
   4638         bExp = 1;
   4639     }
   4640     zSig1 = 0;
   4641     if ( bSig < aSig ) goto aBigger;
   4642     if ( aSig < bSig ) goto bBigger;
   4643     return packFloatx80( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
   4644  bExpBigger:
   4645     if ( bExp == 0x7FFF ) {
   4646         if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
   4647         return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) );
   4648     }
   4649     if ( aExp == 0 ) ++expDiff;
   4650     shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
   4651  bBigger:
   4652     sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
   4653     zExp = bExp;
   4654     zSign ^= 1;
   4655     goto normalizeRoundAndPack;
   4656  aExpBigger:
   4657     if ( aExp == 0x7FFF ) {
   4658         if ( (uint64_t) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
   4659         return a;
   4660     }
   4661     if ( bExp == 0 ) --expDiff;
   4662     shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
   4663  aBigger:
   4664     sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
   4665     zExp = aExp;
   4666  normalizeRoundAndPack:
   4667     return
   4668         normalizeRoundAndPackFloatx80(
   4669             STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
   4670 
   4671 }
   4672 
   4673 /*----------------------------------------------------------------------------
   4674 | Returns the result of adding the extended double-precision floating-point
   4675 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
   4676 | Standard for Binary Floating-Point Arithmetic.
   4677 *----------------------------------------------------------------------------*/
   4678 
   4679 floatx80 floatx80_add( floatx80 a, floatx80 b STATUS_PARAM )
   4680 {
   4681     flag aSign, bSign;
   4682 
   4683     aSign = extractFloatx80Sign( a );
   4684     bSign = extractFloatx80Sign( b );
   4685     if ( aSign == bSign ) {
   4686         return addFloatx80Sigs( a, b, aSign STATUS_VAR );
   4687     }
   4688     else {
   4689         return subFloatx80Sigs( a, b, aSign STATUS_VAR );
   4690     }
   4691 
   4692 }
   4693 
   4694 /*----------------------------------------------------------------------------
   4695 | Returns the result of subtracting the extended double-precision floating-
   4696 | point values `a' and `b'.  The operation is performed according to the
   4697 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   4698 *----------------------------------------------------------------------------*/
   4699 
   4700 floatx80 floatx80_sub( floatx80 a, floatx80 b STATUS_PARAM )
   4701 {
   4702     flag aSign, bSign;
   4703 
   4704     aSign = extractFloatx80Sign( a );
   4705     bSign = extractFloatx80Sign( b );
   4706     if ( aSign == bSign ) {
   4707         return subFloatx80Sigs( a, b, aSign STATUS_VAR );
   4708     }
   4709     else {
   4710         return addFloatx80Sigs( a, b, aSign STATUS_VAR );
   4711     }
   4712 
   4713 }
   4714 
   4715 /*----------------------------------------------------------------------------
   4716 | Returns the result of multiplying the extended double-precision floating-
   4717 | point values `a' and `b'.  The operation is performed according to the
   4718 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   4719 *----------------------------------------------------------------------------*/
   4720 
   4721 floatx80 floatx80_mul( floatx80 a, floatx80 b STATUS_PARAM )
   4722 {
   4723     flag aSign, bSign, zSign;
   4724     int32 aExp, bExp, zExp;
   4725     uint64_t aSig, bSig, zSig0, zSig1;
   4726     floatx80 z;
   4727 
   4728     aSig = extractFloatx80Frac( a );
   4729     aExp = extractFloatx80Exp( a );
   4730     aSign = extractFloatx80Sign( a );
   4731     bSig = extractFloatx80Frac( b );
   4732     bExp = extractFloatx80Exp( b );
   4733     bSign = extractFloatx80Sign( b );
   4734     zSign = aSign ^ bSign;
   4735     if ( aExp == 0x7FFF ) {
   4736         if (    (uint64_t) ( aSig<<1 )
   4737              || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
   4738             return propagateFloatx80NaN( a, b STATUS_VAR );
   4739         }
   4740         if ( ( bExp | bSig ) == 0 ) goto invalid;
   4741         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
   4742     }
   4743     if ( bExp == 0x7FFF ) {
   4744         if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
   4745         if ( ( aExp | aSig ) == 0 ) {
   4746  invalid:
   4747             float_raise( float_flag_invalid STATUS_VAR);
   4748             z.low = floatx80_default_nan_low;
   4749             z.high = floatx80_default_nan_high;
   4750             return z;
   4751         }
   4752         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
   4753     }
   4754     if ( aExp == 0 ) {
   4755         if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
   4756         normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
   4757     }
   4758     if ( bExp == 0 ) {
   4759         if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
   4760         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
   4761     }
   4762     zExp = aExp + bExp - 0x3FFE;
   4763     mul64To128( aSig, bSig, &zSig0, &zSig1 );
   4764     if ( 0 < (int64_t) zSig0 ) {
   4765         shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
   4766         --zExp;
   4767     }
   4768     return
   4769         roundAndPackFloatx80(
   4770             STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
   4771 
   4772 }
   4773 
   4774 /*----------------------------------------------------------------------------
   4775 | Returns the result of dividing the extended double-precision floating-point
   4776 | value `a' by the corresponding value `b'.  The operation is performed
   4777 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   4778 *----------------------------------------------------------------------------*/
   4779 
   4780 floatx80 floatx80_div( floatx80 a, floatx80 b STATUS_PARAM )
   4781 {
   4782     flag aSign, bSign, zSign;
   4783     int32 aExp, bExp, zExp;
   4784     uint64_t aSig, bSig, zSig0, zSig1;
   4785     uint64_t rem0, rem1, rem2, term0, term1, term2;
   4786     floatx80 z;
   4787 
   4788     aSig = extractFloatx80Frac( a );
   4789     aExp = extractFloatx80Exp( a );
   4790     aSign = extractFloatx80Sign( a );
   4791     bSig = extractFloatx80Frac( b );
   4792     bExp = extractFloatx80Exp( b );
   4793     bSign = extractFloatx80Sign( b );
   4794     zSign = aSign ^ bSign;
   4795     if ( aExp == 0x7FFF ) {
   4796         if ( (uint64_t) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
   4797         if ( bExp == 0x7FFF ) {
   4798             if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
   4799             goto invalid;
   4800         }
   4801         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
   4802     }
   4803     if ( bExp == 0x7FFF ) {
   4804         if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
   4805         return packFloatx80( zSign, 0, 0 );
   4806     }
   4807     if ( bExp == 0 ) {
   4808         if ( bSig == 0 ) {
   4809             if ( ( aExp | aSig ) == 0 ) {
   4810  invalid:
   4811                 float_raise( float_flag_invalid STATUS_VAR);
   4812                 z.low = floatx80_default_nan_low;
   4813                 z.high = floatx80_default_nan_high;
   4814                 return z;
   4815             }
   4816             float_raise( float_flag_divbyzero STATUS_VAR);
   4817             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
   4818         }
   4819         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
   4820     }
   4821     if ( aExp == 0 ) {
   4822         if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
   4823         normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
   4824     }
   4825     zExp = aExp - bExp + 0x3FFE;
   4826     rem1 = 0;
   4827     if ( bSig <= aSig ) {
   4828         shift128Right( aSig, 0, 1, &aSig, &rem1 );
   4829         ++zExp;
   4830     }
   4831     zSig0 = estimateDiv128To64( aSig, rem1, bSig );
   4832     mul64To128( bSig, zSig0, &term0, &term1 );
   4833     sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
   4834     while ( (int64_t) rem0 < 0 ) {
   4835         --zSig0;
   4836         add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
   4837     }
   4838     zSig1 = estimateDiv128To64( rem1, 0, bSig );
   4839     if ( (uint64_t) ( zSig1<<1 ) <= 8 ) {
   4840         mul64To128( bSig, zSig1, &term1, &term2 );
   4841         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
   4842         while ( (int64_t) rem1 < 0 ) {
   4843             --zSig1;
   4844             add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
   4845         }
   4846         zSig1 |= ( ( rem1 | rem2 ) != 0 );
   4847     }
   4848     return
   4849         roundAndPackFloatx80(
   4850             STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
   4851 
   4852 }
   4853 
   4854 /*----------------------------------------------------------------------------
   4855 | Returns the remainder of the extended double-precision floating-point value
   4856 | `a' with respect to the corresponding value `b'.  The operation is performed
   4857 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   4858 *----------------------------------------------------------------------------*/
   4859 
   4860 floatx80 floatx80_rem( floatx80 a, floatx80 b STATUS_PARAM )
   4861 {
   4862     flag aSign, zSign;
   4863     int32 aExp, bExp, expDiff;
   4864     uint64_t aSig0, aSig1, bSig;
   4865     uint64_t q, term0, term1, alternateASig0, alternateASig1;
   4866     floatx80 z;
   4867 
   4868     aSig0 = extractFloatx80Frac( a );
   4869     aExp = extractFloatx80Exp( a );
   4870     aSign = extractFloatx80Sign( a );
   4871     bSig = extractFloatx80Frac( b );
   4872     bExp = extractFloatx80Exp( b );
   4873     if ( aExp == 0x7FFF ) {
   4874         if (    (uint64_t) ( aSig0<<1 )
   4875              || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
   4876             return propagateFloatx80NaN( a, b STATUS_VAR );
   4877         }
   4878         goto invalid;
   4879     }
   4880     if ( bExp == 0x7FFF ) {
   4881         if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
   4882         return a;
   4883     }
   4884     if ( bExp == 0 ) {
   4885         if ( bSig == 0 ) {
   4886  invalid:
   4887             float_raise( float_flag_invalid STATUS_VAR);
   4888             z.low = floatx80_default_nan_low;
   4889             z.high = floatx80_default_nan_high;
   4890             return z;
   4891         }
   4892         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
   4893     }
   4894     if ( aExp == 0 ) {
   4895         if ( (uint64_t) ( aSig0<<1 ) == 0 ) return a;
   4896         normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
   4897     }
   4898     bSig |= LIT64( 0x8000000000000000 );
   4899     zSign = aSign;
   4900     expDiff = aExp - bExp;
   4901     aSig1 = 0;
   4902     if ( expDiff < 0 ) {
   4903         if ( expDiff < -1 ) return a;
   4904         shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
   4905         expDiff = 0;
   4906     }
   4907     q = ( bSig <= aSig0 );
   4908     if ( q ) aSig0 -= bSig;
   4909     expDiff -= 64;
   4910     while ( 0 < expDiff ) {
   4911         q = estimateDiv128To64( aSig0, aSig1, bSig );
   4912         q = ( 2 < q ) ? q - 2 : 0;
   4913         mul64To128( bSig, q, &term0, &term1 );
   4914         sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
   4915         shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
   4916         expDiff -= 62;
   4917     }
   4918     expDiff += 64;
   4919     if ( 0 < expDiff ) {
   4920         q = estimateDiv128To64( aSig0, aSig1, bSig );
   4921         q = ( 2 < q ) ? q - 2 : 0;
   4922         q >>= 64 - expDiff;
   4923         mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
   4924         sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
   4925         shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
   4926         while ( le128( term0, term1, aSig0, aSig1 ) ) {
   4927             ++q;
   4928             sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
   4929         }
   4930     }
   4931     else {
   4932         term1 = 0;
   4933         term0 = bSig;
   4934     }
   4935     sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
   4936     if (    lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
   4937          || (    eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
   4938               && ( q & 1 ) )
   4939        ) {
   4940         aSig0 = alternateASig0;
   4941         aSig1 = alternateASig1;
   4942         zSign = ! zSign;
   4943     }
   4944     return
   4945         normalizeRoundAndPackFloatx80(
   4946             80, zSign, bExp + expDiff, aSig0, aSig1 STATUS_VAR );
   4947 
   4948 }
   4949 
   4950 /*----------------------------------------------------------------------------
   4951 | Returns the square root of the extended double-precision floating-point
   4952 | value `a'.  The operation is performed according to the IEC/IEEE Standard
   4953 | for Binary Floating-Point Arithmetic.
   4954 *----------------------------------------------------------------------------*/
   4955 
   4956 floatx80 floatx80_sqrt( floatx80 a STATUS_PARAM )
   4957 {
   4958     flag aSign;
   4959     int32 aExp, zExp;
   4960     uint64_t aSig0, aSig1, zSig0, zSig1, doubleZSig0;
   4961     uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
   4962     floatx80 z;
   4963 
   4964     aSig0 = extractFloatx80Frac( a );
   4965     aExp = extractFloatx80Exp( a );
   4966     aSign = extractFloatx80Sign( a );
   4967     if ( aExp == 0x7FFF ) {
   4968         if ( (uint64_t) ( aSig0<<1 ) ) return propagateFloatx80NaN( a, a STATUS_VAR );
   4969         if ( ! aSign ) return a;
   4970         goto invalid;
   4971     }
   4972     if ( aSign ) {
   4973         if ( ( aExp | aSig0 ) == 0 ) return a;
   4974  invalid:
   4975         float_raise( float_flag_invalid STATUS_VAR);
   4976         z.low = floatx80_default_nan_low;
   4977         z.high = floatx80_default_nan_high;
   4978         return z;
   4979     }
   4980     if ( aExp == 0 ) {
   4981         if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
   4982         normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
   4983     }
   4984     zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
   4985     zSig0 = estimateSqrt32( aExp, aSig0>>32 );
   4986     shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
   4987     zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
   4988     doubleZSig0 = zSig0<<1;
   4989     mul64To128( zSig0, zSig0, &term0, &term1 );
   4990     sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
   4991     while ( (int64_t) rem0 < 0 ) {
   4992         --zSig0;
   4993         doubleZSig0 -= 2;
   4994         add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
   4995     }
   4996     zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
   4997     if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
   4998         if ( zSig1 == 0 ) zSig1 = 1;
   4999         mul64To128( doubleZSig0, zSig1, &term1, &term2 );
   5000         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
   5001         mul64To128( zSig1, zSig1, &term2, &term3 );
   5002         sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
   5003         while ( (int64_t) rem1 < 0 ) {
   5004             --zSig1;
   5005             shortShift128Left( 0, zSig1, 1, &term2, &term3 );
   5006             term3 |= 1;
   5007             term2 |= doubleZSig0;
   5008             add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
   5009         }
   5010         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
   5011     }
   5012     shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
   5013     zSig0 |= doubleZSig0;
   5014     return
   5015         roundAndPackFloatx80(
   5016             STATUS(floatx80_rounding_precision), 0, zExp, zSig0, zSig1 STATUS_VAR );
   5017 
   5018 }
   5019 
   5020 /*----------------------------------------------------------------------------
   5021 | Returns 1 if the extended double-precision floating-point value `a' is equal
   5022 | to the corresponding value `b', and 0 otherwise.  The invalid exception is
   5023 | raised if either operand is a NaN.  Otherwise, the comparison is performed
   5024 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   5025 *----------------------------------------------------------------------------*/
   5026 
   5027 int floatx80_eq( floatx80 a, floatx80 b STATUS_PARAM )
   5028 {
   5029 
   5030     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
   5031               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
   5032          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
   5033               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
   5034        ) {
   5035         float_raise( float_flag_invalid STATUS_VAR);
   5036         return 0;
   5037     }
   5038     return
   5039            ( a.low == b.low )
   5040         && (    ( a.high == b.high )
   5041              || (    ( a.low == 0 )
   5042                   && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
   5043            );
   5044 
   5045 }
   5046 
   5047 /*----------------------------------------------------------------------------
   5048 | Returns 1 if the extended double-precision floating-point value `a' is
   5049 | less than or equal to the corresponding value `b', and 0 otherwise.  The
   5050 | invalid exception is raised if either operand is a NaN.  The comparison is
   5051 | performed according to the IEC/IEEE Standard for Binary Floating-Point
   5052 | Arithmetic.
   5053 *----------------------------------------------------------------------------*/
   5054 
   5055 int floatx80_le( floatx80 a, floatx80 b STATUS_PARAM )
   5056 {
   5057     flag aSign, bSign;
   5058 
   5059     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
   5060               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
   5061          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
   5062               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
   5063        ) {
   5064         float_raise( float_flag_invalid STATUS_VAR);
   5065         return 0;
   5066     }
   5067     aSign = extractFloatx80Sign( a );
   5068     bSign = extractFloatx80Sign( b );
   5069     if ( aSign != bSign ) {
   5070         return
   5071                aSign
   5072             || (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
   5073                  == 0 );
   5074     }
   5075     return
   5076           aSign ? le128( b.high, b.low, a.high, a.low )
   5077         : le128( a.high, a.low, b.high, b.low );
   5078 
   5079 }
   5080 
   5081 /*----------------------------------------------------------------------------
   5082 | Returns 1 if the extended double-precision floating-point value `a' is
   5083 | less than the corresponding value `b', and 0 otherwise.  The invalid
   5084 | exception is raised if either operand is a NaN.  The comparison is performed
   5085 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   5086 *----------------------------------------------------------------------------*/
   5087 
   5088 int floatx80_lt( floatx80 a, floatx80 b STATUS_PARAM )
   5089 {
   5090     flag aSign, bSign;
   5091 
   5092     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
   5093               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
   5094          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
   5095               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
   5096        ) {
   5097         float_raise( float_flag_invalid STATUS_VAR);
   5098         return 0;
   5099     }
   5100     aSign = extractFloatx80Sign( a );
   5101     bSign = extractFloatx80Sign( b );
   5102     if ( aSign != bSign ) {
   5103         return
   5104                aSign
   5105             && (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
   5106                  != 0 );
   5107     }
   5108     return
   5109           aSign ? lt128( b.high, b.low, a.high, a.low )
   5110         : lt128( a.high, a.low, b.high, b.low );
   5111 
   5112 }
   5113 
   5114 /*----------------------------------------------------------------------------
   5115 | Returns 1 if the extended double-precision floating-point values `a' and `b'
   5116 | cannot be compared, and 0 otherwise.  The invalid exception is raised if
   5117 | either operand is a NaN.   The comparison is performed according to the
   5118 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   5119 *----------------------------------------------------------------------------*/
   5120 int floatx80_unordered( floatx80 a, floatx80 b STATUS_PARAM )
   5121 {
   5122     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
   5123               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
   5124          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
   5125               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
   5126        ) {
   5127         float_raise( float_flag_invalid STATUS_VAR);
   5128         return 1;
   5129     }
   5130     return 0;
   5131 }
   5132 
   5133 /*----------------------------------------------------------------------------
   5134 | Returns 1 if the extended double-precision floating-point value `a' is
   5135 | equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
   5136 | cause an exception.  The comparison is performed according to the IEC/IEEE
   5137 | Standard for Binary Floating-Point Arithmetic.
   5138 *----------------------------------------------------------------------------*/
   5139 
   5140 int floatx80_eq_quiet( floatx80 a, floatx80 b STATUS_PARAM )
   5141 {
   5142 
   5143     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
   5144               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
   5145          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
   5146               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
   5147        ) {
   5148         if (    floatx80_is_signaling_nan( a )
   5149              || floatx80_is_signaling_nan( b ) ) {
   5150             float_raise( float_flag_invalid STATUS_VAR);
   5151         }
   5152         return 0;
   5153     }
   5154     return
   5155            ( a.low == b.low )
   5156         && (    ( a.high == b.high )
   5157              || (    ( a.low == 0 )
   5158                   && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
   5159            );
   5160 
   5161 }
   5162 
   5163 /*----------------------------------------------------------------------------
   5164 | Returns 1 if the extended double-precision floating-point value `a' is less
   5165 | than or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs
   5166 | do not cause an exception.  Otherwise, the comparison is performed according
   5167 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   5168 *----------------------------------------------------------------------------*/
   5169 
   5170 int floatx80_le_quiet( floatx80 a, floatx80 b STATUS_PARAM )
   5171 {
   5172     flag aSign, bSign;
   5173 
   5174     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
   5175               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
   5176          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
   5177               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
   5178        ) {
   5179         if (    floatx80_is_signaling_nan( a )
   5180              || floatx80_is_signaling_nan( b ) ) {
   5181             float_raise( float_flag_invalid STATUS_VAR);
   5182         }
   5183         return 0;
   5184     }
   5185     aSign = extractFloatx80Sign( a );
   5186     bSign = extractFloatx80Sign( b );
   5187     if ( aSign != bSign ) {
   5188         return
   5189                aSign
   5190             || (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
   5191                  == 0 );
   5192     }
   5193     return
   5194           aSign ? le128( b.high, b.low, a.high, a.low )
   5195         : le128( a.high, a.low, b.high, b.low );
   5196 
   5197 }
   5198 
   5199 /*----------------------------------------------------------------------------
   5200 | Returns 1 if the extended double-precision floating-point value `a' is less
   5201 | than the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause
   5202 | an exception.  Otherwise, the comparison is performed according to the
   5203 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   5204 *----------------------------------------------------------------------------*/
   5205 
   5206 int floatx80_lt_quiet( floatx80 a, floatx80 b STATUS_PARAM )
   5207 {
   5208     flag aSign, bSign;
   5209 
   5210     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
   5211               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
   5212          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
   5213               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
   5214        ) {
   5215         if (    floatx80_is_signaling_nan( a )
   5216              || floatx80_is_signaling_nan( b ) ) {
   5217             float_raise( float_flag_invalid STATUS_VAR);
   5218         }
   5219         return 0;
   5220     }
   5221     aSign = extractFloatx80Sign( a );
   5222     bSign = extractFloatx80Sign( b );
   5223     if ( aSign != bSign ) {
   5224         return
   5225                aSign
   5226             && (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
   5227                  != 0 );
   5228     }
   5229     return
   5230           aSign ? lt128( b.high, b.low, a.high, a.low )
   5231         : lt128( a.high, a.low, b.high, b.low );
   5232 
   5233 }
   5234 
   5235 /*----------------------------------------------------------------------------
   5236 | Returns 1 if the extended double-precision floating-point values `a' and `b'
   5237 | cannot be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.
   5238 | The comparison is performed according to the IEC/IEEE Standard for Binary
   5239 | Floating-Point Arithmetic.
   5240 *----------------------------------------------------------------------------*/
   5241 int floatx80_unordered_quiet( floatx80 a, floatx80 b STATUS_PARAM )
   5242 {
   5243     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
   5244               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
   5245          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
   5246               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
   5247        ) {
   5248         if (    floatx80_is_signaling_nan( a )
   5249              || floatx80_is_signaling_nan( b ) ) {
   5250             float_raise( float_flag_invalid STATUS_VAR);
   5251         }
   5252         return 1;
   5253     }
   5254     return 0;
   5255 }
   5256 
   5257 /*----------------------------------------------------------------------------
   5258 | Returns the result of converting the quadruple-precision floating-point
   5259 | value `a' to the 32-bit two's complement integer format.  The conversion
   5260 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
   5261 | Arithmetic---which means in particular that the conversion is rounded
   5262 | according to the current rounding mode.  If `a' is a NaN, the largest
   5263 | positive integer is returned.  Otherwise, if the conversion overflows, the
   5264 | largest integer with the same sign as `a' is returned.
   5265 *----------------------------------------------------------------------------*/
   5266 
   5267 int32 float128_to_int32( float128 a STATUS_PARAM )
   5268 {
   5269     flag aSign;
   5270     int32 aExp, shiftCount;
   5271     uint64_t aSig0, aSig1;
   5272 
   5273     aSig1 = extractFloat128Frac1( a );
   5274     aSig0 = extractFloat128Frac0( a );
   5275     aExp = extractFloat128Exp( a );
   5276     aSign = extractFloat128Sign( a );
   5277     if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
   5278     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
   5279     aSig0 |= ( aSig1 != 0 );
   5280     shiftCount = 0x4028 - aExp;
   5281     if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
   5282     return roundAndPackInt32( aSign, aSig0 STATUS_VAR );
   5283 
   5284 }
   5285 
   5286 /*----------------------------------------------------------------------------
   5287 | Returns the result of converting the quadruple-precision floating-point
   5288 | value `a' to the 32-bit two's complement integer format.  The conversion
   5289 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
   5290 | Arithmetic, except that the conversion is always rounded toward zero.  If
   5291 | `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
   5292 | conversion overflows, the largest integer with the same sign as `a' is
   5293 | returned.
   5294 *----------------------------------------------------------------------------*/
   5295 
   5296 int32 float128_to_int32_round_to_zero( float128 a STATUS_PARAM )
   5297 {
   5298     flag aSign;
   5299     int32 aExp, shiftCount;
   5300     uint64_t aSig0, aSig1, savedASig;
   5301     int32_t z;
   5302 
   5303     aSig1 = extractFloat128Frac1( a );
   5304     aSig0 = extractFloat128Frac0( a );
   5305     aExp = extractFloat128Exp( a );
   5306     aSign = extractFloat128Sign( a );
   5307     aSig0 |= ( aSig1 != 0 );
   5308     if ( 0x401E < aExp ) {
   5309         if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
   5310         goto invalid;
   5311     }
   5312     else if ( aExp < 0x3FFF ) {
   5313         if ( aExp || aSig0 ) STATUS(float_exception_flags) |= float_flag_inexact;
   5314         return 0;
   5315     }
   5316     aSig0 |= LIT64( 0x0001000000000000 );
   5317     shiftCount = 0x402F - aExp;
   5318     savedASig = aSig0;
   5319     aSig0 >>= shiftCount;
   5320     z = aSig0;
   5321     if ( aSign ) z = - z;
   5322     if ( ( z < 0 ) ^ aSign ) {
   5323  invalid:
   5324         float_raise( float_flag_invalid STATUS_VAR);
   5325         return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
   5326     }
   5327     if ( ( aSig0<<shiftCount ) != savedASig ) {
   5328         STATUS(float_exception_flags) |= float_flag_inexact;
   5329     }
   5330     return z;
   5331 
   5332 }
   5333 
   5334 /*----------------------------------------------------------------------------
   5335 | Returns the result of converting the quadruple-precision floating-point
   5336 | value `a' to the 64-bit two's complement integer format.  The conversion
   5337 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
   5338 | Arithmetic---which means in particular that the conversion is rounded
   5339 | according to the current rounding mode.  If `a' is a NaN, the largest
   5340 | positive integer is returned.  Otherwise, if the conversion overflows, the
   5341 | largest integer with the same sign as `a' is returned.
   5342 *----------------------------------------------------------------------------*/
   5343 
   5344 int64 float128_to_int64( float128 a STATUS_PARAM )
   5345 {
   5346     flag aSign;
   5347     int32 aExp, shiftCount;
   5348     uint64_t aSig0, aSig1;
   5349 
   5350     aSig1 = extractFloat128Frac1( a );
   5351     aSig0 = extractFloat128Frac0( a );
   5352     aExp = extractFloat128Exp( a );
   5353     aSign = extractFloat128Sign( a );
   5354     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
   5355     shiftCount = 0x402F - aExp;
   5356     if ( shiftCount <= 0 ) {
   5357         if ( 0x403E < aExp ) {
   5358             float_raise( float_flag_invalid STATUS_VAR);
   5359             if (    ! aSign
   5360                  || (    ( aExp == 0x7FFF )
   5361                       && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) )
   5362                     )
   5363                ) {
   5364                 return LIT64( 0x7FFFFFFFFFFFFFFF );
   5365             }
   5366             return (int64_t) LIT64( 0x8000000000000000 );
   5367         }
   5368         shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
   5369     }
   5370     else {
   5371         shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
   5372     }
   5373     return roundAndPackInt64( aSign, aSig0, aSig1 STATUS_VAR );
   5374 
   5375 }
   5376 
   5377 /*----------------------------------------------------------------------------
   5378 | Returns the result of converting the quadruple-precision floating-point
   5379 | value `a' to the 64-bit two's complement integer format.  The conversion
   5380 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
   5381 | Arithmetic, except that the conversion is always rounded toward zero.
   5382 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
   5383 | the conversion overflows, the largest integer with the same sign as `a' is
   5384 | returned.
   5385 *----------------------------------------------------------------------------*/
   5386 
   5387 int64 float128_to_int64_round_to_zero( float128 a STATUS_PARAM )
   5388 {
   5389     flag aSign;
   5390     int32 aExp, shiftCount;
   5391     uint64_t aSig0, aSig1;
   5392     int64 z;
   5393 
   5394     aSig1 = extractFloat128Frac1( a );
   5395     aSig0 = extractFloat128Frac0( a );
   5396     aExp = extractFloat128Exp( a );
   5397     aSign = extractFloat128Sign( a );
   5398     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
   5399     shiftCount = aExp - 0x402F;
   5400     if ( 0 < shiftCount ) {
   5401         if ( 0x403E <= aExp ) {
   5402             aSig0 &= LIT64( 0x0000FFFFFFFFFFFF );
   5403             if (    ( a.high == LIT64( 0xC03E000000000000 ) )
   5404                  && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) {
   5405                 if ( aSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
   5406             }
   5407             else {
   5408                 float_raise( float_flag_invalid STATUS_VAR);
   5409                 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) {
   5410                     return LIT64( 0x7FFFFFFFFFFFFFFF );
   5411                 }
   5412             }
   5413             return (int64_t) LIT64( 0x8000000000000000 );
   5414         }
   5415         z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
   5416         if ( (uint64_t) ( aSig1<<shiftCount ) ) {
   5417             STATUS(float_exception_flags) |= float_flag_inexact;
   5418         }
   5419     }
   5420     else {
   5421         if ( aExp < 0x3FFF ) {
   5422             if ( aExp | aSig0 | aSig1 ) {
   5423                 STATUS(float_exception_flags) |= float_flag_inexact;
   5424             }
   5425             return 0;
   5426         }
   5427         z = aSig0>>( - shiftCount );
   5428         if (    aSig1
   5429              || ( shiftCount && (uint64_t) ( aSig0<<( shiftCount & 63 ) ) ) ) {
   5430             STATUS(float_exception_flags) |= float_flag_inexact;
   5431         }
   5432     }
   5433     if ( aSign ) z = - z;
   5434     return z;
   5435 
   5436 }
   5437 
   5438 /*----------------------------------------------------------------------------
   5439 | Returns the result of converting the quadruple-precision floating-point
   5440 | value `a' to the single-precision floating-point format.  The conversion
   5441 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
   5442 | Arithmetic.
   5443 *----------------------------------------------------------------------------*/
   5444 
   5445 float32 float128_to_float32( float128 a STATUS_PARAM )
   5446 {
   5447     flag aSign;
   5448     int32 aExp;
   5449     uint64_t aSig0, aSig1;
   5450     uint32_t zSig;
   5451 
   5452     aSig1 = extractFloat128Frac1( a );
   5453     aSig0 = extractFloat128Frac0( a );
   5454     aExp = extractFloat128Exp( a );
   5455     aSign = extractFloat128Sign( a );
   5456     if ( aExp == 0x7FFF ) {
   5457         if ( aSig0 | aSig1 ) {
   5458             return commonNaNToFloat32( float128ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
   5459         }
   5460         return packFloat32( aSign, 0xFF, 0 );
   5461     }
   5462     aSig0 |= ( aSig1 != 0 );
   5463     shift64RightJamming( aSig0, 18, &aSig0 );
   5464     zSig = aSig0;
   5465     if ( aExp || zSig ) {
   5466         zSig |= 0x40000000;
   5467         aExp -= 0x3F81;
   5468     }
   5469     return roundAndPackFloat32( aSign, aExp, zSig STATUS_VAR );
   5470 
   5471 }
   5472 
   5473 /*----------------------------------------------------------------------------
   5474 | Returns the result of converting the quadruple-precision floating-point
   5475 | value `a' to the double-precision floating-point format.  The conversion
   5476 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
   5477 | Arithmetic.
   5478 *----------------------------------------------------------------------------*/
   5479 
   5480 float64 float128_to_float64( float128 a STATUS_PARAM )
   5481 {
   5482     flag aSign;
   5483     int32 aExp;
   5484     uint64_t aSig0, aSig1;
   5485 
   5486     aSig1 = extractFloat128Frac1( a );
   5487     aSig0 = extractFloat128Frac0( a );
   5488     aExp = extractFloat128Exp( a );
   5489     aSign = extractFloat128Sign( a );
   5490     if ( aExp == 0x7FFF ) {
   5491         if ( aSig0 | aSig1 ) {
   5492             return commonNaNToFloat64( float128ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
   5493         }
   5494         return packFloat64( aSign, 0x7FF, 0 );
   5495     }
   5496     shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
   5497     aSig0 |= ( aSig1 != 0 );
   5498     if ( aExp || aSig0 ) {
   5499         aSig0 |= LIT64( 0x4000000000000000 );
   5500         aExp -= 0x3C01;
   5501     }
   5502     return roundAndPackFloat64( aSign, aExp, aSig0 STATUS_VAR );
   5503 
   5504 }
   5505 
   5506 /*----------------------------------------------------------------------------
   5507 | Returns the result of converting the quadruple-precision floating-point
   5508 | value `a' to the extended double-precision floating-point format.  The
   5509 | conversion is performed according to the IEC/IEEE Standard for Binary
   5510 | Floating-Point Arithmetic.
   5511 *----------------------------------------------------------------------------*/
   5512 
   5513 floatx80 float128_to_floatx80( float128 a STATUS_PARAM )
   5514 {
   5515     flag aSign;
   5516     int32 aExp;
   5517     uint64_t aSig0, aSig1;
   5518 
   5519     aSig1 = extractFloat128Frac1( a );
   5520     aSig0 = extractFloat128Frac0( a );
   5521     aExp = extractFloat128Exp( a );
   5522     aSign = extractFloat128Sign( a );
   5523     if ( aExp == 0x7FFF ) {
   5524         if ( aSig0 | aSig1 ) {
   5525             return commonNaNToFloatx80( float128ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
   5526         }
   5527         return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
   5528     }
   5529     if ( aExp == 0 ) {
   5530         if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
   5531         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
   5532     }
   5533     else {
   5534         aSig0 |= LIT64( 0x0001000000000000 );
   5535     }
   5536     shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
   5537     return roundAndPackFloatx80( 80, aSign, aExp, aSig0, aSig1 STATUS_VAR );
   5538 
   5539 }
   5540 
   5541 /*----------------------------------------------------------------------------
   5542 | Rounds the quadruple-precision floating-point value `a' to an integer, and
   5543 | returns the result as a quadruple-precision floating-point value.  The
   5544 | operation is performed according to the IEC/IEEE Standard for Binary
   5545 | Floating-Point Arithmetic.
   5546 *----------------------------------------------------------------------------*/
   5547 
   5548 float128 float128_round_to_int( float128 a STATUS_PARAM )
   5549 {
   5550     flag aSign;
   5551     int32 aExp;
   5552     uint64_t lastBitMask, roundBitsMask;
   5553     int8 roundingMode;
   5554     float128 z;
   5555 
   5556     aExp = extractFloat128Exp( a );
   5557     if ( 0x402F <= aExp ) {
   5558         if ( 0x406F <= aExp ) {
   5559             if (    ( aExp == 0x7FFF )
   5560                  && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )
   5561                ) {
   5562                 return propagateFloat128NaN( a, a STATUS_VAR );
   5563             }
   5564             return a;
   5565         }
   5566         lastBitMask = 1;
   5567         lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
   5568         roundBitsMask = lastBitMask - 1;
   5569         z = a;
   5570         roundingMode = STATUS(float_rounding_mode);
   5571         if ( roundingMode == float_round_nearest_even ) {
   5572             if ( lastBitMask ) {
   5573                 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
   5574                 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
   5575             }
   5576             else {
   5577                 if ( (int64_t) z.low < 0 ) {
   5578                     ++z.high;
   5579                     if ( (uint64_t) ( z.low<<1 ) == 0 ) z.high &= ~1;
   5580                 }
   5581             }
   5582         }
   5583         else if ( roundingMode != float_round_to_zero ) {
   5584             if (   extractFloat128Sign( z )
   5585                  ^ ( roundingMode == float_round_up ) ) {
   5586                 add128( z.high, z.low, 0, roundBitsMask, &z.high, &z.low );
   5587             }
   5588         }
   5589         z.low &= ~ roundBitsMask;
   5590     }
   5591     else {
   5592         if ( aExp < 0x3FFF ) {
   5593             if ( ( ( (uint64_t) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
   5594             STATUS(float_exception_flags) |= float_flag_inexact;
   5595             aSign = extractFloat128Sign( a );
   5596             switch ( STATUS(float_rounding_mode) ) {
   5597              case float_round_nearest_even:
   5598                 if (    ( aExp == 0x3FFE )
   5599                      && (   extractFloat128Frac0( a )
   5600                           | extractFloat128Frac1( a ) )
   5601                    ) {
   5602                     return packFloat128( aSign, 0x3FFF, 0, 0 );
   5603                 }
   5604                 break;
   5605              case float_round_down:
   5606                 return
   5607                       aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
   5608                     : packFloat128( 0, 0, 0, 0 );
   5609              case float_round_up:
   5610                 return
   5611                       aSign ? packFloat128( 1, 0, 0, 0 )
   5612                     : packFloat128( 0, 0x3FFF, 0, 0 );
   5613             }
   5614             return packFloat128( aSign, 0, 0, 0 );
   5615         }
   5616         lastBitMask = 1;
   5617         lastBitMask <<= 0x402F - aExp;
   5618         roundBitsMask = lastBitMask - 1;
   5619         z.low = 0;
   5620         z.high = a.high;
   5621         roundingMode = STATUS(float_rounding_mode);
   5622         if ( roundingMode == float_round_nearest_even ) {
   5623             z.high += lastBitMask>>1;
   5624             if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
   5625                 z.high &= ~ lastBitMask;
   5626             }
   5627         }
   5628         else if ( roundingMode != float_round_to_zero ) {
   5629             if (   extractFloat128Sign( z )
   5630                  ^ ( roundingMode == float_round_up ) ) {
   5631                 z.high |= ( a.low != 0 );
   5632                 z.high += roundBitsMask;
   5633             }
   5634         }
   5635         z.high &= ~ roundBitsMask;
   5636     }
   5637     if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
   5638         STATUS(float_exception_flags) |= float_flag_inexact;
   5639     }
   5640     return z;
   5641 
   5642 }
   5643 
   5644 /*----------------------------------------------------------------------------
   5645 | Returns the result of adding the absolute values of the quadruple-precision
   5646 | floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
   5647 | before being returned.  `zSign' is ignored if the result is a NaN.
   5648 | The addition is performed according to the IEC/IEEE Standard for Binary
   5649 | Floating-Point Arithmetic.
   5650 *----------------------------------------------------------------------------*/
   5651 
   5652 static float128 addFloat128Sigs( float128 a, float128 b, flag zSign STATUS_PARAM)
   5653 {
   5654     int32 aExp, bExp, zExp;
   5655     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
   5656     int32 expDiff;
   5657 
   5658     aSig1 = extractFloat128Frac1( a );
   5659     aSig0 = extractFloat128Frac0( a );
   5660     aExp = extractFloat128Exp( a );
   5661     bSig1 = extractFloat128Frac1( b );
   5662     bSig0 = extractFloat128Frac0( b );
   5663     bExp = extractFloat128Exp( b );
   5664     expDiff = aExp - bExp;
   5665     if ( 0 < expDiff ) {
   5666         if ( aExp == 0x7FFF ) {
   5667             if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
   5668             return a;
   5669         }
   5670         if ( bExp == 0 ) {
   5671             --expDiff;
   5672         }
   5673         else {
   5674             bSig0 |= LIT64( 0x0001000000000000 );
   5675         }
   5676         shift128ExtraRightJamming(
   5677             bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
   5678         zExp = aExp;
   5679     }
   5680     else if ( expDiff < 0 ) {
   5681         if ( bExp == 0x7FFF ) {
   5682             if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
   5683             return packFloat128( zSign, 0x7FFF, 0, 0 );
   5684         }
   5685         if ( aExp == 0 ) {
   5686             ++expDiff;
   5687         }
   5688         else {
   5689             aSig0 |= LIT64( 0x0001000000000000 );
   5690         }
   5691         shift128ExtraRightJamming(
   5692             aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
   5693         zExp = bExp;
   5694     }
   5695     else {
   5696         if ( aExp == 0x7FFF ) {
   5697             if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
   5698                 return propagateFloat128NaN( a, b STATUS_VAR );
   5699             }
   5700             return a;
   5701         }
   5702         add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
   5703         if ( aExp == 0 ) {
   5704             if (STATUS(flush_to_zero)) {
   5705                 if (zSig0 | zSig1) {
   5706                     float_raise(float_flag_output_denormal STATUS_VAR);
   5707                 }
   5708                 return packFloat128(zSign, 0, 0, 0);
   5709             }
   5710             return packFloat128( zSign, 0, zSig0, zSig1 );
   5711         }
   5712         zSig2 = 0;
   5713         zSig0 |= LIT64( 0x0002000000000000 );
   5714         zExp = aExp;
   5715         goto shiftRight1;
   5716     }
   5717     aSig0 |= LIT64( 0x0001000000000000 );
   5718     add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
   5719     --zExp;
   5720     if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack;
   5721     ++zExp;
   5722  shiftRight1:
   5723     shift128ExtraRightJamming(
   5724         zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
   5725  roundAndPack:
   5726     return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
   5727 
   5728 }
   5729 
   5730 /*----------------------------------------------------------------------------
   5731 | Returns the result of subtracting the absolute values of the quadruple-
   5732 | precision floating-point values `a' and `b'.  If `zSign' is 1, the
   5733 | difference is negated before being returned.  `zSign' is ignored if the
   5734 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
   5735 | Standard for Binary Floating-Point Arithmetic.
   5736 *----------------------------------------------------------------------------*/
   5737 
   5738 static float128 subFloat128Sigs( float128 a, float128 b, flag zSign STATUS_PARAM)
   5739 {
   5740     int32 aExp, bExp, zExp;
   5741     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
   5742     int32 expDiff;
   5743     float128 z;
   5744 
   5745     aSig1 = extractFloat128Frac1( a );
   5746     aSig0 = extractFloat128Frac0( a );
   5747     aExp = extractFloat128Exp( a );
   5748     bSig1 = extractFloat128Frac1( b );
   5749     bSig0 = extractFloat128Frac0( b );
   5750     bExp = extractFloat128Exp( b );
   5751     expDiff = aExp - bExp;
   5752     shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
   5753     shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
   5754     if ( 0 < expDiff ) goto aExpBigger;
   5755     if ( expDiff < 0 ) goto bExpBigger;
   5756     if ( aExp == 0x7FFF ) {
   5757         if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
   5758             return propagateFloat128NaN( a, b STATUS_VAR );
   5759         }
   5760         float_raise( float_flag_invalid STATUS_VAR);
   5761         z.low = float128_default_nan_low;
   5762         z.high = float128_default_nan_high;
   5763         return z;
   5764     }
   5765     if ( aExp == 0 ) {
   5766         aExp = 1;
   5767         bExp = 1;
   5768     }
   5769     if ( bSig0 < aSig0 ) goto aBigger;
   5770     if ( aSig0 < bSig0 ) goto bBigger;
   5771     if ( bSig1 < aSig1 ) goto aBigger;
   5772     if ( aSig1 < bSig1 ) goto bBigger;
   5773     return packFloat128( STATUS(float_rounding_mode) == float_round_down, 0, 0, 0 );
   5774  bExpBigger:
   5775     if ( bExp == 0x7FFF ) {
   5776         if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
   5777         return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
   5778     }
   5779     if ( aExp == 0 ) {
   5780         ++expDiff;
   5781     }
   5782     else {
   5783         aSig0 |= LIT64( 0x4000000000000000 );
   5784     }
   5785     shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
   5786     bSig0 |= LIT64( 0x4000000000000000 );
   5787  bBigger:
   5788     sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
   5789     zExp = bExp;
   5790     zSign ^= 1;
   5791     goto normalizeRoundAndPack;
   5792  aExpBigger:
   5793     if ( aExp == 0x7FFF ) {
   5794         if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
   5795         return a;
   5796     }
   5797     if ( bExp == 0 ) {
   5798         --expDiff;
   5799     }
   5800     else {
   5801         bSig0 |= LIT64( 0x4000000000000000 );
   5802     }
   5803     shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
   5804     aSig0 |= LIT64( 0x4000000000000000 );
   5805  aBigger:
   5806     sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
   5807     zExp = aExp;
   5808  normalizeRoundAndPack:
   5809     --zExp;
   5810     return normalizeRoundAndPackFloat128( zSign, zExp - 14, zSig0, zSig1 STATUS_VAR );
   5811 
   5812 }
   5813 
   5814 /*----------------------------------------------------------------------------
   5815 | Returns the result of adding the quadruple-precision floating-point values
   5816 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
   5817 | for Binary Floating-Point Arithmetic.
   5818 *----------------------------------------------------------------------------*/
   5819 
   5820 float128 float128_add( float128 a, float128 b STATUS_PARAM )
   5821 {
   5822     flag aSign, bSign;
   5823 
   5824     aSign = extractFloat128Sign( a );
   5825     bSign = extractFloat128Sign( b );
   5826     if ( aSign == bSign ) {
   5827         return addFloat128Sigs( a, b, aSign STATUS_VAR );
   5828     }
   5829     else {
   5830         return subFloat128Sigs( a, b, aSign STATUS_VAR );
   5831     }
   5832 
   5833 }
   5834 
   5835 /*----------------------------------------------------------------------------
   5836 | Returns the result of subtracting the quadruple-precision floating-point
   5837 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
   5838 | Standard for Binary Floating-Point Arithmetic.
   5839 *----------------------------------------------------------------------------*/
   5840 
   5841 float128 float128_sub( float128 a, float128 b STATUS_PARAM )
   5842 {
   5843     flag aSign, bSign;
   5844 
   5845     aSign = extractFloat128Sign( a );
   5846     bSign = extractFloat128Sign( b );
   5847     if ( aSign == bSign ) {
   5848         return subFloat128Sigs( a, b, aSign STATUS_VAR );
   5849     }
   5850     else {
   5851         return addFloat128Sigs( a, b, aSign STATUS_VAR );
   5852     }
   5853 
   5854 }
   5855 
   5856 /*----------------------------------------------------------------------------
   5857 | Returns the result of multiplying the quadruple-precision floating-point
   5858 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
   5859 | Standard for Binary Floating-Point Arithmetic.
   5860 *----------------------------------------------------------------------------*/
   5861 
   5862 float128 float128_mul( float128 a, float128 b STATUS_PARAM )
   5863 {
   5864     flag aSign, bSign, zSign;
   5865     int32 aExp, bExp, zExp;
   5866     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
   5867     float128 z;
   5868 
   5869     aSig1 = extractFloat128Frac1( a );
   5870     aSig0 = extractFloat128Frac0( a );
   5871     aExp = extractFloat128Exp( a );
   5872     aSign = extractFloat128Sign( a );
   5873     bSig1 = extractFloat128Frac1( b );
   5874     bSig0 = extractFloat128Frac0( b );
   5875     bExp = extractFloat128Exp( b );
   5876     bSign = extractFloat128Sign( b );
   5877     zSign = aSign ^ bSign;
   5878     if ( aExp == 0x7FFF ) {
   5879         if (    ( aSig0 | aSig1 )
   5880              || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
   5881             return propagateFloat128NaN( a, b STATUS_VAR );
   5882         }
   5883         if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
   5884         return packFloat128( zSign, 0x7FFF, 0, 0 );
   5885     }
   5886     if ( bExp == 0x7FFF ) {
   5887         if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
   5888         if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
   5889  invalid:
   5890             float_raise( float_flag_invalid STATUS_VAR);
   5891             z.low = float128_default_nan_low;
   5892             z.high = float128_default_nan_high;
   5893             return z;
   5894         }
   5895         return packFloat128( zSign, 0x7FFF, 0, 0 );
   5896     }
   5897     if ( aExp == 0 ) {
   5898         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
   5899         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
   5900     }
   5901     if ( bExp == 0 ) {
   5902         if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
   5903         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
   5904     }
   5905     zExp = aExp + bExp - 0x4000;
   5906     aSig0 |= LIT64( 0x0001000000000000 );
   5907     shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
   5908     mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
   5909     add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
   5910     zSig2 |= ( zSig3 != 0 );
   5911     if ( LIT64( 0x0002000000000000 ) <= zSig0 ) {
   5912         shift128ExtraRightJamming(
   5913             zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
   5914         ++zExp;
   5915     }
   5916     return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
   5917 
   5918 }
   5919 
   5920 /*----------------------------------------------------------------------------
   5921 | Returns the result of dividing the quadruple-precision floating-point value
   5922 | `a' by the corresponding value `b'.  The operation is performed according to
   5923 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   5924 *----------------------------------------------------------------------------*/
   5925 
   5926 float128 float128_div( float128 a, float128 b STATUS_PARAM )
   5927 {
   5928     flag aSign, bSign, zSign;
   5929     int32 aExp, bExp, zExp;
   5930     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
   5931     uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
   5932     float128 z;
   5933 
   5934     aSig1 = extractFloat128Frac1( a );
   5935     aSig0 = extractFloat128Frac0( a );
   5936     aExp = extractFloat128Exp( a );
   5937     aSign = extractFloat128Sign( a );
   5938     bSig1 = extractFloat128Frac1( b );
   5939     bSig0 = extractFloat128Frac0( b );
   5940     bExp = extractFloat128Exp( b );
   5941     bSign = extractFloat128Sign( b );
   5942     zSign = aSign ^ bSign;
   5943     if ( aExp == 0x7FFF ) {
   5944         if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
   5945         if ( bExp == 0x7FFF ) {
   5946             if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
   5947             goto invalid;
   5948         }
   5949         return packFloat128( zSign, 0x7FFF, 0, 0 );
   5950     }
   5951     if ( bExp == 0x7FFF ) {
   5952         if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
   5953         return packFloat128( zSign, 0, 0, 0 );
   5954     }
   5955     if ( bExp == 0 ) {
   5956         if ( ( bSig0 | bSig1 ) == 0 ) {
   5957             if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
   5958  invalid:
   5959                 float_raise( float_flag_invalid STATUS_VAR);
   5960                 z.low = float128_default_nan_low;
   5961                 z.high = float128_default_nan_high;
   5962                 return z;
   5963             }
   5964             float_raise( float_flag_divbyzero STATUS_VAR);
   5965             return packFloat128( zSign, 0x7FFF, 0, 0 );
   5966         }
   5967         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
   5968     }
   5969     if ( aExp == 0 ) {
   5970         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
   5971         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
   5972     }
   5973     zExp = aExp - bExp + 0x3FFD;
   5974     shortShift128Left(
   5975         aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 );
   5976     shortShift128Left(
   5977         bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
   5978     if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
   5979         shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
   5980         ++zExp;
   5981     }
   5982     zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
   5983     mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
   5984     sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
   5985     while ( (int64_t) rem0 < 0 ) {
   5986         --zSig0;
   5987         add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
   5988     }
   5989     zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
   5990     if ( ( zSig1 & 0x3FFF ) <= 4 ) {
   5991         mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
   5992         sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
   5993         while ( (int64_t) rem1 < 0 ) {
   5994             --zSig1;
   5995             add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
   5996         }
   5997         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
   5998     }
   5999     shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
   6000     return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
   6001 
   6002 }
   6003 
   6004 /*----------------------------------------------------------------------------
   6005 | Returns the remainder of the quadruple-precision floating-point value `a'
   6006 | with respect to the corresponding value `b'.  The operation is performed
   6007 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   6008 *----------------------------------------------------------------------------*/
   6009 
   6010 float128 float128_rem( float128 a, float128 b STATUS_PARAM )
   6011 {
   6012     flag aSign, zSign;
   6013     int32 aExp, bExp, expDiff;
   6014     uint64_t aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
   6015     uint64_t allZero, alternateASig0, alternateASig1, sigMean1;
   6016     int64_t sigMean0;
   6017     float128 z;
   6018 
   6019     aSig1 = extractFloat128Frac1( a );
   6020     aSig0 = extractFloat128Frac0( a );
   6021     aExp = extractFloat128Exp( a );
   6022     aSign = extractFloat128Sign( a );
   6023     bSig1 = extractFloat128Frac1( b );
   6024     bSig0 = extractFloat128Frac0( b );
   6025     bExp = extractFloat128Exp( b );
   6026     if ( aExp == 0x7FFF ) {
   6027         if (    ( aSig0 | aSig1 )
   6028              || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
   6029             return propagateFloat128NaN( a, b STATUS_VAR );
   6030         }
   6031         goto invalid;
   6032     }
   6033     if ( bExp == 0x7FFF ) {
   6034         if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
   6035         return a;
   6036     }
   6037     if ( bExp == 0 ) {
   6038         if ( ( bSig0 | bSig1 ) == 0 ) {
   6039  invalid:
   6040             float_raise( float_flag_invalid STATUS_VAR);
   6041             z.low = float128_default_nan_low;
   6042             z.high = float128_default_nan_high;
   6043             return z;
   6044         }
   6045         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
   6046     }
   6047     if ( aExp == 0 ) {
   6048         if ( ( aSig0 | aSig1 ) == 0 ) return a;
   6049         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
   6050     }
   6051     expDiff = aExp - bExp;
   6052     if ( expDiff < -1 ) return a;
   6053     shortShift128Left(
   6054         aSig0 | LIT64( 0x0001000000000000 ),
   6055         aSig1,
   6056         15 - ( expDiff < 0 ),
   6057         &aSig0,
   6058         &aSig1
   6059     );
   6060     shortShift128Left(
   6061         bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
   6062     q = le128( bSig0, bSig1, aSig0, aSig1 );
   6063     if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
   6064     expDiff -= 64;
   6065     while ( 0 < expDiff ) {
   6066         q = estimateDiv128To64( aSig0, aSig1, bSig0 );
   6067         q = ( 4 < q ) ? q - 4 : 0;
   6068         mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
   6069         shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
   6070         shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
   6071         sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
   6072         expDiff -= 61;
   6073     }
   6074     if ( -64 < expDiff ) {
   6075         q = estimateDiv128To64( aSig0, aSig1, bSig0 );
   6076         q = ( 4 < q ) ? q - 4 : 0;
   6077         q >>= - expDiff;
   6078         shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
   6079         expDiff += 52;
   6080         if ( expDiff < 0 ) {
   6081             shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
   6082         }
   6083         else {
   6084             shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
   6085         }
   6086         mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
   6087         sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
   6088     }
   6089     else {
   6090         shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
   6091         shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
   6092     }
   6093     do {
   6094         alternateASig0 = aSig0;
   6095         alternateASig1 = aSig1;
   6096         ++q;
   6097         sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
   6098     } while ( 0 <= (int64_t) aSig0 );
   6099     add128(
   6100         aSig0, aSig1, alternateASig0, alternateASig1, (uint64_t *)&sigMean0, &sigMean1 );
   6101     if (    ( sigMean0 < 0 )
   6102          || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
   6103         aSig0 = alternateASig0;
   6104         aSig1 = alternateASig1;
   6105     }
   6106     zSign = ( (int64_t) aSig0 < 0 );
   6107     if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
   6108     return
   6109         normalizeRoundAndPackFloat128( aSign ^ zSign, bExp - 4, aSig0, aSig1 STATUS_VAR );
   6110 
   6111 }
   6112 
   6113 /*----------------------------------------------------------------------------
   6114 | Returns the square root of the quadruple-precision floating-point value `a'.
   6115 | The operation is performed according to the IEC/IEEE Standard for Binary
   6116 | Floating-Point Arithmetic.
   6117 *----------------------------------------------------------------------------*/
   6118 
   6119 float128 float128_sqrt( float128 a STATUS_PARAM )
   6120 {
   6121     flag aSign;
   6122     int32 aExp, zExp;
   6123     uint64_t aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
   6124     uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
   6125     float128 z;
   6126 
   6127     aSig1 = extractFloat128Frac1( a );
   6128     aSig0 = extractFloat128Frac0( a );
   6129     aExp = extractFloat128Exp( a );
   6130     aSign = extractFloat128Sign( a );
   6131     if ( aExp == 0x7FFF ) {
   6132         if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, a STATUS_VAR );
   6133         if ( ! aSign ) return a;
   6134         goto invalid;
   6135     }
   6136     if ( aSign ) {
   6137         if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
   6138  invalid:
   6139         float_raise( float_flag_invalid STATUS_VAR);
   6140         z.low = float128_default_nan_low;
   6141         z.high = float128_default_nan_high;
   6142         return z;
   6143     }
   6144     if ( aExp == 0 ) {
   6145         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
   6146         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
   6147     }
   6148     zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;
   6149     aSig0 |= LIT64( 0x0001000000000000 );
   6150     zSig0 = estimateSqrt32( aExp, aSig0>>17 );
   6151     shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
   6152     zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
   6153     doubleZSig0 = zSig0<<1;
   6154     mul64To128( zSig0, zSig0, &term0, &term1 );
   6155     sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
   6156     while ( (int64_t) rem0 < 0 ) {
   6157         --zSig0;
   6158         doubleZSig0 -= 2;
   6159         add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
   6160     }
   6161     zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
   6162     if ( ( zSig1 & 0x1FFF ) <= 5 ) {
   6163         if ( zSig1 == 0 ) zSig1 = 1;
   6164         mul64To128( doubleZSig0, zSig1, &term1, &term2 );
   6165         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
   6166         mul64To128( zSig1, zSig1, &term2, &term3 );
   6167         sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
   6168         while ( (int64_t) rem1 < 0 ) {
   6169             --zSig1;
   6170             shortShift128Left( 0, zSig1, 1, &term2, &term3 );
   6171             term3 |= 1;
   6172             term2 |= doubleZSig0;
   6173             add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
   6174         }
   6175         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
   6176     }
   6177     shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
   6178     return roundAndPackFloat128( 0, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
   6179 
   6180 }
   6181 
   6182 /*----------------------------------------------------------------------------
   6183 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
   6184 | the corresponding value `b', and 0 otherwise.  The invalid exception is
   6185 | raised if either operand is a NaN.  Otherwise, the comparison is performed
   6186 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   6187 *----------------------------------------------------------------------------*/
   6188 
   6189 int float128_eq( float128 a, float128 b STATUS_PARAM )
   6190 {
   6191 
   6192     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
   6193               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
   6194          || (    ( extractFloat128Exp( b ) == 0x7FFF )
   6195               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
   6196        ) {
   6197         float_raise( float_flag_invalid STATUS_VAR);
   6198         return 0;
   6199     }
   6200     return
   6201            ( a.low == b.low )
   6202         && (    ( a.high == b.high )
   6203              || (    ( a.low == 0 )
   6204                   && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )
   6205            );
   6206 
   6207 }
   6208 
   6209 /*----------------------------------------------------------------------------
   6210 | Returns 1 if the quadruple-precision floating-point value `a' is less than
   6211 | or equal to the corresponding value `b', and 0 otherwise.  The invalid
   6212 | exception is raised if either operand is a NaN.  The comparison is performed
   6213 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   6214 *----------------------------------------------------------------------------*/
   6215 
   6216 int float128_le( float128 a, float128 b STATUS_PARAM )
   6217 {
   6218     flag aSign, bSign;
   6219 
   6220     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
   6221               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
   6222          || (    ( extractFloat128Exp( b ) == 0x7FFF )
   6223               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
   6224        ) {
   6225         float_raise( float_flag_invalid STATUS_VAR);
   6226         return 0;
   6227     }
   6228     aSign = extractFloat128Sign( a );
   6229     bSign = extractFloat128Sign( b );
   6230     if ( aSign != bSign ) {
   6231         return
   6232                aSign
   6233             || (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
   6234                  == 0 );
   6235     }
   6236     return
   6237           aSign ? le128( b.high, b.low, a.high, a.low )
   6238         : le128( a.high, a.low, b.high, b.low );
   6239 
   6240 }
   6241 
   6242 /*----------------------------------------------------------------------------
   6243 | Returns 1 if the quadruple-precision floating-point value `a' is less than
   6244 | the corresponding value `b', and 0 otherwise.  The invalid exception is
   6245 | raised if either operand is a NaN.  The comparison is performed according
   6246 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   6247 *----------------------------------------------------------------------------*/
   6248 
   6249 int float128_lt( float128 a, float128 b STATUS_PARAM )
   6250 {
   6251     flag aSign, bSign;
   6252 
   6253     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
   6254               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
   6255          || (    ( extractFloat128Exp( b ) == 0x7FFF )
   6256               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
   6257        ) {
   6258         float_raise( float_flag_invalid STATUS_VAR);
   6259         return 0;
   6260     }
   6261     aSign = extractFloat128Sign( a );
   6262     bSign = extractFloat128Sign( b );
   6263     if ( aSign != bSign ) {
   6264         return
   6265                aSign
   6266             && (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
   6267                  != 0 );
   6268     }
   6269     return
   6270           aSign ? lt128( b.high, b.low, a.high, a.low )
   6271         : lt128( a.high, a.low, b.high, b.low );
   6272 
   6273 }
   6274 
   6275 /*----------------------------------------------------------------------------
   6276 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
   6277 | be compared, and 0 otherwise.  The invalid exception is raised if either
   6278 | operand is a NaN. The comparison is performed according to the IEC/IEEE
   6279 | Standard for Binary Floating-Point Arithmetic.
   6280 *----------------------------------------------------------------------------*/
   6281 
   6282 int float128_unordered( float128 a, float128 b STATUS_PARAM )
   6283 {
   6284     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
   6285               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
   6286          || (    ( extractFloat128Exp( b ) == 0x7FFF )
   6287               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
   6288        ) {
   6289         float_raise( float_flag_invalid STATUS_VAR);
   6290         return 1;
   6291     }
   6292     return 0;
   6293 }
   6294 
   6295 /*----------------------------------------------------------------------------
   6296 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
   6297 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
   6298 | exception.  The comparison is performed according to the IEC/IEEE Standard
   6299 | for Binary Floating-Point Arithmetic.
   6300 *----------------------------------------------------------------------------*/
   6301 
   6302 int float128_eq_quiet( float128 a, float128 b STATUS_PARAM )
   6303 {
   6304 
   6305     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
   6306               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
   6307          || (    ( extractFloat128Exp( b ) == 0x7FFF )
   6308               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
   6309        ) {
   6310         if (    float128_is_signaling_nan( a )
   6311              || float128_is_signaling_nan( b ) ) {
   6312             float_raise( float_flag_invalid STATUS_VAR);
   6313         }
   6314         return 0;
   6315     }
   6316     return
   6317            ( a.low == b.low )
   6318         && (    ( a.high == b.high )
   6319              || (    ( a.low == 0 )
   6320                   && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )
   6321            );
   6322 
   6323 }
   6324 
   6325 /*----------------------------------------------------------------------------
   6326 | Returns 1 if the quadruple-precision floating-point value `a' is less than
   6327 | or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
   6328 | cause an exception.  Otherwise, the comparison is performed according to the
   6329 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   6330 *----------------------------------------------------------------------------*/
   6331 
   6332 int float128_le_quiet( float128 a, float128 b STATUS_PARAM )
   6333 {
   6334     flag aSign, bSign;
   6335 
   6336     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
   6337               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
   6338          || (    ( extractFloat128Exp( b ) == 0x7FFF )
   6339               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
   6340        ) {
   6341         if (    float128_is_signaling_nan( a )
   6342              || float128_is_signaling_nan( b ) ) {
   6343             float_raise( float_flag_invalid STATUS_VAR);
   6344         }
   6345         return 0;
   6346     }
   6347     aSign = extractFloat128Sign( a );
   6348     bSign = extractFloat128Sign( b );
   6349     if ( aSign != bSign ) {
   6350         return
   6351                aSign
   6352             || (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
   6353                  == 0 );
   6354     }
   6355     return
   6356           aSign ? le128( b.high, b.low, a.high, a.low )
   6357         : le128( a.high, a.low, b.high, b.low );
   6358 
   6359 }
   6360 
   6361 /*----------------------------------------------------------------------------
   6362 | Returns 1 if the quadruple-precision floating-point value `a' is less than
   6363 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
   6364 | exception.  Otherwise, the comparison is performed according to the IEC/IEEE
   6365 | Standard for Binary Floating-Point Arithmetic.
   6366 *----------------------------------------------------------------------------*/
   6367 
   6368 int float128_lt_quiet( float128 a, float128 b STATUS_PARAM )
   6369 {
   6370     flag aSign, bSign;
   6371 
   6372     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
   6373               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
   6374          || (    ( extractFloat128Exp( b ) == 0x7FFF )
   6375               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
   6376        ) {
   6377         if (    float128_is_signaling_nan( a )
   6378              || float128_is_signaling_nan( b ) ) {
   6379             float_raise( float_flag_invalid STATUS_VAR);
   6380         }
   6381         return 0;
   6382     }
   6383     aSign = extractFloat128Sign( a );
   6384     bSign = extractFloat128Sign( b );
   6385     if ( aSign != bSign ) {
   6386         return
   6387                aSign
   6388             && (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
   6389                  != 0 );
   6390     }
   6391     return
   6392           aSign ? lt128( b.high, b.low, a.high, a.low )
   6393         : lt128( a.high, a.low, b.high, b.low );
   6394 
   6395 }
   6396 
   6397 /*----------------------------------------------------------------------------
   6398 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
   6399 | be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.  The
   6400 | comparison is performed according to the IEC/IEEE Standard for Binary
   6401 | Floating-Point Arithmetic.
   6402 *----------------------------------------------------------------------------*/
   6403 
   6404 int float128_unordered_quiet( float128 a, float128 b STATUS_PARAM )
   6405 {
   6406     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
   6407               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
   6408          || (    ( extractFloat128Exp( b ) == 0x7FFF )
   6409               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
   6410        ) {
   6411         if (    float128_is_signaling_nan( a )
   6412              || float128_is_signaling_nan( b ) ) {
   6413             float_raise( float_flag_invalid STATUS_VAR);
   6414         }
   6415         return 1;
   6416     }
   6417     return 0;
   6418 }
   6419 
   6420 /* misc functions */
   6421 float32 uint32_to_float32( uint32 a STATUS_PARAM )
   6422 {
   6423     return int64_to_float32(a STATUS_VAR);
   6424 }
   6425 
   6426 float64 uint32_to_float64( uint32 a STATUS_PARAM )
   6427 {
   6428     return int64_to_float64(a STATUS_VAR);
   6429 }
   6430 
   6431 uint32 float32_to_uint32( float32 a STATUS_PARAM )
   6432 {
   6433     int64_t v;
   6434     uint32 res;
   6435 
   6436     v = float32_to_int64(a STATUS_VAR);
   6437     if (v < 0) {
   6438         res = 0;
   6439         float_raise( float_flag_invalid STATUS_VAR);
   6440     } else if (v > 0xffffffff) {
   6441         res = 0xffffffff;
   6442         float_raise( float_flag_invalid STATUS_VAR);
   6443     } else {
   6444         res = v;
   6445     }
   6446     return res;
   6447 }
   6448 
   6449 uint32 float32_to_uint32_round_to_zero( float32 a STATUS_PARAM )
   6450 {
   6451     int64_t v;
   6452     uint32 res;
   6453 
   6454     v = float32_to_int64_round_to_zero(a STATUS_VAR);
   6455     if (v < 0) {
   6456         res = 0;
   6457         float_raise( float_flag_invalid STATUS_VAR);
   6458     } else if (v > 0xffffffff) {
   6459         res = 0xffffffff;
   6460         float_raise( float_flag_invalid STATUS_VAR);
   6461     } else {
   6462         res = v;
   6463     }
   6464     return res;
   6465 }
   6466 
   6467 uint_fast16_t float32_to_uint16_round_to_zero(float32 a STATUS_PARAM)
   6468 {
   6469     int64_t v;
   6470     uint_fast16_t res;
   6471 
   6472     v = float32_to_int64_round_to_zero(a STATUS_VAR);
   6473     if (v < 0) {
   6474         res = 0;
   6475         float_raise( float_flag_invalid STATUS_VAR);
   6476     } else if (v > 0xffff) {
   6477         res = 0xffff;
   6478         float_raise( float_flag_invalid STATUS_VAR);
   6479     } else {
   6480         res = v;
   6481     }
   6482     return res;
   6483 }
   6484 
   6485 uint32 float64_to_uint32( float64 a STATUS_PARAM )
   6486 {
   6487     int64_t v;
   6488     uint32 res;
   6489 
   6490     v = float64_to_int64(a STATUS_VAR);
   6491     if (v < 0) {
   6492         res = 0;
   6493         float_raise( float_flag_invalid STATUS_VAR);
   6494     } else if (v > 0xffffffff) {
   6495         res = 0xffffffff;
   6496         float_raise( float_flag_invalid STATUS_VAR);
   6497     } else {
   6498         res = v;
   6499     }
   6500     return res;
   6501 }
   6502 
   6503 uint32 float64_to_uint32_round_to_zero( float64 a STATUS_PARAM )
   6504 {
   6505     int64_t v;
   6506     uint32 res;
   6507 
   6508     v = float64_to_int64_round_to_zero(a STATUS_VAR);
   6509     if (v < 0) {
   6510         res = 0;
   6511         float_raise( float_flag_invalid STATUS_VAR);
   6512     } else if (v > 0xffffffff) {
   6513         res = 0xffffffff;
   6514         float_raise( float_flag_invalid STATUS_VAR);
   6515     } else {
   6516         res = v;
   6517     }
   6518     return res;
   6519 }
   6520 
   6521 uint_fast16_t float64_to_uint16_round_to_zero(float64 a STATUS_PARAM)
   6522 {
   6523     int64_t v;
   6524     uint_fast16_t res;
   6525 
   6526     v = float64_to_int64_round_to_zero(a STATUS_VAR);
   6527     if (v < 0) {
   6528         res = 0;
   6529         float_raise( float_flag_invalid STATUS_VAR);
   6530     } else if (v > 0xffff) {
   6531         res = 0xffff;
   6532         float_raise( float_flag_invalid STATUS_VAR);
   6533     } else {
   6534         res = v;
   6535     }
   6536     return res;
   6537 }
   6538 
   6539 /* FIXME: This looks broken.  */
   6540 uint64_t float64_to_uint64 (float64 a STATUS_PARAM)
   6541 {
   6542     int64_t v;
   6543 
   6544     v = float64_val(int64_to_float64(INT64_MIN STATUS_VAR));
   6545     v += float64_val(a);
   6546     v = float64_to_int64(make_float64(v) STATUS_VAR);
   6547 
   6548     return v - INT64_MIN;
   6549 }
   6550 
   6551 uint64_t float64_to_uint64_round_to_zero (float64 a STATUS_PARAM)
   6552 {
   6553     int64_t v;
   6554 
   6555     v = float64_val(int64_to_float64(INT64_MIN STATUS_VAR));
   6556     v += float64_val(a);
   6557     v = float64_to_int64_round_to_zero(make_float64(v) STATUS_VAR);
   6558 
   6559     return v - INT64_MIN;
   6560 }
   6561 
   6562 #define COMPARE(s, nan_exp)                                                  \
   6563 INLINE int float ## s ## _compare_internal( float ## s a, float ## s b,      \
   6564                                       int is_quiet STATUS_PARAM )            \
   6565 {                                                                            \
   6566     flag aSign, bSign;                                                       \
   6567     uint ## s ## _t av, bv;                                                  \
   6568     a = float ## s ## _squash_input_denormal(a STATUS_VAR);                  \
   6569     b = float ## s ## _squash_input_denormal(b STATUS_VAR);                  \
   6570                                                                              \
   6571     if (( ( extractFloat ## s ## Exp( a ) == nan_exp ) &&                    \
   6572          extractFloat ## s ## Frac( a ) ) ||                                 \
   6573         ( ( extractFloat ## s ## Exp( b ) == nan_exp ) &&                    \
   6574           extractFloat ## s ## Frac( b ) )) {                                \
   6575         if (!is_quiet ||                                                     \
   6576             float ## s ## _is_signaling_nan( a ) ||                          \
   6577             float ## s ## _is_signaling_nan( b ) ) {                         \
   6578             float_raise( float_flag_invalid STATUS_VAR);                     \
   6579         }                                                                    \
   6580         return float_relation_unordered;                                     \
   6581     }                                                                        \
   6582     aSign = extractFloat ## s ## Sign( a );                                  \
   6583     bSign = extractFloat ## s ## Sign( b );                                  \
   6584     av = float ## s ## _val(a);                                              \
   6585     bv = float ## s ## _val(b);                                              \
   6586     if ( aSign != bSign ) {                                                  \
   6587         if ( (uint ## s ## _t) ( ( av | bv )<<1 ) == 0 ) {                   \
   6588             /* zero case */                                                  \
   6589             return float_relation_equal;                                     \
   6590         } else {                                                             \
   6591             return 1 - (2 * aSign);                                          \
   6592         }                                                                    \
   6593     } else {                                                                 \
   6594         if (av == bv) {                                                      \
   6595             return float_relation_equal;                                     \
   6596         } else {                                                             \
   6597             return 1 - 2 * (aSign ^ ( av < bv ));                            \
   6598         }                                                                    \
   6599     }                                                                        \
   6600 }                                                                            \
   6601                                                                              \
   6602 int float ## s ## _compare( float ## s a, float ## s b STATUS_PARAM )        \
   6603 {                                                                            \
   6604     return float ## s ## _compare_internal(a, b, 0 STATUS_VAR);              \
   6605 }                                                                            \
   6606                                                                              \
   6607 int float ## s ## _compare_quiet( float ## s a, float ## s b STATUS_PARAM )  \
   6608 {                                                                            \
   6609     return float ## s ## _compare_internal(a, b, 1 STATUS_VAR);              \
   6610 }
   6611 
   6612 COMPARE(32, 0xff)
   6613 COMPARE(64, 0x7ff)
   6614 
   6615 INLINE int floatx80_compare_internal( floatx80 a, floatx80 b,
   6616                                       int is_quiet STATUS_PARAM )
   6617 {
   6618     flag aSign, bSign;
   6619 
   6620     if (( ( extractFloatx80Exp( a ) == 0x7fff ) &&
   6621           ( extractFloatx80Frac( a )<<1 ) ) ||
   6622         ( ( extractFloatx80Exp( b ) == 0x7fff ) &&
   6623           ( extractFloatx80Frac( b )<<1 ) )) {
   6624         if (!is_quiet ||
   6625             floatx80_is_signaling_nan( a ) ||
   6626             floatx80_is_signaling_nan( b ) ) {
   6627             float_raise( float_flag_invalid STATUS_VAR);
   6628         }
   6629         return float_relation_unordered;
   6630     }
   6631     aSign = extractFloatx80Sign( a );
   6632     bSign = extractFloatx80Sign( b );
   6633     if ( aSign != bSign ) {
   6634 
   6635         if ( ( ( (uint16_t) ( ( a.high | b.high ) << 1 ) ) == 0) &&
   6636              ( ( a.low | b.low ) == 0 ) ) {
   6637             /* zero case */
   6638             return float_relation_equal;
   6639         } else {
   6640             return 1 - (2 * aSign);
   6641         }
   6642     } else {
   6643         if (a.low == b.low && a.high == b.high) {
   6644             return float_relation_equal;
   6645         } else {
   6646             return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
   6647         }
   6648     }
   6649 }
   6650 
   6651 int floatx80_compare( floatx80 a, floatx80 b STATUS_PARAM )
   6652 {
   6653     return floatx80_compare_internal(a, b, 0 STATUS_VAR);
   6654 }
   6655 
   6656 int floatx80_compare_quiet( floatx80 a, floatx80 b STATUS_PARAM )
   6657 {
   6658     return floatx80_compare_internal(a, b, 1 STATUS_VAR);
   6659 }
   6660 
   6661 INLINE int float128_compare_internal( float128 a, float128 b,
   6662                                       int is_quiet STATUS_PARAM )
   6663 {
   6664     flag aSign, bSign;
   6665 
   6666     if (( ( extractFloat128Exp( a ) == 0x7fff ) &&
   6667           ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) ||
   6668         ( ( extractFloat128Exp( b ) == 0x7fff ) &&
   6669           ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )) {
   6670         if (!is_quiet ||
   6671             float128_is_signaling_nan( a ) ||
   6672             float128_is_signaling_nan( b ) ) {
   6673             float_raise( float_flag_invalid STATUS_VAR);
   6674         }
   6675         return float_relation_unordered;
   6676     }
   6677     aSign = extractFloat128Sign( a );
   6678     bSign = extractFloat128Sign( b );
   6679     if ( aSign != bSign ) {
   6680         if ( ( ( ( a.high | b.high )<<1 ) | a.low | b.low ) == 0 ) {
   6681             /* zero case */
   6682             return float_relation_equal;
   6683         } else {
   6684             return 1 - (2 * aSign);
   6685         }
   6686     } else {
   6687         if (a.low == b.low && a.high == b.high) {
   6688             return float_relation_equal;
   6689         } else {
   6690             return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
   6691         }
   6692     }
   6693 }
   6694 
   6695 int float128_compare( float128 a, float128 b STATUS_PARAM )
   6696 {
   6697     return float128_compare_internal(a, b, 0 STATUS_VAR);
   6698 }
   6699 
   6700 int float128_compare_quiet( float128 a, float128 b STATUS_PARAM )
   6701 {
   6702     return float128_compare_internal(a, b, 1 STATUS_VAR);
   6703 }
   6704 
   6705 /* min() and max() functions. These can't be implemented as
   6706  * 'compare and pick one input' because that would mishandle
   6707  * NaNs and +0 vs -0.
   6708  *
   6709  * minnum() and maxnum() functions. These are similar to the min()
   6710  * and max() functions but if one of the arguments is a QNaN and
   6711  * the other is numerical then the numerical argument is returned.
   6712  * minnum() and maxnum correspond to the IEEE 754-2008 minNum()
   6713  * and maxNum() operations. min() and max() are the typical min/max
   6714  * semantics provided by many CPUs which predate that specification.
   6715  */
   6716 #define MINMAX(s)                                                       \
   6717 INLINE float ## s float ## s ## _minmax(float ## s a, float ## s b,     \
   6718                                         int ismin, int isieee STATUS_PARAM) \
   6719 {                                                                       \
   6720     flag aSign, bSign;                                                  \
   6721     uint ## s ## _t av, bv;                                             \
   6722     a = float ## s ## _squash_input_denormal(a STATUS_VAR);             \
   6723     b = float ## s ## _squash_input_denormal(b STATUS_VAR);             \
   6724     if (float ## s ## _is_any_nan(a) ||                                 \
   6725         float ## s ## _is_any_nan(b)) {                                 \
   6726         if (isieee) {                                                   \
   6727             if (float ## s ## _is_quiet_nan(a) &&                       \
   6728                 !float ## s ##_is_any_nan(b)) {                         \
   6729                 return b;                                               \
   6730             } else if (float ## s ## _is_quiet_nan(b) &&                \
   6731                        !float ## s ## _is_any_nan(a)) {                 \
   6732                 return a;                                               \
   6733             }                                                           \
   6734         }                                                               \
   6735         return propagateFloat ## s ## NaN(a, b STATUS_VAR);             \
   6736     }                                                                   \
   6737     aSign = extractFloat ## s ## Sign(a);                               \
   6738     bSign = extractFloat ## s ## Sign(b);                               \
   6739     av = float ## s ## _val(a);                                         \
   6740     bv = float ## s ## _val(b);                                         \
   6741     if (aSign != bSign) {                                               \
   6742         if (ismin) {                                                    \
   6743             return aSign ? a : b;                                       \
   6744         } else {                                                        \
   6745             return aSign ? b : a;                                       \
   6746         }                                                               \
   6747     } else {                                                            \
   6748         if (ismin) {                                                    \
   6749             return (aSign ^ (av < bv)) ? a : b;                         \
   6750         } else {                                                        \
   6751             return (aSign ^ (av < bv)) ? b : a;                         \
   6752         }                                                               \
   6753     }                                                                   \
   6754 }                                                                       \
   6755                                                                         \
   6756 float ## s float ## s ## _min(float ## s a, float ## s b STATUS_PARAM)  \
   6757 {                                                                       \
   6758     return float ## s ## _minmax(a, b, 1, 0 STATUS_VAR);                \
   6759 }                                                                       \
   6760                                                                         \
   6761 float ## s float ## s ## _max(float ## s a, float ## s b STATUS_PARAM)  \
   6762 {                                                                       \
   6763     return float ## s ## _minmax(a, b, 0, 0 STATUS_VAR);                \
   6764 }                                                                       \
   6765                                                                         \
   6766 float ## s float ## s ## _minnum(float ## s a, float ## s b STATUS_PARAM) \
   6767 {                                                                       \
   6768     return float ## s ## _minmax(a, b, 1, 1 STATUS_VAR);                \
   6769 }                                                                       \
   6770                                                                         \
   6771 float ## s float ## s ## _maxnum(float ## s a, float ## s b STATUS_PARAM) \
   6772 {                                                                       \
   6773     return float ## s ## _minmax(a, b, 0, 1 STATUS_VAR);                \
   6774 }
   6775 
   6776 MINMAX(32)
   6777 MINMAX(64)
   6778 
   6779 
   6780 /* Multiply A by 2 raised to the power N.  */
   6781 float32 float32_scalbn( float32 a, int n STATUS_PARAM )
   6782 {
   6783     flag aSign;
   6784     int16_t aExp;
   6785     uint32_t aSig;
   6786 
   6787     a = float32_squash_input_denormal(a STATUS_VAR);
   6788     aSig = extractFloat32Frac( a );
   6789     aExp = extractFloat32Exp( a );
   6790     aSign = extractFloat32Sign( a );
   6791 
   6792     if ( aExp == 0xFF ) {
   6793         if ( aSig ) {
   6794             return propagateFloat32NaN( a, a STATUS_VAR );
   6795         }
   6796         return a;
   6797     }
   6798     if ( aExp != 0 )
   6799         aSig |= 0x00800000;
   6800     else if ( aSig == 0 )
   6801         return a;
   6802 
   6803     if (n > 0x200) {
   6804         n = 0x200;
   6805     } else if (n < -0x200) {
   6806         n = -0x200;
   6807     }
   6808 
   6809     aExp += n - 1;
   6810     aSig <<= 7;
   6811     return normalizeRoundAndPackFloat32( aSign, aExp, aSig STATUS_VAR );
   6812 }
   6813 
   6814 float64 float64_scalbn( float64 a, int n STATUS_PARAM )
   6815 {
   6816     flag aSign;
   6817     int16_t aExp;
   6818     uint64_t aSig;
   6819 
   6820     a = float64_squash_input_denormal(a STATUS_VAR);
   6821     aSig = extractFloat64Frac( a );
   6822     aExp = extractFloat64Exp( a );
   6823     aSign = extractFloat64Sign( a );
   6824 
   6825     if ( aExp == 0x7FF ) {
   6826         if ( aSig ) {
   6827             return propagateFloat64NaN( a, a STATUS_VAR );
   6828         }
   6829         return a;
   6830     }
   6831     if ( aExp != 0 )
   6832         aSig |= LIT64( 0x0010000000000000 );
   6833     else if ( aSig == 0 )
   6834         return a;
   6835 
   6836     if (n > 0x1000) {
   6837         n = 0x1000;
   6838     } else if (n < -0x1000) {
   6839         n = -0x1000;
   6840     }
   6841 
   6842     aExp += n - 1;
   6843     aSig <<= 10;
   6844     return normalizeRoundAndPackFloat64( aSign, aExp, aSig STATUS_VAR );
   6845 }
   6846 
   6847 floatx80 floatx80_scalbn( floatx80 a, int n STATUS_PARAM )
   6848 {
   6849     flag aSign;
   6850     int32_t aExp;
   6851     uint64_t aSig;
   6852 
   6853     aSig = extractFloatx80Frac( a );
   6854     aExp = extractFloatx80Exp( a );
   6855     aSign = extractFloatx80Sign( a );
   6856 
   6857     if ( aExp == 0x7FFF ) {
   6858         if ( aSig<<1 ) {
   6859             return propagateFloatx80NaN( a, a STATUS_VAR );
   6860         }
   6861         return a;
   6862     }
   6863 
   6864     if (aExp == 0 && aSig == 0)
   6865         return a;
   6866 
   6867     if (n > 0x10000) {
   6868         n = 0x10000;
   6869     } else if (n < -0x10000) {
   6870         n = -0x10000;
   6871     }
   6872 
   6873     aExp += n;
   6874     return normalizeRoundAndPackFloatx80( STATUS(floatx80_rounding_precision),
   6875                                           aSign, aExp, aSig, 0 STATUS_VAR );
   6876 }
   6877 
   6878 float128 float128_scalbn( float128 a, int n STATUS_PARAM )
   6879 {
   6880     flag aSign;
   6881     int32_t aExp;
   6882     uint64_t aSig0, aSig1;
   6883 
   6884     aSig1 = extractFloat128Frac1( a );
   6885     aSig0 = extractFloat128Frac0( a );
   6886     aExp = extractFloat128Exp( a );
   6887     aSign = extractFloat128Sign( a );
   6888     if ( aExp == 0x7FFF ) {
   6889         if ( aSig0 | aSig1 ) {
   6890             return propagateFloat128NaN( a, a STATUS_VAR );
   6891         }
   6892         return a;
   6893     }
   6894     if ( aExp != 0 )
   6895         aSig0 |= LIT64( 0x0001000000000000 );
   6896     else if ( aSig0 == 0 && aSig1 == 0 )
   6897         return a;
   6898 
   6899     if (n > 0x10000) {
   6900         n = 0x10000;
   6901     } else if (n < -0x10000) {
   6902         n = -0x10000;
   6903     }
   6904 
   6905     aExp += n - 1;
   6906     return normalizeRoundAndPackFloat128( aSign, aExp, aSig0, aSig1
   6907                                           STATUS_VAR );
   6908 
   6909 }
   6910