Home | History | Annotate | Download | only in fpu
      1 
      2 /*============================================================================
      3 
      4 This C source file is part of the SoftFloat IEC/IEEE Floating-point Arithmetic
      5 Package, Release 2b.
      6 
      7 Written by John R. Hauser.  This work was made possible in part by the
      8 International Computer Science Institute, located at Suite 600, 1947 Center
      9 Street, Berkeley, California 94704.  Funding was partially provided by the
     10 National Science Foundation under grant MIP-9311980.  The original version
     11 of this code was written as part of a project to build a fixed-point vector
     12 processor in collaboration with the University of California at Berkeley,
     13 overseen by Profs. Nelson Morgan and John Wawrzynek.  More information
     14 is available through the Web page `http://www.cs.berkeley.edu/~jhauser/
     15 arithmetic/SoftFloat.html'.
     16 
     17 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE.  Although reasonable effort has
     18 been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT TIMES
     19 RESULT IN INCORRECT BEHAVIOR.  USE OF THIS SOFTWARE IS RESTRICTED TO PERSONS
     20 AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ALL LOSSES,
     21 COSTS, OR OTHER PROBLEMS THEY INCUR DUE TO THE SOFTWARE, AND WHO FURTHERMORE
     22 EFFECTIVELY INDEMNIFY JOHN HAUSER AND THE INTERNATIONAL COMPUTER SCIENCE
     23 INSTITUTE (possibly via similar legal warning) AGAINST ALL LOSSES, COSTS, OR
     24 OTHER PROBLEMS INCURRED BY THEIR CUSTOMERS AND CLIENTS DUE TO THE SOFTWARE.
     25 
     26 Derivative works are acceptable, even for commercial purposes, so long as
     27 (1) the source code for the derivative work includes prominent notice that
     28 the work is derivative, and (2) the source code includes prominent notice with
     29 these four paragraphs for those parts of this code that are retained.
     30 
     31 =============================================================================*/
     32 
     33 /* FIXME: Flush-To-Zero only effects results.  Denormal inputs should also
     34    be flushed to zero.  */
     35 #include "softfloat.h"
     36 
     37 /*----------------------------------------------------------------------------
     38 | Primitive arithmetic functions, including multi-word arithmetic, and
     39 | division and square root approximations.  (Can be specialized to target if
     40 | desired.)
     41 *----------------------------------------------------------------------------*/
     42 #include "softfloat-macros.h"
     43 
     44 /*----------------------------------------------------------------------------
     45 | Functions and definitions to determine:  (1) whether tininess for underflow
     46 | is detected before or after rounding by default, (2) what (if anything)
     47 | happens when exceptions are raised, (3) how signaling NaNs are distinguished
     48 | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
     49 | are propagated from function inputs to output.  These details are target-
     50 | specific.
     51 *----------------------------------------------------------------------------*/
     52 #include "softfloat-specialize.h"
     53 
     54 void set_float_rounding_mode(int val STATUS_PARAM)
     55 {
     56     STATUS(float_rounding_mode) = val;
     57 }
     58 
     59 void set_float_exception_flags(int val STATUS_PARAM)
     60 {
     61     STATUS(float_exception_flags) = val;
     62 }
     63 
     64 #ifdef FLOATX80
     65 void set_floatx80_rounding_precision(int val STATUS_PARAM)
     66 {
     67     STATUS(floatx80_rounding_precision) = val;
     68 }
     69 #endif
     70 
     71 /*----------------------------------------------------------------------------
     72 | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
     73 | and 7, and returns the properly rounded 32-bit integer corresponding to the
     74 | input.  If `zSign' is 1, the input is negated before being converted to an
     75 | integer.  Bit 63 of `absZ' must be zero.  Ordinarily, the fixed-point input
     76 | is simply rounded to an integer, with the inexact exception raised if the
     77 | input cannot be represented exactly as an integer.  However, if the fixed-
     78 | point input is too large, the invalid exception is raised and the largest
     79 | positive or negative integer is returned.
     80 *----------------------------------------------------------------------------*/
     81 
     82 static int32 roundAndPackInt32( flag zSign, bits64 absZ STATUS_PARAM)
     83 {
     84     int8 roundingMode;
     85     flag roundNearestEven;
     86     int8 roundIncrement, roundBits;
     87     int32 z;
     88 
     89     roundingMode = STATUS(float_rounding_mode);
     90     roundNearestEven = ( roundingMode == float_round_nearest_even );
     91     roundIncrement = 0x40;
     92     if ( ! roundNearestEven ) {
     93         if ( roundingMode == float_round_to_zero ) {
     94             roundIncrement = 0;
     95         }
     96         else {
     97             roundIncrement = 0x7F;
     98             if ( zSign ) {
     99                 if ( roundingMode == float_round_up ) roundIncrement = 0;
    100             }
    101             else {
    102                 if ( roundingMode == float_round_down ) roundIncrement = 0;
    103             }
    104         }
    105     }
    106     roundBits = absZ & 0x7F;
    107     absZ = ( absZ + roundIncrement )>>7;
    108     absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
    109     z = absZ;
    110     if ( zSign ) z = - z;
    111     if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
    112         float_raise( float_flag_invalid STATUS_VAR);
    113         return zSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
    114     }
    115     if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
    116     return z;
    117 
    118 }
    119 
    120 /*----------------------------------------------------------------------------
    121 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
    122 | `absZ1', with binary point between bits 63 and 64 (between the input words),
    123 | and returns the properly rounded 64-bit integer corresponding to the input.
    124 | If `zSign' is 1, the input is negated before being converted to an integer.
    125 | Ordinarily, the fixed-point input is simply rounded to an integer, with
    126 | the inexact exception raised if the input cannot be represented exactly as
    127 | an integer.  However, if the fixed-point input is too large, the invalid
    128 | exception is raised and the largest positive or negative integer is
    129 | returned.
    130 *----------------------------------------------------------------------------*/
    131 
    132 static int64 roundAndPackInt64( flag zSign, bits64 absZ0, bits64 absZ1 STATUS_PARAM)
    133 {
    134     int8 roundingMode;
    135     flag roundNearestEven, increment;
    136     int64 z;
    137 
    138     roundingMode = STATUS(float_rounding_mode);
    139     roundNearestEven = ( roundingMode == float_round_nearest_even );
    140     increment = ( (sbits64) absZ1 < 0 );
    141     if ( ! roundNearestEven ) {
    142         if ( roundingMode == float_round_to_zero ) {
    143             increment = 0;
    144         }
    145         else {
    146             if ( zSign ) {
    147                 increment = ( roundingMode == float_round_down ) && absZ1;
    148             }
    149             else {
    150                 increment = ( roundingMode == float_round_up ) && absZ1;
    151             }
    152         }
    153     }
    154     if ( increment ) {
    155         ++absZ0;
    156         if ( absZ0 == 0 ) goto overflow;
    157         absZ0 &= ~ ( ( (bits64) ( absZ1<<1 ) == 0 ) & roundNearestEven );
    158     }
    159     z = absZ0;
    160     if ( zSign ) z = - z;
    161     if ( z && ( ( z < 0 ) ^ zSign ) ) {
    162  overflow:
    163         float_raise( float_flag_invalid STATUS_VAR);
    164         return
    165               zSign ? (sbits64) LIT64( 0x8000000000000000 )
    166             : LIT64( 0x7FFFFFFFFFFFFFFF );
    167     }
    168     if ( absZ1 ) STATUS(float_exception_flags) |= float_flag_inexact;
    169     return z;
    170 
    171 }
    172 
    173 /*----------------------------------------------------------------------------
    174 | Returns the fraction bits of the single-precision floating-point value `a'.
    175 *----------------------------------------------------------------------------*/
    176 
    177 INLINE bits32 extractFloat32Frac( float32 a )
    178 {
    179 
    180     return float32_val(a) & 0x007FFFFF;
    181 
    182 }
    183 
    184 /*----------------------------------------------------------------------------
    185 | Returns the exponent bits of the single-precision floating-point value `a'.
    186 *----------------------------------------------------------------------------*/
    187 
    188 INLINE int16 extractFloat32Exp( float32 a )
    189 {
    190 
    191     return ( float32_val(a)>>23 ) & 0xFF;
    192 
    193 }
    194 
    195 /*----------------------------------------------------------------------------
    196 | Returns the sign bit of the single-precision floating-point value `a'.
    197 *----------------------------------------------------------------------------*/
    198 
    199 INLINE flag extractFloat32Sign( float32 a )
    200 {
    201 
    202     return float32_val(a)>>31;
    203 
    204 }
    205 
    206 /*----------------------------------------------------------------------------
    207 | Normalizes the subnormal single-precision floating-point value represented
    208 | by the denormalized significand `aSig'.  The normalized exponent and
    209 | significand are stored at the locations pointed to by `zExpPtr' and
    210 | `zSigPtr', respectively.
    211 *----------------------------------------------------------------------------*/
    212 
    213 static void
    214  normalizeFloat32Subnormal( bits32 aSig, int16 *zExpPtr, bits32 *zSigPtr )
    215 {
    216     int8 shiftCount;
    217 
    218     shiftCount = countLeadingZeros32( aSig ) - 8;
    219     *zSigPtr = aSig<<shiftCount;
    220     *zExpPtr = 1 - shiftCount;
    221 
    222 }
    223 
    224 /*----------------------------------------------------------------------------
    225 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
    226 | single-precision floating-point value, returning the result.  After being
    227 | shifted into the proper positions, the three fields are simply added
    228 | together to form the result.  This means that any integer portion of `zSig'
    229 | will be added into the exponent.  Since a properly normalized significand
    230 | will have an integer portion equal to 1, the `zExp' input should be 1 less
    231 | than the desired result exponent whenever `zSig' is a complete, normalized
    232 | significand.
    233 *----------------------------------------------------------------------------*/
    234 
    235 INLINE float32 packFloat32( flag zSign, int16 zExp, bits32 zSig )
    236 {
    237 
    238     return make_float32(
    239           ( ( (bits32) zSign )<<31 ) + ( ( (bits32) zExp )<<23 ) + zSig);
    240 
    241 }
    242 
    243 /*----------------------------------------------------------------------------
    244 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
    245 | and significand `zSig', and returns the proper single-precision floating-
    246 | point value corresponding to the abstract input.  Ordinarily, the abstract
    247 | value is simply rounded and packed into the single-precision format, with
    248 | the inexact exception raised if the abstract input cannot be represented
    249 | exactly.  However, if the abstract value is too large, the overflow and
    250 | inexact exceptions are raised and an infinity or maximal finite value is
    251 | returned.  If the abstract value is too small, the input value is rounded to
    252 | a subnormal number, and the underflow and inexact exceptions are raised if
    253 | the abstract input cannot be represented exactly as a subnormal single-
    254 | precision floating-point number.
    255 |     The input significand `zSig' has its binary point between bits 30
    256 | and 29, which is 7 bits to the left of the usual location.  This shifted
    257 | significand must be normalized or smaller.  If `zSig' is not normalized,
    258 | `zExp' must be 0; in that case, the result returned is a subnormal number,
    259 | and it must not require rounding.  In the usual case that `zSig' is
    260 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
    261 | The handling of underflow and overflow follows the IEC/IEEE Standard for
    262 | Binary Floating-Point Arithmetic.
    263 *----------------------------------------------------------------------------*/
    264 
    265 static float32 roundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig STATUS_PARAM)
    266 {
    267     int8 roundingMode;
    268     flag roundNearestEven;
    269     int8 roundIncrement, roundBits;
    270     flag isTiny;
    271 
    272     roundingMode = STATUS(float_rounding_mode);
    273     roundNearestEven = ( roundingMode == float_round_nearest_even );
    274     roundIncrement = 0x40;
    275     if ( ! roundNearestEven ) {
    276         if ( roundingMode == float_round_to_zero ) {
    277             roundIncrement = 0;
    278         }
    279         else {
    280             roundIncrement = 0x7F;
    281             if ( zSign ) {
    282                 if ( roundingMode == float_round_up ) roundIncrement = 0;
    283             }
    284             else {
    285                 if ( roundingMode == float_round_down ) roundIncrement = 0;
    286             }
    287         }
    288     }
    289     roundBits = zSig & 0x7F;
    290     if ( 0xFD <= (bits16) zExp ) {
    291         if (    ( 0xFD < zExp )
    292              || (    ( zExp == 0xFD )
    293                   && ( (sbits32) ( zSig + roundIncrement ) < 0 ) )
    294            ) {
    295             float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
    296             return packFloat32( zSign, 0xFF, - ( roundIncrement == 0 ));
    297         }
    298         if ( zExp < 0 ) {
    299             if ( STATUS(flush_to_zero) ) return packFloat32( zSign, 0, 0 );
    300             isTiny =
    301                    ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
    302                 || ( zExp < -1 )
    303                 || ( zSig + roundIncrement < 0x80000000 );
    304             shift32RightJamming( zSig, - zExp, &zSig );
    305             zExp = 0;
    306             roundBits = zSig & 0x7F;
    307             if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
    308         }
    309     }
    310     if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
    311     zSig = ( zSig + roundIncrement )>>7;
    312     zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
    313     if ( zSig == 0 ) zExp = 0;
    314     return packFloat32( zSign, zExp, zSig );
    315 
    316 }
    317 
    318 /*----------------------------------------------------------------------------
    319 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
    320 | and significand `zSig', and returns the proper single-precision floating-
    321 | point value corresponding to the abstract input.  This routine is just like
    322 | `roundAndPackFloat32' except that `zSig' does not have to be normalized.
    323 | Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
    324 | floating-point exponent.
    325 *----------------------------------------------------------------------------*/
    326 
    327 static float32
    328  normalizeRoundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig STATUS_PARAM)
    329 {
    330     int8 shiftCount;
    331 
    332     shiftCount = countLeadingZeros32( zSig ) - 1;
    333     return roundAndPackFloat32( zSign, zExp - shiftCount, zSig<<shiftCount STATUS_VAR);
    334 
    335 }
    336 
    337 /*----------------------------------------------------------------------------
    338 | Returns the fraction bits of the double-precision floating-point value `a'.
    339 *----------------------------------------------------------------------------*/
    340 
    341 INLINE bits64 extractFloat64Frac( float64 a )
    342 {
    343 
    344     return float64_val(a) & LIT64( 0x000FFFFFFFFFFFFF );
    345 
    346 }
    347 
    348 /*----------------------------------------------------------------------------
    349 | Returns the exponent bits of the double-precision floating-point value `a'.
    350 *----------------------------------------------------------------------------*/
    351 
    352 INLINE int16 extractFloat64Exp( float64 a )
    353 {
    354 
    355     return ( float64_val(a)>>52 ) & 0x7FF;
    356 
    357 }
    358 
    359 /*----------------------------------------------------------------------------
    360 | Returns the sign bit of the double-precision floating-point value `a'.
    361 *----------------------------------------------------------------------------*/
    362 
    363 INLINE flag extractFloat64Sign( float64 a )
    364 {
    365 
    366     return float64_val(a)>>63;
    367 
    368 }
    369 
    370 /*----------------------------------------------------------------------------
    371 | Normalizes the subnormal double-precision floating-point value represented
    372 | by the denormalized significand `aSig'.  The normalized exponent and
    373 | significand are stored at the locations pointed to by `zExpPtr' and
    374 | `zSigPtr', respectively.
    375 *----------------------------------------------------------------------------*/
    376 
    377 static void
    378  normalizeFloat64Subnormal( bits64 aSig, int16 *zExpPtr, bits64 *zSigPtr )
    379 {
    380     int8 shiftCount;
    381 
    382     shiftCount = countLeadingZeros64( aSig ) - 11;
    383     *zSigPtr = aSig<<shiftCount;
    384     *zExpPtr = 1 - shiftCount;
    385 
    386 }
    387 
    388 /*----------------------------------------------------------------------------
    389 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
    390 | double-precision floating-point value, returning the result.  After being
    391 | shifted into the proper positions, the three fields are simply added
    392 | together to form the result.  This means that any integer portion of `zSig'
    393 | will be added into the exponent.  Since a properly normalized significand
    394 | will have an integer portion equal to 1, the `zExp' input should be 1 less
    395 | than the desired result exponent whenever `zSig' is a complete, normalized
    396 | significand.
    397 *----------------------------------------------------------------------------*/
    398 
    399 INLINE float64 packFloat64( flag zSign, int16 zExp, bits64 zSig )
    400 {
    401 
    402     return make_float64(
    403         ( ( (bits64) zSign )<<63 ) + ( ( (bits64) zExp )<<52 ) + zSig);
    404 
    405 }
    406 
    407 /*----------------------------------------------------------------------------
    408 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
    409 | and significand `zSig', and returns the proper double-precision floating-
    410 | point value corresponding to the abstract input.  Ordinarily, the abstract
    411 | value is simply rounded and packed into the double-precision format, with
    412 | the inexact exception raised if the abstract input cannot be represented
    413 | exactly.  However, if the abstract value is too large, the overflow and
    414 | inexact exceptions are raised and an infinity or maximal finite value is
    415 | returned.  If the abstract value is too small, the input value is rounded
    416 | to a subnormal number, and the underflow and inexact exceptions are raised
    417 | if the abstract input cannot be represented exactly as a subnormal double-
    418 | precision floating-point number.
    419 |     The input significand `zSig' has its binary point between bits 62
    420 | and 61, which is 10 bits to the left of the usual location.  This shifted
    421 | significand must be normalized or smaller.  If `zSig' is not normalized,
    422 | `zExp' must be 0; in that case, the result returned is a subnormal number,
    423 | and it must not require rounding.  In the usual case that `zSig' is
    424 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
    425 | The handling of underflow and overflow follows the IEC/IEEE Standard for
    426 | Binary Floating-Point Arithmetic.
    427 *----------------------------------------------------------------------------*/
    428 
    429 static float64 roundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig STATUS_PARAM)
    430 {
    431     int8 roundingMode;
    432     flag roundNearestEven;
    433     int16 roundIncrement, roundBits;
    434     flag isTiny;
    435 
    436     roundingMode = STATUS(float_rounding_mode);
    437     roundNearestEven = ( roundingMode == float_round_nearest_even );
    438     roundIncrement = 0x200;
    439     if ( ! roundNearestEven ) {
    440         if ( roundingMode == float_round_to_zero ) {
    441             roundIncrement = 0;
    442         }
    443         else {
    444             roundIncrement = 0x3FF;
    445             if ( zSign ) {
    446                 if ( roundingMode == float_round_up ) roundIncrement = 0;
    447             }
    448             else {
    449                 if ( roundingMode == float_round_down ) roundIncrement = 0;
    450             }
    451         }
    452     }
    453     roundBits = zSig & 0x3FF;
    454     if ( 0x7FD <= (bits16) zExp ) {
    455         if (    ( 0x7FD < zExp )
    456              || (    ( zExp == 0x7FD )
    457                   && ( (sbits64) ( zSig + roundIncrement ) < 0 ) )
    458            ) {
    459             float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
    460             return packFloat64( zSign, 0x7FF, - ( roundIncrement == 0 ));
    461         }
    462         if ( zExp < 0 ) {
    463             if ( STATUS(flush_to_zero) ) return packFloat64( zSign, 0, 0 );
    464             isTiny =
    465                    ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
    466                 || ( zExp < -1 )
    467                 || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) );
    468             shift64RightJamming( zSig, - zExp, &zSig );
    469             zExp = 0;
    470             roundBits = zSig & 0x3FF;
    471             if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
    472         }
    473     }
    474     if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
    475     zSig = ( zSig + roundIncrement )>>10;
    476     zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
    477     if ( zSig == 0 ) zExp = 0;
    478     return packFloat64( zSign, zExp, zSig );
    479 
    480 }
    481 
    482 /*----------------------------------------------------------------------------
    483 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
    484 | and significand `zSig', and returns the proper double-precision floating-
    485 | point value corresponding to the abstract input.  This routine is just like
    486 | `roundAndPackFloat64' except that `zSig' does not have to be normalized.
    487 | Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
    488 | floating-point exponent.
    489 *----------------------------------------------------------------------------*/
    490 
    491 static float64
    492  normalizeRoundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig STATUS_PARAM)
    493 {
    494     int8 shiftCount;
    495 
    496     shiftCount = countLeadingZeros64( zSig ) - 1;
    497     return roundAndPackFloat64( zSign, zExp - shiftCount, zSig<<shiftCount STATUS_VAR);
    498 
    499 }
    500 
    501 #ifdef FLOATX80
    502 
    503 /*----------------------------------------------------------------------------
    504 | Returns the fraction bits of the extended double-precision floating-point
    505 | value `a'.
    506 *----------------------------------------------------------------------------*/
    507 
    508 INLINE bits64 extractFloatx80Frac( floatx80 a )
    509 {
    510 
    511     return a.low;
    512 
    513 }
    514 
    515 /*----------------------------------------------------------------------------
    516 | Returns the exponent bits of the extended double-precision floating-point
    517 | value `a'.
    518 *----------------------------------------------------------------------------*/
    519 
    520 INLINE int32 extractFloatx80Exp( floatx80 a )
    521 {
    522 
    523     return a.high & 0x7FFF;
    524 
    525 }
    526 
    527 /*----------------------------------------------------------------------------
    528 | Returns the sign bit of the extended double-precision floating-point value
    529 | `a'.
    530 *----------------------------------------------------------------------------*/
    531 
    532 INLINE flag extractFloatx80Sign( floatx80 a )
    533 {
    534 
    535     return a.high>>15;
    536 
    537 }
    538 
    539 /*----------------------------------------------------------------------------
    540 | Normalizes the subnormal extended double-precision floating-point value
    541 | represented by the denormalized significand `aSig'.  The normalized exponent
    542 | and significand are stored at the locations pointed to by `zExpPtr' and
    543 | `zSigPtr', respectively.
    544 *----------------------------------------------------------------------------*/
    545 
    546 static void
    547  normalizeFloatx80Subnormal( bits64 aSig, int32 *zExpPtr, bits64 *zSigPtr )
    548 {
    549     int8 shiftCount;
    550 
    551     shiftCount = countLeadingZeros64( aSig );
    552     *zSigPtr = aSig<<shiftCount;
    553     *zExpPtr = 1 - shiftCount;
    554 
    555 }
    556 
    557 /*----------------------------------------------------------------------------
    558 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
    559 | extended double-precision floating-point value, returning the result.
    560 *----------------------------------------------------------------------------*/
    561 
    562 INLINE floatx80 packFloatx80( flag zSign, int32 zExp, bits64 zSig )
    563 {
    564     floatx80 z;
    565 
    566     z.low = zSig;
    567     z.high = ( ( (bits16) zSign )<<15 ) + zExp;
    568     return z;
    569 
    570 }
    571 
    572 /*----------------------------------------------------------------------------
    573 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
    574 | and extended significand formed by the concatenation of `zSig0' and `zSig1',
    575 | and returns the proper extended double-precision floating-point value
    576 | corresponding to the abstract input.  Ordinarily, the abstract value is
    577 | rounded and packed into the extended double-precision format, with the
    578 | inexact exception raised if the abstract input cannot be represented
    579 | exactly.  However, if the abstract value is too large, the overflow and
    580 | inexact exceptions are raised and an infinity or maximal finite value is
    581 | returned.  If the abstract value is too small, the input value is rounded to
    582 | a subnormal number, and the underflow and inexact exceptions are raised if
    583 | the abstract input cannot be represented exactly as a subnormal extended
    584 | double-precision floating-point number.
    585 |     If `roundingPrecision' is 32 or 64, the result is rounded to the same
    586 | number of bits as single or double precision, respectively.  Otherwise, the
    587 | result is rounded to the full precision of the extended double-precision
    588 | format.
    589 |     The input significand must be normalized or smaller.  If the input
    590 | significand is not normalized, `zExp' must be 0; in that case, the result
    591 | returned is a subnormal number, and it must not require rounding.  The
    592 | handling of underflow and overflow follows the IEC/IEEE Standard for Binary
    593 | Floating-Point Arithmetic.
    594 *----------------------------------------------------------------------------*/
    595 
    596 static floatx80
    597  roundAndPackFloatx80(
    598      int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
    599  STATUS_PARAM)
    600 {
    601     int8 roundingMode;
    602     flag roundNearestEven, increment, isTiny;
    603     int64 roundIncrement, roundMask, roundBits;
    604 
    605     roundingMode = STATUS(float_rounding_mode);
    606     roundNearestEven = ( roundingMode == float_round_nearest_even );
    607     if ( roundingPrecision == 80 ) goto precision80;
    608     if ( roundingPrecision == 64 ) {
    609         roundIncrement = LIT64( 0x0000000000000400 );
    610         roundMask = LIT64( 0x00000000000007FF );
    611     }
    612     else if ( roundingPrecision == 32 ) {
    613         roundIncrement = LIT64( 0x0000008000000000 );
    614         roundMask = LIT64( 0x000000FFFFFFFFFF );
    615     }
    616     else {
    617         goto precision80;
    618     }
    619     zSig0 |= ( zSig1 != 0 );
    620     if ( ! roundNearestEven ) {
    621         if ( roundingMode == float_round_to_zero ) {
    622             roundIncrement = 0;
    623         }
    624         else {
    625             roundIncrement = roundMask;
    626             if ( zSign ) {
    627                 if ( roundingMode == float_round_up ) roundIncrement = 0;
    628             }
    629             else {
    630                 if ( roundingMode == float_round_down ) roundIncrement = 0;
    631             }
    632         }
    633     }
    634     roundBits = zSig0 & roundMask;
    635     if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
    636         if (    ( 0x7FFE < zExp )
    637              || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
    638            ) {
    639             goto overflow;
    640         }
    641         if ( zExp <= 0 ) {
    642             if ( STATUS(flush_to_zero) ) return packFloatx80( zSign, 0, 0 );
    643             isTiny =
    644                    ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
    645                 || ( zExp < 0 )
    646                 || ( zSig0 <= zSig0 + roundIncrement );
    647             shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
    648             zExp = 0;
    649             roundBits = zSig0 & roundMask;
    650             if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
    651             if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
    652             zSig0 += roundIncrement;
    653             if ( (sbits64) zSig0 < 0 ) zExp = 1;
    654             roundIncrement = roundMask + 1;
    655             if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
    656                 roundMask |= roundIncrement;
    657             }
    658             zSig0 &= ~ roundMask;
    659             return packFloatx80( zSign, zExp, zSig0 );
    660         }
    661     }
    662     if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
    663     zSig0 += roundIncrement;
    664     if ( zSig0 < roundIncrement ) {
    665         ++zExp;
    666         zSig0 = LIT64( 0x8000000000000000 );
    667     }
    668     roundIncrement = roundMask + 1;
    669     if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
    670         roundMask |= roundIncrement;
    671     }
    672     zSig0 &= ~ roundMask;
    673     if ( zSig0 == 0 ) zExp = 0;
    674     return packFloatx80( zSign, zExp, zSig0 );
    675  precision80:
    676     increment = ( (sbits64) zSig1 < 0 );
    677     if ( ! roundNearestEven ) {
    678         if ( roundingMode == float_round_to_zero ) {
    679             increment = 0;
    680         }
    681         else {
    682             if ( zSign ) {
    683                 increment = ( roundingMode == float_round_down ) && zSig1;
    684             }
    685             else {
    686                 increment = ( roundingMode == float_round_up ) && zSig1;
    687             }
    688         }
    689     }
    690     if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
    691         if (    ( 0x7FFE < zExp )
    692              || (    ( zExp == 0x7FFE )
    693                   && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
    694                   && increment
    695                 )
    696            ) {
    697             roundMask = 0;
    698  overflow:
    699             float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
    700             if (    ( roundingMode == float_round_to_zero )
    701                  || ( zSign && ( roundingMode == float_round_up ) )
    702                  || ( ! zSign && ( roundingMode == float_round_down ) )
    703                ) {
    704                 return packFloatx80( zSign, 0x7FFE, ~ roundMask );
    705             }
    706             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
    707         }
    708         if ( zExp <= 0 ) {
    709             isTiny =
    710                    ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
    711                 || ( zExp < 0 )
    712                 || ! increment
    713                 || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
    714             shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
    715             zExp = 0;
    716             if ( isTiny && zSig1 ) float_raise( float_flag_underflow STATUS_VAR);
    717             if ( zSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
    718             if ( roundNearestEven ) {
    719                 increment = ( (sbits64) zSig1 < 0 );
    720             }
    721             else {
    722                 if ( zSign ) {
    723                     increment = ( roundingMode == float_round_down ) && zSig1;
    724                 }
    725                 else {
    726                     increment = ( roundingMode == float_round_up ) && zSig1;
    727                 }
    728             }
    729             if ( increment ) {
    730                 ++zSig0;
    731                 zSig0 &=
    732                     ~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven );
    733                 if ( (sbits64) zSig0 < 0 ) zExp = 1;
    734             }
    735             return packFloatx80( zSign, zExp, zSig0 );
    736         }
    737     }
    738     if ( zSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
    739     if ( increment ) {
    740         ++zSig0;
    741         if ( zSig0 == 0 ) {
    742             ++zExp;
    743             zSig0 = LIT64( 0x8000000000000000 );
    744         }
    745         else {
    746             zSig0 &= ~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven );
    747         }
    748     }
    749     else {
    750         if ( zSig0 == 0 ) zExp = 0;
    751     }
    752     return packFloatx80( zSign, zExp, zSig0 );
    753 
    754 }
    755 
    756 /*----------------------------------------------------------------------------
    757 | Takes an abstract floating-point value having sign `zSign', exponent
    758 | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
    759 | and returns the proper extended double-precision floating-point value
    760 | corresponding to the abstract input.  This routine is just like
    761 | `roundAndPackFloatx80' except that the input significand does not have to be
    762 | normalized.
    763 *----------------------------------------------------------------------------*/
    764 
    765 static floatx80
    766  normalizeRoundAndPackFloatx80(
    767      int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
    768  STATUS_PARAM)
    769 {
    770     int8 shiftCount;
    771 
    772     if ( zSig0 == 0 ) {
    773         zSig0 = zSig1;
    774         zSig1 = 0;
    775         zExp -= 64;
    776     }
    777     shiftCount = countLeadingZeros64( zSig0 );
    778     shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
    779     zExp -= shiftCount;
    780     return
    781         roundAndPackFloatx80( roundingPrecision, zSign, zExp, zSig0, zSig1 STATUS_VAR);
    782 
    783 }
    784 
    785 #endif
    786 
    787 #ifdef FLOAT128
    788 
    789 /*----------------------------------------------------------------------------
    790 | Returns the least-significant 64 fraction bits of the quadruple-precision
    791 | floating-point value `a'.
    792 *----------------------------------------------------------------------------*/
    793 
    794 INLINE bits64 extractFloat128Frac1( float128 a )
    795 {
    796 
    797     return a.low;
    798 
    799 }
    800 
    801 /*----------------------------------------------------------------------------
    802 | Returns the most-significant 48 fraction bits of the quadruple-precision
    803 | floating-point value `a'.
    804 *----------------------------------------------------------------------------*/
    805 
    806 INLINE bits64 extractFloat128Frac0( float128 a )
    807 {
    808 
    809     return a.high & LIT64( 0x0000FFFFFFFFFFFF );
    810 
    811 }
    812 
    813 /*----------------------------------------------------------------------------
    814 | Returns the exponent bits of the quadruple-precision floating-point value
    815 | `a'.
    816 *----------------------------------------------------------------------------*/
    817 
    818 INLINE int32 extractFloat128Exp( float128 a )
    819 {
    820 
    821     return ( a.high>>48 ) & 0x7FFF;
    822 
    823 }
    824 
    825 /*----------------------------------------------------------------------------
    826 | Returns the sign bit of the quadruple-precision floating-point value `a'.
    827 *----------------------------------------------------------------------------*/
    828 
    829 INLINE flag extractFloat128Sign( float128 a )
    830 {
    831 
    832     return a.high>>63;
    833 
    834 }
    835 
    836 /*----------------------------------------------------------------------------
    837 | Normalizes the subnormal quadruple-precision floating-point value
    838 | represented by the denormalized significand formed by the concatenation of
    839 | `aSig0' and `aSig1'.  The normalized exponent is stored at the location
    840 | pointed to by `zExpPtr'.  The most significant 49 bits of the normalized
    841 | significand are stored at the location pointed to by `zSig0Ptr', and the
    842 | least significant 64 bits of the normalized significand are stored at the
    843 | location pointed to by `zSig1Ptr'.
    844 *----------------------------------------------------------------------------*/
    845 
    846 static void
    847  normalizeFloat128Subnormal(
    848      bits64 aSig0,
    849      bits64 aSig1,
    850      int32 *zExpPtr,
    851      bits64 *zSig0Ptr,
    852      bits64 *zSig1Ptr
    853  )
    854 {
    855     int8 shiftCount;
    856 
    857     if ( aSig0 == 0 ) {
    858         shiftCount = countLeadingZeros64( aSig1 ) - 15;
    859         if ( shiftCount < 0 ) {
    860             *zSig0Ptr = aSig1>>( - shiftCount );
    861             *zSig1Ptr = aSig1<<( shiftCount & 63 );
    862         }
    863         else {
    864             *zSig0Ptr = aSig1<<shiftCount;
    865             *zSig1Ptr = 0;
    866         }
    867         *zExpPtr = - shiftCount - 63;
    868     }
    869     else {
    870         shiftCount = countLeadingZeros64( aSig0 ) - 15;
    871         shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
    872         *zExpPtr = 1 - shiftCount;
    873     }
    874 
    875 }
    876 
    877 /*----------------------------------------------------------------------------
    878 | Packs the sign `zSign', the exponent `zExp', and the significand formed
    879 | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
    880 | floating-point value, returning the result.  After being shifted into the
    881 | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
    882 | added together to form the most significant 32 bits of the result.  This
    883 | means that any integer portion of `zSig0' will be added into the exponent.
    884 | Since a properly normalized significand will have an integer portion equal
    885 | to 1, the `zExp' input should be 1 less than the desired result exponent
    886 | whenever `zSig0' and `zSig1' concatenated form a complete, normalized
    887 | significand.
    888 *----------------------------------------------------------------------------*/
    889 
    890 INLINE float128
    891  packFloat128( flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 )
    892 {
    893     float128 z;
    894 
    895     z.low = zSig1;
    896     z.high = ( ( (bits64) zSign )<<63 ) + ( ( (bits64) zExp )<<48 ) + zSig0;
    897     return z;
    898 
    899 }
    900 
    901 /*----------------------------------------------------------------------------
    902 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
    903 | and extended significand formed by the concatenation of `zSig0', `zSig1',
    904 | and `zSig2', and returns the proper quadruple-precision floating-point value
    905 | corresponding to the abstract input.  Ordinarily, the abstract value is
    906 | simply rounded and packed into the quadruple-precision format, with the
    907 | inexact exception raised if the abstract input cannot be represented
    908 | exactly.  However, if the abstract value is too large, the overflow and
    909 | inexact exceptions are raised and an infinity or maximal finite value is
    910 | returned.  If the abstract value is too small, the input value is rounded to
    911 | a subnormal number, and the underflow and inexact exceptions are raised if
    912 | the abstract input cannot be represented exactly as a subnormal quadruple-
    913 | precision floating-point number.
    914 |     The input significand must be normalized or smaller.  If the input
    915 | significand is not normalized, `zExp' must be 0; in that case, the result
    916 | returned is a subnormal number, and it must not require rounding.  In the
    917 | usual case that the input significand is normalized, `zExp' must be 1 less
    918 | than the ``true'' floating-point exponent.  The handling of underflow and
    919 | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
    920 *----------------------------------------------------------------------------*/
    921 
    922 static float128
    923  roundAndPackFloat128(
    924      flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1, bits64 zSig2 STATUS_PARAM)
    925 {
    926     int8 roundingMode;
    927     flag roundNearestEven, increment, isTiny;
    928 
    929     roundingMode = STATUS(float_rounding_mode);
    930     roundNearestEven = ( roundingMode == float_round_nearest_even );
    931     increment = ( (sbits64) zSig2 < 0 );
    932     if ( ! roundNearestEven ) {
    933         if ( roundingMode == float_round_to_zero ) {
    934             increment = 0;
    935         }
    936         else {
    937             if ( zSign ) {
    938                 increment = ( roundingMode == float_round_down ) && zSig2;
    939             }
    940             else {
    941                 increment = ( roundingMode == float_round_up ) && zSig2;
    942             }
    943         }
    944     }
    945     if ( 0x7FFD <= (bits32) zExp ) {
    946         if (    ( 0x7FFD < zExp )
    947              || (    ( zExp == 0x7FFD )
    948                   && eq128(
    949                          LIT64( 0x0001FFFFFFFFFFFF ),
    950                          LIT64( 0xFFFFFFFFFFFFFFFF ),
    951                          zSig0,
    952                          zSig1
    953                      )
    954                   && increment
    955                 )
    956            ) {
    957             float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
    958             if (    ( roundingMode == float_round_to_zero )
    959                  || ( zSign && ( roundingMode == float_round_up ) )
    960                  || ( ! zSign && ( roundingMode == float_round_down ) )
    961                ) {
    962                 return
    963                     packFloat128(
    964                         zSign,
    965                         0x7FFE,
    966                         LIT64( 0x0000FFFFFFFFFFFF ),
    967                         LIT64( 0xFFFFFFFFFFFFFFFF )
    968                     );
    969             }
    970             return packFloat128( zSign, 0x7FFF, 0, 0 );
    971         }
    972         if ( zExp < 0 ) {
    973             if ( STATUS(flush_to_zero) ) return packFloat128( zSign, 0, 0, 0 );
    974             isTiny =
    975                    ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
    976                 || ( zExp < -1 )
    977                 || ! increment
    978                 || lt128(
    979                        zSig0,
    980                        zSig1,
    981                        LIT64( 0x0001FFFFFFFFFFFF ),
    982                        LIT64( 0xFFFFFFFFFFFFFFFF )
    983                    );
    984             shift128ExtraRightJamming(
    985                 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
    986             zExp = 0;
    987             if ( isTiny && zSig2 ) float_raise( float_flag_underflow STATUS_VAR);
    988             if ( roundNearestEven ) {
    989                 increment = ( (sbits64) zSig2 < 0 );
    990             }
    991             else {
    992                 if ( zSign ) {
    993                     increment = ( roundingMode == float_round_down ) && zSig2;
    994                 }
    995                 else {
    996                     increment = ( roundingMode == float_round_up ) && zSig2;
    997                 }
    998             }
    999         }
   1000     }
   1001     if ( zSig2 ) STATUS(float_exception_flags) |= float_flag_inexact;
   1002     if ( increment ) {
   1003         add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
   1004         zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven );
   1005     }
   1006     else {
   1007         if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
   1008     }
   1009     return packFloat128( zSign, zExp, zSig0, zSig1 );
   1010 
   1011 }
   1012 
   1013 /*----------------------------------------------------------------------------
   1014 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
   1015 | and significand formed by the concatenation of `zSig0' and `zSig1', and
   1016 | returns the proper quadruple-precision floating-point value corresponding
   1017 | to the abstract input.  This routine is just like `roundAndPackFloat128'
   1018 | except that the input significand has fewer bits and does not have to be
   1019 | normalized.  In all cases, `zExp' must be 1 less than the ``true'' floating-
   1020 | point exponent.
   1021 *----------------------------------------------------------------------------*/
   1022 
   1023 static float128
   1024  normalizeRoundAndPackFloat128(
   1025      flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 STATUS_PARAM)
   1026 {
   1027     int8 shiftCount;
   1028     bits64 zSig2;
   1029 
   1030     if ( zSig0 == 0 ) {
   1031         zSig0 = zSig1;
   1032         zSig1 = 0;
   1033         zExp -= 64;
   1034     }
   1035     shiftCount = countLeadingZeros64( zSig0 ) - 15;
   1036     if ( 0 <= shiftCount ) {
   1037         zSig2 = 0;
   1038         shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
   1039     }
   1040     else {
   1041         shift128ExtraRightJamming(
   1042             zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
   1043     }
   1044     zExp -= shiftCount;
   1045     return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR);
   1046 
   1047 }
   1048 
   1049 #endif
   1050 
   1051 /*----------------------------------------------------------------------------
   1052 | Returns the result of converting the 32-bit two's complement integer `a'
   1053 | to the single-precision floating-point format.  The conversion is performed
   1054 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   1055 *----------------------------------------------------------------------------*/
   1056 
   1057 float32 int32_to_float32( int32 a STATUS_PARAM )
   1058 {
   1059     flag zSign;
   1060 
   1061     if ( a == 0 ) return float32_zero;
   1062     if ( a == (sbits32) 0x80000000 ) return packFloat32( 1, 0x9E, 0 );
   1063     zSign = ( a < 0 );
   1064     return normalizeRoundAndPackFloat32( zSign, 0x9C, zSign ? - a : a STATUS_VAR );
   1065 
   1066 }
   1067 
   1068 /*----------------------------------------------------------------------------
   1069 | Returns the result of converting the 32-bit two's complement integer `a'
   1070 | to the double-precision floating-point format.  The conversion is performed
   1071 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   1072 *----------------------------------------------------------------------------*/
   1073 
   1074 float64 int32_to_float64( int32 a STATUS_PARAM )
   1075 {
   1076     flag zSign;
   1077     uint32 absA;
   1078     int8 shiftCount;
   1079     bits64 zSig;
   1080 
   1081     if ( a == 0 ) return float64_zero;
   1082     zSign = ( a < 0 );
   1083     absA = zSign ? - a : a;
   1084     shiftCount = countLeadingZeros32( absA ) + 21;
   1085     zSig = absA;
   1086     return packFloat64( zSign, 0x432 - shiftCount, zSig<<shiftCount );
   1087 
   1088 }
   1089 
   1090 #ifdef FLOATX80
   1091 
   1092 /*----------------------------------------------------------------------------
   1093 | Returns the result of converting the 32-bit two's complement integer `a'
   1094 | to the extended double-precision floating-point format.  The conversion
   1095 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
   1096 | Arithmetic.
   1097 *----------------------------------------------------------------------------*/
   1098 
   1099 floatx80 int32_to_floatx80( int32 a STATUS_PARAM )
   1100 {
   1101     flag zSign;
   1102     uint32 absA;
   1103     int8 shiftCount;
   1104     bits64 zSig;
   1105 
   1106     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
   1107     zSign = ( a < 0 );
   1108     absA = zSign ? - a : a;
   1109     shiftCount = countLeadingZeros32( absA ) + 32;
   1110     zSig = absA;
   1111     return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
   1112 
   1113 }
   1114 
   1115 #endif
   1116 
   1117 #ifdef FLOAT128
   1118 
   1119 /*----------------------------------------------------------------------------
   1120 | Returns the result of converting the 32-bit two's complement integer `a' to
   1121 | the quadruple-precision floating-point format.  The conversion is performed
   1122 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   1123 *----------------------------------------------------------------------------*/
   1124 
   1125 float128 int32_to_float128( int32 a STATUS_PARAM )
   1126 {
   1127     flag zSign;
   1128     uint32 absA;
   1129     int8 shiftCount;
   1130     bits64 zSig0;
   1131 
   1132     if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
   1133     zSign = ( a < 0 );
   1134     absA = zSign ? - a : a;
   1135     shiftCount = countLeadingZeros32( absA ) + 17;
   1136     zSig0 = absA;
   1137     return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
   1138 
   1139 }
   1140 
   1141 #endif
   1142 
   1143 /*----------------------------------------------------------------------------
   1144 | Returns the result of converting the 64-bit two's complement integer `a'
   1145 | to the single-precision floating-point format.  The conversion is performed
   1146 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   1147 *----------------------------------------------------------------------------*/
   1148 
   1149 float32 int64_to_float32( int64 a STATUS_PARAM )
   1150 {
   1151     flag zSign;
   1152     uint64 absA;
   1153     int8 shiftCount;
   1154 
   1155     if ( a == 0 ) return float32_zero;
   1156     zSign = ( a < 0 );
   1157     absA = zSign ? - a : a;
   1158     shiftCount = countLeadingZeros64( absA ) - 40;
   1159     if ( 0 <= shiftCount ) {
   1160         return packFloat32( zSign, 0x95 - shiftCount, absA<<shiftCount );
   1161     }
   1162     else {
   1163         shiftCount += 7;
   1164         if ( shiftCount < 0 ) {
   1165             shift64RightJamming( absA, - shiftCount, &absA );
   1166         }
   1167         else {
   1168             absA <<= shiftCount;
   1169         }
   1170         return roundAndPackFloat32( zSign, 0x9C - shiftCount, absA STATUS_VAR );
   1171     }
   1172 
   1173 }
   1174 
   1175 float32 uint64_to_float32( uint64 a STATUS_PARAM )
   1176 {
   1177     int8 shiftCount;
   1178 
   1179     if ( a == 0 ) return float32_zero;
   1180     shiftCount = countLeadingZeros64( a ) - 40;
   1181     if ( 0 <= shiftCount ) {
   1182         return packFloat32( 1 > 0, 0x95 - shiftCount, a<<shiftCount );
   1183     }
   1184     else {
   1185         shiftCount += 7;
   1186         if ( shiftCount < 0 ) {
   1187             shift64RightJamming( a, - shiftCount, &a );
   1188         }
   1189         else {
   1190             a <<= shiftCount;
   1191         }
   1192         return roundAndPackFloat32( 1 > 0, 0x9C - shiftCount, a STATUS_VAR );
   1193     }
   1194 }
   1195 
   1196 /*----------------------------------------------------------------------------
   1197 | Returns the result of converting the 64-bit two's complement integer `a'
   1198 | to the double-precision floating-point format.  The conversion is performed
   1199 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   1200 *----------------------------------------------------------------------------*/
   1201 
   1202 float64 int64_to_float64( int64 a STATUS_PARAM )
   1203 {
   1204     flag zSign;
   1205 
   1206     if ( a == 0 ) return float64_zero;
   1207     if ( a == (sbits64) LIT64( 0x8000000000000000 ) ) {
   1208         return packFloat64( 1, 0x43E, 0 );
   1209     }
   1210     zSign = ( a < 0 );
   1211     return normalizeRoundAndPackFloat64( zSign, 0x43C, zSign ? - a : a STATUS_VAR );
   1212 
   1213 }
   1214 
   1215 float64 uint64_to_float64( uint64 a STATUS_PARAM )
   1216 {
   1217     if ( a == 0 ) return float64_zero;
   1218     return normalizeRoundAndPackFloat64( 0, 0x43C, a STATUS_VAR );
   1219 
   1220 }
   1221 
   1222 #ifdef FLOATX80
   1223 
   1224 /*----------------------------------------------------------------------------
   1225 | Returns the result of converting the 64-bit two's complement integer `a'
   1226 | to the extended double-precision floating-point format.  The conversion
   1227 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
   1228 | Arithmetic.
   1229 *----------------------------------------------------------------------------*/
   1230 
   1231 floatx80 int64_to_floatx80( int64 a STATUS_PARAM )
   1232 {
   1233     flag zSign;
   1234     uint64 absA;
   1235     int8 shiftCount;
   1236 
   1237     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
   1238     zSign = ( a < 0 );
   1239     absA = zSign ? - a : a;
   1240     shiftCount = countLeadingZeros64( absA );
   1241     return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
   1242 
   1243 }
   1244 
   1245 #endif
   1246 
   1247 #ifdef FLOAT128
   1248 
   1249 /*----------------------------------------------------------------------------
   1250 | Returns the result of converting the 64-bit two's complement integer `a' to
   1251 | the quadruple-precision floating-point format.  The conversion is performed
   1252 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   1253 *----------------------------------------------------------------------------*/
   1254 
   1255 float128 int64_to_float128( int64 a STATUS_PARAM )
   1256 {
   1257     flag zSign;
   1258     uint64 absA;
   1259     int8 shiftCount;
   1260     int32 zExp;
   1261     bits64 zSig0, zSig1;
   1262 
   1263     if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
   1264     zSign = ( a < 0 );
   1265     absA = zSign ? - a : a;
   1266     shiftCount = countLeadingZeros64( absA ) + 49;
   1267     zExp = 0x406E - shiftCount;
   1268     if ( 64 <= shiftCount ) {
   1269         zSig1 = 0;
   1270         zSig0 = absA;
   1271         shiftCount -= 64;
   1272     }
   1273     else {
   1274         zSig1 = absA;
   1275         zSig0 = 0;
   1276     }
   1277     shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
   1278     return packFloat128( zSign, zExp, zSig0, zSig1 );
   1279 
   1280 }
   1281 
   1282 #endif
   1283 
   1284 /*----------------------------------------------------------------------------
   1285 | Returns the result of converting the single-precision floating-point value
   1286 | `a' to the 32-bit two's complement integer format.  The conversion is
   1287 | performed according to the IEC/IEEE Standard for Binary Floating-Point
   1288 | Arithmetic---which means in particular that the conversion is rounded
   1289 | according to the current rounding mode.  If `a' is a NaN, the largest
   1290 | positive integer is returned.  Otherwise, if the conversion overflows, the
   1291 | largest integer with the same sign as `a' is returned.
   1292 *----------------------------------------------------------------------------*/
   1293 
   1294 int32 float32_to_int32( float32 a STATUS_PARAM )
   1295 {
   1296     flag aSign;
   1297     int16 aExp, shiftCount;
   1298     bits32 aSig;
   1299     bits64 aSig64;
   1300 
   1301     aSig = extractFloat32Frac( a );
   1302     aExp = extractFloat32Exp( a );
   1303     aSign = extractFloat32Sign( a );
   1304     if ( ( aExp == 0xFF ) && aSig ) aSign = 0;
   1305     if ( aExp ) aSig |= 0x00800000;
   1306     shiftCount = 0xAF - aExp;
   1307     aSig64 = aSig;
   1308     aSig64 <<= 32;
   1309     if ( 0 < shiftCount ) shift64RightJamming( aSig64, shiftCount, &aSig64 );
   1310     return roundAndPackInt32( aSign, aSig64 STATUS_VAR );
   1311 
   1312 }
   1313 
   1314 /*----------------------------------------------------------------------------
   1315 | Returns the result of converting the single-precision floating-point value
   1316 | `a' to the 32-bit two's complement integer format.  The conversion is
   1317 | performed according to the IEC/IEEE Standard for Binary Floating-Point
   1318 | Arithmetic, except that the conversion is always rounded toward zero.
   1319 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
   1320 | the conversion overflows, the largest integer with the same sign as `a' is
   1321 | returned.
   1322 *----------------------------------------------------------------------------*/
   1323 
   1324 int32 float32_to_int32_round_to_zero( float32 a STATUS_PARAM )
   1325 {
   1326     flag aSign;
   1327     int16 aExp, shiftCount;
   1328     bits32 aSig;
   1329     int32 z;
   1330 
   1331     aSig = extractFloat32Frac( a );
   1332     aExp = extractFloat32Exp( a );
   1333     aSign = extractFloat32Sign( a );
   1334     shiftCount = aExp - 0x9E;
   1335     if ( 0 <= shiftCount ) {
   1336         if ( float32_val(a) != 0xCF000000 ) {
   1337             float_raise( float_flag_invalid STATUS_VAR);
   1338             if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF;
   1339         }
   1340         return (sbits32) 0x80000000;
   1341     }
   1342     else if ( aExp <= 0x7E ) {
   1343         if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
   1344         return 0;
   1345     }
   1346     aSig = ( aSig | 0x00800000 )<<8;
   1347     z = aSig>>( - shiftCount );
   1348     if ( (bits32) ( aSig<<( shiftCount & 31 ) ) ) {
   1349         STATUS(float_exception_flags) |= float_flag_inexact;
   1350     }
   1351     if ( aSign ) z = - z;
   1352     return z;
   1353 
   1354 }
   1355 
   1356 /*----------------------------------------------------------------------------
   1357 | Returns the result of converting the single-precision floating-point value
   1358 | `a' to the 64-bit two's complement integer format.  The conversion is
   1359 | performed according to the IEC/IEEE Standard for Binary Floating-Point
   1360 | Arithmetic---which means in particular that the conversion is rounded
   1361 | according to the current rounding mode.  If `a' is a NaN, the largest
   1362 | positive integer is returned.  Otherwise, if the conversion overflows, the
   1363 | largest integer with the same sign as `a' is returned.
   1364 *----------------------------------------------------------------------------*/
   1365 
   1366 int64 float32_to_int64( float32 a STATUS_PARAM )
   1367 {
   1368     flag aSign;
   1369     int16 aExp, shiftCount;
   1370     bits32 aSig;
   1371     bits64 aSig64, aSigExtra;
   1372 
   1373     aSig = extractFloat32Frac( a );
   1374     aExp = extractFloat32Exp( a );
   1375     aSign = extractFloat32Sign( a );
   1376     shiftCount = 0xBE - aExp;
   1377     if ( shiftCount < 0 ) {
   1378         float_raise( float_flag_invalid STATUS_VAR);
   1379         if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
   1380             return LIT64( 0x7FFFFFFFFFFFFFFF );
   1381         }
   1382         return (sbits64) LIT64( 0x8000000000000000 );
   1383     }
   1384     if ( aExp ) aSig |= 0x00800000;
   1385     aSig64 = aSig;
   1386     aSig64 <<= 40;
   1387     shift64ExtraRightJamming( aSig64, 0, shiftCount, &aSig64, &aSigExtra );
   1388     return roundAndPackInt64( aSign, aSig64, aSigExtra STATUS_VAR );
   1389 
   1390 }
   1391 
   1392 /*----------------------------------------------------------------------------
   1393 | Returns the result of converting the single-precision floating-point value
   1394 | `a' to the 64-bit two's complement integer format.  The conversion is
   1395 | performed according to the IEC/IEEE Standard for Binary Floating-Point
   1396 | Arithmetic, except that the conversion is always rounded toward zero.  If
   1397 | `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
   1398 | conversion overflows, the largest integer with the same sign as `a' is
   1399 | returned.
   1400 *----------------------------------------------------------------------------*/
   1401 
   1402 int64 float32_to_int64_round_to_zero( float32 a STATUS_PARAM )
   1403 {
   1404     flag aSign;
   1405     int16 aExp, shiftCount;
   1406     bits32 aSig;
   1407     bits64 aSig64;
   1408     int64 z;
   1409 
   1410     aSig = extractFloat32Frac( a );
   1411     aExp = extractFloat32Exp( a );
   1412     aSign = extractFloat32Sign( a );
   1413     shiftCount = aExp - 0xBE;
   1414     if ( 0 <= shiftCount ) {
   1415         if ( float32_val(a) != 0xDF000000 ) {
   1416             float_raise( float_flag_invalid STATUS_VAR);
   1417             if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
   1418                 return LIT64( 0x7FFFFFFFFFFFFFFF );
   1419             }
   1420         }
   1421         return (sbits64) LIT64( 0x8000000000000000 );
   1422     }
   1423     else if ( aExp <= 0x7E ) {
   1424         if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
   1425         return 0;
   1426     }
   1427     aSig64 = aSig | 0x00800000;
   1428     aSig64 <<= 40;
   1429     z = aSig64>>( - shiftCount );
   1430     if ( (bits64) ( aSig64<<( shiftCount & 63 ) ) ) {
   1431         STATUS(float_exception_flags) |= float_flag_inexact;
   1432     }
   1433     if ( aSign ) z = - z;
   1434     return z;
   1435 
   1436 }
   1437 
   1438 /*----------------------------------------------------------------------------
   1439 | Returns the result of converting the single-precision floating-point value
   1440 | `a' to the double-precision floating-point format.  The conversion is
   1441 | performed according to the IEC/IEEE Standard for Binary Floating-Point
   1442 | Arithmetic.
   1443 *----------------------------------------------------------------------------*/
   1444 
   1445 float64 float32_to_float64( float32 a STATUS_PARAM )
   1446 {
   1447     flag aSign;
   1448     int16 aExp;
   1449     bits32 aSig;
   1450 
   1451     aSig = extractFloat32Frac( a );
   1452     aExp = extractFloat32Exp( a );
   1453     aSign = extractFloat32Sign( a );
   1454     if ( aExp == 0xFF ) {
   1455         if ( aSig ) return commonNaNToFloat64( float32ToCommonNaN( a STATUS_VAR ));
   1456         return packFloat64( aSign, 0x7FF, 0 );
   1457     }
   1458     if ( aExp == 0 ) {
   1459         if ( aSig == 0 ) return packFloat64( aSign, 0, 0 );
   1460         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
   1461         --aExp;
   1462     }
   1463     return packFloat64( aSign, aExp + 0x380, ( (bits64) aSig )<<29 );
   1464 
   1465 }
   1466 
   1467 #ifdef FLOATX80
   1468 
   1469 /*----------------------------------------------------------------------------
   1470 | Returns the result of converting the single-precision floating-point value
   1471 | `a' to the extended double-precision floating-point format.  The conversion
   1472 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
   1473 | Arithmetic.
   1474 *----------------------------------------------------------------------------*/
   1475 
   1476 floatx80 float32_to_floatx80( float32 a STATUS_PARAM )
   1477 {
   1478     flag aSign;
   1479     int16 aExp;
   1480     bits32 aSig;
   1481 
   1482     aSig = extractFloat32Frac( a );
   1483     aExp = extractFloat32Exp( a );
   1484     aSign = extractFloat32Sign( a );
   1485     if ( aExp == 0xFF ) {
   1486         if ( aSig ) return commonNaNToFloatx80( float32ToCommonNaN( a STATUS_VAR ) );
   1487         return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
   1488     }
   1489     if ( aExp == 0 ) {
   1490         if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
   1491         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
   1492     }
   1493     aSig |= 0x00800000;
   1494     return packFloatx80( aSign, aExp + 0x3F80, ( (bits64) aSig )<<40 );
   1495 
   1496 }
   1497 
   1498 #endif
   1499 
   1500 #ifdef FLOAT128
   1501 
   1502 /*----------------------------------------------------------------------------
   1503 | Returns the result of converting the single-precision floating-point value
   1504 | `a' to the double-precision floating-point format.  The conversion is
   1505 | performed according to the IEC/IEEE Standard for Binary Floating-Point
   1506 | Arithmetic.
   1507 *----------------------------------------------------------------------------*/
   1508 
   1509 float128 float32_to_float128( float32 a STATUS_PARAM )
   1510 {
   1511     flag aSign;
   1512     int16 aExp;
   1513     bits32 aSig;
   1514 
   1515     aSig = extractFloat32Frac( a );
   1516     aExp = extractFloat32Exp( a );
   1517     aSign = extractFloat32Sign( a );
   1518     if ( aExp == 0xFF ) {
   1519         if ( aSig ) return commonNaNToFloat128( float32ToCommonNaN( a STATUS_VAR ) );
   1520         return packFloat128( aSign, 0x7FFF, 0, 0 );
   1521     }
   1522     if ( aExp == 0 ) {
   1523         if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
   1524         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
   1525         --aExp;
   1526     }
   1527     return packFloat128( aSign, aExp + 0x3F80, ( (bits64) aSig )<<25, 0 );
   1528 
   1529 }
   1530 
   1531 #endif
   1532 
   1533 /*----------------------------------------------------------------------------
   1534 | Rounds the single-precision floating-point value `a' to an integer, and
   1535 | returns the result as a single-precision floating-point value.  The
   1536 | operation is performed according to the IEC/IEEE Standard for Binary
   1537 | Floating-Point Arithmetic.
   1538 *----------------------------------------------------------------------------*/
   1539 
   1540 float32 float32_round_to_int( float32 a STATUS_PARAM)
   1541 {
   1542     flag aSign;
   1543     int16 aExp;
   1544     bits32 lastBitMask, roundBitsMask;
   1545     int8 roundingMode;
   1546     bits32 z;
   1547 
   1548     aExp = extractFloat32Exp( a );
   1549     if ( 0x96 <= aExp ) {
   1550         if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) {
   1551             return propagateFloat32NaN( a, a STATUS_VAR );
   1552         }
   1553         return a;
   1554     }
   1555     if ( aExp <= 0x7E ) {
   1556         if ( (bits32) ( float32_val(a)<<1 ) == 0 ) return a;
   1557         STATUS(float_exception_flags) |= float_flag_inexact;
   1558         aSign = extractFloat32Sign( a );
   1559         switch ( STATUS(float_rounding_mode) ) {
   1560          case float_round_nearest_even:
   1561             if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) {
   1562                 return packFloat32( aSign, 0x7F, 0 );
   1563             }
   1564             break;
   1565          case float_round_down:
   1566             return make_float32(aSign ? 0xBF800000 : 0);
   1567          case float_round_up:
   1568             return make_float32(aSign ? 0x80000000 : 0x3F800000);
   1569         }
   1570         return packFloat32( aSign, 0, 0 );
   1571     }
   1572     lastBitMask = 1;
   1573     lastBitMask <<= 0x96 - aExp;
   1574     roundBitsMask = lastBitMask - 1;
   1575     z = float32_val(a);
   1576     roundingMode = STATUS(float_rounding_mode);
   1577     if ( roundingMode == float_round_nearest_even ) {
   1578         z += lastBitMask>>1;
   1579         if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
   1580     }
   1581     else if ( roundingMode != float_round_to_zero ) {
   1582         if ( extractFloat32Sign( make_float32(z) ) ^ ( roundingMode == float_round_up ) ) {
   1583             z += roundBitsMask;
   1584         }
   1585     }
   1586     z &= ~ roundBitsMask;
   1587     if ( z != float32_val(a) ) STATUS(float_exception_flags) |= float_flag_inexact;
   1588     return make_float32(z);
   1589 
   1590 }
   1591 
   1592 /*----------------------------------------------------------------------------
   1593 | Returns the result of adding the absolute values of the single-precision
   1594 | floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
   1595 | before being returned.  `zSign' is ignored if the result is a NaN.
   1596 | The addition is performed according to the IEC/IEEE Standard for Binary
   1597 | Floating-Point Arithmetic.
   1598 *----------------------------------------------------------------------------*/
   1599 
   1600 static float32 addFloat32Sigs( float32 a, float32 b, flag zSign STATUS_PARAM)
   1601 {
   1602     int16 aExp, bExp, zExp;
   1603     bits32 aSig, bSig, zSig;
   1604     int16 expDiff;
   1605 
   1606     aSig = extractFloat32Frac( a );
   1607     aExp = extractFloat32Exp( a );
   1608     bSig = extractFloat32Frac( b );
   1609     bExp = extractFloat32Exp( b );
   1610     expDiff = aExp - bExp;
   1611     aSig <<= 6;
   1612     bSig <<= 6;
   1613     if ( 0 < expDiff ) {
   1614         if ( aExp == 0xFF ) {
   1615             if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
   1616             return a;
   1617         }
   1618         if ( bExp == 0 ) {
   1619             --expDiff;
   1620         }
   1621         else {
   1622             bSig |= 0x20000000;
   1623         }
   1624         shift32RightJamming( bSig, expDiff, &bSig );
   1625         zExp = aExp;
   1626     }
   1627     else if ( expDiff < 0 ) {
   1628         if ( bExp == 0xFF ) {
   1629             if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
   1630             return packFloat32( zSign, 0xFF, 0 );
   1631         }
   1632         if ( aExp == 0 ) {
   1633             ++expDiff;
   1634         }
   1635         else {
   1636             aSig |= 0x20000000;
   1637         }
   1638         shift32RightJamming( aSig, - expDiff, &aSig );
   1639         zExp = bExp;
   1640     }
   1641     else {
   1642         if ( aExp == 0xFF ) {
   1643             if ( aSig | bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
   1644             return a;
   1645         }
   1646         if ( aExp == 0 ) {
   1647             if ( STATUS(flush_to_zero) ) return packFloat32( zSign, 0, 0 );
   1648             return packFloat32( zSign, 0, ( aSig + bSig )>>6 );
   1649         }
   1650         zSig = 0x40000000 + aSig + bSig;
   1651         zExp = aExp;
   1652         goto roundAndPack;
   1653     }
   1654     aSig |= 0x20000000;
   1655     zSig = ( aSig + bSig )<<1;
   1656     --zExp;
   1657     if ( (sbits32) zSig < 0 ) {
   1658         zSig = aSig + bSig;
   1659         ++zExp;
   1660     }
   1661  roundAndPack:
   1662     return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
   1663 
   1664 }
   1665 
   1666 /*----------------------------------------------------------------------------
   1667 | Returns the result of subtracting the absolute values of the single-
   1668 | precision floating-point values `a' and `b'.  If `zSign' is 1, the
   1669 | difference is negated before being returned.  `zSign' is ignored if the
   1670 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
   1671 | Standard for Binary Floating-Point Arithmetic.
   1672 *----------------------------------------------------------------------------*/
   1673 
   1674 static float32 subFloat32Sigs( float32 a, float32 b, flag zSign STATUS_PARAM)
   1675 {
   1676     int16 aExp, bExp, zExp;
   1677     bits32 aSig, bSig, zSig;
   1678     int16 expDiff;
   1679 
   1680     aSig = extractFloat32Frac( a );
   1681     aExp = extractFloat32Exp( a );
   1682     bSig = extractFloat32Frac( b );
   1683     bExp = extractFloat32Exp( b );
   1684     expDiff = aExp - bExp;
   1685     aSig <<= 7;
   1686     bSig <<= 7;
   1687     if ( 0 < expDiff ) goto aExpBigger;
   1688     if ( expDiff < 0 ) goto bExpBigger;
   1689     if ( aExp == 0xFF ) {
   1690         if ( aSig | bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
   1691         float_raise( float_flag_invalid STATUS_VAR);
   1692         return float32_default_nan;
   1693     }
   1694     if ( aExp == 0 ) {
   1695         aExp = 1;
   1696         bExp = 1;
   1697     }
   1698     if ( bSig < aSig ) goto aBigger;
   1699     if ( aSig < bSig ) goto bBigger;
   1700     return packFloat32( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
   1701  bExpBigger:
   1702     if ( bExp == 0xFF ) {
   1703         if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
   1704         return packFloat32( zSign ^ 1, 0xFF, 0 );
   1705     }
   1706     if ( aExp == 0 ) {
   1707         ++expDiff;
   1708     }
   1709     else {
   1710         aSig |= 0x40000000;
   1711     }
   1712     shift32RightJamming( aSig, - expDiff, &aSig );
   1713     bSig |= 0x40000000;
   1714  bBigger:
   1715     zSig = bSig - aSig;
   1716     zExp = bExp;
   1717     zSign ^= 1;
   1718     goto normalizeRoundAndPack;
   1719  aExpBigger:
   1720     if ( aExp == 0xFF ) {
   1721         if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
   1722         return a;
   1723     }
   1724     if ( bExp == 0 ) {
   1725         --expDiff;
   1726     }
   1727     else {
   1728         bSig |= 0x40000000;
   1729     }
   1730     shift32RightJamming( bSig, expDiff, &bSig );
   1731     aSig |= 0x40000000;
   1732  aBigger:
   1733     zSig = aSig - bSig;
   1734     zExp = aExp;
   1735  normalizeRoundAndPack:
   1736     --zExp;
   1737     return normalizeRoundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
   1738 
   1739 }
   1740 
   1741 /*----------------------------------------------------------------------------
   1742 | Returns the result of adding the single-precision floating-point values `a'
   1743 | and `b'.  The operation is performed according to the IEC/IEEE Standard for
   1744 | Binary Floating-Point Arithmetic.
   1745 *----------------------------------------------------------------------------*/
   1746 
   1747 float32 float32_add( float32 a, float32 b STATUS_PARAM )
   1748 {
   1749     flag aSign, bSign;
   1750 
   1751     aSign = extractFloat32Sign( a );
   1752     bSign = extractFloat32Sign( b );
   1753     if ( aSign == bSign ) {
   1754         return addFloat32Sigs( a, b, aSign STATUS_VAR);
   1755     }
   1756     else {
   1757         return subFloat32Sigs( a, b, aSign STATUS_VAR );
   1758     }
   1759 
   1760 }
   1761 
   1762 /*----------------------------------------------------------------------------
   1763 | Returns the result of subtracting the single-precision floating-point values
   1764 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
   1765 | for Binary Floating-Point Arithmetic.
   1766 *----------------------------------------------------------------------------*/
   1767 
   1768 float32 float32_sub( float32 a, float32 b STATUS_PARAM )
   1769 {
   1770     flag aSign, bSign;
   1771 
   1772     aSign = extractFloat32Sign( a );
   1773     bSign = extractFloat32Sign( b );
   1774     if ( aSign == bSign ) {
   1775         return subFloat32Sigs( a, b, aSign STATUS_VAR );
   1776     }
   1777     else {
   1778         return addFloat32Sigs( a, b, aSign STATUS_VAR );
   1779     }
   1780 
   1781 }
   1782 
   1783 /*----------------------------------------------------------------------------
   1784 | Returns the result of multiplying the single-precision floating-point values
   1785 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
   1786 | for Binary Floating-Point Arithmetic.
   1787 *----------------------------------------------------------------------------*/
   1788 
   1789 float32 float32_mul( float32 a, float32 b STATUS_PARAM )
   1790 {
   1791     flag aSign, bSign, zSign;
   1792     int16 aExp, bExp, zExp;
   1793     bits32 aSig, bSig;
   1794     bits64 zSig64;
   1795     bits32 zSig;
   1796 
   1797     aSig = extractFloat32Frac( a );
   1798     aExp = extractFloat32Exp( a );
   1799     aSign = extractFloat32Sign( a );
   1800     bSig = extractFloat32Frac( b );
   1801     bExp = extractFloat32Exp( b );
   1802     bSign = extractFloat32Sign( b );
   1803     zSign = aSign ^ bSign;
   1804     if ( aExp == 0xFF ) {
   1805         if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
   1806             return propagateFloat32NaN( a, b STATUS_VAR );
   1807         }
   1808         if ( ( bExp | bSig ) == 0 ) {
   1809             float_raise( float_flag_invalid STATUS_VAR);
   1810             return float32_default_nan;
   1811         }
   1812         return packFloat32( zSign, 0xFF, 0 );
   1813     }
   1814     if ( bExp == 0xFF ) {
   1815         if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
   1816         if ( ( aExp | aSig ) == 0 ) {
   1817             float_raise( float_flag_invalid STATUS_VAR);
   1818             return float32_default_nan;
   1819         }
   1820         return packFloat32( zSign, 0xFF, 0 );
   1821     }
   1822     if ( aExp == 0 ) {
   1823         if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
   1824         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
   1825     }
   1826     if ( bExp == 0 ) {
   1827         if ( bSig == 0 ) return packFloat32( zSign, 0, 0 );
   1828         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
   1829     }
   1830     zExp = aExp + bExp - 0x7F;
   1831     aSig = ( aSig | 0x00800000 )<<7;
   1832     bSig = ( bSig | 0x00800000 )<<8;
   1833     shift64RightJamming( ( (bits64) aSig ) * bSig, 32, &zSig64 );
   1834     zSig = zSig64;
   1835     if ( 0 <= (sbits32) ( zSig<<1 ) ) {
   1836         zSig <<= 1;
   1837         --zExp;
   1838     }
   1839     return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
   1840 
   1841 }
   1842 
   1843 /*----------------------------------------------------------------------------
   1844 | Returns the result of dividing the single-precision floating-point value `a'
   1845 | by the corresponding value `b'.  The operation is performed according to the
   1846 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   1847 *----------------------------------------------------------------------------*/
   1848 
   1849 float32 float32_div( float32 a, float32 b STATUS_PARAM )
   1850 {
   1851     flag aSign, bSign, zSign;
   1852     int16 aExp, bExp, zExp;
   1853     bits32 aSig, bSig, zSig;
   1854 
   1855     aSig = extractFloat32Frac( a );
   1856     aExp = extractFloat32Exp( a );
   1857     aSign = extractFloat32Sign( a );
   1858     bSig = extractFloat32Frac( b );
   1859     bExp = extractFloat32Exp( b );
   1860     bSign = extractFloat32Sign( b );
   1861     zSign = aSign ^ bSign;
   1862     if ( aExp == 0xFF ) {
   1863         if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
   1864         if ( bExp == 0xFF ) {
   1865             if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
   1866             float_raise( float_flag_invalid STATUS_VAR);
   1867             return float32_default_nan;
   1868         }
   1869         return packFloat32( zSign, 0xFF, 0 );
   1870     }
   1871     if ( bExp == 0xFF ) {
   1872         if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
   1873         return packFloat32( zSign, 0, 0 );
   1874     }
   1875     if ( bExp == 0 ) {
   1876         if ( bSig == 0 ) {
   1877             if ( ( aExp | aSig ) == 0 ) {
   1878                 float_raise( float_flag_invalid STATUS_VAR);
   1879                 return float32_default_nan;
   1880             }
   1881             float_raise( float_flag_divbyzero STATUS_VAR);
   1882             return packFloat32( zSign, 0xFF, 0 );
   1883         }
   1884         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
   1885     }
   1886     if ( aExp == 0 ) {
   1887         if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
   1888         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
   1889     }
   1890     zExp = aExp - bExp + 0x7D;
   1891     aSig = ( aSig | 0x00800000 )<<7;
   1892     bSig = ( bSig | 0x00800000 )<<8;
   1893     if ( bSig <= ( aSig + aSig ) ) {
   1894         aSig >>= 1;
   1895         ++zExp;
   1896     }
   1897     zSig = ( ( (bits64) aSig )<<32 ) / bSig;
   1898     if ( ( zSig & 0x3F ) == 0 ) {
   1899         zSig |= ( (bits64) bSig * zSig != ( (bits64) aSig )<<32 );
   1900     }
   1901     return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
   1902 
   1903 }
   1904 
   1905 /*----------------------------------------------------------------------------
   1906 | Returns the remainder of the single-precision floating-point value `a'
   1907 | with respect to the corresponding value `b'.  The operation is performed
   1908 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   1909 *----------------------------------------------------------------------------*/
   1910 
   1911 float32 float32_rem( float32 a, float32 b STATUS_PARAM )
   1912 {
   1913     flag aSign, bSign, zSign;
   1914     int16 aExp, bExp, expDiff;
   1915     bits32 aSig, bSig;
   1916     bits32 q;
   1917     bits64 aSig64, bSig64, q64;
   1918     bits32 alternateASig;
   1919     sbits32 sigMean;
   1920 
   1921     aSig = extractFloat32Frac( a );
   1922     aExp = extractFloat32Exp( a );
   1923     aSign = extractFloat32Sign( a );
   1924     bSig = extractFloat32Frac( b );
   1925     bExp = extractFloat32Exp( b );
   1926     bSign = extractFloat32Sign( b );
   1927     if ( aExp == 0xFF ) {
   1928         if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
   1929             return propagateFloat32NaN( a, b STATUS_VAR );
   1930         }
   1931         float_raise( float_flag_invalid STATUS_VAR);
   1932         return float32_default_nan;
   1933     }
   1934     if ( bExp == 0xFF ) {
   1935         if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
   1936         return a;
   1937     }
   1938     if ( bExp == 0 ) {
   1939         if ( bSig == 0 ) {
   1940             float_raise( float_flag_invalid STATUS_VAR);
   1941             return float32_default_nan;
   1942         }
   1943         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
   1944     }
   1945     if ( aExp == 0 ) {
   1946         if ( aSig == 0 ) return a;
   1947         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
   1948     }
   1949     expDiff = aExp - bExp;
   1950     aSig |= 0x00800000;
   1951     bSig |= 0x00800000;
   1952     if ( expDiff < 32 ) {
   1953         aSig <<= 8;
   1954         bSig <<= 8;
   1955         if ( expDiff < 0 ) {
   1956             if ( expDiff < -1 ) return a;
   1957             aSig >>= 1;
   1958         }
   1959         q = ( bSig <= aSig );
   1960         if ( q ) aSig -= bSig;
   1961         if ( 0 < expDiff ) {
   1962             q = ( ( (bits64) aSig )<<32 ) / bSig;
   1963             q >>= 32 - expDiff;
   1964             bSig >>= 2;
   1965             aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
   1966         }
   1967         else {
   1968             aSig >>= 2;
   1969             bSig >>= 2;
   1970         }
   1971     }
   1972     else {
   1973         if ( bSig <= aSig ) aSig -= bSig;
   1974         aSig64 = ( (bits64) aSig )<<40;
   1975         bSig64 = ( (bits64) bSig )<<40;
   1976         expDiff -= 64;
   1977         while ( 0 < expDiff ) {
   1978             q64 = estimateDiv128To64( aSig64, 0, bSig64 );
   1979             q64 = ( 2 < q64 ) ? q64 - 2 : 0;
   1980             aSig64 = - ( ( bSig * q64 )<<38 );
   1981             expDiff -= 62;
   1982         }
   1983         expDiff += 64;
   1984         q64 = estimateDiv128To64( aSig64, 0, bSig64 );
   1985         q64 = ( 2 < q64 ) ? q64 - 2 : 0;
   1986         q = q64>>( 64 - expDiff );
   1987         bSig <<= 6;
   1988         aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
   1989     }
   1990     do {
   1991         alternateASig = aSig;
   1992         ++q;
   1993         aSig -= bSig;
   1994     } while ( 0 <= (sbits32) aSig );
   1995     sigMean = aSig + alternateASig;
   1996     if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
   1997         aSig = alternateASig;
   1998     }
   1999     zSign = ( (sbits32) aSig < 0 );
   2000     if ( zSign ) aSig = - aSig;
   2001     return normalizeRoundAndPackFloat32( aSign ^ zSign, bExp, aSig STATUS_VAR );
   2002 
   2003 }
   2004 
   2005 /*----------------------------------------------------------------------------
   2006 | Returns the square root of the single-precision floating-point value `a'.
   2007 | The operation is performed according to the IEC/IEEE Standard for Binary
   2008 | Floating-Point Arithmetic.
   2009 *----------------------------------------------------------------------------*/
   2010 
   2011 float32 float32_sqrt( float32 a STATUS_PARAM )
   2012 {
   2013     flag aSign;
   2014     int16 aExp, zExp;
   2015     bits32 aSig, zSig;
   2016     bits64 rem, term;
   2017 
   2018     aSig = extractFloat32Frac( a );
   2019     aExp = extractFloat32Exp( a );
   2020     aSign = extractFloat32Sign( a );
   2021     if ( aExp == 0xFF ) {
   2022         if ( aSig ) return propagateFloat32NaN( a, float32_zero STATUS_VAR );
   2023         if ( ! aSign ) return a;
   2024         float_raise( float_flag_invalid STATUS_VAR);
   2025         return float32_default_nan;
   2026     }
   2027     if ( aSign ) {
   2028         if ( ( aExp | aSig ) == 0 ) return a;
   2029         float_raise( float_flag_invalid STATUS_VAR);
   2030         return float32_default_nan;
   2031     }
   2032     if ( aExp == 0 ) {
   2033         if ( aSig == 0 ) return float32_zero;
   2034         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
   2035     }
   2036     zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E;
   2037     aSig = ( aSig | 0x00800000 )<<8;
   2038     zSig = estimateSqrt32( aExp, aSig ) + 2;
   2039     if ( ( zSig & 0x7F ) <= 5 ) {
   2040         if ( zSig < 2 ) {
   2041             zSig = 0x7FFFFFFF;
   2042             goto roundAndPack;
   2043         }
   2044         aSig >>= aExp & 1;
   2045         term = ( (bits64) zSig ) * zSig;
   2046         rem = ( ( (bits64) aSig )<<32 ) - term;
   2047         while ( (sbits64) rem < 0 ) {
   2048             --zSig;
   2049             rem += ( ( (bits64) zSig )<<1 ) | 1;
   2050         }
   2051         zSig |= ( rem != 0 );
   2052     }
   2053     shift32RightJamming( zSig, 1, &zSig );
   2054  roundAndPack:
   2055     return roundAndPackFloat32( 0, zExp, zSig STATUS_VAR );
   2056 
   2057 }
   2058 
   2059 /*----------------------------------------------------------------------------
   2060 | Returns the binary log of the single-precision floating-point value `a'.
   2061 | The operation is performed according to the IEC/IEEE Standard for Binary
   2062 | Floating-Point Arithmetic.
   2063 *----------------------------------------------------------------------------*/
   2064 float32 float32_log2( float32 a STATUS_PARAM )
   2065 {
   2066     flag aSign, zSign;
   2067     int16 aExp;
   2068     bits32 aSig, zSig, i;
   2069 
   2070     aSig = extractFloat32Frac( a );
   2071     aExp = extractFloat32Exp( a );
   2072     aSign = extractFloat32Sign( a );
   2073 
   2074     if ( aExp == 0 ) {
   2075         if ( aSig == 0 ) return packFloat32( 1, 0xFF, 0 );
   2076         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
   2077     }
   2078     if ( aSign ) {
   2079         float_raise( float_flag_invalid STATUS_VAR);
   2080         return float32_default_nan;
   2081     }
   2082     if ( aExp == 0xFF ) {
   2083         if ( aSig ) return propagateFloat32NaN( a, float32_zero STATUS_VAR );
   2084         return a;
   2085     }
   2086 
   2087     aExp -= 0x7F;
   2088     aSig |= 0x00800000;
   2089     zSign = aExp < 0;
   2090     zSig = aExp << 23;
   2091 
   2092     for (i = 1 << 22; i > 0; i >>= 1) {
   2093         aSig = ( (bits64)aSig * aSig ) >> 23;
   2094         if ( aSig & 0x01000000 ) {
   2095             aSig >>= 1;
   2096             zSig |= i;
   2097         }
   2098     }
   2099 
   2100     if ( zSign )
   2101         zSig = -zSig;
   2102 
   2103     return normalizeRoundAndPackFloat32( zSign, 0x85, zSig STATUS_VAR );
   2104 }
   2105 
   2106 /*----------------------------------------------------------------------------
   2107 | Returns 1 if the single-precision floating-point value `a' is equal to
   2108 | the corresponding value `b', and 0 otherwise.  The comparison is performed
   2109 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   2110 *----------------------------------------------------------------------------*/
   2111 
   2112 int float32_eq( float32 a, float32 b STATUS_PARAM )
   2113 {
   2114 
   2115     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
   2116          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
   2117        ) {
   2118         if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
   2119             float_raise( float_flag_invalid STATUS_VAR);
   2120         }
   2121         return 0;
   2122     }
   2123     return ( float32_val(a) == float32_val(b) ) ||
   2124             ( (bits32) ( ( float32_val(a) | float32_val(b) )<<1 ) == 0 );
   2125 
   2126 }
   2127 
   2128 /*----------------------------------------------------------------------------
   2129 | Returns 1 if the single-precision floating-point value `a' is less than
   2130 | or equal to the corresponding value `b', and 0 otherwise.  The comparison
   2131 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
   2132 | Arithmetic.
   2133 *----------------------------------------------------------------------------*/
   2134 
   2135 int float32_le( float32 a, float32 b STATUS_PARAM )
   2136 {
   2137     flag aSign, bSign;
   2138     bits32 av, bv;
   2139 
   2140     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
   2141          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
   2142        ) {
   2143         float_raise( float_flag_invalid STATUS_VAR);
   2144         return 0;
   2145     }
   2146     aSign = extractFloat32Sign( a );
   2147     bSign = extractFloat32Sign( b );
   2148     av = float32_val(a);
   2149     bv = float32_val(b);
   2150     if ( aSign != bSign ) return aSign || ( (bits32) ( ( av | bv )<<1 ) == 0 );
   2151     return ( av == bv ) || ( aSign ^ ( av < bv ) );
   2152 
   2153 }
   2154 
   2155 /*----------------------------------------------------------------------------
   2156 | Returns 1 if the single-precision floating-point value `a' is less than
   2157 | the corresponding value `b', and 0 otherwise.  The comparison is performed
   2158 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   2159 *----------------------------------------------------------------------------*/
   2160 
   2161 int float32_lt( float32 a, float32 b STATUS_PARAM )
   2162 {
   2163     flag aSign, bSign;
   2164     bits32 av, bv;
   2165 
   2166     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
   2167          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
   2168        ) {
   2169         float_raise( float_flag_invalid STATUS_VAR);
   2170         return 0;
   2171     }
   2172     aSign = extractFloat32Sign( a );
   2173     bSign = extractFloat32Sign( b );
   2174     av = float32_val(a);
   2175     bv = float32_val(b);
   2176     if ( aSign != bSign ) return aSign && ( (bits32) ( ( av | bv )<<1 ) != 0 );
   2177     return ( av != bv ) && ( aSign ^ ( av < bv ) );
   2178 
   2179 }
   2180 
   2181 /*----------------------------------------------------------------------------
   2182 | Returns 1 if the single-precision floating-point value `a' is equal to
   2183 | the corresponding value `b', and 0 otherwise.  The invalid exception is
   2184 | raised if either operand is a NaN.  Otherwise, the comparison is performed
   2185 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   2186 *----------------------------------------------------------------------------*/
   2187 
   2188 int float32_eq_signaling( float32 a, float32 b STATUS_PARAM )
   2189 {
   2190     bits32 av, bv;
   2191 
   2192     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
   2193          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
   2194        ) {
   2195         float_raise( float_flag_invalid STATUS_VAR);
   2196         return 0;
   2197     }
   2198     av = float32_val(a);
   2199     bv = float32_val(b);
   2200     return ( av == bv ) || ( (bits32) ( ( av | bv )<<1 ) == 0 );
   2201 
   2202 }
   2203 
   2204 /*----------------------------------------------------------------------------
   2205 | Returns 1 if the single-precision floating-point value `a' is less than or
   2206 | equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
   2207 | cause an exception.  Otherwise, the comparison is performed according to the
   2208 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   2209 *----------------------------------------------------------------------------*/
   2210 
   2211 int float32_le_quiet( float32 a, float32 b STATUS_PARAM )
   2212 {
   2213     flag aSign, bSign;
   2214     bits32 av, bv;
   2215 
   2216     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
   2217          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
   2218        ) {
   2219         if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
   2220             float_raise( float_flag_invalid STATUS_VAR);
   2221         }
   2222         return 0;
   2223     }
   2224     aSign = extractFloat32Sign( a );
   2225     bSign = extractFloat32Sign( b );
   2226     av = float32_val(a);
   2227     bv = float32_val(b);
   2228     if ( aSign != bSign ) return aSign || ( (bits32) ( ( av | bv )<<1 ) == 0 );
   2229     return ( av == bv ) || ( aSign ^ ( av < bv ) );
   2230 
   2231 }
   2232 
   2233 /*----------------------------------------------------------------------------
   2234 | Returns 1 if the single-precision floating-point value `a' is less than
   2235 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
   2236 | exception.  Otherwise, the comparison is performed according to the IEC/IEEE
   2237 | Standard for Binary Floating-Point Arithmetic.
   2238 *----------------------------------------------------------------------------*/
   2239 
   2240 int float32_lt_quiet( float32 a, float32 b STATUS_PARAM )
   2241 {
   2242     flag aSign, bSign;
   2243     bits32 av, bv;
   2244 
   2245     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
   2246          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
   2247        ) {
   2248         if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
   2249             float_raise( float_flag_invalid STATUS_VAR);
   2250         }
   2251         return 0;
   2252     }
   2253     aSign = extractFloat32Sign( a );
   2254     bSign = extractFloat32Sign( b );
   2255     av = float32_val(a);
   2256     bv = float32_val(b);
   2257     if ( aSign != bSign ) return aSign && ( (bits32) ( ( av | bv )<<1 ) != 0 );
   2258     return ( av != bv ) && ( aSign ^ ( av < bv ) );
   2259 
   2260 }
   2261 
   2262 /*----------------------------------------------------------------------------
   2263 | Returns the result of converting the double-precision floating-point value
   2264 | `a' to the 32-bit two's complement integer format.  The conversion is
   2265 | performed according to the IEC/IEEE Standard for Binary Floating-Point
   2266 | Arithmetic---which means in particular that the conversion is rounded
   2267 | according to the current rounding mode.  If `a' is a NaN, the largest
   2268 | positive integer is returned.  Otherwise, if the conversion overflows, the
   2269 | largest integer with the same sign as `a' is returned.
   2270 *----------------------------------------------------------------------------*/
   2271 
   2272 int32 float64_to_int32( float64 a STATUS_PARAM )
   2273 {
   2274     flag aSign;
   2275     int16 aExp, shiftCount;
   2276     bits64 aSig;
   2277 
   2278     aSig = extractFloat64Frac( a );
   2279     aExp = extractFloat64Exp( a );
   2280     aSign = extractFloat64Sign( a );
   2281     if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
   2282     if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
   2283     shiftCount = 0x42C - aExp;
   2284     if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig );
   2285     return roundAndPackInt32( aSign, aSig STATUS_VAR );
   2286 
   2287 }
   2288 
   2289 /*----------------------------------------------------------------------------
   2290 | Returns the result of converting the double-precision floating-point value
   2291 | `a' to the 32-bit two's complement integer format.  The conversion is
   2292 | performed according to the IEC/IEEE Standard for Binary Floating-Point
   2293 | Arithmetic, except that the conversion is always rounded toward zero.
   2294 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
   2295 | the conversion overflows, the largest integer with the same sign as `a' is
   2296 | returned.
   2297 *----------------------------------------------------------------------------*/
   2298 
   2299 int32 float64_to_int32_round_to_zero( float64 a STATUS_PARAM )
   2300 {
   2301     flag aSign;
   2302     int16 aExp, shiftCount;
   2303     bits64 aSig, savedASig;
   2304     int32 z;
   2305 
   2306     aSig = extractFloat64Frac( a );
   2307     aExp = extractFloat64Exp( a );
   2308     aSign = extractFloat64Sign( a );
   2309     if ( 0x41E < aExp ) {
   2310         if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
   2311         goto invalid;
   2312     }
   2313     else if ( aExp < 0x3FF ) {
   2314         if ( aExp || aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
   2315         return 0;
   2316     }
   2317     aSig |= LIT64( 0x0010000000000000 );
   2318     shiftCount = 0x433 - aExp;
   2319     savedASig = aSig;
   2320     aSig >>= shiftCount;
   2321     z = aSig;
   2322     if ( aSign ) z = - z;
   2323     if ( ( z < 0 ) ^ aSign ) {
   2324  invalid:
   2325         float_raise( float_flag_invalid STATUS_VAR);
   2326         return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
   2327     }
   2328     if ( ( aSig<<shiftCount ) != savedASig ) {
   2329         STATUS(float_exception_flags) |= float_flag_inexact;
   2330     }
   2331     return z;
   2332 
   2333 }
   2334 
   2335 /*----------------------------------------------------------------------------
   2336 | Returns the result of converting the double-precision floating-point value
   2337 | `a' to the 64-bit two's complement integer format.  The conversion is
   2338 | performed according to the IEC/IEEE Standard for Binary Floating-Point
   2339 | Arithmetic---which means in particular that the conversion is rounded
   2340 | according to the current rounding mode.  If `a' is a NaN, the largest
   2341 | positive integer is returned.  Otherwise, if the conversion overflows, the
   2342 | largest integer with the same sign as `a' is returned.
   2343 *----------------------------------------------------------------------------*/
   2344 
   2345 int64 float64_to_int64( float64 a STATUS_PARAM )
   2346 {
   2347     flag aSign;
   2348     int16 aExp, shiftCount;
   2349     bits64 aSig, aSigExtra;
   2350 
   2351     aSig = extractFloat64Frac( a );
   2352     aExp = extractFloat64Exp( a );
   2353     aSign = extractFloat64Sign( a );
   2354     if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
   2355     shiftCount = 0x433 - aExp;
   2356     if ( shiftCount <= 0 ) {
   2357         if ( 0x43E < aExp ) {
   2358             float_raise( float_flag_invalid STATUS_VAR);
   2359             if (    ! aSign
   2360                  || (    ( aExp == 0x7FF )
   2361                       && ( aSig != LIT64( 0x0010000000000000 ) ) )
   2362                ) {
   2363                 return LIT64( 0x7FFFFFFFFFFFFFFF );
   2364             }
   2365             return (sbits64) LIT64( 0x8000000000000000 );
   2366         }
   2367         aSigExtra = 0;
   2368         aSig <<= - shiftCount;
   2369     }
   2370     else {
   2371         shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
   2372     }
   2373     return roundAndPackInt64( aSign, aSig, aSigExtra STATUS_VAR );
   2374 
   2375 }
   2376 
   2377 /*----------------------------------------------------------------------------
   2378 | Returns the result of converting the double-precision floating-point value
   2379 | `a' to the 64-bit two's complement integer format.  The conversion is
   2380 | performed according to the IEC/IEEE Standard for Binary Floating-Point
   2381 | Arithmetic, except that the conversion is always rounded toward zero.
   2382 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
   2383 | the conversion overflows, the largest integer with the same sign as `a' is
   2384 | returned.
   2385 *----------------------------------------------------------------------------*/
   2386 
   2387 int64 float64_to_int64_round_to_zero( float64 a STATUS_PARAM )
   2388 {
   2389     flag aSign;
   2390     int16 aExp, shiftCount;
   2391     bits64 aSig;
   2392     int64 z;
   2393 
   2394     aSig = extractFloat64Frac( a );
   2395     aExp = extractFloat64Exp( a );
   2396     aSign = extractFloat64Sign( a );
   2397     if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
   2398     shiftCount = aExp - 0x433;
   2399     if ( 0 <= shiftCount ) {
   2400         if ( 0x43E <= aExp ) {
   2401             if ( float64_val(a) != LIT64( 0xC3E0000000000000 ) ) {
   2402                 float_raise( float_flag_invalid STATUS_VAR);
   2403                 if (    ! aSign
   2404                      || (    ( aExp == 0x7FF )
   2405                           && ( aSig != LIT64( 0x0010000000000000 ) ) )
   2406                    ) {
   2407                     return LIT64( 0x7FFFFFFFFFFFFFFF );
   2408                 }
   2409             }
   2410             return (sbits64) LIT64( 0x8000000000000000 );
   2411         }
   2412         z = aSig<<shiftCount;
   2413     }
   2414     else {
   2415         if ( aExp < 0x3FE ) {
   2416             if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
   2417             return 0;
   2418         }
   2419         z = aSig>>( - shiftCount );
   2420         if ( (bits64) ( aSig<<( shiftCount & 63 ) ) ) {
   2421             STATUS(float_exception_flags) |= float_flag_inexact;
   2422         }
   2423     }
   2424     if ( aSign ) z = - z;
   2425     return z;
   2426 
   2427 }
   2428 
   2429 /*----------------------------------------------------------------------------
   2430 | Returns the result of converting the double-precision floating-point value
   2431 | `a' to the single-precision floating-point format.  The conversion is
   2432 | performed according to the IEC/IEEE Standard for Binary Floating-Point
   2433 | Arithmetic.
   2434 *----------------------------------------------------------------------------*/
   2435 
   2436 float32 float64_to_float32( float64 a STATUS_PARAM )
   2437 {
   2438     flag aSign;
   2439     int16 aExp;
   2440     bits64 aSig;
   2441     bits32 zSig;
   2442 
   2443     aSig = extractFloat64Frac( a );
   2444     aExp = extractFloat64Exp( a );
   2445     aSign = extractFloat64Sign( a );
   2446     if ( aExp == 0x7FF ) {
   2447         if ( aSig ) return commonNaNToFloat32( float64ToCommonNaN( a STATUS_VAR ) );
   2448         return packFloat32( aSign, 0xFF, 0 );
   2449     }
   2450     shift64RightJamming( aSig, 22, &aSig );
   2451     zSig = aSig;
   2452     if ( aExp || zSig ) {
   2453         zSig |= 0x40000000;
   2454         aExp -= 0x381;
   2455     }
   2456     return roundAndPackFloat32( aSign, aExp, zSig STATUS_VAR );
   2457 
   2458 }
   2459 
   2460 #ifdef FLOATX80
   2461 
   2462 /*----------------------------------------------------------------------------
   2463 | Returns the result of converting the double-precision floating-point value
   2464 | `a' to the extended double-precision floating-point format.  The conversion
   2465 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
   2466 | Arithmetic.
   2467 *----------------------------------------------------------------------------*/
   2468 
   2469 floatx80 float64_to_floatx80( float64 a STATUS_PARAM )
   2470 {
   2471     flag aSign;
   2472     int16 aExp;
   2473     bits64 aSig;
   2474 
   2475     aSig = extractFloat64Frac( a );
   2476     aExp = extractFloat64Exp( a );
   2477     aSign = extractFloat64Sign( a );
   2478     if ( aExp == 0x7FF ) {
   2479         if ( aSig ) return commonNaNToFloatx80( float64ToCommonNaN( a STATUS_VAR ) );
   2480         return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
   2481     }
   2482     if ( aExp == 0 ) {
   2483         if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
   2484         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
   2485     }
   2486     return
   2487         packFloatx80(
   2488             aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 );
   2489 
   2490 }
   2491 
   2492 #endif
   2493 
   2494 #ifdef FLOAT128
   2495 
   2496 /*----------------------------------------------------------------------------
   2497 | Returns the result of converting the double-precision floating-point value
   2498 | `a' to the quadruple-precision floating-point format.  The conversion is
   2499 | performed according to the IEC/IEEE Standard for Binary Floating-Point
   2500 | Arithmetic.
   2501 *----------------------------------------------------------------------------*/
   2502 
   2503 float128 float64_to_float128( float64 a STATUS_PARAM )
   2504 {
   2505     flag aSign;
   2506     int16 aExp;
   2507     bits64 aSig, zSig0, zSig1;
   2508 
   2509     aSig = extractFloat64Frac( a );
   2510     aExp = extractFloat64Exp( a );
   2511     aSign = extractFloat64Sign( a );
   2512     if ( aExp == 0x7FF ) {
   2513         if ( aSig ) return commonNaNToFloat128( float64ToCommonNaN( a STATUS_VAR ) );
   2514         return packFloat128( aSign, 0x7FFF, 0, 0 );
   2515     }
   2516     if ( aExp == 0 ) {
   2517         if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
   2518         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
   2519         --aExp;
   2520     }
   2521     shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
   2522     return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
   2523 
   2524 }
   2525 
   2526 #endif
   2527 
   2528 /*----------------------------------------------------------------------------
   2529 | Rounds the double-precision floating-point value `a' to an integer, and
   2530 | returns the result as a double-precision floating-point value.  The
   2531 | operation is performed according to the IEC/IEEE Standard for Binary
   2532 | Floating-Point Arithmetic.
   2533 *----------------------------------------------------------------------------*/
   2534 
   2535 float64 float64_round_to_int( float64 a STATUS_PARAM )
   2536 {
   2537     flag aSign;
   2538     int16 aExp;
   2539     bits64 lastBitMask, roundBitsMask;
   2540     int8 roundingMode;
   2541     bits64 z;
   2542 
   2543     aExp = extractFloat64Exp( a );
   2544     if ( 0x433 <= aExp ) {
   2545         if ( ( aExp == 0x7FF ) && extractFloat64Frac( a ) ) {
   2546             return propagateFloat64NaN( a, a STATUS_VAR );
   2547         }
   2548         return a;
   2549     }
   2550     if ( aExp < 0x3FF ) {
   2551         if ( (bits64) ( float64_val(a)<<1 ) == 0 ) return a;
   2552         STATUS(float_exception_flags) |= float_flag_inexact;
   2553         aSign = extractFloat64Sign( a );
   2554         switch ( STATUS(float_rounding_mode) ) {
   2555          case float_round_nearest_even:
   2556             if ( ( aExp == 0x3FE ) && extractFloat64Frac( a ) ) {
   2557                 return packFloat64( aSign, 0x3FF, 0 );
   2558             }
   2559             break;
   2560          case float_round_down:
   2561             return make_float64(aSign ? LIT64( 0xBFF0000000000000 ) : 0);
   2562          case float_round_up:
   2563             return make_float64(
   2564             aSign ? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 ));
   2565         }
   2566         return packFloat64( aSign, 0, 0 );
   2567     }
   2568     lastBitMask = 1;
   2569     lastBitMask <<= 0x433 - aExp;
   2570     roundBitsMask = lastBitMask - 1;
   2571     z = float64_val(a);
   2572     roundingMode = STATUS(float_rounding_mode);
   2573     if ( roundingMode == float_round_nearest_even ) {
   2574         z += lastBitMask>>1;
   2575         if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
   2576     }
   2577     else if ( roundingMode != float_round_to_zero ) {
   2578         if ( extractFloat64Sign( make_float64(z) ) ^ ( roundingMode == float_round_up ) ) {
   2579             z += roundBitsMask;
   2580         }
   2581     }
   2582     z &= ~ roundBitsMask;
   2583     if ( z != float64_val(a) )
   2584         STATUS(float_exception_flags) |= float_flag_inexact;
   2585     return make_float64(z);
   2586 
   2587 }
   2588 
   2589 float64 float64_trunc_to_int( float64 a STATUS_PARAM)
   2590 {
   2591     int oldmode;
   2592     float64 res;
   2593     oldmode = STATUS(float_rounding_mode);
   2594     STATUS(float_rounding_mode) = float_round_to_zero;
   2595     res = float64_round_to_int(a STATUS_VAR);
   2596     STATUS(float_rounding_mode) = oldmode;
   2597     return res;
   2598 }
   2599 
   2600 /*----------------------------------------------------------------------------
   2601 | Returns the result of adding the absolute values of the double-precision
   2602 | floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
   2603 | before being returned.  `zSign' is ignored if the result is a NaN.
   2604 | The addition is performed according to the IEC/IEEE Standard for Binary
   2605 | Floating-Point Arithmetic.
   2606 *----------------------------------------------------------------------------*/
   2607 
   2608 static float64 addFloat64Sigs( float64 a, float64 b, flag zSign STATUS_PARAM )
   2609 {
   2610     int16 aExp, bExp, zExp;
   2611     bits64 aSig, bSig, zSig;
   2612     int16 expDiff;
   2613 
   2614     aSig = extractFloat64Frac( a );
   2615     aExp = extractFloat64Exp( a );
   2616     bSig = extractFloat64Frac( b );
   2617     bExp = extractFloat64Exp( b );
   2618     expDiff = aExp - bExp;
   2619     aSig <<= 9;
   2620     bSig <<= 9;
   2621     if ( 0 < expDiff ) {
   2622         if ( aExp == 0x7FF ) {
   2623             if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
   2624             return a;
   2625         }
   2626         if ( bExp == 0 ) {
   2627             --expDiff;
   2628         }
   2629         else {
   2630             bSig |= LIT64( 0x2000000000000000 );
   2631         }
   2632         shift64RightJamming( bSig, expDiff, &bSig );
   2633         zExp = aExp;
   2634     }
   2635     else if ( expDiff < 0 ) {
   2636         if ( bExp == 0x7FF ) {
   2637             if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
   2638             return packFloat64( zSign, 0x7FF, 0 );
   2639         }
   2640         if ( aExp == 0 ) {
   2641             ++expDiff;
   2642         }
   2643         else {
   2644             aSig |= LIT64( 0x2000000000000000 );
   2645         }
   2646         shift64RightJamming( aSig, - expDiff, &aSig );
   2647         zExp = bExp;
   2648     }
   2649     else {
   2650         if ( aExp == 0x7FF ) {
   2651             if ( aSig | bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
   2652             return a;
   2653         }
   2654         if ( aExp == 0 ) {
   2655             if ( STATUS(flush_to_zero) ) return packFloat64( zSign, 0, 0 );
   2656             return packFloat64( zSign, 0, ( aSig + bSig )>>9 );
   2657         }
   2658         zSig = LIT64( 0x4000000000000000 ) + aSig + bSig;
   2659         zExp = aExp;
   2660         goto roundAndPack;
   2661     }
   2662     aSig |= LIT64( 0x2000000000000000 );
   2663     zSig = ( aSig + bSig )<<1;
   2664     --zExp;
   2665     if ( (sbits64) zSig < 0 ) {
   2666         zSig = aSig + bSig;
   2667         ++zExp;
   2668     }
   2669  roundAndPack:
   2670     return roundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
   2671 
   2672 }
   2673 
   2674 /*----------------------------------------------------------------------------
   2675 | Returns the result of subtracting the absolute values of the double-
   2676 | precision floating-point values `a' and `b'.  If `zSign' is 1, the
   2677 | difference is negated before being returned.  `zSign' is ignored if the
   2678 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
   2679 | Standard for Binary Floating-Point Arithmetic.
   2680 *----------------------------------------------------------------------------*/
   2681 
   2682 static float64 subFloat64Sigs( float64 a, float64 b, flag zSign STATUS_PARAM )
   2683 {
   2684     int16 aExp, bExp, zExp;
   2685     bits64 aSig, bSig, zSig;
   2686     int16 expDiff;
   2687 
   2688     aSig = extractFloat64Frac( a );
   2689     aExp = extractFloat64Exp( a );
   2690     bSig = extractFloat64Frac( b );
   2691     bExp = extractFloat64Exp( b );
   2692     expDiff = aExp - bExp;
   2693     aSig <<= 10;
   2694     bSig <<= 10;
   2695     if ( 0 < expDiff ) goto aExpBigger;
   2696     if ( expDiff < 0 ) goto bExpBigger;
   2697     if ( aExp == 0x7FF ) {
   2698         if ( aSig | bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
   2699         float_raise( float_flag_invalid STATUS_VAR);
   2700         return float64_default_nan;
   2701     }
   2702     if ( aExp == 0 ) {
   2703         aExp = 1;
   2704         bExp = 1;
   2705     }
   2706     if ( bSig < aSig ) goto aBigger;
   2707     if ( aSig < bSig ) goto bBigger;
   2708     return packFloat64( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
   2709  bExpBigger:
   2710     if ( bExp == 0x7FF ) {
   2711         if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
   2712         return packFloat64( zSign ^ 1, 0x7FF, 0 );
   2713     }
   2714     if ( aExp == 0 ) {
   2715         ++expDiff;
   2716     }
   2717     else {
   2718         aSig |= LIT64( 0x4000000000000000 );
   2719     }
   2720     shift64RightJamming( aSig, - expDiff, &aSig );
   2721     bSig |= LIT64( 0x4000000000000000 );
   2722  bBigger:
   2723     zSig = bSig - aSig;
   2724     zExp = bExp;
   2725     zSign ^= 1;
   2726     goto normalizeRoundAndPack;
   2727  aExpBigger:
   2728     if ( aExp == 0x7FF ) {
   2729         if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
   2730         return a;
   2731     }
   2732     if ( bExp == 0 ) {
   2733         --expDiff;
   2734     }
   2735     else {
   2736         bSig |= LIT64( 0x4000000000000000 );
   2737     }
   2738     shift64RightJamming( bSig, expDiff, &bSig );
   2739     aSig |= LIT64( 0x4000000000000000 );
   2740  aBigger:
   2741     zSig = aSig - bSig;
   2742     zExp = aExp;
   2743  normalizeRoundAndPack:
   2744     --zExp;
   2745     return normalizeRoundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
   2746 
   2747 }
   2748 
   2749 /*----------------------------------------------------------------------------
   2750 | Returns the result of adding the double-precision floating-point values `a'
   2751 | and `b'.  The operation is performed according to the IEC/IEEE Standard for
   2752 | Binary Floating-Point Arithmetic.
   2753 *----------------------------------------------------------------------------*/
   2754 
   2755 float64 float64_add( float64 a, float64 b STATUS_PARAM )
   2756 {
   2757     flag aSign, bSign;
   2758 
   2759     aSign = extractFloat64Sign( a );
   2760     bSign = extractFloat64Sign( b );
   2761     if ( aSign == bSign ) {
   2762         return addFloat64Sigs( a, b, aSign STATUS_VAR );
   2763     }
   2764     else {
   2765         return subFloat64Sigs( a, b, aSign STATUS_VAR );
   2766     }
   2767 
   2768 }
   2769 
   2770 /*----------------------------------------------------------------------------
   2771 | Returns the result of subtracting the double-precision floating-point values
   2772 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
   2773 | for Binary Floating-Point Arithmetic.
   2774 *----------------------------------------------------------------------------*/
   2775 
   2776 float64 float64_sub( float64 a, float64 b STATUS_PARAM )
   2777 {
   2778     flag aSign, bSign;
   2779 
   2780     aSign = extractFloat64Sign( a );
   2781     bSign = extractFloat64Sign( b );
   2782     if ( aSign == bSign ) {
   2783         return subFloat64Sigs( a, b, aSign STATUS_VAR );
   2784     }
   2785     else {
   2786         return addFloat64Sigs( a, b, aSign STATUS_VAR );
   2787     }
   2788 
   2789 }
   2790 
   2791 /*----------------------------------------------------------------------------
   2792 | Returns the result of multiplying the double-precision floating-point values
   2793 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
   2794 | for Binary Floating-Point Arithmetic.
   2795 *----------------------------------------------------------------------------*/
   2796 
   2797 float64 float64_mul( float64 a, float64 b STATUS_PARAM )
   2798 {
   2799     flag aSign, bSign, zSign;
   2800     int16 aExp, bExp, zExp;
   2801     bits64 aSig, bSig, zSig0, zSig1;
   2802 
   2803     aSig = extractFloat64Frac( a );
   2804     aExp = extractFloat64Exp( a );
   2805     aSign = extractFloat64Sign( a );
   2806     bSig = extractFloat64Frac( b );
   2807     bExp = extractFloat64Exp( b );
   2808     bSign = extractFloat64Sign( b );
   2809     zSign = aSign ^ bSign;
   2810     if ( aExp == 0x7FF ) {
   2811         if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
   2812             return propagateFloat64NaN( a, b STATUS_VAR );
   2813         }
   2814         if ( ( bExp | bSig ) == 0 ) {
   2815             float_raise( float_flag_invalid STATUS_VAR);
   2816             return float64_default_nan;
   2817         }
   2818         return packFloat64( zSign, 0x7FF, 0 );
   2819     }
   2820     if ( bExp == 0x7FF ) {
   2821         if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
   2822         if ( ( aExp | aSig ) == 0 ) {
   2823             float_raise( float_flag_invalid STATUS_VAR);
   2824             return float64_default_nan;
   2825         }
   2826         return packFloat64( zSign, 0x7FF, 0 );
   2827     }
   2828     if ( aExp == 0 ) {
   2829         if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
   2830         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
   2831     }
   2832     if ( bExp == 0 ) {
   2833         if ( bSig == 0 ) return packFloat64( zSign, 0, 0 );
   2834         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
   2835     }
   2836     zExp = aExp + bExp - 0x3FF;
   2837     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
   2838     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
   2839     mul64To128( aSig, bSig, &zSig0, &zSig1 );
   2840     zSig0 |= ( zSig1 != 0 );
   2841     if ( 0 <= (sbits64) ( zSig0<<1 ) ) {
   2842         zSig0 <<= 1;
   2843         --zExp;
   2844     }
   2845     return roundAndPackFloat64( zSign, zExp, zSig0 STATUS_VAR );
   2846 
   2847 }
   2848 
   2849 /*----------------------------------------------------------------------------
   2850 | Returns the result of dividing the double-precision floating-point value `a'
   2851 | by the corresponding value `b'.  The operation is performed according to
   2852 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   2853 *----------------------------------------------------------------------------*/
   2854 
   2855 float64 float64_div( float64 a, float64 b STATUS_PARAM )
   2856 {
   2857     flag aSign, bSign, zSign;
   2858     int16 aExp, bExp, zExp;
   2859     bits64 aSig, bSig, zSig;
   2860     bits64 rem0, rem1;
   2861     bits64 term0, term1;
   2862 
   2863     aSig = extractFloat64Frac( a );
   2864     aExp = extractFloat64Exp( a );
   2865     aSign = extractFloat64Sign( a );
   2866     bSig = extractFloat64Frac( b );
   2867     bExp = extractFloat64Exp( b );
   2868     bSign = extractFloat64Sign( b );
   2869     zSign = aSign ^ bSign;
   2870     if ( aExp == 0x7FF ) {
   2871         if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
   2872         if ( bExp == 0x7FF ) {
   2873             if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
   2874             float_raise( float_flag_invalid STATUS_VAR);
   2875             return float64_default_nan;
   2876         }
   2877         return packFloat64( zSign, 0x7FF, 0 );
   2878     }
   2879     if ( bExp == 0x7FF ) {
   2880         if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
   2881         return packFloat64( zSign, 0, 0 );
   2882     }
   2883     if ( bExp == 0 ) {
   2884         if ( bSig == 0 ) {
   2885             if ( ( aExp | aSig ) == 0 ) {
   2886                 float_raise( float_flag_invalid STATUS_VAR);
   2887                 return float64_default_nan;
   2888             }
   2889             float_raise( float_flag_divbyzero STATUS_VAR);
   2890             return packFloat64( zSign, 0x7FF, 0 );
   2891         }
   2892         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
   2893     }
   2894     if ( aExp == 0 ) {
   2895         if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
   2896         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
   2897     }
   2898     zExp = aExp - bExp + 0x3FD;
   2899     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
   2900     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
   2901     if ( bSig <= ( aSig + aSig ) ) {
   2902         aSig >>= 1;
   2903         ++zExp;
   2904     }
   2905     zSig = estimateDiv128To64( aSig, 0, bSig );
   2906     if ( ( zSig & 0x1FF ) <= 2 ) {
   2907         mul64To128( bSig, zSig, &term0, &term1 );
   2908         sub128( aSig, 0, term0, term1, &rem0, &rem1 );
   2909         while ( (sbits64) rem0 < 0 ) {
   2910             --zSig;
   2911             add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
   2912         }
   2913         zSig |= ( rem1 != 0 );
   2914     }
   2915     return roundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
   2916 
   2917 }
   2918 
   2919 /*----------------------------------------------------------------------------
   2920 | Returns the remainder of the double-precision floating-point value `a'
   2921 | with respect to the corresponding value `b'.  The operation is performed
   2922 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   2923 *----------------------------------------------------------------------------*/
   2924 
   2925 float64 float64_rem( float64 a, float64 b STATUS_PARAM )
   2926 {
   2927     flag aSign, bSign, zSign;
   2928     int16 aExp, bExp, expDiff;
   2929     bits64 aSig, bSig;
   2930     bits64 q, alternateASig;
   2931     sbits64 sigMean;
   2932 
   2933     aSig = extractFloat64Frac( a );
   2934     aExp = extractFloat64Exp( a );
   2935     aSign = extractFloat64Sign( a );
   2936     bSig = extractFloat64Frac( b );
   2937     bExp = extractFloat64Exp( b );
   2938     bSign = extractFloat64Sign( b );
   2939     if ( aExp == 0x7FF ) {
   2940         if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
   2941             return propagateFloat64NaN( a, b STATUS_VAR );
   2942         }
   2943         float_raise( float_flag_invalid STATUS_VAR);
   2944         return float64_default_nan;
   2945     }
   2946     if ( bExp == 0x7FF ) {
   2947         if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
   2948         return a;
   2949     }
   2950     if ( bExp == 0 ) {
   2951         if ( bSig == 0 ) {
   2952             float_raise( float_flag_invalid STATUS_VAR);
   2953             return float64_default_nan;
   2954         }
   2955         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
   2956     }
   2957     if ( aExp == 0 ) {
   2958         if ( aSig == 0 ) return a;
   2959         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
   2960     }
   2961     expDiff = aExp - bExp;
   2962     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11;
   2963     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
   2964     if ( expDiff < 0 ) {
   2965         if ( expDiff < -1 ) return a;
   2966         aSig >>= 1;
   2967     }
   2968     q = ( bSig <= aSig );
   2969     if ( q ) aSig -= bSig;
   2970     expDiff -= 64;
   2971     while ( 0 < expDiff ) {
   2972         q = estimateDiv128To64( aSig, 0, bSig );
   2973         q = ( 2 < q ) ? q - 2 : 0;
   2974         aSig = - ( ( bSig>>2 ) * q );
   2975         expDiff -= 62;
   2976     }
   2977     expDiff += 64;
   2978     if ( 0 < expDiff ) {
   2979         q = estimateDiv128To64( aSig, 0, bSig );
   2980         q = ( 2 < q ) ? q - 2 : 0;
   2981         q >>= 64 - expDiff;
   2982         bSig >>= 2;
   2983         aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
   2984     }
   2985     else {
   2986         aSig >>= 2;
   2987         bSig >>= 2;
   2988     }
   2989     do {
   2990         alternateASig = aSig;
   2991         ++q;
   2992         aSig -= bSig;
   2993     } while ( 0 <= (sbits64) aSig );
   2994     sigMean = aSig + alternateASig;
   2995     if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
   2996         aSig = alternateASig;
   2997     }
   2998     zSign = ( (sbits64) aSig < 0 );
   2999     if ( zSign ) aSig = - aSig;
   3000     return normalizeRoundAndPackFloat64( aSign ^ zSign, bExp, aSig STATUS_VAR );
   3001 
   3002 }
   3003 
   3004 /*----------------------------------------------------------------------------
   3005 | Returns the square root of the double-precision floating-point value `a'.
   3006 | The operation is performed according to the IEC/IEEE Standard for Binary
   3007 | Floating-Point Arithmetic.
   3008 *----------------------------------------------------------------------------*/
   3009 
   3010 float64 float64_sqrt( float64 a STATUS_PARAM )
   3011 {
   3012     flag aSign;
   3013     int16 aExp, zExp;
   3014     bits64 aSig, zSig, doubleZSig;
   3015     bits64 rem0, rem1, term0, term1;
   3016 
   3017     aSig = extractFloat64Frac( a );
   3018     aExp = extractFloat64Exp( a );
   3019     aSign = extractFloat64Sign( a );
   3020     if ( aExp == 0x7FF ) {
   3021         if ( aSig ) return propagateFloat64NaN( a, a STATUS_VAR );
   3022         if ( ! aSign ) return a;
   3023         float_raise( float_flag_invalid STATUS_VAR);
   3024         return float64_default_nan;
   3025     }
   3026     if ( aSign ) {
   3027         if ( ( aExp | aSig ) == 0 ) return a;
   3028         float_raise( float_flag_invalid STATUS_VAR);
   3029         return float64_default_nan;
   3030     }
   3031     if ( aExp == 0 ) {
   3032         if ( aSig == 0 ) return float64_zero;
   3033         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
   3034     }
   3035     zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE;
   3036     aSig |= LIT64( 0x0010000000000000 );
   3037     zSig = estimateSqrt32( aExp, aSig>>21 );
   3038     aSig <<= 9 - ( aExp & 1 );
   3039     zSig = estimateDiv128To64( aSig, 0, zSig<<32 ) + ( zSig<<30 );
   3040     if ( ( zSig & 0x1FF ) <= 5 ) {
   3041         doubleZSig = zSig<<1;
   3042         mul64To128( zSig, zSig, &term0, &term1 );
   3043         sub128( aSig, 0, term0, term1, &rem0, &rem1 );
   3044         while ( (sbits64) rem0 < 0 ) {
   3045             --zSig;
   3046             doubleZSig -= 2;
   3047             add128( rem0, rem1, zSig>>63, doubleZSig | 1, &rem0, &rem1 );
   3048         }
   3049         zSig |= ( ( rem0 | rem1 ) != 0 );
   3050     }
   3051     return roundAndPackFloat64( 0, zExp, zSig STATUS_VAR );
   3052 
   3053 }
   3054 
   3055 /*----------------------------------------------------------------------------
   3056 | Returns the binary log of the double-precision floating-point value `a'.
   3057 | The operation is performed according to the IEC/IEEE Standard for Binary
   3058 | Floating-Point Arithmetic.
   3059 *----------------------------------------------------------------------------*/
   3060 float64 float64_log2( float64 a STATUS_PARAM )
   3061 {
   3062     flag aSign, zSign;
   3063     int16 aExp;
   3064     bits64 aSig, aSig0, aSig1, zSig, i;
   3065 
   3066     aSig = extractFloat64Frac( a );
   3067     aExp = extractFloat64Exp( a );
   3068     aSign = extractFloat64Sign( a );
   3069 
   3070     if ( aExp == 0 ) {
   3071         if ( aSig == 0 ) return packFloat64( 1, 0x7FF, 0 );
   3072         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
   3073     }
   3074     if ( aSign ) {
   3075         float_raise( float_flag_invalid STATUS_VAR);
   3076         return float64_default_nan;
   3077     }
   3078     if ( aExp == 0x7FF ) {
   3079         if ( aSig ) return propagateFloat64NaN( a, float64_zero STATUS_VAR );
   3080         return a;
   3081     }
   3082 
   3083     aExp -= 0x3FF;
   3084     aSig |= LIT64( 0x0010000000000000 );
   3085     zSign = aExp < 0;
   3086     zSig = (bits64)aExp << 52;
   3087     for (i = 1LL << 51; i > 0; i >>= 1) {
   3088         mul64To128( aSig, aSig, &aSig0, &aSig1 );
   3089         aSig = ( aSig0 << 12 ) | ( aSig1 >> 52 );
   3090         if ( aSig & LIT64( 0x0020000000000000 ) ) {
   3091             aSig >>= 1;
   3092             zSig |= i;
   3093         }
   3094     }
   3095 
   3096     if ( zSign )
   3097         zSig = -zSig;
   3098     return normalizeRoundAndPackFloat64( zSign, 0x408, zSig STATUS_VAR );
   3099 }
   3100 
   3101 /*----------------------------------------------------------------------------
   3102 | Returns 1 if the double-precision floating-point value `a' is equal to the
   3103 | corresponding value `b', and 0 otherwise.  The comparison is performed
   3104 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   3105 *----------------------------------------------------------------------------*/
   3106 
   3107 int float64_eq( float64 a, float64 b STATUS_PARAM )
   3108 {
   3109     bits64 av, bv;
   3110 
   3111     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
   3112          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
   3113        ) {
   3114         if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
   3115             float_raise( float_flag_invalid STATUS_VAR);
   3116         }
   3117         return 0;
   3118     }
   3119     av = float64_val(a);
   3120     bv = float64_val(b);
   3121     return ( av == bv ) || ( (bits64) ( ( av | bv )<<1 ) == 0 );
   3122 
   3123 }
   3124 
   3125 /*----------------------------------------------------------------------------
   3126 | Returns 1 if the double-precision floating-point value `a' is less than or
   3127 | equal to the corresponding value `b', and 0 otherwise.  The comparison is
   3128 | performed according to the IEC/IEEE Standard for Binary Floating-Point
   3129 | Arithmetic.
   3130 *----------------------------------------------------------------------------*/
   3131 
   3132 int float64_le( float64 a, float64 b STATUS_PARAM )
   3133 {
   3134     flag aSign, bSign;
   3135     bits64 av, bv;
   3136 
   3137     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
   3138          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
   3139        ) {
   3140         float_raise( float_flag_invalid STATUS_VAR);
   3141         return 0;
   3142     }
   3143     aSign = extractFloat64Sign( a );
   3144     bSign = extractFloat64Sign( b );
   3145     av = float64_val(a);
   3146     bv = float64_val(b);
   3147     if ( aSign != bSign ) return aSign || ( (bits64) ( ( av | bv )<<1 ) == 0 );
   3148     return ( av == bv ) || ( aSign ^ ( av < bv ) );
   3149 
   3150 }
   3151 
   3152 /*----------------------------------------------------------------------------
   3153 | Returns 1 if the double-precision floating-point value `a' is less than
   3154 | the corresponding value `b', and 0 otherwise.  The comparison is performed
   3155 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   3156 *----------------------------------------------------------------------------*/
   3157 
   3158 int float64_lt( float64 a, float64 b STATUS_PARAM )
   3159 {
   3160     flag aSign, bSign;
   3161     bits64 av, bv;
   3162 
   3163     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
   3164          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
   3165        ) {
   3166         float_raise( float_flag_invalid STATUS_VAR);
   3167         return 0;
   3168     }
   3169     aSign = extractFloat64Sign( a );
   3170     bSign = extractFloat64Sign( b );
   3171     av = float64_val(a);
   3172     bv = float64_val(b);
   3173     if ( aSign != bSign ) return aSign && ( (bits64) ( ( av | bv )<<1 ) != 0 );
   3174     return ( av != bv ) && ( aSign ^ ( av < bv ) );
   3175 
   3176 }
   3177 
   3178 /*----------------------------------------------------------------------------
   3179 | Returns 1 if the double-precision floating-point value `a' is equal to the
   3180 | corresponding value `b', and 0 otherwise.  The invalid exception is raised
   3181 | if either operand is a NaN.  Otherwise, the comparison is performed
   3182 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   3183 *----------------------------------------------------------------------------*/
   3184 
   3185 int float64_eq_signaling( float64 a, float64 b STATUS_PARAM )
   3186 {
   3187     bits64 av, bv;
   3188 
   3189     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
   3190          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
   3191        ) {
   3192         float_raise( float_flag_invalid STATUS_VAR);
   3193         return 0;
   3194     }
   3195     av = float64_val(a);
   3196     bv = float64_val(b);
   3197     return ( av == bv ) || ( (bits64) ( ( av | bv )<<1 ) == 0 );
   3198 
   3199 }
   3200 
   3201 /*----------------------------------------------------------------------------
   3202 | Returns 1 if the double-precision floating-point value `a' is less than or
   3203 | equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
   3204 | cause an exception.  Otherwise, the comparison is performed according to the
   3205 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   3206 *----------------------------------------------------------------------------*/
   3207 
   3208 int float64_le_quiet( float64 a, float64 b STATUS_PARAM )
   3209 {
   3210     flag aSign, bSign;
   3211     bits64 av, bv;
   3212 
   3213     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
   3214          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
   3215        ) {
   3216         if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
   3217             float_raise( float_flag_invalid STATUS_VAR);
   3218         }
   3219         return 0;
   3220     }
   3221     aSign = extractFloat64Sign( a );
   3222     bSign = extractFloat64Sign( b );
   3223     av = float64_val(a);
   3224     bv = float64_val(b);
   3225     if ( aSign != bSign ) return aSign || ( (bits64) ( ( av | bv )<<1 ) == 0 );
   3226     return ( av == bv ) || ( aSign ^ ( av < bv ) );
   3227 
   3228 }
   3229 
   3230 /*----------------------------------------------------------------------------
   3231 | Returns 1 if the double-precision floating-point value `a' is less than
   3232 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
   3233 | exception.  Otherwise, the comparison is performed according to the IEC/IEEE
   3234 | Standard for Binary Floating-Point Arithmetic.
   3235 *----------------------------------------------------------------------------*/
   3236 
   3237 int float64_lt_quiet( float64 a, float64 b STATUS_PARAM )
   3238 {
   3239     flag aSign, bSign;
   3240     bits64 av, bv;
   3241 
   3242     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
   3243          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
   3244        ) {
   3245         if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
   3246             float_raise( float_flag_invalid STATUS_VAR);
   3247         }
   3248         return 0;
   3249     }
   3250     aSign = extractFloat64Sign( a );
   3251     bSign = extractFloat64Sign( b );
   3252     av = float64_val(a);
   3253     bv = float64_val(b);
   3254     if ( aSign != bSign ) return aSign && ( (bits64) ( ( av | bv )<<1 ) != 0 );
   3255     return ( av != bv ) && ( aSign ^ ( av < bv ) );
   3256 
   3257 }
   3258 
   3259 #ifdef FLOATX80
   3260 
   3261 /*----------------------------------------------------------------------------
   3262 | Returns the result of converting the extended double-precision floating-
   3263 | point value `a' to the 32-bit two's complement integer format.  The
   3264 | conversion is performed according to the IEC/IEEE Standard for Binary
   3265 | Floating-Point Arithmetic---which means in particular that the conversion
   3266 | is rounded according to the current rounding mode.  If `a' is a NaN, the
   3267 | largest positive integer is returned.  Otherwise, if the conversion
   3268 | overflows, the largest integer with the same sign as `a' is returned.
   3269 *----------------------------------------------------------------------------*/
   3270 
   3271 int32 floatx80_to_int32( floatx80 a STATUS_PARAM )
   3272 {
   3273     flag aSign;
   3274     int32 aExp, shiftCount;
   3275     bits64 aSig;
   3276 
   3277     aSig = extractFloatx80Frac( a );
   3278     aExp = extractFloatx80Exp( a );
   3279     aSign = extractFloatx80Sign( a );
   3280     if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
   3281     shiftCount = 0x4037 - aExp;
   3282     if ( shiftCount <= 0 ) shiftCount = 1;
   3283     shift64RightJamming( aSig, shiftCount, &aSig );
   3284     return roundAndPackInt32( aSign, aSig STATUS_VAR );
   3285 
   3286 }
   3287 
   3288 /*----------------------------------------------------------------------------
   3289 | Returns the result of converting the extended double-precision floating-
   3290 | point value `a' to the 32-bit two's complement integer format.  The
   3291 | conversion is performed according to the IEC/IEEE Standard for Binary
   3292 | Floating-Point Arithmetic, except that the conversion is always rounded
   3293 | toward zero.  If `a' is a NaN, the largest positive integer is returned.
   3294 | Otherwise, if the conversion overflows, the largest integer with the same
   3295 | sign as `a' is returned.
   3296 *----------------------------------------------------------------------------*/
   3297 
   3298 int32 floatx80_to_int32_round_to_zero( floatx80 a STATUS_PARAM )
   3299 {
   3300     flag aSign;
   3301     int32 aExp, shiftCount;
   3302     bits64 aSig, savedASig;
   3303     int32 z;
   3304 
   3305     aSig = extractFloatx80Frac( a );
   3306     aExp = extractFloatx80Exp( a );
   3307     aSign = extractFloatx80Sign( a );
   3308     if ( 0x401E < aExp ) {
   3309         if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
   3310         goto invalid;
   3311     }
   3312     else if ( aExp < 0x3FFF ) {
   3313         if ( aExp || aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
   3314         return 0;
   3315     }
   3316     shiftCount = 0x403E - aExp;
   3317     savedASig = aSig;
   3318     aSig >>= shiftCount;
   3319     z = aSig;
   3320     if ( aSign ) z = - z;
   3321     if ( ( z < 0 ) ^ aSign ) {
   3322  invalid:
   3323         float_raise( float_flag_invalid STATUS_VAR);
   3324         return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
   3325     }
   3326     if ( ( aSig<<shiftCount ) != savedASig ) {
   3327         STATUS(float_exception_flags) |= float_flag_inexact;
   3328     }
   3329     return z;
   3330 
   3331 }
   3332 
   3333 /*----------------------------------------------------------------------------
   3334 | Returns the result of converting the extended double-precision floating-
   3335 | point value `a' to the 64-bit two's complement integer format.  The
   3336 | conversion is performed according to the IEC/IEEE Standard for Binary
   3337 | Floating-Point Arithmetic---which means in particular that the conversion
   3338 | is rounded according to the current rounding mode.  If `a' is a NaN,
   3339 | the largest positive integer is returned.  Otherwise, if the conversion
   3340 | overflows, the largest integer with the same sign as `a' is returned.
   3341 *----------------------------------------------------------------------------*/
   3342 
   3343 int64 floatx80_to_int64( floatx80 a STATUS_PARAM )
   3344 {
   3345     flag aSign;
   3346     int32 aExp, shiftCount;
   3347     bits64 aSig, aSigExtra;
   3348 
   3349     aSig = extractFloatx80Frac( a );
   3350     aExp = extractFloatx80Exp( a );
   3351     aSign = extractFloatx80Sign( a );
   3352     shiftCount = 0x403E - aExp;
   3353     if ( shiftCount <= 0 ) {
   3354         if ( shiftCount ) {
   3355             float_raise( float_flag_invalid STATUS_VAR);
   3356             if (    ! aSign
   3357                  || (    ( aExp == 0x7FFF )
   3358                       && ( aSig != LIT64( 0x8000000000000000 ) ) )
   3359                ) {
   3360                 return LIT64( 0x7FFFFFFFFFFFFFFF );
   3361             }
   3362             return (sbits64) LIT64( 0x8000000000000000 );
   3363         }
   3364         aSigExtra = 0;
   3365     }
   3366     else {
   3367         shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
   3368     }
   3369     return roundAndPackInt64( aSign, aSig, aSigExtra STATUS_VAR );
   3370 
   3371 }
   3372 
   3373 /*----------------------------------------------------------------------------
   3374 | Returns the result of converting the extended double-precision floating-
   3375 | point value `a' to the 64-bit two's complement integer format.  The
   3376 | conversion is performed according to the IEC/IEEE Standard for Binary
   3377 | Floating-Point Arithmetic, except that the conversion is always rounded
   3378 | toward zero.  If `a' is a NaN, the largest positive integer is returned.
   3379 | Otherwise, if the conversion overflows, the largest integer with the same
   3380 | sign as `a' is returned.
   3381 *----------------------------------------------------------------------------*/
   3382 
   3383 int64 floatx80_to_int64_round_to_zero( floatx80 a STATUS_PARAM )
   3384 {
   3385     flag aSign;
   3386     int32 aExp, shiftCount;
   3387     bits64 aSig;
   3388     int64 z;
   3389 
   3390     aSig = extractFloatx80Frac( a );
   3391     aExp = extractFloatx80Exp( a );
   3392     aSign = extractFloatx80Sign( a );
   3393     shiftCount = aExp - 0x403E;
   3394     if ( 0 <= shiftCount ) {
   3395         aSig &= LIT64( 0x7FFFFFFFFFFFFFFF );
   3396         if ( ( a.high != 0xC03E ) || aSig ) {
   3397             float_raise( float_flag_invalid STATUS_VAR);
   3398             if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {
   3399                 return LIT64( 0x7FFFFFFFFFFFFFFF );
   3400             }
   3401         }
   3402         return (sbits64) LIT64( 0x8000000000000000 );
   3403     }
   3404     else if ( aExp < 0x3FFF ) {
   3405         if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
   3406         return 0;
   3407     }
   3408     z = aSig>>( - shiftCount );
   3409     if ( (bits64) ( aSig<<( shiftCount & 63 ) ) ) {
   3410         STATUS(float_exception_flags) |= float_flag_inexact;
   3411     }
   3412     if ( aSign ) z = - z;
   3413     return z;
   3414 
   3415 }
   3416 
   3417 /*----------------------------------------------------------------------------
   3418 | Returns the result of converting the extended double-precision floating-
   3419 | point value `a' to the single-precision floating-point format.  The
   3420 | conversion is performed according to the IEC/IEEE Standard for Binary
   3421 | Floating-Point Arithmetic.
   3422 *----------------------------------------------------------------------------*/
   3423 
   3424 float32 floatx80_to_float32( floatx80 a STATUS_PARAM )
   3425 {
   3426     flag aSign;
   3427     int32 aExp;
   3428     bits64 aSig;
   3429 
   3430     aSig = extractFloatx80Frac( a );
   3431     aExp = extractFloatx80Exp( a );
   3432     aSign = extractFloatx80Sign( a );
   3433     if ( aExp == 0x7FFF ) {
   3434         if ( (bits64) ( aSig<<1 ) ) {
   3435             return commonNaNToFloat32( floatx80ToCommonNaN( a STATUS_VAR ) );
   3436         }
   3437         return packFloat32( aSign, 0xFF, 0 );
   3438     }
   3439     shift64RightJamming( aSig, 33, &aSig );
   3440     if ( aExp || aSig ) aExp -= 0x3F81;
   3441     return roundAndPackFloat32( aSign, aExp, aSig STATUS_VAR );
   3442 
   3443 }
   3444 
   3445 /*----------------------------------------------------------------------------
   3446 | Returns the result of converting the extended double-precision floating-
   3447 | point value `a' to the double-precision floating-point format.  The
   3448 | conversion is performed according to the IEC/IEEE Standard for Binary
   3449 | Floating-Point Arithmetic.
   3450 *----------------------------------------------------------------------------*/
   3451 
   3452 float64 floatx80_to_float64( floatx80 a STATUS_PARAM )
   3453 {
   3454     flag aSign;
   3455     int32 aExp;
   3456     bits64 aSig, zSig;
   3457 
   3458     aSig = extractFloatx80Frac( a );
   3459     aExp = extractFloatx80Exp( a );
   3460     aSign = extractFloatx80Sign( a );
   3461     if ( aExp == 0x7FFF ) {
   3462         if ( (bits64) ( aSig<<1 ) ) {
   3463             return commonNaNToFloat64( floatx80ToCommonNaN( a STATUS_VAR ) );
   3464         }
   3465         return packFloat64( aSign, 0x7FF, 0 );
   3466     }
   3467     shift64RightJamming( aSig, 1, &zSig );
   3468     if ( aExp || aSig ) aExp -= 0x3C01;
   3469     return roundAndPackFloat64( aSign, aExp, zSig STATUS_VAR );
   3470 
   3471 }
   3472 
   3473 #ifdef FLOAT128
   3474 
   3475 /*----------------------------------------------------------------------------
   3476 | Returns the result of converting the extended double-precision floating-
   3477 | point value `a' to the quadruple-precision floating-point format.  The
   3478 | conversion is performed according to the IEC/IEEE Standard for Binary
   3479 | Floating-Point Arithmetic.
   3480 *----------------------------------------------------------------------------*/
   3481 
   3482 float128 floatx80_to_float128( floatx80 a STATUS_PARAM )
   3483 {
   3484     flag aSign;
   3485     int16 aExp;
   3486     bits64 aSig, zSig0, zSig1;
   3487 
   3488     aSig = extractFloatx80Frac( a );
   3489     aExp = extractFloatx80Exp( a );
   3490     aSign = extractFloatx80Sign( a );
   3491     if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) {
   3492         return commonNaNToFloat128( floatx80ToCommonNaN( a STATUS_VAR ) );
   3493     }
   3494     shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
   3495     return packFloat128( aSign, aExp, zSig0, zSig1 );
   3496 
   3497 }
   3498 
   3499 #endif
   3500 
   3501 /*----------------------------------------------------------------------------
   3502 | Rounds the extended double-precision floating-point value `a' to an integer,
   3503 | and returns the result as an extended quadruple-precision floating-point
   3504 | value.  The operation is performed according to the IEC/IEEE Standard for
   3505 | Binary Floating-Point Arithmetic.
   3506 *----------------------------------------------------------------------------*/
   3507 
   3508 floatx80 floatx80_round_to_int( floatx80 a STATUS_PARAM )
   3509 {
   3510     flag aSign;
   3511     int32 aExp;
   3512     bits64 lastBitMask, roundBitsMask;
   3513     int8 roundingMode;
   3514     floatx80 z;
   3515 
   3516     aExp = extractFloatx80Exp( a );
   3517     if ( 0x403E <= aExp ) {
   3518         if ( ( aExp == 0x7FFF ) && (bits64) ( extractFloatx80Frac( a )<<1 ) ) {
   3519             return propagateFloatx80NaN( a, a STATUS_VAR );
   3520         }
   3521         return a;
   3522     }
   3523     if ( aExp < 0x3FFF ) {
   3524         if (    ( aExp == 0 )
   3525              && ( (bits64) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
   3526             return a;
   3527         }
   3528         STATUS(float_exception_flags) |= float_flag_inexact;
   3529         aSign = extractFloatx80Sign( a );
   3530         switch ( STATUS(float_rounding_mode) ) {
   3531          case float_round_nearest_even:
   3532             if ( ( aExp == 0x3FFE ) && (bits64) ( extractFloatx80Frac( a )<<1 )
   3533                ) {
   3534                 return
   3535                     packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );
   3536             }
   3537             break;
   3538          case float_round_down:
   3539             return
   3540                   aSign ?
   3541                       packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
   3542                 : packFloatx80( 0, 0, 0 );
   3543          case float_round_up:
   3544             return
   3545                   aSign ? packFloatx80( 1, 0, 0 )
   3546                 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
   3547         }
   3548         return packFloatx80( aSign, 0, 0 );
   3549     }
   3550     lastBitMask = 1;
   3551     lastBitMask <<= 0x403E - aExp;
   3552     roundBitsMask = lastBitMask - 1;
   3553     z = a;
   3554     roundingMode = STATUS(float_rounding_mode);
   3555     if ( roundingMode == float_round_nearest_even ) {
   3556         z.low += lastBitMask>>1;
   3557         if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
   3558     }
   3559     else if ( roundingMode != float_round_to_zero ) {
   3560         if ( extractFloatx80Sign( z ) ^ ( roundingMode == float_round_up ) ) {
   3561             z.low += roundBitsMask;
   3562         }
   3563     }
   3564     z.low &= ~ roundBitsMask;
   3565     if ( z.low == 0 ) {
   3566         ++z.high;
   3567         z.low = LIT64( 0x8000000000000000 );
   3568     }
   3569     if ( z.low != a.low ) STATUS(float_exception_flags) |= float_flag_inexact;
   3570     return z;
   3571 
   3572 }
   3573 
   3574 /*----------------------------------------------------------------------------
   3575 | Returns the result of adding the absolute values of the extended double-
   3576 | precision floating-point values `a' and `b'.  If `zSign' is 1, the sum is
   3577 | negated before being returned.  `zSign' is ignored if the result is a NaN.
   3578 | The addition is performed according to the IEC/IEEE Standard for Binary
   3579 | Floating-Point Arithmetic.
   3580 *----------------------------------------------------------------------------*/
   3581 
   3582 static floatx80 addFloatx80Sigs( floatx80 a, floatx80 b, flag zSign STATUS_PARAM)
   3583 {
   3584     int32 aExp, bExp, zExp;
   3585     bits64 aSig, bSig, zSig0, zSig1;
   3586     int32 expDiff;
   3587 
   3588     aSig = extractFloatx80Frac( a );
   3589     aExp = extractFloatx80Exp( a );
   3590     bSig = extractFloatx80Frac( b );
   3591     bExp = extractFloatx80Exp( b );
   3592     expDiff = aExp - bExp;
   3593     if ( 0 < expDiff ) {
   3594         if ( aExp == 0x7FFF ) {
   3595             if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
   3596             return a;
   3597         }
   3598         if ( bExp == 0 ) --expDiff;
   3599         shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
   3600         zExp = aExp;
   3601     }
   3602     else if ( expDiff < 0 ) {
   3603         if ( bExp == 0x7FFF ) {
   3604             if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
   3605             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
   3606         }
   3607         if ( aExp == 0 ) ++expDiff;
   3608         shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
   3609         zExp = bExp;
   3610     }
   3611     else {
   3612         if ( aExp == 0x7FFF ) {
   3613             if ( (bits64) ( ( aSig | bSig )<<1 ) ) {
   3614                 return propagateFloatx80NaN( a, b STATUS_VAR );
   3615             }
   3616             return a;
   3617         }
   3618         zSig1 = 0;
   3619         zSig0 = aSig + bSig;
   3620         if ( aExp == 0 ) {
   3621             normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
   3622             goto roundAndPack;
   3623         }
   3624         zExp = aExp;
   3625         goto shiftRight1;
   3626     }
   3627     zSig0 = aSig + bSig;
   3628     if ( (sbits64) zSig0 < 0 ) goto roundAndPack;
   3629  shiftRight1:
   3630     shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
   3631     zSig0 |= LIT64( 0x8000000000000000 );
   3632     ++zExp;
   3633  roundAndPack:
   3634     return
   3635         roundAndPackFloatx80(
   3636             STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
   3637 
   3638 }
   3639 
   3640 /*----------------------------------------------------------------------------
   3641 | Returns the result of subtracting the absolute values of the extended
   3642 | double-precision floating-point values `a' and `b'.  If `zSign' is 1, the
   3643 | difference is negated before being returned.  `zSign' is ignored if the
   3644 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
   3645 | Standard for Binary Floating-Point Arithmetic.
   3646 *----------------------------------------------------------------------------*/
   3647 
   3648 static floatx80 subFloatx80Sigs( floatx80 a, floatx80 b, flag zSign STATUS_PARAM )
   3649 {
   3650     int32 aExp, bExp, zExp;
   3651     bits64 aSig, bSig, zSig0, zSig1;
   3652     int32 expDiff;
   3653     floatx80 z;
   3654 
   3655     aSig = extractFloatx80Frac( a );
   3656     aExp = extractFloatx80Exp( a );
   3657     bSig = extractFloatx80Frac( b );
   3658     bExp = extractFloatx80Exp( b );
   3659     expDiff = aExp - bExp;
   3660     if ( 0 < expDiff ) goto aExpBigger;
   3661     if ( expDiff < 0 ) goto bExpBigger;
   3662     if ( aExp == 0x7FFF ) {
   3663         if ( (bits64) ( ( aSig | bSig )<<1 ) ) {
   3664             return propagateFloatx80NaN( a, b STATUS_VAR );
   3665         }
   3666         float_raise( float_flag_invalid STATUS_VAR);
   3667         z.low = floatx80_default_nan_low;
   3668         z.high = floatx80_default_nan_high;
   3669         return z;
   3670     }
   3671     if ( aExp == 0 ) {
   3672         aExp = 1;
   3673         bExp = 1;
   3674     }
   3675     zSig1 = 0;
   3676     if ( bSig < aSig ) goto aBigger;
   3677     if ( aSig < bSig ) goto bBigger;
   3678     return packFloatx80( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
   3679  bExpBigger:
   3680     if ( bExp == 0x7FFF ) {
   3681         if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
   3682         return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) );
   3683     }
   3684     if ( aExp == 0 ) ++expDiff;
   3685     shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
   3686  bBigger:
   3687     sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
   3688     zExp = bExp;
   3689     zSign ^= 1;
   3690     goto normalizeRoundAndPack;
   3691  aExpBigger:
   3692     if ( aExp == 0x7FFF ) {
   3693         if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
   3694         return a;
   3695     }
   3696     if ( bExp == 0 ) --expDiff;
   3697     shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
   3698  aBigger:
   3699     sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
   3700     zExp = aExp;
   3701  normalizeRoundAndPack:
   3702     return
   3703         normalizeRoundAndPackFloatx80(
   3704             STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
   3705 
   3706 }
   3707 
   3708 /*----------------------------------------------------------------------------
   3709 | Returns the result of adding the extended double-precision floating-point
   3710 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
   3711 | Standard for Binary Floating-Point Arithmetic.
   3712 *----------------------------------------------------------------------------*/
   3713 
   3714 floatx80 floatx80_add( floatx80 a, floatx80 b STATUS_PARAM )
   3715 {
   3716     flag aSign, bSign;
   3717 
   3718     aSign = extractFloatx80Sign( a );
   3719     bSign = extractFloatx80Sign( b );
   3720     if ( aSign == bSign ) {
   3721         return addFloatx80Sigs( a, b, aSign STATUS_VAR );
   3722     }
   3723     else {
   3724         return subFloatx80Sigs( a, b, aSign STATUS_VAR );
   3725     }
   3726 
   3727 }
   3728 
   3729 /*----------------------------------------------------------------------------
   3730 | Returns the result of subtracting the extended double-precision floating-
   3731 | point values `a' and `b'.  The operation is performed according to the
   3732 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   3733 *----------------------------------------------------------------------------*/
   3734 
   3735 floatx80 floatx80_sub( floatx80 a, floatx80 b STATUS_PARAM )
   3736 {
   3737     flag aSign, bSign;
   3738 
   3739     aSign = extractFloatx80Sign( a );
   3740     bSign = extractFloatx80Sign( b );
   3741     if ( aSign == bSign ) {
   3742         return subFloatx80Sigs( a, b, aSign STATUS_VAR );
   3743     }
   3744     else {
   3745         return addFloatx80Sigs( a, b, aSign STATUS_VAR );
   3746     }
   3747 
   3748 }
   3749 
   3750 /*----------------------------------------------------------------------------
   3751 | Returns the result of multiplying the extended double-precision floating-
   3752 | point values `a' and `b'.  The operation is performed according to the
   3753 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   3754 *----------------------------------------------------------------------------*/
   3755 
   3756 floatx80 floatx80_mul( floatx80 a, floatx80 b STATUS_PARAM )
   3757 {
   3758     flag aSign, bSign, zSign;
   3759     int32 aExp, bExp, zExp;
   3760     bits64 aSig, bSig, zSig0, zSig1;
   3761     floatx80 z;
   3762 
   3763     aSig = extractFloatx80Frac( a );
   3764     aExp = extractFloatx80Exp( a );
   3765     aSign = extractFloatx80Sign( a );
   3766     bSig = extractFloatx80Frac( b );
   3767     bExp = extractFloatx80Exp( b );
   3768     bSign = extractFloatx80Sign( b );
   3769     zSign = aSign ^ bSign;
   3770     if ( aExp == 0x7FFF ) {
   3771         if (    (bits64) ( aSig<<1 )
   3772              || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
   3773             return propagateFloatx80NaN( a, b STATUS_VAR );
   3774         }
   3775         if ( ( bExp | bSig ) == 0 ) goto invalid;
   3776         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
   3777     }
   3778     if ( bExp == 0x7FFF ) {
   3779         if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
   3780         if ( ( aExp | aSig ) == 0 ) {
   3781  invalid:
   3782             float_raise( float_flag_invalid STATUS_VAR);
   3783             z.low = floatx80_default_nan_low;
   3784             z.high = floatx80_default_nan_high;
   3785             return z;
   3786         }
   3787         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
   3788     }
   3789     if ( aExp == 0 ) {
   3790         if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
   3791         normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
   3792     }
   3793     if ( bExp == 0 ) {
   3794         if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
   3795         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
   3796     }
   3797     zExp = aExp + bExp - 0x3FFE;
   3798     mul64To128( aSig, bSig, &zSig0, &zSig1 );
   3799     if ( 0 < (sbits64) zSig0 ) {
   3800         shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
   3801         --zExp;
   3802     }
   3803     return
   3804         roundAndPackFloatx80(
   3805             STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
   3806 
   3807 }
   3808 
   3809 /*----------------------------------------------------------------------------
   3810 | Returns the result of dividing the extended double-precision floating-point
   3811 | value `a' by the corresponding value `b'.  The operation is performed
   3812 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   3813 *----------------------------------------------------------------------------*/
   3814 
   3815 floatx80 floatx80_div( floatx80 a, floatx80 b STATUS_PARAM )
   3816 {
   3817     flag aSign, bSign, zSign;
   3818     int32 aExp, bExp, zExp;
   3819     bits64 aSig, bSig, zSig0, zSig1;
   3820     bits64 rem0, rem1, rem2, term0, term1, term2;
   3821     floatx80 z;
   3822 
   3823     aSig = extractFloatx80Frac( a );
   3824     aExp = extractFloatx80Exp( a );
   3825     aSign = extractFloatx80Sign( a );
   3826     bSig = extractFloatx80Frac( b );
   3827     bExp = extractFloatx80Exp( b );
   3828     bSign = extractFloatx80Sign( b );
   3829     zSign = aSign ^ bSign;
   3830     if ( aExp == 0x7FFF ) {
   3831         if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
   3832         if ( bExp == 0x7FFF ) {
   3833             if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
   3834             goto invalid;
   3835         }
   3836         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
   3837     }
   3838     if ( bExp == 0x7FFF ) {
   3839         if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
   3840         return packFloatx80( zSign, 0, 0 );
   3841     }
   3842     if ( bExp == 0 ) {
   3843         if ( bSig == 0 ) {
   3844             if ( ( aExp | aSig ) == 0 ) {
   3845  invalid:
   3846                 float_raise( float_flag_invalid STATUS_VAR);
   3847                 z.low = floatx80_default_nan_low;
   3848                 z.high = floatx80_default_nan_high;
   3849                 return z;
   3850             }
   3851             float_raise( float_flag_divbyzero STATUS_VAR);
   3852             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
   3853         }
   3854         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
   3855     }
   3856     if ( aExp == 0 ) {
   3857         if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
   3858         normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
   3859     }
   3860     zExp = aExp - bExp + 0x3FFE;
   3861     rem1 = 0;
   3862     if ( bSig <= aSig ) {
   3863         shift128Right( aSig, 0, 1, &aSig, &rem1 );
   3864         ++zExp;
   3865     }
   3866     zSig0 = estimateDiv128To64( aSig, rem1, bSig );
   3867     mul64To128( bSig, zSig0, &term0, &term1 );
   3868     sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
   3869     while ( (sbits64) rem0 < 0 ) {
   3870         --zSig0;
   3871         add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
   3872     }
   3873     zSig1 = estimateDiv128To64( rem1, 0, bSig );
   3874     if ( (bits64) ( zSig1<<1 ) <= 8 ) {
   3875         mul64To128( bSig, zSig1, &term1, &term2 );
   3876         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
   3877         while ( (sbits64) rem1 < 0 ) {
   3878             --zSig1;
   3879             add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
   3880         }
   3881         zSig1 |= ( ( rem1 | rem2 ) != 0 );
   3882     }
   3883     return
   3884         roundAndPackFloatx80(
   3885             STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
   3886 
   3887 }
   3888 
   3889 /*----------------------------------------------------------------------------
   3890 | Returns the remainder of the extended double-precision floating-point value
   3891 | `a' with respect to the corresponding value `b'.  The operation is performed
   3892 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   3893 *----------------------------------------------------------------------------*/
   3894 
   3895 floatx80 floatx80_rem( floatx80 a, floatx80 b STATUS_PARAM )
   3896 {
   3897     flag aSign, bSign, zSign;
   3898     int32 aExp, bExp, expDiff;
   3899     bits64 aSig0, aSig1, bSig;
   3900     bits64 q, term0, term1, alternateASig0, alternateASig1;
   3901     floatx80 z;
   3902 
   3903     aSig0 = extractFloatx80Frac( a );
   3904     aExp = extractFloatx80Exp( a );
   3905     aSign = extractFloatx80Sign( a );
   3906     bSig = extractFloatx80Frac( b );
   3907     bExp = extractFloatx80Exp( b );
   3908     bSign = extractFloatx80Sign( b );
   3909     if ( aExp == 0x7FFF ) {
   3910         if (    (bits64) ( aSig0<<1 )
   3911              || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
   3912             return propagateFloatx80NaN( a, b STATUS_VAR );
   3913         }
   3914         goto invalid;
   3915     }
   3916     if ( bExp == 0x7FFF ) {
   3917         if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
   3918         return a;
   3919     }
   3920     if ( bExp == 0 ) {
   3921         if ( bSig == 0 ) {
   3922  invalid:
   3923             float_raise( float_flag_invalid STATUS_VAR);
   3924             z.low = floatx80_default_nan_low;
   3925             z.high = floatx80_default_nan_high;
   3926             return z;
   3927         }
   3928         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
   3929     }
   3930     if ( aExp == 0 ) {
   3931         if ( (bits64) ( aSig0<<1 ) == 0 ) return a;
   3932         normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
   3933     }
   3934     bSig |= LIT64( 0x8000000000000000 );
   3935     zSign = aSign;
   3936     expDiff = aExp - bExp;
   3937     aSig1 = 0;
   3938     if ( expDiff < 0 ) {
   3939         if ( expDiff < -1 ) return a;
   3940         shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
   3941         expDiff = 0;
   3942     }
   3943     q = ( bSig <= aSig0 );
   3944     if ( q ) aSig0 -= bSig;
   3945     expDiff -= 64;
   3946     while ( 0 < expDiff ) {
   3947         q = estimateDiv128To64( aSig0, aSig1, bSig );
   3948         q = ( 2 < q ) ? q - 2 : 0;
   3949         mul64To128( bSig, q, &term0, &term1 );
   3950         sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
   3951         shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
   3952         expDiff -= 62;
   3953     }
   3954     expDiff += 64;
   3955     if ( 0 < expDiff ) {
   3956         q = estimateDiv128To64( aSig0, aSig1, bSig );
   3957         q = ( 2 < q ) ? q - 2 : 0;
   3958         q >>= 64 - expDiff;
   3959         mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
   3960         sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
   3961         shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
   3962         while ( le128( term0, term1, aSig0, aSig1 ) ) {
   3963             ++q;
   3964             sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
   3965         }
   3966     }
   3967     else {
   3968         term1 = 0;
   3969         term0 = bSig;
   3970     }
   3971     sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
   3972     if (    lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
   3973          || (    eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
   3974               && ( q & 1 ) )
   3975        ) {
   3976         aSig0 = alternateASig0;
   3977         aSig1 = alternateASig1;
   3978         zSign = ! zSign;
   3979     }
   3980     return
   3981         normalizeRoundAndPackFloatx80(
   3982             80, zSign, bExp + expDiff, aSig0, aSig1 STATUS_VAR );
   3983 
   3984 }
   3985 
   3986 /*----------------------------------------------------------------------------
   3987 | Returns the square root of the extended double-precision floating-point
   3988 | value `a'.  The operation is performed according to the IEC/IEEE Standard
   3989 | for Binary Floating-Point Arithmetic.
   3990 *----------------------------------------------------------------------------*/
   3991 
   3992 floatx80 floatx80_sqrt( floatx80 a STATUS_PARAM )
   3993 {
   3994     flag aSign;
   3995     int32 aExp, zExp;
   3996     bits64 aSig0, aSig1, zSig0, zSig1, doubleZSig0;
   3997     bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
   3998     floatx80 z;
   3999 
   4000     aSig0 = extractFloatx80Frac( a );
   4001     aExp = extractFloatx80Exp( a );
   4002     aSign = extractFloatx80Sign( a );
   4003     if ( aExp == 0x7FFF ) {
   4004         if ( (bits64) ( aSig0<<1 ) ) return propagateFloatx80NaN( a, a STATUS_VAR );
   4005         if ( ! aSign ) return a;
   4006         goto invalid;
   4007     }
   4008     if ( aSign ) {
   4009         if ( ( aExp | aSig0 ) == 0 ) return a;
   4010  invalid:
   4011         float_raise( float_flag_invalid STATUS_VAR);
   4012         z.low = floatx80_default_nan_low;
   4013         z.high = floatx80_default_nan_high;
   4014         return z;
   4015     }
   4016     if ( aExp == 0 ) {
   4017         if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
   4018         normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
   4019     }
   4020     zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
   4021     zSig0 = estimateSqrt32( aExp, aSig0>>32 );
   4022     shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
   4023     zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
   4024     doubleZSig0 = zSig0<<1;
   4025     mul64To128( zSig0, zSig0, &term0, &term1 );
   4026     sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
   4027     while ( (sbits64) rem0 < 0 ) {
   4028         --zSig0;
   4029         doubleZSig0 -= 2;
   4030         add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
   4031     }
   4032     zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
   4033     if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
   4034         if ( zSig1 == 0 ) zSig1 = 1;
   4035         mul64To128( doubleZSig0, zSig1, &term1, &term2 );
   4036         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
   4037         mul64To128( zSig1, zSig1, &term2, &term3 );
   4038         sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
   4039         while ( (sbits64) rem1 < 0 ) {
   4040             --zSig1;
   4041             shortShift128Left( 0, zSig1, 1, &term2, &term3 );
   4042             term3 |= 1;
   4043             term2 |= doubleZSig0;
   4044             add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
   4045         }
   4046         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
   4047     }
   4048     shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
   4049     zSig0 |= doubleZSig0;
   4050     return
   4051         roundAndPackFloatx80(
   4052             STATUS(floatx80_rounding_precision), 0, zExp, zSig0, zSig1 STATUS_VAR );
   4053 
   4054 }
   4055 
   4056 /*----------------------------------------------------------------------------
   4057 | Returns 1 if the extended double-precision floating-point value `a' is
   4058 | equal to the corresponding value `b', and 0 otherwise.  The comparison is
   4059 | performed according to the IEC/IEEE Standard for Binary Floating-Point
   4060 | Arithmetic.
   4061 *----------------------------------------------------------------------------*/
   4062 
   4063 int floatx80_eq( floatx80 a, floatx80 b STATUS_PARAM )
   4064 {
   4065 
   4066     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
   4067               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
   4068          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
   4069               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
   4070        ) {
   4071         if (    floatx80_is_signaling_nan( a )
   4072              || floatx80_is_signaling_nan( b ) ) {
   4073             float_raise( float_flag_invalid STATUS_VAR);
   4074         }
   4075         return 0;
   4076     }
   4077     return
   4078            ( a.low == b.low )
   4079         && (    ( a.high == b.high )
   4080              || (    ( a.low == 0 )
   4081                   && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) )
   4082            );
   4083 
   4084 }
   4085 
   4086 /*----------------------------------------------------------------------------
   4087 | Returns 1 if the extended double-precision floating-point value `a' is
   4088 | less than or equal to the corresponding value `b', and 0 otherwise.  The
   4089 | comparison is performed according to the IEC/IEEE Standard for Binary
   4090 | Floating-Point Arithmetic.
   4091 *----------------------------------------------------------------------------*/
   4092 
   4093 int floatx80_le( floatx80 a, floatx80 b STATUS_PARAM )
   4094 {
   4095     flag aSign, bSign;
   4096 
   4097     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
   4098               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
   4099          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
   4100               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
   4101        ) {
   4102         float_raise( float_flag_invalid STATUS_VAR);
   4103         return 0;
   4104     }
   4105     aSign = extractFloatx80Sign( a );
   4106     bSign = extractFloatx80Sign( b );
   4107     if ( aSign != bSign ) {
   4108         return
   4109                aSign
   4110             || (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
   4111                  == 0 );
   4112     }
   4113     return
   4114           aSign ? le128( b.high, b.low, a.high, a.low )
   4115         : le128( a.high, a.low, b.high, b.low );
   4116 
   4117 }
   4118 
   4119 /*----------------------------------------------------------------------------
   4120 | Returns 1 if the extended double-precision floating-point value `a' is
   4121 | less than the corresponding value `b', and 0 otherwise.  The comparison
   4122 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
   4123 | Arithmetic.
   4124 *----------------------------------------------------------------------------*/
   4125 
   4126 int floatx80_lt( floatx80 a, floatx80 b STATUS_PARAM )
   4127 {
   4128     flag aSign, bSign;
   4129 
   4130     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
   4131               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
   4132          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
   4133               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
   4134        ) {
   4135         float_raise( float_flag_invalid STATUS_VAR);
   4136         return 0;
   4137     }
   4138     aSign = extractFloatx80Sign( a );
   4139     bSign = extractFloatx80Sign( b );
   4140     if ( aSign != bSign ) {
   4141         return
   4142                aSign
   4143             && (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
   4144                  != 0 );
   4145     }
   4146     return
   4147           aSign ? lt128( b.high, b.low, a.high, a.low )
   4148         : lt128( a.high, a.low, b.high, b.low );
   4149 
   4150 }
   4151 
   4152 /*----------------------------------------------------------------------------
   4153 | Returns 1 if the extended double-precision floating-point value `a' is equal
   4154 | to the corresponding value `b', and 0 otherwise.  The invalid exception is
   4155 | raised if either operand is a NaN.  Otherwise, the comparison is performed
   4156 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   4157 *----------------------------------------------------------------------------*/
   4158 
   4159 int floatx80_eq_signaling( floatx80 a, floatx80 b STATUS_PARAM )
   4160 {
   4161 
   4162     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
   4163               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
   4164          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
   4165               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
   4166        ) {
   4167         float_raise( float_flag_invalid STATUS_VAR);
   4168         return 0;
   4169     }
   4170     return
   4171            ( a.low == b.low )
   4172         && (    ( a.high == b.high )
   4173              || (    ( a.low == 0 )
   4174                   && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) )
   4175            );
   4176 
   4177 }
   4178 
   4179 /*----------------------------------------------------------------------------
   4180 | Returns 1 if the extended double-precision floating-point value `a' is less
   4181 | than or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs
   4182 | do not cause an exception.  Otherwise, the comparison is performed according
   4183 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   4184 *----------------------------------------------------------------------------*/
   4185 
   4186 int floatx80_le_quiet( floatx80 a, floatx80 b STATUS_PARAM )
   4187 {
   4188     flag aSign, bSign;
   4189 
   4190     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
   4191               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
   4192          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
   4193               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
   4194        ) {
   4195         if (    floatx80_is_signaling_nan( a )
   4196              || floatx80_is_signaling_nan( b ) ) {
   4197             float_raise( float_flag_invalid STATUS_VAR);
   4198         }
   4199         return 0;
   4200     }
   4201     aSign = extractFloatx80Sign( a );
   4202     bSign = extractFloatx80Sign( b );
   4203     if ( aSign != bSign ) {
   4204         return
   4205                aSign
   4206             || (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
   4207                  == 0 );
   4208     }
   4209     return
   4210           aSign ? le128( b.high, b.low, a.high, a.low )
   4211         : le128( a.high, a.low, b.high, b.low );
   4212 
   4213 }
   4214 
   4215 /*----------------------------------------------------------------------------
   4216 | Returns 1 if the extended double-precision floating-point value `a' is less
   4217 | than the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause
   4218 | an exception.  Otherwise, the comparison is performed according to the
   4219 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   4220 *----------------------------------------------------------------------------*/
   4221 
   4222 int floatx80_lt_quiet( floatx80 a, floatx80 b STATUS_PARAM )
   4223 {
   4224     flag aSign, bSign;
   4225 
   4226     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
   4227               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
   4228          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
   4229               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
   4230        ) {
   4231         if (    floatx80_is_signaling_nan( a )
   4232              || floatx80_is_signaling_nan( b ) ) {
   4233             float_raise( float_flag_invalid STATUS_VAR);
   4234         }
   4235         return 0;
   4236     }
   4237     aSign = extractFloatx80Sign( a );
   4238     bSign = extractFloatx80Sign( b );
   4239     if ( aSign != bSign ) {
   4240         return
   4241                aSign
   4242             && (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
   4243                  != 0 );
   4244     }
   4245     return
   4246           aSign ? lt128( b.high, b.low, a.high, a.low )
   4247         : lt128( a.high, a.low, b.high, b.low );
   4248 
   4249 }
   4250 
   4251 #endif
   4252 
   4253 #ifdef FLOAT128
   4254 
   4255 /*----------------------------------------------------------------------------
   4256 | Returns the result of converting the quadruple-precision floating-point
   4257 | value `a' to the 32-bit two's complement integer format.  The conversion
   4258 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
   4259 | Arithmetic---which means in particular that the conversion is rounded
   4260 | according to the current rounding mode.  If `a' is a NaN, the largest
   4261 | positive integer is returned.  Otherwise, if the conversion overflows, the
   4262 | largest integer with the same sign as `a' is returned.
   4263 *----------------------------------------------------------------------------*/
   4264 
   4265 int32 float128_to_int32( float128 a STATUS_PARAM )
   4266 {
   4267     flag aSign;
   4268     int32 aExp, shiftCount;
   4269     bits64 aSig0, aSig1;
   4270 
   4271     aSig1 = extractFloat128Frac1( a );
   4272     aSig0 = extractFloat128Frac0( a );
   4273     aExp = extractFloat128Exp( a );
   4274     aSign = extractFloat128Sign( a );
   4275     if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
   4276     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
   4277     aSig0 |= ( aSig1 != 0 );
   4278     shiftCount = 0x4028 - aExp;
   4279     if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
   4280     return roundAndPackInt32( aSign, aSig0 STATUS_VAR );
   4281 
   4282 }
   4283 
   4284 /*----------------------------------------------------------------------------
   4285 | Returns the result of converting the quadruple-precision floating-point
   4286 | value `a' to the 32-bit two's complement integer format.  The conversion
   4287 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
   4288 | Arithmetic, except that the conversion is always rounded toward zero.  If
   4289 | `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
   4290 | conversion overflows, the largest integer with the same sign as `a' is
   4291 | returned.
   4292 *----------------------------------------------------------------------------*/
   4293 
   4294 int32 float128_to_int32_round_to_zero( float128 a STATUS_PARAM )
   4295 {
   4296     flag aSign;
   4297     int32 aExp, shiftCount;
   4298     bits64 aSig0, aSig1, savedASig;
   4299     int32 z;
   4300 
   4301     aSig1 = extractFloat128Frac1( a );
   4302     aSig0 = extractFloat128Frac0( a );
   4303     aExp = extractFloat128Exp( a );
   4304     aSign = extractFloat128Sign( a );
   4305     aSig0 |= ( aSig1 != 0 );
   4306     if ( 0x401E < aExp ) {
   4307         if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
   4308         goto invalid;
   4309     }
   4310     else if ( aExp < 0x3FFF ) {
   4311         if ( aExp || aSig0 ) STATUS(float_exception_flags) |= float_flag_inexact;
   4312         return 0;
   4313     }
   4314     aSig0 |= LIT64( 0x0001000000000000 );
   4315     shiftCount = 0x402F - aExp;
   4316     savedASig = aSig0;
   4317     aSig0 >>= shiftCount;
   4318     z = aSig0;
   4319     if ( aSign ) z = - z;
   4320     if ( ( z < 0 ) ^ aSign ) {
   4321  invalid:
   4322         float_raise( float_flag_invalid STATUS_VAR);
   4323         return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
   4324     }
   4325     if ( ( aSig0<<shiftCount ) != savedASig ) {
   4326         STATUS(float_exception_flags) |= float_flag_inexact;
   4327     }
   4328     return z;
   4329 
   4330 }
   4331 
   4332 /*----------------------------------------------------------------------------
   4333 | Returns the result of converting the quadruple-precision floating-point
   4334 | value `a' to the 64-bit two's complement integer format.  The conversion
   4335 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
   4336 | Arithmetic---which means in particular that the conversion is rounded
   4337 | according to the current rounding mode.  If `a' is a NaN, the largest
   4338 | positive integer is returned.  Otherwise, if the conversion overflows, the
   4339 | largest integer with the same sign as `a' is returned.
   4340 *----------------------------------------------------------------------------*/
   4341 
   4342 int64 float128_to_int64( float128 a STATUS_PARAM )
   4343 {
   4344     flag aSign;
   4345     int32 aExp, shiftCount;
   4346     bits64 aSig0, aSig1;
   4347 
   4348     aSig1 = extractFloat128Frac1( a );
   4349     aSig0 = extractFloat128Frac0( a );
   4350     aExp = extractFloat128Exp( a );
   4351     aSign = extractFloat128Sign( a );
   4352     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
   4353     shiftCount = 0x402F - aExp;
   4354     if ( shiftCount <= 0 ) {
   4355         if ( 0x403E < aExp ) {
   4356             float_raise( float_flag_invalid STATUS_VAR);
   4357             if (    ! aSign
   4358                  || (    ( aExp == 0x7FFF )
   4359                       && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) )
   4360                     )
   4361                ) {
   4362                 return LIT64( 0x7FFFFFFFFFFFFFFF );
   4363             }
   4364             return (sbits64) LIT64( 0x8000000000000000 );
   4365         }
   4366         shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
   4367     }
   4368     else {
   4369         shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
   4370     }
   4371     return roundAndPackInt64( aSign, aSig0, aSig1 STATUS_VAR );
   4372 
   4373 }
   4374 
   4375 /*----------------------------------------------------------------------------
   4376 | Returns the result of converting the quadruple-precision floating-point
   4377 | value `a' to the 64-bit two's complement integer format.  The conversion
   4378 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
   4379 | Arithmetic, except that the conversion is always rounded toward zero.
   4380 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
   4381 | the conversion overflows, the largest integer with the same sign as `a' is
   4382 | returned.
   4383 *----------------------------------------------------------------------------*/
   4384 
   4385 int64 float128_to_int64_round_to_zero( float128 a STATUS_PARAM )
   4386 {
   4387     flag aSign;
   4388     int32 aExp, shiftCount;
   4389     bits64 aSig0, aSig1;
   4390     int64 z;
   4391 
   4392     aSig1 = extractFloat128Frac1( a );
   4393     aSig0 = extractFloat128Frac0( a );
   4394     aExp = extractFloat128Exp( a );
   4395     aSign = extractFloat128Sign( a );
   4396     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
   4397     shiftCount = aExp - 0x402F;
   4398     if ( 0 < shiftCount ) {
   4399         if ( 0x403E <= aExp ) {
   4400             aSig0 &= LIT64( 0x0000FFFFFFFFFFFF );
   4401             if (    ( a.high == LIT64( 0xC03E000000000000 ) )
   4402                  && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) {
   4403                 if ( aSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
   4404             }
   4405             else {
   4406                 float_raise( float_flag_invalid STATUS_VAR);
   4407                 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) {
   4408                     return LIT64( 0x7FFFFFFFFFFFFFFF );
   4409                 }
   4410             }
   4411             return (sbits64) LIT64( 0x8000000000000000 );
   4412         }
   4413         z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
   4414         if ( (bits64) ( aSig1<<shiftCount ) ) {
   4415             STATUS(float_exception_flags) |= float_flag_inexact;
   4416         }
   4417     }
   4418     else {
   4419         if ( aExp < 0x3FFF ) {
   4420             if ( aExp | aSig0 | aSig1 ) {
   4421                 STATUS(float_exception_flags) |= float_flag_inexact;
   4422             }
   4423             return 0;
   4424         }
   4425         z = aSig0>>( - shiftCount );
   4426         if (    aSig1
   4427              || ( shiftCount && (bits64) ( aSig0<<( shiftCount & 63 ) ) ) ) {
   4428             STATUS(float_exception_flags) |= float_flag_inexact;
   4429         }
   4430     }
   4431     if ( aSign ) z = - z;
   4432     return z;
   4433 
   4434 }
   4435 
   4436 /*----------------------------------------------------------------------------
   4437 | Returns the result of converting the quadruple-precision floating-point
   4438 | value `a' to the single-precision floating-point format.  The conversion
   4439 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
   4440 | Arithmetic.
   4441 *----------------------------------------------------------------------------*/
   4442 
   4443 float32 float128_to_float32( float128 a STATUS_PARAM )
   4444 {
   4445     flag aSign;
   4446     int32 aExp;
   4447     bits64 aSig0, aSig1;
   4448     bits32 zSig;
   4449 
   4450     aSig1 = extractFloat128Frac1( a );
   4451     aSig0 = extractFloat128Frac0( a );
   4452     aExp = extractFloat128Exp( a );
   4453     aSign = extractFloat128Sign( a );
   4454     if ( aExp == 0x7FFF ) {
   4455         if ( aSig0 | aSig1 ) {
   4456             return commonNaNToFloat32( float128ToCommonNaN( a STATUS_VAR ) );
   4457         }
   4458         return packFloat32( aSign, 0xFF, 0 );
   4459     }
   4460     aSig0 |= ( aSig1 != 0 );
   4461     shift64RightJamming( aSig0, 18, &aSig0 );
   4462     zSig = aSig0;
   4463     if ( aExp || zSig ) {
   4464         zSig |= 0x40000000;
   4465         aExp -= 0x3F81;
   4466     }
   4467     return roundAndPackFloat32( aSign, aExp, zSig STATUS_VAR );
   4468 
   4469 }
   4470 
   4471 /*----------------------------------------------------------------------------
   4472 | Returns the result of converting the quadruple-precision floating-point
   4473 | value `a' to the double-precision floating-point format.  The conversion
   4474 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
   4475 | Arithmetic.
   4476 *----------------------------------------------------------------------------*/
   4477 
   4478 float64 float128_to_float64( float128 a STATUS_PARAM )
   4479 {
   4480     flag aSign;
   4481     int32 aExp;
   4482     bits64 aSig0, aSig1;
   4483 
   4484     aSig1 = extractFloat128Frac1( a );
   4485     aSig0 = extractFloat128Frac0( a );
   4486     aExp = extractFloat128Exp( a );
   4487     aSign = extractFloat128Sign( a );
   4488     if ( aExp == 0x7FFF ) {
   4489         if ( aSig0 | aSig1 ) {
   4490             return commonNaNToFloat64( float128ToCommonNaN( a STATUS_VAR ) );
   4491         }
   4492         return packFloat64( aSign, 0x7FF, 0 );
   4493     }
   4494     shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
   4495     aSig0 |= ( aSig1 != 0 );
   4496     if ( aExp || aSig0 ) {
   4497         aSig0 |= LIT64( 0x4000000000000000 );
   4498         aExp -= 0x3C01;
   4499     }
   4500     return roundAndPackFloat64( aSign, aExp, aSig0 STATUS_VAR );
   4501 
   4502 }
   4503 
   4504 #ifdef FLOATX80
   4505 
   4506 /*----------------------------------------------------------------------------
   4507 | Returns the result of converting the quadruple-precision floating-point
   4508 | value `a' to the extended double-precision floating-point format.  The
   4509 | conversion is performed according to the IEC/IEEE Standard for Binary
   4510 | Floating-Point Arithmetic.
   4511 *----------------------------------------------------------------------------*/
   4512 
   4513 floatx80 float128_to_floatx80( float128 a STATUS_PARAM )
   4514 {
   4515     flag aSign;
   4516     int32 aExp;
   4517     bits64 aSig0, aSig1;
   4518 
   4519     aSig1 = extractFloat128Frac1( a );
   4520     aSig0 = extractFloat128Frac0( a );
   4521     aExp = extractFloat128Exp( a );
   4522     aSign = extractFloat128Sign( a );
   4523     if ( aExp == 0x7FFF ) {
   4524         if ( aSig0 | aSig1 ) {
   4525             return commonNaNToFloatx80( float128ToCommonNaN( a STATUS_VAR ) );
   4526         }
   4527         return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
   4528     }
   4529     if ( aExp == 0 ) {
   4530         if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
   4531         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
   4532     }
   4533     else {
   4534         aSig0 |= LIT64( 0x0001000000000000 );
   4535     }
   4536     shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
   4537     return roundAndPackFloatx80( 80, aSign, aExp, aSig0, aSig1 STATUS_VAR );
   4538 
   4539 }
   4540 
   4541 #endif
   4542 
   4543 /*----------------------------------------------------------------------------
   4544 | Rounds the quadruple-precision floating-point value `a' to an integer, and
   4545 | returns the result as a quadruple-precision floating-point value.  The
   4546 | operation is performed according to the IEC/IEEE Standard for Binary
   4547 | Floating-Point Arithmetic.
   4548 *----------------------------------------------------------------------------*/
   4549 
   4550 float128 float128_round_to_int( float128 a STATUS_PARAM )
   4551 {
   4552     flag aSign;
   4553     int32 aExp;
   4554     bits64 lastBitMask, roundBitsMask;
   4555     int8 roundingMode;
   4556     float128 z;
   4557 
   4558     aExp = extractFloat128Exp( a );
   4559     if ( 0x402F <= aExp ) {
   4560         if ( 0x406F <= aExp ) {
   4561             if (    ( aExp == 0x7FFF )
   4562                  && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )
   4563                ) {
   4564                 return propagateFloat128NaN( a, a STATUS_VAR );
   4565             }
   4566             return a;
   4567         }
   4568         lastBitMask = 1;
   4569         lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
   4570         roundBitsMask = lastBitMask - 1;
   4571         z = a;
   4572         roundingMode = STATUS(float_rounding_mode);
   4573         if ( roundingMode == float_round_nearest_even ) {
   4574             if ( lastBitMask ) {
   4575                 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
   4576                 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
   4577             }
   4578             else {
   4579                 if ( (sbits64) z.low < 0 ) {
   4580                     ++z.high;
   4581                     if ( (bits64) ( z.low<<1 ) == 0 ) z.high &= ~1;
   4582                 }
   4583             }
   4584         }
   4585         else if ( roundingMode != float_round_to_zero ) {
   4586             if (   extractFloat128Sign( z )
   4587                  ^ ( roundingMode == float_round_up ) ) {
   4588                 add128( z.high, z.low, 0, roundBitsMask, &z.high, &z.low );
   4589             }
   4590         }
   4591         z.low &= ~ roundBitsMask;
   4592     }
   4593     else {
   4594         if ( aExp < 0x3FFF ) {
   4595             if ( ( ( (bits64) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
   4596             STATUS(float_exception_flags) |= float_flag_inexact;
   4597             aSign = extractFloat128Sign( a );
   4598             switch ( STATUS(float_rounding_mode) ) {
   4599              case float_round_nearest_even:
   4600                 if (    ( aExp == 0x3FFE )
   4601                      && (   extractFloat128Frac0( a )
   4602                           | extractFloat128Frac1( a ) )
   4603                    ) {
   4604                     return packFloat128( aSign, 0x3FFF, 0, 0 );
   4605                 }
   4606                 break;
   4607              case float_round_down:
   4608                 return
   4609                       aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
   4610                     : packFloat128( 0, 0, 0, 0 );
   4611              case float_round_up:
   4612                 return
   4613                       aSign ? packFloat128( 1, 0, 0, 0 )
   4614                     : packFloat128( 0, 0x3FFF, 0, 0 );
   4615             }
   4616             return packFloat128( aSign, 0, 0, 0 );
   4617         }
   4618         lastBitMask = 1;
   4619         lastBitMask <<= 0x402F - aExp;
   4620         roundBitsMask = lastBitMask - 1;
   4621         z.low = 0;
   4622         z.high = a.high;
   4623         roundingMode = STATUS(float_rounding_mode);
   4624         if ( roundingMode == float_round_nearest_even ) {
   4625             z.high += lastBitMask>>1;
   4626             if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
   4627                 z.high &= ~ lastBitMask;
   4628             }
   4629         }
   4630         else if ( roundingMode != float_round_to_zero ) {
   4631             if (   extractFloat128Sign( z )
   4632                  ^ ( roundingMode == float_round_up ) ) {
   4633                 z.high |= ( a.low != 0 );
   4634                 z.high += roundBitsMask;
   4635             }
   4636         }
   4637         z.high &= ~ roundBitsMask;
   4638     }
   4639     if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
   4640         STATUS(float_exception_flags) |= float_flag_inexact;
   4641     }
   4642     return z;
   4643 
   4644 }
   4645 
   4646 /*----------------------------------------------------------------------------
   4647 | Returns the result of adding the absolute values of the quadruple-precision
   4648 | floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
   4649 | before being returned.  `zSign' is ignored if the result is a NaN.
   4650 | The addition is performed according to the IEC/IEEE Standard for Binary
   4651 | Floating-Point Arithmetic.
   4652 *----------------------------------------------------------------------------*/
   4653 
   4654 static float128 addFloat128Sigs( float128 a, float128 b, flag zSign STATUS_PARAM)
   4655 {
   4656     int32 aExp, bExp, zExp;
   4657     bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
   4658     int32 expDiff;
   4659 
   4660     aSig1 = extractFloat128Frac1( a );
   4661     aSig0 = extractFloat128Frac0( a );
   4662     aExp = extractFloat128Exp( a );
   4663     bSig1 = extractFloat128Frac1( b );
   4664     bSig0 = extractFloat128Frac0( b );
   4665     bExp = extractFloat128Exp( b );
   4666     expDiff = aExp - bExp;
   4667     if ( 0 < expDiff ) {
   4668         if ( aExp == 0x7FFF ) {
   4669             if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
   4670             return a;
   4671         }
   4672         if ( bExp == 0 ) {
   4673             --expDiff;
   4674         }
   4675         else {
   4676             bSig0 |= LIT64( 0x0001000000000000 );
   4677         }
   4678         shift128ExtraRightJamming(
   4679             bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
   4680         zExp = aExp;
   4681     }
   4682     else if ( expDiff < 0 ) {
   4683         if ( bExp == 0x7FFF ) {
   4684             if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
   4685             return packFloat128( zSign, 0x7FFF, 0, 0 );
   4686         }
   4687         if ( aExp == 0 ) {
   4688             ++expDiff;
   4689         }
   4690         else {
   4691             aSig0 |= LIT64( 0x0001000000000000 );
   4692         }
   4693         shift128ExtraRightJamming(
   4694             aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
   4695         zExp = bExp;
   4696     }
   4697     else {
   4698         if ( aExp == 0x7FFF ) {
   4699             if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
   4700                 return propagateFloat128NaN( a, b STATUS_VAR );
   4701             }
   4702             return a;
   4703         }
   4704         add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
   4705         if ( aExp == 0 ) {
   4706             if ( STATUS(flush_to_zero) ) return packFloat128( zSign, 0, 0, 0 );
   4707             return packFloat128( zSign, 0, zSig0, zSig1 );
   4708         }
   4709         zSig2 = 0;
   4710         zSig0 |= LIT64( 0x0002000000000000 );
   4711         zExp = aExp;
   4712         goto shiftRight1;
   4713     }
   4714     aSig0 |= LIT64( 0x0001000000000000 );
   4715     add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
   4716     --zExp;
   4717     if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack;
   4718     ++zExp;
   4719  shiftRight1:
   4720     shift128ExtraRightJamming(
   4721         zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
   4722  roundAndPack:
   4723     return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
   4724 
   4725 }
   4726 
   4727 /*----------------------------------------------------------------------------
   4728 | Returns the result of subtracting the absolute values of the quadruple-
   4729 | precision floating-point values `a' and `b'.  If `zSign' is 1, the
   4730 | difference is negated before being returned.  `zSign' is ignored if the
   4731 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
   4732 | Standard for Binary Floating-Point Arithmetic.
   4733 *----------------------------------------------------------------------------*/
   4734 
   4735 static float128 subFloat128Sigs( float128 a, float128 b, flag zSign STATUS_PARAM)
   4736 {
   4737     int32 aExp, bExp, zExp;
   4738     bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
   4739     int32 expDiff;
   4740     float128 z;
   4741 
   4742     aSig1 = extractFloat128Frac1( a );
   4743     aSig0 = extractFloat128Frac0( a );
   4744     aExp = extractFloat128Exp( a );
   4745     bSig1 = extractFloat128Frac1( b );
   4746     bSig0 = extractFloat128Frac0( b );
   4747     bExp = extractFloat128Exp( b );
   4748     expDiff = aExp - bExp;
   4749     shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
   4750     shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
   4751     if ( 0 < expDiff ) goto aExpBigger;
   4752     if ( expDiff < 0 ) goto bExpBigger;
   4753     if ( aExp == 0x7FFF ) {
   4754         if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
   4755             return propagateFloat128NaN( a, b STATUS_VAR );
   4756         }
   4757         float_raise( float_flag_invalid STATUS_VAR);
   4758         z.low = float128_default_nan_low;
   4759         z.high = float128_default_nan_high;
   4760         return z;
   4761     }
   4762     if ( aExp == 0 ) {
   4763         aExp = 1;
   4764         bExp = 1;
   4765     }
   4766     if ( bSig0 < aSig0 ) goto aBigger;
   4767     if ( aSig0 < bSig0 ) goto bBigger;
   4768     if ( bSig1 < aSig1 ) goto aBigger;
   4769     if ( aSig1 < bSig1 ) goto bBigger;
   4770     return packFloat128( STATUS(float_rounding_mode) == float_round_down, 0, 0, 0 );
   4771  bExpBigger:
   4772     if ( bExp == 0x7FFF ) {
   4773         if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
   4774         return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
   4775     }
   4776     if ( aExp == 0 ) {
   4777         ++expDiff;
   4778     }
   4779     else {
   4780         aSig0 |= LIT64( 0x4000000000000000 );
   4781     }
   4782     shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
   4783     bSig0 |= LIT64( 0x4000000000000000 );
   4784  bBigger:
   4785     sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
   4786     zExp = bExp;
   4787     zSign ^= 1;
   4788     goto normalizeRoundAndPack;
   4789  aExpBigger:
   4790     if ( aExp == 0x7FFF ) {
   4791         if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
   4792         return a;
   4793     }
   4794     if ( bExp == 0 ) {
   4795         --expDiff;
   4796     }
   4797     else {
   4798         bSig0 |= LIT64( 0x4000000000000000 );
   4799     }
   4800     shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
   4801     aSig0 |= LIT64( 0x4000000000000000 );
   4802  aBigger:
   4803     sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
   4804     zExp = aExp;
   4805  normalizeRoundAndPack:
   4806     --zExp;
   4807     return normalizeRoundAndPackFloat128( zSign, zExp - 14, zSig0, zSig1 STATUS_VAR );
   4808 
   4809 }
   4810 
   4811 /*----------------------------------------------------------------------------
   4812 | Returns the result of adding the quadruple-precision floating-point values
   4813 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
   4814 | for Binary Floating-Point Arithmetic.
   4815 *----------------------------------------------------------------------------*/
   4816 
   4817 float128 float128_add( float128 a, float128 b STATUS_PARAM )
   4818 {
   4819     flag aSign, bSign;
   4820 
   4821     aSign = extractFloat128Sign( a );
   4822     bSign = extractFloat128Sign( b );
   4823     if ( aSign == bSign ) {
   4824         return addFloat128Sigs( a, b, aSign STATUS_VAR );
   4825     }
   4826     else {
   4827         return subFloat128Sigs( a, b, aSign STATUS_VAR );
   4828     }
   4829 
   4830 }
   4831 
   4832 /*----------------------------------------------------------------------------
   4833 | Returns the result of subtracting the quadruple-precision floating-point
   4834 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
   4835 | Standard for Binary Floating-Point Arithmetic.
   4836 *----------------------------------------------------------------------------*/
   4837 
   4838 float128 float128_sub( float128 a, float128 b STATUS_PARAM )
   4839 {
   4840     flag aSign, bSign;
   4841 
   4842     aSign = extractFloat128Sign( a );
   4843     bSign = extractFloat128Sign( b );
   4844     if ( aSign == bSign ) {
   4845         return subFloat128Sigs( a, b, aSign STATUS_VAR );
   4846     }
   4847     else {
   4848         return addFloat128Sigs( a, b, aSign STATUS_VAR );
   4849     }
   4850 
   4851 }
   4852 
   4853 /*----------------------------------------------------------------------------
   4854 | Returns the result of multiplying the quadruple-precision floating-point
   4855 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
   4856 | Standard for Binary Floating-Point Arithmetic.
   4857 *----------------------------------------------------------------------------*/
   4858 
   4859 float128 float128_mul( float128 a, float128 b STATUS_PARAM )
   4860 {
   4861     flag aSign, bSign, zSign;
   4862     int32 aExp, bExp, zExp;
   4863     bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
   4864     float128 z;
   4865 
   4866     aSig1 = extractFloat128Frac1( a );
   4867     aSig0 = extractFloat128Frac0( a );
   4868     aExp = extractFloat128Exp( a );
   4869     aSign = extractFloat128Sign( a );
   4870     bSig1 = extractFloat128Frac1( b );
   4871     bSig0 = extractFloat128Frac0( b );
   4872     bExp = extractFloat128Exp( b );
   4873     bSign = extractFloat128Sign( b );
   4874     zSign = aSign ^ bSign;
   4875     if ( aExp == 0x7FFF ) {
   4876         if (    ( aSig0 | aSig1 )
   4877              || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
   4878             return propagateFloat128NaN( a, b STATUS_VAR );
   4879         }
   4880         if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
   4881         return packFloat128( zSign, 0x7FFF, 0, 0 );
   4882     }
   4883     if ( bExp == 0x7FFF ) {
   4884         if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
   4885         if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
   4886  invalid:
   4887             float_raise( float_flag_invalid STATUS_VAR);
   4888             z.low = float128_default_nan_low;
   4889             z.high = float128_default_nan_high;
   4890             return z;
   4891         }
   4892         return packFloat128( zSign, 0x7FFF, 0, 0 );
   4893     }
   4894     if ( aExp == 0 ) {
   4895         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
   4896         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
   4897     }
   4898     if ( bExp == 0 ) {
   4899         if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
   4900         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
   4901     }
   4902     zExp = aExp + bExp - 0x4000;
   4903     aSig0 |= LIT64( 0x0001000000000000 );
   4904     shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
   4905     mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
   4906     add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
   4907     zSig2 |= ( zSig3 != 0 );
   4908     if ( LIT64( 0x0002000000000000 ) <= zSig0 ) {
   4909         shift128ExtraRightJamming(
   4910             zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
   4911         ++zExp;
   4912     }
   4913     return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
   4914 
   4915 }
   4916 
   4917 /*----------------------------------------------------------------------------
   4918 | Returns the result of dividing the quadruple-precision floating-point value
   4919 | `a' by the corresponding value `b'.  The operation is performed according to
   4920 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   4921 *----------------------------------------------------------------------------*/
   4922 
   4923 float128 float128_div( float128 a, float128 b STATUS_PARAM )
   4924 {
   4925     flag aSign, bSign, zSign;
   4926     int32 aExp, bExp, zExp;
   4927     bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
   4928     bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
   4929     float128 z;
   4930 
   4931     aSig1 = extractFloat128Frac1( a );
   4932     aSig0 = extractFloat128Frac0( a );
   4933     aExp = extractFloat128Exp( a );
   4934     aSign = extractFloat128Sign( a );
   4935     bSig1 = extractFloat128Frac1( b );
   4936     bSig0 = extractFloat128Frac0( b );
   4937     bExp = extractFloat128Exp( b );
   4938     bSign = extractFloat128Sign( b );
   4939     zSign = aSign ^ bSign;
   4940     if ( aExp == 0x7FFF ) {
   4941         if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
   4942         if ( bExp == 0x7FFF ) {
   4943             if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
   4944             goto invalid;
   4945         }
   4946         return packFloat128( zSign, 0x7FFF, 0, 0 );
   4947     }
   4948     if ( bExp == 0x7FFF ) {
   4949         if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
   4950         return packFloat128( zSign, 0, 0, 0 );
   4951     }
   4952     if ( bExp == 0 ) {
   4953         if ( ( bSig0 | bSig1 ) == 0 ) {
   4954             if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
   4955  invalid:
   4956                 float_raise( float_flag_invalid STATUS_VAR);
   4957                 z.low = float128_default_nan_low;
   4958                 z.high = float128_default_nan_high;
   4959                 return z;
   4960             }
   4961             float_raise( float_flag_divbyzero STATUS_VAR);
   4962             return packFloat128( zSign, 0x7FFF, 0, 0 );
   4963         }
   4964         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
   4965     }
   4966     if ( aExp == 0 ) {
   4967         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
   4968         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
   4969     }
   4970     zExp = aExp - bExp + 0x3FFD;
   4971     shortShift128Left(
   4972         aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 );
   4973     shortShift128Left(
   4974         bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
   4975     if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
   4976         shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
   4977         ++zExp;
   4978     }
   4979     zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
   4980     mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
   4981     sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
   4982     while ( (sbits64) rem0 < 0 ) {
   4983         --zSig0;
   4984         add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
   4985     }
   4986     zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
   4987     if ( ( zSig1 & 0x3FFF ) <= 4 ) {
   4988         mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
   4989         sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
   4990         while ( (sbits64) rem1 < 0 ) {
   4991             --zSig1;
   4992             add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
   4993         }
   4994         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
   4995     }
   4996     shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
   4997     return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
   4998 
   4999 }
   5000 
   5001 /*----------------------------------------------------------------------------
   5002 | Returns the remainder of the quadruple-precision floating-point value `a'
   5003 | with respect to the corresponding value `b'.  The operation is performed
   5004 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   5005 *----------------------------------------------------------------------------*/
   5006 
   5007 float128 float128_rem( float128 a, float128 b STATUS_PARAM )
   5008 {
   5009     flag aSign, bSign, zSign;
   5010     int32 aExp, bExp, expDiff;
   5011     bits64 aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
   5012     bits64 allZero, alternateASig0, alternateASig1, sigMean1;
   5013     sbits64 sigMean0;
   5014     float128 z;
   5015 
   5016     aSig1 = extractFloat128Frac1( a );
   5017     aSig0 = extractFloat128Frac0( a );
   5018     aExp = extractFloat128Exp( a );
   5019     aSign = extractFloat128Sign( a );
   5020     bSig1 = extractFloat128Frac1( b );
   5021     bSig0 = extractFloat128Frac0( b );
   5022     bExp = extractFloat128Exp( b );
   5023     bSign = extractFloat128Sign( b );
   5024     if ( aExp == 0x7FFF ) {
   5025         if (    ( aSig0 | aSig1 )
   5026              || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
   5027             return propagateFloat128NaN( a, b STATUS_VAR );
   5028         }
   5029         goto invalid;
   5030     }
   5031     if ( bExp == 0x7FFF ) {
   5032         if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
   5033         return a;
   5034     }
   5035     if ( bExp == 0 ) {
   5036         if ( ( bSig0 | bSig1 ) == 0 ) {
   5037  invalid:
   5038             float_raise( float_flag_invalid STATUS_VAR);
   5039             z.low = float128_default_nan_low;
   5040             z.high = float128_default_nan_high;
   5041             return z;
   5042         }
   5043         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
   5044     }
   5045     if ( aExp == 0 ) {
   5046         if ( ( aSig0 | aSig1 ) == 0 ) return a;
   5047         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
   5048     }
   5049     expDiff = aExp - bExp;
   5050     if ( expDiff < -1 ) return a;
   5051     shortShift128Left(
   5052         aSig0 | LIT64( 0x0001000000000000 ),
   5053         aSig1,
   5054         15 - ( expDiff < 0 ),
   5055         &aSig0,
   5056         &aSig1
   5057     );
   5058     shortShift128Left(
   5059         bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
   5060     q = le128( bSig0, bSig1, aSig0, aSig1 );
   5061     if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
   5062     expDiff -= 64;
   5063     while ( 0 < expDiff ) {
   5064         q = estimateDiv128To64( aSig0, aSig1, bSig0 );
   5065         q = ( 4 < q ) ? q - 4 : 0;
   5066         mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
   5067         shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
   5068         shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
   5069         sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
   5070         expDiff -= 61;
   5071     }
   5072     if ( -64 < expDiff ) {
   5073         q = estimateDiv128To64( aSig0, aSig1, bSig0 );
   5074         q = ( 4 < q ) ? q - 4 : 0;
   5075         q >>= - expDiff;
   5076         shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
   5077         expDiff += 52;
   5078         if ( expDiff < 0 ) {
   5079             shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
   5080         }
   5081         else {
   5082             shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
   5083         }
   5084         mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
   5085         sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
   5086     }
   5087     else {
   5088         shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
   5089         shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
   5090     }
   5091     do {
   5092         alternateASig0 = aSig0;
   5093         alternateASig1 = aSig1;
   5094         ++q;
   5095         sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
   5096     } while ( 0 <= (sbits64) aSig0 );
   5097     add128(
   5098         aSig0, aSig1, alternateASig0, alternateASig1, (bits64 *)&sigMean0, &sigMean1 );
   5099     if (    ( sigMean0 < 0 )
   5100          || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
   5101         aSig0 = alternateASig0;
   5102         aSig1 = alternateASig1;
   5103     }
   5104     zSign = ( (sbits64) aSig0 < 0 );
   5105     if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
   5106     return
   5107         normalizeRoundAndPackFloat128( aSign ^ zSign, bExp - 4, aSig0, aSig1 STATUS_VAR );
   5108 
   5109 }
   5110 
   5111 /*----------------------------------------------------------------------------
   5112 | Returns the square root of the quadruple-precision floating-point value `a'.
   5113 | The operation is performed according to the IEC/IEEE Standard for Binary
   5114 | Floating-Point Arithmetic.
   5115 *----------------------------------------------------------------------------*/
   5116 
   5117 float128 float128_sqrt( float128 a STATUS_PARAM )
   5118 {
   5119     flag aSign;
   5120     int32 aExp, zExp;
   5121     bits64 aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
   5122     bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
   5123     float128 z;
   5124 
   5125     aSig1 = extractFloat128Frac1( a );
   5126     aSig0 = extractFloat128Frac0( a );
   5127     aExp = extractFloat128Exp( a );
   5128     aSign = extractFloat128Sign( a );
   5129     if ( aExp == 0x7FFF ) {
   5130         if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, a STATUS_VAR );
   5131         if ( ! aSign ) return a;
   5132         goto invalid;
   5133     }
   5134     if ( aSign ) {
   5135         if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
   5136  invalid:
   5137         float_raise( float_flag_invalid STATUS_VAR);
   5138         z.low = float128_default_nan_low;
   5139         z.high = float128_default_nan_high;
   5140         return z;
   5141     }
   5142     if ( aExp == 0 ) {
   5143         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
   5144         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
   5145     }
   5146     zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;
   5147     aSig0 |= LIT64( 0x0001000000000000 );
   5148     zSig0 = estimateSqrt32( aExp, aSig0>>17 );
   5149     shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
   5150     zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
   5151     doubleZSig0 = zSig0<<1;
   5152     mul64To128( zSig0, zSig0, &term0, &term1 );
   5153     sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
   5154     while ( (sbits64) rem0 < 0 ) {
   5155         --zSig0;
   5156         doubleZSig0 -= 2;
   5157         add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
   5158     }
   5159     zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
   5160     if ( ( zSig1 & 0x1FFF ) <= 5 ) {
   5161         if ( zSig1 == 0 ) zSig1 = 1;
   5162         mul64To128( doubleZSig0, zSig1, &term1, &term2 );
   5163         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
   5164         mul64To128( zSig1, zSig1, &term2, &term3 );
   5165         sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
   5166         while ( (sbits64) rem1 < 0 ) {
   5167             --zSig1;
   5168             shortShift128Left( 0, zSig1, 1, &term2, &term3 );
   5169             term3 |= 1;
   5170             term2 |= doubleZSig0;
   5171             add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
   5172         }
   5173         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
   5174     }
   5175     shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
   5176     return roundAndPackFloat128( 0, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
   5177 
   5178 }
   5179 
   5180 /*----------------------------------------------------------------------------
   5181 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
   5182 | the corresponding value `b', and 0 otherwise.  The comparison is performed
   5183 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   5184 *----------------------------------------------------------------------------*/
   5185 
   5186 int float128_eq( float128 a, float128 b STATUS_PARAM )
   5187 {
   5188 
   5189     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
   5190               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
   5191          || (    ( extractFloat128Exp( b ) == 0x7FFF )
   5192               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
   5193        ) {
   5194         if (    float128_is_signaling_nan( a )
   5195              || float128_is_signaling_nan( b ) ) {
   5196             float_raise( float_flag_invalid STATUS_VAR);
   5197         }
   5198         return 0;
   5199     }
   5200     return
   5201            ( a.low == b.low )
   5202         && (    ( a.high == b.high )
   5203              || (    ( a.low == 0 )
   5204                   && ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) )
   5205            );
   5206 
   5207 }
   5208 
   5209 /*----------------------------------------------------------------------------
   5210 | Returns 1 if the quadruple-precision floating-point value `a' is less than
   5211 | or equal to the corresponding value `b', and 0 otherwise.  The comparison
   5212 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
   5213 | Arithmetic.
   5214 *----------------------------------------------------------------------------*/
   5215 
   5216 int float128_le( float128 a, float128 b STATUS_PARAM )
   5217 {
   5218     flag aSign, bSign;
   5219 
   5220     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
   5221               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
   5222          || (    ( extractFloat128Exp( b ) == 0x7FFF )
   5223               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
   5224        ) {
   5225         float_raise( float_flag_invalid STATUS_VAR);
   5226         return 0;
   5227     }
   5228     aSign = extractFloat128Sign( a );
   5229     bSign = extractFloat128Sign( b );
   5230     if ( aSign != bSign ) {
   5231         return
   5232                aSign
   5233             || (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
   5234                  == 0 );
   5235     }
   5236     return
   5237           aSign ? le128( b.high, b.low, a.high, a.low )
   5238         : le128( a.high, a.low, b.high, b.low );
   5239 
   5240 }
   5241 
   5242 /*----------------------------------------------------------------------------
   5243 | Returns 1 if the quadruple-precision floating-point value `a' is less than
   5244 | the corresponding value `b', and 0 otherwise.  The comparison is performed
   5245 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   5246 *----------------------------------------------------------------------------*/
   5247 
   5248 int float128_lt( float128 a, float128 b STATUS_PARAM )
   5249 {
   5250     flag aSign, bSign;
   5251 
   5252     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
   5253               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
   5254          || (    ( extractFloat128Exp( b ) == 0x7FFF )
   5255               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
   5256        ) {
   5257         float_raise( float_flag_invalid STATUS_VAR);
   5258         return 0;
   5259     }
   5260     aSign = extractFloat128Sign( a );
   5261     bSign = extractFloat128Sign( b );
   5262     if ( aSign != bSign ) {
   5263         return
   5264                aSign
   5265             && (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
   5266                  != 0 );
   5267     }
   5268     return
   5269           aSign ? lt128( b.high, b.low, a.high, a.low )
   5270         : lt128( a.high, a.low, b.high, b.low );
   5271 
   5272 }
   5273 
   5274 /*----------------------------------------------------------------------------
   5275 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
   5276 | the corresponding value `b', and 0 otherwise.  The invalid exception is
   5277 | raised if either operand is a NaN.  Otherwise, the comparison is performed
   5278 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   5279 *----------------------------------------------------------------------------*/
   5280 
   5281 int float128_eq_signaling( float128 a, float128 b STATUS_PARAM )
   5282 {
   5283 
   5284     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
   5285               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
   5286          || (    ( extractFloat128Exp( b ) == 0x7FFF )
   5287               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
   5288        ) {
   5289         float_raise( float_flag_invalid STATUS_VAR);
   5290         return 0;
   5291     }
   5292     return
   5293            ( a.low == b.low )
   5294         && (    ( a.high == b.high )
   5295              || (    ( a.low == 0 )
   5296                   && ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) )
   5297            );
   5298 
   5299 }
   5300 
   5301 /*----------------------------------------------------------------------------
   5302 | Returns 1 if the quadruple-precision floating-point value `a' is less than
   5303 | or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
   5304 | cause an exception.  Otherwise, the comparison is performed according to the
   5305 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   5306 *----------------------------------------------------------------------------*/
   5307 
   5308 int float128_le_quiet( float128 a, float128 b STATUS_PARAM )
   5309 {
   5310     flag aSign, bSign;
   5311 
   5312     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
   5313               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
   5314          || (    ( extractFloat128Exp( b ) == 0x7FFF )
   5315               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
   5316        ) {
   5317         if (    float128_is_signaling_nan( a )
   5318              || float128_is_signaling_nan( b ) ) {
   5319             float_raise( float_flag_invalid STATUS_VAR);
   5320         }
   5321         return 0;
   5322     }
   5323     aSign = extractFloat128Sign( a );
   5324     bSign = extractFloat128Sign( b );
   5325     if ( aSign != bSign ) {
   5326         return
   5327                aSign
   5328             || (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
   5329                  == 0 );
   5330     }
   5331     return
   5332           aSign ? le128( b.high, b.low, a.high, a.low )
   5333         : le128( a.high, a.low, b.high, b.low );
   5334 
   5335 }
   5336 
   5337 /*----------------------------------------------------------------------------
   5338 | Returns 1 if the quadruple-precision floating-point value `a' is less than
   5339 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
   5340 | exception.  Otherwise, the comparison is performed according to the IEC/IEEE
   5341 | Standard for Binary Floating-Point Arithmetic.
   5342 *----------------------------------------------------------------------------*/
   5343 
   5344 int float128_lt_quiet( float128 a, float128 b STATUS_PARAM )
   5345 {
   5346     flag aSign, bSign;
   5347 
   5348     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
   5349               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
   5350          || (    ( extractFloat128Exp( b ) == 0x7FFF )
   5351               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
   5352        ) {
   5353         if (    float128_is_signaling_nan( a )
   5354              || float128_is_signaling_nan( b ) ) {
   5355             float_raise( float_flag_invalid STATUS_VAR);
   5356         }
   5357         return 0;
   5358     }
   5359     aSign = extractFloat128Sign( a );
   5360     bSign = extractFloat128Sign( b );
   5361     if ( aSign != bSign ) {
   5362         return
   5363                aSign
   5364             && (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
   5365                  != 0 );
   5366     }
   5367     return
   5368           aSign ? lt128( b.high, b.low, a.high, a.low )
   5369         : lt128( a.high, a.low, b.high, b.low );
   5370 
   5371 }
   5372 
   5373 #endif
   5374 
   5375 /* misc functions */
   5376 float32 uint32_to_float32( unsigned int a STATUS_PARAM )
   5377 {
   5378     return int64_to_float32(a STATUS_VAR);
   5379 }
   5380 
   5381 float64 uint32_to_float64( unsigned int a STATUS_PARAM )
   5382 {
   5383     return int64_to_float64(a STATUS_VAR);
   5384 }
   5385 
   5386 unsigned int float32_to_uint32( float32 a STATUS_PARAM )
   5387 {
   5388     int64_t v;
   5389     unsigned int res;
   5390 
   5391     v = float32_to_int64(a STATUS_VAR);
   5392     if (v < 0) {
   5393         res = 0;
   5394         float_raise( float_flag_invalid STATUS_VAR);
   5395     } else if (v > 0xffffffff) {
   5396         res = 0xffffffff;
   5397         float_raise( float_flag_invalid STATUS_VAR);
   5398     } else {
   5399         res = v;
   5400     }
   5401     return res;
   5402 }
   5403 
   5404 unsigned int float32_to_uint32_round_to_zero( float32 a STATUS_PARAM )
   5405 {
   5406     int64_t v;
   5407     unsigned int res;
   5408 
   5409     v = float32_to_int64_round_to_zero(a STATUS_VAR);
   5410     if (v < 0) {
   5411         res = 0;
   5412         float_raise( float_flag_invalid STATUS_VAR);
   5413     } else if (v > 0xffffffff) {
   5414         res = 0xffffffff;
   5415         float_raise( float_flag_invalid STATUS_VAR);
   5416     } else {
   5417         res = v;
   5418     }
   5419     return res;
   5420 }
   5421 
   5422 unsigned int float64_to_uint32( float64 a STATUS_PARAM )
   5423 {
   5424     int64_t v;
   5425     unsigned int res;
   5426 
   5427     v = float64_to_int64(a STATUS_VAR);
   5428     if (v < 0) {
   5429         res = 0;
   5430         float_raise( float_flag_invalid STATUS_VAR);
   5431     } else if (v > 0xffffffff) {
   5432         res = 0xffffffff;
   5433         float_raise( float_flag_invalid STATUS_VAR);
   5434     } else {
   5435         res = v;
   5436     }
   5437     return res;
   5438 }
   5439 
   5440 unsigned int float64_to_uint32_round_to_zero( float64 a STATUS_PARAM )
   5441 {
   5442     int64_t v;
   5443     unsigned int res;
   5444 
   5445     v = float64_to_int64_round_to_zero(a STATUS_VAR);
   5446     if (v < 0) {
   5447         res = 0;
   5448         float_raise( float_flag_invalid STATUS_VAR);
   5449     } else if (v > 0xffffffff) {
   5450         res = 0xffffffff;
   5451         float_raise( float_flag_invalid STATUS_VAR);
   5452     } else {
   5453         res = v;
   5454     }
   5455     return res;
   5456 }
   5457 
   5458 /* FIXME: This looks broken.  */
   5459 uint64_t float64_to_uint64 (float64 a STATUS_PARAM)
   5460 {
   5461     int64_t v;
   5462 
   5463     v = float64_val(int64_to_float64(INT64_MIN STATUS_VAR));
   5464     v += float64_val(a);
   5465     v = float64_to_int64(make_float64(v) STATUS_VAR);
   5466 
   5467     return v - INT64_MIN;
   5468 }
   5469 
   5470 uint64_t float64_to_uint64_round_to_zero (float64 a STATUS_PARAM)
   5471 {
   5472     int64_t v;
   5473 
   5474     v = float64_val(int64_to_float64(INT64_MIN STATUS_VAR));
   5475     v += float64_val(a);
   5476     v = float64_to_int64_round_to_zero(make_float64(v) STATUS_VAR);
   5477 
   5478     return v - INT64_MIN;
   5479 }
   5480 
   5481 #define COMPARE(s, nan_exp)                                                  \
   5482 INLINE int float ## s ## _compare_internal( float ## s a, float ## s b,      \
   5483                                       int is_quiet STATUS_PARAM )            \
   5484 {                                                                            \
   5485     flag aSign, bSign;                                                       \
   5486     bits ## s av, bv;                                                        \
   5487                                                                              \
   5488     if (( ( extractFloat ## s ## Exp( a ) == nan_exp ) &&                    \
   5489          extractFloat ## s ## Frac( a ) ) ||                                 \
   5490         ( ( extractFloat ## s ## Exp( b ) == nan_exp ) &&                    \
   5491           extractFloat ## s ## Frac( b ) )) {                                \
   5492         if (!is_quiet ||                                                     \
   5493             float ## s ## _is_signaling_nan( a ) ||                          \
   5494             float ## s ## _is_signaling_nan( b ) ) {                         \
   5495             float_raise( float_flag_invalid STATUS_VAR);                     \
   5496         }                                                                    \
   5497         return float_relation_unordered;                                     \
   5498     }                                                                        \
   5499     aSign = extractFloat ## s ## Sign( a );                                  \
   5500     bSign = extractFloat ## s ## Sign( b );                                  \
   5501     av = float ## s ## _val(a);                                              \
   5502     bv = float ## s ## _val(b);                                              \
   5503     if ( aSign != bSign ) {                                                  \
   5504         if ( (bits ## s) ( ( av | bv )<<1 ) == 0 ) {                         \
   5505             /* zero case */                                                  \
   5506             return float_relation_equal;                                     \
   5507         } else {                                                             \
   5508             return 1 - (2 * aSign);                                          \
   5509         }                                                                    \
   5510     } else {                                                                 \
   5511         if (av == bv) {                                                      \
   5512             return float_relation_equal;                                     \
   5513         } else {                                                             \
   5514             return 1 - 2 * (aSign ^ ( av < bv ));                            \
   5515         }                                                                    \
   5516     }                                                                        \
   5517 }                                                                            \
   5518                                                                              \
   5519 int float ## s ## _compare( float ## s a, float ## s b STATUS_PARAM )        \
   5520 {                                                                            \
   5521     return float ## s ## _compare_internal(a, b, 0 STATUS_VAR);              \
   5522 }                                                                            \
   5523                                                                              \
   5524 int float ## s ## _compare_quiet( float ## s a, float ## s b STATUS_PARAM )  \
   5525 {                                                                            \
   5526     return float ## s ## _compare_internal(a, b, 1 STATUS_VAR);              \
   5527 }
   5528 
   5529 COMPARE(32, 0xff)
   5530 COMPARE(64, 0x7ff)
   5531 
   5532 INLINE int float128_compare_internal( float128 a, float128 b,
   5533                                       int is_quiet STATUS_PARAM )
   5534 {
   5535     flag aSign, bSign;
   5536 
   5537     if (( ( extractFloat128Exp( a ) == 0x7fff ) &&
   5538           ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) ||
   5539         ( ( extractFloat128Exp( b ) == 0x7fff ) &&
   5540           ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )) {
   5541         if (!is_quiet ||
   5542             float128_is_signaling_nan( a ) ||
   5543             float128_is_signaling_nan( b ) ) {
   5544             float_raise( float_flag_invalid STATUS_VAR);
   5545         }
   5546         return float_relation_unordered;
   5547     }
   5548     aSign = extractFloat128Sign( a );
   5549     bSign = extractFloat128Sign( b );
   5550     if ( aSign != bSign ) {
   5551         if ( ( ( ( a.high | b.high )<<1 ) | a.low | b.low ) == 0 ) {
   5552             /* zero case */
   5553             return float_relation_equal;
   5554         } else {
   5555             return 1 - (2 * aSign);
   5556         }
   5557     } else {
   5558         if (a.low == b.low && a.high == b.high) {
   5559             return float_relation_equal;
   5560         } else {
   5561             return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
   5562         }
   5563     }
   5564 }
   5565 
   5566 int float128_compare( float128 a, float128 b STATUS_PARAM )
   5567 {
   5568     return float128_compare_internal(a, b, 0 STATUS_VAR);
   5569 }
   5570 
   5571 int float128_compare_quiet( float128 a, float128 b STATUS_PARAM )
   5572 {
   5573     return float128_compare_internal(a, b, 1 STATUS_VAR);
   5574 }
   5575 
   5576 /* Multiply A by 2 raised to the power N.  */
   5577 float32 float32_scalbn( float32 a, int n STATUS_PARAM )
   5578 {
   5579     flag aSign;
   5580     int16 aExp;
   5581     bits32 aSig;
   5582 
   5583     aSig = extractFloat32Frac( a );
   5584     aExp = extractFloat32Exp( a );
   5585     aSign = extractFloat32Sign( a );
   5586 
   5587     if ( aExp == 0xFF ) {
   5588         return a;
   5589     }
   5590     if ( aExp != 0 )
   5591         aSig |= 0x00800000;
   5592     else if ( aSig == 0 )
   5593         return a;
   5594 
   5595     aExp += n - 1;
   5596     aSig <<= 7;
   5597     return normalizeRoundAndPackFloat32( aSign, aExp, aSig STATUS_VAR );
   5598 }
   5599 
   5600 float64 float64_scalbn( float64 a, int n STATUS_PARAM )
   5601 {
   5602     flag aSign;
   5603     int16 aExp;
   5604     bits64 aSig;
   5605 
   5606     aSig = extractFloat64Frac( a );
   5607     aExp = extractFloat64Exp( a );
   5608     aSign = extractFloat64Sign( a );
   5609 
   5610     if ( aExp == 0x7FF ) {
   5611         return a;
   5612     }
   5613     if ( aExp != 0 )
   5614         aSig |= LIT64( 0x0010000000000000 );
   5615     else if ( aSig == 0 )
   5616         return a;
   5617 
   5618     aExp += n - 1;
   5619     aSig <<= 10;
   5620     return normalizeRoundAndPackFloat64( aSign, aExp, aSig STATUS_VAR );
   5621 }
   5622 
   5623 #ifdef FLOATX80
   5624 floatx80 floatx80_scalbn( floatx80 a, int n STATUS_PARAM )
   5625 {
   5626     flag aSign;
   5627     int16 aExp;
   5628     bits64 aSig;
   5629 
   5630     aSig = extractFloatx80Frac( a );
   5631     aExp = extractFloatx80Exp( a );
   5632     aSign = extractFloatx80Sign( a );
   5633 
   5634     if ( aExp == 0x7FF ) {
   5635         return a;
   5636     }
   5637     if (aExp == 0 && aSig == 0)
   5638         return a;
   5639 
   5640     aExp += n;
   5641     return normalizeRoundAndPackFloatx80( STATUS(floatx80_rounding_precision),
   5642                                           aSign, aExp, aSig, 0 STATUS_VAR );
   5643 }
   5644 #endif
   5645 
   5646 #ifdef FLOAT128
   5647 float128 float128_scalbn( float128 a, int n STATUS_PARAM )
   5648 {
   5649     flag aSign;
   5650     int32 aExp;
   5651     bits64 aSig0, aSig1;
   5652 
   5653     aSig1 = extractFloat128Frac1( a );
   5654     aSig0 = extractFloat128Frac0( a );
   5655     aExp = extractFloat128Exp( a );
   5656     aSign = extractFloat128Sign( a );
   5657     if ( aExp == 0x7FFF ) {
   5658         return a;
   5659     }
   5660     if ( aExp != 0 )
   5661         aSig0 |= LIT64( 0x0001000000000000 );
   5662     else if ( aSig0 == 0 && aSig1 == 0 )
   5663         return a;
   5664 
   5665     aExp += n - 1;
   5666     return normalizeRoundAndPackFloat128( aSign, aExp, aSig0, aSig1
   5667                                           STATUS_VAR );
   5668 
   5669 }
   5670 #endif
   5671