Home | History | Annotate | Download | only in fpu
      1 
      2 /*============================================================================
      3 
      4 This C source file is part of the SoftFloat IEC/IEEE Floating-point Arithmetic
      5 Package, Release 2b.
      6 
      7 Written by John R. Hauser.  This work was made possible in part by the
      8 International Computer Science Institute, located at Suite 600, 1947 Center
      9 Street, Berkeley, California 94704.  Funding was partially provided by the
     10 National Science Foundation under grant MIP-9311980.  The original version
     11 of this code was written as part of a project to build a fixed-point vector
     12 processor in collaboration with the University of California at Berkeley,
     13 overseen by Profs. Nelson Morgan and John Wawrzynek.  More information
     14 is available through the Web page `http://www.cs.berkeley.edu/~jhauser/
     15 arithmetic/SoftFloat.html'.
     16 
     17 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE.  Although reasonable effort has
     18 been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT TIMES
     19 RESULT IN INCORRECT BEHAVIOR.  USE OF THIS SOFTWARE IS RESTRICTED TO PERSONS
     20 AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ALL LOSSES,
     21 COSTS, OR OTHER PROBLEMS THEY INCUR DUE TO THE SOFTWARE, AND WHO FURTHERMORE
     22 EFFECTIVELY INDEMNIFY JOHN HAUSER AND THE INTERNATIONAL COMPUTER SCIENCE
     23 INSTITUTE (possibly via similar legal warning) AGAINST ALL LOSSES, COSTS, OR
     24 OTHER PROBLEMS INCURRED BY THEIR CUSTOMERS AND CLIENTS DUE TO THE SOFTWARE.
     25 
     26 Derivative works are acceptable, even for commercial purposes, so long as
     27 (1) the source code for the derivative work includes prominent notice that
     28 the work is derivative, and (2) the source code includes prominent notice with
     29 these four paragraphs for those parts of this code that are retained.
     30 
     31 =============================================================================*/
     32 
     33 /* FIXME: Flush-To-Zero only effects results.  Denormal inputs should also
     34    be flushed to zero.  */
     35 #include "softfloat.h"
     36 
     37 /*----------------------------------------------------------------------------
     38 | Primitive arithmetic functions, including multi-word arithmetic, and
     39 | division and square root approximations.  (Can be specialized to target if
     40 | desired.)
     41 *----------------------------------------------------------------------------*/
     42 #include "softfloat-macros.h"
     43 
     44 /*----------------------------------------------------------------------------
     45 | Functions and definitions to determine:  (1) whether tininess for underflow
     46 | is detected before or after rounding by default, (2) what (if anything)
     47 | happens when exceptions are raised, (3) how signaling NaNs are distinguished
     48 | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
     49 | are propagated from function inputs to output.  These details are target-
     50 | specific.
     51 *----------------------------------------------------------------------------*/
     52 #include "softfloat-specialize.h"
     53 
     54 void set_float_rounding_mode(int val STATUS_PARAM)
     55 {
     56     STATUS(float_rounding_mode) = val;
     57 }
     58 
     59 void set_float_exception_flags(int val STATUS_PARAM)
     60 {
     61     STATUS(float_exception_flags) = val;
     62 }
     63 
     64 #ifdef FLOATX80
     65 void set_floatx80_rounding_precision(int val STATUS_PARAM)
     66 {
     67     STATUS(floatx80_rounding_precision) = val;
     68 }
     69 #endif
     70 
     71 /*----------------------------------------------------------------------------
     72 | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
     73 | and 7, and returns the properly rounded 32-bit integer corresponding to the
     74 | input.  If `zSign' is 1, the input is negated before being converted to an
     75 | integer.  Bit 63 of `absZ' must be zero.  Ordinarily, the fixed-point input
     76 | is simply rounded to an integer, with the inexact exception raised if the
     77 | input cannot be represented exactly as an integer.  However, if the fixed-
     78 | point input is too large, the invalid exception is raised and the largest
     79 | positive or negative integer is returned.
     80 *----------------------------------------------------------------------------*/
     81 
     82 static int32 roundAndPackInt32( flag zSign, bits64 absZ STATUS_PARAM)
     83 {
     84     int8 roundingMode;
     85     flag roundNearestEven;
     86     int8 roundIncrement, roundBits;
     87     int32 z;
     88 
     89     roundingMode = STATUS(float_rounding_mode);
     90     roundNearestEven = ( roundingMode == float_round_nearest_even );
     91     roundIncrement = 0x40;
     92     if ( ! roundNearestEven ) {
     93         if ( roundingMode == float_round_to_zero ) {
     94             roundIncrement = 0;
     95         }
     96         else {
     97             roundIncrement = 0x7F;
     98             if ( zSign ) {
     99                 if ( roundingMode == float_round_up ) roundIncrement = 0;
    100             }
    101             else {
    102                 if ( roundingMode == float_round_down ) roundIncrement = 0;
    103             }
    104         }
    105     }
    106     roundBits = absZ & 0x7F;
    107     absZ = ( absZ + roundIncrement )>>7;
    108     absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
    109     z = absZ;
    110     if ( zSign ) z = - z;
    111     if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
    112         float_raise( float_flag_invalid STATUS_VAR);
    113         return zSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
    114     }
    115     if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
    116     return z;
    117 
    118 }
    119 
    120 /*----------------------------------------------------------------------------
    121 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
    122 | `absZ1', with binary point between bits 63 and 64 (between the input words),
    123 | and returns the properly rounded 64-bit integer corresponding to the input.
    124 | If `zSign' is 1, the input is negated before being converted to an integer.
    125 | Ordinarily, the fixed-point input is simply rounded to an integer, with
    126 | the inexact exception raised if the input cannot be represented exactly as
    127 | an integer.  However, if the fixed-point input is too large, the invalid
    128 | exception is raised and the largest positive or negative integer is
    129 | returned.
    130 *----------------------------------------------------------------------------*/
    131 
    132 static int64 roundAndPackInt64( flag zSign, bits64 absZ0, bits64 absZ1 STATUS_PARAM)
    133 {
    134     int8 roundingMode;
    135     flag roundNearestEven, increment;
    136     int64 z;
    137 
    138     roundingMode = STATUS(float_rounding_mode);
    139     roundNearestEven = ( roundingMode == float_round_nearest_even );
    140     increment = ( (sbits64) absZ1 < 0 );
    141     if ( ! roundNearestEven ) {
    142         if ( roundingMode == float_round_to_zero ) {
    143             increment = 0;
    144         }
    145         else {
    146             if ( zSign ) {
    147                 increment = ( roundingMode == float_round_down ) && absZ1;
    148             }
    149             else {
    150                 increment = ( roundingMode == float_round_up ) && absZ1;
    151             }
    152         }
    153     }
    154     if ( increment ) {
    155         ++absZ0;
    156         if ( absZ0 == 0 ) goto overflow;
    157         absZ0 &= ~ ( ( (bits64) ( absZ1<<1 ) == 0 ) & roundNearestEven );
    158     }
    159     z = absZ0;
    160     if ( zSign ) z = - z;
    161     if ( z && ( ( z < 0 ) ^ zSign ) ) {
    162  overflow:
    163         float_raise( float_flag_invalid STATUS_VAR);
    164         return
    165               zSign ? (sbits64) LIT64( 0x8000000000000000 )
    166             : LIT64( 0x7FFFFFFFFFFFFFFF );
    167     }
    168     if ( absZ1 ) STATUS(float_exception_flags) |= float_flag_inexact;
    169     return z;
    170 
    171 }
    172 
    173 /*----------------------------------------------------------------------------
    174 | Returns the fraction bits of the single-precision floating-point value `a'.
    175 *----------------------------------------------------------------------------*/
    176 
    177 INLINE bits32 extractFloat32Frac( float32 a )
    178 {
    179 
    180     return float32_val(a) & 0x007FFFFF;
    181 
    182 }
    183 
    184 /*----------------------------------------------------------------------------
    185 | Returns the exponent bits of the single-precision floating-point value `a'.
    186 *----------------------------------------------------------------------------*/
    187 
    188 INLINE int16 extractFloat32Exp( float32 a )
    189 {
    190 
    191     return ( float32_val(a)>>23 ) & 0xFF;
    192 
    193 }
    194 
    195 /*----------------------------------------------------------------------------
    196 | Returns the sign bit of the single-precision floating-point value `a'.
    197 *----------------------------------------------------------------------------*/
    198 
    199 INLINE flag extractFloat32Sign( float32 a )
    200 {
    201 
    202     return float32_val(a)>>31;
    203 
    204 }
    205 
    206 /*----------------------------------------------------------------------------
    207 | Normalizes the subnormal single-precision floating-point value represented
    208 | by the denormalized significand `aSig'.  The normalized exponent and
    209 | significand are stored at the locations pointed to by `zExpPtr' and
    210 | `zSigPtr', respectively.
    211 *----------------------------------------------------------------------------*/
    212 
    213 static void
    214  normalizeFloat32Subnormal( bits32 aSig, int16 *zExpPtr, bits32 *zSigPtr )
    215 {
    216     int8 shiftCount;
    217 
    218     shiftCount = countLeadingZeros32( aSig ) - 8;
    219     *zSigPtr = aSig<<shiftCount;
    220     *zExpPtr = 1 - shiftCount;
    221 
    222 }
    223 
    224 /*----------------------------------------------------------------------------
    225 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
    226 | single-precision floating-point value, returning the result.  After being
    227 | shifted into the proper positions, the three fields are simply added
    228 | together to form the result.  This means that any integer portion of `zSig'
    229 | will be added into the exponent.  Since a properly normalized significand
    230 | will have an integer portion equal to 1, the `zExp' input should be 1 less
    231 | than the desired result exponent whenever `zSig' is a complete, normalized
    232 | significand.
    233 *----------------------------------------------------------------------------*/
    234 
    235 INLINE float32 packFloat32( flag zSign, int16 zExp, bits32 zSig )
    236 {
    237 
    238     return make_float32(
    239           ( ( (bits32) zSign )<<31 ) + ( ( (bits32) zExp )<<23 ) + zSig);
    240 
    241 }
    242 
    243 /*----------------------------------------------------------------------------
    244 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
    245 | and significand `zSig', and returns the proper single-precision floating-
    246 | point value corresponding to the abstract input.  Ordinarily, the abstract
    247 | value is simply rounded and packed into the single-precision format, with
    248 | the inexact exception raised if the abstract input cannot be represented
    249 | exactly.  However, if the abstract value is too large, the overflow and
    250 | inexact exceptions are raised and an infinity or maximal finite value is
    251 | returned.  If the abstract value is too small, the input value is rounded to
    252 | a subnormal number, and the underflow and inexact exceptions are raised if
    253 | the abstract input cannot be represented exactly as a subnormal single-
    254 | precision floating-point number.
    255 |     The input significand `zSig' has its binary point between bits 30
    256 | and 29, which is 7 bits to the left of the usual location.  This shifted
    257 | significand must be normalized or smaller.  If `zSig' is not normalized,
    258 | `zExp' must be 0; in that case, the result returned is a subnormal number,
    259 | and it must not require rounding.  In the usual case that `zSig' is
    260 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
    261 | The handling of underflow and overflow follows the IEC/IEEE Standard for
    262 | Binary Floating-Point Arithmetic.
    263 *----------------------------------------------------------------------------*/
    264 
    265 static float32 roundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig STATUS_PARAM)
    266 {
    267     int8 roundingMode;
    268     flag roundNearestEven;
    269     int8 roundIncrement, roundBits;
    270     flag isTiny;
    271 
    272     roundingMode = STATUS(float_rounding_mode);
    273     roundNearestEven = ( roundingMode == float_round_nearest_even );
    274     roundIncrement = 0x40;
    275     if ( ! roundNearestEven ) {
    276         if ( roundingMode == float_round_to_zero ) {
    277             roundIncrement = 0;
    278         }
    279         else {
    280             roundIncrement = 0x7F;
    281             if ( zSign ) {
    282                 if ( roundingMode == float_round_up ) roundIncrement = 0;
    283             }
    284             else {
    285                 if ( roundingMode == float_round_down ) roundIncrement = 0;
    286             }
    287         }
    288     }
    289     roundBits = zSig & 0x7F;
    290     if ( 0xFD <= (bits16) zExp ) {
    291         if (    ( 0xFD < zExp )
    292              || (    ( zExp == 0xFD )
    293                   && ( (sbits32) ( zSig + roundIncrement ) < 0 ) )
    294            ) {
    295             float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
    296             return packFloat32( zSign, 0xFF, - ( roundIncrement == 0 ));
    297         }
    298         if ( zExp < 0 ) {
    299             if ( STATUS(flush_to_zero) ) return packFloat32( zSign, 0, 0 );
    300             isTiny =
    301                    ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
    302                 || ( zExp < -1 )
    303                 || ( zSig + roundIncrement < 0x80000000 );
    304             shift32RightJamming( zSig, - zExp, &zSig );
    305             zExp = 0;
    306             roundBits = zSig & 0x7F;
    307             if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
    308         }
    309     }
    310     if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
    311     zSig = ( zSig + roundIncrement )>>7;
    312     zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
    313     if ( zSig == 0 ) zExp = 0;
    314     return packFloat32( zSign, zExp, zSig );
    315 
    316 }
    317 
    318 /*----------------------------------------------------------------------------
    319 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
    320 | and significand `zSig', and returns the proper single-precision floating-
    321 | point value corresponding to the abstract input.  This routine is just like
    322 | `roundAndPackFloat32' except that `zSig' does not have to be normalized.
    323 | Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
    324 | floating-point exponent.
    325 *----------------------------------------------------------------------------*/
    326 
    327 static float32
    328  normalizeRoundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig STATUS_PARAM)
    329 {
    330     int8 shiftCount;
    331 
    332     shiftCount = countLeadingZeros32( zSig ) - 1;
    333     return roundAndPackFloat32( zSign, zExp - shiftCount, zSig<<shiftCount STATUS_VAR);
    334 
    335 }
    336 
    337 /*----------------------------------------------------------------------------
    338 | Returns the fraction bits of the double-precision floating-point value `a'.
    339 *----------------------------------------------------------------------------*/
    340 
    341 INLINE bits64 extractFloat64Frac( float64 a )
    342 {
    343 
    344     return float64_val(a) & LIT64( 0x000FFFFFFFFFFFFF );
    345 
    346 }
    347 
    348 /*----------------------------------------------------------------------------
    349 | Returns the exponent bits of the double-precision floating-point value `a'.
    350 *----------------------------------------------------------------------------*/
    351 
    352 INLINE int16 extractFloat64Exp( float64 a )
    353 {
    354 
    355     return ( float64_val(a)>>52 ) & 0x7FF;
    356 
    357 }
    358 
    359 /*----------------------------------------------------------------------------
    360 | Returns the sign bit of the double-precision floating-point value `a'.
    361 *----------------------------------------------------------------------------*/
    362 
    363 INLINE flag extractFloat64Sign( float64 a )
    364 {
    365 
    366     return float64_val(a)>>63;
    367 
    368 }
    369 
    370 /*----------------------------------------------------------------------------
    371 | Normalizes the subnormal double-precision floating-point value represented
    372 | by the denormalized significand `aSig'.  The normalized exponent and
    373 | significand are stored at the locations pointed to by `zExpPtr' and
    374 | `zSigPtr', respectively.
    375 *----------------------------------------------------------------------------*/
    376 
    377 static void
    378  normalizeFloat64Subnormal( bits64 aSig, int16 *zExpPtr, bits64 *zSigPtr )
    379 {
    380     int8 shiftCount;
    381 
    382     shiftCount = countLeadingZeros64( aSig ) - 11;
    383     *zSigPtr = aSig<<shiftCount;
    384     *zExpPtr = 1 - shiftCount;
    385 
    386 }
    387 
    388 /*----------------------------------------------------------------------------
    389 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
    390 | double-precision floating-point value, returning the result.  After being
    391 | shifted into the proper positions, the three fields are simply added
    392 | together to form the result.  This means that any integer portion of `zSig'
    393 | will be added into the exponent.  Since a properly normalized significand
    394 | will have an integer portion equal to 1, the `zExp' input should be 1 less
    395 | than the desired result exponent whenever `zSig' is a complete, normalized
    396 | significand.
    397 *----------------------------------------------------------------------------*/
    398 
    399 INLINE float64 packFloat64( flag zSign, int16 zExp, bits64 zSig )
    400 {
    401 
    402     return make_float64(
    403         ( ( (bits64) zSign )<<63 ) + ( ( (bits64) zExp )<<52 ) + zSig);
    404 
    405 }
    406 
    407 /*----------------------------------------------------------------------------
    408 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
    409 | and significand `zSig', and returns the proper double-precision floating-
    410 | point value corresponding to the abstract input.  Ordinarily, the abstract
    411 | value is simply rounded and packed into the double-precision format, with
    412 | the inexact exception raised if the abstract input cannot be represented
    413 | exactly.  However, if the abstract value is too large, the overflow and
    414 | inexact exceptions are raised and an infinity or maximal finite value is
    415 | returned.  If the abstract value is too small, the input value is rounded
    416 | to a subnormal number, and the underflow and inexact exceptions are raised
    417 | if the abstract input cannot be represented exactly as a subnormal double-
    418 | precision floating-point number.
    419 |     The input significand `zSig' has its binary point between bits 62
    420 | and 61, which is 10 bits to the left of the usual location.  This shifted
    421 | significand must be normalized or smaller.  If `zSig' is not normalized,
    422 | `zExp' must be 0; in that case, the result returned is a subnormal number,
    423 | and it must not require rounding.  In the usual case that `zSig' is
    424 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
    425 | The handling of underflow and overflow follows the IEC/IEEE Standard for
    426 | Binary Floating-Point Arithmetic.
    427 *----------------------------------------------------------------------------*/
    428 
    429 static float64 roundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig STATUS_PARAM)
    430 {
    431     int8 roundingMode;
    432     flag roundNearestEven;
    433     int16 roundIncrement, roundBits;
    434     flag isTiny;
    435 
    436     roundingMode = STATUS(float_rounding_mode);
    437     roundNearestEven = ( roundingMode == float_round_nearest_even );
    438     roundIncrement = 0x200;
    439     if ( ! roundNearestEven ) {
    440         if ( roundingMode == float_round_to_zero ) {
    441             roundIncrement = 0;
    442         }
    443         else {
    444             roundIncrement = 0x3FF;
    445             if ( zSign ) {
    446                 if ( roundingMode == float_round_up ) roundIncrement = 0;
    447             }
    448             else {
    449                 if ( roundingMode == float_round_down ) roundIncrement = 0;
    450             }
    451         }
    452     }
    453     roundBits = zSig & 0x3FF;
    454     if ( 0x7FD <= (bits16) zExp ) {
    455         if (    ( 0x7FD < zExp )
    456              || (    ( zExp == 0x7FD )
    457                   && ( (sbits64) ( zSig + roundIncrement ) < 0 ) )
    458            ) {
    459             float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
    460             return packFloat64( zSign, 0x7FF, - ( roundIncrement == 0 ));
    461         }
    462         if ( zExp < 0 ) {
    463             if ( STATUS(flush_to_zero) ) return packFloat64( zSign, 0, 0 );
    464             isTiny =
    465                    ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
    466                 || ( zExp < -1 )
    467                 || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) );
    468             shift64RightJamming( zSig, - zExp, &zSig );
    469             zExp = 0;
    470             roundBits = zSig & 0x3FF;
    471             if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
    472         }
    473     }
    474     if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
    475     zSig = ( zSig + roundIncrement )>>10;
    476     zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
    477     if ( zSig == 0 ) zExp = 0;
    478     return packFloat64( zSign, zExp, zSig );
    479 
    480 }
    481 
    482 /*----------------------------------------------------------------------------
    483 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
    484 | and significand `zSig', and returns the proper double-precision floating-
    485 | point value corresponding to the abstract input.  This routine is just like
    486 | `roundAndPackFloat64' except that `zSig' does not have to be normalized.
    487 | Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
    488 | floating-point exponent.
    489 *----------------------------------------------------------------------------*/
    490 
    491 static float64
    492  normalizeRoundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig STATUS_PARAM)
    493 {
    494     int8 shiftCount;
    495 
    496     shiftCount = countLeadingZeros64( zSig ) - 1;
    497     return roundAndPackFloat64( zSign, zExp - shiftCount, zSig<<shiftCount STATUS_VAR);
    498 
    499 }
    500 
    501 #ifdef FLOATX80
    502 
    503 /*----------------------------------------------------------------------------
    504 | Returns the fraction bits of the extended double-precision floating-point
    505 | value `a'.
    506 *----------------------------------------------------------------------------*/
    507 
    508 INLINE bits64 extractFloatx80Frac( floatx80 a )
    509 {
    510 
    511     return a.low;
    512 
    513 }
    514 
    515 /*----------------------------------------------------------------------------
    516 | Returns the exponent bits of the extended double-precision floating-point
    517 | value `a'.
    518 *----------------------------------------------------------------------------*/
    519 
    520 INLINE int32 extractFloatx80Exp( floatx80 a )
    521 {
    522 
    523     return a.high & 0x7FFF;
    524 
    525 }
    526 
    527 /*----------------------------------------------------------------------------
    528 | Returns the sign bit of the extended double-precision floating-point value
    529 | `a'.
    530 *----------------------------------------------------------------------------*/
    531 
    532 INLINE flag extractFloatx80Sign( floatx80 a )
    533 {
    534 
    535     return a.high>>15;
    536 
    537 }
    538 
    539 /*----------------------------------------------------------------------------
    540 | Normalizes the subnormal extended double-precision floating-point value
    541 | represented by the denormalized significand `aSig'.  The normalized exponent
    542 | and significand are stored at the locations pointed to by `zExpPtr' and
    543 | `zSigPtr', respectively.
    544 *----------------------------------------------------------------------------*/
    545 
    546 static void
    547  normalizeFloatx80Subnormal( bits64 aSig, int32 *zExpPtr, bits64 *zSigPtr )
    548 {
    549     int8 shiftCount;
    550 
    551     shiftCount = countLeadingZeros64( aSig );
    552     *zSigPtr = aSig<<shiftCount;
    553     *zExpPtr = 1 - shiftCount;
    554 
    555 }
    556 
    557 /*----------------------------------------------------------------------------
    558 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
    559 | extended double-precision floating-point value, returning the result.
    560 *----------------------------------------------------------------------------*/
    561 
    562 INLINE floatx80 packFloatx80( flag zSign, int32 zExp, bits64 zSig )
    563 {
    564     floatx80 z;
    565 
    566     z.low = zSig;
    567     z.high = ( ( (bits16) zSign )<<15 ) + zExp;
    568     return z;
    569 
    570 }
    571 
    572 /*----------------------------------------------------------------------------
    573 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
    574 | and extended significand formed by the concatenation of `zSig0' and `zSig1',
    575 | and returns the proper extended double-precision floating-point value
    576 | corresponding to the abstract input.  Ordinarily, the abstract value is
    577 | rounded and packed into the extended double-precision format, with the
    578 | inexact exception raised if the abstract input cannot be represented
    579 | exactly.  However, if the abstract value is too large, the overflow and
    580 | inexact exceptions are raised and an infinity or maximal finite value is
    581 | returned.  If the abstract value is too small, the input value is rounded to
    582 | a subnormal number, and the underflow and inexact exceptions are raised if
    583 | the abstract input cannot be represented exactly as a subnormal extended
    584 | double-precision floating-point number.
    585 |     If `roundingPrecision' is 32 or 64, the result is rounded to the same
    586 | number of bits as single or double precision, respectively.  Otherwise, the
    587 | result is rounded to the full precision of the extended double-precision
    588 | format.
    589 |     The input significand must be normalized or smaller.  If the input
    590 | significand is not normalized, `zExp' must be 0; in that case, the result
    591 | returned is a subnormal number, and it must not require rounding.  The
    592 | handling of underflow and overflow follows the IEC/IEEE Standard for Binary
    593 | Floating-Point Arithmetic.
    594 *----------------------------------------------------------------------------*/
    595 
    596 static floatx80
    597  roundAndPackFloatx80(
    598      int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
    599  STATUS_PARAM)
    600 {
    601     int8 roundingMode;
    602     flag roundNearestEven, increment, isTiny;
    603     int64 roundIncrement, roundMask, roundBits;
    604 
    605     roundingMode = STATUS(float_rounding_mode);
    606     roundNearestEven = ( roundingMode == float_round_nearest_even );
    607     if ( roundingPrecision == 80 ) goto precision80;
    608     if ( roundingPrecision == 64 ) {
    609         roundIncrement = LIT64( 0x0000000000000400 );
    610         roundMask = LIT64( 0x00000000000007FF );
    611     }
    612     else if ( roundingPrecision == 32 ) {
    613         roundIncrement = LIT64( 0x0000008000000000 );
    614         roundMask = LIT64( 0x000000FFFFFFFFFF );
    615     }
    616     else {
    617         goto precision80;
    618     }
    619     zSig0 |= ( zSig1 != 0 );
    620     if ( ! roundNearestEven ) {
    621         if ( roundingMode == float_round_to_zero ) {
    622             roundIncrement = 0;
    623         }
    624         else {
    625             roundIncrement = roundMask;
    626             if ( zSign ) {
    627                 if ( roundingMode == float_round_up ) roundIncrement = 0;
    628             }
    629             else {
    630                 if ( roundingMode == float_round_down ) roundIncrement = 0;
    631             }
    632         }
    633     }
    634     roundBits = zSig0 & roundMask;
    635     if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
    636         if (    ( 0x7FFE < zExp )
    637              || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
    638            ) {
    639             goto overflow;
    640         }
    641         if ( zExp <= 0 ) {
    642             if ( STATUS(flush_to_zero) ) return packFloatx80( zSign, 0, 0 );
    643             isTiny =
    644                    ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
    645                 || ( zExp < 0 )
    646                 || ( zSig0 <= zSig0 + roundIncrement );
    647             shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
    648             zExp = 0;
    649             roundBits = zSig0 & roundMask;
    650             if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
    651             if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
    652             zSig0 += roundIncrement;
    653             if ( (sbits64) zSig0 < 0 ) zExp = 1;
    654             roundIncrement = roundMask + 1;
    655             if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
    656                 roundMask |= roundIncrement;
    657             }
    658             zSig0 &= ~ roundMask;
    659             return packFloatx80( zSign, zExp, zSig0 );
    660         }
    661     }
    662     if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
    663     zSig0 += roundIncrement;
    664     if ( zSig0 < roundIncrement ) {
    665         ++zExp;
    666         zSig0 = LIT64( 0x8000000000000000 );
    667     }
    668     roundIncrement = roundMask + 1;
    669     if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
    670         roundMask |= roundIncrement;
    671     }
    672     zSig0 &= ~ roundMask;
    673     if ( zSig0 == 0 ) zExp = 0;
    674     return packFloatx80( zSign, zExp, zSig0 );
    675  precision80:
    676     increment = ( (sbits64) zSig1 < 0 );
    677     if ( ! roundNearestEven ) {
    678         if ( roundingMode == float_round_to_zero ) {
    679             increment = 0;
    680         }
    681         else {
    682             if ( zSign ) {
    683                 increment = ( roundingMode == float_round_down ) && zSig1;
    684             }
    685             else {
    686                 increment = ( roundingMode == float_round_up ) && zSig1;
    687             }
    688         }
    689     }
    690     if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
    691         if (    ( 0x7FFE < zExp )
    692              || (    ( zExp == 0x7FFE )
    693                   && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
    694                   && increment
    695                 )
    696            ) {
    697             roundMask = 0;
    698  overflow:
    699             float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
    700             if (    ( roundingMode == float_round_to_zero )
    701                  || ( zSign && ( roundingMode == float_round_up ) )
    702                  || ( ! zSign && ( roundingMode == float_round_down ) )
    703                ) {
    704                 return packFloatx80( zSign, 0x7FFE, ~ roundMask );
    705             }
    706             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
    707         }
    708         if ( zExp <= 0 ) {
    709             isTiny =
    710                    ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
    711                 || ( zExp < 0 )
    712                 || ! increment
    713                 || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
    714             shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
    715             zExp = 0;
    716             if ( isTiny && zSig1 ) float_raise( float_flag_underflow STATUS_VAR);
    717             if ( zSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
    718             if ( roundNearestEven ) {
    719                 increment = ( (sbits64) zSig1 < 0 );
    720             }
    721             else {
    722                 if ( zSign ) {
    723                     increment = ( roundingMode == float_round_down ) && zSig1;
    724                 }
    725                 else {
    726                     increment = ( roundingMode == float_round_up ) && zSig1;
    727                 }
    728             }
    729             if ( increment ) {
    730                 ++zSig0;
    731                 zSig0 &=
    732                     ~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven );
    733                 if ( (sbits64) zSig0 < 0 ) zExp = 1;
    734             }
    735             return packFloatx80( zSign, zExp, zSig0 );
    736         }
    737     }
    738     if ( zSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
    739     if ( increment ) {
    740         ++zSig0;
    741         if ( zSig0 == 0 ) {
    742             ++zExp;
    743             zSig0 = LIT64( 0x8000000000000000 );
    744         }
    745         else {
    746             zSig0 &= ~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven );
    747         }
    748     }
    749     else {
    750         if ( zSig0 == 0 ) zExp = 0;
    751     }
    752     return packFloatx80( zSign, zExp, zSig0 );
    753 
    754 }
    755 
    756 /*----------------------------------------------------------------------------
    757 | Takes an abstract floating-point value having sign `zSign', exponent
    758 | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
    759 | and returns the proper extended double-precision floating-point value
    760 | corresponding to the abstract input.  This routine is just like
    761 | `roundAndPackFloatx80' except that the input significand does not have to be
    762 | normalized.
    763 *----------------------------------------------------------------------------*/
    764 
    765 static floatx80
    766  normalizeRoundAndPackFloatx80(
    767      int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
    768  STATUS_PARAM)
    769 {
    770     int8 shiftCount;
    771 
    772     if ( zSig0 == 0 ) {
    773         zSig0 = zSig1;
    774         zSig1 = 0;
    775         zExp -= 64;
    776     }
    777     shiftCount = countLeadingZeros64( zSig0 );
    778     shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
    779     zExp -= shiftCount;
    780     return
    781         roundAndPackFloatx80( roundingPrecision, zSign, zExp, zSig0, zSig1 STATUS_VAR);
    782 
    783 }
    784 
    785 #endif
    786 
    787 #ifdef FLOAT128
    788 
    789 /*----------------------------------------------------------------------------
    790 | Returns the least-significant 64 fraction bits of the quadruple-precision
    791 | floating-point value `a'.
    792 *----------------------------------------------------------------------------*/
    793 
    794 INLINE bits64 extractFloat128Frac1( float128 a )
    795 {
    796 
    797     return a.low;
    798 
    799 }
    800 
    801 /*----------------------------------------------------------------------------
    802 | Returns the most-significant 48 fraction bits of the quadruple-precision
    803 | floating-point value `a'.
    804 *----------------------------------------------------------------------------*/
    805 
    806 INLINE bits64 extractFloat128Frac0( float128 a )
    807 {
    808 
    809     return a.high & LIT64( 0x0000FFFFFFFFFFFF );
    810 
    811 }
    812 
    813 /*----------------------------------------------------------------------------
    814 | Returns the exponent bits of the quadruple-precision floating-point value
    815 | `a'.
    816 *----------------------------------------------------------------------------*/
    817 
    818 INLINE int32 extractFloat128Exp( float128 a )
    819 {
    820 
    821     return ( a.high>>48 ) & 0x7FFF;
    822 
    823 }
    824 
    825 /*----------------------------------------------------------------------------
    826 | Returns the sign bit of the quadruple-precision floating-point value `a'.
    827 *----------------------------------------------------------------------------*/
    828 
    829 INLINE flag extractFloat128Sign( float128 a )
    830 {
    831 
    832     return a.high>>63;
    833 
    834 }
    835 
    836 /*----------------------------------------------------------------------------
    837 | Normalizes the subnormal quadruple-precision floating-point value
    838 | represented by the denormalized significand formed by the concatenation of
    839 | `aSig0' and `aSig1'.  The normalized exponent is stored at the location
    840 | pointed to by `zExpPtr'.  The most significant 49 bits of the normalized
    841 | significand are stored at the location pointed to by `zSig0Ptr', and the
    842 | least significant 64 bits of the normalized significand are stored at the
    843 | location pointed to by `zSig1Ptr'.
    844 *----------------------------------------------------------------------------*/
    845 
    846 static void
    847  normalizeFloat128Subnormal(
    848      bits64 aSig0,
    849      bits64 aSig1,
    850      int32 *zExpPtr,
    851      bits64 *zSig0Ptr,
    852      bits64 *zSig1Ptr
    853  )
    854 {
    855     int8 shiftCount;
    856 
    857     if ( aSig0 == 0 ) {
    858         shiftCount = countLeadingZeros64( aSig1 ) - 15;
    859         if ( shiftCount < 0 ) {
    860             *zSig0Ptr = aSig1>>( - shiftCount );
    861             *zSig1Ptr = aSig1<<( shiftCount & 63 );
    862         }
    863         else {
    864             *zSig0Ptr = aSig1<<shiftCount;
    865             *zSig1Ptr = 0;
    866         }
    867         *zExpPtr = - shiftCount - 63;
    868     }
    869     else {
    870         shiftCount = countLeadingZeros64( aSig0 ) - 15;
    871         shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
    872         *zExpPtr = 1 - shiftCount;
    873     }
    874 
    875 }
    876 
    877 /*----------------------------------------------------------------------------
    878 | Packs the sign `zSign', the exponent `zExp', and the significand formed
    879 | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
    880 | floating-point value, returning the result.  After being shifted into the
    881 | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
    882 | added together to form the most significant 32 bits of the result.  This
    883 | means that any integer portion of `zSig0' will be added into the exponent.
    884 | Since a properly normalized significand will have an integer portion equal
    885 | to 1, the `zExp' input should be 1 less than the desired result exponent
    886 | whenever `zSig0' and `zSig1' concatenated form a complete, normalized
    887 | significand.
    888 *----------------------------------------------------------------------------*/
    889 
    890 INLINE float128
    891  packFloat128( flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 )
    892 {
    893     float128 z;
    894 
    895     z.low = zSig1;
    896     z.high = ( ( (bits64) zSign )<<63 ) + ( ( (bits64) zExp )<<48 ) + zSig0;
    897     return z;
    898 
    899 }
    900 
    901 /*----------------------------------------------------------------------------
    902 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
    903 | and extended significand formed by the concatenation of `zSig0', `zSig1',
    904 | and `zSig2', and returns the proper quadruple-precision floating-point value
    905 | corresponding to the abstract input.  Ordinarily, the abstract value is
    906 | simply rounded and packed into the quadruple-precision format, with the
    907 | inexact exception raised if the abstract input cannot be represented
    908 | exactly.  However, if the abstract value is too large, the overflow and
    909 | inexact exceptions are raised and an infinity or maximal finite value is
    910 | returned.  If the abstract value is too small, the input value is rounded to
    911 | a subnormal number, and the underflow and inexact exceptions are raised if
    912 | the abstract input cannot be represented exactly as a subnormal quadruple-
    913 | precision floating-point number.
    914 |     The input significand must be normalized or smaller.  If the input
    915 | significand is not normalized, `zExp' must be 0; in that case, the result
    916 | returned is a subnormal number, and it must not require rounding.  In the
    917 | usual case that the input significand is normalized, `zExp' must be 1 less
    918 | than the ``true'' floating-point exponent.  The handling of underflow and
    919 | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
    920 *----------------------------------------------------------------------------*/
    921 
    922 static float128
    923  roundAndPackFloat128(
    924      flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1, bits64 zSig2 STATUS_PARAM)
    925 {
    926     int8 roundingMode;
    927     flag roundNearestEven, increment, isTiny;
    928 
    929     roundingMode = STATUS(float_rounding_mode);
    930     roundNearestEven = ( roundingMode == float_round_nearest_even );
    931     increment = ( (sbits64) zSig2 < 0 );
    932     if ( ! roundNearestEven ) {
    933         if ( roundingMode == float_round_to_zero ) {
    934             increment = 0;
    935         }
    936         else {
    937             if ( zSign ) {
    938                 increment = ( roundingMode == float_round_down ) && zSig2;
    939             }
    940             else {
    941                 increment = ( roundingMode == float_round_up ) && zSig2;
    942             }
    943         }
    944     }
    945     if ( 0x7FFD <= (bits32) zExp ) {
    946         if (    ( 0x7FFD < zExp )
    947              || (    ( zExp == 0x7FFD )
    948                   && eq128(
    949                          LIT64( 0x0001FFFFFFFFFFFF ),
    950                          LIT64( 0xFFFFFFFFFFFFFFFF ),
    951                          zSig0,
    952                          zSig1
    953                      )
    954                   && increment
    955                 )
    956            ) {
    957             float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
    958             if (    ( roundingMode == float_round_to_zero )
    959                  || ( zSign && ( roundingMode == float_round_up ) )
    960                  || ( ! zSign && ( roundingMode == float_round_down ) )
    961                ) {
    962                 return
    963                     packFloat128(
    964                         zSign,
    965                         0x7FFE,
    966                         LIT64( 0x0000FFFFFFFFFFFF ),
    967                         LIT64( 0xFFFFFFFFFFFFFFFF )
    968                     );
    969             }
    970             return packFloat128( zSign, 0x7FFF, 0, 0 );
    971         }
    972         if ( zExp < 0 ) {
    973             if ( STATUS(flush_to_zero) ) return packFloat128( zSign, 0, 0, 0 );
    974             isTiny =
    975                    ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
    976                 || ( zExp < -1 )
    977                 || ! increment
    978                 || lt128(
    979                        zSig0,
    980                        zSig1,
    981                        LIT64( 0x0001FFFFFFFFFFFF ),
    982                        LIT64( 0xFFFFFFFFFFFFFFFF )
    983                    );
    984             shift128ExtraRightJamming(
    985                 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
    986             zExp = 0;
    987             if ( isTiny && zSig2 ) float_raise( float_flag_underflow STATUS_VAR);
    988             if ( roundNearestEven ) {
    989                 increment = ( (sbits64) zSig2 < 0 );
    990             }
    991             else {
    992                 if ( zSign ) {
    993                     increment = ( roundingMode == float_round_down ) && zSig2;
    994                 }
    995                 else {
    996                     increment = ( roundingMode == float_round_up ) && zSig2;
    997                 }
    998             }
    999         }
   1000     }
   1001     if ( zSig2 ) STATUS(float_exception_flags) |= float_flag_inexact;
   1002     if ( increment ) {
   1003         add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
   1004         zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven );
   1005     }
   1006     else {
   1007         if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
   1008     }
   1009     return packFloat128( zSign, zExp, zSig0, zSig1 );
   1010 
   1011 }
   1012 
   1013 /*----------------------------------------------------------------------------
   1014 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
   1015 | and significand formed by the concatenation of `zSig0' and `zSig1', and
   1016 | returns the proper quadruple-precision floating-point value corresponding
   1017 | to the abstract input.  This routine is just like `roundAndPackFloat128'
   1018 | except that the input significand has fewer bits and does not have to be
   1019 | normalized.  In all cases, `zExp' must be 1 less than the ``true'' floating-
   1020 | point exponent.
   1021 *----------------------------------------------------------------------------*/
   1022 
   1023 static float128
   1024  normalizeRoundAndPackFloat128(
   1025      flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 STATUS_PARAM)
   1026 {
   1027     int8 shiftCount;
   1028     bits64 zSig2;
   1029 
   1030     if ( zSig0 == 0 ) {
   1031         zSig0 = zSig1;
   1032         zSig1 = 0;
   1033         zExp -= 64;
   1034     }
   1035     shiftCount = countLeadingZeros64( zSig0 ) - 15;
   1036     if ( 0 <= shiftCount ) {
   1037         zSig2 = 0;
   1038         shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
   1039     }
   1040     else {
   1041         shift128ExtraRightJamming(
   1042             zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
   1043     }
   1044     zExp -= shiftCount;
   1045     return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR);
   1046 
   1047 }
   1048 
   1049 #endif
   1050 
   1051 /*----------------------------------------------------------------------------
   1052 | Returns the result of converting the 32-bit two's complement integer `a'
   1053 | to the single-precision floating-point format.  The conversion is performed
   1054 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   1055 *----------------------------------------------------------------------------*/
   1056 
   1057 float32 int32_to_float32( int32 a STATUS_PARAM )
   1058 {
   1059     flag zSign;
   1060 
   1061     if ( a == 0 ) return float32_zero;
   1062     if ( a == (sbits32) 0x80000000 ) return packFloat32( 1, 0x9E, 0 );
   1063     zSign = ( a < 0 );
   1064     return normalizeRoundAndPackFloat32( zSign, 0x9C, zSign ? - a : a STATUS_VAR );
   1065 
   1066 }
   1067 
   1068 /*----------------------------------------------------------------------------
   1069 | Returns the result of converting the 32-bit two's complement integer `a'
   1070 | to the double-precision floating-point format.  The conversion is performed
   1071 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   1072 *----------------------------------------------------------------------------*/
   1073 
   1074 float64 int32_to_float64( int32 a STATUS_PARAM )
   1075 {
   1076     flag zSign;
   1077     uint32 absA;
   1078     int8 shiftCount;
   1079     bits64 zSig;
   1080 
   1081     if ( a == 0 ) return float64_zero;
   1082     zSign = ( a < 0 );
   1083     absA = zSign ? - a : a;
   1084     shiftCount = countLeadingZeros32( absA ) + 21;
   1085     zSig = absA;
   1086     return packFloat64( zSign, 0x432 - shiftCount, zSig<<shiftCount );
   1087 
   1088 }
   1089 
   1090 #ifdef FLOATX80
   1091 
   1092 /*----------------------------------------------------------------------------
   1093 | Returns the result of converting the 32-bit two's complement integer `a'
   1094 | to the extended double-precision floating-point format.  The conversion
   1095 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
   1096 | Arithmetic.
   1097 *----------------------------------------------------------------------------*/
   1098 
   1099 floatx80 int32_to_floatx80( int32 a STATUS_PARAM )
   1100 {
   1101     flag zSign;
   1102     uint32 absA;
   1103     int8 shiftCount;
   1104     bits64 zSig;
   1105 
   1106     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
   1107     zSign = ( a < 0 );
   1108     absA = zSign ? - a : a;
   1109     shiftCount = countLeadingZeros32( absA ) + 32;
   1110     zSig = absA;
   1111     return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
   1112 
   1113 }
   1114 
   1115 #endif
   1116 
   1117 #ifdef FLOAT128
   1118 
   1119 /*----------------------------------------------------------------------------
   1120 | Returns the result of converting the 32-bit two's complement integer `a' to
   1121 | the quadruple-precision floating-point format.  The conversion is performed
   1122 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   1123 *----------------------------------------------------------------------------*/
   1124 
   1125 float128 int32_to_float128( int32 a STATUS_PARAM )
   1126 {
   1127     flag zSign;
   1128     uint32 absA;
   1129     int8 shiftCount;
   1130     bits64 zSig0;
   1131 
   1132     if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
   1133     zSign = ( a < 0 );
   1134     absA = zSign ? - a : a;
   1135     shiftCount = countLeadingZeros32( absA ) + 17;
   1136     zSig0 = absA;
   1137     return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
   1138 
   1139 }
   1140 
   1141 #endif
   1142 
   1143 /*----------------------------------------------------------------------------
   1144 | Returns the result of converting the 64-bit two's complement integer `a'
   1145 | to the single-precision floating-point format.  The conversion is performed
   1146 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   1147 *----------------------------------------------------------------------------*/
   1148 
   1149 float32 int64_to_float32( int64 a STATUS_PARAM )
   1150 {
   1151     flag zSign;
   1152     uint64 absA;
   1153     int8 shiftCount;
   1154 
   1155     if ( a == 0 ) return float32_zero;
   1156     zSign = ( a < 0 );
   1157     absA = zSign ? - a : a;
   1158     shiftCount = countLeadingZeros64( absA ) - 40;
   1159     if ( 0 <= shiftCount ) {
   1160         return packFloat32( zSign, 0x95 - shiftCount, absA<<shiftCount );
   1161     }
   1162     else {
   1163         shiftCount += 7;
   1164         if ( shiftCount < 0 ) {
   1165             shift64RightJamming( absA, - shiftCount, &absA );
   1166         }
   1167         else {
   1168             absA <<= shiftCount;
   1169         }
   1170         return roundAndPackFloat32( zSign, 0x9C - shiftCount, absA STATUS_VAR );
   1171     }
   1172 
   1173 }
   1174 
   1175 float32 uint64_to_float32( uint64 a STATUS_PARAM )
   1176 {
   1177     int8 shiftCount;
   1178 
   1179     if ( a == 0 ) return float32_zero;
   1180     shiftCount = countLeadingZeros64( a ) - 40;
   1181     if ( 0 <= shiftCount ) {
   1182         return packFloat32( 1 > 0, 0x95 - shiftCount, a<<shiftCount );
   1183     }
   1184     else {
   1185         shiftCount += 7;
   1186         if ( shiftCount < 0 ) {
   1187             shift64RightJamming( a, - shiftCount, &a );
   1188         }
   1189         else {
   1190             a <<= shiftCount;
   1191         }
   1192         return roundAndPackFloat32( 1 > 0, 0x9C - shiftCount, a STATUS_VAR );
   1193     }
   1194 }
   1195 
   1196 /*----------------------------------------------------------------------------
   1197 | Returns the result of converting the 64-bit two's complement integer `a'
   1198 | to the double-precision floating-point format.  The conversion is performed
   1199 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   1200 *----------------------------------------------------------------------------*/
   1201 
   1202 float64 int64_to_float64( int64 a STATUS_PARAM )
   1203 {
   1204     flag zSign;
   1205 
   1206     if ( a == 0 ) return float64_zero;
   1207     if ( a == (sbits64) LIT64( 0x8000000000000000 ) ) {
   1208         return packFloat64( 1, 0x43E, 0 );
   1209     }
   1210     zSign = ( a < 0 );
   1211     return normalizeRoundAndPackFloat64( zSign, 0x43C, zSign ? - a : a STATUS_VAR );
   1212 
   1213 }
   1214 
   1215 float64 uint64_to_float64( uint64 a STATUS_PARAM )
   1216 {
   1217     if ( a == 0 ) return float64_zero;
   1218     return normalizeRoundAndPackFloat64( 0, 0x43C, a STATUS_VAR );
   1219 
   1220 }
   1221 
   1222 #ifdef FLOATX80
   1223 
   1224 /*----------------------------------------------------------------------------
   1225 | Returns the result of converting the 64-bit two's complement integer `a'
   1226 | to the extended double-precision floating-point format.  The conversion
   1227 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
   1228 | Arithmetic.
   1229 *----------------------------------------------------------------------------*/
   1230 
   1231 floatx80 int64_to_floatx80( int64 a STATUS_PARAM )
   1232 {
   1233     flag zSign;
   1234     uint64 absA;
   1235     int8 shiftCount;
   1236 
   1237     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
   1238     zSign = ( a < 0 );
   1239     absA = zSign ? - a : a;
   1240     shiftCount = countLeadingZeros64( absA );
   1241     return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
   1242 
   1243 }
   1244 
   1245 #endif
   1246 
   1247 #ifdef FLOAT128
   1248 
   1249 /*----------------------------------------------------------------------------
   1250 | Returns the result of converting the 64-bit two's complement integer `a' to
   1251 | the quadruple-precision floating-point format.  The conversion is performed
   1252 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   1253 *----------------------------------------------------------------------------*/
   1254 
   1255 float128 int64_to_float128( int64 a STATUS_PARAM )
   1256 {
   1257     flag zSign;
   1258     uint64 absA;
   1259     int8 shiftCount;
   1260     int32 zExp;
   1261     bits64 zSig0, zSig1;
   1262 
   1263     if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
   1264     zSign = ( a < 0 );
   1265     absA = zSign ? - a : a;
   1266     shiftCount = countLeadingZeros64( absA ) + 49;
   1267     zExp = 0x406E - shiftCount;
   1268     if ( 64 <= shiftCount ) {
   1269         zSig1 = 0;
   1270         zSig0 = absA;
   1271         shiftCount -= 64;
   1272     }
   1273     else {
   1274         zSig1 = absA;
   1275         zSig0 = 0;
   1276     }
   1277     shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
   1278     return packFloat128( zSign, zExp, zSig0, zSig1 );
   1279 
   1280 }
   1281 
   1282 #endif
   1283 
   1284 /*----------------------------------------------------------------------------
   1285 | Returns the result of converting the single-precision floating-point value
   1286 | `a' to the 32-bit two's complement integer format.  The conversion is
   1287 | performed according to the IEC/IEEE Standard for Binary Floating-Point
   1288 | Arithmetic---which means in particular that the conversion is rounded
   1289 | according to the current rounding mode.  If `a' is a NaN, the largest
   1290 | positive integer is returned.  Otherwise, if the conversion overflows, the
   1291 | largest integer with the same sign as `a' is returned.
   1292 *----------------------------------------------------------------------------*/
   1293 
   1294 int32 float32_to_int32( float32 a STATUS_PARAM )
   1295 {
   1296     flag aSign;
   1297     int16 aExp, shiftCount;
   1298     bits32 aSig;
   1299     bits64 aSig64;
   1300 
   1301     aSig = extractFloat32Frac( a );
   1302     aExp = extractFloat32Exp( a );
   1303     aSign = extractFloat32Sign( a );
   1304     if ( ( aExp == 0xFF ) && aSig ) aSign = 0;
   1305     if ( aExp ) aSig |= 0x00800000;
   1306     shiftCount = 0xAF - aExp;
   1307     aSig64 = aSig;
   1308     aSig64 <<= 32;
   1309     if ( 0 < shiftCount ) shift64RightJamming( aSig64, shiftCount, &aSig64 );
   1310     return roundAndPackInt32( aSign, aSig64 STATUS_VAR );
   1311 
   1312 }
   1313 
   1314 /*----------------------------------------------------------------------------
   1315 | Returns the result of converting the single-precision floating-point value
   1316 | `a' to the 32-bit two's complement integer format.  The conversion is
   1317 | performed according to the IEC/IEEE Standard for Binary Floating-Point
   1318 | Arithmetic, except that the conversion is always rounded toward zero.
   1319 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
   1320 | the conversion overflows, the largest integer with the same sign as `a' is
   1321 | returned.
   1322 *----------------------------------------------------------------------------*/
   1323 
   1324 int32 float32_to_int32_round_to_zero( float32 a STATUS_PARAM )
   1325 {
   1326     flag aSign;
   1327     int16 aExp, shiftCount;
   1328     bits32 aSig;
   1329     int32 z;
   1330 
   1331     aSig = extractFloat32Frac( a );
   1332     aExp = extractFloat32Exp( a );
   1333     aSign = extractFloat32Sign( a );
   1334     shiftCount = aExp - 0x9E;
   1335     if ( 0 <= shiftCount ) {
   1336         if ( float32_val(a) != 0xCF000000 ) {
   1337             float_raise( float_flag_invalid STATUS_VAR);
   1338             if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF;
   1339         }
   1340         return (sbits32) 0x80000000;
   1341     }
   1342     else if ( aExp <= 0x7E ) {
   1343         if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
   1344         return 0;
   1345     }
   1346     aSig = ( aSig | 0x00800000 )<<8;
   1347     z = aSig>>( - shiftCount );
   1348     if ( (bits32) ( aSig<<( shiftCount & 31 ) ) ) {
   1349         STATUS(float_exception_flags) |= float_flag_inexact;
   1350     }
   1351     if ( aSign ) z = - z;
   1352     return z;
   1353 
   1354 }
   1355 
   1356 /*----------------------------------------------------------------------------
   1357 | Returns the result of converting the single-precision floating-point value
   1358 | `a' to the 64-bit two's complement integer format.  The conversion is
   1359 | performed according to the IEC/IEEE Standard for Binary Floating-Point
   1360 | Arithmetic---which means in particular that the conversion is rounded
   1361 | according to the current rounding mode.  If `a' is a NaN, the largest
   1362 | positive integer is returned.  Otherwise, if the conversion overflows, the
   1363 | largest integer with the same sign as `a' is returned.
   1364 *----------------------------------------------------------------------------*/
   1365 
   1366 int64 float32_to_int64( float32 a STATUS_PARAM )
   1367 {
   1368     flag aSign;
   1369     int16 aExp, shiftCount;
   1370     bits32 aSig;
   1371     bits64 aSig64, aSigExtra;
   1372 
   1373     aSig = extractFloat32Frac( a );
   1374     aExp = extractFloat32Exp( a );
   1375     aSign = extractFloat32Sign( a );
   1376     shiftCount = 0xBE - aExp;
   1377     if ( shiftCount < 0 ) {
   1378         float_raise( float_flag_invalid STATUS_VAR);
   1379         if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
   1380             return LIT64( 0x7FFFFFFFFFFFFFFF );
   1381         }
   1382         return (sbits64) LIT64( 0x8000000000000000 );
   1383     }
   1384     if ( aExp ) aSig |= 0x00800000;
   1385     aSig64 = aSig;
   1386     aSig64 <<= 40;
   1387     shift64ExtraRightJamming( aSig64, 0, shiftCount, &aSig64, &aSigExtra );
   1388     return roundAndPackInt64( aSign, aSig64, aSigExtra STATUS_VAR );
   1389 
   1390 }
   1391 
   1392 /*----------------------------------------------------------------------------
   1393 | Returns the result of converting the single-precision floating-point value
   1394 | `a' to the 64-bit two's complement integer format.  The conversion is
   1395 | performed according to the IEC/IEEE Standard for Binary Floating-Point
   1396 | Arithmetic, except that the conversion is always rounded toward zero.  If
   1397 | `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
   1398 | conversion overflows, the largest integer with the same sign as `a' is
   1399 | returned.
   1400 *----------------------------------------------------------------------------*/
   1401 
   1402 int64 float32_to_int64_round_to_zero( float32 a STATUS_PARAM )
   1403 {
   1404     flag aSign;
   1405     int16 aExp, shiftCount;
   1406     bits32 aSig;
   1407     bits64 aSig64;
   1408     int64 z;
   1409 
   1410     aSig = extractFloat32Frac( a );
   1411     aExp = extractFloat32Exp( a );
   1412     aSign = extractFloat32Sign( a );
   1413     shiftCount = aExp - 0xBE;
   1414     if ( 0 <= shiftCount ) {
   1415         if ( float32_val(a) != 0xDF000000 ) {
   1416             float_raise( float_flag_invalid STATUS_VAR);
   1417             if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
   1418                 return LIT64( 0x7FFFFFFFFFFFFFFF );
   1419             }
   1420         }
   1421         return (sbits64) LIT64( 0x8000000000000000 );
   1422     }
   1423     else if ( aExp <= 0x7E ) {
   1424         if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
   1425         return 0;
   1426     }
   1427     aSig64 = aSig | 0x00800000;
   1428     aSig64 <<= 40;
   1429     z = aSig64>>( - shiftCount );
   1430     if ( (bits64) ( aSig64<<( shiftCount & 63 ) ) ) {
   1431         STATUS(float_exception_flags) |= float_flag_inexact;
   1432     }
   1433     if ( aSign ) z = - z;
   1434     return z;
   1435 
   1436 }
   1437 
   1438 /*----------------------------------------------------------------------------
   1439 | Returns the result of converting the single-precision floating-point value
   1440 | `a' to the double-precision floating-point format.  The conversion is
   1441 | performed according to the IEC/IEEE Standard for Binary Floating-Point
   1442 | Arithmetic.
   1443 *----------------------------------------------------------------------------*/
   1444 
   1445 float64 float32_to_float64( float32 a STATUS_PARAM )
   1446 {
   1447     flag aSign;
   1448     int16 aExp;
   1449     bits32 aSig;
   1450 
   1451     aSig = extractFloat32Frac( a );
   1452     aExp = extractFloat32Exp( a );
   1453     aSign = extractFloat32Sign( a );
   1454     if ( aExp == 0xFF ) {
   1455         if ( aSig ) return commonNaNToFloat64( float32ToCommonNaN( a STATUS_VAR ));
   1456         return packFloat64( aSign, 0x7FF, 0 );
   1457     }
   1458     if ( aExp == 0 ) {
   1459         if ( aSig == 0 ) return packFloat64( aSign, 0, 0 );
   1460         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
   1461         --aExp;
   1462     }
   1463     return packFloat64( aSign, aExp + 0x380, ( (bits64) aSig )<<29 );
   1464 
   1465 }
   1466 
   1467 #ifdef FLOATX80
   1468 
   1469 /*----------------------------------------------------------------------------
   1470 | Returns the result of converting the single-precision floating-point value
   1471 | `a' to the extended double-precision floating-point format.  The conversion
   1472 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
   1473 | Arithmetic.
   1474 *----------------------------------------------------------------------------*/
   1475 
   1476 floatx80 float32_to_floatx80( float32 a STATUS_PARAM )
   1477 {
   1478     flag aSign;
   1479     int16 aExp;
   1480     bits32 aSig;
   1481 
   1482     aSig = extractFloat32Frac( a );
   1483     aExp = extractFloat32Exp( a );
   1484     aSign = extractFloat32Sign( a );
   1485     if ( aExp == 0xFF ) {
   1486         if ( aSig ) return commonNaNToFloatx80( float32ToCommonNaN( a STATUS_VAR ) );
   1487         return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
   1488     }
   1489     if ( aExp == 0 ) {
   1490         if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
   1491         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
   1492     }
   1493     aSig |= 0x00800000;
   1494     return packFloatx80( aSign, aExp + 0x3F80, ( (bits64) aSig )<<40 );
   1495 
   1496 }
   1497 
   1498 #endif
   1499 
   1500 #ifdef FLOAT128
   1501 
   1502 /*----------------------------------------------------------------------------
   1503 | Returns the result of converting the single-precision floating-point value
   1504 | `a' to the double-precision floating-point format.  The conversion is
   1505 | performed according to the IEC/IEEE Standard for Binary Floating-Point
   1506 | Arithmetic.
   1507 *----------------------------------------------------------------------------*/
   1508 
   1509 float128 float32_to_float128( float32 a STATUS_PARAM )
   1510 {
   1511     flag aSign;
   1512     int16 aExp;
   1513     bits32 aSig;
   1514 
   1515     aSig = extractFloat32Frac( a );
   1516     aExp = extractFloat32Exp( a );
   1517     aSign = extractFloat32Sign( a );
   1518     if ( aExp == 0xFF ) {
   1519         if ( aSig ) return commonNaNToFloat128( float32ToCommonNaN( a STATUS_VAR ) );
   1520         return packFloat128( aSign, 0x7FFF, 0, 0 );
   1521     }
   1522     if ( aExp == 0 ) {
   1523         if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
   1524         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
   1525         --aExp;
   1526     }
   1527     return packFloat128( aSign, aExp + 0x3F80, ( (bits64) aSig )<<25, 0 );
   1528 
   1529 }
   1530 
   1531 #endif
   1532 
   1533 /*----------------------------------------------------------------------------
   1534 | Rounds the single-precision floating-point value `a' to an integer, and
   1535 | returns the result as a single-precision floating-point value.  The
   1536 | operation is performed according to the IEC/IEEE Standard for Binary
   1537 | Floating-Point Arithmetic.
   1538 *----------------------------------------------------------------------------*/
   1539 
   1540 float32 float32_round_to_int( float32 a STATUS_PARAM)
   1541 {
   1542     flag aSign;
   1543     int16 aExp;
   1544     bits32 lastBitMask, roundBitsMask;
   1545     int8 roundingMode;
   1546     bits32 z;
   1547 
   1548     aExp = extractFloat32Exp( a );
   1549     if ( 0x96 <= aExp ) {
   1550         if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) {
   1551             return propagateFloat32NaN( a, a STATUS_VAR );
   1552         }
   1553         return a;
   1554     }
   1555     if ( aExp <= 0x7E ) {
   1556         if ( (bits32) ( float32_val(a)<<1 ) == 0 ) return a;
   1557         STATUS(float_exception_flags) |= float_flag_inexact;
   1558         aSign = extractFloat32Sign( a );
   1559         switch ( STATUS(float_rounding_mode) ) {
   1560          case float_round_nearest_even:
   1561             if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) {
   1562                 return packFloat32( aSign, 0x7F, 0 );
   1563             }
   1564             break;
   1565          case float_round_down:
   1566             return make_float32(aSign ? 0xBF800000 : 0);
   1567          case float_round_up:
   1568             return make_float32(aSign ? 0x80000000 : 0x3F800000);
   1569         }
   1570         return packFloat32( aSign, 0, 0 );
   1571     }
   1572     lastBitMask = 1;
   1573     lastBitMask <<= 0x96 - aExp;
   1574     roundBitsMask = lastBitMask - 1;
   1575     z = float32_val(a);
   1576     roundingMode = STATUS(float_rounding_mode);
   1577     if ( roundingMode == float_round_nearest_even ) {
   1578         z += lastBitMask>>1;
   1579         if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
   1580     }
   1581     else if ( roundingMode != float_round_to_zero ) {
   1582         if ( extractFloat32Sign( make_float32(z) ) ^ ( roundingMode == float_round_up ) ) {
   1583             z += roundBitsMask;
   1584         }
   1585     }
   1586     z &= ~ roundBitsMask;
   1587     if ( z != float32_val(a) ) STATUS(float_exception_flags) |= float_flag_inexact;
   1588     return make_float32(z);
   1589 
   1590 }
   1591 
   1592 /*----------------------------------------------------------------------------
   1593 | Returns the result of adding the absolute values of the single-precision
   1594 | floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
   1595 | before being returned.  `zSign' is ignored if the result is a NaN.
   1596 | The addition is performed according to the IEC/IEEE Standard for Binary
   1597 | Floating-Point Arithmetic.
   1598 *----------------------------------------------------------------------------*/
   1599 
   1600 static float32 addFloat32Sigs( float32 a, float32 b, flag zSign STATUS_PARAM)
   1601 {
   1602     int16 aExp, bExp, zExp;
   1603     bits32 aSig, bSig, zSig;
   1604     int16 expDiff;
   1605 
   1606     aSig = extractFloat32Frac( a );
   1607     aExp = extractFloat32Exp( a );
   1608     bSig = extractFloat32Frac( b );
   1609     bExp = extractFloat32Exp( b );
   1610     expDiff = aExp - bExp;
   1611     aSig <<= 6;
   1612     bSig <<= 6;
   1613     if ( 0 < expDiff ) {
   1614         if ( aExp == 0xFF ) {
   1615             if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
   1616             return a;
   1617         }
   1618         if ( bExp == 0 ) {
   1619             --expDiff;
   1620         }
   1621         else {
   1622             bSig |= 0x20000000;
   1623         }
   1624         shift32RightJamming( bSig, expDiff, &bSig );
   1625         zExp = aExp;
   1626     }
   1627     else if ( expDiff < 0 ) {
   1628         if ( bExp == 0xFF ) {
   1629             if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
   1630             return packFloat32( zSign, 0xFF, 0 );
   1631         }
   1632         if ( aExp == 0 ) {
   1633             ++expDiff;
   1634         }
   1635         else {
   1636             aSig |= 0x20000000;
   1637         }
   1638         shift32RightJamming( aSig, - expDiff, &aSig );
   1639         zExp = bExp;
   1640     }
   1641     else {
   1642         if ( aExp == 0xFF ) {
   1643             if ( aSig | bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
   1644             return a;
   1645         }
   1646         if ( aExp == 0 ) {
   1647             if ( STATUS(flush_to_zero) ) return packFloat32( zSign, 0, 0 );
   1648             return packFloat32( zSign, 0, ( aSig + bSig )>>6 );
   1649         }
   1650         zSig = 0x40000000 + aSig + bSig;
   1651         zExp = aExp;
   1652         goto roundAndPack;
   1653     }
   1654     aSig |= 0x20000000;
   1655     zSig = ( aSig + bSig )<<1;
   1656     --zExp;
   1657     if ( (sbits32) zSig < 0 ) {
   1658         zSig = aSig + bSig;
   1659         ++zExp;
   1660     }
   1661  roundAndPack:
   1662     return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
   1663 
   1664 }
   1665 
   1666 /*----------------------------------------------------------------------------
   1667 | Returns the result of subtracting the absolute values of the single-
   1668 | precision floating-point values `a' and `b'.  If `zSign' is 1, the
   1669 | difference is negated before being returned.  `zSign' is ignored if the
   1670 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
   1671 | Standard for Binary Floating-Point Arithmetic.
   1672 *----------------------------------------------------------------------------*/
   1673 
   1674 static float32 subFloat32Sigs( float32 a, float32 b, flag zSign STATUS_PARAM)
   1675 {
   1676     int16 aExp, bExp, zExp;
   1677     bits32 aSig, bSig, zSig;
   1678     int16 expDiff;
   1679 
   1680     aSig = extractFloat32Frac( a );
   1681     aExp = extractFloat32Exp( a );
   1682     bSig = extractFloat32Frac( b );
   1683     bExp = extractFloat32Exp( b );
   1684     expDiff = aExp - bExp;
   1685     aSig <<= 7;
   1686     bSig <<= 7;
   1687     if ( 0 < expDiff ) goto aExpBigger;
   1688     if ( expDiff < 0 ) goto bExpBigger;
   1689     if ( aExp == 0xFF ) {
   1690         if ( aSig | bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
   1691         float_raise( float_flag_invalid STATUS_VAR);
   1692         return float32_default_nan;
   1693     }
   1694     if ( aExp == 0 ) {
   1695         aExp = 1;
   1696         bExp = 1;
   1697     }
   1698     if ( bSig < aSig ) goto aBigger;
   1699     if ( aSig < bSig ) goto bBigger;
   1700     return packFloat32( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
   1701  bExpBigger:
   1702     if ( bExp == 0xFF ) {
   1703         if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
   1704         return packFloat32( zSign ^ 1, 0xFF, 0 );
   1705     }
   1706     if ( aExp == 0 ) {
   1707         ++expDiff;
   1708     }
   1709     else {
   1710         aSig |= 0x40000000;
   1711     }
   1712     shift32RightJamming( aSig, - expDiff, &aSig );
   1713     bSig |= 0x40000000;
   1714  bBigger:
   1715     zSig = bSig - aSig;
   1716     zExp = bExp;
   1717     zSign ^= 1;
   1718     goto normalizeRoundAndPack;
   1719  aExpBigger:
   1720     if ( aExp == 0xFF ) {
   1721         if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
   1722         return a;
   1723     }
   1724     if ( bExp == 0 ) {
   1725         --expDiff;
   1726     }
   1727     else {
   1728         bSig |= 0x40000000;
   1729     }
   1730     shift32RightJamming( bSig, expDiff, &bSig );
   1731     aSig |= 0x40000000;
   1732  aBigger:
   1733     zSig = aSig - bSig;
   1734     zExp = aExp;
   1735  normalizeRoundAndPack:
   1736     --zExp;
   1737     return normalizeRoundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
   1738 
   1739 }
   1740 
   1741 /*----------------------------------------------------------------------------
   1742 | Returns the result of adding the single-precision floating-point values `a'
   1743 | and `b'.  The operation is performed according to the IEC/IEEE Standard for
   1744 | Binary Floating-Point Arithmetic.
   1745 *----------------------------------------------------------------------------*/
   1746 
   1747 float32 float32_add( float32 a, float32 b STATUS_PARAM )
   1748 {
   1749     flag aSign, bSign;
   1750 
   1751     aSign = extractFloat32Sign( a );
   1752     bSign = extractFloat32Sign( b );
   1753     if ( aSign == bSign ) {
   1754         return addFloat32Sigs( a, b, aSign STATUS_VAR);
   1755     }
   1756     else {
   1757         return subFloat32Sigs( a, b, aSign STATUS_VAR );
   1758     }
   1759 
   1760 }
   1761 
   1762 /*----------------------------------------------------------------------------
   1763 | Returns the result of subtracting the single-precision floating-point values
   1764 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
   1765 | for Binary Floating-Point Arithmetic.
   1766 *----------------------------------------------------------------------------*/
   1767 
   1768 float32 float32_sub( float32 a, float32 b STATUS_PARAM )
   1769 {
   1770     flag aSign, bSign;
   1771 
   1772     aSign = extractFloat32Sign( a );
   1773     bSign = extractFloat32Sign( b );
   1774     if ( aSign == bSign ) {
   1775         return subFloat32Sigs( a, b, aSign STATUS_VAR );
   1776     }
   1777     else {
   1778         return addFloat32Sigs( a, b, aSign STATUS_VAR );
   1779     }
   1780 
   1781 }
   1782 
   1783 /*----------------------------------------------------------------------------
   1784 | Returns the result of multiplying the single-precision floating-point values
   1785 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
   1786 | for Binary Floating-Point Arithmetic.
   1787 *----------------------------------------------------------------------------*/
   1788 
   1789 float32 float32_mul( float32 a, float32 b STATUS_PARAM )
   1790 {
   1791     flag aSign, bSign, zSign;
   1792     int16 aExp, bExp, zExp;
   1793     bits32 aSig, bSig;
   1794     bits64 zSig64;
   1795     bits32 zSig;
   1796 
   1797     aSig = extractFloat32Frac( a );
   1798     aExp = extractFloat32Exp( a );
   1799     aSign = extractFloat32Sign( a );
   1800     bSig = extractFloat32Frac( b );
   1801     bExp = extractFloat32Exp( b );
   1802     bSign = extractFloat32Sign( b );
   1803     zSign = aSign ^ bSign;
   1804     if ( aExp == 0xFF ) {
   1805         if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
   1806             return propagateFloat32NaN( a, b STATUS_VAR );
   1807         }
   1808         if ( ( bExp | bSig ) == 0 ) {
   1809             float_raise( float_flag_invalid STATUS_VAR);
   1810             return float32_default_nan;
   1811         }
   1812         return packFloat32( zSign, 0xFF, 0 );
   1813     }
   1814     if ( bExp == 0xFF ) {
   1815         if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
   1816         if ( ( aExp | aSig ) == 0 ) {
   1817             float_raise( float_flag_invalid STATUS_VAR);
   1818             return float32_default_nan;
   1819         }
   1820         return packFloat32( zSign, 0xFF, 0 );
   1821     }
   1822     if ( aExp == 0 ) {
   1823         if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
   1824         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
   1825     }
   1826     if ( bExp == 0 ) {
   1827         if ( bSig == 0 ) return packFloat32( zSign, 0, 0 );
   1828         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
   1829     }
   1830     zExp = aExp + bExp - 0x7F;
   1831     aSig = ( aSig | 0x00800000 )<<7;
   1832     bSig = ( bSig | 0x00800000 )<<8;
   1833     shift64RightJamming( ( (bits64) aSig ) * bSig, 32, &zSig64 );
   1834     zSig = zSig64;
   1835     if ( 0 <= (sbits32) ( zSig<<1 ) ) {
   1836         zSig <<= 1;
   1837         --zExp;
   1838     }
   1839     return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
   1840 
   1841 }
   1842 
   1843 /*----------------------------------------------------------------------------
   1844 | Returns the result of dividing the single-precision floating-point value `a'
   1845 | by the corresponding value `b'.  The operation is performed according to the
   1846 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   1847 *----------------------------------------------------------------------------*/
   1848 
   1849 float32 float32_div( float32 a, float32 b STATUS_PARAM )
   1850 {
   1851     flag aSign, bSign, zSign;
   1852     int16 aExp, bExp, zExp;
   1853     bits32 aSig, bSig, zSig;
   1854 
   1855     aSig = extractFloat32Frac( a );
   1856     aExp = extractFloat32Exp( a );
   1857     aSign = extractFloat32Sign( a );
   1858     bSig = extractFloat32Frac( b );
   1859     bExp = extractFloat32Exp( b );
   1860     bSign = extractFloat32Sign( b );
   1861     zSign = aSign ^ bSign;
   1862     if ( aExp == 0xFF ) {
   1863         if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
   1864         if ( bExp == 0xFF ) {
   1865             if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
   1866             float_raise( float_flag_invalid STATUS_VAR);
   1867             return float32_default_nan;
   1868         }
   1869         return packFloat32( zSign, 0xFF, 0 );
   1870     }
   1871     if ( bExp == 0xFF ) {
   1872         if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
   1873         return packFloat32( zSign, 0, 0 );
   1874     }
   1875     if ( bExp == 0 ) {
   1876         if ( bSig == 0 ) {
   1877             if ( ( aExp | aSig ) == 0 ) {
   1878                 float_raise( float_flag_invalid STATUS_VAR);
   1879                 return float32_default_nan;
   1880             }
   1881             float_raise( float_flag_divbyzero STATUS_VAR);
   1882             return packFloat32( zSign, 0xFF, 0 );
   1883         }
   1884         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
   1885     }
   1886     if ( aExp == 0 ) {
   1887         if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
   1888         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
   1889     }
   1890     zExp = aExp - bExp + 0x7D;
   1891     aSig = ( aSig | 0x00800000 )<<7;
   1892     bSig = ( bSig | 0x00800000 )<<8;
   1893     if ( bSig <= ( aSig + aSig ) ) {
   1894         aSig >>= 1;
   1895         ++zExp;
   1896     }
   1897     zSig = ( ( (bits64) aSig )<<32 ) / bSig;
   1898     if ( ( zSig & 0x3F ) == 0 ) {
   1899         zSig |= ( (bits64) bSig * zSig != ( (bits64) aSig )<<32 );
   1900     }
   1901     return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
   1902 
   1903 }
   1904 
   1905 /*----------------------------------------------------------------------------
   1906 | Returns the remainder of the single-precision floating-point value `a'
   1907 | with respect to the corresponding value `b'.  The operation is performed
   1908 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   1909 *----------------------------------------------------------------------------*/
   1910 
   1911 float32 float32_rem( float32 a, float32 b STATUS_PARAM )
   1912 {
   1913     flag aSign, bSign, zSign;
   1914     int16 aExp, bExp, expDiff;
   1915     bits32 aSig, bSig;
   1916     bits32 q;
   1917     bits64 aSig64, bSig64, q64;
   1918     bits32 alternateASig;
   1919     sbits32 sigMean;
   1920 
   1921     aSig = extractFloat32Frac( a );
   1922     aExp = extractFloat32Exp( a );
   1923     aSign = extractFloat32Sign( a );
   1924     bSig = extractFloat32Frac( b );
   1925     bExp = extractFloat32Exp( b );
   1926     bSign = extractFloat32Sign( b );
   1927     if ( aExp == 0xFF ) {
   1928         if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
   1929             return propagateFloat32NaN( a, b STATUS_VAR );
   1930         }
   1931         float_raise( float_flag_invalid STATUS_VAR);
   1932         return float32_default_nan;
   1933     }
   1934     if ( bExp == 0xFF ) {
   1935         if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
   1936         return a;
   1937     }
   1938     if ( bExp == 0 ) {
   1939         if ( bSig == 0 ) {
   1940             float_raise( float_flag_invalid STATUS_VAR);
   1941             return float32_default_nan;
   1942         }
   1943         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
   1944     }
   1945     if ( aExp == 0 ) {
   1946         if ( aSig == 0 ) return a;
   1947         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
   1948     }
   1949     expDiff = aExp - bExp;
   1950     aSig |= 0x00800000;
   1951     bSig |= 0x00800000;
   1952     if ( expDiff < 32 ) {
   1953         aSig <<= 8;
   1954         bSig <<= 8;
   1955         if ( expDiff < 0 ) {
   1956             if ( expDiff < -1 ) return a;
   1957             aSig >>= 1;
   1958         }
   1959         q = ( bSig <= aSig );
   1960         if ( q ) aSig -= bSig;
   1961         if ( 0 < expDiff ) {
   1962             q = ( ( (bits64) aSig )<<32 ) / bSig;
   1963             q >>= 32 - expDiff;
   1964             bSig >>= 2;
   1965             aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
   1966         }
   1967         else {
   1968             aSig >>= 2;
   1969             bSig >>= 2;
   1970         }
   1971     }
   1972     else {
   1973         if ( bSig <= aSig ) aSig -= bSig;
   1974         aSig64 = ( (bits64) aSig )<<40;
   1975         bSig64 = ( (bits64) bSig )<<40;
   1976         expDiff -= 64;
   1977         while ( 0 < expDiff ) {
   1978             q64 = estimateDiv128To64( aSig64, 0, bSig64 );
   1979             q64 = ( 2 < q64 ) ? q64 - 2 : 0;
   1980             aSig64 = - ( ( bSig * q64 )<<38 );
   1981             expDiff -= 62;
   1982         }
   1983         expDiff += 64;
   1984         q64 = estimateDiv128To64( aSig64, 0, bSig64 );
   1985         q64 = ( 2 < q64 ) ? q64 - 2 : 0;
   1986         q = q64>>( 64 - expDiff );
   1987         bSig <<= 6;
   1988         aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
   1989     }
   1990     do {
   1991         alternateASig = aSig;
   1992         ++q;
   1993         aSig -= bSig;
   1994     } while ( 0 <= (sbits32) aSig );
   1995     sigMean = aSig + alternateASig;
   1996     if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
   1997         aSig = alternateASig;
   1998     }
   1999     zSign = ( (sbits32) aSig < 0 );
   2000     if ( zSign ) aSig = - aSig;
   2001     return normalizeRoundAndPackFloat32( aSign ^ zSign, bExp, aSig STATUS_VAR );
   2002 
   2003 }
   2004 
   2005 /*----------------------------------------------------------------------------
   2006 | Returns the square root of the single-precision floating-point value `a'.
   2007 | The operation is performed according to the IEC/IEEE Standard for Binary
   2008 | Floating-Point Arithmetic.
   2009 *----------------------------------------------------------------------------*/
   2010 
   2011 float32 float32_sqrt( float32 a STATUS_PARAM )
   2012 {
   2013     flag aSign;
   2014     int16 aExp, zExp;
   2015     bits32 aSig, zSig;
   2016     bits64 rem, term;
   2017 
   2018     aSig = extractFloat32Frac( a );
   2019     aExp = extractFloat32Exp( a );
   2020     aSign = extractFloat32Sign( a );
   2021     if ( aExp == 0xFF ) {
   2022         if ( aSig ) return propagateFloat32NaN( a, float32_zero STATUS_VAR );
   2023         if ( ! aSign ) return a;
   2024         float_raise( float_flag_invalid STATUS_VAR);
   2025         return float32_default_nan;
   2026     }
   2027     if ( aSign ) {
   2028         if ( ( aExp | aSig ) == 0 ) return a;
   2029         float_raise( float_flag_invalid STATUS_VAR);
   2030         return float32_default_nan;
   2031     }
   2032     if ( aExp == 0 ) {
   2033         if ( aSig == 0 ) return float32_zero;
   2034         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
   2035     }
   2036     zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E;
   2037     aSig = ( aSig | 0x00800000 )<<8;
   2038     zSig = estimateSqrt32( aExp, aSig ) + 2;
   2039     if ( ( zSig & 0x7F ) <= 5 ) {
   2040         if ( zSig < 2 ) {
   2041             zSig = 0x7FFFFFFF;
   2042             goto roundAndPack;
   2043         }
   2044         aSig >>= aExp & 1;
   2045         term = ( (bits64) zSig ) * zSig;
   2046         rem = ( ( (bits64) aSig )<<32 ) - term;
   2047         while ( (sbits64) rem < 0 ) {
   2048             --zSig;
   2049             rem += ( ( (bits64) zSig )<<1 ) | 1;
   2050         }
   2051         zSig |= ( rem != 0 );
   2052     }
   2053     shift32RightJamming( zSig, 1, &zSig );
   2054  roundAndPack:
   2055     return roundAndPackFloat32( 0, zExp, zSig STATUS_VAR );
   2056 
   2057 }
   2058 
   2059 /*----------------------------------------------------------------------------
   2060 | Returns the binary log of the single-precision floating-point value `a'.
   2061 | The operation is performed according to the IEC/IEEE Standard for Binary
   2062 | Floating-Point Arithmetic.
   2063 *----------------------------------------------------------------------------*/
   2064 float32 float32_log2( float32 a STATUS_PARAM )
   2065 {
   2066     flag aSign, zSign;
   2067     int16 aExp;
   2068     bits32 aSig, zSig, i;
   2069 
   2070     aSig = extractFloat32Frac( a );
   2071     aExp = extractFloat32Exp( a );
   2072     aSign = extractFloat32Sign( a );
   2073 
   2074     if ( aExp == 0 ) {
   2075         if ( aSig == 0 ) return packFloat32( 1, 0xFF, 0 );
   2076         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
   2077     }
   2078     if ( aSign ) {
   2079         float_raise( float_flag_invalid STATUS_VAR);
   2080         return float32_default_nan;
   2081     }
   2082     if ( aExp == 0xFF ) {
   2083         if ( aSig ) return propagateFloat32NaN( a, float32_zero STATUS_VAR );
   2084         return a;
   2085     }
   2086 
   2087     aExp -= 0x7F;
   2088     aSig |= 0x00800000;
   2089     zSign = aExp < 0;
   2090     zSig = aExp << 23;
   2091 
   2092     for (i = 1 << 22; i > 0; i >>= 1) {
   2093         aSig = ( (bits64)aSig * aSig ) >> 23;
   2094         if ( aSig & 0x01000000 ) {
   2095             aSig >>= 1;
   2096             zSig |= i;
   2097         }
   2098     }
   2099 
   2100     if ( zSign )
   2101         zSig = -zSig;
   2102 
   2103     return normalizeRoundAndPackFloat32( zSign, 0x85, zSig STATUS_VAR );
   2104 }
   2105 
   2106 /*----------------------------------------------------------------------------
   2107 | Returns 1 if the single-precision floating-point value `a' is equal to
   2108 | the corresponding value `b', and 0 otherwise.  The comparison is performed
   2109 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   2110 *----------------------------------------------------------------------------*/
   2111 
   2112 int float32_eq( float32 a, float32 b STATUS_PARAM )
   2113 {
   2114 
   2115     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
   2116          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
   2117        ) {
   2118         if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
   2119             float_raise( float_flag_invalid STATUS_VAR);
   2120         }
   2121         return 0;
   2122     }
   2123     return ( float32_val(a) == float32_val(b) ) ||
   2124             ( (bits32) ( ( float32_val(a) | float32_val(b) )<<1 ) == 0 );
   2125 
   2126 }
   2127 
   2128 /*----------------------------------------------------------------------------
   2129 | Returns 1 if the single-precision floating-point value `a' is less than
   2130 | or equal to the corresponding value `b', and 0 otherwise.  The comparison
   2131 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
   2132 | Arithmetic.
   2133 *----------------------------------------------------------------------------*/
   2134 
   2135 int float32_le( float32 a, float32 b STATUS_PARAM )
   2136 {
   2137     flag aSign, bSign;
   2138     bits32 av, bv;
   2139 
   2140     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
   2141          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
   2142        ) {
   2143         float_raise( float_flag_invalid STATUS_VAR);
   2144         return 0;
   2145     }
   2146     aSign = extractFloat32Sign( a );
   2147     bSign = extractFloat32Sign( b );
   2148     av = float32_val(a);
   2149     bv = float32_val(b);
   2150     if ( aSign != bSign ) return aSign || ( (bits32) ( ( av | bv )<<1 ) == 0 );
   2151     return ( av == bv ) || ( aSign ^ ( av < bv ) );
   2152 
   2153 }
   2154 
   2155 /*----------------------------------------------------------------------------
   2156 | Returns 1 if the single-precision floating-point value `a' is less than
   2157 | the corresponding value `b', and 0 otherwise.  The comparison is performed
   2158 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   2159 *----------------------------------------------------------------------------*/
   2160 
   2161 int float32_lt( float32 a, float32 b STATUS_PARAM )
   2162 {
   2163     flag aSign, bSign;
   2164     bits32 av, bv;
   2165 
   2166     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
   2167          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
   2168        ) {
   2169         float_raise( float_flag_invalid STATUS_VAR);
   2170         return 0;
   2171     }
   2172     aSign = extractFloat32Sign( a );
   2173     bSign = extractFloat32Sign( b );
   2174     av = float32_val(a);
   2175     bv = float32_val(b);
   2176     if ( aSign != bSign ) return aSign && ( (bits32) ( ( av | bv )<<1 ) != 0 );
   2177     return ( av != bv ) && ( aSign ^ ( av < bv ) );
   2178 
   2179 }
   2180 
   2181 /*----------------------------------------------------------------------------
   2182 | Returns 1 if the single-precision floating-point value `a' is equal to
   2183 | the corresponding value `b', and 0 otherwise.  The invalid exception is
   2184 | raised if either operand is a NaN.  Otherwise, the comparison is performed
   2185 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   2186 *----------------------------------------------------------------------------*/
   2187 
   2188 int float32_eq_signaling( float32 a, float32 b STATUS_PARAM )
   2189 {
   2190     bits32 av, bv;
   2191 
   2192     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
   2193          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
   2194        ) {
   2195         float_raise( float_flag_invalid STATUS_VAR);
   2196         return 0;
   2197     }
   2198     av = float32_val(a);
   2199     bv = float32_val(b);
   2200     return ( av == bv ) || ( (bits32) ( ( av | bv )<<1 ) == 0 );
   2201 
   2202 }
   2203 
   2204 /*----------------------------------------------------------------------------
   2205 | Returns 1 if the single-precision floating-point value `a' is less than or
   2206 | equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
   2207 | cause an exception.  Otherwise, the comparison is performed according to the
   2208 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   2209 *----------------------------------------------------------------------------*/
   2210 
   2211 int float32_le_quiet( float32 a, float32 b STATUS_PARAM )
   2212 {
   2213     flag aSign, bSign;
   2214     bits32 av, bv;
   2215 
   2216     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
   2217          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
   2218        ) {
   2219         if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
   2220             float_raise( float_flag_invalid STATUS_VAR);
   2221         }
   2222         return 0;
   2223     }
   2224     aSign = extractFloat32Sign( a );
   2225     bSign = extractFloat32Sign( b );
   2226     av = float32_val(a);
   2227     bv = float32_val(b);
   2228     if ( aSign != bSign ) return aSign || ( (bits32) ( ( av | bv )<<1 ) == 0 );
   2229     return ( av == bv ) || ( aSign ^ ( av < bv ) );
   2230 
   2231 }
   2232 
   2233 /*----------------------------------------------------------------------------
   2234 | Returns 1 if the single-precision floating-point value `a' is less than
   2235 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
   2236 | exception.  Otherwise, the comparison is performed according to the IEC/IEEE
   2237 | Standard for Binary Floating-Point Arithmetic.
   2238 *----------------------------------------------------------------------------*/
   2239 
   2240 int float32_lt_quiet( float32 a, float32 b STATUS_PARAM )
   2241 {
   2242     flag aSign, bSign;
   2243     bits32 av, bv;
   2244 
   2245     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
   2246          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
   2247        ) {
   2248         if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
   2249             float_raise( float_flag_invalid STATUS_VAR);
   2250         }
   2251         return 0;
   2252     }
   2253     aSign = extractFloat32Sign( a );
   2254     bSign = extractFloat32Sign( b );
   2255     av = float32_val(a);
   2256     bv = float32_val(b);
   2257     if ( aSign != bSign ) return aSign && ( (bits32) ( ( av | bv )<<1 ) != 0 );
   2258     return ( av != bv ) && ( aSign ^ ( av < bv ) );
   2259 
   2260 }
   2261 
   2262 /*----------------------------------------------------------------------------
   2263 | Returns the result of converting the double-precision floating-point value
   2264 | `a' to the 32-bit two's complement integer format.  The conversion is
   2265 | performed according to the IEC/IEEE Standard for Binary Floating-Point
   2266 | Arithmetic---which means in particular that the conversion is rounded
   2267 | according to the current rounding mode.  If `a' is a NaN, the largest
   2268 | positive integer is returned.  Otherwise, if the conversion overflows, the
   2269 | largest integer with the same sign as `a' is returned.
   2270 *----------------------------------------------------------------------------*/
   2271 
   2272 int32 float64_to_int32( float64 a STATUS_PARAM )
   2273 {
   2274     flag aSign;
   2275     int16 aExp, shiftCount;
   2276     bits64 aSig;
   2277 
   2278     aSig = extractFloat64Frac( a );
   2279     aExp = extractFloat64Exp( a );
   2280     aSign = extractFloat64Sign( a );
   2281     if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
   2282     if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
   2283     shiftCount = 0x42C - aExp;
   2284     if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig );
   2285     return roundAndPackInt32( aSign, aSig STATUS_VAR );
   2286 
   2287 }
   2288 
   2289 /*----------------------------------------------------------------------------
   2290 | Returns the result of converting the double-precision floating-point value
   2291 | `a' to the 32-bit two's complement integer format.  The conversion is
   2292 | performed according to the IEC/IEEE Standard for Binary Floating-Point
   2293 | Arithmetic, except that the conversion is always rounded toward zero.
   2294 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
   2295 | the conversion overflows, the largest integer with the same sign as `a' is
   2296 | returned.
   2297 *----------------------------------------------------------------------------*/
   2298 
   2299 int32 float64_to_int32_round_to_zero( float64 a STATUS_PARAM )
   2300 {
   2301     flag aSign;
   2302     int16 aExp, shiftCount;
   2303     bits64 aSig, savedASig;
   2304     int32 z;
   2305 
   2306     aSig = extractFloat64Frac( a );
   2307     aExp = extractFloat64Exp( a );
   2308     aSign = extractFloat64Sign( a );
   2309     if ( 0x41E < aExp ) {
   2310         if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
   2311         goto invalid;
   2312     }
   2313     else if ( aExp < 0x3FF ) {
   2314         if ( aExp || aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
   2315         return 0;
   2316     }
   2317     aSig |= LIT64( 0x0010000000000000 );
   2318     shiftCount = 0x433 - aExp;
   2319     savedASig = aSig;
   2320     aSig >>= shiftCount;
   2321     z = aSig;
   2322     if ( aSign ) z = - z;
   2323     if ( ( z < 0 ) ^ aSign ) {
   2324  invalid:
   2325         float_raise( float_flag_invalid STATUS_VAR);
   2326         return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
   2327     }
   2328     if ( ( aSig<<shiftCount ) != savedASig ) {
   2329         STATUS(float_exception_flags) |= float_flag_inexact;
   2330     }
   2331     return z;
   2332 
   2333 }
   2334 
   2335 /*----------------------------------------------------------------------------
   2336 | Returns the result of converting the double-precision floating-point value
   2337 | `a' to the 64-bit two's complement integer format.  The conversion is
   2338 | performed according to the IEC/IEEE Standard for Binary Floating-Point
   2339 | Arithmetic---which means in particular that the conversion is rounded
   2340 | according to the current rounding mode.  If `a' is a NaN, the largest
   2341 | positive integer is returned.  Otherwise, if the conversion overflows, the
   2342 | largest integer with the same sign as `a' is returned.
   2343 *----------------------------------------------------------------------------*/
   2344 
   2345 int64 float64_to_int64( float64 a STATUS_PARAM )
   2346 {
   2347     flag aSign;
   2348     int16 aExp, shiftCount;
   2349     bits64 aSig, aSigExtra;
   2350 
   2351     aSig = extractFloat64Frac( a );
   2352     aExp = extractFloat64Exp( a );
   2353     aSign = extractFloat64Sign( a );
   2354     if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
   2355     shiftCount = 0x433 - aExp;
   2356     if ( shiftCount <= 0 ) {
   2357         if ( 0x43E < aExp ) {
   2358             float_raise( float_flag_invalid STATUS_VAR);
   2359             if (    ! aSign
   2360                  || (    ( aExp == 0x7FF )
   2361                       && ( aSig != LIT64( 0x0010000000000000 ) ) )
   2362                ) {
   2363                 return LIT64( 0x7FFFFFFFFFFFFFFF );
   2364             }
   2365             return (sbits64) LIT64( 0x8000000000000000 );
   2366         }
   2367         aSigExtra = 0;
   2368         aSig <<= - shiftCount;
   2369     }
   2370     else {
   2371         shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
   2372     }
   2373     return roundAndPackInt64( aSign, aSig, aSigExtra STATUS_VAR );
   2374 
   2375 }
   2376 
   2377 /*----------------------------------------------------------------------------
   2378 | Returns the result of converting the double-precision floating-point value
   2379 | `a' to the 64-bit two's complement integer format.  The conversion is
   2380 | performed according to the IEC/IEEE Standard for Binary Floating-Point
   2381 | Arithmetic, except that the conversion is always rounded toward zero.
   2382 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
   2383 | the conversion overflows, the largest integer with the same sign as `a' is
   2384 | returned.
   2385 *----------------------------------------------------------------------------*/
   2386 
   2387 int64 float64_to_int64_round_to_zero( float64 a STATUS_PARAM )
   2388 {
   2389     flag aSign;
   2390     int16 aExp, shiftCount;
   2391     bits64 aSig;
   2392     int64 z;
   2393 
   2394     aSig = extractFloat64Frac( a );
   2395     aExp = extractFloat64Exp( a );
   2396     aSign = extractFloat64Sign( a );
   2397     if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
   2398     shiftCount = aExp - 0x433;
   2399     if ( 0 <= shiftCount ) {
   2400         if ( 0x43E <= aExp ) {
   2401             if ( float64_val(a) != LIT64( 0xC3E0000000000000 ) ) {
   2402                 float_raise( float_flag_invalid STATUS_VAR);
   2403                 if (    ! aSign
   2404                      || (    ( aExp == 0x7FF )
   2405                           && ( aSig != LIT64( 0x0010000000000000 ) ) )
   2406                    ) {
   2407                     return LIT64( 0x7FFFFFFFFFFFFFFF );
   2408                 }
   2409             }
   2410             return (sbits64) LIT64( 0x8000000000000000 );
   2411         }
   2412         z = aSig<<shiftCount;
   2413     }
   2414     else {
   2415         if ( aExp < 0x3FE ) {
   2416             if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
   2417             return 0;
   2418         }
   2419         z = aSig>>( - shiftCount );
   2420         if ( (bits64) ( aSig<<( shiftCount & 63 ) ) ) {
   2421             STATUS(float_exception_flags) |= float_flag_inexact;
   2422         }
   2423     }
   2424     if ( aSign ) z = - z;
   2425     return z;
   2426 
   2427 }
   2428 
   2429 /*----------------------------------------------------------------------------
   2430 | Returns the result of converting the double-precision floating-point value
   2431 | `a' to the single-precision floating-point format.  The conversion is
   2432 | performed according to the IEC/IEEE Standard for Binary Floating-Point
   2433 | Arithmetic.
   2434 *----------------------------------------------------------------------------*/
   2435 
   2436 float32 float64_to_float32( float64 a STATUS_PARAM )
   2437 {
   2438     flag aSign;
   2439     int16 aExp;
   2440     bits64 aSig;
   2441     bits32 zSig;
   2442 
   2443     aSig = extractFloat64Frac( a );
   2444     aExp = extractFloat64Exp( a );
   2445     aSign = extractFloat64Sign( a );
   2446     if ( aExp == 0x7FF ) {
   2447         if ( aSig ) return commonNaNToFloat32( float64ToCommonNaN( a STATUS_VAR ) );
   2448         return packFloat32( aSign, 0xFF, 0 );
   2449     }
   2450     shift64RightJamming( aSig, 22, &aSig );
   2451     zSig = aSig;
   2452     if ( aExp || zSig ) {
   2453         zSig |= 0x40000000;
   2454         aExp -= 0x381;
   2455     }
   2456     return roundAndPackFloat32( aSign, aExp, zSig STATUS_VAR );
   2457 
   2458 }
   2459 
   2460 
   2461 /*----------------------------------------------------------------------------
   2462 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
   2463 | half-precision floating-point value, returning the result.  After being
   2464 | shifted into the proper positions, the three fields are simply added
   2465 | together to form the result.  This means that any integer portion of `zSig'
   2466 | will be added into the exponent.  Since a properly normalized significand
   2467 | will have an integer portion equal to 1, the `zExp' input should be 1 less
   2468 | than the desired result exponent whenever `zSig' is a complete, normalized
   2469 | significand.
   2470 *----------------------------------------------------------------------------*/
   2471 static bits16 packFloat16(flag zSign, int16 zExp, bits16 zSig)
   2472 {
   2473     return (((bits32)zSign) << 15) + (((bits32)zExp) << 10) + zSig;
   2474 }
   2475 
   2476 /* Half precision floats come in two formats: standard IEEE and "ARM" format.
   2477    The latter gains extra exponent range by omitting the NaN/Inf encodings.  */
   2478 
   2479 float32 float16_to_float32( bits16 a, flag ieee STATUS_PARAM )
   2480 {
   2481     flag aSign;
   2482     int16 aExp;
   2483     bits32 aSig;
   2484 
   2485     aSign = a >> 15;
   2486     aExp = (a >> 10) & 0x1f;
   2487     aSig = a & 0x3ff;
   2488 
   2489     if (aExp == 0x1f && ieee) {
   2490         if (aSig) {
   2491             /* Make sure correct exceptions are raised.  */
   2492             float32ToCommonNaN(a STATUS_VAR);
   2493             aSig |= 0x200;
   2494         }
   2495         return packFloat32(aSign, 0xff, aSig << 13);
   2496     }
   2497     if (aExp == 0) {
   2498         int8 shiftCount;
   2499 
   2500         if (aSig == 0) {
   2501             return packFloat32(aSign, 0, 0);
   2502         }
   2503 
   2504         shiftCount = countLeadingZeros32( aSig ) - 21;
   2505         aSig = aSig << shiftCount;
   2506         aExp = -shiftCount;
   2507     }
   2508     return packFloat32( aSign, aExp + 0x70, aSig << 13);
   2509 }
   2510 
   2511 bits16 float32_to_float16( float32 a, flag ieee STATUS_PARAM)
   2512 {
   2513     flag aSign;
   2514     int16 aExp;
   2515     bits32 aSig;
   2516     bits32 mask;
   2517     bits32 increment;
   2518     int8 roundingMode;
   2519 
   2520     aSig = extractFloat32Frac( a );
   2521     aExp = extractFloat32Exp( a );
   2522     aSign = extractFloat32Sign( a );
   2523     if ( aExp == 0xFF ) {
   2524         if (aSig) {
   2525             /* Make sure correct exceptions are raised.  */
   2526             float32ToCommonNaN(a STATUS_VAR);
   2527             aSig |= 0x00400000;
   2528         }
   2529         return packFloat16(aSign, 0x1f, aSig >> 13);
   2530     }
   2531     if (aExp == 0 && aSign == 0) {
   2532         return packFloat16(aSign, 0, 0);
   2533     }
   2534     /* Decimal point between bits 22 and 23.  */
   2535     aSig |= 0x00800000;
   2536     aExp -= 0x7f;
   2537     if (aExp < -14) {
   2538         mask = 0x007fffff;
   2539         if (aExp < -24) {
   2540             aExp = -25;
   2541         } else {
   2542             mask >>= 24 + aExp;
   2543         }
   2544     } else {
   2545         mask = 0x00001fff;
   2546     }
   2547     if (aSig & mask) {
   2548         float_raise( float_flag_underflow STATUS_VAR );
   2549         roundingMode = STATUS(float_rounding_mode);
   2550         switch (roundingMode) {
   2551         case float_round_nearest_even:
   2552             increment = (mask + 1) >> 1;
   2553             if ((aSig & mask) == increment) {
   2554                 increment = aSig & (increment << 1);
   2555             }
   2556             break;
   2557         case float_round_up:
   2558             increment = aSign ? 0 : mask;
   2559             break;
   2560         case float_round_down:
   2561             increment = aSign ? mask : 0;
   2562             break;
   2563         default: /* round_to_zero */
   2564             increment = 0;
   2565             break;
   2566         }
   2567         aSig += increment;
   2568         if (aSig >= 0x01000000) {
   2569             aSig >>= 1;
   2570             aExp++;
   2571         }
   2572     } else if (aExp < -14
   2573           && STATUS(float_detect_tininess) == float_tininess_before_rounding) {
   2574         float_raise( float_flag_underflow STATUS_VAR);
   2575     }
   2576 
   2577     if (ieee) {
   2578         if (aExp > 15) {
   2579             float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
   2580             return packFloat16(aSign, 0x1f, 0);
   2581         }
   2582     } else {
   2583         if (aExp > 16) {
   2584             float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
   2585             return packFloat16(aSign, 0x1f, 0x3ff);
   2586         }
   2587     }
   2588     if (aExp < -24) {
   2589         return packFloat16(aSign, 0, 0);
   2590     }
   2591     if (aExp < -14) {
   2592         aSig >>= -14 - aExp;
   2593         aExp = -14;
   2594     }
   2595     return packFloat16(aSign, aExp + 14, aSig >> 13);
   2596 }
   2597 
   2598 #ifdef FLOATX80
   2599 
   2600 /*----------------------------------------------------------------------------
   2601 | Returns the result of converting the double-precision floating-point value
   2602 | `a' to the extended double-precision floating-point format.  The conversion
   2603 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
   2604 | Arithmetic.
   2605 *----------------------------------------------------------------------------*/
   2606 
   2607 floatx80 float64_to_floatx80( float64 a STATUS_PARAM )
   2608 {
   2609     flag aSign;
   2610     int16 aExp;
   2611     bits64 aSig;
   2612 
   2613     aSig = extractFloat64Frac( a );
   2614     aExp = extractFloat64Exp( a );
   2615     aSign = extractFloat64Sign( a );
   2616     if ( aExp == 0x7FF ) {
   2617         if ( aSig ) return commonNaNToFloatx80( float64ToCommonNaN( a STATUS_VAR ) );
   2618         return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
   2619     }
   2620     if ( aExp == 0 ) {
   2621         if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
   2622         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
   2623     }
   2624     return
   2625         packFloatx80(
   2626             aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 );
   2627 
   2628 }
   2629 
   2630 #endif
   2631 
   2632 #ifdef FLOAT128
   2633 
   2634 /*----------------------------------------------------------------------------
   2635 | Returns the result of converting the double-precision floating-point value
   2636 | `a' to the quadruple-precision floating-point format.  The conversion is
   2637 | performed according to the IEC/IEEE Standard for Binary Floating-Point
   2638 | Arithmetic.
   2639 *----------------------------------------------------------------------------*/
   2640 
   2641 float128 float64_to_float128( float64 a STATUS_PARAM )
   2642 {
   2643     flag aSign;
   2644     int16 aExp;
   2645     bits64 aSig, zSig0, zSig1;
   2646 
   2647     aSig = extractFloat64Frac( a );
   2648     aExp = extractFloat64Exp( a );
   2649     aSign = extractFloat64Sign( a );
   2650     if ( aExp == 0x7FF ) {
   2651         if ( aSig ) return commonNaNToFloat128( float64ToCommonNaN( a STATUS_VAR ) );
   2652         return packFloat128( aSign, 0x7FFF, 0, 0 );
   2653     }
   2654     if ( aExp == 0 ) {
   2655         if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
   2656         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
   2657         --aExp;
   2658     }
   2659     shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
   2660     return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
   2661 
   2662 }
   2663 
   2664 #endif
   2665 
   2666 /*----------------------------------------------------------------------------
   2667 | Rounds the double-precision floating-point value `a' to an integer, and
   2668 | returns the result as a double-precision floating-point value.  The
   2669 | operation is performed according to the IEC/IEEE Standard for Binary
   2670 | Floating-Point Arithmetic.
   2671 *----------------------------------------------------------------------------*/
   2672 
   2673 float64 float64_round_to_int( float64 a STATUS_PARAM )
   2674 {
   2675     flag aSign;
   2676     int16 aExp;
   2677     bits64 lastBitMask, roundBitsMask;
   2678     int8 roundingMode;
   2679     bits64 z;
   2680 
   2681     aExp = extractFloat64Exp( a );
   2682     if ( 0x433 <= aExp ) {
   2683         if ( ( aExp == 0x7FF ) && extractFloat64Frac( a ) ) {
   2684             return propagateFloat64NaN( a, a STATUS_VAR );
   2685         }
   2686         return a;
   2687     }
   2688     if ( aExp < 0x3FF ) {
   2689         if ( (bits64) ( float64_val(a)<<1 ) == 0 ) return a;
   2690         STATUS(float_exception_flags) |= float_flag_inexact;
   2691         aSign = extractFloat64Sign( a );
   2692         switch ( STATUS(float_rounding_mode) ) {
   2693          case float_round_nearest_even:
   2694             if ( ( aExp == 0x3FE ) && extractFloat64Frac( a ) ) {
   2695                 return packFloat64( aSign, 0x3FF, 0 );
   2696             }
   2697             break;
   2698          case float_round_down:
   2699             return make_float64(aSign ? LIT64( 0xBFF0000000000000 ) : 0);
   2700          case float_round_up:
   2701             return make_float64(
   2702             aSign ? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 ));
   2703         }
   2704         return packFloat64( aSign, 0, 0 );
   2705     }
   2706     lastBitMask = 1;
   2707     lastBitMask <<= 0x433 - aExp;
   2708     roundBitsMask = lastBitMask - 1;
   2709     z = float64_val(a);
   2710     roundingMode = STATUS(float_rounding_mode);
   2711     if ( roundingMode == float_round_nearest_even ) {
   2712         z += lastBitMask>>1;
   2713         if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
   2714     }
   2715     else if ( roundingMode != float_round_to_zero ) {
   2716         if ( extractFloat64Sign( make_float64(z) ) ^ ( roundingMode == float_round_up ) ) {
   2717             z += roundBitsMask;
   2718         }
   2719     }
   2720     z &= ~ roundBitsMask;
   2721     if ( z != float64_val(a) )
   2722         STATUS(float_exception_flags) |= float_flag_inexact;
   2723     return make_float64(z);
   2724 
   2725 }
   2726 
   2727 float64 float64_trunc_to_int( float64 a STATUS_PARAM)
   2728 {
   2729     int oldmode;
   2730     float64 res;
   2731     oldmode = STATUS(float_rounding_mode);
   2732     STATUS(float_rounding_mode) = float_round_to_zero;
   2733     res = float64_round_to_int(a STATUS_VAR);
   2734     STATUS(float_rounding_mode) = oldmode;
   2735     return res;
   2736 }
   2737 
   2738 /*----------------------------------------------------------------------------
   2739 | Returns the result of adding the absolute values of the double-precision
   2740 | floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
   2741 | before being returned.  `zSign' is ignored if the result is a NaN.
   2742 | The addition is performed according to the IEC/IEEE Standard for Binary
   2743 | Floating-Point Arithmetic.
   2744 *----------------------------------------------------------------------------*/
   2745 
   2746 static float64 addFloat64Sigs( float64 a, float64 b, flag zSign STATUS_PARAM )
   2747 {
   2748     int16 aExp, bExp, zExp;
   2749     bits64 aSig, bSig, zSig;
   2750     int16 expDiff;
   2751 
   2752     aSig = extractFloat64Frac( a );
   2753     aExp = extractFloat64Exp( a );
   2754     bSig = extractFloat64Frac( b );
   2755     bExp = extractFloat64Exp( b );
   2756     expDiff = aExp - bExp;
   2757     aSig <<= 9;
   2758     bSig <<= 9;
   2759     if ( 0 < expDiff ) {
   2760         if ( aExp == 0x7FF ) {
   2761             if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
   2762             return a;
   2763         }
   2764         if ( bExp == 0 ) {
   2765             --expDiff;
   2766         }
   2767         else {
   2768             bSig |= LIT64( 0x2000000000000000 );
   2769         }
   2770         shift64RightJamming( bSig, expDiff, &bSig );
   2771         zExp = aExp;
   2772     }
   2773     else if ( expDiff < 0 ) {
   2774         if ( bExp == 0x7FF ) {
   2775             if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
   2776             return packFloat64( zSign, 0x7FF, 0 );
   2777         }
   2778         if ( aExp == 0 ) {
   2779             ++expDiff;
   2780         }
   2781         else {
   2782             aSig |= LIT64( 0x2000000000000000 );
   2783         }
   2784         shift64RightJamming( aSig, - expDiff, &aSig );
   2785         zExp = bExp;
   2786     }
   2787     else {
   2788         if ( aExp == 0x7FF ) {
   2789             if ( aSig | bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
   2790             return a;
   2791         }
   2792         if ( aExp == 0 ) {
   2793             if ( STATUS(flush_to_zero) ) return packFloat64( zSign, 0, 0 );
   2794             return packFloat64( zSign, 0, ( aSig + bSig )>>9 );
   2795         }
   2796         zSig = LIT64( 0x4000000000000000 ) + aSig + bSig;
   2797         zExp = aExp;
   2798         goto roundAndPack;
   2799     }
   2800     aSig |= LIT64( 0x2000000000000000 );
   2801     zSig = ( aSig + bSig )<<1;
   2802     --zExp;
   2803     if ( (sbits64) zSig < 0 ) {
   2804         zSig = aSig + bSig;
   2805         ++zExp;
   2806     }
   2807  roundAndPack:
   2808     return roundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
   2809 
   2810 }
   2811 
   2812 /*----------------------------------------------------------------------------
   2813 | Returns the result of subtracting the absolute values of the double-
   2814 | precision floating-point values `a' and `b'.  If `zSign' is 1, the
   2815 | difference is negated before being returned.  `zSign' is ignored if the
   2816 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
   2817 | Standard for Binary Floating-Point Arithmetic.
   2818 *----------------------------------------------------------------------------*/
   2819 
   2820 static float64 subFloat64Sigs( float64 a, float64 b, flag zSign STATUS_PARAM )
   2821 {
   2822     int16 aExp, bExp, zExp;
   2823     bits64 aSig, bSig, zSig;
   2824     int16 expDiff;
   2825 
   2826     aSig = extractFloat64Frac( a );
   2827     aExp = extractFloat64Exp( a );
   2828     bSig = extractFloat64Frac( b );
   2829     bExp = extractFloat64Exp( b );
   2830     expDiff = aExp - bExp;
   2831     aSig <<= 10;
   2832     bSig <<= 10;
   2833     if ( 0 < expDiff ) goto aExpBigger;
   2834     if ( expDiff < 0 ) goto bExpBigger;
   2835     if ( aExp == 0x7FF ) {
   2836         if ( aSig | bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
   2837         float_raise( float_flag_invalid STATUS_VAR);
   2838         return float64_default_nan;
   2839     }
   2840     if ( aExp == 0 ) {
   2841         aExp = 1;
   2842         bExp = 1;
   2843     }
   2844     if ( bSig < aSig ) goto aBigger;
   2845     if ( aSig < bSig ) goto bBigger;
   2846     return packFloat64( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
   2847  bExpBigger:
   2848     if ( bExp == 0x7FF ) {
   2849         if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
   2850         return packFloat64( zSign ^ 1, 0x7FF, 0 );
   2851     }
   2852     if ( aExp == 0 ) {
   2853         ++expDiff;
   2854     }
   2855     else {
   2856         aSig |= LIT64( 0x4000000000000000 );
   2857     }
   2858     shift64RightJamming( aSig, - expDiff, &aSig );
   2859     bSig |= LIT64( 0x4000000000000000 );
   2860  bBigger:
   2861     zSig = bSig - aSig;
   2862     zExp = bExp;
   2863     zSign ^= 1;
   2864     goto normalizeRoundAndPack;
   2865  aExpBigger:
   2866     if ( aExp == 0x7FF ) {
   2867         if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
   2868         return a;
   2869     }
   2870     if ( bExp == 0 ) {
   2871         --expDiff;
   2872     }
   2873     else {
   2874         bSig |= LIT64( 0x4000000000000000 );
   2875     }
   2876     shift64RightJamming( bSig, expDiff, &bSig );
   2877     aSig |= LIT64( 0x4000000000000000 );
   2878  aBigger:
   2879     zSig = aSig - bSig;
   2880     zExp = aExp;
   2881  normalizeRoundAndPack:
   2882     --zExp;
   2883     return normalizeRoundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
   2884 
   2885 }
   2886 
   2887 /*----------------------------------------------------------------------------
   2888 | Returns the result of adding the double-precision floating-point values `a'
   2889 | and `b'.  The operation is performed according to the IEC/IEEE Standard for
   2890 | Binary Floating-Point Arithmetic.
   2891 *----------------------------------------------------------------------------*/
   2892 
   2893 float64 float64_add( float64 a, float64 b STATUS_PARAM )
   2894 {
   2895     flag aSign, bSign;
   2896 
   2897     aSign = extractFloat64Sign( a );
   2898     bSign = extractFloat64Sign( b );
   2899     if ( aSign == bSign ) {
   2900         return addFloat64Sigs( a, b, aSign STATUS_VAR );
   2901     }
   2902     else {
   2903         return subFloat64Sigs( a, b, aSign STATUS_VAR );
   2904     }
   2905 
   2906 }
   2907 
   2908 /*----------------------------------------------------------------------------
   2909 | Returns the result of subtracting the double-precision floating-point values
   2910 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
   2911 | for Binary Floating-Point Arithmetic.
   2912 *----------------------------------------------------------------------------*/
   2913 
   2914 float64 float64_sub( float64 a, float64 b STATUS_PARAM )
   2915 {
   2916     flag aSign, bSign;
   2917 
   2918     aSign = extractFloat64Sign( a );
   2919     bSign = extractFloat64Sign( b );
   2920     if ( aSign == bSign ) {
   2921         return subFloat64Sigs( a, b, aSign STATUS_VAR );
   2922     }
   2923     else {
   2924         return addFloat64Sigs( a, b, aSign STATUS_VAR );
   2925     }
   2926 
   2927 }
   2928 
   2929 /*----------------------------------------------------------------------------
   2930 | Returns the result of multiplying the double-precision floating-point values
   2931 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
   2932 | for Binary Floating-Point Arithmetic.
   2933 *----------------------------------------------------------------------------*/
   2934 
   2935 float64 float64_mul( float64 a, float64 b STATUS_PARAM )
   2936 {
   2937     flag aSign, bSign, zSign;
   2938     int16 aExp, bExp, zExp;
   2939     bits64 aSig, bSig, zSig0, zSig1;
   2940 
   2941     aSig = extractFloat64Frac( a );
   2942     aExp = extractFloat64Exp( a );
   2943     aSign = extractFloat64Sign( a );
   2944     bSig = extractFloat64Frac( b );
   2945     bExp = extractFloat64Exp( b );
   2946     bSign = extractFloat64Sign( b );
   2947     zSign = aSign ^ bSign;
   2948     if ( aExp == 0x7FF ) {
   2949         if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
   2950             return propagateFloat64NaN( a, b STATUS_VAR );
   2951         }
   2952         if ( ( bExp | bSig ) == 0 ) {
   2953             float_raise( float_flag_invalid STATUS_VAR);
   2954             return float64_default_nan;
   2955         }
   2956         return packFloat64( zSign, 0x7FF, 0 );
   2957     }
   2958     if ( bExp == 0x7FF ) {
   2959         if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
   2960         if ( ( aExp | aSig ) == 0 ) {
   2961             float_raise( float_flag_invalid STATUS_VAR);
   2962             return float64_default_nan;
   2963         }
   2964         return packFloat64( zSign, 0x7FF, 0 );
   2965     }
   2966     if ( aExp == 0 ) {
   2967         if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
   2968         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
   2969     }
   2970     if ( bExp == 0 ) {
   2971         if ( bSig == 0 ) return packFloat64( zSign, 0, 0 );
   2972         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
   2973     }
   2974     zExp = aExp + bExp - 0x3FF;
   2975     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
   2976     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
   2977     mul64To128( aSig, bSig, &zSig0, &zSig1 );
   2978     zSig0 |= ( zSig1 != 0 );
   2979     if ( 0 <= (sbits64) ( zSig0<<1 ) ) {
   2980         zSig0 <<= 1;
   2981         --zExp;
   2982     }
   2983     return roundAndPackFloat64( zSign, zExp, zSig0 STATUS_VAR );
   2984 
   2985 }
   2986 
   2987 /*----------------------------------------------------------------------------
   2988 | Returns the result of dividing the double-precision floating-point value `a'
   2989 | by the corresponding value `b'.  The operation is performed according to
   2990 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   2991 *----------------------------------------------------------------------------*/
   2992 
   2993 float64 float64_div( float64 a, float64 b STATUS_PARAM )
   2994 {
   2995     flag aSign, bSign, zSign;
   2996     int16 aExp, bExp, zExp;
   2997     bits64 aSig, bSig, zSig;
   2998     bits64 rem0, rem1;
   2999     bits64 term0, term1;
   3000 
   3001     aSig = extractFloat64Frac( a );
   3002     aExp = extractFloat64Exp( a );
   3003     aSign = extractFloat64Sign( a );
   3004     bSig = extractFloat64Frac( b );
   3005     bExp = extractFloat64Exp( b );
   3006     bSign = extractFloat64Sign( b );
   3007     zSign = aSign ^ bSign;
   3008     if ( aExp == 0x7FF ) {
   3009         if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
   3010         if ( bExp == 0x7FF ) {
   3011             if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
   3012             float_raise( float_flag_invalid STATUS_VAR);
   3013             return float64_default_nan;
   3014         }
   3015         return packFloat64( zSign, 0x7FF, 0 );
   3016     }
   3017     if ( bExp == 0x7FF ) {
   3018         if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
   3019         return packFloat64( zSign, 0, 0 );
   3020     }
   3021     if ( bExp == 0 ) {
   3022         if ( bSig == 0 ) {
   3023             if ( ( aExp | aSig ) == 0 ) {
   3024                 float_raise( float_flag_invalid STATUS_VAR);
   3025                 return float64_default_nan;
   3026             }
   3027             float_raise( float_flag_divbyzero STATUS_VAR);
   3028             return packFloat64( zSign, 0x7FF, 0 );
   3029         }
   3030         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
   3031     }
   3032     if ( aExp == 0 ) {
   3033         if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
   3034         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
   3035     }
   3036     zExp = aExp - bExp + 0x3FD;
   3037     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
   3038     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
   3039     if ( bSig <= ( aSig + aSig ) ) {
   3040         aSig >>= 1;
   3041         ++zExp;
   3042     }
   3043     zSig = estimateDiv128To64( aSig, 0, bSig );
   3044     if ( ( zSig & 0x1FF ) <= 2 ) {
   3045         mul64To128( bSig, zSig, &term0, &term1 );
   3046         sub128( aSig, 0, term0, term1, &rem0, &rem1 );
   3047         while ( (sbits64) rem0 < 0 ) {
   3048             --zSig;
   3049             add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
   3050         }
   3051         zSig |= ( rem1 != 0 );
   3052     }
   3053     return roundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
   3054 
   3055 }
   3056 
   3057 /*----------------------------------------------------------------------------
   3058 | Returns the remainder of the double-precision floating-point value `a'
   3059 | with respect to the corresponding value `b'.  The operation is performed
   3060 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   3061 *----------------------------------------------------------------------------*/
   3062 
   3063 float64 float64_rem( float64 a, float64 b STATUS_PARAM )
   3064 {
   3065     flag aSign, bSign, zSign;
   3066     int16 aExp, bExp, expDiff;
   3067     bits64 aSig, bSig;
   3068     bits64 q, alternateASig;
   3069     sbits64 sigMean;
   3070 
   3071     aSig = extractFloat64Frac( a );
   3072     aExp = extractFloat64Exp( a );
   3073     aSign = extractFloat64Sign( a );
   3074     bSig = extractFloat64Frac( b );
   3075     bExp = extractFloat64Exp( b );
   3076     bSign = extractFloat64Sign( b );
   3077     if ( aExp == 0x7FF ) {
   3078         if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
   3079             return propagateFloat64NaN( a, b STATUS_VAR );
   3080         }
   3081         float_raise( float_flag_invalid STATUS_VAR);
   3082         return float64_default_nan;
   3083     }
   3084     if ( bExp == 0x7FF ) {
   3085         if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
   3086         return a;
   3087     }
   3088     if ( bExp == 0 ) {
   3089         if ( bSig == 0 ) {
   3090             float_raise( float_flag_invalid STATUS_VAR);
   3091             return float64_default_nan;
   3092         }
   3093         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
   3094     }
   3095     if ( aExp == 0 ) {
   3096         if ( aSig == 0 ) return a;
   3097         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
   3098     }
   3099     expDiff = aExp - bExp;
   3100     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11;
   3101     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
   3102     if ( expDiff < 0 ) {
   3103         if ( expDiff < -1 ) return a;
   3104         aSig >>= 1;
   3105     }
   3106     q = ( bSig <= aSig );
   3107     if ( q ) aSig -= bSig;
   3108     expDiff -= 64;
   3109     while ( 0 < expDiff ) {
   3110         q = estimateDiv128To64( aSig, 0, bSig );
   3111         q = ( 2 < q ) ? q - 2 : 0;
   3112         aSig = - ( ( bSig>>2 ) * q );
   3113         expDiff -= 62;
   3114     }
   3115     expDiff += 64;
   3116     if ( 0 < expDiff ) {
   3117         q = estimateDiv128To64( aSig, 0, bSig );
   3118         q = ( 2 < q ) ? q - 2 : 0;
   3119         q >>= 64 - expDiff;
   3120         bSig >>= 2;
   3121         aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
   3122     }
   3123     else {
   3124         aSig >>= 2;
   3125         bSig >>= 2;
   3126     }
   3127     do {
   3128         alternateASig = aSig;
   3129         ++q;
   3130         aSig -= bSig;
   3131     } while ( 0 <= (sbits64) aSig );
   3132     sigMean = aSig + alternateASig;
   3133     if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
   3134         aSig = alternateASig;
   3135     }
   3136     zSign = ( (sbits64) aSig < 0 );
   3137     if ( zSign ) aSig = - aSig;
   3138     return normalizeRoundAndPackFloat64( aSign ^ zSign, bExp, aSig STATUS_VAR );
   3139 
   3140 }
   3141 
   3142 /*----------------------------------------------------------------------------
   3143 | Returns the square root of the double-precision floating-point value `a'.
   3144 | The operation is performed according to the IEC/IEEE Standard for Binary
   3145 | Floating-Point Arithmetic.
   3146 *----------------------------------------------------------------------------*/
   3147 
   3148 float64 float64_sqrt( float64 a STATUS_PARAM )
   3149 {
   3150     flag aSign;
   3151     int16 aExp, zExp;
   3152     bits64 aSig, zSig, doubleZSig;
   3153     bits64 rem0, rem1, term0, term1;
   3154 
   3155     aSig = extractFloat64Frac( a );
   3156     aExp = extractFloat64Exp( a );
   3157     aSign = extractFloat64Sign( a );
   3158     if ( aExp == 0x7FF ) {
   3159         if ( aSig ) return propagateFloat64NaN( a, a STATUS_VAR );
   3160         if ( ! aSign ) return a;
   3161         float_raise( float_flag_invalid STATUS_VAR);
   3162         return float64_default_nan;
   3163     }
   3164     if ( aSign ) {
   3165         if ( ( aExp | aSig ) == 0 ) return a;
   3166         float_raise( float_flag_invalid STATUS_VAR);
   3167         return float64_default_nan;
   3168     }
   3169     if ( aExp == 0 ) {
   3170         if ( aSig == 0 ) return float64_zero;
   3171         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
   3172     }
   3173     zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE;
   3174     aSig |= LIT64( 0x0010000000000000 );
   3175     zSig = estimateSqrt32( aExp, aSig>>21 );
   3176     aSig <<= 9 - ( aExp & 1 );
   3177     zSig = estimateDiv128To64( aSig, 0, zSig<<32 ) + ( zSig<<30 );
   3178     if ( ( zSig & 0x1FF ) <= 5 ) {
   3179         doubleZSig = zSig<<1;
   3180         mul64To128( zSig, zSig, &term0, &term1 );
   3181         sub128( aSig, 0, term0, term1, &rem0, &rem1 );
   3182         while ( (sbits64) rem0 < 0 ) {
   3183             --zSig;
   3184             doubleZSig -= 2;
   3185             add128( rem0, rem1, zSig>>63, doubleZSig | 1, &rem0, &rem1 );
   3186         }
   3187         zSig |= ( ( rem0 | rem1 ) != 0 );
   3188     }
   3189     return roundAndPackFloat64( 0, zExp, zSig STATUS_VAR );
   3190 
   3191 }
   3192 
   3193 /*----------------------------------------------------------------------------
   3194 | Returns the binary log of the double-precision floating-point value `a'.
   3195 | The operation is performed according to the IEC/IEEE Standard for Binary
   3196 | Floating-Point Arithmetic.
   3197 *----------------------------------------------------------------------------*/
   3198 float64 float64_log2( float64 a STATUS_PARAM )
   3199 {
   3200     flag aSign, zSign;
   3201     int16 aExp;
   3202     bits64 aSig, aSig0, aSig1, zSig, i;
   3203 
   3204     aSig = extractFloat64Frac( a );
   3205     aExp = extractFloat64Exp( a );
   3206     aSign = extractFloat64Sign( a );
   3207 
   3208     if ( aExp == 0 ) {
   3209         if ( aSig == 0 ) return packFloat64( 1, 0x7FF, 0 );
   3210         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
   3211     }
   3212     if ( aSign ) {
   3213         float_raise( float_flag_invalid STATUS_VAR);
   3214         return float64_default_nan;
   3215     }
   3216     if ( aExp == 0x7FF ) {
   3217         if ( aSig ) return propagateFloat64NaN( a, float64_zero STATUS_VAR );
   3218         return a;
   3219     }
   3220 
   3221     aExp -= 0x3FF;
   3222     aSig |= LIT64( 0x0010000000000000 );
   3223     zSign = aExp < 0;
   3224     zSig = (bits64)aExp << 52;
   3225     for (i = 1LL << 51; i > 0; i >>= 1) {
   3226         mul64To128( aSig, aSig, &aSig0, &aSig1 );
   3227         aSig = ( aSig0 << 12 ) | ( aSig1 >> 52 );
   3228         if ( aSig & LIT64( 0x0020000000000000 ) ) {
   3229             aSig >>= 1;
   3230             zSig |= i;
   3231         }
   3232     }
   3233 
   3234     if ( zSign )
   3235         zSig = -zSig;
   3236     return normalizeRoundAndPackFloat64( zSign, 0x408, zSig STATUS_VAR );
   3237 }
   3238 
   3239 /*----------------------------------------------------------------------------
   3240 | Returns 1 if the double-precision floating-point value `a' is equal to the
   3241 | corresponding value `b', and 0 otherwise.  The comparison is performed
   3242 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   3243 *----------------------------------------------------------------------------*/
   3244 
   3245 int float64_eq( float64 a, float64 b STATUS_PARAM )
   3246 {
   3247     bits64 av, bv;
   3248 
   3249     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
   3250          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
   3251        ) {
   3252         if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
   3253             float_raise( float_flag_invalid STATUS_VAR);
   3254         }
   3255         return 0;
   3256     }
   3257     av = float64_val(a);
   3258     bv = float64_val(b);
   3259     return ( av == bv ) || ( (bits64) ( ( av | bv )<<1 ) == 0 );
   3260 
   3261 }
   3262 
   3263 /*----------------------------------------------------------------------------
   3264 | Returns 1 if the double-precision floating-point value `a' is less than or
   3265 | equal to the corresponding value `b', and 0 otherwise.  The comparison is
   3266 | performed according to the IEC/IEEE Standard for Binary Floating-Point
   3267 | Arithmetic.
   3268 *----------------------------------------------------------------------------*/
   3269 
   3270 int float64_le( float64 a, float64 b STATUS_PARAM )
   3271 {
   3272     flag aSign, bSign;
   3273     bits64 av, bv;
   3274 
   3275     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
   3276          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
   3277        ) {
   3278         float_raise( float_flag_invalid STATUS_VAR);
   3279         return 0;
   3280     }
   3281     aSign = extractFloat64Sign( a );
   3282     bSign = extractFloat64Sign( b );
   3283     av = float64_val(a);
   3284     bv = float64_val(b);
   3285     if ( aSign != bSign ) return aSign || ( (bits64) ( ( av | bv )<<1 ) == 0 );
   3286     return ( av == bv ) || ( aSign ^ ( av < bv ) );
   3287 
   3288 }
   3289 
   3290 /*----------------------------------------------------------------------------
   3291 | Returns 1 if the double-precision floating-point value `a' is less than
   3292 | the corresponding value `b', and 0 otherwise.  The comparison is performed
   3293 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   3294 *----------------------------------------------------------------------------*/
   3295 
   3296 int float64_lt( float64 a, float64 b STATUS_PARAM )
   3297 {
   3298     flag aSign, bSign;
   3299     bits64 av, bv;
   3300 
   3301     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
   3302          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
   3303        ) {
   3304         float_raise( float_flag_invalid STATUS_VAR);
   3305         return 0;
   3306     }
   3307     aSign = extractFloat64Sign( a );
   3308     bSign = extractFloat64Sign( b );
   3309     av = float64_val(a);
   3310     bv = float64_val(b);
   3311     if ( aSign != bSign ) return aSign && ( (bits64) ( ( av | bv )<<1 ) != 0 );
   3312     return ( av != bv ) && ( aSign ^ ( av < bv ) );
   3313 
   3314 }
   3315 
   3316 /*----------------------------------------------------------------------------
   3317 | Returns 1 if the double-precision floating-point value `a' is equal to the
   3318 | corresponding value `b', and 0 otherwise.  The invalid exception is raised
   3319 | if either operand is a NaN.  Otherwise, the comparison is performed
   3320 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   3321 *----------------------------------------------------------------------------*/
   3322 
   3323 int float64_eq_signaling( float64 a, float64 b STATUS_PARAM )
   3324 {
   3325     bits64 av, bv;
   3326 
   3327     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
   3328          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
   3329        ) {
   3330         float_raise( float_flag_invalid STATUS_VAR);
   3331         return 0;
   3332     }
   3333     av = float64_val(a);
   3334     bv = float64_val(b);
   3335     return ( av == bv ) || ( (bits64) ( ( av | bv )<<1 ) == 0 );
   3336 
   3337 }
   3338 
   3339 /*----------------------------------------------------------------------------
   3340 | Returns 1 if the double-precision floating-point value `a' is less than or
   3341 | equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
   3342 | cause an exception.  Otherwise, the comparison is performed according to the
   3343 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   3344 *----------------------------------------------------------------------------*/
   3345 
   3346 int float64_le_quiet( float64 a, float64 b STATUS_PARAM )
   3347 {
   3348     flag aSign, bSign;
   3349     bits64 av, bv;
   3350 
   3351     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
   3352          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
   3353        ) {
   3354         if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
   3355             float_raise( float_flag_invalid STATUS_VAR);
   3356         }
   3357         return 0;
   3358     }
   3359     aSign = extractFloat64Sign( a );
   3360     bSign = extractFloat64Sign( b );
   3361     av = float64_val(a);
   3362     bv = float64_val(b);
   3363     if ( aSign != bSign ) return aSign || ( (bits64) ( ( av | bv )<<1 ) == 0 );
   3364     return ( av == bv ) || ( aSign ^ ( av < bv ) );
   3365 
   3366 }
   3367 
   3368 /*----------------------------------------------------------------------------
   3369 | Returns 1 if the double-precision floating-point value `a' is less than
   3370 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
   3371 | exception.  Otherwise, the comparison is performed according to the IEC/IEEE
   3372 | Standard for Binary Floating-Point Arithmetic.
   3373 *----------------------------------------------------------------------------*/
   3374 
   3375 int float64_lt_quiet( float64 a, float64 b STATUS_PARAM )
   3376 {
   3377     flag aSign, bSign;
   3378     bits64 av, bv;
   3379 
   3380     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
   3381          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
   3382        ) {
   3383         if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
   3384             float_raise( float_flag_invalid STATUS_VAR);
   3385         }
   3386         return 0;
   3387     }
   3388     aSign = extractFloat64Sign( a );
   3389     bSign = extractFloat64Sign( b );
   3390     av = float64_val(a);
   3391     bv = float64_val(b);
   3392     if ( aSign != bSign ) return aSign && ( (bits64) ( ( av | bv )<<1 ) != 0 );
   3393     return ( av != bv ) && ( aSign ^ ( av < bv ) );
   3394 
   3395 }
   3396 
   3397 #ifdef FLOATX80
   3398 
   3399 /*----------------------------------------------------------------------------
   3400 | Returns the result of converting the extended double-precision floating-
   3401 | point value `a' to the 32-bit two's complement integer format.  The
   3402 | conversion is performed according to the IEC/IEEE Standard for Binary
   3403 | Floating-Point Arithmetic---which means in particular that the conversion
   3404 | is rounded according to the current rounding mode.  If `a' is a NaN, the
   3405 | largest positive integer is returned.  Otherwise, if the conversion
   3406 | overflows, the largest integer with the same sign as `a' is returned.
   3407 *----------------------------------------------------------------------------*/
   3408 
   3409 int32 floatx80_to_int32( floatx80 a STATUS_PARAM )
   3410 {
   3411     flag aSign;
   3412     int32 aExp, shiftCount;
   3413     bits64 aSig;
   3414 
   3415     aSig = extractFloatx80Frac( a );
   3416     aExp = extractFloatx80Exp( a );
   3417     aSign = extractFloatx80Sign( a );
   3418     if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
   3419     shiftCount = 0x4037 - aExp;
   3420     if ( shiftCount <= 0 ) shiftCount = 1;
   3421     shift64RightJamming( aSig, shiftCount, &aSig );
   3422     return roundAndPackInt32( aSign, aSig STATUS_VAR );
   3423 
   3424 }
   3425 
   3426 /*----------------------------------------------------------------------------
   3427 | Returns the result of converting the extended double-precision floating-
   3428 | point value `a' to the 32-bit two's complement integer format.  The
   3429 | conversion is performed according to the IEC/IEEE Standard for Binary
   3430 | Floating-Point Arithmetic, except that the conversion is always rounded
   3431 | toward zero.  If `a' is a NaN, the largest positive integer is returned.
   3432 | Otherwise, if the conversion overflows, the largest integer with the same
   3433 | sign as `a' is returned.
   3434 *----------------------------------------------------------------------------*/
   3435 
   3436 int32 floatx80_to_int32_round_to_zero( floatx80 a STATUS_PARAM )
   3437 {
   3438     flag aSign;
   3439     int32 aExp, shiftCount;
   3440     bits64 aSig, savedASig;
   3441     int32 z;
   3442 
   3443     aSig = extractFloatx80Frac( a );
   3444     aExp = extractFloatx80Exp( a );
   3445     aSign = extractFloatx80Sign( a );
   3446     if ( 0x401E < aExp ) {
   3447         if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
   3448         goto invalid;
   3449     }
   3450     else if ( aExp < 0x3FFF ) {
   3451         if ( aExp || aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
   3452         return 0;
   3453     }
   3454     shiftCount = 0x403E - aExp;
   3455     savedASig = aSig;
   3456     aSig >>= shiftCount;
   3457     z = aSig;
   3458     if ( aSign ) z = - z;
   3459     if ( ( z < 0 ) ^ aSign ) {
   3460  invalid:
   3461         float_raise( float_flag_invalid STATUS_VAR);
   3462         return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
   3463     }
   3464     if ( ( aSig<<shiftCount ) != savedASig ) {
   3465         STATUS(float_exception_flags) |= float_flag_inexact;
   3466     }
   3467     return z;
   3468 
   3469 }
   3470 
   3471 /*----------------------------------------------------------------------------
   3472 | Returns the result of converting the extended double-precision floating-
   3473 | point value `a' to the 64-bit two's complement integer format.  The
   3474 | conversion is performed according to the IEC/IEEE Standard for Binary
   3475 | Floating-Point Arithmetic---which means in particular that the conversion
   3476 | is rounded according to the current rounding mode.  If `a' is a NaN,
   3477 | the largest positive integer is returned.  Otherwise, if the conversion
   3478 | overflows, the largest integer with the same sign as `a' is returned.
   3479 *----------------------------------------------------------------------------*/
   3480 
   3481 int64 floatx80_to_int64( floatx80 a STATUS_PARAM )
   3482 {
   3483     flag aSign;
   3484     int32 aExp, shiftCount;
   3485     bits64 aSig, aSigExtra;
   3486 
   3487     aSig = extractFloatx80Frac( a );
   3488     aExp = extractFloatx80Exp( a );
   3489     aSign = extractFloatx80Sign( a );
   3490     shiftCount = 0x403E - aExp;
   3491     if ( shiftCount <= 0 ) {
   3492         if ( shiftCount ) {
   3493             float_raise( float_flag_invalid STATUS_VAR);
   3494             if (    ! aSign
   3495                  || (    ( aExp == 0x7FFF )
   3496                       && ( aSig != LIT64( 0x8000000000000000 ) ) )
   3497                ) {
   3498                 return LIT64( 0x7FFFFFFFFFFFFFFF );
   3499             }
   3500             return (sbits64) LIT64( 0x8000000000000000 );
   3501         }
   3502         aSigExtra = 0;
   3503     }
   3504     else {
   3505         shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
   3506     }
   3507     return roundAndPackInt64( aSign, aSig, aSigExtra STATUS_VAR );
   3508 
   3509 }
   3510 
   3511 /*----------------------------------------------------------------------------
   3512 | Returns the result of converting the extended double-precision floating-
   3513 | point value `a' to the 64-bit two's complement integer format.  The
   3514 | conversion is performed according to the IEC/IEEE Standard for Binary
   3515 | Floating-Point Arithmetic, except that the conversion is always rounded
   3516 | toward zero.  If `a' is a NaN, the largest positive integer is returned.
   3517 | Otherwise, if the conversion overflows, the largest integer with the same
   3518 | sign as `a' is returned.
   3519 *----------------------------------------------------------------------------*/
   3520 
   3521 int64 floatx80_to_int64_round_to_zero( floatx80 a STATUS_PARAM )
   3522 {
   3523     flag aSign;
   3524     int32 aExp, shiftCount;
   3525     bits64 aSig;
   3526     int64 z;
   3527 
   3528     aSig = extractFloatx80Frac( a );
   3529     aExp = extractFloatx80Exp( a );
   3530     aSign = extractFloatx80Sign( a );
   3531     shiftCount = aExp - 0x403E;
   3532     if ( 0 <= shiftCount ) {
   3533         aSig &= LIT64( 0x7FFFFFFFFFFFFFFF );
   3534         if ( ( a.high != 0xC03E ) || aSig ) {
   3535             float_raise( float_flag_invalid STATUS_VAR);
   3536             if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {
   3537                 return LIT64( 0x7FFFFFFFFFFFFFFF );
   3538             }
   3539         }
   3540         return (sbits64) LIT64( 0x8000000000000000 );
   3541     }
   3542     else if ( aExp < 0x3FFF ) {
   3543         if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
   3544         return 0;
   3545     }
   3546     z = aSig>>( - shiftCount );
   3547     if ( (bits64) ( aSig<<( shiftCount & 63 ) ) ) {
   3548         STATUS(float_exception_flags) |= float_flag_inexact;
   3549     }
   3550     if ( aSign ) z = - z;
   3551     return z;
   3552 
   3553 }
   3554 
   3555 /*----------------------------------------------------------------------------
   3556 | Returns the result of converting the extended double-precision floating-
   3557 | point value `a' to the single-precision floating-point format.  The
   3558 | conversion is performed according to the IEC/IEEE Standard for Binary
   3559 | Floating-Point Arithmetic.
   3560 *----------------------------------------------------------------------------*/
   3561 
   3562 float32 floatx80_to_float32( floatx80 a STATUS_PARAM )
   3563 {
   3564     flag aSign;
   3565     int32 aExp;
   3566     bits64 aSig;
   3567 
   3568     aSig = extractFloatx80Frac( a );
   3569     aExp = extractFloatx80Exp( a );
   3570     aSign = extractFloatx80Sign( a );
   3571     if ( aExp == 0x7FFF ) {
   3572         if ( (bits64) ( aSig<<1 ) ) {
   3573             return commonNaNToFloat32( floatx80ToCommonNaN( a STATUS_VAR ) );
   3574         }
   3575         return packFloat32( aSign, 0xFF, 0 );
   3576     }
   3577     shift64RightJamming( aSig, 33, &aSig );
   3578     if ( aExp || aSig ) aExp -= 0x3F81;
   3579     return roundAndPackFloat32( aSign, aExp, aSig STATUS_VAR );
   3580 
   3581 }
   3582 
   3583 /*----------------------------------------------------------------------------
   3584 | Returns the result of converting the extended double-precision floating-
   3585 | point value `a' to the double-precision floating-point format.  The
   3586 | conversion is performed according to the IEC/IEEE Standard for Binary
   3587 | Floating-Point Arithmetic.
   3588 *----------------------------------------------------------------------------*/
   3589 
   3590 float64 floatx80_to_float64( floatx80 a STATUS_PARAM )
   3591 {
   3592     flag aSign;
   3593     int32 aExp;
   3594     bits64 aSig, zSig;
   3595 
   3596     aSig = extractFloatx80Frac( a );
   3597     aExp = extractFloatx80Exp( a );
   3598     aSign = extractFloatx80Sign( a );
   3599     if ( aExp == 0x7FFF ) {
   3600         if ( (bits64) ( aSig<<1 ) ) {
   3601             return commonNaNToFloat64( floatx80ToCommonNaN( a STATUS_VAR ) );
   3602         }
   3603         return packFloat64( aSign, 0x7FF, 0 );
   3604     }
   3605     shift64RightJamming( aSig, 1, &zSig );
   3606     if ( aExp || aSig ) aExp -= 0x3C01;
   3607     return roundAndPackFloat64( aSign, aExp, zSig STATUS_VAR );
   3608 
   3609 }
   3610 
   3611 #ifdef FLOAT128
   3612 
   3613 /*----------------------------------------------------------------------------
   3614 | Returns the result of converting the extended double-precision floating-
   3615 | point value `a' to the quadruple-precision floating-point format.  The
   3616 | conversion is performed according to the IEC/IEEE Standard for Binary
   3617 | Floating-Point Arithmetic.
   3618 *----------------------------------------------------------------------------*/
   3619 
   3620 float128 floatx80_to_float128( floatx80 a STATUS_PARAM )
   3621 {
   3622     flag aSign;
   3623     int16 aExp;
   3624     bits64 aSig, zSig0, zSig1;
   3625 
   3626     aSig = extractFloatx80Frac( a );
   3627     aExp = extractFloatx80Exp( a );
   3628     aSign = extractFloatx80Sign( a );
   3629     if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) {
   3630         return commonNaNToFloat128( floatx80ToCommonNaN( a STATUS_VAR ) );
   3631     }
   3632     shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
   3633     return packFloat128( aSign, aExp, zSig0, zSig1 );
   3634 
   3635 }
   3636 
   3637 #endif
   3638 
   3639 /*----------------------------------------------------------------------------
   3640 | Rounds the extended double-precision floating-point value `a' to an integer,
   3641 | and returns the result as an extended quadruple-precision floating-point
   3642 | value.  The operation is performed according to the IEC/IEEE Standard for
   3643 | Binary Floating-Point Arithmetic.
   3644 *----------------------------------------------------------------------------*/
   3645 
   3646 floatx80 floatx80_round_to_int( floatx80 a STATUS_PARAM )
   3647 {
   3648     flag aSign;
   3649     int32 aExp;
   3650     bits64 lastBitMask, roundBitsMask;
   3651     int8 roundingMode;
   3652     floatx80 z;
   3653 
   3654     aExp = extractFloatx80Exp( a );
   3655     if ( 0x403E <= aExp ) {
   3656         if ( ( aExp == 0x7FFF ) && (bits64) ( extractFloatx80Frac( a )<<1 ) ) {
   3657             return propagateFloatx80NaN( a, a STATUS_VAR );
   3658         }
   3659         return a;
   3660     }
   3661     if ( aExp < 0x3FFF ) {
   3662         if (    ( aExp == 0 )
   3663              && ( (bits64) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
   3664             return a;
   3665         }
   3666         STATUS(float_exception_flags) |= float_flag_inexact;
   3667         aSign = extractFloatx80Sign( a );
   3668         switch ( STATUS(float_rounding_mode) ) {
   3669          case float_round_nearest_even:
   3670             if ( ( aExp == 0x3FFE ) && (bits64) ( extractFloatx80Frac( a )<<1 )
   3671                ) {
   3672                 return
   3673                     packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );
   3674             }
   3675             break;
   3676          case float_round_down:
   3677             return
   3678                   aSign ?
   3679                       packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
   3680                 : packFloatx80( 0, 0, 0 );
   3681          case float_round_up:
   3682             return
   3683                   aSign ? packFloatx80( 1, 0, 0 )
   3684                 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
   3685         }
   3686         return packFloatx80( aSign, 0, 0 );
   3687     }
   3688     lastBitMask = 1;
   3689     lastBitMask <<= 0x403E - aExp;
   3690     roundBitsMask = lastBitMask - 1;
   3691     z = a;
   3692     roundingMode = STATUS(float_rounding_mode);
   3693     if ( roundingMode == float_round_nearest_even ) {
   3694         z.low += lastBitMask>>1;
   3695         if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
   3696     }
   3697     else if ( roundingMode != float_round_to_zero ) {
   3698         if ( extractFloatx80Sign( z ) ^ ( roundingMode == float_round_up ) ) {
   3699             z.low += roundBitsMask;
   3700         }
   3701     }
   3702     z.low &= ~ roundBitsMask;
   3703     if ( z.low == 0 ) {
   3704         ++z.high;
   3705         z.low = LIT64( 0x8000000000000000 );
   3706     }
   3707     if ( z.low != a.low ) STATUS(float_exception_flags) |= float_flag_inexact;
   3708     return z;
   3709 
   3710 }
   3711 
   3712 /*----------------------------------------------------------------------------
   3713 | Returns the result of adding the absolute values of the extended double-
   3714 | precision floating-point values `a' and `b'.  If `zSign' is 1, the sum is
   3715 | negated before being returned.  `zSign' is ignored if the result is a NaN.
   3716 | The addition is performed according to the IEC/IEEE Standard for Binary
   3717 | Floating-Point Arithmetic.
   3718 *----------------------------------------------------------------------------*/
   3719 
   3720 static floatx80 addFloatx80Sigs( floatx80 a, floatx80 b, flag zSign STATUS_PARAM)
   3721 {
   3722     int32 aExp, bExp, zExp;
   3723     bits64 aSig, bSig, zSig0, zSig1;
   3724     int32 expDiff;
   3725 
   3726     aSig = extractFloatx80Frac( a );
   3727     aExp = extractFloatx80Exp( a );
   3728     bSig = extractFloatx80Frac( b );
   3729     bExp = extractFloatx80Exp( b );
   3730     expDiff = aExp - bExp;
   3731     if ( 0 < expDiff ) {
   3732         if ( aExp == 0x7FFF ) {
   3733             if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
   3734             return a;
   3735         }
   3736         if ( bExp == 0 ) --expDiff;
   3737         shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
   3738         zExp = aExp;
   3739     }
   3740     else if ( expDiff < 0 ) {
   3741         if ( bExp == 0x7FFF ) {
   3742             if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
   3743             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
   3744         }
   3745         if ( aExp == 0 ) ++expDiff;
   3746         shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
   3747         zExp = bExp;
   3748     }
   3749     else {
   3750         if ( aExp == 0x7FFF ) {
   3751             if ( (bits64) ( ( aSig | bSig )<<1 ) ) {
   3752                 return propagateFloatx80NaN( a, b STATUS_VAR );
   3753             }
   3754             return a;
   3755         }
   3756         zSig1 = 0;
   3757         zSig0 = aSig + bSig;
   3758         if ( aExp == 0 ) {
   3759             normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
   3760             goto roundAndPack;
   3761         }
   3762         zExp = aExp;
   3763         goto shiftRight1;
   3764     }
   3765     zSig0 = aSig + bSig;
   3766     if ( (sbits64) zSig0 < 0 ) goto roundAndPack;
   3767  shiftRight1:
   3768     shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
   3769     zSig0 |= LIT64( 0x8000000000000000 );
   3770     ++zExp;
   3771  roundAndPack:
   3772     return
   3773         roundAndPackFloatx80(
   3774             STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
   3775 
   3776 }
   3777 
   3778 /*----------------------------------------------------------------------------
   3779 | Returns the result of subtracting the absolute values of the extended
   3780 | double-precision floating-point values `a' and `b'.  If `zSign' is 1, the
   3781 | difference is negated before being returned.  `zSign' is ignored if the
   3782 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
   3783 | Standard for Binary Floating-Point Arithmetic.
   3784 *----------------------------------------------------------------------------*/
   3785 
   3786 static floatx80 subFloatx80Sigs( floatx80 a, floatx80 b, flag zSign STATUS_PARAM )
   3787 {
   3788     int32 aExp, bExp, zExp;
   3789     bits64 aSig, bSig, zSig0, zSig1;
   3790     int32 expDiff;
   3791     floatx80 z;
   3792 
   3793     aSig = extractFloatx80Frac( a );
   3794     aExp = extractFloatx80Exp( a );
   3795     bSig = extractFloatx80Frac( b );
   3796     bExp = extractFloatx80Exp( b );
   3797     expDiff = aExp - bExp;
   3798     if ( 0 < expDiff ) goto aExpBigger;
   3799     if ( expDiff < 0 ) goto bExpBigger;
   3800     if ( aExp == 0x7FFF ) {
   3801         if ( (bits64) ( ( aSig | bSig )<<1 ) ) {
   3802             return propagateFloatx80NaN( a, b STATUS_VAR );
   3803         }
   3804         float_raise( float_flag_invalid STATUS_VAR);
   3805         z.low = floatx80_default_nan_low;
   3806         z.high = floatx80_default_nan_high;
   3807         return z;
   3808     }
   3809     if ( aExp == 0 ) {
   3810         aExp = 1;
   3811         bExp = 1;
   3812     }
   3813     zSig1 = 0;
   3814     if ( bSig < aSig ) goto aBigger;
   3815     if ( aSig < bSig ) goto bBigger;
   3816     return packFloatx80( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
   3817  bExpBigger:
   3818     if ( bExp == 0x7FFF ) {
   3819         if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
   3820         return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) );
   3821     }
   3822     if ( aExp == 0 ) ++expDiff;
   3823     shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
   3824  bBigger:
   3825     sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
   3826     zExp = bExp;
   3827     zSign ^= 1;
   3828     goto normalizeRoundAndPack;
   3829  aExpBigger:
   3830     if ( aExp == 0x7FFF ) {
   3831         if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
   3832         return a;
   3833     }
   3834     if ( bExp == 0 ) --expDiff;
   3835     shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
   3836  aBigger:
   3837     sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
   3838     zExp = aExp;
   3839  normalizeRoundAndPack:
   3840     return
   3841         normalizeRoundAndPackFloatx80(
   3842             STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
   3843 
   3844 }
   3845 
   3846 /*----------------------------------------------------------------------------
   3847 | Returns the result of adding the extended double-precision floating-point
   3848 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
   3849 | Standard for Binary Floating-Point Arithmetic.
   3850 *----------------------------------------------------------------------------*/
   3851 
   3852 floatx80 floatx80_add( floatx80 a, floatx80 b STATUS_PARAM )
   3853 {
   3854     flag aSign, bSign;
   3855 
   3856     aSign = extractFloatx80Sign( a );
   3857     bSign = extractFloatx80Sign( b );
   3858     if ( aSign == bSign ) {
   3859         return addFloatx80Sigs( a, b, aSign STATUS_VAR );
   3860     }
   3861     else {
   3862         return subFloatx80Sigs( a, b, aSign STATUS_VAR );
   3863     }
   3864 
   3865 }
   3866 
   3867 /*----------------------------------------------------------------------------
   3868 | Returns the result of subtracting the extended double-precision floating-
   3869 | point values `a' and `b'.  The operation is performed according to the
   3870 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   3871 *----------------------------------------------------------------------------*/
   3872 
   3873 floatx80 floatx80_sub( floatx80 a, floatx80 b STATUS_PARAM )
   3874 {
   3875     flag aSign, bSign;
   3876 
   3877     aSign = extractFloatx80Sign( a );
   3878     bSign = extractFloatx80Sign( b );
   3879     if ( aSign == bSign ) {
   3880         return subFloatx80Sigs( a, b, aSign STATUS_VAR );
   3881     }
   3882     else {
   3883         return addFloatx80Sigs( a, b, aSign STATUS_VAR );
   3884     }
   3885 
   3886 }
   3887 
   3888 /*----------------------------------------------------------------------------
   3889 | Returns the result of multiplying the extended double-precision floating-
   3890 | point values `a' and `b'.  The operation is performed according to the
   3891 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   3892 *----------------------------------------------------------------------------*/
   3893 
   3894 floatx80 floatx80_mul( floatx80 a, floatx80 b STATUS_PARAM )
   3895 {
   3896     flag aSign, bSign, zSign;
   3897     int32 aExp, bExp, zExp;
   3898     bits64 aSig, bSig, zSig0, zSig1;
   3899     floatx80 z;
   3900 
   3901     aSig = extractFloatx80Frac( a );
   3902     aExp = extractFloatx80Exp( a );
   3903     aSign = extractFloatx80Sign( a );
   3904     bSig = extractFloatx80Frac( b );
   3905     bExp = extractFloatx80Exp( b );
   3906     bSign = extractFloatx80Sign( b );
   3907     zSign = aSign ^ bSign;
   3908     if ( aExp == 0x7FFF ) {
   3909         if (    (bits64) ( aSig<<1 )
   3910              || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
   3911             return propagateFloatx80NaN( a, b STATUS_VAR );
   3912         }
   3913         if ( ( bExp | bSig ) == 0 ) goto invalid;
   3914         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
   3915     }
   3916     if ( bExp == 0x7FFF ) {
   3917         if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
   3918         if ( ( aExp | aSig ) == 0 ) {
   3919  invalid:
   3920             float_raise( float_flag_invalid STATUS_VAR);
   3921             z.low = floatx80_default_nan_low;
   3922             z.high = floatx80_default_nan_high;
   3923             return z;
   3924         }
   3925         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
   3926     }
   3927     if ( aExp == 0 ) {
   3928         if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
   3929         normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
   3930     }
   3931     if ( bExp == 0 ) {
   3932         if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
   3933         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
   3934     }
   3935     zExp = aExp + bExp - 0x3FFE;
   3936     mul64To128( aSig, bSig, &zSig0, &zSig1 );
   3937     if ( 0 < (sbits64) zSig0 ) {
   3938         shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
   3939         --zExp;
   3940     }
   3941     return
   3942         roundAndPackFloatx80(
   3943             STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
   3944 
   3945 }
   3946 
   3947 /*----------------------------------------------------------------------------
   3948 | Returns the result of dividing the extended double-precision floating-point
   3949 | value `a' by the corresponding value `b'.  The operation is performed
   3950 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   3951 *----------------------------------------------------------------------------*/
   3952 
   3953 floatx80 floatx80_div( floatx80 a, floatx80 b STATUS_PARAM )
   3954 {
   3955     flag aSign, bSign, zSign;
   3956     int32 aExp, bExp, zExp;
   3957     bits64 aSig, bSig, zSig0, zSig1;
   3958     bits64 rem0, rem1, rem2, term0, term1, term2;
   3959     floatx80 z;
   3960 
   3961     aSig = extractFloatx80Frac( a );
   3962     aExp = extractFloatx80Exp( a );
   3963     aSign = extractFloatx80Sign( a );
   3964     bSig = extractFloatx80Frac( b );
   3965     bExp = extractFloatx80Exp( b );
   3966     bSign = extractFloatx80Sign( b );
   3967     zSign = aSign ^ bSign;
   3968     if ( aExp == 0x7FFF ) {
   3969         if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
   3970         if ( bExp == 0x7FFF ) {
   3971             if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
   3972             goto invalid;
   3973         }
   3974         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
   3975     }
   3976     if ( bExp == 0x7FFF ) {
   3977         if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
   3978         return packFloatx80( zSign, 0, 0 );
   3979     }
   3980     if ( bExp == 0 ) {
   3981         if ( bSig == 0 ) {
   3982             if ( ( aExp | aSig ) == 0 ) {
   3983  invalid:
   3984                 float_raise( float_flag_invalid STATUS_VAR);
   3985                 z.low = floatx80_default_nan_low;
   3986                 z.high = floatx80_default_nan_high;
   3987                 return z;
   3988             }
   3989             float_raise( float_flag_divbyzero STATUS_VAR);
   3990             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
   3991         }
   3992         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
   3993     }
   3994     if ( aExp == 0 ) {
   3995         if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
   3996         normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
   3997     }
   3998     zExp = aExp - bExp + 0x3FFE;
   3999     rem1 = 0;
   4000     if ( bSig <= aSig ) {
   4001         shift128Right( aSig, 0, 1, &aSig, &rem1 );
   4002         ++zExp;
   4003     }
   4004     zSig0 = estimateDiv128To64( aSig, rem1, bSig );
   4005     mul64To128( bSig, zSig0, &term0, &term1 );
   4006     sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
   4007     while ( (sbits64) rem0 < 0 ) {
   4008         --zSig0;
   4009         add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
   4010     }
   4011     zSig1 = estimateDiv128To64( rem1, 0, bSig );
   4012     if ( (bits64) ( zSig1<<1 ) <= 8 ) {
   4013         mul64To128( bSig, zSig1, &term1, &term2 );
   4014         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
   4015         while ( (sbits64) rem1 < 0 ) {
   4016             --zSig1;
   4017             add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
   4018         }
   4019         zSig1 |= ( ( rem1 | rem2 ) != 0 );
   4020     }
   4021     return
   4022         roundAndPackFloatx80(
   4023             STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
   4024 
   4025 }
   4026 
   4027 /*----------------------------------------------------------------------------
   4028 | Returns the remainder of the extended double-precision floating-point value
   4029 | `a' with respect to the corresponding value `b'.  The operation is performed
   4030 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   4031 *----------------------------------------------------------------------------*/
   4032 
   4033 floatx80 floatx80_rem( floatx80 a, floatx80 b STATUS_PARAM )
   4034 {
   4035     flag aSign, bSign, zSign;
   4036     int32 aExp, bExp, expDiff;
   4037     bits64 aSig0, aSig1, bSig;
   4038     bits64 q, term0, term1, alternateASig0, alternateASig1;
   4039     floatx80 z;
   4040 
   4041     aSig0 = extractFloatx80Frac( a );
   4042     aExp = extractFloatx80Exp( a );
   4043     aSign = extractFloatx80Sign( a );
   4044     bSig = extractFloatx80Frac( b );
   4045     bExp = extractFloatx80Exp( b );
   4046     bSign = extractFloatx80Sign( b );
   4047     if ( aExp == 0x7FFF ) {
   4048         if (    (bits64) ( aSig0<<1 )
   4049              || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
   4050             return propagateFloatx80NaN( a, b STATUS_VAR );
   4051         }
   4052         goto invalid;
   4053     }
   4054     if ( bExp == 0x7FFF ) {
   4055         if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
   4056         return a;
   4057     }
   4058     if ( bExp == 0 ) {
   4059         if ( bSig == 0 ) {
   4060  invalid:
   4061             float_raise( float_flag_invalid STATUS_VAR);
   4062             z.low = floatx80_default_nan_low;
   4063             z.high = floatx80_default_nan_high;
   4064             return z;
   4065         }
   4066         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
   4067     }
   4068     if ( aExp == 0 ) {
   4069         if ( (bits64) ( aSig0<<1 ) == 0 ) return a;
   4070         normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
   4071     }
   4072     bSig |= LIT64( 0x8000000000000000 );
   4073     zSign = aSign;
   4074     expDiff = aExp - bExp;
   4075     aSig1 = 0;
   4076     if ( expDiff < 0 ) {
   4077         if ( expDiff < -1 ) return a;
   4078         shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
   4079         expDiff = 0;
   4080     }
   4081     q = ( bSig <= aSig0 );
   4082     if ( q ) aSig0 -= bSig;
   4083     expDiff -= 64;
   4084     while ( 0 < expDiff ) {
   4085         q = estimateDiv128To64( aSig0, aSig1, bSig );
   4086         q = ( 2 < q ) ? q - 2 : 0;
   4087         mul64To128( bSig, q, &term0, &term1 );
   4088         sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
   4089         shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
   4090         expDiff -= 62;
   4091     }
   4092     expDiff += 64;
   4093     if ( 0 < expDiff ) {
   4094         q = estimateDiv128To64( aSig0, aSig1, bSig );
   4095         q = ( 2 < q ) ? q - 2 : 0;
   4096         q >>= 64 - expDiff;
   4097         mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
   4098         sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
   4099         shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
   4100         while ( le128( term0, term1, aSig0, aSig1 ) ) {
   4101             ++q;
   4102             sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
   4103         }
   4104     }
   4105     else {
   4106         term1 = 0;
   4107         term0 = bSig;
   4108     }
   4109     sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
   4110     if (    lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
   4111          || (    eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
   4112               && ( q & 1 ) )
   4113        ) {
   4114         aSig0 = alternateASig0;
   4115         aSig1 = alternateASig1;
   4116         zSign = ! zSign;
   4117     }
   4118     return
   4119         normalizeRoundAndPackFloatx80(
   4120             80, zSign, bExp + expDiff, aSig0, aSig1 STATUS_VAR );
   4121 
   4122 }
   4123 
   4124 /*----------------------------------------------------------------------------
   4125 | Returns the square root of the extended double-precision floating-point
   4126 | value `a'.  The operation is performed according to the IEC/IEEE Standard
   4127 | for Binary Floating-Point Arithmetic.
   4128 *----------------------------------------------------------------------------*/
   4129 
   4130 floatx80 floatx80_sqrt( floatx80 a STATUS_PARAM )
   4131 {
   4132     flag aSign;
   4133     int32 aExp, zExp;
   4134     bits64 aSig0, aSig1, zSig0, zSig1, doubleZSig0;
   4135     bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
   4136     floatx80 z;
   4137 
   4138     aSig0 = extractFloatx80Frac( a );
   4139     aExp = extractFloatx80Exp( a );
   4140     aSign = extractFloatx80Sign( a );
   4141     if ( aExp == 0x7FFF ) {
   4142         if ( (bits64) ( aSig0<<1 ) ) return propagateFloatx80NaN( a, a STATUS_VAR );
   4143         if ( ! aSign ) return a;
   4144         goto invalid;
   4145     }
   4146     if ( aSign ) {
   4147         if ( ( aExp | aSig0 ) == 0 ) return a;
   4148  invalid:
   4149         float_raise( float_flag_invalid STATUS_VAR);
   4150         z.low = floatx80_default_nan_low;
   4151         z.high = floatx80_default_nan_high;
   4152         return z;
   4153     }
   4154     if ( aExp == 0 ) {
   4155         if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
   4156         normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
   4157     }
   4158     zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
   4159     zSig0 = estimateSqrt32( aExp, aSig0>>32 );
   4160     shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
   4161     zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
   4162     doubleZSig0 = zSig0<<1;
   4163     mul64To128( zSig0, zSig0, &term0, &term1 );
   4164     sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
   4165     while ( (sbits64) rem0 < 0 ) {
   4166         --zSig0;
   4167         doubleZSig0 -= 2;
   4168         add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
   4169     }
   4170     zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
   4171     if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
   4172         if ( zSig1 == 0 ) zSig1 = 1;
   4173         mul64To128( doubleZSig0, zSig1, &term1, &term2 );
   4174         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
   4175         mul64To128( zSig1, zSig1, &term2, &term3 );
   4176         sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
   4177         while ( (sbits64) rem1 < 0 ) {
   4178             --zSig1;
   4179             shortShift128Left( 0, zSig1, 1, &term2, &term3 );
   4180             term3 |= 1;
   4181             term2 |= doubleZSig0;
   4182             add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
   4183         }
   4184         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
   4185     }
   4186     shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
   4187     zSig0 |= doubleZSig0;
   4188     return
   4189         roundAndPackFloatx80(
   4190             STATUS(floatx80_rounding_precision), 0, zExp, zSig0, zSig1 STATUS_VAR );
   4191 
   4192 }
   4193 
   4194 /*----------------------------------------------------------------------------
   4195 | Returns 1 if the extended double-precision floating-point value `a' is
   4196 | equal to the corresponding value `b', and 0 otherwise.  The comparison is
   4197 | performed according to the IEC/IEEE Standard for Binary Floating-Point
   4198 | Arithmetic.
   4199 *----------------------------------------------------------------------------*/
   4200 
   4201 int floatx80_eq( floatx80 a, floatx80 b STATUS_PARAM )
   4202 {
   4203 
   4204     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
   4205               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
   4206          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
   4207               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
   4208        ) {
   4209         if (    floatx80_is_signaling_nan( a )
   4210              || floatx80_is_signaling_nan( b ) ) {
   4211             float_raise( float_flag_invalid STATUS_VAR);
   4212         }
   4213         return 0;
   4214     }
   4215     return
   4216            ( a.low == b.low )
   4217         && (    ( a.high == b.high )
   4218              || (    ( a.low == 0 )
   4219                   && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) )
   4220            );
   4221 
   4222 }
   4223 
   4224 /*----------------------------------------------------------------------------
   4225 | Returns 1 if the extended double-precision floating-point value `a' is
   4226 | less than or equal to the corresponding value `b', and 0 otherwise.  The
   4227 | comparison is performed according to the IEC/IEEE Standard for Binary
   4228 | Floating-Point Arithmetic.
   4229 *----------------------------------------------------------------------------*/
   4230 
   4231 int floatx80_le( floatx80 a, floatx80 b STATUS_PARAM )
   4232 {
   4233     flag aSign, bSign;
   4234 
   4235     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
   4236               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
   4237          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
   4238               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
   4239        ) {
   4240         float_raise( float_flag_invalid STATUS_VAR);
   4241         return 0;
   4242     }
   4243     aSign = extractFloatx80Sign( a );
   4244     bSign = extractFloatx80Sign( b );
   4245     if ( aSign != bSign ) {
   4246         return
   4247                aSign
   4248             || (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
   4249                  == 0 );
   4250     }
   4251     return
   4252           aSign ? le128( b.high, b.low, a.high, a.low )
   4253         : le128( a.high, a.low, b.high, b.low );
   4254 
   4255 }
   4256 
   4257 /*----------------------------------------------------------------------------
   4258 | Returns 1 if the extended double-precision floating-point value `a' is
   4259 | less than the corresponding value `b', and 0 otherwise.  The comparison
   4260 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
   4261 | Arithmetic.
   4262 *----------------------------------------------------------------------------*/
   4263 
   4264 int floatx80_lt( floatx80 a, floatx80 b STATUS_PARAM )
   4265 {
   4266     flag aSign, bSign;
   4267 
   4268     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
   4269               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
   4270          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
   4271               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
   4272        ) {
   4273         float_raise( float_flag_invalid STATUS_VAR);
   4274         return 0;
   4275     }
   4276     aSign = extractFloatx80Sign( a );
   4277     bSign = extractFloatx80Sign( b );
   4278     if ( aSign != bSign ) {
   4279         return
   4280                aSign
   4281             && (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
   4282                  != 0 );
   4283     }
   4284     return
   4285           aSign ? lt128( b.high, b.low, a.high, a.low )
   4286         : lt128( a.high, a.low, b.high, b.low );
   4287 
   4288 }
   4289 
   4290 /*----------------------------------------------------------------------------
   4291 | Returns 1 if the extended double-precision floating-point value `a' is equal
   4292 | to the corresponding value `b', and 0 otherwise.  The invalid exception is
   4293 | raised if either operand is a NaN.  Otherwise, the comparison is performed
   4294 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   4295 *----------------------------------------------------------------------------*/
   4296 
   4297 int floatx80_eq_signaling( floatx80 a, floatx80 b STATUS_PARAM )
   4298 {
   4299 
   4300     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
   4301               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
   4302          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
   4303               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
   4304        ) {
   4305         float_raise( float_flag_invalid STATUS_VAR);
   4306         return 0;
   4307     }
   4308     return
   4309            ( a.low == b.low )
   4310         && (    ( a.high == b.high )
   4311              || (    ( a.low == 0 )
   4312                   && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) )
   4313            );
   4314 
   4315 }
   4316 
   4317 /*----------------------------------------------------------------------------
   4318 | Returns 1 if the extended double-precision floating-point value `a' is less
   4319 | than or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs
   4320 | do not cause an exception.  Otherwise, the comparison is performed according
   4321 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   4322 *----------------------------------------------------------------------------*/
   4323 
   4324 int floatx80_le_quiet( floatx80 a, floatx80 b STATUS_PARAM )
   4325 {
   4326     flag aSign, bSign;
   4327 
   4328     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
   4329               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
   4330          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
   4331               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
   4332        ) {
   4333         if (    floatx80_is_signaling_nan( a )
   4334              || floatx80_is_signaling_nan( b ) ) {
   4335             float_raise( float_flag_invalid STATUS_VAR);
   4336         }
   4337         return 0;
   4338     }
   4339     aSign = extractFloatx80Sign( a );
   4340     bSign = extractFloatx80Sign( b );
   4341     if ( aSign != bSign ) {
   4342         return
   4343                aSign
   4344             || (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
   4345                  == 0 );
   4346     }
   4347     return
   4348           aSign ? le128( b.high, b.low, a.high, a.low )
   4349         : le128( a.high, a.low, b.high, b.low );
   4350 
   4351 }
   4352 
   4353 /*----------------------------------------------------------------------------
   4354 | Returns 1 if the extended double-precision floating-point value `a' is less
   4355 | than the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause
   4356 | an exception.  Otherwise, the comparison is performed according to the
   4357 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   4358 *----------------------------------------------------------------------------*/
   4359 
   4360 int floatx80_lt_quiet( floatx80 a, floatx80 b STATUS_PARAM )
   4361 {
   4362     flag aSign, bSign;
   4363 
   4364     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
   4365               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
   4366          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
   4367               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
   4368        ) {
   4369         if (    floatx80_is_signaling_nan( a )
   4370              || floatx80_is_signaling_nan( b ) ) {
   4371             float_raise( float_flag_invalid STATUS_VAR);
   4372         }
   4373         return 0;
   4374     }
   4375     aSign = extractFloatx80Sign( a );
   4376     bSign = extractFloatx80Sign( b );
   4377     if ( aSign != bSign ) {
   4378         return
   4379                aSign
   4380             && (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
   4381                  != 0 );
   4382     }
   4383     return
   4384           aSign ? lt128( b.high, b.low, a.high, a.low )
   4385         : lt128( a.high, a.low, b.high, b.low );
   4386 
   4387 }
   4388 
   4389 #endif
   4390 
   4391 #ifdef FLOAT128
   4392 
   4393 /*----------------------------------------------------------------------------
   4394 | Returns the result of converting the quadruple-precision floating-point
   4395 | value `a' to the 32-bit two's complement integer format.  The conversion
   4396 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
   4397 | Arithmetic---which means in particular that the conversion is rounded
   4398 | according to the current rounding mode.  If `a' is a NaN, the largest
   4399 | positive integer is returned.  Otherwise, if the conversion overflows, the
   4400 | largest integer with the same sign as `a' is returned.
   4401 *----------------------------------------------------------------------------*/
   4402 
   4403 int32 float128_to_int32( float128 a STATUS_PARAM )
   4404 {
   4405     flag aSign;
   4406     int32 aExp, shiftCount;
   4407     bits64 aSig0, aSig1;
   4408 
   4409     aSig1 = extractFloat128Frac1( a );
   4410     aSig0 = extractFloat128Frac0( a );
   4411     aExp = extractFloat128Exp( a );
   4412     aSign = extractFloat128Sign( a );
   4413     if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
   4414     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
   4415     aSig0 |= ( aSig1 != 0 );
   4416     shiftCount = 0x4028 - aExp;
   4417     if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
   4418     return roundAndPackInt32( aSign, aSig0 STATUS_VAR );
   4419 
   4420 }
   4421 
   4422 /*----------------------------------------------------------------------------
   4423 | Returns the result of converting the quadruple-precision floating-point
   4424 | value `a' to the 32-bit two's complement integer format.  The conversion
   4425 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
   4426 | Arithmetic, except that the conversion is always rounded toward zero.  If
   4427 | `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
   4428 | conversion overflows, the largest integer with the same sign as `a' is
   4429 | returned.
   4430 *----------------------------------------------------------------------------*/
   4431 
   4432 int32 float128_to_int32_round_to_zero( float128 a STATUS_PARAM )
   4433 {
   4434     flag aSign;
   4435     int32 aExp, shiftCount;
   4436     bits64 aSig0, aSig1, savedASig;
   4437     int32 z;
   4438 
   4439     aSig1 = extractFloat128Frac1( a );
   4440     aSig0 = extractFloat128Frac0( a );
   4441     aExp = extractFloat128Exp( a );
   4442     aSign = extractFloat128Sign( a );
   4443     aSig0 |= ( aSig1 != 0 );
   4444     if ( 0x401E < aExp ) {
   4445         if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
   4446         goto invalid;
   4447     }
   4448     else if ( aExp < 0x3FFF ) {
   4449         if ( aExp || aSig0 ) STATUS(float_exception_flags) |= float_flag_inexact;
   4450         return 0;
   4451     }
   4452     aSig0 |= LIT64( 0x0001000000000000 );
   4453     shiftCount = 0x402F - aExp;
   4454     savedASig = aSig0;
   4455     aSig0 >>= shiftCount;
   4456     z = aSig0;
   4457     if ( aSign ) z = - z;
   4458     if ( ( z < 0 ) ^ aSign ) {
   4459  invalid:
   4460         float_raise( float_flag_invalid STATUS_VAR);
   4461         return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
   4462     }
   4463     if ( ( aSig0<<shiftCount ) != savedASig ) {
   4464         STATUS(float_exception_flags) |= float_flag_inexact;
   4465     }
   4466     return z;
   4467 
   4468 }
   4469 
   4470 /*----------------------------------------------------------------------------
   4471 | Returns the result of converting the quadruple-precision floating-point
   4472 | value `a' to the 64-bit two's complement integer format.  The conversion
   4473 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
   4474 | Arithmetic---which means in particular that the conversion is rounded
   4475 | according to the current rounding mode.  If `a' is a NaN, the largest
   4476 | positive integer is returned.  Otherwise, if the conversion overflows, the
   4477 | largest integer with the same sign as `a' is returned.
   4478 *----------------------------------------------------------------------------*/
   4479 
   4480 int64 float128_to_int64( float128 a STATUS_PARAM )
   4481 {
   4482     flag aSign;
   4483     int32 aExp, shiftCount;
   4484     bits64 aSig0, aSig1;
   4485 
   4486     aSig1 = extractFloat128Frac1( a );
   4487     aSig0 = extractFloat128Frac0( a );
   4488     aExp = extractFloat128Exp( a );
   4489     aSign = extractFloat128Sign( a );
   4490     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
   4491     shiftCount = 0x402F - aExp;
   4492     if ( shiftCount <= 0 ) {
   4493         if ( 0x403E < aExp ) {
   4494             float_raise( float_flag_invalid STATUS_VAR);
   4495             if (    ! aSign
   4496                  || (    ( aExp == 0x7FFF )
   4497                       && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) )
   4498                     )
   4499                ) {
   4500                 return LIT64( 0x7FFFFFFFFFFFFFFF );
   4501             }
   4502             return (sbits64) LIT64( 0x8000000000000000 );
   4503         }
   4504         shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
   4505     }
   4506     else {
   4507         shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
   4508     }
   4509     return roundAndPackInt64( aSign, aSig0, aSig1 STATUS_VAR );
   4510 
   4511 }
   4512 
   4513 /*----------------------------------------------------------------------------
   4514 | Returns the result of converting the quadruple-precision floating-point
   4515 | value `a' to the 64-bit two's complement integer format.  The conversion
   4516 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
   4517 | Arithmetic, except that the conversion is always rounded toward zero.
   4518 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
   4519 | the conversion overflows, the largest integer with the same sign as `a' is
   4520 | returned.
   4521 *----------------------------------------------------------------------------*/
   4522 
   4523 int64 float128_to_int64_round_to_zero( float128 a STATUS_PARAM )
   4524 {
   4525     flag aSign;
   4526     int32 aExp, shiftCount;
   4527     bits64 aSig0, aSig1;
   4528     int64 z;
   4529 
   4530     aSig1 = extractFloat128Frac1( a );
   4531     aSig0 = extractFloat128Frac0( a );
   4532     aExp = extractFloat128Exp( a );
   4533     aSign = extractFloat128Sign( a );
   4534     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
   4535     shiftCount = aExp - 0x402F;
   4536     if ( 0 < shiftCount ) {
   4537         if ( 0x403E <= aExp ) {
   4538             aSig0 &= LIT64( 0x0000FFFFFFFFFFFF );
   4539             if (    ( a.high == LIT64( 0xC03E000000000000 ) )
   4540                  && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) {
   4541                 if ( aSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
   4542             }
   4543             else {
   4544                 float_raise( float_flag_invalid STATUS_VAR);
   4545                 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) {
   4546                     return LIT64( 0x7FFFFFFFFFFFFFFF );
   4547                 }
   4548             }
   4549             return (sbits64) LIT64( 0x8000000000000000 );
   4550         }
   4551         z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
   4552         if ( (bits64) ( aSig1<<shiftCount ) ) {
   4553             STATUS(float_exception_flags) |= float_flag_inexact;
   4554         }
   4555     }
   4556     else {
   4557         if ( aExp < 0x3FFF ) {
   4558             if ( aExp | aSig0 | aSig1 ) {
   4559                 STATUS(float_exception_flags) |= float_flag_inexact;
   4560             }
   4561             return 0;
   4562         }
   4563         z = aSig0>>( - shiftCount );
   4564         if (    aSig1
   4565              || ( shiftCount && (bits64) ( aSig0<<( shiftCount & 63 ) ) ) ) {
   4566             STATUS(float_exception_flags) |= float_flag_inexact;
   4567         }
   4568     }
   4569     if ( aSign ) z = - z;
   4570     return z;
   4571 
   4572 }
   4573 
   4574 /*----------------------------------------------------------------------------
   4575 | Returns the result of converting the quadruple-precision floating-point
   4576 | value `a' to the single-precision floating-point format.  The conversion
   4577 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
   4578 | Arithmetic.
   4579 *----------------------------------------------------------------------------*/
   4580 
   4581 float32 float128_to_float32( float128 a STATUS_PARAM )
   4582 {
   4583     flag aSign;
   4584     int32 aExp;
   4585     bits64 aSig0, aSig1;
   4586     bits32 zSig;
   4587 
   4588     aSig1 = extractFloat128Frac1( a );
   4589     aSig0 = extractFloat128Frac0( a );
   4590     aExp = extractFloat128Exp( a );
   4591     aSign = extractFloat128Sign( a );
   4592     if ( aExp == 0x7FFF ) {
   4593         if ( aSig0 | aSig1 ) {
   4594             return commonNaNToFloat32( float128ToCommonNaN( a STATUS_VAR ) );
   4595         }
   4596         return packFloat32( aSign, 0xFF, 0 );
   4597     }
   4598     aSig0 |= ( aSig1 != 0 );
   4599     shift64RightJamming( aSig0, 18, &aSig0 );
   4600     zSig = aSig0;
   4601     if ( aExp || zSig ) {
   4602         zSig |= 0x40000000;
   4603         aExp -= 0x3F81;
   4604     }
   4605     return roundAndPackFloat32( aSign, aExp, zSig STATUS_VAR );
   4606 
   4607 }
   4608 
   4609 /*----------------------------------------------------------------------------
   4610 | Returns the result of converting the quadruple-precision floating-point
   4611 | value `a' to the double-precision floating-point format.  The conversion
   4612 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
   4613 | Arithmetic.
   4614 *----------------------------------------------------------------------------*/
   4615 
   4616 float64 float128_to_float64( float128 a STATUS_PARAM )
   4617 {
   4618     flag aSign;
   4619     int32 aExp;
   4620     bits64 aSig0, aSig1;
   4621 
   4622     aSig1 = extractFloat128Frac1( a );
   4623     aSig0 = extractFloat128Frac0( a );
   4624     aExp = extractFloat128Exp( a );
   4625     aSign = extractFloat128Sign( a );
   4626     if ( aExp == 0x7FFF ) {
   4627         if ( aSig0 | aSig1 ) {
   4628             return commonNaNToFloat64( float128ToCommonNaN( a STATUS_VAR ) );
   4629         }
   4630         return packFloat64( aSign, 0x7FF, 0 );
   4631     }
   4632     shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
   4633     aSig0 |= ( aSig1 != 0 );
   4634     if ( aExp || aSig0 ) {
   4635         aSig0 |= LIT64( 0x4000000000000000 );
   4636         aExp -= 0x3C01;
   4637     }
   4638     return roundAndPackFloat64( aSign, aExp, aSig0 STATUS_VAR );
   4639 
   4640 }
   4641 
   4642 #ifdef FLOATX80
   4643 
   4644 /*----------------------------------------------------------------------------
   4645 | Returns the result of converting the quadruple-precision floating-point
   4646 | value `a' to the extended double-precision floating-point format.  The
   4647 | conversion is performed according to the IEC/IEEE Standard for Binary
   4648 | Floating-Point Arithmetic.
   4649 *----------------------------------------------------------------------------*/
   4650 
   4651 floatx80 float128_to_floatx80( float128 a STATUS_PARAM )
   4652 {
   4653     flag aSign;
   4654     int32 aExp;
   4655     bits64 aSig0, aSig1;
   4656 
   4657     aSig1 = extractFloat128Frac1( a );
   4658     aSig0 = extractFloat128Frac0( a );
   4659     aExp = extractFloat128Exp( a );
   4660     aSign = extractFloat128Sign( a );
   4661     if ( aExp == 0x7FFF ) {
   4662         if ( aSig0 | aSig1 ) {
   4663             return commonNaNToFloatx80( float128ToCommonNaN( a STATUS_VAR ) );
   4664         }
   4665         return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
   4666     }
   4667     if ( aExp == 0 ) {
   4668         if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
   4669         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
   4670     }
   4671     else {
   4672         aSig0 |= LIT64( 0x0001000000000000 );
   4673     }
   4674     shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
   4675     return roundAndPackFloatx80( 80, aSign, aExp, aSig0, aSig1 STATUS_VAR );
   4676 
   4677 }
   4678 
   4679 #endif
   4680 
   4681 /*----------------------------------------------------------------------------
   4682 | Rounds the quadruple-precision floating-point value `a' to an integer, and
   4683 | returns the result as a quadruple-precision floating-point value.  The
   4684 | operation is performed according to the IEC/IEEE Standard for Binary
   4685 | Floating-Point Arithmetic.
   4686 *----------------------------------------------------------------------------*/
   4687 
   4688 float128 float128_round_to_int( float128 a STATUS_PARAM )
   4689 {
   4690     flag aSign;
   4691     int32 aExp;
   4692     bits64 lastBitMask, roundBitsMask;
   4693     int8 roundingMode;
   4694     float128 z;
   4695 
   4696     aExp = extractFloat128Exp( a );
   4697     if ( 0x402F <= aExp ) {
   4698         if ( 0x406F <= aExp ) {
   4699             if (    ( aExp == 0x7FFF )
   4700                  && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )
   4701                ) {
   4702                 return propagateFloat128NaN( a, a STATUS_VAR );
   4703             }
   4704             return a;
   4705         }
   4706         lastBitMask = 1;
   4707         lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
   4708         roundBitsMask = lastBitMask - 1;
   4709         z = a;
   4710         roundingMode = STATUS(float_rounding_mode);
   4711         if ( roundingMode == float_round_nearest_even ) {
   4712             if ( lastBitMask ) {
   4713                 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
   4714                 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
   4715             }
   4716             else {
   4717                 if ( (sbits64) z.low < 0 ) {
   4718                     ++z.high;
   4719                     if ( (bits64) ( z.low<<1 ) == 0 ) z.high &= ~1;
   4720                 }
   4721             }
   4722         }
   4723         else if ( roundingMode != float_round_to_zero ) {
   4724             if (   extractFloat128Sign( z )
   4725                  ^ ( roundingMode == float_round_up ) ) {
   4726                 add128( z.high, z.low, 0, roundBitsMask, &z.high, &z.low );
   4727             }
   4728         }
   4729         z.low &= ~ roundBitsMask;
   4730     }
   4731     else {
   4732         if ( aExp < 0x3FFF ) {
   4733             if ( ( ( (bits64) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
   4734             STATUS(float_exception_flags) |= float_flag_inexact;
   4735             aSign = extractFloat128Sign( a );
   4736             switch ( STATUS(float_rounding_mode) ) {
   4737              case float_round_nearest_even:
   4738                 if (    ( aExp == 0x3FFE )
   4739                      && (   extractFloat128Frac0( a )
   4740                           | extractFloat128Frac1( a ) )
   4741                    ) {
   4742                     return packFloat128( aSign, 0x3FFF, 0, 0 );
   4743                 }
   4744                 break;
   4745              case float_round_down:
   4746                 return
   4747                       aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
   4748                     : packFloat128( 0, 0, 0, 0 );
   4749              case float_round_up:
   4750                 return
   4751                       aSign ? packFloat128( 1, 0, 0, 0 )
   4752                     : packFloat128( 0, 0x3FFF, 0, 0 );
   4753             }
   4754             return packFloat128( aSign, 0, 0, 0 );
   4755         }
   4756         lastBitMask = 1;
   4757         lastBitMask <<= 0x402F - aExp;
   4758         roundBitsMask = lastBitMask - 1;
   4759         z.low = 0;
   4760         z.high = a.high;
   4761         roundingMode = STATUS(float_rounding_mode);
   4762         if ( roundingMode == float_round_nearest_even ) {
   4763             z.high += lastBitMask>>1;
   4764             if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
   4765                 z.high &= ~ lastBitMask;
   4766             }
   4767         }
   4768         else if ( roundingMode != float_round_to_zero ) {
   4769             if (   extractFloat128Sign( z )
   4770                  ^ ( roundingMode == float_round_up ) ) {
   4771                 z.high |= ( a.low != 0 );
   4772                 z.high += roundBitsMask;
   4773             }
   4774         }
   4775         z.high &= ~ roundBitsMask;
   4776     }
   4777     if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
   4778         STATUS(float_exception_flags) |= float_flag_inexact;
   4779     }
   4780     return z;
   4781 
   4782 }
   4783 
   4784 /*----------------------------------------------------------------------------
   4785 | Returns the result of adding the absolute values of the quadruple-precision
   4786 | floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
   4787 | before being returned.  `zSign' is ignored if the result is a NaN.
   4788 | The addition is performed according to the IEC/IEEE Standard for Binary
   4789 | Floating-Point Arithmetic.
   4790 *----------------------------------------------------------------------------*/
   4791 
   4792 static float128 addFloat128Sigs( float128 a, float128 b, flag zSign STATUS_PARAM)
   4793 {
   4794     int32 aExp, bExp, zExp;
   4795     bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
   4796     int32 expDiff;
   4797 
   4798     aSig1 = extractFloat128Frac1( a );
   4799     aSig0 = extractFloat128Frac0( a );
   4800     aExp = extractFloat128Exp( a );
   4801     bSig1 = extractFloat128Frac1( b );
   4802     bSig0 = extractFloat128Frac0( b );
   4803     bExp = extractFloat128Exp( b );
   4804     expDiff = aExp - bExp;
   4805     if ( 0 < expDiff ) {
   4806         if ( aExp == 0x7FFF ) {
   4807             if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
   4808             return a;
   4809         }
   4810         if ( bExp == 0 ) {
   4811             --expDiff;
   4812         }
   4813         else {
   4814             bSig0 |= LIT64( 0x0001000000000000 );
   4815         }
   4816         shift128ExtraRightJamming(
   4817             bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
   4818         zExp = aExp;
   4819     }
   4820     else if ( expDiff < 0 ) {
   4821         if ( bExp == 0x7FFF ) {
   4822             if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
   4823             return packFloat128( zSign, 0x7FFF, 0, 0 );
   4824         }
   4825         if ( aExp == 0 ) {
   4826             ++expDiff;
   4827         }
   4828         else {
   4829             aSig0 |= LIT64( 0x0001000000000000 );
   4830         }
   4831         shift128ExtraRightJamming(
   4832             aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
   4833         zExp = bExp;
   4834     }
   4835     else {
   4836         if ( aExp == 0x7FFF ) {
   4837             if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
   4838                 return propagateFloat128NaN( a, b STATUS_VAR );
   4839             }
   4840             return a;
   4841         }
   4842         add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
   4843         if ( aExp == 0 ) {
   4844             if ( STATUS(flush_to_zero) ) return packFloat128( zSign, 0, 0, 0 );
   4845             return packFloat128( zSign, 0, zSig0, zSig1 );
   4846         }
   4847         zSig2 = 0;
   4848         zSig0 |= LIT64( 0x0002000000000000 );
   4849         zExp = aExp;
   4850         goto shiftRight1;
   4851     }
   4852     aSig0 |= LIT64( 0x0001000000000000 );
   4853     add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
   4854     --zExp;
   4855     if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack;
   4856     ++zExp;
   4857  shiftRight1:
   4858     shift128ExtraRightJamming(
   4859         zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
   4860  roundAndPack:
   4861     return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
   4862 
   4863 }
   4864 
   4865 /*----------------------------------------------------------------------------
   4866 | Returns the result of subtracting the absolute values of the quadruple-
   4867 | precision floating-point values `a' and `b'.  If `zSign' is 1, the
   4868 | difference is negated before being returned.  `zSign' is ignored if the
   4869 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
   4870 | Standard for Binary Floating-Point Arithmetic.
   4871 *----------------------------------------------------------------------------*/
   4872 
   4873 static float128 subFloat128Sigs( float128 a, float128 b, flag zSign STATUS_PARAM)
   4874 {
   4875     int32 aExp, bExp, zExp;
   4876     bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
   4877     int32 expDiff;
   4878     float128 z;
   4879 
   4880     aSig1 = extractFloat128Frac1( a );
   4881     aSig0 = extractFloat128Frac0( a );
   4882     aExp = extractFloat128Exp( a );
   4883     bSig1 = extractFloat128Frac1( b );
   4884     bSig0 = extractFloat128Frac0( b );
   4885     bExp = extractFloat128Exp( b );
   4886     expDiff = aExp - bExp;
   4887     shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
   4888     shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
   4889     if ( 0 < expDiff ) goto aExpBigger;
   4890     if ( expDiff < 0 ) goto bExpBigger;
   4891     if ( aExp == 0x7FFF ) {
   4892         if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
   4893             return propagateFloat128NaN( a, b STATUS_VAR );
   4894         }
   4895         float_raise( float_flag_invalid STATUS_VAR);
   4896         z.low = float128_default_nan_low;
   4897         z.high = float128_default_nan_high;
   4898         return z;
   4899     }
   4900     if ( aExp == 0 ) {
   4901         aExp = 1;
   4902         bExp = 1;
   4903     }
   4904     if ( bSig0 < aSig0 ) goto aBigger;
   4905     if ( aSig0 < bSig0 ) goto bBigger;
   4906     if ( bSig1 < aSig1 ) goto aBigger;
   4907     if ( aSig1 < bSig1 ) goto bBigger;
   4908     return packFloat128( STATUS(float_rounding_mode) == float_round_down, 0, 0, 0 );
   4909  bExpBigger:
   4910     if ( bExp == 0x7FFF ) {
   4911         if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
   4912         return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
   4913     }
   4914     if ( aExp == 0 ) {
   4915         ++expDiff;
   4916     }
   4917     else {
   4918         aSig0 |= LIT64( 0x4000000000000000 );
   4919     }
   4920     shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
   4921     bSig0 |= LIT64( 0x4000000000000000 );
   4922  bBigger:
   4923     sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
   4924     zExp = bExp;
   4925     zSign ^= 1;
   4926     goto normalizeRoundAndPack;
   4927  aExpBigger:
   4928     if ( aExp == 0x7FFF ) {
   4929         if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
   4930         return a;
   4931     }
   4932     if ( bExp == 0 ) {
   4933         --expDiff;
   4934     }
   4935     else {
   4936         bSig0 |= LIT64( 0x4000000000000000 );
   4937     }
   4938     shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
   4939     aSig0 |= LIT64( 0x4000000000000000 );
   4940  aBigger:
   4941     sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
   4942     zExp = aExp;
   4943  normalizeRoundAndPack:
   4944     --zExp;
   4945     return normalizeRoundAndPackFloat128( zSign, zExp - 14, zSig0, zSig1 STATUS_VAR );
   4946 
   4947 }
   4948 
   4949 /*----------------------------------------------------------------------------
   4950 | Returns the result of adding the quadruple-precision floating-point values
   4951 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
   4952 | for Binary Floating-Point Arithmetic.
   4953 *----------------------------------------------------------------------------*/
   4954 
   4955 float128 float128_add( float128 a, float128 b STATUS_PARAM )
   4956 {
   4957     flag aSign, bSign;
   4958 
   4959     aSign = extractFloat128Sign( a );
   4960     bSign = extractFloat128Sign( b );
   4961     if ( aSign == bSign ) {
   4962         return addFloat128Sigs( a, b, aSign STATUS_VAR );
   4963     }
   4964     else {
   4965         return subFloat128Sigs( a, b, aSign STATUS_VAR );
   4966     }
   4967 
   4968 }
   4969 
   4970 /*----------------------------------------------------------------------------
   4971 | Returns the result of subtracting the quadruple-precision floating-point
   4972 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
   4973 | Standard for Binary Floating-Point Arithmetic.
   4974 *----------------------------------------------------------------------------*/
   4975 
   4976 float128 float128_sub( float128 a, float128 b STATUS_PARAM )
   4977 {
   4978     flag aSign, bSign;
   4979 
   4980     aSign = extractFloat128Sign( a );
   4981     bSign = extractFloat128Sign( b );
   4982     if ( aSign == bSign ) {
   4983         return subFloat128Sigs( a, b, aSign STATUS_VAR );
   4984     }
   4985     else {
   4986         return addFloat128Sigs( a, b, aSign STATUS_VAR );
   4987     }
   4988 
   4989 }
   4990 
   4991 /*----------------------------------------------------------------------------
   4992 | Returns the result of multiplying the quadruple-precision floating-point
   4993 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
   4994 | Standard for Binary Floating-Point Arithmetic.
   4995 *----------------------------------------------------------------------------*/
   4996 
   4997 float128 float128_mul( float128 a, float128 b STATUS_PARAM )
   4998 {
   4999     flag aSign, bSign, zSign;
   5000     int32 aExp, bExp, zExp;
   5001     bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
   5002     float128 z;
   5003 
   5004     aSig1 = extractFloat128Frac1( a );
   5005     aSig0 = extractFloat128Frac0( a );
   5006     aExp = extractFloat128Exp( a );
   5007     aSign = extractFloat128Sign( a );
   5008     bSig1 = extractFloat128Frac1( b );
   5009     bSig0 = extractFloat128Frac0( b );
   5010     bExp = extractFloat128Exp( b );
   5011     bSign = extractFloat128Sign( b );
   5012     zSign = aSign ^ bSign;
   5013     if ( aExp == 0x7FFF ) {
   5014         if (    ( aSig0 | aSig1 )
   5015              || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
   5016             return propagateFloat128NaN( a, b STATUS_VAR );
   5017         }
   5018         if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
   5019         return packFloat128( zSign, 0x7FFF, 0, 0 );
   5020     }
   5021     if ( bExp == 0x7FFF ) {
   5022         if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
   5023         if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
   5024  invalid:
   5025             float_raise( float_flag_invalid STATUS_VAR);
   5026             z.low = float128_default_nan_low;
   5027             z.high = float128_default_nan_high;
   5028             return z;
   5029         }
   5030         return packFloat128( zSign, 0x7FFF, 0, 0 );
   5031     }
   5032     if ( aExp == 0 ) {
   5033         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
   5034         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
   5035     }
   5036     if ( bExp == 0 ) {
   5037         if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
   5038         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
   5039     }
   5040     zExp = aExp + bExp - 0x4000;
   5041     aSig0 |= LIT64( 0x0001000000000000 );
   5042     shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
   5043     mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
   5044     add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
   5045     zSig2 |= ( zSig3 != 0 );
   5046     if ( LIT64( 0x0002000000000000 ) <= zSig0 ) {
   5047         shift128ExtraRightJamming(
   5048             zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
   5049         ++zExp;
   5050     }
   5051     return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
   5052 
   5053 }
   5054 
   5055 /*----------------------------------------------------------------------------
   5056 | Returns the result of dividing the quadruple-precision floating-point value
   5057 | `a' by the corresponding value `b'.  The operation is performed according to
   5058 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   5059 *----------------------------------------------------------------------------*/
   5060 
   5061 float128 float128_div( float128 a, float128 b STATUS_PARAM )
   5062 {
   5063     flag aSign, bSign, zSign;
   5064     int32 aExp, bExp, zExp;
   5065     bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
   5066     bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
   5067     float128 z;
   5068 
   5069     aSig1 = extractFloat128Frac1( a );
   5070     aSig0 = extractFloat128Frac0( a );
   5071     aExp = extractFloat128Exp( a );
   5072     aSign = extractFloat128Sign( a );
   5073     bSig1 = extractFloat128Frac1( b );
   5074     bSig0 = extractFloat128Frac0( b );
   5075     bExp = extractFloat128Exp( b );
   5076     bSign = extractFloat128Sign( b );
   5077     zSign = aSign ^ bSign;
   5078     if ( aExp == 0x7FFF ) {
   5079         if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
   5080         if ( bExp == 0x7FFF ) {
   5081             if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
   5082             goto invalid;
   5083         }
   5084         return packFloat128( zSign, 0x7FFF, 0, 0 );
   5085     }
   5086     if ( bExp == 0x7FFF ) {
   5087         if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
   5088         return packFloat128( zSign, 0, 0, 0 );
   5089     }
   5090     if ( bExp == 0 ) {
   5091         if ( ( bSig0 | bSig1 ) == 0 ) {
   5092             if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
   5093  invalid:
   5094                 float_raise( float_flag_invalid STATUS_VAR);
   5095                 z.low = float128_default_nan_low;
   5096                 z.high = float128_default_nan_high;
   5097                 return z;
   5098             }
   5099             float_raise( float_flag_divbyzero STATUS_VAR);
   5100             return packFloat128( zSign, 0x7FFF, 0, 0 );
   5101         }
   5102         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
   5103     }
   5104     if ( aExp == 0 ) {
   5105         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
   5106         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
   5107     }
   5108     zExp = aExp - bExp + 0x3FFD;
   5109     shortShift128Left(
   5110         aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 );
   5111     shortShift128Left(
   5112         bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
   5113     if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
   5114         shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
   5115         ++zExp;
   5116     }
   5117     zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
   5118     mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
   5119     sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
   5120     while ( (sbits64) rem0 < 0 ) {
   5121         --zSig0;
   5122         add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
   5123     }
   5124     zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
   5125     if ( ( zSig1 & 0x3FFF ) <= 4 ) {
   5126         mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
   5127         sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
   5128         while ( (sbits64) rem1 < 0 ) {
   5129             --zSig1;
   5130             add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
   5131         }
   5132         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
   5133     }
   5134     shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
   5135     return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
   5136 
   5137 }
   5138 
   5139 /*----------------------------------------------------------------------------
   5140 | Returns the remainder of the quadruple-precision floating-point value `a'
   5141 | with respect to the corresponding value `b'.  The operation is performed
   5142 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   5143 *----------------------------------------------------------------------------*/
   5144 
   5145 float128 float128_rem( float128 a, float128 b STATUS_PARAM )
   5146 {
   5147     flag aSign, bSign, zSign;
   5148     int32 aExp, bExp, expDiff;
   5149     bits64 aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
   5150     bits64 allZero, alternateASig0, alternateASig1, sigMean1;
   5151     sbits64 sigMean0;
   5152     float128 z;
   5153 
   5154     aSig1 = extractFloat128Frac1( a );
   5155     aSig0 = extractFloat128Frac0( a );
   5156     aExp = extractFloat128Exp( a );
   5157     aSign = extractFloat128Sign( a );
   5158     bSig1 = extractFloat128Frac1( b );
   5159     bSig0 = extractFloat128Frac0( b );
   5160     bExp = extractFloat128Exp( b );
   5161     bSign = extractFloat128Sign( b );
   5162     if ( aExp == 0x7FFF ) {
   5163         if (    ( aSig0 | aSig1 )
   5164              || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
   5165             return propagateFloat128NaN( a, b STATUS_VAR );
   5166         }
   5167         goto invalid;
   5168     }
   5169     if ( bExp == 0x7FFF ) {
   5170         if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
   5171         return a;
   5172     }
   5173     if ( bExp == 0 ) {
   5174         if ( ( bSig0 | bSig1 ) == 0 ) {
   5175  invalid:
   5176             float_raise( float_flag_invalid STATUS_VAR);
   5177             z.low = float128_default_nan_low;
   5178             z.high = float128_default_nan_high;
   5179             return z;
   5180         }
   5181         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
   5182     }
   5183     if ( aExp == 0 ) {
   5184         if ( ( aSig0 | aSig1 ) == 0 ) return a;
   5185         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
   5186     }
   5187     expDiff = aExp - bExp;
   5188     if ( expDiff < -1 ) return a;
   5189     shortShift128Left(
   5190         aSig0 | LIT64( 0x0001000000000000 ),
   5191         aSig1,
   5192         15 - ( expDiff < 0 ),
   5193         &aSig0,
   5194         &aSig1
   5195     );
   5196     shortShift128Left(
   5197         bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
   5198     q = le128( bSig0, bSig1, aSig0, aSig1 );
   5199     if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
   5200     expDiff -= 64;
   5201     while ( 0 < expDiff ) {
   5202         q = estimateDiv128To64( aSig0, aSig1, bSig0 );
   5203         q = ( 4 < q ) ? q - 4 : 0;
   5204         mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
   5205         shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
   5206         shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
   5207         sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
   5208         expDiff -= 61;
   5209     }
   5210     if ( -64 < expDiff ) {
   5211         q = estimateDiv128To64( aSig0, aSig1, bSig0 );
   5212         q = ( 4 < q ) ? q - 4 : 0;
   5213         q >>= - expDiff;
   5214         shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
   5215         expDiff += 52;
   5216         if ( expDiff < 0 ) {
   5217             shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
   5218         }
   5219         else {
   5220             shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
   5221         }
   5222         mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
   5223         sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
   5224     }
   5225     else {
   5226         shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
   5227         shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
   5228     }
   5229     do {
   5230         alternateASig0 = aSig0;
   5231         alternateASig1 = aSig1;
   5232         ++q;
   5233         sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
   5234     } while ( 0 <= (sbits64) aSig0 );
   5235     add128(
   5236         aSig0, aSig1, alternateASig0, alternateASig1, (bits64 *)&sigMean0, &sigMean1 );
   5237     if (    ( sigMean0 < 0 )
   5238          || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
   5239         aSig0 = alternateASig0;
   5240         aSig1 = alternateASig1;
   5241     }
   5242     zSign = ( (sbits64) aSig0 < 0 );
   5243     if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
   5244     return
   5245         normalizeRoundAndPackFloat128( aSign ^ zSign, bExp - 4, aSig0, aSig1 STATUS_VAR );
   5246 
   5247 }
   5248 
   5249 /*----------------------------------------------------------------------------
   5250 | Returns the square root of the quadruple-precision floating-point value `a'.
   5251 | The operation is performed according to the IEC/IEEE Standard for Binary
   5252 | Floating-Point Arithmetic.
   5253 *----------------------------------------------------------------------------*/
   5254 
   5255 float128 float128_sqrt( float128 a STATUS_PARAM )
   5256 {
   5257     flag aSign;
   5258     int32 aExp, zExp;
   5259     bits64 aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
   5260     bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
   5261     float128 z;
   5262 
   5263     aSig1 = extractFloat128Frac1( a );
   5264     aSig0 = extractFloat128Frac0( a );
   5265     aExp = extractFloat128Exp( a );
   5266     aSign = extractFloat128Sign( a );
   5267     if ( aExp == 0x7FFF ) {
   5268         if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, a STATUS_VAR );
   5269         if ( ! aSign ) return a;
   5270         goto invalid;
   5271     }
   5272     if ( aSign ) {
   5273         if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
   5274  invalid:
   5275         float_raise( float_flag_invalid STATUS_VAR);
   5276         z.low = float128_default_nan_low;
   5277         z.high = float128_default_nan_high;
   5278         return z;
   5279     }
   5280     if ( aExp == 0 ) {
   5281         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
   5282         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
   5283     }
   5284     zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;
   5285     aSig0 |= LIT64( 0x0001000000000000 );
   5286     zSig0 = estimateSqrt32( aExp, aSig0>>17 );
   5287     shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
   5288     zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
   5289     doubleZSig0 = zSig0<<1;
   5290     mul64To128( zSig0, zSig0, &term0, &term1 );
   5291     sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
   5292     while ( (sbits64) rem0 < 0 ) {
   5293         --zSig0;
   5294         doubleZSig0 -= 2;
   5295         add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
   5296     }
   5297     zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
   5298     if ( ( zSig1 & 0x1FFF ) <= 5 ) {
   5299         if ( zSig1 == 0 ) zSig1 = 1;
   5300         mul64To128( doubleZSig0, zSig1, &term1, &term2 );
   5301         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
   5302         mul64To128( zSig1, zSig1, &term2, &term3 );
   5303         sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
   5304         while ( (sbits64) rem1 < 0 ) {
   5305             --zSig1;
   5306             shortShift128Left( 0, zSig1, 1, &term2, &term3 );
   5307             term3 |= 1;
   5308             term2 |= doubleZSig0;
   5309             add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
   5310         }
   5311         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
   5312     }
   5313     shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
   5314     return roundAndPackFloat128( 0, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
   5315 
   5316 }
   5317 
   5318 /*----------------------------------------------------------------------------
   5319 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
   5320 | the corresponding value `b', and 0 otherwise.  The comparison is performed
   5321 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   5322 *----------------------------------------------------------------------------*/
   5323 
   5324 int float128_eq( float128 a, float128 b STATUS_PARAM )
   5325 {
   5326 
   5327     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
   5328               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
   5329          || (    ( extractFloat128Exp( b ) == 0x7FFF )
   5330               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
   5331        ) {
   5332         if (    float128_is_signaling_nan( a )
   5333              || float128_is_signaling_nan( b ) ) {
   5334             float_raise( float_flag_invalid STATUS_VAR);
   5335         }
   5336         return 0;
   5337     }
   5338     return
   5339            ( a.low == b.low )
   5340         && (    ( a.high == b.high )
   5341              || (    ( a.low == 0 )
   5342                   && ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) )
   5343            );
   5344 
   5345 }
   5346 
   5347 /*----------------------------------------------------------------------------
   5348 | Returns 1 if the quadruple-precision floating-point value `a' is less than
   5349 | or equal to the corresponding value `b', and 0 otherwise.  The comparison
   5350 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
   5351 | Arithmetic.
   5352 *----------------------------------------------------------------------------*/
   5353 
   5354 int float128_le( float128 a, float128 b STATUS_PARAM )
   5355 {
   5356     flag aSign, bSign;
   5357 
   5358     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
   5359               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
   5360          || (    ( extractFloat128Exp( b ) == 0x7FFF )
   5361               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
   5362        ) {
   5363         float_raise( float_flag_invalid STATUS_VAR);
   5364         return 0;
   5365     }
   5366     aSign = extractFloat128Sign( a );
   5367     bSign = extractFloat128Sign( b );
   5368     if ( aSign != bSign ) {
   5369         return
   5370                aSign
   5371             || (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
   5372                  == 0 );
   5373     }
   5374     return
   5375           aSign ? le128( b.high, b.low, a.high, a.low )
   5376         : le128( a.high, a.low, b.high, b.low );
   5377 
   5378 }
   5379 
   5380 /*----------------------------------------------------------------------------
   5381 | Returns 1 if the quadruple-precision floating-point value `a' is less than
   5382 | the corresponding value `b', and 0 otherwise.  The comparison is performed
   5383 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   5384 *----------------------------------------------------------------------------*/
   5385 
   5386 int float128_lt( float128 a, float128 b STATUS_PARAM )
   5387 {
   5388     flag aSign, bSign;
   5389 
   5390     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
   5391               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
   5392          || (    ( extractFloat128Exp( b ) == 0x7FFF )
   5393               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
   5394        ) {
   5395         float_raise( float_flag_invalid STATUS_VAR);
   5396         return 0;
   5397     }
   5398     aSign = extractFloat128Sign( a );
   5399     bSign = extractFloat128Sign( b );
   5400     if ( aSign != bSign ) {
   5401         return
   5402                aSign
   5403             && (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
   5404                  != 0 );
   5405     }
   5406     return
   5407           aSign ? lt128( b.high, b.low, a.high, a.low )
   5408         : lt128( a.high, a.low, b.high, b.low );
   5409 
   5410 }
   5411 
   5412 /*----------------------------------------------------------------------------
   5413 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
   5414 | the corresponding value `b', and 0 otherwise.  The invalid exception is
   5415 | raised if either operand is a NaN.  Otherwise, the comparison is performed
   5416 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   5417 *----------------------------------------------------------------------------*/
   5418 
   5419 int float128_eq_signaling( float128 a, float128 b STATUS_PARAM )
   5420 {
   5421 
   5422     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
   5423               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
   5424          || (    ( extractFloat128Exp( b ) == 0x7FFF )
   5425               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
   5426        ) {
   5427         float_raise( float_flag_invalid STATUS_VAR);
   5428         return 0;
   5429     }
   5430     return
   5431            ( a.low == b.low )
   5432         && (    ( a.high == b.high )
   5433              || (    ( a.low == 0 )
   5434                   && ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) )
   5435            );
   5436 
   5437 }
   5438 
   5439 /*----------------------------------------------------------------------------
   5440 | Returns 1 if the quadruple-precision floating-point value `a' is less than
   5441 | or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
   5442 | cause an exception.  Otherwise, the comparison is performed according to the
   5443 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   5444 *----------------------------------------------------------------------------*/
   5445 
   5446 int float128_le_quiet( float128 a, float128 b STATUS_PARAM )
   5447 {
   5448     flag aSign, bSign;
   5449 
   5450     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
   5451               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
   5452          || (    ( extractFloat128Exp( b ) == 0x7FFF )
   5453               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
   5454        ) {
   5455         if (    float128_is_signaling_nan( a )
   5456              || float128_is_signaling_nan( b ) ) {
   5457             float_raise( float_flag_invalid STATUS_VAR);
   5458         }
   5459         return 0;
   5460     }
   5461     aSign = extractFloat128Sign( a );
   5462     bSign = extractFloat128Sign( b );
   5463     if ( aSign != bSign ) {
   5464         return
   5465                aSign
   5466             || (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
   5467                  == 0 );
   5468     }
   5469     return
   5470           aSign ? le128( b.high, b.low, a.high, a.low )
   5471         : le128( a.high, a.low, b.high, b.low );
   5472 
   5473 }
   5474 
   5475 /*----------------------------------------------------------------------------
   5476 | Returns 1 if the quadruple-precision floating-point value `a' is less than
   5477 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
   5478 | exception.  Otherwise, the comparison is performed according to the IEC/IEEE
   5479 | Standard for Binary Floating-Point Arithmetic.
   5480 *----------------------------------------------------------------------------*/
   5481 
   5482 int float128_lt_quiet( float128 a, float128 b STATUS_PARAM )
   5483 {
   5484     flag aSign, bSign;
   5485 
   5486     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
   5487               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
   5488          || (    ( extractFloat128Exp( b ) == 0x7FFF )
   5489               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
   5490        ) {
   5491         if (    float128_is_signaling_nan( a )
   5492              || float128_is_signaling_nan( b ) ) {
   5493             float_raise( float_flag_invalid STATUS_VAR);
   5494         }
   5495         return 0;
   5496     }
   5497     aSign = extractFloat128Sign( a );
   5498     bSign = extractFloat128Sign( b );
   5499     if ( aSign != bSign ) {
   5500         return
   5501                aSign
   5502             && (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
   5503                  != 0 );
   5504     }
   5505     return
   5506           aSign ? lt128( b.high, b.low, a.high, a.low )
   5507         : lt128( a.high, a.low, b.high, b.low );
   5508 
   5509 }
   5510 
   5511 #endif
   5512 
   5513 /* misc functions */
   5514 float32 uint32_to_float32( unsigned int a STATUS_PARAM )
   5515 {
   5516     return int64_to_float32(a STATUS_VAR);
   5517 }
   5518 
   5519 float64 uint32_to_float64( unsigned int a STATUS_PARAM )
   5520 {
   5521     return int64_to_float64(a STATUS_VAR);
   5522 }
   5523 
   5524 unsigned int float32_to_uint32( float32 a STATUS_PARAM )
   5525 {
   5526     int64_t v;
   5527     unsigned int res;
   5528 
   5529     v = float32_to_int64(a STATUS_VAR);
   5530     if (v < 0) {
   5531         res = 0;
   5532         float_raise( float_flag_invalid STATUS_VAR);
   5533     } else if (v > 0xffffffff) {
   5534         res = 0xffffffff;
   5535         float_raise( float_flag_invalid STATUS_VAR);
   5536     } else {
   5537         res = v;
   5538     }
   5539     return res;
   5540 }
   5541 
   5542 unsigned int float32_to_uint32_round_to_zero( float32 a STATUS_PARAM )
   5543 {
   5544     int64_t v;
   5545     unsigned int res;
   5546 
   5547     v = float32_to_int64_round_to_zero(a STATUS_VAR);
   5548     if (v < 0) {
   5549         res = 0;
   5550         float_raise( float_flag_invalid STATUS_VAR);
   5551     } else if (v > 0xffffffff) {
   5552         res = 0xffffffff;
   5553         float_raise( float_flag_invalid STATUS_VAR);
   5554     } else {
   5555         res = v;
   5556     }
   5557     return res;
   5558 }
   5559 
   5560 unsigned int float64_to_uint32( float64 a STATUS_PARAM )
   5561 {
   5562     int64_t v;
   5563     unsigned int res;
   5564 
   5565     v = float64_to_int64(a STATUS_VAR);
   5566     if (v < 0) {
   5567         res = 0;
   5568         float_raise( float_flag_invalid STATUS_VAR);
   5569     } else if (v > 0xffffffff) {
   5570         res = 0xffffffff;
   5571         float_raise( float_flag_invalid STATUS_VAR);
   5572     } else {
   5573         res = v;
   5574     }
   5575     return res;
   5576 }
   5577 
   5578 unsigned int float64_to_uint32_round_to_zero( float64 a STATUS_PARAM )
   5579 {
   5580     int64_t v;
   5581     unsigned int res;
   5582 
   5583     v = float64_to_int64_round_to_zero(a STATUS_VAR);
   5584     if (v < 0) {
   5585         res = 0;
   5586         float_raise( float_flag_invalid STATUS_VAR);
   5587     } else if (v > 0xffffffff) {
   5588         res = 0xffffffff;
   5589         float_raise( float_flag_invalid STATUS_VAR);
   5590     } else {
   5591         res = v;
   5592     }
   5593     return res;
   5594 }
   5595 
   5596 /* FIXME: This looks broken.  */
   5597 uint64_t float64_to_uint64 (float64 a STATUS_PARAM)
   5598 {
   5599     int64_t v;
   5600 
   5601     v = float64_val(int64_to_float64(INT64_MIN STATUS_VAR));
   5602     v += float64_val(a);
   5603     v = float64_to_int64(make_float64(v) STATUS_VAR);
   5604 
   5605     return v - INT64_MIN;
   5606 }
   5607 
   5608 uint64_t float64_to_uint64_round_to_zero (float64 a STATUS_PARAM)
   5609 {
   5610     int64_t v;
   5611 
   5612     v = float64_val(int64_to_float64(INT64_MIN STATUS_VAR));
   5613     v += float64_val(a);
   5614     v = float64_to_int64_round_to_zero(make_float64(v) STATUS_VAR);
   5615 
   5616     return v - INT64_MIN;
   5617 }
   5618 
   5619 #define COMPARE(s, nan_exp)                                                  \
   5620 INLINE int float ## s ## _compare_internal( float ## s a, float ## s b,      \
   5621                                       int is_quiet STATUS_PARAM )            \
   5622 {                                                                            \
   5623     flag aSign, bSign;                                                       \
   5624     bits ## s av, bv;                                                        \
   5625                                                                              \
   5626     if (( ( extractFloat ## s ## Exp( a ) == nan_exp ) &&                    \
   5627          extractFloat ## s ## Frac( a ) ) ||                                 \
   5628         ( ( extractFloat ## s ## Exp( b ) == nan_exp ) &&                    \
   5629           extractFloat ## s ## Frac( b ) )) {                                \
   5630         if (!is_quiet ||                                                     \
   5631             float ## s ## _is_signaling_nan( a ) ||                          \
   5632             float ## s ## _is_signaling_nan( b ) ) {                         \
   5633             float_raise( float_flag_invalid STATUS_VAR);                     \
   5634         }                                                                    \
   5635         return float_relation_unordered;                                     \
   5636     }                                                                        \
   5637     aSign = extractFloat ## s ## Sign( a );                                  \
   5638     bSign = extractFloat ## s ## Sign( b );                                  \
   5639     av = float ## s ## _val(a);                                              \
   5640     bv = float ## s ## _val(b);                                              \
   5641     if ( aSign != bSign ) {                                                  \
   5642         if ( (bits ## s) ( ( av | bv )<<1 ) == 0 ) {                         \
   5643             /* zero case */                                                  \
   5644             return float_relation_equal;                                     \
   5645         } else {                                                             \
   5646             return 1 - (2 * aSign);                                          \
   5647         }                                                                    \
   5648     } else {                                                                 \
   5649         if (av == bv) {                                                      \
   5650             return float_relation_equal;                                     \
   5651         } else {                                                             \
   5652             return 1 - 2 * (aSign ^ ( av < bv ));                            \
   5653         }                                                                    \
   5654     }                                                                        \
   5655 }                                                                            \
   5656                                                                              \
   5657 int float ## s ## _compare( float ## s a, float ## s b STATUS_PARAM )        \
   5658 {                                                                            \
   5659     return float ## s ## _compare_internal(a, b, 0 STATUS_VAR);              \
   5660 }                                                                            \
   5661                                                                              \
   5662 int float ## s ## _compare_quiet( float ## s a, float ## s b STATUS_PARAM )  \
   5663 {                                                                            \
   5664     return float ## s ## _compare_internal(a, b, 1 STATUS_VAR);              \
   5665 }
   5666 
   5667 COMPARE(32, 0xff)
   5668 COMPARE(64, 0x7ff)
   5669 
   5670 INLINE int float128_compare_internal( float128 a, float128 b,
   5671                                       int is_quiet STATUS_PARAM )
   5672 {
   5673     flag aSign, bSign;
   5674 
   5675     if (( ( extractFloat128Exp( a ) == 0x7fff ) &&
   5676           ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) ||
   5677         ( ( extractFloat128Exp( b ) == 0x7fff ) &&
   5678           ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )) {
   5679         if (!is_quiet ||
   5680             float128_is_signaling_nan( a ) ||
   5681             float128_is_signaling_nan( b ) ) {
   5682             float_raise( float_flag_invalid STATUS_VAR);
   5683         }
   5684         return float_relation_unordered;
   5685     }
   5686     aSign = extractFloat128Sign( a );
   5687     bSign = extractFloat128Sign( b );
   5688     if ( aSign != bSign ) {
   5689         if ( ( ( ( a.high | b.high )<<1 ) | a.low | b.low ) == 0 ) {
   5690             /* zero case */
   5691             return float_relation_equal;
   5692         } else {
   5693             return 1 - (2 * aSign);
   5694         }
   5695     } else {
   5696         if (a.low == b.low && a.high == b.high) {
   5697             return float_relation_equal;
   5698         } else {
   5699             return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
   5700         }
   5701     }
   5702 }
   5703 
   5704 int float128_compare( float128 a, float128 b STATUS_PARAM )
   5705 {
   5706     return float128_compare_internal(a, b, 0 STATUS_VAR);
   5707 }
   5708 
   5709 int float128_compare_quiet( float128 a, float128 b STATUS_PARAM )
   5710 {
   5711     return float128_compare_internal(a, b, 1 STATUS_VAR);
   5712 }
   5713 
   5714 /* Multiply A by 2 raised to the power N.  */
   5715 float32 float32_scalbn( float32 a, int n STATUS_PARAM )
   5716 {
   5717     flag aSign;
   5718     int16 aExp;
   5719     bits32 aSig;
   5720 
   5721     aSig = extractFloat32Frac( a );
   5722     aExp = extractFloat32Exp( a );
   5723     aSign = extractFloat32Sign( a );
   5724 
   5725     if ( aExp == 0xFF ) {
   5726         return a;
   5727     }
   5728     if ( aExp != 0 )
   5729         aSig |= 0x00800000;
   5730     else if ( aSig == 0 )
   5731         return a;
   5732 
   5733     aExp += n - 1;
   5734     aSig <<= 7;
   5735     return normalizeRoundAndPackFloat32( aSign, aExp, aSig STATUS_VAR );
   5736 }
   5737 
   5738 float64 float64_scalbn( float64 a, int n STATUS_PARAM )
   5739 {
   5740     flag aSign;
   5741     int16 aExp;
   5742     bits64 aSig;
   5743 
   5744     aSig = extractFloat64Frac( a );
   5745     aExp = extractFloat64Exp( a );
   5746     aSign = extractFloat64Sign( a );
   5747 
   5748     if ( aExp == 0x7FF ) {
   5749         return a;
   5750     }
   5751     if ( aExp != 0 )
   5752         aSig |= LIT64( 0x0010000000000000 );
   5753     else if ( aSig == 0 )
   5754         return a;
   5755 
   5756     aExp += n - 1;
   5757     aSig <<= 10;
   5758     return normalizeRoundAndPackFloat64( aSign, aExp, aSig STATUS_VAR );
   5759 }
   5760 
   5761 #ifdef FLOATX80
   5762 floatx80 floatx80_scalbn( floatx80 a, int n STATUS_PARAM )
   5763 {
   5764     flag aSign;
   5765     int16 aExp;
   5766     bits64 aSig;
   5767 
   5768     aSig = extractFloatx80Frac( a );
   5769     aExp = extractFloatx80Exp( a );
   5770     aSign = extractFloatx80Sign( a );
   5771 
   5772     if ( aExp == 0x7FF ) {
   5773         return a;
   5774     }
   5775     if (aExp == 0 && aSig == 0)
   5776         return a;
   5777 
   5778     aExp += n;
   5779     return normalizeRoundAndPackFloatx80( STATUS(floatx80_rounding_precision),
   5780                                           aSign, aExp, aSig, 0 STATUS_VAR );
   5781 }
   5782 #endif
   5783 
   5784 #ifdef FLOAT128
   5785 float128 float128_scalbn( float128 a, int n STATUS_PARAM )
   5786 {
   5787     flag aSign;
   5788     int32 aExp;
   5789     bits64 aSig0, aSig1;
   5790 
   5791     aSig1 = extractFloat128Frac1( a );
   5792     aSig0 = extractFloat128Frac0( a );
   5793     aExp = extractFloat128Exp( a );
   5794     aSign = extractFloat128Sign( a );
   5795     if ( aExp == 0x7FFF ) {
   5796         return a;
   5797     }
   5798     if ( aExp != 0 )
   5799         aSig0 |= LIT64( 0x0001000000000000 );
   5800     else if ( aSig0 == 0 && aSig1 == 0 )
   5801         return a;
   5802 
   5803     aExp += n - 1;
   5804     return normalizeRoundAndPackFloat128( aSign, aExp, aSig0, aSig1
   5805                                           STATUS_VAR );
   5806 
   5807 }
   5808 #endif
   5809