Home | History | Annotate | Download | only in priv
      1 
      2 /*---------------------------------------------------------------*/
      3 /*--- begin                              host_generic_maddf.c ---*/
      4 /*---------------------------------------------------------------*/
      5 
      6 /*
      7    Compute x * y + z as ternary operation.
      8    Copyright (C) 2010-2017 Free Software Foundation, Inc.
      9    This file is part of the GNU C Library.
     10    Contributed by Jakub Jelinek <jakub (at) redhat.com>, 2010.
     11 
     12    The GNU C Library is free software; you can redistribute it and/or
     13    modify it under the terms of the GNU Lesser General Public
     14    License as published by the Free Software Foundation; either
     15    version 2.1 of the License, or (at your option) any later version.
     16 
     17    The GNU C Library is distributed in the hope that it will be useful,
     18    but WITHOUT ANY WARRANTY; without even the implied warranty of
     19    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
     20    Lesser General Public License for more details.
     21 
     22    You should have received a copy of the GNU Lesser General Public
     23    License along with the GNU C Library; if not, see
     24    <http://www.gnu.org/licenses/>.
     25 */
     26 
     27 /* Generic helper functions for doing FMA, i.e. compute x * y + z
     28    as ternary operation.
     29    These are purely back-end entities and cannot be seen/referenced
     30    from IR. */
     31 
     32 #include "libvex_basictypes.h"
     33 #include "host_generic_maddf.h"
     34 #include "main_util.h"
     35 
     36 /* This implementation relies on Double being more than twice as
     37    precise as Float and uses rounding to odd in order to avoid problems
     38    with double rounding.
     39    See a paper by Boldo and Melquiond:
     40    http://www.lri.fr/~melquion/doc/08-tc.pdf  */
     41 
     42 #define FORCE_EVAL(X) __asm __volatile__ ("" : : "m" (X))
     43 
     44 #if defined(__x86_64__) && defined(__SSE2_MATH__)
     45 # define ENV_TYPE unsigned int
     46 /* Save current rounding mode into ENV, hold exceptions, set rounding
     47    mode to rounding toward zero.  */
     48 # define ROUNDTOZERO(env) \
     49    do {							\
     50       unsigned int mxcsr;				\
     51       __asm __volatile__ ("stmxcsr %0" : "=m" (mxcsr));	\
     52       (env) = mxcsr;					\
     53       mxcsr = (mxcsr | 0x7f80) & ~0x3f;			\
     54       __asm __volatile__ ("ldmxcsr %0" : : "m" (mxcsr));\
     55    } while (0)
     56 /* Restore exceptions from ENV, return if inexact exception has been raised
     57    since ROUNDTOZERO.  */
     58 # define RESET_TESTINEXACT(env) \
     59    ({							\
     60       unsigned int mxcsr, ret;				\
     61       __asm __volatile__ ("stmxcsr %0" : "=m" (mxcsr));	\
     62       ret = (mxcsr >> 5) & 1;				\
     63       mxcsr = (mxcsr & 0x3d) | (env);			\
     64       __asm __volatile__ ("ldmxcsr %0" : : "m" (mxcsr));\
     65       ret;						\
     66    })
     67 /* Return if inexact exception has been raised since ROUNDTOZERO.  */
     68 # define TESTINEXACT() \
     69    ({							\
     70       unsigned int mxcsr;				\
     71       __asm __volatile__ ("stmxcsr %0" : "=m" (mxcsr));	\
     72       (mxcsr >> 5) & 1;					\
     73    })
     74 #endif
     75 
     76 #define DBL_MANT_DIG 53
     77 #define IEEE754_DOUBLE_BIAS 0x3ff
     78 
     79 union vg_ieee754_double {
     80    Double d;
     81 
     82    /* This is the IEEE 754 double-precision format.  */
     83    struct {
     84 #ifdef VKI_BIG_ENDIAN
     85       unsigned int negative:1;
     86       unsigned int exponent:11;
     87       unsigned int mantissa0:20;
     88       unsigned int mantissa1:32;
     89 #else
     90       unsigned int mantissa1:32;
     91       unsigned int mantissa0:20;
     92       unsigned int exponent:11;
     93       unsigned int negative:1;
     94 #endif
     95    } ieee;
     96 };
     97 
     98 void VEX_REGPARM(3)
     99      h_generic_calc_MAddF32 ( /*OUT*/Float* res,
    100                                Float* argX, Float* argY, Float* argZ )
    101 {
    102 #ifndef ENV_TYPE
    103    /* Lame fallback implementation.  */
    104    *res = *argX * *argY + *argZ;
    105 #else
    106    ENV_TYPE env;
    107    /* Multiplication is always exact.  */
    108    Double temp = (Double) *argX * (Double) *argY;
    109    union vg_ieee754_double u;
    110 
    111    ROUNDTOZERO (env);
    112 
    113    /* Perform addition with round to odd.  */
    114    u.d = temp + (Double) *argZ;
    115    /* Ensure the addition is not scheduled after fetestexcept call.  */
    116    FORCE_EVAL (u.d);
    117 
    118    /* Reset rounding mode and test for inexact simultaneously.  */
    119    int j = RESET_TESTINEXACT (env);
    120 
    121    if ((u.ieee.mantissa1 & 1) == 0 && u.ieee.exponent != 0x7ff)
    122       u.ieee.mantissa1 |= j;
    123 
    124    /* And finally truncation with round to nearest.  */
    125    *res = (Float) u.d;
    126 #endif
    127 }
    128 
    129 
    130 void VEX_REGPARM(3)
    131      h_generic_calc_MAddF64 ( /*OUT*/Double* res,
    132                                Double* argX, Double* argY, Double* argZ )
    133 {
    134 #ifndef ENV_TYPE
    135    /* Lame fallback implementation.  */
    136    *res = *argX * *argY + *argZ;
    137 #else
    138    Double x = *argX, y = *argY, z = *argZ;
    139    union vg_ieee754_double u, v, w;
    140    int adjust = 0;
    141    u.d = x;
    142    v.d = y;
    143    w.d = z;
    144    if (UNLIKELY (u.ieee.exponent + v.ieee.exponent
    145                  >= 0x7ff + IEEE754_DOUBLE_BIAS - DBL_MANT_DIG)
    146        || UNLIKELY (u.ieee.exponent >= 0x7ff - DBL_MANT_DIG)
    147        || UNLIKELY (v.ieee.exponent >= 0x7ff - DBL_MANT_DIG)
    148        || UNLIKELY (w.ieee.exponent >= 0x7ff - DBL_MANT_DIG)
    149        || UNLIKELY (u.ieee.exponent + v.ieee.exponent
    150                     <= IEEE754_DOUBLE_BIAS + DBL_MANT_DIG)) {
    151       /* If z is Inf, but x and y are finite, the result should be
    152          z rather than NaN.  */
    153       if (w.ieee.exponent == 0x7ff
    154           && u.ieee.exponent != 0x7ff
    155           && v.ieee.exponent != 0x7ff) {
    156          *res = (z + x) + y;
    157          return;
    158       }
    159       /* If x or y or z is Inf/NaN, or if fma will certainly overflow,
    160          or if x * y is less than half of DBL_DENORM_MIN,
    161          compute as x * y + z.  */
    162       if (u.ieee.exponent == 0x7ff
    163           || v.ieee.exponent == 0x7ff
    164           || w.ieee.exponent == 0x7ff
    165           || u.ieee.exponent + v.ieee.exponent > 0x7ff + IEEE754_DOUBLE_BIAS
    166           || u.ieee.exponent + v.ieee.exponent
    167              < IEEE754_DOUBLE_BIAS - DBL_MANT_DIG - 2) {
    168          *res = x * y + z;
    169          return;
    170       }
    171       if (u.ieee.exponent + v.ieee.exponent
    172           >= 0x7ff + IEEE754_DOUBLE_BIAS - DBL_MANT_DIG) {
    173          /* Compute 1p-53 times smaller result and multiply
    174             at the end.  */
    175          if (u.ieee.exponent > v.ieee.exponent)
    176             u.ieee.exponent -= DBL_MANT_DIG;
    177          else
    178             v.ieee.exponent -= DBL_MANT_DIG;
    179          /* If x + y exponent is very large and z exponent is very small,
    180             it doesn't matter if we don't adjust it.  */
    181          if (w.ieee.exponent > DBL_MANT_DIG)
    182             w.ieee.exponent -= DBL_MANT_DIG;
    183          adjust = 1;
    184       } else if (w.ieee.exponent >= 0x7ff - DBL_MANT_DIG) {
    185          /* Similarly.
    186             If z exponent is very large and x and y exponents are
    187             very small, it doesn't matter if we don't adjust it.  */
    188          if (u.ieee.exponent > v.ieee.exponent) {
    189             if (u.ieee.exponent > DBL_MANT_DIG)
    190                u.ieee.exponent -= DBL_MANT_DIG;
    191          } else if (v.ieee.exponent > DBL_MANT_DIG)
    192             v.ieee.exponent -= DBL_MANT_DIG;
    193          w.ieee.exponent -= DBL_MANT_DIG;
    194          adjust = 1;
    195       } else if (u.ieee.exponent >= 0x7ff - DBL_MANT_DIG) {
    196          u.ieee.exponent -= DBL_MANT_DIG;
    197          if (v.ieee.exponent)
    198             v.ieee.exponent += DBL_MANT_DIG;
    199          else
    200             v.d *= 0x1p53;
    201       } else if (v.ieee.exponent >= 0x7ff - DBL_MANT_DIG) {
    202          v.ieee.exponent -= DBL_MANT_DIG;
    203          if (u.ieee.exponent)
    204             u.ieee.exponent += DBL_MANT_DIG;
    205          else
    206             u.d *= 0x1p53;
    207       } else /* if (u.ieee.exponent + v.ieee.exponent
    208                     <= IEEE754_DOUBLE_BIAS + DBL_MANT_DIG) */ {
    209          if (u.ieee.exponent > v.ieee.exponent)
    210             u.ieee.exponent += 2 * DBL_MANT_DIG;
    211          else
    212             v.ieee.exponent += 2 * DBL_MANT_DIG;
    213          if (w.ieee.exponent <= 4 * DBL_MANT_DIG + 4) {
    214             if (w.ieee.exponent)
    215                w.ieee.exponent += 2 * DBL_MANT_DIG;
    216             else
    217                w.d *= 0x1p106;
    218             adjust = -1;
    219          }
    220          /* Otherwise x * y should just affect inexact
    221             and nothing else.  */
    222       }
    223       x = u.d;
    224       y = v.d;
    225       z = w.d;
    226    }
    227    /* Multiplication m1 + m2 = x * y using Dekker's algorithm.  */
    228 #  define C ((1 << (DBL_MANT_DIG + 1) / 2) + 1)
    229    Double x1 = x * C;
    230    Double y1 = y * C;
    231    Double m1 = x * y;
    232    x1 = (x - x1) + x1;
    233    y1 = (y - y1) + y1;
    234    Double x2 = x - x1;
    235    Double y2 = y - y1;
    236    Double m2 = (((x1 * y1 - m1) + x1 * y2) + x2 * y1) + x2 * y2;
    237 #  undef C
    238 
    239    /* Addition a1 + a2 = z + m1 using Knuth's algorithm.  */
    240    Double a1 = z + m1;
    241    Double t1 = a1 - z;
    242    Double t2 = a1 - t1;
    243    t1 = m1 - t1;
    244    t2 = z - t2;
    245    Double a2 = t1 + t2;
    246 
    247    ENV_TYPE env;
    248    ROUNDTOZERO (env);
    249 
    250    /* Perform m2 + a2 addition with round to odd.  */
    251    u.d = a2 + m2;
    252 
    253    if (UNLIKELY (adjust < 0)) {
    254       if ((u.ieee.mantissa1 & 1) == 0)
    255          u.ieee.mantissa1 |= TESTINEXACT ();
    256       v.d = a1 + u.d;
    257       /* Ensure the addition is not scheduled after fetestexcept call.  */
    258       FORCE_EVAL (v.d);
    259    }
    260 
    261    /* Reset rounding mode and test for inexact simultaneously.  */
    262    int j = RESET_TESTINEXACT (env) != 0;
    263 
    264    if (LIKELY (adjust == 0)) {
    265       if ((u.ieee.mantissa1 & 1) == 0 && u.ieee.exponent != 0x7ff)
    266          u.ieee.mantissa1 |= j;
    267       /* Result is a1 + u.d.  */
    268       *res = a1 + u.d;
    269    } else if (LIKELY (adjust > 0)) {
    270       if ((u.ieee.mantissa1 & 1) == 0 && u.ieee.exponent != 0x7ff)
    271          u.ieee.mantissa1 |= j;
    272       /* Result is a1 + u.d, scaled up.  */
    273       *res = (a1 + u.d) * 0x1p53;
    274    } else {
    275       /* If a1 + u.d is exact, the only rounding happens during
    276          scaling down.  */
    277       if (j == 0) {
    278          *res = v.d * 0x1p-106;
    279          return;
    280       }
    281       /* If result rounded to zero is not subnormal, no double
    282          rounding will occur.  */
    283       if (v.ieee.exponent > 106) {
    284          *res = (a1 + u.d) * 0x1p-106;
    285          return;
    286       }
    287       /* If v.d * 0x1p-106 with round to zero is a subnormal above
    288          or equal to DBL_MIN / 2, then v.d * 0x1p-106 shifts mantissa
    289          down just by 1 bit, which means v.ieee.mantissa1 |= j would
    290          change the round bit, not sticky or guard bit.
    291          v.d * 0x1p-106 never normalizes by shifting up,
    292          so round bit plus sticky bit should be already enough
    293          for proper rounding.  */
    294       if (v.ieee.exponent == 106) {
    295          /* v.ieee.mantissa1 & 2 is LSB bit of the result before rounding,
    296             v.ieee.mantissa1 & 1 is the round bit and j is our sticky
    297             bit.  In round-to-nearest 001 rounds down like 00,
    298             011 rounds up, even though 01 rounds down (thus we need
    299             to adjust), 101 rounds down like 10 and 111 rounds up
    300             like 11.  */
    301          if ((v.ieee.mantissa1 & 3) == 1) {
    302             v.d *= 0x1p-106;
    303             if (v.ieee.negative)
    304                *res = v.d - 0x1p-1074;
    305             else
    306                *res = v.d + 0x1p-1074;
    307          } else
    308             *res = v.d * 0x1p-106;
    309          return;
    310       }
    311       v.ieee.mantissa1 |= j;
    312       *res = v.d * 0x1p-106;
    313       return;
    314     }
    315 #endif
    316 }
    317 
    318 /*---------------------------------------------------------------*/
    319 /*--- end                                 host_generic_maddf.c --*/
    320 /*---------------------------------------------------------------*/
    321