Home | History | Annotate | Download | only in big
      1 // Copyright 2014 The Go Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style
      3 // license that can be found in the LICENSE file.
      4 
      5 // This file implements multi-precision floating-point numbers.
      6 // Like in the GNU MPFR library (http://www.mpfr.org/), operands
      7 // can be of mixed precision. Unlike MPFR, the rounding mode is
      8 // not specified with each operation, but with each operand. The
      9 // rounding mode of the result operand determines the rounding
     10 // mode of an operation. This is a from-scratch implementation.
     11 
     12 package big
     13 
     14 import (
     15 	"fmt"
     16 	"math"
     17 )
     18 
     19 const debugFloat = false // enable for debugging
     20 
     21 // A nonzero finite Float represents a multi-precision floating point number
     22 //
     23 //   sign  mantissa  2**exponent
     24 //
     25 // with 0.5 <= mantissa < 1.0, and MinExp <= exponent <= MaxExp.
     26 // A Float may also be zero (+0, -0) or infinite (+Inf, -Inf).
     27 // All Floats are ordered, and the ordering of two Floats x and y
     28 // is defined by x.Cmp(y).
     29 //
     30 // Each Float value also has a precision, rounding mode, and accuracy.
     31 // The precision is the maximum number of mantissa bits available to
     32 // represent the value. The rounding mode specifies how a result should
     33 // be rounded to fit into the mantissa bits, and accuracy describes the
     34 // rounding error with respect to the exact result.
     35 //
     36 // Unless specified otherwise, all operations (including setters) that
     37 // specify a *Float variable for the result (usually via the receiver
     38 // with the exception of MantExp), round the numeric result according
     39 // to the precision and rounding mode of the result variable.
     40 //
     41 // If the provided result precision is 0 (see below), it is set to the
     42 // precision of the argument with the largest precision value before any
     43 // rounding takes place, and the rounding mode remains unchanged. Thus,
     44 // uninitialized Floats provided as result arguments will have their
     45 // precision set to a reasonable value determined by the operands and
     46 // their mode is the zero value for RoundingMode (ToNearestEven).
     47 //
     48 // By setting the desired precision to 24 or 53 and using matching rounding
     49 // mode (typically ToNearestEven), Float operations produce the same results
     50 // as the corresponding float32 or float64 IEEE-754 arithmetic for operands
     51 // that correspond to normal (i.e., not denormal) float32 or float64 numbers.
     52 // Exponent underflow and overflow lead to a 0 or an Infinity for different
     53 // values than IEEE-754 because Float exponents have a much larger range.
     54 //
     55 // The zero (uninitialized) value for a Float is ready to use and represents
     56 // the number +0.0 exactly, with precision 0 and rounding mode ToNearestEven.
     57 //
     58 type Float struct {
     59 	prec uint32
     60 	mode RoundingMode
     61 	acc  Accuracy
     62 	form form
     63 	neg  bool
     64 	mant nat
     65 	exp  int32
     66 }
     67 
     68 // An ErrNaN panic is raised by a Float operation that would lead to
     69 // a NaN under IEEE-754 rules. An ErrNaN implements the error interface.
     70 type ErrNaN struct {
     71 	msg string
     72 }
     73 
     74 func (err ErrNaN) Error() string {
     75 	return err.msg
     76 }
     77 
     78 // NewFloat allocates and returns a new Float set to x,
     79 // with precision 53 and rounding mode ToNearestEven.
     80 // NewFloat panics with ErrNaN if x is a NaN.
     81 func NewFloat(x float64) *Float {
     82 	if math.IsNaN(x) {
     83 		panic(ErrNaN{"NewFloat(NaN)"})
     84 	}
     85 	return new(Float).SetFloat64(x)
     86 }
     87 
     88 // Exponent and precision limits.
     89 const (
     90 	MaxExp  = math.MaxInt32  // largest supported exponent
     91 	MinExp  = math.MinInt32  // smallest supported exponent
     92 	MaxPrec = math.MaxUint32 // largest (theoretically) supported precision; likely memory-limited
     93 )
     94 
     95 // Internal representation: The mantissa bits x.mant of a nonzero finite
     96 // Float x are stored in a nat slice long enough to hold up to x.prec bits;
     97 // the slice may (but doesn't have to) be shorter if the mantissa contains
     98 // trailing 0 bits. x.mant is normalized if the msb of x.mant == 1 (i.e.,
     99 // the msb is shifted all the way "to the left"). Thus, if the mantissa has
    100 // trailing 0 bits or x.prec is not a multiple of the the Word size _W,
    101 // x.mant[0] has trailing zero bits. The msb of the mantissa corresponds
    102 // to the value 0.5; the exponent x.exp shifts the binary point as needed.
    103 //
    104 // A zero or non-finite Float x ignores x.mant and x.exp.
    105 //
    106 // x                 form      neg      mant         exp
    107 // ----------------------------------------------------------
    108 // 0                zero      sign     -            -
    109 // 0 < |x| < +Inf    finite    sign     mantissa     exponent
    110 // Inf              inf       sign     -            -
    111 
    112 // A form value describes the internal representation.
    113 type form byte
    114 
    115 // The form value order is relevant - do not change!
    116 const (
    117 	zero form = iota
    118 	finite
    119 	inf
    120 )
    121 
    122 // RoundingMode determines how a Float value is rounded to the
    123 // desired precision. Rounding may change the Float value; the
    124 // rounding error is described by the Float's Accuracy.
    125 type RoundingMode byte
    126 
    127 // The following rounding modes are supported.
    128 const (
    129 	ToNearestEven RoundingMode = iota // == IEEE 754-2008 roundTiesToEven
    130 	ToNearestAway                     // == IEEE 754-2008 roundTiesToAway
    131 	ToZero                            // == IEEE 754-2008 roundTowardZero
    132 	AwayFromZero                      // no IEEE 754-2008 equivalent
    133 	ToNegativeInf                     // == IEEE 754-2008 roundTowardNegative
    134 	ToPositiveInf                     // == IEEE 754-2008 roundTowardPositive
    135 )
    136 
    137 //go:generate stringer -type=RoundingMode
    138 
    139 // Accuracy describes the rounding error produced by the most recent
    140 // operation that generated a Float value, relative to the exact value.
    141 type Accuracy int8
    142 
    143 // Constants describing the Accuracy of a Float.
    144 const (
    145 	Below Accuracy = -1
    146 	Exact Accuracy = 0
    147 	Above Accuracy = +1
    148 )
    149 
    150 //go:generate stringer -type=Accuracy
    151 
    152 // SetPrec sets z's precision to prec and returns the (possibly) rounded
    153 // value of z. Rounding occurs according to z's rounding mode if the mantissa
    154 // cannot be represented in prec bits without loss of precision.
    155 // SetPrec(0) maps all finite values to 0; infinite values remain unchanged.
    156 // If prec > MaxPrec, it is set to MaxPrec.
    157 func (z *Float) SetPrec(prec uint) *Float {
    158 	z.acc = Exact // optimistically assume no rounding is needed
    159 
    160 	// special case
    161 	if prec == 0 {
    162 		z.prec = 0
    163 		if z.form == finite {
    164 			// truncate z to 0
    165 			z.acc = makeAcc(z.neg)
    166 			z.form = zero
    167 		}
    168 		return z
    169 	}
    170 
    171 	// general case
    172 	if prec > MaxPrec {
    173 		prec = MaxPrec
    174 	}
    175 	old := z.prec
    176 	z.prec = uint32(prec)
    177 	if z.prec < old {
    178 		z.round(0)
    179 	}
    180 	return z
    181 }
    182 
    183 func makeAcc(above bool) Accuracy {
    184 	if above {
    185 		return Above
    186 	}
    187 	return Below
    188 }
    189 
    190 // SetMode sets z's rounding mode to mode and returns an exact z.
    191 // z remains unchanged otherwise.
    192 // z.SetMode(z.Mode()) is a cheap way to set z's accuracy to Exact.
    193 func (z *Float) SetMode(mode RoundingMode) *Float {
    194 	z.mode = mode
    195 	z.acc = Exact
    196 	return z
    197 }
    198 
    199 // Prec returns the mantissa precision of x in bits.
    200 // The result may be 0 for |x| == 0 and |x| == Inf.
    201 func (x *Float) Prec() uint {
    202 	return uint(x.prec)
    203 }
    204 
    205 // MinPrec returns the minimum precision required to represent x exactly
    206 // (i.e., the smallest prec before x.SetPrec(prec) would start rounding x).
    207 // The result is 0 for |x| == 0 and |x| == Inf.
    208 func (x *Float) MinPrec() uint {
    209 	if x.form != finite {
    210 		return 0
    211 	}
    212 	return uint(len(x.mant))*_W - x.mant.trailingZeroBits()
    213 }
    214 
    215 // Mode returns the rounding mode of x.
    216 func (x *Float) Mode() RoundingMode {
    217 	return x.mode
    218 }
    219 
    220 // Acc returns the accuracy of x produced by the most recent operation.
    221 func (x *Float) Acc() Accuracy {
    222 	return x.acc
    223 }
    224 
    225 // Sign returns:
    226 //
    227 //	-1 if x <   0
    228 //	 0 if x is 0
    229 //	+1 if x >   0
    230 //
    231 func (x *Float) Sign() int {
    232 	if debugFloat {
    233 		x.validate()
    234 	}
    235 	if x.form == zero {
    236 		return 0
    237 	}
    238 	if x.neg {
    239 		return -1
    240 	}
    241 	return 1
    242 }
    243 
    244 // MantExp breaks x into its mantissa and exponent components
    245 // and returns the exponent. If a non-nil mant argument is
    246 // provided its value is set to the mantissa of x, with the
    247 // same precision and rounding mode as x. The components
    248 // satisfy x == mant  2**exp, with 0.5 <= |mant| < 1.0.
    249 // Calling MantExp with a nil argument is an efficient way to
    250 // get the exponent of the receiver.
    251 //
    252 // Special cases are:
    253 //
    254 //	(  0).MantExp(mant) = 0, with mant set to   0
    255 //	(Inf).MantExp(mant) = 0, with mant set to Inf
    256 //
    257 // x and mant may be the same in which case x is set to its
    258 // mantissa value.
    259 func (x *Float) MantExp(mant *Float) (exp int) {
    260 	if debugFloat {
    261 		x.validate()
    262 	}
    263 	if x.form == finite {
    264 		exp = int(x.exp)
    265 	}
    266 	if mant != nil {
    267 		mant.Copy(x)
    268 		if mant.form == finite {
    269 			mant.exp = 0
    270 		}
    271 	}
    272 	return
    273 }
    274 
    275 func (z *Float) setExpAndRound(exp int64, sbit uint) {
    276 	if exp < MinExp {
    277 		// underflow
    278 		z.acc = makeAcc(z.neg)
    279 		z.form = zero
    280 		return
    281 	}
    282 
    283 	if exp > MaxExp {
    284 		// overflow
    285 		z.acc = makeAcc(!z.neg)
    286 		z.form = inf
    287 		return
    288 	}
    289 
    290 	z.form = finite
    291 	z.exp = int32(exp)
    292 	z.round(sbit)
    293 }
    294 
    295 // SetMantExp sets z to mant  2**exp and and returns z.
    296 // The result z has the same precision and rounding mode
    297 // as mant. SetMantExp is an inverse of MantExp but does
    298 // not require 0.5 <= |mant| < 1.0. Specifically:
    299 //
    300 //	mant := new(Float)
    301 //	new(Float).SetMantExp(mant, x.SetMantExp(mant)).Cmp(x).Eql() is true
    302 //
    303 // Special cases are:
    304 //
    305 //	z.SetMantExp(  0, exp) =   0
    306 //	z.SetMantExp(Inf, exp) = Inf
    307 //
    308 // z and mant may be the same in which case z's exponent
    309 // is set to exp.
    310 func (z *Float) SetMantExp(mant *Float, exp int) *Float {
    311 	if debugFloat {
    312 		z.validate()
    313 		mant.validate()
    314 	}
    315 	z.Copy(mant)
    316 	if z.form != finite {
    317 		return z
    318 	}
    319 	z.setExpAndRound(int64(z.exp)+int64(exp), 0)
    320 	return z
    321 }
    322 
    323 // Signbit returns true if x is negative or negative zero.
    324 func (x *Float) Signbit() bool {
    325 	return x.neg
    326 }
    327 
    328 // IsInf reports whether x is +Inf or -Inf.
    329 func (x *Float) IsInf() bool {
    330 	return x.form == inf
    331 }
    332 
    333 // IsInt reports whether x is an integer.
    334 // Inf values are not integers.
    335 func (x *Float) IsInt() bool {
    336 	if debugFloat {
    337 		x.validate()
    338 	}
    339 	// special cases
    340 	if x.form != finite {
    341 		return x.form == zero
    342 	}
    343 	// x.form == finite
    344 	if x.exp <= 0 {
    345 		return false
    346 	}
    347 	// x.exp > 0
    348 	return x.prec <= uint32(x.exp) || x.MinPrec() <= uint(x.exp) // not enough bits for fractional mantissa
    349 }
    350 
    351 // debugging support
    352 func (x *Float) validate() {
    353 	if !debugFloat {
    354 		// avoid performance bugs
    355 		panic("validate called but debugFloat is not set")
    356 	}
    357 	if x.form != finite {
    358 		return
    359 	}
    360 	m := len(x.mant)
    361 	if m == 0 {
    362 		panic("nonzero finite number with empty mantissa")
    363 	}
    364 	const msb = 1 << (_W - 1)
    365 	if x.mant[m-1]&msb == 0 {
    366 		panic(fmt.Sprintf("msb not set in last word %#x of %s", x.mant[m-1], x.Text('p', 0)))
    367 	}
    368 	if x.prec == 0 {
    369 		panic("zero precision finite number")
    370 	}
    371 }
    372 
    373 // round rounds z according to z.mode to z.prec bits and sets z.acc accordingly.
    374 // sbit must be 0 or 1 and summarizes any "sticky bit" information one might
    375 // have before calling round. z's mantissa must be normalized (with the msb set)
    376 // or empty.
    377 //
    378 // CAUTION: The rounding modes ToNegativeInf, ToPositiveInf are affected by the
    379 // sign of z. For correct rounding, the sign of z must be set correctly before
    380 // calling round.
    381 func (z *Float) round(sbit uint) {
    382 	if debugFloat {
    383 		z.validate()
    384 	}
    385 
    386 	z.acc = Exact
    387 	if z.form != finite {
    388 		// 0 or Inf => nothing left to do
    389 		return
    390 	}
    391 	// z.form == finite && len(z.mant) > 0
    392 	// m > 0 implies z.prec > 0 (checked by validate)
    393 
    394 	m := uint32(len(z.mant)) // present mantissa length in words
    395 	bits := m * _W           // present mantissa bits
    396 	if bits <= z.prec {
    397 		// mantissa fits => nothing to do
    398 		return
    399 	}
    400 	// bits > z.prec
    401 
    402 	n := (z.prec + (_W - 1)) / _W // mantissa length in words for desired precision
    403 
    404 	// Rounding is based on two bits: the rounding bit (rbit) and the
    405 	// sticky bit (sbit). The rbit is the bit immediately before the
    406 	// z.prec leading mantissa bits (the "0.5"). The sbit is set if any
    407 	// of the bits before the rbit are set (the "0.25", "0.125", etc.):
    408 	//
    409 	//   rbit  sbit  => "fractional part"
    410 	//
    411 	//   0     0        == 0
    412 	//   0     1        >  0  , < 0.5
    413 	//   1     0        == 0.5
    414 	//   1     1        >  0.5, < 1.0
    415 
    416 	// bits > z.prec: mantissa too large => round
    417 	r := uint(bits - z.prec - 1) // rounding bit position; r >= 0
    418 	rbit := z.mant.bit(r)        // rounding bit
    419 	if sbit == 0 {
    420 		sbit = z.mant.sticky(r)
    421 	}
    422 	if debugFloat && sbit&^1 != 0 {
    423 		panic(fmt.Sprintf("invalid sbit %#x", sbit))
    424 	}
    425 
    426 	// convert ToXInf rounding modes
    427 	mode := z.mode
    428 	switch mode {
    429 	case ToNegativeInf:
    430 		mode = ToZero
    431 		if z.neg {
    432 			mode = AwayFromZero
    433 		}
    434 	case ToPositiveInf:
    435 		mode = AwayFromZero
    436 		if z.neg {
    437 			mode = ToZero
    438 		}
    439 	}
    440 
    441 	// cut off extra words
    442 	if m > n {
    443 		copy(z.mant, z.mant[m-n:]) // move n last words to front
    444 		z.mant = z.mant[:n]
    445 	}
    446 
    447 	// determine number of trailing zero bits t
    448 	t := n*_W - z.prec // 0 <= t < _W
    449 	lsb := Word(1) << t
    450 
    451 	// make rounding decision
    452 	// TODO(gri) This can be simplified (see Bits.round in bits_test.go).
    453 	switch mode {
    454 	case ToZero:
    455 		// nothing to do
    456 	case ToNearestEven, ToNearestAway:
    457 		if rbit == 0 {
    458 			// rounding bits == 0b0x
    459 			mode = ToZero
    460 		} else if sbit == 1 {
    461 			// rounding bits == 0b11
    462 			mode = AwayFromZero
    463 		}
    464 	case AwayFromZero:
    465 		if rbit|sbit == 0 {
    466 			mode = ToZero
    467 		}
    468 	default:
    469 		// ToXInf modes have been converted to ToZero or AwayFromZero
    470 		panic("unreachable")
    471 	}
    472 
    473 	// round and determine accuracy
    474 	switch mode {
    475 	case ToZero:
    476 		if rbit|sbit != 0 {
    477 			z.acc = Below
    478 		}
    479 
    480 	case ToNearestEven, ToNearestAway:
    481 		if debugFloat && rbit != 1 {
    482 			panic("internal error in rounding")
    483 		}
    484 		if mode == ToNearestEven && sbit == 0 && z.mant[0]&lsb == 0 {
    485 			z.acc = Below
    486 			break
    487 		}
    488 		// mode == ToNearestAway || sbit == 1 || z.mant[0]&lsb != 0
    489 		fallthrough
    490 
    491 	case AwayFromZero:
    492 		// add 1 to mantissa
    493 		if addVW(z.mant, z.mant, lsb) != 0 {
    494 			// overflow => shift mantissa right by 1 and add msb
    495 			shrVU(z.mant, z.mant, 1)
    496 			z.mant[n-1] |= 1 << (_W - 1)
    497 			// adjust exponent
    498 			if z.exp < MaxExp {
    499 				z.exp++
    500 			} else {
    501 				// exponent overflow
    502 				z.acc = makeAcc(!z.neg)
    503 				z.form = inf
    504 				return
    505 			}
    506 		}
    507 		z.acc = Above
    508 	}
    509 
    510 	// zero out trailing bits in least-significant word
    511 	z.mant[0] &^= lsb - 1
    512 
    513 	// update accuracy
    514 	if z.acc != Exact && z.neg {
    515 		z.acc = -z.acc
    516 	}
    517 
    518 	if debugFloat {
    519 		z.validate()
    520 	}
    521 
    522 	return
    523 }
    524 
    525 func (z *Float) setBits64(neg bool, x uint64) *Float {
    526 	if z.prec == 0 {
    527 		z.prec = 64
    528 	}
    529 	z.acc = Exact
    530 	z.neg = neg
    531 	if x == 0 {
    532 		z.form = zero
    533 		return z
    534 	}
    535 	// x != 0
    536 	z.form = finite
    537 	s := nlz64(x)
    538 	z.mant = z.mant.setUint64(x << s)
    539 	z.exp = int32(64 - s) // always fits
    540 	if z.prec < 64 {
    541 		z.round(0)
    542 	}
    543 	return z
    544 }
    545 
    546 // SetUint64 sets z to the (possibly rounded) value of x and returns z.
    547 // If z's precision is 0, it is changed to 64 (and rounding will have
    548 // no effect).
    549 func (z *Float) SetUint64(x uint64) *Float {
    550 	return z.setBits64(false, x)
    551 }
    552 
    553 // SetInt64 sets z to the (possibly rounded) value of x and returns z.
    554 // If z's precision is 0, it is changed to 64 (and rounding will have
    555 // no effect).
    556 func (z *Float) SetInt64(x int64) *Float {
    557 	u := x
    558 	if u < 0 {
    559 		u = -u
    560 	}
    561 	// We cannot simply call z.SetUint64(uint64(u)) and change
    562 	// the sign afterwards because the sign affects rounding.
    563 	return z.setBits64(x < 0, uint64(u))
    564 }
    565 
    566 // SetFloat64 sets z to the (possibly rounded) value of x and returns z.
    567 // If z's precision is 0, it is changed to 53 (and rounding will have
    568 // no effect). SetFloat64 panics with ErrNaN if x is a NaN.
    569 func (z *Float) SetFloat64(x float64) *Float {
    570 	if z.prec == 0 {
    571 		z.prec = 53
    572 	}
    573 	if math.IsNaN(x) {
    574 		panic(ErrNaN{"Float.SetFloat64(NaN)"})
    575 	}
    576 	z.acc = Exact
    577 	z.neg = math.Signbit(x) // handle -0, -Inf correctly
    578 	if x == 0 {
    579 		z.form = zero
    580 		return z
    581 	}
    582 	if math.IsInf(x, 0) {
    583 		z.form = inf
    584 		return z
    585 	}
    586 	// normalized x != 0
    587 	z.form = finite
    588 	fmant, exp := math.Frexp(x) // get normalized mantissa
    589 	z.mant = z.mant.setUint64(1<<63 | math.Float64bits(fmant)<<11)
    590 	z.exp = int32(exp) // always fits
    591 	if z.prec < 53 {
    592 		z.round(0)
    593 	}
    594 	return z
    595 }
    596 
    597 // fnorm normalizes mantissa m by shifting it to the left
    598 // such that the msb of the most-significant word (msw) is 1.
    599 // It returns the shift amount. It assumes that len(m) != 0.
    600 func fnorm(m nat) int64 {
    601 	if debugFloat && (len(m) == 0 || m[len(m)-1] == 0) {
    602 		panic("msw of mantissa is 0")
    603 	}
    604 	s := nlz(m[len(m)-1])
    605 	if s > 0 {
    606 		c := shlVU(m, m, s)
    607 		if debugFloat && c != 0 {
    608 			panic("nlz or shlVU incorrect")
    609 		}
    610 	}
    611 	return int64(s)
    612 }
    613 
    614 // SetInt sets z to the (possibly rounded) value of x and returns z.
    615 // If z's precision is 0, it is changed to the larger of x.BitLen()
    616 // or 64 (and rounding will have no effect).
    617 func (z *Float) SetInt(x *Int) *Float {
    618 	// TODO(gri) can be more efficient if z.prec > 0
    619 	// but small compared to the size of x, or if there
    620 	// are many trailing 0's.
    621 	bits := uint32(x.BitLen())
    622 	if z.prec == 0 {
    623 		z.prec = umax32(bits, 64)
    624 	}
    625 	z.acc = Exact
    626 	z.neg = x.neg
    627 	if len(x.abs) == 0 {
    628 		z.form = zero
    629 		return z
    630 	}
    631 	// x != 0
    632 	z.mant = z.mant.set(x.abs)
    633 	fnorm(z.mant)
    634 	z.setExpAndRound(int64(bits), 0)
    635 	return z
    636 }
    637 
    638 // SetRat sets z to the (possibly rounded) value of x and returns z.
    639 // If z's precision is 0, it is changed to the largest of a.BitLen(),
    640 // b.BitLen(), or 64; with x = a/b.
    641 func (z *Float) SetRat(x *Rat) *Float {
    642 	if x.IsInt() {
    643 		return z.SetInt(x.Num())
    644 	}
    645 	var a, b Float
    646 	a.SetInt(x.Num())
    647 	b.SetInt(x.Denom())
    648 	if z.prec == 0 {
    649 		z.prec = umax32(a.prec, b.prec)
    650 	}
    651 	return z.Quo(&a, &b)
    652 }
    653 
    654 // SetInf sets z to the infinite Float -Inf if signbit is
    655 // set, or +Inf if signbit is not set, and returns z. The
    656 // precision of z is unchanged and the result is always
    657 // Exact.
    658 func (z *Float) SetInf(signbit bool) *Float {
    659 	z.acc = Exact
    660 	z.form = inf
    661 	z.neg = signbit
    662 	return z
    663 }
    664 
    665 // Set sets z to the (possibly rounded) value of x and returns z.
    666 // If z's precision is 0, it is changed to the precision of x
    667 // before setting z (and rounding will have no effect).
    668 // Rounding is performed according to z's precision and rounding
    669 // mode; and z's accuracy reports the result error relative to the
    670 // exact (not rounded) result.
    671 func (z *Float) Set(x *Float) *Float {
    672 	if debugFloat {
    673 		x.validate()
    674 	}
    675 	z.acc = Exact
    676 	if z != x {
    677 		z.form = x.form
    678 		z.neg = x.neg
    679 		if x.form == finite {
    680 			z.exp = x.exp
    681 			z.mant = z.mant.set(x.mant)
    682 		}
    683 		if z.prec == 0 {
    684 			z.prec = x.prec
    685 		} else if z.prec < x.prec {
    686 			z.round(0)
    687 		}
    688 	}
    689 	return z
    690 }
    691 
    692 // Copy sets z to x, with the same precision, rounding mode, and
    693 // accuracy as x, and returns z. x is not changed even if z and
    694 // x are the same.
    695 func (z *Float) Copy(x *Float) *Float {
    696 	if debugFloat {
    697 		x.validate()
    698 	}
    699 	if z != x {
    700 		z.prec = x.prec
    701 		z.mode = x.mode
    702 		z.acc = x.acc
    703 		z.form = x.form
    704 		z.neg = x.neg
    705 		if z.form == finite {
    706 			z.mant = z.mant.set(x.mant)
    707 			z.exp = x.exp
    708 		}
    709 	}
    710 	return z
    711 }
    712 
    713 // msb32 returns the 32 most significant bits of x.
    714 func msb32(x nat) uint32 {
    715 	i := len(x) - 1
    716 	if i < 0 {
    717 		return 0
    718 	}
    719 	if debugFloat && x[i]&(1<<(_W-1)) == 0 {
    720 		panic("x not normalized")
    721 	}
    722 	switch _W {
    723 	case 32:
    724 		return uint32(x[i])
    725 	case 64:
    726 		return uint32(x[i] >> 32)
    727 	}
    728 	panic("unreachable")
    729 }
    730 
    731 // msb64 returns the 64 most significant bits of x.
    732 func msb64(x nat) uint64 {
    733 	i := len(x) - 1
    734 	if i < 0 {
    735 		return 0
    736 	}
    737 	if debugFloat && x[i]&(1<<(_W-1)) == 0 {
    738 		panic("x not normalized")
    739 	}
    740 	switch _W {
    741 	case 32:
    742 		v := uint64(x[i]) << 32
    743 		if i > 0 {
    744 			v |= uint64(x[i-1])
    745 		}
    746 		return v
    747 	case 64:
    748 		return uint64(x[i])
    749 	}
    750 	panic("unreachable")
    751 }
    752 
    753 // Uint64 returns the unsigned integer resulting from truncating x
    754 // towards zero. If 0 <= x <= math.MaxUint64, the result is Exact
    755 // if x is an integer and Below otherwise.
    756 // The result is (0, Above) for x < 0, and (math.MaxUint64, Below)
    757 // for x > math.MaxUint64.
    758 func (x *Float) Uint64() (uint64, Accuracy) {
    759 	if debugFloat {
    760 		x.validate()
    761 	}
    762 
    763 	switch x.form {
    764 	case finite:
    765 		if x.neg {
    766 			return 0, Above
    767 		}
    768 		// 0 < x < +Inf
    769 		if x.exp <= 0 {
    770 			// 0 < x < 1
    771 			return 0, Below
    772 		}
    773 		// 1 <= x < Inf
    774 		if x.exp <= 64 {
    775 			// u = trunc(x) fits into a uint64
    776 			u := msb64(x.mant) >> (64 - uint32(x.exp))
    777 			if x.MinPrec() <= 64 {
    778 				return u, Exact
    779 			}
    780 			return u, Below // x truncated
    781 		}
    782 		// x too large
    783 		return math.MaxUint64, Below
    784 
    785 	case zero:
    786 		return 0, Exact
    787 
    788 	case inf:
    789 		if x.neg {
    790 			return 0, Above
    791 		}
    792 		return math.MaxUint64, Below
    793 	}
    794 
    795 	panic("unreachable")
    796 }
    797 
    798 // Int64 returns the integer resulting from truncating x towards zero.
    799 // If math.MinInt64 <= x <= math.MaxInt64, the result is Exact if x is
    800 // an integer, and Above (x < 0) or Below (x > 0) otherwise.
    801 // The result is (math.MinInt64, Above) for x < math.MinInt64,
    802 // and (math.MaxInt64, Below) for x > math.MaxInt64.
    803 func (x *Float) Int64() (int64, Accuracy) {
    804 	if debugFloat {
    805 		x.validate()
    806 	}
    807 
    808 	switch x.form {
    809 	case finite:
    810 		// 0 < |x| < +Inf
    811 		acc := makeAcc(x.neg)
    812 		if x.exp <= 0 {
    813 			// 0 < |x| < 1
    814 			return 0, acc
    815 		}
    816 		// x.exp > 0
    817 
    818 		// 1 <= |x| < +Inf
    819 		if x.exp <= 63 {
    820 			// i = trunc(x) fits into an int64 (excluding math.MinInt64)
    821 			i := int64(msb64(x.mant) >> (64 - uint32(x.exp)))
    822 			if x.neg {
    823 				i = -i
    824 			}
    825 			if x.MinPrec() <= uint(x.exp) {
    826 				return i, Exact
    827 			}
    828 			return i, acc // x truncated
    829 		}
    830 		if x.neg {
    831 			// check for special case x == math.MinInt64 (i.e., x == -(0.5 << 64))
    832 			if x.exp == 64 && x.MinPrec() == 1 {
    833 				acc = Exact
    834 			}
    835 			return math.MinInt64, acc
    836 		}
    837 		// x too large
    838 		return math.MaxInt64, Below
    839 
    840 	case zero:
    841 		return 0, Exact
    842 
    843 	case inf:
    844 		if x.neg {
    845 			return math.MinInt64, Above
    846 		}
    847 		return math.MaxInt64, Below
    848 	}
    849 
    850 	panic("unreachable")
    851 }
    852 
    853 // Float32 returns the float32 value nearest to x. If x is too small to be
    854 // represented by a float32 (|x| < math.SmallestNonzeroFloat32), the result
    855 // is (0, Below) or (-0, Above), respectively, depending on the sign of x.
    856 // If x is too large to be represented by a float32 (|x| > math.MaxFloat32),
    857 // the result is (+Inf, Above) or (-Inf, Below), depending on the sign of x.
    858 func (x *Float) Float32() (float32, Accuracy) {
    859 	if debugFloat {
    860 		x.validate()
    861 	}
    862 
    863 	switch x.form {
    864 	case finite:
    865 		// 0 < |x| < +Inf
    866 
    867 		const (
    868 			fbits = 32                //        float size
    869 			mbits = 23                //        mantissa size (excluding implicit msb)
    870 			ebits = fbits - mbits - 1 //     8  exponent size
    871 			bias  = 1<<(ebits-1) - 1  //   127  exponent bias
    872 			dmin  = 1 - bias - mbits  //  -149  smallest unbiased exponent (denormal)
    873 			emin  = 1 - bias          //  -126  smallest unbiased exponent (normal)
    874 			emax  = bias              //   127  largest unbiased exponent (normal)
    875 		)
    876 
    877 		// Float mantissa m is 0.5 <= m < 1.0; compute exponent for floatxx mantissa.
    878 		e := x.exp - 1 // exponent for mantissa m with 1.0 <= m < 2.0
    879 		p := mbits + 1 // precision of normal float
    880 
    881 		// If the exponent is too small, we may have a denormal number
    882 		// in which case we have fewer mantissa bits available: reduce
    883 		// precision accordingly.
    884 		if e < emin {
    885 			p -= emin - int(e)
    886 			// Make sure we have at least 1 bit so that we don't
    887 			// lose numbers rounded up to the smallest denormal.
    888 			if p < 1 {
    889 				p = 1
    890 			}
    891 		}
    892 
    893 		// round
    894 		var r Float
    895 		r.prec = uint32(p)
    896 		r.Set(x)
    897 		e = r.exp - 1
    898 
    899 		// Rounding may have caused r to overflow to Inf
    900 		// (rounding never causes underflows to 0).
    901 		if r.form == inf {
    902 			e = emax + 1 // cause overflow below
    903 		}
    904 
    905 		// If the exponent is too large, overflow to Inf.
    906 		if e > emax {
    907 			// overflow
    908 			if x.neg {
    909 				return float32(math.Inf(-1)), Below
    910 			}
    911 			return float32(math.Inf(+1)), Above
    912 		}
    913 		// e <= emax
    914 
    915 		// Determine sign, biased exponent, and mantissa.
    916 		var sign, bexp, mant uint32
    917 		if x.neg {
    918 			sign = 1 << (fbits - 1)
    919 		}
    920 
    921 		// Rounding may have caused a denormal number to
    922 		// become normal. Check again.
    923 		if e < emin {
    924 			// denormal number
    925 			if e < dmin {
    926 				// underflow to 0
    927 				if x.neg {
    928 					var z float32
    929 					return -z, Above
    930 				}
    931 				return 0.0, Below
    932 			}
    933 			// bexp = 0
    934 			mant = msb32(r.mant) >> (fbits - r.prec)
    935 		} else {
    936 			// normal number: emin <= e <= emax
    937 			bexp = uint32(e+bias) << mbits
    938 			mant = msb32(r.mant) >> ebits & (1<<mbits - 1) // cut off msb (implicit 1 bit)
    939 		}
    940 
    941 		return math.Float32frombits(sign | bexp | mant), r.acc
    942 
    943 	case zero:
    944 		if x.neg {
    945 			var z float32
    946 			return -z, Exact
    947 		}
    948 		return 0.0, Exact
    949 
    950 	case inf:
    951 		if x.neg {
    952 			return float32(math.Inf(-1)), Exact
    953 		}
    954 		return float32(math.Inf(+1)), Exact
    955 	}
    956 
    957 	panic("unreachable")
    958 }
    959 
    960 // Float64 returns the float64 value nearest to x. If x is too small to be
    961 // represented by a float64 (|x| < math.SmallestNonzeroFloat64), the result
    962 // is (0, Below) or (-0, Above), respectively, depending on the sign of x.
    963 // If x is too large to be represented by a float64 (|x| > math.MaxFloat64),
    964 // the result is (+Inf, Above) or (-Inf, Below), depending on the sign of x.
    965 func (x *Float) Float64() (float64, Accuracy) {
    966 	if debugFloat {
    967 		x.validate()
    968 	}
    969 
    970 	switch x.form {
    971 	case finite:
    972 		// 0 < |x| < +Inf
    973 
    974 		const (
    975 			fbits = 64                //        float size
    976 			mbits = 52                //        mantissa size (excluding implicit msb)
    977 			ebits = fbits - mbits - 1 //    11  exponent size
    978 			bias  = 1<<(ebits-1) - 1  //  1023  exponent bias
    979 			dmin  = 1 - bias - mbits  // -1074  smallest unbiased exponent (denormal)
    980 			emin  = 1 - bias          // -1022  smallest unbiased exponent (normal)
    981 			emax  = bias              //  1023  largest unbiased exponent (normal)
    982 		)
    983 
    984 		// Float mantissa m is 0.5 <= m < 1.0; compute exponent for floatxx mantissa.
    985 		e := x.exp - 1 // exponent for mantissa m with 1.0 <= m < 2.0
    986 		p := mbits + 1 // precision of normal float
    987 
    988 		// If the exponent is too small, we may have a denormal number
    989 		// in which case we have fewer mantissa bits available: reduce
    990 		// precision accordingly.
    991 		if e < emin {
    992 			p -= emin - int(e)
    993 			// Make sure we have at least 1 bit so that we don't
    994 			// lose numbers rounded up to the smallest denormal.
    995 			if p < 1 {
    996 				p = 1
    997 			}
    998 		}
    999 
   1000 		// round
   1001 		var r Float
   1002 		r.prec = uint32(p)
   1003 		r.Set(x)
   1004 		e = r.exp - 1
   1005 
   1006 		// Rounding may have caused r to overflow to Inf
   1007 		// (rounding never causes underflows to 0).
   1008 		if r.form == inf {
   1009 			e = emax + 1 // cause overflow below
   1010 		}
   1011 
   1012 		// If the exponent is too large, overflow to Inf.
   1013 		if e > emax {
   1014 			// overflow
   1015 			if x.neg {
   1016 				return math.Inf(-1), Below
   1017 			}
   1018 			return math.Inf(+1), Above
   1019 		}
   1020 		// e <= emax
   1021 
   1022 		// Determine sign, biased exponent, and mantissa.
   1023 		var sign, bexp, mant uint64
   1024 		if x.neg {
   1025 			sign = 1 << (fbits - 1)
   1026 		}
   1027 
   1028 		// Rounding may have caused a denormal number to
   1029 		// become normal. Check again.
   1030 		if e < emin {
   1031 			// denormal number
   1032 			if e < dmin {
   1033 				// underflow to 0
   1034 				if x.neg {
   1035 					var z float64
   1036 					return -z, Above
   1037 				}
   1038 				return 0.0, Below
   1039 			}
   1040 			// bexp = 0
   1041 			mant = msb64(r.mant) >> (fbits - r.prec)
   1042 		} else {
   1043 			// normal number: emin <= e <= emax
   1044 			bexp = uint64(e+bias) << mbits
   1045 			mant = msb64(r.mant) >> ebits & (1<<mbits - 1) // cut off msb (implicit 1 bit)
   1046 		}
   1047 
   1048 		return math.Float64frombits(sign | bexp | mant), r.acc
   1049 
   1050 	case zero:
   1051 		if x.neg {
   1052 			var z float64
   1053 			return -z, Exact
   1054 		}
   1055 		return 0.0, Exact
   1056 
   1057 	case inf:
   1058 		if x.neg {
   1059 			return math.Inf(-1), Exact
   1060 		}
   1061 		return math.Inf(+1), Exact
   1062 	}
   1063 
   1064 	panic("unreachable")
   1065 }
   1066 
   1067 // Int returns the result of truncating x towards zero;
   1068 // or nil if x is an infinity.
   1069 // The result is Exact if x.IsInt(); otherwise it is Below
   1070 // for x > 0, and Above for x < 0.
   1071 // If a non-nil *Int argument z is provided, Int stores
   1072 // the result in z instead of allocating a new Int.
   1073 func (x *Float) Int(z *Int) (*Int, Accuracy) {
   1074 	if debugFloat {
   1075 		x.validate()
   1076 	}
   1077 
   1078 	if z == nil && x.form <= finite {
   1079 		z = new(Int)
   1080 	}
   1081 
   1082 	switch x.form {
   1083 	case finite:
   1084 		// 0 < |x| < +Inf
   1085 		acc := makeAcc(x.neg)
   1086 		if x.exp <= 0 {
   1087 			// 0 < |x| < 1
   1088 			return z.SetInt64(0), acc
   1089 		}
   1090 		// x.exp > 0
   1091 
   1092 		// 1 <= |x| < +Inf
   1093 		// determine minimum required precision for x
   1094 		allBits := uint(len(x.mant)) * _W
   1095 		exp := uint(x.exp)
   1096 		if x.MinPrec() <= exp {
   1097 			acc = Exact
   1098 		}
   1099 		// shift mantissa as needed
   1100 		if z == nil {
   1101 			z = new(Int)
   1102 		}
   1103 		z.neg = x.neg
   1104 		switch {
   1105 		case exp > allBits:
   1106 			z.abs = z.abs.shl(x.mant, exp-allBits)
   1107 		default:
   1108 			z.abs = z.abs.set(x.mant)
   1109 		case exp < allBits:
   1110 			z.abs = z.abs.shr(x.mant, allBits-exp)
   1111 		}
   1112 		return z, acc
   1113 
   1114 	case zero:
   1115 		return z.SetInt64(0), Exact
   1116 
   1117 	case inf:
   1118 		return nil, makeAcc(x.neg)
   1119 	}
   1120 
   1121 	panic("unreachable")
   1122 }
   1123 
   1124 // Rat returns the rational number corresponding to x;
   1125 // or nil if x is an infinity.
   1126 // The result is Exact is x is not an Inf.
   1127 // If a non-nil *Rat argument z is provided, Rat stores
   1128 // the result in z instead of allocating a new Rat.
   1129 func (x *Float) Rat(z *Rat) (*Rat, Accuracy) {
   1130 	if debugFloat {
   1131 		x.validate()
   1132 	}
   1133 
   1134 	if z == nil && x.form <= finite {
   1135 		z = new(Rat)
   1136 	}
   1137 
   1138 	switch x.form {
   1139 	case finite:
   1140 		// 0 < |x| < +Inf
   1141 		allBits := int32(len(x.mant)) * _W
   1142 		// build up numerator and denominator
   1143 		z.a.neg = x.neg
   1144 		switch {
   1145 		case x.exp > allBits:
   1146 			z.a.abs = z.a.abs.shl(x.mant, uint(x.exp-allBits))
   1147 			z.b.abs = z.b.abs[:0] // == 1 (see Rat)
   1148 			// z already in normal form
   1149 		default:
   1150 			z.a.abs = z.a.abs.set(x.mant)
   1151 			z.b.abs = z.b.abs[:0] // == 1 (see Rat)
   1152 			// z already in normal form
   1153 		case x.exp < allBits:
   1154 			z.a.abs = z.a.abs.set(x.mant)
   1155 			t := z.b.abs.setUint64(1)
   1156 			z.b.abs = t.shl(t, uint(allBits-x.exp))
   1157 			z.norm()
   1158 		}
   1159 		return z, Exact
   1160 
   1161 	case zero:
   1162 		return z.SetInt64(0), Exact
   1163 
   1164 	case inf:
   1165 		return nil, makeAcc(x.neg)
   1166 	}
   1167 
   1168 	panic("unreachable")
   1169 }
   1170 
   1171 // Abs sets z to the (possibly rounded) value |x| (the absolute value of x)
   1172 // and returns z.
   1173 func (z *Float) Abs(x *Float) *Float {
   1174 	z.Set(x)
   1175 	z.neg = false
   1176 	return z
   1177 }
   1178 
   1179 // Neg sets z to the (possibly rounded) value of x with its sign negated,
   1180 // and returns z.
   1181 func (z *Float) Neg(x *Float) *Float {
   1182 	z.Set(x)
   1183 	z.neg = !z.neg
   1184 	return z
   1185 }
   1186 
   1187 func validateBinaryOperands(x, y *Float) {
   1188 	if !debugFloat {
   1189 		// avoid performance bugs
   1190 		panic("validateBinaryOperands called but debugFloat is not set")
   1191 	}
   1192 	if len(x.mant) == 0 {
   1193 		panic("empty mantissa for x")
   1194 	}
   1195 	if len(y.mant) == 0 {
   1196 		panic("empty mantissa for y")
   1197 	}
   1198 }
   1199 
   1200 // z = x + y, ignoring signs of x and y for the addition
   1201 // but using the sign of z for rounding the result.
   1202 // x and y must have a non-empty mantissa and valid exponent.
   1203 func (z *Float) uadd(x, y *Float) {
   1204 	// Note: This implementation requires 2 shifts most of the
   1205 	// time. It is also inefficient if exponents or precisions
   1206 	// differ by wide margins. The following article describes
   1207 	// an efficient (but much more complicated) implementation
   1208 	// compatible with the internal representation used here:
   1209 	//
   1210 	// Vincent Lefvre: "The Generic Multiple-Precision Floating-
   1211 	// Point Addition With Exact Rounding (as in the MPFR Library)"
   1212 	// http://www.vinc17.net/research/papers/rnc6.pdf
   1213 
   1214 	if debugFloat {
   1215 		validateBinaryOperands(x, y)
   1216 	}
   1217 
   1218 	// compute exponents ex, ey for mantissa with "binary point"
   1219 	// on the right (mantissa.0) - use int64 to avoid overflow
   1220 	ex := int64(x.exp) - int64(len(x.mant))*_W
   1221 	ey := int64(y.exp) - int64(len(y.mant))*_W
   1222 
   1223 	// TODO(gri) having a combined add-and-shift primitive
   1224 	//           could make this code significantly faster
   1225 	switch {
   1226 	case ex < ey:
   1227 		// cannot re-use z.mant w/o testing for aliasing
   1228 		t := nat(nil).shl(y.mant, uint(ey-ex))
   1229 		z.mant = z.mant.add(x.mant, t)
   1230 	default:
   1231 		// ex == ey, no shift needed
   1232 		z.mant = z.mant.add(x.mant, y.mant)
   1233 	case ex > ey:
   1234 		// cannot re-use z.mant w/o testing for aliasing
   1235 		t := nat(nil).shl(x.mant, uint(ex-ey))
   1236 		z.mant = z.mant.add(t, y.mant)
   1237 		ex = ey
   1238 	}
   1239 	// len(z.mant) > 0
   1240 
   1241 	z.setExpAndRound(ex+int64(len(z.mant))*_W-fnorm(z.mant), 0)
   1242 }
   1243 
   1244 // z = x - y for |x| > |y|, ignoring signs of x and y for the subtraction
   1245 // but using the sign of z for rounding the result.
   1246 // x and y must have a non-empty mantissa and valid exponent.
   1247 func (z *Float) usub(x, y *Float) {
   1248 	// This code is symmetric to uadd.
   1249 	// We have not factored the common code out because
   1250 	// eventually uadd (and usub) should be optimized
   1251 	// by special-casing, and the code will diverge.
   1252 
   1253 	if debugFloat {
   1254 		validateBinaryOperands(x, y)
   1255 	}
   1256 
   1257 	ex := int64(x.exp) - int64(len(x.mant))*_W
   1258 	ey := int64(y.exp) - int64(len(y.mant))*_W
   1259 
   1260 	switch {
   1261 	case ex < ey:
   1262 		// cannot re-use z.mant w/o testing for aliasing
   1263 		t := nat(nil).shl(y.mant, uint(ey-ex))
   1264 		z.mant = t.sub(x.mant, t)
   1265 	default:
   1266 		// ex == ey, no shift needed
   1267 		z.mant = z.mant.sub(x.mant, y.mant)
   1268 	case ex > ey:
   1269 		// cannot re-use z.mant w/o testing for aliasing
   1270 		t := nat(nil).shl(x.mant, uint(ex-ey))
   1271 		z.mant = t.sub(t, y.mant)
   1272 		ex = ey
   1273 	}
   1274 
   1275 	// operands may have cancelled each other out
   1276 	if len(z.mant) == 0 {
   1277 		z.acc = Exact
   1278 		z.form = zero
   1279 		z.neg = false
   1280 		return
   1281 	}
   1282 	// len(z.mant) > 0
   1283 
   1284 	z.setExpAndRound(ex+int64(len(z.mant))*_W-fnorm(z.mant), 0)
   1285 }
   1286 
   1287 // z = x * y, ignoring signs of x and y for the multiplication
   1288 // but using the sign of z for rounding the result.
   1289 // x and y must have a non-empty mantissa and valid exponent.
   1290 func (z *Float) umul(x, y *Float) {
   1291 	if debugFloat {
   1292 		validateBinaryOperands(x, y)
   1293 	}
   1294 
   1295 	// Note: This is doing too much work if the precision
   1296 	// of z is less than the sum of the precisions of x
   1297 	// and y which is often the case (e.g., if all floats
   1298 	// have the same precision).
   1299 	// TODO(gri) Optimize this for the common case.
   1300 
   1301 	e := int64(x.exp) + int64(y.exp)
   1302 	z.mant = z.mant.mul(x.mant, y.mant)
   1303 
   1304 	z.setExpAndRound(e-fnorm(z.mant), 0)
   1305 }
   1306 
   1307 // z = x / y, ignoring signs of x and y for the division
   1308 // but using the sign of z for rounding the result.
   1309 // x and y must have a non-empty mantissa and valid exponent.
   1310 func (z *Float) uquo(x, y *Float) {
   1311 	if debugFloat {
   1312 		validateBinaryOperands(x, y)
   1313 	}
   1314 
   1315 	// mantissa length in words for desired result precision + 1
   1316 	// (at least one extra bit so we get the rounding bit after
   1317 	// the division)
   1318 	n := int(z.prec/_W) + 1
   1319 
   1320 	// compute adjusted x.mant such that we get enough result precision
   1321 	xadj := x.mant
   1322 	if d := n - len(x.mant) + len(y.mant); d > 0 {
   1323 		// d extra words needed => add d "0 digits" to x
   1324 		xadj = make(nat, len(x.mant)+d)
   1325 		copy(xadj[d:], x.mant)
   1326 	}
   1327 	// TODO(gri): If we have too many digits (d < 0), we should be able
   1328 	// to shorten x for faster division. But we must be extra careful
   1329 	// with rounding in that case.
   1330 
   1331 	// Compute d before division since there may be aliasing of x.mant
   1332 	// (via xadj) or y.mant with z.mant.
   1333 	d := len(xadj) - len(y.mant)
   1334 
   1335 	// divide
   1336 	var r nat
   1337 	z.mant, r = z.mant.div(nil, xadj, y.mant)
   1338 	e := int64(x.exp) - int64(y.exp) - int64(d-len(z.mant))*_W
   1339 
   1340 	// The result is long enough to include (at least) the rounding bit.
   1341 	// If there's a non-zero remainder, the corresponding fractional part
   1342 	// (if it were computed), would have a non-zero sticky bit (if it were
   1343 	// zero, it couldn't have a non-zero remainder).
   1344 	var sbit uint
   1345 	if len(r) > 0 {
   1346 		sbit = 1
   1347 	}
   1348 
   1349 	z.setExpAndRound(e-fnorm(z.mant), sbit)
   1350 }
   1351 
   1352 // ucmp returns -1, 0, or +1, depending on whether
   1353 // |x| < |y|, |x| == |y|, or |x| > |y|.
   1354 // x and y must have a non-empty mantissa and valid exponent.
   1355 func (x *Float) ucmp(y *Float) int {
   1356 	if debugFloat {
   1357 		validateBinaryOperands(x, y)
   1358 	}
   1359 
   1360 	switch {
   1361 	case x.exp < y.exp:
   1362 		return -1
   1363 	case x.exp > y.exp:
   1364 		return +1
   1365 	}
   1366 	// x.exp == y.exp
   1367 
   1368 	// compare mantissas
   1369 	i := len(x.mant)
   1370 	j := len(y.mant)
   1371 	for i > 0 || j > 0 {
   1372 		var xm, ym Word
   1373 		if i > 0 {
   1374 			i--
   1375 			xm = x.mant[i]
   1376 		}
   1377 		if j > 0 {
   1378 			j--
   1379 			ym = y.mant[j]
   1380 		}
   1381 		switch {
   1382 		case xm < ym:
   1383 			return -1
   1384 		case xm > ym:
   1385 			return +1
   1386 		}
   1387 	}
   1388 
   1389 	return 0
   1390 }
   1391 
   1392 // Handling of sign bit as defined by IEEE 754-2008, section 6.3:
   1393 //
   1394 // When neither the inputs nor result are NaN, the sign of a product or
   1395 // quotient is the exclusive OR of the operands signs; the sign of a sum,
   1396 // or of a difference xy regarded as a sum x+(y), differs from at most
   1397 // one of the addends signs; and the sign of the result of conversions,
   1398 // the quantize operation, the roundToIntegral operations, and the
   1399 // roundToIntegralExact (see 5.3.1) is the sign of the first or only operand.
   1400 // These rules shall apply even when operands or results are zero or infinite.
   1401 //
   1402 // When the sum of two operands with opposite signs (or the difference of
   1403 // two operands with like signs) is exactly zero, the sign of that sum (or
   1404 // difference) shall be +0 in all rounding-direction attributes except
   1405 // roundTowardNegative; under that attribute, the sign of an exact zero
   1406 // sum (or difference) shall be 0. However, x+x = x(x) retains the same
   1407 // sign as x even when x is zero.
   1408 //
   1409 // See also: https://play.golang.org/p/RtH3UCt5IH
   1410 
   1411 // Add sets z to the rounded sum x+y and returns z. If z's precision is 0,
   1412 // it is changed to the larger of x's or y's precision before the operation.
   1413 // Rounding is performed according to z's precision and rounding mode; and
   1414 // z's accuracy reports the result error relative to the exact (not rounded)
   1415 // result. Add panics with ErrNaN if x and y are infinities with opposite
   1416 // signs. The value of z is undefined in that case.
   1417 //
   1418 // BUG(gri) When rounding ToNegativeInf, the sign of Float values rounded to 0 is incorrect.
   1419 func (z *Float) Add(x, y *Float) *Float {
   1420 	if debugFloat {
   1421 		x.validate()
   1422 		y.validate()
   1423 	}
   1424 
   1425 	if z.prec == 0 {
   1426 		z.prec = umax32(x.prec, y.prec)
   1427 	}
   1428 
   1429 	if x.form == finite && y.form == finite {
   1430 		// x + y (commom case)
   1431 		z.neg = x.neg
   1432 		if x.neg == y.neg {
   1433 			// x + y == x + y
   1434 			// (-x) + (-y) == -(x + y)
   1435 			z.uadd(x, y)
   1436 		} else {
   1437 			// x + (-y) == x - y == -(y - x)
   1438 			// (-x) + y == y - x == -(x - y)
   1439 			if x.ucmp(y) > 0 {
   1440 				z.usub(x, y)
   1441 			} else {
   1442 				z.neg = !z.neg
   1443 				z.usub(y, x)
   1444 			}
   1445 		}
   1446 		return z
   1447 	}
   1448 
   1449 	if x.form == inf && y.form == inf && x.neg != y.neg {
   1450 		// +Inf + -Inf
   1451 		// -Inf + +Inf
   1452 		// value of z is undefined but make sure it's valid
   1453 		z.acc = Exact
   1454 		z.form = zero
   1455 		z.neg = false
   1456 		panic(ErrNaN{"addition of infinities with opposite signs"})
   1457 	}
   1458 
   1459 	if x.form == zero && y.form == zero {
   1460 		// 0 + 0
   1461 		z.acc = Exact
   1462 		z.form = zero
   1463 		z.neg = x.neg && y.neg // -0 + -0 == -0
   1464 		return z
   1465 	}
   1466 
   1467 	if x.form == inf || y.form == zero {
   1468 		// Inf + y
   1469 		// x + 0
   1470 		return z.Set(x)
   1471 	}
   1472 
   1473 	// 0 + y
   1474 	// x + Inf
   1475 	return z.Set(y)
   1476 }
   1477 
   1478 // Sub sets z to the rounded difference x-y and returns z.
   1479 // Precision, rounding, and accuracy reporting are as for Add.
   1480 // Sub panics with ErrNaN if x and y are infinities with equal
   1481 // signs. The value of z is undefined in that case.
   1482 func (z *Float) Sub(x, y *Float) *Float {
   1483 	if debugFloat {
   1484 		x.validate()
   1485 		y.validate()
   1486 	}
   1487 
   1488 	if z.prec == 0 {
   1489 		z.prec = umax32(x.prec, y.prec)
   1490 	}
   1491 
   1492 	if x.form == finite && y.form == finite {
   1493 		// x - y (common case)
   1494 		z.neg = x.neg
   1495 		if x.neg != y.neg {
   1496 			// x - (-y) == x + y
   1497 			// (-x) - y == -(x + y)
   1498 			z.uadd(x, y)
   1499 		} else {
   1500 			// x - y == x - y == -(y - x)
   1501 			// (-x) - (-y) == y - x == -(x - y)
   1502 			if x.ucmp(y) > 0 {
   1503 				z.usub(x, y)
   1504 			} else {
   1505 				z.neg = !z.neg
   1506 				z.usub(y, x)
   1507 			}
   1508 		}
   1509 		return z
   1510 	}
   1511 
   1512 	if x.form == inf && y.form == inf && x.neg == y.neg {
   1513 		// +Inf - +Inf
   1514 		// -Inf - -Inf
   1515 		// value of z is undefined but make sure it's valid
   1516 		z.acc = Exact
   1517 		z.form = zero
   1518 		z.neg = false
   1519 		panic(ErrNaN{"subtraction of infinities with equal signs"})
   1520 	}
   1521 
   1522 	if x.form == zero && y.form == zero {
   1523 		// 0 - 0
   1524 		z.acc = Exact
   1525 		z.form = zero
   1526 		z.neg = x.neg && !y.neg // -0 - +0 == -0
   1527 		return z
   1528 	}
   1529 
   1530 	if x.form == inf || y.form == zero {
   1531 		// Inf - y
   1532 		// x - 0
   1533 		return z.Set(x)
   1534 	}
   1535 
   1536 	// 0 - y
   1537 	// x - Inf
   1538 	return z.Neg(y)
   1539 }
   1540 
   1541 // Mul sets z to the rounded product x*y and returns z.
   1542 // Precision, rounding, and accuracy reporting are as for Add.
   1543 // Mul panics with ErrNaN if one operand is zero and the other
   1544 // operand an infinity. The value of z is undefined in that case.
   1545 func (z *Float) Mul(x, y *Float) *Float {
   1546 	if debugFloat {
   1547 		x.validate()
   1548 		y.validate()
   1549 	}
   1550 
   1551 	if z.prec == 0 {
   1552 		z.prec = umax32(x.prec, y.prec)
   1553 	}
   1554 
   1555 	z.neg = x.neg != y.neg
   1556 
   1557 	if x.form == finite && y.form == finite {
   1558 		// x * y (common case)
   1559 		z.umul(x, y)
   1560 		return z
   1561 	}
   1562 
   1563 	z.acc = Exact
   1564 	if x.form == zero && y.form == inf || x.form == inf && y.form == zero {
   1565 		// 0 * Inf
   1566 		// Inf * 0
   1567 		// value of z is undefined but make sure it's valid
   1568 		z.form = zero
   1569 		z.neg = false
   1570 		panic(ErrNaN{"multiplication of zero with infinity"})
   1571 	}
   1572 
   1573 	if x.form == inf || y.form == inf {
   1574 		// Inf * y
   1575 		// x * Inf
   1576 		z.form = inf
   1577 		return z
   1578 	}
   1579 
   1580 	// 0 * y
   1581 	// x * 0
   1582 	z.form = zero
   1583 	return z
   1584 }
   1585 
   1586 // Quo sets z to the rounded quotient x/y and returns z.
   1587 // Precision, rounding, and accuracy reporting are as for Add.
   1588 // Quo panics with ErrNaN if both operands are zero or infinities.
   1589 // The value of z is undefined in that case.
   1590 func (z *Float) Quo(x, y *Float) *Float {
   1591 	if debugFloat {
   1592 		x.validate()
   1593 		y.validate()
   1594 	}
   1595 
   1596 	if z.prec == 0 {
   1597 		z.prec = umax32(x.prec, y.prec)
   1598 	}
   1599 
   1600 	z.neg = x.neg != y.neg
   1601 
   1602 	if x.form == finite && y.form == finite {
   1603 		// x / y (common case)
   1604 		z.uquo(x, y)
   1605 		return z
   1606 	}
   1607 
   1608 	z.acc = Exact
   1609 	if x.form == zero && y.form == zero || x.form == inf && y.form == inf {
   1610 		// 0 / 0
   1611 		// Inf / Inf
   1612 		// value of z is undefined but make sure it's valid
   1613 		z.form = zero
   1614 		z.neg = false
   1615 		panic(ErrNaN{"division of zero by zero or infinity by infinity"})
   1616 	}
   1617 
   1618 	if x.form == zero || y.form == inf {
   1619 		// 0 / y
   1620 		// x / Inf
   1621 		z.form = zero
   1622 		return z
   1623 	}
   1624 
   1625 	// x / 0
   1626 	// Inf / y
   1627 	z.form = inf
   1628 	return z
   1629 }
   1630 
   1631 // Cmp compares x and y and returns:
   1632 //
   1633 //   -1 if x <  y
   1634 //    0 if x == y (incl. -0 == 0, -Inf == -Inf, and +Inf == +Inf)
   1635 //   +1 if x >  y
   1636 //
   1637 func (x *Float) Cmp(y *Float) int {
   1638 	if debugFloat {
   1639 		x.validate()
   1640 		y.validate()
   1641 	}
   1642 
   1643 	mx := x.ord()
   1644 	my := y.ord()
   1645 	switch {
   1646 	case mx < my:
   1647 		return -1
   1648 	case mx > my:
   1649 		return +1
   1650 	}
   1651 	// mx == my
   1652 
   1653 	// only if |mx| == 1 we have to compare the mantissae
   1654 	switch mx {
   1655 	case -1:
   1656 		return y.ucmp(x)
   1657 	case +1:
   1658 		return x.ucmp(y)
   1659 	}
   1660 
   1661 	return 0
   1662 }
   1663 
   1664 // ord classifies x and returns:
   1665 //
   1666 //	-2 if -Inf == x
   1667 //	-1 if -Inf < x < 0
   1668 //	 0 if x == 0 (signed or unsigned)
   1669 //	+1 if 0 < x < +Inf
   1670 //	+2 if x == +Inf
   1671 //
   1672 func (x *Float) ord() int {
   1673 	var m int
   1674 	switch x.form {
   1675 	case finite:
   1676 		m = 1
   1677 	case zero:
   1678 		return 0
   1679 	case inf:
   1680 		m = 2
   1681 	}
   1682 	if x.neg {
   1683 		m = -m
   1684 	}
   1685 	return m
   1686 }
   1687 
   1688 func umax32(x, y uint32) uint32 {
   1689 	if x > y {
   1690 		return x
   1691 	}
   1692 	return y
   1693 }
   1694