Home | History | Annotate | Download | only in big
      1 // Copyright 2009 The Go Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style
      3 // license that can be found in the LICENSE file.
      4 
      5 // This file implements unsigned multi-precision integers (natural
      6 // numbers). They are the building blocks for the implementation
      7 // of signed integers, rationals, and floating-point numbers.
      8 
      9 package big
     10 
     11 import (
     12 	"math/rand"
     13 	"sync"
     14 )
     15 
     16 // An unsigned integer x of the form
     17 //
     18 //   x = x[n-1]*_B^(n-1) + x[n-2]*_B^(n-2) + ... + x[1]*_B + x[0]
     19 //
     20 // with 0 <= x[i] < _B and 0 <= i < n is stored in a slice of length n,
     21 // with the digits x[i] as the slice elements.
     22 //
     23 // A number is normalized if the slice contains no leading 0 digits.
     24 // During arithmetic operations, denormalized values may occur but are
     25 // always normalized before returning the final result. The normalized
     26 // representation of 0 is the empty or nil slice (length = 0).
     27 //
     28 type nat []Word
     29 
     30 var (
     31 	natOne = nat{1}
     32 	natTwo = nat{2}
     33 	natTen = nat{10}
     34 )
     35 
     36 func (z nat) clear() {
     37 	for i := range z {
     38 		z[i] = 0
     39 	}
     40 }
     41 
     42 func (z nat) norm() nat {
     43 	i := len(z)
     44 	for i > 0 && z[i-1] == 0 {
     45 		i--
     46 	}
     47 	return z[0:i]
     48 }
     49 
     50 func (z nat) make(n int) nat {
     51 	if n <= cap(z) {
     52 		return z[:n] // reuse z
     53 	}
     54 	// Choosing a good value for e has significant performance impact
     55 	// because it increases the chance that a value can be reused.
     56 	const e = 4 // extra capacity
     57 	return make(nat, n, n+e)
     58 }
     59 
     60 func (z nat) setWord(x Word) nat {
     61 	if x == 0 {
     62 		return z[:0]
     63 	}
     64 	z = z.make(1)
     65 	z[0] = x
     66 	return z
     67 }
     68 
     69 func (z nat) setUint64(x uint64) nat {
     70 	// single-digit values
     71 	if w := Word(x); uint64(w) == x {
     72 		return z.setWord(w)
     73 	}
     74 
     75 	// compute number of words n required to represent x
     76 	n := 0
     77 	for t := x; t > 0; t >>= _W {
     78 		n++
     79 	}
     80 
     81 	// split x into n words
     82 	z = z.make(n)
     83 	for i := range z {
     84 		z[i] = Word(x & _M)
     85 		x >>= _W
     86 	}
     87 
     88 	return z
     89 }
     90 
     91 func (z nat) set(x nat) nat {
     92 	z = z.make(len(x))
     93 	copy(z, x)
     94 	return z
     95 }
     96 
     97 func (z nat) add(x, y nat) nat {
     98 	m := len(x)
     99 	n := len(y)
    100 
    101 	switch {
    102 	case m < n:
    103 		return z.add(y, x)
    104 	case m == 0:
    105 		// n == 0 because m >= n; result is 0
    106 		return z[:0]
    107 	case n == 0:
    108 		// result is x
    109 		return z.set(x)
    110 	}
    111 	// m > 0
    112 
    113 	z = z.make(m + 1)
    114 	c := addVV(z[0:n], x, y)
    115 	if m > n {
    116 		c = addVW(z[n:m], x[n:], c)
    117 	}
    118 	z[m] = c
    119 
    120 	return z.norm()
    121 }
    122 
    123 func (z nat) sub(x, y nat) nat {
    124 	m := len(x)
    125 	n := len(y)
    126 
    127 	switch {
    128 	case m < n:
    129 		panic("underflow")
    130 	case m == 0:
    131 		// n == 0 because m >= n; result is 0
    132 		return z[:0]
    133 	case n == 0:
    134 		// result is x
    135 		return z.set(x)
    136 	}
    137 	// m > 0
    138 
    139 	z = z.make(m)
    140 	c := subVV(z[0:n], x, y)
    141 	if m > n {
    142 		c = subVW(z[n:], x[n:], c)
    143 	}
    144 	if c != 0 {
    145 		panic("underflow")
    146 	}
    147 
    148 	return z.norm()
    149 }
    150 
    151 func (x nat) cmp(y nat) (r int) {
    152 	m := len(x)
    153 	n := len(y)
    154 	if m != n || m == 0 {
    155 		switch {
    156 		case m < n:
    157 			r = -1
    158 		case m > n:
    159 			r = 1
    160 		}
    161 		return
    162 	}
    163 
    164 	i := m - 1
    165 	for i > 0 && x[i] == y[i] {
    166 		i--
    167 	}
    168 
    169 	switch {
    170 	case x[i] < y[i]:
    171 		r = -1
    172 	case x[i] > y[i]:
    173 		r = 1
    174 	}
    175 	return
    176 }
    177 
    178 func (z nat) mulAddWW(x nat, y, r Word) nat {
    179 	m := len(x)
    180 	if m == 0 || y == 0 {
    181 		return z.setWord(r) // result is r
    182 	}
    183 	// m > 0
    184 
    185 	z = z.make(m + 1)
    186 	z[m] = mulAddVWW(z[0:m], x, y, r)
    187 
    188 	return z.norm()
    189 }
    190 
    191 // basicMul multiplies x and y and leaves the result in z.
    192 // The (non-normalized) result is placed in z[0 : len(x) + len(y)].
    193 func basicMul(z, x, y nat) {
    194 	z[0 : len(x)+len(y)].clear() // initialize z
    195 	for i, d := range y {
    196 		if d != 0 {
    197 			z[len(x)+i] = addMulVVW(z[i:i+len(x)], x, d)
    198 		}
    199 	}
    200 }
    201 
    202 // montgomery computes z mod m = x*y*2**(-n*_W) mod m,
    203 // assuming k = -1/m mod 2**_W.
    204 // z is used for storing the result which is returned;
    205 // z must not alias x, y or m.
    206 // See Gueron, "Efficient Software Implementations of Modular Exponentiation".
    207 // https://eprint.iacr.org/2011/239.pdf
    208 // In the terminology of that paper, this is an "Almost Montgomery Multiplication":
    209 // x and y are required to satisfy 0 <= z < 2**(n*_W) and then the result
    210 // z is guaranteed to satisfy 0 <= z < 2**(n*_W), but it may not be < m.
    211 func (z nat) montgomery(x, y, m nat, k Word, n int) nat {
    212 	// This code assumes x, y, m are all the same length, n.
    213 	// (required by addMulVVW and the for loop).
    214 	// It also assumes that x, y are already reduced mod m,
    215 	// or else the result will not be properly reduced.
    216 	if len(x) != n || len(y) != n || len(m) != n {
    217 		panic("math/big: mismatched montgomery number lengths")
    218 	}
    219 	z = z.make(n)
    220 	z.clear()
    221 	var c Word
    222 	for i := 0; i < n; i++ {
    223 		d := y[i]
    224 		c2 := addMulVVW(z, x, d)
    225 		t := z[0] * k
    226 		c3 := addMulVVW(z, m, t)
    227 		copy(z, z[1:])
    228 		cx := c + c2
    229 		cy := cx + c3
    230 		z[n-1] = cy
    231 		if cx < c2 || cy < c3 {
    232 			c = 1
    233 		} else {
    234 			c = 0
    235 		}
    236 	}
    237 	if c != 0 {
    238 		subVV(z, z, m)
    239 	}
    240 	return z
    241 }
    242 
    243 // Fast version of z[0:n+n>>1].add(z[0:n+n>>1], x[0:n]) w/o bounds checks.
    244 // Factored out for readability - do not use outside karatsuba.
    245 func karatsubaAdd(z, x nat, n int) {
    246 	if c := addVV(z[0:n], z, x); c != 0 {
    247 		addVW(z[n:n+n>>1], z[n:], c)
    248 	}
    249 }
    250 
    251 // Like karatsubaAdd, but does subtract.
    252 func karatsubaSub(z, x nat, n int) {
    253 	if c := subVV(z[0:n], z, x); c != 0 {
    254 		subVW(z[n:n+n>>1], z[n:], c)
    255 	}
    256 }
    257 
    258 // Operands that are shorter than karatsubaThreshold are multiplied using
    259 // "grade school" multiplication; for longer operands the Karatsuba algorithm
    260 // is used.
    261 var karatsubaThreshold int = 40 // computed by calibrate.go
    262 
    263 // karatsuba multiplies x and y and leaves the result in z.
    264 // Both x and y must have the same length n and n must be a
    265 // power of 2. The result vector z must have len(z) >= 6*n.
    266 // The (non-normalized) result is placed in z[0 : 2*n].
    267 func karatsuba(z, x, y nat) {
    268 	n := len(y)
    269 
    270 	// Switch to basic multiplication if numbers are odd or small.
    271 	// (n is always even if karatsubaThreshold is even, but be
    272 	// conservative)
    273 	if n&1 != 0 || n < karatsubaThreshold || n < 2 {
    274 		basicMul(z, x, y)
    275 		return
    276 	}
    277 	// n&1 == 0 && n >= karatsubaThreshold && n >= 2
    278 
    279 	// Karatsuba multiplication is based on the observation that
    280 	// for two numbers x and y with:
    281 	//
    282 	//   x = x1*b + x0
    283 	//   y = y1*b + y0
    284 	//
    285 	// the product x*y can be obtained with 3 products z2, z1, z0
    286 	// instead of 4:
    287 	//
    288 	//   x*y = x1*y1*b*b + (x1*y0 + x0*y1)*b + x0*y0
    289 	//       =    z2*b*b +              z1*b +    z0
    290 	//
    291 	// with:
    292 	//
    293 	//   xd = x1 - x0
    294 	//   yd = y0 - y1
    295 	//
    296 	//   z1 =      xd*yd                    + z2 + z0
    297 	//      = (x1-x0)*(y0 - y1)             + z2 + z0
    298 	//      = x1*y0 - x1*y1 - x0*y0 + x0*y1 + z2 + z0
    299 	//      = x1*y0 -    z2 -    z0 + x0*y1 + z2 + z0
    300 	//      = x1*y0                 + x0*y1
    301 
    302 	// split x, y into "digits"
    303 	n2 := n >> 1              // n2 >= 1
    304 	x1, x0 := x[n2:], x[0:n2] // x = x1*b + y0
    305 	y1, y0 := y[n2:], y[0:n2] // y = y1*b + y0
    306 
    307 	// z is used for the result and temporary storage:
    308 	//
    309 	//   6*n     5*n     4*n     3*n     2*n     1*n     0*n
    310 	// z = [z2 copy|z0 copy| xd*yd | yd:xd | x1*y1 | x0*y0 ]
    311 	//
    312 	// For each recursive call of karatsuba, an unused slice of
    313 	// z is passed in that has (at least) half the length of the
    314 	// caller's z.
    315 
    316 	// compute z0 and z2 with the result "in place" in z
    317 	karatsuba(z, x0, y0)     // z0 = x0*y0
    318 	karatsuba(z[n:], x1, y1) // z2 = x1*y1
    319 
    320 	// compute xd (or the negative value if underflow occurs)
    321 	s := 1 // sign of product xd*yd
    322 	xd := z[2*n : 2*n+n2]
    323 	if subVV(xd, x1, x0) != 0 { // x1-x0
    324 		s = -s
    325 		subVV(xd, x0, x1) // x0-x1
    326 	}
    327 
    328 	// compute yd (or the negative value if underflow occurs)
    329 	yd := z[2*n+n2 : 3*n]
    330 	if subVV(yd, y0, y1) != 0 { // y0-y1
    331 		s = -s
    332 		subVV(yd, y1, y0) // y1-y0
    333 	}
    334 
    335 	// p = (x1-x0)*(y0-y1) == x1*y0 - x1*y1 - x0*y0 + x0*y1 for s > 0
    336 	// p = (x0-x1)*(y0-y1) == x0*y0 - x0*y1 - x1*y0 + x1*y1 for s < 0
    337 	p := z[n*3:]
    338 	karatsuba(p, xd, yd)
    339 
    340 	// save original z2:z0
    341 	// (ok to use upper half of z since we're done recursing)
    342 	r := z[n*4:]
    343 	copy(r, z[:n*2])
    344 
    345 	// add up all partial products
    346 	//
    347 	//   2*n     n     0
    348 	// z = [ z2  | z0  ]
    349 	//   +    [ z0  ]
    350 	//   +    [ z2  ]
    351 	//   +    [  p  ]
    352 	//
    353 	karatsubaAdd(z[n2:], r, n)
    354 	karatsubaAdd(z[n2:], r[n:], n)
    355 	if s > 0 {
    356 		karatsubaAdd(z[n2:], p, n)
    357 	} else {
    358 		karatsubaSub(z[n2:], p, n)
    359 	}
    360 }
    361 
    362 // alias reports whether x and y share the same base array.
    363 func alias(x, y nat) bool {
    364 	return cap(x) > 0 && cap(y) > 0 && &x[0:cap(x)][cap(x)-1] == &y[0:cap(y)][cap(y)-1]
    365 }
    366 
    367 // addAt implements z += x<<(_W*i); z must be long enough.
    368 // (we don't use nat.add because we need z to stay the same
    369 // slice, and we don't need to normalize z after each addition)
    370 func addAt(z, x nat, i int) {
    371 	if n := len(x); n > 0 {
    372 		if c := addVV(z[i:i+n], z[i:], x); c != 0 {
    373 			j := i + n
    374 			if j < len(z) {
    375 				addVW(z[j:], z[j:], c)
    376 			}
    377 		}
    378 	}
    379 }
    380 
    381 func max(x, y int) int {
    382 	if x > y {
    383 		return x
    384 	}
    385 	return y
    386 }
    387 
    388 // karatsubaLen computes an approximation to the maximum k <= n such that
    389 // k = p<<i for a number p <= karatsubaThreshold and an i >= 0. Thus, the
    390 // result is the largest number that can be divided repeatedly by 2 before
    391 // becoming about the value of karatsubaThreshold.
    392 func karatsubaLen(n int) int {
    393 	i := uint(0)
    394 	for n > karatsubaThreshold {
    395 		n >>= 1
    396 		i++
    397 	}
    398 	return n << i
    399 }
    400 
    401 func (z nat) mul(x, y nat) nat {
    402 	m := len(x)
    403 	n := len(y)
    404 
    405 	switch {
    406 	case m < n:
    407 		return z.mul(y, x)
    408 	case m == 0 || n == 0:
    409 		return z[:0]
    410 	case n == 1:
    411 		return z.mulAddWW(x, y[0], 0)
    412 	}
    413 	// m >= n > 1
    414 
    415 	// determine if z can be reused
    416 	if alias(z, x) || alias(z, y) {
    417 		z = nil // z is an alias for x or y - cannot reuse
    418 	}
    419 
    420 	// use basic multiplication if the numbers are small
    421 	if n < karatsubaThreshold {
    422 		z = z.make(m + n)
    423 		basicMul(z, x, y)
    424 		return z.norm()
    425 	}
    426 	// m >= n && n >= karatsubaThreshold && n >= 2
    427 
    428 	// determine Karatsuba length k such that
    429 	//
    430 	//   x = xh*b + x0  (0 <= x0 < b)
    431 	//   y = yh*b + y0  (0 <= y0 < b)
    432 	//   b = 1<<(_W*k)  ("base" of digits xi, yi)
    433 	//
    434 	k := karatsubaLen(n)
    435 	// k <= n
    436 
    437 	// multiply x0 and y0 via Karatsuba
    438 	x0 := x[0:k]              // x0 is not normalized
    439 	y0 := y[0:k]              // y0 is not normalized
    440 	z = z.make(max(6*k, m+n)) // enough space for karatsuba of x0*y0 and full result of x*y
    441 	karatsuba(z, x0, y0)
    442 	z = z[0 : m+n]  // z has final length but may be incomplete
    443 	z[2*k:].clear() // upper portion of z is garbage (and 2*k <= m+n since k <= n <= m)
    444 
    445 	// If xh != 0 or yh != 0, add the missing terms to z. For
    446 	//
    447 	//   xh = xi*b^i + ... + x2*b^2 + x1*b (0 <= xi < b)
    448 	//   yh =                         y1*b (0 <= y1 < b)
    449 	//
    450 	// the missing terms are
    451 	//
    452 	//   x0*y1*b and xi*y0*b^i, xi*y1*b^(i+1) for i > 0
    453 	//
    454 	// since all the yi for i > 1 are 0 by choice of k: If any of them
    455 	// were > 0, then yh >= b^2 and thus y >= b^2. Then k' = k*2 would
    456 	// be a larger valid threshold contradicting the assumption about k.
    457 	//
    458 	if k < n || m != n {
    459 		var t nat
    460 
    461 		// add x0*y1*b
    462 		x0 := x0.norm()
    463 		y1 := y[k:]       // y1 is normalized because y is
    464 		t = t.mul(x0, y1) // update t so we don't lose t's underlying array
    465 		addAt(z, t, k)
    466 
    467 		// add xi*y0<<i, xi*y1*b<<(i+k)
    468 		y0 := y0.norm()
    469 		for i := k; i < len(x); i += k {
    470 			xi := x[i:]
    471 			if len(xi) > k {
    472 				xi = xi[:k]
    473 			}
    474 			xi = xi.norm()
    475 			t = t.mul(xi, y0)
    476 			addAt(z, t, i)
    477 			t = t.mul(xi, y1)
    478 			addAt(z, t, i+k)
    479 		}
    480 	}
    481 
    482 	return z.norm()
    483 }
    484 
    485 // mulRange computes the product of all the unsigned integers in the
    486 // range [a, b] inclusively. If a > b (empty range), the result is 1.
    487 func (z nat) mulRange(a, b uint64) nat {
    488 	switch {
    489 	case a == 0:
    490 		// cut long ranges short (optimization)
    491 		return z.setUint64(0)
    492 	case a > b:
    493 		return z.setUint64(1)
    494 	case a == b:
    495 		return z.setUint64(a)
    496 	case a+1 == b:
    497 		return z.mul(nat(nil).setUint64(a), nat(nil).setUint64(b))
    498 	}
    499 	m := (a + b) / 2
    500 	return z.mul(nat(nil).mulRange(a, m), nat(nil).mulRange(m+1, b))
    501 }
    502 
    503 // q = (x-r)/y, with 0 <= r < y
    504 func (z nat) divW(x nat, y Word) (q nat, r Word) {
    505 	m := len(x)
    506 	switch {
    507 	case y == 0:
    508 		panic("division by zero")
    509 	case y == 1:
    510 		q = z.set(x) // result is x
    511 		return
    512 	case m == 0:
    513 		q = z[:0] // result is 0
    514 		return
    515 	}
    516 	// m > 0
    517 	z = z.make(m)
    518 	r = divWVW(z, 0, x, y)
    519 	q = z.norm()
    520 	return
    521 }
    522 
    523 func (z nat) div(z2, u, v nat) (q, r nat) {
    524 	if len(v) == 0 {
    525 		panic("division by zero")
    526 	}
    527 
    528 	if u.cmp(v) < 0 {
    529 		q = z[:0]
    530 		r = z2.set(u)
    531 		return
    532 	}
    533 
    534 	if len(v) == 1 {
    535 		var r2 Word
    536 		q, r2 = z.divW(u, v[0])
    537 		r = z2.setWord(r2)
    538 		return
    539 	}
    540 
    541 	q, r = z.divLarge(z2, u, v)
    542 	return
    543 }
    544 
    545 // getNat returns a *nat of len n. The contents may not be zero.
    546 // The pool holds *nat to avoid allocation when converting to interface{}.
    547 func getNat(n int) *nat {
    548 	var z *nat
    549 	if v := natPool.Get(); v != nil {
    550 		z = v.(*nat)
    551 	}
    552 	if z == nil {
    553 		z = new(nat)
    554 	}
    555 	*z = z.make(n)
    556 	return z
    557 }
    558 
    559 func putNat(x *nat) {
    560 	natPool.Put(x)
    561 }
    562 
    563 var natPool sync.Pool
    564 
    565 // q = (uIn-r)/v, with 0 <= r < y
    566 // Uses z as storage for q, and u as storage for r if possible.
    567 // See Knuth, Volume 2, section 4.3.1, Algorithm D.
    568 // Preconditions:
    569 //    len(v) >= 2
    570 //    len(uIn) >= len(v)
    571 func (z nat) divLarge(u, uIn, v nat) (q, r nat) {
    572 	n := len(v)
    573 	m := len(uIn) - n
    574 
    575 	// determine if z can be reused
    576 	// TODO(gri) should find a better solution - this if statement
    577 	//           is very costly (see e.g. time pidigits -s -n 10000)
    578 	if alias(z, uIn) || alias(z, v) {
    579 		z = nil // z is an alias for uIn or v - cannot reuse
    580 	}
    581 	q = z.make(m + 1)
    582 
    583 	qhatvp := getNat(n + 1)
    584 	qhatv := *qhatvp
    585 	if alias(u, uIn) || alias(u, v) {
    586 		u = nil // u is an alias for uIn or v - cannot reuse
    587 	}
    588 	u = u.make(len(uIn) + 1)
    589 	u.clear() // TODO(gri) no need to clear if we allocated a new u
    590 
    591 	// D1.
    592 	var v1p *nat
    593 	shift := nlz(v[n-1])
    594 	if shift > 0 {
    595 		// do not modify v, it may be used by another goroutine simultaneously
    596 		v1p = getNat(n)
    597 		v1 := *v1p
    598 		shlVU(v1, v, shift)
    599 		v = v1
    600 	}
    601 	u[len(uIn)] = shlVU(u[0:len(uIn)], uIn, shift)
    602 
    603 	// D2.
    604 	vn1 := v[n-1]
    605 	for j := m; j >= 0; j-- {
    606 		// D3.
    607 		qhat := Word(_M)
    608 		if ujn := u[j+n]; ujn != vn1 {
    609 			var rhat Word
    610 			qhat, rhat = divWW(ujn, u[j+n-1], vn1)
    611 
    612 			// x1 | x2 = qv_{n-2}
    613 			vn2 := v[n-2]
    614 			x1, x2 := mulWW(qhat, vn2)
    615 			// test if qv_{n-2} > br + u_{j+n-2}
    616 			ujn2 := u[j+n-2]
    617 			for greaterThan(x1, x2, rhat, ujn2) {
    618 				qhat--
    619 				prevRhat := rhat
    620 				rhat += vn1
    621 				// v[n-1] >= 0, so this tests for overflow.
    622 				if rhat < prevRhat {
    623 					break
    624 				}
    625 				x1, x2 = mulWW(qhat, vn2)
    626 			}
    627 		}
    628 
    629 		// D4.
    630 		qhatv[n] = mulAddVWW(qhatv[0:n], v, qhat, 0)
    631 
    632 		c := subVV(u[j:j+len(qhatv)], u[j:], qhatv)
    633 		if c != 0 {
    634 			c := addVV(u[j:j+n], u[j:], v)
    635 			u[j+n] += c
    636 			qhat--
    637 		}
    638 
    639 		q[j] = qhat
    640 	}
    641 	if v1p != nil {
    642 		putNat(v1p)
    643 	}
    644 	putNat(qhatvp)
    645 
    646 	q = q.norm()
    647 	shrVU(u, u, shift)
    648 	r = u.norm()
    649 
    650 	return q, r
    651 }
    652 
    653 // Length of x in bits. x must be normalized.
    654 func (x nat) bitLen() int {
    655 	if i := len(x) - 1; i >= 0 {
    656 		return i*_W + bitLen(x[i])
    657 	}
    658 	return 0
    659 }
    660 
    661 const deBruijn32 = 0x077CB531
    662 
    663 var deBruijn32Lookup = [...]byte{
    664 	0, 1, 28, 2, 29, 14, 24, 3, 30, 22, 20, 15, 25, 17, 4, 8,
    665 	31, 27, 13, 23, 21, 19, 16, 7, 26, 12, 18, 6, 11, 5, 10, 9,
    666 }
    667 
    668 const deBruijn64 = 0x03f79d71b4ca8b09
    669 
    670 var deBruijn64Lookup = [...]byte{
    671 	0, 1, 56, 2, 57, 49, 28, 3, 61, 58, 42, 50, 38, 29, 17, 4,
    672 	62, 47, 59, 36, 45, 43, 51, 22, 53, 39, 33, 30, 24, 18, 12, 5,
    673 	63, 55, 48, 27, 60, 41, 37, 16, 46, 35, 44, 21, 52, 32, 23, 11,
    674 	54, 26, 40, 15, 34, 20, 31, 10, 25, 14, 19, 9, 13, 8, 7, 6,
    675 }
    676 
    677 // trailingZeroBits returns the number of consecutive least significant zero
    678 // bits of x.
    679 func trailingZeroBits(x Word) uint {
    680 	// x & -x leaves only the right-most bit set in the word. Let k be the
    681 	// index of that bit. Since only a single bit is set, the value is two
    682 	// to the power of k. Multiplying by a power of two is equivalent to
    683 	// left shifting, in this case by k bits. The de Bruijn constant is
    684 	// such that all six bit, consecutive substrings are distinct.
    685 	// Therefore, if we have a left shifted version of this constant we can
    686 	// find by how many bits it was shifted by looking at which six bit
    687 	// substring ended up at the top of the word.
    688 	// (Knuth, volume 4, section 7.3.1)
    689 	switch _W {
    690 	case 32:
    691 		return uint(deBruijn32Lookup[((x&-x)*deBruijn32)>>27])
    692 	case 64:
    693 		return uint(deBruijn64Lookup[((x&-x)*(deBruijn64&_M))>>58])
    694 	default:
    695 		panic("unknown word size")
    696 	}
    697 }
    698 
    699 // trailingZeroBits returns the number of consecutive least significant zero
    700 // bits of x.
    701 func (x nat) trailingZeroBits() uint {
    702 	if len(x) == 0 {
    703 		return 0
    704 	}
    705 	var i uint
    706 	for x[i] == 0 {
    707 		i++
    708 	}
    709 	// x[i] != 0
    710 	return i*_W + trailingZeroBits(x[i])
    711 }
    712 
    713 // z = x << s
    714 func (z nat) shl(x nat, s uint) nat {
    715 	m := len(x)
    716 	if m == 0 {
    717 		return z[:0]
    718 	}
    719 	// m > 0
    720 
    721 	n := m + int(s/_W)
    722 	z = z.make(n + 1)
    723 	z[n] = shlVU(z[n-m:n], x, s%_W)
    724 	z[0 : n-m].clear()
    725 
    726 	return z.norm()
    727 }
    728 
    729 // z = x >> s
    730 func (z nat) shr(x nat, s uint) nat {
    731 	m := len(x)
    732 	n := m - int(s/_W)
    733 	if n <= 0 {
    734 		return z[:0]
    735 	}
    736 	// n > 0
    737 
    738 	z = z.make(n)
    739 	shrVU(z, x[m-n:], s%_W)
    740 
    741 	return z.norm()
    742 }
    743 
    744 func (z nat) setBit(x nat, i uint, b uint) nat {
    745 	j := int(i / _W)
    746 	m := Word(1) << (i % _W)
    747 	n := len(x)
    748 	switch b {
    749 	case 0:
    750 		z = z.make(n)
    751 		copy(z, x)
    752 		if j >= n {
    753 			// no need to grow
    754 			return z
    755 		}
    756 		z[j] &^= m
    757 		return z.norm()
    758 	case 1:
    759 		if j >= n {
    760 			z = z.make(j + 1)
    761 			z[n:].clear()
    762 		} else {
    763 			z = z.make(n)
    764 		}
    765 		copy(z, x)
    766 		z[j] |= m
    767 		// no need to normalize
    768 		return z
    769 	}
    770 	panic("set bit is not 0 or 1")
    771 }
    772 
    773 // bit returns the value of the i'th bit, with lsb == bit 0.
    774 func (x nat) bit(i uint) uint {
    775 	j := i / _W
    776 	if j >= uint(len(x)) {
    777 		return 0
    778 	}
    779 	// 0 <= j < len(x)
    780 	return uint(x[j] >> (i % _W) & 1)
    781 }
    782 
    783 // sticky returns 1 if there's a 1 bit within the
    784 // i least significant bits, otherwise it returns 0.
    785 func (x nat) sticky(i uint) uint {
    786 	j := i / _W
    787 	if j >= uint(len(x)) {
    788 		if len(x) == 0 {
    789 			return 0
    790 		}
    791 		return 1
    792 	}
    793 	// 0 <= j < len(x)
    794 	for _, x := range x[:j] {
    795 		if x != 0 {
    796 			return 1
    797 		}
    798 	}
    799 	if x[j]<<(_W-i%_W) != 0 {
    800 		return 1
    801 	}
    802 	return 0
    803 }
    804 
    805 func (z nat) and(x, y nat) nat {
    806 	m := len(x)
    807 	n := len(y)
    808 	if m > n {
    809 		m = n
    810 	}
    811 	// m <= n
    812 
    813 	z = z.make(m)
    814 	for i := 0; i < m; i++ {
    815 		z[i] = x[i] & y[i]
    816 	}
    817 
    818 	return z.norm()
    819 }
    820 
    821 func (z nat) andNot(x, y nat) nat {
    822 	m := len(x)
    823 	n := len(y)
    824 	if n > m {
    825 		n = m
    826 	}
    827 	// m >= n
    828 
    829 	z = z.make(m)
    830 	for i := 0; i < n; i++ {
    831 		z[i] = x[i] &^ y[i]
    832 	}
    833 	copy(z[n:m], x[n:m])
    834 
    835 	return z.norm()
    836 }
    837 
    838 func (z nat) or(x, y nat) nat {
    839 	m := len(x)
    840 	n := len(y)
    841 	s := x
    842 	if m < n {
    843 		n, m = m, n
    844 		s = y
    845 	}
    846 	// m >= n
    847 
    848 	z = z.make(m)
    849 	for i := 0; i < n; i++ {
    850 		z[i] = x[i] | y[i]
    851 	}
    852 	copy(z[n:m], s[n:m])
    853 
    854 	return z.norm()
    855 }
    856 
    857 func (z nat) xor(x, y nat) nat {
    858 	m := len(x)
    859 	n := len(y)
    860 	s := x
    861 	if m < n {
    862 		n, m = m, n
    863 		s = y
    864 	}
    865 	// m >= n
    866 
    867 	z = z.make(m)
    868 	for i := 0; i < n; i++ {
    869 		z[i] = x[i] ^ y[i]
    870 	}
    871 	copy(z[n:m], s[n:m])
    872 
    873 	return z.norm()
    874 }
    875 
    876 // greaterThan reports whether (x1<<_W + x2) > (y1<<_W + y2)
    877 func greaterThan(x1, x2, y1, y2 Word) bool {
    878 	return x1 > y1 || x1 == y1 && x2 > y2
    879 }
    880 
    881 // modW returns x % d.
    882 func (x nat) modW(d Word) (r Word) {
    883 	// TODO(agl): we don't actually need to store the q value.
    884 	var q nat
    885 	q = q.make(len(x))
    886 	return divWVW(q, 0, x, d)
    887 }
    888 
    889 // random creates a random integer in [0..limit), using the space in z if
    890 // possible. n is the bit length of limit.
    891 func (z nat) random(rand *rand.Rand, limit nat, n int) nat {
    892 	if alias(z, limit) {
    893 		z = nil // z is an alias for limit - cannot reuse
    894 	}
    895 	z = z.make(len(limit))
    896 
    897 	bitLengthOfMSW := uint(n % _W)
    898 	if bitLengthOfMSW == 0 {
    899 		bitLengthOfMSW = _W
    900 	}
    901 	mask := Word((1 << bitLengthOfMSW) - 1)
    902 
    903 	for {
    904 		switch _W {
    905 		case 32:
    906 			for i := range z {
    907 				z[i] = Word(rand.Uint32())
    908 			}
    909 		case 64:
    910 			for i := range z {
    911 				z[i] = Word(rand.Uint32()) | Word(rand.Uint32())<<32
    912 			}
    913 		default:
    914 			panic("unknown word size")
    915 		}
    916 		z[len(limit)-1] &= mask
    917 		if z.cmp(limit) < 0 {
    918 			break
    919 		}
    920 	}
    921 
    922 	return z.norm()
    923 }
    924 
    925 // If m != 0 (i.e., len(m) != 0), expNN sets z to x**y mod m;
    926 // otherwise it sets z to x**y. The result is the value of z.
    927 func (z nat) expNN(x, y, m nat) nat {
    928 	if alias(z, x) || alias(z, y) {
    929 		// We cannot allow in-place modification of x or y.
    930 		z = nil
    931 	}
    932 
    933 	// x**y mod 1 == 0
    934 	if len(m) == 1 && m[0] == 1 {
    935 		return z.setWord(0)
    936 	}
    937 	// m == 0 || m > 1
    938 
    939 	// x**0 == 1
    940 	if len(y) == 0 {
    941 		return z.setWord(1)
    942 	}
    943 	// y > 0
    944 
    945 	// x**1 mod m == x mod m
    946 	if len(y) == 1 && y[0] == 1 && len(m) != 0 {
    947 		_, z = z.div(z, x, m)
    948 		return z
    949 	}
    950 	// y > 1
    951 
    952 	if len(m) != 0 {
    953 		// We likely end up being as long as the modulus.
    954 		z = z.make(len(m))
    955 	}
    956 	z = z.set(x)
    957 
    958 	// If the base is non-trivial and the exponent is large, we use
    959 	// 4-bit, windowed exponentiation. This involves precomputing 14 values
    960 	// (x^2...x^15) but then reduces the number of multiply-reduces by a
    961 	// third. Even for a 32-bit exponent, this reduces the number of
    962 	// operations. Uses Montgomery method for odd moduli.
    963 	if x.cmp(natOne) > 0 && len(y) > 1 && len(m) > 0 {
    964 		if m[0]&1 == 1 {
    965 			return z.expNNMontgomery(x, y, m)
    966 		}
    967 		return z.expNNWindowed(x, y, m)
    968 	}
    969 
    970 	v := y[len(y)-1] // v > 0 because y is normalized and y > 0
    971 	shift := nlz(v) + 1
    972 	v <<= shift
    973 	var q nat
    974 
    975 	const mask = 1 << (_W - 1)
    976 
    977 	// We walk through the bits of the exponent one by one. Each time we
    978 	// see a bit, we square, thus doubling the power. If the bit is a one,
    979 	// we also multiply by x, thus adding one to the power.
    980 
    981 	w := _W - int(shift)
    982 	// zz and r are used to avoid allocating in mul and div as
    983 	// otherwise the arguments would alias.
    984 	var zz, r nat
    985 	for j := 0; j < w; j++ {
    986 		zz = zz.mul(z, z)
    987 		zz, z = z, zz
    988 
    989 		if v&mask != 0 {
    990 			zz = zz.mul(z, x)
    991 			zz, z = z, zz
    992 		}
    993 
    994 		if len(m) != 0 {
    995 			zz, r = zz.div(r, z, m)
    996 			zz, r, q, z = q, z, zz, r
    997 		}
    998 
    999 		v <<= 1
   1000 	}
   1001 
   1002 	for i := len(y) - 2; i >= 0; i-- {
   1003 		v = y[i]
   1004 
   1005 		for j := 0; j < _W; j++ {
   1006 			zz = zz.mul(z, z)
   1007 			zz, z = z, zz
   1008 
   1009 			if v&mask != 0 {
   1010 				zz = zz.mul(z, x)
   1011 				zz, z = z, zz
   1012 			}
   1013 
   1014 			if len(m) != 0 {
   1015 				zz, r = zz.div(r, z, m)
   1016 				zz, r, q, z = q, z, zz, r
   1017 			}
   1018 
   1019 			v <<= 1
   1020 		}
   1021 	}
   1022 
   1023 	return z.norm()
   1024 }
   1025 
   1026 // expNNWindowed calculates x**y mod m using a fixed, 4-bit window.
   1027 func (z nat) expNNWindowed(x, y, m nat) nat {
   1028 	// zz and r are used to avoid allocating in mul and div as otherwise
   1029 	// the arguments would alias.
   1030 	var zz, r nat
   1031 
   1032 	const n = 4
   1033 	// powers[i] contains x^i.
   1034 	var powers [1 << n]nat
   1035 	powers[0] = natOne
   1036 	powers[1] = x
   1037 	for i := 2; i < 1<<n; i += 2 {
   1038 		p2, p, p1 := &powers[i/2], &powers[i], &powers[i+1]
   1039 		*p = p.mul(*p2, *p2)
   1040 		zz, r = zz.div(r, *p, m)
   1041 		*p, r = r, *p
   1042 		*p1 = p1.mul(*p, x)
   1043 		zz, r = zz.div(r, *p1, m)
   1044 		*p1, r = r, *p1
   1045 	}
   1046 
   1047 	z = z.setWord(1)
   1048 
   1049 	for i := len(y) - 1; i >= 0; i-- {
   1050 		yi := y[i]
   1051 		for j := 0; j < _W; j += n {
   1052 			if i != len(y)-1 || j != 0 {
   1053 				// Unrolled loop for significant performance
   1054 				// gain. Use go test -bench=".*" in crypto/rsa
   1055 				// to check performance before making changes.
   1056 				zz = zz.mul(z, z)
   1057 				zz, z = z, zz
   1058 				zz, r = zz.div(r, z, m)
   1059 				z, r = r, z
   1060 
   1061 				zz = zz.mul(z, z)
   1062 				zz, z = z, zz
   1063 				zz, r = zz.div(r, z, m)
   1064 				z, r = r, z
   1065 
   1066 				zz = zz.mul(z, z)
   1067 				zz, z = z, zz
   1068 				zz, r = zz.div(r, z, m)
   1069 				z, r = r, z
   1070 
   1071 				zz = zz.mul(z, z)
   1072 				zz, z = z, zz
   1073 				zz, r = zz.div(r, z, m)
   1074 				z, r = r, z
   1075 			}
   1076 
   1077 			zz = zz.mul(z, powers[yi>>(_W-n)])
   1078 			zz, z = z, zz
   1079 			zz, r = zz.div(r, z, m)
   1080 			z, r = r, z
   1081 
   1082 			yi <<= n
   1083 		}
   1084 	}
   1085 
   1086 	return z.norm()
   1087 }
   1088 
   1089 // expNNMontgomery calculates x**y mod m using a fixed, 4-bit window.
   1090 // Uses Montgomery representation.
   1091 func (z nat) expNNMontgomery(x, y, m nat) nat {
   1092 	numWords := len(m)
   1093 
   1094 	// We want the lengths of x and m to be equal.
   1095 	// It is OK if x >= m as long as len(x) == len(m).
   1096 	if len(x) > numWords {
   1097 		_, x = nat(nil).div(nil, x, m)
   1098 		// Note: now len(x) <= numWords, not guaranteed ==.
   1099 	}
   1100 	if len(x) < numWords {
   1101 		rr := make(nat, numWords)
   1102 		copy(rr, x)
   1103 		x = rr
   1104 	}
   1105 
   1106 	// Ideally the precomputations would be performed outside, and reused
   1107 	// k0 = -m**-1 mod 2**_W. Algorithm from: Dumas, J.G. "On NewtonRaphson
   1108 	// Iteration for Multiplicative Inverses Modulo Prime Powers".
   1109 	k0 := 2 - m[0]
   1110 	t := m[0] - 1
   1111 	for i := 1; i < _W; i <<= 1 {
   1112 		t *= t
   1113 		k0 *= (t + 1)
   1114 	}
   1115 	k0 = -k0
   1116 
   1117 	// RR = 2**(2*_W*len(m)) mod m
   1118 	RR := nat(nil).setWord(1)
   1119 	zz := nat(nil).shl(RR, uint(2*numWords*_W))
   1120 	_, RR = RR.div(RR, zz, m)
   1121 	if len(RR) < numWords {
   1122 		zz = zz.make(numWords)
   1123 		copy(zz, RR)
   1124 		RR = zz
   1125 	}
   1126 	// one = 1, with equal length to that of m
   1127 	one := make(nat, numWords)
   1128 	one[0] = 1
   1129 
   1130 	const n = 4
   1131 	// powers[i] contains x^i
   1132 	var powers [1 << n]nat
   1133 	powers[0] = powers[0].montgomery(one, RR, m, k0, numWords)
   1134 	powers[1] = powers[1].montgomery(x, RR, m, k0, numWords)
   1135 	for i := 2; i < 1<<n; i++ {
   1136 		powers[i] = powers[i].montgomery(powers[i-1], powers[1], m, k0, numWords)
   1137 	}
   1138 
   1139 	// initialize z = 1 (Montgomery 1)
   1140 	z = z.make(numWords)
   1141 	copy(z, powers[0])
   1142 
   1143 	zz = zz.make(numWords)
   1144 
   1145 	// same windowed exponent, but with Montgomery multiplications
   1146 	for i := len(y) - 1; i >= 0; i-- {
   1147 		yi := y[i]
   1148 		for j := 0; j < _W; j += n {
   1149 			if i != len(y)-1 || j != 0 {
   1150 				zz = zz.montgomery(z, z, m, k0, numWords)
   1151 				z = z.montgomery(zz, zz, m, k0, numWords)
   1152 				zz = zz.montgomery(z, z, m, k0, numWords)
   1153 				z = z.montgomery(zz, zz, m, k0, numWords)
   1154 			}
   1155 			zz = zz.montgomery(z, powers[yi>>(_W-n)], m, k0, numWords)
   1156 			z, zz = zz, z
   1157 			yi <<= n
   1158 		}
   1159 	}
   1160 	// convert to regular number
   1161 	zz = zz.montgomery(z, one, m, k0, numWords)
   1162 
   1163 	// One last reduction, just in case.
   1164 	// See golang.org/issue/13907.
   1165 	if zz.cmp(m) >= 0 {
   1166 		// Common case is m has high bit set; in that case,
   1167 		// since zz is the same length as m, there can be just
   1168 		// one multiple of m to remove. Just subtract.
   1169 		// We think that the subtract should be sufficient in general,
   1170 		// so do that unconditionally, but double-check,
   1171 		// in case our beliefs are wrong.
   1172 		// The div is not expected to be reached.
   1173 		zz = zz.sub(zz, m)
   1174 		if zz.cmp(m) >= 0 {
   1175 			_, zz = nat(nil).div(nil, zz, m)
   1176 		}
   1177 	}
   1178 
   1179 	return zz.norm()
   1180 }
   1181 
   1182 // bytes writes the value of z into buf using big-endian encoding.
   1183 // len(buf) must be >= len(z)*_S. The value of z is encoded in the
   1184 // slice buf[i:]. The number i of unused bytes at the beginning of
   1185 // buf is returned as result.
   1186 func (z nat) bytes(buf []byte) (i int) {
   1187 	i = len(buf)
   1188 	for _, d := range z {
   1189 		for j := 0; j < _S; j++ {
   1190 			i--
   1191 			buf[i] = byte(d)
   1192 			d >>= 8
   1193 		}
   1194 	}
   1195 
   1196 	for i < len(buf) && buf[i] == 0 {
   1197 		i++
   1198 	}
   1199 
   1200 	return
   1201 }
   1202 
   1203 // setBytes interprets buf as the bytes of a big-endian unsigned
   1204 // integer, sets z to that value, and returns z.
   1205 func (z nat) setBytes(buf []byte) nat {
   1206 	z = z.make((len(buf) + _S - 1) / _S)
   1207 
   1208 	k := 0
   1209 	s := uint(0)
   1210 	var d Word
   1211 	for i := len(buf); i > 0; i-- {
   1212 		d |= Word(buf[i-1]) << s
   1213 		if s += 8; s == _S*8 {
   1214 			z[k] = d
   1215 			k++
   1216 			s = 0
   1217 			d = 0
   1218 		}
   1219 	}
   1220 	if k < len(z) {
   1221 		z[k] = d
   1222 	}
   1223 
   1224 	return z.norm()
   1225 }
   1226 
   1227 // sqrt sets z = x
   1228 func (z nat) sqrt(x nat) nat {
   1229 	if x.cmp(natOne) <= 0 {
   1230 		return z.set(x)
   1231 	}
   1232 	if alias(z, x) {
   1233 		z = nil
   1234 	}
   1235 
   1236 	// Start with value known to be too large and repeat "z = (z + x/z)/2" until it stops getting smaller.
   1237 	// See Brent and Zimmermann, Modern Computer Arithmetic, Algorithm 1.13 (SqrtInt).
   1238 	// https://members.loria.fr/PZimmermann/mca/pub226.html
   1239 	// If x is one less than a perfect square, the sequence oscillates between the correct z and z+1;
   1240 	// otherwise it converges to the correct z and stays there.
   1241 	var z1, z2 nat
   1242 	z1 = z
   1243 	z1 = z1.setUint64(1)
   1244 	z1 = z1.shl(z1, uint(x.bitLen()/2+1)) // must be  x
   1245 	for n := 0; ; n++ {
   1246 		z2, _ = z2.div(nil, x, z1)
   1247 		z2 = z2.add(z2, z1)
   1248 		z2 = z2.shr(z2, 1)
   1249 		if z2.cmp(z1) >= 0 {
   1250 			// z1 is answer.
   1251 			// Figure out whether z1 or z2 is currently aliased to z by looking at loop count.
   1252 			if n&1 == 0 {
   1253 				return z1
   1254 			}
   1255 			return z.set(z1)
   1256 		}
   1257 		z1, z2 = z2, z1
   1258 	}
   1259 }
   1260