Home | History | Annotate | Download | only in elliptic
      1 // Copyright 2013 The Go Authors.  All rights reserved.
      2 // Use of this source code is governed by a BSD-style
      3 // license that can be found in the LICENSE file.
      4 
      5 package elliptic
      6 
      7 // This file contains a constant-time, 32-bit implementation of P256.
      8 
      9 import (
     10 	"math/big"
     11 )
     12 
     13 type p256Curve struct {
     14 	*CurveParams
     15 }
     16 
     17 var (
     18 	p256 p256Curve
     19 	// RInverse contains 1/R mod p - the inverse of the Montgomery constant
     20 	// (2**257).
     21 	p256RInverse *big.Int
     22 )
     23 
     24 func initP256() {
     25 	// See FIPS 186-3, section D.2.3
     26 	p256.CurveParams = &CurveParams{Name: "P-256"}
     27 	p256.P, _ = new(big.Int).SetString("115792089210356248762697446949407573530086143415290314195533631308867097853951", 10)
     28 	p256.N, _ = new(big.Int).SetString("115792089210356248762697446949407573529996955224135760342422259061068512044369", 10)
     29 	p256.B, _ = new(big.Int).SetString("5ac635d8aa3a93e7b3ebbd55769886bc651d06b0cc53b0f63bce3c3e27d2604b", 16)
     30 	p256.Gx, _ = new(big.Int).SetString("6b17d1f2e12c4247f8bce6e563a440f277037d812deb33a0f4a13945d898c296", 16)
     31 	p256.Gy, _ = new(big.Int).SetString("4fe342e2fe1a7f9b8ee7eb4a7c0f9e162bce33576b315ececbb6406837bf51f5", 16)
     32 	p256.BitSize = 256
     33 
     34 	p256RInverse, _ = new(big.Int).SetString("7fffffff00000001fffffffe8000000100000000ffffffff0000000180000000", 16)
     35 }
     36 
     37 func (curve p256Curve) Params() *CurveParams {
     38 	return curve.CurveParams
     39 }
     40 
     41 // p256GetScalar endian-swaps the big-endian scalar value from in and writes it
     42 // to out. If the scalar is equal or greater than the order of the group, it's
     43 // reduced modulo that order.
     44 func p256GetScalar(out *[32]byte, in []byte) {
     45 	n := new(big.Int).SetBytes(in)
     46 	var scalarBytes []byte
     47 
     48 	if n.Cmp(p256.N) >= 0 {
     49 		n.Mod(n, p256.N)
     50 		scalarBytes = n.Bytes()
     51 	} else {
     52 		scalarBytes = in
     53 	}
     54 
     55 	for i, v := range scalarBytes {
     56 		out[len(scalarBytes)-(1+i)] = v
     57 	}
     58 }
     59 
     60 func (p256Curve) ScalarBaseMult(scalar []byte) (x, y *big.Int) {
     61 	var scalarReversed [32]byte
     62 	p256GetScalar(&scalarReversed, scalar)
     63 
     64 	var x1, y1, z1 [p256Limbs]uint32
     65 	p256ScalarBaseMult(&x1, &y1, &z1, &scalarReversed)
     66 	return p256ToAffine(&x1, &y1, &z1)
     67 }
     68 
     69 func (p256Curve) ScalarMult(bigX, bigY *big.Int, scalar []byte) (x, y *big.Int) {
     70 	var scalarReversed [32]byte
     71 	p256GetScalar(&scalarReversed, scalar)
     72 
     73 	var px, py, x1, y1, z1 [p256Limbs]uint32
     74 	p256FromBig(&px, bigX)
     75 	p256FromBig(&py, bigY)
     76 	p256ScalarMult(&x1, &y1, &z1, &px, &py, &scalarReversed)
     77 	return p256ToAffine(&x1, &y1, &z1)
     78 }
     79 
     80 // Field elements are represented as nine, unsigned 32-bit words.
     81 //
     82 // The value of an field element is:
     83 //   x[0] + (x[1] * 2**29) + (x[2] * 2**57) + ... + (x[8] * 2**228)
     84 //
     85 // That is, each limb is alternately 29 or 28-bits wide in little-endian
     86 // order.
     87 //
     88 // This means that a field element hits 2**257, rather than 2**256 as we would
     89 // like. A 28, 29, ... pattern would cause us to hit 2**256, but that causes
     90 // problems when multiplying as terms end up one bit short of a limb which
     91 // would require much bit-shifting to correct.
     92 //
     93 // Finally, the values stored in a field element are in Montgomery form. So the
     94 // value |y| is stored as (y*R) mod p, where p is the P-256 prime and R is
     95 // 2**257.
     96 
     97 const (
     98 	p256Limbs    = 9
     99 	bottom29Bits = 0x1fffffff
    100 )
    101 
    102 var (
    103 	// p256One is the number 1 as a field element.
    104 	p256One  = [p256Limbs]uint32{2, 0, 0, 0xffff800, 0x1fffffff, 0xfffffff, 0x1fbfffff, 0x1ffffff, 0}
    105 	p256Zero = [p256Limbs]uint32{0, 0, 0, 0, 0, 0, 0, 0, 0}
    106 	// p256P is the prime modulus as a field element.
    107 	p256P = [p256Limbs]uint32{0x1fffffff, 0xfffffff, 0x1fffffff, 0x3ff, 0, 0, 0x200000, 0xf000000, 0xfffffff}
    108 	// p2562P is the twice prime modulus as a field element.
    109 	p2562P = [p256Limbs]uint32{0x1ffffffe, 0xfffffff, 0x1fffffff, 0x7ff, 0, 0, 0x400000, 0xe000000, 0x1fffffff}
    110 )
    111 
    112 // p256Precomputed contains precomputed values to aid the calculation of scalar
    113 // multiples of the base point, G. It's actually two, equal length, tables
    114 // concatenated.
    115 //
    116 // The first table contains (x,y) field element pairs for 16 multiples of the
    117 // base point, G.
    118 //
    119 //   Index  |  Index (binary) | Value
    120 //       0  |           0000  | 0G (all zeros, omitted)
    121 //       1  |           0001  | G
    122 //       2  |           0010  | 2**64G
    123 //       3  |           0011  | 2**64G + G
    124 //       4  |           0100  | 2**128G
    125 //       5  |           0101  | 2**128G + G
    126 //       6  |           0110  | 2**128G + 2**64G
    127 //       7  |           0111  | 2**128G + 2**64G + G
    128 //       8  |           1000  | 2**192G
    129 //       9  |           1001  | 2**192G + G
    130 //      10  |           1010  | 2**192G + 2**64G
    131 //      11  |           1011  | 2**192G + 2**64G + G
    132 //      12  |           1100  | 2**192G + 2**128G
    133 //      13  |           1101  | 2**192G + 2**128G + G
    134 //      14  |           1110  | 2**192G + 2**128G + 2**64G
    135 //      15  |           1111  | 2**192G + 2**128G + 2**64G + G
    136 //
    137 // The second table follows the same style, but the terms are 2**32G,
    138 // 2**96G, 2**160G, 2**224G.
    139 //
    140 // This is ~2KB of data.
    141 var p256Precomputed = [p256Limbs * 2 * 15 * 2]uint32{
    142 	0x11522878, 0xe730d41, 0xdb60179, 0x4afe2ff, 0x12883add, 0xcaddd88, 0x119e7edc, 0xd4a6eab, 0x3120bee,
    143 	0x1d2aac15, 0xf25357c, 0x19e45cdd, 0x5c721d0, 0x1992c5a5, 0xa237487, 0x154ba21, 0x14b10bb, 0xae3fe3,
    144 	0xd41a576, 0x922fc51, 0x234994f, 0x60b60d3, 0x164586ae, 0xce95f18, 0x1fe49073, 0x3fa36cc, 0x5ebcd2c,
    145 	0xb402f2f, 0x15c70bf, 0x1561925c, 0x5a26704, 0xda91e90, 0xcdc1c7f, 0x1ea12446, 0xe1ade1e, 0xec91f22,
    146 	0x26f7778, 0x566847e, 0xa0bec9e, 0x234f453, 0x1a31f21a, 0xd85e75c, 0x56c7109, 0xa267a00, 0xb57c050,
    147 	0x98fb57, 0xaa837cc, 0x60c0792, 0xcfa5e19, 0x61bab9e, 0x589e39b, 0xa324c5, 0x7d6dee7, 0x2976e4b,
    148 	0x1fc4124a, 0xa8c244b, 0x1ce86762, 0xcd61c7e, 0x1831c8e0, 0x75774e1, 0x1d96a5a9, 0x843a649, 0xc3ab0fa,
    149 	0x6e2e7d5, 0x7673a2a, 0x178b65e8, 0x4003e9b, 0x1a1f11c2, 0x7816ea, 0xf643e11, 0x58c43df, 0xf423fc2,
    150 	0x19633ffa, 0x891f2b2, 0x123c231c, 0x46add8c, 0x54700dd, 0x59e2b17, 0x172db40f, 0x83e277d, 0xb0dd609,
    151 	0xfd1da12, 0x35c6e52, 0x19ede20c, 0xd19e0c0, 0x97d0f40, 0xb015b19, 0x449e3f5, 0xe10c9e, 0x33ab581,
    152 	0x56a67ab, 0x577734d, 0x1dddc062, 0xc57b10d, 0x149b39d, 0x26a9e7b, 0xc35df9f, 0x48764cd, 0x76dbcca,
    153 	0xca4b366, 0xe9303ab, 0x1a7480e7, 0x57e9e81, 0x1e13eb50, 0xf466cf3, 0x6f16b20, 0x4ba3173, 0xc168c33,
    154 	0x15cb5439, 0x6a38e11, 0x73658bd, 0xb29564f, 0x3f6dc5b, 0x53b97e, 0x1322c4c0, 0x65dd7ff, 0x3a1e4f6,
    155 	0x14e614aa, 0x9246317, 0x1bc83aca, 0xad97eed, 0xd38ce4a, 0xf82b006, 0x341f077, 0xa6add89, 0x4894acd,
    156 	0x9f162d5, 0xf8410ef, 0x1b266a56, 0xd7f223, 0x3e0cb92, 0xe39b672, 0x6a2901a, 0x69a8556, 0x7e7c0,
    157 	0x9b7d8d3, 0x309a80, 0x1ad05f7f, 0xc2fb5dd, 0xcbfd41d, 0x9ceb638, 0x1051825c, 0xda0cf5b, 0x812e881,
    158 	0x6f35669, 0x6a56f2c, 0x1df8d184, 0x345820, 0x1477d477, 0x1645db1, 0xbe80c51, 0xc22be3e, 0xe35e65a,
    159 	0x1aeb7aa0, 0xc375315, 0xf67bc99, 0x7fdd7b9, 0x191fc1be, 0x61235d, 0x2c184e9, 0x1c5a839, 0x47a1e26,
    160 	0xb7cb456, 0x93e225d, 0x14f3c6ed, 0xccc1ac9, 0x17fe37f3, 0x4988989, 0x1a90c502, 0x2f32042, 0xa17769b,
    161 	0xafd8c7c, 0x8191c6e, 0x1dcdb237, 0x16200c0, 0x107b32a1, 0x66c08db, 0x10d06a02, 0x3fc93, 0x5620023,
    162 	0x16722b27, 0x68b5c59, 0x270fcfc, 0xfad0ecc, 0xe5de1c2, 0xeab466b, 0x2fc513c, 0x407f75c, 0xbaab133,
    163 	0x9705fe9, 0xb88b8e7, 0x734c993, 0x1e1ff8f, 0x19156970, 0xabd0f00, 0x10469ea7, 0x3293ac0, 0xcdc98aa,
    164 	0x1d843fd, 0xe14bfe8, 0x15be825f, 0x8b5212, 0xeb3fb67, 0x81cbd29, 0xbc62f16, 0x2b6fcc7, 0xf5a4e29,
    165 	0x13560b66, 0xc0b6ac2, 0x51ae690, 0xd41e271, 0xf3e9bd4, 0x1d70aab, 0x1029f72, 0x73e1c35, 0xee70fbc,
    166 	0xad81baf, 0x9ecc49a, 0x86c741e, 0xfe6be30, 0x176752e7, 0x23d416, 0x1f83de85, 0x27de188, 0x66f70b8,
    167 	0x181cd51f, 0x96b6e4c, 0x188f2335, 0xa5df759, 0x17a77eb6, 0xfeb0e73, 0x154ae914, 0x2f3ec51, 0x3826b59,
    168 	0xb91f17d, 0x1c72949, 0x1362bf0a, 0xe23fddf, 0xa5614b0, 0xf7d8f, 0x79061, 0x823d9d2, 0x8213f39,
    169 	0x1128ae0b, 0xd095d05, 0xb85c0c2, 0x1ecb2ef, 0x24ddc84, 0xe35e901, 0x18411a4a, 0xf5ddc3d, 0x3786689,
    170 	0x52260e8, 0x5ae3564, 0x542b10d, 0x8d93a45, 0x19952aa4, 0x996cc41, 0x1051a729, 0x4be3499, 0x52b23aa,
    171 	0x109f307e, 0x6f5b6bb, 0x1f84e1e7, 0x77a0cfa, 0x10c4df3f, 0x25a02ea, 0xb048035, 0xe31de66, 0xc6ecaa3,
    172 	0x28ea335, 0x2886024, 0x1372f020, 0xf55d35, 0x15e4684c, 0xf2a9e17, 0x1a4a7529, 0xcb7beb1, 0xb2a78a1,
    173 	0x1ab21f1f, 0x6361ccf, 0x6c9179d, 0xb135627, 0x1267b974, 0x4408bad, 0x1cbff658, 0xe3d6511, 0xc7d76f,
    174 	0x1cc7a69, 0xe7ee31b, 0x54fab4f, 0x2b914f, 0x1ad27a30, 0xcd3579e, 0xc50124c, 0x50daa90, 0xb13f72,
    175 	0xb06aa75, 0x70f5cc6, 0x1649e5aa, 0x84a5312, 0x329043c, 0x41c4011, 0x13d32411, 0xb04a838, 0xd760d2d,
    176 	0x1713b532, 0xbaa0c03, 0x84022ab, 0x6bcf5c1, 0x2f45379, 0x18ae070, 0x18c9e11e, 0x20bca9a, 0x66f496b,
    177 	0x3eef294, 0x67500d2, 0xd7f613c, 0x2dbbeb, 0xb741038, 0xe04133f, 0x1582968d, 0xbe985f7, 0x1acbc1a,
    178 	0x1a6a939f, 0x33e50f6, 0xd665ed4, 0xb4b7bd6, 0x1e5a3799, 0x6b33847, 0x17fa56ff, 0x65ef930, 0x21dc4a,
    179 	0x2b37659, 0x450fe17, 0xb357b65, 0xdf5efac, 0x15397bef, 0x9d35a7f, 0x112ac15f, 0x624e62e, 0xa90ae2f,
    180 	0x107eecd2, 0x1f69bbe, 0x77d6bce, 0x5741394, 0x13c684fc, 0x950c910, 0x725522b, 0xdc78583, 0x40eeabb,
    181 	0x1fde328a, 0xbd61d96, 0xd28c387, 0x9e77d89, 0x12550c40, 0x759cb7d, 0x367ef34, 0xae2a960, 0x91b8bdc,
    182 	0x93462a9, 0xf469ef, 0xb2e9aef, 0xd2ca771, 0x54e1f42, 0x7aaa49, 0x6316abb, 0x2413c8e, 0x5425bf9,
    183 	0x1bed3e3a, 0xf272274, 0x1f5e7326, 0x6416517, 0xea27072, 0x9cedea7, 0x6e7633, 0x7c91952, 0xd806dce,
    184 	0x8e2a7e1, 0xe421e1a, 0x418c9e1, 0x1dbc890, 0x1b395c36, 0xa1dc175, 0x1dc4ef73, 0x8956f34, 0xe4b5cf2,
    185 	0x1b0d3a18, 0x3194a36, 0x6c2641f, 0xe44124c, 0xa2f4eaa, 0xa8c25ba, 0xf927ed7, 0x627b614, 0x7371cca,
    186 	0xba16694, 0x417bc03, 0x7c0a7e3, 0x9c35c19, 0x1168a205, 0x8b6b00d, 0x10e3edc9, 0x9c19bf2, 0x5882229,
    187 	0x1b2b4162, 0xa5cef1a, 0x1543622b, 0x9bd433e, 0x364e04d, 0x7480792, 0x5c9b5b3, 0xe85ff25, 0x408ef57,
    188 	0x1814cfa4, 0x121b41b, 0xd248a0f, 0x3b05222, 0x39bb16a, 0xc75966d, 0xa038113, 0xa4a1769, 0x11fbc6c,
    189 	0x917e50e, 0xeec3da8, 0x169d6eac, 0x10c1699, 0xa416153, 0xf724912, 0x15cd60b7, 0x4acbad9, 0x5efc5fa,
    190 	0xf150ed7, 0x122b51, 0x1104b40a, 0xcb7f442, 0xfbb28ff, 0x6ac53ca, 0x196142cc, 0x7bf0fa9, 0x957651,
    191 	0x4e0f215, 0xed439f8, 0x3f46bd5, 0x5ace82f, 0x110916b6, 0x6db078, 0xffd7d57, 0xf2ecaac, 0xca86dec,
    192 	0x15d6b2da, 0x965ecc9, 0x1c92b4c2, 0x1f3811, 0x1cb080f5, 0x2d8b804, 0x19d1c12d, 0xf20bd46, 0x1951fa7,
    193 	0xa3656c3, 0x523a425, 0xfcd0692, 0xd44ddc8, 0x131f0f5b, 0xaf80e4a, 0xcd9fc74, 0x99bb618, 0x2db944c,
    194 	0xa673090, 0x1c210e1, 0x178c8d23, 0x1474383, 0x10b8743d, 0x985a55b, 0x2e74779, 0x576138, 0x9587927,
    195 	0x133130fa, 0xbe05516, 0x9f4d619, 0xbb62570, 0x99ec591, 0xd9468fe, 0x1d07782d, 0xfc72e0b, 0x701b298,
    196 	0x1863863b, 0x85954b8, 0x121a0c36, 0x9e7fedf, 0xf64b429, 0x9b9d71e, 0x14e2f5d8, 0xf858d3a, 0x942eea8,
    197 	0xda5b765, 0x6edafff, 0xa9d18cc, 0xc65e4ba, 0x1c747e86, 0xe4ea915, 0x1981d7a1, 0x8395659, 0x52ed4e2,
    198 	0x87d43b7, 0x37ab11b, 0x19d292ce, 0xf8d4692, 0x18c3053f, 0x8863e13, 0x4c146c0, 0x6bdf55a, 0x4e4457d,
    199 	0x16152289, 0xac78ec2, 0x1a59c5a2, 0x2028b97, 0x71c2d01, 0x295851f, 0x404747b, 0x878558d, 0x7d29aa4,
    200 	0x13d8341f, 0x8daefd7, 0x139c972d, 0x6b7ea75, 0xd4a9dde, 0xff163d8, 0x81d55d7, 0xa5bef68, 0xb7b30d8,
    201 	0xbe73d6f, 0xaa88141, 0xd976c81, 0x7e7a9cc, 0x18beb771, 0xd773cbd, 0x13f51951, 0x9d0c177, 0x1c49a78,
    202 }
    203 
    204 // Field element operations:
    205 
    206 // nonZeroToAllOnes returns:
    207 //   0xffffffff for 0 < x <= 2**31
    208 //   0 for x == 0 or x > 2**31.
    209 func nonZeroToAllOnes(x uint32) uint32 {
    210 	return ((x - 1) >> 31) - 1
    211 }
    212 
    213 // p256ReduceCarry adds a multiple of p in order to cancel |carry|,
    214 // which is a term at 2**257.
    215 //
    216 // On entry: carry < 2**3, inout[0,2,...] < 2**29, inout[1,3,...] < 2**28.
    217 // On exit: inout[0,2,..] < 2**30, inout[1,3,...] < 2**29.
    218 func p256ReduceCarry(inout *[p256Limbs]uint32, carry uint32) {
    219 	carry_mask := nonZeroToAllOnes(carry)
    220 
    221 	inout[0] += carry << 1
    222 	inout[3] += 0x10000000 & carry_mask
    223 	// carry < 2**3 thus (carry << 11) < 2**14 and we added 2**28 in the
    224 	// previous line therefore this doesn't underflow.
    225 	inout[3] -= carry << 11
    226 	inout[4] += (0x20000000 - 1) & carry_mask
    227 	inout[5] += (0x10000000 - 1) & carry_mask
    228 	inout[6] += (0x20000000 - 1) & carry_mask
    229 	inout[6] -= carry << 22
    230 	// This may underflow if carry is non-zero but, if so, we'll fix it in the
    231 	// next line.
    232 	inout[7] -= 1 & carry_mask
    233 	inout[7] += carry << 25
    234 }
    235 
    236 // p256Sum sets out = in+in2.
    237 //
    238 // On entry, in[i]+in2[i] must not overflow a 32-bit word.
    239 // On exit: out[0,2,...] < 2**30, out[1,3,...] < 2**29
    240 func p256Sum(out, in, in2 *[p256Limbs]uint32) {
    241 	carry := uint32(0)
    242 	for i := 0; ; i++ {
    243 		out[i] = in[i] + in2[i]
    244 		out[i] += carry
    245 		carry = out[i] >> 29
    246 		out[i] &= bottom29Bits
    247 
    248 		i++
    249 		if i == p256Limbs {
    250 			break
    251 		}
    252 
    253 		out[i] = in[i] + in2[i]
    254 		out[i] += carry
    255 		carry = out[i] >> 28
    256 		out[i] &= bottom28Bits
    257 	}
    258 
    259 	p256ReduceCarry(out, carry)
    260 }
    261 
    262 const (
    263 	two30m2    = 1<<30 - 1<<2
    264 	two30p13m2 = 1<<30 + 1<<13 - 1<<2
    265 	two31m2    = 1<<31 - 1<<2
    266 	two31p24m2 = 1<<31 + 1<<24 - 1<<2
    267 	two30m27m2 = 1<<30 - 1<<27 - 1<<2
    268 )
    269 
    270 // p256Zero31 is 0 mod p.
    271 var p256Zero31 = [p256Limbs]uint32{two31m3, two30m2, two31m2, two30p13m2, two31m2, two30m2, two31p24m2, two30m27m2, two31m2}
    272 
    273 // p256Diff sets out = in-in2.
    274 //
    275 // On entry: in[0,2,...] < 2**30, in[1,3,...] < 2**29 and
    276 //           in2[0,2,...] < 2**30, in2[1,3,...] < 2**29.
    277 // On exit: out[0,2,...] < 2**30, out[1,3,...] < 2**29.
    278 func p256Diff(out, in, in2 *[p256Limbs]uint32) {
    279 	var carry uint32
    280 
    281 	for i := 0; ; i++ {
    282 		out[i] = in[i] - in2[i]
    283 		out[i] += p256Zero31[i]
    284 		out[i] += carry
    285 		carry = out[i] >> 29
    286 		out[i] &= bottom29Bits
    287 
    288 		i++
    289 		if i == p256Limbs {
    290 			break
    291 		}
    292 
    293 		out[i] = in[i] - in2[i]
    294 		out[i] += p256Zero31[i]
    295 		out[i] += carry
    296 		carry = out[i] >> 28
    297 		out[i] &= bottom28Bits
    298 	}
    299 
    300 	p256ReduceCarry(out, carry)
    301 }
    302 
    303 // p256ReduceDegree sets out = tmp/R mod p where tmp contains 64-bit words with
    304 // the same 29,28,... bit positions as an field element.
    305 //
    306 // The values in field elements are in Montgomery form: x*R mod p where R =
    307 // 2**257. Since we just multiplied two Montgomery values together, the result
    308 // is x*y*R*R mod p. We wish to divide by R in order for the result also to be
    309 // in Montgomery form.
    310 //
    311 // On entry: tmp[i] < 2**64
    312 // On exit: out[0,2,...] < 2**30, out[1,3,...] < 2**29
    313 func p256ReduceDegree(out *[p256Limbs]uint32, tmp [17]uint64) {
    314 	// The following table may be helpful when reading this code:
    315 	//
    316 	// Limb number:   0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10...
    317 	// Width (bits):  29| 28| 29| 28| 29| 28| 29| 28| 29| 28| 29
    318 	// Start bit:     0 | 29| 57| 86|114|143|171|200|228|257|285
    319 	//   (odd phase): 0 | 28| 57| 85|114|142|171|199|228|256|285
    320 	var tmp2 [18]uint32
    321 	var carry, x, xMask uint32
    322 
    323 	// tmp contains 64-bit words with the same 29,28,29-bit positions as an
    324 	// field element. So the top of an element of tmp might overlap with
    325 	// another element two positions down. The following loop eliminates
    326 	// this overlap.
    327 	tmp2[0] = uint32(tmp[0]) & bottom29Bits
    328 
    329 	tmp2[1] = uint32(tmp[0]) >> 29
    330 	tmp2[1] |= (uint32(tmp[0]>>32) << 3) & bottom28Bits
    331 	tmp2[1] += uint32(tmp[1]) & bottom28Bits
    332 	carry = tmp2[1] >> 28
    333 	tmp2[1] &= bottom28Bits
    334 
    335 	for i := 2; i < 17; i++ {
    336 		tmp2[i] = (uint32(tmp[i-2] >> 32)) >> 25
    337 		tmp2[i] += (uint32(tmp[i-1])) >> 28
    338 		tmp2[i] += (uint32(tmp[i-1]>>32) << 4) & bottom29Bits
    339 		tmp2[i] += uint32(tmp[i]) & bottom29Bits
    340 		tmp2[i] += carry
    341 		carry = tmp2[i] >> 29
    342 		tmp2[i] &= bottom29Bits
    343 
    344 		i++
    345 		if i == 17 {
    346 			break
    347 		}
    348 		tmp2[i] = uint32(tmp[i-2]>>32) >> 25
    349 		tmp2[i] += uint32(tmp[i-1]) >> 29
    350 		tmp2[i] += ((uint32(tmp[i-1] >> 32)) << 3) & bottom28Bits
    351 		tmp2[i] += uint32(tmp[i]) & bottom28Bits
    352 		tmp2[i] += carry
    353 		carry = tmp2[i] >> 28
    354 		tmp2[i] &= bottom28Bits
    355 	}
    356 
    357 	tmp2[17] = uint32(tmp[15]>>32) >> 25
    358 	tmp2[17] += uint32(tmp[16]) >> 29
    359 	tmp2[17] += uint32(tmp[16]>>32) << 3
    360 	tmp2[17] += carry
    361 
    362 	// Montgomery elimination of terms:
    363 	//
    364 	// Since R is 2**257, we can divide by R with a bitwise shift if we can
    365 	// ensure that the right-most 257 bits are all zero. We can make that true
    366 	// by adding multiplies of p without affecting the value.
    367 	//
    368 	// So we eliminate limbs from right to left. Since the bottom 29 bits of p
    369 	// are all ones, then by adding tmp2[0]*p to tmp2 we'll make tmp2[0] == 0.
    370 	// We can do that for 8 further limbs and then right shift to eliminate the
    371 	// extra factor of R.
    372 	for i := 0; ; i += 2 {
    373 		tmp2[i+1] += tmp2[i] >> 29
    374 		x = tmp2[i] & bottom29Bits
    375 		xMask = nonZeroToAllOnes(x)
    376 		tmp2[i] = 0
    377 
    378 		// The bounds calculations for this loop are tricky. Each iteration of
    379 		// the loop eliminates two words by adding values to words to their
    380 		// right.
    381 		//
    382 		// The following table contains the amounts added to each word (as an
    383 		// offset from the value of i at the top of the loop). The amounts are
    384 		// accounted for from the first and second half of the loop separately
    385 		// and are written as, for example, 28 to mean a value <2**28.
    386 		//
    387 		// Word:                   3   4   5   6   7   8   9   10
    388 		// Added in top half:     28  11      29  21  29  28
    389 		//                                        28  29
    390 		//                                            29
    391 		// Added in bottom half:      29  10      28  21  28   28
    392 		//                                            29
    393 		//
    394 		// The value that is currently offset 7 will be offset 5 for the next
    395 		// iteration and then offset 3 for the iteration after that. Therefore
    396 		// the total value added will be the values added at 7, 5 and 3.
    397 		//
    398 		// The following table accumulates these values. The sums at the bottom
    399 		// are written as, for example, 29+28, to mean a value < 2**29+2**28.
    400 		//
    401 		// Word:                   3   4   5   6   7   8   9  10  11  12  13
    402 		//                        28  11  10  29  21  29  28  28  28  28  28
    403 		//                            29  28  11  28  29  28  29  28  29  28
    404 		//                                    29  28  21  21  29  21  29  21
    405 		//                                        10  29  28  21  28  21  28
    406 		//                                        28  29  28  29  28  29  28
    407 		//                                            11  10  29  10  29  10
    408 		//                                            29  28  11  28  11
    409 		//                                                    29      29
    410 		//                        --------------------------------------------
    411 		//                                                30+ 31+ 30+ 31+ 30+
    412 		//                                                28+ 29+ 28+ 29+ 21+
    413 		//                                                21+ 28+ 21+ 28+ 10
    414 		//                                                10  21+ 10  21+
    415 		//                                                    11      11
    416 		//
    417 		// So the greatest amount is added to tmp2[10] and tmp2[12]. If
    418 		// tmp2[10/12] has an initial value of <2**29, then the maximum value
    419 		// will be < 2**31 + 2**30 + 2**28 + 2**21 + 2**11, which is < 2**32,
    420 		// as required.
    421 		tmp2[i+3] += (x << 10) & bottom28Bits
    422 		tmp2[i+4] += (x >> 18)
    423 
    424 		tmp2[i+6] += (x << 21) & bottom29Bits
    425 		tmp2[i+7] += x >> 8
    426 
    427 		// At position 200, which is the starting bit position for word 7, we
    428 		// have a factor of 0xf000000 = 2**28 - 2**24.
    429 		tmp2[i+7] += 0x10000000 & xMask
    430 		tmp2[i+8] += (x - 1) & xMask
    431 		tmp2[i+7] -= (x << 24) & bottom28Bits
    432 		tmp2[i+8] -= x >> 4
    433 
    434 		tmp2[i+8] += 0x20000000 & xMask
    435 		tmp2[i+8] -= x
    436 		tmp2[i+8] += (x << 28) & bottom29Bits
    437 		tmp2[i+9] += ((x >> 1) - 1) & xMask
    438 
    439 		if i+1 == p256Limbs {
    440 			break
    441 		}
    442 		tmp2[i+2] += tmp2[i+1] >> 28
    443 		x = tmp2[i+1] & bottom28Bits
    444 		xMask = nonZeroToAllOnes(x)
    445 		tmp2[i+1] = 0
    446 
    447 		tmp2[i+4] += (x << 11) & bottom29Bits
    448 		tmp2[i+5] += (x >> 18)
    449 
    450 		tmp2[i+7] += (x << 21) & bottom28Bits
    451 		tmp2[i+8] += x >> 7
    452 
    453 		// At position 199, which is the starting bit of the 8th word when
    454 		// dealing with a context starting on an odd word, we have a factor of
    455 		// 0x1e000000 = 2**29 - 2**25. Since we have not updated i, the 8th
    456 		// word from i+1 is i+8.
    457 		tmp2[i+8] += 0x20000000 & xMask
    458 		tmp2[i+9] += (x - 1) & xMask
    459 		tmp2[i+8] -= (x << 25) & bottom29Bits
    460 		tmp2[i+9] -= x >> 4
    461 
    462 		tmp2[i+9] += 0x10000000 & xMask
    463 		tmp2[i+9] -= x
    464 		tmp2[i+10] += (x - 1) & xMask
    465 	}
    466 
    467 	// We merge the right shift with a carry chain. The words above 2**257 have
    468 	// widths of 28,29,... which we need to correct when copying them down.
    469 	carry = 0
    470 	for i := 0; i < 8; i++ {
    471 		// The maximum value of tmp2[i + 9] occurs on the first iteration and
    472 		// is < 2**30+2**29+2**28. Adding 2**29 (from tmp2[i + 10]) is
    473 		// therefore safe.
    474 		out[i] = tmp2[i+9]
    475 		out[i] += carry
    476 		out[i] += (tmp2[i+10] << 28) & bottom29Bits
    477 		carry = out[i] >> 29
    478 		out[i] &= bottom29Bits
    479 
    480 		i++
    481 		out[i] = tmp2[i+9] >> 1
    482 		out[i] += carry
    483 		carry = out[i] >> 28
    484 		out[i] &= bottom28Bits
    485 	}
    486 
    487 	out[8] = tmp2[17]
    488 	out[8] += carry
    489 	carry = out[8] >> 29
    490 	out[8] &= bottom29Bits
    491 
    492 	p256ReduceCarry(out, carry)
    493 }
    494 
    495 // p256Square sets out=in*in.
    496 //
    497 // On entry: in[0,2,...] < 2**30, in[1,3,...] < 2**29.
    498 // On exit: out[0,2,...] < 2**30, out[1,3,...] < 2**29.
    499 func p256Square(out, in *[p256Limbs]uint32) {
    500 	var tmp [17]uint64
    501 
    502 	tmp[0] = uint64(in[0]) * uint64(in[0])
    503 	tmp[1] = uint64(in[0]) * (uint64(in[1]) << 1)
    504 	tmp[2] = uint64(in[0])*(uint64(in[2])<<1) +
    505 		uint64(in[1])*(uint64(in[1])<<1)
    506 	tmp[3] = uint64(in[0])*(uint64(in[3])<<1) +
    507 		uint64(in[1])*(uint64(in[2])<<1)
    508 	tmp[4] = uint64(in[0])*(uint64(in[4])<<1) +
    509 		uint64(in[1])*(uint64(in[3])<<2) +
    510 		uint64(in[2])*uint64(in[2])
    511 	tmp[5] = uint64(in[0])*(uint64(in[5])<<1) +
    512 		uint64(in[1])*(uint64(in[4])<<1) +
    513 		uint64(in[2])*(uint64(in[3])<<1)
    514 	tmp[6] = uint64(in[0])*(uint64(in[6])<<1) +
    515 		uint64(in[1])*(uint64(in[5])<<2) +
    516 		uint64(in[2])*(uint64(in[4])<<1) +
    517 		uint64(in[3])*(uint64(in[3])<<1)
    518 	tmp[7] = uint64(in[0])*(uint64(in[7])<<1) +
    519 		uint64(in[1])*(uint64(in[6])<<1) +
    520 		uint64(in[2])*(uint64(in[5])<<1) +
    521 		uint64(in[3])*(uint64(in[4])<<1)
    522 	// tmp[8] has the greatest value of 2**61 + 2**60 + 2**61 + 2**60 + 2**60,
    523 	// which is < 2**64 as required.
    524 	tmp[8] = uint64(in[0])*(uint64(in[8])<<1) +
    525 		uint64(in[1])*(uint64(in[7])<<2) +
    526 		uint64(in[2])*(uint64(in[6])<<1) +
    527 		uint64(in[3])*(uint64(in[5])<<2) +
    528 		uint64(in[4])*uint64(in[4])
    529 	tmp[9] = uint64(in[1])*(uint64(in[8])<<1) +
    530 		uint64(in[2])*(uint64(in[7])<<1) +
    531 		uint64(in[3])*(uint64(in[6])<<1) +
    532 		uint64(in[4])*(uint64(in[5])<<1)
    533 	tmp[10] = uint64(in[2])*(uint64(in[8])<<1) +
    534 		uint64(in[3])*(uint64(in[7])<<2) +
    535 		uint64(in[4])*(uint64(in[6])<<1) +
    536 		uint64(in[5])*(uint64(in[5])<<1)
    537 	tmp[11] = uint64(in[3])*(uint64(in[8])<<1) +
    538 		uint64(in[4])*(uint64(in[7])<<1) +
    539 		uint64(in[5])*(uint64(in[6])<<1)
    540 	tmp[12] = uint64(in[4])*(uint64(in[8])<<1) +
    541 		uint64(in[5])*(uint64(in[7])<<2) +
    542 		uint64(in[6])*uint64(in[6])
    543 	tmp[13] = uint64(in[5])*(uint64(in[8])<<1) +
    544 		uint64(in[6])*(uint64(in[7])<<1)
    545 	tmp[14] = uint64(in[6])*(uint64(in[8])<<1) +
    546 		uint64(in[7])*(uint64(in[7])<<1)
    547 	tmp[15] = uint64(in[7]) * (uint64(in[8]) << 1)
    548 	tmp[16] = uint64(in[8]) * uint64(in[8])
    549 
    550 	p256ReduceDegree(out, tmp)
    551 }
    552 
    553 // p256Mul sets out=in*in2.
    554 //
    555 // On entry: in[0,2,...] < 2**30, in[1,3,...] < 2**29 and
    556 //           in2[0,2,...] < 2**30, in2[1,3,...] < 2**29.
    557 // On exit: out[0,2,...] < 2**30, out[1,3,...] < 2**29.
    558 func p256Mul(out, in, in2 *[p256Limbs]uint32) {
    559 	var tmp [17]uint64
    560 
    561 	tmp[0] = uint64(in[0]) * uint64(in2[0])
    562 	tmp[1] = uint64(in[0])*(uint64(in2[1])<<0) +
    563 		uint64(in[1])*(uint64(in2[0])<<0)
    564 	tmp[2] = uint64(in[0])*(uint64(in2[2])<<0) +
    565 		uint64(in[1])*(uint64(in2[1])<<1) +
    566 		uint64(in[2])*(uint64(in2[0])<<0)
    567 	tmp[3] = uint64(in[0])*(uint64(in2[3])<<0) +
    568 		uint64(in[1])*(uint64(in2[2])<<0) +
    569 		uint64(in[2])*(uint64(in2[1])<<0) +
    570 		uint64(in[3])*(uint64(in2[0])<<0)
    571 	tmp[4] = uint64(in[0])*(uint64(in2[4])<<0) +
    572 		uint64(in[1])*(uint64(in2[3])<<1) +
    573 		uint64(in[2])*(uint64(in2[2])<<0) +
    574 		uint64(in[3])*(uint64(in2[1])<<1) +
    575 		uint64(in[4])*(uint64(in2[0])<<0)
    576 	tmp[5] = uint64(in[0])*(uint64(in2[5])<<0) +
    577 		uint64(in[1])*(uint64(in2[4])<<0) +
    578 		uint64(in[2])*(uint64(in2[3])<<0) +
    579 		uint64(in[3])*(uint64(in2[2])<<0) +
    580 		uint64(in[4])*(uint64(in2[1])<<0) +
    581 		uint64(in[5])*(uint64(in2[0])<<0)
    582 	tmp[6] = uint64(in[0])*(uint64(in2[6])<<0) +
    583 		uint64(in[1])*(uint64(in2[5])<<1) +
    584 		uint64(in[2])*(uint64(in2[4])<<0) +
    585 		uint64(in[3])*(uint64(in2[3])<<1) +
    586 		uint64(in[4])*(uint64(in2[2])<<0) +
    587 		uint64(in[5])*(uint64(in2[1])<<1) +
    588 		uint64(in[6])*(uint64(in2[0])<<0)
    589 	tmp[7] = uint64(in[0])*(uint64(in2[7])<<0) +
    590 		uint64(in[1])*(uint64(in2[6])<<0) +
    591 		uint64(in[2])*(uint64(in2[5])<<0) +
    592 		uint64(in[3])*(uint64(in2[4])<<0) +
    593 		uint64(in[4])*(uint64(in2[3])<<0) +
    594 		uint64(in[5])*(uint64(in2[2])<<0) +
    595 		uint64(in[6])*(uint64(in2[1])<<0) +
    596 		uint64(in[7])*(uint64(in2[0])<<0)
    597 	// tmp[8] has the greatest value but doesn't overflow. See logic in
    598 	// p256Square.
    599 	tmp[8] = uint64(in[0])*(uint64(in2[8])<<0) +
    600 		uint64(in[1])*(uint64(in2[7])<<1) +
    601 		uint64(in[2])*(uint64(in2[6])<<0) +
    602 		uint64(in[3])*(uint64(in2[5])<<1) +
    603 		uint64(in[4])*(uint64(in2[4])<<0) +
    604 		uint64(in[5])*(uint64(in2[3])<<1) +
    605 		uint64(in[6])*(uint64(in2[2])<<0) +
    606 		uint64(in[7])*(uint64(in2[1])<<1) +
    607 		uint64(in[8])*(uint64(in2[0])<<0)
    608 	tmp[9] = uint64(in[1])*(uint64(in2[8])<<0) +
    609 		uint64(in[2])*(uint64(in2[7])<<0) +
    610 		uint64(in[3])*(uint64(in2[6])<<0) +
    611 		uint64(in[4])*(uint64(in2[5])<<0) +
    612 		uint64(in[5])*(uint64(in2[4])<<0) +
    613 		uint64(in[6])*(uint64(in2[3])<<0) +
    614 		uint64(in[7])*(uint64(in2[2])<<0) +
    615 		uint64(in[8])*(uint64(in2[1])<<0)
    616 	tmp[10] = uint64(in[2])*(uint64(in2[8])<<0) +
    617 		uint64(in[3])*(uint64(in2[7])<<1) +
    618 		uint64(in[4])*(uint64(in2[6])<<0) +
    619 		uint64(in[5])*(uint64(in2[5])<<1) +
    620 		uint64(in[6])*(uint64(in2[4])<<0) +
    621 		uint64(in[7])*(uint64(in2[3])<<1) +
    622 		uint64(in[8])*(uint64(in2[2])<<0)
    623 	tmp[11] = uint64(in[3])*(uint64(in2[8])<<0) +
    624 		uint64(in[4])*(uint64(in2[7])<<0) +
    625 		uint64(in[5])*(uint64(in2[6])<<0) +
    626 		uint64(in[6])*(uint64(in2[5])<<0) +
    627 		uint64(in[7])*(uint64(in2[4])<<0) +
    628 		uint64(in[8])*(uint64(in2[3])<<0)
    629 	tmp[12] = uint64(in[4])*(uint64(in2[8])<<0) +
    630 		uint64(in[5])*(uint64(in2[7])<<1) +
    631 		uint64(in[6])*(uint64(in2[6])<<0) +
    632 		uint64(in[7])*(uint64(in2[5])<<1) +
    633 		uint64(in[8])*(uint64(in2[4])<<0)
    634 	tmp[13] = uint64(in[5])*(uint64(in2[8])<<0) +
    635 		uint64(in[6])*(uint64(in2[7])<<0) +
    636 		uint64(in[7])*(uint64(in2[6])<<0) +
    637 		uint64(in[8])*(uint64(in2[5])<<0)
    638 	tmp[14] = uint64(in[6])*(uint64(in2[8])<<0) +
    639 		uint64(in[7])*(uint64(in2[7])<<1) +
    640 		uint64(in[8])*(uint64(in2[6])<<0)
    641 	tmp[15] = uint64(in[7])*(uint64(in2[8])<<0) +
    642 		uint64(in[8])*(uint64(in2[7])<<0)
    643 	tmp[16] = uint64(in[8]) * (uint64(in2[8]) << 0)
    644 
    645 	p256ReduceDegree(out, tmp)
    646 }
    647 
    648 func p256Assign(out, in *[p256Limbs]uint32) {
    649 	*out = *in
    650 }
    651 
    652 // p256Invert calculates |out| = |in|^{-1}
    653 //
    654 // Based on Fermat's Little Theorem:
    655 //   a^p = a (mod p)
    656 //   a^{p-1} = 1 (mod p)
    657 //   a^{p-2} = a^{-1} (mod p)
    658 func p256Invert(out, in *[p256Limbs]uint32) {
    659 	var ftmp, ftmp2 [p256Limbs]uint32
    660 
    661 	// each e_I will hold |in|^{2^I - 1}
    662 	var e2, e4, e8, e16, e32, e64 [p256Limbs]uint32
    663 
    664 	p256Square(&ftmp, in)     // 2^1
    665 	p256Mul(&ftmp, in, &ftmp) // 2^2 - 2^0
    666 	p256Assign(&e2, &ftmp)
    667 	p256Square(&ftmp, &ftmp)   // 2^3 - 2^1
    668 	p256Square(&ftmp, &ftmp)   // 2^4 - 2^2
    669 	p256Mul(&ftmp, &ftmp, &e2) // 2^4 - 2^0
    670 	p256Assign(&e4, &ftmp)
    671 	p256Square(&ftmp, &ftmp)   // 2^5 - 2^1
    672 	p256Square(&ftmp, &ftmp)   // 2^6 - 2^2
    673 	p256Square(&ftmp, &ftmp)   // 2^7 - 2^3
    674 	p256Square(&ftmp, &ftmp)   // 2^8 - 2^4
    675 	p256Mul(&ftmp, &ftmp, &e4) // 2^8 - 2^0
    676 	p256Assign(&e8, &ftmp)
    677 	for i := 0; i < 8; i++ {
    678 		p256Square(&ftmp, &ftmp)
    679 	} // 2^16 - 2^8
    680 	p256Mul(&ftmp, &ftmp, &e8) // 2^16 - 2^0
    681 	p256Assign(&e16, &ftmp)
    682 	for i := 0; i < 16; i++ {
    683 		p256Square(&ftmp, &ftmp)
    684 	} // 2^32 - 2^16
    685 	p256Mul(&ftmp, &ftmp, &e16) // 2^32 - 2^0
    686 	p256Assign(&e32, &ftmp)
    687 	for i := 0; i < 32; i++ {
    688 		p256Square(&ftmp, &ftmp)
    689 	} // 2^64 - 2^32
    690 	p256Assign(&e64, &ftmp)
    691 	p256Mul(&ftmp, &ftmp, in) // 2^64 - 2^32 + 2^0
    692 	for i := 0; i < 192; i++ {
    693 		p256Square(&ftmp, &ftmp)
    694 	} // 2^256 - 2^224 + 2^192
    695 
    696 	p256Mul(&ftmp2, &e64, &e32) // 2^64 - 2^0
    697 	for i := 0; i < 16; i++ {
    698 		p256Square(&ftmp2, &ftmp2)
    699 	} // 2^80 - 2^16
    700 	p256Mul(&ftmp2, &ftmp2, &e16) // 2^80 - 2^0
    701 	for i := 0; i < 8; i++ {
    702 		p256Square(&ftmp2, &ftmp2)
    703 	} // 2^88 - 2^8
    704 	p256Mul(&ftmp2, &ftmp2, &e8) // 2^88 - 2^0
    705 	for i := 0; i < 4; i++ {
    706 		p256Square(&ftmp2, &ftmp2)
    707 	} // 2^92 - 2^4
    708 	p256Mul(&ftmp2, &ftmp2, &e4) // 2^92 - 2^0
    709 	p256Square(&ftmp2, &ftmp2)   // 2^93 - 2^1
    710 	p256Square(&ftmp2, &ftmp2)   // 2^94 - 2^2
    711 	p256Mul(&ftmp2, &ftmp2, &e2) // 2^94 - 2^0
    712 	p256Square(&ftmp2, &ftmp2)   // 2^95 - 2^1
    713 	p256Square(&ftmp2, &ftmp2)   // 2^96 - 2^2
    714 	p256Mul(&ftmp2, &ftmp2, in)  // 2^96 - 3
    715 
    716 	p256Mul(out, &ftmp2, &ftmp) // 2^256 - 2^224 + 2^192 + 2^96 - 3
    717 }
    718 
    719 // p256Scalar3 sets out=3*out.
    720 //
    721 // On entry: out[0,2,...] < 2**30, out[1,3,...] < 2**29.
    722 // On exit: out[0,2,...] < 2**30, out[1,3,...] < 2**29.
    723 func p256Scalar3(out *[p256Limbs]uint32) {
    724 	var carry uint32
    725 
    726 	for i := 0; ; i++ {
    727 		out[i] *= 3
    728 		out[i] += carry
    729 		carry = out[i] >> 29
    730 		out[i] &= bottom29Bits
    731 
    732 		i++
    733 		if i == p256Limbs {
    734 			break
    735 		}
    736 
    737 		out[i] *= 3
    738 		out[i] += carry
    739 		carry = out[i] >> 28
    740 		out[i] &= bottom28Bits
    741 	}
    742 
    743 	p256ReduceCarry(out, carry)
    744 }
    745 
    746 // p256Scalar4 sets out=4*out.
    747 //
    748 // On entry: out[0,2,...] < 2**30, out[1,3,...] < 2**29.
    749 // On exit: out[0,2,...] < 2**30, out[1,3,...] < 2**29.
    750 func p256Scalar4(out *[p256Limbs]uint32) {
    751 	var carry, nextCarry uint32
    752 
    753 	for i := 0; ; i++ {
    754 		nextCarry = out[i] >> 27
    755 		out[i] <<= 2
    756 		out[i] &= bottom29Bits
    757 		out[i] += carry
    758 		carry = nextCarry + (out[i] >> 29)
    759 		out[i] &= bottom29Bits
    760 
    761 		i++
    762 		if i == p256Limbs {
    763 			break
    764 		}
    765 		nextCarry = out[i] >> 26
    766 		out[i] <<= 2
    767 		out[i] &= bottom28Bits
    768 		out[i] += carry
    769 		carry = nextCarry + (out[i] >> 28)
    770 		out[i] &= bottom28Bits
    771 	}
    772 
    773 	p256ReduceCarry(out, carry)
    774 }
    775 
    776 // p256Scalar8 sets out=8*out.
    777 //
    778 // On entry: out[0,2,...] < 2**30, out[1,3,...] < 2**29.
    779 // On exit: out[0,2,...] < 2**30, out[1,3,...] < 2**29.
    780 func p256Scalar8(out *[p256Limbs]uint32) {
    781 	var carry, nextCarry uint32
    782 
    783 	for i := 0; ; i++ {
    784 		nextCarry = out[i] >> 26
    785 		out[i] <<= 3
    786 		out[i] &= bottom29Bits
    787 		out[i] += carry
    788 		carry = nextCarry + (out[i] >> 29)
    789 		out[i] &= bottom29Bits
    790 
    791 		i++
    792 		if i == p256Limbs {
    793 			break
    794 		}
    795 		nextCarry = out[i] >> 25
    796 		out[i] <<= 3
    797 		out[i] &= bottom28Bits
    798 		out[i] += carry
    799 		carry = nextCarry + (out[i] >> 28)
    800 		out[i] &= bottom28Bits
    801 	}
    802 
    803 	p256ReduceCarry(out, carry)
    804 }
    805 
    806 // Group operations:
    807 //
    808 // Elements of the elliptic curve group are represented in Jacobian
    809 // coordinates: (x, y, z). An affine point (x', y') is x'=x/z**2, y'=y/z**3 in
    810 // Jacobian form.
    811 
    812 // p256PointDouble sets {xOut,yOut,zOut} = 2*{x,y,z}.
    813 //
    814 // See http://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-0.html#doubling-dbl-2009-l
    815 func p256PointDouble(xOut, yOut, zOut, x, y, z *[p256Limbs]uint32) {
    816 	var delta, gamma, alpha, beta, tmp, tmp2 [p256Limbs]uint32
    817 
    818 	p256Square(&delta, z)
    819 	p256Square(&gamma, y)
    820 	p256Mul(&beta, x, &gamma)
    821 
    822 	p256Sum(&tmp, x, &delta)
    823 	p256Diff(&tmp2, x, &delta)
    824 	p256Mul(&alpha, &tmp, &tmp2)
    825 	p256Scalar3(&alpha)
    826 
    827 	p256Sum(&tmp, y, z)
    828 	p256Square(&tmp, &tmp)
    829 	p256Diff(&tmp, &tmp, &gamma)
    830 	p256Diff(zOut, &tmp, &delta)
    831 
    832 	p256Scalar4(&beta)
    833 	p256Square(xOut, &alpha)
    834 	p256Diff(xOut, xOut, &beta)
    835 	p256Diff(xOut, xOut, &beta)
    836 
    837 	p256Diff(&tmp, &beta, xOut)
    838 	p256Mul(&tmp, &alpha, &tmp)
    839 	p256Square(&tmp2, &gamma)
    840 	p256Scalar8(&tmp2)
    841 	p256Diff(yOut, &tmp, &tmp2)
    842 }
    843 
    844 // p256PointAddMixed sets {xOut,yOut,zOut} = {x1,y1,z1} + {x2,y2,1}.
    845 // (i.e. the second point is affine.)
    846 //
    847 // See http://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-0.html#addition-add-2007-bl
    848 //
    849 // Note that this function does not handle P+P, infinity+P nor P+infinity
    850 // correctly.
    851 func p256PointAddMixed(xOut, yOut, zOut, x1, y1, z1, x2, y2 *[p256Limbs]uint32) {
    852 	var z1z1, z1z1z1, s2, u2, h, i, j, r, rr, v, tmp [p256Limbs]uint32
    853 
    854 	p256Square(&z1z1, z1)
    855 	p256Sum(&tmp, z1, z1)
    856 
    857 	p256Mul(&u2, x2, &z1z1)
    858 	p256Mul(&z1z1z1, z1, &z1z1)
    859 	p256Mul(&s2, y2, &z1z1z1)
    860 	p256Diff(&h, &u2, x1)
    861 	p256Sum(&i, &h, &h)
    862 	p256Square(&i, &i)
    863 	p256Mul(&j, &h, &i)
    864 	p256Diff(&r, &s2, y1)
    865 	p256Sum(&r, &r, &r)
    866 	p256Mul(&v, x1, &i)
    867 
    868 	p256Mul(zOut, &tmp, &h)
    869 	p256Square(&rr, &r)
    870 	p256Diff(xOut, &rr, &j)
    871 	p256Diff(xOut, xOut, &v)
    872 	p256Diff(xOut, xOut, &v)
    873 
    874 	p256Diff(&tmp, &v, xOut)
    875 	p256Mul(yOut, &tmp, &r)
    876 	p256Mul(&tmp, y1, &j)
    877 	p256Diff(yOut, yOut, &tmp)
    878 	p256Diff(yOut, yOut, &tmp)
    879 }
    880 
    881 // p256PointAdd sets {xOut,yOut,zOut} = {x1,y1,z1} + {x2,y2,z2}.
    882 //
    883 // See http://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-0.html#addition-add-2007-bl
    884 //
    885 // Note that this function does not handle P+P, infinity+P nor P+infinity
    886 // correctly.
    887 func p256PointAdd(xOut, yOut, zOut, x1, y1, z1, x2, y2, z2 *[p256Limbs]uint32) {
    888 	var z1z1, z1z1z1, z2z2, z2z2z2, s1, s2, u1, u2, h, i, j, r, rr, v, tmp [p256Limbs]uint32
    889 
    890 	p256Square(&z1z1, z1)
    891 	p256Square(&z2z2, z2)
    892 	p256Mul(&u1, x1, &z2z2)
    893 
    894 	p256Sum(&tmp, z1, z2)
    895 	p256Square(&tmp, &tmp)
    896 	p256Diff(&tmp, &tmp, &z1z1)
    897 	p256Diff(&tmp, &tmp, &z2z2)
    898 
    899 	p256Mul(&z2z2z2, z2, &z2z2)
    900 	p256Mul(&s1, y1, &z2z2z2)
    901 
    902 	p256Mul(&u2, x2, &z1z1)
    903 	p256Mul(&z1z1z1, z1, &z1z1)
    904 	p256Mul(&s2, y2, &z1z1z1)
    905 	p256Diff(&h, &u2, &u1)
    906 	p256Sum(&i, &h, &h)
    907 	p256Square(&i, &i)
    908 	p256Mul(&j, &h, &i)
    909 	p256Diff(&r, &s2, &s1)
    910 	p256Sum(&r, &r, &r)
    911 	p256Mul(&v, &u1, &i)
    912 
    913 	p256Mul(zOut, &tmp, &h)
    914 	p256Square(&rr, &r)
    915 	p256Diff(xOut, &rr, &j)
    916 	p256Diff(xOut, xOut, &v)
    917 	p256Diff(xOut, xOut, &v)
    918 
    919 	p256Diff(&tmp, &v, xOut)
    920 	p256Mul(yOut, &tmp, &r)
    921 	p256Mul(&tmp, &s1, &j)
    922 	p256Diff(yOut, yOut, &tmp)
    923 	p256Diff(yOut, yOut, &tmp)
    924 }
    925 
    926 // p256CopyConditional sets out=in if mask = 0xffffffff in constant time.
    927 //
    928 // On entry: mask is either 0 or 0xffffffff.
    929 func p256CopyConditional(out, in *[p256Limbs]uint32, mask uint32) {
    930 	for i := 0; i < p256Limbs; i++ {
    931 		tmp := mask & (in[i] ^ out[i])
    932 		out[i] ^= tmp
    933 	}
    934 }
    935 
    936 // p256SelectAffinePoint sets {out_x,out_y} to the index'th entry of table.
    937 // On entry: index < 16, table[0] must be zero.
    938 func p256SelectAffinePoint(xOut, yOut *[p256Limbs]uint32, table []uint32, index uint32) {
    939 	for i := range xOut {
    940 		xOut[i] = 0
    941 	}
    942 	for i := range yOut {
    943 		yOut[i] = 0
    944 	}
    945 
    946 	for i := uint32(1); i < 16; i++ {
    947 		mask := i ^ index
    948 		mask |= mask >> 2
    949 		mask |= mask >> 1
    950 		mask &= 1
    951 		mask--
    952 		for j := range xOut {
    953 			xOut[j] |= table[0] & mask
    954 			table = table[1:]
    955 		}
    956 		for j := range yOut {
    957 			yOut[j] |= table[0] & mask
    958 			table = table[1:]
    959 		}
    960 	}
    961 }
    962 
    963 // p256SelectJacobianPoint sets {out_x,out_y,out_z} to the index'th entry of
    964 // table.
    965 // On entry: index < 16, table[0] must be zero.
    966 func p256SelectJacobianPoint(xOut, yOut, zOut *[p256Limbs]uint32, table *[16][3][p256Limbs]uint32, index uint32) {
    967 	for i := range xOut {
    968 		xOut[i] = 0
    969 	}
    970 	for i := range yOut {
    971 		yOut[i] = 0
    972 	}
    973 	for i := range zOut {
    974 		zOut[i] = 0
    975 	}
    976 
    977 	// The implicit value at index 0 is all zero. We don't need to perform that
    978 	// iteration of the loop because we already set out_* to zero.
    979 	for i := uint32(1); i < 16; i++ {
    980 		mask := i ^ index
    981 		mask |= mask >> 2
    982 		mask |= mask >> 1
    983 		mask &= 1
    984 		mask--
    985 		for j := range xOut {
    986 			xOut[j] |= table[i][0][j] & mask
    987 		}
    988 		for j := range yOut {
    989 			yOut[j] |= table[i][1][j] & mask
    990 		}
    991 		for j := range zOut {
    992 			zOut[j] |= table[i][2][j] & mask
    993 		}
    994 	}
    995 }
    996 
    997 // p256GetBit returns the bit'th bit of scalar.
    998 func p256GetBit(scalar *[32]uint8, bit uint) uint32 {
    999 	return uint32(((scalar[bit>>3]) >> (bit & 7)) & 1)
   1000 }
   1001 
   1002 // p256ScalarBaseMult sets {xOut,yOut,zOut} = scalar*G where scalar is a
   1003 // little-endian number. Note that the value of scalar must be less than the
   1004 // order of the group.
   1005 func p256ScalarBaseMult(xOut, yOut, zOut *[p256Limbs]uint32, scalar *[32]uint8) {
   1006 	nIsInfinityMask := ^uint32(0)
   1007 	var pIsNoninfiniteMask, mask, tableOffset uint32
   1008 	var px, py, tx, ty, tz [p256Limbs]uint32
   1009 
   1010 	for i := range xOut {
   1011 		xOut[i] = 0
   1012 	}
   1013 	for i := range yOut {
   1014 		yOut[i] = 0
   1015 	}
   1016 	for i := range zOut {
   1017 		zOut[i] = 0
   1018 	}
   1019 
   1020 	// The loop adds bits at positions 0, 64, 128 and 192, followed by
   1021 	// positions 32,96,160 and 224 and does this 32 times.
   1022 	for i := uint(0); i < 32; i++ {
   1023 		if i != 0 {
   1024 			p256PointDouble(xOut, yOut, zOut, xOut, yOut, zOut)
   1025 		}
   1026 		tableOffset = 0
   1027 		for j := uint(0); j <= 32; j += 32 {
   1028 			bit0 := p256GetBit(scalar, 31-i+j)
   1029 			bit1 := p256GetBit(scalar, 95-i+j)
   1030 			bit2 := p256GetBit(scalar, 159-i+j)
   1031 			bit3 := p256GetBit(scalar, 223-i+j)
   1032 			index := bit0 | (bit1 << 1) | (bit2 << 2) | (bit3 << 3)
   1033 
   1034 			p256SelectAffinePoint(&px, &py, p256Precomputed[tableOffset:], index)
   1035 			tableOffset += 30 * p256Limbs
   1036 
   1037 			// Since scalar is less than the order of the group, we know that
   1038 			// {xOut,yOut,zOut} != {px,py,1}, unless both are zero, which we handle
   1039 			// below.
   1040 			p256PointAddMixed(&tx, &ty, &tz, xOut, yOut, zOut, &px, &py)
   1041 			// The result of pointAddMixed is incorrect if {xOut,yOut,zOut} is zero
   1042 			// (a.k.a.  the point at infinity). We handle that situation by
   1043 			// copying the point from the table.
   1044 			p256CopyConditional(xOut, &px, nIsInfinityMask)
   1045 			p256CopyConditional(yOut, &py, nIsInfinityMask)
   1046 			p256CopyConditional(zOut, &p256One, nIsInfinityMask)
   1047 
   1048 			// Equally, the result is also wrong if the point from the table is
   1049 			// zero, which happens when the index is zero. We handle that by
   1050 			// only copying from {tx,ty,tz} to {xOut,yOut,zOut} if index != 0.
   1051 			pIsNoninfiniteMask = nonZeroToAllOnes(index)
   1052 			mask = pIsNoninfiniteMask & ^nIsInfinityMask
   1053 			p256CopyConditional(xOut, &tx, mask)
   1054 			p256CopyConditional(yOut, &ty, mask)
   1055 			p256CopyConditional(zOut, &tz, mask)
   1056 			// If p was not zero, then n is now non-zero.
   1057 			nIsInfinityMask &= ^pIsNoninfiniteMask
   1058 		}
   1059 	}
   1060 }
   1061 
   1062 // p256PointToAffine converts a Jacobian point to an affine point. If the input
   1063 // is the point at infinity then it returns (0, 0) in constant time.
   1064 func p256PointToAffine(xOut, yOut, x, y, z *[p256Limbs]uint32) {
   1065 	var zInv, zInvSq [p256Limbs]uint32
   1066 
   1067 	p256Invert(&zInv, z)
   1068 	p256Square(&zInvSq, &zInv)
   1069 	p256Mul(xOut, x, &zInvSq)
   1070 	p256Mul(&zInv, &zInv, &zInvSq)
   1071 	p256Mul(yOut, y, &zInv)
   1072 }
   1073 
   1074 // p256ToAffine returns a pair of *big.Int containing the affine representation
   1075 // of {x,y,z}.
   1076 func p256ToAffine(x, y, z *[p256Limbs]uint32) (xOut, yOut *big.Int) {
   1077 	var xx, yy [p256Limbs]uint32
   1078 	p256PointToAffine(&xx, &yy, x, y, z)
   1079 	return p256ToBig(&xx), p256ToBig(&yy)
   1080 }
   1081 
   1082 // p256ScalarMult sets {xOut,yOut,zOut} = scalar*{x,y}.
   1083 func p256ScalarMult(xOut, yOut, zOut, x, y *[p256Limbs]uint32, scalar *[32]uint8) {
   1084 	var px, py, pz, tx, ty, tz [p256Limbs]uint32
   1085 	var precomp [16][3][p256Limbs]uint32
   1086 	var nIsInfinityMask, index, pIsNoninfiniteMask, mask uint32
   1087 
   1088 	// We precompute 0,1,2,... times {x,y}.
   1089 	precomp[1][0] = *x
   1090 	precomp[1][1] = *y
   1091 	precomp[1][2] = p256One
   1092 
   1093 	for i := 2; i < 16; i += 2 {
   1094 		p256PointDouble(&precomp[i][0], &precomp[i][1], &precomp[i][2], &precomp[i/2][0], &precomp[i/2][1], &precomp[i/2][2])
   1095 		p256PointAddMixed(&precomp[i+1][0], &precomp[i+1][1], &precomp[i+1][2], &precomp[i][0], &precomp[i][1], &precomp[i][2], x, y)
   1096 	}
   1097 
   1098 	for i := range xOut {
   1099 		xOut[i] = 0
   1100 	}
   1101 	for i := range yOut {
   1102 		yOut[i] = 0
   1103 	}
   1104 	for i := range zOut {
   1105 		zOut[i] = 0
   1106 	}
   1107 	nIsInfinityMask = ^uint32(0)
   1108 
   1109 	// We add in a window of four bits each iteration and do this 64 times.
   1110 	for i := 0; i < 64; i++ {
   1111 		if i != 0 {
   1112 			p256PointDouble(xOut, yOut, zOut, xOut, yOut, zOut)
   1113 			p256PointDouble(xOut, yOut, zOut, xOut, yOut, zOut)
   1114 			p256PointDouble(xOut, yOut, zOut, xOut, yOut, zOut)
   1115 			p256PointDouble(xOut, yOut, zOut, xOut, yOut, zOut)
   1116 		}
   1117 
   1118 		index = uint32(scalar[31-i/2])
   1119 		if (i & 1) == 1 {
   1120 			index &= 15
   1121 		} else {
   1122 			index >>= 4
   1123 		}
   1124 
   1125 		// See the comments in scalarBaseMult about handling infinities.
   1126 		p256SelectJacobianPoint(&px, &py, &pz, &precomp, index)
   1127 		p256PointAdd(&tx, &ty, &tz, xOut, yOut, zOut, &px, &py, &pz)
   1128 		p256CopyConditional(xOut, &px, nIsInfinityMask)
   1129 		p256CopyConditional(yOut, &py, nIsInfinityMask)
   1130 		p256CopyConditional(zOut, &pz, nIsInfinityMask)
   1131 
   1132 		pIsNoninfiniteMask = nonZeroToAllOnes(index)
   1133 		mask = pIsNoninfiniteMask & ^nIsInfinityMask
   1134 		p256CopyConditional(xOut, &tx, mask)
   1135 		p256CopyConditional(yOut, &ty, mask)
   1136 		p256CopyConditional(zOut, &tz, mask)
   1137 		nIsInfinityMask &= ^pIsNoninfiniteMask
   1138 	}
   1139 }
   1140 
   1141 // p256FromBig sets out = R*in.
   1142 func p256FromBig(out *[p256Limbs]uint32, in *big.Int) {
   1143 	tmp := new(big.Int).Lsh(in, 257)
   1144 	tmp.Mod(tmp, p256.P)
   1145 
   1146 	for i := 0; i < p256Limbs; i++ {
   1147 		if bits := tmp.Bits(); len(bits) > 0 {
   1148 			out[i] = uint32(bits[0]) & bottom29Bits
   1149 		} else {
   1150 			out[i] = 0
   1151 		}
   1152 		tmp.Rsh(tmp, 29)
   1153 
   1154 		i++
   1155 		if i == p256Limbs {
   1156 			break
   1157 		}
   1158 
   1159 		if bits := tmp.Bits(); len(bits) > 0 {
   1160 			out[i] = uint32(bits[0]) & bottom28Bits
   1161 		} else {
   1162 			out[i] = 0
   1163 		}
   1164 		tmp.Rsh(tmp, 28)
   1165 	}
   1166 }
   1167 
   1168 // p256ToBig returns a *big.Int containing the value of in.
   1169 func p256ToBig(in *[p256Limbs]uint32) *big.Int {
   1170 	result, tmp := new(big.Int), new(big.Int)
   1171 
   1172 	result.SetInt64(int64(in[p256Limbs-1]))
   1173 	for i := p256Limbs - 2; i >= 0; i-- {
   1174 		if (i & 1) == 0 {
   1175 			result.Lsh(result, 29)
   1176 		} else {
   1177 			result.Lsh(result, 28)
   1178 		}
   1179 		tmp.SetInt64(int64(in[i]))
   1180 		result.Add(result, tmp)
   1181 	}
   1182 
   1183 	result.Mul(result, p256RInverse)
   1184 	result.Mod(result, p256.P)
   1185 	return result
   1186 }
   1187