Home | History | Annotate | Download | only in elliptic
      1 // Copyright 2013 The Go Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style
      3 // license that can be found in the LICENSE file.
      4 
      5 // +build !amd64
      6 
      7 package elliptic
      8 
      9 // This file contains a constant-time, 32-bit implementation of P256.
     10 
     11 import (
     12 	"math/big"
     13 )
     14 
     15 type p256Curve struct {
     16 	*CurveParams
     17 }
     18 
     19 var (
     20 	p256Params *CurveParams
     21 
     22 	// RInverse contains 1/R mod p - the inverse of the Montgomery constant
     23 	// (2**257).
     24 	p256RInverse *big.Int
     25 )
     26 
     27 func initP256() {
     28 	// See FIPS 186-3, section D.2.3
     29 	p256Params = &CurveParams{Name: "P-256"}
     30 	p256Params.P, _ = new(big.Int).SetString("115792089210356248762697446949407573530086143415290314195533631308867097853951", 10)
     31 	p256Params.N, _ = new(big.Int).SetString("115792089210356248762697446949407573529996955224135760342422259061068512044369", 10)
     32 	p256Params.B, _ = new(big.Int).SetString("5ac635d8aa3a93e7b3ebbd55769886bc651d06b0cc53b0f63bce3c3e27d2604b", 16)
     33 	p256Params.Gx, _ = new(big.Int).SetString("6b17d1f2e12c4247f8bce6e563a440f277037d812deb33a0f4a13945d898c296", 16)
     34 	p256Params.Gy, _ = new(big.Int).SetString("4fe342e2fe1a7f9b8ee7eb4a7c0f9e162bce33576b315ececbb6406837bf51f5", 16)
     35 	p256Params.BitSize = 256
     36 
     37 	p256RInverse, _ = new(big.Int).SetString("7fffffff00000001fffffffe8000000100000000ffffffff0000000180000000", 16)
     38 
     39 	// Arch-specific initialization, i.e. let a platform dynamically pick a P256 implementation
     40 	initP256Arch()
     41 }
     42 
     43 func (curve p256Curve) Params() *CurveParams {
     44 	return curve.CurveParams
     45 }
     46 
     47 // p256GetScalar endian-swaps the big-endian scalar value from in and writes it
     48 // to out. If the scalar is equal or greater than the order of the group, it's
     49 // reduced modulo that order.
     50 func p256GetScalar(out *[32]byte, in []byte) {
     51 	n := new(big.Int).SetBytes(in)
     52 	var scalarBytes []byte
     53 
     54 	if n.Cmp(p256Params.N) >= 0 {
     55 		n.Mod(n, p256Params.N)
     56 		scalarBytes = n.Bytes()
     57 	} else {
     58 		scalarBytes = in
     59 	}
     60 
     61 	for i, v := range scalarBytes {
     62 		out[len(scalarBytes)-(1+i)] = v
     63 	}
     64 }
     65 
     66 func (p256Curve) ScalarBaseMult(scalar []byte) (x, y *big.Int) {
     67 	var scalarReversed [32]byte
     68 	p256GetScalar(&scalarReversed, scalar)
     69 
     70 	var x1, y1, z1 [p256Limbs]uint32
     71 	p256ScalarBaseMult(&x1, &y1, &z1, &scalarReversed)
     72 	return p256ToAffine(&x1, &y1, &z1)
     73 }
     74 
     75 func (p256Curve) ScalarMult(bigX, bigY *big.Int, scalar []byte) (x, y *big.Int) {
     76 	var scalarReversed [32]byte
     77 	p256GetScalar(&scalarReversed, scalar)
     78 
     79 	var px, py, x1, y1, z1 [p256Limbs]uint32
     80 	p256FromBig(&px, bigX)
     81 	p256FromBig(&py, bigY)
     82 	p256ScalarMult(&x1, &y1, &z1, &px, &py, &scalarReversed)
     83 	return p256ToAffine(&x1, &y1, &z1)
     84 }
     85 
     86 // Field elements are represented as nine, unsigned 32-bit words.
     87 //
     88 // The value of an field element is:
     89 //   x[0] + (x[1] * 2**29) + (x[2] * 2**57) + ... + (x[8] * 2**228)
     90 //
     91 // That is, each limb is alternately 29 or 28-bits wide in little-endian
     92 // order.
     93 //
     94 // This means that a field element hits 2**257, rather than 2**256 as we would
     95 // like. A 28, 29, ... pattern would cause us to hit 2**256, but that causes
     96 // problems when multiplying as terms end up one bit short of a limb which
     97 // would require much bit-shifting to correct.
     98 //
     99 // Finally, the values stored in a field element are in Montgomery form. So the
    100 // value |y| is stored as (y*R) mod p, where p is the P-256 prime and R is
    101 // 2**257.
    102 
    103 const (
    104 	p256Limbs    = 9
    105 	bottom29Bits = 0x1fffffff
    106 )
    107 
    108 var (
    109 	// p256One is the number 1 as a field element.
    110 	p256One  = [p256Limbs]uint32{2, 0, 0, 0xffff800, 0x1fffffff, 0xfffffff, 0x1fbfffff, 0x1ffffff, 0}
    111 	p256Zero = [p256Limbs]uint32{0, 0, 0, 0, 0, 0, 0, 0, 0}
    112 	// p256P is the prime modulus as a field element.
    113 	p256P = [p256Limbs]uint32{0x1fffffff, 0xfffffff, 0x1fffffff, 0x3ff, 0, 0, 0x200000, 0xf000000, 0xfffffff}
    114 	// p2562P is the twice prime modulus as a field element.
    115 	p2562P = [p256Limbs]uint32{0x1ffffffe, 0xfffffff, 0x1fffffff, 0x7ff, 0, 0, 0x400000, 0xe000000, 0x1fffffff}
    116 )
    117 
    118 // p256Precomputed contains precomputed values to aid the calculation of scalar
    119 // multiples of the base point, G. It's actually two, equal length, tables
    120 // concatenated.
    121 //
    122 // The first table contains (x,y) field element pairs for 16 multiples of the
    123 // base point, G.
    124 //
    125 //   Index  |  Index (binary) | Value
    126 //       0  |           0000  | 0G (all zeros, omitted)
    127 //       1  |           0001  | G
    128 //       2  |           0010  | 2**64G
    129 //       3  |           0011  | 2**64G + G
    130 //       4  |           0100  | 2**128G
    131 //       5  |           0101  | 2**128G + G
    132 //       6  |           0110  | 2**128G + 2**64G
    133 //       7  |           0111  | 2**128G + 2**64G + G
    134 //       8  |           1000  | 2**192G
    135 //       9  |           1001  | 2**192G + G
    136 //      10  |           1010  | 2**192G + 2**64G
    137 //      11  |           1011  | 2**192G + 2**64G + G
    138 //      12  |           1100  | 2**192G + 2**128G
    139 //      13  |           1101  | 2**192G + 2**128G + G
    140 //      14  |           1110  | 2**192G + 2**128G + 2**64G
    141 //      15  |           1111  | 2**192G + 2**128G + 2**64G + G
    142 //
    143 // The second table follows the same style, but the terms are 2**32G,
    144 // 2**96G, 2**160G, 2**224G.
    145 //
    146 // This is ~2KB of data.
    147 var p256Precomputed = [p256Limbs * 2 * 15 * 2]uint32{
    148 	0x11522878, 0xe730d41, 0xdb60179, 0x4afe2ff, 0x12883add, 0xcaddd88, 0x119e7edc, 0xd4a6eab, 0x3120bee,
    149 	0x1d2aac15, 0xf25357c, 0x19e45cdd, 0x5c721d0, 0x1992c5a5, 0xa237487, 0x154ba21, 0x14b10bb, 0xae3fe3,
    150 	0xd41a576, 0x922fc51, 0x234994f, 0x60b60d3, 0x164586ae, 0xce95f18, 0x1fe49073, 0x3fa36cc, 0x5ebcd2c,
    151 	0xb402f2f, 0x15c70bf, 0x1561925c, 0x5a26704, 0xda91e90, 0xcdc1c7f, 0x1ea12446, 0xe1ade1e, 0xec91f22,
    152 	0x26f7778, 0x566847e, 0xa0bec9e, 0x234f453, 0x1a31f21a, 0xd85e75c, 0x56c7109, 0xa267a00, 0xb57c050,
    153 	0x98fb57, 0xaa837cc, 0x60c0792, 0xcfa5e19, 0x61bab9e, 0x589e39b, 0xa324c5, 0x7d6dee7, 0x2976e4b,
    154 	0x1fc4124a, 0xa8c244b, 0x1ce86762, 0xcd61c7e, 0x1831c8e0, 0x75774e1, 0x1d96a5a9, 0x843a649, 0xc3ab0fa,
    155 	0x6e2e7d5, 0x7673a2a, 0x178b65e8, 0x4003e9b, 0x1a1f11c2, 0x7816ea, 0xf643e11, 0x58c43df, 0xf423fc2,
    156 	0x19633ffa, 0x891f2b2, 0x123c231c, 0x46add8c, 0x54700dd, 0x59e2b17, 0x172db40f, 0x83e277d, 0xb0dd609,
    157 	0xfd1da12, 0x35c6e52, 0x19ede20c, 0xd19e0c0, 0x97d0f40, 0xb015b19, 0x449e3f5, 0xe10c9e, 0x33ab581,
    158 	0x56a67ab, 0x577734d, 0x1dddc062, 0xc57b10d, 0x149b39d, 0x26a9e7b, 0xc35df9f, 0x48764cd, 0x76dbcca,
    159 	0xca4b366, 0xe9303ab, 0x1a7480e7, 0x57e9e81, 0x1e13eb50, 0xf466cf3, 0x6f16b20, 0x4ba3173, 0xc168c33,
    160 	0x15cb5439, 0x6a38e11, 0x73658bd, 0xb29564f, 0x3f6dc5b, 0x53b97e, 0x1322c4c0, 0x65dd7ff, 0x3a1e4f6,
    161 	0x14e614aa, 0x9246317, 0x1bc83aca, 0xad97eed, 0xd38ce4a, 0xf82b006, 0x341f077, 0xa6add89, 0x4894acd,
    162 	0x9f162d5, 0xf8410ef, 0x1b266a56, 0xd7f223, 0x3e0cb92, 0xe39b672, 0x6a2901a, 0x69a8556, 0x7e7c0,
    163 	0x9b7d8d3, 0x309a80, 0x1ad05f7f, 0xc2fb5dd, 0xcbfd41d, 0x9ceb638, 0x1051825c, 0xda0cf5b, 0x812e881,
    164 	0x6f35669, 0x6a56f2c, 0x1df8d184, 0x345820, 0x1477d477, 0x1645db1, 0xbe80c51, 0xc22be3e, 0xe35e65a,
    165 	0x1aeb7aa0, 0xc375315, 0xf67bc99, 0x7fdd7b9, 0x191fc1be, 0x61235d, 0x2c184e9, 0x1c5a839, 0x47a1e26,
    166 	0xb7cb456, 0x93e225d, 0x14f3c6ed, 0xccc1ac9, 0x17fe37f3, 0x4988989, 0x1a90c502, 0x2f32042, 0xa17769b,
    167 	0xafd8c7c, 0x8191c6e, 0x1dcdb237, 0x16200c0, 0x107b32a1, 0x66c08db, 0x10d06a02, 0x3fc93, 0x5620023,
    168 	0x16722b27, 0x68b5c59, 0x270fcfc, 0xfad0ecc, 0xe5de1c2, 0xeab466b, 0x2fc513c, 0x407f75c, 0xbaab133,
    169 	0x9705fe9, 0xb88b8e7, 0x734c993, 0x1e1ff8f, 0x19156970, 0xabd0f00, 0x10469ea7, 0x3293ac0, 0xcdc98aa,
    170 	0x1d843fd, 0xe14bfe8, 0x15be825f, 0x8b5212, 0xeb3fb67, 0x81cbd29, 0xbc62f16, 0x2b6fcc7, 0xf5a4e29,
    171 	0x13560b66, 0xc0b6ac2, 0x51ae690, 0xd41e271, 0xf3e9bd4, 0x1d70aab, 0x1029f72, 0x73e1c35, 0xee70fbc,
    172 	0xad81baf, 0x9ecc49a, 0x86c741e, 0xfe6be30, 0x176752e7, 0x23d416, 0x1f83de85, 0x27de188, 0x66f70b8,
    173 	0x181cd51f, 0x96b6e4c, 0x188f2335, 0xa5df759, 0x17a77eb6, 0xfeb0e73, 0x154ae914, 0x2f3ec51, 0x3826b59,
    174 	0xb91f17d, 0x1c72949, 0x1362bf0a, 0xe23fddf, 0xa5614b0, 0xf7d8f, 0x79061, 0x823d9d2, 0x8213f39,
    175 	0x1128ae0b, 0xd095d05, 0xb85c0c2, 0x1ecb2ef, 0x24ddc84, 0xe35e901, 0x18411a4a, 0xf5ddc3d, 0x3786689,
    176 	0x52260e8, 0x5ae3564, 0x542b10d, 0x8d93a45, 0x19952aa4, 0x996cc41, 0x1051a729, 0x4be3499, 0x52b23aa,
    177 	0x109f307e, 0x6f5b6bb, 0x1f84e1e7, 0x77a0cfa, 0x10c4df3f, 0x25a02ea, 0xb048035, 0xe31de66, 0xc6ecaa3,
    178 	0x28ea335, 0x2886024, 0x1372f020, 0xf55d35, 0x15e4684c, 0xf2a9e17, 0x1a4a7529, 0xcb7beb1, 0xb2a78a1,
    179 	0x1ab21f1f, 0x6361ccf, 0x6c9179d, 0xb135627, 0x1267b974, 0x4408bad, 0x1cbff658, 0xe3d6511, 0xc7d76f,
    180 	0x1cc7a69, 0xe7ee31b, 0x54fab4f, 0x2b914f, 0x1ad27a30, 0xcd3579e, 0xc50124c, 0x50daa90, 0xb13f72,
    181 	0xb06aa75, 0x70f5cc6, 0x1649e5aa, 0x84a5312, 0x329043c, 0x41c4011, 0x13d32411, 0xb04a838, 0xd760d2d,
    182 	0x1713b532, 0xbaa0c03, 0x84022ab, 0x6bcf5c1, 0x2f45379, 0x18ae070, 0x18c9e11e, 0x20bca9a, 0x66f496b,
    183 	0x3eef294, 0x67500d2, 0xd7f613c, 0x2dbbeb, 0xb741038, 0xe04133f, 0x1582968d, 0xbe985f7, 0x1acbc1a,
    184 	0x1a6a939f, 0x33e50f6, 0xd665ed4, 0xb4b7bd6, 0x1e5a3799, 0x6b33847, 0x17fa56ff, 0x65ef930, 0x21dc4a,
    185 	0x2b37659, 0x450fe17, 0xb357b65, 0xdf5efac, 0x15397bef, 0x9d35a7f, 0x112ac15f, 0x624e62e, 0xa90ae2f,
    186 	0x107eecd2, 0x1f69bbe, 0x77d6bce, 0x5741394, 0x13c684fc, 0x950c910, 0x725522b, 0xdc78583, 0x40eeabb,
    187 	0x1fde328a, 0xbd61d96, 0xd28c387, 0x9e77d89, 0x12550c40, 0x759cb7d, 0x367ef34, 0xae2a960, 0x91b8bdc,
    188 	0x93462a9, 0xf469ef, 0xb2e9aef, 0xd2ca771, 0x54e1f42, 0x7aaa49, 0x6316abb, 0x2413c8e, 0x5425bf9,
    189 	0x1bed3e3a, 0xf272274, 0x1f5e7326, 0x6416517, 0xea27072, 0x9cedea7, 0x6e7633, 0x7c91952, 0xd806dce,
    190 	0x8e2a7e1, 0xe421e1a, 0x418c9e1, 0x1dbc890, 0x1b395c36, 0xa1dc175, 0x1dc4ef73, 0x8956f34, 0xe4b5cf2,
    191 	0x1b0d3a18, 0x3194a36, 0x6c2641f, 0xe44124c, 0xa2f4eaa, 0xa8c25ba, 0xf927ed7, 0x627b614, 0x7371cca,
    192 	0xba16694, 0x417bc03, 0x7c0a7e3, 0x9c35c19, 0x1168a205, 0x8b6b00d, 0x10e3edc9, 0x9c19bf2, 0x5882229,
    193 	0x1b2b4162, 0xa5cef1a, 0x1543622b, 0x9bd433e, 0x364e04d, 0x7480792, 0x5c9b5b3, 0xe85ff25, 0x408ef57,
    194 	0x1814cfa4, 0x121b41b, 0xd248a0f, 0x3b05222, 0x39bb16a, 0xc75966d, 0xa038113, 0xa4a1769, 0x11fbc6c,
    195 	0x917e50e, 0xeec3da8, 0x169d6eac, 0x10c1699, 0xa416153, 0xf724912, 0x15cd60b7, 0x4acbad9, 0x5efc5fa,
    196 	0xf150ed7, 0x122b51, 0x1104b40a, 0xcb7f442, 0xfbb28ff, 0x6ac53ca, 0x196142cc, 0x7bf0fa9, 0x957651,
    197 	0x4e0f215, 0xed439f8, 0x3f46bd5, 0x5ace82f, 0x110916b6, 0x6db078, 0xffd7d57, 0xf2ecaac, 0xca86dec,
    198 	0x15d6b2da, 0x965ecc9, 0x1c92b4c2, 0x1f3811, 0x1cb080f5, 0x2d8b804, 0x19d1c12d, 0xf20bd46, 0x1951fa7,
    199 	0xa3656c3, 0x523a425, 0xfcd0692, 0xd44ddc8, 0x131f0f5b, 0xaf80e4a, 0xcd9fc74, 0x99bb618, 0x2db944c,
    200 	0xa673090, 0x1c210e1, 0x178c8d23, 0x1474383, 0x10b8743d, 0x985a55b, 0x2e74779, 0x576138, 0x9587927,
    201 	0x133130fa, 0xbe05516, 0x9f4d619, 0xbb62570, 0x99ec591, 0xd9468fe, 0x1d07782d, 0xfc72e0b, 0x701b298,
    202 	0x1863863b, 0x85954b8, 0x121a0c36, 0x9e7fedf, 0xf64b429, 0x9b9d71e, 0x14e2f5d8, 0xf858d3a, 0x942eea8,
    203 	0xda5b765, 0x6edafff, 0xa9d18cc, 0xc65e4ba, 0x1c747e86, 0xe4ea915, 0x1981d7a1, 0x8395659, 0x52ed4e2,
    204 	0x87d43b7, 0x37ab11b, 0x19d292ce, 0xf8d4692, 0x18c3053f, 0x8863e13, 0x4c146c0, 0x6bdf55a, 0x4e4457d,
    205 	0x16152289, 0xac78ec2, 0x1a59c5a2, 0x2028b97, 0x71c2d01, 0x295851f, 0x404747b, 0x878558d, 0x7d29aa4,
    206 	0x13d8341f, 0x8daefd7, 0x139c972d, 0x6b7ea75, 0xd4a9dde, 0xff163d8, 0x81d55d7, 0xa5bef68, 0xb7b30d8,
    207 	0xbe73d6f, 0xaa88141, 0xd976c81, 0x7e7a9cc, 0x18beb771, 0xd773cbd, 0x13f51951, 0x9d0c177, 0x1c49a78,
    208 }
    209 
    210 // Field element operations:
    211 
    212 // nonZeroToAllOnes returns:
    213 //   0xffffffff for 0 < x <= 2**31
    214 //   0 for x == 0 or x > 2**31.
    215 func nonZeroToAllOnes(x uint32) uint32 {
    216 	return ((x - 1) >> 31) - 1
    217 }
    218 
    219 // p256ReduceCarry adds a multiple of p in order to cancel |carry|,
    220 // which is a term at 2**257.
    221 //
    222 // On entry: carry < 2**3, inout[0,2,...] < 2**29, inout[1,3,...] < 2**28.
    223 // On exit: inout[0,2,..] < 2**30, inout[1,3,...] < 2**29.
    224 func p256ReduceCarry(inout *[p256Limbs]uint32, carry uint32) {
    225 	carry_mask := nonZeroToAllOnes(carry)
    226 
    227 	inout[0] += carry << 1
    228 	inout[3] += 0x10000000 & carry_mask
    229 	// carry < 2**3 thus (carry << 11) < 2**14 and we added 2**28 in the
    230 	// previous line therefore this doesn't underflow.
    231 	inout[3] -= carry << 11
    232 	inout[4] += (0x20000000 - 1) & carry_mask
    233 	inout[5] += (0x10000000 - 1) & carry_mask
    234 	inout[6] += (0x20000000 - 1) & carry_mask
    235 	inout[6] -= carry << 22
    236 	// This may underflow if carry is non-zero but, if so, we'll fix it in the
    237 	// next line.
    238 	inout[7] -= 1 & carry_mask
    239 	inout[7] += carry << 25
    240 }
    241 
    242 // p256Sum sets out = in+in2.
    243 //
    244 // On entry, in[i]+in2[i] must not overflow a 32-bit word.
    245 // On exit: out[0,2,...] < 2**30, out[1,3,...] < 2**29
    246 func p256Sum(out, in, in2 *[p256Limbs]uint32) {
    247 	carry := uint32(0)
    248 	for i := 0; ; i++ {
    249 		out[i] = in[i] + in2[i]
    250 		out[i] += carry
    251 		carry = out[i] >> 29
    252 		out[i] &= bottom29Bits
    253 
    254 		i++
    255 		if i == p256Limbs {
    256 			break
    257 		}
    258 
    259 		out[i] = in[i] + in2[i]
    260 		out[i] += carry
    261 		carry = out[i] >> 28
    262 		out[i] &= bottom28Bits
    263 	}
    264 
    265 	p256ReduceCarry(out, carry)
    266 }
    267 
    268 const (
    269 	two30m2    = 1<<30 - 1<<2
    270 	two30p13m2 = 1<<30 + 1<<13 - 1<<2
    271 	two31m2    = 1<<31 - 1<<2
    272 	two31p24m2 = 1<<31 + 1<<24 - 1<<2
    273 	two30m27m2 = 1<<30 - 1<<27 - 1<<2
    274 )
    275 
    276 // p256Zero31 is 0 mod p.
    277 var p256Zero31 = [p256Limbs]uint32{two31m3, two30m2, two31m2, two30p13m2, two31m2, two30m2, two31p24m2, two30m27m2, two31m2}
    278 
    279 // p256Diff sets out = in-in2.
    280 //
    281 // On entry: in[0,2,...] < 2**30, in[1,3,...] < 2**29 and
    282 //           in2[0,2,...] < 2**30, in2[1,3,...] < 2**29.
    283 // On exit: out[0,2,...] < 2**30, out[1,3,...] < 2**29.
    284 func p256Diff(out, in, in2 *[p256Limbs]uint32) {
    285 	var carry uint32
    286 
    287 	for i := 0; ; i++ {
    288 		out[i] = in[i] - in2[i]
    289 		out[i] += p256Zero31[i]
    290 		out[i] += carry
    291 		carry = out[i] >> 29
    292 		out[i] &= bottom29Bits
    293 
    294 		i++
    295 		if i == p256Limbs {
    296 			break
    297 		}
    298 
    299 		out[i] = in[i] - in2[i]
    300 		out[i] += p256Zero31[i]
    301 		out[i] += carry
    302 		carry = out[i] >> 28
    303 		out[i] &= bottom28Bits
    304 	}
    305 
    306 	p256ReduceCarry(out, carry)
    307 }
    308 
    309 // p256ReduceDegree sets out = tmp/R mod p where tmp contains 64-bit words with
    310 // the same 29,28,... bit positions as an field element.
    311 //
    312 // The values in field elements are in Montgomery form: x*R mod p where R =
    313 // 2**257. Since we just multiplied two Montgomery values together, the result
    314 // is x*y*R*R mod p. We wish to divide by R in order for the result also to be
    315 // in Montgomery form.
    316 //
    317 // On entry: tmp[i] < 2**64
    318 // On exit: out[0,2,...] < 2**30, out[1,3,...] < 2**29
    319 func p256ReduceDegree(out *[p256Limbs]uint32, tmp [17]uint64) {
    320 	// The following table may be helpful when reading this code:
    321 	//
    322 	// Limb number:   0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10...
    323 	// Width (bits):  29| 28| 29| 28| 29| 28| 29| 28| 29| 28| 29
    324 	// Start bit:     0 | 29| 57| 86|114|143|171|200|228|257|285
    325 	//   (odd phase): 0 | 28| 57| 85|114|142|171|199|228|256|285
    326 	var tmp2 [18]uint32
    327 	var carry, x, xMask uint32
    328 
    329 	// tmp contains 64-bit words with the same 29,28,29-bit positions as an
    330 	// field element. So the top of an element of tmp might overlap with
    331 	// another element two positions down. The following loop eliminates
    332 	// this overlap.
    333 	tmp2[0] = uint32(tmp[0]) & bottom29Bits
    334 
    335 	tmp2[1] = uint32(tmp[0]) >> 29
    336 	tmp2[1] |= (uint32(tmp[0]>>32) << 3) & bottom28Bits
    337 	tmp2[1] += uint32(tmp[1]) & bottom28Bits
    338 	carry = tmp2[1] >> 28
    339 	tmp2[1] &= bottom28Bits
    340 
    341 	for i := 2; i < 17; i++ {
    342 		tmp2[i] = (uint32(tmp[i-2] >> 32)) >> 25
    343 		tmp2[i] += (uint32(tmp[i-1])) >> 28
    344 		tmp2[i] += (uint32(tmp[i-1]>>32) << 4) & bottom29Bits
    345 		tmp2[i] += uint32(tmp[i]) & bottom29Bits
    346 		tmp2[i] += carry
    347 		carry = tmp2[i] >> 29
    348 		tmp2[i] &= bottom29Bits
    349 
    350 		i++
    351 		if i == 17 {
    352 			break
    353 		}
    354 		tmp2[i] = uint32(tmp[i-2]>>32) >> 25
    355 		tmp2[i] += uint32(tmp[i-1]) >> 29
    356 		tmp2[i] += ((uint32(tmp[i-1] >> 32)) << 3) & bottom28Bits
    357 		tmp2[i] += uint32(tmp[i]) & bottom28Bits
    358 		tmp2[i] += carry
    359 		carry = tmp2[i] >> 28
    360 		tmp2[i] &= bottom28Bits
    361 	}
    362 
    363 	tmp2[17] = uint32(tmp[15]>>32) >> 25
    364 	tmp2[17] += uint32(tmp[16]) >> 29
    365 	tmp2[17] += uint32(tmp[16]>>32) << 3
    366 	tmp2[17] += carry
    367 
    368 	// Montgomery elimination of terms:
    369 	//
    370 	// Since R is 2**257, we can divide by R with a bitwise shift if we can
    371 	// ensure that the right-most 257 bits are all zero. We can make that true
    372 	// by adding multiplies of p without affecting the value.
    373 	//
    374 	// So we eliminate limbs from right to left. Since the bottom 29 bits of p
    375 	// are all ones, then by adding tmp2[0]*p to tmp2 we'll make tmp2[0] == 0.
    376 	// We can do that for 8 further limbs and then right shift to eliminate the
    377 	// extra factor of R.
    378 	for i := 0; ; i += 2 {
    379 		tmp2[i+1] += tmp2[i] >> 29
    380 		x = tmp2[i] & bottom29Bits
    381 		xMask = nonZeroToAllOnes(x)
    382 		tmp2[i] = 0
    383 
    384 		// The bounds calculations for this loop are tricky. Each iteration of
    385 		// the loop eliminates two words by adding values to words to their
    386 		// right.
    387 		//
    388 		// The following table contains the amounts added to each word (as an
    389 		// offset from the value of i at the top of the loop). The amounts are
    390 		// accounted for from the first and second half of the loop separately
    391 		// and are written as, for example, 28 to mean a value <2**28.
    392 		//
    393 		// Word:                   3   4   5   6   7   8   9   10
    394 		// Added in top half:     28  11      29  21  29  28
    395 		//                                        28  29
    396 		//                                            29
    397 		// Added in bottom half:      29  10      28  21  28   28
    398 		//                                            29
    399 		//
    400 		// The value that is currently offset 7 will be offset 5 for the next
    401 		// iteration and then offset 3 for the iteration after that. Therefore
    402 		// the total value added will be the values added at 7, 5 and 3.
    403 		//
    404 		// The following table accumulates these values. The sums at the bottom
    405 		// are written as, for example, 29+28, to mean a value < 2**29+2**28.
    406 		//
    407 		// Word:                   3   4   5   6   7   8   9  10  11  12  13
    408 		//                        28  11  10  29  21  29  28  28  28  28  28
    409 		//                            29  28  11  28  29  28  29  28  29  28
    410 		//                                    29  28  21  21  29  21  29  21
    411 		//                                        10  29  28  21  28  21  28
    412 		//                                        28  29  28  29  28  29  28
    413 		//                                            11  10  29  10  29  10
    414 		//                                            29  28  11  28  11
    415 		//                                                    29      29
    416 		//                        --------------------------------------------
    417 		//                                                30+ 31+ 30+ 31+ 30+
    418 		//                                                28+ 29+ 28+ 29+ 21+
    419 		//                                                21+ 28+ 21+ 28+ 10
    420 		//                                                10  21+ 10  21+
    421 		//                                                    11      11
    422 		//
    423 		// So the greatest amount is added to tmp2[10] and tmp2[12]. If
    424 		// tmp2[10/12] has an initial value of <2**29, then the maximum value
    425 		// will be < 2**31 + 2**30 + 2**28 + 2**21 + 2**11, which is < 2**32,
    426 		// as required.
    427 		tmp2[i+3] += (x << 10) & bottom28Bits
    428 		tmp2[i+4] += (x >> 18)
    429 
    430 		tmp2[i+6] += (x << 21) & bottom29Bits
    431 		tmp2[i+7] += x >> 8
    432 
    433 		// At position 200, which is the starting bit position for word 7, we
    434 		// have a factor of 0xf000000 = 2**28 - 2**24.
    435 		tmp2[i+7] += 0x10000000 & xMask
    436 		tmp2[i+8] += (x - 1) & xMask
    437 		tmp2[i+7] -= (x << 24) & bottom28Bits
    438 		tmp2[i+8] -= x >> 4
    439 
    440 		tmp2[i+8] += 0x20000000 & xMask
    441 		tmp2[i+8] -= x
    442 		tmp2[i+8] += (x << 28) & bottom29Bits
    443 		tmp2[i+9] += ((x >> 1) - 1) & xMask
    444 
    445 		if i+1 == p256Limbs {
    446 			break
    447 		}
    448 		tmp2[i+2] += tmp2[i+1] >> 28
    449 		x = tmp2[i+1] & bottom28Bits
    450 		xMask = nonZeroToAllOnes(x)
    451 		tmp2[i+1] = 0
    452 
    453 		tmp2[i+4] += (x << 11) & bottom29Bits
    454 		tmp2[i+5] += (x >> 18)
    455 
    456 		tmp2[i+7] += (x << 21) & bottom28Bits
    457 		tmp2[i+8] += x >> 7
    458 
    459 		// At position 199, which is the starting bit of the 8th word when
    460 		// dealing with a context starting on an odd word, we have a factor of
    461 		// 0x1e000000 = 2**29 - 2**25. Since we have not updated i, the 8th
    462 		// word from i+1 is i+8.
    463 		tmp2[i+8] += 0x20000000 & xMask
    464 		tmp2[i+9] += (x - 1) & xMask
    465 		tmp2[i+8] -= (x << 25) & bottom29Bits
    466 		tmp2[i+9] -= x >> 4
    467 
    468 		tmp2[i+9] += 0x10000000 & xMask
    469 		tmp2[i+9] -= x
    470 		tmp2[i+10] += (x - 1) & xMask
    471 	}
    472 
    473 	// We merge the right shift with a carry chain. The words above 2**257 have
    474 	// widths of 28,29,... which we need to correct when copying them down.
    475 	carry = 0
    476 	for i := 0; i < 8; i++ {
    477 		// The maximum value of tmp2[i + 9] occurs on the first iteration and
    478 		// is < 2**30+2**29+2**28. Adding 2**29 (from tmp2[i + 10]) is
    479 		// therefore safe.
    480 		out[i] = tmp2[i+9]
    481 		out[i] += carry
    482 		out[i] += (tmp2[i+10] << 28) & bottom29Bits
    483 		carry = out[i] >> 29
    484 		out[i] &= bottom29Bits
    485 
    486 		i++
    487 		out[i] = tmp2[i+9] >> 1
    488 		out[i] += carry
    489 		carry = out[i] >> 28
    490 		out[i] &= bottom28Bits
    491 	}
    492 
    493 	out[8] = tmp2[17]
    494 	out[8] += carry
    495 	carry = out[8] >> 29
    496 	out[8] &= bottom29Bits
    497 
    498 	p256ReduceCarry(out, carry)
    499 }
    500 
    501 // p256Square sets out=in*in.
    502 //
    503 // On entry: in[0,2,...] < 2**30, in[1,3,...] < 2**29.
    504 // On exit: out[0,2,...] < 2**30, out[1,3,...] < 2**29.
    505 func p256Square(out, in *[p256Limbs]uint32) {
    506 	var tmp [17]uint64
    507 
    508 	tmp[0] = uint64(in[0]) * uint64(in[0])
    509 	tmp[1] = uint64(in[0]) * (uint64(in[1]) << 1)
    510 	tmp[2] = uint64(in[0])*(uint64(in[2])<<1) +
    511 		uint64(in[1])*(uint64(in[1])<<1)
    512 	tmp[3] = uint64(in[0])*(uint64(in[3])<<1) +
    513 		uint64(in[1])*(uint64(in[2])<<1)
    514 	tmp[4] = uint64(in[0])*(uint64(in[4])<<1) +
    515 		uint64(in[1])*(uint64(in[3])<<2) +
    516 		uint64(in[2])*uint64(in[2])
    517 	tmp[5] = uint64(in[0])*(uint64(in[5])<<1) +
    518 		uint64(in[1])*(uint64(in[4])<<1) +
    519 		uint64(in[2])*(uint64(in[3])<<1)
    520 	tmp[6] = uint64(in[0])*(uint64(in[6])<<1) +
    521 		uint64(in[1])*(uint64(in[5])<<2) +
    522 		uint64(in[2])*(uint64(in[4])<<1) +
    523 		uint64(in[3])*(uint64(in[3])<<1)
    524 	tmp[7] = uint64(in[0])*(uint64(in[7])<<1) +
    525 		uint64(in[1])*(uint64(in[6])<<1) +
    526 		uint64(in[2])*(uint64(in[5])<<1) +
    527 		uint64(in[3])*(uint64(in[4])<<1)
    528 	// tmp[8] has the greatest value of 2**61 + 2**60 + 2**61 + 2**60 + 2**60,
    529 	// which is < 2**64 as required.
    530 	tmp[8] = uint64(in[0])*(uint64(in[8])<<1) +
    531 		uint64(in[1])*(uint64(in[7])<<2) +
    532 		uint64(in[2])*(uint64(in[6])<<1) +
    533 		uint64(in[3])*(uint64(in[5])<<2) +
    534 		uint64(in[4])*uint64(in[4])
    535 	tmp[9] = uint64(in[1])*(uint64(in[8])<<1) +
    536 		uint64(in[2])*(uint64(in[7])<<1) +
    537 		uint64(in[3])*(uint64(in[6])<<1) +
    538 		uint64(in[4])*(uint64(in[5])<<1)
    539 	tmp[10] = uint64(in[2])*(uint64(in[8])<<1) +
    540 		uint64(in[3])*(uint64(in[7])<<2) +
    541 		uint64(in[4])*(uint64(in[6])<<1) +
    542 		uint64(in[5])*(uint64(in[5])<<1)
    543 	tmp[11] = uint64(in[3])*(uint64(in[8])<<1) +
    544 		uint64(in[4])*(uint64(in[7])<<1) +
    545 		uint64(in[5])*(uint64(in[6])<<1)
    546 	tmp[12] = uint64(in[4])*(uint64(in[8])<<1) +
    547 		uint64(in[5])*(uint64(in[7])<<2) +
    548 		uint64(in[6])*uint64(in[6])
    549 	tmp[13] = uint64(in[5])*(uint64(in[8])<<1) +
    550 		uint64(in[6])*(uint64(in[7])<<1)
    551 	tmp[14] = uint64(in[6])*(uint64(in[8])<<1) +
    552 		uint64(in[7])*(uint64(in[7])<<1)
    553 	tmp[15] = uint64(in[7]) * (uint64(in[8]) << 1)
    554 	tmp[16] = uint64(in[8]) * uint64(in[8])
    555 
    556 	p256ReduceDegree(out, tmp)
    557 }
    558 
    559 // p256Mul sets out=in*in2.
    560 //
    561 // On entry: in[0,2,...] < 2**30, in[1,3,...] < 2**29 and
    562 //           in2[0,2,...] < 2**30, in2[1,3,...] < 2**29.
    563 // On exit: out[0,2,...] < 2**30, out[1,3,...] < 2**29.
    564 func p256Mul(out, in, in2 *[p256Limbs]uint32) {
    565 	var tmp [17]uint64
    566 
    567 	tmp[0] = uint64(in[0]) * uint64(in2[0])
    568 	tmp[1] = uint64(in[0])*(uint64(in2[1])<<0) +
    569 		uint64(in[1])*(uint64(in2[0])<<0)
    570 	tmp[2] = uint64(in[0])*(uint64(in2[2])<<0) +
    571 		uint64(in[1])*(uint64(in2[1])<<1) +
    572 		uint64(in[2])*(uint64(in2[0])<<0)
    573 	tmp[3] = uint64(in[0])*(uint64(in2[3])<<0) +
    574 		uint64(in[1])*(uint64(in2[2])<<0) +
    575 		uint64(in[2])*(uint64(in2[1])<<0) +
    576 		uint64(in[3])*(uint64(in2[0])<<0)
    577 	tmp[4] = uint64(in[0])*(uint64(in2[4])<<0) +
    578 		uint64(in[1])*(uint64(in2[3])<<1) +
    579 		uint64(in[2])*(uint64(in2[2])<<0) +
    580 		uint64(in[3])*(uint64(in2[1])<<1) +
    581 		uint64(in[4])*(uint64(in2[0])<<0)
    582 	tmp[5] = uint64(in[0])*(uint64(in2[5])<<0) +
    583 		uint64(in[1])*(uint64(in2[4])<<0) +
    584 		uint64(in[2])*(uint64(in2[3])<<0) +
    585 		uint64(in[3])*(uint64(in2[2])<<0) +
    586 		uint64(in[4])*(uint64(in2[1])<<0) +
    587 		uint64(in[5])*(uint64(in2[0])<<0)
    588 	tmp[6] = uint64(in[0])*(uint64(in2[6])<<0) +
    589 		uint64(in[1])*(uint64(in2[5])<<1) +
    590 		uint64(in[2])*(uint64(in2[4])<<0) +
    591 		uint64(in[3])*(uint64(in2[3])<<1) +
    592 		uint64(in[4])*(uint64(in2[2])<<0) +
    593 		uint64(in[5])*(uint64(in2[1])<<1) +
    594 		uint64(in[6])*(uint64(in2[0])<<0)
    595 	tmp[7] = uint64(in[0])*(uint64(in2[7])<<0) +
    596 		uint64(in[1])*(uint64(in2[6])<<0) +
    597 		uint64(in[2])*(uint64(in2[5])<<0) +
    598 		uint64(in[3])*(uint64(in2[4])<<0) +
    599 		uint64(in[4])*(uint64(in2[3])<<0) +
    600 		uint64(in[5])*(uint64(in2[2])<<0) +
    601 		uint64(in[6])*(uint64(in2[1])<<0) +
    602 		uint64(in[7])*(uint64(in2[0])<<0)
    603 	// tmp[8] has the greatest value but doesn't overflow. See logic in
    604 	// p256Square.
    605 	tmp[8] = uint64(in[0])*(uint64(in2[8])<<0) +
    606 		uint64(in[1])*(uint64(in2[7])<<1) +
    607 		uint64(in[2])*(uint64(in2[6])<<0) +
    608 		uint64(in[3])*(uint64(in2[5])<<1) +
    609 		uint64(in[4])*(uint64(in2[4])<<0) +
    610 		uint64(in[5])*(uint64(in2[3])<<1) +
    611 		uint64(in[6])*(uint64(in2[2])<<0) +
    612 		uint64(in[7])*(uint64(in2[1])<<1) +
    613 		uint64(in[8])*(uint64(in2[0])<<0)
    614 	tmp[9] = uint64(in[1])*(uint64(in2[8])<<0) +
    615 		uint64(in[2])*(uint64(in2[7])<<0) +
    616 		uint64(in[3])*(uint64(in2[6])<<0) +
    617 		uint64(in[4])*(uint64(in2[5])<<0) +
    618 		uint64(in[5])*(uint64(in2[4])<<0) +
    619 		uint64(in[6])*(uint64(in2[3])<<0) +
    620 		uint64(in[7])*(uint64(in2[2])<<0) +
    621 		uint64(in[8])*(uint64(in2[1])<<0)
    622 	tmp[10] = uint64(in[2])*(uint64(in2[8])<<0) +
    623 		uint64(in[3])*(uint64(in2[7])<<1) +
    624 		uint64(in[4])*(uint64(in2[6])<<0) +
    625 		uint64(in[5])*(uint64(in2[5])<<1) +
    626 		uint64(in[6])*(uint64(in2[4])<<0) +
    627 		uint64(in[7])*(uint64(in2[3])<<1) +
    628 		uint64(in[8])*(uint64(in2[2])<<0)
    629 	tmp[11] = uint64(in[3])*(uint64(in2[8])<<0) +
    630 		uint64(in[4])*(uint64(in2[7])<<0) +
    631 		uint64(in[5])*(uint64(in2[6])<<0) +
    632 		uint64(in[6])*(uint64(in2[5])<<0) +
    633 		uint64(in[7])*(uint64(in2[4])<<0) +
    634 		uint64(in[8])*(uint64(in2[3])<<0)
    635 	tmp[12] = uint64(in[4])*(uint64(in2[8])<<0) +
    636 		uint64(in[5])*(uint64(in2[7])<<1) +
    637 		uint64(in[6])*(uint64(in2[6])<<0) +
    638 		uint64(in[7])*(uint64(in2[5])<<1) +
    639 		uint64(in[8])*(uint64(in2[4])<<0)
    640 	tmp[13] = uint64(in[5])*(uint64(in2[8])<<0) +
    641 		uint64(in[6])*(uint64(in2[7])<<0) +
    642 		uint64(in[7])*(uint64(in2[6])<<0) +
    643 		uint64(in[8])*(uint64(in2[5])<<0)
    644 	tmp[14] = uint64(in[6])*(uint64(in2[8])<<0) +
    645 		uint64(in[7])*(uint64(in2[7])<<1) +
    646 		uint64(in[8])*(uint64(in2[6])<<0)
    647 	tmp[15] = uint64(in[7])*(uint64(in2[8])<<0) +
    648 		uint64(in[8])*(uint64(in2[7])<<0)
    649 	tmp[16] = uint64(in[8]) * (uint64(in2[8]) << 0)
    650 
    651 	p256ReduceDegree(out, tmp)
    652 }
    653 
    654 func p256Assign(out, in *[p256Limbs]uint32) {
    655 	*out = *in
    656 }
    657 
    658 // p256Invert calculates |out| = |in|^{-1}
    659 //
    660 // Based on Fermat's Little Theorem:
    661 //   a^p = a (mod p)
    662 //   a^{p-1} = 1 (mod p)
    663 //   a^{p-2} = a^{-1} (mod p)
    664 func p256Invert(out, in *[p256Limbs]uint32) {
    665 	var ftmp, ftmp2 [p256Limbs]uint32
    666 
    667 	// each e_I will hold |in|^{2^I - 1}
    668 	var e2, e4, e8, e16, e32, e64 [p256Limbs]uint32
    669 
    670 	p256Square(&ftmp, in)     // 2^1
    671 	p256Mul(&ftmp, in, &ftmp) // 2^2 - 2^0
    672 	p256Assign(&e2, &ftmp)
    673 	p256Square(&ftmp, &ftmp)   // 2^3 - 2^1
    674 	p256Square(&ftmp, &ftmp)   // 2^4 - 2^2
    675 	p256Mul(&ftmp, &ftmp, &e2) // 2^4 - 2^0
    676 	p256Assign(&e4, &ftmp)
    677 	p256Square(&ftmp, &ftmp)   // 2^5 - 2^1
    678 	p256Square(&ftmp, &ftmp)   // 2^6 - 2^2
    679 	p256Square(&ftmp, &ftmp)   // 2^7 - 2^3
    680 	p256Square(&ftmp, &ftmp)   // 2^8 - 2^4
    681 	p256Mul(&ftmp, &ftmp, &e4) // 2^8 - 2^0
    682 	p256Assign(&e8, &ftmp)
    683 	for i := 0; i < 8; i++ {
    684 		p256Square(&ftmp, &ftmp)
    685 	} // 2^16 - 2^8
    686 	p256Mul(&ftmp, &ftmp, &e8) // 2^16 - 2^0
    687 	p256Assign(&e16, &ftmp)
    688 	for i := 0; i < 16; i++ {
    689 		p256Square(&ftmp, &ftmp)
    690 	} // 2^32 - 2^16
    691 	p256Mul(&ftmp, &ftmp, &e16) // 2^32 - 2^0
    692 	p256Assign(&e32, &ftmp)
    693 	for i := 0; i < 32; i++ {
    694 		p256Square(&ftmp, &ftmp)
    695 	} // 2^64 - 2^32
    696 	p256Assign(&e64, &ftmp)
    697 	p256Mul(&ftmp, &ftmp, in) // 2^64 - 2^32 + 2^0
    698 	for i := 0; i < 192; i++ {
    699 		p256Square(&ftmp, &ftmp)
    700 	} // 2^256 - 2^224 + 2^192
    701 
    702 	p256Mul(&ftmp2, &e64, &e32) // 2^64 - 2^0
    703 	for i := 0; i < 16; i++ {
    704 		p256Square(&ftmp2, &ftmp2)
    705 	} // 2^80 - 2^16
    706 	p256Mul(&ftmp2, &ftmp2, &e16) // 2^80 - 2^0
    707 	for i := 0; i < 8; i++ {
    708 		p256Square(&ftmp2, &ftmp2)
    709 	} // 2^88 - 2^8
    710 	p256Mul(&ftmp2, &ftmp2, &e8) // 2^88 - 2^0
    711 	for i := 0; i < 4; i++ {
    712 		p256Square(&ftmp2, &ftmp2)
    713 	} // 2^92 - 2^4
    714 	p256Mul(&ftmp2, &ftmp2, &e4) // 2^92 - 2^0
    715 	p256Square(&ftmp2, &ftmp2)   // 2^93 - 2^1
    716 	p256Square(&ftmp2, &ftmp2)   // 2^94 - 2^2
    717 	p256Mul(&ftmp2, &ftmp2, &e2) // 2^94 - 2^0
    718 	p256Square(&ftmp2, &ftmp2)   // 2^95 - 2^1
    719 	p256Square(&ftmp2, &ftmp2)   // 2^96 - 2^2
    720 	p256Mul(&ftmp2, &ftmp2, in)  // 2^96 - 3
    721 
    722 	p256Mul(out, &ftmp2, &ftmp) // 2^256 - 2^224 + 2^192 + 2^96 - 3
    723 }
    724 
    725 // p256Scalar3 sets out=3*out.
    726 //
    727 // On entry: out[0,2,...] < 2**30, out[1,3,...] < 2**29.
    728 // On exit: out[0,2,...] < 2**30, out[1,3,...] < 2**29.
    729 func p256Scalar3(out *[p256Limbs]uint32) {
    730 	var carry uint32
    731 
    732 	for i := 0; ; i++ {
    733 		out[i] *= 3
    734 		out[i] += carry
    735 		carry = out[i] >> 29
    736 		out[i] &= bottom29Bits
    737 
    738 		i++
    739 		if i == p256Limbs {
    740 			break
    741 		}
    742 
    743 		out[i] *= 3
    744 		out[i] += carry
    745 		carry = out[i] >> 28
    746 		out[i] &= bottom28Bits
    747 	}
    748 
    749 	p256ReduceCarry(out, carry)
    750 }
    751 
    752 // p256Scalar4 sets out=4*out.
    753 //
    754 // On entry: out[0,2,...] < 2**30, out[1,3,...] < 2**29.
    755 // On exit: out[0,2,...] < 2**30, out[1,3,...] < 2**29.
    756 func p256Scalar4(out *[p256Limbs]uint32) {
    757 	var carry, nextCarry uint32
    758 
    759 	for i := 0; ; i++ {
    760 		nextCarry = out[i] >> 27
    761 		out[i] <<= 2
    762 		out[i] &= bottom29Bits
    763 		out[i] += carry
    764 		carry = nextCarry + (out[i] >> 29)
    765 		out[i] &= bottom29Bits
    766 
    767 		i++
    768 		if i == p256Limbs {
    769 			break
    770 		}
    771 		nextCarry = out[i] >> 26
    772 		out[i] <<= 2
    773 		out[i] &= bottom28Bits
    774 		out[i] += carry
    775 		carry = nextCarry + (out[i] >> 28)
    776 		out[i] &= bottom28Bits
    777 	}
    778 
    779 	p256ReduceCarry(out, carry)
    780 }
    781 
    782 // p256Scalar8 sets out=8*out.
    783 //
    784 // On entry: out[0,2,...] < 2**30, out[1,3,...] < 2**29.
    785 // On exit: out[0,2,...] < 2**30, out[1,3,...] < 2**29.
    786 func p256Scalar8(out *[p256Limbs]uint32) {
    787 	var carry, nextCarry uint32
    788 
    789 	for i := 0; ; i++ {
    790 		nextCarry = out[i] >> 26
    791 		out[i] <<= 3
    792 		out[i] &= bottom29Bits
    793 		out[i] += carry
    794 		carry = nextCarry + (out[i] >> 29)
    795 		out[i] &= bottom29Bits
    796 
    797 		i++
    798 		if i == p256Limbs {
    799 			break
    800 		}
    801 		nextCarry = out[i] >> 25
    802 		out[i] <<= 3
    803 		out[i] &= bottom28Bits
    804 		out[i] += carry
    805 		carry = nextCarry + (out[i] >> 28)
    806 		out[i] &= bottom28Bits
    807 	}
    808 
    809 	p256ReduceCarry(out, carry)
    810 }
    811 
    812 // Group operations:
    813 //
    814 // Elements of the elliptic curve group are represented in Jacobian
    815 // coordinates: (x, y, z). An affine point (x', y') is x'=x/z**2, y'=y/z**3 in
    816 // Jacobian form.
    817 
    818 // p256PointDouble sets {xOut,yOut,zOut} = 2*{x,y,z}.
    819 //
    820 // See http://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-0.html#doubling-dbl-2009-l
    821 func p256PointDouble(xOut, yOut, zOut, x, y, z *[p256Limbs]uint32) {
    822 	var delta, gamma, alpha, beta, tmp, tmp2 [p256Limbs]uint32
    823 
    824 	p256Square(&delta, z)
    825 	p256Square(&gamma, y)
    826 	p256Mul(&beta, x, &gamma)
    827 
    828 	p256Sum(&tmp, x, &delta)
    829 	p256Diff(&tmp2, x, &delta)
    830 	p256Mul(&alpha, &tmp, &tmp2)
    831 	p256Scalar3(&alpha)
    832 
    833 	p256Sum(&tmp, y, z)
    834 	p256Square(&tmp, &tmp)
    835 	p256Diff(&tmp, &tmp, &gamma)
    836 	p256Diff(zOut, &tmp, &delta)
    837 
    838 	p256Scalar4(&beta)
    839 	p256Square(xOut, &alpha)
    840 	p256Diff(xOut, xOut, &beta)
    841 	p256Diff(xOut, xOut, &beta)
    842 
    843 	p256Diff(&tmp, &beta, xOut)
    844 	p256Mul(&tmp, &alpha, &tmp)
    845 	p256Square(&tmp2, &gamma)
    846 	p256Scalar8(&tmp2)
    847 	p256Diff(yOut, &tmp, &tmp2)
    848 }
    849 
    850 // p256PointAddMixed sets {xOut,yOut,zOut} = {x1,y1,z1} + {x2,y2,1}.
    851 // (i.e. the second point is affine.)
    852 //
    853 // See http://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-0.html#addition-add-2007-bl
    854 //
    855 // Note that this function does not handle P+P, infinity+P nor P+infinity
    856 // correctly.
    857 func p256PointAddMixed(xOut, yOut, zOut, x1, y1, z1, x2, y2 *[p256Limbs]uint32) {
    858 	var z1z1, z1z1z1, s2, u2, h, i, j, r, rr, v, tmp [p256Limbs]uint32
    859 
    860 	p256Square(&z1z1, z1)
    861 	p256Sum(&tmp, z1, z1)
    862 
    863 	p256Mul(&u2, x2, &z1z1)
    864 	p256Mul(&z1z1z1, z1, &z1z1)
    865 	p256Mul(&s2, y2, &z1z1z1)
    866 	p256Diff(&h, &u2, x1)
    867 	p256Sum(&i, &h, &h)
    868 	p256Square(&i, &i)
    869 	p256Mul(&j, &h, &i)
    870 	p256Diff(&r, &s2, y1)
    871 	p256Sum(&r, &r, &r)
    872 	p256Mul(&v, x1, &i)
    873 
    874 	p256Mul(zOut, &tmp, &h)
    875 	p256Square(&rr, &r)
    876 	p256Diff(xOut, &rr, &j)
    877 	p256Diff(xOut, xOut, &v)
    878 	p256Diff(xOut, xOut, &v)
    879 
    880 	p256Diff(&tmp, &v, xOut)
    881 	p256Mul(yOut, &tmp, &r)
    882 	p256Mul(&tmp, y1, &j)
    883 	p256Diff(yOut, yOut, &tmp)
    884 	p256Diff(yOut, yOut, &tmp)
    885 }
    886 
    887 // p256PointAdd sets {xOut,yOut,zOut} = {x1,y1,z1} + {x2,y2,z2}.
    888 //
    889 // See http://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-0.html#addition-add-2007-bl
    890 //
    891 // Note that this function does not handle P+P, infinity+P nor P+infinity
    892 // correctly.
    893 func p256PointAdd(xOut, yOut, zOut, x1, y1, z1, x2, y2, z2 *[p256Limbs]uint32) {
    894 	var z1z1, z1z1z1, z2z2, z2z2z2, s1, s2, u1, u2, h, i, j, r, rr, v, tmp [p256Limbs]uint32
    895 
    896 	p256Square(&z1z1, z1)
    897 	p256Square(&z2z2, z2)
    898 	p256Mul(&u1, x1, &z2z2)
    899 
    900 	p256Sum(&tmp, z1, z2)
    901 	p256Square(&tmp, &tmp)
    902 	p256Diff(&tmp, &tmp, &z1z1)
    903 	p256Diff(&tmp, &tmp, &z2z2)
    904 
    905 	p256Mul(&z2z2z2, z2, &z2z2)
    906 	p256Mul(&s1, y1, &z2z2z2)
    907 
    908 	p256Mul(&u2, x2, &z1z1)
    909 	p256Mul(&z1z1z1, z1, &z1z1)
    910 	p256Mul(&s2, y2, &z1z1z1)
    911 	p256Diff(&h, &u2, &u1)
    912 	p256Sum(&i, &h, &h)
    913 	p256Square(&i, &i)
    914 	p256Mul(&j, &h, &i)
    915 	p256Diff(&r, &s2, &s1)
    916 	p256Sum(&r, &r, &r)
    917 	p256Mul(&v, &u1, &i)
    918 
    919 	p256Mul(zOut, &tmp, &h)
    920 	p256Square(&rr, &r)
    921 	p256Diff(xOut, &rr, &j)
    922 	p256Diff(xOut, xOut, &v)
    923 	p256Diff(xOut, xOut, &v)
    924 
    925 	p256Diff(&tmp, &v, xOut)
    926 	p256Mul(yOut, &tmp, &r)
    927 	p256Mul(&tmp, &s1, &j)
    928 	p256Diff(yOut, yOut, &tmp)
    929 	p256Diff(yOut, yOut, &tmp)
    930 }
    931 
    932 // p256CopyConditional sets out=in if mask = 0xffffffff in constant time.
    933 //
    934 // On entry: mask is either 0 or 0xffffffff.
    935 func p256CopyConditional(out, in *[p256Limbs]uint32, mask uint32) {
    936 	for i := 0; i < p256Limbs; i++ {
    937 		tmp := mask & (in[i] ^ out[i])
    938 		out[i] ^= tmp
    939 	}
    940 }
    941 
    942 // p256SelectAffinePoint sets {out_x,out_y} to the index'th entry of table.
    943 // On entry: index < 16, table[0] must be zero.
    944 func p256SelectAffinePoint(xOut, yOut *[p256Limbs]uint32, table []uint32, index uint32) {
    945 	for i := range xOut {
    946 		xOut[i] = 0
    947 	}
    948 	for i := range yOut {
    949 		yOut[i] = 0
    950 	}
    951 
    952 	for i := uint32(1); i < 16; i++ {
    953 		mask := i ^ index
    954 		mask |= mask >> 2
    955 		mask |= mask >> 1
    956 		mask &= 1
    957 		mask--
    958 		for j := range xOut {
    959 			xOut[j] |= table[0] & mask
    960 			table = table[1:]
    961 		}
    962 		for j := range yOut {
    963 			yOut[j] |= table[0] & mask
    964 			table = table[1:]
    965 		}
    966 	}
    967 }
    968 
    969 // p256SelectJacobianPoint sets {out_x,out_y,out_z} to the index'th entry of
    970 // table.
    971 // On entry: index < 16, table[0] must be zero.
    972 func p256SelectJacobianPoint(xOut, yOut, zOut *[p256Limbs]uint32, table *[16][3][p256Limbs]uint32, index uint32) {
    973 	for i := range xOut {
    974 		xOut[i] = 0
    975 	}
    976 	for i := range yOut {
    977 		yOut[i] = 0
    978 	}
    979 	for i := range zOut {
    980 		zOut[i] = 0
    981 	}
    982 
    983 	// The implicit value at index 0 is all zero. We don't need to perform that
    984 	// iteration of the loop because we already set out_* to zero.
    985 	for i := uint32(1); i < 16; i++ {
    986 		mask := i ^ index
    987 		mask |= mask >> 2
    988 		mask |= mask >> 1
    989 		mask &= 1
    990 		mask--
    991 		for j := range xOut {
    992 			xOut[j] |= table[i][0][j] & mask
    993 		}
    994 		for j := range yOut {
    995 			yOut[j] |= table[i][1][j] & mask
    996 		}
    997 		for j := range zOut {
    998 			zOut[j] |= table[i][2][j] & mask
    999 		}
   1000 	}
   1001 }
   1002 
   1003 // p256GetBit returns the bit'th bit of scalar.
   1004 func p256GetBit(scalar *[32]uint8, bit uint) uint32 {
   1005 	return uint32(((scalar[bit>>3]) >> (bit & 7)) & 1)
   1006 }
   1007 
   1008 // p256ScalarBaseMult sets {xOut,yOut,zOut} = scalar*G where scalar is a
   1009 // little-endian number. Note that the value of scalar must be less than the
   1010 // order of the group.
   1011 func p256ScalarBaseMult(xOut, yOut, zOut *[p256Limbs]uint32, scalar *[32]uint8) {
   1012 	nIsInfinityMask := ^uint32(0)
   1013 	var pIsNoninfiniteMask, mask, tableOffset uint32
   1014 	var px, py, tx, ty, tz [p256Limbs]uint32
   1015 
   1016 	for i := range xOut {
   1017 		xOut[i] = 0
   1018 	}
   1019 	for i := range yOut {
   1020 		yOut[i] = 0
   1021 	}
   1022 	for i := range zOut {
   1023 		zOut[i] = 0
   1024 	}
   1025 
   1026 	// The loop adds bits at positions 0, 64, 128 and 192, followed by
   1027 	// positions 32,96,160 and 224 and does this 32 times.
   1028 	for i := uint(0); i < 32; i++ {
   1029 		if i != 0 {
   1030 			p256PointDouble(xOut, yOut, zOut, xOut, yOut, zOut)
   1031 		}
   1032 		tableOffset = 0
   1033 		for j := uint(0); j <= 32; j += 32 {
   1034 			bit0 := p256GetBit(scalar, 31-i+j)
   1035 			bit1 := p256GetBit(scalar, 95-i+j)
   1036 			bit2 := p256GetBit(scalar, 159-i+j)
   1037 			bit3 := p256GetBit(scalar, 223-i+j)
   1038 			index := bit0 | (bit1 << 1) | (bit2 << 2) | (bit3 << 3)
   1039 
   1040 			p256SelectAffinePoint(&px, &py, p256Precomputed[tableOffset:], index)
   1041 			tableOffset += 30 * p256Limbs
   1042 
   1043 			// Since scalar is less than the order of the group, we know that
   1044 			// {xOut,yOut,zOut} != {px,py,1}, unless both are zero, which we handle
   1045 			// below.
   1046 			p256PointAddMixed(&tx, &ty, &tz, xOut, yOut, zOut, &px, &py)
   1047 			// The result of pointAddMixed is incorrect if {xOut,yOut,zOut} is zero
   1048 			// (a.k.a.  the point at infinity). We handle that situation by
   1049 			// copying the point from the table.
   1050 			p256CopyConditional(xOut, &px, nIsInfinityMask)
   1051 			p256CopyConditional(yOut, &py, nIsInfinityMask)
   1052 			p256CopyConditional(zOut, &p256One, nIsInfinityMask)
   1053 
   1054 			// Equally, the result is also wrong if the point from the table is
   1055 			// zero, which happens when the index is zero. We handle that by
   1056 			// only copying from {tx,ty,tz} to {xOut,yOut,zOut} if index != 0.
   1057 			pIsNoninfiniteMask = nonZeroToAllOnes(index)
   1058 			mask = pIsNoninfiniteMask & ^nIsInfinityMask
   1059 			p256CopyConditional(xOut, &tx, mask)
   1060 			p256CopyConditional(yOut, &ty, mask)
   1061 			p256CopyConditional(zOut, &tz, mask)
   1062 			// If p was not zero, then n is now non-zero.
   1063 			nIsInfinityMask &^= pIsNoninfiniteMask
   1064 		}
   1065 	}
   1066 }
   1067 
   1068 // p256PointToAffine converts a Jacobian point to an affine point. If the input
   1069 // is the point at infinity then it returns (0, 0) in constant time.
   1070 func p256PointToAffine(xOut, yOut, x, y, z *[p256Limbs]uint32) {
   1071 	var zInv, zInvSq [p256Limbs]uint32
   1072 
   1073 	p256Invert(&zInv, z)
   1074 	p256Square(&zInvSq, &zInv)
   1075 	p256Mul(xOut, x, &zInvSq)
   1076 	p256Mul(&zInv, &zInv, &zInvSq)
   1077 	p256Mul(yOut, y, &zInv)
   1078 }
   1079 
   1080 // p256ToAffine returns a pair of *big.Int containing the affine representation
   1081 // of {x,y,z}.
   1082 func p256ToAffine(x, y, z *[p256Limbs]uint32) (xOut, yOut *big.Int) {
   1083 	var xx, yy [p256Limbs]uint32
   1084 	p256PointToAffine(&xx, &yy, x, y, z)
   1085 	return p256ToBig(&xx), p256ToBig(&yy)
   1086 }
   1087 
   1088 // p256ScalarMult sets {xOut,yOut,zOut} = scalar*{x,y}.
   1089 func p256ScalarMult(xOut, yOut, zOut, x, y *[p256Limbs]uint32, scalar *[32]uint8) {
   1090 	var px, py, pz, tx, ty, tz [p256Limbs]uint32
   1091 	var precomp [16][3][p256Limbs]uint32
   1092 	var nIsInfinityMask, index, pIsNoninfiniteMask, mask uint32
   1093 
   1094 	// We precompute 0,1,2,... times {x,y}.
   1095 	precomp[1][0] = *x
   1096 	precomp[1][1] = *y
   1097 	precomp[1][2] = p256One
   1098 
   1099 	for i := 2; i < 16; i += 2 {
   1100 		p256PointDouble(&precomp[i][0], &precomp[i][1], &precomp[i][2], &precomp[i/2][0], &precomp[i/2][1], &precomp[i/2][2])
   1101 		p256PointAddMixed(&precomp[i+1][0], &precomp[i+1][1], &precomp[i+1][2], &precomp[i][0], &precomp[i][1], &precomp[i][2], x, y)
   1102 	}
   1103 
   1104 	for i := range xOut {
   1105 		xOut[i] = 0
   1106 	}
   1107 	for i := range yOut {
   1108 		yOut[i] = 0
   1109 	}
   1110 	for i := range zOut {
   1111 		zOut[i] = 0
   1112 	}
   1113 	nIsInfinityMask = ^uint32(0)
   1114 
   1115 	// We add in a window of four bits each iteration and do this 64 times.
   1116 	for i := 0; i < 64; i++ {
   1117 		if i != 0 {
   1118 			p256PointDouble(xOut, yOut, zOut, xOut, yOut, zOut)
   1119 			p256PointDouble(xOut, yOut, zOut, xOut, yOut, zOut)
   1120 			p256PointDouble(xOut, yOut, zOut, xOut, yOut, zOut)
   1121 			p256PointDouble(xOut, yOut, zOut, xOut, yOut, zOut)
   1122 		}
   1123 
   1124 		index = uint32(scalar[31-i/2])
   1125 		if (i & 1) == 1 {
   1126 			index &= 15
   1127 		} else {
   1128 			index >>= 4
   1129 		}
   1130 
   1131 		// See the comments in scalarBaseMult about handling infinities.
   1132 		p256SelectJacobianPoint(&px, &py, &pz, &precomp, index)
   1133 		p256PointAdd(&tx, &ty, &tz, xOut, yOut, zOut, &px, &py, &pz)
   1134 		p256CopyConditional(xOut, &px, nIsInfinityMask)
   1135 		p256CopyConditional(yOut, &py, nIsInfinityMask)
   1136 		p256CopyConditional(zOut, &pz, nIsInfinityMask)
   1137 
   1138 		pIsNoninfiniteMask = nonZeroToAllOnes(index)
   1139 		mask = pIsNoninfiniteMask & ^nIsInfinityMask
   1140 		p256CopyConditional(xOut, &tx, mask)
   1141 		p256CopyConditional(yOut, &ty, mask)
   1142 		p256CopyConditional(zOut, &tz, mask)
   1143 		nIsInfinityMask &^= pIsNoninfiniteMask
   1144 	}
   1145 }
   1146 
   1147 // p256FromBig sets out = R*in.
   1148 func p256FromBig(out *[p256Limbs]uint32, in *big.Int) {
   1149 	tmp := new(big.Int).Lsh(in, 257)
   1150 	tmp.Mod(tmp, p256Params.P)
   1151 
   1152 	for i := 0; i < p256Limbs; i++ {
   1153 		if bits := tmp.Bits(); len(bits) > 0 {
   1154 			out[i] = uint32(bits[0]) & bottom29Bits
   1155 		} else {
   1156 			out[i] = 0
   1157 		}
   1158 		tmp.Rsh(tmp, 29)
   1159 
   1160 		i++
   1161 		if i == p256Limbs {
   1162 			break
   1163 		}
   1164 
   1165 		if bits := tmp.Bits(); len(bits) > 0 {
   1166 			out[i] = uint32(bits[0]) & bottom28Bits
   1167 		} else {
   1168 			out[i] = 0
   1169 		}
   1170 		tmp.Rsh(tmp, 28)
   1171 	}
   1172 }
   1173 
   1174 // p256ToBig returns a *big.Int containing the value of in.
   1175 func p256ToBig(in *[p256Limbs]uint32) *big.Int {
   1176 	result, tmp := new(big.Int), new(big.Int)
   1177 
   1178 	result.SetInt64(int64(in[p256Limbs-1]))
   1179 	for i := p256Limbs - 2; i >= 0; i-- {
   1180 		if (i & 1) == 0 {
   1181 			result.Lsh(result, 29)
   1182 		} else {
   1183 			result.Lsh(result, 28)
   1184 		}
   1185 		tmp.SetInt64(int64(in[i]))
   1186 		result.Add(result, tmp)
   1187 	}
   1188 
   1189 	result.Mul(result, p256RInverse)
   1190 	result.Mod(result, p256Params.P)
   1191 	return result
   1192 }
   1193