Home | History | Annotate | Download | only in libconstrainedcrypto
      1 /*
      2  * Copyright 2013 The Android Open Source Project
      3  *
      4  * Redistribution and use in source and binary forms, with or without
      5  * modification, are permitted provided that the following conditions are met:
      6  *     * Redistributions of source code must retain the above copyright
      7  *       notice, this list of conditions and the following disclaimer.
      8  *     * Redistributions in binary form must reproduce the above copyright
      9  *       notice, this list of conditions and the following disclaimer in the
     10  *       documentation and/or other materials provided with the distribution.
     11  *     * Neither the name of Google Inc. nor the names of its contributors may
     12  *       be used to endorse or promote products derived from this software
     13  *       without specific prior written permission.
     14  *
     15  * THIS SOFTWARE IS PROVIDED BY Google Inc. ``AS IS'' AND ANY EXPRESS OR
     16  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
     17  * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
     18  * EVENT SHALL Google Inc. BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
     19  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
     20  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
     21  * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
     22  * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
     23  * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
     24  * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     25  */
     26 
     27 // This is an implementation of the P256 elliptic curve group. It's written to
     28 // be portable 32-bit, although it's still constant-time.
     29 //
     30 // WARNING: Implementing these functions in a constant-time manner is far from
     31 //          obvious. Be careful when touching this code.
     32 //
     33 // See http://www.imperialviolet.org/2010/12/04/ecc.html ([1]) for background.
     34 
     35 #include <stdint.h>
     36 #include <stdio.h>
     37 
     38 #include <string.h>
     39 #include <stdlib.h>
     40 
     41 #include "constrainedcrypto/p256.h"
     42 
     43 typedef uint8_t u8;
     44 typedef uint32_t u32;
     45 typedef int32_t s32;
     46 typedef uint64_t u64;
     47 
     48 /* Our field elements are represented as nine 32-bit limbs.
     49  *
     50  * The value of an felem (field element) is:
     51  *   x[0] + (x[1] * 2**29) + (x[2] * 2**57) + ... + (x[8] * 2**228)
     52  *
     53  * That is, each limb is alternately 29 or 28-bits wide in little-endian
     54  * order.
     55  *
     56  * This means that an felem hits 2**257, rather than 2**256 as we would like. A
     57  * 28, 29, ... pattern would cause us to hit 2**256, but that causes problems
     58  * when multiplying as terms end up one bit short of a limb which would require
     59  * much bit-shifting to correct.
     60  *
     61  * Finally, the values stored in an felem are in Montgomery form. So the value
     62  * |y| is stored as (y*R) mod p, where p is the P-256 prime and R is 2**257.
     63  */
     64 typedef u32 limb;
     65 #define NLIMBS 9
     66 typedef limb felem[NLIMBS];
     67 
     68 static const limb kBottom28Bits = 0xfffffff;
     69 static const limb kBottom29Bits = 0x1fffffff;
     70 
     71 /* kOne is the number 1 as an felem. It's 2**257 mod p split up into 29 and
     72  * 28-bit words. */
     73 static const felem kOne = {
     74     2, 0, 0, 0xffff800,
     75     0x1fffffff, 0xfffffff, 0x1fbfffff, 0x1ffffff,
     76     0
     77 };
     78 static const felem kZero = {0};
     79 static const felem kP = {
     80     0x1fffffff, 0xfffffff, 0x1fffffff, 0x3ff,
     81     0, 0, 0x200000, 0xf000000,
     82     0xfffffff
     83 };
     84 static const felem k2P = {
     85     0x1ffffffe, 0xfffffff, 0x1fffffff, 0x7ff,
     86     0, 0, 0x400000, 0xe000000,
     87     0x1fffffff
     88 };
     89 /* kPrecomputed contains precomputed values to aid the calculation of scalar
     90  * multiples of the base point, G. It's actually two, equal length, tables
     91  * concatenated.
     92  *
     93  * The first table contains (x,y) felem pairs for 16 multiples of the base
     94  * point, G.
     95  *
     96  *   Index  |  Index (binary) | Value
     97  *       0  |           0000  | 0G (all zeros, omitted)
     98  *       1  |           0001  | G
     99  *       2  |           0010  | 2**64G
    100  *       3  |           0011  | 2**64G + G
    101  *       4  |           0100  | 2**128G
    102  *       5  |           0101  | 2**128G + G
    103  *       6  |           0110  | 2**128G + 2**64G
    104  *       7  |           0111  | 2**128G + 2**64G + G
    105  *       8  |           1000  | 2**192G
    106  *       9  |           1001  | 2**192G + G
    107  *      10  |           1010  | 2**192G + 2**64G
    108  *      11  |           1011  | 2**192G + 2**64G + G
    109  *      12  |           1100  | 2**192G + 2**128G
    110  *      13  |           1101  | 2**192G + 2**128G + G
    111  *      14  |           1110  | 2**192G + 2**128G + 2**64G
    112  *      15  |           1111  | 2**192G + 2**128G + 2**64G + G
    113  *
    114  * The second table follows the same style, but the terms are 2**32G,
    115  * 2**96G, 2**160G, 2**224G.
    116  *
    117  * This is ~2KB of data. */
    118 static const limb kPrecomputed[NLIMBS * 2 * 15 * 2] = {
    119     0x11522878, 0xe730d41, 0xdb60179, 0x4afe2ff, 0x12883add, 0xcaddd88, 0x119e7edc, 0xd4a6eab, 0x3120bee,
    120     0x1d2aac15, 0xf25357c, 0x19e45cdd, 0x5c721d0, 0x1992c5a5, 0xa237487, 0x154ba21, 0x14b10bb, 0xae3fe3,
    121     0xd41a576, 0x922fc51, 0x234994f, 0x60b60d3, 0x164586ae, 0xce95f18, 0x1fe49073, 0x3fa36cc, 0x5ebcd2c,
    122     0xb402f2f, 0x15c70bf, 0x1561925c, 0x5a26704, 0xda91e90, 0xcdc1c7f, 0x1ea12446, 0xe1ade1e, 0xec91f22,
    123     0x26f7778, 0x566847e, 0xa0bec9e, 0x234f453, 0x1a31f21a, 0xd85e75c, 0x56c7109, 0xa267a00, 0xb57c050,
    124     0x98fb57, 0xaa837cc, 0x60c0792, 0xcfa5e19, 0x61bab9e, 0x589e39b, 0xa324c5, 0x7d6dee7, 0x2976e4b,
    125     0x1fc4124a, 0xa8c244b, 0x1ce86762, 0xcd61c7e, 0x1831c8e0, 0x75774e1, 0x1d96a5a9, 0x843a649, 0xc3ab0fa,
    126     0x6e2e7d5, 0x7673a2a, 0x178b65e8, 0x4003e9b, 0x1a1f11c2, 0x7816ea, 0xf643e11, 0x58c43df, 0xf423fc2,
    127     0x19633ffa, 0x891f2b2, 0x123c231c, 0x46add8c, 0x54700dd, 0x59e2b17, 0x172db40f, 0x83e277d, 0xb0dd609,
    128     0xfd1da12, 0x35c6e52, 0x19ede20c, 0xd19e0c0, 0x97d0f40, 0xb015b19, 0x449e3f5, 0xe10c9e, 0x33ab581,
    129     0x56a67ab, 0x577734d, 0x1dddc062, 0xc57b10d, 0x149b39d, 0x26a9e7b, 0xc35df9f, 0x48764cd, 0x76dbcca,
    130     0xca4b366, 0xe9303ab, 0x1a7480e7, 0x57e9e81, 0x1e13eb50, 0xf466cf3, 0x6f16b20, 0x4ba3173, 0xc168c33,
    131     0x15cb5439, 0x6a38e11, 0x73658bd, 0xb29564f, 0x3f6dc5b, 0x53b97e, 0x1322c4c0, 0x65dd7ff, 0x3a1e4f6,
    132     0x14e614aa, 0x9246317, 0x1bc83aca, 0xad97eed, 0xd38ce4a, 0xf82b006, 0x341f077, 0xa6add89, 0x4894acd,
    133     0x9f162d5, 0xf8410ef, 0x1b266a56, 0xd7f223, 0x3e0cb92, 0xe39b672, 0x6a2901a, 0x69a8556, 0x7e7c0,
    134     0x9b7d8d3, 0x309a80, 0x1ad05f7f, 0xc2fb5dd, 0xcbfd41d, 0x9ceb638, 0x1051825c, 0xda0cf5b, 0x812e881,
    135     0x6f35669, 0x6a56f2c, 0x1df8d184, 0x345820, 0x1477d477, 0x1645db1, 0xbe80c51, 0xc22be3e, 0xe35e65a,
    136     0x1aeb7aa0, 0xc375315, 0xf67bc99, 0x7fdd7b9, 0x191fc1be, 0x61235d, 0x2c184e9, 0x1c5a839, 0x47a1e26,
    137     0xb7cb456, 0x93e225d, 0x14f3c6ed, 0xccc1ac9, 0x17fe37f3, 0x4988989, 0x1a90c502, 0x2f32042, 0xa17769b,
    138     0xafd8c7c, 0x8191c6e, 0x1dcdb237, 0x16200c0, 0x107b32a1, 0x66c08db, 0x10d06a02, 0x3fc93, 0x5620023,
    139     0x16722b27, 0x68b5c59, 0x270fcfc, 0xfad0ecc, 0xe5de1c2, 0xeab466b, 0x2fc513c, 0x407f75c, 0xbaab133,
    140     0x9705fe9, 0xb88b8e7, 0x734c993, 0x1e1ff8f, 0x19156970, 0xabd0f00, 0x10469ea7, 0x3293ac0, 0xcdc98aa,
    141     0x1d843fd, 0xe14bfe8, 0x15be825f, 0x8b5212, 0xeb3fb67, 0x81cbd29, 0xbc62f16, 0x2b6fcc7, 0xf5a4e29,
    142     0x13560b66, 0xc0b6ac2, 0x51ae690, 0xd41e271, 0xf3e9bd4, 0x1d70aab, 0x1029f72, 0x73e1c35, 0xee70fbc,
    143     0xad81baf, 0x9ecc49a, 0x86c741e, 0xfe6be30, 0x176752e7, 0x23d416, 0x1f83de85, 0x27de188, 0x66f70b8,
    144     0x181cd51f, 0x96b6e4c, 0x188f2335, 0xa5df759, 0x17a77eb6, 0xfeb0e73, 0x154ae914, 0x2f3ec51, 0x3826b59,
    145     0xb91f17d, 0x1c72949, 0x1362bf0a, 0xe23fddf, 0xa5614b0, 0xf7d8f, 0x79061, 0x823d9d2, 0x8213f39,
    146     0x1128ae0b, 0xd095d05, 0xb85c0c2, 0x1ecb2ef, 0x24ddc84, 0xe35e901, 0x18411a4a, 0xf5ddc3d, 0x3786689,
    147     0x52260e8, 0x5ae3564, 0x542b10d, 0x8d93a45, 0x19952aa4, 0x996cc41, 0x1051a729, 0x4be3499, 0x52b23aa,
    148     0x109f307e, 0x6f5b6bb, 0x1f84e1e7, 0x77a0cfa, 0x10c4df3f, 0x25a02ea, 0xb048035, 0xe31de66, 0xc6ecaa3,
    149     0x28ea335, 0x2886024, 0x1372f020, 0xf55d35, 0x15e4684c, 0xf2a9e17, 0x1a4a7529, 0xcb7beb1, 0xb2a78a1,
    150     0x1ab21f1f, 0x6361ccf, 0x6c9179d, 0xb135627, 0x1267b974, 0x4408bad, 0x1cbff658, 0xe3d6511, 0xc7d76f,
    151     0x1cc7a69, 0xe7ee31b, 0x54fab4f, 0x2b914f, 0x1ad27a30, 0xcd3579e, 0xc50124c, 0x50daa90, 0xb13f72,
    152     0xb06aa75, 0x70f5cc6, 0x1649e5aa, 0x84a5312, 0x329043c, 0x41c4011, 0x13d32411, 0xb04a838, 0xd760d2d,
    153     0x1713b532, 0xbaa0c03, 0x84022ab, 0x6bcf5c1, 0x2f45379, 0x18ae070, 0x18c9e11e, 0x20bca9a, 0x66f496b,
    154     0x3eef294, 0x67500d2, 0xd7f613c, 0x2dbbeb, 0xb741038, 0xe04133f, 0x1582968d, 0xbe985f7, 0x1acbc1a,
    155     0x1a6a939f, 0x33e50f6, 0xd665ed4, 0xb4b7bd6, 0x1e5a3799, 0x6b33847, 0x17fa56ff, 0x65ef930, 0x21dc4a,
    156     0x2b37659, 0x450fe17, 0xb357b65, 0xdf5efac, 0x15397bef, 0x9d35a7f, 0x112ac15f, 0x624e62e, 0xa90ae2f,
    157     0x107eecd2, 0x1f69bbe, 0x77d6bce, 0x5741394, 0x13c684fc, 0x950c910, 0x725522b, 0xdc78583, 0x40eeabb,
    158     0x1fde328a, 0xbd61d96, 0xd28c387, 0x9e77d89, 0x12550c40, 0x759cb7d, 0x367ef34, 0xae2a960, 0x91b8bdc,
    159     0x93462a9, 0xf469ef, 0xb2e9aef, 0xd2ca771, 0x54e1f42, 0x7aaa49, 0x6316abb, 0x2413c8e, 0x5425bf9,
    160     0x1bed3e3a, 0xf272274, 0x1f5e7326, 0x6416517, 0xea27072, 0x9cedea7, 0x6e7633, 0x7c91952, 0xd806dce,
    161     0x8e2a7e1, 0xe421e1a, 0x418c9e1, 0x1dbc890, 0x1b395c36, 0xa1dc175, 0x1dc4ef73, 0x8956f34, 0xe4b5cf2,
    162     0x1b0d3a18, 0x3194a36, 0x6c2641f, 0xe44124c, 0xa2f4eaa, 0xa8c25ba, 0xf927ed7, 0x627b614, 0x7371cca,
    163     0xba16694, 0x417bc03, 0x7c0a7e3, 0x9c35c19, 0x1168a205, 0x8b6b00d, 0x10e3edc9, 0x9c19bf2, 0x5882229,
    164     0x1b2b4162, 0xa5cef1a, 0x1543622b, 0x9bd433e, 0x364e04d, 0x7480792, 0x5c9b5b3, 0xe85ff25, 0x408ef57,
    165     0x1814cfa4, 0x121b41b, 0xd248a0f, 0x3b05222, 0x39bb16a, 0xc75966d, 0xa038113, 0xa4a1769, 0x11fbc6c,
    166     0x917e50e, 0xeec3da8, 0x169d6eac, 0x10c1699, 0xa416153, 0xf724912, 0x15cd60b7, 0x4acbad9, 0x5efc5fa,
    167     0xf150ed7, 0x122b51, 0x1104b40a, 0xcb7f442, 0xfbb28ff, 0x6ac53ca, 0x196142cc, 0x7bf0fa9, 0x957651,
    168     0x4e0f215, 0xed439f8, 0x3f46bd5, 0x5ace82f, 0x110916b6, 0x6db078, 0xffd7d57, 0xf2ecaac, 0xca86dec,
    169     0x15d6b2da, 0x965ecc9, 0x1c92b4c2, 0x1f3811, 0x1cb080f5, 0x2d8b804, 0x19d1c12d, 0xf20bd46, 0x1951fa7,
    170     0xa3656c3, 0x523a425, 0xfcd0692, 0xd44ddc8, 0x131f0f5b, 0xaf80e4a, 0xcd9fc74, 0x99bb618, 0x2db944c,
    171     0xa673090, 0x1c210e1, 0x178c8d23, 0x1474383, 0x10b8743d, 0x985a55b, 0x2e74779, 0x576138, 0x9587927,
    172     0x133130fa, 0xbe05516, 0x9f4d619, 0xbb62570, 0x99ec591, 0xd9468fe, 0x1d07782d, 0xfc72e0b, 0x701b298,
    173     0x1863863b, 0x85954b8, 0x121a0c36, 0x9e7fedf, 0xf64b429, 0x9b9d71e, 0x14e2f5d8, 0xf858d3a, 0x942eea8,
    174     0xda5b765, 0x6edafff, 0xa9d18cc, 0xc65e4ba, 0x1c747e86, 0xe4ea915, 0x1981d7a1, 0x8395659, 0x52ed4e2,
    175     0x87d43b7, 0x37ab11b, 0x19d292ce, 0xf8d4692, 0x18c3053f, 0x8863e13, 0x4c146c0, 0x6bdf55a, 0x4e4457d,
    176     0x16152289, 0xac78ec2, 0x1a59c5a2, 0x2028b97, 0x71c2d01, 0x295851f, 0x404747b, 0x878558d, 0x7d29aa4,
    177     0x13d8341f, 0x8daefd7, 0x139c972d, 0x6b7ea75, 0xd4a9dde, 0xff163d8, 0x81d55d7, 0xa5bef68, 0xb7b30d8,
    178     0xbe73d6f, 0xaa88141, 0xd976c81, 0x7e7a9cc, 0x18beb771, 0xd773cbd, 0x13f51951, 0x9d0c177, 0x1c49a78,
    179 };
    180 
    181 
    182 /* Field element operations: */
    183 
    184 /* NON_ZERO_TO_ALL_ONES returns:
    185  *   0xffffffff for 0 < x <= 2**31
    186  *   0 for x == 0 or x > 2**31.
    187  *
    188  * x must be a u32 or an equivalent type such as limb. */
    189 #define NON_ZERO_TO_ALL_ONES(x) ((((u32)(x) - 1) >> 31) - 1)
    190 
    191 /* felem_reduce_carry adds a multiple of p in order to cancel |carry|,
    192  * which is a term at 2**257.
    193  *
    194  * On entry: carry < 2**3, inout[0,2,...] < 2**29, inout[1,3,...] < 2**28.
    195  * On exit: inout[0,2,..] < 2**30, inout[1,3,...] < 2**29. */
    196 static void felem_reduce_carry(felem inout, limb carry) {
    197   const u32 carry_mask = NON_ZERO_TO_ALL_ONES(carry);
    198 
    199   inout[0] += carry << 1;
    200   inout[3] += 0x10000000 & carry_mask;
    201   /* carry < 2**3 thus (carry << 11) < 2**14 and we added 2**28 in the
    202    * previous line therefore this doesn't underflow. */
    203   inout[3] -= carry << 11;
    204   inout[4] += (0x20000000 - 1) & carry_mask;
    205   inout[5] += (0x10000000 - 1) & carry_mask;
    206   inout[6] += (0x20000000 - 1) & carry_mask;
    207   inout[6] -= carry << 22;
    208   /* This may underflow if carry is non-zero but, if so, we'll fix it in the
    209    * next line. */
    210   inout[7] -= 1 & carry_mask;
    211   inout[7] += carry << 25;
    212 }
    213 
    214 /* felem_sum sets out = in+in2.
    215  *
    216  * On entry, in[i]+in2[i] must not overflow a 32-bit word.
    217  * On exit: out[0,2,...] < 2**30, out[1,3,...] < 2**29 */
    218 static void felem_sum(felem out, const felem in, const felem in2) {
    219   limb carry = 0;
    220   unsigned i;
    221 
    222   for (i = 0;; i++) {
    223     out[i] = in[i] + in2[i];
    224     out[i] += carry;
    225     carry = out[i] >> 29;
    226     out[i] &= kBottom29Bits;
    227 
    228     i++;
    229     if (i == NLIMBS)
    230       break;
    231 
    232     out[i] = in[i] + in2[i];
    233     out[i] += carry;
    234     carry = out[i] >> 28;
    235     out[i] &= kBottom28Bits;
    236   }
    237 
    238   felem_reduce_carry(out, carry);
    239 }
    240 
    241 #define two31m3 (((limb)1) << 31) - (((limb)1) << 3)
    242 #define two30m2 (((limb)1) << 30) - (((limb)1) << 2)
    243 #define two30p13m2 (((limb)1) << 30) + (((limb)1) << 13) - (((limb)1) << 2)
    244 #define two31m2 (((limb)1) << 31) - (((limb)1) << 2)
    245 #define two31p24m2 (((limb)1) << 31) + (((limb)1) << 24) - (((limb)1) << 2)
    246 #define two30m27m2 (((limb)1) << 30) - (((limb)1) << 27) - (((limb)1) << 2)
    247 
    248 /* zero31 is 0 mod p. */
    249 static const felem zero31 = { two31m3, two30m2, two31m2, two30p13m2, two31m2, two30m2, two31p24m2, two30m27m2, two31m2 };
    250 
    251 /* felem_diff sets out = in-in2.
    252  *
    253  * On entry: in[0,2,...] < 2**30, in[1,3,...] < 2**29 and
    254  *           in2[0,2,...] < 2**30, in2[1,3,...] < 2**29.
    255  * On exit: out[0,2,...] < 2**30, out[1,3,...] < 2**29. */
    256 static void felem_diff(felem out, const felem in, const felem in2) {
    257   limb carry = 0;
    258   unsigned i;
    259 
    260    for (i = 0;; i++) {
    261     out[i] = in[i] - in2[i];
    262     out[i] += zero31[i];
    263     out[i] += carry;
    264     carry = out[i] >> 29;
    265     out[i] &= kBottom29Bits;
    266 
    267     i++;
    268     if (i == NLIMBS)
    269       break;
    270 
    271     out[i] = in[i] - in2[i];
    272     out[i] += zero31[i];
    273     out[i] += carry;
    274     carry = out[i] >> 28;
    275     out[i] &= kBottom28Bits;
    276   }
    277 
    278   felem_reduce_carry(out, carry);
    279 }
    280 
    281 /* felem_reduce_degree sets out = tmp/R mod p where tmp contains 64-bit words
    282  * with the same 29,28,... bit positions as an felem.
    283  *
    284  * The values in felems are in Montgomery form: x*R mod p where R = 2**257.
    285  * Since we just multiplied two Montgomery values together, the result is
    286  * x*y*R*R mod p. We wish to divide by R in order for the result also to be
    287  * in Montgomery form.
    288  *
    289  * On entry: tmp[i] < 2**64
    290  * On exit: out[0,2,...] < 2**30, out[1,3,...] < 2**29 */
    291 static void felem_reduce_degree(felem out, u64 tmp[17]) {
    292    /* The following table may be helpful when reading this code:
    293     *
    294     * Limb number:   0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10...
    295     * Width (bits):  29| 28| 29| 28| 29| 28| 29| 28| 29| 28| 29
    296     * Start bit:     0 | 29| 57| 86|114|143|171|200|228|257|285
    297     *   (odd phase): 0 | 28| 57| 85|114|142|171|199|228|256|285 */
    298   limb tmp2[18], carry, x, xMask;
    299   unsigned i;
    300 
    301   /* tmp contains 64-bit words with the same 29,28,29-bit positions as an
    302    * felem. So the top of an element of tmp might overlap with another
    303    * element two positions down. The following loop eliminates this
    304    * overlap. */
    305   tmp2[0] = (limb)(tmp[0] & kBottom29Bits);
    306 
    307   /* In the following we use "(limb) tmp[x]" and "(limb) (tmp[x]>>32)" to try
    308    * and hint to the compiler that it can do a single-word shift by selecting
    309    * the right register rather than doing a double-word shift and truncating
    310    * afterwards. */
    311   tmp2[1] = ((limb) tmp[0]) >> 29;
    312   tmp2[1] |= (((limb)(tmp[0] >> 32)) << 3) & kBottom28Bits;
    313   tmp2[1] += ((limb) tmp[1]) & kBottom28Bits;
    314   carry = tmp2[1] >> 28;
    315   tmp2[1] &= kBottom28Bits;
    316 
    317   for (i = 2; i < 17; i++) {
    318     tmp2[i] = ((limb)(tmp[i - 2] >> 32)) >> 25;
    319     tmp2[i] += ((limb)(tmp[i - 1])) >> 28;
    320     tmp2[i] += (((limb)(tmp[i - 1] >> 32)) << 4) & kBottom29Bits;
    321     tmp2[i] += ((limb) tmp[i]) & kBottom29Bits;
    322     tmp2[i] += carry;
    323     carry = tmp2[i] >> 29;
    324     tmp2[i] &= kBottom29Bits;
    325 
    326     i++;
    327     if (i == 17)
    328       break;
    329     tmp2[i] = ((limb)(tmp[i - 2] >> 32)) >> 25;
    330     tmp2[i] += ((limb)(tmp[i - 1])) >> 29;
    331     tmp2[i] += (((limb)(tmp[i - 1] >> 32)) << 3) & kBottom28Bits;
    332     tmp2[i] += ((limb) tmp[i]) & kBottom28Bits;
    333     tmp2[i] += carry;
    334     carry = tmp2[i] >> 28;
    335     tmp2[i] &= kBottom28Bits;
    336   }
    337 
    338   tmp2[17] = ((limb)(tmp[15] >> 32)) >> 25;
    339   tmp2[17] += ((limb)(tmp[16])) >> 29;
    340   tmp2[17] += (((limb)(tmp[16] >> 32)) << 3);
    341   tmp2[17] += carry;
    342 
    343   /* Montgomery elimination of terms.
    344    *
    345    * Since R is 2**257, we can divide by R with a bitwise shift if we can
    346    * ensure that the right-most 257 bits are all zero. We can make that true by
    347    * adding multiplies of p without affecting the value.
    348    *
    349    * So we eliminate limbs from right to left. Since the bottom 29 bits of p
    350    * are all ones, then by adding tmp2[0]*p to tmp2 we'll make tmp2[0] == 0.
    351    * We can do that for 8 further limbs and then right shift to eliminate the
    352    * extra factor of R. */
    353   for (i = 0;; i += 2) {
    354     tmp2[i + 1] += tmp2[i] >> 29;
    355     x = tmp2[i] & kBottom29Bits;
    356     xMask = NON_ZERO_TO_ALL_ONES(x);
    357     tmp2[i] = 0;
    358 
    359     /* The bounds calculations for this loop are tricky. Each iteration of
    360      * the loop eliminates two words by adding values to words to their
    361      * right.
    362      *
    363      * The following table contains the amounts added to each word (as an
    364      * offset from the value of i at the top of the loop). The amounts are
    365      * accounted for from the first and second half of the loop separately
    366      * and are written as, for example, 28 to mean a value <2**28.
    367      *
    368      * Word:                   3   4   5   6   7   8   9   10
    369      * Added in top half:     28  11      29  21  29  28
    370      *                                        28  29
    371      *                                            29
    372      * Added in bottom half:      29  10      28  21  28   28
    373      *                                            29
    374      *
    375      * The value that is currently offset 7 will be offset 5 for the next
    376      * iteration and then offset 3 for the iteration after that. Therefore
    377      * the total value added will be the values added at 7, 5 and 3.
    378      *
    379      * The following table accumulates these values. The sums at the bottom
    380      * are written as, for example, 29+28, to mean a value < 2**29+2**28.
    381      *
    382      * Word:                   3   4   5   6   7   8   9  10  11  12  13
    383      *                        28  11  10  29  21  29  28  28  28  28  28
    384      *                            29  28  11  28  29  28  29  28  29  28
    385      *                                    29  28  21  21  29  21  29  21
    386      *                                        10  29  28  21  28  21  28
    387      *                                        28  29  28  29  28  29  28
    388      *                                            11  10  29  10  29  10
    389      *                                            29  28  11  28  11
    390      *                                                    29      29
    391      *                        --------------------------------------------
    392      *                                                30+ 31+ 30+ 31+ 30+
    393      *                                                28+ 29+ 28+ 29+ 21+
    394      *                                                21+ 28+ 21+ 28+ 10
    395      *                                                10  21+ 10  21+
    396      *                                                    11      11
    397      *
    398      * So the greatest amount is added to tmp2[10] and tmp2[12]. If
    399      * tmp2[10/12] has an initial value of <2**29, then the maximum value
    400      * will be < 2**31 + 2**30 + 2**28 + 2**21 + 2**11, which is < 2**32,
    401      * as required. */
    402     tmp2[i + 3] += (x << 10) & kBottom28Bits;
    403     tmp2[i + 4] += (x >> 18);
    404 
    405     tmp2[i + 6] += (x << 21) & kBottom29Bits;
    406     tmp2[i + 7] += x >> 8;
    407 
    408     /* At position 200, which is the starting bit position for word 7, we
    409      * have a factor of 0xf000000 = 2**28 - 2**24. */
    410     tmp2[i + 7] += 0x10000000 & xMask;
    411     /* Word 7 is 28 bits wide, so the 2**28 term exactly hits word 8. */
    412     tmp2[i + 8] += (x - 1) & xMask;
    413     tmp2[i + 7] -= (x << 24) & kBottom28Bits;
    414     tmp2[i + 8] -= x >> 4;
    415 
    416     tmp2[i + 8] += 0x20000000 & xMask;
    417     tmp2[i + 8] -= x;
    418     tmp2[i + 8] += (x << 28) & kBottom29Bits;
    419     tmp2[i + 9] += ((x >> 1) - 1) & xMask;
    420 
    421     if (i+1 == NLIMBS)
    422       break;
    423     tmp2[i + 2] += tmp2[i + 1] >> 28;
    424     x = tmp2[i + 1] & kBottom28Bits;
    425     xMask = NON_ZERO_TO_ALL_ONES(x);
    426     tmp2[i + 1] = 0;
    427 
    428     tmp2[i + 4] += (x << 11) & kBottom29Bits;
    429     tmp2[i + 5] += (x >> 18);
    430 
    431     tmp2[i + 7] += (x << 21) & kBottom28Bits;
    432     tmp2[i + 8] += x >> 7;
    433 
    434     /* At position 199, which is the starting bit of the 8th word when
    435      * dealing with a context starting on an odd word, we have a factor of
    436      * 0x1e000000 = 2**29 - 2**25. Since we have not updated i, the 8th
    437      * word from i+1 is i+8. */
    438     tmp2[i + 8] += 0x20000000 & xMask;
    439     tmp2[i + 9] += (x - 1) & xMask;
    440     tmp2[i + 8] -= (x << 25) & kBottom29Bits;
    441     tmp2[i + 9] -= x >> 4;
    442 
    443     tmp2[i + 9] += 0x10000000 & xMask;
    444     tmp2[i + 9] -= x;
    445     tmp2[i + 10] += (x - 1) & xMask;
    446   }
    447 
    448   /* We merge the right shift with a carry chain. The words above 2**257 have
    449    * widths of 28,29,... which we need to correct when copying them down.  */
    450   carry = 0;
    451   for (i = 0; i < 8; i++) {
    452     /* The maximum value of tmp2[i + 9] occurs on the first iteration and
    453      * is < 2**30+2**29+2**28. Adding 2**29 (from tmp2[i + 10]) is
    454      * therefore safe. */
    455     out[i] = tmp2[i + 9];
    456     out[i] += carry;
    457     out[i] += (tmp2[i + 10] << 28) & kBottom29Bits;
    458     carry = out[i] >> 29;
    459     out[i] &= kBottom29Bits;
    460 
    461     i++;
    462     out[i] = tmp2[i + 9] >> 1;
    463     out[i] += carry;
    464     carry = out[i] >> 28;
    465     out[i] &= kBottom28Bits;
    466   }
    467 
    468   out[8] = tmp2[17];
    469   out[8] += carry;
    470   carry = out[8] >> 29;
    471   out[8] &= kBottom29Bits;
    472 
    473   felem_reduce_carry(out, carry);
    474 }
    475 
    476 /* felem_square sets out=in*in.
    477  *
    478  * On entry: in[0,2,...] < 2**30, in[1,3,...] < 2**29.
    479  * On exit: out[0,2,...] < 2**30, out[1,3,...] < 2**29. */
    480 static void felem_square(felem out, const felem in) {
    481   u64 tmp[17];
    482 
    483   tmp[0] = ((u64) in[0]) * in[0];
    484   tmp[1] = ((u64) in[0]) * (in[1] << 1);
    485   tmp[2] = ((u64) in[0]) * (in[2] << 1) +
    486            ((u64) in[1]) * (in[1] << 1);
    487   tmp[3] = ((u64) in[0]) * (in[3] << 1) +
    488            ((u64) in[1]) * (in[2] << 1);
    489   tmp[4] = ((u64) in[0]) * (in[4] << 1) +
    490            ((u64) in[1]) * (in[3] << 2) + ((u64) in[2]) * in[2];
    491   tmp[5] = ((u64) in[0]) * (in[5] << 1) + ((u64) in[1]) *
    492            (in[4] << 1) + ((u64) in[2]) * (in[3] << 1);
    493   tmp[6] = ((u64) in[0]) * (in[6] << 1) + ((u64) in[1]) *
    494            (in[5] << 2) + ((u64) in[2]) * (in[4] << 1) +
    495            ((u64) in[3]) * (in[3] << 1);
    496   tmp[7] = ((u64) in[0]) * (in[7] << 1) + ((u64) in[1]) *
    497            (in[6] << 1) + ((u64) in[2]) * (in[5] << 1) +
    498            ((u64) in[3]) * (in[4] << 1);
    499   /* tmp[8] has the greatest value of 2**61 + 2**60 + 2**61 + 2**60 + 2**60,
    500    * which is < 2**64 as required. */
    501   tmp[8] = ((u64) in[0]) * (in[8] << 1) + ((u64) in[1]) *
    502            (in[7] << 2) + ((u64) in[2]) * (in[6] << 1) +
    503            ((u64) in[3]) * (in[5] << 2) + ((u64) in[4]) * in[4];
    504   tmp[9] = ((u64) in[1]) * (in[8] << 1) + ((u64) in[2]) *
    505            (in[7] << 1) + ((u64) in[3]) * (in[6] << 1) +
    506            ((u64) in[4]) * (in[5] << 1);
    507   tmp[10] = ((u64) in[2]) * (in[8] << 1) + ((u64) in[3]) *
    508             (in[7] << 2) + ((u64) in[4]) * (in[6] << 1) +
    509             ((u64) in[5]) * (in[5] << 1);
    510   tmp[11] = ((u64) in[3]) * (in[8] << 1) + ((u64) in[4]) *
    511             (in[7] << 1) + ((u64) in[5]) * (in[6] << 1);
    512   tmp[12] = ((u64) in[4]) * (in[8] << 1) +
    513             ((u64) in[5]) * (in[7] << 2) + ((u64) in[6]) * in[6];
    514   tmp[13] = ((u64) in[5]) * (in[8] << 1) +
    515             ((u64) in[6]) * (in[7] << 1);
    516   tmp[14] = ((u64) in[6]) * (in[8] << 1) +
    517             ((u64) in[7]) * (in[7] << 1);
    518   tmp[15] = ((u64) in[7]) * (in[8] << 1);
    519   tmp[16] = ((u64) in[8]) * in[8];
    520 
    521   felem_reduce_degree(out, tmp);
    522 }
    523 
    524 /* felem_mul sets out=in*in2.
    525  *
    526  * On entry: in[0,2,...] < 2**30, in[1,3,...] < 2**29 and
    527  *           in2[0,2,...] < 2**30, in2[1,3,...] < 2**29.
    528  * On exit: out[0,2,...] < 2**30, out[1,3,...] < 2**29. */
    529 static void felem_mul(felem out, const felem in, const felem in2) {
    530   u64 tmp[17];
    531 
    532   tmp[0] = ((u64) in[0]) * in2[0];
    533   tmp[1] = ((u64) in[0]) * (in2[1] << 0) +
    534            ((u64) in[1]) * (in2[0] << 0);
    535   tmp[2] = ((u64) in[0]) * (in2[2] << 0) + ((u64) in[1]) *
    536            (in2[1] << 1) + ((u64) in[2]) * (in2[0] << 0);
    537   tmp[3] = ((u64) in[0]) * (in2[3] << 0) + ((u64) in[1]) *
    538            (in2[2] << 0) + ((u64) in[2]) * (in2[1] << 0) +
    539            ((u64) in[3]) * (in2[0] << 0);
    540   tmp[4] = ((u64) in[0]) * (in2[4] << 0) + ((u64) in[1]) *
    541            (in2[3] << 1) + ((u64) in[2]) * (in2[2] << 0) +
    542            ((u64) in[3]) * (in2[1] << 1) +
    543            ((u64) in[4]) * (in2[0] << 0);
    544   tmp[5] = ((u64) in[0]) * (in2[5] << 0) + ((u64) in[1]) *
    545            (in2[4] << 0) + ((u64) in[2]) * (in2[3] << 0) +
    546            ((u64) in[3]) * (in2[2] << 0) + ((u64) in[4]) *
    547            (in2[1] << 0) + ((u64) in[5]) * (in2[0] << 0);
    548   tmp[6] = ((u64) in[0]) * (in2[6] << 0) + ((u64) in[1]) *
    549            (in2[5] << 1) + ((u64) in[2]) * (in2[4] << 0) +
    550            ((u64) in[3]) * (in2[3] << 1) + ((u64) in[4]) *
    551            (in2[2] << 0) + ((u64) in[5]) * (in2[1] << 1) +
    552            ((u64) in[6]) * (in2[0] << 0);
    553   tmp[7] = ((u64) in[0]) * (in2[7] << 0) + ((u64) in[1]) *
    554            (in2[6] << 0) + ((u64) in[2]) * (in2[5] << 0) +
    555            ((u64) in[3]) * (in2[4] << 0) + ((u64) in[4]) *
    556            (in2[3] << 0) + ((u64) in[5]) * (in2[2] << 0) +
    557            ((u64) in[6]) * (in2[1] << 0) +
    558            ((u64) in[7]) * (in2[0] << 0);
    559   /* tmp[8] has the greatest value but doesn't overflow. See logic in
    560    * felem_square. */
    561   tmp[8] = ((u64) in[0]) * (in2[8] << 0) + ((u64) in[1]) *
    562            (in2[7] << 1) + ((u64) in[2]) * (in2[6] << 0) +
    563            ((u64) in[3]) * (in2[5] << 1) + ((u64) in[4]) *
    564            (in2[4] << 0) + ((u64) in[5]) * (in2[3] << 1) +
    565            ((u64) in[6]) * (in2[2] << 0) + ((u64) in[7]) *
    566            (in2[1] << 1) + ((u64) in[8]) * (in2[0] << 0);
    567   tmp[9] = ((u64) in[1]) * (in2[8] << 0) + ((u64) in[2]) *
    568            (in2[7] << 0) + ((u64) in[3]) * (in2[6] << 0) +
    569            ((u64) in[4]) * (in2[5] << 0) + ((u64) in[5]) *
    570            (in2[4] << 0) + ((u64) in[6]) * (in2[3] << 0) +
    571            ((u64) in[7]) * (in2[2] << 0) +
    572            ((u64) in[8]) * (in2[1] << 0);
    573   tmp[10] = ((u64) in[2]) * (in2[8] << 0) + ((u64) in[3]) *
    574             (in2[7] << 1) + ((u64) in[4]) * (in2[6] << 0) +
    575             ((u64) in[5]) * (in2[5] << 1) + ((u64) in[6]) *
    576             (in2[4] << 0) + ((u64) in[7]) * (in2[3] << 1) +
    577             ((u64) in[8]) * (in2[2] << 0);
    578   tmp[11] = ((u64) in[3]) * (in2[8] << 0) + ((u64) in[4]) *
    579             (in2[7] << 0) + ((u64) in[5]) * (in2[6] << 0) +
    580             ((u64) in[6]) * (in2[5] << 0) + ((u64) in[7]) *
    581             (in2[4] << 0) + ((u64) in[8]) * (in2[3] << 0);
    582   tmp[12] = ((u64) in[4]) * (in2[8] << 0) + ((u64) in[5]) *
    583             (in2[7] << 1) + ((u64) in[6]) * (in2[6] << 0) +
    584             ((u64) in[7]) * (in2[5] << 1) +
    585             ((u64) in[8]) * (in2[4] << 0);
    586   tmp[13] = ((u64) in[5]) * (in2[8] << 0) + ((u64) in[6]) *
    587             (in2[7] << 0) + ((u64) in[7]) * (in2[6] << 0) +
    588             ((u64) in[8]) * (in2[5] << 0);
    589   tmp[14] = ((u64) in[6]) * (in2[8] << 0) + ((u64) in[7]) *
    590             (in2[7] << 1) + ((u64) in[8]) * (in2[6] << 0);
    591   tmp[15] = ((u64) in[7]) * (in2[8] << 0) +
    592             ((u64) in[8]) * (in2[7] << 0);
    593   tmp[16] = ((u64) in[8]) * (in2[8] << 0);
    594 
    595   felem_reduce_degree(out, tmp);
    596 }
    597 
    598 static void felem_assign(felem out, const felem in) {
    599   memcpy(out, in, sizeof(felem));
    600 }
    601 
    602 /* felem_inv calculates |out| = |in|^{-1}
    603  *
    604  * Based on Fermat's Little Theorem:
    605  *   a^p = a (mod p)
    606  *   a^{p-1} = 1 (mod p)
    607  *   a^{p-2} = a^{-1} (mod p)
    608  */
    609 static void felem_inv(felem out, const felem in) {
    610   felem ftmp, ftmp2;
    611   /* each e_I will hold |in|^{2^I - 1} */
    612   felem e2, e4, e8, e16, e32, e64;
    613   unsigned i;
    614 
    615   felem_square(ftmp, in); /* 2^1 */
    616   felem_mul(ftmp, in, ftmp); /* 2^2 - 2^0 */
    617   felem_assign(e2, ftmp);
    618   felem_square(ftmp, ftmp); /* 2^3 - 2^1 */
    619   felem_square(ftmp, ftmp); /* 2^4 - 2^2 */
    620   felem_mul(ftmp, ftmp, e2); /* 2^4 - 2^0 */
    621   felem_assign(e4, ftmp);
    622   felem_square(ftmp, ftmp); /* 2^5 - 2^1 */
    623   felem_square(ftmp, ftmp); /* 2^6 - 2^2 */
    624   felem_square(ftmp, ftmp); /* 2^7 - 2^3 */
    625   felem_square(ftmp, ftmp); /* 2^8 - 2^4 */
    626   felem_mul(ftmp, ftmp, e4); /* 2^8 - 2^0 */
    627   felem_assign(e8, ftmp);
    628   for (i = 0; i < 8; i++) {
    629     felem_square(ftmp, ftmp);
    630   } /* 2^16 - 2^8 */
    631   felem_mul(ftmp, ftmp, e8); /* 2^16 - 2^0 */
    632   felem_assign(e16, ftmp);
    633   for (i = 0; i < 16; i++) {
    634     felem_square(ftmp, ftmp);
    635   } /* 2^32 - 2^16 */
    636   felem_mul(ftmp, ftmp, e16); /* 2^32 - 2^0 */
    637   felem_assign(e32, ftmp);
    638   for (i = 0; i < 32; i++) {
    639     felem_square(ftmp, ftmp);
    640   } /* 2^64 - 2^32 */
    641   felem_assign(e64, ftmp);
    642   felem_mul(ftmp, ftmp, in); /* 2^64 - 2^32 + 2^0 */
    643   for (i = 0; i < 192; i++) {
    644     felem_square(ftmp, ftmp);
    645   } /* 2^256 - 2^224 + 2^192 */
    646 
    647   felem_mul(ftmp2, e64, e32); /* 2^64 - 2^0 */
    648   for (i = 0; i < 16; i++) {
    649     felem_square(ftmp2, ftmp2);
    650   } /* 2^80 - 2^16 */
    651   felem_mul(ftmp2, ftmp2, e16); /* 2^80 - 2^0 */
    652   for (i = 0; i < 8; i++) {
    653     felem_square(ftmp2, ftmp2);
    654   } /* 2^88 - 2^8 */
    655   felem_mul(ftmp2, ftmp2, e8); /* 2^88 - 2^0 */
    656   for (i = 0; i < 4; i++) {
    657     felem_square(ftmp2, ftmp2);
    658   } /* 2^92 - 2^4 */
    659   felem_mul(ftmp2, ftmp2, e4); /* 2^92 - 2^0 */
    660   felem_square(ftmp2, ftmp2); /* 2^93 - 2^1 */
    661   felem_square(ftmp2, ftmp2); /* 2^94 - 2^2 */
    662   felem_mul(ftmp2, ftmp2, e2); /* 2^94 - 2^0 */
    663   felem_square(ftmp2, ftmp2); /* 2^95 - 2^1 */
    664   felem_square(ftmp2, ftmp2); /* 2^96 - 2^2 */
    665   felem_mul(ftmp2, ftmp2, in); /* 2^96 - 3 */
    666 
    667   felem_mul(out, ftmp2, ftmp); /* 2^256 - 2^224 + 2^192 + 2^96 - 3 */
    668 }
    669 
    670 /* felem_scalar_3 sets out=3*out.
    671  *
    672  * On entry: out[0,2,...] < 2**30, out[1,3,...] < 2**29.
    673  * On exit: out[0,2,...] < 2**30, out[1,3,...] < 2**29. */
    674 static void felem_scalar_3(felem out) {
    675   limb carry = 0;
    676   unsigned i;
    677 
    678   for (i = 0;; i++) {
    679     out[i] *= 3;
    680     out[i] += carry;
    681     carry = out[i] >> 29;
    682     out[i] &= kBottom29Bits;
    683 
    684     i++;
    685     if (i == NLIMBS)
    686       break;
    687 
    688     out[i] *= 3;
    689     out[i] += carry;
    690     carry = out[i] >> 28;
    691     out[i] &= kBottom28Bits;
    692   }
    693 
    694   felem_reduce_carry(out, carry);
    695 }
    696 
    697 /* felem_scalar_4 sets out=4*out.
    698  *
    699  * On entry: out[0,2,...] < 2**30, out[1,3,...] < 2**29.
    700  * On exit: out[0,2,...] < 2**30, out[1,3,...] < 2**29. */
    701 static void felem_scalar_4(felem out) {
    702   limb carry = 0, next_carry;
    703   unsigned i;
    704 
    705   for (i = 0;; i++) {
    706     next_carry = out[i] >> 27;
    707     out[i] <<= 2;
    708     out[i] &= kBottom29Bits;
    709     out[i] += carry;
    710     carry = next_carry + (out[i] >> 29);
    711     out[i] &= kBottom29Bits;
    712 
    713     i++;
    714     if (i == NLIMBS)
    715       break;
    716 
    717     next_carry = out[i] >> 26;
    718     out[i] <<= 2;
    719     out[i] &= kBottom28Bits;
    720     out[i] += carry;
    721     carry = next_carry + (out[i] >> 28);
    722     out[i] &= kBottom28Bits;
    723   }
    724 
    725   felem_reduce_carry(out, carry);
    726 }
    727 
    728 /* felem_scalar_8 sets out=8*out.
    729  *
    730  * On entry: out[0,2,...] < 2**30, out[1,3,...] < 2**29.
    731  * On exit: out[0,2,...] < 2**30, out[1,3,...] < 2**29. */
    732 static void felem_scalar_8(felem out) {
    733   limb carry = 0, next_carry;
    734   unsigned i;
    735 
    736   for (i = 0;; i++) {
    737     next_carry = out[i] >> 26;
    738     out[i] <<= 3;
    739     out[i] &= kBottom29Bits;
    740     out[i] += carry;
    741     carry = next_carry + (out[i] >> 29);
    742     out[i] &= kBottom29Bits;
    743 
    744     i++;
    745     if (i == NLIMBS)
    746       break;
    747 
    748     next_carry = out[i] >> 25;
    749     out[i] <<= 3;
    750     out[i] &= kBottom28Bits;
    751     out[i] += carry;
    752     carry = next_carry + (out[i] >> 28);
    753     out[i] &= kBottom28Bits;
    754   }
    755 
    756   felem_reduce_carry(out, carry);
    757 }
    758 
    759 /* felem_is_zero_vartime returns 1 iff |in| == 0. It takes a variable amount of
    760  * time depending on the value of |in|. */
    761 static char felem_is_zero_vartime(const felem in) {
    762   limb carry;
    763   int i;
    764   limb tmp[NLIMBS];
    765 
    766   felem_assign(tmp, in);
    767 
    768   /* First, reduce tmp to a minimal form. */
    769   do {
    770     carry = 0;
    771     for (i = 0;; i++) {
    772       tmp[i] += carry;
    773       carry = tmp[i] >> 29;
    774       tmp[i] &= kBottom29Bits;
    775 
    776       i++;
    777       if (i == NLIMBS)
    778         break;
    779 
    780       tmp[i] += carry;
    781       carry = tmp[i] >> 28;
    782       tmp[i] &= kBottom28Bits;
    783     }
    784 
    785     felem_reduce_carry(tmp, carry);
    786   } while (carry);
    787 
    788   /* tmp < 2**257, so the only possible zero values are 0, p and 2p. */
    789   return memcmp(tmp, kZero, sizeof(tmp)) == 0 ||
    790          memcmp(tmp, kP, sizeof(tmp)) == 0 ||
    791          memcmp(tmp, k2P, sizeof(tmp)) == 0;
    792 }
    793 
    794 
    795 /* Group operations:
    796  *
    797  * Elements of the elliptic curve group are represented in Jacobian
    798  * coordinates: (x, y, z). An affine point (x', y') is x'=x/z**2, y'=y/z**3 in
    799  * Jacobian form. */
    800 
    801 /* point_double sets {x_out,y_out,z_out} = 2*{x,y,z}.
    802  *
    803  * See http://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-0.html#doubling-dbl-2009-l */
    804 static void point_double(felem x_out, felem y_out, felem z_out, const felem x,
    805                          const felem y, const felem z) {
    806   felem delta, gamma, alpha, beta, tmp, tmp2;
    807 
    808   felem_square(delta, z);
    809   felem_square(gamma, y);
    810   felem_mul(beta, x, gamma);
    811 
    812   felem_sum(tmp, x, delta);
    813   felem_diff(tmp2, x, delta);
    814   felem_mul(alpha, tmp, tmp2);
    815   felem_scalar_3(alpha);
    816 
    817   felem_sum(tmp, y, z);
    818   felem_square(tmp, tmp);
    819   felem_diff(tmp, tmp, gamma);
    820   felem_diff(z_out, tmp, delta);
    821 
    822   felem_scalar_4(beta);
    823   felem_square(x_out, alpha);
    824   felem_diff(x_out, x_out, beta);
    825   felem_diff(x_out, x_out, beta);
    826 
    827   felem_diff(tmp, beta, x_out);
    828   felem_mul(tmp, alpha, tmp);
    829   felem_square(tmp2, gamma);
    830   felem_scalar_8(tmp2);
    831   felem_diff(y_out, tmp, tmp2);
    832 }
    833 
    834 /* point_add_mixed sets {x_out,y_out,z_out} = {x1,y1,z1} + {x2,y2,1}.
    835  * (i.e. the second point is affine.)
    836  *
    837  * See http://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-0.html#addition-add-2007-bl
    838  *
    839  * Note that this function does not handle P+P, infinity+P nor P+infinity
    840  * correctly. */
    841 static void point_add_mixed(felem x_out, felem y_out, felem z_out,
    842                             const felem x1, const felem y1, const felem z1,
    843                             const felem x2, const felem y2) {
    844   felem z1z1, z1z1z1, s2, u2, h, i, j, r, rr, v, tmp;
    845 
    846   felem_square(z1z1, z1);
    847   felem_sum(tmp, z1, z1);
    848 
    849   felem_mul(u2, x2, z1z1);
    850   felem_mul(z1z1z1, z1, z1z1);
    851   felem_mul(s2, y2, z1z1z1);
    852   felem_diff(h, u2, x1);
    853   felem_sum(i, h, h);
    854   felem_square(i, i);
    855   felem_mul(j, h, i);
    856   felem_diff(r, s2, y1);
    857   felem_sum(r, r, r);
    858   felem_mul(v, x1, i);
    859 
    860   felem_mul(z_out, tmp, h);
    861   felem_square(rr, r);
    862   felem_diff(x_out, rr, j);
    863   felem_diff(x_out, x_out, v);
    864   felem_diff(x_out, x_out, v);
    865 
    866   felem_diff(tmp, v, x_out);
    867   felem_mul(y_out, tmp, r);
    868   felem_mul(tmp, y1, j);
    869   felem_diff(y_out, y_out, tmp);
    870   felem_diff(y_out, y_out, tmp);
    871 }
    872 
    873 /* point_add sets {x_out,y_out,z_out} = {x1,y1,z1} + {x2,y2,z2}.
    874  *
    875  * See http://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-0.html#addition-add-2007-bl
    876  *
    877  * Note that this function does not handle P+P, infinity+P nor P+infinity
    878  * correctly. */
    879 static void point_add(felem x_out, felem y_out, felem z_out, const felem x1,
    880                       const felem y1, const felem z1, const felem x2,
    881                       const felem y2, const felem z2) {
    882   felem z1z1, z1z1z1, z2z2, z2z2z2, s1, s2, u1, u2, h, i, j, r, rr, v, tmp;
    883 
    884   felem_square(z1z1, z1);
    885   felem_square(z2z2, z2);
    886   felem_mul(u1, x1, z2z2);
    887 
    888   felem_sum(tmp, z1, z2);
    889   felem_square(tmp, tmp);
    890   felem_diff(tmp, tmp, z1z1);
    891   felem_diff(tmp, tmp, z2z2);
    892 
    893   felem_mul(z2z2z2, z2, z2z2);
    894   felem_mul(s1, y1, z2z2z2);
    895 
    896   felem_mul(u2, x2, z1z1);
    897   felem_mul(z1z1z1, z1, z1z1);
    898   felem_mul(s2, y2, z1z1z1);
    899   felem_diff(h, u2, u1);
    900   felem_sum(i, h, h);
    901   felem_square(i, i);
    902   felem_mul(j, h, i);
    903   felem_diff(r, s2, s1);
    904   felem_sum(r, r, r);
    905   felem_mul(v, u1, i);
    906 
    907   felem_mul(z_out, tmp, h);
    908   felem_square(rr, r);
    909   felem_diff(x_out, rr, j);
    910   felem_diff(x_out, x_out, v);
    911   felem_diff(x_out, x_out, v);
    912 
    913   felem_diff(tmp, v, x_out);
    914   felem_mul(y_out, tmp, r);
    915   felem_mul(tmp, s1, j);
    916   felem_diff(y_out, y_out, tmp);
    917   felem_diff(y_out, y_out, tmp);
    918 }
    919 
    920 /* point_add_or_double_vartime sets {x_out,y_out,z_out} = {x1,y1,z1} +
    921  *                                                        {x2,y2,z2}.
    922  *
    923  * See http://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-0.html#addition-add-2007-bl
    924  *
    925  * This function handles the case where {x1,y1,z1}={x2,y2,z2}. */
    926 static void point_add_or_double_vartime(
    927     felem x_out, felem y_out, felem z_out, const felem x1, const felem y1,
    928     const felem z1, const felem x2, const felem y2, const felem z2) {
    929   felem z1z1, z1z1z1, z2z2, z2z2z2, s1, s2, u1, u2, h, i, j, r, rr, v, tmp;
    930   char x_equal, y_equal;
    931 
    932   felem_square(z1z1, z1);
    933   felem_square(z2z2, z2);
    934   felem_mul(u1, x1, z2z2);
    935 
    936   felem_sum(tmp, z1, z2);
    937   felem_square(tmp, tmp);
    938   felem_diff(tmp, tmp, z1z1);
    939   felem_diff(tmp, tmp, z2z2);
    940 
    941   felem_mul(z2z2z2, z2, z2z2);
    942   felem_mul(s1, y1, z2z2z2);
    943 
    944   felem_mul(u2, x2, z1z1);
    945   felem_mul(z1z1z1, z1, z1z1);
    946   felem_mul(s2, y2, z1z1z1);
    947   felem_diff(h, u2, u1);
    948   x_equal = felem_is_zero_vartime(h);
    949   felem_sum(i, h, h);
    950   felem_square(i, i);
    951   felem_mul(j, h, i);
    952   felem_diff(r, s2, s1);
    953   y_equal = felem_is_zero_vartime(r);
    954   if (x_equal && y_equal) {
    955     point_double(x_out, y_out, z_out, x1, y1, z1);
    956     return;
    957   }
    958   felem_sum(r, r, r);
    959   felem_mul(v, u1, i);
    960 
    961   felem_mul(z_out, tmp, h);
    962   felem_square(rr, r);
    963   felem_diff(x_out, rr, j);
    964   felem_diff(x_out, x_out, v);
    965   felem_diff(x_out, x_out, v);
    966 
    967   felem_diff(tmp, v, x_out);
    968   felem_mul(y_out, tmp, r);
    969   felem_mul(tmp, s1, j);
    970   felem_diff(y_out, y_out, tmp);
    971   felem_diff(y_out, y_out, tmp);
    972 }
    973 
    974 /* copy_conditional sets out=in if mask = 0xffffffff in constant time.
    975  *
    976  * On entry: mask is either 0 or 0xffffffff. */
    977 static void copy_conditional(felem out, const felem in, limb mask) {
    978   int i;
    979 
    980   for (i = 0; i < NLIMBS; i++) {
    981     const limb tmp = mask & (in[i] ^ out[i]);
    982     out[i] ^= tmp;
    983   }
    984 }
    985 
    986 /* select_affine_point sets {out_x,out_y} to the index'th entry of table.
    987  * On entry: index < 16, table[0] must be zero. */
    988 static void select_affine_point(felem out_x, felem out_y, const limb* table,
    989                                 limb index) {
    990   limb i, j;
    991 
    992   memset(out_x, 0, sizeof(felem));
    993   memset(out_y, 0, sizeof(felem));
    994 
    995   for (i = 1; i < 16; i++) {
    996     limb mask = i ^ index;
    997     mask |= mask >> 2;
    998     mask |= mask >> 1;
    999     mask &= 1;
   1000     mask--;
   1001     for (j = 0; j < NLIMBS; j++, table++) {
   1002       out_x[j] |= *table & mask;
   1003     }
   1004     for (j = 0; j < NLIMBS; j++, table++) {
   1005       out_y[j] |= *table & mask;
   1006     }
   1007   }
   1008 }
   1009 
   1010 /* select_jacobian_point sets {out_x,out_y,out_z} to the index'th entry of
   1011  * table. On entry: index < 16, table[0] must be zero. */
   1012 static void select_jacobian_point(felem out_x, felem out_y, felem out_z,
   1013                                   const limb* table, limb index) {
   1014   limb i, j;
   1015 
   1016   memset(out_x, 0, sizeof(felem));
   1017   memset(out_y, 0, sizeof(felem));
   1018   memset(out_z, 0, sizeof(felem));
   1019 
   1020   /* The implicit value at index 0 is all zero. We don't need to perform that
   1021    * iteration of the loop because we already set out_* to zero. */
   1022   table += 3 * NLIMBS;
   1023 
   1024   // Hit all entries to obscure cache profiling.
   1025   for (i = 1; i < 16; i++) {
   1026     limb mask = i ^ index;
   1027     mask |= mask >> 2;
   1028     mask |= mask >> 1;
   1029     mask &= 1;
   1030     mask--;
   1031     for (j = 0; j < NLIMBS; j++, table++) {
   1032       out_x[j] |= *table & mask;
   1033     }
   1034     for (j = 0; j < NLIMBS; j++, table++) {
   1035       out_y[j] |= *table & mask;
   1036     }
   1037     for (j = 0; j < NLIMBS; j++, table++) {
   1038       out_z[j] |= *table & mask;
   1039     }
   1040   }
   1041 }
   1042 
   1043 /* scalar_base_mult sets {nx,ny,nz} = scalar*G where scalar is a little-endian
   1044  * number. Note that the value of scalar must be less than the order of the
   1045  * group. */
   1046 static void scalar_base_mult(felem nx, felem ny, felem nz,
   1047                              const p256_int* scalar) {
   1048   int i, j;
   1049   limb n_is_infinity_mask = -1, p_is_noninfinite_mask, mask;
   1050   u32 table_offset;
   1051 
   1052   felem px, py;
   1053   felem tx, ty, tz;
   1054 
   1055   memset(nx, 0, sizeof(felem));
   1056   memset(ny, 0, sizeof(felem));
   1057   memset(nz, 0, sizeof(felem));
   1058 
   1059   /* The loop adds bits at positions 0, 64, 128 and 192, followed by
   1060    * positions 32,96,160 and 224 and does this 32 times. */
   1061   for (i = 0; i < 32; i++) {
   1062     if (i) {
   1063       point_double(nx, ny, nz, nx, ny, nz);
   1064     }
   1065     table_offset = 0;
   1066     for (j = 0; j <= 32; j += 32) {
   1067       char bit0 = p256_get_bit(scalar, 31 - i + j);
   1068       char bit1 = p256_get_bit(scalar, 95 - i + j);
   1069       char bit2 = p256_get_bit(scalar, 159 - i + j);
   1070       char bit3 = p256_get_bit(scalar, 223 - i + j);
   1071       limb index = bit0 | (bit1 << 1) | (bit2 << 2) | (bit3 << 3);
   1072 
   1073       select_affine_point(px, py, kPrecomputed + table_offset, index);
   1074       table_offset += 30 * NLIMBS;
   1075 
   1076       /* Since scalar is less than the order of the group, we know that
   1077        * {nx,ny,nz} != {px,py,1}, unless both are zero, which we handle
   1078        * below. */
   1079       point_add_mixed(tx, ty, tz, nx, ny, nz, px, py);
   1080       /* The result of point_add_mixed is incorrect if {nx,ny,nz} is zero
   1081        * (a.k.a.  the point at infinity). We handle that situation by
   1082        * copying the point from the table. */
   1083       copy_conditional(nx, px, n_is_infinity_mask);
   1084       copy_conditional(ny, py, n_is_infinity_mask);
   1085       copy_conditional(nz, kOne, n_is_infinity_mask);
   1086 
   1087       /* Equally, the result is also wrong if the point from the table is
   1088        * zero, which happens when the index is zero. We handle that by
   1089        * only copying from {tx,ty,tz} to {nx,ny,nz} if index != 0. */
   1090       p_is_noninfinite_mask = NON_ZERO_TO_ALL_ONES(index);
   1091       mask = p_is_noninfinite_mask & ~n_is_infinity_mask;
   1092       copy_conditional(nx, tx, mask);
   1093       copy_conditional(ny, ty, mask);
   1094       copy_conditional(nz, tz, mask);
   1095       /* If p was not zero, then n is now non-zero. */
   1096       n_is_infinity_mask &= ~p_is_noninfinite_mask;
   1097     }
   1098   }
   1099 }
   1100 
   1101 /* point_to_affine converts a Jacobian point to an affine point. If the input
   1102  * is the point at infinity then it returns (0, 0) in constant time. */
   1103 static void point_to_affine(felem x_out, felem y_out, const felem nx,
   1104                             const felem ny, const felem nz) {
   1105   felem z_inv, z_inv_sq;
   1106   felem_inv(z_inv, nz);
   1107   felem_square(z_inv_sq, z_inv);
   1108   felem_mul(x_out, nx, z_inv_sq);
   1109   felem_mul(z_inv, z_inv, z_inv_sq);
   1110   felem_mul(y_out, ny, z_inv);
   1111 }
   1112 
   1113 /* scalar_base_mult sets {nx,ny,nz} = scalar*{x,y}. */
   1114 static void scalar_mult(felem nx, felem ny, felem nz, const felem x,
   1115                         const felem y, const p256_int* scalar) {
   1116   int i;
   1117   felem px, py, pz, tx, ty, tz;
   1118   felem precomp[16][3];
   1119   limb n_is_infinity_mask, index, p_is_noninfinite_mask, mask;
   1120 
   1121   /* We precompute 0,1,2,... times {x,y}. */
   1122   memset(precomp, 0, sizeof(felem) * 3);
   1123   memcpy(&precomp[1][0], x, sizeof(felem));
   1124   memcpy(&precomp[1][1], y, sizeof(felem));
   1125   memcpy(&precomp[1][2], kOne, sizeof(felem));
   1126 
   1127   for (i = 2; i < 16; i += 2) {
   1128     point_double(precomp[i][0], precomp[i][1], precomp[i][2],
   1129                  precomp[i / 2][0], precomp[i / 2][1], precomp[i / 2][2]);
   1130 
   1131     point_add_mixed(precomp[i + 1][0], precomp[i + 1][1], precomp[i + 1][2],
   1132                     precomp[i][0], precomp[i][1], precomp[i][2], x, y);
   1133   }
   1134 
   1135   memset(nx, 0, sizeof(felem));
   1136   memset(ny, 0, sizeof(felem));
   1137   memset(nz, 0, sizeof(felem));
   1138   n_is_infinity_mask = -1;
   1139 
   1140   /* We add in a window of four bits each iteration and do this 64 times. */
   1141   for (i = 0; i < 256; i += 4) {
   1142     if (i) {
   1143       point_double(nx, ny, nz, nx, ny, nz);
   1144       point_double(nx, ny, nz, nx, ny, nz);
   1145       point_double(nx, ny, nz, nx, ny, nz);
   1146       point_double(nx, ny, nz, nx, ny, nz);
   1147     }
   1148 
   1149     index = (p256_get_bit(scalar, 255 - i - 0) << 3) |
   1150             (p256_get_bit(scalar, 255 - i - 1) << 2) |
   1151             (p256_get_bit(scalar, 255 - i - 2) << 1) |
   1152             p256_get_bit(scalar, 255 - i - 3);
   1153 
   1154     /* See the comments in scalar_base_mult about handling infinities. */
   1155     select_jacobian_point(px, py, pz, precomp[0][0], index);
   1156     point_add(tx, ty, tz, nx, ny, nz, px, py, pz);
   1157     copy_conditional(nx, px, n_is_infinity_mask);
   1158     copy_conditional(ny, py, n_is_infinity_mask);
   1159     copy_conditional(nz, pz, n_is_infinity_mask);
   1160 
   1161     p_is_noninfinite_mask = NON_ZERO_TO_ALL_ONES(index);
   1162     mask = p_is_noninfinite_mask & ~n_is_infinity_mask;
   1163 
   1164     copy_conditional(nx, tx, mask);
   1165     copy_conditional(ny, ty, mask);
   1166     copy_conditional(nz, tz, mask);
   1167     n_is_infinity_mask &= ~p_is_noninfinite_mask;
   1168   }
   1169 }
   1170 
   1171 #define kRDigits {2, 0, 0, 0xfffffffe, 0xffffffff, 0xffffffff, 0xfffffffd, 1} // 2^257 mod p256.p
   1172 
   1173 #define kRInvDigits {0x80000000, 1, 0xffffffff, 0, 0x80000001, 0xfffffffe, 1, 0x7fffffff}  // 1 / 2^257 mod p256.p
   1174 
   1175 static const p256_int kR = { kRDigits };
   1176 static const p256_int kRInv = { kRInvDigits };
   1177 
   1178 /* to_montgomery sets out = R*in. */
   1179 static void to_montgomery(felem out, const p256_int* in) {
   1180   p256_int in_shifted;
   1181   int i;
   1182 
   1183   p256_init(&in_shifted);
   1184   p256_modmul(&SECP256r1_p, in, 0, &kR, &in_shifted);
   1185 
   1186   for (i = 0; i < NLIMBS; i++) {
   1187     if ((i & 1) == 0) {
   1188       out[i] = P256_DIGIT(&in_shifted, 0) & kBottom29Bits;
   1189       p256_shr(&in_shifted, 29, &in_shifted);
   1190     } else {
   1191       out[i] = P256_DIGIT(&in_shifted, 0) & kBottom28Bits;
   1192       p256_shr(&in_shifted, 28, &in_shifted);
   1193     }
   1194   }
   1195 
   1196   p256_clear(&in_shifted);
   1197 }
   1198 
   1199 /* from_montgomery sets out=in/R. */
   1200 static void from_montgomery(p256_int* out, const felem in) {
   1201   p256_int result, tmp;
   1202   int i, top;
   1203 
   1204   p256_init(&result);
   1205   p256_init(&tmp);
   1206 
   1207   p256_add_d(&tmp, in[NLIMBS - 1], &result);
   1208   for (i = NLIMBS - 2; i >= 0; i--) {
   1209     if ((i & 1) == 0) {
   1210       top = p256_shl(&result, 29, &tmp);
   1211     } else {
   1212       top = p256_shl(&result, 28, &tmp);
   1213     }
   1214     top |= p256_add_d(&tmp, in[i], &result);
   1215   }
   1216 
   1217   p256_modmul(&SECP256r1_p, &kRInv, top, &result, out);
   1218 
   1219   p256_clear(&result);
   1220   p256_clear(&tmp);
   1221 }
   1222 
   1223 /* p256_base_point_mul sets {out_x,out_y} = nG, where n is < the
   1224  * order of the group. */
   1225 void p256_base_point_mul(const p256_int* n, p256_int* out_x, p256_int* out_y) {
   1226   felem x, y, z;
   1227 
   1228   scalar_base_mult(x, y, z, n);
   1229 
   1230   {
   1231     felem x_affine, y_affine;
   1232 
   1233     point_to_affine(x_affine, y_affine, x, y, z);
   1234     from_montgomery(out_x, x_affine);
   1235     from_montgomery(out_y, y_affine);
   1236   }
   1237 }
   1238 
   1239 /* p256_points_mul_vartime sets {out_x,out_y} = n1*G + n2*{in_x,in_y}, where
   1240  * n1 and n2 are < the order of the group.
   1241  *
   1242  * As indicated by the name, this function operates in variable time. This
   1243  * is safe because it's used for signature validation which doesn't deal
   1244  * with secrets. */
   1245 void p256_points_mul_vartime(
   1246     const p256_int* n1, const p256_int* n2, const p256_int* in_x,
   1247     const p256_int* in_y, p256_int* out_x, p256_int* out_y) {
   1248   felem x1, y1, z1, x2, y2, z2, px, py;
   1249 
   1250   /* If both scalars are zero, then the result is the point at infinity. */
   1251   if (p256_is_zero(n1) != 0 && p256_is_zero(n2) != 0) {
   1252     p256_clear(out_x);
   1253     p256_clear(out_y);
   1254     return;
   1255   }
   1256 
   1257   to_montgomery(px, in_x);
   1258   to_montgomery(py, in_y);
   1259   scalar_base_mult(x1, y1, z1, n1);
   1260   scalar_mult(x2, y2, z2, px, py, n2);
   1261 
   1262   if (p256_is_zero(n2) != 0) {
   1263     /* If n2 == 0, then {x2,y2,z2} is zero and the result is just
   1264          * {x1,y1,z1}. */
   1265   } else if (p256_is_zero(n1) != 0) {
   1266     /* If n1 == 0, then {x1,y1,z1} is zero and the result is just
   1267          * {x2,y2,z2}. */
   1268     memcpy(x1, x2, sizeof(x2));
   1269     memcpy(y1, y2, sizeof(y2));
   1270     memcpy(z1, z2, sizeof(z2));
   1271   } else {
   1272     /* This function handles the case where {x1,y1,z1} == {x2,y2,z2}. */
   1273     point_add_or_double_vartime(x1, y1, z1, x1, y1, z1, x2, y2, z2);
   1274   }
   1275 
   1276   point_to_affine(px, py, x1, y1, z1);
   1277   from_montgomery(out_x, px);
   1278   from_montgomery(out_y, py);
   1279 }
   1280