1 /* 2 * Copyright 2013 The Android Open Source Project 3 * 4 * Redistribution and use in source and binary forms, with or without 5 * modification, are permitted provided that the following conditions are met: 6 * * Redistributions of source code must retain the above copyright 7 * notice, this list of conditions and the following disclaimer. 8 * * Redistributions in binary form must reproduce the above copyright 9 * notice, this list of conditions and the following disclaimer in the 10 * documentation and/or other materials provided with the distribution. 11 * * Neither the name of Google Inc. nor the names of its contributors may 12 * be used to endorse or promote products derived from this software 13 * without specific prior written permission. 14 * 15 * THIS SOFTWARE IS PROVIDED BY Google Inc. ``AS IS'' AND ANY EXPRESS OR 16 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF 17 * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO 18 * EVENT SHALL Google Inc. BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 19 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 20 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; 21 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, 22 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR 23 * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF 24 * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 */ 26 27 // This is an implementation of the P256 elliptic curve group. It's written to 28 // be portable 32-bit, although it's still constant-time. 29 // 30 // WARNING: Implementing these functions in a constant-time manner is far from 31 // obvious. Be careful when touching this code. 32 // 33 // See http://www.imperialviolet.org/2010/12/04/ecc.html ([1]) for background. 34 35 #include <stdint.h> 36 #include <stdio.h> 37 38 #include <string.h> 39 #include <stdlib.h> 40 41 #include "constrainedcrypto/p256.h" 42 43 typedef uint8_t u8; 44 typedef uint32_t u32; 45 typedef int32_t s32; 46 typedef uint64_t u64; 47 48 /* Our field elements are represented as nine 32-bit limbs. 49 * 50 * The value of an felem (field element) is: 51 * x[0] + (x[1] * 2**29) + (x[2] * 2**57) + ... + (x[8] * 2**228) 52 * 53 * That is, each limb is alternately 29 or 28-bits wide in little-endian 54 * order. 55 * 56 * This means that an felem hits 2**257, rather than 2**256 as we would like. A 57 * 28, 29, ... pattern would cause us to hit 2**256, but that causes problems 58 * when multiplying as terms end up one bit short of a limb which would require 59 * much bit-shifting to correct. 60 * 61 * Finally, the values stored in an felem are in Montgomery form. So the value 62 * |y| is stored as (y*R) mod p, where p is the P-256 prime and R is 2**257. 63 */ 64 typedef u32 limb; 65 #define NLIMBS 9 66 typedef limb felem[NLIMBS]; 67 68 static const limb kBottom28Bits = 0xfffffff; 69 static const limb kBottom29Bits = 0x1fffffff; 70 71 /* kOne is the number 1 as an felem. It's 2**257 mod p split up into 29 and 72 * 28-bit words. */ 73 static const felem kOne = { 74 2, 0, 0, 0xffff800, 75 0x1fffffff, 0xfffffff, 0x1fbfffff, 0x1ffffff, 76 0 77 }; 78 static const felem kZero = {0}; 79 static const felem kP = { 80 0x1fffffff, 0xfffffff, 0x1fffffff, 0x3ff, 81 0, 0, 0x200000, 0xf000000, 82 0xfffffff 83 }; 84 static const felem k2P = { 85 0x1ffffffe, 0xfffffff, 0x1fffffff, 0x7ff, 86 0, 0, 0x400000, 0xe000000, 87 0x1fffffff 88 }; 89 /* kPrecomputed contains precomputed values to aid the calculation of scalar 90 * multiples of the base point, G. It's actually two, equal length, tables 91 * concatenated. 92 * 93 * The first table contains (x,y) felem pairs for 16 multiples of the base 94 * point, G. 95 * 96 * Index | Index (binary) | Value 97 * 0 | 0000 | 0G (all zeros, omitted) 98 * 1 | 0001 | G 99 * 2 | 0010 | 2**64G 100 * 3 | 0011 | 2**64G + G 101 * 4 | 0100 | 2**128G 102 * 5 | 0101 | 2**128G + G 103 * 6 | 0110 | 2**128G + 2**64G 104 * 7 | 0111 | 2**128G + 2**64G + G 105 * 8 | 1000 | 2**192G 106 * 9 | 1001 | 2**192G + G 107 * 10 | 1010 | 2**192G + 2**64G 108 * 11 | 1011 | 2**192G + 2**64G + G 109 * 12 | 1100 | 2**192G + 2**128G 110 * 13 | 1101 | 2**192G + 2**128G + G 111 * 14 | 1110 | 2**192G + 2**128G + 2**64G 112 * 15 | 1111 | 2**192G + 2**128G + 2**64G + G 113 * 114 * The second table follows the same style, but the terms are 2**32G, 115 * 2**96G, 2**160G, 2**224G. 116 * 117 * This is ~2KB of data. */ 118 static const limb kPrecomputed[NLIMBS * 2 * 15 * 2] = { 119 0x11522878, 0xe730d41, 0xdb60179, 0x4afe2ff, 0x12883add, 0xcaddd88, 0x119e7edc, 0xd4a6eab, 0x3120bee, 120 0x1d2aac15, 0xf25357c, 0x19e45cdd, 0x5c721d0, 0x1992c5a5, 0xa237487, 0x154ba21, 0x14b10bb, 0xae3fe3, 121 0xd41a576, 0x922fc51, 0x234994f, 0x60b60d3, 0x164586ae, 0xce95f18, 0x1fe49073, 0x3fa36cc, 0x5ebcd2c, 122 0xb402f2f, 0x15c70bf, 0x1561925c, 0x5a26704, 0xda91e90, 0xcdc1c7f, 0x1ea12446, 0xe1ade1e, 0xec91f22, 123 0x26f7778, 0x566847e, 0xa0bec9e, 0x234f453, 0x1a31f21a, 0xd85e75c, 0x56c7109, 0xa267a00, 0xb57c050, 124 0x98fb57, 0xaa837cc, 0x60c0792, 0xcfa5e19, 0x61bab9e, 0x589e39b, 0xa324c5, 0x7d6dee7, 0x2976e4b, 125 0x1fc4124a, 0xa8c244b, 0x1ce86762, 0xcd61c7e, 0x1831c8e0, 0x75774e1, 0x1d96a5a9, 0x843a649, 0xc3ab0fa, 126 0x6e2e7d5, 0x7673a2a, 0x178b65e8, 0x4003e9b, 0x1a1f11c2, 0x7816ea, 0xf643e11, 0x58c43df, 0xf423fc2, 127 0x19633ffa, 0x891f2b2, 0x123c231c, 0x46add8c, 0x54700dd, 0x59e2b17, 0x172db40f, 0x83e277d, 0xb0dd609, 128 0xfd1da12, 0x35c6e52, 0x19ede20c, 0xd19e0c0, 0x97d0f40, 0xb015b19, 0x449e3f5, 0xe10c9e, 0x33ab581, 129 0x56a67ab, 0x577734d, 0x1dddc062, 0xc57b10d, 0x149b39d, 0x26a9e7b, 0xc35df9f, 0x48764cd, 0x76dbcca, 130 0xca4b366, 0xe9303ab, 0x1a7480e7, 0x57e9e81, 0x1e13eb50, 0xf466cf3, 0x6f16b20, 0x4ba3173, 0xc168c33, 131 0x15cb5439, 0x6a38e11, 0x73658bd, 0xb29564f, 0x3f6dc5b, 0x53b97e, 0x1322c4c0, 0x65dd7ff, 0x3a1e4f6, 132 0x14e614aa, 0x9246317, 0x1bc83aca, 0xad97eed, 0xd38ce4a, 0xf82b006, 0x341f077, 0xa6add89, 0x4894acd, 133 0x9f162d5, 0xf8410ef, 0x1b266a56, 0xd7f223, 0x3e0cb92, 0xe39b672, 0x6a2901a, 0x69a8556, 0x7e7c0, 134 0x9b7d8d3, 0x309a80, 0x1ad05f7f, 0xc2fb5dd, 0xcbfd41d, 0x9ceb638, 0x1051825c, 0xda0cf5b, 0x812e881, 135 0x6f35669, 0x6a56f2c, 0x1df8d184, 0x345820, 0x1477d477, 0x1645db1, 0xbe80c51, 0xc22be3e, 0xe35e65a, 136 0x1aeb7aa0, 0xc375315, 0xf67bc99, 0x7fdd7b9, 0x191fc1be, 0x61235d, 0x2c184e9, 0x1c5a839, 0x47a1e26, 137 0xb7cb456, 0x93e225d, 0x14f3c6ed, 0xccc1ac9, 0x17fe37f3, 0x4988989, 0x1a90c502, 0x2f32042, 0xa17769b, 138 0xafd8c7c, 0x8191c6e, 0x1dcdb237, 0x16200c0, 0x107b32a1, 0x66c08db, 0x10d06a02, 0x3fc93, 0x5620023, 139 0x16722b27, 0x68b5c59, 0x270fcfc, 0xfad0ecc, 0xe5de1c2, 0xeab466b, 0x2fc513c, 0x407f75c, 0xbaab133, 140 0x9705fe9, 0xb88b8e7, 0x734c993, 0x1e1ff8f, 0x19156970, 0xabd0f00, 0x10469ea7, 0x3293ac0, 0xcdc98aa, 141 0x1d843fd, 0xe14bfe8, 0x15be825f, 0x8b5212, 0xeb3fb67, 0x81cbd29, 0xbc62f16, 0x2b6fcc7, 0xf5a4e29, 142 0x13560b66, 0xc0b6ac2, 0x51ae690, 0xd41e271, 0xf3e9bd4, 0x1d70aab, 0x1029f72, 0x73e1c35, 0xee70fbc, 143 0xad81baf, 0x9ecc49a, 0x86c741e, 0xfe6be30, 0x176752e7, 0x23d416, 0x1f83de85, 0x27de188, 0x66f70b8, 144 0x181cd51f, 0x96b6e4c, 0x188f2335, 0xa5df759, 0x17a77eb6, 0xfeb0e73, 0x154ae914, 0x2f3ec51, 0x3826b59, 145 0xb91f17d, 0x1c72949, 0x1362bf0a, 0xe23fddf, 0xa5614b0, 0xf7d8f, 0x79061, 0x823d9d2, 0x8213f39, 146 0x1128ae0b, 0xd095d05, 0xb85c0c2, 0x1ecb2ef, 0x24ddc84, 0xe35e901, 0x18411a4a, 0xf5ddc3d, 0x3786689, 147 0x52260e8, 0x5ae3564, 0x542b10d, 0x8d93a45, 0x19952aa4, 0x996cc41, 0x1051a729, 0x4be3499, 0x52b23aa, 148 0x109f307e, 0x6f5b6bb, 0x1f84e1e7, 0x77a0cfa, 0x10c4df3f, 0x25a02ea, 0xb048035, 0xe31de66, 0xc6ecaa3, 149 0x28ea335, 0x2886024, 0x1372f020, 0xf55d35, 0x15e4684c, 0xf2a9e17, 0x1a4a7529, 0xcb7beb1, 0xb2a78a1, 150 0x1ab21f1f, 0x6361ccf, 0x6c9179d, 0xb135627, 0x1267b974, 0x4408bad, 0x1cbff658, 0xe3d6511, 0xc7d76f, 151 0x1cc7a69, 0xe7ee31b, 0x54fab4f, 0x2b914f, 0x1ad27a30, 0xcd3579e, 0xc50124c, 0x50daa90, 0xb13f72, 152 0xb06aa75, 0x70f5cc6, 0x1649e5aa, 0x84a5312, 0x329043c, 0x41c4011, 0x13d32411, 0xb04a838, 0xd760d2d, 153 0x1713b532, 0xbaa0c03, 0x84022ab, 0x6bcf5c1, 0x2f45379, 0x18ae070, 0x18c9e11e, 0x20bca9a, 0x66f496b, 154 0x3eef294, 0x67500d2, 0xd7f613c, 0x2dbbeb, 0xb741038, 0xe04133f, 0x1582968d, 0xbe985f7, 0x1acbc1a, 155 0x1a6a939f, 0x33e50f6, 0xd665ed4, 0xb4b7bd6, 0x1e5a3799, 0x6b33847, 0x17fa56ff, 0x65ef930, 0x21dc4a, 156 0x2b37659, 0x450fe17, 0xb357b65, 0xdf5efac, 0x15397bef, 0x9d35a7f, 0x112ac15f, 0x624e62e, 0xa90ae2f, 157 0x107eecd2, 0x1f69bbe, 0x77d6bce, 0x5741394, 0x13c684fc, 0x950c910, 0x725522b, 0xdc78583, 0x40eeabb, 158 0x1fde328a, 0xbd61d96, 0xd28c387, 0x9e77d89, 0x12550c40, 0x759cb7d, 0x367ef34, 0xae2a960, 0x91b8bdc, 159 0x93462a9, 0xf469ef, 0xb2e9aef, 0xd2ca771, 0x54e1f42, 0x7aaa49, 0x6316abb, 0x2413c8e, 0x5425bf9, 160 0x1bed3e3a, 0xf272274, 0x1f5e7326, 0x6416517, 0xea27072, 0x9cedea7, 0x6e7633, 0x7c91952, 0xd806dce, 161 0x8e2a7e1, 0xe421e1a, 0x418c9e1, 0x1dbc890, 0x1b395c36, 0xa1dc175, 0x1dc4ef73, 0x8956f34, 0xe4b5cf2, 162 0x1b0d3a18, 0x3194a36, 0x6c2641f, 0xe44124c, 0xa2f4eaa, 0xa8c25ba, 0xf927ed7, 0x627b614, 0x7371cca, 163 0xba16694, 0x417bc03, 0x7c0a7e3, 0x9c35c19, 0x1168a205, 0x8b6b00d, 0x10e3edc9, 0x9c19bf2, 0x5882229, 164 0x1b2b4162, 0xa5cef1a, 0x1543622b, 0x9bd433e, 0x364e04d, 0x7480792, 0x5c9b5b3, 0xe85ff25, 0x408ef57, 165 0x1814cfa4, 0x121b41b, 0xd248a0f, 0x3b05222, 0x39bb16a, 0xc75966d, 0xa038113, 0xa4a1769, 0x11fbc6c, 166 0x917e50e, 0xeec3da8, 0x169d6eac, 0x10c1699, 0xa416153, 0xf724912, 0x15cd60b7, 0x4acbad9, 0x5efc5fa, 167 0xf150ed7, 0x122b51, 0x1104b40a, 0xcb7f442, 0xfbb28ff, 0x6ac53ca, 0x196142cc, 0x7bf0fa9, 0x957651, 168 0x4e0f215, 0xed439f8, 0x3f46bd5, 0x5ace82f, 0x110916b6, 0x6db078, 0xffd7d57, 0xf2ecaac, 0xca86dec, 169 0x15d6b2da, 0x965ecc9, 0x1c92b4c2, 0x1f3811, 0x1cb080f5, 0x2d8b804, 0x19d1c12d, 0xf20bd46, 0x1951fa7, 170 0xa3656c3, 0x523a425, 0xfcd0692, 0xd44ddc8, 0x131f0f5b, 0xaf80e4a, 0xcd9fc74, 0x99bb618, 0x2db944c, 171 0xa673090, 0x1c210e1, 0x178c8d23, 0x1474383, 0x10b8743d, 0x985a55b, 0x2e74779, 0x576138, 0x9587927, 172 0x133130fa, 0xbe05516, 0x9f4d619, 0xbb62570, 0x99ec591, 0xd9468fe, 0x1d07782d, 0xfc72e0b, 0x701b298, 173 0x1863863b, 0x85954b8, 0x121a0c36, 0x9e7fedf, 0xf64b429, 0x9b9d71e, 0x14e2f5d8, 0xf858d3a, 0x942eea8, 174 0xda5b765, 0x6edafff, 0xa9d18cc, 0xc65e4ba, 0x1c747e86, 0xe4ea915, 0x1981d7a1, 0x8395659, 0x52ed4e2, 175 0x87d43b7, 0x37ab11b, 0x19d292ce, 0xf8d4692, 0x18c3053f, 0x8863e13, 0x4c146c0, 0x6bdf55a, 0x4e4457d, 176 0x16152289, 0xac78ec2, 0x1a59c5a2, 0x2028b97, 0x71c2d01, 0x295851f, 0x404747b, 0x878558d, 0x7d29aa4, 177 0x13d8341f, 0x8daefd7, 0x139c972d, 0x6b7ea75, 0xd4a9dde, 0xff163d8, 0x81d55d7, 0xa5bef68, 0xb7b30d8, 178 0xbe73d6f, 0xaa88141, 0xd976c81, 0x7e7a9cc, 0x18beb771, 0xd773cbd, 0x13f51951, 0x9d0c177, 0x1c49a78, 179 }; 180 181 182 /* Field element operations: */ 183 184 /* NON_ZERO_TO_ALL_ONES returns: 185 * 0xffffffff for 0 < x <= 2**31 186 * 0 for x == 0 or x > 2**31. 187 * 188 * x must be a u32 or an equivalent type such as limb. */ 189 #define NON_ZERO_TO_ALL_ONES(x) ((((u32)(x) - 1) >> 31) - 1) 190 191 /* felem_reduce_carry adds a multiple of p in order to cancel |carry|, 192 * which is a term at 2**257. 193 * 194 * On entry: carry < 2**3, inout[0,2,...] < 2**29, inout[1,3,...] < 2**28. 195 * On exit: inout[0,2,..] < 2**30, inout[1,3,...] < 2**29. */ 196 static void felem_reduce_carry(felem inout, limb carry) { 197 const u32 carry_mask = NON_ZERO_TO_ALL_ONES(carry); 198 199 inout[0] += carry << 1; 200 inout[3] += 0x10000000 & carry_mask; 201 /* carry < 2**3 thus (carry << 11) < 2**14 and we added 2**28 in the 202 * previous line therefore this doesn't underflow. */ 203 inout[3] -= carry << 11; 204 inout[4] += (0x20000000 - 1) & carry_mask; 205 inout[5] += (0x10000000 - 1) & carry_mask; 206 inout[6] += (0x20000000 - 1) & carry_mask; 207 inout[6] -= carry << 22; 208 /* This may underflow if carry is non-zero but, if so, we'll fix it in the 209 * next line. */ 210 inout[7] -= 1 & carry_mask; 211 inout[7] += carry << 25; 212 } 213 214 /* felem_sum sets out = in+in2. 215 * 216 * On entry, in[i]+in2[i] must not overflow a 32-bit word. 217 * On exit: out[0,2,...] < 2**30, out[1,3,...] < 2**29 */ 218 static void felem_sum(felem out, const felem in, const felem in2) { 219 limb carry = 0; 220 unsigned i; 221 222 for (i = 0;; i++) { 223 out[i] = in[i] + in2[i]; 224 out[i] += carry; 225 carry = out[i] >> 29; 226 out[i] &= kBottom29Bits; 227 228 i++; 229 if (i == NLIMBS) 230 break; 231 232 out[i] = in[i] + in2[i]; 233 out[i] += carry; 234 carry = out[i] >> 28; 235 out[i] &= kBottom28Bits; 236 } 237 238 felem_reduce_carry(out, carry); 239 } 240 241 #define two31m3 (((limb)1) << 31) - (((limb)1) << 3) 242 #define two30m2 (((limb)1) << 30) - (((limb)1) << 2) 243 #define two30p13m2 (((limb)1) << 30) + (((limb)1) << 13) - (((limb)1) << 2) 244 #define two31m2 (((limb)1) << 31) - (((limb)1) << 2) 245 #define two31p24m2 (((limb)1) << 31) + (((limb)1) << 24) - (((limb)1) << 2) 246 #define two30m27m2 (((limb)1) << 30) - (((limb)1) << 27) - (((limb)1) << 2) 247 248 /* zero31 is 0 mod p. */ 249 static const felem zero31 = { two31m3, two30m2, two31m2, two30p13m2, two31m2, two30m2, two31p24m2, two30m27m2, two31m2 }; 250 251 /* felem_diff sets out = in-in2. 252 * 253 * On entry: in[0,2,...] < 2**30, in[1,3,...] < 2**29 and 254 * in2[0,2,...] < 2**30, in2[1,3,...] < 2**29. 255 * On exit: out[0,2,...] < 2**30, out[1,3,...] < 2**29. */ 256 static void felem_diff(felem out, const felem in, const felem in2) { 257 limb carry = 0; 258 unsigned i; 259 260 for (i = 0;; i++) { 261 out[i] = in[i] - in2[i]; 262 out[i] += zero31[i]; 263 out[i] += carry; 264 carry = out[i] >> 29; 265 out[i] &= kBottom29Bits; 266 267 i++; 268 if (i == NLIMBS) 269 break; 270 271 out[i] = in[i] - in2[i]; 272 out[i] += zero31[i]; 273 out[i] += carry; 274 carry = out[i] >> 28; 275 out[i] &= kBottom28Bits; 276 } 277 278 felem_reduce_carry(out, carry); 279 } 280 281 /* felem_reduce_degree sets out = tmp/R mod p where tmp contains 64-bit words 282 * with the same 29,28,... bit positions as an felem. 283 * 284 * The values in felems are in Montgomery form: x*R mod p where R = 2**257. 285 * Since we just multiplied two Montgomery values together, the result is 286 * x*y*R*R mod p. We wish to divide by R in order for the result also to be 287 * in Montgomery form. 288 * 289 * On entry: tmp[i] < 2**64 290 * On exit: out[0,2,...] < 2**30, out[1,3,...] < 2**29 */ 291 static void felem_reduce_degree(felem out, u64 tmp[17]) { 292 /* The following table may be helpful when reading this code: 293 * 294 * Limb number: 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10... 295 * Width (bits): 29| 28| 29| 28| 29| 28| 29| 28| 29| 28| 29 296 * Start bit: 0 | 29| 57| 86|114|143|171|200|228|257|285 297 * (odd phase): 0 | 28| 57| 85|114|142|171|199|228|256|285 */ 298 limb tmp2[18], carry, x, xMask; 299 unsigned i; 300 301 /* tmp contains 64-bit words with the same 29,28,29-bit positions as an 302 * felem. So the top of an element of tmp might overlap with another 303 * element two positions down. The following loop eliminates this 304 * overlap. */ 305 tmp2[0] = (limb)(tmp[0] & kBottom29Bits); 306 307 /* In the following we use "(limb) tmp[x]" and "(limb) (tmp[x]>>32)" to try 308 * and hint to the compiler that it can do a single-word shift by selecting 309 * the right register rather than doing a double-word shift and truncating 310 * afterwards. */ 311 tmp2[1] = ((limb) tmp[0]) >> 29; 312 tmp2[1] |= (((limb)(tmp[0] >> 32)) << 3) & kBottom28Bits; 313 tmp2[1] += ((limb) tmp[1]) & kBottom28Bits; 314 carry = tmp2[1] >> 28; 315 tmp2[1] &= kBottom28Bits; 316 317 for (i = 2; i < 17; i++) { 318 tmp2[i] = ((limb)(tmp[i - 2] >> 32)) >> 25; 319 tmp2[i] += ((limb)(tmp[i - 1])) >> 28; 320 tmp2[i] += (((limb)(tmp[i - 1] >> 32)) << 4) & kBottom29Bits; 321 tmp2[i] += ((limb) tmp[i]) & kBottom29Bits; 322 tmp2[i] += carry; 323 carry = tmp2[i] >> 29; 324 tmp2[i] &= kBottom29Bits; 325 326 i++; 327 if (i == 17) 328 break; 329 tmp2[i] = ((limb)(tmp[i - 2] >> 32)) >> 25; 330 tmp2[i] += ((limb)(tmp[i - 1])) >> 29; 331 tmp2[i] += (((limb)(tmp[i - 1] >> 32)) << 3) & kBottom28Bits; 332 tmp2[i] += ((limb) tmp[i]) & kBottom28Bits; 333 tmp2[i] += carry; 334 carry = tmp2[i] >> 28; 335 tmp2[i] &= kBottom28Bits; 336 } 337 338 tmp2[17] = ((limb)(tmp[15] >> 32)) >> 25; 339 tmp2[17] += ((limb)(tmp[16])) >> 29; 340 tmp2[17] += (((limb)(tmp[16] >> 32)) << 3); 341 tmp2[17] += carry; 342 343 /* Montgomery elimination of terms. 344 * 345 * Since R is 2**257, we can divide by R with a bitwise shift if we can 346 * ensure that the right-most 257 bits are all zero. We can make that true by 347 * adding multiplies of p without affecting the value. 348 * 349 * So we eliminate limbs from right to left. Since the bottom 29 bits of p 350 * are all ones, then by adding tmp2[0]*p to tmp2 we'll make tmp2[0] == 0. 351 * We can do that for 8 further limbs and then right shift to eliminate the 352 * extra factor of R. */ 353 for (i = 0;; i += 2) { 354 tmp2[i + 1] += tmp2[i] >> 29; 355 x = tmp2[i] & kBottom29Bits; 356 xMask = NON_ZERO_TO_ALL_ONES(x); 357 tmp2[i] = 0; 358 359 /* The bounds calculations for this loop are tricky. Each iteration of 360 * the loop eliminates two words by adding values to words to their 361 * right. 362 * 363 * The following table contains the amounts added to each word (as an 364 * offset from the value of i at the top of the loop). The amounts are 365 * accounted for from the first and second half of the loop separately 366 * and are written as, for example, 28 to mean a value <2**28. 367 * 368 * Word: 3 4 5 6 7 8 9 10 369 * Added in top half: 28 11 29 21 29 28 370 * 28 29 371 * 29 372 * Added in bottom half: 29 10 28 21 28 28 373 * 29 374 * 375 * The value that is currently offset 7 will be offset 5 for the next 376 * iteration and then offset 3 for the iteration after that. Therefore 377 * the total value added will be the values added at 7, 5 and 3. 378 * 379 * The following table accumulates these values. The sums at the bottom 380 * are written as, for example, 29+28, to mean a value < 2**29+2**28. 381 * 382 * Word: 3 4 5 6 7 8 9 10 11 12 13 383 * 28 11 10 29 21 29 28 28 28 28 28 384 * 29 28 11 28 29 28 29 28 29 28 385 * 29 28 21 21 29 21 29 21 386 * 10 29 28 21 28 21 28 387 * 28 29 28 29 28 29 28 388 * 11 10 29 10 29 10 389 * 29 28 11 28 11 390 * 29 29 391 * -------------------------------------------- 392 * 30+ 31+ 30+ 31+ 30+ 393 * 28+ 29+ 28+ 29+ 21+ 394 * 21+ 28+ 21+ 28+ 10 395 * 10 21+ 10 21+ 396 * 11 11 397 * 398 * So the greatest amount is added to tmp2[10] and tmp2[12]. If 399 * tmp2[10/12] has an initial value of <2**29, then the maximum value 400 * will be < 2**31 + 2**30 + 2**28 + 2**21 + 2**11, which is < 2**32, 401 * as required. */ 402 tmp2[i + 3] += (x << 10) & kBottom28Bits; 403 tmp2[i + 4] += (x >> 18); 404 405 tmp2[i + 6] += (x << 21) & kBottom29Bits; 406 tmp2[i + 7] += x >> 8; 407 408 /* At position 200, which is the starting bit position for word 7, we 409 * have a factor of 0xf000000 = 2**28 - 2**24. */ 410 tmp2[i + 7] += 0x10000000 & xMask; 411 /* Word 7 is 28 bits wide, so the 2**28 term exactly hits word 8. */ 412 tmp2[i + 8] += (x - 1) & xMask; 413 tmp2[i + 7] -= (x << 24) & kBottom28Bits; 414 tmp2[i + 8] -= x >> 4; 415 416 tmp2[i + 8] += 0x20000000 & xMask; 417 tmp2[i + 8] -= x; 418 tmp2[i + 8] += (x << 28) & kBottom29Bits; 419 tmp2[i + 9] += ((x >> 1) - 1) & xMask; 420 421 if (i+1 == NLIMBS) 422 break; 423 tmp2[i + 2] += tmp2[i + 1] >> 28; 424 x = tmp2[i + 1] & kBottom28Bits; 425 xMask = NON_ZERO_TO_ALL_ONES(x); 426 tmp2[i + 1] = 0; 427 428 tmp2[i + 4] += (x << 11) & kBottom29Bits; 429 tmp2[i + 5] += (x >> 18); 430 431 tmp2[i + 7] += (x << 21) & kBottom28Bits; 432 tmp2[i + 8] += x >> 7; 433 434 /* At position 199, which is the starting bit of the 8th word when 435 * dealing with a context starting on an odd word, we have a factor of 436 * 0x1e000000 = 2**29 - 2**25. Since we have not updated i, the 8th 437 * word from i+1 is i+8. */ 438 tmp2[i + 8] += 0x20000000 & xMask; 439 tmp2[i + 9] += (x - 1) & xMask; 440 tmp2[i + 8] -= (x << 25) & kBottom29Bits; 441 tmp2[i + 9] -= x >> 4; 442 443 tmp2[i + 9] += 0x10000000 & xMask; 444 tmp2[i + 9] -= x; 445 tmp2[i + 10] += (x - 1) & xMask; 446 } 447 448 /* We merge the right shift with a carry chain. The words above 2**257 have 449 * widths of 28,29,... which we need to correct when copying them down. */ 450 carry = 0; 451 for (i = 0; i < 8; i++) { 452 /* The maximum value of tmp2[i + 9] occurs on the first iteration and 453 * is < 2**30+2**29+2**28. Adding 2**29 (from tmp2[i + 10]) is 454 * therefore safe. */ 455 out[i] = tmp2[i + 9]; 456 out[i] += carry; 457 out[i] += (tmp2[i + 10] << 28) & kBottom29Bits; 458 carry = out[i] >> 29; 459 out[i] &= kBottom29Bits; 460 461 i++; 462 out[i] = tmp2[i + 9] >> 1; 463 out[i] += carry; 464 carry = out[i] >> 28; 465 out[i] &= kBottom28Bits; 466 } 467 468 out[8] = tmp2[17]; 469 out[8] += carry; 470 carry = out[8] >> 29; 471 out[8] &= kBottom29Bits; 472 473 felem_reduce_carry(out, carry); 474 } 475 476 /* felem_square sets out=in*in. 477 * 478 * On entry: in[0,2,...] < 2**30, in[1,3,...] < 2**29. 479 * On exit: out[0,2,...] < 2**30, out[1,3,...] < 2**29. */ 480 static void felem_square(felem out, const felem in) { 481 u64 tmp[17]; 482 483 tmp[0] = ((u64) in[0]) * in[0]; 484 tmp[1] = ((u64) in[0]) * (in[1] << 1); 485 tmp[2] = ((u64) in[0]) * (in[2] << 1) + 486 ((u64) in[1]) * (in[1] << 1); 487 tmp[3] = ((u64) in[0]) * (in[3] << 1) + 488 ((u64) in[1]) * (in[2] << 1); 489 tmp[4] = ((u64) in[0]) * (in[4] << 1) + 490 ((u64) in[1]) * (in[3] << 2) + ((u64) in[2]) * in[2]; 491 tmp[5] = ((u64) in[0]) * (in[5] << 1) + ((u64) in[1]) * 492 (in[4] << 1) + ((u64) in[2]) * (in[3] << 1); 493 tmp[6] = ((u64) in[0]) * (in[6] << 1) + ((u64) in[1]) * 494 (in[5] << 2) + ((u64) in[2]) * (in[4] << 1) + 495 ((u64) in[3]) * (in[3] << 1); 496 tmp[7] = ((u64) in[0]) * (in[7] << 1) + ((u64) in[1]) * 497 (in[6] << 1) + ((u64) in[2]) * (in[5] << 1) + 498 ((u64) in[3]) * (in[4] << 1); 499 /* tmp[8] has the greatest value of 2**61 + 2**60 + 2**61 + 2**60 + 2**60, 500 * which is < 2**64 as required. */ 501 tmp[8] = ((u64) in[0]) * (in[8] << 1) + ((u64) in[1]) * 502 (in[7] << 2) + ((u64) in[2]) * (in[6] << 1) + 503 ((u64) in[3]) * (in[5] << 2) + ((u64) in[4]) * in[4]; 504 tmp[9] = ((u64) in[1]) * (in[8] << 1) + ((u64) in[2]) * 505 (in[7] << 1) + ((u64) in[3]) * (in[6] << 1) + 506 ((u64) in[4]) * (in[5] << 1); 507 tmp[10] = ((u64) in[2]) * (in[8] << 1) + ((u64) in[3]) * 508 (in[7] << 2) + ((u64) in[4]) * (in[6] << 1) + 509 ((u64) in[5]) * (in[5] << 1); 510 tmp[11] = ((u64) in[3]) * (in[8] << 1) + ((u64) in[4]) * 511 (in[7] << 1) + ((u64) in[5]) * (in[6] << 1); 512 tmp[12] = ((u64) in[4]) * (in[8] << 1) + 513 ((u64) in[5]) * (in[7] << 2) + ((u64) in[6]) * in[6]; 514 tmp[13] = ((u64) in[5]) * (in[8] << 1) + 515 ((u64) in[6]) * (in[7] << 1); 516 tmp[14] = ((u64) in[6]) * (in[8] << 1) + 517 ((u64) in[7]) * (in[7] << 1); 518 tmp[15] = ((u64) in[7]) * (in[8] << 1); 519 tmp[16] = ((u64) in[8]) * in[8]; 520 521 felem_reduce_degree(out, tmp); 522 } 523 524 /* felem_mul sets out=in*in2. 525 * 526 * On entry: in[0,2,...] < 2**30, in[1,3,...] < 2**29 and 527 * in2[0,2,...] < 2**30, in2[1,3,...] < 2**29. 528 * On exit: out[0,2,...] < 2**30, out[1,3,...] < 2**29. */ 529 static void felem_mul(felem out, const felem in, const felem in2) { 530 u64 tmp[17]; 531 532 tmp[0] = ((u64) in[0]) * in2[0]; 533 tmp[1] = ((u64) in[0]) * (in2[1] << 0) + 534 ((u64) in[1]) * (in2[0] << 0); 535 tmp[2] = ((u64) in[0]) * (in2[2] << 0) + ((u64) in[1]) * 536 (in2[1] << 1) + ((u64) in[2]) * (in2[0] << 0); 537 tmp[3] = ((u64) in[0]) * (in2[3] << 0) + ((u64) in[1]) * 538 (in2[2] << 0) + ((u64) in[2]) * (in2[1] << 0) + 539 ((u64) in[3]) * (in2[0] << 0); 540 tmp[4] = ((u64) in[0]) * (in2[4] << 0) + ((u64) in[1]) * 541 (in2[3] << 1) + ((u64) in[2]) * (in2[2] << 0) + 542 ((u64) in[3]) * (in2[1] << 1) + 543 ((u64) in[4]) * (in2[0] << 0); 544 tmp[5] = ((u64) in[0]) * (in2[5] << 0) + ((u64) in[1]) * 545 (in2[4] << 0) + ((u64) in[2]) * (in2[3] << 0) + 546 ((u64) in[3]) * (in2[2] << 0) + ((u64) in[4]) * 547 (in2[1] << 0) + ((u64) in[5]) * (in2[0] << 0); 548 tmp[6] = ((u64) in[0]) * (in2[6] << 0) + ((u64) in[1]) * 549 (in2[5] << 1) + ((u64) in[2]) * (in2[4] << 0) + 550 ((u64) in[3]) * (in2[3] << 1) + ((u64) in[4]) * 551 (in2[2] << 0) + ((u64) in[5]) * (in2[1] << 1) + 552 ((u64) in[6]) * (in2[0] << 0); 553 tmp[7] = ((u64) in[0]) * (in2[7] << 0) + ((u64) in[1]) * 554 (in2[6] << 0) + ((u64) in[2]) * (in2[5] << 0) + 555 ((u64) in[3]) * (in2[4] << 0) + ((u64) in[4]) * 556 (in2[3] << 0) + ((u64) in[5]) * (in2[2] << 0) + 557 ((u64) in[6]) * (in2[1] << 0) + 558 ((u64) in[7]) * (in2[0] << 0); 559 /* tmp[8] has the greatest value but doesn't overflow. See logic in 560 * felem_square. */ 561 tmp[8] = ((u64) in[0]) * (in2[8] << 0) + ((u64) in[1]) * 562 (in2[7] << 1) + ((u64) in[2]) * (in2[6] << 0) + 563 ((u64) in[3]) * (in2[5] << 1) + ((u64) in[4]) * 564 (in2[4] << 0) + ((u64) in[5]) * (in2[3] << 1) + 565 ((u64) in[6]) * (in2[2] << 0) + ((u64) in[7]) * 566 (in2[1] << 1) + ((u64) in[8]) * (in2[0] << 0); 567 tmp[9] = ((u64) in[1]) * (in2[8] << 0) + ((u64) in[2]) * 568 (in2[7] << 0) + ((u64) in[3]) * (in2[6] << 0) + 569 ((u64) in[4]) * (in2[5] << 0) + ((u64) in[5]) * 570 (in2[4] << 0) + ((u64) in[6]) * (in2[3] << 0) + 571 ((u64) in[7]) * (in2[2] << 0) + 572 ((u64) in[8]) * (in2[1] << 0); 573 tmp[10] = ((u64) in[2]) * (in2[8] << 0) + ((u64) in[3]) * 574 (in2[7] << 1) + ((u64) in[4]) * (in2[6] << 0) + 575 ((u64) in[5]) * (in2[5] << 1) + ((u64) in[6]) * 576 (in2[4] << 0) + ((u64) in[7]) * (in2[3] << 1) + 577 ((u64) in[8]) * (in2[2] << 0); 578 tmp[11] = ((u64) in[3]) * (in2[8] << 0) + ((u64) in[4]) * 579 (in2[7] << 0) + ((u64) in[5]) * (in2[6] << 0) + 580 ((u64) in[6]) * (in2[5] << 0) + ((u64) in[7]) * 581 (in2[4] << 0) + ((u64) in[8]) * (in2[3] << 0); 582 tmp[12] = ((u64) in[4]) * (in2[8] << 0) + ((u64) in[5]) * 583 (in2[7] << 1) + ((u64) in[6]) * (in2[6] << 0) + 584 ((u64) in[7]) * (in2[5] << 1) + 585 ((u64) in[8]) * (in2[4] << 0); 586 tmp[13] = ((u64) in[5]) * (in2[8] << 0) + ((u64) in[6]) * 587 (in2[7] << 0) + ((u64) in[7]) * (in2[6] << 0) + 588 ((u64) in[8]) * (in2[5] << 0); 589 tmp[14] = ((u64) in[6]) * (in2[8] << 0) + ((u64) in[7]) * 590 (in2[7] << 1) + ((u64) in[8]) * (in2[6] << 0); 591 tmp[15] = ((u64) in[7]) * (in2[8] << 0) + 592 ((u64) in[8]) * (in2[7] << 0); 593 tmp[16] = ((u64) in[8]) * (in2[8] << 0); 594 595 felem_reduce_degree(out, tmp); 596 } 597 598 static void felem_assign(felem out, const felem in) { 599 memcpy(out, in, sizeof(felem)); 600 } 601 602 /* felem_inv calculates |out| = |in|^{-1} 603 * 604 * Based on Fermat's Little Theorem: 605 * a^p = a (mod p) 606 * a^{p-1} = 1 (mod p) 607 * a^{p-2} = a^{-1} (mod p) 608 */ 609 static void felem_inv(felem out, const felem in) { 610 felem ftmp, ftmp2; 611 /* each e_I will hold |in|^{2^I - 1} */ 612 felem e2, e4, e8, e16, e32, e64; 613 unsigned i; 614 615 felem_square(ftmp, in); /* 2^1 */ 616 felem_mul(ftmp, in, ftmp); /* 2^2 - 2^0 */ 617 felem_assign(e2, ftmp); 618 felem_square(ftmp, ftmp); /* 2^3 - 2^1 */ 619 felem_square(ftmp, ftmp); /* 2^4 - 2^2 */ 620 felem_mul(ftmp, ftmp, e2); /* 2^4 - 2^0 */ 621 felem_assign(e4, ftmp); 622 felem_square(ftmp, ftmp); /* 2^5 - 2^1 */ 623 felem_square(ftmp, ftmp); /* 2^6 - 2^2 */ 624 felem_square(ftmp, ftmp); /* 2^7 - 2^3 */ 625 felem_square(ftmp, ftmp); /* 2^8 - 2^4 */ 626 felem_mul(ftmp, ftmp, e4); /* 2^8 - 2^0 */ 627 felem_assign(e8, ftmp); 628 for (i = 0; i < 8; i++) { 629 felem_square(ftmp, ftmp); 630 } /* 2^16 - 2^8 */ 631 felem_mul(ftmp, ftmp, e8); /* 2^16 - 2^0 */ 632 felem_assign(e16, ftmp); 633 for (i = 0; i < 16; i++) { 634 felem_square(ftmp, ftmp); 635 } /* 2^32 - 2^16 */ 636 felem_mul(ftmp, ftmp, e16); /* 2^32 - 2^0 */ 637 felem_assign(e32, ftmp); 638 for (i = 0; i < 32; i++) { 639 felem_square(ftmp, ftmp); 640 } /* 2^64 - 2^32 */ 641 felem_assign(e64, ftmp); 642 felem_mul(ftmp, ftmp, in); /* 2^64 - 2^32 + 2^0 */ 643 for (i = 0; i < 192; i++) { 644 felem_square(ftmp, ftmp); 645 } /* 2^256 - 2^224 + 2^192 */ 646 647 felem_mul(ftmp2, e64, e32); /* 2^64 - 2^0 */ 648 for (i = 0; i < 16; i++) { 649 felem_square(ftmp2, ftmp2); 650 } /* 2^80 - 2^16 */ 651 felem_mul(ftmp2, ftmp2, e16); /* 2^80 - 2^0 */ 652 for (i = 0; i < 8; i++) { 653 felem_square(ftmp2, ftmp2); 654 } /* 2^88 - 2^8 */ 655 felem_mul(ftmp2, ftmp2, e8); /* 2^88 - 2^0 */ 656 for (i = 0; i < 4; i++) { 657 felem_square(ftmp2, ftmp2); 658 } /* 2^92 - 2^4 */ 659 felem_mul(ftmp2, ftmp2, e4); /* 2^92 - 2^0 */ 660 felem_square(ftmp2, ftmp2); /* 2^93 - 2^1 */ 661 felem_square(ftmp2, ftmp2); /* 2^94 - 2^2 */ 662 felem_mul(ftmp2, ftmp2, e2); /* 2^94 - 2^0 */ 663 felem_square(ftmp2, ftmp2); /* 2^95 - 2^1 */ 664 felem_square(ftmp2, ftmp2); /* 2^96 - 2^2 */ 665 felem_mul(ftmp2, ftmp2, in); /* 2^96 - 3 */ 666 667 felem_mul(out, ftmp2, ftmp); /* 2^256 - 2^224 + 2^192 + 2^96 - 3 */ 668 } 669 670 /* felem_scalar_3 sets out=3*out. 671 * 672 * On entry: out[0,2,...] < 2**30, out[1,3,...] < 2**29. 673 * On exit: out[0,2,...] < 2**30, out[1,3,...] < 2**29. */ 674 static void felem_scalar_3(felem out) { 675 limb carry = 0; 676 unsigned i; 677 678 for (i = 0;; i++) { 679 out[i] *= 3; 680 out[i] += carry; 681 carry = out[i] >> 29; 682 out[i] &= kBottom29Bits; 683 684 i++; 685 if (i == NLIMBS) 686 break; 687 688 out[i] *= 3; 689 out[i] += carry; 690 carry = out[i] >> 28; 691 out[i] &= kBottom28Bits; 692 } 693 694 felem_reduce_carry(out, carry); 695 } 696 697 /* felem_scalar_4 sets out=4*out. 698 * 699 * On entry: out[0,2,...] < 2**30, out[1,3,...] < 2**29. 700 * On exit: out[0,2,...] < 2**30, out[1,3,...] < 2**29. */ 701 static void felem_scalar_4(felem out) { 702 limb carry = 0, next_carry; 703 unsigned i; 704 705 for (i = 0;; i++) { 706 next_carry = out[i] >> 27; 707 out[i] <<= 2; 708 out[i] &= kBottom29Bits; 709 out[i] += carry; 710 carry = next_carry + (out[i] >> 29); 711 out[i] &= kBottom29Bits; 712 713 i++; 714 if (i == NLIMBS) 715 break; 716 717 next_carry = out[i] >> 26; 718 out[i] <<= 2; 719 out[i] &= kBottom28Bits; 720 out[i] += carry; 721 carry = next_carry + (out[i] >> 28); 722 out[i] &= kBottom28Bits; 723 } 724 725 felem_reduce_carry(out, carry); 726 } 727 728 /* felem_scalar_8 sets out=8*out. 729 * 730 * On entry: out[0,2,...] < 2**30, out[1,3,...] < 2**29. 731 * On exit: out[0,2,...] < 2**30, out[1,3,...] < 2**29. */ 732 static void felem_scalar_8(felem out) { 733 limb carry = 0, next_carry; 734 unsigned i; 735 736 for (i = 0;; i++) { 737 next_carry = out[i] >> 26; 738 out[i] <<= 3; 739 out[i] &= kBottom29Bits; 740 out[i] += carry; 741 carry = next_carry + (out[i] >> 29); 742 out[i] &= kBottom29Bits; 743 744 i++; 745 if (i == NLIMBS) 746 break; 747 748 next_carry = out[i] >> 25; 749 out[i] <<= 3; 750 out[i] &= kBottom28Bits; 751 out[i] += carry; 752 carry = next_carry + (out[i] >> 28); 753 out[i] &= kBottom28Bits; 754 } 755 756 felem_reduce_carry(out, carry); 757 } 758 759 /* felem_is_zero_vartime returns 1 iff |in| == 0. It takes a variable amount of 760 * time depending on the value of |in|. */ 761 static char felem_is_zero_vartime(const felem in) { 762 limb carry; 763 int i; 764 limb tmp[NLIMBS]; 765 766 felem_assign(tmp, in); 767 768 /* First, reduce tmp to a minimal form. */ 769 do { 770 carry = 0; 771 for (i = 0;; i++) { 772 tmp[i] += carry; 773 carry = tmp[i] >> 29; 774 tmp[i] &= kBottom29Bits; 775 776 i++; 777 if (i == NLIMBS) 778 break; 779 780 tmp[i] += carry; 781 carry = tmp[i] >> 28; 782 tmp[i] &= kBottom28Bits; 783 } 784 785 felem_reduce_carry(tmp, carry); 786 } while (carry); 787 788 /* tmp < 2**257, so the only possible zero values are 0, p and 2p. */ 789 return memcmp(tmp, kZero, sizeof(tmp)) == 0 || 790 memcmp(tmp, kP, sizeof(tmp)) == 0 || 791 memcmp(tmp, k2P, sizeof(tmp)) == 0; 792 } 793 794 795 /* Group operations: 796 * 797 * Elements of the elliptic curve group are represented in Jacobian 798 * coordinates: (x, y, z). An affine point (x', y') is x'=x/z**2, y'=y/z**3 in 799 * Jacobian form. */ 800 801 /* point_double sets {x_out,y_out,z_out} = 2*{x,y,z}. 802 * 803 * See http://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-0.html#doubling-dbl-2009-l */ 804 static void point_double(felem x_out, felem y_out, felem z_out, const felem x, 805 const felem y, const felem z) { 806 felem delta, gamma, alpha, beta, tmp, tmp2; 807 808 felem_square(delta, z); 809 felem_square(gamma, y); 810 felem_mul(beta, x, gamma); 811 812 felem_sum(tmp, x, delta); 813 felem_diff(tmp2, x, delta); 814 felem_mul(alpha, tmp, tmp2); 815 felem_scalar_3(alpha); 816 817 felem_sum(tmp, y, z); 818 felem_square(tmp, tmp); 819 felem_diff(tmp, tmp, gamma); 820 felem_diff(z_out, tmp, delta); 821 822 felem_scalar_4(beta); 823 felem_square(x_out, alpha); 824 felem_diff(x_out, x_out, beta); 825 felem_diff(x_out, x_out, beta); 826 827 felem_diff(tmp, beta, x_out); 828 felem_mul(tmp, alpha, tmp); 829 felem_square(tmp2, gamma); 830 felem_scalar_8(tmp2); 831 felem_diff(y_out, tmp, tmp2); 832 } 833 834 /* point_add_mixed sets {x_out,y_out,z_out} = {x1,y1,z1} + {x2,y2,1}. 835 * (i.e. the second point is affine.) 836 * 837 * See http://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-0.html#addition-add-2007-bl 838 * 839 * Note that this function does not handle P+P, infinity+P nor P+infinity 840 * correctly. */ 841 static void point_add_mixed(felem x_out, felem y_out, felem z_out, 842 const felem x1, const felem y1, const felem z1, 843 const felem x2, const felem y2) { 844 felem z1z1, z1z1z1, s2, u2, h, i, j, r, rr, v, tmp; 845 846 felem_square(z1z1, z1); 847 felem_sum(tmp, z1, z1); 848 849 felem_mul(u2, x2, z1z1); 850 felem_mul(z1z1z1, z1, z1z1); 851 felem_mul(s2, y2, z1z1z1); 852 felem_diff(h, u2, x1); 853 felem_sum(i, h, h); 854 felem_square(i, i); 855 felem_mul(j, h, i); 856 felem_diff(r, s2, y1); 857 felem_sum(r, r, r); 858 felem_mul(v, x1, i); 859 860 felem_mul(z_out, tmp, h); 861 felem_square(rr, r); 862 felem_diff(x_out, rr, j); 863 felem_diff(x_out, x_out, v); 864 felem_diff(x_out, x_out, v); 865 866 felem_diff(tmp, v, x_out); 867 felem_mul(y_out, tmp, r); 868 felem_mul(tmp, y1, j); 869 felem_diff(y_out, y_out, tmp); 870 felem_diff(y_out, y_out, tmp); 871 } 872 873 /* point_add sets {x_out,y_out,z_out} = {x1,y1,z1} + {x2,y2,z2}. 874 * 875 * See http://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-0.html#addition-add-2007-bl 876 * 877 * Note that this function does not handle P+P, infinity+P nor P+infinity 878 * correctly. */ 879 static void point_add(felem x_out, felem y_out, felem z_out, const felem x1, 880 const felem y1, const felem z1, const felem x2, 881 const felem y2, const felem z2) { 882 felem z1z1, z1z1z1, z2z2, z2z2z2, s1, s2, u1, u2, h, i, j, r, rr, v, tmp; 883 884 felem_square(z1z1, z1); 885 felem_square(z2z2, z2); 886 felem_mul(u1, x1, z2z2); 887 888 felem_sum(tmp, z1, z2); 889 felem_square(tmp, tmp); 890 felem_diff(tmp, tmp, z1z1); 891 felem_diff(tmp, tmp, z2z2); 892 893 felem_mul(z2z2z2, z2, z2z2); 894 felem_mul(s1, y1, z2z2z2); 895 896 felem_mul(u2, x2, z1z1); 897 felem_mul(z1z1z1, z1, z1z1); 898 felem_mul(s2, y2, z1z1z1); 899 felem_diff(h, u2, u1); 900 felem_sum(i, h, h); 901 felem_square(i, i); 902 felem_mul(j, h, i); 903 felem_diff(r, s2, s1); 904 felem_sum(r, r, r); 905 felem_mul(v, u1, i); 906 907 felem_mul(z_out, tmp, h); 908 felem_square(rr, r); 909 felem_diff(x_out, rr, j); 910 felem_diff(x_out, x_out, v); 911 felem_diff(x_out, x_out, v); 912 913 felem_diff(tmp, v, x_out); 914 felem_mul(y_out, tmp, r); 915 felem_mul(tmp, s1, j); 916 felem_diff(y_out, y_out, tmp); 917 felem_diff(y_out, y_out, tmp); 918 } 919 920 /* point_add_or_double_vartime sets {x_out,y_out,z_out} = {x1,y1,z1} + 921 * {x2,y2,z2}. 922 * 923 * See http://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-0.html#addition-add-2007-bl 924 * 925 * This function handles the case where {x1,y1,z1}={x2,y2,z2}. */ 926 static void point_add_or_double_vartime( 927 felem x_out, felem y_out, felem z_out, const felem x1, const felem y1, 928 const felem z1, const felem x2, const felem y2, const felem z2) { 929 felem z1z1, z1z1z1, z2z2, z2z2z2, s1, s2, u1, u2, h, i, j, r, rr, v, tmp; 930 char x_equal, y_equal; 931 932 felem_square(z1z1, z1); 933 felem_square(z2z2, z2); 934 felem_mul(u1, x1, z2z2); 935 936 felem_sum(tmp, z1, z2); 937 felem_square(tmp, tmp); 938 felem_diff(tmp, tmp, z1z1); 939 felem_diff(tmp, tmp, z2z2); 940 941 felem_mul(z2z2z2, z2, z2z2); 942 felem_mul(s1, y1, z2z2z2); 943 944 felem_mul(u2, x2, z1z1); 945 felem_mul(z1z1z1, z1, z1z1); 946 felem_mul(s2, y2, z1z1z1); 947 felem_diff(h, u2, u1); 948 x_equal = felem_is_zero_vartime(h); 949 felem_sum(i, h, h); 950 felem_square(i, i); 951 felem_mul(j, h, i); 952 felem_diff(r, s2, s1); 953 y_equal = felem_is_zero_vartime(r); 954 if (x_equal && y_equal) { 955 point_double(x_out, y_out, z_out, x1, y1, z1); 956 return; 957 } 958 felem_sum(r, r, r); 959 felem_mul(v, u1, i); 960 961 felem_mul(z_out, tmp, h); 962 felem_square(rr, r); 963 felem_diff(x_out, rr, j); 964 felem_diff(x_out, x_out, v); 965 felem_diff(x_out, x_out, v); 966 967 felem_diff(tmp, v, x_out); 968 felem_mul(y_out, tmp, r); 969 felem_mul(tmp, s1, j); 970 felem_diff(y_out, y_out, tmp); 971 felem_diff(y_out, y_out, tmp); 972 } 973 974 /* copy_conditional sets out=in if mask = 0xffffffff in constant time. 975 * 976 * On entry: mask is either 0 or 0xffffffff. */ 977 static void copy_conditional(felem out, const felem in, limb mask) { 978 int i; 979 980 for (i = 0; i < NLIMBS; i++) { 981 const limb tmp = mask & (in[i] ^ out[i]); 982 out[i] ^= tmp; 983 } 984 } 985 986 /* select_affine_point sets {out_x,out_y} to the index'th entry of table. 987 * On entry: index < 16, table[0] must be zero. */ 988 static void select_affine_point(felem out_x, felem out_y, const limb* table, 989 limb index) { 990 limb i, j; 991 992 memset(out_x, 0, sizeof(felem)); 993 memset(out_y, 0, sizeof(felem)); 994 995 for (i = 1; i < 16; i++) { 996 limb mask = i ^ index; 997 mask |= mask >> 2; 998 mask |= mask >> 1; 999 mask &= 1; 1000 mask--; 1001 for (j = 0; j < NLIMBS; j++, table++) { 1002 out_x[j] |= *table & mask; 1003 } 1004 for (j = 0; j < NLIMBS; j++, table++) { 1005 out_y[j] |= *table & mask; 1006 } 1007 } 1008 } 1009 1010 /* select_jacobian_point sets {out_x,out_y,out_z} to the index'th entry of 1011 * table. On entry: index < 16, table[0] must be zero. */ 1012 static void select_jacobian_point(felem out_x, felem out_y, felem out_z, 1013 const limb* table, limb index) { 1014 limb i, j; 1015 1016 memset(out_x, 0, sizeof(felem)); 1017 memset(out_y, 0, sizeof(felem)); 1018 memset(out_z, 0, sizeof(felem)); 1019 1020 /* The implicit value at index 0 is all zero. We don't need to perform that 1021 * iteration of the loop because we already set out_* to zero. */ 1022 table += 3 * NLIMBS; 1023 1024 // Hit all entries to obscure cache profiling. 1025 for (i = 1; i < 16; i++) { 1026 limb mask = i ^ index; 1027 mask |= mask >> 2; 1028 mask |= mask >> 1; 1029 mask &= 1; 1030 mask--; 1031 for (j = 0; j < NLIMBS; j++, table++) { 1032 out_x[j] |= *table & mask; 1033 } 1034 for (j = 0; j < NLIMBS; j++, table++) { 1035 out_y[j] |= *table & mask; 1036 } 1037 for (j = 0; j < NLIMBS; j++, table++) { 1038 out_z[j] |= *table & mask; 1039 } 1040 } 1041 } 1042 1043 /* scalar_base_mult sets {nx,ny,nz} = scalar*G where scalar is a little-endian 1044 * number. Note that the value of scalar must be less than the order of the 1045 * group. */ 1046 static void scalar_base_mult(felem nx, felem ny, felem nz, 1047 const p256_int* scalar) { 1048 int i, j; 1049 limb n_is_infinity_mask = -1, p_is_noninfinite_mask, mask; 1050 u32 table_offset; 1051 1052 felem px, py; 1053 felem tx, ty, tz; 1054 1055 memset(nx, 0, sizeof(felem)); 1056 memset(ny, 0, sizeof(felem)); 1057 memset(nz, 0, sizeof(felem)); 1058 1059 /* The loop adds bits at positions 0, 64, 128 and 192, followed by 1060 * positions 32,96,160 and 224 and does this 32 times. */ 1061 for (i = 0; i < 32; i++) { 1062 if (i) { 1063 point_double(nx, ny, nz, nx, ny, nz); 1064 } 1065 table_offset = 0; 1066 for (j = 0; j <= 32; j += 32) { 1067 char bit0 = p256_get_bit(scalar, 31 - i + j); 1068 char bit1 = p256_get_bit(scalar, 95 - i + j); 1069 char bit2 = p256_get_bit(scalar, 159 - i + j); 1070 char bit3 = p256_get_bit(scalar, 223 - i + j); 1071 limb index = bit0 | (bit1 << 1) | (bit2 << 2) | (bit3 << 3); 1072 1073 select_affine_point(px, py, kPrecomputed + table_offset, index); 1074 table_offset += 30 * NLIMBS; 1075 1076 /* Since scalar is less than the order of the group, we know that 1077 * {nx,ny,nz} != {px,py,1}, unless both are zero, which we handle 1078 * below. */ 1079 point_add_mixed(tx, ty, tz, nx, ny, nz, px, py); 1080 /* The result of point_add_mixed is incorrect if {nx,ny,nz} is zero 1081 * (a.k.a. the point at infinity). We handle that situation by 1082 * copying the point from the table. */ 1083 copy_conditional(nx, px, n_is_infinity_mask); 1084 copy_conditional(ny, py, n_is_infinity_mask); 1085 copy_conditional(nz, kOne, n_is_infinity_mask); 1086 1087 /* Equally, the result is also wrong if the point from the table is 1088 * zero, which happens when the index is zero. We handle that by 1089 * only copying from {tx,ty,tz} to {nx,ny,nz} if index != 0. */ 1090 p_is_noninfinite_mask = NON_ZERO_TO_ALL_ONES(index); 1091 mask = p_is_noninfinite_mask & ~n_is_infinity_mask; 1092 copy_conditional(nx, tx, mask); 1093 copy_conditional(ny, ty, mask); 1094 copy_conditional(nz, tz, mask); 1095 /* If p was not zero, then n is now non-zero. */ 1096 n_is_infinity_mask &= ~p_is_noninfinite_mask; 1097 } 1098 } 1099 } 1100 1101 /* point_to_affine converts a Jacobian point to an affine point. If the input 1102 * is the point at infinity then it returns (0, 0) in constant time. */ 1103 static void point_to_affine(felem x_out, felem y_out, const felem nx, 1104 const felem ny, const felem nz) { 1105 felem z_inv, z_inv_sq; 1106 felem_inv(z_inv, nz); 1107 felem_square(z_inv_sq, z_inv); 1108 felem_mul(x_out, nx, z_inv_sq); 1109 felem_mul(z_inv, z_inv, z_inv_sq); 1110 felem_mul(y_out, ny, z_inv); 1111 } 1112 1113 /* scalar_base_mult sets {nx,ny,nz} = scalar*{x,y}. */ 1114 static void scalar_mult(felem nx, felem ny, felem nz, const felem x, 1115 const felem y, const p256_int* scalar) { 1116 int i; 1117 felem px, py, pz, tx, ty, tz; 1118 felem precomp[16][3]; 1119 limb n_is_infinity_mask, index, p_is_noninfinite_mask, mask; 1120 1121 /* We precompute 0,1,2,... times {x,y}. */ 1122 memset(precomp, 0, sizeof(felem) * 3); 1123 memcpy(&precomp[1][0], x, sizeof(felem)); 1124 memcpy(&precomp[1][1], y, sizeof(felem)); 1125 memcpy(&precomp[1][2], kOne, sizeof(felem)); 1126 1127 for (i = 2; i < 16; i += 2) { 1128 point_double(precomp[i][0], precomp[i][1], precomp[i][2], 1129 precomp[i / 2][0], precomp[i / 2][1], precomp[i / 2][2]); 1130 1131 point_add_mixed(precomp[i + 1][0], precomp[i + 1][1], precomp[i + 1][2], 1132 precomp[i][0], precomp[i][1], precomp[i][2], x, y); 1133 } 1134 1135 memset(nx, 0, sizeof(felem)); 1136 memset(ny, 0, sizeof(felem)); 1137 memset(nz, 0, sizeof(felem)); 1138 n_is_infinity_mask = -1; 1139 1140 /* We add in a window of four bits each iteration and do this 64 times. */ 1141 for (i = 0; i < 256; i += 4) { 1142 if (i) { 1143 point_double(nx, ny, nz, nx, ny, nz); 1144 point_double(nx, ny, nz, nx, ny, nz); 1145 point_double(nx, ny, nz, nx, ny, nz); 1146 point_double(nx, ny, nz, nx, ny, nz); 1147 } 1148 1149 index = (p256_get_bit(scalar, 255 - i - 0) << 3) | 1150 (p256_get_bit(scalar, 255 - i - 1) << 2) | 1151 (p256_get_bit(scalar, 255 - i - 2) << 1) | 1152 p256_get_bit(scalar, 255 - i - 3); 1153 1154 /* See the comments in scalar_base_mult about handling infinities. */ 1155 select_jacobian_point(px, py, pz, precomp[0][0], index); 1156 point_add(tx, ty, tz, nx, ny, nz, px, py, pz); 1157 copy_conditional(nx, px, n_is_infinity_mask); 1158 copy_conditional(ny, py, n_is_infinity_mask); 1159 copy_conditional(nz, pz, n_is_infinity_mask); 1160 1161 p_is_noninfinite_mask = NON_ZERO_TO_ALL_ONES(index); 1162 mask = p_is_noninfinite_mask & ~n_is_infinity_mask; 1163 1164 copy_conditional(nx, tx, mask); 1165 copy_conditional(ny, ty, mask); 1166 copy_conditional(nz, tz, mask); 1167 n_is_infinity_mask &= ~p_is_noninfinite_mask; 1168 } 1169 } 1170 1171 #define kRDigits {2, 0, 0, 0xfffffffe, 0xffffffff, 0xffffffff, 0xfffffffd, 1} // 2^257 mod p256.p 1172 1173 #define kRInvDigits {0x80000000, 1, 0xffffffff, 0, 0x80000001, 0xfffffffe, 1, 0x7fffffff} // 1 / 2^257 mod p256.p 1174 1175 static const p256_int kR = { kRDigits }; 1176 static const p256_int kRInv = { kRInvDigits }; 1177 1178 /* to_montgomery sets out = R*in. */ 1179 static void to_montgomery(felem out, const p256_int* in) { 1180 p256_int in_shifted; 1181 int i; 1182 1183 p256_init(&in_shifted); 1184 p256_modmul(&SECP256r1_p, in, 0, &kR, &in_shifted); 1185 1186 for (i = 0; i < NLIMBS; i++) { 1187 if ((i & 1) == 0) { 1188 out[i] = P256_DIGIT(&in_shifted, 0) & kBottom29Bits; 1189 p256_shr(&in_shifted, 29, &in_shifted); 1190 } else { 1191 out[i] = P256_DIGIT(&in_shifted, 0) & kBottom28Bits; 1192 p256_shr(&in_shifted, 28, &in_shifted); 1193 } 1194 } 1195 1196 p256_clear(&in_shifted); 1197 } 1198 1199 /* from_montgomery sets out=in/R. */ 1200 static void from_montgomery(p256_int* out, const felem in) { 1201 p256_int result, tmp; 1202 int i, top; 1203 1204 p256_init(&result); 1205 p256_init(&tmp); 1206 1207 p256_add_d(&tmp, in[NLIMBS - 1], &result); 1208 for (i = NLIMBS - 2; i >= 0; i--) { 1209 if ((i & 1) == 0) { 1210 top = p256_shl(&result, 29, &tmp); 1211 } else { 1212 top = p256_shl(&result, 28, &tmp); 1213 } 1214 top |= p256_add_d(&tmp, in[i], &result); 1215 } 1216 1217 p256_modmul(&SECP256r1_p, &kRInv, top, &result, out); 1218 1219 p256_clear(&result); 1220 p256_clear(&tmp); 1221 } 1222 1223 /* p256_base_point_mul sets {out_x,out_y} = nG, where n is < the 1224 * order of the group. */ 1225 void p256_base_point_mul(const p256_int* n, p256_int* out_x, p256_int* out_y) { 1226 felem x, y, z; 1227 1228 scalar_base_mult(x, y, z, n); 1229 1230 { 1231 felem x_affine, y_affine; 1232 1233 point_to_affine(x_affine, y_affine, x, y, z); 1234 from_montgomery(out_x, x_affine); 1235 from_montgomery(out_y, y_affine); 1236 } 1237 } 1238 1239 /* p256_points_mul_vartime sets {out_x,out_y} = n1*G + n2*{in_x,in_y}, where 1240 * n1 and n2 are < the order of the group. 1241 * 1242 * As indicated by the name, this function operates in variable time. This 1243 * is safe because it's used for signature validation which doesn't deal 1244 * with secrets. */ 1245 void p256_points_mul_vartime( 1246 const p256_int* n1, const p256_int* n2, const p256_int* in_x, 1247 const p256_int* in_y, p256_int* out_x, p256_int* out_y) { 1248 felem x1, y1, z1, x2, y2, z2, px, py; 1249 1250 /* If both scalars are zero, then the result is the point at infinity. */ 1251 if (p256_is_zero(n1) != 0 && p256_is_zero(n2) != 0) { 1252 p256_clear(out_x); 1253 p256_clear(out_y); 1254 return; 1255 } 1256 1257 to_montgomery(px, in_x); 1258 to_montgomery(py, in_y); 1259 scalar_base_mult(x1, y1, z1, n1); 1260 scalar_mult(x2, y2, z2, px, py, n2); 1261 1262 if (p256_is_zero(n2) != 0) { 1263 /* If n2 == 0, then {x2,y2,z2} is zero and the result is just 1264 * {x1,y1,z1}. */ 1265 } else if (p256_is_zero(n1) != 0) { 1266 /* If n1 == 0, then {x1,y1,z1} is zero and the result is just 1267 * {x2,y2,z2}. */ 1268 memcpy(x1, x2, sizeof(x2)); 1269 memcpy(y1, y2, sizeof(y2)); 1270 memcpy(z1, z2, sizeof(z2)); 1271 } else { 1272 /* This function handles the case where {x1,y1,z1} == {x2,y2,z2}. */ 1273 point_add_or_double_vartime(x1, y1, z1, x1, y1, z1, x2, y2, z2); 1274 } 1275 1276 point_to_affine(px, py, x1, y1, z1); 1277 from_montgomery(out_x, px); 1278 from_montgomery(out_y, py); 1279 } 1280