1 // The MIT License (MIT) 2 // 3 // Copyright (c) 2015-2016 the fiat-crypto authors (see the AUTHORS file). 4 // 5 // Permission is hereby granted, free of charge, to any person obtaining a copy 6 // of this software and associated documentation files (the "Software"), to deal 7 // in the Software without restriction, including without limitation the rights 8 // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 // copies of the Software, and to permit persons to whom the Software is 10 // furnished to do so, subject to the following conditions: 11 // 12 // The above copyright notice and this permission notice shall be included in all 13 // copies or substantial portions of the Software. 14 // 15 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 // SOFTWARE. 22 23 // Some of this code is taken from the ref10 version of Ed25519 in SUPERCOP 24 // 20141124 (http://bench.cr.yp.to/supercop.html). That code is released as 25 // public domain but parts have been replaced with code generated by Fiat 26 // (https://github.com/mit-plv/fiat-crypto), which is MIT licensed. 27 // 28 // The field functions are shared by Ed25519 and X25519 where possible. 29 30 #include <openssl/curve25519.h> 31 32 #include <assert.h> 33 #include <string.h> 34 35 #include <openssl/cpu.h> 36 #include <openssl/mem.h> 37 #include <openssl/rand.h> 38 #include <openssl/sha.h> 39 #include <openssl/type_check.h> 40 41 #include "internal.h" 42 #include "../../crypto/internal.h" 43 44 45 // Various pre-computed constants. 46 #include "./curve25519_tables.h" 47 48 #if defined(BORINGSSL_CURVE25519_64BIT) 49 #include "./curve25519_64.h" 50 #else 51 #include "./curve25519_32.h" 52 #endif // BORINGSSL_CURVE25519_64BIT 53 54 55 // Low-level intrinsic operations 56 57 static uint64_t load_3(const uint8_t *in) { 58 uint64_t result; 59 result = (uint64_t)in[0]; 60 result |= ((uint64_t)in[1]) << 8; 61 result |= ((uint64_t)in[2]) << 16; 62 return result; 63 } 64 65 static uint64_t load_4(const uint8_t *in) { 66 uint64_t result; 67 result = (uint64_t)in[0]; 68 result |= ((uint64_t)in[1]) << 8; 69 result |= ((uint64_t)in[2]) << 16; 70 result |= ((uint64_t)in[3]) << 24; 71 return result; 72 } 73 74 75 // Field operations. 76 77 #if defined(BORINGSSL_CURVE25519_64BIT) 78 79 typedef uint64_t fe_limb_t; 80 #define FE_NUM_LIMBS 5 81 82 // assert_fe asserts that |f| satisfies bounds: 83 // 84 // [[0x0 ~> 0x8cccccccccccc], 85 // [0x0 ~> 0x8cccccccccccc], 86 // [0x0 ~> 0x8cccccccccccc], 87 // [0x0 ~> 0x8cccccccccccc], 88 // [0x0 ~> 0x8cccccccccccc]] 89 // 90 // See comments in curve25519_64.h for which functions use these bounds for 91 // inputs or outputs. 92 #define assert_fe(f) \ 93 do { \ 94 for (unsigned _assert_fe_i = 0; _assert_fe_i < 5; _assert_fe_i++) { \ 95 assert(f[_assert_fe_i] <= UINT64_C(0x8cccccccccccc)); \ 96 } \ 97 } while (0) 98 99 // assert_fe_loose asserts that |f| satisfies bounds: 100 // 101 // [[0x0 ~> 0x1a666666666664], 102 // [0x0 ~> 0x1a666666666664], 103 // [0x0 ~> 0x1a666666666664], 104 // [0x0 ~> 0x1a666666666664], 105 // [0x0 ~> 0x1a666666666664]] 106 // 107 // See comments in curve25519_64.h for which functions use these bounds for 108 // inputs or outputs. 109 #define assert_fe_loose(f) \ 110 do { \ 111 for (unsigned _assert_fe_i = 0; _assert_fe_i < 5; _assert_fe_i++) { \ 112 assert(f[_assert_fe_i] <= UINT64_C(0x1a666666666664)); \ 113 } \ 114 } while (0) 115 116 #else 117 118 typedef uint32_t fe_limb_t; 119 #define FE_NUM_LIMBS 10 120 121 // assert_fe asserts that |f| satisfies bounds: 122 // 123 // [[0x0 ~> 0x4666666], [0x0 ~> 0x2333333], 124 // [0x0 ~> 0x4666666], [0x0 ~> 0x2333333], 125 // [0x0 ~> 0x4666666], [0x0 ~> 0x2333333], 126 // [0x0 ~> 0x4666666], [0x0 ~> 0x2333333], 127 // [0x0 ~> 0x4666666], [0x0 ~> 0x2333333]] 128 // 129 // See comments in curve25519_32.h for which functions use these bounds for 130 // inputs or outputs. 131 #define assert_fe(f) \ 132 do { \ 133 for (unsigned _assert_fe_i = 0; _assert_fe_i < 10; _assert_fe_i++) { \ 134 assert(f[_assert_fe_i] <= \ 135 ((_assert_fe_i & 1) ? 0x2333333u : 0x4666666u)); \ 136 } \ 137 } while (0) 138 139 // assert_fe_loose asserts that |f| satisfies bounds: 140 // 141 // [[0x0 ~> 0xd333332], [0x0 ~> 0x6999999], 142 // [0x0 ~> 0xd333332], [0x0 ~> 0x6999999], 143 // [0x0 ~> 0xd333332], [0x0 ~> 0x6999999], 144 // [0x0 ~> 0xd333332], [0x0 ~> 0x6999999], 145 // [0x0 ~> 0xd333332], [0x0 ~> 0x6999999]] 146 // 147 // See comments in curve25519_32.h for which functions use these bounds for 148 // inputs or outputs. 149 #define assert_fe_loose(f) \ 150 do { \ 151 for (unsigned _assert_fe_i = 0; _assert_fe_i < 10; _assert_fe_i++) { \ 152 assert(f[_assert_fe_i] <= \ 153 ((_assert_fe_i & 1) ? 0x6999999u : 0xd333332u)); \ 154 } \ 155 } while (0) 156 157 #endif // BORINGSSL_CURVE25519_64BIT 158 159 OPENSSL_STATIC_ASSERT(sizeof(fe) == sizeof(fe_limb_t) * FE_NUM_LIMBS, 160 "fe_limb_t[FE_NUM_LIMBS] is inconsistent with fe"); 161 162 static void fe_frombytes_strict(fe *h, const uint8_t s[32]) { 163 // |fiat_25519_from_bytes| requires the top-most bit be clear. 164 assert((s[31] & 0x80) == 0); 165 fiat_25519_from_bytes(h->v, s); 166 assert_fe(h->v); 167 } 168 169 static void fe_frombytes(fe *h, const uint8_t s[32]) { 170 uint8_t s_copy[32]; 171 OPENSSL_memcpy(s_copy, s, 32); 172 s_copy[31] &= 0x7f; 173 fe_frombytes_strict(h, s_copy); 174 } 175 176 static void fe_tobytes(uint8_t s[32], const fe *f) { 177 assert_fe(f->v); 178 fiat_25519_to_bytes(s, f->v); 179 } 180 181 // h = 0 182 static void fe_0(fe *h) { 183 OPENSSL_memset(h, 0, sizeof(fe)); 184 } 185 186 static void fe_loose_0(fe_loose *h) { 187 OPENSSL_memset(h, 0, sizeof(fe_loose)); 188 } 189 190 // h = 1 191 static void fe_1(fe *h) { 192 OPENSSL_memset(h, 0, sizeof(fe)); 193 h->v[0] = 1; 194 } 195 196 static void fe_loose_1(fe_loose *h) { 197 OPENSSL_memset(h, 0, sizeof(fe_loose)); 198 h->v[0] = 1; 199 } 200 201 // h = f + g 202 // Can overlap h with f or g. 203 static void fe_add(fe_loose *h, const fe *f, const fe *g) { 204 assert_fe(f->v); 205 assert_fe(g->v); 206 fiat_25519_add(h->v, f->v, g->v); 207 assert_fe_loose(h->v); 208 } 209 210 // h = f - g 211 // Can overlap h with f or g. 212 static void fe_sub(fe_loose *h, const fe *f, const fe *g) { 213 assert_fe(f->v); 214 assert_fe(g->v); 215 fiat_25519_sub(h->v, f->v, g->v); 216 assert_fe_loose(h->v); 217 } 218 219 static void fe_carry(fe *h, const fe_loose* f) { 220 assert_fe_loose(f->v); 221 fiat_25519_carry(h->v, f->v); 222 assert_fe(h->v); 223 } 224 225 static void fe_mul_impl(fe_limb_t out[FE_NUM_LIMBS], 226 const fe_limb_t in1[FE_NUM_LIMBS], 227 const fe_limb_t in2[FE_NUM_LIMBS]) { 228 assert_fe_loose(in1); 229 assert_fe_loose(in2); 230 fiat_25519_carry_mul(out, in1, in2); 231 assert_fe(out); 232 } 233 234 static void fe_mul_ltt(fe_loose *h, const fe *f, const fe *g) { 235 fe_mul_impl(h->v, f->v, g->v); 236 } 237 238 static void fe_mul_llt(fe_loose *h, const fe_loose *f, const fe *g) { 239 fe_mul_impl(h->v, f->v, g->v); 240 } 241 242 static void fe_mul_ttt(fe *h, const fe *f, const fe *g) { 243 fe_mul_impl(h->v, f->v, g->v); 244 } 245 246 static void fe_mul_tlt(fe *h, const fe_loose *f, const fe *g) { 247 fe_mul_impl(h->v, f->v, g->v); 248 } 249 250 static void fe_mul_ttl(fe *h, const fe *f, const fe_loose *g) { 251 fe_mul_impl(h->v, f->v, g->v); 252 } 253 254 static void fe_mul_tll(fe *h, const fe_loose *f, const fe_loose *g) { 255 fe_mul_impl(h->v, f->v, g->v); 256 } 257 258 static void fe_sq_tl(fe *h, const fe_loose *f) { 259 assert_fe_loose(f->v); 260 fiat_25519_carry_square(h->v, f->v); 261 assert_fe(h->v); 262 } 263 264 static void fe_sq_tt(fe *h, const fe *f) { 265 assert_fe_loose(f->v); 266 fiat_25519_carry_square(h->v, f->v); 267 assert_fe(h->v); 268 } 269 270 // Replace (f,g) with (g,f) if b == 1; 271 // replace (f,g) with (f,g) if b == 0. 272 // 273 // Preconditions: b in {0,1}. 274 static void fe_cswap(fe *f, fe *g, fe_limb_t b) { 275 b = 0-b; 276 for (unsigned i = 0; i < FE_NUM_LIMBS; i++) { 277 fe_limb_t x = f->v[i] ^ g->v[i]; 278 x &= b; 279 f->v[i] ^= x; 280 g->v[i] ^= x; 281 } 282 } 283 284 static void fe_mul121666(fe *h, const fe_loose *f) { 285 assert_fe_loose(f->v); 286 fiat_25519_carry_scmul_121666(h->v, f->v); 287 assert_fe(h->v); 288 } 289 290 // h = -f 291 static void fe_neg(fe_loose *h, const fe *f) { 292 assert_fe(f->v); 293 fiat_25519_opp(h->v, f->v); 294 assert_fe_loose(h->v); 295 } 296 297 // Replace (f,g) with (g,g) if b == 1; 298 // replace (f,g) with (f,g) if b == 0. 299 // 300 // Preconditions: b in {0,1}. 301 static void fe_cmov(fe_loose *f, const fe_loose *g, fe_limb_t b) { 302 // Silence an unused function warning. |fiat_25519_selectznz| isn't quite the 303 // calling convention the rest of this code wants, so implement it by hand. 304 // 305 // TODO(davidben): Switch to fiat's calling convention, or ask fiat to emit a 306 // different one. 307 (void)fiat_25519_selectznz; 308 309 b = 0-b; 310 for (unsigned i = 0; i < FE_NUM_LIMBS; i++) { 311 fe_limb_t x = f->v[i] ^ g->v[i]; 312 x &= b; 313 f->v[i] ^= x; 314 } 315 } 316 317 // h = f 318 static void fe_copy(fe *h, const fe *f) { 319 OPENSSL_memmove(h, f, sizeof(fe)); 320 } 321 322 static void fe_copy_lt(fe_loose *h, const fe *f) { 323 OPENSSL_STATIC_ASSERT(sizeof(fe_loose) == sizeof(fe), 324 "fe and fe_loose mismatch"); 325 OPENSSL_memmove(h, f, sizeof(fe)); 326 } 327 #if !defined(OPENSSL_SMALL) 328 static void fe_copy_ll(fe_loose *h, const fe_loose *f) { 329 OPENSSL_memmove(h, f, sizeof(fe_loose)); 330 } 331 #endif // !defined(OPENSSL_SMALL) 332 333 static void fe_loose_invert(fe *out, const fe_loose *z) { 334 fe t0; 335 fe t1; 336 fe t2; 337 fe t3; 338 int i; 339 340 fe_sq_tl(&t0, z); 341 fe_sq_tt(&t1, &t0); 342 for (i = 1; i < 2; ++i) { 343 fe_sq_tt(&t1, &t1); 344 } 345 fe_mul_tlt(&t1, z, &t1); 346 fe_mul_ttt(&t0, &t0, &t1); 347 fe_sq_tt(&t2, &t0); 348 fe_mul_ttt(&t1, &t1, &t2); 349 fe_sq_tt(&t2, &t1); 350 for (i = 1; i < 5; ++i) { 351 fe_sq_tt(&t2, &t2); 352 } 353 fe_mul_ttt(&t1, &t2, &t1); 354 fe_sq_tt(&t2, &t1); 355 for (i = 1; i < 10; ++i) { 356 fe_sq_tt(&t2, &t2); 357 } 358 fe_mul_ttt(&t2, &t2, &t1); 359 fe_sq_tt(&t3, &t2); 360 for (i = 1; i < 20; ++i) { 361 fe_sq_tt(&t3, &t3); 362 } 363 fe_mul_ttt(&t2, &t3, &t2); 364 fe_sq_tt(&t2, &t2); 365 for (i = 1; i < 10; ++i) { 366 fe_sq_tt(&t2, &t2); 367 } 368 fe_mul_ttt(&t1, &t2, &t1); 369 fe_sq_tt(&t2, &t1); 370 for (i = 1; i < 50; ++i) { 371 fe_sq_tt(&t2, &t2); 372 } 373 fe_mul_ttt(&t2, &t2, &t1); 374 fe_sq_tt(&t3, &t2); 375 for (i = 1; i < 100; ++i) { 376 fe_sq_tt(&t3, &t3); 377 } 378 fe_mul_ttt(&t2, &t3, &t2); 379 fe_sq_tt(&t2, &t2); 380 for (i = 1; i < 50; ++i) { 381 fe_sq_tt(&t2, &t2); 382 } 383 fe_mul_ttt(&t1, &t2, &t1); 384 fe_sq_tt(&t1, &t1); 385 for (i = 1; i < 5; ++i) { 386 fe_sq_tt(&t1, &t1); 387 } 388 fe_mul_ttt(out, &t1, &t0); 389 } 390 391 static void fe_invert(fe *out, const fe *z) { 392 fe_loose l; 393 fe_copy_lt(&l, z); 394 fe_loose_invert(out, &l); 395 } 396 397 // return 0 if f == 0 398 // return 1 if f != 0 399 static int fe_isnonzero(const fe_loose *f) { 400 fe tight; 401 fe_carry(&tight, f); 402 uint8_t s[32]; 403 fe_tobytes(s, &tight); 404 405 static const uint8_t zero[32] = {0}; 406 return CRYPTO_memcmp(s, zero, sizeof(zero)) != 0; 407 } 408 409 // return 1 if f is in {1,3,5,...,q-2} 410 // return 0 if f is in {0,2,4,...,q-1} 411 static int fe_isnegative(const fe *f) { 412 uint8_t s[32]; 413 fe_tobytes(s, f); 414 return s[0] & 1; 415 } 416 417 static void fe_sq2_tt(fe *h, const fe *f) { 418 // h = f^2 419 fe_sq_tt(h, f); 420 421 // h = h + h 422 fe_loose tmp; 423 fe_add(&tmp, h, h); 424 fe_carry(h, &tmp); 425 } 426 427 static void fe_pow22523(fe *out, const fe *z) { 428 fe t0; 429 fe t1; 430 fe t2; 431 int i; 432 433 fe_sq_tt(&t0, z); 434 fe_sq_tt(&t1, &t0); 435 for (i = 1; i < 2; ++i) { 436 fe_sq_tt(&t1, &t1); 437 } 438 fe_mul_ttt(&t1, z, &t1); 439 fe_mul_ttt(&t0, &t0, &t1); 440 fe_sq_tt(&t0, &t0); 441 fe_mul_ttt(&t0, &t1, &t0); 442 fe_sq_tt(&t1, &t0); 443 for (i = 1; i < 5; ++i) { 444 fe_sq_tt(&t1, &t1); 445 } 446 fe_mul_ttt(&t0, &t1, &t0); 447 fe_sq_tt(&t1, &t0); 448 for (i = 1; i < 10; ++i) { 449 fe_sq_tt(&t1, &t1); 450 } 451 fe_mul_ttt(&t1, &t1, &t0); 452 fe_sq_tt(&t2, &t1); 453 for (i = 1; i < 20; ++i) { 454 fe_sq_tt(&t2, &t2); 455 } 456 fe_mul_ttt(&t1, &t2, &t1); 457 fe_sq_tt(&t1, &t1); 458 for (i = 1; i < 10; ++i) { 459 fe_sq_tt(&t1, &t1); 460 } 461 fe_mul_ttt(&t0, &t1, &t0); 462 fe_sq_tt(&t1, &t0); 463 for (i = 1; i < 50; ++i) { 464 fe_sq_tt(&t1, &t1); 465 } 466 fe_mul_ttt(&t1, &t1, &t0); 467 fe_sq_tt(&t2, &t1); 468 for (i = 1; i < 100; ++i) { 469 fe_sq_tt(&t2, &t2); 470 } 471 fe_mul_ttt(&t1, &t2, &t1); 472 fe_sq_tt(&t1, &t1); 473 for (i = 1; i < 50; ++i) { 474 fe_sq_tt(&t1, &t1); 475 } 476 fe_mul_ttt(&t0, &t1, &t0); 477 fe_sq_tt(&t0, &t0); 478 for (i = 1; i < 2; ++i) { 479 fe_sq_tt(&t0, &t0); 480 } 481 fe_mul_ttt(out, &t0, z); 482 } 483 484 485 // Group operations. 486 487 void x25519_ge_tobytes(uint8_t s[32], const ge_p2 *h) { 488 fe recip; 489 fe x; 490 fe y; 491 492 fe_invert(&recip, &h->Z); 493 fe_mul_ttt(&x, &h->X, &recip); 494 fe_mul_ttt(&y, &h->Y, &recip); 495 fe_tobytes(s, &y); 496 s[31] ^= fe_isnegative(&x) << 7; 497 } 498 499 static void ge_p3_tobytes(uint8_t s[32], const ge_p3 *h) { 500 fe recip; 501 fe x; 502 fe y; 503 504 fe_invert(&recip, &h->Z); 505 fe_mul_ttt(&x, &h->X, &recip); 506 fe_mul_ttt(&y, &h->Y, &recip); 507 fe_tobytes(s, &y); 508 s[31] ^= fe_isnegative(&x) << 7; 509 } 510 511 int x25519_ge_frombytes_vartime(ge_p3 *h, const uint8_t s[32]) { 512 fe u; 513 fe_loose v; 514 fe v3; 515 fe vxx; 516 fe_loose check; 517 518 fe_frombytes(&h->Y, s); 519 fe_1(&h->Z); 520 fe_sq_tt(&v3, &h->Y); 521 fe_mul_ttt(&vxx, &v3, &d); 522 fe_sub(&v, &v3, &h->Z); // u = y^2-1 523 fe_carry(&u, &v); 524 fe_add(&v, &vxx, &h->Z); // v = dy^2+1 525 526 fe_sq_tl(&v3, &v); 527 fe_mul_ttl(&v3, &v3, &v); // v3 = v^3 528 fe_sq_tt(&h->X, &v3); 529 fe_mul_ttl(&h->X, &h->X, &v); 530 fe_mul_ttt(&h->X, &h->X, &u); // x = uv^7 531 532 fe_pow22523(&h->X, &h->X); // x = (uv^7)^((q-5)/8) 533 fe_mul_ttt(&h->X, &h->X, &v3); 534 fe_mul_ttt(&h->X, &h->X, &u); // x = uv^3(uv^7)^((q-5)/8) 535 536 fe_sq_tt(&vxx, &h->X); 537 fe_mul_ttl(&vxx, &vxx, &v); 538 fe_sub(&check, &vxx, &u); 539 if (fe_isnonzero(&check)) { 540 fe_add(&check, &vxx, &u); 541 if (fe_isnonzero(&check)) { 542 return 0; 543 } 544 fe_mul_ttt(&h->X, &h->X, &sqrtm1); 545 } 546 547 if (fe_isnegative(&h->X) != (s[31] >> 7)) { 548 fe_loose t; 549 fe_neg(&t, &h->X); 550 fe_carry(&h->X, &t); 551 } 552 553 fe_mul_ttt(&h->T, &h->X, &h->Y); 554 return 1; 555 } 556 557 static void ge_p2_0(ge_p2 *h) { 558 fe_0(&h->X); 559 fe_1(&h->Y); 560 fe_1(&h->Z); 561 } 562 563 static void ge_p3_0(ge_p3 *h) { 564 fe_0(&h->X); 565 fe_1(&h->Y); 566 fe_1(&h->Z); 567 fe_0(&h->T); 568 } 569 570 static void ge_cached_0(ge_cached *h) { 571 fe_loose_1(&h->YplusX); 572 fe_loose_1(&h->YminusX); 573 fe_loose_1(&h->Z); 574 fe_loose_0(&h->T2d); 575 } 576 577 static void ge_precomp_0(ge_precomp *h) { 578 fe_loose_1(&h->yplusx); 579 fe_loose_1(&h->yminusx); 580 fe_loose_0(&h->xy2d); 581 } 582 583 // r = p 584 static void ge_p3_to_p2(ge_p2 *r, const ge_p3 *p) { 585 fe_copy(&r->X, &p->X); 586 fe_copy(&r->Y, &p->Y); 587 fe_copy(&r->Z, &p->Z); 588 } 589 590 // r = p 591 void x25519_ge_p3_to_cached(ge_cached *r, const ge_p3 *p) { 592 fe_add(&r->YplusX, &p->Y, &p->X); 593 fe_sub(&r->YminusX, &p->Y, &p->X); 594 fe_copy_lt(&r->Z, &p->Z); 595 fe_mul_ltt(&r->T2d, &p->T, &d2); 596 } 597 598 // r = p 599 void x25519_ge_p1p1_to_p2(ge_p2 *r, const ge_p1p1 *p) { 600 fe_mul_tll(&r->X, &p->X, &p->T); 601 fe_mul_tll(&r->Y, &p->Y, &p->Z); 602 fe_mul_tll(&r->Z, &p->Z, &p->T); 603 } 604 605 // r = p 606 void x25519_ge_p1p1_to_p3(ge_p3 *r, const ge_p1p1 *p) { 607 fe_mul_tll(&r->X, &p->X, &p->T); 608 fe_mul_tll(&r->Y, &p->Y, &p->Z); 609 fe_mul_tll(&r->Z, &p->Z, &p->T); 610 fe_mul_tll(&r->T, &p->X, &p->Y); 611 } 612 613 // r = p 614 static void ge_p1p1_to_cached(ge_cached *r, const ge_p1p1 *p) { 615 ge_p3 t; 616 x25519_ge_p1p1_to_p3(&t, p); 617 x25519_ge_p3_to_cached(r, &t); 618 } 619 620 // r = 2 * p 621 static void ge_p2_dbl(ge_p1p1 *r, const ge_p2 *p) { 622 fe trX, trZ, trT; 623 fe t0; 624 625 fe_sq_tt(&trX, &p->X); 626 fe_sq_tt(&trZ, &p->Y); 627 fe_sq2_tt(&trT, &p->Z); 628 fe_add(&r->Y, &p->X, &p->Y); 629 fe_sq_tl(&t0, &r->Y); 630 631 fe_add(&r->Y, &trZ, &trX); 632 fe_sub(&r->Z, &trZ, &trX); 633 fe_carry(&trZ, &r->Y); 634 fe_sub(&r->X, &t0, &trZ); 635 fe_carry(&trZ, &r->Z); 636 fe_sub(&r->T, &trT, &trZ); 637 } 638 639 // r = 2 * p 640 static void ge_p3_dbl(ge_p1p1 *r, const ge_p3 *p) { 641 ge_p2 q; 642 ge_p3_to_p2(&q, p); 643 ge_p2_dbl(r, &q); 644 } 645 646 // r = p + q 647 static void ge_madd(ge_p1p1 *r, const ge_p3 *p, const ge_precomp *q) { 648 fe trY, trZ, trT; 649 650 fe_add(&r->X, &p->Y, &p->X); 651 fe_sub(&r->Y, &p->Y, &p->X); 652 fe_mul_tll(&trZ, &r->X, &q->yplusx); 653 fe_mul_tll(&trY, &r->Y, &q->yminusx); 654 fe_mul_tlt(&trT, &q->xy2d, &p->T); 655 fe_add(&r->T, &p->Z, &p->Z); 656 fe_sub(&r->X, &trZ, &trY); 657 fe_add(&r->Y, &trZ, &trY); 658 fe_carry(&trZ, &r->T); 659 fe_add(&r->Z, &trZ, &trT); 660 fe_sub(&r->T, &trZ, &trT); 661 } 662 663 // r = p - q 664 static void ge_msub(ge_p1p1 *r, const ge_p3 *p, const ge_precomp *q) { 665 fe trY, trZ, trT; 666 667 fe_add(&r->X, &p->Y, &p->X); 668 fe_sub(&r->Y, &p->Y, &p->X); 669 fe_mul_tll(&trZ, &r->X, &q->yminusx); 670 fe_mul_tll(&trY, &r->Y, &q->yplusx); 671 fe_mul_tlt(&trT, &q->xy2d, &p->T); 672 fe_add(&r->T, &p->Z, &p->Z); 673 fe_sub(&r->X, &trZ, &trY); 674 fe_add(&r->Y, &trZ, &trY); 675 fe_carry(&trZ, &r->T); 676 fe_sub(&r->Z, &trZ, &trT); 677 fe_add(&r->T, &trZ, &trT); 678 } 679 680 // r = p + q 681 void x25519_ge_add(ge_p1p1 *r, const ge_p3 *p, const ge_cached *q) { 682 fe trX, trY, trZ, trT; 683 684 fe_add(&r->X, &p->Y, &p->X); 685 fe_sub(&r->Y, &p->Y, &p->X); 686 fe_mul_tll(&trZ, &r->X, &q->YplusX); 687 fe_mul_tll(&trY, &r->Y, &q->YminusX); 688 fe_mul_tlt(&trT, &q->T2d, &p->T); 689 fe_mul_ttl(&trX, &p->Z, &q->Z); 690 fe_add(&r->T, &trX, &trX); 691 fe_sub(&r->X, &trZ, &trY); 692 fe_add(&r->Y, &trZ, &trY); 693 fe_carry(&trZ, &r->T); 694 fe_add(&r->Z, &trZ, &trT); 695 fe_sub(&r->T, &trZ, &trT); 696 } 697 698 // r = p - q 699 void x25519_ge_sub(ge_p1p1 *r, const ge_p3 *p, const ge_cached *q) { 700 fe trX, trY, trZ, trT; 701 702 fe_add(&r->X, &p->Y, &p->X); 703 fe_sub(&r->Y, &p->Y, &p->X); 704 fe_mul_tll(&trZ, &r->X, &q->YminusX); 705 fe_mul_tll(&trY, &r->Y, &q->YplusX); 706 fe_mul_tlt(&trT, &q->T2d, &p->T); 707 fe_mul_ttl(&trX, &p->Z, &q->Z); 708 fe_add(&r->T, &trX, &trX); 709 fe_sub(&r->X, &trZ, &trY); 710 fe_add(&r->Y, &trZ, &trY); 711 fe_carry(&trZ, &r->T); 712 fe_sub(&r->Z, &trZ, &trT); 713 fe_add(&r->T, &trZ, &trT); 714 } 715 716 static uint8_t equal(signed char b, signed char c) { 717 uint8_t ub = b; 718 uint8_t uc = c; 719 uint8_t x = ub ^ uc; // 0: yes; 1..255: no 720 uint32_t y = x; // 0: yes; 1..255: no 721 y -= 1; // 4294967295: yes; 0..254: no 722 y >>= 31; // 1: yes; 0: no 723 return y; 724 } 725 726 static void cmov(ge_precomp *t, const ge_precomp *u, uint8_t b) { 727 fe_cmov(&t->yplusx, &u->yplusx, b); 728 fe_cmov(&t->yminusx, &u->yminusx, b); 729 fe_cmov(&t->xy2d, &u->xy2d, b); 730 } 731 732 void x25519_ge_scalarmult_small_precomp( 733 ge_p3 *h, const uint8_t a[32], const uint8_t precomp_table[15 * 2 * 32]) { 734 // precomp_table is first expanded into matching |ge_precomp| 735 // elements. 736 ge_precomp multiples[15]; 737 738 unsigned i; 739 for (i = 0; i < 15; i++) { 740 // The precomputed table is assumed to already clear the top bit, so 741 // |fe_frombytes_strict| may be used directly. 742 const uint8_t *bytes = &precomp_table[i*(2 * 32)]; 743 fe x, y; 744 fe_frombytes_strict(&x, bytes); 745 fe_frombytes_strict(&y, bytes + 32); 746 747 ge_precomp *out = &multiples[i]; 748 fe_add(&out->yplusx, &y, &x); 749 fe_sub(&out->yminusx, &y, &x); 750 fe_mul_ltt(&out->xy2d, &x, &y); 751 fe_mul_llt(&out->xy2d, &out->xy2d, &d2); 752 } 753 754 // See the comment above |k25519SmallPrecomp| about the structure of the 755 // precomputed elements. This loop does 64 additions and 64 doublings to 756 // calculate the result. 757 ge_p3_0(h); 758 759 for (i = 63; i < 64; i--) { 760 unsigned j; 761 signed char index = 0; 762 763 for (j = 0; j < 4; j++) { 764 const uint8_t bit = 1 & (a[(8 * j) + (i / 8)] >> (i & 7)); 765 index |= (bit << j); 766 } 767 768 ge_precomp e; 769 ge_precomp_0(&e); 770 771 for (j = 1; j < 16; j++) { 772 cmov(&e, &multiples[j-1], equal(index, j)); 773 } 774 775 ge_cached cached; 776 ge_p1p1 r; 777 x25519_ge_p3_to_cached(&cached, h); 778 x25519_ge_add(&r, h, &cached); 779 x25519_ge_p1p1_to_p3(h, &r); 780 781 ge_madd(&r, h, &e); 782 x25519_ge_p1p1_to_p3(h, &r); 783 } 784 } 785 786 #if defined(OPENSSL_SMALL) 787 788 void x25519_ge_scalarmult_base(ge_p3 *h, const uint8_t a[32]) { 789 x25519_ge_scalarmult_small_precomp(h, a, k25519SmallPrecomp); 790 } 791 792 #else 793 794 static uint8_t negative(signed char b) { 795 uint32_t x = b; 796 x >>= 31; // 1: yes; 0: no 797 return x; 798 } 799 800 static void table_select(ge_precomp *t, int pos, signed char b) { 801 ge_precomp minust; 802 uint8_t bnegative = negative(b); 803 uint8_t babs = b - ((uint8_t)((-bnegative) & b) << 1); 804 805 ge_precomp_0(t); 806 cmov(t, &k25519Precomp[pos][0], equal(babs, 1)); 807 cmov(t, &k25519Precomp[pos][1], equal(babs, 2)); 808 cmov(t, &k25519Precomp[pos][2], equal(babs, 3)); 809 cmov(t, &k25519Precomp[pos][3], equal(babs, 4)); 810 cmov(t, &k25519Precomp[pos][4], equal(babs, 5)); 811 cmov(t, &k25519Precomp[pos][5], equal(babs, 6)); 812 cmov(t, &k25519Precomp[pos][6], equal(babs, 7)); 813 cmov(t, &k25519Precomp[pos][7], equal(babs, 8)); 814 fe_copy_ll(&minust.yplusx, &t->yminusx); 815 fe_copy_ll(&minust.yminusx, &t->yplusx); 816 817 // NOTE: the input table is canonical, but types don't encode it 818 fe tmp; 819 fe_carry(&tmp, &t->xy2d); 820 fe_neg(&minust.xy2d, &tmp); 821 822 cmov(t, &minust, bnegative); 823 } 824 825 // h = a * B 826 // where a = a[0]+256*a[1]+...+256^31 a[31] 827 // B is the Ed25519 base point (x,4/5) with x positive. 828 // 829 // Preconditions: 830 // a[31] <= 127 831 void x25519_ge_scalarmult_base(ge_p3 *h, const uint8_t *a) { 832 signed char e[64]; 833 signed char carry; 834 ge_p1p1 r; 835 ge_p2 s; 836 ge_precomp t; 837 int i; 838 839 for (i = 0; i < 32; ++i) { 840 e[2 * i + 0] = (a[i] >> 0) & 15; 841 e[2 * i + 1] = (a[i] >> 4) & 15; 842 } 843 // each e[i] is between 0 and 15 844 // e[63] is between 0 and 7 845 846 carry = 0; 847 for (i = 0; i < 63; ++i) { 848 e[i] += carry; 849 carry = e[i] + 8; 850 carry >>= 4; 851 e[i] -= carry << 4; 852 } 853 e[63] += carry; 854 // each e[i] is between -8 and 8 855 856 ge_p3_0(h); 857 for (i = 1; i < 64; i += 2) { 858 table_select(&t, i / 2, e[i]); 859 ge_madd(&r, h, &t); 860 x25519_ge_p1p1_to_p3(h, &r); 861 } 862 863 ge_p3_dbl(&r, h); 864 x25519_ge_p1p1_to_p2(&s, &r); 865 ge_p2_dbl(&r, &s); 866 x25519_ge_p1p1_to_p2(&s, &r); 867 ge_p2_dbl(&r, &s); 868 x25519_ge_p1p1_to_p2(&s, &r); 869 ge_p2_dbl(&r, &s); 870 x25519_ge_p1p1_to_p3(h, &r); 871 872 for (i = 0; i < 64; i += 2) { 873 table_select(&t, i / 2, e[i]); 874 ge_madd(&r, h, &t); 875 x25519_ge_p1p1_to_p3(h, &r); 876 } 877 } 878 879 #endif 880 881 static void cmov_cached(ge_cached *t, ge_cached *u, uint8_t b) { 882 fe_cmov(&t->YplusX, &u->YplusX, b); 883 fe_cmov(&t->YminusX, &u->YminusX, b); 884 fe_cmov(&t->Z, &u->Z, b); 885 fe_cmov(&t->T2d, &u->T2d, b); 886 } 887 888 // r = scalar * A. 889 // where a = a[0]+256*a[1]+...+256^31 a[31]. 890 void x25519_ge_scalarmult(ge_p2 *r, const uint8_t *scalar, const ge_p3 *A) { 891 ge_p2 Ai_p2[8]; 892 ge_cached Ai[16]; 893 ge_p1p1 t; 894 895 ge_cached_0(&Ai[0]); 896 x25519_ge_p3_to_cached(&Ai[1], A); 897 ge_p3_to_p2(&Ai_p2[1], A); 898 899 unsigned i; 900 for (i = 2; i < 16; i += 2) { 901 ge_p2_dbl(&t, &Ai_p2[i / 2]); 902 ge_p1p1_to_cached(&Ai[i], &t); 903 if (i < 8) { 904 x25519_ge_p1p1_to_p2(&Ai_p2[i], &t); 905 } 906 x25519_ge_add(&t, A, &Ai[i]); 907 ge_p1p1_to_cached(&Ai[i + 1], &t); 908 if (i < 7) { 909 x25519_ge_p1p1_to_p2(&Ai_p2[i + 1], &t); 910 } 911 } 912 913 ge_p2_0(r); 914 ge_p3 u; 915 916 for (i = 0; i < 256; i += 4) { 917 ge_p2_dbl(&t, r); 918 x25519_ge_p1p1_to_p2(r, &t); 919 ge_p2_dbl(&t, r); 920 x25519_ge_p1p1_to_p2(r, &t); 921 ge_p2_dbl(&t, r); 922 x25519_ge_p1p1_to_p2(r, &t); 923 ge_p2_dbl(&t, r); 924 x25519_ge_p1p1_to_p3(&u, &t); 925 926 uint8_t index = scalar[31 - i/8]; 927 index >>= 4 - (i & 4); 928 index &= 0xf; 929 930 unsigned j; 931 ge_cached selected; 932 ge_cached_0(&selected); 933 for (j = 0; j < 16; j++) { 934 cmov_cached(&selected, &Ai[j], equal(j, index)); 935 } 936 937 x25519_ge_add(&t, &u, &selected); 938 x25519_ge_p1p1_to_p2(r, &t); 939 } 940 } 941 942 static void slide(signed char *r, const uint8_t *a) { 943 int i; 944 int b; 945 int k; 946 947 for (i = 0; i < 256; ++i) { 948 r[i] = 1 & (a[i >> 3] >> (i & 7)); 949 } 950 951 for (i = 0; i < 256; ++i) { 952 if (r[i]) { 953 for (b = 1; b <= 6 && i + b < 256; ++b) { 954 if (r[i + b]) { 955 if (r[i] + (r[i + b] << b) <= 15) { 956 r[i] += r[i + b] << b; 957 r[i + b] = 0; 958 } else if (r[i] - (r[i + b] << b) >= -15) { 959 r[i] -= r[i + b] << b; 960 for (k = i + b; k < 256; ++k) { 961 if (!r[k]) { 962 r[k] = 1; 963 break; 964 } 965 r[k] = 0; 966 } 967 } else { 968 break; 969 } 970 } 971 } 972 } 973 } 974 } 975 976 // r = a * A + b * B 977 // where a = a[0]+256*a[1]+...+256^31 a[31]. 978 // and b = b[0]+256*b[1]+...+256^31 b[31]. 979 // B is the Ed25519 base point (x,4/5) with x positive. 980 static void ge_double_scalarmult_vartime(ge_p2 *r, const uint8_t *a, 981 const ge_p3 *A, const uint8_t *b) { 982 signed char aslide[256]; 983 signed char bslide[256]; 984 ge_cached Ai[8]; // A,3A,5A,7A,9A,11A,13A,15A 985 ge_p1p1 t; 986 ge_p3 u; 987 ge_p3 A2; 988 int i; 989 990 slide(aslide, a); 991 slide(bslide, b); 992 993 x25519_ge_p3_to_cached(&Ai[0], A); 994 ge_p3_dbl(&t, A); 995 x25519_ge_p1p1_to_p3(&A2, &t); 996 x25519_ge_add(&t, &A2, &Ai[0]); 997 x25519_ge_p1p1_to_p3(&u, &t); 998 x25519_ge_p3_to_cached(&Ai[1], &u); 999 x25519_ge_add(&t, &A2, &Ai[1]); 1000 x25519_ge_p1p1_to_p3(&u, &t); 1001 x25519_ge_p3_to_cached(&Ai[2], &u); 1002 x25519_ge_add(&t, &A2, &Ai[2]); 1003 x25519_ge_p1p1_to_p3(&u, &t); 1004 x25519_ge_p3_to_cached(&Ai[3], &u); 1005 x25519_ge_add(&t, &A2, &Ai[3]); 1006 x25519_ge_p1p1_to_p3(&u, &t); 1007 x25519_ge_p3_to_cached(&Ai[4], &u); 1008 x25519_ge_add(&t, &A2, &Ai[4]); 1009 x25519_ge_p1p1_to_p3(&u, &t); 1010 x25519_ge_p3_to_cached(&Ai[5], &u); 1011 x25519_ge_add(&t, &A2, &Ai[5]); 1012 x25519_ge_p1p1_to_p3(&u, &t); 1013 x25519_ge_p3_to_cached(&Ai[6], &u); 1014 x25519_ge_add(&t, &A2, &Ai[6]); 1015 x25519_ge_p1p1_to_p3(&u, &t); 1016 x25519_ge_p3_to_cached(&Ai[7], &u); 1017 1018 ge_p2_0(r); 1019 1020 for (i = 255; i >= 0; --i) { 1021 if (aslide[i] || bslide[i]) { 1022 break; 1023 } 1024 } 1025 1026 for (; i >= 0; --i) { 1027 ge_p2_dbl(&t, r); 1028 1029 if (aslide[i] > 0) { 1030 x25519_ge_p1p1_to_p3(&u, &t); 1031 x25519_ge_add(&t, &u, &Ai[aslide[i] / 2]); 1032 } else if (aslide[i] < 0) { 1033 x25519_ge_p1p1_to_p3(&u, &t); 1034 x25519_ge_sub(&t, &u, &Ai[(-aslide[i]) / 2]); 1035 } 1036 1037 if (bslide[i] > 0) { 1038 x25519_ge_p1p1_to_p3(&u, &t); 1039 ge_madd(&t, &u, &Bi[bslide[i] / 2]); 1040 } else if (bslide[i] < 0) { 1041 x25519_ge_p1p1_to_p3(&u, &t); 1042 ge_msub(&t, &u, &Bi[(-bslide[i]) / 2]); 1043 } 1044 1045 x25519_ge_p1p1_to_p2(r, &t); 1046 } 1047 } 1048 1049 // int64_lshift21 returns |a << 21| but is defined when shifting bits into the 1050 // sign bit. This works around a language flaw in C. 1051 static inline int64_t int64_lshift21(int64_t a) { 1052 return (int64_t)((uint64_t)a << 21); 1053 } 1054 1055 // The set of scalars is \Z/l 1056 // where l = 2^252 + 27742317777372353535851937790883648493. 1057 1058 // Input: 1059 // s[0]+256*s[1]+...+256^63*s[63] = s 1060 // 1061 // Output: 1062 // s[0]+256*s[1]+...+256^31*s[31] = s mod l 1063 // where l = 2^252 + 27742317777372353535851937790883648493. 1064 // Overwrites s in place. 1065 void x25519_sc_reduce(uint8_t s[64]) { 1066 int64_t s0 = 2097151 & load_3(s); 1067 int64_t s1 = 2097151 & (load_4(s + 2) >> 5); 1068 int64_t s2 = 2097151 & (load_3(s + 5) >> 2); 1069 int64_t s3 = 2097151 & (load_4(s + 7) >> 7); 1070 int64_t s4 = 2097151 & (load_4(s + 10) >> 4); 1071 int64_t s5 = 2097151 & (load_3(s + 13) >> 1); 1072 int64_t s6 = 2097151 & (load_4(s + 15) >> 6); 1073 int64_t s7 = 2097151 & (load_3(s + 18) >> 3); 1074 int64_t s8 = 2097151 & load_3(s + 21); 1075 int64_t s9 = 2097151 & (load_4(s + 23) >> 5); 1076 int64_t s10 = 2097151 & (load_3(s + 26) >> 2); 1077 int64_t s11 = 2097151 & (load_4(s + 28) >> 7); 1078 int64_t s12 = 2097151 & (load_4(s + 31) >> 4); 1079 int64_t s13 = 2097151 & (load_3(s + 34) >> 1); 1080 int64_t s14 = 2097151 & (load_4(s + 36) >> 6); 1081 int64_t s15 = 2097151 & (load_3(s + 39) >> 3); 1082 int64_t s16 = 2097151 & load_3(s + 42); 1083 int64_t s17 = 2097151 & (load_4(s + 44) >> 5); 1084 int64_t s18 = 2097151 & (load_3(s + 47) >> 2); 1085 int64_t s19 = 2097151 & (load_4(s + 49) >> 7); 1086 int64_t s20 = 2097151 & (load_4(s + 52) >> 4); 1087 int64_t s21 = 2097151 & (load_3(s + 55) >> 1); 1088 int64_t s22 = 2097151 & (load_4(s + 57) >> 6); 1089 int64_t s23 = (load_4(s + 60) >> 3); 1090 int64_t carry0; 1091 int64_t carry1; 1092 int64_t carry2; 1093 int64_t carry3; 1094 int64_t carry4; 1095 int64_t carry5; 1096 int64_t carry6; 1097 int64_t carry7; 1098 int64_t carry8; 1099 int64_t carry9; 1100 int64_t carry10; 1101 int64_t carry11; 1102 int64_t carry12; 1103 int64_t carry13; 1104 int64_t carry14; 1105 int64_t carry15; 1106 int64_t carry16; 1107 1108 s11 += s23 * 666643; 1109 s12 += s23 * 470296; 1110 s13 += s23 * 654183; 1111 s14 -= s23 * 997805; 1112 s15 += s23 * 136657; 1113 s16 -= s23 * 683901; 1114 s23 = 0; 1115 1116 s10 += s22 * 666643; 1117 s11 += s22 * 470296; 1118 s12 += s22 * 654183; 1119 s13 -= s22 * 997805; 1120 s14 += s22 * 136657; 1121 s15 -= s22 * 683901; 1122 s22 = 0; 1123 1124 s9 += s21 * 666643; 1125 s10 += s21 * 470296; 1126 s11 += s21 * 654183; 1127 s12 -= s21 * 997805; 1128 s13 += s21 * 136657; 1129 s14 -= s21 * 683901; 1130 s21 = 0; 1131 1132 s8 += s20 * 666643; 1133 s9 += s20 * 470296; 1134 s10 += s20 * 654183; 1135 s11 -= s20 * 997805; 1136 s12 += s20 * 136657; 1137 s13 -= s20 * 683901; 1138 s20 = 0; 1139 1140 s7 += s19 * 666643; 1141 s8 += s19 * 470296; 1142 s9 += s19 * 654183; 1143 s10 -= s19 * 997805; 1144 s11 += s19 * 136657; 1145 s12 -= s19 * 683901; 1146 s19 = 0; 1147 1148 s6 += s18 * 666643; 1149 s7 += s18 * 470296; 1150 s8 += s18 * 654183; 1151 s9 -= s18 * 997805; 1152 s10 += s18 * 136657; 1153 s11 -= s18 * 683901; 1154 s18 = 0; 1155 1156 carry6 = (s6 + (1 << 20)) >> 21; 1157 s7 += carry6; 1158 s6 -= int64_lshift21(carry6); 1159 carry8 = (s8 + (1 << 20)) >> 21; 1160 s9 += carry8; 1161 s8 -= int64_lshift21(carry8); 1162 carry10 = (s10 + (1 << 20)) >> 21; 1163 s11 += carry10; 1164 s10 -= int64_lshift21(carry10); 1165 carry12 = (s12 + (1 << 20)) >> 21; 1166 s13 += carry12; 1167 s12 -= int64_lshift21(carry12); 1168 carry14 = (s14 + (1 << 20)) >> 21; 1169 s15 += carry14; 1170 s14 -= int64_lshift21(carry14); 1171 carry16 = (s16 + (1 << 20)) >> 21; 1172 s17 += carry16; 1173 s16 -= int64_lshift21(carry16); 1174 1175 carry7 = (s7 + (1 << 20)) >> 21; 1176 s8 += carry7; 1177 s7 -= int64_lshift21(carry7); 1178 carry9 = (s9 + (1 << 20)) >> 21; 1179 s10 += carry9; 1180 s9 -= int64_lshift21(carry9); 1181 carry11 = (s11 + (1 << 20)) >> 21; 1182 s12 += carry11; 1183 s11 -= int64_lshift21(carry11); 1184 carry13 = (s13 + (1 << 20)) >> 21; 1185 s14 += carry13; 1186 s13 -= int64_lshift21(carry13); 1187 carry15 = (s15 + (1 << 20)) >> 21; 1188 s16 += carry15; 1189 s15 -= int64_lshift21(carry15); 1190 1191 s5 += s17 * 666643; 1192 s6 += s17 * 470296; 1193 s7 += s17 * 654183; 1194 s8 -= s17 * 997805; 1195 s9 += s17 * 136657; 1196 s10 -= s17 * 683901; 1197 s17 = 0; 1198 1199 s4 += s16 * 666643; 1200 s5 += s16 * 470296; 1201 s6 += s16 * 654183; 1202 s7 -= s16 * 997805; 1203 s8 += s16 * 136657; 1204 s9 -= s16 * 683901; 1205 s16 = 0; 1206 1207 s3 += s15 * 666643; 1208 s4 += s15 * 470296; 1209 s5 += s15 * 654183; 1210 s6 -= s15 * 997805; 1211 s7 += s15 * 136657; 1212 s8 -= s15 * 683901; 1213 s15 = 0; 1214 1215 s2 += s14 * 666643; 1216 s3 += s14 * 470296; 1217 s4 += s14 * 654183; 1218 s5 -= s14 * 997805; 1219 s6 += s14 * 136657; 1220 s7 -= s14 * 683901; 1221 s14 = 0; 1222 1223 s1 += s13 * 666643; 1224 s2 += s13 * 470296; 1225 s3 += s13 * 654183; 1226 s4 -= s13 * 997805; 1227 s5 += s13 * 136657; 1228 s6 -= s13 * 683901; 1229 s13 = 0; 1230 1231 s0 += s12 * 666643; 1232 s1 += s12 * 470296; 1233 s2 += s12 * 654183; 1234 s3 -= s12 * 997805; 1235 s4 += s12 * 136657; 1236 s5 -= s12 * 683901; 1237 s12 = 0; 1238 1239 carry0 = (s0 + (1 << 20)) >> 21; 1240 s1 += carry0; 1241 s0 -= int64_lshift21(carry0); 1242 carry2 = (s2 + (1 << 20)) >> 21; 1243 s3 += carry2; 1244 s2 -= int64_lshift21(carry2); 1245 carry4 = (s4 + (1 << 20)) >> 21; 1246 s5 += carry4; 1247 s4 -= int64_lshift21(carry4); 1248 carry6 = (s6 + (1 << 20)) >> 21; 1249 s7 += carry6; 1250 s6 -= int64_lshift21(carry6); 1251 carry8 = (s8 + (1 << 20)) >> 21; 1252 s9 += carry8; 1253 s8 -= int64_lshift21(carry8); 1254 carry10 = (s10 + (1 << 20)) >> 21; 1255 s11 += carry10; 1256 s10 -= int64_lshift21(carry10); 1257 1258 carry1 = (s1 + (1 << 20)) >> 21; 1259 s2 += carry1; 1260 s1 -= int64_lshift21(carry1); 1261 carry3 = (s3 + (1 << 20)) >> 21; 1262 s4 += carry3; 1263 s3 -= int64_lshift21(carry3); 1264 carry5 = (s5 + (1 << 20)) >> 21; 1265 s6 += carry5; 1266 s5 -= int64_lshift21(carry5); 1267 carry7 = (s7 + (1 << 20)) >> 21; 1268 s8 += carry7; 1269 s7 -= int64_lshift21(carry7); 1270 carry9 = (s9 + (1 << 20)) >> 21; 1271 s10 += carry9; 1272 s9 -= int64_lshift21(carry9); 1273 carry11 = (s11 + (1 << 20)) >> 21; 1274 s12 += carry11; 1275 s11 -= int64_lshift21(carry11); 1276 1277 s0 += s12 * 666643; 1278 s1 += s12 * 470296; 1279 s2 += s12 * 654183; 1280 s3 -= s12 * 997805; 1281 s4 += s12 * 136657; 1282 s5 -= s12 * 683901; 1283 s12 = 0; 1284 1285 carry0 = s0 >> 21; 1286 s1 += carry0; 1287 s0 -= int64_lshift21(carry0); 1288 carry1 = s1 >> 21; 1289 s2 += carry1; 1290 s1 -= int64_lshift21(carry1); 1291 carry2 = s2 >> 21; 1292 s3 += carry2; 1293 s2 -= int64_lshift21(carry2); 1294 carry3 = s3 >> 21; 1295 s4 += carry3; 1296 s3 -= int64_lshift21(carry3); 1297 carry4 = s4 >> 21; 1298 s5 += carry4; 1299 s4 -= int64_lshift21(carry4); 1300 carry5 = s5 >> 21; 1301 s6 += carry5; 1302 s5 -= int64_lshift21(carry5); 1303 carry6 = s6 >> 21; 1304 s7 += carry6; 1305 s6 -= int64_lshift21(carry6); 1306 carry7 = s7 >> 21; 1307 s8 += carry7; 1308 s7 -= int64_lshift21(carry7); 1309 carry8 = s8 >> 21; 1310 s9 += carry8; 1311 s8 -= int64_lshift21(carry8); 1312 carry9 = s9 >> 21; 1313 s10 += carry9; 1314 s9 -= int64_lshift21(carry9); 1315 carry10 = s10 >> 21; 1316 s11 += carry10; 1317 s10 -= int64_lshift21(carry10); 1318 carry11 = s11 >> 21; 1319 s12 += carry11; 1320 s11 -= int64_lshift21(carry11); 1321 1322 s0 += s12 * 666643; 1323 s1 += s12 * 470296; 1324 s2 += s12 * 654183; 1325 s3 -= s12 * 997805; 1326 s4 += s12 * 136657; 1327 s5 -= s12 * 683901; 1328 s12 = 0; 1329 1330 carry0 = s0 >> 21; 1331 s1 += carry0; 1332 s0 -= int64_lshift21(carry0); 1333 carry1 = s1 >> 21; 1334 s2 += carry1; 1335 s1 -= int64_lshift21(carry1); 1336 carry2 = s2 >> 21; 1337 s3 += carry2; 1338 s2 -= int64_lshift21(carry2); 1339 carry3 = s3 >> 21; 1340 s4 += carry3; 1341 s3 -= int64_lshift21(carry3); 1342 carry4 = s4 >> 21; 1343 s5 += carry4; 1344 s4 -= int64_lshift21(carry4); 1345 carry5 = s5 >> 21; 1346 s6 += carry5; 1347 s5 -= int64_lshift21(carry5); 1348 carry6 = s6 >> 21; 1349 s7 += carry6; 1350 s6 -= int64_lshift21(carry6); 1351 carry7 = s7 >> 21; 1352 s8 += carry7; 1353 s7 -= int64_lshift21(carry7); 1354 carry8 = s8 >> 21; 1355 s9 += carry8; 1356 s8 -= int64_lshift21(carry8); 1357 carry9 = s9 >> 21; 1358 s10 += carry9; 1359 s9 -= int64_lshift21(carry9); 1360 carry10 = s10 >> 21; 1361 s11 += carry10; 1362 s10 -= int64_lshift21(carry10); 1363 1364 s[0] = s0 >> 0; 1365 s[1] = s0 >> 8; 1366 s[2] = (s0 >> 16) | (s1 << 5); 1367 s[3] = s1 >> 3; 1368 s[4] = s1 >> 11; 1369 s[5] = (s1 >> 19) | (s2 << 2); 1370 s[6] = s2 >> 6; 1371 s[7] = (s2 >> 14) | (s3 << 7); 1372 s[8] = s3 >> 1; 1373 s[9] = s3 >> 9; 1374 s[10] = (s3 >> 17) | (s4 << 4); 1375 s[11] = s4 >> 4; 1376 s[12] = s4 >> 12; 1377 s[13] = (s4 >> 20) | (s5 << 1); 1378 s[14] = s5 >> 7; 1379 s[15] = (s5 >> 15) | (s6 << 6); 1380 s[16] = s6 >> 2; 1381 s[17] = s6 >> 10; 1382 s[18] = (s6 >> 18) | (s7 << 3); 1383 s[19] = s7 >> 5; 1384 s[20] = s7 >> 13; 1385 s[21] = s8 >> 0; 1386 s[22] = s8 >> 8; 1387 s[23] = (s8 >> 16) | (s9 << 5); 1388 s[24] = s9 >> 3; 1389 s[25] = s9 >> 11; 1390 s[26] = (s9 >> 19) | (s10 << 2); 1391 s[27] = s10 >> 6; 1392 s[28] = (s10 >> 14) | (s11 << 7); 1393 s[29] = s11 >> 1; 1394 s[30] = s11 >> 9; 1395 s[31] = s11 >> 17; 1396 } 1397 1398 // Input: 1399 // a[0]+256*a[1]+...+256^31*a[31] = a 1400 // b[0]+256*b[1]+...+256^31*b[31] = b 1401 // c[0]+256*c[1]+...+256^31*c[31] = c 1402 // 1403 // Output: 1404 // s[0]+256*s[1]+...+256^31*s[31] = (ab+c) mod l 1405 // where l = 2^252 + 27742317777372353535851937790883648493. 1406 static void sc_muladd(uint8_t *s, const uint8_t *a, const uint8_t *b, 1407 const uint8_t *c) { 1408 int64_t a0 = 2097151 & load_3(a); 1409 int64_t a1 = 2097151 & (load_4(a + 2) >> 5); 1410 int64_t a2 = 2097151 & (load_3(a + 5) >> 2); 1411 int64_t a3 = 2097151 & (load_4(a + 7) >> 7); 1412 int64_t a4 = 2097151 & (load_4(a + 10) >> 4); 1413 int64_t a5 = 2097151 & (load_3(a + 13) >> 1); 1414 int64_t a6 = 2097151 & (load_4(a + 15) >> 6); 1415 int64_t a7 = 2097151 & (load_3(a + 18) >> 3); 1416 int64_t a8 = 2097151 & load_3(a + 21); 1417 int64_t a9 = 2097151 & (load_4(a + 23) >> 5); 1418 int64_t a10 = 2097151 & (load_3(a + 26) >> 2); 1419 int64_t a11 = (load_4(a + 28) >> 7); 1420 int64_t b0 = 2097151 & load_3(b); 1421 int64_t b1 = 2097151 & (load_4(b + 2) >> 5); 1422 int64_t b2 = 2097151 & (load_3(b + 5) >> 2); 1423 int64_t b3 = 2097151 & (load_4(b + 7) >> 7); 1424 int64_t b4 = 2097151 & (load_4(b + 10) >> 4); 1425 int64_t b5 = 2097151 & (load_3(b + 13) >> 1); 1426 int64_t b6 = 2097151 & (load_4(b + 15) >> 6); 1427 int64_t b7 = 2097151 & (load_3(b + 18) >> 3); 1428 int64_t b8 = 2097151 & load_3(b + 21); 1429 int64_t b9 = 2097151 & (load_4(b + 23) >> 5); 1430 int64_t b10 = 2097151 & (load_3(b + 26) >> 2); 1431 int64_t b11 = (load_4(b + 28) >> 7); 1432 int64_t c0 = 2097151 & load_3(c); 1433 int64_t c1 = 2097151 & (load_4(c + 2) >> 5); 1434 int64_t c2 = 2097151 & (load_3(c + 5) >> 2); 1435 int64_t c3 = 2097151 & (load_4(c + 7) >> 7); 1436 int64_t c4 = 2097151 & (load_4(c + 10) >> 4); 1437 int64_t c5 = 2097151 & (load_3(c + 13) >> 1); 1438 int64_t c6 = 2097151 & (load_4(c + 15) >> 6); 1439 int64_t c7 = 2097151 & (load_3(c + 18) >> 3); 1440 int64_t c8 = 2097151 & load_3(c + 21); 1441 int64_t c9 = 2097151 & (load_4(c + 23) >> 5); 1442 int64_t c10 = 2097151 & (load_3(c + 26) >> 2); 1443 int64_t c11 = (load_4(c + 28) >> 7); 1444 int64_t s0; 1445 int64_t s1; 1446 int64_t s2; 1447 int64_t s3; 1448 int64_t s4; 1449 int64_t s5; 1450 int64_t s6; 1451 int64_t s7; 1452 int64_t s8; 1453 int64_t s9; 1454 int64_t s10; 1455 int64_t s11; 1456 int64_t s12; 1457 int64_t s13; 1458 int64_t s14; 1459 int64_t s15; 1460 int64_t s16; 1461 int64_t s17; 1462 int64_t s18; 1463 int64_t s19; 1464 int64_t s20; 1465 int64_t s21; 1466 int64_t s22; 1467 int64_t s23; 1468 int64_t carry0; 1469 int64_t carry1; 1470 int64_t carry2; 1471 int64_t carry3; 1472 int64_t carry4; 1473 int64_t carry5; 1474 int64_t carry6; 1475 int64_t carry7; 1476 int64_t carry8; 1477 int64_t carry9; 1478 int64_t carry10; 1479 int64_t carry11; 1480 int64_t carry12; 1481 int64_t carry13; 1482 int64_t carry14; 1483 int64_t carry15; 1484 int64_t carry16; 1485 int64_t carry17; 1486 int64_t carry18; 1487 int64_t carry19; 1488 int64_t carry20; 1489 int64_t carry21; 1490 int64_t carry22; 1491 1492 s0 = c0 + a0 * b0; 1493 s1 = c1 + a0 * b1 + a1 * b0; 1494 s2 = c2 + a0 * b2 + a1 * b1 + a2 * b0; 1495 s3 = c3 + a0 * b3 + a1 * b2 + a2 * b1 + a3 * b0; 1496 s4 = c4 + a0 * b4 + a1 * b3 + a2 * b2 + a3 * b1 + a4 * b0; 1497 s5 = c5 + a0 * b5 + a1 * b4 + a2 * b3 + a3 * b2 + a4 * b1 + a5 * b0; 1498 s6 = c6 + a0 * b6 + a1 * b5 + a2 * b4 + a3 * b3 + a4 * b2 + a5 * b1 + a6 * b0; 1499 s7 = c7 + a0 * b7 + a1 * b6 + a2 * b5 + a3 * b4 + a4 * b3 + a5 * b2 + 1500 a6 * b1 + a7 * b0; 1501 s8 = c8 + a0 * b8 + a1 * b7 + a2 * b6 + a3 * b5 + a4 * b4 + a5 * b3 + 1502 a6 * b2 + a7 * b1 + a8 * b0; 1503 s9 = c9 + a0 * b9 + a1 * b8 + a2 * b7 + a3 * b6 + a4 * b5 + a5 * b4 + 1504 a6 * b3 + a7 * b2 + a8 * b1 + a9 * b0; 1505 s10 = c10 + a0 * b10 + a1 * b9 + a2 * b8 + a3 * b7 + a4 * b6 + a5 * b5 + 1506 a6 * b4 + a7 * b3 + a8 * b2 + a9 * b1 + a10 * b0; 1507 s11 = c11 + a0 * b11 + a1 * b10 + a2 * b9 + a3 * b8 + a4 * b7 + a5 * b6 + 1508 a6 * b5 + a7 * b4 + a8 * b3 + a9 * b2 + a10 * b1 + a11 * b0; 1509 s12 = a1 * b11 + a2 * b10 + a3 * b9 + a4 * b8 + a5 * b7 + a6 * b6 + a7 * b5 + 1510 a8 * b4 + a9 * b3 + a10 * b2 + a11 * b1; 1511 s13 = a2 * b11 + a3 * b10 + a4 * b9 + a5 * b8 + a6 * b7 + a7 * b6 + a8 * b5 + 1512 a9 * b4 + a10 * b3 + a11 * b2; 1513 s14 = a3 * b11 + a4 * b10 + a5 * b9 + a6 * b8 + a7 * b7 + a8 * b6 + a9 * b5 + 1514 a10 * b4 + a11 * b3; 1515 s15 = a4 * b11 + a5 * b10 + a6 * b9 + a7 * b8 + a8 * b7 + a9 * b6 + a10 * b5 + 1516 a11 * b4; 1517 s16 = a5 * b11 + a6 * b10 + a7 * b9 + a8 * b8 + a9 * b7 + a10 * b6 + a11 * b5; 1518 s17 = a6 * b11 + a7 * b10 + a8 * b9 + a9 * b8 + a10 * b7 + a11 * b6; 1519 s18 = a7 * b11 + a8 * b10 + a9 * b9 + a10 * b8 + a11 * b7; 1520 s19 = a8 * b11 + a9 * b10 + a10 * b9 + a11 * b8; 1521 s20 = a9 * b11 + a10 * b10 + a11 * b9; 1522 s21 = a10 * b11 + a11 * b10; 1523 s22 = a11 * b11; 1524 s23 = 0; 1525 1526 carry0 = (s0 + (1 << 20)) >> 21; 1527 s1 += carry0; 1528 s0 -= int64_lshift21(carry0); 1529 carry2 = (s2 + (1 << 20)) >> 21; 1530 s3 += carry2; 1531 s2 -= int64_lshift21(carry2); 1532 carry4 = (s4 + (1 << 20)) >> 21; 1533 s5 += carry4; 1534 s4 -= int64_lshift21(carry4); 1535 carry6 = (s6 + (1 << 20)) >> 21; 1536 s7 += carry6; 1537 s6 -= int64_lshift21(carry6); 1538 carry8 = (s8 + (1 << 20)) >> 21; 1539 s9 += carry8; 1540 s8 -= int64_lshift21(carry8); 1541 carry10 = (s10 + (1 << 20)) >> 21; 1542 s11 += carry10; 1543 s10 -= int64_lshift21(carry10); 1544 carry12 = (s12 + (1 << 20)) >> 21; 1545 s13 += carry12; 1546 s12 -= int64_lshift21(carry12); 1547 carry14 = (s14 + (1 << 20)) >> 21; 1548 s15 += carry14; 1549 s14 -= int64_lshift21(carry14); 1550 carry16 = (s16 + (1 << 20)) >> 21; 1551 s17 += carry16; 1552 s16 -= int64_lshift21(carry16); 1553 carry18 = (s18 + (1 << 20)) >> 21; 1554 s19 += carry18; 1555 s18 -= int64_lshift21(carry18); 1556 carry20 = (s20 + (1 << 20)) >> 21; 1557 s21 += carry20; 1558 s20 -= int64_lshift21(carry20); 1559 carry22 = (s22 + (1 << 20)) >> 21; 1560 s23 += carry22; 1561 s22 -= int64_lshift21(carry22); 1562 1563 carry1 = (s1 + (1 << 20)) >> 21; 1564 s2 += carry1; 1565 s1 -= int64_lshift21(carry1); 1566 carry3 = (s3 + (1 << 20)) >> 21; 1567 s4 += carry3; 1568 s3 -= int64_lshift21(carry3); 1569 carry5 = (s5 + (1 << 20)) >> 21; 1570 s6 += carry5; 1571 s5 -= int64_lshift21(carry5); 1572 carry7 = (s7 + (1 << 20)) >> 21; 1573 s8 += carry7; 1574 s7 -= int64_lshift21(carry7); 1575 carry9 = (s9 + (1 << 20)) >> 21; 1576 s10 += carry9; 1577 s9 -= int64_lshift21(carry9); 1578 carry11 = (s11 + (1 << 20)) >> 21; 1579 s12 += carry11; 1580 s11 -= int64_lshift21(carry11); 1581 carry13 = (s13 + (1 << 20)) >> 21; 1582 s14 += carry13; 1583 s13 -= int64_lshift21(carry13); 1584 carry15 = (s15 + (1 << 20)) >> 21; 1585 s16 += carry15; 1586 s15 -= int64_lshift21(carry15); 1587 carry17 = (s17 + (1 << 20)) >> 21; 1588 s18 += carry17; 1589 s17 -= int64_lshift21(carry17); 1590 carry19 = (s19 + (1 << 20)) >> 21; 1591 s20 += carry19; 1592 s19 -= int64_lshift21(carry19); 1593 carry21 = (s21 + (1 << 20)) >> 21; 1594 s22 += carry21; 1595 s21 -= int64_lshift21(carry21); 1596 1597 s11 += s23 * 666643; 1598 s12 += s23 * 470296; 1599 s13 += s23 * 654183; 1600 s14 -= s23 * 997805; 1601 s15 += s23 * 136657; 1602 s16 -= s23 * 683901; 1603 s23 = 0; 1604 1605 s10 += s22 * 666643; 1606 s11 += s22 * 470296; 1607 s12 += s22 * 654183; 1608 s13 -= s22 * 997805; 1609 s14 += s22 * 136657; 1610 s15 -= s22 * 683901; 1611 s22 = 0; 1612 1613 s9 += s21 * 666643; 1614 s10 += s21 * 470296; 1615 s11 += s21 * 654183; 1616 s12 -= s21 * 997805; 1617 s13 += s21 * 136657; 1618 s14 -= s21 * 683901; 1619 s21 = 0; 1620 1621 s8 += s20 * 666643; 1622 s9 += s20 * 470296; 1623 s10 += s20 * 654183; 1624 s11 -= s20 * 997805; 1625 s12 += s20 * 136657; 1626 s13 -= s20 * 683901; 1627 s20 = 0; 1628 1629 s7 += s19 * 666643; 1630 s8 += s19 * 470296; 1631 s9 += s19 * 654183; 1632 s10 -= s19 * 997805; 1633 s11 += s19 * 136657; 1634 s12 -= s19 * 683901; 1635 s19 = 0; 1636 1637 s6 += s18 * 666643; 1638 s7 += s18 * 470296; 1639 s8 += s18 * 654183; 1640 s9 -= s18 * 997805; 1641 s10 += s18 * 136657; 1642 s11 -= s18 * 683901; 1643 s18 = 0; 1644 1645 carry6 = (s6 + (1 << 20)) >> 21; 1646 s7 += carry6; 1647 s6 -= int64_lshift21(carry6); 1648 carry8 = (s8 + (1 << 20)) >> 21; 1649 s9 += carry8; 1650 s8 -= int64_lshift21(carry8); 1651 carry10 = (s10 + (1 << 20)) >> 21; 1652 s11 += carry10; 1653 s10 -= int64_lshift21(carry10); 1654 carry12 = (s12 + (1 << 20)) >> 21; 1655 s13 += carry12; 1656 s12 -= int64_lshift21(carry12); 1657 carry14 = (s14 + (1 << 20)) >> 21; 1658 s15 += carry14; 1659 s14 -= int64_lshift21(carry14); 1660 carry16 = (s16 + (1 << 20)) >> 21; 1661 s17 += carry16; 1662 s16 -= int64_lshift21(carry16); 1663 1664 carry7 = (s7 + (1 << 20)) >> 21; 1665 s8 += carry7; 1666 s7 -= int64_lshift21(carry7); 1667 carry9 = (s9 + (1 << 20)) >> 21; 1668 s10 += carry9; 1669 s9 -= int64_lshift21(carry9); 1670 carry11 = (s11 + (1 << 20)) >> 21; 1671 s12 += carry11; 1672 s11 -= int64_lshift21(carry11); 1673 carry13 = (s13 + (1 << 20)) >> 21; 1674 s14 += carry13; 1675 s13 -= int64_lshift21(carry13); 1676 carry15 = (s15 + (1 << 20)) >> 21; 1677 s16 += carry15; 1678 s15 -= int64_lshift21(carry15); 1679 1680 s5 += s17 * 666643; 1681 s6 += s17 * 470296; 1682 s7 += s17 * 654183; 1683 s8 -= s17 * 997805; 1684 s9 += s17 * 136657; 1685 s10 -= s17 * 683901; 1686 s17 = 0; 1687 1688 s4 += s16 * 666643; 1689 s5 += s16 * 470296; 1690 s6 += s16 * 654183; 1691 s7 -= s16 * 997805; 1692 s8 += s16 * 136657; 1693 s9 -= s16 * 683901; 1694 s16 = 0; 1695 1696 s3 += s15 * 666643; 1697 s4 += s15 * 470296; 1698 s5 += s15 * 654183; 1699 s6 -= s15 * 997805; 1700 s7 += s15 * 136657; 1701 s8 -= s15 * 683901; 1702 s15 = 0; 1703 1704 s2 += s14 * 666643; 1705 s3 += s14 * 470296; 1706 s4 += s14 * 654183; 1707 s5 -= s14 * 997805; 1708 s6 += s14 * 136657; 1709 s7 -= s14 * 683901; 1710 s14 = 0; 1711 1712 s1 += s13 * 666643; 1713 s2 += s13 * 470296; 1714 s3 += s13 * 654183; 1715 s4 -= s13 * 997805; 1716 s5 += s13 * 136657; 1717 s6 -= s13 * 683901; 1718 s13 = 0; 1719 1720 s0 += s12 * 666643; 1721 s1 += s12 * 470296; 1722 s2 += s12 * 654183; 1723 s3 -= s12 * 997805; 1724 s4 += s12 * 136657; 1725 s5 -= s12 * 683901; 1726 s12 = 0; 1727 1728 carry0 = (s0 + (1 << 20)) >> 21; 1729 s1 += carry0; 1730 s0 -= int64_lshift21(carry0); 1731 carry2 = (s2 + (1 << 20)) >> 21; 1732 s3 += carry2; 1733 s2 -= int64_lshift21(carry2); 1734 carry4 = (s4 + (1 << 20)) >> 21; 1735 s5 += carry4; 1736 s4 -= int64_lshift21(carry4); 1737 carry6 = (s6 + (1 << 20)) >> 21; 1738 s7 += carry6; 1739 s6 -= int64_lshift21(carry6); 1740 carry8 = (s8 + (1 << 20)) >> 21; 1741 s9 += carry8; 1742 s8 -= int64_lshift21(carry8); 1743 carry10 = (s10 + (1 << 20)) >> 21; 1744 s11 += carry10; 1745 s10 -= int64_lshift21(carry10); 1746 1747 carry1 = (s1 + (1 << 20)) >> 21; 1748 s2 += carry1; 1749 s1 -= int64_lshift21(carry1); 1750 carry3 = (s3 + (1 << 20)) >> 21; 1751 s4 += carry3; 1752 s3 -= int64_lshift21(carry3); 1753 carry5 = (s5 + (1 << 20)) >> 21; 1754 s6 += carry5; 1755 s5 -= int64_lshift21(carry5); 1756 carry7 = (s7 + (1 << 20)) >> 21; 1757 s8 += carry7; 1758 s7 -= int64_lshift21(carry7); 1759 carry9 = (s9 + (1 << 20)) >> 21; 1760 s10 += carry9; 1761 s9 -= int64_lshift21(carry9); 1762 carry11 = (s11 + (1 << 20)) >> 21; 1763 s12 += carry11; 1764 s11 -= int64_lshift21(carry11); 1765 1766 s0 += s12 * 666643; 1767 s1 += s12 * 470296; 1768 s2 += s12 * 654183; 1769 s3 -= s12 * 997805; 1770 s4 += s12 * 136657; 1771 s5 -= s12 * 683901; 1772 s12 = 0; 1773 1774 carry0 = s0 >> 21; 1775 s1 += carry0; 1776 s0 -= int64_lshift21(carry0); 1777 carry1 = s1 >> 21; 1778 s2 += carry1; 1779 s1 -= int64_lshift21(carry1); 1780 carry2 = s2 >> 21; 1781 s3 += carry2; 1782 s2 -= int64_lshift21(carry2); 1783 carry3 = s3 >> 21; 1784 s4 += carry3; 1785 s3 -= int64_lshift21(carry3); 1786 carry4 = s4 >> 21; 1787 s5 += carry4; 1788 s4 -= int64_lshift21(carry4); 1789 carry5 = s5 >> 21; 1790 s6 += carry5; 1791 s5 -= int64_lshift21(carry5); 1792 carry6 = s6 >> 21; 1793 s7 += carry6; 1794 s6 -= int64_lshift21(carry6); 1795 carry7 = s7 >> 21; 1796 s8 += carry7; 1797 s7 -= int64_lshift21(carry7); 1798 carry8 = s8 >> 21; 1799 s9 += carry8; 1800 s8 -= int64_lshift21(carry8); 1801 carry9 = s9 >> 21; 1802 s10 += carry9; 1803 s9 -= int64_lshift21(carry9); 1804 carry10 = s10 >> 21; 1805 s11 += carry10; 1806 s10 -= int64_lshift21(carry10); 1807 carry11 = s11 >> 21; 1808 s12 += carry11; 1809 s11 -= int64_lshift21(carry11); 1810 1811 s0 += s12 * 666643; 1812 s1 += s12 * 470296; 1813 s2 += s12 * 654183; 1814 s3 -= s12 * 997805; 1815 s4 += s12 * 136657; 1816 s5 -= s12 * 683901; 1817 s12 = 0; 1818 1819 carry0 = s0 >> 21; 1820 s1 += carry0; 1821 s0 -= int64_lshift21(carry0); 1822 carry1 = s1 >> 21; 1823 s2 += carry1; 1824 s1 -= int64_lshift21(carry1); 1825 carry2 = s2 >> 21; 1826 s3 += carry2; 1827 s2 -= int64_lshift21(carry2); 1828 carry3 = s3 >> 21; 1829 s4 += carry3; 1830 s3 -= int64_lshift21(carry3); 1831 carry4 = s4 >> 21; 1832 s5 += carry4; 1833 s4 -= int64_lshift21(carry4); 1834 carry5 = s5 >> 21; 1835 s6 += carry5; 1836 s5 -= int64_lshift21(carry5); 1837 carry6 = s6 >> 21; 1838 s7 += carry6; 1839 s6 -= int64_lshift21(carry6); 1840 carry7 = s7 >> 21; 1841 s8 += carry7; 1842 s7 -= int64_lshift21(carry7); 1843 carry8 = s8 >> 21; 1844 s9 += carry8; 1845 s8 -= int64_lshift21(carry8); 1846 carry9 = s9 >> 21; 1847 s10 += carry9; 1848 s9 -= int64_lshift21(carry9); 1849 carry10 = s10 >> 21; 1850 s11 += carry10; 1851 s10 -= int64_lshift21(carry10); 1852 1853 s[0] = s0 >> 0; 1854 s[1] = s0 >> 8; 1855 s[2] = (s0 >> 16) | (s1 << 5); 1856 s[3] = s1 >> 3; 1857 s[4] = s1 >> 11; 1858 s[5] = (s1 >> 19) | (s2 << 2); 1859 s[6] = s2 >> 6; 1860 s[7] = (s2 >> 14) | (s3 << 7); 1861 s[8] = s3 >> 1; 1862 s[9] = s3 >> 9; 1863 s[10] = (s3 >> 17) | (s4 << 4); 1864 s[11] = s4 >> 4; 1865 s[12] = s4 >> 12; 1866 s[13] = (s4 >> 20) | (s5 << 1); 1867 s[14] = s5 >> 7; 1868 s[15] = (s5 >> 15) | (s6 << 6); 1869 s[16] = s6 >> 2; 1870 s[17] = s6 >> 10; 1871 s[18] = (s6 >> 18) | (s7 << 3); 1872 s[19] = s7 >> 5; 1873 s[20] = s7 >> 13; 1874 s[21] = s8 >> 0; 1875 s[22] = s8 >> 8; 1876 s[23] = (s8 >> 16) | (s9 << 5); 1877 s[24] = s9 >> 3; 1878 s[25] = s9 >> 11; 1879 s[26] = (s9 >> 19) | (s10 << 2); 1880 s[27] = s10 >> 6; 1881 s[28] = (s10 >> 14) | (s11 << 7); 1882 s[29] = s11 >> 1; 1883 s[30] = s11 >> 9; 1884 s[31] = s11 >> 17; 1885 } 1886 1887 void ED25519_keypair(uint8_t out_public_key[32], uint8_t out_private_key[64]) { 1888 uint8_t seed[32]; 1889 RAND_bytes(seed, 32); 1890 ED25519_keypair_from_seed(out_public_key, out_private_key, seed); 1891 } 1892 1893 int ED25519_sign(uint8_t out_sig[64], const uint8_t *message, 1894 size_t message_len, const uint8_t private_key[64]) { 1895 // NOTE: The documentation on this function says that it returns zero on 1896 // allocation failure. While that can't happen with the current 1897 // implementation, we want to reserve the ability to allocate in this 1898 // implementation in the future. 1899 1900 uint8_t az[SHA512_DIGEST_LENGTH]; 1901 SHA512(private_key, 32, az); 1902 1903 az[0] &= 248; 1904 az[31] &= 63; 1905 az[31] |= 64; 1906 1907 SHA512_CTX hash_ctx; 1908 SHA512_Init(&hash_ctx); 1909 SHA512_Update(&hash_ctx, az + 32, 32); 1910 SHA512_Update(&hash_ctx, message, message_len); 1911 uint8_t nonce[SHA512_DIGEST_LENGTH]; 1912 SHA512_Final(nonce, &hash_ctx); 1913 1914 x25519_sc_reduce(nonce); 1915 ge_p3 R; 1916 x25519_ge_scalarmult_base(&R, nonce); 1917 ge_p3_tobytes(out_sig, &R); 1918 1919 SHA512_Init(&hash_ctx); 1920 SHA512_Update(&hash_ctx, out_sig, 32); 1921 SHA512_Update(&hash_ctx, private_key + 32, 32); 1922 SHA512_Update(&hash_ctx, message, message_len); 1923 uint8_t hram[SHA512_DIGEST_LENGTH]; 1924 SHA512_Final(hram, &hash_ctx); 1925 1926 x25519_sc_reduce(hram); 1927 sc_muladd(out_sig + 32, hram, az, nonce); 1928 1929 return 1; 1930 } 1931 1932 int ED25519_verify(const uint8_t *message, size_t message_len, 1933 const uint8_t signature[64], const uint8_t public_key[32]) { 1934 ge_p3 A; 1935 if ((signature[63] & 224) != 0 || 1936 !x25519_ge_frombytes_vartime(&A, public_key)) { 1937 return 0; 1938 } 1939 1940 fe_loose t; 1941 fe_neg(&t, &A.X); 1942 fe_carry(&A.X, &t); 1943 fe_neg(&t, &A.T); 1944 fe_carry(&A.T, &t); 1945 1946 uint8_t pkcopy[32]; 1947 OPENSSL_memcpy(pkcopy, public_key, 32); 1948 uint8_t rcopy[32]; 1949 OPENSSL_memcpy(rcopy, signature, 32); 1950 union { 1951 uint64_t u64[4]; 1952 uint8_t u8[32]; 1953 } scopy; 1954 OPENSSL_memcpy(&scopy.u8[0], signature + 32, 32); 1955 1956 // https://tools.ietf.org/html/rfc8032#section-5.1.7 requires that s be in 1957 // the range [0, order) in order to prevent signature malleability. 1958 1959 // kOrder is the order of Curve25519 in little-endian form. 1960 static const uint64_t kOrder[4] = { 1961 UINT64_C(0x5812631a5cf5d3ed), 1962 UINT64_C(0x14def9dea2f79cd6), 1963 0, 1964 UINT64_C(0x1000000000000000), 1965 }; 1966 for (size_t i = 3;; i--) { 1967 if (scopy.u64[i] > kOrder[i]) { 1968 return 0; 1969 } else if (scopy.u64[i] < kOrder[i]) { 1970 break; 1971 } else if (i == 0) { 1972 return 0; 1973 } 1974 } 1975 1976 SHA512_CTX hash_ctx; 1977 SHA512_Init(&hash_ctx); 1978 SHA512_Update(&hash_ctx, signature, 32); 1979 SHA512_Update(&hash_ctx, public_key, 32); 1980 SHA512_Update(&hash_ctx, message, message_len); 1981 uint8_t h[SHA512_DIGEST_LENGTH]; 1982 SHA512_Final(h, &hash_ctx); 1983 1984 x25519_sc_reduce(h); 1985 1986 ge_p2 R; 1987 ge_double_scalarmult_vartime(&R, h, &A, scopy.u8); 1988 1989 uint8_t rcheck[32]; 1990 x25519_ge_tobytes(rcheck, &R); 1991 1992 return CRYPTO_memcmp(rcheck, rcopy, sizeof(rcheck)) == 0; 1993 } 1994 1995 void ED25519_keypair_from_seed(uint8_t out_public_key[32], 1996 uint8_t out_private_key[64], 1997 const uint8_t seed[32]) { 1998 uint8_t az[SHA512_DIGEST_LENGTH]; 1999 SHA512(seed, 32, az); 2000 2001 az[0] &= 248; 2002 az[31] &= 127; 2003 az[31] |= 64; 2004 2005 ge_p3 A; 2006 x25519_ge_scalarmult_base(&A, az); 2007 ge_p3_tobytes(out_public_key, &A); 2008 2009 OPENSSL_memcpy(out_private_key, seed, 32); 2010 OPENSSL_memcpy(out_private_key + 32, out_public_key, 32); 2011 } 2012 2013 2014 static void x25519_scalar_mult_generic(uint8_t out[32], 2015 const uint8_t scalar[32], 2016 const uint8_t point[32]) { 2017 fe x1, x2, z2, x3, z3, tmp0, tmp1; 2018 fe_loose x2l, z2l, x3l, tmp0l, tmp1l; 2019 2020 uint8_t e[32]; 2021 OPENSSL_memcpy(e, scalar, 32); 2022 e[0] &= 248; 2023 e[31] &= 127; 2024 e[31] |= 64; 2025 2026 // The following implementation was transcribed to Coq and proven to 2027 // correspond to unary scalar multiplication in affine coordinates given that 2028 // x1 != 0 is the x coordinate of some point on the curve. It was also checked 2029 // in Coq that doing a ladderstep with x1 = x3 = 0 gives z2' = z3' = 0, and z2 2030 // = z3 = 0 gives z2' = z3' = 0. The statement was quantified over the 2031 // underlying field, so it applies to Curve25519 itself and the quadratic 2032 // twist of Curve25519. It was not proven in Coq that prime-field arithmetic 2033 // correctly simulates extension-field arithmetic on prime-field values. 2034 // The decoding of the byte array representation of e was not considered. 2035 // Specification of Montgomery curves in affine coordinates: 2036 // <https://github.com/mit-plv/fiat-crypto/blob/2456d821825521f7e03e65882cc3521795b0320f/src/Spec/MontgomeryCurve.v#L27> 2037 // Proof that these form a group that is isomorphic to a Weierstrass curve: 2038 // <https://github.com/mit-plv/fiat-crypto/blob/2456d821825521f7e03e65882cc3521795b0320f/src/Curves/Montgomery/AffineProofs.v#L35> 2039 // Coq transcription and correctness proof of the loop (where scalarbits=255): 2040 // <https://github.com/mit-plv/fiat-crypto/blob/2456d821825521f7e03e65882cc3521795b0320f/src/Curves/Montgomery/XZ.v#L118> 2041 // <https://github.com/mit-plv/fiat-crypto/blob/2456d821825521f7e03e65882cc3521795b0320f/src/Curves/Montgomery/XZProofs.v#L278> 2042 // preconditions: 0 <= e < 2^255 (not necessarily e < order), fe_invert(0) = 0 2043 fe_frombytes(&x1, point); 2044 fe_1(&x2); 2045 fe_0(&z2); 2046 fe_copy(&x3, &x1); 2047 fe_1(&z3); 2048 2049 unsigned swap = 0; 2050 int pos; 2051 for (pos = 254; pos >= 0; --pos) { 2052 // loop invariant as of right before the test, for the case where x1 != 0: 2053 // pos >= -1; if z2 = 0 then x2 is nonzero; if z3 = 0 then x3 is nonzero 2054 // let r := e >> (pos+1) in the following equalities of projective points: 2055 // to_xz (r*P) === if swap then (x3, z3) else (x2, z2) 2056 // to_xz ((r+1)*P) === if swap then (x2, z2) else (x3, z3) 2057 // x1 is the nonzero x coordinate of the nonzero point (r*P-(r+1)*P) 2058 unsigned b = 1 & (e[pos / 8] >> (pos & 7)); 2059 swap ^= b; 2060 fe_cswap(&x2, &x3, swap); 2061 fe_cswap(&z2, &z3, swap); 2062 swap = b; 2063 // Coq transcription of ladderstep formula (called from transcribed loop): 2064 // <https://github.com/mit-plv/fiat-crypto/blob/2456d821825521f7e03e65882cc3521795b0320f/src/Curves/Montgomery/XZ.v#L89> 2065 // <https://github.com/mit-plv/fiat-crypto/blob/2456d821825521f7e03e65882cc3521795b0320f/src/Curves/Montgomery/XZProofs.v#L131> 2066 // x1 != 0 <https://github.com/mit-plv/fiat-crypto/blob/2456d821825521f7e03e65882cc3521795b0320f/src/Curves/Montgomery/XZProofs.v#L217> 2067 // x1 = 0 <https://github.com/mit-plv/fiat-crypto/blob/2456d821825521f7e03e65882cc3521795b0320f/src/Curves/Montgomery/XZProofs.v#L147> 2068 fe_sub(&tmp0l, &x3, &z3); 2069 fe_sub(&tmp1l, &x2, &z2); 2070 fe_add(&x2l, &x2, &z2); 2071 fe_add(&z2l, &x3, &z3); 2072 fe_mul_tll(&z3, &tmp0l, &x2l); 2073 fe_mul_tll(&z2, &z2l, &tmp1l); 2074 fe_sq_tl(&tmp0, &tmp1l); 2075 fe_sq_tl(&tmp1, &x2l); 2076 fe_add(&x3l, &z3, &z2); 2077 fe_sub(&z2l, &z3, &z2); 2078 fe_mul_ttt(&x2, &tmp1, &tmp0); 2079 fe_sub(&tmp1l, &tmp1, &tmp0); 2080 fe_sq_tl(&z2, &z2l); 2081 fe_mul121666(&z3, &tmp1l); 2082 fe_sq_tl(&x3, &x3l); 2083 fe_add(&tmp0l, &tmp0, &z3); 2084 fe_mul_ttt(&z3, &x1, &z2); 2085 fe_mul_tll(&z2, &tmp1l, &tmp0l); 2086 } 2087 // here pos=-1, so r=e, so to_xz (e*P) === if swap then (x3, z3) else (x2, z2) 2088 fe_cswap(&x2, &x3, swap); 2089 fe_cswap(&z2, &z3, swap); 2090 2091 fe_invert(&z2, &z2); 2092 fe_mul_ttt(&x2, &x2, &z2); 2093 fe_tobytes(out, &x2); 2094 } 2095 2096 static void x25519_scalar_mult(uint8_t out[32], const uint8_t scalar[32], 2097 const uint8_t point[32]) { 2098 #if defined(BORINGSSL_X25519_NEON) 2099 if (CRYPTO_is_NEON_capable()) { 2100 x25519_NEON(out, scalar, point); 2101 return; 2102 } 2103 #endif 2104 2105 x25519_scalar_mult_generic(out, scalar, point); 2106 } 2107 2108 void X25519_keypair(uint8_t out_public_value[32], uint8_t out_private_key[32]) { 2109 RAND_bytes(out_private_key, 32); 2110 2111 // All X25519 implementations should decode scalars correctly (see 2112 // https://tools.ietf.org/html/rfc7748#section-5). However, if an 2113 // implementation doesn't then it might interoperate with random keys a 2114 // fraction of the time because they'll, randomly, happen to be correctly 2115 // formed. 2116 // 2117 // Thus we do the opposite of the masking here to make sure that our private 2118 // keys are never correctly masked and so, hopefully, any incorrect 2119 // implementations are deterministically broken. 2120 // 2121 // This does not affect security because, although we're throwing away 2122 // entropy, a valid implementation of scalarmult should throw away the exact 2123 // same bits anyway. 2124 out_private_key[0] |= ~248; 2125 out_private_key[31] &= ~64; 2126 out_private_key[31] |= ~127; 2127 2128 X25519_public_from_private(out_public_value, out_private_key); 2129 } 2130 2131 int X25519(uint8_t out_shared_key[32], const uint8_t private_key[32], 2132 const uint8_t peer_public_value[32]) { 2133 static const uint8_t kZeros[32] = {0}; 2134 x25519_scalar_mult(out_shared_key, private_key, peer_public_value); 2135 // The all-zero output results when the input is a point of small order. 2136 return CRYPTO_memcmp(kZeros, out_shared_key, 32) != 0; 2137 } 2138 2139 void X25519_public_from_private(uint8_t out_public_value[32], 2140 const uint8_t private_key[32]) { 2141 #if defined(BORINGSSL_X25519_NEON) 2142 if (CRYPTO_is_NEON_capable()) { 2143 static const uint8_t kMongomeryBasePoint[32] = {9}; 2144 x25519_NEON(out_public_value, private_key, kMongomeryBasePoint); 2145 return; 2146 } 2147 #endif 2148 2149 uint8_t e[32]; 2150 OPENSSL_memcpy(e, private_key, 32); 2151 e[0] &= 248; 2152 e[31] &= 127; 2153 e[31] |= 64; 2154 2155 ge_p3 A; 2156 x25519_ge_scalarmult_base(&A, e); 2157 2158 // We only need the u-coordinate of the curve25519 point. The map is 2159 // u=(y+1)/(1-y). Since y=Y/Z, this gives u=(Z+Y)/(Z-Y). 2160 fe_loose zplusy, zminusy; 2161 fe zminusy_inv; 2162 fe_add(&zplusy, &A.Z, &A.Y); 2163 fe_sub(&zminusy, &A.Z, &A.Y); 2164 fe_loose_invert(&zminusy_inv, &zminusy); 2165 fe_mul_tlt(&zminusy_inv, &zplusy, &zminusy_inv); 2166 fe_tobytes(out_public_value, &zminusy_inv); 2167 } 2168