Home | History | Annotate | Download | only in fiat
      1 // The MIT License (MIT)
      2 //
      3 // Copyright (c) 2015-2016 the fiat-crypto authors (see the AUTHORS file).
      4 //
      5 // Permission is hereby granted, free of charge, to any person obtaining a copy
      6 // of this software and associated documentation files (the "Software"), to deal
      7 // in the Software without restriction, including without limitation the rights
      8 // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
      9 // copies of the Software, and to permit persons to whom the Software is
     10 // furnished to do so, subject to the following conditions:
     11 //
     12 // The above copyright notice and this permission notice shall be included in all
     13 // copies or substantial portions of the Software.
     14 //
     15 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
     16 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
     17 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
     18 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
     19 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
     20 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
     21 // SOFTWARE.
     22 
     23 // Some of this code is taken from the ref10 version of Ed25519 in SUPERCOP
     24 // 20141124 (http://bench.cr.yp.to/supercop.html). That code is released as
     25 // public domain but parts have been replaced with code generated by Fiat
     26 // (https://github.com/mit-plv/fiat-crypto), which is MIT licensed.
     27 //
     28 // The field functions are shared by Ed25519 and X25519 where possible.
     29 
     30 #include <openssl/curve25519.h>
     31 
     32 #include <assert.h>
     33 #include <string.h>
     34 
     35 #include <openssl/cpu.h>
     36 #include <openssl/mem.h>
     37 #include <openssl/rand.h>
     38 #include <openssl/sha.h>
     39 #include <openssl/type_check.h>
     40 
     41 #include "internal.h"
     42 #include "../../crypto/internal.h"
     43 
     44 
     45 // Various pre-computed constants.
     46 #include "./curve25519_tables.h"
     47 
     48 #if defined(BORINGSSL_CURVE25519_64BIT)
     49 #include "./curve25519_64.h"
     50 #else
     51 #include "./curve25519_32.h"
     52 #endif  // BORINGSSL_CURVE25519_64BIT
     53 
     54 
     55 // Low-level intrinsic operations
     56 
     57 static uint64_t load_3(const uint8_t *in) {
     58   uint64_t result;
     59   result = (uint64_t)in[0];
     60   result |= ((uint64_t)in[1]) << 8;
     61   result |= ((uint64_t)in[2]) << 16;
     62   return result;
     63 }
     64 
     65 static uint64_t load_4(const uint8_t *in) {
     66   uint64_t result;
     67   result = (uint64_t)in[0];
     68   result |= ((uint64_t)in[1]) << 8;
     69   result |= ((uint64_t)in[2]) << 16;
     70   result |= ((uint64_t)in[3]) << 24;
     71   return result;
     72 }
     73 
     74 
     75 // Field operations.
     76 
     77 #if defined(BORINGSSL_CURVE25519_64BIT)
     78 
     79 typedef uint64_t fe_limb_t;
     80 #define FE_NUM_LIMBS 5
     81 
     82 // assert_fe asserts that |f| satisfies bounds:
     83 //
     84 //  [[0x0 ~> 0x8cccccccccccc],
     85 //   [0x0 ~> 0x8cccccccccccc],
     86 //   [0x0 ~> 0x8cccccccccccc],
     87 //   [0x0 ~> 0x8cccccccccccc],
     88 //   [0x0 ~> 0x8cccccccccccc]]
     89 //
     90 // See comments in curve25519_64.h for which functions use these bounds for
     91 // inputs or outputs.
     92 #define assert_fe(f)                                                    \
     93   do {                                                                  \
     94     for (unsigned _assert_fe_i = 0; _assert_fe_i < 5; _assert_fe_i++) { \
     95       assert(f[_assert_fe_i] <= UINT64_C(0x8cccccccccccc));             \
     96     }                                                                   \
     97   } while (0)
     98 
     99 // assert_fe_loose asserts that |f| satisfies bounds:
    100 //
    101 //  [[0x0 ~> 0x1a666666666664],
    102 //   [0x0 ~> 0x1a666666666664],
    103 //   [0x0 ~> 0x1a666666666664],
    104 //   [0x0 ~> 0x1a666666666664],
    105 //   [0x0 ~> 0x1a666666666664]]
    106 //
    107 // See comments in curve25519_64.h for which functions use these bounds for
    108 // inputs or outputs.
    109 #define assert_fe_loose(f)                                              \
    110   do {                                                                  \
    111     for (unsigned _assert_fe_i = 0; _assert_fe_i < 5; _assert_fe_i++) { \
    112       assert(f[_assert_fe_i] <= UINT64_C(0x1a666666666664));            \
    113     }                                                                   \
    114   } while (0)
    115 
    116 #else
    117 
    118 typedef uint32_t fe_limb_t;
    119 #define FE_NUM_LIMBS 10
    120 
    121 // assert_fe asserts that |f| satisfies bounds:
    122 //
    123 //  [[0x0 ~> 0x4666666], [0x0 ~> 0x2333333],
    124 //   [0x0 ~> 0x4666666], [0x0 ~> 0x2333333],
    125 //   [0x0 ~> 0x4666666], [0x0 ~> 0x2333333],
    126 //   [0x0 ~> 0x4666666], [0x0 ~> 0x2333333],
    127 //   [0x0 ~> 0x4666666], [0x0 ~> 0x2333333]]
    128 //
    129 // See comments in curve25519_32.h for which functions use these bounds for
    130 // inputs or outputs.
    131 #define assert_fe(f)                                                     \
    132   do {                                                                   \
    133     for (unsigned _assert_fe_i = 0; _assert_fe_i < 10; _assert_fe_i++) { \
    134       assert(f[_assert_fe_i] <=                                          \
    135              ((_assert_fe_i & 1) ? 0x2333333u : 0x4666666u));            \
    136     }                                                                    \
    137   } while (0)
    138 
    139 // assert_fe_loose asserts that |f| satisfies bounds:
    140 //
    141 //  [[0x0 ~> 0xd333332], [0x0 ~> 0x6999999],
    142 //   [0x0 ~> 0xd333332], [0x0 ~> 0x6999999],
    143 //   [0x0 ~> 0xd333332], [0x0 ~> 0x6999999],
    144 //   [0x0 ~> 0xd333332], [0x0 ~> 0x6999999],
    145 //   [0x0 ~> 0xd333332], [0x0 ~> 0x6999999]]
    146 //
    147 // See comments in curve25519_32.h for which functions use these bounds for
    148 // inputs or outputs.
    149 #define assert_fe_loose(f)                                               \
    150   do {                                                                   \
    151     for (unsigned _assert_fe_i = 0; _assert_fe_i < 10; _assert_fe_i++) { \
    152       assert(f[_assert_fe_i] <=                                          \
    153              ((_assert_fe_i & 1) ? 0x6999999u : 0xd333332u));            \
    154     }                                                                    \
    155   } while (0)
    156 
    157 #endif  // BORINGSSL_CURVE25519_64BIT
    158 
    159 OPENSSL_STATIC_ASSERT(sizeof(fe) == sizeof(fe_limb_t) * FE_NUM_LIMBS,
    160                       "fe_limb_t[FE_NUM_LIMBS] is inconsistent with fe");
    161 
    162 static void fe_frombytes_strict(fe *h, const uint8_t s[32]) {
    163   // |fiat_25519_from_bytes| requires the top-most bit be clear.
    164   assert((s[31] & 0x80) == 0);
    165   fiat_25519_from_bytes(h->v, s);
    166   assert_fe(h->v);
    167 }
    168 
    169 static void fe_frombytes(fe *h, const uint8_t s[32]) {
    170   uint8_t s_copy[32];
    171   OPENSSL_memcpy(s_copy, s, 32);
    172   s_copy[31] &= 0x7f;
    173   fe_frombytes_strict(h, s_copy);
    174 }
    175 
    176 static void fe_tobytes(uint8_t s[32], const fe *f) {
    177   assert_fe(f->v);
    178   fiat_25519_to_bytes(s, f->v);
    179 }
    180 
    181 // h = 0
    182 static void fe_0(fe *h) {
    183   OPENSSL_memset(h, 0, sizeof(fe));
    184 }
    185 
    186 static void fe_loose_0(fe_loose *h) {
    187   OPENSSL_memset(h, 0, sizeof(fe_loose));
    188 }
    189 
    190 // h = 1
    191 static void fe_1(fe *h) {
    192   OPENSSL_memset(h, 0, sizeof(fe));
    193   h->v[0] = 1;
    194 }
    195 
    196 static void fe_loose_1(fe_loose *h) {
    197   OPENSSL_memset(h, 0, sizeof(fe_loose));
    198   h->v[0] = 1;
    199 }
    200 
    201 // h = f + g
    202 // Can overlap h with f or g.
    203 static void fe_add(fe_loose *h, const fe *f, const fe *g) {
    204   assert_fe(f->v);
    205   assert_fe(g->v);
    206   fiat_25519_add(h->v, f->v, g->v);
    207   assert_fe_loose(h->v);
    208 }
    209 
    210 // h = f - g
    211 // Can overlap h with f or g.
    212 static void fe_sub(fe_loose *h, const fe *f, const fe *g) {
    213   assert_fe(f->v);
    214   assert_fe(g->v);
    215   fiat_25519_sub(h->v, f->v, g->v);
    216   assert_fe_loose(h->v);
    217 }
    218 
    219 static void fe_carry(fe *h, const fe_loose* f) {
    220   assert_fe_loose(f->v);
    221   fiat_25519_carry(h->v, f->v);
    222   assert_fe(h->v);
    223 }
    224 
    225 static void fe_mul_impl(fe_limb_t out[FE_NUM_LIMBS],
    226                         const fe_limb_t in1[FE_NUM_LIMBS],
    227                         const fe_limb_t in2[FE_NUM_LIMBS]) {
    228   assert_fe_loose(in1);
    229   assert_fe_loose(in2);
    230   fiat_25519_carry_mul(out, in1, in2);
    231   assert_fe(out);
    232 }
    233 
    234 static void fe_mul_ltt(fe_loose *h, const fe *f, const fe *g) {
    235   fe_mul_impl(h->v, f->v, g->v);
    236 }
    237 
    238 static void fe_mul_llt(fe_loose *h, const fe_loose *f, const fe *g) {
    239   fe_mul_impl(h->v, f->v, g->v);
    240 }
    241 
    242 static void fe_mul_ttt(fe *h, const fe *f, const fe *g) {
    243   fe_mul_impl(h->v, f->v, g->v);
    244 }
    245 
    246 static void fe_mul_tlt(fe *h, const fe_loose *f, const fe *g) {
    247   fe_mul_impl(h->v, f->v, g->v);
    248 }
    249 
    250 static void fe_mul_ttl(fe *h, const fe *f, const fe_loose *g) {
    251   fe_mul_impl(h->v, f->v, g->v);
    252 }
    253 
    254 static void fe_mul_tll(fe *h, const fe_loose *f, const fe_loose *g) {
    255   fe_mul_impl(h->v, f->v, g->v);
    256 }
    257 
    258 static void fe_sq_tl(fe *h, const fe_loose *f) {
    259   assert_fe_loose(f->v);
    260   fiat_25519_carry_square(h->v, f->v);
    261   assert_fe(h->v);
    262 }
    263 
    264 static void fe_sq_tt(fe *h, const fe *f) {
    265   assert_fe_loose(f->v);
    266   fiat_25519_carry_square(h->v, f->v);
    267   assert_fe(h->v);
    268 }
    269 
    270 // Replace (f,g) with (g,f) if b == 1;
    271 // replace (f,g) with (f,g) if b == 0.
    272 //
    273 // Preconditions: b in {0,1}.
    274 static void fe_cswap(fe *f, fe *g, fe_limb_t b) {
    275   b = 0-b;
    276   for (unsigned i = 0; i < FE_NUM_LIMBS; i++) {
    277     fe_limb_t x = f->v[i] ^ g->v[i];
    278     x &= b;
    279     f->v[i] ^= x;
    280     g->v[i] ^= x;
    281   }
    282 }
    283 
    284 static void fe_mul121666(fe *h, const fe_loose *f) {
    285   assert_fe_loose(f->v);
    286   fiat_25519_carry_scmul_121666(h->v, f->v);
    287   assert_fe(h->v);
    288 }
    289 
    290 // h = -f
    291 static void fe_neg(fe_loose *h, const fe *f) {
    292   assert_fe(f->v);
    293   fiat_25519_opp(h->v, f->v);
    294   assert_fe_loose(h->v);
    295 }
    296 
    297 // Replace (f,g) with (g,g) if b == 1;
    298 // replace (f,g) with (f,g) if b == 0.
    299 //
    300 // Preconditions: b in {0,1}.
    301 static void fe_cmov(fe_loose *f, const fe_loose *g, fe_limb_t b) {
    302   // Silence an unused function warning. |fiat_25519_selectznz| isn't quite the
    303   // calling convention the rest of this code wants, so implement it by hand.
    304   //
    305   // TODO(davidben): Switch to fiat's calling convention, or ask fiat to emit a
    306   // different one.
    307   (void)fiat_25519_selectznz;
    308 
    309   b = 0-b;
    310   for (unsigned i = 0; i < FE_NUM_LIMBS; i++) {
    311     fe_limb_t x = f->v[i] ^ g->v[i];
    312     x &= b;
    313     f->v[i] ^= x;
    314   }
    315 }
    316 
    317 // h = f
    318 static void fe_copy(fe *h, const fe *f) {
    319   OPENSSL_memmove(h, f, sizeof(fe));
    320 }
    321 
    322 static void fe_copy_lt(fe_loose *h, const fe *f) {
    323   OPENSSL_STATIC_ASSERT(sizeof(fe_loose) == sizeof(fe),
    324                         "fe and fe_loose mismatch");
    325   OPENSSL_memmove(h, f, sizeof(fe));
    326 }
    327 #if !defined(OPENSSL_SMALL)
    328 static void fe_copy_ll(fe_loose *h, const fe_loose *f) {
    329   OPENSSL_memmove(h, f, sizeof(fe_loose));
    330 }
    331 #endif // !defined(OPENSSL_SMALL)
    332 
    333 static void fe_loose_invert(fe *out, const fe_loose *z) {
    334   fe t0;
    335   fe t1;
    336   fe t2;
    337   fe t3;
    338   int i;
    339 
    340   fe_sq_tl(&t0, z);
    341   fe_sq_tt(&t1, &t0);
    342   for (i = 1; i < 2; ++i) {
    343     fe_sq_tt(&t1, &t1);
    344   }
    345   fe_mul_tlt(&t1, z, &t1);
    346   fe_mul_ttt(&t0, &t0, &t1);
    347   fe_sq_tt(&t2, &t0);
    348   fe_mul_ttt(&t1, &t1, &t2);
    349   fe_sq_tt(&t2, &t1);
    350   for (i = 1; i < 5; ++i) {
    351     fe_sq_tt(&t2, &t2);
    352   }
    353   fe_mul_ttt(&t1, &t2, &t1);
    354   fe_sq_tt(&t2, &t1);
    355   for (i = 1; i < 10; ++i) {
    356     fe_sq_tt(&t2, &t2);
    357   }
    358   fe_mul_ttt(&t2, &t2, &t1);
    359   fe_sq_tt(&t3, &t2);
    360   for (i = 1; i < 20; ++i) {
    361     fe_sq_tt(&t3, &t3);
    362   }
    363   fe_mul_ttt(&t2, &t3, &t2);
    364   fe_sq_tt(&t2, &t2);
    365   for (i = 1; i < 10; ++i) {
    366     fe_sq_tt(&t2, &t2);
    367   }
    368   fe_mul_ttt(&t1, &t2, &t1);
    369   fe_sq_tt(&t2, &t1);
    370   for (i = 1; i < 50; ++i) {
    371     fe_sq_tt(&t2, &t2);
    372   }
    373   fe_mul_ttt(&t2, &t2, &t1);
    374   fe_sq_tt(&t3, &t2);
    375   for (i = 1; i < 100; ++i) {
    376     fe_sq_tt(&t3, &t3);
    377   }
    378   fe_mul_ttt(&t2, &t3, &t2);
    379   fe_sq_tt(&t2, &t2);
    380   for (i = 1; i < 50; ++i) {
    381     fe_sq_tt(&t2, &t2);
    382   }
    383   fe_mul_ttt(&t1, &t2, &t1);
    384   fe_sq_tt(&t1, &t1);
    385   for (i = 1; i < 5; ++i) {
    386     fe_sq_tt(&t1, &t1);
    387   }
    388   fe_mul_ttt(out, &t1, &t0);
    389 }
    390 
    391 static void fe_invert(fe *out, const fe *z) {
    392   fe_loose l;
    393   fe_copy_lt(&l, z);
    394   fe_loose_invert(out, &l);
    395 }
    396 
    397 // return 0 if f == 0
    398 // return 1 if f != 0
    399 static int fe_isnonzero(const fe_loose *f) {
    400   fe tight;
    401   fe_carry(&tight, f);
    402   uint8_t s[32];
    403   fe_tobytes(s, &tight);
    404 
    405   static const uint8_t zero[32] = {0};
    406   return CRYPTO_memcmp(s, zero, sizeof(zero)) != 0;
    407 }
    408 
    409 // return 1 if f is in {1,3,5,...,q-2}
    410 // return 0 if f is in {0,2,4,...,q-1}
    411 static int fe_isnegative(const fe *f) {
    412   uint8_t s[32];
    413   fe_tobytes(s, f);
    414   return s[0] & 1;
    415 }
    416 
    417 static void fe_sq2_tt(fe *h, const fe *f) {
    418   // h = f^2
    419   fe_sq_tt(h, f);
    420 
    421   // h = h + h
    422   fe_loose tmp;
    423   fe_add(&tmp, h, h);
    424   fe_carry(h, &tmp);
    425 }
    426 
    427 static void fe_pow22523(fe *out, const fe *z) {
    428   fe t0;
    429   fe t1;
    430   fe t2;
    431   int i;
    432 
    433   fe_sq_tt(&t0, z);
    434   fe_sq_tt(&t1, &t0);
    435   for (i = 1; i < 2; ++i) {
    436     fe_sq_tt(&t1, &t1);
    437   }
    438   fe_mul_ttt(&t1, z, &t1);
    439   fe_mul_ttt(&t0, &t0, &t1);
    440   fe_sq_tt(&t0, &t0);
    441   fe_mul_ttt(&t0, &t1, &t0);
    442   fe_sq_tt(&t1, &t0);
    443   for (i = 1; i < 5; ++i) {
    444     fe_sq_tt(&t1, &t1);
    445   }
    446   fe_mul_ttt(&t0, &t1, &t0);
    447   fe_sq_tt(&t1, &t0);
    448   for (i = 1; i < 10; ++i) {
    449     fe_sq_tt(&t1, &t1);
    450   }
    451   fe_mul_ttt(&t1, &t1, &t0);
    452   fe_sq_tt(&t2, &t1);
    453   for (i = 1; i < 20; ++i) {
    454     fe_sq_tt(&t2, &t2);
    455   }
    456   fe_mul_ttt(&t1, &t2, &t1);
    457   fe_sq_tt(&t1, &t1);
    458   for (i = 1; i < 10; ++i) {
    459     fe_sq_tt(&t1, &t1);
    460   }
    461   fe_mul_ttt(&t0, &t1, &t0);
    462   fe_sq_tt(&t1, &t0);
    463   for (i = 1; i < 50; ++i) {
    464     fe_sq_tt(&t1, &t1);
    465   }
    466   fe_mul_ttt(&t1, &t1, &t0);
    467   fe_sq_tt(&t2, &t1);
    468   for (i = 1; i < 100; ++i) {
    469     fe_sq_tt(&t2, &t2);
    470   }
    471   fe_mul_ttt(&t1, &t2, &t1);
    472   fe_sq_tt(&t1, &t1);
    473   for (i = 1; i < 50; ++i) {
    474     fe_sq_tt(&t1, &t1);
    475   }
    476   fe_mul_ttt(&t0, &t1, &t0);
    477   fe_sq_tt(&t0, &t0);
    478   for (i = 1; i < 2; ++i) {
    479     fe_sq_tt(&t0, &t0);
    480   }
    481   fe_mul_ttt(out, &t0, z);
    482 }
    483 
    484 
    485 // Group operations.
    486 
    487 void x25519_ge_tobytes(uint8_t s[32], const ge_p2 *h) {
    488   fe recip;
    489   fe x;
    490   fe y;
    491 
    492   fe_invert(&recip, &h->Z);
    493   fe_mul_ttt(&x, &h->X, &recip);
    494   fe_mul_ttt(&y, &h->Y, &recip);
    495   fe_tobytes(s, &y);
    496   s[31] ^= fe_isnegative(&x) << 7;
    497 }
    498 
    499 static void ge_p3_tobytes(uint8_t s[32], const ge_p3 *h) {
    500   fe recip;
    501   fe x;
    502   fe y;
    503 
    504   fe_invert(&recip, &h->Z);
    505   fe_mul_ttt(&x, &h->X, &recip);
    506   fe_mul_ttt(&y, &h->Y, &recip);
    507   fe_tobytes(s, &y);
    508   s[31] ^= fe_isnegative(&x) << 7;
    509 }
    510 
    511 int x25519_ge_frombytes_vartime(ge_p3 *h, const uint8_t s[32]) {
    512   fe u;
    513   fe_loose v;
    514   fe v3;
    515   fe vxx;
    516   fe_loose check;
    517 
    518   fe_frombytes(&h->Y, s);
    519   fe_1(&h->Z);
    520   fe_sq_tt(&v3, &h->Y);
    521   fe_mul_ttt(&vxx, &v3, &d);
    522   fe_sub(&v, &v3, &h->Z);  // u = y^2-1
    523   fe_carry(&u, &v);
    524   fe_add(&v, &vxx, &h->Z);  // v = dy^2+1
    525 
    526   fe_sq_tl(&v3, &v);
    527   fe_mul_ttl(&v3, &v3, &v);  // v3 = v^3
    528   fe_sq_tt(&h->X, &v3);
    529   fe_mul_ttl(&h->X, &h->X, &v);
    530   fe_mul_ttt(&h->X, &h->X, &u);  // x = uv^7
    531 
    532   fe_pow22523(&h->X, &h->X);  // x = (uv^7)^((q-5)/8)
    533   fe_mul_ttt(&h->X, &h->X, &v3);
    534   fe_mul_ttt(&h->X, &h->X, &u);  // x = uv^3(uv^7)^((q-5)/8)
    535 
    536   fe_sq_tt(&vxx, &h->X);
    537   fe_mul_ttl(&vxx, &vxx, &v);
    538   fe_sub(&check, &vxx, &u);
    539   if (fe_isnonzero(&check)) {
    540     fe_add(&check, &vxx, &u);
    541     if (fe_isnonzero(&check)) {
    542       return 0;
    543     }
    544     fe_mul_ttt(&h->X, &h->X, &sqrtm1);
    545   }
    546 
    547   if (fe_isnegative(&h->X) != (s[31] >> 7)) {
    548     fe_loose t;
    549     fe_neg(&t, &h->X);
    550     fe_carry(&h->X, &t);
    551   }
    552 
    553   fe_mul_ttt(&h->T, &h->X, &h->Y);
    554   return 1;
    555 }
    556 
    557 static void ge_p2_0(ge_p2 *h) {
    558   fe_0(&h->X);
    559   fe_1(&h->Y);
    560   fe_1(&h->Z);
    561 }
    562 
    563 static void ge_p3_0(ge_p3 *h) {
    564   fe_0(&h->X);
    565   fe_1(&h->Y);
    566   fe_1(&h->Z);
    567   fe_0(&h->T);
    568 }
    569 
    570 static void ge_cached_0(ge_cached *h) {
    571   fe_loose_1(&h->YplusX);
    572   fe_loose_1(&h->YminusX);
    573   fe_loose_1(&h->Z);
    574   fe_loose_0(&h->T2d);
    575 }
    576 
    577 static void ge_precomp_0(ge_precomp *h) {
    578   fe_loose_1(&h->yplusx);
    579   fe_loose_1(&h->yminusx);
    580   fe_loose_0(&h->xy2d);
    581 }
    582 
    583 // r = p
    584 static void ge_p3_to_p2(ge_p2 *r, const ge_p3 *p) {
    585   fe_copy(&r->X, &p->X);
    586   fe_copy(&r->Y, &p->Y);
    587   fe_copy(&r->Z, &p->Z);
    588 }
    589 
    590 // r = p
    591 void x25519_ge_p3_to_cached(ge_cached *r, const ge_p3 *p) {
    592   fe_add(&r->YplusX, &p->Y, &p->X);
    593   fe_sub(&r->YminusX, &p->Y, &p->X);
    594   fe_copy_lt(&r->Z, &p->Z);
    595   fe_mul_ltt(&r->T2d, &p->T, &d2);
    596 }
    597 
    598 // r = p
    599 void x25519_ge_p1p1_to_p2(ge_p2 *r, const ge_p1p1 *p) {
    600   fe_mul_tll(&r->X, &p->X, &p->T);
    601   fe_mul_tll(&r->Y, &p->Y, &p->Z);
    602   fe_mul_tll(&r->Z, &p->Z, &p->T);
    603 }
    604 
    605 // r = p
    606 void x25519_ge_p1p1_to_p3(ge_p3 *r, const ge_p1p1 *p) {
    607   fe_mul_tll(&r->X, &p->X, &p->T);
    608   fe_mul_tll(&r->Y, &p->Y, &p->Z);
    609   fe_mul_tll(&r->Z, &p->Z, &p->T);
    610   fe_mul_tll(&r->T, &p->X, &p->Y);
    611 }
    612 
    613 // r = p
    614 static void ge_p1p1_to_cached(ge_cached *r, const ge_p1p1 *p) {
    615   ge_p3 t;
    616   x25519_ge_p1p1_to_p3(&t, p);
    617   x25519_ge_p3_to_cached(r, &t);
    618 }
    619 
    620 // r = 2 * p
    621 static void ge_p2_dbl(ge_p1p1 *r, const ge_p2 *p) {
    622   fe trX, trZ, trT;
    623   fe t0;
    624 
    625   fe_sq_tt(&trX, &p->X);
    626   fe_sq_tt(&trZ, &p->Y);
    627   fe_sq2_tt(&trT, &p->Z);
    628   fe_add(&r->Y, &p->X, &p->Y);
    629   fe_sq_tl(&t0, &r->Y);
    630 
    631   fe_add(&r->Y, &trZ, &trX);
    632   fe_sub(&r->Z, &trZ, &trX);
    633   fe_carry(&trZ, &r->Y);
    634   fe_sub(&r->X, &t0, &trZ);
    635   fe_carry(&trZ, &r->Z);
    636   fe_sub(&r->T, &trT, &trZ);
    637 }
    638 
    639 // r = 2 * p
    640 static void ge_p3_dbl(ge_p1p1 *r, const ge_p3 *p) {
    641   ge_p2 q;
    642   ge_p3_to_p2(&q, p);
    643   ge_p2_dbl(r, &q);
    644 }
    645 
    646 // r = p + q
    647 static void ge_madd(ge_p1p1 *r, const ge_p3 *p, const ge_precomp *q) {
    648   fe trY, trZ, trT;
    649 
    650   fe_add(&r->X, &p->Y, &p->X);
    651   fe_sub(&r->Y, &p->Y, &p->X);
    652   fe_mul_tll(&trZ, &r->X, &q->yplusx);
    653   fe_mul_tll(&trY, &r->Y, &q->yminusx);
    654   fe_mul_tlt(&trT, &q->xy2d, &p->T);
    655   fe_add(&r->T, &p->Z, &p->Z);
    656   fe_sub(&r->X, &trZ, &trY);
    657   fe_add(&r->Y, &trZ, &trY);
    658   fe_carry(&trZ, &r->T);
    659   fe_add(&r->Z, &trZ, &trT);
    660   fe_sub(&r->T, &trZ, &trT);
    661 }
    662 
    663 // r = p - q
    664 static void ge_msub(ge_p1p1 *r, const ge_p3 *p, const ge_precomp *q) {
    665   fe trY, trZ, trT;
    666 
    667   fe_add(&r->X, &p->Y, &p->X);
    668   fe_sub(&r->Y, &p->Y, &p->X);
    669   fe_mul_tll(&trZ, &r->X, &q->yminusx);
    670   fe_mul_tll(&trY, &r->Y, &q->yplusx);
    671   fe_mul_tlt(&trT, &q->xy2d, &p->T);
    672   fe_add(&r->T, &p->Z, &p->Z);
    673   fe_sub(&r->X, &trZ, &trY);
    674   fe_add(&r->Y, &trZ, &trY);
    675   fe_carry(&trZ, &r->T);
    676   fe_sub(&r->Z, &trZ, &trT);
    677   fe_add(&r->T, &trZ, &trT);
    678 }
    679 
    680 // r = p + q
    681 void x25519_ge_add(ge_p1p1 *r, const ge_p3 *p, const ge_cached *q) {
    682   fe trX, trY, trZ, trT;
    683 
    684   fe_add(&r->X, &p->Y, &p->X);
    685   fe_sub(&r->Y, &p->Y, &p->X);
    686   fe_mul_tll(&trZ, &r->X, &q->YplusX);
    687   fe_mul_tll(&trY, &r->Y, &q->YminusX);
    688   fe_mul_tlt(&trT, &q->T2d, &p->T);
    689   fe_mul_ttl(&trX, &p->Z, &q->Z);
    690   fe_add(&r->T, &trX, &trX);
    691   fe_sub(&r->X, &trZ, &trY);
    692   fe_add(&r->Y, &trZ, &trY);
    693   fe_carry(&trZ, &r->T);
    694   fe_add(&r->Z, &trZ, &trT);
    695   fe_sub(&r->T, &trZ, &trT);
    696 }
    697 
    698 // r = p - q
    699 void x25519_ge_sub(ge_p1p1 *r, const ge_p3 *p, const ge_cached *q) {
    700   fe trX, trY, trZ, trT;
    701 
    702   fe_add(&r->X, &p->Y, &p->X);
    703   fe_sub(&r->Y, &p->Y, &p->X);
    704   fe_mul_tll(&trZ, &r->X, &q->YminusX);
    705   fe_mul_tll(&trY, &r->Y, &q->YplusX);
    706   fe_mul_tlt(&trT, &q->T2d, &p->T);
    707   fe_mul_ttl(&trX, &p->Z, &q->Z);
    708   fe_add(&r->T, &trX, &trX);
    709   fe_sub(&r->X, &trZ, &trY);
    710   fe_add(&r->Y, &trZ, &trY);
    711   fe_carry(&trZ, &r->T);
    712   fe_sub(&r->Z, &trZ, &trT);
    713   fe_add(&r->T, &trZ, &trT);
    714 }
    715 
    716 static uint8_t equal(signed char b, signed char c) {
    717   uint8_t ub = b;
    718   uint8_t uc = c;
    719   uint8_t x = ub ^ uc;  // 0: yes; 1..255: no
    720   uint32_t y = x;       // 0: yes; 1..255: no
    721   y -= 1;               // 4294967295: yes; 0..254: no
    722   y >>= 31;             // 1: yes; 0: no
    723   return y;
    724 }
    725 
    726 static void cmov(ge_precomp *t, const ge_precomp *u, uint8_t b) {
    727   fe_cmov(&t->yplusx, &u->yplusx, b);
    728   fe_cmov(&t->yminusx, &u->yminusx, b);
    729   fe_cmov(&t->xy2d, &u->xy2d, b);
    730 }
    731 
    732 void x25519_ge_scalarmult_small_precomp(
    733     ge_p3 *h, const uint8_t a[32], const uint8_t precomp_table[15 * 2 * 32]) {
    734   // precomp_table is first expanded into matching |ge_precomp|
    735   // elements.
    736   ge_precomp multiples[15];
    737 
    738   unsigned i;
    739   for (i = 0; i < 15; i++) {
    740     // The precomputed table is assumed to already clear the top bit, so
    741     // |fe_frombytes_strict| may be used directly.
    742     const uint8_t *bytes = &precomp_table[i*(2 * 32)];
    743     fe x, y;
    744     fe_frombytes_strict(&x, bytes);
    745     fe_frombytes_strict(&y, bytes + 32);
    746 
    747     ge_precomp *out = &multiples[i];
    748     fe_add(&out->yplusx, &y, &x);
    749     fe_sub(&out->yminusx, &y, &x);
    750     fe_mul_ltt(&out->xy2d, &x, &y);
    751     fe_mul_llt(&out->xy2d, &out->xy2d, &d2);
    752   }
    753 
    754   // See the comment above |k25519SmallPrecomp| about the structure of the
    755   // precomputed elements. This loop does 64 additions and 64 doublings to
    756   // calculate the result.
    757   ge_p3_0(h);
    758 
    759   for (i = 63; i < 64; i--) {
    760     unsigned j;
    761     signed char index = 0;
    762 
    763     for (j = 0; j < 4; j++) {
    764       const uint8_t bit = 1 & (a[(8 * j) + (i / 8)] >> (i & 7));
    765       index |= (bit << j);
    766     }
    767 
    768     ge_precomp e;
    769     ge_precomp_0(&e);
    770 
    771     for (j = 1; j < 16; j++) {
    772       cmov(&e, &multiples[j-1], equal(index, j));
    773     }
    774 
    775     ge_cached cached;
    776     ge_p1p1 r;
    777     x25519_ge_p3_to_cached(&cached, h);
    778     x25519_ge_add(&r, h, &cached);
    779     x25519_ge_p1p1_to_p3(h, &r);
    780 
    781     ge_madd(&r, h, &e);
    782     x25519_ge_p1p1_to_p3(h, &r);
    783   }
    784 }
    785 
    786 #if defined(OPENSSL_SMALL)
    787 
    788 void x25519_ge_scalarmult_base(ge_p3 *h, const uint8_t a[32]) {
    789   x25519_ge_scalarmult_small_precomp(h, a, k25519SmallPrecomp);
    790 }
    791 
    792 #else
    793 
    794 static uint8_t negative(signed char b) {
    795   uint32_t x = b;
    796   x >>= 31;  // 1: yes; 0: no
    797   return x;
    798 }
    799 
    800 static void table_select(ge_precomp *t, int pos, signed char b) {
    801   ge_precomp minust;
    802   uint8_t bnegative = negative(b);
    803   uint8_t babs = b - ((uint8_t)((-bnegative) & b) << 1);
    804 
    805   ge_precomp_0(t);
    806   cmov(t, &k25519Precomp[pos][0], equal(babs, 1));
    807   cmov(t, &k25519Precomp[pos][1], equal(babs, 2));
    808   cmov(t, &k25519Precomp[pos][2], equal(babs, 3));
    809   cmov(t, &k25519Precomp[pos][3], equal(babs, 4));
    810   cmov(t, &k25519Precomp[pos][4], equal(babs, 5));
    811   cmov(t, &k25519Precomp[pos][5], equal(babs, 6));
    812   cmov(t, &k25519Precomp[pos][6], equal(babs, 7));
    813   cmov(t, &k25519Precomp[pos][7], equal(babs, 8));
    814   fe_copy_ll(&minust.yplusx, &t->yminusx);
    815   fe_copy_ll(&minust.yminusx, &t->yplusx);
    816 
    817   // NOTE: the input table is canonical, but types don't encode it
    818   fe tmp;
    819   fe_carry(&tmp, &t->xy2d);
    820   fe_neg(&minust.xy2d, &tmp);
    821 
    822   cmov(t, &minust, bnegative);
    823 }
    824 
    825 // h = a * B
    826 // where a = a[0]+256*a[1]+...+256^31 a[31]
    827 // B is the Ed25519 base point (x,4/5) with x positive.
    828 //
    829 // Preconditions:
    830 //   a[31] <= 127
    831 void x25519_ge_scalarmult_base(ge_p3 *h, const uint8_t *a) {
    832   signed char e[64];
    833   signed char carry;
    834   ge_p1p1 r;
    835   ge_p2 s;
    836   ge_precomp t;
    837   int i;
    838 
    839   for (i = 0; i < 32; ++i) {
    840     e[2 * i + 0] = (a[i] >> 0) & 15;
    841     e[2 * i + 1] = (a[i] >> 4) & 15;
    842   }
    843   // each e[i] is between 0 and 15
    844   // e[63] is between 0 and 7
    845 
    846   carry = 0;
    847   for (i = 0; i < 63; ++i) {
    848     e[i] += carry;
    849     carry = e[i] + 8;
    850     carry >>= 4;
    851     e[i] -= carry << 4;
    852   }
    853   e[63] += carry;
    854   // each e[i] is between -8 and 8
    855 
    856   ge_p3_0(h);
    857   for (i = 1; i < 64; i += 2) {
    858     table_select(&t, i / 2, e[i]);
    859     ge_madd(&r, h, &t);
    860     x25519_ge_p1p1_to_p3(h, &r);
    861   }
    862 
    863   ge_p3_dbl(&r, h);
    864   x25519_ge_p1p1_to_p2(&s, &r);
    865   ge_p2_dbl(&r, &s);
    866   x25519_ge_p1p1_to_p2(&s, &r);
    867   ge_p2_dbl(&r, &s);
    868   x25519_ge_p1p1_to_p2(&s, &r);
    869   ge_p2_dbl(&r, &s);
    870   x25519_ge_p1p1_to_p3(h, &r);
    871 
    872   for (i = 0; i < 64; i += 2) {
    873     table_select(&t, i / 2, e[i]);
    874     ge_madd(&r, h, &t);
    875     x25519_ge_p1p1_to_p3(h, &r);
    876   }
    877 }
    878 
    879 #endif
    880 
    881 static void cmov_cached(ge_cached *t, ge_cached *u, uint8_t b) {
    882   fe_cmov(&t->YplusX, &u->YplusX, b);
    883   fe_cmov(&t->YminusX, &u->YminusX, b);
    884   fe_cmov(&t->Z, &u->Z, b);
    885   fe_cmov(&t->T2d, &u->T2d, b);
    886 }
    887 
    888 // r = scalar * A.
    889 // where a = a[0]+256*a[1]+...+256^31 a[31].
    890 void x25519_ge_scalarmult(ge_p2 *r, const uint8_t *scalar, const ge_p3 *A) {
    891   ge_p2 Ai_p2[8];
    892   ge_cached Ai[16];
    893   ge_p1p1 t;
    894 
    895   ge_cached_0(&Ai[0]);
    896   x25519_ge_p3_to_cached(&Ai[1], A);
    897   ge_p3_to_p2(&Ai_p2[1], A);
    898 
    899   unsigned i;
    900   for (i = 2; i < 16; i += 2) {
    901     ge_p2_dbl(&t, &Ai_p2[i / 2]);
    902     ge_p1p1_to_cached(&Ai[i], &t);
    903     if (i < 8) {
    904       x25519_ge_p1p1_to_p2(&Ai_p2[i], &t);
    905     }
    906     x25519_ge_add(&t, A, &Ai[i]);
    907     ge_p1p1_to_cached(&Ai[i + 1], &t);
    908     if (i < 7) {
    909       x25519_ge_p1p1_to_p2(&Ai_p2[i + 1], &t);
    910     }
    911   }
    912 
    913   ge_p2_0(r);
    914   ge_p3 u;
    915 
    916   for (i = 0; i < 256; i += 4) {
    917     ge_p2_dbl(&t, r);
    918     x25519_ge_p1p1_to_p2(r, &t);
    919     ge_p2_dbl(&t, r);
    920     x25519_ge_p1p1_to_p2(r, &t);
    921     ge_p2_dbl(&t, r);
    922     x25519_ge_p1p1_to_p2(r, &t);
    923     ge_p2_dbl(&t, r);
    924     x25519_ge_p1p1_to_p3(&u, &t);
    925 
    926     uint8_t index = scalar[31 - i/8];
    927     index >>= 4 - (i & 4);
    928     index &= 0xf;
    929 
    930     unsigned j;
    931     ge_cached selected;
    932     ge_cached_0(&selected);
    933     for (j = 0; j < 16; j++) {
    934       cmov_cached(&selected, &Ai[j], equal(j, index));
    935     }
    936 
    937     x25519_ge_add(&t, &u, &selected);
    938     x25519_ge_p1p1_to_p2(r, &t);
    939   }
    940 }
    941 
    942 static void slide(signed char *r, const uint8_t *a) {
    943   int i;
    944   int b;
    945   int k;
    946 
    947   for (i = 0; i < 256; ++i) {
    948     r[i] = 1 & (a[i >> 3] >> (i & 7));
    949   }
    950 
    951   for (i = 0; i < 256; ++i) {
    952     if (r[i]) {
    953       for (b = 1; b <= 6 && i + b < 256; ++b) {
    954         if (r[i + b]) {
    955           if (r[i] + (r[i + b] << b) <= 15) {
    956             r[i] += r[i + b] << b;
    957             r[i + b] = 0;
    958           } else if (r[i] - (r[i + b] << b) >= -15) {
    959             r[i] -= r[i + b] << b;
    960             for (k = i + b; k < 256; ++k) {
    961               if (!r[k]) {
    962                 r[k] = 1;
    963                 break;
    964               }
    965               r[k] = 0;
    966             }
    967           } else {
    968             break;
    969           }
    970         }
    971       }
    972     }
    973   }
    974 }
    975 
    976 // r = a * A + b * B
    977 // where a = a[0]+256*a[1]+...+256^31 a[31].
    978 // and b = b[0]+256*b[1]+...+256^31 b[31].
    979 // B is the Ed25519 base point (x,4/5) with x positive.
    980 static void ge_double_scalarmult_vartime(ge_p2 *r, const uint8_t *a,
    981                                          const ge_p3 *A, const uint8_t *b) {
    982   signed char aslide[256];
    983   signed char bslide[256];
    984   ge_cached Ai[8];  // A,3A,5A,7A,9A,11A,13A,15A
    985   ge_p1p1 t;
    986   ge_p3 u;
    987   ge_p3 A2;
    988   int i;
    989 
    990   slide(aslide, a);
    991   slide(bslide, b);
    992 
    993   x25519_ge_p3_to_cached(&Ai[0], A);
    994   ge_p3_dbl(&t, A);
    995   x25519_ge_p1p1_to_p3(&A2, &t);
    996   x25519_ge_add(&t, &A2, &Ai[0]);
    997   x25519_ge_p1p1_to_p3(&u, &t);
    998   x25519_ge_p3_to_cached(&Ai[1], &u);
    999   x25519_ge_add(&t, &A2, &Ai[1]);
   1000   x25519_ge_p1p1_to_p3(&u, &t);
   1001   x25519_ge_p3_to_cached(&Ai[2], &u);
   1002   x25519_ge_add(&t, &A2, &Ai[2]);
   1003   x25519_ge_p1p1_to_p3(&u, &t);
   1004   x25519_ge_p3_to_cached(&Ai[3], &u);
   1005   x25519_ge_add(&t, &A2, &Ai[3]);
   1006   x25519_ge_p1p1_to_p3(&u, &t);
   1007   x25519_ge_p3_to_cached(&Ai[4], &u);
   1008   x25519_ge_add(&t, &A2, &Ai[4]);
   1009   x25519_ge_p1p1_to_p3(&u, &t);
   1010   x25519_ge_p3_to_cached(&Ai[5], &u);
   1011   x25519_ge_add(&t, &A2, &Ai[5]);
   1012   x25519_ge_p1p1_to_p3(&u, &t);
   1013   x25519_ge_p3_to_cached(&Ai[6], &u);
   1014   x25519_ge_add(&t, &A2, &Ai[6]);
   1015   x25519_ge_p1p1_to_p3(&u, &t);
   1016   x25519_ge_p3_to_cached(&Ai[7], &u);
   1017 
   1018   ge_p2_0(r);
   1019 
   1020   for (i = 255; i >= 0; --i) {
   1021     if (aslide[i] || bslide[i]) {
   1022       break;
   1023     }
   1024   }
   1025 
   1026   for (; i >= 0; --i) {
   1027     ge_p2_dbl(&t, r);
   1028 
   1029     if (aslide[i] > 0) {
   1030       x25519_ge_p1p1_to_p3(&u, &t);
   1031       x25519_ge_add(&t, &u, &Ai[aslide[i] / 2]);
   1032     } else if (aslide[i] < 0) {
   1033       x25519_ge_p1p1_to_p3(&u, &t);
   1034       x25519_ge_sub(&t, &u, &Ai[(-aslide[i]) / 2]);
   1035     }
   1036 
   1037     if (bslide[i] > 0) {
   1038       x25519_ge_p1p1_to_p3(&u, &t);
   1039       ge_madd(&t, &u, &Bi[bslide[i] / 2]);
   1040     } else if (bslide[i] < 0) {
   1041       x25519_ge_p1p1_to_p3(&u, &t);
   1042       ge_msub(&t, &u, &Bi[(-bslide[i]) / 2]);
   1043     }
   1044 
   1045     x25519_ge_p1p1_to_p2(r, &t);
   1046   }
   1047 }
   1048 
   1049 // int64_lshift21 returns |a << 21| but is defined when shifting bits into the
   1050 // sign bit. This works around a language flaw in C.
   1051 static inline int64_t int64_lshift21(int64_t a) {
   1052   return (int64_t)((uint64_t)a << 21);
   1053 }
   1054 
   1055 // The set of scalars is \Z/l
   1056 // where l = 2^252 + 27742317777372353535851937790883648493.
   1057 
   1058 // Input:
   1059 //   s[0]+256*s[1]+...+256^63*s[63] = s
   1060 //
   1061 // Output:
   1062 //   s[0]+256*s[1]+...+256^31*s[31] = s mod l
   1063 //   where l = 2^252 + 27742317777372353535851937790883648493.
   1064 //   Overwrites s in place.
   1065 void x25519_sc_reduce(uint8_t s[64]) {
   1066   int64_t s0 = 2097151 & load_3(s);
   1067   int64_t s1 = 2097151 & (load_4(s + 2) >> 5);
   1068   int64_t s2 = 2097151 & (load_3(s + 5) >> 2);
   1069   int64_t s3 = 2097151 & (load_4(s + 7) >> 7);
   1070   int64_t s4 = 2097151 & (load_4(s + 10) >> 4);
   1071   int64_t s5 = 2097151 & (load_3(s + 13) >> 1);
   1072   int64_t s6 = 2097151 & (load_4(s + 15) >> 6);
   1073   int64_t s7 = 2097151 & (load_3(s + 18) >> 3);
   1074   int64_t s8 = 2097151 & load_3(s + 21);
   1075   int64_t s9 = 2097151 & (load_4(s + 23) >> 5);
   1076   int64_t s10 = 2097151 & (load_3(s + 26) >> 2);
   1077   int64_t s11 = 2097151 & (load_4(s + 28) >> 7);
   1078   int64_t s12 = 2097151 & (load_4(s + 31) >> 4);
   1079   int64_t s13 = 2097151 & (load_3(s + 34) >> 1);
   1080   int64_t s14 = 2097151 & (load_4(s + 36) >> 6);
   1081   int64_t s15 = 2097151 & (load_3(s + 39) >> 3);
   1082   int64_t s16 = 2097151 & load_3(s + 42);
   1083   int64_t s17 = 2097151 & (load_4(s + 44) >> 5);
   1084   int64_t s18 = 2097151 & (load_3(s + 47) >> 2);
   1085   int64_t s19 = 2097151 & (load_4(s + 49) >> 7);
   1086   int64_t s20 = 2097151 & (load_4(s + 52) >> 4);
   1087   int64_t s21 = 2097151 & (load_3(s + 55) >> 1);
   1088   int64_t s22 = 2097151 & (load_4(s + 57) >> 6);
   1089   int64_t s23 = (load_4(s + 60) >> 3);
   1090   int64_t carry0;
   1091   int64_t carry1;
   1092   int64_t carry2;
   1093   int64_t carry3;
   1094   int64_t carry4;
   1095   int64_t carry5;
   1096   int64_t carry6;
   1097   int64_t carry7;
   1098   int64_t carry8;
   1099   int64_t carry9;
   1100   int64_t carry10;
   1101   int64_t carry11;
   1102   int64_t carry12;
   1103   int64_t carry13;
   1104   int64_t carry14;
   1105   int64_t carry15;
   1106   int64_t carry16;
   1107 
   1108   s11 += s23 * 666643;
   1109   s12 += s23 * 470296;
   1110   s13 += s23 * 654183;
   1111   s14 -= s23 * 997805;
   1112   s15 += s23 * 136657;
   1113   s16 -= s23 * 683901;
   1114   s23 = 0;
   1115 
   1116   s10 += s22 * 666643;
   1117   s11 += s22 * 470296;
   1118   s12 += s22 * 654183;
   1119   s13 -= s22 * 997805;
   1120   s14 += s22 * 136657;
   1121   s15 -= s22 * 683901;
   1122   s22 = 0;
   1123 
   1124   s9 += s21 * 666643;
   1125   s10 += s21 * 470296;
   1126   s11 += s21 * 654183;
   1127   s12 -= s21 * 997805;
   1128   s13 += s21 * 136657;
   1129   s14 -= s21 * 683901;
   1130   s21 = 0;
   1131 
   1132   s8 += s20 * 666643;
   1133   s9 += s20 * 470296;
   1134   s10 += s20 * 654183;
   1135   s11 -= s20 * 997805;
   1136   s12 += s20 * 136657;
   1137   s13 -= s20 * 683901;
   1138   s20 = 0;
   1139 
   1140   s7 += s19 * 666643;
   1141   s8 += s19 * 470296;
   1142   s9 += s19 * 654183;
   1143   s10 -= s19 * 997805;
   1144   s11 += s19 * 136657;
   1145   s12 -= s19 * 683901;
   1146   s19 = 0;
   1147 
   1148   s6 += s18 * 666643;
   1149   s7 += s18 * 470296;
   1150   s8 += s18 * 654183;
   1151   s9 -= s18 * 997805;
   1152   s10 += s18 * 136657;
   1153   s11 -= s18 * 683901;
   1154   s18 = 0;
   1155 
   1156   carry6 = (s6 + (1 << 20)) >> 21;
   1157   s7 += carry6;
   1158   s6 -= int64_lshift21(carry6);
   1159   carry8 = (s8 + (1 << 20)) >> 21;
   1160   s9 += carry8;
   1161   s8 -= int64_lshift21(carry8);
   1162   carry10 = (s10 + (1 << 20)) >> 21;
   1163   s11 += carry10;
   1164   s10 -= int64_lshift21(carry10);
   1165   carry12 = (s12 + (1 << 20)) >> 21;
   1166   s13 += carry12;
   1167   s12 -= int64_lshift21(carry12);
   1168   carry14 = (s14 + (1 << 20)) >> 21;
   1169   s15 += carry14;
   1170   s14 -= int64_lshift21(carry14);
   1171   carry16 = (s16 + (1 << 20)) >> 21;
   1172   s17 += carry16;
   1173   s16 -= int64_lshift21(carry16);
   1174 
   1175   carry7 = (s7 + (1 << 20)) >> 21;
   1176   s8 += carry7;
   1177   s7 -= int64_lshift21(carry7);
   1178   carry9 = (s9 + (1 << 20)) >> 21;
   1179   s10 += carry9;
   1180   s9 -= int64_lshift21(carry9);
   1181   carry11 = (s11 + (1 << 20)) >> 21;
   1182   s12 += carry11;
   1183   s11 -= int64_lshift21(carry11);
   1184   carry13 = (s13 + (1 << 20)) >> 21;
   1185   s14 += carry13;
   1186   s13 -= int64_lshift21(carry13);
   1187   carry15 = (s15 + (1 << 20)) >> 21;
   1188   s16 += carry15;
   1189   s15 -= int64_lshift21(carry15);
   1190 
   1191   s5 += s17 * 666643;
   1192   s6 += s17 * 470296;
   1193   s7 += s17 * 654183;
   1194   s8 -= s17 * 997805;
   1195   s9 += s17 * 136657;
   1196   s10 -= s17 * 683901;
   1197   s17 = 0;
   1198 
   1199   s4 += s16 * 666643;
   1200   s5 += s16 * 470296;
   1201   s6 += s16 * 654183;
   1202   s7 -= s16 * 997805;
   1203   s8 += s16 * 136657;
   1204   s9 -= s16 * 683901;
   1205   s16 = 0;
   1206 
   1207   s3 += s15 * 666643;
   1208   s4 += s15 * 470296;
   1209   s5 += s15 * 654183;
   1210   s6 -= s15 * 997805;
   1211   s7 += s15 * 136657;
   1212   s8 -= s15 * 683901;
   1213   s15 = 0;
   1214 
   1215   s2 += s14 * 666643;
   1216   s3 += s14 * 470296;
   1217   s4 += s14 * 654183;
   1218   s5 -= s14 * 997805;
   1219   s6 += s14 * 136657;
   1220   s7 -= s14 * 683901;
   1221   s14 = 0;
   1222 
   1223   s1 += s13 * 666643;
   1224   s2 += s13 * 470296;
   1225   s3 += s13 * 654183;
   1226   s4 -= s13 * 997805;
   1227   s5 += s13 * 136657;
   1228   s6 -= s13 * 683901;
   1229   s13 = 0;
   1230 
   1231   s0 += s12 * 666643;
   1232   s1 += s12 * 470296;
   1233   s2 += s12 * 654183;
   1234   s3 -= s12 * 997805;
   1235   s4 += s12 * 136657;
   1236   s5 -= s12 * 683901;
   1237   s12 = 0;
   1238 
   1239   carry0 = (s0 + (1 << 20)) >> 21;
   1240   s1 += carry0;
   1241   s0 -= int64_lshift21(carry0);
   1242   carry2 = (s2 + (1 << 20)) >> 21;
   1243   s3 += carry2;
   1244   s2 -= int64_lshift21(carry2);
   1245   carry4 = (s4 + (1 << 20)) >> 21;
   1246   s5 += carry4;
   1247   s4 -= int64_lshift21(carry4);
   1248   carry6 = (s6 + (1 << 20)) >> 21;
   1249   s7 += carry6;
   1250   s6 -= int64_lshift21(carry6);
   1251   carry8 = (s8 + (1 << 20)) >> 21;
   1252   s9 += carry8;
   1253   s8 -= int64_lshift21(carry8);
   1254   carry10 = (s10 + (1 << 20)) >> 21;
   1255   s11 += carry10;
   1256   s10 -= int64_lshift21(carry10);
   1257 
   1258   carry1 = (s1 + (1 << 20)) >> 21;
   1259   s2 += carry1;
   1260   s1 -= int64_lshift21(carry1);
   1261   carry3 = (s3 + (1 << 20)) >> 21;
   1262   s4 += carry3;
   1263   s3 -= int64_lshift21(carry3);
   1264   carry5 = (s5 + (1 << 20)) >> 21;
   1265   s6 += carry5;
   1266   s5 -= int64_lshift21(carry5);
   1267   carry7 = (s7 + (1 << 20)) >> 21;
   1268   s8 += carry7;
   1269   s7 -= int64_lshift21(carry7);
   1270   carry9 = (s9 + (1 << 20)) >> 21;
   1271   s10 += carry9;
   1272   s9 -= int64_lshift21(carry9);
   1273   carry11 = (s11 + (1 << 20)) >> 21;
   1274   s12 += carry11;
   1275   s11 -= int64_lshift21(carry11);
   1276 
   1277   s0 += s12 * 666643;
   1278   s1 += s12 * 470296;
   1279   s2 += s12 * 654183;
   1280   s3 -= s12 * 997805;
   1281   s4 += s12 * 136657;
   1282   s5 -= s12 * 683901;
   1283   s12 = 0;
   1284 
   1285   carry0 = s0 >> 21;
   1286   s1 += carry0;
   1287   s0 -= int64_lshift21(carry0);
   1288   carry1 = s1 >> 21;
   1289   s2 += carry1;
   1290   s1 -= int64_lshift21(carry1);
   1291   carry2 = s2 >> 21;
   1292   s3 += carry2;
   1293   s2 -= int64_lshift21(carry2);
   1294   carry3 = s3 >> 21;
   1295   s4 += carry3;
   1296   s3 -= int64_lshift21(carry3);
   1297   carry4 = s4 >> 21;
   1298   s5 += carry4;
   1299   s4 -= int64_lshift21(carry4);
   1300   carry5 = s5 >> 21;
   1301   s6 += carry5;
   1302   s5 -= int64_lshift21(carry5);
   1303   carry6 = s6 >> 21;
   1304   s7 += carry6;
   1305   s6 -= int64_lshift21(carry6);
   1306   carry7 = s7 >> 21;
   1307   s8 += carry7;
   1308   s7 -= int64_lshift21(carry7);
   1309   carry8 = s8 >> 21;
   1310   s9 += carry8;
   1311   s8 -= int64_lshift21(carry8);
   1312   carry9 = s9 >> 21;
   1313   s10 += carry9;
   1314   s9 -= int64_lshift21(carry9);
   1315   carry10 = s10 >> 21;
   1316   s11 += carry10;
   1317   s10 -= int64_lshift21(carry10);
   1318   carry11 = s11 >> 21;
   1319   s12 += carry11;
   1320   s11 -= int64_lshift21(carry11);
   1321 
   1322   s0 += s12 * 666643;
   1323   s1 += s12 * 470296;
   1324   s2 += s12 * 654183;
   1325   s3 -= s12 * 997805;
   1326   s4 += s12 * 136657;
   1327   s5 -= s12 * 683901;
   1328   s12 = 0;
   1329 
   1330   carry0 = s0 >> 21;
   1331   s1 += carry0;
   1332   s0 -= int64_lshift21(carry0);
   1333   carry1 = s1 >> 21;
   1334   s2 += carry1;
   1335   s1 -= int64_lshift21(carry1);
   1336   carry2 = s2 >> 21;
   1337   s3 += carry2;
   1338   s2 -= int64_lshift21(carry2);
   1339   carry3 = s3 >> 21;
   1340   s4 += carry3;
   1341   s3 -= int64_lshift21(carry3);
   1342   carry4 = s4 >> 21;
   1343   s5 += carry4;
   1344   s4 -= int64_lshift21(carry4);
   1345   carry5 = s5 >> 21;
   1346   s6 += carry5;
   1347   s5 -= int64_lshift21(carry5);
   1348   carry6 = s6 >> 21;
   1349   s7 += carry6;
   1350   s6 -= int64_lshift21(carry6);
   1351   carry7 = s7 >> 21;
   1352   s8 += carry7;
   1353   s7 -= int64_lshift21(carry7);
   1354   carry8 = s8 >> 21;
   1355   s9 += carry8;
   1356   s8 -= int64_lshift21(carry8);
   1357   carry9 = s9 >> 21;
   1358   s10 += carry9;
   1359   s9 -= int64_lshift21(carry9);
   1360   carry10 = s10 >> 21;
   1361   s11 += carry10;
   1362   s10 -= int64_lshift21(carry10);
   1363 
   1364   s[0] = s0 >> 0;
   1365   s[1] = s0 >> 8;
   1366   s[2] = (s0 >> 16) | (s1 << 5);
   1367   s[3] = s1 >> 3;
   1368   s[4] = s1 >> 11;
   1369   s[5] = (s1 >> 19) | (s2 << 2);
   1370   s[6] = s2 >> 6;
   1371   s[7] = (s2 >> 14) | (s3 << 7);
   1372   s[8] = s3 >> 1;
   1373   s[9] = s3 >> 9;
   1374   s[10] = (s3 >> 17) | (s4 << 4);
   1375   s[11] = s4 >> 4;
   1376   s[12] = s4 >> 12;
   1377   s[13] = (s4 >> 20) | (s5 << 1);
   1378   s[14] = s5 >> 7;
   1379   s[15] = (s5 >> 15) | (s6 << 6);
   1380   s[16] = s6 >> 2;
   1381   s[17] = s6 >> 10;
   1382   s[18] = (s6 >> 18) | (s7 << 3);
   1383   s[19] = s7 >> 5;
   1384   s[20] = s7 >> 13;
   1385   s[21] = s8 >> 0;
   1386   s[22] = s8 >> 8;
   1387   s[23] = (s8 >> 16) | (s9 << 5);
   1388   s[24] = s9 >> 3;
   1389   s[25] = s9 >> 11;
   1390   s[26] = (s9 >> 19) | (s10 << 2);
   1391   s[27] = s10 >> 6;
   1392   s[28] = (s10 >> 14) | (s11 << 7);
   1393   s[29] = s11 >> 1;
   1394   s[30] = s11 >> 9;
   1395   s[31] = s11 >> 17;
   1396 }
   1397 
   1398 // Input:
   1399 //   a[0]+256*a[1]+...+256^31*a[31] = a
   1400 //   b[0]+256*b[1]+...+256^31*b[31] = b
   1401 //   c[0]+256*c[1]+...+256^31*c[31] = c
   1402 //
   1403 // Output:
   1404 //   s[0]+256*s[1]+...+256^31*s[31] = (ab+c) mod l
   1405 //   where l = 2^252 + 27742317777372353535851937790883648493.
   1406 static void sc_muladd(uint8_t *s, const uint8_t *a, const uint8_t *b,
   1407                       const uint8_t *c) {
   1408   int64_t a0 = 2097151 & load_3(a);
   1409   int64_t a1 = 2097151 & (load_4(a + 2) >> 5);
   1410   int64_t a2 = 2097151 & (load_3(a + 5) >> 2);
   1411   int64_t a3 = 2097151 & (load_4(a + 7) >> 7);
   1412   int64_t a4 = 2097151 & (load_4(a + 10) >> 4);
   1413   int64_t a5 = 2097151 & (load_3(a + 13) >> 1);
   1414   int64_t a6 = 2097151 & (load_4(a + 15) >> 6);
   1415   int64_t a7 = 2097151 & (load_3(a + 18) >> 3);
   1416   int64_t a8 = 2097151 & load_3(a + 21);
   1417   int64_t a9 = 2097151 & (load_4(a + 23) >> 5);
   1418   int64_t a10 = 2097151 & (load_3(a + 26) >> 2);
   1419   int64_t a11 = (load_4(a + 28) >> 7);
   1420   int64_t b0 = 2097151 & load_3(b);
   1421   int64_t b1 = 2097151 & (load_4(b + 2) >> 5);
   1422   int64_t b2 = 2097151 & (load_3(b + 5) >> 2);
   1423   int64_t b3 = 2097151 & (load_4(b + 7) >> 7);
   1424   int64_t b4 = 2097151 & (load_4(b + 10) >> 4);
   1425   int64_t b5 = 2097151 & (load_3(b + 13) >> 1);
   1426   int64_t b6 = 2097151 & (load_4(b + 15) >> 6);
   1427   int64_t b7 = 2097151 & (load_3(b + 18) >> 3);
   1428   int64_t b8 = 2097151 & load_3(b + 21);
   1429   int64_t b9 = 2097151 & (load_4(b + 23) >> 5);
   1430   int64_t b10 = 2097151 & (load_3(b + 26) >> 2);
   1431   int64_t b11 = (load_4(b + 28) >> 7);
   1432   int64_t c0 = 2097151 & load_3(c);
   1433   int64_t c1 = 2097151 & (load_4(c + 2) >> 5);
   1434   int64_t c2 = 2097151 & (load_3(c + 5) >> 2);
   1435   int64_t c3 = 2097151 & (load_4(c + 7) >> 7);
   1436   int64_t c4 = 2097151 & (load_4(c + 10) >> 4);
   1437   int64_t c5 = 2097151 & (load_3(c + 13) >> 1);
   1438   int64_t c6 = 2097151 & (load_4(c + 15) >> 6);
   1439   int64_t c7 = 2097151 & (load_3(c + 18) >> 3);
   1440   int64_t c8 = 2097151 & load_3(c + 21);
   1441   int64_t c9 = 2097151 & (load_4(c + 23) >> 5);
   1442   int64_t c10 = 2097151 & (load_3(c + 26) >> 2);
   1443   int64_t c11 = (load_4(c + 28) >> 7);
   1444   int64_t s0;
   1445   int64_t s1;
   1446   int64_t s2;
   1447   int64_t s3;
   1448   int64_t s4;
   1449   int64_t s5;
   1450   int64_t s6;
   1451   int64_t s7;
   1452   int64_t s8;
   1453   int64_t s9;
   1454   int64_t s10;
   1455   int64_t s11;
   1456   int64_t s12;
   1457   int64_t s13;
   1458   int64_t s14;
   1459   int64_t s15;
   1460   int64_t s16;
   1461   int64_t s17;
   1462   int64_t s18;
   1463   int64_t s19;
   1464   int64_t s20;
   1465   int64_t s21;
   1466   int64_t s22;
   1467   int64_t s23;
   1468   int64_t carry0;
   1469   int64_t carry1;
   1470   int64_t carry2;
   1471   int64_t carry3;
   1472   int64_t carry4;
   1473   int64_t carry5;
   1474   int64_t carry6;
   1475   int64_t carry7;
   1476   int64_t carry8;
   1477   int64_t carry9;
   1478   int64_t carry10;
   1479   int64_t carry11;
   1480   int64_t carry12;
   1481   int64_t carry13;
   1482   int64_t carry14;
   1483   int64_t carry15;
   1484   int64_t carry16;
   1485   int64_t carry17;
   1486   int64_t carry18;
   1487   int64_t carry19;
   1488   int64_t carry20;
   1489   int64_t carry21;
   1490   int64_t carry22;
   1491 
   1492   s0 = c0 + a0 * b0;
   1493   s1 = c1 + a0 * b1 + a1 * b0;
   1494   s2 = c2 + a0 * b2 + a1 * b1 + a2 * b0;
   1495   s3 = c3 + a0 * b3 + a1 * b2 + a2 * b1 + a3 * b0;
   1496   s4 = c4 + a0 * b4 + a1 * b3 + a2 * b2 + a3 * b1 + a4 * b0;
   1497   s5 = c5 + a0 * b5 + a1 * b4 + a2 * b3 + a3 * b2 + a4 * b1 + a5 * b0;
   1498   s6 = c6 + a0 * b6 + a1 * b5 + a2 * b4 + a3 * b3 + a4 * b2 + a5 * b1 + a6 * b0;
   1499   s7 = c7 + a0 * b7 + a1 * b6 + a2 * b5 + a3 * b4 + a4 * b3 + a5 * b2 +
   1500        a6 * b1 + a7 * b0;
   1501   s8 = c8 + a0 * b8 + a1 * b7 + a2 * b6 + a3 * b5 + a4 * b4 + a5 * b3 +
   1502        a6 * b2 + a7 * b1 + a8 * b0;
   1503   s9 = c9 + a0 * b9 + a1 * b8 + a2 * b7 + a3 * b6 + a4 * b5 + a5 * b4 +
   1504        a6 * b3 + a7 * b2 + a8 * b1 + a9 * b0;
   1505   s10 = c10 + a0 * b10 + a1 * b9 + a2 * b8 + a3 * b7 + a4 * b6 + a5 * b5 +
   1506         a6 * b4 + a7 * b3 + a8 * b2 + a9 * b1 + a10 * b0;
   1507   s11 = c11 + a0 * b11 + a1 * b10 + a2 * b9 + a3 * b8 + a4 * b7 + a5 * b6 +
   1508         a6 * b5 + a7 * b4 + a8 * b3 + a9 * b2 + a10 * b1 + a11 * b0;
   1509   s12 = a1 * b11 + a2 * b10 + a3 * b9 + a4 * b8 + a5 * b7 + a6 * b6 + a7 * b5 +
   1510         a8 * b4 + a9 * b3 + a10 * b2 + a11 * b1;
   1511   s13 = a2 * b11 + a3 * b10 + a4 * b9 + a5 * b8 + a6 * b7 + a7 * b6 + a8 * b5 +
   1512         a9 * b4 + a10 * b3 + a11 * b2;
   1513   s14 = a3 * b11 + a4 * b10 + a5 * b9 + a6 * b8 + a7 * b7 + a8 * b6 + a9 * b5 +
   1514         a10 * b4 + a11 * b3;
   1515   s15 = a4 * b11 + a5 * b10 + a6 * b9 + a7 * b8 + a8 * b7 + a9 * b6 + a10 * b5 +
   1516         a11 * b4;
   1517   s16 = a5 * b11 + a6 * b10 + a7 * b9 + a8 * b8 + a9 * b7 + a10 * b6 + a11 * b5;
   1518   s17 = a6 * b11 + a7 * b10 + a8 * b9 + a9 * b8 + a10 * b7 + a11 * b6;
   1519   s18 = a7 * b11 + a8 * b10 + a9 * b9 + a10 * b8 + a11 * b7;
   1520   s19 = a8 * b11 + a9 * b10 + a10 * b9 + a11 * b8;
   1521   s20 = a9 * b11 + a10 * b10 + a11 * b9;
   1522   s21 = a10 * b11 + a11 * b10;
   1523   s22 = a11 * b11;
   1524   s23 = 0;
   1525 
   1526   carry0 = (s0 + (1 << 20)) >> 21;
   1527   s1 += carry0;
   1528   s0 -= int64_lshift21(carry0);
   1529   carry2 = (s2 + (1 << 20)) >> 21;
   1530   s3 += carry2;
   1531   s2 -= int64_lshift21(carry2);
   1532   carry4 = (s4 + (1 << 20)) >> 21;
   1533   s5 += carry4;
   1534   s4 -= int64_lshift21(carry4);
   1535   carry6 = (s6 + (1 << 20)) >> 21;
   1536   s7 += carry6;
   1537   s6 -= int64_lshift21(carry6);
   1538   carry8 = (s8 + (1 << 20)) >> 21;
   1539   s9 += carry8;
   1540   s8 -= int64_lshift21(carry8);
   1541   carry10 = (s10 + (1 << 20)) >> 21;
   1542   s11 += carry10;
   1543   s10 -= int64_lshift21(carry10);
   1544   carry12 = (s12 + (1 << 20)) >> 21;
   1545   s13 += carry12;
   1546   s12 -= int64_lshift21(carry12);
   1547   carry14 = (s14 + (1 << 20)) >> 21;
   1548   s15 += carry14;
   1549   s14 -= int64_lshift21(carry14);
   1550   carry16 = (s16 + (1 << 20)) >> 21;
   1551   s17 += carry16;
   1552   s16 -= int64_lshift21(carry16);
   1553   carry18 = (s18 + (1 << 20)) >> 21;
   1554   s19 += carry18;
   1555   s18 -= int64_lshift21(carry18);
   1556   carry20 = (s20 + (1 << 20)) >> 21;
   1557   s21 += carry20;
   1558   s20 -= int64_lshift21(carry20);
   1559   carry22 = (s22 + (1 << 20)) >> 21;
   1560   s23 += carry22;
   1561   s22 -= int64_lshift21(carry22);
   1562 
   1563   carry1 = (s1 + (1 << 20)) >> 21;
   1564   s2 += carry1;
   1565   s1 -= int64_lshift21(carry1);
   1566   carry3 = (s3 + (1 << 20)) >> 21;
   1567   s4 += carry3;
   1568   s3 -= int64_lshift21(carry3);
   1569   carry5 = (s5 + (1 << 20)) >> 21;
   1570   s6 += carry5;
   1571   s5 -= int64_lshift21(carry5);
   1572   carry7 = (s7 + (1 << 20)) >> 21;
   1573   s8 += carry7;
   1574   s7 -= int64_lshift21(carry7);
   1575   carry9 = (s9 + (1 << 20)) >> 21;
   1576   s10 += carry9;
   1577   s9 -= int64_lshift21(carry9);
   1578   carry11 = (s11 + (1 << 20)) >> 21;
   1579   s12 += carry11;
   1580   s11 -= int64_lshift21(carry11);
   1581   carry13 = (s13 + (1 << 20)) >> 21;
   1582   s14 += carry13;
   1583   s13 -= int64_lshift21(carry13);
   1584   carry15 = (s15 + (1 << 20)) >> 21;
   1585   s16 += carry15;
   1586   s15 -= int64_lshift21(carry15);
   1587   carry17 = (s17 + (1 << 20)) >> 21;
   1588   s18 += carry17;
   1589   s17 -= int64_lshift21(carry17);
   1590   carry19 = (s19 + (1 << 20)) >> 21;
   1591   s20 += carry19;
   1592   s19 -= int64_lshift21(carry19);
   1593   carry21 = (s21 + (1 << 20)) >> 21;
   1594   s22 += carry21;
   1595   s21 -= int64_lshift21(carry21);
   1596 
   1597   s11 += s23 * 666643;
   1598   s12 += s23 * 470296;
   1599   s13 += s23 * 654183;
   1600   s14 -= s23 * 997805;
   1601   s15 += s23 * 136657;
   1602   s16 -= s23 * 683901;
   1603   s23 = 0;
   1604 
   1605   s10 += s22 * 666643;
   1606   s11 += s22 * 470296;
   1607   s12 += s22 * 654183;
   1608   s13 -= s22 * 997805;
   1609   s14 += s22 * 136657;
   1610   s15 -= s22 * 683901;
   1611   s22 = 0;
   1612 
   1613   s9 += s21 * 666643;
   1614   s10 += s21 * 470296;
   1615   s11 += s21 * 654183;
   1616   s12 -= s21 * 997805;
   1617   s13 += s21 * 136657;
   1618   s14 -= s21 * 683901;
   1619   s21 = 0;
   1620 
   1621   s8 += s20 * 666643;
   1622   s9 += s20 * 470296;
   1623   s10 += s20 * 654183;
   1624   s11 -= s20 * 997805;
   1625   s12 += s20 * 136657;
   1626   s13 -= s20 * 683901;
   1627   s20 = 0;
   1628 
   1629   s7 += s19 * 666643;
   1630   s8 += s19 * 470296;
   1631   s9 += s19 * 654183;
   1632   s10 -= s19 * 997805;
   1633   s11 += s19 * 136657;
   1634   s12 -= s19 * 683901;
   1635   s19 = 0;
   1636 
   1637   s6 += s18 * 666643;
   1638   s7 += s18 * 470296;
   1639   s8 += s18 * 654183;
   1640   s9 -= s18 * 997805;
   1641   s10 += s18 * 136657;
   1642   s11 -= s18 * 683901;
   1643   s18 = 0;
   1644 
   1645   carry6 = (s6 + (1 << 20)) >> 21;
   1646   s7 += carry6;
   1647   s6 -= int64_lshift21(carry6);
   1648   carry8 = (s8 + (1 << 20)) >> 21;
   1649   s9 += carry8;
   1650   s8 -= int64_lshift21(carry8);
   1651   carry10 = (s10 + (1 << 20)) >> 21;
   1652   s11 += carry10;
   1653   s10 -= int64_lshift21(carry10);
   1654   carry12 = (s12 + (1 << 20)) >> 21;
   1655   s13 += carry12;
   1656   s12 -= int64_lshift21(carry12);
   1657   carry14 = (s14 + (1 << 20)) >> 21;
   1658   s15 += carry14;
   1659   s14 -= int64_lshift21(carry14);
   1660   carry16 = (s16 + (1 << 20)) >> 21;
   1661   s17 += carry16;
   1662   s16 -= int64_lshift21(carry16);
   1663 
   1664   carry7 = (s7 + (1 << 20)) >> 21;
   1665   s8 += carry7;
   1666   s7 -= int64_lshift21(carry7);
   1667   carry9 = (s9 + (1 << 20)) >> 21;
   1668   s10 += carry9;
   1669   s9 -= int64_lshift21(carry9);
   1670   carry11 = (s11 + (1 << 20)) >> 21;
   1671   s12 += carry11;
   1672   s11 -= int64_lshift21(carry11);
   1673   carry13 = (s13 + (1 << 20)) >> 21;
   1674   s14 += carry13;
   1675   s13 -= int64_lshift21(carry13);
   1676   carry15 = (s15 + (1 << 20)) >> 21;
   1677   s16 += carry15;
   1678   s15 -= int64_lshift21(carry15);
   1679 
   1680   s5 += s17 * 666643;
   1681   s6 += s17 * 470296;
   1682   s7 += s17 * 654183;
   1683   s8 -= s17 * 997805;
   1684   s9 += s17 * 136657;
   1685   s10 -= s17 * 683901;
   1686   s17 = 0;
   1687 
   1688   s4 += s16 * 666643;
   1689   s5 += s16 * 470296;
   1690   s6 += s16 * 654183;
   1691   s7 -= s16 * 997805;
   1692   s8 += s16 * 136657;
   1693   s9 -= s16 * 683901;
   1694   s16 = 0;
   1695 
   1696   s3 += s15 * 666643;
   1697   s4 += s15 * 470296;
   1698   s5 += s15 * 654183;
   1699   s6 -= s15 * 997805;
   1700   s7 += s15 * 136657;
   1701   s8 -= s15 * 683901;
   1702   s15 = 0;
   1703 
   1704   s2 += s14 * 666643;
   1705   s3 += s14 * 470296;
   1706   s4 += s14 * 654183;
   1707   s5 -= s14 * 997805;
   1708   s6 += s14 * 136657;
   1709   s7 -= s14 * 683901;
   1710   s14 = 0;
   1711 
   1712   s1 += s13 * 666643;
   1713   s2 += s13 * 470296;
   1714   s3 += s13 * 654183;
   1715   s4 -= s13 * 997805;
   1716   s5 += s13 * 136657;
   1717   s6 -= s13 * 683901;
   1718   s13 = 0;
   1719 
   1720   s0 += s12 * 666643;
   1721   s1 += s12 * 470296;
   1722   s2 += s12 * 654183;
   1723   s3 -= s12 * 997805;
   1724   s4 += s12 * 136657;
   1725   s5 -= s12 * 683901;
   1726   s12 = 0;
   1727 
   1728   carry0 = (s0 + (1 << 20)) >> 21;
   1729   s1 += carry0;
   1730   s0 -= int64_lshift21(carry0);
   1731   carry2 = (s2 + (1 << 20)) >> 21;
   1732   s3 += carry2;
   1733   s2 -= int64_lshift21(carry2);
   1734   carry4 = (s4 + (1 << 20)) >> 21;
   1735   s5 += carry4;
   1736   s4 -= int64_lshift21(carry4);
   1737   carry6 = (s6 + (1 << 20)) >> 21;
   1738   s7 += carry6;
   1739   s6 -= int64_lshift21(carry6);
   1740   carry8 = (s8 + (1 << 20)) >> 21;
   1741   s9 += carry8;
   1742   s8 -= int64_lshift21(carry8);
   1743   carry10 = (s10 + (1 << 20)) >> 21;
   1744   s11 += carry10;
   1745   s10 -= int64_lshift21(carry10);
   1746 
   1747   carry1 = (s1 + (1 << 20)) >> 21;
   1748   s2 += carry1;
   1749   s1 -= int64_lshift21(carry1);
   1750   carry3 = (s3 + (1 << 20)) >> 21;
   1751   s4 += carry3;
   1752   s3 -= int64_lshift21(carry3);
   1753   carry5 = (s5 + (1 << 20)) >> 21;
   1754   s6 += carry5;
   1755   s5 -= int64_lshift21(carry5);
   1756   carry7 = (s7 + (1 << 20)) >> 21;
   1757   s8 += carry7;
   1758   s7 -= int64_lshift21(carry7);
   1759   carry9 = (s9 + (1 << 20)) >> 21;
   1760   s10 += carry9;
   1761   s9 -= int64_lshift21(carry9);
   1762   carry11 = (s11 + (1 << 20)) >> 21;
   1763   s12 += carry11;
   1764   s11 -= int64_lshift21(carry11);
   1765 
   1766   s0 += s12 * 666643;
   1767   s1 += s12 * 470296;
   1768   s2 += s12 * 654183;
   1769   s3 -= s12 * 997805;
   1770   s4 += s12 * 136657;
   1771   s5 -= s12 * 683901;
   1772   s12 = 0;
   1773 
   1774   carry0 = s0 >> 21;
   1775   s1 += carry0;
   1776   s0 -= int64_lshift21(carry0);
   1777   carry1 = s1 >> 21;
   1778   s2 += carry1;
   1779   s1 -= int64_lshift21(carry1);
   1780   carry2 = s2 >> 21;
   1781   s3 += carry2;
   1782   s2 -= int64_lshift21(carry2);
   1783   carry3 = s3 >> 21;
   1784   s4 += carry3;
   1785   s3 -= int64_lshift21(carry3);
   1786   carry4 = s4 >> 21;
   1787   s5 += carry4;
   1788   s4 -= int64_lshift21(carry4);
   1789   carry5 = s5 >> 21;
   1790   s6 += carry5;
   1791   s5 -= int64_lshift21(carry5);
   1792   carry6 = s6 >> 21;
   1793   s7 += carry6;
   1794   s6 -= int64_lshift21(carry6);
   1795   carry7 = s7 >> 21;
   1796   s8 += carry7;
   1797   s7 -= int64_lshift21(carry7);
   1798   carry8 = s8 >> 21;
   1799   s9 += carry8;
   1800   s8 -= int64_lshift21(carry8);
   1801   carry9 = s9 >> 21;
   1802   s10 += carry9;
   1803   s9 -= int64_lshift21(carry9);
   1804   carry10 = s10 >> 21;
   1805   s11 += carry10;
   1806   s10 -= int64_lshift21(carry10);
   1807   carry11 = s11 >> 21;
   1808   s12 += carry11;
   1809   s11 -= int64_lshift21(carry11);
   1810 
   1811   s0 += s12 * 666643;
   1812   s1 += s12 * 470296;
   1813   s2 += s12 * 654183;
   1814   s3 -= s12 * 997805;
   1815   s4 += s12 * 136657;
   1816   s5 -= s12 * 683901;
   1817   s12 = 0;
   1818 
   1819   carry0 = s0 >> 21;
   1820   s1 += carry0;
   1821   s0 -= int64_lshift21(carry0);
   1822   carry1 = s1 >> 21;
   1823   s2 += carry1;
   1824   s1 -= int64_lshift21(carry1);
   1825   carry2 = s2 >> 21;
   1826   s3 += carry2;
   1827   s2 -= int64_lshift21(carry2);
   1828   carry3 = s3 >> 21;
   1829   s4 += carry3;
   1830   s3 -= int64_lshift21(carry3);
   1831   carry4 = s4 >> 21;
   1832   s5 += carry4;
   1833   s4 -= int64_lshift21(carry4);
   1834   carry5 = s5 >> 21;
   1835   s6 += carry5;
   1836   s5 -= int64_lshift21(carry5);
   1837   carry6 = s6 >> 21;
   1838   s7 += carry6;
   1839   s6 -= int64_lshift21(carry6);
   1840   carry7 = s7 >> 21;
   1841   s8 += carry7;
   1842   s7 -= int64_lshift21(carry7);
   1843   carry8 = s8 >> 21;
   1844   s9 += carry8;
   1845   s8 -= int64_lshift21(carry8);
   1846   carry9 = s9 >> 21;
   1847   s10 += carry9;
   1848   s9 -= int64_lshift21(carry9);
   1849   carry10 = s10 >> 21;
   1850   s11 += carry10;
   1851   s10 -= int64_lshift21(carry10);
   1852 
   1853   s[0] = s0 >> 0;
   1854   s[1] = s0 >> 8;
   1855   s[2] = (s0 >> 16) | (s1 << 5);
   1856   s[3] = s1 >> 3;
   1857   s[4] = s1 >> 11;
   1858   s[5] = (s1 >> 19) | (s2 << 2);
   1859   s[6] = s2 >> 6;
   1860   s[7] = (s2 >> 14) | (s3 << 7);
   1861   s[8] = s3 >> 1;
   1862   s[9] = s3 >> 9;
   1863   s[10] = (s3 >> 17) | (s4 << 4);
   1864   s[11] = s4 >> 4;
   1865   s[12] = s4 >> 12;
   1866   s[13] = (s4 >> 20) | (s5 << 1);
   1867   s[14] = s5 >> 7;
   1868   s[15] = (s5 >> 15) | (s6 << 6);
   1869   s[16] = s6 >> 2;
   1870   s[17] = s6 >> 10;
   1871   s[18] = (s6 >> 18) | (s7 << 3);
   1872   s[19] = s7 >> 5;
   1873   s[20] = s7 >> 13;
   1874   s[21] = s8 >> 0;
   1875   s[22] = s8 >> 8;
   1876   s[23] = (s8 >> 16) | (s9 << 5);
   1877   s[24] = s9 >> 3;
   1878   s[25] = s9 >> 11;
   1879   s[26] = (s9 >> 19) | (s10 << 2);
   1880   s[27] = s10 >> 6;
   1881   s[28] = (s10 >> 14) | (s11 << 7);
   1882   s[29] = s11 >> 1;
   1883   s[30] = s11 >> 9;
   1884   s[31] = s11 >> 17;
   1885 }
   1886 
   1887 void ED25519_keypair(uint8_t out_public_key[32], uint8_t out_private_key[64]) {
   1888   uint8_t seed[32];
   1889   RAND_bytes(seed, 32);
   1890   ED25519_keypair_from_seed(out_public_key, out_private_key, seed);
   1891 }
   1892 
   1893 int ED25519_sign(uint8_t out_sig[64], const uint8_t *message,
   1894                  size_t message_len, const uint8_t private_key[64]) {
   1895   // NOTE: The documentation on this function says that it returns zero on
   1896   // allocation failure. While that can't happen with the current
   1897   // implementation, we want to reserve the ability to allocate in this
   1898   // implementation in the future.
   1899 
   1900   uint8_t az[SHA512_DIGEST_LENGTH];
   1901   SHA512(private_key, 32, az);
   1902 
   1903   az[0] &= 248;
   1904   az[31] &= 63;
   1905   az[31] |= 64;
   1906 
   1907   SHA512_CTX hash_ctx;
   1908   SHA512_Init(&hash_ctx);
   1909   SHA512_Update(&hash_ctx, az + 32, 32);
   1910   SHA512_Update(&hash_ctx, message, message_len);
   1911   uint8_t nonce[SHA512_DIGEST_LENGTH];
   1912   SHA512_Final(nonce, &hash_ctx);
   1913 
   1914   x25519_sc_reduce(nonce);
   1915   ge_p3 R;
   1916   x25519_ge_scalarmult_base(&R, nonce);
   1917   ge_p3_tobytes(out_sig, &R);
   1918 
   1919   SHA512_Init(&hash_ctx);
   1920   SHA512_Update(&hash_ctx, out_sig, 32);
   1921   SHA512_Update(&hash_ctx, private_key + 32, 32);
   1922   SHA512_Update(&hash_ctx, message, message_len);
   1923   uint8_t hram[SHA512_DIGEST_LENGTH];
   1924   SHA512_Final(hram, &hash_ctx);
   1925 
   1926   x25519_sc_reduce(hram);
   1927   sc_muladd(out_sig + 32, hram, az, nonce);
   1928 
   1929   return 1;
   1930 }
   1931 
   1932 int ED25519_verify(const uint8_t *message, size_t message_len,
   1933                    const uint8_t signature[64], const uint8_t public_key[32]) {
   1934   ge_p3 A;
   1935   if ((signature[63] & 224) != 0 ||
   1936       !x25519_ge_frombytes_vartime(&A, public_key)) {
   1937     return 0;
   1938   }
   1939 
   1940   fe_loose t;
   1941   fe_neg(&t, &A.X);
   1942   fe_carry(&A.X, &t);
   1943   fe_neg(&t, &A.T);
   1944   fe_carry(&A.T, &t);
   1945 
   1946   uint8_t pkcopy[32];
   1947   OPENSSL_memcpy(pkcopy, public_key, 32);
   1948   uint8_t rcopy[32];
   1949   OPENSSL_memcpy(rcopy, signature, 32);
   1950   union {
   1951     uint64_t u64[4];
   1952     uint8_t u8[32];
   1953   } scopy;
   1954   OPENSSL_memcpy(&scopy.u8[0], signature + 32, 32);
   1955 
   1956   // https://tools.ietf.org/html/rfc8032#section-5.1.7 requires that s be in
   1957   // the range [0, order) in order to prevent signature malleability.
   1958 
   1959   // kOrder is the order of Curve25519 in little-endian form.
   1960   static const uint64_t kOrder[4] = {
   1961     UINT64_C(0x5812631a5cf5d3ed),
   1962     UINT64_C(0x14def9dea2f79cd6),
   1963     0,
   1964     UINT64_C(0x1000000000000000),
   1965   };
   1966   for (size_t i = 3;; i--) {
   1967     if (scopy.u64[i] > kOrder[i]) {
   1968       return 0;
   1969     } else if (scopy.u64[i] < kOrder[i]) {
   1970       break;
   1971     } else if (i == 0) {
   1972       return 0;
   1973     }
   1974   }
   1975 
   1976   SHA512_CTX hash_ctx;
   1977   SHA512_Init(&hash_ctx);
   1978   SHA512_Update(&hash_ctx, signature, 32);
   1979   SHA512_Update(&hash_ctx, public_key, 32);
   1980   SHA512_Update(&hash_ctx, message, message_len);
   1981   uint8_t h[SHA512_DIGEST_LENGTH];
   1982   SHA512_Final(h, &hash_ctx);
   1983 
   1984   x25519_sc_reduce(h);
   1985 
   1986   ge_p2 R;
   1987   ge_double_scalarmult_vartime(&R, h, &A, scopy.u8);
   1988 
   1989   uint8_t rcheck[32];
   1990   x25519_ge_tobytes(rcheck, &R);
   1991 
   1992   return CRYPTO_memcmp(rcheck, rcopy, sizeof(rcheck)) == 0;
   1993 }
   1994 
   1995 void ED25519_keypair_from_seed(uint8_t out_public_key[32],
   1996                                uint8_t out_private_key[64],
   1997                                const uint8_t seed[32]) {
   1998   uint8_t az[SHA512_DIGEST_LENGTH];
   1999   SHA512(seed, 32, az);
   2000 
   2001   az[0] &= 248;
   2002   az[31] &= 127;
   2003   az[31] |= 64;
   2004 
   2005   ge_p3 A;
   2006   x25519_ge_scalarmult_base(&A, az);
   2007   ge_p3_tobytes(out_public_key, &A);
   2008 
   2009   OPENSSL_memcpy(out_private_key, seed, 32);
   2010   OPENSSL_memcpy(out_private_key + 32, out_public_key, 32);
   2011 }
   2012 
   2013 
   2014 static void x25519_scalar_mult_generic(uint8_t out[32],
   2015                                        const uint8_t scalar[32],
   2016                                        const uint8_t point[32]) {
   2017   fe x1, x2, z2, x3, z3, tmp0, tmp1;
   2018   fe_loose x2l, z2l, x3l, tmp0l, tmp1l;
   2019 
   2020   uint8_t e[32];
   2021   OPENSSL_memcpy(e, scalar, 32);
   2022   e[0] &= 248;
   2023   e[31] &= 127;
   2024   e[31] |= 64;
   2025 
   2026   // The following implementation was transcribed to Coq and proven to
   2027   // correspond to unary scalar multiplication in affine coordinates given that
   2028   // x1 != 0 is the x coordinate of some point on the curve. It was also checked
   2029   // in Coq that doing a ladderstep with x1 = x3 = 0 gives z2' = z3' = 0, and z2
   2030   // = z3 = 0 gives z2' = z3' = 0. The statement was quantified over the
   2031   // underlying field, so it applies to Curve25519 itself and the quadratic
   2032   // twist of Curve25519. It was not proven in Coq that prime-field arithmetic
   2033   // correctly simulates extension-field arithmetic on prime-field values.
   2034   // The decoding of the byte array representation of e was not considered.
   2035   // Specification of Montgomery curves in affine coordinates:
   2036   // <https://github.com/mit-plv/fiat-crypto/blob/2456d821825521f7e03e65882cc3521795b0320f/src/Spec/MontgomeryCurve.v#L27>
   2037   // Proof that these form a group that is isomorphic to a Weierstrass curve:
   2038   // <https://github.com/mit-plv/fiat-crypto/blob/2456d821825521f7e03e65882cc3521795b0320f/src/Curves/Montgomery/AffineProofs.v#L35>
   2039   // Coq transcription and correctness proof of the loop (where scalarbits=255):
   2040   // <https://github.com/mit-plv/fiat-crypto/blob/2456d821825521f7e03e65882cc3521795b0320f/src/Curves/Montgomery/XZ.v#L118>
   2041   // <https://github.com/mit-plv/fiat-crypto/blob/2456d821825521f7e03e65882cc3521795b0320f/src/Curves/Montgomery/XZProofs.v#L278>
   2042   // preconditions: 0 <= e < 2^255 (not necessarily e < order), fe_invert(0) = 0
   2043   fe_frombytes(&x1, point);
   2044   fe_1(&x2);
   2045   fe_0(&z2);
   2046   fe_copy(&x3, &x1);
   2047   fe_1(&z3);
   2048 
   2049   unsigned swap = 0;
   2050   int pos;
   2051   for (pos = 254; pos >= 0; --pos) {
   2052     // loop invariant as of right before the test, for the case where x1 != 0:
   2053     //   pos >= -1; if z2 = 0 then x2 is nonzero; if z3 = 0 then x3 is nonzero
   2054     //   let r := e >> (pos+1) in the following equalities of projective points:
   2055     //   to_xz (r*P)     === if swap then (x3, z3) else (x2, z2)
   2056     //   to_xz ((r+1)*P) === if swap then (x2, z2) else (x3, z3)
   2057     //   x1 is the nonzero x coordinate of the nonzero point (r*P-(r+1)*P)
   2058     unsigned b = 1 & (e[pos / 8] >> (pos & 7));
   2059     swap ^= b;
   2060     fe_cswap(&x2, &x3, swap);
   2061     fe_cswap(&z2, &z3, swap);
   2062     swap = b;
   2063     // Coq transcription of ladderstep formula (called from transcribed loop):
   2064     // <https://github.com/mit-plv/fiat-crypto/blob/2456d821825521f7e03e65882cc3521795b0320f/src/Curves/Montgomery/XZ.v#L89>
   2065     // <https://github.com/mit-plv/fiat-crypto/blob/2456d821825521f7e03e65882cc3521795b0320f/src/Curves/Montgomery/XZProofs.v#L131>
   2066     // x1 != 0 <https://github.com/mit-plv/fiat-crypto/blob/2456d821825521f7e03e65882cc3521795b0320f/src/Curves/Montgomery/XZProofs.v#L217>
   2067     // x1  = 0 <https://github.com/mit-plv/fiat-crypto/blob/2456d821825521f7e03e65882cc3521795b0320f/src/Curves/Montgomery/XZProofs.v#L147>
   2068     fe_sub(&tmp0l, &x3, &z3);
   2069     fe_sub(&tmp1l, &x2, &z2);
   2070     fe_add(&x2l, &x2, &z2);
   2071     fe_add(&z2l, &x3, &z3);
   2072     fe_mul_tll(&z3, &tmp0l, &x2l);
   2073     fe_mul_tll(&z2, &z2l, &tmp1l);
   2074     fe_sq_tl(&tmp0, &tmp1l);
   2075     fe_sq_tl(&tmp1, &x2l);
   2076     fe_add(&x3l, &z3, &z2);
   2077     fe_sub(&z2l, &z3, &z2);
   2078     fe_mul_ttt(&x2, &tmp1, &tmp0);
   2079     fe_sub(&tmp1l, &tmp1, &tmp0);
   2080     fe_sq_tl(&z2, &z2l);
   2081     fe_mul121666(&z3, &tmp1l);
   2082     fe_sq_tl(&x3, &x3l);
   2083     fe_add(&tmp0l, &tmp0, &z3);
   2084     fe_mul_ttt(&z3, &x1, &z2);
   2085     fe_mul_tll(&z2, &tmp1l, &tmp0l);
   2086   }
   2087   // here pos=-1, so r=e, so to_xz (e*P) === if swap then (x3, z3) else (x2, z2)
   2088   fe_cswap(&x2, &x3, swap);
   2089   fe_cswap(&z2, &z3, swap);
   2090 
   2091   fe_invert(&z2, &z2);
   2092   fe_mul_ttt(&x2, &x2, &z2);
   2093   fe_tobytes(out, &x2);
   2094 }
   2095 
   2096 static void x25519_scalar_mult(uint8_t out[32], const uint8_t scalar[32],
   2097                                const uint8_t point[32]) {
   2098 #if defined(BORINGSSL_X25519_NEON)
   2099   if (CRYPTO_is_NEON_capable()) {
   2100     x25519_NEON(out, scalar, point);
   2101     return;
   2102   }
   2103 #endif
   2104 
   2105   x25519_scalar_mult_generic(out, scalar, point);
   2106 }
   2107 
   2108 void X25519_keypair(uint8_t out_public_value[32], uint8_t out_private_key[32]) {
   2109   RAND_bytes(out_private_key, 32);
   2110 
   2111   // All X25519 implementations should decode scalars correctly (see
   2112   // https://tools.ietf.org/html/rfc7748#section-5). However, if an
   2113   // implementation doesn't then it might interoperate with random keys a
   2114   // fraction of the time because they'll, randomly, happen to be correctly
   2115   // formed.
   2116   //
   2117   // Thus we do the opposite of the masking here to make sure that our private
   2118   // keys are never correctly masked and so, hopefully, any incorrect
   2119   // implementations are deterministically broken.
   2120   //
   2121   // This does not affect security because, although we're throwing away
   2122   // entropy, a valid implementation of scalarmult should throw away the exact
   2123   // same bits anyway.
   2124   out_private_key[0] |= ~248;
   2125   out_private_key[31] &= ~64;
   2126   out_private_key[31] |= ~127;
   2127 
   2128   X25519_public_from_private(out_public_value, out_private_key);
   2129 }
   2130 
   2131 int X25519(uint8_t out_shared_key[32], const uint8_t private_key[32],
   2132            const uint8_t peer_public_value[32]) {
   2133   static const uint8_t kZeros[32] = {0};
   2134   x25519_scalar_mult(out_shared_key, private_key, peer_public_value);
   2135   // The all-zero output results when the input is a point of small order.
   2136   return CRYPTO_memcmp(kZeros, out_shared_key, 32) != 0;
   2137 }
   2138 
   2139 void X25519_public_from_private(uint8_t out_public_value[32],
   2140                                 const uint8_t private_key[32]) {
   2141 #if defined(BORINGSSL_X25519_NEON)
   2142   if (CRYPTO_is_NEON_capable()) {
   2143     static const uint8_t kMongomeryBasePoint[32] = {9};
   2144     x25519_NEON(out_public_value, private_key, kMongomeryBasePoint);
   2145     return;
   2146   }
   2147 #endif
   2148 
   2149   uint8_t e[32];
   2150   OPENSSL_memcpy(e, private_key, 32);
   2151   e[0] &= 248;
   2152   e[31] &= 127;
   2153   e[31] |= 64;
   2154 
   2155   ge_p3 A;
   2156   x25519_ge_scalarmult_base(&A, e);
   2157 
   2158   // We only need the u-coordinate of the curve25519 point. The map is
   2159   // u=(y+1)/(1-y). Since y=Y/Z, this gives u=(Z+Y)/(Z-Y).
   2160   fe_loose zplusy, zminusy;
   2161   fe zminusy_inv;
   2162   fe_add(&zplusy, &A.Z, &A.Y);
   2163   fe_sub(&zminusy, &A.Z, &A.Y);
   2164   fe_loose_invert(&zminusy_inv, &zminusy);
   2165   fe_mul_tlt(&zminusy_inv, &zplusy, &zminusy_inv);
   2166   fe_tobytes(out_public_value, &zminusy_inv);
   2167 }
   2168