Home | History | Annotate | Download | only in crypto
      1 // Copyright (c) 2013 The Chromium Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style license that can be
      3 // found in the LICENSE file.
      4 
      5 /*
      6  * curve25519-donna: Curve25519 elliptic curve, public key function
      7  *
      8  * http://code.google.com/p/curve25519-donna/
      9  *
     10  * Adam Langley <agl (at) imperialviolet.org>
     11  *
     12  * Derived from public domain C code by Daniel J. Bernstein <djb (at) cr.yp.to>
     13  *
     14  * More information about curve25519 can be found here
     15  *   http://cr.yp.to/ecdh.html
     16  *
     17  * djb's sample implementation of curve25519 is written in a special assembly
     18  * language called qhasm and uses the floating point registers.
     19  *
     20  * This is, almost, a clean room reimplementation from the curve25519 paper. It
     21  * uses many of the tricks described therein. Only the crecip function is taken
     22  * from the sample implementation.
     23  */
     24 
     25 #include <string.h>
     26 #include <stdint.h>
     27 
     28 typedef uint8_t u8;
     29 typedef int32_t s32;
     30 typedef int64_t limb;
     31 
     32 /* Field element representation:
     33  *
     34  * Field elements are written as an array of signed, 64-bit limbs, least
     35  * significant first. The value of the field element is:
     36  *   x[0] + 2^26x[1] + x^51x[2] + 2^102x[3] + ...
     37  *
     38  * i.e. the limbs are 26, 25, 26, 25, ... bits wide.
     39  */
     40 
     41 /* Sum two numbers: output += in */
     42 static void fsum(limb *output, const limb *in) {
     43   unsigned i;
     44   for (i = 0; i < 10; i += 2) {
     45     output[0+i] = (output[0+i] + in[0+i]);
     46     output[1+i] = (output[1+i] + in[1+i]);
     47   }
     48 }
     49 
     50 /* Find the difference of two numbers: output = in - output
     51  * (note the order of the arguments!)
     52  */
     53 static void fdifference(limb *output, const limb *in) {
     54   unsigned i;
     55   for (i = 0; i < 10; ++i) {
     56     output[i] = (in[i] - output[i]);
     57   }
     58 }
     59 
     60 /* Multiply a number my a scalar: output = in * scalar */
     61 static void fscalar_product(limb *output, const limb *in, const limb scalar) {
     62   unsigned i;
     63   for (i = 0; i < 10; ++i) {
     64     output[i] = in[i] * scalar;
     65   }
     66 }
     67 
     68 /* Multiply two numbers: output = in2 * in
     69  *
     70  * output must be distinct to both inputs. The inputs are reduced coefficient
     71  * form, the output is not.
     72  */
     73 static void fproduct(limb *output, const limb *in2, const limb *in) {
     74   output[0] =       ((limb) ((s32) in2[0])) * ((s32) in[0]);
     75   output[1] =       ((limb) ((s32) in2[0])) * ((s32) in[1]) +
     76                     ((limb) ((s32) in2[1])) * ((s32) in[0]);
     77   output[2] =  2 *  ((limb) ((s32) in2[1])) * ((s32) in[1]) +
     78                     ((limb) ((s32) in2[0])) * ((s32) in[2]) +
     79                     ((limb) ((s32) in2[2])) * ((s32) in[0]);
     80   output[3] =       ((limb) ((s32) in2[1])) * ((s32) in[2]) +
     81                     ((limb) ((s32) in2[2])) * ((s32) in[1]) +
     82                     ((limb) ((s32) in2[0])) * ((s32) in[3]) +
     83                     ((limb) ((s32) in2[3])) * ((s32) in[0]);
     84   output[4] =       ((limb) ((s32) in2[2])) * ((s32) in[2]) +
     85                2 * (((limb) ((s32) in2[1])) * ((s32) in[3]) +
     86                     ((limb) ((s32) in2[3])) * ((s32) in[1])) +
     87                     ((limb) ((s32) in2[0])) * ((s32) in[4]) +
     88                     ((limb) ((s32) in2[4])) * ((s32) in[0]);
     89   output[5] =       ((limb) ((s32) in2[2])) * ((s32) in[3]) +
     90                     ((limb) ((s32) in2[3])) * ((s32) in[2]) +
     91                     ((limb) ((s32) in2[1])) * ((s32) in[4]) +
     92                     ((limb) ((s32) in2[4])) * ((s32) in[1]) +
     93                     ((limb) ((s32) in2[0])) * ((s32) in[5]) +
     94                     ((limb) ((s32) in2[5])) * ((s32) in[0]);
     95   output[6] =  2 * (((limb) ((s32) in2[3])) * ((s32) in[3]) +
     96                     ((limb) ((s32) in2[1])) * ((s32) in[5]) +
     97                     ((limb) ((s32) in2[5])) * ((s32) in[1])) +
     98                     ((limb) ((s32) in2[2])) * ((s32) in[4]) +
     99                     ((limb) ((s32) in2[4])) * ((s32) in[2]) +
    100                     ((limb) ((s32) in2[0])) * ((s32) in[6]) +
    101                     ((limb) ((s32) in2[6])) * ((s32) in[0]);
    102   output[7] =       ((limb) ((s32) in2[3])) * ((s32) in[4]) +
    103                     ((limb) ((s32) in2[4])) * ((s32) in[3]) +
    104                     ((limb) ((s32) in2[2])) * ((s32) in[5]) +
    105                     ((limb) ((s32) in2[5])) * ((s32) in[2]) +
    106                     ((limb) ((s32) in2[1])) * ((s32) in[6]) +
    107                     ((limb) ((s32) in2[6])) * ((s32) in[1]) +
    108                     ((limb) ((s32) in2[0])) * ((s32) in[7]) +
    109                     ((limb) ((s32) in2[7])) * ((s32) in[0]);
    110   output[8] =       ((limb) ((s32) in2[4])) * ((s32) in[4]) +
    111                2 * (((limb) ((s32) in2[3])) * ((s32) in[5]) +
    112                     ((limb) ((s32) in2[5])) * ((s32) in[3]) +
    113                     ((limb) ((s32) in2[1])) * ((s32) in[7]) +
    114                     ((limb) ((s32) in2[7])) * ((s32) in[1])) +
    115                     ((limb) ((s32) in2[2])) * ((s32) in[6]) +
    116                     ((limb) ((s32) in2[6])) * ((s32) in[2]) +
    117                     ((limb) ((s32) in2[0])) * ((s32) in[8]) +
    118                     ((limb) ((s32) in2[8])) * ((s32) in[0]);
    119   output[9] =       ((limb) ((s32) in2[4])) * ((s32) in[5]) +
    120                     ((limb) ((s32) in2[5])) * ((s32) in[4]) +
    121                     ((limb) ((s32) in2[3])) * ((s32) in[6]) +
    122                     ((limb) ((s32) in2[6])) * ((s32) in[3]) +
    123                     ((limb) ((s32) in2[2])) * ((s32) in[7]) +
    124                     ((limb) ((s32) in2[7])) * ((s32) in[2]) +
    125                     ((limb) ((s32) in2[1])) * ((s32) in[8]) +
    126                     ((limb) ((s32) in2[8])) * ((s32) in[1]) +
    127                     ((limb) ((s32) in2[0])) * ((s32) in[9]) +
    128                     ((limb) ((s32) in2[9])) * ((s32) in[0]);
    129   output[10] = 2 * (((limb) ((s32) in2[5])) * ((s32) in[5]) +
    130                     ((limb) ((s32) in2[3])) * ((s32) in[7]) +
    131                     ((limb) ((s32) in2[7])) * ((s32) in[3]) +
    132                     ((limb) ((s32) in2[1])) * ((s32) in[9]) +
    133                     ((limb) ((s32) in2[9])) * ((s32) in[1])) +
    134                     ((limb) ((s32) in2[4])) * ((s32) in[6]) +
    135                     ((limb) ((s32) in2[6])) * ((s32) in[4]) +
    136                     ((limb) ((s32) in2[2])) * ((s32) in[8]) +
    137                     ((limb) ((s32) in2[8])) * ((s32) in[2]);
    138   output[11] =      ((limb) ((s32) in2[5])) * ((s32) in[6]) +
    139                     ((limb) ((s32) in2[6])) * ((s32) in[5]) +
    140                     ((limb) ((s32) in2[4])) * ((s32) in[7]) +
    141                     ((limb) ((s32) in2[7])) * ((s32) in[4]) +
    142                     ((limb) ((s32) in2[3])) * ((s32) in[8]) +
    143                     ((limb) ((s32) in2[8])) * ((s32) in[3]) +
    144                     ((limb) ((s32) in2[2])) * ((s32) in[9]) +
    145                     ((limb) ((s32) in2[9])) * ((s32) in[2]);
    146   output[12] =      ((limb) ((s32) in2[6])) * ((s32) in[6]) +
    147                2 * (((limb) ((s32) in2[5])) * ((s32) in[7]) +
    148                     ((limb) ((s32) in2[7])) * ((s32) in[5]) +
    149                     ((limb) ((s32) in2[3])) * ((s32) in[9]) +
    150                     ((limb) ((s32) in2[9])) * ((s32) in[3])) +
    151                     ((limb) ((s32) in2[4])) * ((s32) in[8]) +
    152                     ((limb) ((s32) in2[8])) * ((s32) in[4]);
    153   output[13] =      ((limb) ((s32) in2[6])) * ((s32) in[7]) +
    154                     ((limb) ((s32) in2[7])) * ((s32) in[6]) +
    155                     ((limb) ((s32) in2[5])) * ((s32) in[8]) +
    156                     ((limb) ((s32) in2[8])) * ((s32) in[5]) +
    157                     ((limb) ((s32) in2[4])) * ((s32) in[9]) +
    158                     ((limb) ((s32) in2[9])) * ((s32) in[4]);
    159   output[14] = 2 * (((limb) ((s32) in2[7])) * ((s32) in[7]) +
    160                     ((limb) ((s32) in2[5])) * ((s32) in[9]) +
    161                     ((limb) ((s32) in2[9])) * ((s32) in[5])) +
    162                     ((limb) ((s32) in2[6])) * ((s32) in[8]) +
    163                     ((limb) ((s32) in2[8])) * ((s32) in[6]);
    164   output[15] =      ((limb) ((s32) in2[7])) * ((s32) in[8]) +
    165                     ((limb) ((s32) in2[8])) * ((s32) in[7]) +
    166                     ((limb) ((s32) in2[6])) * ((s32) in[9]) +
    167                     ((limb) ((s32) in2[9])) * ((s32) in[6]);
    168   output[16] =      ((limb) ((s32) in2[8])) * ((s32) in[8]) +
    169                2 * (((limb) ((s32) in2[7])) * ((s32) in[9]) +
    170                     ((limb) ((s32) in2[9])) * ((s32) in[7]));
    171   output[17] =      ((limb) ((s32) in2[8])) * ((s32) in[9]) +
    172                     ((limb) ((s32) in2[9])) * ((s32) in[8]);
    173   output[18] = 2 *  ((limb) ((s32) in2[9])) * ((s32) in[9]);
    174 }
    175 
    176 /* Reduce a long form to a short form by taking the input mod 2^255 - 19. */
    177 static void freduce_degree(limb *output) {
    178   /* Each of these shifts and adds ends up multiplying the value by 19. */
    179   output[8] += output[18] << 4;
    180   output[8] += output[18] << 1;
    181   output[8] += output[18];
    182   output[7] += output[17] << 4;
    183   output[7] += output[17] << 1;
    184   output[7] += output[17];
    185   output[6] += output[16] << 4;
    186   output[6] += output[16] << 1;
    187   output[6] += output[16];
    188   output[5] += output[15] << 4;
    189   output[5] += output[15] << 1;
    190   output[5] += output[15];
    191   output[4] += output[14] << 4;
    192   output[4] += output[14] << 1;
    193   output[4] += output[14];
    194   output[3] += output[13] << 4;
    195   output[3] += output[13] << 1;
    196   output[3] += output[13];
    197   output[2] += output[12] << 4;
    198   output[2] += output[12] << 1;
    199   output[2] += output[12];
    200   output[1] += output[11] << 4;
    201   output[1] += output[11] << 1;
    202   output[1] += output[11];
    203   output[0] += output[10] << 4;
    204   output[0] += output[10] << 1;
    205   output[0] += output[10];
    206 }
    207 
    208 /* Reduce all coefficients of the short form input so that |x| < 2^26.
    209  *
    210  * On entry: |output[i]| < 2^62
    211  */
    212 static void freduce_coefficients(limb *output) {
    213   unsigned i;
    214   do {
    215     output[10] = 0;
    216 
    217     for (i = 0; i < 10; i += 2) {
    218       limb over = output[i] / 0x4000000l;
    219       output[i+1] += over;
    220       output[i] -= over * 0x4000000l;
    221 
    222       over = output[i+1] / 0x2000000;
    223       output[i+2] += over;
    224       output[i+1] -= over * 0x2000000;
    225     }
    226     output[0] += 19 * output[10];
    227   } while (output[10]);
    228 }
    229 
    230 /* A helpful wrapper around fproduct: output = in * in2.
    231  *
    232  * output must be distinct to both inputs. The output is reduced degree and
    233  * reduced coefficient.
    234  */
    235 static void
    236 fmul(limb *output, const limb *in, const limb *in2) {
    237   limb t[19];
    238   fproduct(t, in, in2);
    239   freduce_degree(t);
    240   freduce_coefficients(t);
    241   memcpy(output, t, sizeof(limb) * 10);
    242 }
    243 
    244 static void fsquare_inner(limb *output, const limb *in) {
    245   output[0] =       ((limb) ((s32) in[0])) * ((s32) in[0]);
    246   output[1] =  2 *  ((limb) ((s32) in[0])) * ((s32) in[1]);
    247   output[2] =  2 * (((limb) ((s32) in[1])) * ((s32) in[1]) +
    248                     ((limb) ((s32) in[0])) * ((s32) in[2]));
    249   output[3] =  2 * (((limb) ((s32) in[1])) * ((s32) in[2]) +
    250                     ((limb) ((s32) in[0])) * ((s32) in[3]));
    251   output[4] =       ((limb) ((s32) in[2])) * ((s32) in[2]) +
    252                4 *  ((limb) ((s32) in[1])) * ((s32) in[3]) +
    253                2 *  ((limb) ((s32) in[0])) * ((s32) in[4]);
    254   output[5] =  2 * (((limb) ((s32) in[2])) * ((s32) in[3]) +
    255                     ((limb) ((s32) in[1])) * ((s32) in[4]) +
    256                     ((limb) ((s32) in[0])) * ((s32) in[5]));
    257   output[6] =  2 * (((limb) ((s32) in[3])) * ((s32) in[3]) +
    258                     ((limb) ((s32) in[2])) * ((s32) in[4]) +
    259                     ((limb) ((s32) in[0])) * ((s32) in[6]) +
    260                2 *  ((limb) ((s32) in[1])) * ((s32) in[5]));
    261   output[7] =  2 * (((limb) ((s32) in[3])) * ((s32) in[4]) +
    262                     ((limb) ((s32) in[2])) * ((s32) in[5]) +
    263                     ((limb) ((s32) in[1])) * ((s32) in[6]) +
    264                     ((limb) ((s32) in[0])) * ((s32) in[7]));
    265   output[8] =       ((limb) ((s32) in[4])) * ((s32) in[4]) +
    266                2 * (((limb) ((s32) in[2])) * ((s32) in[6]) +
    267                     ((limb) ((s32) in[0])) * ((s32) in[8]) +
    268                2 * (((limb) ((s32) in[1])) * ((s32) in[7]) +
    269                     ((limb) ((s32) in[3])) * ((s32) in[5])));
    270   output[9] =  2 * (((limb) ((s32) in[4])) * ((s32) in[5]) +
    271                     ((limb) ((s32) in[3])) * ((s32) in[6]) +
    272                     ((limb) ((s32) in[2])) * ((s32) in[7]) +
    273                     ((limb) ((s32) in[1])) * ((s32) in[8]) +
    274                     ((limb) ((s32) in[0])) * ((s32) in[9]));
    275   output[10] = 2 * (((limb) ((s32) in[5])) * ((s32) in[5]) +
    276                     ((limb) ((s32) in[4])) * ((s32) in[6]) +
    277                     ((limb) ((s32) in[2])) * ((s32) in[8]) +
    278                2 * (((limb) ((s32) in[3])) * ((s32) in[7]) +
    279                     ((limb) ((s32) in[1])) * ((s32) in[9])));
    280   output[11] = 2 * (((limb) ((s32) in[5])) * ((s32) in[6]) +
    281                     ((limb) ((s32) in[4])) * ((s32) in[7]) +
    282                     ((limb) ((s32) in[3])) * ((s32) in[8]) +
    283                     ((limb) ((s32) in[2])) * ((s32) in[9]));
    284   output[12] =      ((limb) ((s32) in[6])) * ((s32) in[6]) +
    285                2 * (((limb) ((s32) in[4])) * ((s32) in[8]) +
    286                2 * (((limb) ((s32) in[5])) * ((s32) in[7]) +
    287                     ((limb) ((s32) in[3])) * ((s32) in[9])));
    288   output[13] = 2 * (((limb) ((s32) in[6])) * ((s32) in[7]) +
    289                     ((limb) ((s32) in[5])) * ((s32) in[8]) +
    290                     ((limb) ((s32) in[4])) * ((s32) in[9]));
    291   output[14] = 2 * (((limb) ((s32) in[7])) * ((s32) in[7]) +
    292                     ((limb) ((s32) in[6])) * ((s32) in[8]) +
    293                2 *  ((limb) ((s32) in[5])) * ((s32) in[9]));
    294   output[15] = 2 * (((limb) ((s32) in[7])) * ((s32) in[8]) +
    295                     ((limb) ((s32) in[6])) * ((s32) in[9]));
    296   output[16] =      ((limb) ((s32) in[8])) * ((s32) in[8]) +
    297                4 *  ((limb) ((s32) in[7])) * ((s32) in[9]);
    298   output[17] = 2 *  ((limb) ((s32) in[8])) * ((s32) in[9]);
    299   output[18] = 2 *  ((limb) ((s32) in[9])) * ((s32) in[9]);
    300 }
    301 
    302 static void
    303 fsquare(limb *output, const limb *in) {
    304   limb t[19];
    305   fsquare_inner(t, in);
    306   freduce_degree(t);
    307   freduce_coefficients(t);
    308   memcpy(output, t, sizeof(limb) * 10);
    309 }
    310 
    311 /* Take a little-endian, 32-byte number and expand it into polynomial form */
    312 static void
    313 fexpand(limb *output, const u8 *input) {
    314 #define F(n,start,shift,mask) \
    315   output[n] = ((((limb) input[start + 0]) | \
    316                 ((limb) input[start + 1]) << 8 | \
    317                 ((limb) input[start + 2]) << 16 | \
    318                 ((limb) input[start + 3]) << 24) >> shift) & mask;
    319   F(0, 0, 0, 0x3ffffff);
    320   F(1, 3, 2, 0x1ffffff);
    321   F(2, 6, 3, 0x3ffffff);
    322   F(3, 9, 5, 0x1ffffff);
    323   F(4, 12, 6, 0x3ffffff);
    324   F(5, 16, 0, 0x1ffffff);
    325   F(6, 19, 1, 0x3ffffff);
    326   F(7, 22, 3, 0x1ffffff);
    327   F(8, 25, 4, 0x3ffffff);
    328   F(9, 28, 6, 0x1ffffff);
    329 #undef F
    330 }
    331 
    332 /* Take a fully reduced polynomial form number and contract it into a
    333  * little-endian, 32-byte array
    334  */
    335 static void
    336 fcontract(u8 *output, limb *input) {
    337   int i;
    338 
    339   do {
    340     for (i = 0; i < 9; ++i) {
    341       if ((i & 1) == 1) {
    342         while (input[i] < 0) {
    343           input[i] += 0x2000000;
    344           input[i + 1]--;
    345         }
    346       } else {
    347         while (input[i] < 0) {
    348           input[i] += 0x4000000;
    349           input[i + 1]--;
    350         }
    351       }
    352     }
    353     while (input[9] < 0) {
    354       input[9] += 0x2000000;
    355       input[0] -= 19;
    356     }
    357   } while (input[0] < 0);
    358 
    359   input[1] <<= 2;
    360   input[2] <<= 3;
    361   input[3] <<= 5;
    362   input[4] <<= 6;
    363   input[6] <<= 1;
    364   input[7] <<= 3;
    365   input[8] <<= 4;
    366   input[9] <<= 6;
    367 #define F(i, s) \
    368   output[s+0] |=  input[i] & 0xff; \
    369   output[s+1]  = (input[i] >> 8) & 0xff; \
    370   output[s+2]  = (input[i] >> 16) & 0xff; \
    371   output[s+3]  = (input[i] >> 24) & 0xff;
    372   output[0] = 0;
    373   output[16] = 0;
    374   F(0,0);
    375   F(1,3);
    376   F(2,6);
    377   F(3,9);
    378   F(4,12);
    379   F(5,16);
    380   F(6,19);
    381   F(7,22);
    382   F(8,25);
    383   F(9,28);
    384 #undef F
    385 }
    386 
    387 /* Input: Q, Q', Q-Q'
    388  * Output: 2Q, Q+Q'
    389  *
    390  *   x2 z3: long form
    391  *   x3 z3: long form
    392  *   x z: short form, destroyed
    393  *   xprime zprime: short form, destroyed
    394  *   qmqp: short form, preserved
    395  */
    396 static void fmonty(limb *x2, limb *z2,  /* output 2Q */
    397                    limb *x3, limb *z3,  /* output Q + Q' */
    398                    limb *x, limb *z,    /* input Q */
    399                    limb *xprime, limb *zprime,  /* input Q' */
    400                    const limb *qmqp /* input Q - Q' */) {
    401   limb origx[10], origxprime[10], zzz[19], xx[19], zz[19], xxprime[19],
    402         zzprime[19], zzzprime[19], xxxprime[19];
    403 
    404   memcpy(origx, x, 10 * sizeof(limb));
    405   fsum(x, z);
    406   fdifference(z, origx);  // does x - z
    407 
    408   memcpy(origxprime, xprime, sizeof(limb) * 10);
    409   fsum(xprime, zprime);
    410   fdifference(zprime, origxprime);
    411   fproduct(xxprime, xprime, z);
    412   fproduct(zzprime, x, zprime);
    413   freduce_degree(xxprime);
    414   freduce_coefficients(xxprime);
    415   freduce_degree(zzprime);
    416   freduce_coefficients(zzprime);
    417   memcpy(origxprime, xxprime, sizeof(limb) * 10);
    418   fsum(xxprime, zzprime);
    419   fdifference(zzprime, origxprime);
    420   fsquare(xxxprime, xxprime);
    421   fsquare(zzzprime, zzprime);
    422   fproduct(zzprime, zzzprime, qmqp);
    423   freduce_degree(zzprime);
    424   freduce_coefficients(zzprime);
    425   memcpy(x3, xxxprime, sizeof(limb) * 10);
    426   memcpy(z3, zzprime, sizeof(limb) * 10);
    427 
    428   fsquare(xx, x);
    429   fsquare(zz, z);
    430   fproduct(x2, xx, zz);
    431   freduce_degree(x2);
    432   freduce_coefficients(x2);
    433   fdifference(zz, xx);  // does zz = xx - zz
    434   memset(zzz + 10, 0, sizeof(limb) * 9);
    435   fscalar_product(zzz, zz, 121665);
    436   freduce_degree(zzz);
    437   freduce_coefficients(zzz);
    438   fsum(zzz, xx);
    439   fproduct(z2, zz, zzz);
    440   freduce_degree(z2);
    441   freduce_coefficients(z2);
    442 }
    443 
    444 /* Calculates nQ where Q is the x-coordinate of a point on the curve
    445  *
    446  *   resultx/resultz: the x coordinate of the resulting curve point (short form)
    447  *   n: a little endian, 32-byte number
    448  *   q: a point of the curve (short form)
    449  */
    450 static void
    451 cmult(limb *resultx, limb *resultz, const u8 *n, const limb *q) {
    452   limb a[19] = {0}, b[19] = {1}, c[19] = {1}, d[19] = {0};
    453   limb *nqpqx = a, *nqpqz = b, *nqx = c, *nqz = d, *t;
    454   limb e[19] = {0}, f[19] = {1}, g[19] = {0}, h[19] = {1};
    455   limb *nqpqx2 = e, *nqpqz2 = f, *nqx2 = g, *nqz2 = h;
    456 
    457   unsigned i, j;
    458 
    459   memcpy(nqpqx, q, sizeof(limb) * 10);
    460 
    461   for (i = 0; i < 32; ++i) {
    462     u8 byte = n[31 - i];
    463     for (j = 0; j < 8; ++j) {
    464       if (byte & 0x80) {
    465         fmonty(nqpqx2, nqpqz2,
    466                nqx2, nqz2,
    467                nqpqx, nqpqz,
    468                nqx, nqz,
    469                q);
    470       } else {
    471         fmonty(nqx2, nqz2,
    472                nqpqx2, nqpqz2,
    473                nqx, nqz,
    474                nqpqx, nqpqz,
    475                q);
    476       }
    477 
    478       t = nqx;
    479       nqx = nqx2;
    480       nqx2 = t;
    481       t = nqz;
    482       nqz = nqz2;
    483       nqz2 = t;
    484       t = nqpqx;
    485       nqpqx = nqpqx2;
    486       nqpqx2 = t;
    487       t = nqpqz;
    488       nqpqz = nqpqz2;
    489       nqpqz2 = t;
    490 
    491       byte <<= 1;
    492     }
    493   }
    494 
    495   memcpy(resultx, nqx, sizeof(limb) * 10);
    496   memcpy(resultz, nqz, sizeof(limb) * 10);
    497 }
    498 
    499 // -----------------------------------------------------------------------------
    500 // Shamelessly copied from djb's code
    501 // -----------------------------------------------------------------------------
    502 static void
    503 crecip(limb *out, const limb *z) {
    504   limb z2[10];
    505   limb z9[10];
    506   limb z11[10];
    507   limb z2_5_0[10];
    508   limb z2_10_0[10];
    509   limb z2_20_0[10];
    510   limb z2_50_0[10];
    511   limb z2_100_0[10];
    512   limb t0[10];
    513   limb t1[10];
    514   int i;
    515 
    516   /* 2 */ fsquare(z2,z);
    517   /* 4 */ fsquare(t1,z2);
    518   /* 8 */ fsquare(t0,t1);
    519   /* 9 */ fmul(z9,t0,z);
    520   /* 11 */ fmul(z11,z9,z2);
    521   /* 22 */ fsquare(t0,z11);
    522   /* 2^5 - 2^0 = 31 */ fmul(z2_5_0,t0,z9);
    523 
    524   /* 2^6 - 2^1 */ fsquare(t0,z2_5_0);
    525   /* 2^7 - 2^2 */ fsquare(t1,t0);
    526   /* 2^8 - 2^3 */ fsquare(t0,t1);
    527   /* 2^9 - 2^4 */ fsquare(t1,t0);
    528   /* 2^10 - 2^5 */ fsquare(t0,t1);
    529   /* 2^10 - 2^0 */ fmul(z2_10_0,t0,z2_5_0);
    530 
    531   /* 2^11 - 2^1 */ fsquare(t0,z2_10_0);
    532   /* 2^12 - 2^2 */ fsquare(t1,t0);
    533   /* 2^20 - 2^10 */
    534   for (i = 2;i < 10;i += 2) { fsquare(t0,t1); fsquare(t1,t0); }
    535   /* 2^20 - 2^0 */ fmul(z2_20_0,t1,z2_10_0);
    536 
    537   /* 2^21 - 2^1 */ fsquare(t0,z2_20_0);
    538   /* 2^22 - 2^2 */ fsquare(t1,t0);
    539   /* 2^40 - 2^20 */
    540   for (i = 2;i < 20;i += 2) { fsquare(t0,t1); fsquare(t1,t0); }
    541   /* 2^40 - 2^0 */ fmul(t0,t1,z2_20_0);
    542 
    543   /* 2^41 - 2^1 */ fsquare(t1,t0);
    544   /* 2^42 - 2^2 */ fsquare(t0,t1);
    545   /* 2^50 - 2^10 */
    546   for (i = 2;i < 10;i += 2) { fsquare(t1,t0); fsquare(t0,t1); }
    547   /* 2^50 - 2^0 */ fmul(z2_50_0,t0,z2_10_0);
    548 
    549   /* 2^51 - 2^1 */ fsquare(t0,z2_50_0);
    550   /* 2^52 - 2^2 */ fsquare(t1,t0);
    551   /* 2^100 - 2^50 */
    552   for (i = 2;i < 50;i += 2) { fsquare(t0,t1); fsquare(t1,t0); }
    553   /* 2^100 - 2^0 */ fmul(z2_100_0,t1,z2_50_0);
    554 
    555   /* 2^101 - 2^1 */ fsquare(t1,z2_100_0);
    556   /* 2^102 - 2^2 */ fsquare(t0,t1);
    557   /* 2^200 - 2^100 */
    558   for (i = 2;i < 100;i += 2) { fsquare(t1,t0); fsquare(t0,t1); }
    559   /* 2^200 - 2^0 */ fmul(t1,t0,z2_100_0);
    560 
    561   /* 2^201 - 2^1 */ fsquare(t0,t1);
    562   /* 2^202 - 2^2 */ fsquare(t1,t0);
    563   /* 2^250 - 2^50 */
    564   for (i = 2;i < 50;i += 2) { fsquare(t0,t1); fsquare(t1,t0); }
    565   /* 2^250 - 2^0 */ fmul(t0,t1,z2_50_0);
    566 
    567   /* 2^251 - 2^1 */ fsquare(t1,t0);
    568   /* 2^252 - 2^2 */ fsquare(t0,t1);
    569   /* 2^253 - 2^3 */ fsquare(t1,t0);
    570   /* 2^254 - 2^4 */ fsquare(t0,t1);
    571   /* 2^255 - 2^5 */ fsquare(t1,t0);
    572   /* 2^255 - 21 */ fmul(out,t1,z11);
    573 }
    574 
    575 int
    576 curve25519_donna(u8 *mypublic, const u8 *secret, const u8 *basepoint) {
    577   limb bp[10], x[10], z[10], zmone[10];
    578   uint8_t e[32];
    579   int i;
    580 
    581   for (i = 0; i < 32; ++i) e[i] = secret[i];
    582   e[0] &= 248;
    583   e[31] &= 127;
    584   e[31] |= 64;
    585 
    586   fexpand(bp, basepoint);
    587   cmult(x, z, e, bp);
    588   crecip(zmone, z);
    589   fmul(z, x, zmone);
    590   fcontract(mypublic, z);
    591   return 0;
    592 }
    593