Home | History | Annotate | Download | only in bn
      1 /* Copyright (C) 1995-1998 Eric Young (eay (at) cryptsoft.com)
      2  * All rights reserved.
      3  *
      4  * This package is an SSL implementation written
      5  * by Eric Young (eay (at) cryptsoft.com).
      6  * The implementation was written so as to conform with Netscapes SSL.
      7  *
      8  * This library is free for commercial and non-commercial use as long as
      9  * the following conditions are aheared to.  The following conditions
     10  * apply to all code found in this distribution, be it the RC4, RSA,
     11  * lhash, DES, etc., code; not just the SSL code.  The SSL documentation
     12  * included with this distribution is covered by the same copyright terms
     13  * except that the holder is Tim Hudson (tjh (at) cryptsoft.com).
     14  *
     15  * Copyright remains Eric Young's, and as such any Copyright notices in
     16  * the code are not to be removed.
     17  * If this package is used in a product, Eric Young should be given attribution
     18  * as the author of the parts of the library used.
     19  * This can be in the form of a textual message at program startup or
     20  * in documentation (online or textual) provided with the package.
     21  *
     22  * Redistribution and use in source and binary forms, with or without
     23  * modification, are permitted provided that the following conditions
     24  * are met:
     25  * 1. Redistributions of source code must retain the copyright
     26  *    notice, this list of conditions and the following disclaimer.
     27  * 2. Redistributions in binary form must reproduce the above copyright
     28  *    notice, this list of conditions and the following disclaimer in the
     29  *    documentation and/or other materials provided with the distribution.
     30  * 3. All advertising materials mentioning features or use of this software
     31  *    must display the following acknowledgement:
     32  *    "This product includes cryptographic software written by
     33  *     Eric Young (eay (at) cryptsoft.com)"
     34  *    The word 'cryptographic' can be left out if the rouines from the library
     35  *    being used are not cryptographic related :-).
     36  * 4. If you include any Windows specific code (or a derivative thereof) from
     37  *    the apps directory (application code) you must include an acknowledgement:
     38  *    "This product includes software written by Tim Hudson (tjh (at) cryptsoft.com)"
     39  *
     40  * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
     41  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     42  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
     43  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
     44  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
     45  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
     46  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
     47  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
     48  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
     49  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
     50  * SUCH DAMAGE.
     51  *
     52  * The licence and distribution terms for any publically available version or
     53  * derivative of this code cannot be changed.  i.e. this code cannot simply be
     54  * copied and put under another distribution licence
     55  * [including the GNU Public Licence.] */
     56 
     57 #include <openssl/bn.h>
     58 
     59 #include <assert.h>
     60 
     61 #include "internal.h"
     62 
     63 
     64 /* Generic implementations of most operations are needed for:
     65  * - Configurations without inline assembly.
     66  * - Architectures other than x86 or x86_64.
     67  * - Windows x84_64; x86_64-gcc.c does not build on MSVC. */
     68 #if defined(OPENSSL_NO_ASM) || \
     69     (!defined(OPENSSL_X86_64) && !defined(OPENSSL_X86)) || \
     70     (defined(OPENSSL_X86_64) && defined(OPENSSL_WINDOWS))
     71 
     72 #ifdef BN_ULLONG
     73 #define mul_add(r, a, w, c)             \
     74   {                                     \
     75     BN_ULLONG t;                        \
     76     t = (BN_ULLONG)w * (a) + (r) + (c); \
     77     (r) = Lw(t);                        \
     78     (c) = Hw(t);                        \
     79   }
     80 
     81 #define mul(r, a, w, c)           \
     82   {                               \
     83     BN_ULLONG t;                  \
     84     t = (BN_ULLONG)w * (a) + (c); \
     85     (r) = Lw(t);                  \
     86     (c) = Hw(t);                  \
     87   }
     88 
     89 #define sqr(r0, r1, a)        \
     90   {                           \
     91     BN_ULLONG t;              \
     92     t = (BN_ULLONG)(a) * (a); \
     93     (r0) = Lw(t);             \
     94     (r1) = Hw(t);             \
     95   }
     96 
     97 #elif defined(BN_UMULT_LOHI)
     98 #define mul_add(r, a, w, c)             \
     99   {                                     \
    100     BN_ULONG high, low, ret, tmp = (a); \
    101     ret = (r);                          \
    102     BN_UMULT_LOHI(low, high, w, tmp);   \
    103     ret += (c);                         \
    104     (c) = (ret < (c)) ? 1 : 0;          \
    105     (c) += high;                        \
    106     ret += low;                         \
    107     (c) += (ret < low) ? 1 : 0;         \
    108     (r) = ret;                          \
    109   }
    110 
    111 #define mul(r, a, w, c)                \
    112   {                                    \
    113     BN_ULONG high, low, ret, ta = (a); \
    114     BN_UMULT_LOHI(low, high, w, ta);   \
    115     ret = low + (c);                   \
    116     (c) = high;                        \
    117     (c) += (ret < low) ? 1 : 0;        \
    118     (r) = ret;                         \
    119   }
    120 
    121 #define sqr(r0, r1, a)               \
    122   {                                  \
    123     BN_ULONG tmp = (a);              \
    124     BN_UMULT_LOHI(r0, r1, tmp, tmp); \
    125   }
    126 
    127 #else
    128 
    129 /*************************************************************
    130  * No long long type
    131  */
    132 
    133 #define LBITS(a) ((a) & BN_MASK2l)
    134 #define HBITS(a) (((a) >> BN_BITS4) & BN_MASK2l)
    135 #define L2HBITS(a) (((a) << BN_BITS4) & BN_MASK2)
    136 
    137 #define LLBITS(a) ((a) & BN_MASKl)
    138 #define LHBITS(a) (((a) >> BN_BITS2) & BN_MASKl)
    139 #define LL2HBITS(a) ((BN_ULLONG)((a) & BN_MASKl) << BN_BITS2)
    140 
    141 #define mul64(l, h, bl, bh)       \
    142   {                               \
    143     BN_ULONG m, m1, lt, ht;       \
    144                                   \
    145     lt = l;                       \
    146     ht = h;                       \
    147     m = (bh) * (lt);              \
    148     lt = (bl) * (lt);             \
    149     m1 = (bl) * (ht);             \
    150     ht = (bh) * (ht);             \
    151     m = (m + m1) & BN_MASK2;      \
    152     if (m < m1)                   \
    153       ht += L2HBITS((BN_ULONG)1); \
    154     ht += HBITS(m);               \
    155     m1 = L2HBITS(m);              \
    156     lt = (lt + m1) & BN_MASK2;    \
    157     if (lt < m1)                  \
    158       ht++;                       \
    159     (l) = lt;                     \
    160     (h) = ht;                     \
    161   }
    162 
    163 #define sqr64(lo, ho, in)                    \
    164   {                                          \
    165     BN_ULONG l, h, m;                        \
    166                                              \
    167     h = (in);                                \
    168     l = LBITS(h);                            \
    169     h = HBITS(h);                            \
    170     m = (l) * (h);                           \
    171     l *= l;                                  \
    172     h *= h;                                  \
    173     h += (m & BN_MASK2h1) >> (BN_BITS4 - 1); \
    174     m = (m & BN_MASK2l) << (BN_BITS4 + 1);   \
    175     l = (l + m) & BN_MASK2;                  \
    176     if (l < m)                               \
    177       h++;                                   \
    178     (lo) = l;                                \
    179     (ho) = h;                                \
    180   }
    181 
    182 #define mul_add(r, a, bl, bh, c) \
    183   {                              \
    184     BN_ULONG l, h;               \
    185                                  \
    186     h = (a);                     \
    187     l = LBITS(h);                \
    188     h = HBITS(h);                \
    189     mul64(l, h, (bl), (bh));     \
    190                                  \
    191     /* non-multiply part */      \
    192     l = (l + (c)) & BN_MASK2;    \
    193     if (l < (c))                 \
    194       h++;                       \
    195     (c) = (r);                   \
    196     l = (l + (c)) & BN_MASK2;    \
    197     if (l < (c))                 \
    198       h++;                       \
    199     (c) = h & BN_MASK2;          \
    200     (r) = l;                     \
    201   }
    202 
    203 #define mul(r, a, bl, bh, c)  \
    204   {                           \
    205     BN_ULONG l, h;            \
    206                               \
    207     h = (a);                  \
    208     l = LBITS(h);             \
    209     h = HBITS(h);             \
    210     mul64(l, h, (bl), (bh));  \
    211                               \
    212     /* non-multiply part */   \
    213     l += (c);                 \
    214     if ((l & BN_MASK2) < (c)) \
    215       h++;                    \
    216     (c) = h & BN_MASK2;       \
    217     (r) = l & BN_MASK2;       \
    218   }
    219 #endif /* !BN_ULLONG */
    220 
    221 #if defined(BN_ULLONG) || defined(BN_UMULT_HIGH)
    222 
    223 BN_ULONG bn_mul_add_words(BN_ULONG *rp, const BN_ULONG *ap, int num,
    224                           BN_ULONG w) {
    225   BN_ULONG c1 = 0;
    226 
    227   assert(num >= 0);
    228   if (num <= 0) {
    229     return c1;
    230   }
    231 
    232   while (num & ~3) {
    233     mul_add(rp[0], ap[0], w, c1);
    234     mul_add(rp[1], ap[1], w, c1);
    235     mul_add(rp[2], ap[2], w, c1);
    236     mul_add(rp[3], ap[3], w, c1);
    237     ap += 4;
    238     rp += 4;
    239     num -= 4;
    240   }
    241 
    242   while (num) {
    243     mul_add(rp[0], ap[0], w, c1);
    244     ap++;
    245     rp++;
    246     num--;
    247   }
    248 
    249   return c1;
    250 }
    251 
    252 BN_ULONG bn_mul_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w) {
    253   BN_ULONG c1 = 0;
    254 
    255   assert(num >= 0);
    256   if (num <= 0) {
    257     return c1;
    258   }
    259 
    260   while (num & ~3) {
    261     mul(rp[0], ap[0], w, c1);
    262     mul(rp[1], ap[1], w, c1);
    263     mul(rp[2], ap[2], w, c1);
    264     mul(rp[3], ap[3], w, c1);
    265     ap += 4;
    266     rp += 4;
    267     num -= 4;
    268   }
    269   while (num) {
    270     mul(rp[0], ap[0], w, c1);
    271     ap++;
    272     rp++;
    273     num--;
    274   }
    275   return c1;
    276 }
    277 
    278 void bn_sqr_words(BN_ULONG *r, const BN_ULONG *a, int n) {
    279   assert(n >= 0);
    280   if (n <= 0) {
    281     return;
    282   }
    283 
    284   while (n & ~3) {
    285     sqr(r[0], r[1], a[0]);
    286     sqr(r[2], r[3], a[1]);
    287     sqr(r[4], r[5], a[2]);
    288     sqr(r[6], r[7], a[3]);
    289     a += 4;
    290     r += 8;
    291     n -= 4;
    292   }
    293   while (n) {
    294     sqr(r[0], r[1], a[0]);
    295     a++;
    296     r += 2;
    297     n--;
    298   }
    299 }
    300 
    301 #else /* !(defined(BN_ULLONG) || defined(BN_UMULT_HIGH)) */
    302 
    303 BN_ULONG bn_mul_add_words(BN_ULONG *rp, const BN_ULONG *ap, int num,
    304                           BN_ULONG w) {
    305   BN_ULONG c = 0;
    306   BN_ULONG bl, bh;
    307 
    308   assert(num >= 0);
    309   if (num <= 0) {
    310     return (BN_ULONG)0;
    311   }
    312 
    313   bl = LBITS(w);
    314   bh = HBITS(w);
    315 
    316   while (num & ~3) {
    317     mul_add(rp[0], ap[0], bl, bh, c);
    318     mul_add(rp[1], ap[1], bl, bh, c);
    319     mul_add(rp[2], ap[2], bl, bh, c);
    320     mul_add(rp[3], ap[3], bl, bh, c);
    321     ap += 4;
    322     rp += 4;
    323     num -= 4;
    324   }
    325   while (num) {
    326     mul_add(rp[0], ap[0], bl, bh, c);
    327     ap++;
    328     rp++;
    329     num--;
    330   }
    331   return c;
    332 }
    333 
    334 BN_ULONG bn_mul_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w) {
    335   BN_ULONG carry = 0;
    336   BN_ULONG bl, bh;
    337 
    338   assert(num >= 0);
    339   if (num <= 0) {
    340     return (BN_ULONG)0;
    341   }
    342 
    343   bl = LBITS(w);
    344   bh = HBITS(w);
    345 
    346   while (num & ~3) {
    347     mul(rp[0], ap[0], bl, bh, carry);
    348     mul(rp[1], ap[1], bl, bh, carry);
    349     mul(rp[2], ap[2], bl, bh, carry);
    350     mul(rp[3], ap[3], bl, bh, carry);
    351     ap += 4;
    352     rp += 4;
    353     num -= 4;
    354   }
    355   while (num) {
    356     mul(rp[0], ap[0], bl, bh, carry);
    357     ap++;
    358     rp++;
    359     num--;
    360   }
    361   return carry;
    362 }
    363 
    364 void bn_sqr_words(BN_ULONG *r, const BN_ULONG *a, int n) {
    365   assert(n >= 0);
    366   if (n <= 0) {
    367     return;
    368   }
    369 
    370   while (n & ~3) {
    371     sqr64(r[0], r[1], a[0]);
    372     sqr64(r[2], r[3], a[1]);
    373     sqr64(r[4], r[5], a[2]);
    374     sqr64(r[6], r[7], a[3]);
    375     a += 4;
    376     r += 8;
    377     n -= 4;
    378   }
    379   while (n) {
    380     sqr64(r[0], r[1], a[0]);
    381     a++;
    382     r += 2;
    383     n--;
    384   }
    385 }
    386 
    387 #endif /* !(defined(BN_ULLONG) || defined(BN_UMULT_HIGH)) */
    388 
    389 #if defined(BN_ULLONG)
    390 
    391 BN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d) {
    392   return (BN_ULONG)(((((BN_ULLONG)h) << BN_BITS2) | l) / (BN_ULLONG)d);
    393 }
    394 
    395 #else
    396 
    397 /* Divide h,l by d and return the result. */
    398 BN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d) {
    399   BN_ULONG dh, dl, q, ret = 0, th, tl, t;
    400   int i, count = 2;
    401 
    402   if (d == 0) {
    403     return BN_MASK2;
    404   }
    405 
    406   i = BN_num_bits_word(d);
    407   assert((i == BN_BITS2) || (h <= (BN_ULONG)1 << i));
    408 
    409   i = BN_BITS2 - i;
    410   if (h >= d) {
    411     h -= d;
    412   }
    413 
    414   if (i) {
    415     d <<= i;
    416     h = (h << i) | (l >> (BN_BITS2 - i));
    417     l <<= i;
    418   }
    419   dh = (d & BN_MASK2h) >> BN_BITS4;
    420   dl = (d & BN_MASK2l);
    421   for (;;) {
    422     if ((h >> BN_BITS4) == dh) {
    423       q = BN_MASK2l;
    424     } else {
    425       q = h / dh;
    426     }
    427 
    428     th = q * dh;
    429     tl = dl * q;
    430     for (;;) {
    431       t = h - th;
    432       if ((t & BN_MASK2h) ||
    433           ((tl) <= ((t << BN_BITS4) | ((l & BN_MASK2h) >> BN_BITS4)))) {
    434         break;
    435       }
    436       q--;
    437       th -= dh;
    438       tl -= dl;
    439     }
    440     t = (tl >> BN_BITS4);
    441     tl = (tl << BN_BITS4) & BN_MASK2h;
    442     th += t;
    443 
    444     if (l < tl) {
    445       th++;
    446     }
    447     l -= tl;
    448     if (h < th) {
    449       h += d;
    450       q--;
    451     }
    452     h -= th;
    453 
    454     if (--count == 0) {
    455       break;
    456     }
    457 
    458     ret = q << BN_BITS4;
    459     h = ((h << BN_BITS4) | (l >> BN_BITS4)) & BN_MASK2;
    460     l = (l & BN_MASK2l) << BN_BITS4;
    461   }
    462 
    463   ret |= q;
    464   return ret;
    465 }
    466 
    467 #endif /* !defined(BN_ULLONG) */
    468 
    469 #ifdef BN_ULLONG
    470 BN_ULONG bn_add_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b,
    471                       int n) {
    472   BN_ULLONG ll = 0;
    473 
    474   assert(n >= 0);
    475   if (n <= 0) {
    476     return (BN_ULONG)0;
    477   }
    478 
    479   while (n & ~3) {
    480     ll += (BN_ULLONG)a[0] + b[0];
    481     r[0] = (BN_ULONG)ll & BN_MASK2;
    482     ll >>= BN_BITS2;
    483     ll += (BN_ULLONG)a[1] + b[1];
    484     r[1] = (BN_ULONG)ll & BN_MASK2;
    485     ll >>= BN_BITS2;
    486     ll += (BN_ULLONG)a[2] + b[2];
    487     r[2] = (BN_ULONG)ll & BN_MASK2;
    488     ll >>= BN_BITS2;
    489     ll += (BN_ULLONG)a[3] + b[3];
    490     r[3] = (BN_ULONG)ll & BN_MASK2;
    491     ll >>= BN_BITS2;
    492     a += 4;
    493     b += 4;
    494     r += 4;
    495     n -= 4;
    496   }
    497   while (n) {
    498     ll += (BN_ULLONG)a[0] + b[0];
    499     r[0] = (BN_ULONG)ll & BN_MASK2;
    500     ll >>= BN_BITS2;
    501     a++;
    502     b++;
    503     r++;
    504     n--;
    505   }
    506   return (BN_ULONG)ll;
    507 }
    508 
    509 #else /* !BN_ULLONG */
    510 
    511 BN_ULONG bn_add_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b,
    512                       int n) {
    513   BN_ULONG c, l, t;
    514 
    515   assert(n >= 0);
    516   if (n <= 0) {
    517     return (BN_ULONG)0;
    518   }
    519 
    520   c = 0;
    521   while (n & ~3) {
    522     t = a[0];
    523     t = (t + c) & BN_MASK2;
    524     c = (t < c);
    525     l = (t + b[0]) & BN_MASK2;
    526     c += (l < t);
    527     r[0] = l;
    528     t = a[1];
    529     t = (t + c) & BN_MASK2;
    530     c = (t < c);
    531     l = (t + b[1]) & BN_MASK2;
    532     c += (l < t);
    533     r[1] = l;
    534     t = a[2];
    535     t = (t + c) & BN_MASK2;
    536     c = (t < c);
    537     l = (t + b[2]) & BN_MASK2;
    538     c += (l < t);
    539     r[2] = l;
    540     t = a[3];
    541     t = (t + c) & BN_MASK2;
    542     c = (t < c);
    543     l = (t + b[3]) & BN_MASK2;
    544     c += (l < t);
    545     r[3] = l;
    546     a += 4;
    547     b += 4;
    548     r += 4;
    549     n -= 4;
    550   }
    551   while (n) {
    552     t = a[0];
    553     t = (t + c) & BN_MASK2;
    554     c = (t < c);
    555     l = (t + b[0]) & BN_MASK2;
    556     c += (l < t);
    557     r[0] = l;
    558     a++;
    559     b++;
    560     r++;
    561     n--;
    562   }
    563   return (BN_ULONG)c;
    564 }
    565 
    566 #endif /* !BN_ULLONG */
    567 
    568 BN_ULONG bn_sub_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b,
    569                       int n) {
    570   BN_ULONG t1, t2;
    571   int c = 0;
    572 
    573   assert(n >= 0);
    574   if (n <= 0) {
    575     return (BN_ULONG)0;
    576   }
    577 
    578   while (n & ~3) {
    579     t1 = a[0];
    580     t2 = b[0];
    581     r[0] = (t1 - t2 - c) & BN_MASK2;
    582     if (t1 != t2) {
    583       c = (t1 < t2);
    584     }
    585     t1 = a[1];
    586     t2 = b[1];
    587     r[1] = (t1 - t2 - c) & BN_MASK2;
    588     if (t1 != t2) {
    589       c = (t1 < t2);
    590     }
    591     t1 = a[2];
    592     t2 = b[2];
    593     r[2] = (t1 - t2 - c) & BN_MASK2;
    594     if (t1 != t2) {
    595       c = (t1 < t2);
    596     }
    597     t1 = a[3];
    598     t2 = b[3];
    599     r[3] = (t1 - t2 - c) & BN_MASK2;
    600     if (t1 != t2) {
    601       c = (t1 < t2);
    602     }
    603     a += 4;
    604     b += 4;
    605     r += 4;
    606     n -= 4;
    607   }
    608   while (n) {
    609     t1 = a[0];
    610     t2 = b[0];
    611     r[0] = (t1 - t2 - c) & BN_MASK2;
    612     if (t1 != t2) {
    613       c = (t1 < t2);
    614     }
    615     a++;
    616     b++;
    617     r++;
    618     n--;
    619   }
    620   return c;
    621 }
    622 
    623 /* mul_add_c(a,b,c0,c1,c2)  -- c+=a*b for three word number c=(c2,c1,c0) */
    624 /* mul_add_c2(a,b,c0,c1,c2) -- c+=2*a*b for three word number c=(c2,c1,c0) */
    625 /* sqr_add_c(a,i,c0,c1,c2)  -- c+=a[i]^2 for three word number c=(c2,c1,c0) */
    626 /* sqr_add_c2(a,i,c0,c1,c2) -- c+=2*a[i]*a[j] for three word number c=(c2,c1,c0) */
    627 
    628 #ifdef BN_ULLONG
    629 
    630 /* Keep in mind that additions to multiplication result can not overflow,
    631  * because its high half cannot be all-ones. */
    632 #define mul_add_c(a, b, c0, c1, c2)     \
    633   do {                                  \
    634     BN_ULONG hi;                        \
    635     BN_ULLONG t = (BN_ULLONG)(a) * (b); \
    636     t += c0; /* no carry */             \
    637     c0 = (BN_ULONG)Lw(t);               \
    638     hi = (BN_ULONG)Hw(t);               \
    639     c1 = (c1 + hi) & BN_MASK2;          \
    640     if (c1 < hi)                        \
    641       c2++;                             \
    642   } while (0)
    643 
    644 #define mul_add_c2(a, b, c0, c1, c2)      \
    645   do {                                    \
    646     BN_ULONG hi;                          \
    647     BN_ULLONG t = (BN_ULLONG)(a) * (b);   \
    648     BN_ULLONG tt = t + c0; /* no carry */ \
    649     c0 = (BN_ULONG)Lw(tt);                \
    650     hi = (BN_ULONG)Hw(tt);                \
    651     c1 = (c1 + hi) & BN_MASK2;            \
    652     if (c1 < hi)                          \
    653       c2++;                               \
    654     t += c0; /* no carry */               \
    655     c0 = (BN_ULONG)Lw(t);                 \
    656     hi = (BN_ULONG)Hw(t);                 \
    657     c1 = (c1 + hi) & BN_MASK2;            \
    658     if (c1 < hi)                          \
    659       c2++;                               \
    660   } while (0)
    661 
    662 #define sqr_add_c(a, i, c0, c1, c2)       \
    663   do {                                    \
    664     BN_ULONG hi;                          \
    665     BN_ULLONG t = (BN_ULLONG)a[i] * a[i]; \
    666     t += c0; /* no carry */               \
    667     c0 = (BN_ULONG)Lw(t);                 \
    668     hi = (BN_ULONG)Hw(t);                 \
    669     c1 = (c1 + hi) & BN_MASK2;            \
    670     if (c1 < hi)                          \
    671       c2++;                               \
    672   } while (0)
    673 
    674 #define sqr_add_c2(a, i, j, c0, c1, c2) mul_add_c2((a)[i], (a)[j], c0, c1, c2)
    675 
    676 #elif defined(BN_UMULT_LOHI)
    677 
    678 /* Keep in mind that additions to hi can not overflow, because the high word of
    679  * a multiplication result cannot be all-ones. */
    680 #define mul_add_c(a, b, c0, c1, c2) \
    681   do {                              \
    682     BN_ULONG ta = (a), tb = (b);    \
    683     BN_ULONG lo, hi;                \
    684     BN_UMULT_LOHI(lo, hi, ta, tb);  \
    685     c0 += lo;                       \
    686     hi += (c0 < lo) ? 1 : 0;        \
    687     c1 += hi;                       \
    688     c2 += (c1 < hi) ? 1 : 0;        \
    689   } while (0)
    690 
    691 #define mul_add_c2(a, b, c0, c1, c2) \
    692   do {                               \
    693     BN_ULONG ta = (a), tb = (b);     \
    694     BN_ULONG lo, hi, tt;             \
    695     BN_UMULT_LOHI(lo, hi, ta, tb);   \
    696     c0 += lo;                        \
    697     tt = hi + ((c0 < lo) ? 1 : 0);   \
    698     c1 += tt;                        \
    699     c2 += (c1 < tt) ? 1 : 0;         \
    700     c0 += lo;                        \
    701     hi += (c0 < lo) ? 1 : 0;         \
    702     c1 += hi;                        \
    703     c2 += (c1 < hi) ? 1 : 0;         \
    704   } while (0)
    705 
    706 #define sqr_add_c(a, i, c0, c1, c2) \
    707   do {                              \
    708     BN_ULONG ta = (a)[i];           \
    709     BN_ULONG lo, hi;                \
    710     BN_UMULT_LOHI(lo, hi, ta, ta);  \
    711     c0 += lo;                       \
    712     hi += (c0 < lo) ? 1 : 0;        \
    713     c1 += hi;                       \
    714     c2 += (c1 < hi) ? 1 : 0;        \
    715   } while (0)
    716 
    717 #define sqr_add_c2(a, i, j, c0, c1, c2) mul_add_c2((a)[i], (a)[j], c0, c1, c2)
    718 
    719 #else /* !BN_ULLONG */
    720 
    721 /* Keep in mind that additions to hi can not overflow, because
    722  * the high word of a multiplication result cannot be all-ones. */
    723 
    724 #define mul_add_c(a, b, c0, c1, c2)        \
    725   do {                                     \
    726     BN_ULONG lo = LBITS(a), hi = HBITS(a); \
    727     BN_ULONG bl = LBITS(b), bh = HBITS(b); \
    728     mul64(lo, hi, bl, bh);                 \
    729     c0 = (c0 + lo) & BN_MASK2;             \
    730     if (c0 < lo)                           \
    731       hi++;                                \
    732     c1 = (c1 + hi) & BN_MASK2;             \
    733     if (c1 < hi)                           \
    734       c2++;                                \
    735   } while (0)
    736 
    737 #define mul_add_c2(a, b, c0, c1, c2)       \
    738   do {                                     \
    739     BN_ULONG tt;                           \
    740     BN_ULONG lo = LBITS(a), hi = HBITS(a); \
    741     BN_ULONG bl = LBITS(b), bh = HBITS(b); \
    742     mul64(lo, hi, bl, bh);                 \
    743     tt = hi;                               \
    744     c0 = (c0 + lo) & BN_MASK2;             \
    745     if (c0 < lo)                           \
    746       tt++;                                \
    747     c1 = (c1 + tt) & BN_MASK2;             \
    748     if (c1 < tt)                           \
    749       c2++;                                \
    750     c0 = (c0 + lo) & BN_MASK2;             \
    751     if (c0 < lo)                           \
    752       hi++;                                \
    753     c1 = (c1 + hi) & BN_MASK2;             \
    754     if (c1 < hi)                           \
    755       c2++;                                \
    756   } while (0)
    757 
    758 #define sqr_add_c(a, i, c0, c1, c2) \
    759   do {                              \
    760     BN_ULONG lo, hi;                \
    761     sqr64(lo, hi, (a)[i]);          \
    762     c0 = (c0 + lo) & BN_MASK2;      \
    763     if (c0 < lo)                    \
    764       hi++;                         \
    765     c1 = (c1 + hi) & BN_MASK2;      \
    766     if (c1 < hi)                    \
    767       c2++;                         \
    768   } while (0)
    769 
    770 #define sqr_add_c2(a, i, j, c0, c1, c2) mul_add_c2((a)[i], (a)[j], c0, c1, c2)
    771 #endif /* !BN_ULLONG */
    772 
    773 void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b) {
    774   BN_ULONG c1, c2, c3;
    775 
    776   c1 = 0;
    777   c2 = 0;
    778   c3 = 0;
    779   mul_add_c(a[0], b[0], c1, c2, c3);
    780   r[0] = c1;
    781   c1 = 0;
    782   mul_add_c(a[0], b[1], c2, c3, c1);
    783   mul_add_c(a[1], b[0], c2, c3, c1);
    784   r[1] = c2;
    785   c2 = 0;
    786   mul_add_c(a[2], b[0], c3, c1, c2);
    787   mul_add_c(a[1], b[1], c3, c1, c2);
    788   mul_add_c(a[0], b[2], c3, c1, c2);
    789   r[2] = c3;
    790   c3 = 0;
    791   mul_add_c(a[0], b[3], c1, c2, c3);
    792   mul_add_c(a[1], b[2], c1, c2, c3);
    793   mul_add_c(a[2], b[1], c1, c2, c3);
    794   mul_add_c(a[3], b[0], c1, c2, c3);
    795   r[3] = c1;
    796   c1 = 0;
    797   mul_add_c(a[4], b[0], c2, c3, c1);
    798   mul_add_c(a[3], b[1], c2, c3, c1);
    799   mul_add_c(a[2], b[2], c2, c3, c1);
    800   mul_add_c(a[1], b[3], c2, c3, c1);
    801   mul_add_c(a[0], b[4], c2, c3, c1);
    802   r[4] = c2;
    803   c2 = 0;
    804   mul_add_c(a[0], b[5], c3, c1, c2);
    805   mul_add_c(a[1], b[4], c3, c1, c2);
    806   mul_add_c(a[2], b[3], c3, c1, c2);
    807   mul_add_c(a[3], b[2], c3, c1, c2);
    808   mul_add_c(a[4], b[1], c3, c1, c2);
    809   mul_add_c(a[5], b[0], c3, c1, c2);
    810   r[5] = c3;
    811   c3 = 0;
    812   mul_add_c(a[6], b[0], c1, c2, c3);
    813   mul_add_c(a[5], b[1], c1, c2, c3);
    814   mul_add_c(a[4], b[2], c1, c2, c3);
    815   mul_add_c(a[3], b[3], c1, c2, c3);
    816   mul_add_c(a[2], b[4], c1, c2, c3);
    817   mul_add_c(a[1], b[5], c1, c2, c3);
    818   mul_add_c(a[0], b[6], c1, c2, c3);
    819   r[6] = c1;
    820   c1 = 0;
    821   mul_add_c(a[0], b[7], c2, c3, c1);
    822   mul_add_c(a[1], b[6], c2, c3, c1);
    823   mul_add_c(a[2], b[5], c2, c3, c1);
    824   mul_add_c(a[3], b[4], c2, c3, c1);
    825   mul_add_c(a[4], b[3], c2, c3, c1);
    826   mul_add_c(a[5], b[2], c2, c3, c1);
    827   mul_add_c(a[6], b[1], c2, c3, c1);
    828   mul_add_c(a[7], b[0], c2, c3, c1);
    829   r[7] = c2;
    830   c2 = 0;
    831   mul_add_c(a[7], b[1], c3, c1, c2);
    832   mul_add_c(a[6], b[2], c3, c1, c2);
    833   mul_add_c(a[5], b[3], c3, c1, c2);
    834   mul_add_c(a[4], b[4], c3, c1, c2);
    835   mul_add_c(a[3], b[5], c3, c1, c2);
    836   mul_add_c(a[2], b[6], c3, c1, c2);
    837   mul_add_c(a[1], b[7], c3, c1, c2);
    838   r[8] = c3;
    839   c3 = 0;
    840   mul_add_c(a[2], b[7], c1, c2, c3);
    841   mul_add_c(a[3], b[6], c1, c2, c3);
    842   mul_add_c(a[4], b[5], c1, c2, c3);
    843   mul_add_c(a[5], b[4], c1, c2, c3);
    844   mul_add_c(a[6], b[3], c1, c2, c3);
    845   mul_add_c(a[7], b[2], c1, c2, c3);
    846   r[9] = c1;
    847   c1 = 0;
    848   mul_add_c(a[7], b[3], c2, c3, c1);
    849   mul_add_c(a[6], b[4], c2, c3, c1);
    850   mul_add_c(a[5], b[5], c2, c3, c1);
    851   mul_add_c(a[4], b[6], c2, c3, c1);
    852   mul_add_c(a[3], b[7], c2, c3, c1);
    853   r[10] = c2;
    854   c2 = 0;
    855   mul_add_c(a[4], b[7], c3, c1, c2);
    856   mul_add_c(a[5], b[6], c3, c1, c2);
    857   mul_add_c(a[6], b[5], c3, c1, c2);
    858   mul_add_c(a[7], b[4], c3, c1, c2);
    859   r[11] = c3;
    860   c3 = 0;
    861   mul_add_c(a[7], b[5], c1, c2, c3);
    862   mul_add_c(a[6], b[6], c1, c2, c3);
    863   mul_add_c(a[5], b[7], c1, c2, c3);
    864   r[12] = c1;
    865   c1 = 0;
    866   mul_add_c(a[6], b[7], c2, c3, c1);
    867   mul_add_c(a[7], b[6], c2, c3, c1);
    868   r[13] = c2;
    869   c2 = 0;
    870   mul_add_c(a[7], b[7], c3, c1, c2);
    871   r[14] = c3;
    872   r[15] = c1;
    873 }
    874 
    875 void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b) {
    876   BN_ULONG c1, c2, c3;
    877 
    878   c1 = 0;
    879   c2 = 0;
    880   c3 = 0;
    881   mul_add_c(a[0], b[0], c1, c2, c3);
    882   r[0] = c1;
    883   c1 = 0;
    884   mul_add_c(a[0], b[1], c2, c3, c1);
    885   mul_add_c(a[1], b[0], c2, c3, c1);
    886   r[1] = c2;
    887   c2 = 0;
    888   mul_add_c(a[2], b[0], c3, c1, c2);
    889   mul_add_c(a[1], b[1], c3, c1, c2);
    890   mul_add_c(a[0], b[2], c3, c1, c2);
    891   r[2] = c3;
    892   c3 = 0;
    893   mul_add_c(a[0], b[3], c1, c2, c3);
    894   mul_add_c(a[1], b[2], c1, c2, c3);
    895   mul_add_c(a[2], b[1], c1, c2, c3);
    896   mul_add_c(a[3], b[0], c1, c2, c3);
    897   r[3] = c1;
    898   c1 = 0;
    899   mul_add_c(a[3], b[1], c2, c3, c1);
    900   mul_add_c(a[2], b[2], c2, c3, c1);
    901   mul_add_c(a[1], b[3], c2, c3, c1);
    902   r[4] = c2;
    903   c2 = 0;
    904   mul_add_c(a[2], b[3], c3, c1, c2);
    905   mul_add_c(a[3], b[2], c3, c1, c2);
    906   r[5] = c3;
    907   c3 = 0;
    908   mul_add_c(a[3], b[3], c1, c2, c3);
    909   r[6] = c1;
    910   r[7] = c2;
    911 }
    912 
    913 void bn_sqr_comba8(BN_ULONG *r, const BN_ULONG *a) {
    914   BN_ULONG c1, c2, c3;
    915 
    916   c1 = 0;
    917   c2 = 0;
    918   c3 = 0;
    919   sqr_add_c(a, 0, c1, c2, c3);
    920   r[0] = c1;
    921   c1 = 0;
    922   sqr_add_c2(a, 1, 0, c2, c3, c1);
    923   r[1] = c2;
    924   c2 = 0;
    925   sqr_add_c(a, 1, c3, c1, c2);
    926   sqr_add_c2(a, 2, 0, c3, c1, c2);
    927   r[2] = c3;
    928   c3 = 0;
    929   sqr_add_c2(a, 3, 0, c1, c2, c3);
    930   sqr_add_c2(a, 2, 1, c1, c2, c3);
    931   r[3] = c1;
    932   c1 = 0;
    933   sqr_add_c(a, 2, c2, c3, c1);
    934   sqr_add_c2(a, 3, 1, c2, c3, c1);
    935   sqr_add_c2(a, 4, 0, c2, c3, c1);
    936   r[4] = c2;
    937   c2 = 0;
    938   sqr_add_c2(a, 5, 0, c3, c1, c2);
    939   sqr_add_c2(a, 4, 1, c3, c1, c2);
    940   sqr_add_c2(a, 3, 2, c3, c1, c2);
    941   r[5] = c3;
    942   c3 = 0;
    943   sqr_add_c(a, 3, c1, c2, c3);
    944   sqr_add_c2(a, 4, 2, c1, c2, c3);
    945   sqr_add_c2(a, 5, 1, c1, c2, c3);
    946   sqr_add_c2(a, 6, 0, c1, c2, c3);
    947   r[6] = c1;
    948   c1 = 0;
    949   sqr_add_c2(a, 7, 0, c2, c3, c1);
    950   sqr_add_c2(a, 6, 1, c2, c3, c1);
    951   sqr_add_c2(a, 5, 2, c2, c3, c1);
    952   sqr_add_c2(a, 4, 3, c2, c3, c1);
    953   r[7] = c2;
    954   c2 = 0;
    955   sqr_add_c(a, 4, c3, c1, c2);
    956   sqr_add_c2(a, 5, 3, c3, c1, c2);
    957   sqr_add_c2(a, 6, 2, c3, c1, c2);
    958   sqr_add_c2(a, 7, 1, c3, c1, c2);
    959   r[8] = c3;
    960   c3 = 0;
    961   sqr_add_c2(a, 7, 2, c1, c2, c3);
    962   sqr_add_c2(a, 6, 3, c1, c2, c3);
    963   sqr_add_c2(a, 5, 4, c1, c2, c3);
    964   r[9] = c1;
    965   c1 = 0;
    966   sqr_add_c(a, 5, c2, c3, c1);
    967   sqr_add_c2(a, 6, 4, c2, c3, c1);
    968   sqr_add_c2(a, 7, 3, c2, c3, c1);
    969   r[10] = c2;
    970   c2 = 0;
    971   sqr_add_c2(a, 7, 4, c3, c1, c2);
    972   sqr_add_c2(a, 6, 5, c3, c1, c2);
    973   r[11] = c3;
    974   c3 = 0;
    975   sqr_add_c(a, 6, c1, c2, c3);
    976   sqr_add_c2(a, 7, 5, c1, c2, c3);
    977   r[12] = c1;
    978   c1 = 0;
    979   sqr_add_c2(a, 7, 6, c2, c3, c1);
    980   r[13] = c2;
    981   c2 = 0;
    982   sqr_add_c(a, 7, c3, c1, c2);
    983   r[14] = c3;
    984   r[15] = c1;
    985 }
    986 
    987 void bn_sqr_comba4(BN_ULONG *r, const BN_ULONG *a) {
    988   BN_ULONG c1, c2, c3;
    989 
    990   c1 = 0;
    991   c2 = 0;
    992   c3 = 0;
    993   sqr_add_c(a, 0, c1, c2, c3);
    994   r[0] = c1;
    995   c1 = 0;
    996   sqr_add_c2(a, 1, 0, c2, c3, c1);
    997   r[1] = c2;
    998   c2 = 0;
    999   sqr_add_c(a, 1, c3, c1, c2);
   1000   sqr_add_c2(a, 2, 0, c3, c1, c2);
   1001   r[2] = c3;
   1002   c3 = 0;
   1003   sqr_add_c2(a, 3, 0, c1, c2, c3);
   1004   sqr_add_c2(a, 2, 1, c1, c2, c3);
   1005   r[3] = c1;
   1006   c1 = 0;
   1007   sqr_add_c(a, 2, c2, c3, c1);
   1008   sqr_add_c2(a, 3, 1, c2, c3, c1);
   1009   r[4] = c2;
   1010   c2 = 0;
   1011   sqr_add_c2(a, 3, 2, c3, c1, c2);
   1012   r[5] = c3;
   1013   c3 = 0;
   1014   sqr_add_c(a, 3, c1, c2, c3);
   1015   r[6] = c1;
   1016   r[7] = c2;
   1017 }
   1018 
   1019 #endif
   1020