Home | History | Annotate | Download | only in bn
      1 /* Copyright (C) 1995-1998 Eric Young (eay (at) cryptsoft.com)
      2  * All rights reserved.
      3  *
      4  * This package is an SSL implementation written
      5  * by Eric Young (eay (at) cryptsoft.com).
      6  * The implementation was written so as to conform with Netscapes SSL.
      7  *
      8  * This library is free for commercial and non-commercial use as long as
      9  * the following conditions are aheared to.  The following conditions
     10  * apply to all code found in this distribution, be it the RC4, RSA,
     11  * lhash, DES, etc., code; not just the SSL code.  The SSL documentation
     12  * included with this distribution is covered by the same copyright terms
     13  * except that the holder is Tim Hudson (tjh (at) cryptsoft.com).
     14  *
     15  * Copyright remains Eric Young's, and as such any Copyright notices in
     16  * the code are not to be removed.
     17  * If this package is used in a product, Eric Young should be given attribution
     18  * as the author of the parts of the library used.
     19  * This can be in the form of a textual message at program startup or
     20  * in documentation (online or textual) provided with the package.
     21  *
     22  * Redistribution and use in source and binary forms, with or without
     23  * modification, are permitted provided that the following conditions
     24  * are met:
     25  * 1. Redistributions of source code must retain the copyright
     26  *    notice, this list of conditions and the following disclaimer.
     27  * 2. Redistributions in binary form must reproduce the above copyright
     28  *    notice, this list of conditions and the following disclaimer in the
     29  *    documentation and/or other materials provided with the distribution.
     30  * 3. All advertising materials mentioning features or use of this software
     31  *    must display the following acknowledgement:
     32  *    "This product includes cryptographic software written by
     33  *     Eric Young (eay (at) cryptsoft.com)"
     34  *    The word 'cryptographic' can be left out if the rouines from the library
     35  *    being used are not cryptographic related :-).
     36  * 4. If you include any Windows specific code (or a derivative thereof) from
     37  *    the apps directory (application code) you must include an acknowledgement:
     38  *    "This product includes software written by Tim Hudson (tjh (at) cryptsoft.com)"
     39  *
     40  * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
     41  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     42  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
     43  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
     44  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
     45  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
     46  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
     47  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
     48  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
     49  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
     50  * SUCH DAMAGE.
     51  *
     52  * The licence and distribution terms for any publically available version or
     53  * derivative of this code cannot be changed.  i.e. this code cannot simply be
     54  * copied and put under another distribution licence
     55  * [including the GNU Public Licence.] */
     56 
     57 #include <openssl/bn.h>
     58 
     59 #include <assert.h>
     60 
     61 #include "internal.h"
     62 
     63 
     64 #if defined(OPENSSL_WINDOWS) || defined(OPENSSL_NO_ASM) || \
     65     (!defined(OPENSSL_X86_64) && !defined(OPENSSL_X86))
     66 
     67 #if defined(OPENSSL_WINDOWS)
     68 #define alloca _alloca
     69 #else
     70 #include <alloca.h>
     71 #endif
     72 
     73 #ifdef BN_LLONG
     74 #define mul_add(r, a, w, c)             \
     75   {                                     \
     76     BN_ULLONG t;                        \
     77     t = (BN_ULLONG)w * (a) + (r) + (c); \
     78     (r) = Lw(t);                        \
     79     (c) = Hw(t);                        \
     80   }
     81 
     82 #define mul(r, a, w, c)           \
     83   {                               \
     84     BN_ULLONG t;                  \
     85     t = (BN_ULLONG)w * (a) + (c); \
     86     (r) = Lw(t);                  \
     87     (c) = Hw(t);                  \
     88   }
     89 
     90 #define sqr(r0, r1, a)        \
     91   {                           \
     92     BN_ULLONG t;              \
     93     t = (BN_ULLONG)(a) * (a); \
     94     (r0) = Lw(t);             \
     95     (r1) = Hw(t);             \
     96   }
     97 
     98 #elif defined(BN_UMULT_LOHI)
     99 #define mul_add(r, a, w, c)             \
    100   {                                     \
    101     BN_ULONG high, low, ret, tmp = (a); \
    102     ret = (r);                          \
    103     BN_UMULT_LOHI(low, high, w, tmp);   \
    104     ret += (c);                         \
    105     (c) = (ret < (c)) ? 1 : 0;          \
    106     (c) += high;                        \
    107     ret += low;                         \
    108     (c) += (ret < low) ? 1 : 0;         \
    109     (r) = ret;                          \
    110   }
    111 
    112 #define mul(r, a, w, c)                \
    113   {                                    \
    114     BN_ULONG high, low, ret, ta = (a); \
    115     BN_UMULT_LOHI(low, high, w, ta);   \
    116     ret = low + (c);                   \
    117     (c) = high;                        \
    118     (c) += (ret < low) ? 1 : 0;        \
    119     (r) = ret;                         \
    120   }
    121 
    122 #define sqr(r0, r1, a)               \
    123   {                                  \
    124     BN_ULONG tmp = (a);              \
    125     BN_UMULT_LOHI(r0, r1, tmp, tmp); \
    126   }
    127 
    128 #elif defined(BN_UMULT_HIGH)
    129 #define mul_add(r, a, w, c)             \
    130   {                                     \
    131     BN_ULONG high, low, ret, tmp = (a); \
    132     ret = (r);                          \
    133     high = BN_UMULT_HIGH(w, tmp);       \
    134     ret += (c);                         \
    135     low = (w) * tmp;                    \
    136     (c) = (ret < (c)) ? 1 : 0;          \
    137     (c) += high;                        \
    138     ret += low;                         \
    139     (c) += (ret < low) ? 1 : 0;         \
    140     (r) = ret;                          \
    141   }
    142 
    143 #define mul(r, a, w, c)                \
    144   {                                    \
    145     BN_ULONG high, low, ret, ta = (a); \
    146     low = (w) * ta;                    \
    147     high = BN_UMULT_HIGH(w, ta);       \
    148     ret = low + (c);                   \
    149     (c) = high;                        \
    150     (c) += (ret < low) ? 1 : 0;        \
    151     (r) = ret;                         \
    152   }
    153 
    154 #define sqr(r0, r1, a)              \
    155   {                                 \
    156     BN_ULONG tmp = (a);             \
    157     (r0) = tmp * tmp;               \
    158     (r1) = BN_UMULT_HIGH(tmp, tmp); \
    159   }
    160 
    161 #else
    162 /*************************************************************
    163  * No long long type
    164  */
    165 
    166 #define LBITS(a) ((a) & BN_MASK2l)
    167 #define HBITS(a) (((a) >> BN_BITS4) & BN_MASK2l)
    168 #define L2HBITS(a) (((a) << BN_BITS4) & BN_MASK2)
    169 
    170 #define LLBITS(a) ((a) & BN_MASKl)
    171 #define LHBITS(a) (((a) >> BN_BITS2) & BN_MASKl)
    172 #define LL2HBITS(a) ((BN_ULLONG)((a) & BN_MASKl) << BN_BITS2)
    173 
    174 #define mul64(l, h, bl, bh)       \
    175   {                               \
    176     BN_ULONG m, m1, lt, ht;       \
    177                                   \
    178     lt = l;                       \
    179     ht = h;                       \
    180     m = (bh) * (lt);              \
    181     lt = (bl) * (lt);             \
    182     m1 = (bl) * (ht);             \
    183     ht = (bh) * (ht);             \
    184     m = (m + m1) & BN_MASK2;      \
    185     if (m < m1)                   \
    186       ht += L2HBITS((BN_ULONG)1); \
    187     ht += HBITS(m);               \
    188     m1 = L2HBITS(m);              \
    189     lt = (lt + m1) & BN_MASK2;    \
    190     if (lt < m1)                  \
    191       ht++;                       \
    192     (l) = lt;                     \
    193     (h) = ht;                     \
    194   }
    195 
    196 #define sqr64(lo, ho, in)                    \
    197   {                                          \
    198     BN_ULONG l, h, m;                        \
    199                                              \
    200     h = (in);                                \
    201     l = LBITS(h);                            \
    202     h = HBITS(h);                            \
    203     m = (l) * (h);                           \
    204     l *= l;                                  \
    205     h *= h;                                  \
    206     h += (m & BN_MASK2h1) >> (BN_BITS4 - 1); \
    207     m = (m & BN_MASK2l) << (BN_BITS4 + 1);   \
    208     l = (l + m) & BN_MASK2;                  \
    209     if (l < m)                               \
    210       h++;                                   \
    211     (lo) = l;                                \
    212     (ho) = h;                                \
    213   }
    214 
    215 #define mul_add(r, a, bl, bh, c) \
    216   {                              \
    217     BN_ULONG l, h;               \
    218                                  \
    219     h = (a);                     \
    220     l = LBITS(h);                \
    221     h = HBITS(h);                \
    222     mul64(l, h, (bl), (bh));     \
    223                                  \
    224     /* non-multiply part */      \
    225     l = (l + (c)) & BN_MASK2;    \
    226     if (l < (c))                 \
    227       h++;                       \
    228     (c) = (r);                   \
    229     l = (l + (c)) & BN_MASK2;    \
    230     if (l < (c))                 \
    231       h++;                       \
    232     (c) = h & BN_MASK2;          \
    233     (r) = l;                     \
    234   }
    235 
    236 #define mul(r, a, bl, bh, c)  \
    237   {                           \
    238     BN_ULONG l, h;            \
    239                               \
    240     h = (a);                  \
    241     l = LBITS(h);             \
    242     h = HBITS(h);             \
    243     mul64(l, h, (bl), (bh));  \
    244                               \
    245     /* non-multiply part */   \
    246     l += (c);                 \
    247     if ((l & BN_MASK2) < (c)) \
    248       h++;                    \
    249     (c) = h & BN_MASK2;       \
    250     (r) = l & BN_MASK2;       \
    251   }
    252 #endif /* !BN_LLONG */
    253 
    254 #if defined(BN_LLONG) || defined(BN_UMULT_HIGH)
    255 
    256 BN_ULONG bn_mul_add_words(BN_ULONG *rp, const BN_ULONG *ap, int num,
    257                           BN_ULONG w) {
    258   BN_ULONG c1 = 0;
    259 
    260   assert(num >= 0);
    261   if (num <= 0) {
    262     return c1;
    263   }
    264 
    265   while (num & ~3) {
    266     mul_add(rp[0], ap[0], w, c1);
    267     mul_add(rp[1], ap[1], w, c1);
    268     mul_add(rp[2], ap[2], w, c1);
    269     mul_add(rp[3], ap[3], w, c1);
    270     ap += 4;
    271     rp += 4;
    272     num -= 4;
    273   }
    274 
    275   while (num) {
    276     mul_add(rp[0], ap[0], w, c1);
    277     ap++;
    278     rp++;
    279     num--;
    280   }
    281 
    282   return c1;
    283 }
    284 
    285 BN_ULONG bn_mul_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w) {
    286   BN_ULONG c1 = 0;
    287 
    288   assert(num >= 0);
    289   if (num <= 0) {
    290     return c1;
    291   }
    292 
    293   while (num & ~3) {
    294     mul(rp[0], ap[0], w, c1);
    295     mul(rp[1], ap[1], w, c1);
    296     mul(rp[2], ap[2], w, c1);
    297     mul(rp[3], ap[3], w, c1);
    298     ap += 4;
    299     rp += 4;
    300     num -= 4;
    301   }
    302   while (num) {
    303     mul(rp[0], ap[0], w, c1);
    304     ap++;
    305     rp++;
    306     num--;
    307   }
    308   return c1;
    309 }
    310 
    311 void bn_sqr_words(BN_ULONG *r, const BN_ULONG *a, int n) {
    312   assert(n >= 0);
    313   if (n <= 0) {
    314     return;
    315   }
    316 
    317   while (n & ~3) {
    318     sqr(r[0], r[1], a[0]);
    319     sqr(r[2], r[3], a[1]);
    320     sqr(r[4], r[5], a[2]);
    321     sqr(r[6], r[7], a[3]);
    322     a += 4;
    323     r += 8;
    324     n -= 4;
    325   }
    326   while (n) {
    327     sqr(r[0], r[1], a[0]);
    328     a++;
    329     r += 2;
    330     n--;
    331   }
    332 }
    333 
    334 #else /* !(defined(BN_LLONG) || defined(BN_UMULT_HIGH)) */
    335 
    336 BN_ULONG bn_mul_add_words(BN_ULONG *rp, const BN_ULONG *ap, int num,
    337                           BN_ULONG w) {
    338   BN_ULONG c = 0;
    339   BN_ULONG bl, bh;
    340 
    341   assert(num >= 0);
    342   if (num <= 0) {
    343     return (BN_ULONG)0;
    344   }
    345 
    346   bl = LBITS(w);
    347   bh = HBITS(w);
    348 
    349   while (num & ~3) {
    350     mul_add(rp[0], ap[0], bl, bh, c);
    351     mul_add(rp[1], ap[1], bl, bh, c);
    352     mul_add(rp[2], ap[2], bl, bh, c);
    353     mul_add(rp[3], ap[3], bl, bh, c);
    354     ap += 4;
    355     rp += 4;
    356     num -= 4;
    357   }
    358   while (num) {
    359     mul_add(rp[0], ap[0], bl, bh, c);
    360     ap++;
    361     rp++;
    362     num--;
    363   }
    364   return c;
    365 }
    366 
    367 BN_ULONG bn_mul_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w) {
    368   BN_ULONG carry = 0;
    369   BN_ULONG bl, bh;
    370 
    371   assert(num >= 0);
    372   if (num <= 0) {
    373     return (BN_ULONG)0;
    374   }
    375 
    376   bl = LBITS(w);
    377   bh = HBITS(w);
    378 
    379   while (num & ~3) {
    380     mul(rp[0], ap[0], bl, bh, carry);
    381     mul(rp[1], ap[1], bl, bh, carry);
    382     mul(rp[2], ap[2], bl, bh, carry);
    383     mul(rp[3], ap[3], bl, bh, carry);
    384     ap += 4;
    385     rp += 4;
    386     num -= 4;
    387   }
    388   while (num) {
    389     mul(rp[0], ap[0], bl, bh, carry);
    390     ap++;
    391     rp++;
    392     num--;
    393   }
    394   return carry;
    395 }
    396 
    397 void bn_sqr_words(BN_ULONG *r, const BN_ULONG *a, int n) {
    398   assert(n >= 0);
    399   if (n <= 0) {
    400     return;
    401   }
    402 
    403   while (n & ~3) {
    404     sqr64(r[0], r[1], a[0]);
    405     sqr64(r[2], r[3], a[1]);
    406     sqr64(r[4], r[5], a[2]);
    407     sqr64(r[6], r[7], a[3]);
    408     a += 4;
    409     r += 8;
    410     n -= 4;
    411   }
    412   while (n) {
    413     sqr64(r[0], r[1], a[0]);
    414     a++;
    415     r += 2;
    416     n--;
    417   }
    418 }
    419 
    420 #endif /* !(defined(BN_LLONG) || defined(BN_UMULT_HIGH)) */
    421 
    422 #if defined(BN_LLONG) && defined(BN_DIV2W)
    423 
    424 BN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d) {
    425   return (BN_ULONG)(((((BN_ULLONG)h) << BN_BITS2) | l) / (BN_ULLONG)d);
    426 }
    427 
    428 #else
    429 
    430 /* Divide h,l by d and return the result. */
    431 BN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d) {
    432   BN_ULONG dh, dl, q, ret = 0, th, tl, t;
    433   int i, count = 2;
    434 
    435   if (d == 0) {
    436     return BN_MASK2;
    437   }
    438 
    439   i = BN_num_bits_word(d);
    440   assert((i == BN_BITS2) || (h <= (BN_ULONG)1 << i));
    441 
    442   i = BN_BITS2 - i;
    443   if (h >= d) {
    444     h -= d;
    445   }
    446 
    447   if (i) {
    448     d <<= i;
    449     h = (h << i) | (l >> (BN_BITS2 - i));
    450     l <<= i;
    451   }
    452   dh = (d & BN_MASK2h) >> BN_BITS4;
    453   dl = (d & BN_MASK2l);
    454   for (;;) {
    455     if ((h >> BN_BITS4) == dh) {
    456       q = BN_MASK2l;
    457     } else {
    458       q = h / dh;
    459     }
    460 
    461     th = q * dh;
    462     tl = dl * q;
    463     for (;;) {
    464       t = h - th;
    465       if ((t & BN_MASK2h) ||
    466           ((tl) <= ((t << BN_BITS4) | ((l & BN_MASK2h) >> BN_BITS4)))) {
    467         break;
    468       }
    469       q--;
    470       th -= dh;
    471       tl -= dl;
    472     }
    473     t = (tl >> BN_BITS4);
    474     tl = (tl << BN_BITS4) & BN_MASK2h;
    475     th += t;
    476 
    477     if (l < tl) {
    478       th++;
    479     }
    480     l -= tl;
    481     if (h < th) {
    482       h += d;
    483       q--;
    484     }
    485     h -= th;
    486 
    487     if (--count == 0) {
    488       break;
    489     }
    490 
    491     ret = q << BN_BITS4;
    492     h = ((h << BN_BITS4) | (l >> BN_BITS4)) & BN_MASK2;
    493     l = (l & BN_MASK2l) << BN_BITS4;
    494   }
    495 
    496   ret |= q;
    497   return ret;
    498 }
    499 
    500 #endif /* !defined(BN_LLONG) && defined(BN_DIV2W) */
    501 
    502 #ifdef BN_LLONG
    503 BN_ULONG bn_add_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b,
    504                       int n) {
    505   BN_ULLONG ll = 0;
    506 
    507   assert(n >= 0);
    508   if (n <= 0) {
    509     return (BN_ULONG)0;
    510   }
    511 
    512   while (n & ~3) {
    513     ll += (BN_ULLONG)a[0] + b[0];
    514     r[0] = (BN_ULONG)ll & BN_MASK2;
    515     ll >>= BN_BITS2;
    516     ll += (BN_ULLONG)a[1] + b[1];
    517     r[1] = (BN_ULONG)ll & BN_MASK2;
    518     ll >>= BN_BITS2;
    519     ll += (BN_ULLONG)a[2] + b[2];
    520     r[2] = (BN_ULONG)ll & BN_MASK2;
    521     ll >>= BN_BITS2;
    522     ll += (BN_ULLONG)a[3] + b[3];
    523     r[3] = (BN_ULONG)ll & BN_MASK2;
    524     ll >>= BN_BITS2;
    525     a += 4;
    526     b += 4;
    527     r += 4;
    528     n -= 4;
    529   }
    530   while (n) {
    531     ll += (BN_ULLONG)a[0] + b[0];
    532     r[0] = (BN_ULONG)ll & BN_MASK2;
    533     ll >>= BN_BITS2;
    534     a++;
    535     b++;
    536     r++;
    537     n--;
    538   }
    539   return (BN_ULONG)ll;
    540 }
    541 
    542 #else /* !BN_LLONG */
    543 
    544 BN_ULONG bn_add_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b,
    545                       int n) {
    546   BN_ULONG c, l, t;
    547 
    548   assert(n >= 0);
    549   if (n <= 0) {
    550     return (BN_ULONG)0;
    551   }
    552 
    553   c = 0;
    554   while (n & ~3) {
    555     t = a[0];
    556     t = (t + c) & BN_MASK2;
    557     c = (t < c);
    558     l = (t + b[0]) & BN_MASK2;
    559     c += (l < t);
    560     r[0] = l;
    561     t = a[1];
    562     t = (t + c) & BN_MASK2;
    563     c = (t < c);
    564     l = (t + b[1]) & BN_MASK2;
    565     c += (l < t);
    566     r[1] = l;
    567     t = a[2];
    568     t = (t + c) & BN_MASK2;
    569     c = (t < c);
    570     l = (t + b[2]) & BN_MASK2;
    571     c += (l < t);
    572     r[2] = l;
    573     t = a[3];
    574     t = (t + c) & BN_MASK2;
    575     c = (t < c);
    576     l = (t + b[3]) & BN_MASK2;
    577     c += (l < t);
    578     r[3] = l;
    579     a += 4;
    580     b += 4;
    581     r += 4;
    582     n -= 4;
    583   }
    584   while (n) {
    585     t = a[0];
    586     t = (t + c) & BN_MASK2;
    587     c = (t < c);
    588     l = (t + b[0]) & BN_MASK2;
    589     c += (l < t);
    590     r[0] = l;
    591     a++;
    592     b++;
    593     r++;
    594     n--;
    595   }
    596   return (BN_ULONG)c;
    597 }
    598 
    599 #endif /* !BN_LLONG */
    600 
    601 BN_ULONG bn_sub_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b,
    602                       int n) {
    603   BN_ULONG t1, t2;
    604   int c = 0;
    605 
    606   assert(n >= 0);
    607   if (n <= 0) {
    608     return (BN_ULONG)0;
    609   }
    610 
    611   while (n & ~3) {
    612     t1 = a[0];
    613     t2 = b[0];
    614     r[0] = (t1 - t2 - c) & BN_MASK2;
    615     if (t1 != t2)
    616       c = (t1 < t2);
    617     t1 = a[1];
    618     t2 = b[1];
    619     r[1] = (t1 - t2 - c) & BN_MASK2;
    620     if (t1 != t2)
    621       c = (t1 < t2);
    622     t1 = a[2];
    623     t2 = b[2];
    624     r[2] = (t1 - t2 - c) & BN_MASK2;
    625     if (t1 != t2)
    626       c = (t1 < t2);
    627     t1 = a[3];
    628     t2 = b[3];
    629     r[3] = (t1 - t2 - c) & BN_MASK2;
    630     if (t1 != t2)
    631       c = (t1 < t2);
    632     a += 4;
    633     b += 4;
    634     r += 4;
    635     n -= 4;
    636   }
    637   while (n) {
    638     t1 = a[0];
    639     t2 = b[0];
    640     r[0] = (t1 - t2 - c) & BN_MASK2;
    641     if (t1 != t2)
    642       c = (t1 < t2);
    643     a++;
    644     b++;
    645     r++;
    646     n--;
    647   }
    648   return c;
    649 }
    650 
    651 /* mul_add_c(a,b,c0,c1,c2)  -- c+=a*b for three word number c=(c2,c1,c0) */
    652 /* mul_add_c2(a,b,c0,c1,c2) -- c+=2*a*b for three word number c=(c2,c1,c0) */
    653 /* sqr_add_c(a,i,c0,c1,c2)  -- c+=a[i]^2 for three word number c=(c2,c1,c0) */
    654 /* sqr_add_c2(a,i,c0,c1,c2) -- c+=2*a[i]*a[j] for three word number c=(c2,c1,c0) */
    655 
    656 #ifdef BN_LLONG
    657 #define mul_add_c(a, b, c0, c1, c2) \
    658   t = (BN_ULLONG)a * b;             \
    659   t1 = (BN_ULONG)Lw(t);             \
    660   t2 = (BN_ULONG)Hw(t);             \
    661   c0 = (c0 + t1) & BN_MASK2;        \
    662   if ((c0) < t1)                    \
    663     t2++;                           \
    664   c1 = (c1 + t2) & BN_MASK2;        \
    665   if ((c1) < t2)                    \
    666     c2++;
    667 
    668 #define mul_add_c2(a, b, c0, c1, c2)           \
    669   t = (BN_ULLONG)a * b;                        \
    670   tt = (t + t) & BN_MASK;                      \
    671   if (tt < t)                                  \
    672     c2++;                                      \
    673   t1 = (BN_ULONG)Lw(tt);                       \
    674   t2 = (BN_ULONG)Hw(tt);                       \
    675   c0 = (c0 + t1) & BN_MASK2;                   \
    676   if ((c0 < t1) && (((++t2) & BN_MASK2) == 0)) \
    677     c2++;                                      \
    678   c1 = (c1 + t2) & BN_MASK2;                   \
    679   if ((c1) < t2)                               \
    680     c2++;
    681 
    682 #define sqr_add_c(a, i, c0, c1, c2) \
    683   t = (BN_ULLONG)a[i] * a[i];       \
    684   t1 = (BN_ULONG)Lw(t);             \
    685   t2 = (BN_ULONG)Hw(t);             \
    686   c0 = (c0 + t1) & BN_MASK2;        \
    687   if ((c0) < t1)                    \
    688     t2++;                           \
    689   c1 = (c1 + t2) & BN_MASK2;        \
    690   if ((c1) < t2)                    \
    691     c2++;
    692 
    693 #define sqr_add_c2(a, i, j, c0, c1, c2) mul_add_c2((a)[i], (a)[j], c0, c1, c2)
    694 
    695 #elif defined(BN_UMULT_LOHI)
    696 
    697 #define mul_add_c(a, b, c0, c1, c2) \
    698   {                                 \
    699     BN_ULONG ta = (a), tb = (b);    \
    700     BN_UMULT_LOHI(t1, t2, ta, tb);  \
    701     c0 += t1;                       \
    702     t2 += (c0 < t1) ? 1 : 0;        \
    703     c1 += t2;                       \
    704     c2 += (c1 < t2) ? 1 : 0;        \
    705   }
    706 
    707 #define mul_add_c2(a, b, c0, c1, c2) \
    708   {                                  \
    709     BN_ULONG ta = (a), tb = (b), t0; \
    710     BN_UMULT_LOHI(t0, t1, ta, tb);   \
    711     t2 = t1 + t1;                    \
    712     c2 += (t2 < t1) ? 1 : 0;         \
    713     t1 = t0 + t0;                    \
    714     t2 += (t1 < t0) ? 1 : 0;         \
    715     c0 += t1;                        \
    716     t2 += (c0 < t1) ? 1 : 0;         \
    717     c1 += t2;                        \
    718     c2 += (c1 < t2) ? 1 : 0;         \
    719   }
    720 
    721 #define sqr_add_c(a, i, c0, c1, c2) \
    722   {                                 \
    723     BN_ULONG ta = (a)[i];           \
    724     BN_UMULT_LOHI(t1, t2, ta, ta);  \
    725     c0 += t1;                       \
    726     t2 += (c0 < t1) ? 1 : 0;        \
    727     c1 += t2;                       \
    728     c2 += (c1 < t2) ? 1 : 0;        \
    729   }
    730 
    731 #define sqr_add_c2(a, i, j, c0, c1, c2) mul_add_c2((a)[i], (a)[j], c0, c1, c2)
    732 
    733 #elif defined(BN_UMULT_HIGH)
    734 
    735 #define mul_add_c(a, b, c0, c1, c2) \
    736   {                                 \
    737     BN_ULONG ta = (a), tb = (b);    \
    738     t1 = ta * tb;                   \
    739     t2 = BN_UMULT_HIGH(ta, tb);     \
    740     c0 += t1;                       \
    741     t2 += (c0 < t1) ? 1 : 0;        \
    742     c1 += t2;                       \
    743     c2 += (c1 < t2) ? 1 : 0;        \
    744   }
    745 
    746 #define mul_add_c2(a, b, c0, c1, c2) \
    747   {                                  \
    748     BN_ULONG ta = (a), tb = (b), t0; \
    749     t1 = BN_UMULT_HIGH(ta, tb);      \
    750     t0 = ta * tb;                    \
    751     t2 = t1 + t1;                    \
    752     c2 += (t2 < t1) ? 1 : 0;         \
    753     t1 = t0 + t0;                    \
    754     t2 += (t1 < t0) ? 1 : 0;         \
    755     c0 += t1;                        \
    756     t2 += (c0 < t1) ? 1 : 0;         \
    757     c1 += t2;                        \
    758     c2 += (c1 < t2) ? 1 : 0;         \
    759   }
    760 
    761 #define sqr_add_c(a, i, c0, c1, c2) \
    762   {                                 \
    763     BN_ULONG ta = (a)[i];           \
    764     t1 = ta * ta;                   \
    765     t2 = BN_UMULT_HIGH(ta, ta);     \
    766     c0 += t1;                       \
    767     t2 += (c0 < t1) ? 1 : 0;        \
    768     c1 += t2;                       \
    769     c2 += (c1 < t2) ? 1 : 0;        \
    770   }
    771 
    772 #define sqr_add_c2(a, i, j, c0, c1, c2) mul_add_c2((a)[i], (a)[j], c0, c1, c2)
    773 
    774 #else /* !BN_LLONG */
    775 #define mul_add_c(a, b, c0, c1, c2) \
    776   t1 = LBITS(a);                    \
    777   t2 = HBITS(a);                    \
    778   bl = LBITS(b);                    \
    779   bh = HBITS(b);                    \
    780   mul64(t1, t2, bl, bh);            \
    781   c0 = (c0 + t1) & BN_MASK2;        \
    782   if ((c0) < t1)                    \
    783     t2++;                           \
    784   c1 = (c1 + t2) & BN_MASK2;        \
    785   if ((c1) < t2)                    \
    786     c2++;
    787 
    788 #define mul_add_c2(a, b, c0, c1, c2)           \
    789   t1 = LBITS(a);                               \
    790   t2 = HBITS(a);                               \
    791   bl = LBITS(b);                               \
    792   bh = HBITS(b);                               \
    793   mul64(t1, t2, bl, bh);                       \
    794   if (t2 & BN_TBIT)                            \
    795     c2++;                                      \
    796   t2 = (t2 + t2) & BN_MASK2;                   \
    797   if (t1 & BN_TBIT)                            \
    798     t2++;                                      \
    799   t1 = (t1 + t1) & BN_MASK2;                   \
    800   c0 = (c0 + t1) & BN_MASK2;                   \
    801   if ((c0 < t1) && (((++t2) & BN_MASK2) == 0)) \
    802     c2++;                                      \
    803   c1 = (c1 + t2) & BN_MASK2;                   \
    804   if ((c1) < t2)                               \
    805     c2++;
    806 
    807 #define sqr_add_c(a, i, c0, c1, c2) \
    808   sqr64(t1, t2, (a)[i]);            \
    809   c0 = (c0 + t1) & BN_MASK2;        \
    810   if ((c0) < t1)                    \
    811     t2++;                           \
    812   c1 = (c1 + t2) & BN_MASK2;        \
    813   if ((c1) < t2)                    \
    814     c2++;
    815 
    816 #define sqr_add_c2(a, i, j, c0, c1, c2) mul_add_c2((a)[i], (a)[j], c0, c1, c2)
    817 #endif /* !BN_LLONG */
    818 
    819 void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b) {
    820 #ifdef BN_LLONG
    821   BN_ULLONG t;
    822 #else
    823   BN_ULONG bl, bh;
    824 #endif
    825   BN_ULONG t1, t2;
    826   BN_ULONG c1, c2, c3;
    827 
    828   c1 = 0;
    829   c2 = 0;
    830   c3 = 0;
    831   mul_add_c(a[0], b[0], c1, c2, c3);
    832   r[0] = c1;
    833   c1 = 0;
    834   mul_add_c(a[0], b[1], c2, c3, c1);
    835   mul_add_c(a[1], b[0], c2, c3, c1);
    836   r[1] = c2;
    837   c2 = 0;
    838   mul_add_c(a[2], b[0], c3, c1, c2);
    839   mul_add_c(a[1], b[1], c3, c1, c2);
    840   mul_add_c(a[0], b[2], c3, c1, c2);
    841   r[2] = c3;
    842   c3 = 0;
    843   mul_add_c(a[0], b[3], c1, c2, c3);
    844   mul_add_c(a[1], b[2], c1, c2, c3);
    845   mul_add_c(a[2], b[1], c1, c2, c3);
    846   mul_add_c(a[3], b[0], c1, c2, c3);
    847   r[3] = c1;
    848   c1 = 0;
    849   mul_add_c(a[4], b[0], c2, c3, c1);
    850   mul_add_c(a[3], b[1], c2, c3, c1);
    851   mul_add_c(a[2], b[2], c2, c3, c1);
    852   mul_add_c(a[1], b[3], c2, c3, c1);
    853   mul_add_c(a[0], b[4], c2, c3, c1);
    854   r[4] = c2;
    855   c2 = 0;
    856   mul_add_c(a[0], b[5], c3, c1, c2);
    857   mul_add_c(a[1], b[4], c3, c1, c2);
    858   mul_add_c(a[2], b[3], c3, c1, c2);
    859   mul_add_c(a[3], b[2], c3, c1, c2);
    860   mul_add_c(a[4], b[1], c3, c1, c2);
    861   mul_add_c(a[5], b[0], c3, c1, c2);
    862   r[5] = c3;
    863   c3 = 0;
    864   mul_add_c(a[6], b[0], c1, c2, c3);
    865   mul_add_c(a[5], b[1], c1, c2, c3);
    866   mul_add_c(a[4], b[2], c1, c2, c3);
    867   mul_add_c(a[3], b[3], c1, c2, c3);
    868   mul_add_c(a[2], b[4], c1, c2, c3);
    869   mul_add_c(a[1], b[5], c1, c2, c3);
    870   mul_add_c(a[0], b[6], c1, c2, c3);
    871   r[6] = c1;
    872   c1 = 0;
    873   mul_add_c(a[0], b[7], c2, c3, c1);
    874   mul_add_c(a[1], b[6], c2, c3, c1);
    875   mul_add_c(a[2], b[5], c2, c3, c1);
    876   mul_add_c(a[3], b[4], c2, c3, c1);
    877   mul_add_c(a[4], b[3], c2, c3, c1);
    878   mul_add_c(a[5], b[2], c2, c3, c1);
    879   mul_add_c(a[6], b[1], c2, c3, c1);
    880   mul_add_c(a[7], b[0], c2, c3, c1);
    881   r[7] = c2;
    882   c2 = 0;
    883   mul_add_c(a[7], b[1], c3, c1, c2);
    884   mul_add_c(a[6], b[2], c3, c1, c2);
    885   mul_add_c(a[5], b[3], c3, c1, c2);
    886   mul_add_c(a[4], b[4], c3, c1, c2);
    887   mul_add_c(a[3], b[5], c3, c1, c2);
    888   mul_add_c(a[2], b[6], c3, c1, c2);
    889   mul_add_c(a[1], b[7], c3, c1, c2);
    890   r[8] = c3;
    891   c3 = 0;
    892   mul_add_c(a[2], b[7], c1, c2, c3);
    893   mul_add_c(a[3], b[6], c1, c2, c3);
    894   mul_add_c(a[4], b[5], c1, c2, c3);
    895   mul_add_c(a[5], b[4], c1, c2, c3);
    896   mul_add_c(a[6], b[3], c1, c2, c3);
    897   mul_add_c(a[7], b[2], c1, c2, c3);
    898   r[9] = c1;
    899   c1 = 0;
    900   mul_add_c(a[7], b[3], c2, c3, c1);
    901   mul_add_c(a[6], b[4], c2, c3, c1);
    902   mul_add_c(a[5], b[5], c2, c3, c1);
    903   mul_add_c(a[4], b[6], c2, c3, c1);
    904   mul_add_c(a[3], b[7], c2, c3, c1);
    905   r[10] = c2;
    906   c2 = 0;
    907   mul_add_c(a[4], b[7], c3, c1, c2);
    908   mul_add_c(a[5], b[6], c3, c1, c2);
    909   mul_add_c(a[6], b[5], c3, c1, c2);
    910   mul_add_c(a[7], b[4], c3, c1, c2);
    911   r[11] = c3;
    912   c3 = 0;
    913   mul_add_c(a[7], b[5], c1, c2, c3);
    914   mul_add_c(a[6], b[6], c1, c2, c3);
    915   mul_add_c(a[5], b[7], c1, c2, c3);
    916   r[12] = c1;
    917   c1 = 0;
    918   mul_add_c(a[6], b[7], c2, c3, c1);
    919   mul_add_c(a[7], b[6], c2, c3, c1);
    920   r[13] = c2;
    921   c2 = 0;
    922   mul_add_c(a[7], b[7], c3, c1, c2);
    923   r[14] = c3;
    924   r[15] = c1;
    925 }
    926 
    927 void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b) {
    928 #ifdef BN_LLONG
    929   BN_ULLONG t;
    930 #else
    931   BN_ULONG bl, bh;
    932 #endif
    933   BN_ULONG t1, t2;
    934   BN_ULONG c1, c2, c3;
    935 
    936   c1 = 0;
    937   c2 = 0;
    938   c3 = 0;
    939   mul_add_c(a[0], b[0], c1, c2, c3);
    940   r[0] = c1;
    941   c1 = 0;
    942   mul_add_c(a[0], b[1], c2, c3, c1);
    943   mul_add_c(a[1], b[0], c2, c3, c1);
    944   r[1] = c2;
    945   c2 = 0;
    946   mul_add_c(a[2], b[0], c3, c1, c2);
    947   mul_add_c(a[1], b[1], c3, c1, c2);
    948   mul_add_c(a[0], b[2], c3, c1, c2);
    949   r[2] = c3;
    950   c3 = 0;
    951   mul_add_c(a[0], b[3], c1, c2, c3);
    952   mul_add_c(a[1], b[2], c1, c2, c3);
    953   mul_add_c(a[2], b[1], c1, c2, c3);
    954   mul_add_c(a[3], b[0], c1, c2, c3);
    955   r[3] = c1;
    956   c1 = 0;
    957   mul_add_c(a[3], b[1], c2, c3, c1);
    958   mul_add_c(a[2], b[2], c2, c3, c1);
    959   mul_add_c(a[1], b[3], c2, c3, c1);
    960   r[4] = c2;
    961   c2 = 0;
    962   mul_add_c(a[2], b[3], c3, c1, c2);
    963   mul_add_c(a[3], b[2], c3, c1, c2);
    964   r[5] = c3;
    965   c3 = 0;
    966   mul_add_c(a[3], b[3], c1, c2, c3);
    967   r[6] = c1;
    968   r[7] = c2;
    969 }
    970 
    971 void bn_sqr_comba8(BN_ULONG *r, const BN_ULONG *a) {
    972 #ifdef BN_LLONG
    973   BN_ULLONG t, tt;
    974 #else
    975   BN_ULONG bl, bh;
    976 #endif
    977   BN_ULONG t1, t2;
    978   BN_ULONG c1, c2, c3;
    979 
    980   c1 = 0;
    981   c2 = 0;
    982   c3 = 0;
    983   sqr_add_c(a, 0, c1, c2, c3);
    984   r[0] = c1;
    985   c1 = 0;
    986   sqr_add_c2(a, 1, 0, c2, c3, c1);
    987   r[1] = c2;
    988   c2 = 0;
    989   sqr_add_c(a, 1, c3, c1, c2);
    990   sqr_add_c2(a, 2, 0, c3, c1, c2);
    991   r[2] = c3;
    992   c3 = 0;
    993   sqr_add_c2(a, 3, 0, c1, c2, c3);
    994   sqr_add_c2(a, 2, 1, c1, c2, c3);
    995   r[3] = c1;
    996   c1 = 0;
    997   sqr_add_c(a, 2, c2, c3, c1);
    998   sqr_add_c2(a, 3, 1, c2, c3, c1);
    999   sqr_add_c2(a, 4, 0, c2, c3, c1);
   1000   r[4] = c2;
   1001   c2 = 0;
   1002   sqr_add_c2(a, 5, 0, c3, c1, c2);
   1003   sqr_add_c2(a, 4, 1, c3, c1, c2);
   1004   sqr_add_c2(a, 3, 2, c3, c1, c2);
   1005   r[5] = c3;
   1006   c3 = 0;
   1007   sqr_add_c(a, 3, c1, c2, c3);
   1008   sqr_add_c2(a, 4, 2, c1, c2, c3);
   1009   sqr_add_c2(a, 5, 1, c1, c2, c3);
   1010   sqr_add_c2(a, 6, 0, c1, c2, c3);
   1011   r[6] = c1;
   1012   c1 = 0;
   1013   sqr_add_c2(a, 7, 0, c2, c3, c1);
   1014   sqr_add_c2(a, 6, 1, c2, c3, c1);
   1015   sqr_add_c2(a, 5, 2, c2, c3, c1);
   1016   sqr_add_c2(a, 4, 3, c2, c3, c1);
   1017   r[7] = c2;
   1018   c2 = 0;
   1019   sqr_add_c(a, 4, c3, c1, c2);
   1020   sqr_add_c2(a, 5, 3, c3, c1, c2);
   1021   sqr_add_c2(a, 6, 2, c3, c1, c2);
   1022   sqr_add_c2(a, 7, 1, c3, c1, c2);
   1023   r[8] = c3;
   1024   c3 = 0;
   1025   sqr_add_c2(a, 7, 2, c1, c2, c3);
   1026   sqr_add_c2(a, 6, 3, c1, c2, c3);
   1027   sqr_add_c2(a, 5, 4, c1, c2, c3);
   1028   r[9] = c1;
   1029   c1 = 0;
   1030   sqr_add_c(a, 5, c2, c3, c1);
   1031   sqr_add_c2(a, 6, 4, c2, c3, c1);
   1032   sqr_add_c2(a, 7, 3, c2, c3, c1);
   1033   r[10] = c2;
   1034   c2 = 0;
   1035   sqr_add_c2(a, 7, 4, c3, c1, c2);
   1036   sqr_add_c2(a, 6, 5, c3, c1, c2);
   1037   r[11] = c3;
   1038   c3 = 0;
   1039   sqr_add_c(a, 6, c1, c2, c3);
   1040   sqr_add_c2(a, 7, 5, c1, c2, c3);
   1041   r[12] = c1;
   1042   c1 = 0;
   1043   sqr_add_c2(a, 7, 6, c2, c3, c1);
   1044   r[13] = c2;
   1045   c2 = 0;
   1046   sqr_add_c(a, 7, c3, c1, c2);
   1047   r[14] = c3;
   1048   r[15] = c1;
   1049 }
   1050 
   1051 void bn_sqr_comba4(BN_ULONG *r, const BN_ULONG *a) {
   1052 #ifdef BN_LLONG
   1053   BN_ULLONG t, tt;
   1054 #else
   1055   BN_ULONG bl, bh;
   1056 #endif
   1057   BN_ULONG t1, t2;
   1058   BN_ULONG c1, c2, c3;
   1059 
   1060   c1 = 0;
   1061   c2 = 0;
   1062   c3 = 0;
   1063   sqr_add_c(a, 0, c1, c2, c3);
   1064   r[0] = c1;
   1065   c1 = 0;
   1066   sqr_add_c2(a, 1, 0, c2, c3, c1);
   1067   r[1] = c2;
   1068   c2 = 0;
   1069   sqr_add_c(a, 1, c3, c1, c2);
   1070   sqr_add_c2(a, 2, 0, c3, c1, c2);
   1071   r[2] = c3;
   1072   c3 = 0;
   1073   sqr_add_c2(a, 3, 0, c1, c2, c3);
   1074   sqr_add_c2(a, 2, 1, c1, c2, c3);
   1075   r[3] = c1;
   1076   c1 = 0;
   1077   sqr_add_c(a, 2, c2, c3, c1);
   1078   sqr_add_c2(a, 3, 1, c2, c3, c1);
   1079   r[4] = c2;
   1080   c2 = 0;
   1081   sqr_add_c2(a, 3, 2, c3, c1, c2);
   1082   r[5] = c3;
   1083   c3 = 0;
   1084   sqr_add_c(a, 3, c1, c2, c3);
   1085   r[6] = c1;
   1086   r[7] = c2;
   1087 }
   1088 
   1089 #if defined(OPENSSL_NO_ASM) || (!defined(OPENSSL_ARM) && !defined(OPENSSL_X86_64))
   1090 /* This is essentially reference implementation, which may or may not
   1091  * result in performance improvement. E.g. on IA-32 this routine was
   1092  * observed to give 40% faster rsa1024 private key operations and 10%
   1093  * faster rsa4096 ones, while on AMD64 it improves rsa1024 sign only
   1094  * by 10% and *worsens* rsa4096 sign by 15%. Once again, it's a
   1095  * reference implementation, one to be used as starting point for
   1096  * platform-specific assembler. Mentioned numbers apply to compiler
   1097  * generated code compiled with and without -DOPENSSL_BN_ASM_MONT and
   1098  * can vary not only from platform to platform, but even for compiler
   1099  * versions. Assembler vs. assembler improvement coefficients can
   1100  * [and are known to] differ and are to be documented elsewhere. */
   1101 int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
   1102                 const BN_ULONG *np, const BN_ULONG *n0p, int num) {
   1103   BN_ULONG c0, c1, ml, *tp, n0;
   1104 #ifdef mul64
   1105   BN_ULONG mh;
   1106 #endif
   1107   volatile BN_ULONG *vp;
   1108   int i = 0, j;
   1109 
   1110 #if 0 /* template for platform-specific implementation */
   1111 	if (ap==bp)	return bn_sqr_mont(rp,ap,np,n0p,num);
   1112 #endif
   1113   vp = tp = alloca((num + 2) * sizeof(BN_ULONG));
   1114 
   1115   n0 = *n0p;
   1116 
   1117   c0 = 0;
   1118   ml = bp[0];
   1119 #ifdef mul64
   1120   mh = HBITS(ml);
   1121   ml = LBITS(ml);
   1122   for (j = 0; j < num; ++j)
   1123     mul(tp[j], ap[j], ml, mh, c0);
   1124 #else
   1125   for (j = 0; j < num; ++j)
   1126     mul(tp[j], ap[j], ml, c0);
   1127 #endif
   1128 
   1129   tp[num] = c0;
   1130   tp[num + 1] = 0;
   1131   goto enter;
   1132 
   1133   for (i = 0; i < num; i++) {
   1134     c0 = 0;
   1135     ml = bp[i];
   1136 #ifdef mul64
   1137     mh = HBITS(ml);
   1138     ml = LBITS(ml);
   1139     for (j = 0; j < num; ++j)
   1140       mul_add(tp[j], ap[j], ml, mh, c0);
   1141 #else
   1142     for (j = 0; j < num; ++j)
   1143       mul_add(tp[j], ap[j], ml, c0);
   1144 #endif
   1145     c1 = (tp[num] + c0) & BN_MASK2;
   1146     tp[num] = c1;
   1147     tp[num + 1] = (c1 < c0 ? 1 : 0);
   1148   enter:
   1149     c1 = tp[0];
   1150     ml = (c1 * n0) & BN_MASK2;
   1151     c0 = 0;
   1152 #ifdef mul64
   1153     mh = HBITS(ml);
   1154     ml = LBITS(ml);
   1155     mul_add(c1, np[0], ml, mh, c0);
   1156 #else
   1157     mul_add(c1, ml, np[0], c0);
   1158 #endif
   1159     for (j = 1; j < num; j++) {
   1160       c1 = tp[j];
   1161 #ifdef mul64
   1162       mul_add(c1, np[j], ml, mh, c0);
   1163 #else
   1164       mul_add(c1, ml, np[j], c0);
   1165 #endif
   1166       tp[j - 1] = c1 & BN_MASK2;
   1167     }
   1168     c1 = (tp[num] + c0) & BN_MASK2;
   1169     tp[num - 1] = c1;
   1170     tp[num] = tp[num + 1] + (c1 < c0 ? 1 : 0);
   1171   }
   1172 
   1173   if (tp[num] != 0 || tp[num - 1] >= np[num - 1]) {
   1174     c0 = bn_sub_words(rp, tp, np, num);
   1175     if (tp[num] != 0 || c0 == 0) {
   1176       for (i = 0; i < num + 2; i++)
   1177         vp[i] = 0;
   1178       return 1;
   1179     }
   1180   }
   1181   for (i = 0; i < num; i++)
   1182     rp[i] = tp[i], vp[i] = 0;
   1183   vp[num] = 0;
   1184   vp[num + 1] = 0;
   1185   return 1;
   1186 }
   1187 #endif
   1188 
   1189 #endif
   1190