Home | History | Annotate | Download | only in asm
      1 #include <openssl/bn.h>
      2 
      3 #if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && !defined(OPENSSL_WINDOWS)
      4 
      5 #include "../internal.h"
      6 
      7 /* x86_64 BIGNUM accelerator version 0.1, December 2002.
      8  *
      9  * Implemented by Andy Polyakov <appro (at) fy.chalmers.se> for the OpenSSL
     10  * project.
     11  *
     12  * Rights for redistribution and usage in source and binary forms are
     13  * granted according to the OpenSSL license. Warranty of any kind is
     14  * disclaimed.
     15  *
     16  * Q. Version 0.1? It doesn't sound like Andy, he used to assign real
     17  *    versions, like 1.0...
     18  * A. Well, that's because this code is basically a quick-n-dirty
     19  *    proof-of-concept hack. As you can see it's implemented with
     20  *    inline assembler, which means that you're bound to GCC and that
     21  *    there might be enough room for further improvement.
     22  *
     23  * Q. Why inline assembler?
     24  * A. x86_64 features own ABI which I'm not familiar with. This is
     25  *    why I decided to let the compiler take care of subroutine
     26  *    prologue/epilogue as well as register allocation. For reference.
     27  *    Win64 implements different ABI for AMD64, different from Linux.
     28  *
     29  * Q. How much faster does it get?
     30  * A. 'apps/openssl speed rsa dsa' output with no-asm:
     31  *
     32  *	                  sign    verify    sign/s verify/s
     33  *	rsa  512 bits   0.0006s   0.0001s   1683.8  18456.2
     34  *	rsa 1024 bits   0.0028s   0.0002s    356.0   6407.0
     35  *	rsa 2048 bits   0.0172s   0.0005s     58.0   1957.8
     36  *	rsa 4096 bits   0.1155s   0.0018s      8.7    555.6
     37  *	                  sign    verify    sign/s verify/s
     38  *	dsa  512 bits   0.0005s   0.0006s   2100.8   1768.3
     39  *	dsa 1024 bits   0.0014s   0.0018s    692.3    559.2
     40  *	dsa 2048 bits   0.0049s   0.0061s    204.7    165.0
     41  *
     42  *    'apps/openssl speed rsa dsa' output with this module:
     43  *
     44  *	                  sign    verify    sign/s verify/s
     45  *	rsa  512 bits   0.0004s   0.0000s   2767.1  33297.9
     46  *	rsa 1024 bits   0.0012s   0.0001s    867.4  14674.7
     47  *	rsa 2048 bits   0.0061s   0.0002s    164.0   5270.0
     48  *	rsa 4096 bits   0.0384s   0.0006s     26.1   1650.8
     49  *	                  sign    verify    sign/s verify/s
     50  *	dsa  512 bits   0.0002s   0.0003s   4442.2   3786.3
     51  *	dsa 1024 bits   0.0005s   0.0007s   1835.1   1497.4
     52  *	dsa 2048 bits   0.0016s   0.0020s    620.4    504.6
     53  *
     54  *    For the reference. IA-32 assembler implementation performs
     55  *    very much like 64-bit code compiled with no-asm on the same
     56  *    machine.
     57  */
     58 
     59  /* TODO(davidben): Get this file working on Windows x64. */
     60 
     61 #undef mul
     62 #undef mul_add
     63 
     64 #define asm __asm__
     65 
     66 /*
     67  * "m"(a), "+m"(r)	is the way to favor DirectPath -code;
     68  * "g"(0)		let the compiler to decide where does it
     69  *			want to keep the value of zero;
     70  */
     71 #define mul_add(r, a, word, carry)                                     \
     72   do {                                                                 \
     73     register BN_ULONG high, low;                                       \
     74     asm("mulq %3" : "=a"(low), "=d"(high) : "a"(word), "m"(a) : "cc"); \
     75     asm("addq %2,%0; adcq %3,%1"                                       \
     76         : "+r"(carry), "+d"(high)                                      \
     77         : "a"(low), "g"(0)                                             \
     78         : "cc");                                                       \
     79     asm("addq %2,%0; adcq %3,%1"                                       \
     80         : "+m"(r), "+d"(high)                                          \
     81         : "r"(carry), "g"(0)                                           \
     82         : "cc");                                                       \
     83     carry = high;                                                      \
     84   } while (0)
     85 
     86 #define mul(r, a, word, carry)                                         \
     87   do {                                                                 \
     88     register BN_ULONG high, low;                                       \
     89     asm("mulq %3" : "=a"(low), "=d"(high) : "a"(word), "g"(a) : "cc"); \
     90     asm("addq %2,%0; adcq %3,%1"                                       \
     91         : "+r"(carry), "+d"(high)                                      \
     92         : "a"(low), "g"(0)                                             \
     93         : "cc");                                                       \
     94     (r) = carry, carry = high;                                         \
     95   } while (0)
     96 #undef sqr
     97 #define sqr(r0, r1, a) asm("mulq %2" : "=a"(r0), "=d"(r1) : "a"(a) : "cc");
     98 
     99 BN_ULONG bn_mul_add_words(BN_ULONG *rp, const BN_ULONG *ap, int num,
    100                           BN_ULONG w) {
    101   BN_ULONG c1 = 0;
    102 
    103   if (num <= 0) {
    104     return (c1);
    105   }
    106 
    107   while (num & ~3) {
    108     mul_add(rp[0], ap[0], w, c1);
    109     mul_add(rp[1], ap[1], w, c1);
    110     mul_add(rp[2], ap[2], w, c1);
    111     mul_add(rp[3], ap[3], w, c1);
    112     ap += 4;
    113     rp += 4;
    114     num -= 4;
    115   }
    116   if (num) {
    117     mul_add(rp[0], ap[0], w, c1);
    118     if (--num == 0) {
    119       return c1;
    120     }
    121     mul_add(rp[1], ap[1], w, c1);
    122     if (--num == 0) {
    123       return c1;
    124     }
    125     mul_add(rp[2], ap[2], w, c1);
    126     return c1;
    127   }
    128 
    129   return c1;
    130 }
    131 
    132 BN_ULONG bn_mul_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w) {
    133   BN_ULONG c1 = 0;
    134 
    135   if (num <= 0) {
    136     return c1;
    137   }
    138 
    139   while (num & ~3) {
    140     mul(rp[0], ap[0], w, c1);
    141     mul(rp[1], ap[1], w, c1);
    142     mul(rp[2], ap[2], w, c1);
    143     mul(rp[3], ap[3], w, c1);
    144     ap += 4;
    145     rp += 4;
    146     num -= 4;
    147   }
    148   if (num) {
    149     mul(rp[0], ap[0], w, c1);
    150     if (--num == 0) {
    151       return c1;
    152     }
    153     mul(rp[1], ap[1], w, c1);
    154     if (--num == 0) {
    155       return c1;
    156     }
    157     mul(rp[2], ap[2], w, c1);
    158   }
    159   return c1;
    160 }
    161 
    162 void bn_sqr_words(BN_ULONG *r, const BN_ULONG *a, int n) {
    163   if (n <= 0) {
    164     return;
    165   }
    166 
    167   while (n & ~3) {
    168     sqr(r[0], r[1], a[0]);
    169     sqr(r[2], r[3], a[1]);
    170     sqr(r[4], r[5], a[2]);
    171     sqr(r[6], r[7], a[3]);
    172     a += 4;
    173     r += 8;
    174     n -= 4;
    175   }
    176   if (n) {
    177     sqr(r[0], r[1], a[0]);
    178     if (--n == 0) {
    179       return;
    180     }
    181     sqr(r[2], r[3], a[1]);
    182     if (--n == 0) {
    183       return;
    184     }
    185     sqr(r[4], r[5], a[2]);
    186   }
    187 }
    188 
    189 BN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d) {
    190   BN_ULONG ret, waste;
    191 
    192   asm("divq	%4" : "=a"(ret), "=d"(waste) : "a"(l), "d"(h), "g"(d) : "cc");
    193 
    194   return ret;
    195 }
    196 
    197 BN_ULONG bn_add_words(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
    198                       int n) {
    199   BN_ULONG ret;
    200   size_t i = 0;
    201 
    202   if (n <= 0) {
    203     return 0;
    204   }
    205 
    206   asm volatile (
    207       "	subq	%0,%0		\n" /* clear carry */
    208       "	jmp	1f		\n"
    209       ".p2align 4			\n"
    210       "1:	movq	(%4,%2,8),%0	\n"
    211       "	adcq	(%5,%2,8),%0	\n"
    212       "	movq	%0,(%3,%2,8)	\n"
    213       "	lea	1(%2),%2	\n"
    214       "	loop	1b		\n"
    215       "	sbbq	%0,%0		\n"
    216       : "=&r"(ret), "+c"(n), "+r"(i)
    217       : "r"(rp), "r"(ap), "r"(bp)
    218       : "cc", "memory");
    219 
    220   return ret & 1;
    221 }
    222 
    223 #ifndef SIMICS
    224 BN_ULONG bn_sub_words(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
    225                       int n) {
    226   BN_ULONG ret;
    227   size_t i = 0;
    228 
    229   if (n <= 0) {
    230     return 0;
    231   }
    232 
    233   asm volatile (
    234       "	subq	%0,%0		\n" /* clear borrow */
    235       "	jmp	1f		\n"
    236       ".p2align 4			\n"
    237       "1:	movq	(%4,%2,8),%0	\n"
    238       "	sbbq	(%5,%2,8),%0	\n"
    239       "	movq	%0,(%3,%2,8)	\n"
    240       "	lea	1(%2),%2	\n"
    241       "	loop	1b		\n"
    242       "	sbbq	%0,%0		\n"
    243       : "=&r"(ret), "+c"(n), "+r"(i)
    244       : "r"(rp), "r"(ap), "r"(bp)
    245       : "cc", "memory");
    246 
    247   return ret & 1;
    248 }
    249 #else
    250 /* Simics 1.4<7 has buggy sbbq:-( */
    251 #define BN_MASK2 0xffffffffffffffffL
    252 BN_ULONG bn_sub_words(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n) {
    253   BN_ULONG t1, t2;
    254   int c = 0;
    255 
    256   if (n <= 0) {
    257     return (BN_ULONG)0;
    258   }
    259 
    260   for (;;) {
    261     t1 = a[0];
    262     t2 = b[0];
    263     r[0] = (t1 - t2 - c) & BN_MASK2;
    264     if (t1 != t2) {
    265       c = (t1 < t2);
    266     }
    267     if (--n <= 0) {
    268       break;
    269     }
    270 
    271     t1 = a[1];
    272     t2 = b[1];
    273     r[1] = (t1 - t2 - c) & BN_MASK2;
    274     if (t1 != t2) {
    275       c = (t1 < t2);
    276     }
    277     if (--n <= 0) {
    278       break;
    279     }
    280 
    281     t1 = a[2];
    282     t2 = b[2];
    283     r[2] = (t1 - t2 - c) & BN_MASK2;
    284     if (t1 != t2) {
    285       c = (t1 < t2);
    286     }
    287     if (--n <= 0) {
    288       break;
    289     }
    290 
    291     t1 = a[3];
    292     t2 = b[3];
    293     r[3] = (t1 - t2 - c) & BN_MASK2;
    294     if (t1 != t2) {
    295       c = (t1 < t2);
    296     }
    297     if (--n <= 0) {
    298       break;
    299     }
    300 
    301     a += 4;
    302     b += 4;
    303     r += 4;
    304   }
    305   return c;
    306 }
    307 #endif
    308 
    309 /* mul_add_c(a,b,c0,c1,c2)  -- c+=a*b for three word number c=(c2,c1,c0) */
    310 /* mul_add_c2(a,b,c0,c1,c2) -- c+=2*a*b for three word number c=(c2,c1,c0) */
    311 /* sqr_add_c(a,i,c0,c1,c2)  -- c+=a[i]^2 for three word number c=(c2,c1,c0) */
    312 /* sqr_add_c2(a,i,c0,c1,c2) -- c+=2*a[i]*a[j] for three word number c=(c2,c1,c0)
    313  */
    314 
    315 /* Keep in mind that carrying into high part of multiplication result can not
    316  * overflow, because it cannot be all-ones. */
    317 #define mul_add_c(a, b, c0, c1, c2)          \
    318   do {                                       \
    319     BN_ULONG t1, t2;                \
    320     asm("mulq %3" : "=a"(t1), "=d"(t2) : "a"(a), "m"(b) : "cc"); \
    321     asm("addq %3,%0; adcq %4,%1; adcq %5,%2" \
    322         : "+r"(c0), "+r"(c1), "+r"(c2)       \
    323         : "r"(t1), "r"(t2), "g"(0)           \
    324         : "cc");                             \
    325   } while (0)
    326 
    327 #define sqr_add_c(a, i, c0, c1, c2)          \
    328   do {                                       \
    329     BN_ULONG t1, t2;                         \
    330     asm("mulq %2" : "=a"(t1), "=d"(t2) : "a"(a[i]) : "cc"); \
    331     asm("addq %3,%0; adcq %4,%1; adcq %5,%2" \
    332         : "+r"(c0), "+r"(c1), "+r"(c2)       \
    333         : "r"(t1), "r"(t2), "g"(0)           \
    334         : "cc");                             \
    335   } while (0)
    336 
    337 #define mul_add_c2(a, b, c0, c1, c2)         \
    338   do {                                       \
    339     BN_ULONG t1, t2;                                                    \
    340     asm("mulq %3" : "=a"(t1), "=d"(t2) : "a"(a), "m"(b) : "cc");        \
    341     asm("addq %3,%0; adcq %4,%1; adcq %5,%2" \
    342         : "+r"(c0), "+r"(c1), "+r"(c2)       \
    343         : "r"(t1), "r"(t2), "g"(0)           \
    344         : "cc");                             \
    345     asm("addq %3,%0; adcq %4,%1; adcq %5,%2" \
    346         : "+r"(c0), "+r"(c1), "+r"(c2)       \
    347         : "r"(t1), "r"(t2), "g"(0)           \
    348         : "cc");                             \
    349   } while (0)
    350 
    351 #define sqr_add_c2(a, i, j, c0, c1, c2) mul_add_c2((a)[i], (a)[j], c0, c1, c2)
    352 
    353 void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b) {
    354   BN_ULONG c1, c2, c3;
    355 
    356   c1 = 0;
    357   c2 = 0;
    358   c3 = 0;
    359   mul_add_c(a[0], b[0], c1, c2, c3);
    360   r[0] = c1;
    361   c1 = 0;
    362   mul_add_c(a[0], b[1], c2, c3, c1);
    363   mul_add_c(a[1], b[0], c2, c3, c1);
    364   r[1] = c2;
    365   c2 = 0;
    366   mul_add_c(a[2], b[0], c3, c1, c2);
    367   mul_add_c(a[1], b[1], c3, c1, c2);
    368   mul_add_c(a[0], b[2], c3, c1, c2);
    369   r[2] = c3;
    370   c3 = 0;
    371   mul_add_c(a[0], b[3], c1, c2, c3);
    372   mul_add_c(a[1], b[2], c1, c2, c3);
    373   mul_add_c(a[2], b[1], c1, c2, c3);
    374   mul_add_c(a[3], b[0], c1, c2, c3);
    375   r[3] = c1;
    376   c1 = 0;
    377   mul_add_c(a[4], b[0], c2, c3, c1);
    378   mul_add_c(a[3], b[1], c2, c3, c1);
    379   mul_add_c(a[2], b[2], c2, c3, c1);
    380   mul_add_c(a[1], b[3], c2, c3, c1);
    381   mul_add_c(a[0], b[4], c2, c3, c1);
    382   r[4] = c2;
    383   c2 = 0;
    384   mul_add_c(a[0], b[5], c3, c1, c2);
    385   mul_add_c(a[1], b[4], c3, c1, c2);
    386   mul_add_c(a[2], b[3], c3, c1, c2);
    387   mul_add_c(a[3], b[2], c3, c1, c2);
    388   mul_add_c(a[4], b[1], c3, c1, c2);
    389   mul_add_c(a[5], b[0], c3, c1, c2);
    390   r[5] = c3;
    391   c3 = 0;
    392   mul_add_c(a[6], b[0], c1, c2, c3);
    393   mul_add_c(a[5], b[1], c1, c2, c3);
    394   mul_add_c(a[4], b[2], c1, c2, c3);
    395   mul_add_c(a[3], b[3], c1, c2, c3);
    396   mul_add_c(a[2], b[4], c1, c2, c3);
    397   mul_add_c(a[1], b[5], c1, c2, c3);
    398   mul_add_c(a[0], b[6], c1, c2, c3);
    399   r[6] = c1;
    400   c1 = 0;
    401   mul_add_c(a[0], b[7], c2, c3, c1);
    402   mul_add_c(a[1], b[6], c2, c3, c1);
    403   mul_add_c(a[2], b[5], c2, c3, c1);
    404   mul_add_c(a[3], b[4], c2, c3, c1);
    405   mul_add_c(a[4], b[3], c2, c3, c1);
    406   mul_add_c(a[5], b[2], c2, c3, c1);
    407   mul_add_c(a[6], b[1], c2, c3, c1);
    408   mul_add_c(a[7], b[0], c2, c3, c1);
    409   r[7] = c2;
    410   c2 = 0;
    411   mul_add_c(a[7], b[1], c3, c1, c2);
    412   mul_add_c(a[6], b[2], c3, c1, c2);
    413   mul_add_c(a[5], b[3], c3, c1, c2);
    414   mul_add_c(a[4], b[4], c3, c1, c2);
    415   mul_add_c(a[3], b[5], c3, c1, c2);
    416   mul_add_c(a[2], b[6], c3, c1, c2);
    417   mul_add_c(a[1], b[7], c3, c1, c2);
    418   r[8] = c3;
    419   c3 = 0;
    420   mul_add_c(a[2], b[7], c1, c2, c3);
    421   mul_add_c(a[3], b[6], c1, c2, c3);
    422   mul_add_c(a[4], b[5], c1, c2, c3);
    423   mul_add_c(a[5], b[4], c1, c2, c3);
    424   mul_add_c(a[6], b[3], c1, c2, c3);
    425   mul_add_c(a[7], b[2], c1, c2, c3);
    426   r[9] = c1;
    427   c1 = 0;
    428   mul_add_c(a[7], b[3], c2, c3, c1);
    429   mul_add_c(a[6], b[4], c2, c3, c1);
    430   mul_add_c(a[5], b[5], c2, c3, c1);
    431   mul_add_c(a[4], b[6], c2, c3, c1);
    432   mul_add_c(a[3], b[7], c2, c3, c1);
    433   r[10] = c2;
    434   c2 = 0;
    435   mul_add_c(a[4], b[7], c3, c1, c2);
    436   mul_add_c(a[5], b[6], c3, c1, c2);
    437   mul_add_c(a[6], b[5], c3, c1, c2);
    438   mul_add_c(a[7], b[4], c3, c1, c2);
    439   r[11] = c3;
    440   c3 = 0;
    441   mul_add_c(a[7], b[5], c1, c2, c3);
    442   mul_add_c(a[6], b[6], c1, c2, c3);
    443   mul_add_c(a[5], b[7], c1, c2, c3);
    444   r[12] = c1;
    445   c1 = 0;
    446   mul_add_c(a[6], b[7], c2, c3, c1);
    447   mul_add_c(a[7], b[6], c2, c3, c1);
    448   r[13] = c2;
    449   c2 = 0;
    450   mul_add_c(a[7], b[7], c3, c1, c2);
    451   r[14] = c3;
    452   r[15] = c1;
    453 }
    454 
    455 void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b) {
    456   BN_ULONG c1, c2, c3;
    457 
    458   c1 = 0;
    459   c2 = 0;
    460   c3 = 0;
    461   mul_add_c(a[0], b[0], c1, c2, c3);
    462   r[0] = c1;
    463   c1 = 0;
    464   mul_add_c(a[0], b[1], c2, c3, c1);
    465   mul_add_c(a[1], b[0], c2, c3, c1);
    466   r[1] = c2;
    467   c2 = 0;
    468   mul_add_c(a[2], b[0], c3, c1, c2);
    469   mul_add_c(a[1], b[1], c3, c1, c2);
    470   mul_add_c(a[0], b[2], c3, c1, c2);
    471   r[2] = c3;
    472   c3 = 0;
    473   mul_add_c(a[0], b[3], c1, c2, c3);
    474   mul_add_c(a[1], b[2], c1, c2, c3);
    475   mul_add_c(a[2], b[1], c1, c2, c3);
    476   mul_add_c(a[3], b[0], c1, c2, c3);
    477   r[3] = c1;
    478   c1 = 0;
    479   mul_add_c(a[3], b[1], c2, c3, c1);
    480   mul_add_c(a[2], b[2], c2, c3, c1);
    481   mul_add_c(a[1], b[3], c2, c3, c1);
    482   r[4] = c2;
    483   c2 = 0;
    484   mul_add_c(a[2], b[3], c3, c1, c2);
    485   mul_add_c(a[3], b[2], c3, c1, c2);
    486   r[5] = c3;
    487   c3 = 0;
    488   mul_add_c(a[3], b[3], c1, c2, c3);
    489   r[6] = c1;
    490   r[7] = c2;
    491 }
    492 
    493 void bn_sqr_comba8(BN_ULONG *r, const BN_ULONG *a) {
    494   BN_ULONG c1, c2, c3;
    495 
    496   c1 = 0;
    497   c2 = 0;
    498   c3 = 0;
    499   sqr_add_c(a, 0, c1, c2, c3);
    500   r[0] = c1;
    501   c1 = 0;
    502   sqr_add_c2(a, 1, 0, c2, c3, c1);
    503   r[1] = c2;
    504   c2 = 0;
    505   sqr_add_c(a, 1, c3, c1, c2);
    506   sqr_add_c2(a, 2, 0, c3, c1, c2);
    507   r[2] = c3;
    508   c3 = 0;
    509   sqr_add_c2(a, 3, 0, c1, c2, c3);
    510   sqr_add_c2(a, 2, 1, c1, c2, c3);
    511   r[3] = c1;
    512   c1 = 0;
    513   sqr_add_c(a, 2, c2, c3, c1);
    514   sqr_add_c2(a, 3, 1, c2, c3, c1);
    515   sqr_add_c2(a, 4, 0, c2, c3, c1);
    516   r[4] = c2;
    517   c2 = 0;
    518   sqr_add_c2(a, 5, 0, c3, c1, c2);
    519   sqr_add_c2(a, 4, 1, c3, c1, c2);
    520   sqr_add_c2(a, 3, 2, c3, c1, c2);
    521   r[5] = c3;
    522   c3 = 0;
    523   sqr_add_c(a, 3, c1, c2, c3);
    524   sqr_add_c2(a, 4, 2, c1, c2, c3);
    525   sqr_add_c2(a, 5, 1, c1, c2, c3);
    526   sqr_add_c2(a, 6, 0, c1, c2, c3);
    527   r[6] = c1;
    528   c1 = 0;
    529   sqr_add_c2(a, 7, 0, c2, c3, c1);
    530   sqr_add_c2(a, 6, 1, c2, c3, c1);
    531   sqr_add_c2(a, 5, 2, c2, c3, c1);
    532   sqr_add_c2(a, 4, 3, c2, c3, c1);
    533   r[7] = c2;
    534   c2 = 0;
    535   sqr_add_c(a, 4, c3, c1, c2);
    536   sqr_add_c2(a, 5, 3, c3, c1, c2);
    537   sqr_add_c2(a, 6, 2, c3, c1, c2);
    538   sqr_add_c2(a, 7, 1, c3, c1, c2);
    539   r[8] = c3;
    540   c3 = 0;
    541   sqr_add_c2(a, 7, 2, c1, c2, c3);
    542   sqr_add_c2(a, 6, 3, c1, c2, c3);
    543   sqr_add_c2(a, 5, 4, c1, c2, c3);
    544   r[9] = c1;
    545   c1 = 0;
    546   sqr_add_c(a, 5, c2, c3, c1);
    547   sqr_add_c2(a, 6, 4, c2, c3, c1);
    548   sqr_add_c2(a, 7, 3, c2, c3, c1);
    549   r[10] = c2;
    550   c2 = 0;
    551   sqr_add_c2(a, 7, 4, c3, c1, c2);
    552   sqr_add_c2(a, 6, 5, c3, c1, c2);
    553   r[11] = c3;
    554   c3 = 0;
    555   sqr_add_c(a, 6, c1, c2, c3);
    556   sqr_add_c2(a, 7, 5, c1, c2, c3);
    557   r[12] = c1;
    558   c1 = 0;
    559   sqr_add_c2(a, 7, 6, c2, c3, c1);
    560   r[13] = c2;
    561   c2 = 0;
    562   sqr_add_c(a, 7, c3, c1, c2);
    563   r[14] = c3;
    564   r[15] = c1;
    565 }
    566 
    567 void bn_sqr_comba4(BN_ULONG *r, const BN_ULONG *a) {
    568   BN_ULONG c1, c2, c3;
    569 
    570   c1 = 0;
    571   c2 = 0;
    572   c3 = 0;
    573   sqr_add_c(a, 0, c1, c2, c3);
    574   r[0] = c1;
    575   c1 = 0;
    576   sqr_add_c2(a, 1, 0, c2, c3, c1);
    577   r[1] = c2;
    578   c2 = 0;
    579   sqr_add_c(a, 1, c3, c1, c2);
    580   sqr_add_c2(a, 2, 0, c3, c1, c2);
    581   r[2] = c3;
    582   c3 = 0;
    583   sqr_add_c2(a, 3, 0, c1, c2, c3);
    584   sqr_add_c2(a, 2, 1, c1, c2, c3);
    585   r[3] = c1;
    586   c1 = 0;
    587   sqr_add_c(a, 2, c2, c3, c1);
    588   sqr_add_c2(a, 3, 1, c2, c3, c1);
    589   r[4] = c2;
    590   c2 = 0;
    591   sqr_add_c2(a, 3, 2, c3, c1, c2);
    592   r[5] = c3;
    593   c3 = 0;
    594   sqr_add_c(a, 3, c1, c2, c3);
    595   r[6] = c1;
    596   r[7] = c2;
    597 }
    598 
    599 #endif  /* !NO_ASM && X86_64 && !WINDOWS */
    600