Home | History | Annotate | Download | only in asm
      1 /* x86_64 BIGNUM accelerator version 0.1, December 2002.
      2  *
      3  * Implemented by Andy Polyakov <appro (at) fy.chalmers.se> for the OpenSSL
      4  * project.
      5  *
      6  * Rights for redistribution and usage in source and binary forms are
      7  * granted according to the OpenSSL license. Warranty of any kind is
      8  * disclaimed.
      9  *
     10  * Q. Version 0.1? It doesn't sound like Andy, he used to assign real
     11  *    versions, like 1.0...
     12  * A. Well, that's because this code is basically a quick-n-dirty
     13  *    proof-of-concept hack. As you can see it's implemented with
     14  *    inline assembler, which means that you're bound to GCC and that
     15  *    there might be enough room for further improvement.
     16  *
     17  * Q. Why inline assembler?
     18  * A. x86_64 features own ABI which I'm not familiar with. This is
     19  *    why I decided to let the compiler take care of subroutine
     20  *    prologue/epilogue as well as register allocation. For reference.
     21  *    Win64 implements different ABI for AMD64, different from Linux.
     22  *
     23  * Q. How much faster does it get?
     24  * A. 'apps/openssl speed rsa dsa' output with no-asm:
     25  *
     26  *	                  sign    verify    sign/s verify/s
     27  *	rsa  512 bits   0.0006s   0.0001s   1683.8  18456.2
     28  *	rsa 1024 bits   0.0028s   0.0002s    356.0   6407.0
     29  *	rsa 2048 bits   0.0172s   0.0005s     58.0   1957.8
     30  *	rsa 4096 bits   0.1155s   0.0018s      8.7    555.6
     31  *	                  sign    verify    sign/s verify/s
     32  *	dsa  512 bits   0.0005s   0.0006s   2100.8   1768.3
     33  *	dsa 1024 bits   0.0014s   0.0018s    692.3    559.2
     34  *	dsa 2048 bits   0.0049s   0.0061s    204.7    165.0
     35  *
     36  *    'apps/openssl speed rsa dsa' output with this module:
     37  *
     38  *	                  sign    verify    sign/s verify/s
     39  *	rsa  512 bits   0.0004s   0.0000s   2767.1  33297.9
     40  *	rsa 1024 bits   0.0012s   0.0001s    867.4  14674.7
     41  *	rsa 2048 bits   0.0061s   0.0002s    164.0   5270.0
     42  *	rsa 4096 bits   0.0384s   0.0006s     26.1   1650.8
     43  *	                  sign    verify    sign/s verify/s
     44  *	dsa  512 bits   0.0002s   0.0003s   4442.2   3786.3
     45  *	dsa 1024 bits   0.0005s   0.0007s   1835.1   1497.4
     46  *	dsa 2048 bits   0.0016s   0.0020s    620.4    504.6
     47  *
     48  *    For the reference. IA-32 assembler implementation performs
     49  *    very much like 64-bit code compiled with no-asm on the same
     50  *    machine.
     51  */
     52 
     53 #include <openssl/bn.h>
     54 
     55 /* TODO(davidben): Get this file working on Windows x64. */
     56 #if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__GNUC__)
     57 
     58 #include "../internal.h"
     59 
     60 
     61 #undef mul
     62 #undef mul_add
     63 
     64 #define asm __asm__
     65 
     66 /*
     67  * "m"(a), "+m"(r)	is the way to favor DirectPath -code;
     68  * "g"(0)		let the compiler to decide where does it
     69  *			want to keep the value of zero;
     70  */
     71 #define mul_add(r, a, word, carry)                                     \
     72   do {                                                                 \
     73     register BN_ULONG high, low;                                       \
     74     asm("mulq %3" : "=a"(low), "=d"(high) : "a"(word), "m"(a) : "cc"); \
     75     asm("addq %2,%0; adcq %3,%1"                                       \
     76         : "+r"(carry), "+d"(high)                                      \
     77         : "a"(low), "g"(0)                                             \
     78         : "cc");                                                       \
     79     asm("addq %2,%0; adcq %3,%1"                                       \
     80         : "+m"(r), "+d"(high)                                          \
     81         : "r"(carry), "g"(0)                                           \
     82         : "cc");                                                       \
     83     (carry) = high;                                                    \
     84   } while (0)
     85 
     86 #define mul(r, a, word, carry)                                         \
     87   do {                                                                 \
     88     register BN_ULONG high, low;                                       \
     89     asm("mulq %3" : "=a"(low), "=d"(high) : "a"(word), "g"(a) : "cc"); \
     90     asm("addq %2,%0; adcq %3,%1"                                       \
     91         : "+r"(carry), "+d"(high)                                      \
     92         : "a"(low), "g"(0)                                             \
     93         : "cc");                                                       \
     94     (r) = (carry);                                                     \
     95     (carry) = high;                                                    \
     96   } while (0)
     97 #undef sqr
     98 #define sqr(r0, r1, a) asm("mulq %2" : "=a"(r0), "=d"(r1) : "a"(a) : "cc");
     99 
    100 BN_ULONG bn_mul_add_words(BN_ULONG *rp, const BN_ULONG *ap, int num,
    101                           BN_ULONG w) {
    102   BN_ULONG c1 = 0;
    103 
    104   if (num <= 0) {
    105     return (c1);
    106   }
    107 
    108   while (num & ~3) {
    109     mul_add(rp[0], ap[0], w, c1);
    110     mul_add(rp[1], ap[1], w, c1);
    111     mul_add(rp[2], ap[2], w, c1);
    112     mul_add(rp[3], ap[3], w, c1);
    113     ap += 4;
    114     rp += 4;
    115     num -= 4;
    116   }
    117   if (num) {
    118     mul_add(rp[0], ap[0], w, c1);
    119     if (--num == 0) {
    120       return c1;
    121     }
    122     mul_add(rp[1], ap[1], w, c1);
    123     if (--num == 0) {
    124       return c1;
    125     }
    126     mul_add(rp[2], ap[2], w, c1);
    127     return c1;
    128   }
    129 
    130   return c1;
    131 }
    132 
    133 BN_ULONG bn_mul_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w) {
    134   BN_ULONG c1 = 0;
    135 
    136   if (num <= 0) {
    137     return c1;
    138   }
    139 
    140   while (num & ~3) {
    141     mul(rp[0], ap[0], w, c1);
    142     mul(rp[1], ap[1], w, c1);
    143     mul(rp[2], ap[2], w, c1);
    144     mul(rp[3], ap[3], w, c1);
    145     ap += 4;
    146     rp += 4;
    147     num -= 4;
    148   }
    149   if (num) {
    150     mul(rp[0], ap[0], w, c1);
    151     if (--num == 0) {
    152       return c1;
    153     }
    154     mul(rp[1], ap[1], w, c1);
    155     if (--num == 0) {
    156       return c1;
    157     }
    158     mul(rp[2], ap[2], w, c1);
    159   }
    160   return c1;
    161 }
    162 
    163 void bn_sqr_words(BN_ULONG *r, const BN_ULONG *a, int n) {
    164   if (n <= 0) {
    165     return;
    166   }
    167 
    168   while (n & ~3) {
    169     sqr(r[0], r[1], a[0]);
    170     sqr(r[2], r[3], a[1]);
    171     sqr(r[4], r[5], a[2]);
    172     sqr(r[6], r[7], a[3]);
    173     a += 4;
    174     r += 8;
    175     n -= 4;
    176   }
    177   if (n) {
    178     sqr(r[0], r[1], a[0]);
    179     if (--n == 0) {
    180       return;
    181     }
    182     sqr(r[2], r[3], a[1]);
    183     if (--n == 0) {
    184       return;
    185     }
    186     sqr(r[4], r[5], a[2]);
    187   }
    188 }
    189 
    190 BN_ULONG bn_add_words(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
    191                       int n) {
    192   BN_ULONG ret;
    193   size_t i = 0;
    194 
    195   if (n <= 0) {
    196     return 0;
    197   }
    198 
    199   asm volatile (
    200       "	subq	%0,%0		\n" /* clear carry */
    201       "	jmp	1f		\n"
    202       ".p2align 4			\n"
    203       "1:"
    204       "	movq	(%4,%2,8),%0	\n"
    205       "	adcq	(%5,%2,8),%0	\n"
    206       "	movq	%0,(%3,%2,8)	\n"
    207       "	lea	1(%2),%2	\n"
    208       "	loop	1b		\n"
    209       "	sbbq	%0,%0		\n"
    210       : "=&r"(ret), "+c"(n), "+r"(i)
    211       : "r"(rp), "r"(ap), "r"(bp)
    212       : "cc", "memory");
    213 
    214   return ret & 1;
    215 }
    216 
    217 BN_ULONG bn_sub_words(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
    218                       int n) {
    219   BN_ULONG ret;
    220   size_t i = 0;
    221 
    222   if (n <= 0) {
    223     return 0;
    224   }
    225 
    226   asm volatile (
    227       "	subq	%0,%0		\n" /* clear borrow */
    228       "	jmp	1f		\n"
    229       ".p2align 4			\n"
    230       "1:"
    231       "	movq	(%4,%2,8),%0	\n"
    232       "	sbbq	(%5,%2,8),%0	\n"
    233       "	movq	%0,(%3,%2,8)	\n"
    234       "	lea	1(%2),%2	\n"
    235       "	loop	1b		\n"
    236       "	sbbq	%0,%0		\n"
    237       : "=&r"(ret), "+c"(n), "+r"(i)
    238       : "r"(rp), "r"(ap), "r"(bp)
    239       : "cc", "memory");
    240 
    241   return ret & 1;
    242 }
    243 
    244 /* mul_add_c(a,b,c0,c1,c2)  -- c+=a*b for three word number c=(c2,c1,c0) */
    245 /* mul_add_c2(a,b,c0,c1,c2) -- c+=2*a*b for three word number c=(c2,c1,c0) */
    246 /* sqr_add_c(a,i,c0,c1,c2)  -- c+=a[i]^2 for three word number c=(c2,c1,c0) */
    247 /* sqr_add_c2(a,i,c0,c1,c2) -- c+=2*a[i]*a[j] for three word number c=(c2,c1,c0)
    248  */
    249 
    250 /* Keep in mind that carrying into high part of multiplication result can not
    251  * overflow, because it cannot be all-ones. */
    252 #define mul_add_c(a, b, c0, c1, c2)          \
    253   do {                                       \
    254     BN_ULONG t1, t2;                \
    255     asm("mulq %3" : "=a"(t1), "=d"(t2) : "a"(a), "m"(b) : "cc"); \
    256     asm("addq %3,%0; adcq %4,%1; adcq %5,%2" \
    257         : "+r"(c0), "+r"(c1), "+r"(c2)       \
    258         : "r"(t1), "r"(t2), "g"(0)           \
    259         : "cc");                             \
    260   } while (0)
    261 
    262 #define sqr_add_c(a, i, c0, c1, c2)                           \
    263   do {                                                        \
    264     BN_ULONG t1, t2;                                          \
    265     asm("mulq %2" : "=a"(t1), "=d"(t2) : "a"((a)[i]) : "cc"); \
    266     asm("addq %3,%0; adcq %4,%1; adcq %5,%2"                  \
    267         : "+r"(c0), "+r"(c1), "+r"(c2)                        \
    268         : "r"(t1), "r"(t2), "g"(0)                            \
    269         : "cc");                                              \
    270   } while (0)
    271 
    272 #define mul_add_c2(a, b, c0, c1, c2)         \
    273   do {                                       \
    274     BN_ULONG t1, t2;                                                    \
    275     asm("mulq %3" : "=a"(t1), "=d"(t2) : "a"(a), "m"(b) : "cc");        \
    276     asm("addq %3,%0; adcq %4,%1; adcq %5,%2" \
    277         : "+r"(c0), "+r"(c1), "+r"(c2)       \
    278         : "r"(t1), "r"(t2), "g"(0)           \
    279         : "cc");                             \
    280     asm("addq %3,%0; adcq %4,%1; adcq %5,%2" \
    281         : "+r"(c0), "+r"(c1), "+r"(c2)       \
    282         : "r"(t1), "r"(t2), "g"(0)           \
    283         : "cc");                             \
    284   } while (0)
    285 
    286 #define sqr_add_c2(a, i, j, c0, c1, c2) mul_add_c2((a)[i], (a)[j], c0, c1, c2)
    287 
    288 void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b) {
    289   BN_ULONG c1, c2, c3;
    290 
    291   c1 = 0;
    292   c2 = 0;
    293   c3 = 0;
    294   mul_add_c(a[0], b[0], c1, c2, c3);
    295   r[0] = c1;
    296   c1 = 0;
    297   mul_add_c(a[0], b[1], c2, c3, c1);
    298   mul_add_c(a[1], b[0], c2, c3, c1);
    299   r[1] = c2;
    300   c2 = 0;
    301   mul_add_c(a[2], b[0], c3, c1, c2);
    302   mul_add_c(a[1], b[1], c3, c1, c2);
    303   mul_add_c(a[0], b[2], c3, c1, c2);
    304   r[2] = c3;
    305   c3 = 0;
    306   mul_add_c(a[0], b[3], c1, c2, c3);
    307   mul_add_c(a[1], b[2], c1, c2, c3);
    308   mul_add_c(a[2], b[1], c1, c2, c3);
    309   mul_add_c(a[3], b[0], c1, c2, c3);
    310   r[3] = c1;
    311   c1 = 0;
    312   mul_add_c(a[4], b[0], c2, c3, c1);
    313   mul_add_c(a[3], b[1], c2, c3, c1);
    314   mul_add_c(a[2], b[2], c2, c3, c1);
    315   mul_add_c(a[1], b[3], c2, c3, c1);
    316   mul_add_c(a[0], b[4], c2, c3, c1);
    317   r[4] = c2;
    318   c2 = 0;
    319   mul_add_c(a[0], b[5], c3, c1, c2);
    320   mul_add_c(a[1], b[4], c3, c1, c2);
    321   mul_add_c(a[2], b[3], c3, c1, c2);
    322   mul_add_c(a[3], b[2], c3, c1, c2);
    323   mul_add_c(a[4], b[1], c3, c1, c2);
    324   mul_add_c(a[5], b[0], c3, c1, c2);
    325   r[5] = c3;
    326   c3 = 0;
    327   mul_add_c(a[6], b[0], c1, c2, c3);
    328   mul_add_c(a[5], b[1], c1, c2, c3);
    329   mul_add_c(a[4], b[2], c1, c2, c3);
    330   mul_add_c(a[3], b[3], c1, c2, c3);
    331   mul_add_c(a[2], b[4], c1, c2, c3);
    332   mul_add_c(a[1], b[5], c1, c2, c3);
    333   mul_add_c(a[0], b[6], c1, c2, c3);
    334   r[6] = c1;
    335   c1 = 0;
    336   mul_add_c(a[0], b[7], c2, c3, c1);
    337   mul_add_c(a[1], b[6], c2, c3, c1);
    338   mul_add_c(a[2], b[5], c2, c3, c1);
    339   mul_add_c(a[3], b[4], c2, c3, c1);
    340   mul_add_c(a[4], b[3], c2, c3, c1);
    341   mul_add_c(a[5], b[2], c2, c3, c1);
    342   mul_add_c(a[6], b[1], c2, c3, c1);
    343   mul_add_c(a[7], b[0], c2, c3, c1);
    344   r[7] = c2;
    345   c2 = 0;
    346   mul_add_c(a[7], b[1], c3, c1, c2);
    347   mul_add_c(a[6], b[2], c3, c1, c2);
    348   mul_add_c(a[5], b[3], c3, c1, c2);
    349   mul_add_c(a[4], b[4], c3, c1, c2);
    350   mul_add_c(a[3], b[5], c3, c1, c2);
    351   mul_add_c(a[2], b[6], c3, c1, c2);
    352   mul_add_c(a[1], b[7], c3, c1, c2);
    353   r[8] = c3;
    354   c3 = 0;
    355   mul_add_c(a[2], b[7], c1, c2, c3);
    356   mul_add_c(a[3], b[6], c1, c2, c3);
    357   mul_add_c(a[4], b[5], c1, c2, c3);
    358   mul_add_c(a[5], b[4], c1, c2, c3);
    359   mul_add_c(a[6], b[3], c1, c2, c3);
    360   mul_add_c(a[7], b[2], c1, c2, c3);
    361   r[9] = c1;
    362   c1 = 0;
    363   mul_add_c(a[7], b[3], c2, c3, c1);
    364   mul_add_c(a[6], b[4], c2, c3, c1);
    365   mul_add_c(a[5], b[5], c2, c3, c1);
    366   mul_add_c(a[4], b[6], c2, c3, c1);
    367   mul_add_c(a[3], b[7], c2, c3, c1);
    368   r[10] = c2;
    369   c2 = 0;
    370   mul_add_c(a[4], b[7], c3, c1, c2);
    371   mul_add_c(a[5], b[6], c3, c1, c2);
    372   mul_add_c(a[6], b[5], c3, c1, c2);
    373   mul_add_c(a[7], b[4], c3, c1, c2);
    374   r[11] = c3;
    375   c3 = 0;
    376   mul_add_c(a[7], b[5], c1, c2, c3);
    377   mul_add_c(a[6], b[6], c1, c2, c3);
    378   mul_add_c(a[5], b[7], c1, c2, c3);
    379   r[12] = c1;
    380   c1 = 0;
    381   mul_add_c(a[6], b[7], c2, c3, c1);
    382   mul_add_c(a[7], b[6], c2, c3, c1);
    383   r[13] = c2;
    384   c2 = 0;
    385   mul_add_c(a[7], b[7], c3, c1, c2);
    386   r[14] = c3;
    387   r[15] = c1;
    388 }
    389 
    390 void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b) {
    391   BN_ULONG c1, c2, c3;
    392 
    393   c1 = 0;
    394   c2 = 0;
    395   c3 = 0;
    396   mul_add_c(a[0], b[0], c1, c2, c3);
    397   r[0] = c1;
    398   c1 = 0;
    399   mul_add_c(a[0], b[1], c2, c3, c1);
    400   mul_add_c(a[1], b[0], c2, c3, c1);
    401   r[1] = c2;
    402   c2 = 0;
    403   mul_add_c(a[2], b[0], c3, c1, c2);
    404   mul_add_c(a[1], b[1], c3, c1, c2);
    405   mul_add_c(a[0], b[2], c3, c1, c2);
    406   r[2] = c3;
    407   c3 = 0;
    408   mul_add_c(a[0], b[3], c1, c2, c3);
    409   mul_add_c(a[1], b[2], c1, c2, c3);
    410   mul_add_c(a[2], b[1], c1, c2, c3);
    411   mul_add_c(a[3], b[0], c1, c2, c3);
    412   r[3] = c1;
    413   c1 = 0;
    414   mul_add_c(a[3], b[1], c2, c3, c1);
    415   mul_add_c(a[2], b[2], c2, c3, c1);
    416   mul_add_c(a[1], b[3], c2, c3, c1);
    417   r[4] = c2;
    418   c2 = 0;
    419   mul_add_c(a[2], b[3], c3, c1, c2);
    420   mul_add_c(a[3], b[2], c3, c1, c2);
    421   r[5] = c3;
    422   c3 = 0;
    423   mul_add_c(a[3], b[3], c1, c2, c3);
    424   r[6] = c1;
    425   r[7] = c2;
    426 }
    427 
    428 void bn_sqr_comba8(BN_ULONG *r, const BN_ULONG *a) {
    429   BN_ULONG c1, c2, c3;
    430 
    431   c1 = 0;
    432   c2 = 0;
    433   c3 = 0;
    434   sqr_add_c(a, 0, c1, c2, c3);
    435   r[0] = c1;
    436   c1 = 0;
    437   sqr_add_c2(a, 1, 0, c2, c3, c1);
    438   r[1] = c2;
    439   c2 = 0;
    440   sqr_add_c(a, 1, c3, c1, c2);
    441   sqr_add_c2(a, 2, 0, c3, c1, c2);
    442   r[2] = c3;
    443   c3 = 0;
    444   sqr_add_c2(a, 3, 0, c1, c2, c3);
    445   sqr_add_c2(a, 2, 1, c1, c2, c3);
    446   r[3] = c1;
    447   c1 = 0;
    448   sqr_add_c(a, 2, c2, c3, c1);
    449   sqr_add_c2(a, 3, 1, c2, c3, c1);
    450   sqr_add_c2(a, 4, 0, c2, c3, c1);
    451   r[4] = c2;
    452   c2 = 0;
    453   sqr_add_c2(a, 5, 0, c3, c1, c2);
    454   sqr_add_c2(a, 4, 1, c3, c1, c2);
    455   sqr_add_c2(a, 3, 2, c3, c1, c2);
    456   r[5] = c3;
    457   c3 = 0;
    458   sqr_add_c(a, 3, c1, c2, c3);
    459   sqr_add_c2(a, 4, 2, c1, c2, c3);
    460   sqr_add_c2(a, 5, 1, c1, c2, c3);
    461   sqr_add_c2(a, 6, 0, c1, c2, c3);
    462   r[6] = c1;
    463   c1 = 0;
    464   sqr_add_c2(a, 7, 0, c2, c3, c1);
    465   sqr_add_c2(a, 6, 1, c2, c3, c1);
    466   sqr_add_c2(a, 5, 2, c2, c3, c1);
    467   sqr_add_c2(a, 4, 3, c2, c3, c1);
    468   r[7] = c2;
    469   c2 = 0;
    470   sqr_add_c(a, 4, c3, c1, c2);
    471   sqr_add_c2(a, 5, 3, c3, c1, c2);
    472   sqr_add_c2(a, 6, 2, c3, c1, c2);
    473   sqr_add_c2(a, 7, 1, c3, c1, c2);
    474   r[8] = c3;
    475   c3 = 0;
    476   sqr_add_c2(a, 7, 2, c1, c2, c3);
    477   sqr_add_c2(a, 6, 3, c1, c2, c3);
    478   sqr_add_c2(a, 5, 4, c1, c2, c3);
    479   r[9] = c1;
    480   c1 = 0;
    481   sqr_add_c(a, 5, c2, c3, c1);
    482   sqr_add_c2(a, 6, 4, c2, c3, c1);
    483   sqr_add_c2(a, 7, 3, c2, c3, c1);
    484   r[10] = c2;
    485   c2 = 0;
    486   sqr_add_c2(a, 7, 4, c3, c1, c2);
    487   sqr_add_c2(a, 6, 5, c3, c1, c2);
    488   r[11] = c3;
    489   c3 = 0;
    490   sqr_add_c(a, 6, c1, c2, c3);
    491   sqr_add_c2(a, 7, 5, c1, c2, c3);
    492   r[12] = c1;
    493   c1 = 0;
    494   sqr_add_c2(a, 7, 6, c2, c3, c1);
    495   r[13] = c2;
    496   c2 = 0;
    497   sqr_add_c(a, 7, c3, c1, c2);
    498   r[14] = c3;
    499   r[15] = c1;
    500 }
    501 
    502 void bn_sqr_comba4(BN_ULONG *r, const BN_ULONG *a) {
    503   BN_ULONG c1, c2, c3;
    504 
    505   c1 = 0;
    506   c2 = 0;
    507   c3 = 0;
    508   sqr_add_c(a, 0, c1, c2, c3);
    509   r[0] = c1;
    510   c1 = 0;
    511   sqr_add_c2(a, 1, 0, c2, c3, c1);
    512   r[1] = c2;
    513   c2 = 0;
    514   sqr_add_c(a, 1, c3, c1, c2);
    515   sqr_add_c2(a, 2, 0, c3, c1, c2);
    516   r[2] = c3;
    517   c3 = 0;
    518   sqr_add_c2(a, 3, 0, c1, c2, c3);
    519   sqr_add_c2(a, 2, 1, c1, c2, c3);
    520   r[3] = c1;
    521   c1 = 0;
    522   sqr_add_c(a, 2, c2, c3, c1);
    523   sqr_add_c2(a, 3, 1, c2, c3, c1);
    524   r[4] = c2;
    525   c2 = 0;
    526   sqr_add_c2(a, 3, 2, c3, c1, c2);
    527   r[5] = c3;
    528   c3 = 0;
    529   sqr_add_c(a, 3, c1, c2, c3);
    530   r[6] = c1;
    531   r[7] = c2;
    532 }
    533 
    534 #undef mul_add
    535 #undef mul
    536 #undef sqr
    537 #undef mul_add_c
    538 #undef sqr_add_c
    539 #undef mul_add_c2
    540 #undef sqr_add_c2
    541 
    542 #endif  /* !NO_ASM && X86_64 && __GNUC__ */
    543