Home | History | Annotate | Download | only in bn
      1 /* crypto/bn/bn_asm.c */
      2 /* Copyright (C) 1995-1998 Eric Young (eay (at) cryptsoft.com)
      3  * All rights reserved.
      4  *
      5  * This package is an SSL implementation written
      6  * by Eric Young (eay (at) cryptsoft.com).
      7  * The implementation was written so as to conform with Netscapes SSL.
      8  *
      9  * This library is free for commercial and non-commercial use as long as
     10  * the following conditions are aheared to.  The following conditions
     11  * apply to all code found in this distribution, be it the RC4, RSA,
     12  * lhash, DES, etc., code; not just the SSL code.  The SSL documentation
     13  * included with this distribution is covered by the same copyright terms
     14  * except that the holder is Tim Hudson (tjh (at) cryptsoft.com).
     15  *
     16  * Copyright remains Eric Young's, and as such any Copyright notices in
     17  * the code are not to be removed.
     18  * If this package is used in a product, Eric Young should be given attribution
     19  * as the author of the parts of the library used.
     20  * This can be in the form of a textual message at program startup or
     21  * in documentation (online or textual) provided with the package.
     22  *
     23  * Redistribution and use in source and binary forms, with or without
     24  * modification, are permitted provided that the following conditions
     25  * are met:
     26  * 1. Redistributions of source code must retain the copyright
     27  *    notice, this list of conditions and the following disclaimer.
     28  * 2. Redistributions in binary form must reproduce the above copyright
     29  *    notice, this list of conditions and the following disclaimer in the
     30  *    documentation and/or other materials provided with the distribution.
     31  * 3. All advertising materials mentioning features or use of this software
     32  *    must display the following acknowledgement:
     33  *    "This product includes cryptographic software written by
     34  *     Eric Young (eay (at) cryptsoft.com)"
     35  *    The word 'cryptographic' can be left out if the rouines from the library
     36  *    being used are not cryptographic related :-).
     37  * 4. If you include any Windows specific code (or a derivative thereof) from
     38  *    the apps directory (application code) you must include an acknowledgement:
     39  *    "This product includes software written by Tim Hudson (tjh (at) cryptsoft.com)"
     40  *
     41  * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
     42  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     43  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
     44  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
     45  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
     46  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
     47  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
     48  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
     49  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
     50  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
     51  * SUCH DAMAGE.
     52  *
     53  * The licence and distribution terms for any publically available version or
     54  * derivative of this code cannot be changed.  i.e. this code cannot simply be
     55  * copied and put under another distribution licence
     56  * [including the GNU Public Licence.]
     57  */
     58 
     59 #ifndef BN_DEBUG
     60 # undef NDEBUG /* avoid conflicting definitions */
     61 # define NDEBUG
     62 #endif
     63 
     64 #include <stdio.h>
     65 #include <assert.h>
     66 #include "cryptlib.h"
     67 #include "bn_lcl.h"
     68 
     69 #if defined(BN_LLONG) || defined(BN_UMULT_HIGH)
     70 
     71 BN_ULONG bn_mul_add_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w)
     72 	{
     73 	BN_ULONG c1=0;
     74 
     75 	assert(num >= 0);
     76 	if (num <= 0) return(c1);
     77 
     78 #ifndef OPENSSL_SMALL_FOOTPRINT
     79 	while (num&~3)
     80 		{
     81 		mul_add(rp[0],ap[0],w,c1);
     82 		mul_add(rp[1],ap[1],w,c1);
     83 		mul_add(rp[2],ap[2],w,c1);
     84 		mul_add(rp[3],ap[3],w,c1);
     85 		ap+=4; rp+=4; num-=4;
     86 		}
     87 #endif
     88 	while (num)
     89 		{
     90 		mul_add(rp[0],ap[0],w,c1);
     91 		ap++; rp++; num--;
     92 		}
     93 
     94 	return(c1);
     95 	}
     96 
     97 BN_ULONG bn_mul_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w)
     98 	{
     99 	BN_ULONG c1=0;
    100 
    101 	assert(num >= 0);
    102 	if (num <= 0) return(c1);
    103 
    104 #ifndef OPENSSL_SMALL_FOOTPRINT
    105 	while (num&~3)
    106 		{
    107 		mul(rp[0],ap[0],w,c1);
    108 		mul(rp[1],ap[1],w,c1);
    109 		mul(rp[2],ap[2],w,c1);
    110 		mul(rp[3],ap[3],w,c1);
    111 		ap+=4; rp+=4; num-=4;
    112 		}
    113 #endif
    114 	while (num)
    115 		{
    116 		mul(rp[0],ap[0],w,c1);
    117 		ap++; rp++; num--;
    118 		}
    119 	return(c1);
    120 	}
    121 
    122 void bn_sqr_words(BN_ULONG *r, const BN_ULONG *a, int n)
    123         {
    124 	assert(n >= 0);
    125 	if (n <= 0) return;
    126 
    127 #ifndef OPENSSL_SMALL_FOOTPRINT
    128 	while (n&~3)
    129 		{
    130 		sqr(r[0],r[1],a[0]);
    131 		sqr(r[2],r[3],a[1]);
    132 		sqr(r[4],r[5],a[2]);
    133 		sqr(r[6],r[7],a[3]);
    134 		a+=4; r+=8; n-=4;
    135 		}
    136 #endif
    137 	while (n)
    138 		{
    139 		sqr(r[0],r[1],a[0]);
    140 		a++; r+=2; n--;
    141 		}
    142 	}
    143 
    144 #else /* !(defined(BN_LLONG) || defined(BN_UMULT_HIGH)) */
    145 
    146 BN_ULONG bn_mul_add_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w)
    147 	{
    148 	BN_ULONG c=0;
    149 	BN_ULONG bl,bh;
    150 
    151 	assert(num >= 0);
    152 	if (num <= 0) return((BN_ULONG)0);
    153 
    154 	bl=LBITS(w);
    155 	bh=HBITS(w);
    156 
    157 #ifndef OPENSSL_SMALL_FOOTPRINT
    158 	while (num&~3)
    159 		{
    160 		mul_add(rp[0],ap[0],bl,bh,c);
    161 		mul_add(rp[1],ap[1],bl,bh,c);
    162 		mul_add(rp[2],ap[2],bl,bh,c);
    163 		mul_add(rp[3],ap[3],bl,bh,c);
    164 		ap+=4; rp+=4; num-=4;
    165 		}
    166 #endif
    167 	while (num)
    168 		{
    169 		mul_add(rp[0],ap[0],bl,bh,c);
    170 		ap++; rp++; num--;
    171 		}
    172 	return(c);
    173 	}
    174 
    175 BN_ULONG bn_mul_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w)
    176 	{
    177 	BN_ULONG carry=0;
    178 	BN_ULONG bl,bh;
    179 
    180 	assert(num >= 0);
    181 	if (num <= 0) return((BN_ULONG)0);
    182 
    183 	bl=LBITS(w);
    184 	bh=HBITS(w);
    185 
    186 #ifndef OPENSSL_SMALL_FOOTPRINT
    187 	while (num&~3)
    188 		{
    189 		mul(rp[0],ap[0],bl,bh,carry);
    190 		mul(rp[1],ap[1],bl,bh,carry);
    191 		mul(rp[2],ap[2],bl,bh,carry);
    192 		mul(rp[3],ap[3],bl,bh,carry);
    193 		ap+=4; rp+=4; num-=4;
    194 		}
    195 #endif
    196 	while (num)
    197 		{
    198 		mul(rp[0],ap[0],bl,bh,carry);
    199 		ap++; rp++; num--;
    200 		}
    201 	return(carry);
    202 	}
    203 
    204 void bn_sqr_words(BN_ULONG *r, const BN_ULONG *a, int n)
    205         {
    206 	assert(n >= 0);
    207 	if (n <= 0) return;
    208 
    209 #ifndef OPENSSL_SMALL_FOOTPRINT
    210 	while (n&~3)
    211 		{
    212 		sqr64(r[0],r[1],a[0]);
    213 		sqr64(r[2],r[3],a[1]);
    214 		sqr64(r[4],r[5],a[2]);
    215 		sqr64(r[6],r[7],a[3]);
    216 		a+=4; r+=8; n-=4;
    217 		}
    218 #endif
    219 	while (n)
    220 		{
    221 		sqr64(r[0],r[1],a[0]);
    222 		a++; r+=2; n--;
    223 		}
    224 	}
    225 
    226 #endif /* !(defined(BN_LLONG) || defined(BN_UMULT_HIGH)) */
    227 
    228 #if defined(BN_LLONG) && defined(BN_DIV2W)
    229 
    230 BN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d)
    231 	{
    232 	return((BN_ULONG)(((((BN_ULLONG)h)<<BN_BITS2)|l)/(BN_ULLONG)d));
    233 	}
    234 
    235 #else
    236 
    237 /* Divide h,l by d and return the result. */
    238 /* I need to test this some more :-( */
    239 BN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d)
    240 	{
    241 	BN_ULONG dh,dl,q,ret=0,th,tl,t;
    242 	int i,count=2;
    243 
    244 	if (d == 0) return(BN_MASK2);
    245 
    246 	i=BN_num_bits_word(d);
    247 	assert((i == BN_BITS2) || (h <= (BN_ULONG)1<<i));
    248 
    249 	i=BN_BITS2-i;
    250 	if (h >= d) h-=d;
    251 
    252 	if (i)
    253 		{
    254 		d<<=i;
    255 		h=(h<<i)|(l>>(BN_BITS2-i));
    256 		l<<=i;
    257 		}
    258 	dh=(d&BN_MASK2h)>>BN_BITS4;
    259 	dl=(d&BN_MASK2l);
    260 	for (;;)
    261 		{
    262 		if ((h>>BN_BITS4) == dh)
    263 			q=BN_MASK2l;
    264 		else
    265 			q=h/dh;
    266 
    267 		th=q*dh;
    268 		tl=dl*q;
    269 		for (;;)
    270 			{
    271 			t=h-th;
    272 			if ((t&BN_MASK2h) ||
    273 				((tl) <= (
    274 					(t<<BN_BITS4)|
    275 					((l&BN_MASK2h)>>BN_BITS4))))
    276 				break;
    277 			q--;
    278 			th-=dh;
    279 			tl-=dl;
    280 			}
    281 		t=(tl>>BN_BITS4);
    282 		tl=(tl<<BN_BITS4)&BN_MASK2h;
    283 		th+=t;
    284 
    285 		if (l < tl) th++;
    286 		l-=tl;
    287 		if (h < th)
    288 			{
    289 			h+=d;
    290 			q--;
    291 			}
    292 		h-=th;
    293 
    294 		if (--count == 0) break;
    295 
    296 		ret=q<<BN_BITS4;
    297 		h=((h<<BN_BITS4)|(l>>BN_BITS4))&BN_MASK2;
    298 		l=(l&BN_MASK2l)<<BN_BITS4;
    299 		}
    300 	ret|=q;
    301 	return(ret);
    302 	}
    303 #endif /* !defined(BN_LLONG) && defined(BN_DIV2W) */
    304 
    305 #ifdef BN_LLONG
    306 BN_ULONG bn_add_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b, int n)
    307         {
    308 	BN_ULLONG ll=0;
    309 
    310 	assert(n >= 0);
    311 	if (n <= 0) return((BN_ULONG)0);
    312 
    313 #ifndef OPENSSL_SMALL_FOOTPRINT
    314 	while (n&~3)
    315 		{
    316 		ll+=(BN_ULLONG)a[0]+b[0];
    317 		r[0]=(BN_ULONG)ll&BN_MASK2;
    318 		ll>>=BN_BITS2;
    319 		ll+=(BN_ULLONG)a[1]+b[1];
    320 		r[1]=(BN_ULONG)ll&BN_MASK2;
    321 		ll>>=BN_BITS2;
    322 		ll+=(BN_ULLONG)a[2]+b[2];
    323 		r[2]=(BN_ULONG)ll&BN_MASK2;
    324 		ll>>=BN_BITS2;
    325 		ll+=(BN_ULLONG)a[3]+b[3];
    326 		r[3]=(BN_ULONG)ll&BN_MASK2;
    327 		ll>>=BN_BITS2;
    328 		a+=4; b+=4; r+=4; n-=4;
    329 		}
    330 #endif
    331 	while (n)
    332 		{
    333 		ll+=(BN_ULLONG)a[0]+b[0];
    334 		r[0]=(BN_ULONG)ll&BN_MASK2;
    335 		ll>>=BN_BITS2;
    336 		a++; b++; r++; n--;
    337 		}
    338 	return((BN_ULONG)ll);
    339 	}
    340 #else /* !BN_LLONG */
    341 BN_ULONG bn_add_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b, int n)
    342         {
    343 	BN_ULONG c,l,t;
    344 
    345 	assert(n >= 0);
    346 	if (n <= 0) return((BN_ULONG)0);
    347 
    348 	c=0;
    349 #ifndef OPENSSL_SMALL_FOOTPRINT
    350 	while (n&~3)
    351 		{
    352 		t=a[0];
    353 		t=(t+c)&BN_MASK2;
    354 		c=(t < c);
    355 		l=(t+b[0])&BN_MASK2;
    356 		c+=(l < t);
    357 		r[0]=l;
    358 		t=a[1];
    359 		t=(t+c)&BN_MASK2;
    360 		c=(t < c);
    361 		l=(t+b[1])&BN_MASK2;
    362 		c+=(l < t);
    363 		r[1]=l;
    364 		t=a[2];
    365 		t=(t+c)&BN_MASK2;
    366 		c=(t < c);
    367 		l=(t+b[2])&BN_MASK2;
    368 		c+=(l < t);
    369 		r[2]=l;
    370 		t=a[3];
    371 		t=(t+c)&BN_MASK2;
    372 		c=(t < c);
    373 		l=(t+b[3])&BN_MASK2;
    374 		c+=(l < t);
    375 		r[3]=l;
    376 		a+=4; b+=4; r+=4; n-=4;
    377 		}
    378 #endif
    379 	while(n)
    380 		{
    381 		t=a[0];
    382 		t=(t+c)&BN_MASK2;
    383 		c=(t < c);
    384 		l=(t+b[0])&BN_MASK2;
    385 		c+=(l < t);
    386 		r[0]=l;
    387 		a++; b++; r++; n--;
    388 		}
    389 	return((BN_ULONG)c);
    390 	}
    391 #endif /* !BN_LLONG */
    392 
    393 BN_ULONG bn_sub_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b, int n)
    394         {
    395 	BN_ULONG t1,t2;
    396 	int c=0;
    397 
    398 	assert(n >= 0);
    399 	if (n <= 0) return((BN_ULONG)0);
    400 
    401 #ifndef OPENSSL_SMALL_FOOTPRINT
    402 	while (n&~3)
    403 		{
    404 		t1=a[0]; t2=b[0];
    405 		r[0]=(t1-t2-c)&BN_MASK2;
    406 		if (t1 != t2) c=(t1 < t2);
    407 		t1=a[1]; t2=b[1];
    408 		r[1]=(t1-t2-c)&BN_MASK2;
    409 		if (t1 != t2) c=(t1 < t2);
    410 		t1=a[2]; t2=b[2];
    411 		r[2]=(t1-t2-c)&BN_MASK2;
    412 		if (t1 != t2) c=(t1 < t2);
    413 		t1=a[3]; t2=b[3];
    414 		r[3]=(t1-t2-c)&BN_MASK2;
    415 		if (t1 != t2) c=(t1 < t2);
    416 		a+=4; b+=4; r+=4; n-=4;
    417 		}
    418 #endif
    419 	while (n)
    420 		{
    421 		t1=a[0]; t2=b[0];
    422 		r[0]=(t1-t2-c)&BN_MASK2;
    423 		if (t1 != t2) c=(t1 < t2);
    424 		a++; b++; r++; n--;
    425 		}
    426 	return(c);
    427 	}
    428 
    429 #if defined(BN_MUL_COMBA) && !defined(OPENSSL_SMALL_FOOTPRINT)
    430 
    431 #undef bn_mul_comba8
    432 #undef bn_mul_comba4
    433 #undef bn_sqr_comba8
    434 #undef bn_sqr_comba4
    435 
    436 /* mul_add_c(a,b,c0,c1,c2)  -- c+=a*b for three word number c=(c2,c1,c0) */
    437 /* mul_add_c2(a,b,c0,c1,c2) -- c+=2*a*b for three word number c=(c2,c1,c0) */
    438 /* sqr_add_c(a,i,c0,c1,c2)  -- c+=a[i]^2 for three word number c=(c2,c1,c0) */
    439 /* sqr_add_c2(a,i,c0,c1,c2) -- c+=2*a[i]*a[j] for three word number c=(c2,c1,c0) */
    440 
    441 #ifdef BN_LLONG
    442 #define mul_add_c(a,b,c0,c1,c2) \
    443 	t=(BN_ULLONG)a*b; \
    444 	t1=(BN_ULONG)Lw(t); \
    445 	t2=(BN_ULONG)Hw(t); \
    446 	c0=(c0+t1)&BN_MASK2; if ((c0) < t1) t2++; \
    447 	c1=(c1+t2)&BN_MASK2; if ((c1) < t2) c2++;
    448 
    449 #define mul_add_c2(a,b,c0,c1,c2) \
    450 	t=(BN_ULLONG)a*b; \
    451 	tt=(t+t)&BN_MASK; \
    452 	if (tt < t) c2++; \
    453 	t1=(BN_ULONG)Lw(tt); \
    454 	t2=(BN_ULONG)Hw(tt); \
    455 	c0=(c0+t1)&BN_MASK2;  \
    456 	if ((c0 < t1) && (((++t2)&BN_MASK2) == 0)) c2++; \
    457 	c1=(c1+t2)&BN_MASK2; if ((c1) < t2) c2++;
    458 
    459 #define sqr_add_c(a,i,c0,c1,c2) \
    460 	t=(BN_ULLONG)a[i]*a[i]; \
    461 	t1=(BN_ULONG)Lw(t); \
    462 	t2=(BN_ULONG)Hw(t); \
    463 	c0=(c0+t1)&BN_MASK2; if ((c0) < t1) t2++; \
    464 	c1=(c1+t2)&BN_MASK2; if ((c1) < t2) c2++;
    465 
    466 #define sqr_add_c2(a,i,j,c0,c1,c2) \
    467 	mul_add_c2((a)[i],(a)[j],c0,c1,c2)
    468 
    469 #elif defined(BN_UMULT_LOHI)
    470 
    471 #define mul_add_c(a,b,c0,c1,c2)	{	\
    472 	BN_ULONG ta=(a),tb=(b);		\
    473 	BN_UMULT_LOHI(t1,t2,ta,tb);	\
    474 	c0 += t1; t2 += (c0<t1)?1:0;	\
    475 	c1 += t2; c2 += (c1<t2)?1:0;	\
    476 	}
    477 
    478 #define mul_add_c2(a,b,c0,c1,c2) {	\
    479 	BN_ULONG ta=(a),tb=(b),t0;	\
    480 	BN_UMULT_LOHI(t0,t1,ta,tb);	\
    481 	t2 = t1+t1; c2 += (t2<t1)?1:0;	\
    482 	t1 = t0+t0; t2 += (t1<t0)?1:0;	\
    483 	c0 += t1; t2 += (c0<t1)?1:0;	\
    484 	c1 += t2; c2 += (c1<t2)?1:0;	\
    485 	}
    486 
    487 #define sqr_add_c(a,i,c0,c1,c2)	{	\
    488 	BN_ULONG ta=(a)[i];		\
    489 	BN_UMULT_LOHI(t1,t2,ta,ta);	\
    490 	c0 += t1; t2 += (c0<t1)?1:0;	\
    491 	c1 += t2; c2 += (c1<t2)?1:0;	\
    492 	}
    493 
    494 #define sqr_add_c2(a,i,j,c0,c1,c2)	\
    495 	mul_add_c2((a)[i],(a)[j],c0,c1,c2)
    496 
    497 #elif defined(BN_UMULT_HIGH)
    498 
    499 #define mul_add_c(a,b,c0,c1,c2)	{	\
    500 	BN_ULONG ta=(a),tb=(b);		\
    501 	t1 = ta * tb;			\
    502 	t2 = BN_UMULT_HIGH(ta,tb);	\
    503 	c0 += t1; t2 += (c0<t1)?1:0;	\
    504 	c1 += t2; c2 += (c1<t2)?1:0;	\
    505 	}
    506 
    507 #define mul_add_c2(a,b,c0,c1,c2) {	\
    508 	BN_ULONG ta=(a),tb=(b),t0;	\
    509 	t1 = BN_UMULT_HIGH(ta,tb);	\
    510 	t0 = ta * tb;			\
    511 	t2 = t1+t1; c2 += (t2<t1)?1:0;	\
    512 	t1 = t0+t0; t2 += (t1<t0)?1:0;	\
    513 	c0 += t1; t2 += (c0<t1)?1:0;	\
    514 	c1 += t2; c2 += (c1<t2)?1:0;	\
    515 	}
    516 
    517 #define sqr_add_c(a,i,c0,c1,c2)	{	\
    518 	BN_ULONG ta=(a)[i];		\
    519 	t1 = ta * ta;			\
    520 	t2 = BN_UMULT_HIGH(ta,ta);	\
    521 	c0 += t1; t2 += (c0<t1)?1:0;	\
    522 	c1 += t2; c2 += (c1<t2)?1:0;	\
    523 	}
    524 
    525 #define sqr_add_c2(a,i,j,c0,c1,c2)	\
    526 	mul_add_c2((a)[i],(a)[j],c0,c1,c2)
    527 
    528 #else /* !BN_LLONG */
    529 #define mul_add_c(a,b,c0,c1,c2) \
    530 	t1=LBITS(a); t2=HBITS(a); \
    531 	bl=LBITS(b); bh=HBITS(b); \
    532 	mul64(t1,t2,bl,bh); \
    533 	c0=(c0+t1)&BN_MASK2; if ((c0) < t1) t2++; \
    534 	c1=(c1+t2)&BN_MASK2; if ((c1) < t2) c2++;
    535 
    536 #define mul_add_c2(a,b,c0,c1,c2) \
    537 	t1=LBITS(a); t2=HBITS(a); \
    538 	bl=LBITS(b); bh=HBITS(b); \
    539 	mul64(t1,t2,bl,bh); \
    540 	if (t2 & BN_TBIT) c2++; \
    541 	t2=(t2+t2)&BN_MASK2; \
    542 	if (t1 & BN_TBIT) t2++; \
    543 	t1=(t1+t1)&BN_MASK2; \
    544 	c0=(c0+t1)&BN_MASK2;  \
    545 	if ((c0 < t1) && (((++t2)&BN_MASK2) == 0)) c2++; \
    546 	c1=(c1+t2)&BN_MASK2; if ((c1) < t2) c2++;
    547 
    548 #define sqr_add_c(a,i,c0,c1,c2) \
    549 	sqr64(t1,t2,(a)[i]); \
    550 	c0=(c0+t1)&BN_MASK2; if ((c0) < t1) t2++; \
    551 	c1=(c1+t2)&BN_MASK2; if ((c1) < t2) c2++;
    552 
    553 #define sqr_add_c2(a,i,j,c0,c1,c2) \
    554 	mul_add_c2((a)[i],(a)[j],c0,c1,c2)
    555 #endif /* !BN_LLONG */
    556 
    557 void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
    558 	{
    559 #ifdef BN_LLONG
    560 	BN_ULLONG t;
    561 #else
    562 	BN_ULONG bl,bh;
    563 #endif
    564 	BN_ULONG t1,t2;
    565 	BN_ULONG c1,c2,c3;
    566 
    567 	c1=0;
    568 	c2=0;
    569 	c3=0;
    570 	mul_add_c(a[0],b[0],c1,c2,c3);
    571 	r[0]=c1;
    572 	c1=0;
    573 	mul_add_c(a[0],b[1],c2,c3,c1);
    574 	mul_add_c(a[1],b[0],c2,c3,c1);
    575 	r[1]=c2;
    576 	c2=0;
    577 	mul_add_c(a[2],b[0],c3,c1,c2);
    578 	mul_add_c(a[1],b[1],c3,c1,c2);
    579 	mul_add_c(a[0],b[2],c3,c1,c2);
    580 	r[2]=c3;
    581 	c3=0;
    582 	mul_add_c(a[0],b[3],c1,c2,c3);
    583 	mul_add_c(a[1],b[2],c1,c2,c3);
    584 	mul_add_c(a[2],b[1],c1,c2,c3);
    585 	mul_add_c(a[3],b[0],c1,c2,c3);
    586 	r[3]=c1;
    587 	c1=0;
    588 	mul_add_c(a[4],b[0],c2,c3,c1);
    589 	mul_add_c(a[3],b[1],c2,c3,c1);
    590 	mul_add_c(a[2],b[2],c2,c3,c1);
    591 	mul_add_c(a[1],b[3],c2,c3,c1);
    592 	mul_add_c(a[0],b[4],c2,c3,c1);
    593 	r[4]=c2;
    594 	c2=0;
    595 	mul_add_c(a[0],b[5],c3,c1,c2);
    596 	mul_add_c(a[1],b[4],c3,c1,c2);
    597 	mul_add_c(a[2],b[3],c3,c1,c2);
    598 	mul_add_c(a[3],b[2],c3,c1,c2);
    599 	mul_add_c(a[4],b[1],c3,c1,c2);
    600 	mul_add_c(a[5],b[0],c3,c1,c2);
    601 	r[5]=c3;
    602 	c3=0;
    603 	mul_add_c(a[6],b[0],c1,c2,c3);
    604 	mul_add_c(a[5],b[1],c1,c2,c3);
    605 	mul_add_c(a[4],b[2],c1,c2,c3);
    606 	mul_add_c(a[3],b[3],c1,c2,c3);
    607 	mul_add_c(a[2],b[4],c1,c2,c3);
    608 	mul_add_c(a[1],b[5],c1,c2,c3);
    609 	mul_add_c(a[0],b[6],c1,c2,c3);
    610 	r[6]=c1;
    611 	c1=0;
    612 	mul_add_c(a[0],b[7],c2,c3,c1);
    613 	mul_add_c(a[1],b[6],c2,c3,c1);
    614 	mul_add_c(a[2],b[5],c2,c3,c1);
    615 	mul_add_c(a[3],b[4],c2,c3,c1);
    616 	mul_add_c(a[4],b[3],c2,c3,c1);
    617 	mul_add_c(a[5],b[2],c2,c3,c1);
    618 	mul_add_c(a[6],b[1],c2,c3,c1);
    619 	mul_add_c(a[7],b[0],c2,c3,c1);
    620 	r[7]=c2;
    621 	c2=0;
    622 	mul_add_c(a[7],b[1],c3,c1,c2);
    623 	mul_add_c(a[6],b[2],c3,c1,c2);
    624 	mul_add_c(a[5],b[3],c3,c1,c2);
    625 	mul_add_c(a[4],b[4],c3,c1,c2);
    626 	mul_add_c(a[3],b[5],c3,c1,c2);
    627 	mul_add_c(a[2],b[6],c3,c1,c2);
    628 	mul_add_c(a[1],b[7],c3,c1,c2);
    629 	r[8]=c3;
    630 	c3=0;
    631 	mul_add_c(a[2],b[7],c1,c2,c3);
    632 	mul_add_c(a[3],b[6],c1,c2,c3);
    633 	mul_add_c(a[4],b[5],c1,c2,c3);
    634 	mul_add_c(a[5],b[4],c1,c2,c3);
    635 	mul_add_c(a[6],b[3],c1,c2,c3);
    636 	mul_add_c(a[7],b[2],c1,c2,c3);
    637 	r[9]=c1;
    638 	c1=0;
    639 	mul_add_c(a[7],b[3],c2,c3,c1);
    640 	mul_add_c(a[6],b[4],c2,c3,c1);
    641 	mul_add_c(a[5],b[5],c2,c3,c1);
    642 	mul_add_c(a[4],b[6],c2,c3,c1);
    643 	mul_add_c(a[3],b[7],c2,c3,c1);
    644 	r[10]=c2;
    645 	c2=0;
    646 	mul_add_c(a[4],b[7],c3,c1,c2);
    647 	mul_add_c(a[5],b[6],c3,c1,c2);
    648 	mul_add_c(a[6],b[5],c3,c1,c2);
    649 	mul_add_c(a[7],b[4],c3,c1,c2);
    650 	r[11]=c3;
    651 	c3=0;
    652 	mul_add_c(a[7],b[5],c1,c2,c3);
    653 	mul_add_c(a[6],b[6],c1,c2,c3);
    654 	mul_add_c(a[5],b[7],c1,c2,c3);
    655 	r[12]=c1;
    656 	c1=0;
    657 	mul_add_c(a[6],b[7],c2,c3,c1);
    658 	mul_add_c(a[7],b[6],c2,c3,c1);
    659 	r[13]=c2;
    660 	c2=0;
    661 	mul_add_c(a[7],b[7],c3,c1,c2);
    662 	r[14]=c3;
    663 	r[15]=c1;
    664 	}
    665 
    666 void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
    667 	{
    668 #ifdef BN_LLONG
    669 	BN_ULLONG t;
    670 #else
    671 	BN_ULONG bl,bh;
    672 #endif
    673 	BN_ULONG t1,t2;
    674 	BN_ULONG c1,c2,c3;
    675 
    676 	c1=0;
    677 	c2=0;
    678 	c3=0;
    679 	mul_add_c(a[0],b[0],c1,c2,c3);
    680 	r[0]=c1;
    681 	c1=0;
    682 	mul_add_c(a[0],b[1],c2,c3,c1);
    683 	mul_add_c(a[1],b[0],c2,c3,c1);
    684 	r[1]=c2;
    685 	c2=0;
    686 	mul_add_c(a[2],b[0],c3,c1,c2);
    687 	mul_add_c(a[1],b[1],c3,c1,c2);
    688 	mul_add_c(a[0],b[2],c3,c1,c2);
    689 	r[2]=c3;
    690 	c3=0;
    691 	mul_add_c(a[0],b[3],c1,c2,c3);
    692 	mul_add_c(a[1],b[2],c1,c2,c3);
    693 	mul_add_c(a[2],b[1],c1,c2,c3);
    694 	mul_add_c(a[3],b[0],c1,c2,c3);
    695 	r[3]=c1;
    696 	c1=0;
    697 	mul_add_c(a[3],b[1],c2,c3,c1);
    698 	mul_add_c(a[2],b[2],c2,c3,c1);
    699 	mul_add_c(a[1],b[3],c2,c3,c1);
    700 	r[4]=c2;
    701 	c2=0;
    702 	mul_add_c(a[2],b[3],c3,c1,c2);
    703 	mul_add_c(a[3],b[2],c3,c1,c2);
    704 	r[5]=c3;
    705 	c3=0;
    706 	mul_add_c(a[3],b[3],c1,c2,c3);
    707 	r[6]=c1;
    708 	r[7]=c2;
    709 	}
    710 
    711 void bn_sqr_comba8(BN_ULONG *r, const BN_ULONG *a)
    712 	{
    713 #ifdef BN_LLONG
    714 	BN_ULLONG t,tt;
    715 #else
    716 	BN_ULONG bl,bh;
    717 #endif
    718 	BN_ULONG t1,t2;
    719 	BN_ULONG c1,c2,c3;
    720 
    721 	c1=0;
    722 	c2=0;
    723 	c3=0;
    724 	sqr_add_c(a,0,c1,c2,c3);
    725 	r[0]=c1;
    726 	c1=0;
    727 	sqr_add_c2(a,1,0,c2,c3,c1);
    728 	r[1]=c2;
    729 	c2=0;
    730 	sqr_add_c(a,1,c3,c1,c2);
    731 	sqr_add_c2(a,2,0,c3,c1,c2);
    732 	r[2]=c3;
    733 	c3=0;
    734 	sqr_add_c2(a,3,0,c1,c2,c3);
    735 	sqr_add_c2(a,2,1,c1,c2,c3);
    736 	r[3]=c1;
    737 	c1=0;
    738 	sqr_add_c(a,2,c2,c3,c1);
    739 	sqr_add_c2(a,3,1,c2,c3,c1);
    740 	sqr_add_c2(a,4,0,c2,c3,c1);
    741 	r[4]=c2;
    742 	c2=0;
    743 	sqr_add_c2(a,5,0,c3,c1,c2);
    744 	sqr_add_c2(a,4,1,c3,c1,c2);
    745 	sqr_add_c2(a,3,2,c3,c1,c2);
    746 	r[5]=c3;
    747 	c3=0;
    748 	sqr_add_c(a,3,c1,c2,c3);
    749 	sqr_add_c2(a,4,2,c1,c2,c3);
    750 	sqr_add_c2(a,5,1,c1,c2,c3);
    751 	sqr_add_c2(a,6,0,c1,c2,c3);
    752 	r[6]=c1;
    753 	c1=0;
    754 	sqr_add_c2(a,7,0,c2,c3,c1);
    755 	sqr_add_c2(a,6,1,c2,c3,c1);
    756 	sqr_add_c2(a,5,2,c2,c3,c1);
    757 	sqr_add_c2(a,4,3,c2,c3,c1);
    758 	r[7]=c2;
    759 	c2=0;
    760 	sqr_add_c(a,4,c3,c1,c2);
    761 	sqr_add_c2(a,5,3,c3,c1,c2);
    762 	sqr_add_c2(a,6,2,c3,c1,c2);
    763 	sqr_add_c2(a,7,1,c3,c1,c2);
    764 	r[8]=c3;
    765 	c3=0;
    766 	sqr_add_c2(a,7,2,c1,c2,c3);
    767 	sqr_add_c2(a,6,3,c1,c2,c3);
    768 	sqr_add_c2(a,5,4,c1,c2,c3);
    769 	r[9]=c1;
    770 	c1=0;
    771 	sqr_add_c(a,5,c2,c3,c1);
    772 	sqr_add_c2(a,6,4,c2,c3,c1);
    773 	sqr_add_c2(a,7,3,c2,c3,c1);
    774 	r[10]=c2;
    775 	c2=0;
    776 	sqr_add_c2(a,7,4,c3,c1,c2);
    777 	sqr_add_c2(a,6,5,c3,c1,c2);
    778 	r[11]=c3;
    779 	c3=0;
    780 	sqr_add_c(a,6,c1,c2,c3);
    781 	sqr_add_c2(a,7,5,c1,c2,c3);
    782 	r[12]=c1;
    783 	c1=0;
    784 	sqr_add_c2(a,7,6,c2,c3,c1);
    785 	r[13]=c2;
    786 	c2=0;
    787 	sqr_add_c(a,7,c3,c1,c2);
    788 	r[14]=c3;
    789 	r[15]=c1;
    790 	}
    791 
    792 void bn_sqr_comba4(BN_ULONG *r, const BN_ULONG *a)
    793 	{
    794 #ifdef BN_LLONG
    795 	BN_ULLONG t,tt;
    796 #else
    797 	BN_ULONG bl,bh;
    798 #endif
    799 	BN_ULONG t1,t2;
    800 	BN_ULONG c1,c2,c3;
    801 
    802 	c1=0;
    803 	c2=0;
    804 	c3=0;
    805 	sqr_add_c(a,0,c1,c2,c3);
    806 	r[0]=c1;
    807 	c1=0;
    808 	sqr_add_c2(a,1,0,c2,c3,c1);
    809 	r[1]=c2;
    810 	c2=0;
    811 	sqr_add_c(a,1,c3,c1,c2);
    812 	sqr_add_c2(a,2,0,c3,c1,c2);
    813 	r[2]=c3;
    814 	c3=0;
    815 	sqr_add_c2(a,3,0,c1,c2,c3);
    816 	sqr_add_c2(a,2,1,c1,c2,c3);
    817 	r[3]=c1;
    818 	c1=0;
    819 	sqr_add_c(a,2,c2,c3,c1);
    820 	sqr_add_c2(a,3,1,c2,c3,c1);
    821 	r[4]=c2;
    822 	c2=0;
    823 	sqr_add_c2(a,3,2,c3,c1,c2);
    824 	r[5]=c3;
    825 	c3=0;
    826 	sqr_add_c(a,3,c1,c2,c3);
    827 	r[6]=c1;
    828 	r[7]=c2;
    829 	}
    830 
    831 #ifdef OPENSSL_NO_ASM
    832 #ifdef OPENSSL_BN_ASM_MONT
    833 #include <alloca.h>
    834 /*
    835  * This is essentially reference implementation, which may or may not
    836  * result in performance improvement. E.g. on IA-32 this routine was
    837  * observed to give 40% faster rsa1024 private key operations and 10%
    838  * faster rsa4096 ones, while on AMD64 it improves rsa1024 sign only
    839  * by 10% and *worsens* rsa4096 sign by 15%. Once again, it's a
    840  * reference implementation, one to be used as starting point for
    841  * platform-specific assembler. Mentioned numbers apply to compiler
    842  * generated code compiled with and without -DOPENSSL_BN_ASM_MONT and
    843  * can vary not only from platform to platform, but even for compiler
    844  * versions. Assembler vs. assembler improvement coefficients can
    845  * [and are known to] differ and are to be documented elsewhere.
    846  */
    847 int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, const BN_ULONG *np,const BN_ULONG *n0p, int num)
    848 	{
    849 	BN_ULONG c0,c1,ml,*tp,n0;
    850 #ifdef mul64
    851 	BN_ULONG mh;
    852 #endif
    853 	volatile BN_ULONG *vp;
    854 	int i=0,j;
    855 
    856 #if 0	/* template for platform-specific implementation */
    857 	if (ap==bp)	return bn_sqr_mont(rp,ap,np,n0p,num);
    858 #endif
    859 	vp = tp = alloca((num+2)*sizeof(BN_ULONG));
    860 
    861 	n0 = *n0p;
    862 
    863 	c0 = 0;
    864 	ml = bp[0];
    865 #ifdef mul64
    866 	mh = HBITS(ml);
    867 	ml = LBITS(ml);
    868 	for (j=0;j<num;++j)
    869 		mul(tp[j],ap[j],ml,mh,c0);
    870 #else
    871 	for (j=0;j<num;++j)
    872 		mul(tp[j],ap[j],ml,c0);
    873 #endif
    874 
    875 	tp[num]   = c0;
    876 	tp[num+1] = 0;
    877 	goto enter;
    878 
    879 	for(i=0;i<num;i++)
    880 		{
    881 		c0 = 0;
    882 		ml = bp[i];
    883 #ifdef mul64
    884 		mh = HBITS(ml);
    885 		ml = LBITS(ml);
    886 		for (j=0;j<num;++j)
    887 			mul_add(tp[j],ap[j],ml,mh,c0);
    888 #else
    889 		for (j=0;j<num;++j)
    890 			mul_add(tp[j],ap[j],ml,c0);
    891 #endif
    892 		c1 = (tp[num] + c0)&BN_MASK2;
    893 		tp[num]   = c1;
    894 		tp[num+1] = (c1<c0?1:0);
    895 	enter:
    896 		c1  = tp[0];
    897 		ml = (c1*n0)&BN_MASK2;
    898 		c0 = 0;
    899 #ifdef mul64
    900 		mh = HBITS(ml);
    901 		ml = LBITS(ml);
    902 		mul_add(c1,np[0],ml,mh,c0);
    903 #else
    904 		mul_add(c1,ml,np[0],c0);
    905 #endif
    906 		for(j=1;j<num;j++)
    907 			{
    908 			c1 = tp[j];
    909 #ifdef mul64
    910 			mul_add(c1,np[j],ml,mh,c0);
    911 #else
    912 			mul_add(c1,ml,np[j],c0);
    913 #endif
    914 			tp[j-1] = c1&BN_MASK2;
    915 			}
    916 		c1        = (tp[num] + c0)&BN_MASK2;
    917 		tp[num-1] = c1;
    918 		tp[num]   = tp[num+1] + (c1<c0?1:0);
    919 		}
    920 
    921 	if (tp[num]!=0 || tp[num-1]>=np[num-1])
    922 		{
    923 		c0 = bn_sub_words(rp,tp,np,num);
    924 		if (tp[num]!=0 || c0==0)
    925 			{
    926 			for(i=0;i<num+2;i++)	vp[i] = 0;
    927 			return 1;
    928 			}
    929 		}
    930 	for(i=0;i<num;i++)	rp[i] = tp[i],	vp[i] = 0;
    931 	vp[num]   = 0;
    932 	vp[num+1] = 0;
    933 	return 1;
    934 	}
    935 #else
    936 /*
    937  * Return value of 0 indicates that multiplication/convolution was not
    938  * performed to signal the caller to fall down to alternative/original
    939  * code-path.
    940  */
    941 int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, const BN_ULONG *np,const BN_ULONG *n0, int num)
    942 {	return 0;	}
    943 #endif /* OPENSSL_BN_ASM_MONT */
    944 #endif
    945 
    946 #else /* !BN_MUL_COMBA */
    947 
    948 /* hmm... is it faster just to do a multiply? */
    949 #undef bn_sqr_comba4
    950 void bn_sqr_comba4(BN_ULONG *r, const BN_ULONG *a)
    951 	{
    952 	BN_ULONG t[8];
    953 	bn_sqr_normal(r,a,4,t);
    954 	}
    955 
    956 #undef bn_sqr_comba8
    957 void bn_sqr_comba8(BN_ULONG *r, const BN_ULONG *a)
    958 	{
    959 	BN_ULONG t[16];
    960 	bn_sqr_normal(r,a,8,t);
    961 	}
    962 
    963 void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
    964 	{
    965 	r[4]=bn_mul_words(    &(r[0]),a,4,b[0]);
    966 	r[5]=bn_mul_add_words(&(r[1]),a,4,b[1]);
    967 	r[6]=bn_mul_add_words(&(r[2]),a,4,b[2]);
    968 	r[7]=bn_mul_add_words(&(r[3]),a,4,b[3]);
    969 	}
    970 
    971 void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
    972 	{
    973 	r[ 8]=bn_mul_words(    &(r[0]),a,8,b[0]);
    974 	r[ 9]=bn_mul_add_words(&(r[1]),a,8,b[1]);
    975 	r[10]=bn_mul_add_words(&(r[2]),a,8,b[2]);
    976 	r[11]=bn_mul_add_words(&(r[3]),a,8,b[3]);
    977 	r[12]=bn_mul_add_words(&(r[4]),a,8,b[4]);
    978 	r[13]=bn_mul_add_words(&(r[5]),a,8,b[5]);
    979 	r[14]=bn_mul_add_words(&(r[6]),a,8,b[6]);
    980 	r[15]=bn_mul_add_words(&(r[7]),a,8,b[7]);
    981 	}
    982 
    983 #ifdef OPENSSL_NO_ASM
    984 #ifdef OPENSSL_BN_ASM_MONT
    985 #include <alloca.h>
    986 int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, const BN_ULONG *np,const BN_ULONG *n0p, int num)
    987 	{
    988 	BN_ULONG c0,c1,*tp,n0=*n0p;
    989 	volatile BN_ULONG *vp;
    990 	int i=0,j;
    991 
    992 	vp = tp = alloca((num+2)*sizeof(BN_ULONG));
    993 
    994 	for(i=0;i<=num;i++)	tp[i]=0;
    995 
    996 	for(i=0;i<num;i++)
    997 		{
    998 		c0         = bn_mul_add_words(tp,ap,num,bp[i]);
    999 		c1         = (tp[num] + c0)&BN_MASK2;
   1000 		tp[num]    = c1;
   1001 		tp[num+1]  = (c1<c0?1:0);
   1002 
   1003 		c0         = bn_mul_add_words(tp,np,num,tp[0]*n0);
   1004 		c1         = (tp[num] + c0)&BN_MASK2;
   1005 		tp[num]    = c1;
   1006 		tp[num+1] += (c1<c0?1:0);
   1007 		for(j=0;j<=num;j++)	tp[j]=tp[j+1];
   1008 		}
   1009 
   1010 	if (tp[num]!=0 || tp[num-1]>=np[num-1])
   1011 		{
   1012 		c0 = bn_sub_words(rp,tp,np,num);
   1013 		if (tp[num]!=0 || c0==0)
   1014 			{
   1015 			for(i=0;i<num+2;i++)	vp[i] = 0;
   1016 			return 1;
   1017 			}
   1018 		}
   1019 	for(i=0;i<num;i++)	rp[i] = tp[i],	vp[i] = 0;
   1020 	vp[num]   = 0;
   1021 	vp[num+1] = 0;
   1022 	return 1;
   1023 	}
   1024 #else
   1025 int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, const BN_ULONG *np,const BN_ULONG *n0, int num)
   1026 {	return 0;	}
   1027 #endif /* OPENSSL_BN_ASM_MONT */
   1028 #endif
   1029 
   1030 #endif /* !BN_MUL_COMBA */
   1031