Home | History | Annotate | Download | only in asm
      1 .ident	"sparcv8plus.s, Version 1.4"
      2 .ident	"SPARC v9 ISA artwork by Andy Polyakov <appro (at) fy.chalmers.se>"
      3 
      4 /*
      5  * ====================================================================
      6  * Written by Andy Polyakov <appro (at) fy.chalmers.se> for the OpenSSL
      7  * project.
      8  *
      9  * Rights for redistribution and usage in source and binary forms are
     10  * granted according to the OpenSSL license. Warranty of any kind is
     11  * disclaimed.
     12  * ====================================================================
     13  */
     14 
     15 /*
     16  * This is my modest contributon to OpenSSL project (see
     17  * http://www.openssl.org/ for more information about it) and is
     18  * a drop-in UltraSPARC ISA replacement for crypto/bn/bn_asm.c
     19  * module. For updates see http://fy.chalmers.se/~appro/hpe/.
     20  *
     21  * Questions-n-answers.
     22  *
     23  * Q. How to compile?
     24  * A. With SC4.x/SC5.x:
     25  *
     26  *	cc -xarch=v8plus -c bn_asm.sparc.v8plus.S -o bn_asm.o
     27  *
     28  *    and with gcc:
     29  *
     30  *	gcc -mcpu=ultrasparc -c bn_asm.sparc.v8plus.S -o bn_asm.o
     31  *
     32  *    or if above fails (it does if you have gas installed):
     33  *
     34  *	gcc -E bn_asm.sparc.v8plus.S | as -xarch=v8plus /dev/fd/0 -o bn_asm.o
     35  *
     36  *    Quick-n-dirty way to fuse the module into the library.
     37  *    Provided that the library is already configured and built
     38  *    (in 0.9.2 case with no-asm option):
     39  *
     40  *	# cd crypto/bn
     41  *	# cp /some/place/bn_asm.sparc.v8plus.S .
     42  *	# cc -xarch=v8plus -c bn_asm.sparc.v8plus.S -o bn_asm.o
     43  *	# make
     44  *	# cd ../..
     45  *	# make; make test
     46  *
     47  *    Quick-n-dirty way to get rid of it:
     48  *
     49  *	# cd crypto/bn
     50  *	# touch bn_asm.c
     51  *	# make
     52  *	# cd ../..
     53  *	# make; make test
     54  *
     55  * Q. V8plus achitecture? What kind of beast is that?
     56  * A. Well, it's rather a programming model than an architecture...
     57  *    It's actually v9-compliant, i.e. *any* UltraSPARC, CPU under
     58  *    special conditions, namely when kernel doesn't preserve upper
     59  *    32 bits of otherwise 64-bit registers during a context switch.
     60  *
     61  * Q. Why just UltraSPARC? What about SuperSPARC?
     62  * A. Original release did target UltraSPARC only. Now SuperSPARC
     63  *    version is provided along. Both version share bn_*comba[48]
     64  *    implementations (see comment later in code for explanation).
     65  *    But what's so special about this UltraSPARC implementation?
     66  *    Why didn't I let compiler do the job? Trouble is that most of
     67  *    available compilers (well, SC5.0 is the only exception) don't
     68  *    attempt to take advantage of UltraSPARC's 64-bitness under
     69  *    32-bit kernels even though it's perfectly possible (see next
     70  *    question).
     71  *
     72  * Q. 64-bit registers under 32-bit kernels? Didn't you just say it
     73  *    doesn't work?
     74  * A. You can't adress *all* registers as 64-bit wide:-( The catch is
     75  *    that you actually may rely upon %o0-%o5 and %g1-%g4 being fully
     76  *    preserved if you're in a leaf function, i.e. such never calling
     77  *    any other functions. All functions in this module are leaf and
     78  *    10 registers is a handful. And as a matter of fact none-"comba"
     79  *    routines don't require even that much and I could even afford to
     80  *    not allocate own stack frame for 'em:-)
     81  *
     82  * Q. What about 64-bit kernels?
     83  * A. What about 'em? Just kidding:-) Pure 64-bit version is currently
     84  *    under evaluation and development...
     85  *
     86  * Q. What about shared libraries?
     87  * A. What about 'em? Kidding again:-) Code does *not* contain any
     88  *    code position dependencies and it's safe to include it into
     89  *    shared library as is.
     90  *
     91  * Q. How much faster does it go?
     92  * A. Do you have a good benchmark? In either case below is what I
     93  *    experience with crypto/bn/expspeed.c test program:
     94  *
     95  *	v8plus module on U10/300MHz against bn_asm.c compiled with:
     96  *
     97  *	cc-5.0 -xarch=v8plus -xO5 -xdepend	+7-12%
     98  *	cc-4.2 -xarch=v8plus -xO5 -xdepend	+25-35%
     99  *	egcs-1.1.2 -mcpu=ultrasparc -O3		+35-45%
    100  *
    101  *	v8 module on SS10/60MHz against bn_asm.c compiled with:
    102  *
    103  *	cc-5.0 -xarch=v8 -xO5 -xdepend		+7-10%
    104  *	cc-4.2 -xarch=v8 -xO5 -xdepend		+10%
    105  *	egcs-1.1.2 -mv8 -O3			+35-45%
    106  *
    107  *    As you can see it's damn hard to beat the new Sun C compiler
    108  *    and it's in first place GNU C users who will appreciate this
    109  *    assembler implementation:-)
    110  */
    111 
    112 /*
    113  * Revision history.
    114  *
    115  * 1.0	- initial release;
    116  * 1.1	- new loop unrolling model(*);
    117  *	- some more fine tuning;
    118  * 1.2	- made gas friendly;
    119  *	- updates to documentation concerning v9;
    120  *	- new performance comparison matrix;
    121  * 1.3	- fixed problem with /usr/ccs/lib/cpp;
    122  * 1.4	- native V9 bn_*_comba[48] implementation (15% more efficient)
    123  *	  resulting in slight overall performance kick;
    124  *	- some retunes;
    125  *	- support for GNU as added;
    126  *
    127  * (*)	Originally unrolled loop looked like this:
    128  *	    for (;;) {
    129  *		op(p+0); if (--n==0) break;
    130  *		op(p+1); if (--n==0) break;
    131  *		op(p+2); if (--n==0) break;
    132  *		op(p+3); if (--n==0) break;
    133  *		p+=4;
    134  *	    }
    135  *	I unroll according to following:
    136  *	    while (n&~3) {
    137  *		op(p+0); op(p+1); op(p+2); op(p+3);
    138  *		p+=4; n=-4;
    139  *	    }
    140  *	    if (n) {
    141  *		op(p+0); if (--n==0) return;
    142  *		op(p+2); if (--n==0) return;
    143  *		op(p+3); return;
    144  *	    }
    145  */
    146 
    147 #if defined(__SUNPRO_C) && defined(__sparcv9)
    148   /* They've said -xarch=v9 at command line */
    149   .register	%g2,#scratch
    150   .register	%g3,#scratch
    151 # define	FRAME_SIZE	-192
    152 #elif defined(__GNUC__) && defined(__arch64__)
    153   /* They've said -m64 at command line */
    154   .register	%g2,#scratch
    155   .register	%g3,#scratch
    156 # define	FRAME_SIZE	-192
    157 #else
    158 # define	FRAME_SIZE	-96
    159 #endif
    160 /*
    161  * GNU assembler can't stand stuw:-(
    162  */
    163 #define stuw st
    164 
    165 .section	".text",#alloc,#execinstr
    166 .file		"bn_asm.sparc.v8plus.S"
    167 
    168 .align	32
    169 
    170 .global bn_mul_add_words
    171 /*
    172  * BN_ULONG bn_mul_add_words(rp,ap,num,w)
    173  * BN_ULONG *rp,*ap;
    174  * int num;
    175  * BN_ULONG w;
    176  */
    177 bn_mul_add_words:
    178 	sra	%o2,%g0,%o2	! signx %o2
    179 	brgz,a	%o2,.L_bn_mul_add_words_proceed
    180 	lduw	[%o1],%g2
    181 	retl
    182 	clr	%o0
    183 	nop
    184 	nop
    185 	nop
    186 
    187 .L_bn_mul_add_words_proceed:
    188 	srl	%o3,%g0,%o3	! clruw	%o3
    189 	andcc	%o2,-4,%g0
    190 	bz,pn	%icc,.L_bn_mul_add_words_tail
    191 	clr	%o5
    192 
    193 .L_bn_mul_add_words_loop:	! wow! 32 aligned!
    194 	lduw	[%o0],%g1
    195 	lduw	[%o1+4],%g3
    196 	mulx	%o3,%g2,%g2
    197 	add	%g1,%o5,%o4
    198 	nop
    199 	add	%o4,%g2,%o4
    200 	stuw	%o4,[%o0]
    201 	srlx	%o4,32,%o5
    202 
    203 	lduw	[%o0+4],%g1
    204 	lduw	[%o1+8],%g2
    205 	mulx	%o3,%g3,%g3
    206 	add	%g1,%o5,%o4
    207 	dec	4,%o2
    208 	add	%o4,%g3,%o4
    209 	stuw	%o4,[%o0+4]
    210 	srlx	%o4,32,%o5
    211 
    212 	lduw	[%o0+8],%g1
    213 	lduw	[%o1+12],%g3
    214 	mulx	%o3,%g2,%g2
    215 	add	%g1,%o5,%o4
    216 	inc	16,%o1
    217 	add	%o4,%g2,%o4
    218 	stuw	%o4,[%o0+8]
    219 	srlx	%o4,32,%o5
    220 
    221 	lduw	[%o0+12],%g1
    222 	mulx	%o3,%g3,%g3
    223 	add	%g1,%o5,%o4
    224 	inc	16,%o0
    225 	add	%o4,%g3,%o4
    226 	andcc	%o2,-4,%g0
    227 	stuw	%o4,[%o0-4]
    228 	srlx	%o4,32,%o5
    229 	bnz,a,pt	%icc,.L_bn_mul_add_words_loop
    230 	lduw	[%o1],%g2
    231 
    232 	brnz,a,pn	%o2,.L_bn_mul_add_words_tail
    233 	lduw	[%o1],%g2
    234 .L_bn_mul_add_words_return:
    235 	retl
    236 	mov	%o5,%o0
    237 
    238 .L_bn_mul_add_words_tail:
    239 	lduw	[%o0],%g1
    240 	mulx	%o3,%g2,%g2
    241 	add	%g1,%o5,%o4
    242 	dec	%o2
    243 	add	%o4,%g2,%o4
    244 	srlx	%o4,32,%o5
    245 	brz,pt	%o2,.L_bn_mul_add_words_return
    246 	stuw	%o4,[%o0]
    247 
    248 	lduw	[%o1+4],%g2
    249 	lduw	[%o0+4],%g1
    250 	mulx	%o3,%g2,%g2
    251 	add	%g1,%o5,%o4
    252 	dec	%o2
    253 	add	%o4,%g2,%o4
    254 	srlx	%o4,32,%o5
    255 	brz,pt	%o2,.L_bn_mul_add_words_return
    256 	stuw	%o4,[%o0+4]
    257 
    258 	lduw	[%o1+8],%g2
    259 	lduw	[%o0+8],%g1
    260 	mulx	%o3,%g2,%g2
    261 	add	%g1,%o5,%o4
    262 	add	%o4,%g2,%o4
    263 	stuw	%o4,[%o0+8]
    264 	retl
    265 	srlx	%o4,32,%o0
    266 
    267 .type	bn_mul_add_words,#function
    268 .size	bn_mul_add_words,(.-bn_mul_add_words)
    269 
    270 .align	32
    271 
    272 .global bn_mul_words
    273 /*
    274  * BN_ULONG bn_mul_words(rp,ap,num,w)
    275  * BN_ULONG *rp,*ap;
    276  * int num;
    277  * BN_ULONG w;
    278  */
    279 bn_mul_words:
    280 	sra	%o2,%g0,%o2	! signx %o2
    281 	brgz,a	%o2,.L_bn_mul_words_proceeed
    282 	lduw	[%o1],%g2
    283 	retl
    284 	clr	%o0
    285 	nop
    286 	nop
    287 	nop
    288 
    289 .L_bn_mul_words_proceeed:
    290 	srl	%o3,%g0,%o3	! clruw	%o3
    291 	andcc	%o2,-4,%g0
    292 	bz,pn	%icc,.L_bn_mul_words_tail
    293 	clr	%o5
    294 
    295 .L_bn_mul_words_loop:		! wow! 32 aligned!
    296 	lduw	[%o1+4],%g3
    297 	mulx	%o3,%g2,%g2
    298 	add	%g2,%o5,%o4
    299 	nop
    300 	stuw	%o4,[%o0]
    301 	srlx	%o4,32,%o5
    302 
    303 	lduw	[%o1+8],%g2
    304 	mulx	%o3,%g3,%g3
    305 	add	%g3,%o5,%o4
    306 	dec	4,%o2
    307 	stuw	%o4,[%o0+4]
    308 	srlx	%o4,32,%o5
    309 
    310 	lduw	[%o1+12],%g3
    311 	mulx	%o3,%g2,%g2
    312 	add	%g2,%o5,%o4
    313 	inc	16,%o1
    314 	stuw	%o4,[%o0+8]
    315 	srlx	%o4,32,%o5
    316 
    317 	mulx	%o3,%g3,%g3
    318 	add	%g3,%o5,%o4
    319 	inc	16,%o0
    320 	stuw	%o4,[%o0-4]
    321 	srlx	%o4,32,%o5
    322 	andcc	%o2,-4,%g0
    323 	bnz,a,pt	%icc,.L_bn_mul_words_loop
    324 	lduw	[%o1],%g2
    325 	nop
    326 	nop
    327 
    328 	brnz,a,pn	%o2,.L_bn_mul_words_tail
    329 	lduw	[%o1],%g2
    330 .L_bn_mul_words_return:
    331 	retl
    332 	mov	%o5,%o0
    333 
    334 .L_bn_mul_words_tail:
    335 	mulx	%o3,%g2,%g2
    336 	add	%g2,%o5,%o4
    337 	dec	%o2
    338 	srlx	%o4,32,%o5
    339 	brz,pt	%o2,.L_bn_mul_words_return
    340 	stuw	%o4,[%o0]
    341 
    342 	lduw	[%o1+4],%g2
    343 	mulx	%o3,%g2,%g2
    344 	add	%g2,%o5,%o4
    345 	dec	%o2
    346 	srlx	%o4,32,%o5
    347 	brz,pt	%o2,.L_bn_mul_words_return
    348 	stuw	%o4,[%o0+4]
    349 
    350 	lduw	[%o1+8],%g2
    351 	mulx	%o3,%g2,%g2
    352 	add	%g2,%o5,%o4
    353 	stuw	%o4,[%o0+8]
    354 	retl
    355 	srlx	%o4,32,%o0
    356 
    357 .type	bn_mul_words,#function
    358 .size	bn_mul_words,(.-bn_mul_words)
    359 
    360 .align  32
    361 .global	bn_sqr_words
    362 /*
    363  * void bn_sqr_words(r,a,n)
    364  * BN_ULONG *r,*a;
    365  * int n;
    366  */
    367 bn_sqr_words:
    368 	sra	%o2,%g0,%o2	! signx %o2
    369 	brgz,a	%o2,.L_bn_sqr_words_proceeed
    370 	lduw	[%o1],%g2
    371 	retl
    372 	clr	%o0
    373 	nop
    374 	nop
    375 	nop
    376 
    377 .L_bn_sqr_words_proceeed:
    378 	andcc	%o2,-4,%g0
    379 	nop
    380 	bz,pn	%icc,.L_bn_sqr_words_tail
    381 	nop
    382 
    383 .L_bn_sqr_words_loop:		! wow! 32 aligned!
    384 	lduw	[%o1+4],%g3
    385 	mulx	%g2,%g2,%o4
    386 	stuw	%o4,[%o0]
    387 	srlx	%o4,32,%o5
    388 	stuw	%o5,[%o0+4]
    389 	nop
    390 
    391 	lduw	[%o1+8],%g2
    392 	mulx	%g3,%g3,%o4
    393 	dec	4,%o2
    394 	stuw	%o4,[%o0+8]
    395 	srlx	%o4,32,%o5
    396 	stuw	%o5,[%o0+12]
    397 
    398 	lduw	[%o1+12],%g3
    399 	mulx	%g2,%g2,%o4
    400 	srlx	%o4,32,%o5
    401 	stuw	%o4,[%o0+16]
    402 	inc	16,%o1
    403 	stuw	%o5,[%o0+20]
    404 
    405 	mulx	%g3,%g3,%o4
    406 	inc	32,%o0
    407 	stuw	%o4,[%o0-8]
    408 	srlx	%o4,32,%o5
    409 	andcc	%o2,-4,%g2
    410 	stuw	%o5,[%o0-4]
    411 	bnz,a,pt	%icc,.L_bn_sqr_words_loop
    412 	lduw	[%o1],%g2
    413 	nop
    414 
    415 	brnz,a,pn	%o2,.L_bn_sqr_words_tail
    416 	lduw	[%o1],%g2
    417 .L_bn_sqr_words_return:
    418 	retl
    419 	clr	%o0
    420 
    421 .L_bn_sqr_words_tail:
    422 	mulx	%g2,%g2,%o4
    423 	dec	%o2
    424 	stuw	%o4,[%o0]
    425 	srlx	%o4,32,%o5
    426 	brz,pt	%o2,.L_bn_sqr_words_return
    427 	stuw	%o5,[%o0+4]
    428 
    429 	lduw	[%o1+4],%g2
    430 	mulx	%g2,%g2,%o4
    431 	dec	%o2
    432 	stuw	%o4,[%o0+8]
    433 	srlx	%o4,32,%o5
    434 	brz,pt	%o2,.L_bn_sqr_words_return
    435 	stuw	%o5,[%o0+12]
    436 
    437 	lduw	[%o1+8],%g2
    438 	mulx	%g2,%g2,%o4
    439 	srlx	%o4,32,%o5
    440 	stuw	%o4,[%o0+16]
    441 	stuw	%o5,[%o0+20]
    442 	retl
    443 	clr	%o0
    444 
    445 .type	bn_sqr_words,#function
    446 .size	bn_sqr_words,(.-bn_sqr_words)
    447 
    448 .align	32
    449 .global bn_div_words
    450 /*
    451  * BN_ULONG bn_div_words(h,l,d)
    452  * BN_ULONG h,l,d;
    453  */
    454 bn_div_words:
    455 	sllx	%o0,32,%o0
    456 	or	%o0,%o1,%o0
    457 	udivx	%o0,%o2,%o0
    458 	retl
    459 	srl	%o0,%g0,%o0	! clruw	%o0
    460 
    461 .type	bn_div_words,#function
    462 .size	bn_div_words,(.-bn_div_words)
    463 
    464 .align	32
    465 
    466 .global bn_add_words
    467 /*
    468  * BN_ULONG bn_add_words(rp,ap,bp,n)
    469  * BN_ULONG *rp,*ap,*bp;
    470  * int n;
    471  */
    472 bn_add_words:
    473 	sra	%o3,%g0,%o3	! signx %o3
    474 	brgz,a	%o3,.L_bn_add_words_proceed
    475 	lduw	[%o1],%o4
    476 	retl
    477 	clr	%o0
    478 
    479 .L_bn_add_words_proceed:
    480 	andcc	%o3,-4,%g0
    481 	bz,pn	%icc,.L_bn_add_words_tail
    482 	addcc	%g0,0,%g0	! clear carry flag
    483 
    484 .L_bn_add_words_loop:		! wow! 32 aligned!
    485 	dec	4,%o3
    486 	lduw	[%o2],%o5
    487 	lduw	[%o1+4],%g1
    488 	lduw	[%o2+4],%g2
    489 	lduw	[%o1+8],%g3
    490 	lduw	[%o2+8],%g4
    491 	addccc	%o5,%o4,%o5
    492 	stuw	%o5,[%o0]
    493 
    494 	lduw	[%o1+12],%o4
    495 	lduw	[%o2+12],%o5
    496 	inc	16,%o1
    497 	addccc	%g1,%g2,%g1
    498 	stuw	%g1,[%o0+4]
    499 
    500 	inc	16,%o2
    501 	addccc	%g3,%g4,%g3
    502 	stuw	%g3,[%o0+8]
    503 
    504 	inc	16,%o0
    505 	addccc	%o5,%o4,%o5
    506 	stuw	%o5,[%o0-4]
    507 	and	%o3,-4,%g1
    508 	brnz,a,pt	%g1,.L_bn_add_words_loop
    509 	lduw	[%o1],%o4
    510 
    511 	brnz,a,pn	%o3,.L_bn_add_words_tail
    512 	lduw	[%o1],%o4
    513 .L_bn_add_words_return:
    514 	clr	%o0
    515 	retl
    516 	movcs	%icc,1,%o0
    517 	nop
    518 
    519 .L_bn_add_words_tail:
    520 	lduw	[%o2],%o5
    521 	dec	%o3
    522 	addccc	%o5,%o4,%o5
    523 	brz,pt	%o3,.L_bn_add_words_return
    524 	stuw	%o5,[%o0]
    525 
    526 	lduw	[%o1+4],%o4
    527 	lduw	[%o2+4],%o5
    528 	dec	%o3
    529 	addccc	%o5,%o4,%o5
    530 	brz,pt	%o3,.L_bn_add_words_return
    531 	stuw	%o5,[%o0+4]
    532 
    533 	lduw	[%o1+8],%o4
    534 	lduw	[%o2+8],%o5
    535 	addccc	%o5,%o4,%o5
    536 	stuw	%o5,[%o0+8]
    537 	clr	%o0
    538 	retl
    539 	movcs	%icc,1,%o0
    540 
    541 .type	bn_add_words,#function
    542 .size	bn_add_words,(.-bn_add_words)
    543 
    544 .global bn_sub_words
    545 /*
    546  * BN_ULONG bn_sub_words(rp,ap,bp,n)
    547  * BN_ULONG *rp,*ap,*bp;
    548  * int n;
    549  */
    550 bn_sub_words:
    551 	sra	%o3,%g0,%o3	! signx %o3
    552 	brgz,a	%o3,.L_bn_sub_words_proceed
    553 	lduw	[%o1],%o4
    554 	retl
    555 	clr	%o0
    556 
    557 .L_bn_sub_words_proceed:
    558 	andcc	%o3,-4,%g0
    559 	bz,pn	%icc,.L_bn_sub_words_tail
    560 	addcc	%g0,0,%g0	! clear carry flag
    561 
    562 .L_bn_sub_words_loop:		! wow! 32 aligned!
    563 	dec	4,%o3
    564 	lduw	[%o2],%o5
    565 	lduw	[%o1+4],%g1
    566 	lduw	[%o2+4],%g2
    567 	lduw	[%o1+8],%g3
    568 	lduw	[%o2+8],%g4
    569 	subccc	%o4,%o5,%o5
    570 	stuw	%o5,[%o0]
    571 
    572 	lduw	[%o1+12],%o4
    573 	lduw	[%o2+12],%o5
    574 	inc	16,%o1
    575 	subccc	%g1,%g2,%g2
    576 	stuw	%g2,[%o0+4]
    577 
    578 	inc	16,%o2
    579 	subccc	%g3,%g4,%g4
    580 	stuw	%g4,[%o0+8]
    581 
    582 	inc	16,%o0
    583 	subccc	%o4,%o5,%o5
    584 	stuw	%o5,[%o0-4]
    585 	and	%o3,-4,%g1
    586 	brnz,a,pt	%g1,.L_bn_sub_words_loop
    587 	lduw	[%o1],%o4
    588 
    589 	brnz,a,pn	%o3,.L_bn_sub_words_tail
    590 	lduw	[%o1],%o4
    591 .L_bn_sub_words_return:
    592 	clr	%o0
    593 	retl
    594 	movcs	%icc,1,%o0
    595 	nop
    596 
    597 .L_bn_sub_words_tail:		! wow! 32 aligned!
    598 	lduw	[%o2],%o5
    599 	dec	%o3
    600 	subccc	%o4,%o5,%o5
    601 	brz,pt	%o3,.L_bn_sub_words_return
    602 	stuw	%o5,[%o0]
    603 
    604 	lduw	[%o1+4],%o4
    605 	lduw	[%o2+4],%o5
    606 	dec	%o3
    607 	subccc	%o4,%o5,%o5
    608 	brz,pt	%o3,.L_bn_sub_words_return
    609 	stuw	%o5,[%o0+4]
    610 
    611 	lduw	[%o1+8],%o4
    612 	lduw	[%o2+8],%o5
    613 	subccc	%o4,%o5,%o5
    614 	stuw	%o5,[%o0+8]
    615 	clr	%o0
    616 	retl
    617 	movcs	%icc,1,%o0
    618 
    619 .type	bn_sub_words,#function
    620 .size	bn_sub_words,(.-bn_sub_words)
    621 
    622 /*
    623  * Code below depends on the fact that upper parts of the %l0-%l7
    624  * and %i0-%i7 are zeroed by kernel after context switch. In
    625  * previous versions this comment stated that "the trouble is that
    626  * it's not feasible to implement the mumbo-jumbo in less V9
    627  * instructions:-(" which apparently isn't true thanks to
    628  * 'bcs,a %xcc,.+8; inc %rd' pair. But the performance improvement
    629  * results not from the shorter code, but from elimination of
    630  * multicycle none-pairable 'rd %y,%rd' instructions.
    631  *
    632  *							Andy.
    633  */
    634 
    635 /*
    636  * Here is register usage map for *all* routines below.
    637  */
    638 #define t_1	%o0
    639 #define	t_2	%o1
    640 #define c_12	%o2
    641 #define c_3	%o3
    642 
    643 #define ap(I)	[%i1+4*I]
    644 #define bp(I)	[%i2+4*I]
    645 #define rp(I)	[%i0+4*I]
    646 
    647 #define	a_0	%l0
    648 #define	a_1	%l1
    649 #define	a_2	%l2
    650 #define	a_3	%l3
    651 #define	a_4	%l4
    652 #define	a_5	%l5
    653 #define	a_6	%l6
    654 #define	a_7	%l7
    655 
    656 #define	b_0	%i3
    657 #define	b_1	%i4
    658 #define	b_2	%i5
    659 #define	b_3	%o4
    660 #define	b_4	%o5
    661 #define	b_5	%o7
    662 #define	b_6	%g1
    663 #define	b_7	%g4
    664 
    665 .align	32
    666 .global bn_mul_comba8
    667 /*
    668  * void bn_mul_comba8(r,a,b)
    669  * BN_ULONG *r,*a,*b;
    670  */
    671 bn_mul_comba8:
    672 	save	%sp,FRAME_SIZE,%sp
    673 	mov	1,t_2
    674 	lduw	ap(0),a_0
    675 	sllx	t_2,32,t_2
    676 	lduw	bp(0),b_0	!=
    677 	lduw	bp(1),b_1
    678 	mulx	a_0,b_0,t_1	!mul_add_c(a[0],b[0],c1,c2,c3);
    679 	srlx	t_1,32,c_12
    680 	stuw	t_1,rp(0)	!=!r[0]=c1;
    681 
    682 	lduw	ap(1),a_1
    683 	mulx	a_0,b_1,t_1	!mul_add_c(a[0],b[1],c2,c3,c1);
    684 	addcc	c_12,t_1,c_12
    685 	clr	c_3		!=
    686 	bcs,a	%xcc,.+8
    687 	add	c_3,t_2,c_3
    688 	lduw	ap(2),a_2
    689 	mulx	a_1,b_0,t_1	!=!mul_add_c(a[1],b[0],c2,c3,c1);
    690 	addcc	c_12,t_1,t_1
    691 	bcs,a	%xcc,.+8
    692 	add	c_3,t_2,c_3
    693 	srlx	t_1,32,c_12	!=
    694 	stuw	t_1,rp(1)	!r[1]=c2;
    695 	or	c_12,c_3,c_12
    696 
    697 	mulx	a_2,b_0,t_1	!mul_add_c(a[2],b[0],c3,c1,c2);
    698 	addcc	c_12,t_1,c_12	!=
    699 	clr	c_3
    700 	bcs,a	%xcc,.+8
    701 	add	c_3,t_2,c_3
    702 	lduw	bp(2),b_2	!=
    703 	mulx	a_1,b_1,t_1	!mul_add_c(a[1],b[1],c3,c1,c2);
    704 	addcc	c_12,t_1,c_12
    705 	bcs,a	%xcc,.+8
    706 	add	c_3,t_2,c_3	!=
    707 	lduw	bp(3),b_3
    708 	mulx	a_0,b_2,t_1	!mul_add_c(a[0],b[2],c3,c1,c2);
    709 	addcc	c_12,t_1,t_1
    710 	bcs,a	%xcc,.+8	!=
    711 	add	c_3,t_2,c_3
    712 	srlx	t_1,32,c_12
    713 	stuw	t_1,rp(2)	!r[2]=c3;
    714 	or	c_12,c_3,c_12	!=
    715 
    716 	mulx	a_0,b_3,t_1	!mul_add_c(a[0],b[3],c1,c2,c3);
    717 	addcc	c_12,t_1,c_12
    718 	clr	c_3
    719 	bcs,a	%xcc,.+8	!=
    720 	add	c_3,t_2,c_3
    721 	mulx	a_1,b_2,t_1	!=!mul_add_c(a[1],b[2],c1,c2,c3);
    722 	addcc	c_12,t_1,c_12
    723 	bcs,a	%xcc,.+8	!=
    724 	add	c_3,t_2,c_3
    725 	lduw	ap(3),a_3
    726 	mulx	a_2,b_1,t_1	!mul_add_c(a[2],b[1],c1,c2,c3);
    727 	addcc	c_12,t_1,c_12	!=
    728 	bcs,a	%xcc,.+8
    729 	add	c_3,t_2,c_3
    730 	lduw	ap(4),a_4
    731 	mulx	a_3,b_0,t_1	!=!mul_add_c(a[3],b[0],c1,c2,c3);!=
    732 	addcc	c_12,t_1,t_1
    733 	bcs,a	%xcc,.+8
    734 	add	c_3,t_2,c_3
    735 	srlx	t_1,32,c_12	!=
    736 	stuw	t_1,rp(3)	!r[3]=c1;
    737 	or	c_12,c_3,c_12
    738 
    739 	mulx	a_4,b_0,t_1	!mul_add_c(a[4],b[0],c2,c3,c1);
    740 	addcc	c_12,t_1,c_12	!=
    741 	clr	c_3
    742 	bcs,a	%xcc,.+8
    743 	add	c_3,t_2,c_3
    744 	mulx	a_3,b_1,t_1	!=!mul_add_c(a[3],b[1],c2,c3,c1);
    745 	addcc	c_12,t_1,c_12
    746 	bcs,a	%xcc,.+8
    747 	add	c_3,t_2,c_3
    748 	mulx	a_2,b_2,t_1	!=!mul_add_c(a[2],b[2],c2,c3,c1);
    749 	addcc	c_12,t_1,c_12
    750 	bcs,a	%xcc,.+8
    751 	add	c_3,t_2,c_3
    752 	lduw	bp(4),b_4	!=
    753 	mulx	a_1,b_3,t_1	!mul_add_c(a[1],b[3],c2,c3,c1);
    754 	addcc	c_12,t_1,c_12
    755 	bcs,a	%xcc,.+8
    756 	add	c_3,t_2,c_3	!=
    757 	lduw	bp(5),b_5
    758 	mulx	a_0,b_4,t_1	!mul_add_c(a[0],b[4],c2,c3,c1);
    759 	addcc	c_12,t_1,t_1
    760 	bcs,a	%xcc,.+8	!=
    761 	add	c_3,t_2,c_3
    762 	srlx	t_1,32,c_12
    763 	stuw	t_1,rp(4)	!r[4]=c2;
    764 	or	c_12,c_3,c_12	!=
    765 
    766 	mulx	a_0,b_5,t_1	!mul_add_c(a[0],b[5],c3,c1,c2);
    767 	addcc	c_12,t_1,c_12
    768 	clr	c_3
    769 	bcs,a	%xcc,.+8	!=
    770 	add	c_3,t_2,c_3
    771 	mulx	a_1,b_4,t_1	!mul_add_c(a[1],b[4],c3,c1,c2);
    772 	addcc	c_12,t_1,c_12
    773 	bcs,a	%xcc,.+8	!=
    774 	add	c_3,t_2,c_3
    775 	mulx	a_2,b_3,t_1	!mul_add_c(a[2],b[3],c3,c1,c2);
    776 	addcc	c_12,t_1,c_12
    777 	bcs,a	%xcc,.+8	!=
    778 	add	c_3,t_2,c_3
    779 	mulx	a_3,b_2,t_1	!mul_add_c(a[3],b[2],c3,c1,c2);
    780 	addcc	c_12,t_1,c_12
    781 	bcs,a	%xcc,.+8	!=
    782 	add	c_3,t_2,c_3
    783 	lduw	ap(5),a_5
    784 	mulx	a_4,b_1,t_1	!mul_add_c(a[4],b[1],c3,c1,c2);
    785 	addcc	c_12,t_1,c_12	!=
    786 	bcs,a	%xcc,.+8
    787 	add	c_3,t_2,c_3
    788 	lduw	ap(6),a_6
    789 	mulx	a_5,b_0,t_1	!=!mul_add_c(a[5],b[0],c3,c1,c2);
    790 	addcc	c_12,t_1,t_1
    791 	bcs,a	%xcc,.+8
    792 	add	c_3,t_2,c_3
    793 	srlx	t_1,32,c_12	!=
    794 	stuw	t_1,rp(5)	!r[5]=c3;
    795 	or	c_12,c_3,c_12
    796 
    797 	mulx	a_6,b_0,t_1	!mul_add_c(a[6],b[0],c1,c2,c3);
    798 	addcc	c_12,t_1,c_12	!=
    799 	clr	c_3
    800 	bcs,a	%xcc,.+8
    801 	add	c_3,t_2,c_3
    802 	mulx	a_5,b_1,t_1	!=!mul_add_c(a[5],b[1],c1,c2,c3);
    803 	addcc	c_12,t_1,c_12
    804 	bcs,a	%xcc,.+8
    805 	add	c_3,t_2,c_3
    806 	mulx	a_4,b_2,t_1	!=!mul_add_c(a[4],b[2],c1,c2,c3);
    807 	addcc	c_12,t_1,c_12
    808 	bcs,a	%xcc,.+8
    809 	add	c_3,t_2,c_3
    810 	mulx	a_3,b_3,t_1	!=!mul_add_c(a[3],b[3],c1,c2,c3);
    811 	addcc	c_12,t_1,c_12
    812 	bcs,a	%xcc,.+8
    813 	add	c_3,t_2,c_3
    814 	mulx	a_2,b_4,t_1	!=!mul_add_c(a[2],b[4],c1,c2,c3);
    815 	addcc	c_12,t_1,c_12
    816 	bcs,a	%xcc,.+8
    817 	add	c_3,t_2,c_3
    818 	lduw	bp(6),b_6	!=
    819 	mulx	a_1,b_5,t_1	!mul_add_c(a[1],b[5],c1,c2,c3);
    820 	addcc	c_12,t_1,c_12
    821 	bcs,a	%xcc,.+8
    822 	add	c_3,t_2,c_3	!=
    823 	lduw	bp(7),b_7
    824 	mulx	a_0,b_6,t_1	!mul_add_c(a[0],b[6],c1,c2,c3);
    825 	addcc	c_12,t_1,t_1
    826 	bcs,a	%xcc,.+8	!=
    827 	add	c_3,t_2,c_3
    828 	srlx	t_1,32,c_12
    829 	stuw	t_1,rp(6)	!r[6]=c1;
    830 	or	c_12,c_3,c_12	!=
    831 
    832 	mulx	a_0,b_7,t_1	!mul_add_c(a[0],b[7],c2,c3,c1);
    833 	addcc	c_12,t_1,c_12
    834 	clr	c_3
    835 	bcs,a	%xcc,.+8	!=
    836 	add	c_3,t_2,c_3
    837 	mulx	a_1,b_6,t_1	!mul_add_c(a[1],b[6],c2,c3,c1);
    838 	addcc	c_12,t_1,c_12
    839 	bcs,a	%xcc,.+8	!=
    840 	add	c_3,t_2,c_3
    841 	mulx	a_2,b_5,t_1	!mul_add_c(a[2],b[5],c2,c3,c1);
    842 	addcc	c_12,t_1,c_12
    843 	bcs,a	%xcc,.+8	!=
    844 	add	c_3,t_2,c_3
    845 	mulx	a_3,b_4,t_1	!mul_add_c(a[3],b[4],c2,c3,c1);
    846 	addcc	c_12,t_1,c_12
    847 	bcs,a	%xcc,.+8	!=
    848 	add	c_3,t_2,c_3
    849 	mulx	a_4,b_3,t_1	!mul_add_c(a[4],b[3],c2,c3,c1);
    850 	addcc	c_12,t_1,c_12
    851 	bcs,a	%xcc,.+8	!=
    852 	add	c_3,t_2,c_3
    853 	mulx	a_5,b_2,t_1	!mul_add_c(a[5],b[2],c2,c3,c1);
    854 	addcc	c_12,t_1,c_12
    855 	bcs,a	%xcc,.+8	!=
    856 	add	c_3,t_2,c_3
    857 	lduw	ap(7),a_7
    858 	mulx	a_6,b_1,t_1	!=!mul_add_c(a[6],b[1],c2,c3,c1);
    859 	addcc	c_12,t_1,c_12
    860 	bcs,a	%xcc,.+8
    861 	add	c_3,t_2,c_3
    862 	mulx	a_7,b_0,t_1	!=!mul_add_c(a[7],b[0],c2,c3,c1);
    863 	addcc	c_12,t_1,t_1
    864 	bcs,a	%xcc,.+8
    865 	add	c_3,t_2,c_3
    866 	srlx	t_1,32,c_12	!=
    867 	stuw	t_1,rp(7)	!r[7]=c2;
    868 	or	c_12,c_3,c_12
    869 
    870 	mulx	a_7,b_1,t_1	!=!mul_add_c(a[7],b[1],c3,c1,c2);
    871 	addcc	c_12,t_1,c_12
    872 	clr	c_3
    873 	bcs,a	%xcc,.+8
    874 	add	c_3,t_2,c_3	!=
    875 	mulx	a_6,b_2,t_1	!mul_add_c(a[6],b[2],c3,c1,c2);
    876 	addcc	c_12,t_1,c_12
    877 	bcs,a	%xcc,.+8
    878 	add	c_3,t_2,c_3	!=
    879 	mulx	a_5,b_3,t_1	!mul_add_c(a[5],b[3],c3,c1,c2);
    880 	addcc	c_12,t_1,c_12
    881 	bcs,a	%xcc,.+8
    882 	add	c_3,t_2,c_3	!=
    883 	mulx	a_4,b_4,t_1	!mul_add_c(a[4],b[4],c3,c1,c2);
    884 	addcc	c_12,t_1,c_12
    885 	bcs,a	%xcc,.+8
    886 	add	c_3,t_2,c_3	!=
    887 	mulx	a_3,b_5,t_1	!mul_add_c(a[3],b[5],c3,c1,c2);
    888 	addcc	c_12,t_1,c_12
    889 	bcs,a	%xcc,.+8
    890 	add	c_3,t_2,c_3	!=
    891 	mulx	a_2,b_6,t_1	!mul_add_c(a[2],b[6],c3,c1,c2);
    892 	addcc	c_12,t_1,c_12
    893 	bcs,a	%xcc,.+8
    894 	add	c_3,t_2,c_3	!=
    895 	mulx	a_1,b_7,t_1	!mul_add_c(a[1],b[7],c3,c1,c2);
    896 	addcc	c_12,t_1,t_1
    897 	bcs,a	%xcc,.+8
    898 	add	c_3,t_2,c_3	!=
    899 	srlx	t_1,32,c_12
    900 	stuw	t_1,rp(8)	!r[8]=c3;
    901 	or	c_12,c_3,c_12
    902 
    903 	mulx	a_2,b_7,t_1	!=!mul_add_c(a[2],b[7],c1,c2,c3);
    904 	addcc	c_12,t_1,c_12
    905 	clr	c_3
    906 	bcs,a	%xcc,.+8
    907 	add	c_3,t_2,c_3	!=
    908 	mulx	a_3,b_6,t_1	!mul_add_c(a[3],b[6],c1,c2,c3);
    909 	addcc	c_12,t_1,c_12
    910 	bcs,a	%xcc,.+8	!=
    911 	add	c_3,t_2,c_3
    912 	mulx	a_4,b_5,t_1	!mul_add_c(a[4],b[5],c1,c2,c3);
    913 	addcc	c_12,t_1,c_12
    914 	bcs,a	%xcc,.+8	!=
    915 	add	c_3,t_2,c_3
    916 	mulx	a_5,b_4,t_1	!mul_add_c(a[5],b[4],c1,c2,c3);
    917 	addcc	c_12,t_1,c_12
    918 	bcs,a	%xcc,.+8	!=
    919 	add	c_3,t_2,c_3
    920 	mulx	a_6,b_3,t_1	!mul_add_c(a[6],b[3],c1,c2,c3);
    921 	addcc	c_12,t_1,c_12
    922 	bcs,a	%xcc,.+8	!=
    923 	add	c_3,t_2,c_3
    924 	mulx	a_7,b_2,t_1	!mul_add_c(a[7],b[2],c1,c2,c3);
    925 	addcc	c_12,t_1,t_1
    926 	bcs,a	%xcc,.+8	!=
    927 	add	c_3,t_2,c_3
    928 	srlx	t_1,32,c_12
    929 	stuw	t_1,rp(9)	!r[9]=c1;
    930 	or	c_12,c_3,c_12	!=
    931 
    932 	mulx	a_7,b_3,t_1	!mul_add_c(a[7],b[3],c2,c3,c1);
    933 	addcc	c_12,t_1,c_12
    934 	clr	c_3
    935 	bcs,a	%xcc,.+8	!=
    936 	add	c_3,t_2,c_3
    937 	mulx	a_6,b_4,t_1	!mul_add_c(a[6],b[4],c2,c3,c1);
    938 	addcc	c_12,t_1,c_12
    939 	bcs,a	%xcc,.+8	!=
    940 	add	c_3,t_2,c_3
    941 	mulx	a_5,b_5,t_1	!mul_add_c(a[5],b[5],c2,c3,c1);
    942 	addcc	c_12,t_1,c_12
    943 	bcs,a	%xcc,.+8	!=
    944 	add	c_3,t_2,c_3
    945 	mulx	a_4,b_6,t_1	!mul_add_c(a[4],b[6],c2,c3,c1);
    946 	addcc	c_12,t_1,c_12
    947 	bcs,a	%xcc,.+8	!=
    948 	add	c_3,t_2,c_3
    949 	mulx	a_3,b_7,t_1	!mul_add_c(a[3],b[7],c2,c3,c1);
    950 	addcc	c_12,t_1,t_1
    951 	bcs,a	%xcc,.+8	!=
    952 	add	c_3,t_2,c_3
    953 	srlx	t_1,32,c_12
    954 	stuw	t_1,rp(10)	!r[10]=c2;
    955 	or	c_12,c_3,c_12	!=
    956 
    957 	mulx	a_4,b_7,t_1	!mul_add_c(a[4],b[7],c3,c1,c2);
    958 	addcc	c_12,t_1,c_12
    959 	clr	c_3
    960 	bcs,a	%xcc,.+8	!=
    961 	add	c_3,t_2,c_3
    962 	mulx	a_5,b_6,t_1	!mul_add_c(a[5],b[6],c3,c1,c2);
    963 	addcc	c_12,t_1,c_12
    964 	bcs,a	%xcc,.+8	!=
    965 	add	c_3,t_2,c_3
    966 	mulx	a_6,b_5,t_1	!mul_add_c(a[6],b[5],c3,c1,c2);
    967 	addcc	c_12,t_1,c_12
    968 	bcs,a	%xcc,.+8	!=
    969 	add	c_3,t_2,c_3
    970 	mulx	a_7,b_4,t_1	!mul_add_c(a[7],b[4],c3,c1,c2);
    971 	addcc	c_12,t_1,t_1
    972 	bcs,a	%xcc,.+8	!=
    973 	add	c_3,t_2,c_3
    974 	srlx	t_1,32,c_12
    975 	stuw	t_1,rp(11)	!r[11]=c3;
    976 	or	c_12,c_3,c_12	!=
    977 
    978 	mulx	a_7,b_5,t_1	!mul_add_c(a[7],b[5],c1,c2,c3);
    979 	addcc	c_12,t_1,c_12
    980 	clr	c_3
    981 	bcs,a	%xcc,.+8	!=
    982 	add	c_3,t_2,c_3
    983 	mulx	a_6,b_6,t_1	!mul_add_c(a[6],b[6],c1,c2,c3);
    984 	addcc	c_12,t_1,c_12
    985 	bcs,a	%xcc,.+8	!=
    986 	add	c_3,t_2,c_3
    987 	mulx	a_5,b_7,t_1	!mul_add_c(a[5],b[7],c1,c2,c3);
    988 	addcc	c_12,t_1,t_1
    989 	bcs,a	%xcc,.+8	!=
    990 	add	c_3,t_2,c_3
    991 	srlx	t_1,32,c_12
    992 	stuw	t_1,rp(12)	!r[12]=c1;
    993 	or	c_12,c_3,c_12	!=
    994 
    995 	mulx	a_6,b_7,t_1	!mul_add_c(a[6],b[7],c2,c3,c1);
    996 	addcc	c_12,t_1,c_12
    997 	clr	c_3
    998 	bcs,a	%xcc,.+8	!=
    999 	add	c_3,t_2,c_3
   1000 	mulx	a_7,b_6,t_1	!mul_add_c(a[7],b[6],c2,c3,c1);
   1001 	addcc	c_12,t_1,t_1
   1002 	bcs,a	%xcc,.+8	!=
   1003 	add	c_3,t_2,c_3
   1004 	srlx	t_1,32,c_12
   1005 	st	t_1,rp(13)	!r[13]=c2;
   1006 	or	c_12,c_3,c_12	!=
   1007 
   1008 	mulx	a_7,b_7,t_1	!mul_add_c(a[7],b[7],c3,c1,c2);
   1009 	addcc	c_12,t_1,t_1
   1010 	srlx	t_1,32,c_12	!=
   1011 	stuw	t_1,rp(14)	!r[14]=c3;
   1012 	stuw	c_12,rp(15)	!r[15]=c1;
   1013 
   1014 	ret
   1015 	restore	%g0,%g0,%o0	!=
   1016 
   1017 .type	bn_mul_comba8,#function
   1018 .size	bn_mul_comba8,(.-bn_mul_comba8)
   1019 
   1020 .align	32
   1021 
   1022 .global bn_mul_comba4
   1023 /*
   1024  * void bn_mul_comba4(r,a,b)
   1025  * BN_ULONG *r,*a,*b;
   1026  */
   1027 bn_mul_comba4:
   1028 	save	%sp,FRAME_SIZE,%sp
   1029 	lduw	ap(0),a_0
   1030 	mov	1,t_2
   1031 	lduw	bp(0),b_0
   1032 	sllx	t_2,32,t_2	!=
   1033 	lduw	bp(1),b_1
   1034 	mulx	a_0,b_0,t_1	!mul_add_c(a[0],b[0],c1,c2,c3);
   1035 	srlx	t_1,32,c_12
   1036 	stuw	t_1,rp(0)	!=!r[0]=c1;
   1037 
   1038 	lduw	ap(1),a_1
   1039 	mulx	a_0,b_1,t_1	!mul_add_c(a[0],b[1],c2,c3,c1);
   1040 	addcc	c_12,t_1,c_12
   1041 	clr	c_3		!=
   1042 	bcs,a	%xcc,.+8
   1043 	add	c_3,t_2,c_3
   1044 	lduw	ap(2),a_2
   1045 	mulx	a_1,b_0,t_1	!=!mul_add_c(a[1],b[0],c2,c3,c1);
   1046 	addcc	c_12,t_1,t_1
   1047 	bcs,a	%xcc,.+8
   1048 	add	c_3,t_2,c_3
   1049 	srlx	t_1,32,c_12	!=
   1050 	stuw	t_1,rp(1)	!r[1]=c2;
   1051 	or	c_12,c_3,c_12
   1052 
   1053 	mulx	a_2,b_0,t_1	!mul_add_c(a[2],b[0],c3,c1,c2);
   1054 	addcc	c_12,t_1,c_12	!=
   1055 	clr	c_3
   1056 	bcs,a	%xcc,.+8
   1057 	add	c_3,t_2,c_3
   1058 	lduw	bp(2),b_2	!=
   1059 	mulx	a_1,b_1,t_1	!mul_add_c(a[1],b[1],c3,c1,c2);
   1060 	addcc	c_12,t_1,c_12
   1061 	bcs,a	%xcc,.+8
   1062 	add	c_3,t_2,c_3	!=
   1063 	lduw	bp(3),b_3
   1064 	mulx	a_0,b_2,t_1	!mul_add_c(a[0],b[2],c3,c1,c2);
   1065 	addcc	c_12,t_1,t_1
   1066 	bcs,a	%xcc,.+8	!=
   1067 	add	c_3,t_2,c_3
   1068 	srlx	t_1,32,c_12
   1069 	stuw	t_1,rp(2)	!r[2]=c3;
   1070 	or	c_12,c_3,c_12	!=
   1071 
   1072 	mulx	a_0,b_3,t_1	!mul_add_c(a[0],b[3],c1,c2,c3);
   1073 	addcc	c_12,t_1,c_12
   1074 	clr	c_3
   1075 	bcs,a	%xcc,.+8	!=
   1076 	add	c_3,t_2,c_3
   1077 	mulx	a_1,b_2,t_1	!mul_add_c(a[1],b[2],c1,c2,c3);
   1078 	addcc	c_12,t_1,c_12
   1079 	bcs,a	%xcc,.+8	!=
   1080 	add	c_3,t_2,c_3
   1081 	lduw	ap(3),a_3
   1082 	mulx	a_2,b_1,t_1	!mul_add_c(a[2],b[1],c1,c2,c3);
   1083 	addcc	c_12,t_1,c_12	!=
   1084 	bcs,a	%xcc,.+8
   1085 	add	c_3,t_2,c_3
   1086 	mulx	a_3,b_0,t_1	!mul_add_c(a[3],b[0],c1,c2,c3);!=
   1087 	addcc	c_12,t_1,t_1	!=
   1088 	bcs,a	%xcc,.+8
   1089 	add	c_3,t_2,c_3
   1090 	srlx	t_1,32,c_12
   1091 	stuw	t_1,rp(3)	!=!r[3]=c1;
   1092 	or	c_12,c_3,c_12
   1093 
   1094 	mulx	a_3,b_1,t_1	!mul_add_c(a[3],b[1],c2,c3,c1);
   1095 	addcc	c_12,t_1,c_12
   1096 	clr	c_3		!=
   1097 	bcs,a	%xcc,.+8
   1098 	add	c_3,t_2,c_3
   1099 	mulx	a_2,b_2,t_1	!mul_add_c(a[2],b[2],c2,c3,c1);
   1100 	addcc	c_12,t_1,c_12	!=
   1101 	bcs,a	%xcc,.+8
   1102 	add	c_3,t_2,c_3
   1103 	mulx	a_1,b_3,t_1	!mul_add_c(a[1],b[3],c2,c3,c1);
   1104 	addcc	c_12,t_1,t_1	!=
   1105 	bcs,a	%xcc,.+8
   1106 	add	c_3,t_2,c_3
   1107 	srlx	t_1,32,c_12
   1108 	stuw	t_1,rp(4)	!=!r[4]=c2;
   1109 	or	c_12,c_3,c_12
   1110 
   1111 	mulx	a_2,b_3,t_1	!mul_add_c(a[2],b[3],c3,c1,c2);
   1112 	addcc	c_12,t_1,c_12
   1113 	clr	c_3		!=
   1114 	bcs,a	%xcc,.+8
   1115 	add	c_3,t_2,c_3
   1116 	mulx	a_3,b_2,t_1	!mul_add_c(a[3],b[2],c3,c1,c2);
   1117 	addcc	c_12,t_1,t_1	!=
   1118 	bcs,a	%xcc,.+8
   1119 	add	c_3,t_2,c_3
   1120 	srlx	t_1,32,c_12
   1121 	stuw	t_1,rp(5)	!=!r[5]=c3;
   1122 	or	c_12,c_3,c_12
   1123 
   1124 	mulx	a_3,b_3,t_1	!mul_add_c(a[3],b[3],c1,c2,c3);
   1125 	addcc	c_12,t_1,t_1
   1126 	srlx	t_1,32,c_12	!=
   1127 	stuw	t_1,rp(6)	!r[6]=c1;
   1128 	stuw	c_12,rp(7)	!r[7]=c2;
   1129 
   1130 	ret
   1131 	restore	%g0,%g0,%o0
   1132 
   1133 .type	bn_mul_comba4,#function
   1134 .size	bn_mul_comba4,(.-bn_mul_comba4)
   1135 
   1136 .align	32
   1137 
   1138 .global bn_sqr_comba8
   1139 bn_sqr_comba8:
   1140 	save	%sp,FRAME_SIZE,%sp
   1141 	mov	1,t_2
   1142 	lduw	ap(0),a_0
   1143 	sllx	t_2,32,t_2
   1144 	lduw	ap(1),a_1
   1145 	mulx	a_0,a_0,t_1	!sqr_add_c(a,0,c1,c2,c3);
   1146 	srlx	t_1,32,c_12
   1147 	stuw	t_1,rp(0)	!r[0]=c1;
   1148 
   1149 	lduw	ap(2),a_2
   1150 	mulx	a_0,a_1,t_1	!=!sqr_add_c2(a,1,0,c2,c3,c1);
   1151 	addcc	c_12,t_1,c_12
   1152 	clr	c_3
   1153 	bcs,a	%xcc,.+8
   1154 	add	c_3,t_2,c_3
   1155 	addcc	c_12,t_1,t_1
   1156 	bcs,a	%xcc,.+8
   1157 	add	c_3,t_2,c_3
   1158 	srlx	t_1,32,c_12
   1159 	stuw	t_1,rp(1)	!r[1]=c2;
   1160 	or	c_12,c_3,c_12
   1161 
   1162 	mulx	a_2,a_0,t_1	!sqr_add_c2(a,2,0,c3,c1,c2);
   1163 	addcc	c_12,t_1,c_12
   1164 	clr	c_3
   1165 	bcs,a	%xcc,.+8
   1166 	add	c_3,t_2,c_3
   1167 	addcc	c_12,t_1,c_12
   1168 	bcs,a	%xcc,.+8
   1169 	add	c_3,t_2,c_3
   1170 	lduw	ap(3),a_3
   1171 	mulx	a_1,a_1,t_1	!sqr_add_c(a,1,c3,c1,c2);
   1172 	addcc	c_12,t_1,t_1
   1173 	bcs,a	%xcc,.+8
   1174 	add	c_3,t_2,c_3
   1175 	srlx	t_1,32,c_12
   1176 	stuw	t_1,rp(2)	!r[2]=c3;
   1177 	or	c_12,c_3,c_12
   1178 
   1179 	mulx	a_0,a_3,t_1	!sqr_add_c2(a,3,0,c1,c2,c3);
   1180 	addcc	c_12,t_1,c_12
   1181 	clr	c_3
   1182 	bcs,a	%xcc,.+8
   1183 	add	c_3,t_2,c_3
   1184 	addcc	c_12,t_1,c_12
   1185 	bcs,a	%xcc,.+8
   1186 	add	c_3,t_2,c_3
   1187 	lduw	ap(4),a_4
   1188 	mulx	a_1,a_2,t_1	!sqr_add_c2(a,2,1,c1,c2,c3);
   1189 	addcc	c_12,t_1,c_12
   1190 	bcs,a	%xcc,.+8
   1191 	add	c_3,t_2,c_3
   1192 	addcc	c_12,t_1,t_1
   1193 	bcs,a	%xcc,.+8
   1194 	add	c_3,t_2,c_3
   1195 	srlx	t_1,32,c_12
   1196 	st	t_1,rp(3)	!r[3]=c1;
   1197 	or	c_12,c_3,c_12
   1198 
   1199 	mulx	a_4,a_0,t_1	!sqr_add_c2(a,4,0,c2,c3,c1);
   1200 	addcc	c_12,t_1,c_12
   1201 	clr	c_3
   1202 	bcs,a	%xcc,.+8
   1203 	add	c_3,t_2,c_3
   1204 	addcc	c_12,t_1,c_12
   1205 	bcs,a	%xcc,.+8
   1206 	add	c_3,t_2,c_3
   1207 	mulx	a_3,a_1,t_1	!sqr_add_c2(a,3,1,c2,c3,c1);
   1208 	addcc	c_12,t_1,c_12
   1209 	bcs,a	%xcc,.+8
   1210 	add	c_3,t_2,c_3
   1211 	addcc	c_12,t_1,c_12
   1212 	bcs,a	%xcc,.+8
   1213 	add	c_3,t_2,c_3
   1214 	lduw	ap(5),a_5
   1215 	mulx	a_2,a_2,t_1	!sqr_add_c(a,2,c2,c3,c1);
   1216 	addcc	c_12,t_1,t_1
   1217 	bcs,a	%xcc,.+8
   1218 	add	c_3,t_2,c_3
   1219 	srlx	t_1,32,c_12
   1220 	stuw	t_1,rp(4)	!r[4]=c2;
   1221 	or	c_12,c_3,c_12
   1222 
   1223 	mulx	a_0,a_5,t_1	!sqr_add_c2(a,5,0,c3,c1,c2);
   1224 	addcc	c_12,t_1,c_12
   1225 	clr	c_3
   1226 	bcs,a	%xcc,.+8
   1227 	add	c_3,t_2,c_3
   1228 	addcc	c_12,t_1,c_12
   1229 	bcs,a	%xcc,.+8
   1230 	add	c_3,t_2,c_3
   1231 	mulx	a_1,a_4,t_1	!sqr_add_c2(a,4,1,c3,c1,c2);
   1232 	addcc	c_12,t_1,c_12
   1233 	bcs,a	%xcc,.+8
   1234 	add	c_3,t_2,c_3
   1235 	addcc	c_12,t_1,c_12
   1236 	bcs,a	%xcc,.+8
   1237 	add	c_3,t_2,c_3
   1238 	lduw	ap(6),a_6
   1239 	mulx	a_2,a_3,t_1	!sqr_add_c2(a,3,2,c3,c1,c2);
   1240 	addcc	c_12,t_1,c_12
   1241 	bcs,a	%xcc,.+8
   1242 	add	c_3,t_2,c_3
   1243 	addcc	c_12,t_1,t_1
   1244 	bcs,a	%xcc,.+8
   1245 	add	c_3,t_2,c_3
   1246 	srlx	t_1,32,c_12
   1247 	stuw	t_1,rp(5)	!r[5]=c3;
   1248 	or	c_12,c_3,c_12
   1249 
   1250 	mulx	a_6,a_0,t_1	!sqr_add_c2(a,6,0,c1,c2,c3);
   1251 	addcc	c_12,t_1,c_12
   1252 	clr	c_3
   1253 	bcs,a	%xcc,.+8
   1254 	add	c_3,t_2,c_3
   1255 	addcc	c_12,t_1,c_12
   1256 	bcs,a	%xcc,.+8
   1257 	add	c_3,t_2,c_3
   1258 	mulx	a_5,a_1,t_1	!sqr_add_c2(a,5,1,c1,c2,c3);
   1259 	addcc	c_12,t_1,c_12
   1260 	bcs,a	%xcc,.+8
   1261 	add	c_3,t_2,c_3
   1262 	addcc	c_12,t_1,c_12
   1263 	bcs,a	%xcc,.+8
   1264 	add	c_3,t_2,c_3
   1265 	mulx	a_4,a_2,t_1	!sqr_add_c2(a,4,2,c1,c2,c3);
   1266 	addcc	c_12,t_1,c_12
   1267 	bcs,a	%xcc,.+8
   1268 	add	c_3,t_2,c_3
   1269 	addcc	c_12,t_1,c_12
   1270 	bcs,a	%xcc,.+8
   1271 	add	c_3,t_2,c_3
   1272 	lduw	ap(7),a_7
   1273 	mulx	a_3,a_3,t_1	!=!sqr_add_c(a,3,c1,c2,c3);
   1274 	addcc	c_12,t_1,t_1
   1275 	bcs,a	%xcc,.+8
   1276 	add	c_3,t_2,c_3
   1277 	srlx	t_1,32,c_12
   1278 	stuw	t_1,rp(6)	!r[6]=c1;
   1279 	or	c_12,c_3,c_12
   1280 
   1281 	mulx	a_0,a_7,t_1	!sqr_add_c2(a,7,0,c2,c3,c1);
   1282 	addcc	c_12,t_1,c_12
   1283 	clr	c_3
   1284 	bcs,a	%xcc,.+8
   1285 	add	c_3,t_2,c_3
   1286 	addcc	c_12,t_1,c_12
   1287 	bcs,a	%xcc,.+8
   1288 	add	c_3,t_2,c_3
   1289 	mulx	a_1,a_6,t_1	!sqr_add_c2(a,6,1,c2,c3,c1);
   1290 	addcc	c_12,t_1,c_12
   1291 	bcs,a	%xcc,.+8
   1292 	add	c_3,t_2,c_3
   1293 	addcc	c_12,t_1,c_12
   1294 	bcs,a	%xcc,.+8
   1295 	add	c_3,t_2,c_3
   1296 	mulx	a_2,a_5,t_1	!sqr_add_c2(a,5,2,c2,c3,c1);
   1297 	addcc	c_12,t_1,c_12
   1298 	bcs,a	%xcc,.+8
   1299 	add	c_3,t_2,c_3
   1300 	addcc	c_12,t_1,c_12
   1301 	bcs,a	%xcc,.+8
   1302 	add	c_3,t_2,c_3
   1303 	mulx	a_3,a_4,t_1	!sqr_add_c2(a,4,3,c2,c3,c1);
   1304 	addcc	c_12,t_1,c_12
   1305 	bcs,a	%xcc,.+8
   1306 	add	c_3,t_2,c_3
   1307 	addcc	c_12,t_1,t_1
   1308 	bcs,a	%xcc,.+8
   1309 	add	c_3,t_2,c_3
   1310 	srlx	t_1,32,c_12
   1311 	stuw	t_1,rp(7)	!r[7]=c2;
   1312 	or	c_12,c_3,c_12
   1313 
   1314 	mulx	a_7,a_1,t_1	!sqr_add_c2(a,7,1,c3,c1,c2);
   1315 	addcc	c_12,t_1,c_12
   1316 	clr	c_3
   1317 	bcs,a	%xcc,.+8
   1318 	add	c_3,t_2,c_3
   1319 	addcc	c_12,t_1,c_12
   1320 	bcs,a	%xcc,.+8
   1321 	add	c_3,t_2,c_3
   1322 	mulx	a_6,a_2,t_1	!sqr_add_c2(a,6,2,c3,c1,c2);
   1323 	addcc	c_12,t_1,c_12
   1324 	bcs,a	%xcc,.+8
   1325 	add	c_3,t_2,c_3
   1326 	addcc	c_12,t_1,c_12
   1327 	bcs,a	%xcc,.+8
   1328 	add	c_3,t_2,c_3
   1329 	mulx	a_5,a_3,t_1	!sqr_add_c2(a,5,3,c3,c1,c2);
   1330 	addcc	c_12,t_1,c_12
   1331 	bcs,a	%xcc,.+8
   1332 	add	c_3,t_2,c_3
   1333 	addcc	c_12,t_1,c_12
   1334 	bcs,a	%xcc,.+8
   1335 	add	c_3,t_2,c_3
   1336 	mulx	a_4,a_4,t_1	!sqr_add_c(a,4,c3,c1,c2);
   1337 	addcc	c_12,t_1,t_1
   1338 	bcs,a	%xcc,.+8
   1339 	add	c_3,t_2,c_3
   1340 	srlx	t_1,32,c_12
   1341 	stuw	t_1,rp(8)	!r[8]=c3;
   1342 	or	c_12,c_3,c_12
   1343 
   1344 	mulx	a_2,a_7,t_1	!sqr_add_c2(a,7,2,c1,c2,c3);
   1345 	addcc	c_12,t_1,c_12
   1346 	clr	c_3
   1347 	bcs,a	%xcc,.+8
   1348 	add	c_3,t_2,c_3
   1349 	addcc	c_12,t_1,c_12
   1350 	bcs,a	%xcc,.+8
   1351 	add	c_3,t_2,c_3
   1352 	mulx	a_3,a_6,t_1	!sqr_add_c2(a,6,3,c1,c2,c3);
   1353 	addcc	c_12,t_1,c_12
   1354 	bcs,a	%xcc,.+8
   1355 	add	c_3,t_2,c_3
   1356 	addcc	c_12,t_1,c_12
   1357 	bcs,a	%xcc,.+8
   1358 	add	c_3,t_2,c_3
   1359 	mulx	a_4,a_5,t_1	!sqr_add_c2(a,5,4,c1,c2,c3);
   1360 	addcc	c_12,t_1,c_12
   1361 	bcs,a	%xcc,.+8
   1362 	add	c_3,t_2,c_3
   1363 	addcc	c_12,t_1,t_1
   1364 	bcs,a	%xcc,.+8
   1365 	add	c_3,t_2,c_3
   1366 	srlx	t_1,32,c_12
   1367 	stuw	t_1,rp(9)	!r[9]=c1;
   1368 	or	c_12,c_3,c_12
   1369 
   1370 	mulx	a_7,a_3,t_1	!sqr_add_c2(a,7,3,c2,c3,c1);
   1371 	addcc	c_12,t_1,c_12
   1372 	clr	c_3
   1373 	bcs,a	%xcc,.+8
   1374 	add	c_3,t_2,c_3
   1375 	addcc	c_12,t_1,c_12
   1376 	bcs,a	%xcc,.+8
   1377 	add	c_3,t_2,c_3
   1378 	mulx	a_6,a_4,t_1	!sqr_add_c2(a,6,4,c2,c3,c1);
   1379 	addcc	c_12,t_1,c_12
   1380 	bcs,a	%xcc,.+8
   1381 	add	c_3,t_2,c_3
   1382 	addcc	c_12,t_1,c_12
   1383 	bcs,a	%xcc,.+8
   1384 	add	c_3,t_2,c_3
   1385 	mulx	a_5,a_5,t_1	!sqr_add_c(a,5,c2,c3,c1);
   1386 	addcc	c_12,t_1,t_1
   1387 	bcs,a	%xcc,.+8
   1388 	add	c_3,t_2,c_3
   1389 	srlx	t_1,32,c_12
   1390 	stuw	t_1,rp(10)	!r[10]=c2;
   1391 	or	c_12,c_3,c_12
   1392 
   1393 	mulx	a_4,a_7,t_1	!sqr_add_c2(a,7,4,c3,c1,c2);
   1394 	addcc	c_12,t_1,c_12
   1395 	clr	c_3
   1396 	bcs,a	%xcc,.+8
   1397 	add	c_3,t_2,c_3
   1398 	addcc	c_12,t_1,c_12
   1399 	bcs,a	%xcc,.+8
   1400 	add	c_3,t_2,c_3
   1401 	mulx	a_5,a_6,t_1	!sqr_add_c2(a,6,5,c3,c1,c2);
   1402 	addcc	c_12,t_1,c_12
   1403 	bcs,a	%xcc,.+8
   1404 	add	c_3,t_2,c_3
   1405 	addcc	c_12,t_1,t_1
   1406 	bcs,a	%xcc,.+8
   1407 	add	c_3,t_2,c_3
   1408 	srlx	t_1,32,c_12
   1409 	stuw	t_1,rp(11)	!r[11]=c3;
   1410 	or	c_12,c_3,c_12
   1411 
   1412 	mulx	a_7,a_5,t_1	!sqr_add_c2(a,7,5,c1,c2,c3);
   1413 	addcc	c_12,t_1,c_12
   1414 	clr	c_3
   1415 	bcs,a	%xcc,.+8
   1416 	add	c_3,t_2,c_3
   1417 	addcc	c_12,t_1,c_12
   1418 	bcs,a	%xcc,.+8
   1419 	add	c_3,t_2,c_3
   1420 	mulx	a_6,a_6,t_1	!sqr_add_c(a,6,c1,c2,c3);
   1421 	addcc	c_12,t_1,t_1
   1422 	bcs,a	%xcc,.+8
   1423 	add	c_3,t_2,c_3
   1424 	srlx	t_1,32,c_12
   1425 	stuw	t_1,rp(12)	!r[12]=c1;
   1426 	or	c_12,c_3,c_12
   1427 
   1428 	mulx	a_6,a_7,t_1	!sqr_add_c2(a,7,6,c2,c3,c1);
   1429 	addcc	c_12,t_1,c_12
   1430 	clr	c_3
   1431 	bcs,a	%xcc,.+8
   1432 	add	c_3,t_2,c_3
   1433 	addcc	c_12,t_1,t_1
   1434 	bcs,a	%xcc,.+8
   1435 	add	c_3,t_2,c_3
   1436 	srlx	t_1,32,c_12
   1437 	stuw	t_1,rp(13)	!r[13]=c2;
   1438 	or	c_12,c_3,c_12
   1439 
   1440 	mulx	a_7,a_7,t_1	!sqr_add_c(a,7,c3,c1,c2);
   1441 	addcc	c_12,t_1,t_1
   1442 	srlx	t_1,32,c_12
   1443 	stuw	t_1,rp(14)	!r[14]=c3;
   1444 	stuw	c_12,rp(15)	!r[15]=c1;
   1445 
   1446 	ret
   1447 	restore	%g0,%g0,%o0
   1448 
   1449 .type	bn_sqr_comba8,#function
   1450 .size	bn_sqr_comba8,(.-bn_sqr_comba8)
   1451 
   1452 .align	32
   1453 
   1454 .global bn_sqr_comba4
   1455 /*
   1456  * void bn_sqr_comba4(r,a)
   1457  * BN_ULONG *r,*a;
   1458  */
   1459 bn_sqr_comba4:
   1460 	save	%sp,FRAME_SIZE,%sp
   1461 	mov	1,t_2
   1462 	lduw	ap(0),a_0
   1463 	sllx	t_2,32,t_2
   1464 	lduw	ap(1),a_1
   1465 	mulx	a_0,a_0,t_1	!sqr_add_c(a,0,c1,c2,c3);
   1466 	srlx	t_1,32,c_12
   1467 	stuw	t_1,rp(0)	!r[0]=c1;
   1468 
   1469 	lduw	ap(2),a_2
   1470 	mulx	a_0,a_1,t_1	!sqr_add_c2(a,1,0,c2,c3,c1);
   1471 	addcc	c_12,t_1,c_12
   1472 	clr	c_3
   1473 	bcs,a	%xcc,.+8
   1474 	add	c_3,t_2,c_3
   1475 	addcc	c_12,t_1,t_1
   1476 	bcs,a	%xcc,.+8
   1477 	add	c_3,t_2,c_3
   1478 	srlx	t_1,32,c_12
   1479 	stuw	t_1,rp(1)	!r[1]=c2;
   1480 	or	c_12,c_3,c_12
   1481 
   1482 	mulx	a_2,a_0,t_1	!sqr_add_c2(a,2,0,c3,c1,c2);
   1483 	addcc	c_12,t_1,c_12
   1484 	clr	c_3
   1485 	bcs,a	%xcc,.+8
   1486 	add	c_3,t_2,c_3
   1487 	addcc	c_12,t_1,c_12
   1488 	bcs,a	%xcc,.+8
   1489 	add	c_3,t_2,c_3
   1490 	lduw	ap(3),a_3
   1491 	mulx	a_1,a_1,t_1	!sqr_add_c(a,1,c3,c1,c2);
   1492 	addcc	c_12,t_1,t_1
   1493 	bcs,a	%xcc,.+8
   1494 	add	c_3,t_2,c_3
   1495 	srlx	t_1,32,c_12
   1496 	stuw	t_1,rp(2)	!r[2]=c3;
   1497 	or	c_12,c_3,c_12
   1498 
   1499 	mulx	a_0,a_3,t_1	!sqr_add_c2(a,3,0,c1,c2,c3);
   1500 	addcc	c_12,t_1,c_12
   1501 	clr	c_3
   1502 	bcs,a	%xcc,.+8
   1503 	add	c_3,t_2,c_3
   1504 	addcc	c_12,t_1,c_12
   1505 	bcs,a	%xcc,.+8
   1506 	add	c_3,t_2,c_3
   1507 	mulx	a_1,a_2,t_1	!sqr_add_c2(a,2,1,c1,c2,c3);
   1508 	addcc	c_12,t_1,c_12
   1509 	bcs,a	%xcc,.+8
   1510 	add	c_3,t_2,c_3
   1511 	addcc	c_12,t_1,t_1
   1512 	bcs,a	%xcc,.+8
   1513 	add	c_3,t_2,c_3
   1514 	srlx	t_1,32,c_12
   1515 	stuw	t_1,rp(3)	!r[3]=c1;
   1516 	or	c_12,c_3,c_12
   1517 
   1518 	mulx	a_3,a_1,t_1	!sqr_add_c2(a,3,1,c2,c3,c1);
   1519 	addcc	c_12,t_1,c_12
   1520 	clr	c_3
   1521 	bcs,a	%xcc,.+8
   1522 	add	c_3,t_2,c_3
   1523 	addcc	c_12,t_1,c_12
   1524 	bcs,a	%xcc,.+8
   1525 	add	c_3,t_2,c_3
   1526 	mulx	a_2,a_2,t_1	!sqr_add_c(a,2,c2,c3,c1);
   1527 	addcc	c_12,t_1,t_1
   1528 	bcs,a	%xcc,.+8
   1529 	add	c_3,t_2,c_3
   1530 	srlx	t_1,32,c_12
   1531 	stuw	t_1,rp(4)	!r[4]=c2;
   1532 	or	c_12,c_3,c_12
   1533 
   1534 	mulx	a_2,a_3,t_1	!sqr_add_c2(a,3,2,c3,c1,c2);
   1535 	addcc	c_12,t_1,c_12
   1536 	clr	c_3
   1537 	bcs,a	%xcc,.+8
   1538 	add	c_3,t_2,c_3
   1539 	addcc	c_12,t_1,t_1
   1540 	bcs,a	%xcc,.+8
   1541 	add	c_3,t_2,c_3
   1542 	srlx	t_1,32,c_12
   1543 	stuw	t_1,rp(5)	!r[5]=c3;
   1544 	or	c_12,c_3,c_12
   1545 
   1546 	mulx	a_3,a_3,t_1	!sqr_add_c(a,3,c1,c2,c3);
   1547 	addcc	c_12,t_1,t_1
   1548 	srlx	t_1,32,c_12
   1549 	stuw	t_1,rp(6)	!r[6]=c1;
   1550 	stuw	c_12,rp(7)	!r[7]=c2;
   1551 
   1552 	ret
   1553 	restore	%g0,%g0,%o0
   1554 
   1555 .type	bn_sqr_comba4,#function
   1556 .size	bn_sqr_comba4,(.-bn_sqr_comba4)
   1557 
   1558 .align	32
   1559