Home | History | Annotate | Download | only in asm
      1 .rdata
      2 .asciiz	"mips3.s, Version 1.1"
      3 .asciiz	"MIPS III/IV ISA artwork by Andy Polyakov <appro (at) fy.chalmers.se>"
      4 
      5 /*
      6  * ====================================================================
      7  * Written by Andy Polyakov <appro (at) fy.chalmers.se> for the OpenSSL
      8  * project.
      9  *
     10  * Rights for redistribution and usage in source and binary forms are
     11  * granted according to the OpenSSL license. Warranty of any kind is
     12  * disclaimed.
     13  * ====================================================================
     14  */
     15 
     16 /*
     17  * This is my modest contributon to the OpenSSL project (see
     18  * http://www.openssl.org/ for more information about it) and is
     19  * a drop-in MIPS III/IV ISA replacement for crypto/bn/bn_asm.c
     20  * module. For updates see http://fy.chalmers.se/~appro/hpe/.
     21  *
     22  * The module is designed to work with either of the "new" MIPS ABI(5),
     23  * namely N32 or N64, offered by IRIX 6.x. It's not ment to work under
     24  * IRIX 5.x not only because it doesn't support new ABIs but also
     25  * because 5.x kernels put R4x00 CPU into 32-bit mode and all those
     26  * 64-bit instructions (daddu, dmultu, etc.) found below gonna only
     27  * cause illegal instruction exception:-(
     28  *
     29  * In addition the code depends on preprocessor flags set up by MIPSpro
     30  * compiler driver (either as or cc) and therefore (probably?) can't be
     31  * compiled by the GNU assembler. GNU C driver manages fine though...
     32  * I mean as long as -mmips-as is specified or is the default option,
     33  * because then it simply invokes /usr/bin/as which in turn takes
     34  * perfect care of the preprocessor definitions. Another neat feature
     35  * offered by the MIPSpro assembler is an optimization pass. This gave
     36  * me the opportunity to have the code looking more regular as all those
     37  * architecture dependent instruction rescheduling details were left to
     38  * the assembler. Cool, huh?
     39  *
     40  * Performance improvement is astonishing! 'apps/openssl speed rsa dsa'
     41  * goes way over 3 times faster!
     42  *
     43  *					<appro (at) fy.chalmers.se>
     44  */
     45 #include <asm.h>
     46 #include <regdef.h>
     47 
     48 #if _MIPS_ISA>=4
     49 #define	MOVNZ(cond,dst,src)	\
     50 	movn	dst,src,cond
     51 #else
     52 #define	MOVNZ(cond,dst,src)	\
     53 	.set	noreorder;	\
     54 	bnezl	cond,.+8;	\
     55 	move	dst,src;	\
     56 	.set	reorder
     57 #endif
     58 
     59 .text
     60 
     61 .set	noat
     62 .set	reorder
     63 
     64 #define	MINUS4	v1
     65 
     66 .align	5
     67 LEAF(bn_mul_add_words)
     68 	.set	noreorder
     69 	bgtzl	a2,.L_bn_mul_add_words_proceed
     70 	ld	t0,0(a1)
     71 	jr	ra
     72 	move	v0,zero
     73 	.set	reorder
     74 
     75 .L_bn_mul_add_words_proceed:
     76 	li	MINUS4,-4
     77 	and	ta0,a2,MINUS4
     78 	move	v0,zero
     79 	beqz	ta0,.L_bn_mul_add_words_tail
     80 
     81 .L_bn_mul_add_words_loop:
     82 	dmultu	t0,a3
     83 	ld	t1,0(a0)
     84 	ld	t2,8(a1)
     85 	ld	t3,8(a0)
     86 	ld	ta0,16(a1)
     87 	ld	ta1,16(a0)
     88 	daddu	t1,v0
     89 	sltu	v0,t1,v0	/* All manuals say it "compares 32-bit
     90 				 * values", but it seems to work fine
     91 				 * even on 64-bit registers. */
     92 	mflo	AT
     93 	mfhi	t0
     94 	daddu	t1,AT
     95 	daddu	v0,t0
     96 	sltu	AT,t1,AT
     97 	sd	t1,0(a0)
     98 	daddu	v0,AT
     99 
    100 	dmultu	t2,a3
    101 	ld	ta2,24(a1)
    102 	ld	ta3,24(a0)
    103 	daddu	t3,v0
    104 	sltu	v0,t3,v0
    105 	mflo	AT
    106 	mfhi	t2
    107 	daddu	t3,AT
    108 	daddu	v0,t2
    109 	sltu	AT,t3,AT
    110 	sd	t3,8(a0)
    111 	daddu	v0,AT
    112 
    113 	dmultu	ta0,a3
    114 	subu	a2,4
    115 	PTR_ADD	a0,32
    116 	PTR_ADD	a1,32
    117 	daddu	ta1,v0
    118 	sltu	v0,ta1,v0
    119 	mflo	AT
    120 	mfhi	ta0
    121 	daddu	ta1,AT
    122 	daddu	v0,ta0
    123 	sltu	AT,ta1,AT
    124 	sd	ta1,-16(a0)
    125 	daddu	v0,AT
    126 
    127 
    128 	dmultu	ta2,a3
    129 	and	ta0,a2,MINUS4
    130 	daddu	ta3,v0
    131 	sltu	v0,ta3,v0
    132 	mflo	AT
    133 	mfhi	ta2
    134 	daddu	ta3,AT
    135 	daddu	v0,ta2
    136 	sltu	AT,ta3,AT
    137 	sd	ta3,-8(a0)
    138 	daddu	v0,AT
    139 	.set	noreorder
    140 	bgtzl	ta0,.L_bn_mul_add_words_loop
    141 	ld	t0,0(a1)
    142 
    143 	bnezl	a2,.L_bn_mul_add_words_tail
    144 	ld	t0,0(a1)
    145 	.set	reorder
    146 
    147 .L_bn_mul_add_words_return:
    148 	jr	ra
    149 
    150 .L_bn_mul_add_words_tail:
    151 	dmultu	t0,a3
    152 	ld	t1,0(a0)
    153 	subu	a2,1
    154 	daddu	t1,v0
    155 	sltu	v0,t1,v0
    156 	mflo	AT
    157 	mfhi	t0
    158 	daddu	t1,AT
    159 	daddu	v0,t0
    160 	sltu	AT,t1,AT
    161 	sd	t1,0(a0)
    162 	daddu	v0,AT
    163 	beqz	a2,.L_bn_mul_add_words_return
    164 
    165 	ld	t0,8(a1)
    166 	dmultu	t0,a3
    167 	ld	t1,8(a0)
    168 	subu	a2,1
    169 	daddu	t1,v0
    170 	sltu	v0,t1,v0
    171 	mflo	AT
    172 	mfhi	t0
    173 	daddu	t1,AT
    174 	daddu	v0,t0
    175 	sltu	AT,t1,AT
    176 	sd	t1,8(a0)
    177 	daddu	v0,AT
    178 	beqz	a2,.L_bn_mul_add_words_return
    179 
    180 	ld	t0,16(a1)
    181 	dmultu	t0,a3
    182 	ld	t1,16(a0)
    183 	daddu	t1,v0
    184 	sltu	v0,t1,v0
    185 	mflo	AT
    186 	mfhi	t0
    187 	daddu	t1,AT
    188 	daddu	v0,t0
    189 	sltu	AT,t1,AT
    190 	sd	t1,16(a0)
    191 	daddu	v0,AT
    192 	jr	ra
    193 END(bn_mul_add_words)
    194 
    195 .align	5
    196 LEAF(bn_mul_words)
    197 	.set	noreorder
    198 	bgtzl	a2,.L_bn_mul_words_proceed
    199 	ld	t0,0(a1)
    200 	jr	ra
    201 	move	v0,zero
    202 	.set	reorder
    203 
    204 .L_bn_mul_words_proceed:
    205 	li	MINUS4,-4
    206 	and	ta0,a2,MINUS4
    207 	move	v0,zero
    208 	beqz	ta0,.L_bn_mul_words_tail
    209 
    210 .L_bn_mul_words_loop:
    211 	dmultu	t0,a3
    212 	ld	t2,8(a1)
    213 	ld	ta0,16(a1)
    214 	ld	ta2,24(a1)
    215 	mflo	AT
    216 	mfhi	t0
    217 	daddu	v0,AT
    218 	sltu	t1,v0,AT
    219 	sd	v0,0(a0)
    220 	daddu	v0,t1,t0
    221 
    222 	dmultu	t2,a3
    223 	subu	a2,4
    224 	PTR_ADD	a0,32
    225 	PTR_ADD	a1,32
    226 	mflo	AT
    227 	mfhi	t2
    228 	daddu	v0,AT
    229 	sltu	t3,v0,AT
    230 	sd	v0,-24(a0)
    231 	daddu	v0,t3,t2
    232 
    233 	dmultu	ta0,a3
    234 	mflo	AT
    235 	mfhi	ta0
    236 	daddu	v0,AT
    237 	sltu	ta1,v0,AT
    238 	sd	v0,-16(a0)
    239 	daddu	v0,ta1,ta0
    240 
    241 
    242 	dmultu	ta2,a3
    243 	and	ta0,a2,MINUS4
    244 	mflo	AT
    245 	mfhi	ta2
    246 	daddu	v0,AT
    247 	sltu	ta3,v0,AT
    248 	sd	v0,-8(a0)
    249 	daddu	v0,ta3,ta2
    250 	.set	noreorder
    251 	bgtzl	ta0,.L_bn_mul_words_loop
    252 	ld	t0,0(a1)
    253 
    254 	bnezl	a2,.L_bn_mul_words_tail
    255 	ld	t0,0(a1)
    256 	.set	reorder
    257 
    258 .L_bn_mul_words_return:
    259 	jr	ra
    260 
    261 .L_bn_mul_words_tail:
    262 	dmultu	t0,a3
    263 	subu	a2,1
    264 	mflo	AT
    265 	mfhi	t0
    266 	daddu	v0,AT
    267 	sltu	t1,v0,AT
    268 	sd	v0,0(a0)
    269 	daddu	v0,t1,t0
    270 	beqz	a2,.L_bn_mul_words_return
    271 
    272 	ld	t0,8(a1)
    273 	dmultu	t0,a3
    274 	subu	a2,1
    275 	mflo	AT
    276 	mfhi	t0
    277 	daddu	v0,AT
    278 	sltu	t1,v0,AT
    279 	sd	v0,8(a0)
    280 	daddu	v0,t1,t0
    281 	beqz	a2,.L_bn_mul_words_return
    282 
    283 	ld	t0,16(a1)
    284 	dmultu	t0,a3
    285 	mflo	AT
    286 	mfhi	t0
    287 	daddu	v0,AT
    288 	sltu	t1,v0,AT
    289 	sd	v0,16(a0)
    290 	daddu	v0,t1,t0
    291 	jr	ra
    292 END(bn_mul_words)
    293 
    294 .align	5
    295 LEAF(bn_sqr_words)
    296 	.set	noreorder
    297 	bgtzl	a2,.L_bn_sqr_words_proceed
    298 	ld	t0,0(a1)
    299 	jr	ra
    300 	move	v0,zero
    301 	.set	reorder
    302 
    303 .L_bn_sqr_words_proceed:
    304 	li	MINUS4,-4
    305 	and	ta0,a2,MINUS4
    306 	move	v0,zero
    307 	beqz	ta0,.L_bn_sqr_words_tail
    308 
    309 .L_bn_sqr_words_loop:
    310 	dmultu	t0,t0
    311 	ld	t2,8(a1)
    312 	ld	ta0,16(a1)
    313 	ld	ta2,24(a1)
    314 	mflo	t1
    315 	mfhi	t0
    316 	sd	t1,0(a0)
    317 	sd	t0,8(a0)
    318 
    319 	dmultu	t2,t2
    320 	subu	a2,4
    321 	PTR_ADD	a0,64
    322 	PTR_ADD	a1,32
    323 	mflo	t3
    324 	mfhi	t2
    325 	sd	t3,-48(a0)
    326 	sd	t2,-40(a0)
    327 
    328 	dmultu	ta0,ta0
    329 	mflo	ta1
    330 	mfhi	ta0
    331 	sd	ta1,-32(a0)
    332 	sd	ta0,-24(a0)
    333 
    334 
    335 	dmultu	ta2,ta2
    336 	and	ta0,a2,MINUS4
    337 	mflo	ta3
    338 	mfhi	ta2
    339 	sd	ta3,-16(a0)
    340 	sd	ta2,-8(a0)
    341 
    342 	.set	noreorder
    343 	bgtzl	ta0,.L_bn_sqr_words_loop
    344 	ld	t0,0(a1)
    345 
    346 	bnezl	a2,.L_bn_sqr_words_tail
    347 	ld	t0,0(a1)
    348 	.set	reorder
    349 
    350 .L_bn_sqr_words_return:
    351 	move	v0,zero
    352 	jr	ra
    353 
    354 .L_bn_sqr_words_tail:
    355 	dmultu	t0,t0
    356 	subu	a2,1
    357 	mflo	t1
    358 	mfhi	t0
    359 	sd	t1,0(a0)
    360 	sd	t0,8(a0)
    361 	beqz	a2,.L_bn_sqr_words_return
    362 
    363 	ld	t0,8(a1)
    364 	dmultu	t0,t0
    365 	subu	a2,1
    366 	mflo	t1
    367 	mfhi	t0
    368 	sd	t1,16(a0)
    369 	sd	t0,24(a0)
    370 	beqz	a2,.L_bn_sqr_words_return
    371 
    372 	ld	t0,16(a1)
    373 	dmultu	t0,t0
    374 	mflo	t1
    375 	mfhi	t0
    376 	sd	t1,32(a0)
    377 	sd	t0,40(a0)
    378 	jr	ra
    379 END(bn_sqr_words)
    380 
    381 .align	5
    382 LEAF(bn_add_words)
    383 	.set	noreorder
    384 	bgtzl	a3,.L_bn_add_words_proceed
    385 	ld	t0,0(a1)
    386 	jr	ra
    387 	move	v0,zero
    388 	.set	reorder
    389 
    390 .L_bn_add_words_proceed:
    391 	li	MINUS4,-4
    392 	and	AT,a3,MINUS4
    393 	move	v0,zero
    394 	beqz	AT,.L_bn_add_words_tail
    395 
    396 .L_bn_add_words_loop:
    397 	ld	ta0,0(a2)
    398 	subu	a3,4
    399 	ld	t1,8(a1)
    400 	and	AT,a3,MINUS4
    401 	ld	t2,16(a1)
    402 	PTR_ADD	a2,32
    403 	ld	t3,24(a1)
    404 	PTR_ADD	a0,32
    405 	ld	ta1,-24(a2)
    406 	PTR_ADD	a1,32
    407 	ld	ta2,-16(a2)
    408 	ld	ta3,-8(a2)
    409 	daddu	ta0,t0
    410 	sltu	t8,ta0,t0
    411 	daddu	t0,ta0,v0
    412 	sltu	v0,t0,ta0
    413 	sd	t0,-32(a0)
    414 	daddu	v0,t8
    415 
    416 	daddu	ta1,t1
    417 	sltu	t9,ta1,t1
    418 	daddu	t1,ta1,v0
    419 	sltu	v0,t1,ta1
    420 	sd	t1,-24(a0)
    421 	daddu	v0,t9
    422 
    423 	daddu	ta2,t2
    424 	sltu	t8,ta2,t2
    425 	daddu	t2,ta2,v0
    426 	sltu	v0,t2,ta2
    427 	sd	t2,-16(a0)
    428 	daddu	v0,t8
    429 
    430 	daddu	ta3,t3
    431 	sltu	t9,ta3,t3
    432 	daddu	t3,ta3,v0
    433 	sltu	v0,t3,ta3
    434 	sd	t3,-8(a0)
    435 	daddu	v0,t9
    436 
    437 	.set	noreorder
    438 	bgtzl	AT,.L_bn_add_words_loop
    439 	ld	t0,0(a1)
    440 
    441 	bnezl	a3,.L_bn_add_words_tail
    442 	ld	t0,0(a1)
    443 	.set	reorder
    444 
    445 .L_bn_add_words_return:
    446 	jr	ra
    447 
    448 .L_bn_add_words_tail:
    449 	ld	ta0,0(a2)
    450 	daddu	ta0,t0
    451 	subu	a3,1
    452 	sltu	t8,ta0,t0
    453 	daddu	t0,ta0,v0
    454 	sltu	v0,t0,ta0
    455 	sd	t0,0(a0)
    456 	daddu	v0,t8
    457 	beqz	a3,.L_bn_add_words_return
    458 
    459 	ld	t1,8(a1)
    460 	ld	ta1,8(a2)
    461 	daddu	ta1,t1
    462 	subu	a3,1
    463 	sltu	t9,ta1,t1
    464 	daddu	t1,ta1,v0
    465 	sltu	v0,t1,ta1
    466 	sd	t1,8(a0)
    467 	daddu	v0,t9
    468 	beqz	a3,.L_bn_add_words_return
    469 
    470 	ld	t2,16(a1)
    471 	ld	ta2,16(a2)
    472 	daddu	ta2,t2
    473 	sltu	t8,ta2,t2
    474 	daddu	t2,ta2,v0
    475 	sltu	v0,t2,ta2
    476 	sd	t2,16(a0)
    477 	daddu	v0,t8
    478 	jr	ra
    479 END(bn_add_words)
    480 
    481 .align	5
    482 LEAF(bn_sub_words)
    483 	.set	noreorder
    484 	bgtzl	a3,.L_bn_sub_words_proceed
    485 	ld	t0,0(a1)
    486 	jr	ra
    487 	move	v0,zero
    488 	.set	reorder
    489 
    490 .L_bn_sub_words_proceed:
    491 	li	MINUS4,-4
    492 	and	AT,a3,MINUS4
    493 	move	v0,zero
    494 	beqz	AT,.L_bn_sub_words_tail
    495 
    496 .L_bn_sub_words_loop:
    497 	ld	ta0,0(a2)
    498 	subu	a3,4
    499 	ld	t1,8(a1)
    500 	and	AT,a3,MINUS4
    501 	ld	t2,16(a1)
    502 	PTR_ADD	a2,32
    503 	ld	t3,24(a1)
    504 	PTR_ADD	a0,32
    505 	ld	ta1,-24(a2)
    506 	PTR_ADD	a1,32
    507 	ld	ta2,-16(a2)
    508 	ld	ta3,-8(a2)
    509 	sltu	t8,t0,ta0
    510 	dsubu	t0,ta0
    511 	dsubu	ta0,t0,v0
    512 	sd	ta0,-32(a0)
    513 	MOVNZ	(t0,v0,t8)
    514 
    515 	sltu	t9,t1,ta1
    516 	dsubu	t1,ta1
    517 	dsubu	ta1,t1,v0
    518 	sd	ta1,-24(a0)
    519 	MOVNZ	(t1,v0,t9)
    520 
    521 
    522 	sltu	t8,t2,ta2
    523 	dsubu	t2,ta2
    524 	dsubu	ta2,t2,v0
    525 	sd	ta2,-16(a0)
    526 	MOVNZ	(t2,v0,t8)
    527 
    528 	sltu	t9,t3,ta3
    529 	dsubu	t3,ta3
    530 	dsubu	ta3,t3,v0
    531 	sd	ta3,-8(a0)
    532 	MOVNZ	(t3,v0,t9)
    533 
    534 	.set	noreorder
    535 	bgtzl	AT,.L_bn_sub_words_loop
    536 	ld	t0,0(a1)
    537 
    538 	bnezl	a3,.L_bn_sub_words_tail
    539 	ld	t0,0(a1)
    540 	.set	reorder
    541 
    542 .L_bn_sub_words_return:
    543 	jr	ra
    544 
    545 .L_bn_sub_words_tail:
    546 	ld	ta0,0(a2)
    547 	subu	a3,1
    548 	sltu	t8,t0,ta0
    549 	dsubu	t0,ta0
    550 	dsubu	ta0,t0,v0
    551 	MOVNZ	(t0,v0,t8)
    552 	sd	ta0,0(a0)
    553 	beqz	a3,.L_bn_sub_words_return
    554 
    555 	ld	t1,8(a1)
    556 	subu	a3,1
    557 	ld	ta1,8(a2)
    558 	sltu	t9,t1,ta1
    559 	dsubu	t1,ta1
    560 	dsubu	ta1,t1,v0
    561 	MOVNZ	(t1,v0,t9)
    562 	sd	ta1,8(a0)
    563 	beqz	a3,.L_bn_sub_words_return
    564 
    565 	ld	t2,16(a1)
    566 	ld	ta2,16(a2)
    567 	sltu	t8,t2,ta2
    568 	dsubu	t2,ta2
    569 	dsubu	ta2,t2,v0
    570 	MOVNZ	(t2,v0,t8)
    571 	sd	ta2,16(a0)
    572 	jr	ra
    573 END(bn_sub_words)
    574 
    575 #undef	MINUS4
    576 
    577 .align 5
    578 LEAF(bn_div_3_words)
    579 	.set	reorder
    580 	move	a3,a0		/* we know that bn_div_words doesn't
    581 				 * touch a3, ta2, ta3 and preserves a2
    582 				 * so that we can save two arguments
    583 				 * and return address in registers
    584 				 * instead of stack:-)
    585 				 */
    586 	ld	a0,(a3)
    587 	move	ta2,a1
    588 	ld	a1,-8(a3)
    589 	bne	a0,a2,.L_bn_div_3_words_proceed
    590 	li	v0,-1
    591 	jr	ra
    592 .L_bn_div_3_words_proceed:
    593 	move	ta3,ra
    594 	bal	bn_div_words
    595 	move	ra,ta3
    596 	dmultu	ta2,v0
    597 	ld	t2,-16(a3)
    598 	move	ta0,zero
    599 	mfhi	t1
    600 	mflo	t0
    601 	sltu	t8,t1,v1
    602 .L_bn_div_3_words_inner_loop:
    603 	bnez	t8,.L_bn_div_3_words_inner_loop_done
    604 	sgeu	AT,t2,t0
    605 	seq	t9,t1,v1
    606 	and	AT,t9
    607 	sltu	t3,t0,ta2
    608 	daddu	v1,a2
    609 	dsubu	t1,t3
    610 	dsubu	t0,ta2
    611 	sltu	t8,t1,v1
    612 	sltu	ta0,v1,a2
    613 	or	t8,ta0
    614 	.set	noreorder
    615 	beqzl	AT,.L_bn_div_3_words_inner_loop
    616 	dsubu	v0,1
    617 	.set	reorder
    618 .L_bn_div_3_words_inner_loop_done:
    619 	jr	ra
    620 END(bn_div_3_words)
    621 
    622 .align	5
    623 LEAF(bn_div_words)
    624 	.set	noreorder
    625 	bnezl	a2,.L_bn_div_words_proceed
    626 	move	v1,zero
    627 	jr	ra
    628 	li	v0,-1		/* I'd rather signal div-by-zero
    629 				 * which can be done with 'break 7' */
    630 
    631 .L_bn_div_words_proceed:
    632 	bltz	a2,.L_bn_div_words_body
    633 	move	t9,v1
    634 	dsll	a2,1
    635 	bgtz	a2,.-4
    636 	addu	t9,1
    637 
    638 	.set	reorder
    639 	negu	t1,t9
    640 	li	t2,-1
    641 	dsll	t2,t1
    642 	and	t2,a0
    643 	dsrl	AT,a1,t1
    644 	.set	noreorder
    645 	bnezl	t2,.+8
    646 	break	6		/* signal overflow */
    647 	.set	reorder
    648 	dsll	a0,t9
    649 	dsll	a1,t9
    650 	or	a0,AT
    651 
    652 #define	QT	ta0
    653 #define	HH	ta1
    654 #define	DH	v1
    655 .L_bn_div_words_body:
    656 	dsrl	DH,a2,32
    657 	sgeu	AT,a0,a2
    658 	.set	noreorder
    659 	bnezl	AT,.+8
    660 	dsubu	a0,a2
    661 	.set	reorder
    662 
    663 	li	QT,-1
    664 	dsrl	HH,a0,32
    665 	dsrl	QT,32	/* q=0xffffffff */
    666 	beq	DH,HH,.L_bn_div_words_skip_div1
    667 	ddivu	zero,a0,DH
    668 	mflo	QT
    669 .L_bn_div_words_skip_div1:
    670 	dmultu	a2,QT
    671 	dsll	t3,a0,32
    672 	dsrl	AT,a1,32
    673 	or	t3,AT
    674 	mflo	t0
    675 	mfhi	t1
    676 .L_bn_div_words_inner_loop1:
    677 	sltu	t2,t3,t0
    678 	seq	t8,HH,t1
    679 	sltu	AT,HH,t1
    680 	and	t2,t8
    681 	sltu	v0,t0,a2
    682 	or	AT,t2
    683 	.set	noreorder
    684 	beqz	AT,.L_bn_div_words_inner_loop1_done
    685 	dsubu	t1,v0
    686 	dsubu	t0,a2
    687 	b	.L_bn_div_words_inner_loop1
    688 	dsubu	QT,1
    689 	.set	reorder
    690 .L_bn_div_words_inner_loop1_done:
    691 
    692 	dsll	a1,32
    693 	dsubu	a0,t3,t0
    694 	dsll	v0,QT,32
    695 
    696 	li	QT,-1
    697 	dsrl	HH,a0,32
    698 	dsrl	QT,32	/* q=0xffffffff */
    699 	beq	DH,HH,.L_bn_div_words_skip_div2
    700 	ddivu	zero,a0,DH
    701 	mflo	QT
    702 .L_bn_div_words_skip_div2:
    703 #undef	DH
    704 	dmultu	a2,QT
    705 	dsll	t3,a0,32
    706 	dsrl	AT,a1,32
    707 	or	t3,AT
    708 	mflo	t0
    709 	mfhi	t1
    710 .L_bn_div_words_inner_loop2:
    711 	sltu	t2,t3,t0
    712 	seq	t8,HH,t1
    713 	sltu	AT,HH,t1
    714 	and	t2,t8
    715 	sltu	v1,t0,a2
    716 	or	AT,t2
    717 	.set	noreorder
    718 	beqz	AT,.L_bn_div_words_inner_loop2_done
    719 	dsubu	t1,v1
    720 	dsubu	t0,a2
    721 	b	.L_bn_div_words_inner_loop2
    722 	dsubu	QT,1
    723 	.set	reorder
    724 .L_bn_div_words_inner_loop2_done:
    725 #undef	HH
    726 
    727 	dsubu	a0,t3,t0
    728 	or	v0,QT
    729 	dsrl	v1,a0,t9	/* v1 contains remainder if anybody wants it */
    730 	dsrl	a2,t9		/* restore a2 */
    731 	jr	ra
    732 #undef	QT
    733 END(bn_div_words)
    734 
    735 #define	a_0	t0
    736 #define	a_1	t1
    737 #define	a_2	t2
    738 #define	a_3	t3
    739 #define	b_0	ta0
    740 #define	b_1	ta1
    741 #define	b_2	ta2
    742 #define	b_3	ta3
    743 
    744 #define	a_4	s0
    745 #define	a_5	s2
    746 #define	a_6	s4
    747 #define	a_7	a1	/* once we load a[7] we don't need a anymore */
    748 #define	b_4	s1
    749 #define	b_5	s3
    750 #define	b_6	s5
    751 #define	b_7	a2	/* once we load b[7] we don't need b anymore */
    752 
    753 #define	t_1	t8
    754 #define	t_2	t9
    755 
    756 #define	c_1	v0
    757 #define	c_2	v1
    758 #define	c_3	a3
    759 
    760 #define	FRAME_SIZE	48
    761 
    762 .align	5
    763 LEAF(bn_mul_comba8)
    764 	.set	noreorder
    765 	PTR_SUB	sp,FRAME_SIZE
    766 	.frame	sp,64,ra
    767 	.set	reorder
    768 	ld	a_0,0(a1)	/* If compiled with -mips3 option on
    769 				 * R5000 box assembler barks on this
    770 				 * line with "shouldn't have mult/div
    771 				 * as last instruction in bb (R10K
    772 				 * bug)" warning. If anybody out there
    773 				 * has a clue about how to circumvent
    774 				 * this do send me a note.
    775 				 *		<appro (at) fy.chalmers.se>
    776 				 */
    777 	ld	b_0,0(a2)
    778 	ld	a_1,8(a1)
    779 	ld	a_2,16(a1)
    780 	ld	a_3,24(a1)
    781 	ld	b_1,8(a2)
    782 	ld	b_2,16(a2)
    783 	ld	b_3,24(a2)
    784 	dmultu	a_0,b_0		/* mul_add_c(a[0],b[0],c1,c2,c3); */
    785 	sd	s0,0(sp)
    786 	sd	s1,8(sp)
    787 	sd	s2,16(sp)
    788 	sd	s3,24(sp)
    789 	sd	s4,32(sp)
    790 	sd	s5,40(sp)
    791 	mflo	c_1
    792 	mfhi	c_2
    793 
    794 	dmultu	a_0,b_1		/* mul_add_c(a[0],b[1],c2,c3,c1); */
    795 	ld	a_4,32(a1)
    796 	ld	a_5,40(a1)
    797 	ld	a_6,48(a1)
    798 	ld	a_7,56(a1)
    799 	ld	b_4,32(a2)
    800 	ld	b_5,40(a2)
    801 	mflo	t_1
    802 	mfhi	t_2
    803 	daddu	c_2,t_1
    804 	sltu	AT,c_2,t_1
    805 	daddu	c_3,t_2,AT
    806 	dmultu	a_1,b_0		/* mul_add_c(a[1],b[0],c2,c3,c1); */
    807 	ld	b_6,48(a2)
    808 	ld	b_7,56(a2)
    809 	sd	c_1,0(a0)	/* r[0]=c1; */
    810 	mflo	t_1
    811 	mfhi	t_2
    812 	daddu	c_2,t_1
    813 	sltu	AT,c_2,t_1
    814 	daddu	t_2,AT
    815 	daddu	c_3,t_2
    816 	sltu	c_1,c_3,t_2
    817 	sd	c_2,8(a0)	/* r[1]=c2; */
    818 
    819 	dmultu	a_2,b_0		/* mul_add_c(a[2],b[0],c3,c1,c2); */
    820 	mflo	t_1
    821 	mfhi	t_2
    822 	daddu	c_3,t_1
    823 	sltu	AT,c_3,t_1
    824 	daddu	t_2,AT
    825 	daddu	c_1,t_2
    826 	dmultu	a_1,b_1		/* mul_add_c(a[1],b[1],c3,c1,c2); */
    827 	mflo	t_1
    828 	mfhi	t_2
    829 	daddu	c_3,t_1
    830 	sltu	AT,c_3,t_1
    831 	daddu	t_2,AT
    832 	daddu	c_1,t_2
    833 	sltu	c_2,c_1,t_2
    834 	dmultu	a_0,b_2		/* mul_add_c(a[0],b[2],c3,c1,c2); */
    835 	mflo	t_1
    836 	mfhi	t_2
    837 	daddu	c_3,t_1
    838 	sltu	AT,c_3,t_1
    839 	daddu	t_2,AT
    840 	daddu	c_1,t_2
    841 	sltu	AT,c_1,t_2
    842 	daddu	c_2,AT
    843 	sd	c_3,16(a0)	/* r[2]=c3; */
    844 
    845 	dmultu	a_0,b_3		/* mul_add_c(a[0],b[3],c1,c2,c3); */
    846 	mflo	t_1
    847 	mfhi	t_2
    848 	daddu	c_1,t_1
    849 	sltu	AT,c_1,t_1
    850 	daddu	t_2,AT
    851 	daddu	c_2,t_2
    852 	sltu	c_3,c_2,t_2
    853 	dmultu	a_1,b_2		/* mul_add_c(a[1],b[2],c1,c2,c3); */
    854 	mflo	t_1
    855 	mfhi	t_2
    856 	daddu	c_1,t_1
    857 	sltu	AT,c_1,t_1
    858 	daddu	t_2,AT
    859 	daddu	c_2,t_2
    860 	sltu	AT,c_2,t_2
    861 	daddu	c_3,AT
    862 	dmultu	a_2,b_1		/* mul_add_c(a[2],b[1],c1,c2,c3); */
    863 	mflo	t_1
    864 	mfhi	t_2
    865 	daddu	c_1,t_1
    866 	sltu	AT,c_1,t_1
    867 	daddu	t_2,AT
    868 	daddu	c_2,t_2
    869 	sltu	AT,c_2,t_2
    870 	daddu	c_3,AT
    871 	dmultu	a_3,b_0		/* mul_add_c(a[3],b[0],c1,c2,c3); */
    872 	mflo	t_1
    873 	mfhi	t_2
    874 	daddu	c_1,t_1
    875 	sltu	AT,c_1,t_1
    876 	daddu	t_2,AT
    877 	daddu	c_2,t_2
    878 	sltu	AT,c_2,t_2
    879 	daddu	c_3,AT
    880 	sd	c_1,24(a0)	/* r[3]=c1; */
    881 
    882 	dmultu	a_4,b_0		/* mul_add_c(a[4],b[0],c2,c3,c1); */
    883 	mflo	t_1
    884 	mfhi	t_2
    885 	daddu	c_2,t_1
    886 	sltu	AT,c_2,t_1
    887 	daddu	t_2,AT
    888 	daddu	c_3,t_2
    889 	sltu	c_1,c_3,t_2
    890 	dmultu	a_3,b_1		/* mul_add_c(a[3],b[1],c2,c3,c1); */
    891 	mflo	t_1
    892 	mfhi	t_2
    893 	daddu	c_2,t_1
    894 	sltu	AT,c_2,t_1
    895 	daddu	t_2,AT
    896 	daddu	c_3,t_2
    897 	sltu	AT,c_3,t_2
    898 	daddu	c_1,AT
    899 	dmultu	a_2,b_2		/* mul_add_c(a[2],b[2],c2,c3,c1); */
    900 	mflo	t_1
    901 	mfhi	t_2
    902 	daddu	c_2,t_1
    903 	sltu	AT,c_2,t_1
    904 	daddu	t_2,AT
    905 	daddu	c_3,t_2
    906 	sltu	AT,c_3,t_2
    907 	daddu	c_1,AT
    908 	dmultu	a_1,b_3		/* mul_add_c(a[1],b[3],c2,c3,c1); */
    909 	mflo	t_1
    910 	mfhi	t_2
    911 	daddu	c_2,t_1
    912 	sltu	AT,c_2,t_1
    913 	daddu	t_2,AT
    914 	daddu	c_3,t_2
    915 	sltu	AT,c_3,t_2
    916 	daddu	c_1,AT
    917 	dmultu	a_0,b_4		/* mul_add_c(a[0],b[4],c2,c3,c1); */
    918 	mflo	t_1
    919 	mfhi	t_2
    920 	daddu	c_2,t_1
    921 	sltu	AT,c_2,t_1
    922 	daddu	t_2,AT
    923 	daddu	c_3,t_2
    924 	sltu	AT,c_3,t_2
    925 	daddu	c_1,AT
    926 	sd	c_2,32(a0)	/* r[4]=c2; */
    927 
    928 	dmultu	a_0,b_5		/* mul_add_c(a[0],b[5],c3,c1,c2); */
    929 	mflo	t_1
    930 	mfhi	t_2
    931 	daddu	c_3,t_1
    932 	sltu	AT,c_3,t_1
    933 	daddu	t_2,AT
    934 	daddu	c_1,t_2
    935 	sltu	c_2,c_1,t_2
    936 	dmultu	a_1,b_4		/* mul_add_c(a[1],b[4],c3,c1,c2); */
    937 	mflo	t_1
    938 	mfhi	t_2
    939 	daddu	c_3,t_1
    940 	sltu	AT,c_3,t_1
    941 	daddu	t_2,AT
    942 	daddu	c_1,t_2
    943 	sltu	AT,c_1,t_2
    944 	daddu	c_2,AT
    945 	dmultu	a_2,b_3		/* mul_add_c(a[2],b[3],c3,c1,c2); */
    946 	mflo	t_1
    947 	mfhi	t_2
    948 	daddu	c_3,t_1
    949 	sltu	AT,c_3,t_1
    950 	daddu	t_2,AT
    951 	daddu	c_1,t_2
    952 	sltu	AT,c_1,t_2
    953 	daddu	c_2,AT
    954 	dmultu	a_3,b_2		/* mul_add_c(a[3],b[2],c3,c1,c2); */
    955 	mflo	t_1
    956 	mfhi	t_2
    957 	daddu	c_3,t_1
    958 	sltu	AT,c_3,t_1
    959 	daddu	t_2,AT
    960 	daddu	c_1,t_2
    961 	sltu	AT,c_1,t_2
    962 	daddu	c_2,AT
    963 	dmultu	a_4,b_1		/* mul_add_c(a[4],b[1],c3,c1,c2); */
    964 	mflo	t_1
    965 	mfhi	t_2
    966 	daddu	c_3,t_1
    967 	sltu	AT,c_3,t_1
    968 	daddu	t_2,AT
    969 	daddu	c_1,t_2
    970 	sltu	AT,c_1,t_2
    971 	daddu	c_2,AT
    972 	dmultu	a_5,b_0		/* mul_add_c(a[5],b[0],c3,c1,c2); */
    973 	mflo	t_1
    974 	mfhi	t_2
    975 	daddu	c_3,t_1
    976 	sltu	AT,c_3,t_1
    977 	daddu	t_2,AT
    978 	daddu	c_1,t_2
    979 	sltu	AT,c_1,t_2
    980 	daddu	c_2,AT
    981 	sd	c_3,40(a0)	/* r[5]=c3; */
    982 
    983 	dmultu	a_6,b_0		/* mul_add_c(a[6],b[0],c1,c2,c3); */
    984 	mflo	t_1
    985 	mfhi	t_2
    986 	daddu	c_1,t_1
    987 	sltu	AT,c_1,t_1
    988 	daddu	t_2,AT
    989 	daddu	c_2,t_2
    990 	sltu	c_3,c_2,t_2
    991 	dmultu	a_5,b_1		/* mul_add_c(a[5],b[1],c1,c2,c3); */
    992 	mflo	t_1
    993 	mfhi	t_2
    994 	daddu	c_1,t_1
    995 	sltu	AT,c_1,t_1
    996 	daddu	t_2,AT
    997 	daddu	c_2,t_2
    998 	sltu	AT,c_2,t_2
    999 	daddu	c_3,AT
   1000 	dmultu	a_4,b_2		/* mul_add_c(a[4],b[2],c1,c2,c3); */
   1001 	mflo	t_1
   1002 	mfhi	t_2
   1003 	daddu	c_1,t_1
   1004 	sltu	AT,c_1,t_1
   1005 	daddu	t_2,AT
   1006 	daddu	c_2,t_2
   1007 	sltu	AT,c_2,t_2
   1008 	daddu	c_3,AT
   1009 	dmultu	a_3,b_3		/* mul_add_c(a[3],b[3],c1,c2,c3); */
   1010 	mflo	t_1
   1011 	mfhi	t_2
   1012 	daddu	c_1,t_1
   1013 	sltu	AT,c_1,t_1
   1014 	daddu	t_2,AT
   1015 	daddu	c_2,t_2
   1016 	sltu	AT,c_2,t_2
   1017 	daddu	c_3,AT
   1018 	dmultu	a_2,b_4		/* mul_add_c(a[2],b[4],c1,c2,c3); */
   1019 	mflo	t_1
   1020 	mfhi	t_2
   1021 	daddu	c_1,t_1
   1022 	sltu	AT,c_1,t_1
   1023 	daddu	t_2,AT
   1024 	daddu	c_2,t_2
   1025 	sltu	AT,c_2,t_2
   1026 	daddu	c_3,AT
   1027 	dmultu	a_1,b_5		/* mul_add_c(a[1],b[5],c1,c2,c3); */
   1028 	mflo	t_1
   1029 	mfhi	t_2
   1030 	daddu	c_1,t_1
   1031 	sltu	AT,c_1,t_1
   1032 	daddu	t_2,AT
   1033 	daddu	c_2,t_2
   1034 	sltu	AT,c_2,t_2
   1035 	daddu	c_3,AT
   1036 	dmultu	a_0,b_6		/* mul_add_c(a[0],b[6],c1,c2,c3); */
   1037 	mflo	t_1
   1038 	mfhi	t_2
   1039 	daddu	c_1,t_1
   1040 	sltu	AT,c_1,t_1
   1041 	daddu	t_2,AT
   1042 	daddu	c_2,t_2
   1043 	sltu	AT,c_2,t_2
   1044 	daddu	c_3,AT
   1045 	sd	c_1,48(a0)	/* r[6]=c1; */
   1046 
   1047 	dmultu	a_0,b_7		/* mul_add_c(a[0],b[7],c2,c3,c1); */
   1048 	mflo	t_1
   1049 	mfhi	t_2
   1050 	daddu	c_2,t_1
   1051 	sltu	AT,c_2,t_1
   1052 	daddu	t_2,AT
   1053 	daddu	c_3,t_2
   1054 	sltu	c_1,c_3,t_2
   1055 	dmultu	a_1,b_6		/* mul_add_c(a[1],b[6],c2,c3,c1); */
   1056 	mflo	t_1
   1057 	mfhi	t_2
   1058 	daddu	c_2,t_1
   1059 	sltu	AT,c_2,t_1
   1060 	daddu	t_2,AT
   1061 	daddu	c_3,t_2
   1062 	sltu	AT,c_3,t_2
   1063 	daddu	c_1,AT
   1064 	dmultu	a_2,b_5		/* mul_add_c(a[2],b[5],c2,c3,c1); */
   1065 	mflo	t_1
   1066 	mfhi	t_2
   1067 	daddu	c_2,t_1
   1068 	sltu	AT,c_2,t_1
   1069 	daddu	t_2,AT
   1070 	daddu	c_3,t_2
   1071 	sltu	AT,c_3,t_2
   1072 	daddu	c_1,AT
   1073 	dmultu	a_3,b_4		/* mul_add_c(a[3],b[4],c2,c3,c1); */
   1074 	mflo	t_1
   1075 	mfhi	t_2
   1076 	daddu	c_2,t_1
   1077 	sltu	AT,c_2,t_1
   1078 	daddu	t_2,AT
   1079 	daddu	c_3,t_2
   1080 	sltu	AT,c_3,t_2
   1081 	daddu	c_1,AT
   1082 	dmultu	a_4,b_3		/* mul_add_c(a[4],b[3],c2,c3,c1); */
   1083 	mflo	t_1
   1084 	mfhi	t_2
   1085 	daddu	c_2,t_1
   1086 	sltu	AT,c_2,t_1
   1087 	daddu	t_2,AT
   1088 	daddu	c_3,t_2
   1089 	sltu	AT,c_3,t_2
   1090 	daddu	c_1,AT
   1091 	dmultu	a_5,b_2		/* mul_add_c(a[5],b[2],c2,c3,c1); */
   1092 	mflo	t_1
   1093 	mfhi	t_2
   1094 	daddu	c_2,t_1
   1095 	sltu	AT,c_2,t_1
   1096 	daddu	t_2,AT
   1097 	daddu	c_3,t_2
   1098 	sltu	AT,c_3,t_2
   1099 	daddu	c_1,AT
   1100 	dmultu	a_6,b_1		/* mul_add_c(a[6],b[1],c2,c3,c1); */
   1101 	mflo	t_1
   1102 	mfhi	t_2
   1103 	daddu	c_2,t_1
   1104 	sltu	AT,c_2,t_1
   1105 	daddu	t_2,AT
   1106 	daddu	c_3,t_2
   1107 	sltu	AT,c_3,t_2
   1108 	daddu	c_1,AT
   1109 	dmultu	a_7,b_0		/* mul_add_c(a[7],b[0],c2,c3,c1); */
   1110 	mflo	t_1
   1111 	mfhi	t_2
   1112 	daddu	c_2,t_1
   1113 	sltu	AT,c_2,t_1
   1114 	daddu	t_2,AT
   1115 	daddu	c_3,t_2
   1116 	sltu	AT,c_3,t_2
   1117 	daddu	c_1,AT
   1118 	sd	c_2,56(a0)	/* r[7]=c2; */
   1119 
   1120 	dmultu	a_7,b_1		/* mul_add_c(a[7],b[1],c3,c1,c2); */
   1121 	mflo	t_1
   1122 	mfhi	t_2
   1123 	daddu	c_3,t_1
   1124 	sltu	AT,c_3,t_1
   1125 	daddu	t_2,AT
   1126 	daddu	c_1,t_2
   1127 	sltu	c_2,c_1,t_2
   1128 	dmultu	a_6,b_2		/* mul_add_c(a[6],b[2],c3,c1,c2); */
   1129 	mflo	t_1
   1130 	mfhi	t_2
   1131 	daddu	c_3,t_1
   1132 	sltu	AT,c_3,t_1
   1133 	daddu	t_2,AT
   1134 	daddu	c_1,t_2
   1135 	sltu	AT,c_1,t_2
   1136 	daddu	c_2,AT
   1137 	dmultu	a_5,b_3		/* mul_add_c(a[5],b[3],c3,c1,c2); */
   1138 	mflo	t_1
   1139 	mfhi	t_2
   1140 	daddu	c_3,t_1
   1141 	sltu	AT,c_3,t_1
   1142 	daddu	t_2,AT
   1143 	daddu	c_1,t_2
   1144 	sltu	AT,c_1,t_2
   1145 	daddu	c_2,AT
   1146 	dmultu	a_4,b_4		/* mul_add_c(a[4],b[4],c3,c1,c2); */
   1147 	mflo	t_1
   1148 	mfhi	t_2
   1149 	daddu	c_3,t_1
   1150 	sltu	AT,c_3,t_1
   1151 	daddu	t_2,AT
   1152 	daddu	c_1,t_2
   1153 	sltu	AT,c_1,t_2
   1154 	daddu	c_2,AT
   1155 	dmultu	a_3,b_5		/* mul_add_c(a[3],b[5],c3,c1,c2); */
   1156 	mflo	t_1
   1157 	mfhi	t_2
   1158 	daddu	c_3,t_1
   1159 	sltu	AT,c_3,t_1
   1160 	daddu	t_2,AT
   1161 	daddu	c_1,t_2
   1162 	sltu	AT,c_1,t_2
   1163 	daddu	c_2,AT
   1164 	dmultu	a_2,b_6		/* mul_add_c(a[2],b[6],c3,c1,c2); */
   1165 	mflo	t_1
   1166 	mfhi	t_2
   1167 	daddu	c_3,t_1
   1168 	sltu	AT,c_3,t_1
   1169 	daddu	t_2,AT
   1170 	daddu	c_1,t_2
   1171 	sltu	AT,c_1,t_2
   1172 	daddu	c_2,AT
   1173 	dmultu	a_1,b_7		/* mul_add_c(a[1],b[7],c3,c1,c2); */
   1174 	mflo	t_1
   1175 	mfhi	t_2
   1176 	daddu	c_3,t_1
   1177 	sltu	AT,c_3,t_1
   1178 	daddu	t_2,AT
   1179 	daddu	c_1,t_2
   1180 	sltu	AT,c_1,t_2
   1181 	daddu	c_2,AT
   1182 	sd	c_3,64(a0)	/* r[8]=c3; */
   1183 
   1184 	dmultu	a_2,b_7		/* mul_add_c(a[2],b[7],c1,c2,c3); */
   1185 	mflo	t_1
   1186 	mfhi	t_2
   1187 	daddu	c_1,t_1
   1188 	sltu	AT,c_1,t_1
   1189 	daddu	t_2,AT
   1190 	daddu	c_2,t_2
   1191 	sltu	c_3,c_2,t_2
   1192 	dmultu	a_3,b_6		/* mul_add_c(a[3],b[6],c1,c2,c3); */
   1193 	mflo	t_1
   1194 	mfhi	t_2
   1195 	daddu	c_1,t_1
   1196 	sltu	AT,c_1,t_1
   1197 	daddu	t_2,AT
   1198 	daddu	c_2,t_2
   1199 	sltu	AT,c_2,t_2
   1200 	daddu	c_3,AT
   1201 	dmultu	a_4,b_5		/* mul_add_c(a[4],b[5],c1,c2,c3); */
   1202 	mflo	t_1
   1203 	mfhi	t_2
   1204 	daddu	c_1,t_1
   1205 	sltu	AT,c_1,t_1
   1206 	daddu	t_2,AT
   1207 	daddu	c_2,t_2
   1208 	sltu	AT,c_2,t_2
   1209 	daddu	c_3,AT
   1210 	dmultu	a_5,b_4		/* mul_add_c(a[5],b[4],c1,c2,c3); */
   1211 	mflo	t_1
   1212 	mfhi	t_2
   1213 	daddu	c_1,t_1
   1214 	sltu	AT,c_1,t_1
   1215 	daddu	t_2,AT
   1216 	daddu	c_2,t_2
   1217 	sltu	AT,c_2,t_2
   1218 	daddu	c_3,AT
   1219 	dmultu	a_6,b_3		/* mul_add_c(a[6],b[3],c1,c2,c3); */
   1220 	mflo	t_1
   1221 	mfhi	t_2
   1222 	daddu	c_1,t_1
   1223 	sltu	AT,c_1,t_1
   1224 	daddu	t_2,AT
   1225 	daddu	c_2,t_2
   1226 	sltu	AT,c_2,t_2
   1227 	daddu	c_3,AT
   1228 	dmultu	a_7,b_2		/* mul_add_c(a[7],b[2],c1,c2,c3); */
   1229 	mflo	t_1
   1230 	mfhi	t_2
   1231 	daddu	c_1,t_1
   1232 	sltu	AT,c_1,t_1
   1233 	daddu	t_2,AT
   1234 	daddu	c_2,t_2
   1235 	sltu	AT,c_2,t_2
   1236 	daddu	c_3,AT
   1237 	sd	c_1,72(a0)	/* r[9]=c1; */
   1238 
   1239 	dmultu	a_7,b_3		/* mul_add_c(a[7],b[3],c2,c3,c1); */
   1240 	mflo	t_1
   1241 	mfhi	t_2
   1242 	daddu	c_2,t_1
   1243 	sltu	AT,c_2,t_1
   1244 	daddu	t_2,AT
   1245 	daddu	c_3,t_2
   1246 	sltu	c_1,c_3,t_2
   1247 	dmultu	a_6,b_4		/* mul_add_c(a[6],b[4],c2,c3,c1); */
   1248 	mflo	t_1
   1249 	mfhi	t_2
   1250 	daddu	c_2,t_1
   1251 	sltu	AT,c_2,t_1
   1252 	daddu	t_2,AT
   1253 	daddu	c_3,t_2
   1254 	sltu	AT,c_3,t_2
   1255 	daddu	c_1,AT
   1256 	dmultu	a_5,b_5		/* mul_add_c(a[5],b[5],c2,c3,c1); */
   1257 	mflo	t_1
   1258 	mfhi	t_2
   1259 	daddu	c_2,t_1
   1260 	sltu	AT,c_2,t_1
   1261 	daddu	t_2,AT
   1262 	daddu	c_3,t_2
   1263 	sltu	AT,c_3,t_2
   1264 	daddu	c_1,AT
   1265 	dmultu	a_4,b_6		/* mul_add_c(a[4],b[6],c2,c3,c1); */
   1266 	mflo	t_1
   1267 	mfhi	t_2
   1268 	daddu	c_2,t_1
   1269 	sltu	AT,c_2,t_1
   1270 	daddu	t_2,AT
   1271 	daddu	c_3,t_2
   1272 	sltu	AT,c_3,t_2
   1273 	daddu	c_1,AT
   1274 	dmultu	a_3,b_7		/* mul_add_c(a[3],b[7],c2,c3,c1); */
   1275 	mflo	t_1
   1276 	mfhi	t_2
   1277 	daddu	c_2,t_1
   1278 	sltu	AT,c_2,t_1
   1279 	daddu	t_2,AT
   1280 	daddu	c_3,t_2
   1281 	sltu	AT,c_3,t_2
   1282 	daddu	c_1,AT
   1283 	sd	c_2,80(a0)	/* r[10]=c2; */
   1284 
   1285 	dmultu	a_4,b_7		/* mul_add_c(a[4],b[7],c3,c1,c2); */
   1286 	mflo	t_1
   1287 	mfhi	t_2
   1288 	daddu	c_3,t_1
   1289 	sltu	AT,c_3,t_1
   1290 	daddu	t_2,AT
   1291 	daddu	c_1,t_2
   1292 	sltu	c_2,c_1,t_2
   1293 	dmultu	a_5,b_6		/* mul_add_c(a[5],b[6],c3,c1,c2); */
   1294 	mflo	t_1
   1295 	mfhi	t_2
   1296 	daddu	c_3,t_1
   1297 	sltu	AT,c_3,t_1
   1298 	daddu	t_2,AT
   1299 	daddu	c_1,t_2
   1300 	sltu	AT,c_1,t_2
   1301 	daddu	c_2,AT
   1302 	dmultu	a_6,b_5		/* mul_add_c(a[6],b[5],c3,c1,c2); */
   1303 	mflo	t_1
   1304 	mfhi	t_2
   1305 	daddu	c_3,t_1
   1306 	sltu	AT,c_3,t_1
   1307 	daddu	t_2,AT
   1308 	daddu	c_1,t_2
   1309 	sltu	AT,c_1,t_2
   1310 	daddu	c_2,AT
   1311 	dmultu	a_7,b_4		/* mul_add_c(a[7],b[4],c3,c1,c2); */
   1312 	mflo	t_1
   1313 	mfhi	t_2
   1314 	daddu	c_3,t_1
   1315 	sltu	AT,c_3,t_1
   1316 	daddu	t_2,AT
   1317 	daddu	c_1,t_2
   1318 	sltu	AT,c_1,t_2
   1319 	daddu	c_2,AT
   1320 	sd	c_3,88(a0)	/* r[11]=c3; */
   1321 
   1322 	dmultu	a_7,b_5		/* mul_add_c(a[7],b[5],c1,c2,c3); */
   1323 	mflo	t_1
   1324 	mfhi	t_2
   1325 	daddu	c_1,t_1
   1326 	sltu	AT,c_1,t_1
   1327 	daddu	t_2,AT
   1328 	daddu	c_2,t_2
   1329 	sltu	c_3,c_2,t_2
   1330 	dmultu	a_6,b_6		/* mul_add_c(a[6],b[6],c1,c2,c3); */
   1331 	mflo	t_1
   1332 	mfhi	t_2
   1333 	daddu	c_1,t_1
   1334 	sltu	AT,c_1,t_1
   1335 	daddu	t_2,AT
   1336 	daddu	c_2,t_2
   1337 	sltu	AT,c_2,t_2
   1338 	daddu	c_3,AT
   1339 	dmultu	a_5,b_7		/* mul_add_c(a[5],b[7],c1,c2,c3); */
   1340 	mflo	t_1
   1341 	mfhi	t_2
   1342 	daddu	c_1,t_1
   1343 	sltu	AT,c_1,t_1
   1344 	daddu	t_2,AT
   1345 	daddu	c_2,t_2
   1346 	sltu	AT,c_2,t_2
   1347 	daddu	c_3,AT
   1348 	sd	c_1,96(a0)	/* r[12]=c1; */
   1349 
   1350 	dmultu	a_6,b_7		/* mul_add_c(a[6],b[7],c2,c3,c1); */
   1351 	mflo	t_1
   1352 	mfhi	t_2
   1353 	daddu	c_2,t_1
   1354 	sltu	AT,c_2,t_1
   1355 	daddu	t_2,AT
   1356 	daddu	c_3,t_2
   1357 	sltu	c_1,c_3,t_2
   1358 	dmultu	a_7,b_6		/* mul_add_c(a[7],b[6],c2,c3,c1); */
   1359 	mflo	t_1
   1360 	mfhi	t_2
   1361 	daddu	c_2,t_1
   1362 	sltu	AT,c_2,t_1
   1363 	daddu	t_2,AT
   1364 	daddu	c_3,t_2
   1365 	sltu	AT,c_3,t_2
   1366 	daddu	c_1,AT
   1367 	sd	c_2,104(a0)	/* r[13]=c2; */
   1368 
   1369 	dmultu	a_7,b_7		/* mul_add_c(a[7],b[7],c3,c1,c2); */
   1370 	ld	s0,0(sp)
   1371 	ld	s1,8(sp)
   1372 	ld	s2,16(sp)
   1373 	ld	s3,24(sp)
   1374 	ld	s4,32(sp)
   1375 	ld	s5,40(sp)
   1376 	mflo	t_1
   1377 	mfhi	t_2
   1378 	daddu	c_3,t_1
   1379 	sltu	AT,c_3,t_1
   1380 	daddu	t_2,AT
   1381 	daddu	c_1,t_2
   1382 	sd	c_3,112(a0)	/* r[14]=c3; */
   1383 	sd	c_1,120(a0)	/* r[15]=c1; */
   1384 
   1385 	PTR_ADD	sp,FRAME_SIZE
   1386 
   1387 	jr	ra
   1388 END(bn_mul_comba8)
   1389 
   1390 .align	5
   1391 LEAF(bn_mul_comba4)
   1392 	.set	reorder
   1393 	ld	a_0,0(a1)
   1394 	ld	b_0,0(a2)
   1395 	ld	a_1,8(a1)
   1396 	ld	a_2,16(a1)
   1397 	dmultu	a_0,b_0		/* mul_add_c(a[0],b[0],c1,c2,c3); */
   1398 	ld	a_3,24(a1)
   1399 	ld	b_1,8(a2)
   1400 	ld	b_2,16(a2)
   1401 	ld	b_3,24(a2)
   1402 	mflo	c_1
   1403 	mfhi	c_2
   1404 	sd	c_1,0(a0)
   1405 
   1406 	dmultu	a_0,b_1		/* mul_add_c(a[0],b[1],c2,c3,c1); */
   1407 	mflo	t_1
   1408 	mfhi	t_2
   1409 	daddu	c_2,t_1
   1410 	sltu	AT,c_2,t_1
   1411 	daddu	c_3,t_2,AT
   1412 	dmultu	a_1,b_0		/* mul_add_c(a[1],b[0],c2,c3,c1); */
   1413 	mflo	t_1
   1414 	mfhi	t_2
   1415 	daddu	c_2,t_1
   1416 	sltu	AT,c_2,t_1
   1417 	daddu	t_2,AT
   1418 	daddu	c_3,t_2
   1419 	sltu	c_1,c_3,t_2
   1420 	sd	c_2,8(a0)
   1421 
   1422 	dmultu	a_2,b_0		/* mul_add_c(a[2],b[0],c3,c1,c2); */
   1423 	mflo	t_1
   1424 	mfhi	t_2
   1425 	daddu	c_3,t_1
   1426 	sltu	AT,c_3,t_1
   1427 	daddu	t_2,AT
   1428 	daddu	c_1,t_2
   1429 	dmultu	a_1,b_1		/* mul_add_c(a[1],b[1],c3,c1,c2); */
   1430 	mflo	t_1
   1431 	mfhi	t_2
   1432 	daddu	c_3,t_1
   1433 	sltu	AT,c_3,t_1
   1434 	daddu	t_2,AT
   1435 	daddu	c_1,t_2
   1436 	sltu	c_2,c_1,t_2
   1437 	dmultu	a_0,b_2		/* mul_add_c(a[0],b[2],c3,c1,c2); */
   1438 	mflo	t_1
   1439 	mfhi	t_2
   1440 	daddu	c_3,t_1
   1441 	sltu	AT,c_3,t_1
   1442 	daddu	t_2,AT
   1443 	daddu	c_1,t_2
   1444 	sltu	AT,c_1,t_2
   1445 	daddu	c_2,AT
   1446 	sd	c_3,16(a0)
   1447 
   1448 	dmultu	a_0,b_3		/* mul_add_c(a[0],b[3],c1,c2,c3); */
   1449 	mflo	t_1
   1450 	mfhi	t_2
   1451 	daddu	c_1,t_1
   1452 	sltu	AT,c_1,t_1
   1453 	daddu	t_2,AT
   1454 	daddu	c_2,t_2
   1455 	sltu	c_3,c_2,t_2
   1456 	dmultu	a_1,b_2		/* mul_add_c(a[1],b[2],c1,c2,c3); */
   1457 	mflo	t_1
   1458 	mfhi	t_2
   1459 	daddu	c_1,t_1
   1460 	sltu	AT,c_1,t_1
   1461 	daddu	t_2,AT
   1462 	daddu	c_2,t_2
   1463 	sltu	AT,c_2,t_2
   1464 	daddu	c_3,AT
   1465 	dmultu	a_2,b_1		/* mul_add_c(a[2],b[1],c1,c2,c3); */
   1466 	mflo	t_1
   1467 	mfhi	t_2
   1468 	daddu	c_1,t_1
   1469 	sltu	AT,c_1,t_1
   1470 	daddu	t_2,AT
   1471 	daddu	c_2,t_2
   1472 	sltu	AT,c_2,t_2
   1473 	daddu	c_3,AT
   1474 	dmultu	a_3,b_0		/* mul_add_c(a[3],b[0],c1,c2,c3); */
   1475 	mflo	t_1
   1476 	mfhi	t_2
   1477 	daddu	c_1,t_1
   1478 	sltu	AT,c_1,t_1
   1479 	daddu	t_2,AT
   1480 	daddu	c_2,t_2
   1481 	sltu	AT,c_2,t_2
   1482 	daddu	c_3,AT
   1483 	sd	c_1,24(a0)
   1484 
   1485 	dmultu	a_3,b_1		/* mul_add_c(a[3],b[1],c2,c3,c1); */
   1486 	mflo	t_1
   1487 	mfhi	t_2
   1488 	daddu	c_2,t_1
   1489 	sltu	AT,c_2,t_1
   1490 	daddu	t_2,AT
   1491 	daddu	c_3,t_2
   1492 	sltu	c_1,c_3,t_2
   1493 	dmultu	a_2,b_2		/* mul_add_c(a[2],b[2],c2,c3,c1); */
   1494 	mflo	t_1
   1495 	mfhi	t_2
   1496 	daddu	c_2,t_1
   1497 	sltu	AT,c_2,t_1
   1498 	daddu	t_2,AT
   1499 	daddu	c_3,t_2
   1500 	sltu	AT,c_3,t_2
   1501 	daddu	c_1,AT
   1502 	dmultu	a_1,b_3		/* mul_add_c(a[1],b[3],c2,c3,c1); */
   1503 	mflo	t_1
   1504 	mfhi	t_2
   1505 	daddu	c_2,t_1
   1506 	sltu	AT,c_2,t_1
   1507 	daddu	t_2,AT
   1508 	daddu	c_3,t_2
   1509 	sltu	AT,c_3,t_2
   1510 	daddu	c_1,AT
   1511 	sd	c_2,32(a0)
   1512 
   1513 	dmultu	a_2,b_3		/* mul_add_c(a[2],b[3],c3,c1,c2); */
   1514 	mflo	t_1
   1515 	mfhi	t_2
   1516 	daddu	c_3,t_1
   1517 	sltu	AT,c_3,t_1
   1518 	daddu	t_2,AT
   1519 	daddu	c_1,t_2
   1520 	sltu	c_2,c_1,t_2
   1521 	dmultu	a_3,b_2		/* mul_add_c(a[3],b[2],c3,c1,c2); */
   1522 	mflo	t_1
   1523 	mfhi	t_2
   1524 	daddu	c_3,t_1
   1525 	sltu	AT,c_3,t_1
   1526 	daddu	t_2,AT
   1527 	daddu	c_1,t_2
   1528 	sltu	AT,c_1,t_2
   1529 	daddu	c_2,AT
   1530 	sd	c_3,40(a0)
   1531 
   1532 	dmultu	a_3,b_3		/* mul_add_c(a[3],b[3],c1,c2,c3); */
   1533 	mflo	t_1
   1534 	mfhi	t_2
   1535 	daddu	c_1,t_1
   1536 	sltu	AT,c_1,t_1
   1537 	daddu	t_2,AT
   1538 	daddu	c_2,t_2
   1539 	sd	c_1,48(a0)
   1540 	sd	c_2,56(a0)
   1541 
   1542 	jr	ra
   1543 END(bn_mul_comba4)
   1544 
   1545 #undef	a_4
   1546 #undef	a_5
   1547 #undef	a_6
   1548 #undef	a_7
   1549 #define	a_4	b_0
   1550 #define	a_5	b_1
   1551 #define	a_6	b_2
   1552 #define	a_7	b_3
   1553 
   1554 .align	5
   1555 LEAF(bn_sqr_comba8)
   1556 	.set	reorder
   1557 	ld	a_0,0(a1)
   1558 	ld	a_1,8(a1)
   1559 	ld	a_2,16(a1)
   1560 	ld	a_3,24(a1)
   1561 
   1562 	dmultu	a_0,a_0		/* mul_add_c(a[0],b[0],c1,c2,c3); */
   1563 	ld	a_4,32(a1)
   1564 	ld	a_5,40(a1)
   1565 	ld	a_6,48(a1)
   1566 	ld	a_7,56(a1)
   1567 	mflo	c_1
   1568 	mfhi	c_2
   1569 	sd	c_1,0(a0)
   1570 
   1571 	dmultu	a_0,a_1		/* mul_add_c2(a[0],b[1],c2,c3,c1); */
   1572 	mflo	t_1
   1573 	mfhi	t_2
   1574 	slt	c_1,t_2,zero
   1575 	dsll	t_2,1
   1576 	slt	a2,t_1,zero
   1577 	daddu	t_2,a2
   1578 	dsll	t_1,1
   1579 	daddu	c_2,t_1
   1580 	sltu	AT,c_2,t_1
   1581 	daddu	c_3,t_2,AT
   1582 	sd	c_2,8(a0)
   1583 
   1584 	dmultu	a_2,a_0		/* mul_add_c2(a[2],b[0],c3,c1,c2); */
   1585 	mflo	t_1
   1586 	mfhi	t_2
   1587 	slt	c_2,t_2,zero
   1588 	dsll	t_2,1
   1589 	slt	a2,t_1,zero
   1590 	daddu	t_2,a2
   1591 	dsll	t_1,1
   1592 	daddu	c_3,t_1
   1593 	sltu	AT,c_3,t_1
   1594 	daddu	t_2,AT
   1595 	daddu	c_1,t_2
   1596 	sltu	AT,c_1,t_2
   1597 	daddu	c_2,AT
   1598 	dmultu	a_1,a_1		/* mul_add_c(a[1],b[1],c3,c1,c2); */
   1599 	mflo	t_1
   1600 	mfhi	t_2
   1601 	daddu	c_3,t_1
   1602 	sltu	AT,c_3,t_1
   1603 	daddu	t_2,AT
   1604 	daddu	c_1,t_2
   1605 	sltu	AT,c_1,t_2
   1606 	daddu	c_2,AT
   1607 	sd	c_3,16(a0)
   1608 
   1609 	dmultu	a_0,a_3		/* mul_add_c2(a[0],b[3],c1,c2,c3); */
   1610 	mflo	t_1
   1611 	mfhi	t_2
   1612 	slt	c_3,t_2,zero
   1613 	dsll	t_2,1
   1614 	slt	a2,t_1,zero
   1615 	daddu	t_2,a2
   1616 	dsll	t_1,1
   1617 	daddu	c_1,t_1
   1618 	sltu	AT,c_1,t_1
   1619 	daddu	t_2,AT
   1620 	daddu	c_2,t_2
   1621 	sltu	AT,c_2,t_2
   1622 	daddu	c_3,AT
   1623 	dmultu	a_1,a_2		/* mul_add_c2(a[1],b[2],c1,c2,c3); */
   1624 	mflo	t_1
   1625 	mfhi	t_2
   1626 	slt	AT,t_2,zero
   1627 	daddu	c_3,AT
   1628 	dsll	t_2,1
   1629 	slt	a2,t_1,zero
   1630 	daddu	t_2,a2
   1631 	dsll	t_1,1
   1632 	daddu	c_1,t_1
   1633 	sltu	AT,c_1,t_1
   1634 	daddu	t_2,AT
   1635 	daddu	c_2,t_2
   1636 	sltu	AT,c_2,t_2
   1637 	daddu	c_3,AT
   1638 	sd	c_1,24(a0)
   1639 
   1640 	dmultu	a_4,a_0		/* mul_add_c2(a[4],b[0],c2,c3,c1); */
   1641 	mflo	t_1
   1642 	mfhi	t_2
   1643 	slt	c_1,t_2,zero
   1644 	dsll	t_2,1
   1645 	slt	a2,t_1,zero
   1646 	daddu	t_2,a2
   1647 	dsll	t_1,1
   1648 	daddu	c_2,t_1
   1649 	sltu	AT,c_2,t_1
   1650 	daddu	t_2,AT
   1651 	daddu	c_3,t_2
   1652 	sltu	AT,c_3,t_2
   1653 	daddu	c_1,AT
   1654 	dmultu	a_3,a_1		/* mul_add_c2(a[3],b[1],c2,c3,c1); */
   1655 	mflo	t_1
   1656 	mfhi	t_2
   1657 	slt	AT,t_2,zero
   1658 	daddu	c_1,AT
   1659 	dsll	t_2,1
   1660 	slt	a2,t_1,zero
   1661 	daddu	t_2,a2
   1662 	dsll	t_1,1
   1663 	daddu	c_2,t_1
   1664 	sltu	AT,c_2,t_1
   1665 	daddu	t_2,AT
   1666 	daddu	c_3,t_2
   1667 	sltu	AT,c_3,t_2
   1668 	daddu	c_1,AT
   1669 	dmultu	a_2,a_2		/* mul_add_c(a[2],b[2],c2,c3,c1); */
   1670 	mflo	t_1
   1671 	mfhi	t_2
   1672 	daddu	c_2,t_1
   1673 	sltu	AT,c_2,t_1
   1674 	daddu	t_2,AT
   1675 	daddu	c_3,t_2
   1676 	sltu	AT,c_3,t_2
   1677 	daddu	c_1,AT
   1678 	sd	c_2,32(a0)
   1679 
   1680 	dmultu	a_0,a_5		/* mul_add_c2(a[0],b[5],c3,c1,c2); */
   1681 	mflo	t_1
   1682 	mfhi	t_2
   1683 	slt	c_2,t_2,zero
   1684 	dsll	t_2,1
   1685 	slt	a2,t_1,zero
   1686 	daddu	t_2,a2
   1687 	dsll	t_1,1
   1688 	daddu	c_3,t_1
   1689 	sltu	AT,c_3,t_1
   1690 	daddu	t_2,AT
   1691 	daddu	c_1,t_2
   1692 	sltu	AT,c_1,t_2
   1693 	daddu	c_2,AT
   1694 	dmultu	a_1,a_4		/* mul_add_c2(a[1],b[4],c3,c1,c2); */
   1695 	mflo	t_1
   1696 	mfhi	t_2
   1697 	slt	AT,t_2,zero
   1698 	daddu	c_2,AT
   1699 	dsll	t_2,1
   1700 	slt	a2,t_1,zero
   1701 	daddu	t_2,a2
   1702 	dsll	t_1,1
   1703 	daddu	c_3,t_1
   1704 	sltu	AT,c_3,t_1
   1705 	daddu	t_2,AT
   1706 	daddu	c_1,t_2
   1707 	sltu	AT,c_1,t_2
   1708 	daddu	c_2,AT
   1709 	dmultu	a_2,a_3		/* mul_add_c2(a[2],b[3],c3,c1,c2); */
   1710 	mflo	t_1
   1711 	mfhi	t_2
   1712 	slt	AT,t_2,zero
   1713 	daddu	c_2,AT
   1714 	dsll	t_2,1
   1715 	slt	a2,t_1,zero
   1716 	daddu	t_2,a2
   1717 	dsll	t_1,1
   1718 	daddu	c_3,t_1
   1719 	sltu	AT,c_3,t_1
   1720 	daddu	t_2,AT
   1721 	daddu	c_1,t_2
   1722 	sltu	AT,c_1,t_2
   1723 	daddu	c_2,AT
   1724 	sd	c_3,40(a0)
   1725 
   1726 	dmultu	a_6,a_0		/* mul_add_c2(a[6],b[0],c1,c2,c3); */
   1727 	mflo	t_1
   1728 	mfhi	t_2
   1729 	slt	c_3,t_2,zero
   1730 	dsll	t_2,1
   1731 	slt	a2,t_1,zero
   1732 	daddu	t_2,a2
   1733 	dsll	t_1,1
   1734 	daddu	c_1,t_1
   1735 	sltu	AT,c_1,t_1
   1736 	daddu	t_2,AT
   1737 	daddu	c_2,t_2
   1738 	sltu	AT,c_2,t_2
   1739 	daddu	c_3,AT
   1740 	dmultu	a_5,a_1		/* mul_add_c2(a[5],b[1],c1,c2,c3); */
   1741 	mflo	t_1
   1742 	mfhi	t_2
   1743 	slt	AT,t_2,zero
   1744 	daddu	c_3,AT
   1745 	dsll	t_2,1
   1746 	slt	a2,t_1,zero
   1747 	daddu	t_2,a2
   1748 	dsll	t_1,1
   1749 	daddu	c_1,t_1
   1750 	sltu	AT,c_1,t_1
   1751 	daddu	t_2,AT
   1752 	daddu	c_2,t_2
   1753 	sltu	AT,c_2,t_2
   1754 	daddu	c_3,AT
   1755 	dmultu	a_4,a_2		/* mul_add_c2(a[4],b[2],c1,c2,c3); */
   1756 	mflo	t_1
   1757 	mfhi	t_2
   1758 	slt	AT,t_2,zero
   1759 	daddu	c_3,AT
   1760 	dsll	t_2,1
   1761 	slt	a2,t_1,zero
   1762 	daddu	t_2,a2
   1763 	dsll	t_1,1
   1764 	daddu	c_1,t_1
   1765 	sltu	AT,c_1,t_1
   1766 	daddu	t_2,AT
   1767 	daddu	c_2,t_2
   1768 	sltu	AT,c_2,t_2
   1769 	daddu	c_3,AT
   1770 	dmultu	a_3,a_3		/* mul_add_c(a[3],b[3],c1,c2,c3); */
   1771 	mflo	t_1
   1772 	mfhi	t_2
   1773 	daddu	c_1,t_1
   1774 	sltu	AT,c_1,t_1
   1775 	daddu	t_2,AT
   1776 	daddu	c_2,t_2
   1777 	sltu	AT,c_2,t_2
   1778 	daddu	c_3,AT
   1779 	sd	c_1,48(a0)
   1780 
   1781 	dmultu	a_0,a_7		/* mul_add_c2(a[0],b[7],c2,c3,c1); */
   1782 	mflo	t_1
   1783 	mfhi	t_2
   1784 	slt	c_1,t_2,zero
   1785 	dsll	t_2,1
   1786 	slt	a2,t_1,zero
   1787 	daddu	t_2,a2
   1788 	dsll	t_1,1
   1789 	daddu	c_2,t_1
   1790 	sltu	AT,c_2,t_1
   1791 	daddu	t_2,AT
   1792 	daddu	c_3,t_2
   1793 	sltu	AT,c_3,t_2
   1794 	daddu	c_1,AT
   1795 	dmultu	a_1,a_6		/* mul_add_c2(a[1],b[6],c2,c3,c1); */
   1796 	mflo	t_1
   1797 	mfhi	t_2
   1798 	slt	AT,t_2,zero
   1799 	daddu	c_1,AT
   1800 	dsll	t_2,1
   1801 	slt	a2,t_1,zero
   1802 	daddu	t_2,a2
   1803 	dsll	t_1,1
   1804 	daddu	c_2,t_1
   1805 	sltu	AT,c_2,t_1
   1806 	daddu	t_2,AT
   1807 	daddu	c_3,t_2
   1808 	sltu	AT,c_3,t_2
   1809 	daddu	c_1,AT
   1810 	dmultu	a_2,a_5		/* mul_add_c2(a[2],b[5],c2,c3,c1); */
   1811 	mflo	t_1
   1812 	mfhi	t_2
   1813 	slt	AT,t_2,zero
   1814 	daddu	c_1,AT
   1815 	dsll	t_2,1
   1816 	slt	a2,t_1,zero
   1817 	daddu	t_2,a2
   1818 	dsll	t_1,1
   1819 	daddu	c_2,t_1
   1820 	sltu	AT,c_2,t_1
   1821 	daddu	t_2,AT
   1822 	daddu	c_3,t_2
   1823 	sltu	AT,c_3,t_2
   1824 	daddu	c_1,AT
   1825 	dmultu	a_3,a_4		/* mul_add_c2(a[3],b[4],c2,c3,c1); */
   1826 	mflo	t_1
   1827 	mfhi	t_2
   1828 	slt	AT,t_2,zero
   1829 	daddu	c_1,AT
   1830 	dsll	t_2,1
   1831 	slt	a2,t_1,zero
   1832 	daddu	t_2,a2
   1833 	dsll	t_1,1
   1834 	daddu	c_2,t_1
   1835 	sltu	AT,c_2,t_1
   1836 	daddu	t_2,AT
   1837 	daddu	c_3,t_2
   1838 	sltu	AT,c_3,t_2
   1839 	daddu	c_1,AT
   1840 	sd	c_2,56(a0)
   1841 
   1842 	dmultu	a_7,a_1		/* mul_add_c2(a[7],b[1],c3,c1,c2); */
   1843 	mflo	t_1
   1844 	mfhi	t_2
   1845 	slt	c_2,t_2,zero
   1846 	dsll	t_2,1
   1847 	slt	a2,t_1,zero
   1848 	daddu	t_2,a2
   1849 	dsll	t_1,1
   1850 	daddu	c_3,t_1
   1851 	sltu	AT,c_3,t_1
   1852 	daddu	t_2,AT
   1853 	daddu	c_1,t_2
   1854 	sltu	AT,c_1,t_2
   1855 	daddu	c_2,AT
   1856 	dmultu	a_6,a_2		/* mul_add_c2(a[6],b[2],c3,c1,c2); */
   1857 	mflo	t_1
   1858 	mfhi	t_2
   1859 	slt	AT,t_2,zero
   1860 	daddu	c_2,AT
   1861 	dsll	t_2,1
   1862 	slt	a2,t_1,zero
   1863 	daddu	t_2,a2
   1864 	dsll	t_1,1
   1865 	daddu	c_3,t_1
   1866 	sltu	AT,c_3,t_1
   1867 	daddu	t_2,AT
   1868 	daddu	c_1,t_2
   1869 	sltu	AT,c_1,t_2
   1870 	daddu	c_2,AT
   1871 	dmultu	a_5,a_3		/* mul_add_c2(a[5],b[3],c3,c1,c2); */
   1872 	mflo	t_1
   1873 	mfhi	t_2
   1874 	slt	AT,t_2,zero
   1875 	daddu	c_2,AT
   1876 	dsll	t_2,1
   1877 	slt	a2,t_1,zero
   1878 	daddu	t_2,a2
   1879 	dsll	t_1,1
   1880 	daddu	c_3,t_1
   1881 	sltu	AT,c_3,t_1
   1882 	daddu	t_2,AT
   1883 	daddu	c_1,t_2
   1884 	sltu	AT,c_1,t_2
   1885 	daddu	c_2,AT
   1886 	dmultu	a_4,a_4		/* mul_add_c(a[4],b[4],c3,c1,c2); */
   1887 	mflo	t_1
   1888 	mfhi	t_2
   1889 	daddu	c_3,t_1
   1890 	sltu	AT,c_3,t_1
   1891 	daddu	t_2,AT
   1892 	daddu	c_1,t_2
   1893 	sltu	AT,c_1,t_2
   1894 	daddu	c_2,AT
   1895 	sd	c_3,64(a0)
   1896 
   1897 	dmultu	a_2,a_7		/* mul_add_c2(a[2],b[7],c1,c2,c3); */
   1898 	mflo	t_1
   1899 	mfhi	t_2
   1900 	slt	c_3,t_2,zero
   1901 	dsll	t_2,1
   1902 	slt	a2,t_1,zero
   1903 	daddu	t_2,a2
   1904 	dsll	t_1,1
   1905 	daddu	c_1,t_1
   1906 	sltu	AT,c_1,t_1
   1907 	daddu	t_2,AT
   1908 	daddu	c_2,t_2
   1909 	sltu	AT,c_2,t_2
   1910 	daddu	c_3,AT
   1911 	dmultu	a_3,a_6		/* mul_add_c2(a[3],b[6],c1,c2,c3); */
   1912 	mflo	t_1
   1913 	mfhi	t_2
   1914 	slt	AT,t_2,zero
   1915 	daddu	c_3,AT
   1916 	dsll	t_2,1
   1917 	slt	a2,t_1,zero
   1918 	daddu	t_2,a2
   1919 	dsll	t_1,1
   1920 	daddu	c_1,t_1
   1921 	sltu	AT,c_1,t_1
   1922 	daddu	t_2,AT
   1923 	daddu	c_2,t_2
   1924 	sltu	AT,c_2,t_2
   1925 	daddu	c_3,AT
   1926 	dmultu	a_4,a_5		/* mul_add_c2(a[4],b[5],c1,c2,c3); */
   1927 	mflo	t_1
   1928 	mfhi	t_2
   1929 	slt	AT,t_2,zero
   1930 	daddu	c_3,AT
   1931 	dsll	t_2,1
   1932 	slt	a2,t_1,zero
   1933 	daddu	t_2,a2
   1934 	dsll	t_1,1
   1935 	daddu	c_1,t_1
   1936 	sltu	AT,c_1,t_1
   1937 	daddu	t_2,AT
   1938 	daddu	c_2,t_2
   1939 	sltu	AT,c_2,t_2
   1940 	daddu	c_3,AT
   1941 	sd	c_1,72(a0)
   1942 
   1943 	dmultu	a_7,a_3		/* mul_add_c2(a[7],b[3],c2,c3,c1); */
   1944 	mflo	t_1
   1945 	mfhi	t_2
   1946 	slt	c_1,t_2,zero
   1947 	dsll	t_2,1
   1948 	slt	a2,t_1,zero
   1949 	daddu	t_2,a2
   1950 	dsll	t_1,1
   1951 	daddu	c_2,t_1
   1952 	sltu	AT,c_2,t_1
   1953 	daddu	t_2,AT
   1954 	daddu	c_3,t_2
   1955 	sltu	AT,c_3,t_2
   1956 	daddu	c_1,AT
   1957 	dmultu	a_6,a_4		/* mul_add_c2(a[6],b[4],c2,c3,c1); */
   1958 	mflo	t_1
   1959 	mfhi	t_2
   1960 	slt	AT,t_2,zero
   1961 	daddu	c_1,AT
   1962 	dsll	t_2,1
   1963 	slt	a2,t_1,zero
   1964 	daddu	t_2,a2
   1965 	dsll	t_1,1
   1966 	daddu	c_2,t_1
   1967 	sltu	AT,c_2,t_1
   1968 	daddu	t_2,AT
   1969 	daddu	c_3,t_2
   1970 	sltu	AT,c_3,t_2
   1971 	daddu	c_1,AT
   1972 	dmultu	a_5,a_5		/* mul_add_c(a[5],b[5],c2,c3,c1); */
   1973 	mflo	t_1
   1974 	mfhi	t_2
   1975 	daddu	c_2,t_1
   1976 	sltu	AT,c_2,t_1
   1977 	daddu	t_2,AT
   1978 	daddu	c_3,t_2
   1979 	sltu	AT,c_3,t_2
   1980 	daddu	c_1,AT
   1981 	sd	c_2,80(a0)
   1982 
   1983 	dmultu	a_4,a_7		/* mul_add_c2(a[4],b[7],c3,c1,c2); */
   1984 	mflo	t_1
   1985 	mfhi	t_2
   1986 	slt	c_2,t_2,zero
   1987 	dsll	t_2,1
   1988 	slt	a2,t_1,zero
   1989 	daddu	t_2,a2
   1990 	dsll	t_1,1
   1991 	daddu	c_3,t_1
   1992 	sltu	AT,c_3,t_1
   1993 	daddu	t_2,AT
   1994 	daddu	c_1,t_2
   1995 	sltu	AT,c_1,t_2
   1996 	daddu	c_2,AT
   1997 	dmultu	a_5,a_6		/* mul_add_c2(a[5],b[6],c3,c1,c2); */
   1998 	mflo	t_1
   1999 	mfhi	t_2
   2000 	slt	AT,t_2,zero
   2001 	daddu	c_2,AT
   2002 	dsll	t_2,1
   2003 	slt	a2,t_1,zero
   2004 	daddu	t_2,a2
   2005 	dsll	t_1,1
   2006 	daddu	c_3,t_1
   2007 	sltu	AT,c_3,t_1
   2008 	daddu	t_2,AT
   2009 	daddu	c_1,t_2
   2010 	sltu	AT,c_1,t_2
   2011 	daddu	c_2,AT
   2012 	sd	c_3,88(a0)
   2013 
   2014 	dmultu	a_7,a_5		/* mul_add_c2(a[7],b[5],c1,c2,c3); */
   2015 	mflo	t_1
   2016 	mfhi	t_2
   2017 	slt	c_3,t_2,zero
   2018 	dsll	t_2,1
   2019 	slt	a2,t_1,zero
   2020 	daddu	t_2,a2
   2021 	dsll	t_1,1
   2022 	daddu	c_1,t_1
   2023 	sltu	AT,c_1,t_1
   2024 	daddu	t_2,AT
   2025 	daddu	c_2,t_2
   2026 	sltu	AT,c_2,t_2
   2027 	daddu	c_3,AT
   2028 	dmultu	a_6,a_6		/* mul_add_c(a[6],b[6],c1,c2,c3); */
   2029 	mflo	t_1
   2030 	mfhi	t_2
   2031 	daddu	c_1,t_1
   2032 	sltu	AT,c_1,t_1
   2033 	daddu	t_2,AT
   2034 	daddu	c_2,t_2
   2035 	sltu	AT,c_2,t_2
   2036 	daddu	c_3,AT
   2037 	sd	c_1,96(a0)
   2038 
   2039 	dmultu	a_6,a_7		/* mul_add_c2(a[6],b[7],c2,c3,c1); */
   2040 	mflo	t_1
   2041 	mfhi	t_2
   2042 	slt	c_1,t_2,zero
   2043 	dsll	t_2,1
   2044 	slt	a2,t_1,zero
   2045 	daddu	t_2,a2
   2046 	dsll	t_1,1
   2047 	daddu	c_2,t_1
   2048 	sltu	AT,c_2,t_1
   2049 	daddu	t_2,AT
   2050 	daddu	c_3,t_2
   2051 	sltu	AT,c_3,t_2
   2052 	daddu	c_1,AT
   2053 	sd	c_2,104(a0)
   2054 
   2055 	dmultu	a_7,a_7		/* mul_add_c(a[7],b[7],c3,c1,c2); */
   2056 	mflo	t_1
   2057 	mfhi	t_2
   2058 	daddu	c_3,t_1
   2059 	sltu	AT,c_3,t_1
   2060 	daddu	t_2,AT
   2061 	daddu	c_1,t_2
   2062 	sd	c_3,112(a0)
   2063 	sd	c_1,120(a0)
   2064 
   2065 	jr	ra
   2066 END(bn_sqr_comba8)
   2067 
   2068 .align	5
   2069 LEAF(bn_sqr_comba4)
   2070 	.set	reorder
   2071 	ld	a_0,0(a1)
   2072 	ld	a_1,8(a1)
   2073 	ld	a_2,16(a1)
   2074 	ld	a_3,24(a1)
   2075 	dmultu	a_0,a_0		/* mul_add_c(a[0],b[0],c1,c2,c3); */
   2076 	mflo	c_1
   2077 	mfhi	c_2
   2078 	sd	c_1,0(a0)
   2079 
   2080 	dmultu	a_0,a_1		/* mul_add_c2(a[0],b[1],c2,c3,c1); */
   2081 	mflo	t_1
   2082 	mfhi	t_2
   2083 	slt	c_1,t_2,zero
   2084 	dsll	t_2,1
   2085 	slt	a2,t_1,zero
   2086 	daddu	t_2,a2
   2087 	dsll	t_1,1
   2088 	daddu	c_2,t_1
   2089 	sltu	AT,c_2,t_1
   2090 	daddu	c_3,t_2,AT
   2091 	sd	c_2,8(a0)
   2092 
   2093 	dmultu	a_2,a_0		/* mul_add_c2(a[2],b[0],c3,c1,c2); */
   2094 	mflo	t_1
   2095 	mfhi	t_2
   2096 	slt	c_2,t_2,zero
   2097 	dsll	t_2,1
   2098 	slt	a2,t_1,zero
   2099 	daddu	t_2,a2
   2100 	dsll	t_1,1
   2101 	daddu	c_3,t_1
   2102 	sltu	AT,c_3,t_1
   2103 	daddu	t_2,AT
   2104 	daddu	c_1,t_2
   2105 	sltu	AT,c_1,t_2
   2106 	daddu	c_2,AT
   2107 	dmultu	a_1,a_1		/* mul_add_c(a[1],b[1],c3,c1,c2); */
   2108 	mflo	t_1
   2109 	mfhi	t_2
   2110 	daddu	c_3,t_1
   2111 	sltu	AT,c_3,t_1
   2112 	daddu	t_2,AT
   2113 	daddu	c_1,t_2
   2114 	sltu	AT,c_1,t_2
   2115 	daddu	c_2,AT
   2116 	sd	c_3,16(a0)
   2117 
   2118 	dmultu	a_0,a_3		/* mul_add_c2(a[0],b[3],c1,c2,c3); */
   2119 	mflo	t_1
   2120 	mfhi	t_2
   2121 	slt	c_3,t_2,zero
   2122 	dsll	t_2,1
   2123 	slt	a2,t_1,zero
   2124 	daddu	t_2,a2
   2125 	dsll	t_1,1
   2126 	daddu	c_1,t_1
   2127 	sltu	AT,c_1,t_1
   2128 	daddu	t_2,AT
   2129 	daddu	c_2,t_2
   2130 	sltu	AT,c_2,t_2
   2131 	daddu	c_3,AT
   2132 	dmultu	a_1,a_2		/* mul_add_c(a2[1],b[2],c1,c2,c3); */
   2133 	mflo	t_1
   2134 	mfhi	t_2
   2135 	slt	AT,t_2,zero
   2136 	daddu	c_3,AT
   2137 	dsll	t_2,1
   2138 	slt	a2,t_1,zero
   2139 	daddu	t_2,a2
   2140 	dsll	t_1,1
   2141 	daddu	c_1,t_1
   2142 	sltu	AT,c_1,t_1
   2143 	daddu	t_2,AT
   2144 	daddu	c_2,t_2
   2145 	sltu	AT,c_2,t_2
   2146 	daddu	c_3,AT
   2147 	sd	c_1,24(a0)
   2148 
   2149 	dmultu	a_3,a_1		/* mul_add_c2(a[3],b[1],c2,c3,c1); */
   2150 	mflo	t_1
   2151 	mfhi	t_2
   2152 	slt	c_1,t_2,zero
   2153 	dsll	t_2,1
   2154 	slt	a2,t_1,zero
   2155 	daddu	t_2,a2
   2156 	dsll	t_1,1
   2157 	daddu	c_2,t_1
   2158 	sltu	AT,c_2,t_1
   2159 	daddu	t_2,AT
   2160 	daddu	c_3,t_2
   2161 	sltu	AT,c_3,t_2
   2162 	daddu	c_1,AT
   2163 	dmultu	a_2,a_2		/* mul_add_c(a[2],b[2],c2,c3,c1); */
   2164 	mflo	t_1
   2165 	mfhi	t_2
   2166 	daddu	c_2,t_1
   2167 	sltu	AT,c_2,t_1
   2168 	daddu	t_2,AT
   2169 	daddu	c_3,t_2
   2170 	sltu	AT,c_3,t_2
   2171 	daddu	c_1,AT
   2172 	sd	c_2,32(a0)
   2173 
   2174 	dmultu	a_2,a_3		/* mul_add_c2(a[2],b[3],c3,c1,c2); */
   2175 	mflo	t_1
   2176 	mfhi	t_2
   2177 	slt	c_2,t_2,zero
   2178 	dsll	t_2,1
   2179 	slt	a2,t_1,zero
   2180 	daddu	t_2,a2
   2181 	dsll	t_1,1
   2182 	daddu	c_3,t_1
   2183 	sltu	AT,c_3,t_1
   2184 	daddu	t_2,AT
   2185 	daddu	c_1,t_2
   2186 	sltu	AT,c_1,t_2
   2187 	daddu	c_2,AT
   2188 	sd	c_3,40(a0)
   2189 
   2190 	dmultu	a_3,a_3		/* mul_add_c(a[3],b[3],c1,c2,c3); */
   2191 	mflo	t_1
   2192 	mfhi	t_2
   2193 	daddu	c_1,t_1
   2194 	sltu	AT,c_1,t_1
   2195 	daddu	t_2,AT
   2196 	daddu	c_2,t_2
   2197 	sd	c_1,48(a0)
   2198 	sd	c_2,56(a0)
   2199 
   2200 	jr	ra
   2201 END(bn_sqr_comba4)
   2202