Home | History | Annotate | Download | only in asm
      1 #!/usr/bin/env perl
      2 #
      3 # ====================================================================
      4 # Written by Andy Polyakov <appro (at] fy.chalmers.se> for the OpenSSL
      5 # project.
      6 #
      7 # Rights for redistribution and usage in source and binary forms are
      8 # granted according to the OpenSSL license. Warranty of any kind is
      9 # disclaimed.
     10 # ====================================================================
     11 
     12 
     13 # July 1999
     14 #
     15 # This is drop-in MIPS III/IV ISA replacement for crypto/bn/bn_asm.c.
     16 #
     17 # The module is designed to work with either of the "new" MIPS ABI(5),
     18 # namely N32 or N64, offered by IRIX 6.x. It's not ment to work under
     19 # IRIX 5.x not only because it doesn't support new ABIs but also
     20 # because 5.x kernels put R4x00 CPU into 32-bit mode and all those
     21 # 64-bit instructions (daddu, dmultu, etc.) found below gonna only
     22 # cause illegal instruction exception:-(
     23 #
     24 # In addition the code depends on preprocessor flags set up by MIPSpro
     25 # compiler driver (either as or cc) and therefore (probably?) can't be
     26 # compiled by the GNU assembler. GNU C driver manages fine though...
     27 # I mean as long as -mmips-as is specified or is the default option,
     28 # because then it simply invokes /usr/bin/as which in turn takes
     29 # perfect care of the preprocessor definitions. Another neat feature
     30 # offered by the MIPSpro assembler is an optimization pass. This gave
     31 # me the opportunity to have the code looking more regular as all those
     32 # architecture dependent instruction rescheduling details were left to
     33 # the assembler. Cool, huh?
     34 #
     35 # Performance improvement is astonishing! 'apps/openssl speed rsa dsa'
     36 # goes way over 3 times faster!
     37 #
     38 #					<appro (at] fy.chalmers.se>
     39 
     40 # October 2010
     41 #
     42 # Adapt the module even for 32-bit ABIs and other OSes. The former was
     43 # achieved by mechanical replacement of 64-bit arithmetic instructions
     44 # such as dmultu, daddu, etc. with their 32-bit counterparts and
     45 # adjusting offsets denoting multiples of BN_ULONG. Above mentioned
     46 # >3x performance improvement naturally does not apply to 32-bit code
     47 # [because there is no instruction 32-bit compiler can't use], one
     48 # has to content with 40-85% improvement depending on benchmark and
     49 # key length, more for longer keys.
     50 
     51 $flavour = shift;
     52 while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
     53 open STDOUT,">$output";
     54 
     55 if ($flavour =~ /64|n32/i) {
     56 	$LD="ld";
     57 	$ST="sd";
     58 	$MULTU="dmultu";
     59 	$DIVU="ddivu";
     60 	$ADDU="daddu";
     61 	$SUBU="dsubu";
     62 	$SRL="dsrl";
     63 	$SLL="dsll";
     64 	$BNSZ=8;
     65 	$PTR_ADD="daddu";
     66 	$PTR_SUB="dsubu";
     67 	$SZREG=8;
     68 	$REG_S="sd";
     69 	$REG_L="ld";
     70 } else {
     71 	$LD="lw";
     72 	$ST="sw";
     73 	$MULTU="multu";
     74 	$DIVU="divu";
     75 	$ADDU="addu";
     76 	$SUBU="subu";
     77 	$SRL="srl";
     78 	$SLL="sll";
     79 	$BNSZ=4;
     80 	$PTR_ADD="addu";
     81 	$PTR_SUB="subu";
     82 	$SZREG=4;
     83 	$REG_S="sw";
     84 	$REG_L="lw";
     85 	$code=".set	mips2\n";
     86 }
     87 
     88 # Below is N32/64 register layout used in the original module.
     89 #
     90 ($zero,$at,$v0,$v1)=map("\$$_",(0..3));
     91 ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
     92 ($t0,$t1,$t2,$t3,$t8,$t9)=map("\$$_",(12..15,24,25));
     93 ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7)=map("\$$_",(16..23));
     94 ($gp,$sp,$fp,$ra)=map("\$$_",(28..31));
     95 ($ta0,$ta1,$ta2,$ta3)=($a4,$a5,$a6,$a7);
     96 #
     97 # No special adaptation is required for O32. NUBI on the other hand
     98 # is treated by saving/restoring ($v1,$t0..$t3).
     99 
    100 $gp=$v1 if ($flavour =~ /nubi/i);
    101 
    102 $minus4=$v1;
    103 
    104 $code.=<<___;
    105 .rdata
    106 .asciiz	"mips3.s, Version 1.2"
    107 .asciiz	"MIPS II/III/IV ISA artwork by Andy Polyakov <appro\@fy.chalmers.se>"
    108 
    109 .text
    110 .set	noat
    111 
    112 .align	5
    113 .globl	bn_mul_add_words
    114 .ent	bn_mul_add_words
    115 bn_mul_add_words:
    116 	.set	noreorder
    117 	bgtz	$a2,bn_mul_add_words_internal
    118 	move	$v0,$zero
    119 	jr	$ra
    120 	move	$a0,$v0
    121 .end	bn_mul_add_words
    122 
    123 .align	5
    124 .ent	bn_mul_add_words_internal
    125 bn_mul_add_words_internal:
    126 ___
    127 $code.=<<___ if ($flavour =~ /nubi/i);
    128 	.frame	$sp,6*$SZREG,$ra
    129 	.mask	0x8000f008,-$SZREG
    130 	.set	noreorder
    131 	$PTR_SUB $sp,6*$SZREG
    132 	$REG_S	$ra,5*$SZREG($sp)
    133 	$REG_S	$t3,4*$SZREG($sp)
    134 	$REG_S	$t2,3*$SZREG($sp)
    135 	$REG_S	$t1,2*$SZREG($sp)
    136 	$REG_S	$t0,1*$SZREG($sp)
    137 	$REG_S	$gp,0*$SZREG($sp)
    138 ___
    139 $code.=<<___;
    140 	.set	reorder
    141 	li	$minus4,-4
    142 	and	$ta0,$a2,$minus4
    143 	$LD	$t0,0($a1)
    144 	beqz	$ta0,.L_bn_mul_add_words_tail
    145 
    146 .L_bn_mul_add_words_loop:
    147 	$MULTU	$t0,$a3
    148 	$LD	$t1,0($a0)
    149 	$LD	$t2,$BNSZ($a1)
    150 	$LD	$t3,$BNSZ($a0)
    151 	$LD	$ta0,2*$BNSZ($a1)
    152 	$LD	$ta1,2*$BNSZ($a0)
    153 	$ADDU	$t1,$v0
    154 	sltu	$v0,$t1,$v0	# All manuals say it "compares 32-bit
    155 				# values", but it seems to work fine
    156 				# even on 64-bit registers.
    157 	mflo	$at
    158 	mfhi	$t0
    159 	$ADDU	$t1,$at
    160 	$ADDU	$v0,$t0
    161 	 $MULTU	$t2,$a3
    162 	sltu	$at,$t1,$at
    163 	$ST	$t1,0($a0)
    164 	$ADDU	$v0,$at
    165 
    166 	$LD	$ta2,3*$BNSZ($a1)
    167 	$LD	$ta3,3*$BNSZ($a0)
    168 	$ADDU	$t3,$v0
    169 	sltu	$v0,$t3,$v0
    170 	mflo	$at
    171 	mfhi	$t2
    172 	$ADDU	$t3,$at
    173 	$ADDU	$v0,$t2
    174 	 $MULTU	$ta0,$a3
    175 	sltu	$at,$t3,$at
    176 	$ST	$t3,$BNSZ($a0)
    177 	$ADDU	$v0,$at
    178 
    179 	subu	$a2,4
    180 	$PTR_ADD $a0,4*$BNSZ
    181 	$PTR_ADD $a1,4*$BNSZ
    182 	$ADDU	$ta1,$v0
    183 	sltu	$v0,$ta1,$v0
    184 	mflo	$at
    185 	mfhi	$ta0
    186 	$ADDU	$ta1,$at
    187 	$ADDU	$v0,$ta0
    188 	 $MULTU	$ta2,$a3
    189 	sltu	$at,$ta1,$at
    190 	$ST	$ta1,-2*$BNSZ($a0)
    191 	$ADDU	$v0,$at
    192 
    193 
    194 	and	$ta0,$a2,$minus4
    195 	$ADDU	$ta3,$v0
    196 	sltu	$v0,$ta3,$v0
    197 	mflo	$at
    198 	mfhi	$ta2
    199 	$ADDU	$ta3,$at
    200 	$ADDU	$v0,$ta2
    201 	sltu	$at,$ta3,$at
    202 	$ST	$ta3,-$BNSZ($a0)
    203 	$ADDU	$v0,$at
    204 	.set	noreorder
    205 	bgtzl	$ta0,.L_bn_mul_add_words_loop
    206 	$LD	$t0,0($a1)
    207 
    208 	beqz	$a2,.L_bn_mul_add_words_return
    209 	nop
    210 
    211 .L_bn_mul_add_words_tail:
    212 	.set	reorder
    213 	$LD	$t0,0($a1)
    214 	$MULTU	$t0,$a3
    215 	$LD	$t1,0($a0)
    216 	subu	$a2,1
    217 	$ADDU	$t1,$v0
    218 	sltu	$v0,$t1,$v0
    219 	mflo	$at
    220 	mfhi	$t0
    221 	$ADDU	$t1,$at
    222 	$ADDU	$v0,$t0
    223 	sltu	$at,$t1,$at
    224 	$ST	$t1,0($a0)
    225 	$ADDU	$v0,$at
    226 	beqz	$a2,.L_bn_mul_add_words_return
    227 
    228 	$LD	$t0,$BNSZ($a1)
    229 	$MULTU	$t0,$a3
    230 	$LD	$t1,$BNSZ($a0)
    231 	subu	$a2,1
    232 	$ADDU	$t1,$v0
    233 	sltu	$v0,$t1,$v0
    234 	mflo	$at
    235 	mfhi	$t0
    236 	$ADDU	$t1,$at
    237 	$ADDU	$v0,$t0
    238 	sltu	$at,$t1,$at
    239 	$ST	$t1,$BNSZ($a0)
    240 	$ADDU	$v0,$at
    241 	beqz	$a2,.L_bn_mul_add_words_return
    242 
    243 	$LD	$t0,2*$BNSZ($a1)
    244 	$MULTU	$t0,$a3
    245 	$LD	$t1,2*$BNSZ($a0)
    246 	$ADDU	$t1,$v0
    247 	sltu	$v0,$t1,$v0
    248 	mflo	$at
    249 	mfhi	$t0
    250 	$ADDU	$t1,$at
    251 	$ADDU	$v0,$t0
    252 	sltu	$at,$t1,$at
    253 	$ST	$t1,2*$BNSZ($a0)
    254 	$ADDU	$v0,$at
    255 
    256 .L_bn_mul_add_words_return:
    257 	.set	noreorder
    258 ___
    259 $code.=<<___ if ($flavour =~ /nubi/i);
    260 	$REG_L	$t3,4*$SZREG($sp)
    261 	$REG_L	$t2,3*$SZREG($sp)
    262 	$REG_L	$t1,2*$SZREG($sp)
    263 	$REG_L	$t0,1*$SZREG($sp)
    264 	$REG_L	$gp,0*$SZREG($sp)
    265 	$PTR_ADD $sp,6*$SZREG
    266 ___
    267 $code.=<<___;
    268 	jr	$ra
    269 	move	$a0,$v0
    270 .end	bn_mul_add_words_internal
    271 
    272 .align	5
    273 .globl	bn_mul_words
    274 .ent	bn_mul_words
    275 bn_mul_words:
    276 	.set	noreorder
    277 	bgtz	$a2,bn_mul_words_internal
    278 	move	$v0,$zero
    279 	jr	$ra
    280 	move	$a0,$v0
    281 .end	bn_mul_words
    282 
    283 .align	5
    284 .ent	bn_mul_words_internal
    285 bn_mul_words_internal:
    286 ___
    287 $code.=<<___ if ($flavour =~ /nubi/i);
    288 	.frame	$sp,6*$SZREG,$ra
    289 	.mask	0x8000f008,-$SZREG
    290 	.set	noreorder
    291 	$PTR_SUB $sp,6*$SZREG
    292 	$REG_S	$ra,5*$SZREG($sp)
    293 	$REG_S	$t3,4*$SZREG($sp)
    294 	$REG_S	$t2,3*$SZREG($sp)
    295 	$REG_S	$t1,2*$SZREG($sp)
    296 	$REG_S	$t0,1*$SZREG($sp)
    297 	$REG_S	$gp,0*$SZREG($sp)
    298 ___
    299 $code.=<<___;
    300 	.set	reorder
    301 	li	$minus4,-4
    302 	and	$ta0,$a2,$minus4
    303 	$LD	$t0,0($a1)
    304 	beqz	$ta0,.L_bn_mul_words_tail
    305 
    306 .L_bn_mul_words_loop:
    307 	$MULTU	$t0,$a3
    308 	$LD	$t2,$BNSZ($a1)
    309 	$LD	$ta0,2*$BNSZ($a1)
    310 	$LD	$ta2,3*$BNSZ($a1)
    311 	mflo	$at
    312 	mfhi	$t0
    313 	$ADDU	$v0,$at
    314 	sltu	$t1,$v0,$at
    315 	 $MULTU	$t2,$a3
    316 	$ST	$v0,0($a0)
    317 	$ADDU	$v0,$t1,$t0
    318 
    319 	subu	$a2,4
    320 	$PTR_ADD $a0,4*$BNSZ
    321 	$PTR_ADD $a1,4*$BNSZ
    322 	mflo	$at
    323 	mfhi	$t2
    324 	$ADDU	$v0,$at
    325 	sltu	$t3,$v0,$at
    326 	 $MULTU	$ta0,$a3
    327 	$ST	$v0,-3*$BNSZ($a0)
    328 	$ADDU	$v0,$t3,$t2
    329 
    330 	mflo	$at
    331 	mfhi	$ta0
    332 	$ADDU	$v0,$at
    333 	sltu	$ta1,$v0,$at
    334 	 $MULTU	$ta2,$a3
    335 	$ST	$v0,-2*$BNSZ($a0)
    336 	$ADDU	$v0,$ta1,$ta0
    337 
    338 	and	$ta0,$a2,$minus4
    339 	mflo	$at
    340 	mfhi	$ta2
    341 	$ADDU	$v0,$at
    342 	sltu	$ta3,$v0,$at
    343 	$ST	$v0,-$BNSZ($a0)
    344 	$ADDU	$v0,$ta3,$ta2
    345 	.set	noreorder
    346 	bgtzl	$ta0,.L_bn_mul_words_loop
    347 	$LD	$t0,0($a1)
    348 
    349 	beqz	$a2,.L_bn_mul_words_return
    350 	nop
    351 
    352 .L_bn_mul_words_tail:
    353 	.set	reorder
    354 	$LD	$t0,0($a1)
    355 	$MULTU	$t0,$a3
    356 	subu	$a2,1
    357 	mflo	$at
    358 	mfhi	$t0
    359 	$ADDU	$v0,$at
    360 	sltu	$t1,$v0,$at
    361 	$ST	$v0,0($a0)
    362 	$ADDU	$v0,$t1,$t0
    363 	beqz	$a2,.L_bn_mul_words_return
    364 
    365 	$LD	$t0,$BNSZ($a1)
    366 	$MULTU	$t0,$a3
    367 	subu	$a2,1
    368 	mflo	$at
    369 	mfhi	$t0
    370 	$ADDU	$v0,$at
    371 	sltu	$t1,$v0,$at
    372 	$ST	$v0,$BNSZ($a0)
    373 	$ADDU	$v0,$t1,$t0
    374 	beqz	$a2,.L_bn_mul_words_return
    375 
    376 	$LD	$t0,2*$BNSZ($a1)
    377 	$MULTU	$t0,$a3
    378 	mflo	$at
    379 	mfhi	$t0
    380 	$ADDU	$v0,$at
    381 	sltu	$t1,$v0,$at
    382 	$ST	$v0,2*$BNSZ($a0)
    383 	$ADDU	$v0,$t1,$t0
    384 
    385 .L_bn_mul_words_return:
    386 	.set	noreorder
    387 ___
    388 $code.=<<___ if ($flavour =~ /nubi/i);
    389 	$REG_L	$t3,4*$SZREG($sp)
    390 	$REG_L	$t2,3*$SZREG($sp)
    391 	$REG_L	$t1,2*$SZREG($sp)
    392 	$REG_L	$t0,1*$SZREG($sp)
    393 	$REG_L	$gp,0*$SZREG($sp)
    394 	$PTR_ADD $sp,6*$SZREG
    395 ___
    396 $code.=<<___;
    397 	jr	$ra
    398 	move	$a0,$v0
    399 .end	bn_mul_words_internal
    400 
    401 .align	5
    402 .globl	bn_sqr_words
    403 .ent	bn_sqr_words
    404 bn_sqr_words:
    405 	.set	noreorder
    406 	bgtz	$a2,bn_sqr_words_internal
    407 	move	$v0,$zero
    408 	jr	$ra
    409 	move	$a0,$v0
    410 .end	bn_sqr_words
    411 
    412 .align	5
    413 .ent	bn_sqr_words_internal
    414 bn_sqr_words_internal:
    415 ___
    416 $code.=<<___ if ($flavour =~ /nubi/i);
    417 	.frame	$sp,6*$SZREG,$ra
    418 	.mask	0x8000f008,-$SZREG
    419 	.set	noreorder
    420 	$PTR_SUB $sp,6*$SZREG
    421 	$REG_S	$ra,5*$SZREG($sp)
    422 	$REG_S	$t3,4*$SZREG($sp)
    423 	$REG_S	$t2,3*$SZREG($sp)
    424 	$REG_S	$t1,2*$SZREG($sp)
    425 	$REG_S	$t0,1*$SZREG($sp)
    426 	$REG_S	$gp,0*$SZREG($sp)
    427 ___
    428 $code.=<<___;
    429 	.set	reorder
    430 	li	$minus4,-4
    431 	and	$ta0,$a2,$minus4
    432 	$LD	$t0,0($a1)
    433 	beqz	$ta0,.L_bn_sqr_words_tail
    434 
    435 .L_bn_sqr_words_loop:
    436 	$MULTU	$t0,$t0
    437 	$LD	$t2,$BNSZ($a1)
    438 	$LD	$ta0,2*$BNSZ($a1)
    439 	$LD	$ta2,3*$BNSZ($a1)
    440 	mflo	$t1
    441 	mfhi	$t0
    442 	$ST	$t1,0($a0)
    443 	$ST	$t0,$BNSZ($a0)
    444 
    445 	$MULTU	$t2,$t2
    446 	subu	$a2,4
    447 	$PTR_ADD $a0,8*$BNSZ
    448 	$PTR_ADD $a1,4*$BNSZ
    449 	mflo	$t3
    450 	mfhi	$t2
    451 	$ST	$t3,-6*$BNSZ($a0)
    452 	$ST	$t2,-5*$BNSZ($a0)
    453 
    454 	$MULTU	$ta0,$ta0
    455 	mflo	$ta1
    456 	mfhi	$ta0
    457 	$ST	$ta1,-4*$BNSZ($a0)
    458 	$ST	$ta0,-3*$BNSZ($a0)
    459 
    460 
    461 	$MULTU	$ta2,$ta2
    462 	and	$ta0,$a2,$minus4
    463 	mflo	$ta3
    464 	mfhi	$ta2
    465 	$ST	$ta3,-2*$BNSZ($a0)
    466 	$ST	$ta2,-$BNSZ($a0)
    467 
    468 	.set	noreorder
    469 	bgtzl	$ta0,.L_bn_sqr_words_loop
    470 	$LD	$t0,0($a1)
    471 
    472 	beqz	$a2,.L_bn_sqr_words_return
    473 	nop
    474 
    475 .L_bn_sqr_words_tail:
    476 	.set	reorder
    477 	$LD	$t0,0($a1)
    478 	$MULTU	$t0,$t0
    479 	subu	$a2,1
    480 	mflo	$t1
    481 	mfhi	$t0
    482 	$ST	$t1,0($a0)
    483 	$ST	$t0,$BNSZ($a0)
    484 	beqz	$a2,.L_bn_sqr_words_return
    485 
    486 	$LD	$t0,$BNSZ($a1)
    487 	$MULTU	$t0,$t0
    488 	subu	$a2,1
    489 	mflo	$t1
    490 	mfhi	$t0
    491 	$ST	$t1,2*$BNSZ($a0)
    492 	$ST	$t0,3*$BNSZ($a0)
    493 	beqz	$a2,.L_bn_sqr_words_return
    494 
    495 	$LD	$t0,2*$BNSZ($a1)
    496 	$MULTU	$t0,$t0
    497 	mflo	$t1
    498 	mfhi	$t0
    499 	$ST	$t1,4*$BNSZ($a0)
    500 	$ST	$t0,5*$BNSZ($a0)
    501 
    502 .L_bn_sqr_words_return:
    503 	.set	noreorder
    504 ___
    505 $code.=<<___ if ($flavour =~ /nubi/i);
    506 	$REG_L	$t3,4*$SZREG($sp)
    507 	$REG_L	$t2,3*$SZREG($sp)
    508 	$REG_L	$t1,2*$SZREG($sp)
    509 	$REG_L	$t0,1*$SZREG($sp)
    510 	$REG_L	$gp,0*$SZREG($sp)
    511 	$PTR_ADD $sp,6*$SZREG
    512 ___
    513 $code.=<<___;
    514 	jr	$ra
    515 	move	$a0,$v0
    516 
    517 .end	bn_sqr_words_internal
    518 
    519 .align	5
    520 .globl	bn_add_words
    521 .ent	bn_add_words
    522 bn_add_words:
    523 	.set	noreorder
    524 	bgtz	$a3,bn_add_words_internal
    525 	move	$v0,$zero
    526 	jr	$ra
    527 	move	$a0,$v0
    528 .end	bn_add_words
    529 
    530 .align	5
    531 .ent	bn_add_words_internal
    532 bn_add_words_internal:
    533 ___
    534 $code.=<<___ if ($flavour =~ /nubi/i);
    535 	.frame	$sp,6*$SZREG,$ra
    536 	.mask	0x8000f008,-$SZREG
    537 	.set	noreorder
    538 	$PTR_SUB $sp,6*$SZREG
    539 	$REG_S	$ra,5*$SZREG($sp)
    540 	$REG_S	$t3,4*$SZREG($sp)
    541 	$REG_S	$t2,3*$SZREG($sp)
    542 	$REG_S	$t1,2*$SZREG($sp)
    543 	$REG_S	$t0,1*$SZREG($sp)
    544 	$REG_S	$gp,0*$SZREG($sp)
    545 ___
    546 $code.=<<___;
    547 	.set	reorder
    548 	li	$minus4,-4
    549 	and	$at,$a3,$minus4
    550 	$LD	$t0,0($a1)
    551 	beqz	$at,.L_bn_add_words_tail
    552 
    553 .L_bn_add_words_loop:
    554 	$LD	$ta0,0($a2)
    555 	subu	$a3,4
    556 	$LD	$t1,$BNSZ($a1)
    557 	and	$at,$a3,$minus4
    558 	$LD	$t2,2*$BNSZ($a1)
    559 	$PTR_ADD $a2,4*$BNSZ
    560 	$LD	$t3,3*$BNSZ($a1)
    561 	$PTR_ADD $a0,4*$BNSZ
    562 	$LD	$ta1,-3*$BNSZ($a2)
    563 	$PTR_ADD $a1,4*$BNSZ
    564 	$LD	$ta2,-2*$BNSZ($a2)
    565 	$LD	$ta3,-$BNSZ($a2)
    566 	$ADDU	$ta0,$t0
    567 	sltu	$t8,$ta0,$t0
    568 	$ADDU	$t0,$ta0,$v0
    569 	sltu	$v0,$t0,$ta0
    570 	$ST	$t0,-4*$BNSZ($a0)
    571 	$ADDU	$v0,$t8
    572 
    573 	$ADDU	$ta1,$t1
    574 	sltu	$t9,$ta1,$t1
    575 	$ADDU	$t1,$ta1,$v0
    576 	sltu	$v0,$t1,$ta1
    577 	$ST	$t1,-3*$BNSZ($a0)
    578 	$ADDU	$v0,$t9
    579 
    580 	$ADDU	$ta2,$t2
    581 	sltu	$t8,$ta2,$t2
    582 	$ADDU	$t2,$ta2,$v0
    583 	sltu	$v0,$t2,$ta2
    584 	$ST	$t2,-2*$BNSZ($a0)
    585 	$ADDU	$v0,$t8
    586 	
    587 	$ADDU	$ta3,$t3
    588 	sltu	$t9,$ta3,$t3
    589 	$ADDU	$t3,$ta3,$v0
    590 	sltu	$v0,$t3,$ta3
    591 	$ST	$t3,-$BNSZ($a0)
    592 	$ADDU	$v0,$t9
    593 	
    594 	.set	noreorder
    595 	bgtzl	$at,.L_bn_add_words_loop
    596 	$LD	$t0,0($a1)
    597 
    598 	beqz	$a3,.L_bn_add_words_return
    599 	nop
    600 
    601 .L_bn_add_words_tail:
    602 	.set	reorder
    603 	$LD	$t0,0($a1)
    604 	$LD	$ta0,0($a2)
    605 	$ADDU	$ta0,$t0
    606 	subu	$a3,1
    607 	sltu	$t8,$ta0,$t0
    608 	$ADDU	$t0,$ta0,$v0
    609 	sltu	$v0,$t0,$ta0
    610 	$ST	$t0,0($a0)
    611 	$ADDU	$v0,$t8
    612 	beqz	$a3,.L_bn_add_words_return
    613 
    614 	$LD	$t1,$BNSZ($a1)
    615 	$LD	$ta1,$BNSZ($a2)
    616 	$ADDU	$ta1,$t1
    617 	subu	$a3,1
    618 	sltu	$t9,$ta1,$t1
    619 	$ADDU	$t1,$ta1,$v0
    620 	sltu	$v0,$t1,$ta1
    621 	$ST	$t1,$BNSZ($a0)
    622 	$ADDU	$v0,$t9
    623 	beqz	$a3,.L_bn_add_words_return
    624 
    625 	$LD	$t2,2*$BNSZ($a1)
    626 	$LD	$ta2,2*$BNSZ($a2)
    627 	$ADDU	$ta2,$t2
    628 	sltu	$t8,$ta2,$t2
    629 	$ADDU	$t2,$ta2,$v0
    630 	sltu	$v0,$t2,$ta2
    631 	$ST	$t2,2*$BNSZ($a0)
    632 	$ADDU	$v0,$t8
    633 
    634 .L_bn_add_words_return:
    635 	.set	noreorder
    636 ___
    637 $code.=<<___ if ($flavour =~ /nubi/i);
    638 	$REG_L	$t3,4*$SZREG($sp)
    639 	$REG_L	$t2,3*$SZREG($sp)
    640 	$REG_L	$t1,2*$SZREG($sp)
    641 	$REG_L	$t0,1*$SZREG($sp)
    642 	$REG_L	$gp,0*$SZREG($sp)
    643 	$PTR_ADD $sp,6*$SZREG
    644 ___
    645 $code.=<<___;
    646 	jr	$ra
    647 	move	$a0,$v0
    648 
    649 .end	bn_add_words_internal
    650 
    651 .align	5
    652 .globl	bn_sub_words
    653 .ent	bn_sub_words
    654 bn_sub_words:
    655 	.set	noreorder
    656 	bgtz	$a3,bn_sub_words_internal
    657 	move	$v0,$zero
    658 	jr	$ra
    659 	move	$a0,$zero
    660 .end	bn_sub_words
    661 
    662 .align	5
    663 .ent	bn_sub_words_internal
    664 bn_sub_words_internal:
    665 ___
    666 $code.=<<___ if ($flavour =~ /nubi/i);
    667 	.frame	$sp,6*$SZREG,$ra
    668 	.mask	0x8000f008,-$SZREG
    669 	.set	noreorder
    670 	$PTR_SUB $sp,6*$SZREG
    671 	$REG_S	$ra,5*$SZREG($sp)
    672 	$REG_S	$t3,4*$SZREG($sp)
    673 	$REG_S	$t2,3*$SZREG($sp)
    674 	$REG_S	$t1,2*$SZREG($sp)
    675 	$REG_S	$t0,1*$SZREG($sp)
    676 	$REG_S	$gp,0*$SZREG($sp)
    677 ___
    678 $code.=<<___;
    679 	.set	reorder
    680 	li	$minus4,-4
    681 	and	$at,$a3,$minus4
    682 	$LD	$t0,0($a1)
    683 	beqz	$at,.L_bn_sub_words_tail
    684 
    685 .L_bn_sub_words_loop:
    686 	$LD	$ta0,0($a2)
    687 	subu	$a3,4
    688 	$LD	$t1,$BNSZ($a1)
    689 	and	$at,$a3,$minus4
    690 	$LD	$t2,2*$BNSZ($a1)
    691 	$PTR_ADD $a2,4*$BNSZ
    692 	$LD	$t3,3*$BNSZ($a1)
    693 	$PTR_ADD $a0,4*$BNSZ
    694 	$LD	$ta1,-3*$BNSZ($a2)
    695 	$PTR_ADD $a1,4*$BNSZ
    696 	$LD	$ta2,-2*$BNSZ($a2)
    697 	$LD	$ta3,-$BNSZ($a2)
    698 	sltu	$t8,$t0,$ta0
    699 	$SUBU	$ta0,$t0,$ta0
    700 	$SUBU	$t0,$ta0,$v0
    701 	sgtu	$v0,$t0,$ta0
    702 	$ST	$t0,-4*$BNSZ($a0)
    703 	$ADDU	$v0,$t8
    704 
    705 	sltu	$t9,$t1,$ta1
    706 	$SUBU	$ta1,$t1,$ta1
    707 	$SUBU	$t1,$ta1,$v0
    708 	sgtu	$v0,$t1,$ta1
    709 	$ST	$t1,-3*$BNSZ($a0)
    710 	$ADDU	$v0,$t9
    711 
    712 
    713 	sltu	$t8,$t2,$ta2
    714 	$SUBU	$ta2,$t2,$ta2
    715 	$SUBU	$t2,$ta2,$v0
    716 	sgtu	$v0,$t2,$ta2
    717 	$ST	$t2,-2*$BNSZ($a0)
    718 	$ADDU	$v0,$t8
    719 
    720 	sltu	$t9,$t3,$ta3
    721 	$SUBU	$ta3,$t3,$ta3
    722 	$SUBU	$t3,$ta3,$v0
    723 	sgtu	$v0,$t3,$ta3
    724 	$ST	$t3,-$BNSZ($a0)
    725 	$ADDU	$v0,$t9
    726 
    727 	.set	noreorder
    728 	bgtzl	$at,.L_bn_sub_words_loop
    729 	$LD	$t0,0($a1)
    730 
    731 	beqz	$a3,.L_bn_sub_words_return
    732 	nop
    733 
    734 .L_bn_sub_words_tail:
    735 	.set	reorder
    736 	$LD	$t0,0($a1)
    737 	$LD	$ta0,0($a2)
    738 	subu	$a3,1
    739 	sltu	$t8,$t0,$ta0
    740 	$SUBU	$ta0,$t0,$ta0
    741 	$SUBU	$t0,$ta0,$v0
    742 	sgtu	$v0,$t0,$ta0
    743 	$ST	$t0,0($a0)
    744 	$ADDU	$v0,$t8
    745 	beqz	$a3,.L_bn_sub_words_return
    746 
    747 	$LD	$t1,$BNSZ($a1)
    748 	subu	$a3,1
    749 	$LD	$ta1,$BNSZ($a2)
    750 	sltu	$t9,$t1,$ta1
    751 	$SUBU	$ta1,$t1,$ta1
    752 	$SUBU	$t1,$ta1,$v0
    753 	sgtu	$v0,$t1,$ta1
    754 	$ST	$t1,$BNSZ($a0)
    755 	$ADDU	$v0,$t9
    756 	beqz	$a3,.L_bn_sub_words_return
    757 
    758 	$LD	$t2,2*$BNSZ($a1)
    759 	$LD	$ta2,2*$BNSZ($a2)
    760 	sltu	$t8,$t2,$ta2
    761 	$SUBU	$ta2,$t2,$ta2
    762 	$SUBU	$t2,$ta2,$v0
    763 	sgtu	$v0,$t2,$ta2
    764 	$ST	$t2,2*$BNSZ($a0)
    765 	$ADDU	$v0,$t8
    766 
    767 .L_bn_sub_words_return:
    768 	.set	noreorder
    769 ___
    770 $code.=<<___ if ($flavour =~ /nubi/i);
    771 	$REG_L	$t3,4*$SZREG($sp)
    772 	$REG_L	$t2,3*$SZREG($sp)
    773 	$REG_L	$t1,2*$SZREG($sp)
    774 	$REG_L	$t0,1*$SZREG($sp)
    775 	$REG_L	$gp,0*$SZREG($sp)
    776 	$PTR_ADD $sp,6*$SZREG
    777 ___
    778 $code.=<<___;
    779 	jr	$ra
    780 	move	$a0,$v0
    781 .end	bn_sub_words_internal
    782 
    783 .align 5
    784 .globl	bn_div_3_words
    785 .ent	bn_div_3_words
    786 bn_div_3_words:
    787 	.set	noreorder
    788 	move	$a3,$a0		# we know that bn_div_words does not
    789 				# touch $a3, $ta2, $ta3 and preserves $a2
    790 				# so that we can save two arguments
    791 				# and return address in registers
    792 				# instead of stack:-)
    793 				
    794 	$LD	$a0,($a3)
    795 	move	$ta2,$a1
    796 	bne	$a0,$a2,bn_div_3_words_internal
    797 	$LD	$a1,-$BNSZ($a3)
    798 	li	$v0,-1
    799 	jr	$ra
    800 	move	$a0,$v0
    801 .end	bn_div_3_words
    802 
    803 .align	5
    804 .ent	bn_div_3_words_internal
    805 bn_div_3_words_internal:
    806 ___
    807 $code.=<<___ if ($flavour =~ /nubi/i);
    808 	.frame	$sp,6*$SZREG,$ra
    809 	.mask	0x8000f008,-$SZREG
    810 	.set	noreorder
    811 	$PTR_SUB $sp,6*$SZREG
    812 	$REG_S	$ra,5*$SZREG($sp)
    813 	$REG_S	$t3,4*$SZREG($sp)
    814 	$REG_S	$t2,3*$SZREG($sp)
    815 	$REG_S	$t1,2*$SZREG($sp)
    816 	$REG_S	$t0,1*$SZREG($sp)
    817 	$REG_S	$gp,0*$SZREG($sp)
    818 ___
    819 $code.=<<___;
    820 	.set	reorder
    821 	move	$ta3,$ra
    822 	bal	bn_div_words_internal
    823 	move	$ra,$ta3
    824 	$MULTU	$ta2,$v0
    825 	$LD	$t2,-2*$BNSZ($a3)
    826 	move	$ta0,$zero
    827 	mfhi	$t1
    828 	mflo	$t0
    829 	sltu	$t8,$t1,$a1
    830 .L_bn_div_3_words_inner_loop:
    831 	bnez	$t8,.L_bn_div_3_words_inner_loop_done
    832 	sgeu	$at,$t2,$t0
    833 	seq	$t9,$t1,$a1
    834 	and	$at,$t9
    835 	sltu	$t3,$t0,$ta2
    836 	$ADDU	$a1,$a2
    837 	$SUBU	$t1,$t3
    838 	$SUBU	$t0,$ta2
    839 	sltu	$t8,$t1,$a1
    840 	sltu	$ta0,$a1,$a2
    841 	or	$t8,$ta0
    842 	.set	noreorder
    843 	beqzl	$at,.L_bn_div_3_words_inner_loop
    844 	$SUBU	$v0,1
    845 	.set	reorder
    846 .L_bn_div_3_words_inner_loop_done:
    847 	.set	noreorder
    848 ___
    849 $code.=<<___ if ($flavour =~ /nubi/i);
    850 	$REG_L	$t3,4*$SZREG($sp)
    851 	$REG_L	$t2,3*$SZREG($sp)
    852 	$REG_L	$t1,2*$SZREG($sp)
    853 	$REG_L	$t0,1*$SZREG($sp)
    854 	$REG_L	$gp,0*$SZREG($sp)
    855 	$PTR_ADD $sp,6*$SZREG
    856 ___
    857 $code.=<<___;
    858 	jr	$ra
    859 	move	$a0,$v0
    860 .end	bn_div_3_words_internal
    861 
    862 .align	5
    863 .globl	bn_div_words
    864 .ent	bn_div_words
    865 bn_div_words:
    866 	.set	noreorder
    867 	bnez	$a2,bn_div_words_internal
    868 	li	$v0,-1		# I would rather signal div-by-zero
    869 				# which can be done with 'break 7'
    870 	jr	$ra
    871 	move	$a0,$v0
    872 .end	bn_div_words
    873 
    874 .align	5
    875 .ent	bn_div_words_internal
    876 bn_div_words_internal:
    877 ___
    878 $code.=<<___ if ($flavour =~ /nubi/i);
    879 	.frame	$sp,6*$SZREG,$ra
    880 	.mask	0x8000f008,-$SZREG
    881 	.set	noreorder
    882 	$PTR_SUB $sp,6*$SZREG
    883 	$REG_S	$ra,5*$SZREG($sp)
    884 	$REG_S	$t3,4*$SZREG($sp)
    885 	$REG_S	$t2,3*$SZREG($sp)
    886 	$REG_S	$t1,2*$SZREG($sp)
    887 	$REG_S	$t0,1*$SZREG($sp)
    888 	$REG_S	$gp,0*$SZREG($sp)
    889 ___
    890 $code.=<<___;
    891 	move	$v1,$zero
    892 	bltz	$a2,.L_bn_div_words_body
    893 	move	$t9,$v1
    894 	$SLL	$a2,1
    895 	bgtz	$a2,.-4
    896 	addu	$t9,1
    897 
    898 	.set	reorder
    899 	negu	$t1,$t9
    900 	li	$t2,-1
    901 	$SLL	$t2,$t1
    902 	and	$t2,$a0
    903 	$SRL	$at,$a1,$t1
    904 	.set	noreorder
    905 	bnezl	$t2,.+8
    906 	break	6		# signal overflow
    907 	.set	reorder
    908 	$SLL	$a0,$t9
    909 	$SLL	$a1,$t9
    910 	or	$a0,$at
    911 ___
    912 $QT=$ta0;
    913 $HH=$ta1;
    914 $DH=$v1;
    915 $code.=<<___;
    916 .L_bn_div_words_body:
    917 	$SRL	$DH,$a2,4*$BNSZ	# bits
    918 	sgeu	$at,$a0,$a2
    919 	.set	noreorder
    920 	bnezl	$at,.+8
    921 	$SUBU	$a0,$a2
    922 	.set	reorder
    923 
    924 	li	$QT,-1
    925 	$SRL	$HH,$a0,4*$BNSZ	# bits
    926 	$SRL	$QT,4*$BNSZ	# q=0xffffffff
    927 	beq	$DH,$HH,.L_bn_div_words_skip_div1
    928 	$DIVU	$zero,$a0,$DH
    929 	mflo	$QT
    930 .L_bn_div_words_skip_div1:
    931 	$MULTU	$a2,$QT
    932 	$SLL	$t3,$a0,4*$BNSZ	# bits
    933 	$SRL	$at,$a1,4*$BNSZ	# bits
    934 	or	$t3,$at
    935 	mflo	$t0
    936 	mfhi	$t1
    937 .L_bn_div_words_inner_loop1:
    938 	sltu	$t2,$t3,$t0
    939 	seq	$t8,$HH,$t1
    940 	sltu	$at,$HH,$t1
    941 	and	$t2,$t8
    942 	sltu	$v0,$t0,$a2
    943 	or	$at,$t2
    944 	.set	noreorder
    945 	beqz	$at,.L_bn_div_words_inner_loop1_done
    946 	$SUBU	$t1,$v0
    947 	$SUBU	$t0,$a2
    948 	b	.L_bn_div_words_inner_loop1
    949 	$SUBU	$QT,1
    950 	.set	reorder
    951 .L_bn_div_words_inner_loop1_done:
    952 
    953 	$SLL	$a1,4*$BNSZ	# bits
    954 	$SUBU	$a0,$t3,$t0
    955 	$SLL	$v0,$QT,4*$BNSZ	# bits
    956 
    957 	li	$QT,-1
    958 	$SRL	$HH,$a0,4*$BNSZ	# bits
    959 	$SRL	$QT,4*$BNSZ	# q=0xffffffff
    960 	beq	$DH,$HH,.L_bn_div_words_skip_div2
    961 	$DIVU	$zero,$a0,$DH
    962 	mflo	$QT
    963 .L_bn_div_words_skip_div2:
    964 	$MULTU	$a2,$QT
    965 	$SLL	$t3,$a0,4*$BNSZ	# bits
    966 	$SRL	$at,$a1,4*$BNSZ	# bits
    967 	or	$t3,$at
    968 	mflo	$t0
    969 	mfhi	$t1
    970 .L_bn_div_words_inner_loop2:
    971 	sltu	$t2,$t3,$t0
    972 	seq	$t8,$HH,$t1
    973 	sltu	$at,$HH,$t1
    974 	and	$t2,$t8
    975 	sltu	$v1,$t0,$a2
    976 	or	$at,$t2
    977 	.set	noreorder
    978 	beqz	$at,.L_bn_div_words_inner_loop2_done
    979 	$SUBU	$t1,$v1
    980 	$SUBU	$t0,$a2
    981 	b	.L_bn_div_words_inner_loop2
    982 	$SUBU	$QT,1
    983 	.set	reorder
    984 .L_bn_div_words_inner_loop2_done:
    985 
    986 	$SUBU	$a0,$t3,$t0
    987 	or	$v0,$QT
    988 	$SRL	$v1,$a0,$t9	# $v1 contains remainder if anybody wants it
    989 	$SRL	$a2,$t9		# restore $a2
    990 
    991 	.set	noreorder
    992 	move	$a1,$v1
    993 ___
    994 $code.=<<___ if ($flavour =~ /nubi/i);
    995 	$REG_L	$t3,4*$SZREG($sp)
    996 	$REG_L	$t2,3*$SZREG($sp)
    997 	$REG_L	$t1,2*$SZREG($sp)
    998 	$REG_L	$t0,1*$SZREG($sp)
    999 	$REG_L	$gp,0*$SZREG($sp)
   1000 	$PTR_ADD $sp,6*$SZREG
   1001 ___
   1002 $code.=<<___;
   1003 	jr	$ra
   1004 	move	$a0,$v0
   1005 .end	bn_div_words_internal
   1006 ___
   1007 undef $HH; undef $QT; undef $DH;
   1008 
   1009 ($a_0,$a_1,$a_2,$a_3)=($t0,$t1,$t2,$t3);
   1010 ($b_0,$b_1,$b_2,$b_3)=($ta0,$ta1,$ta2,$ta3);
   1011 
   1012 ($a_4,$a_5,$a_6,$a_7)=($s0,$s2,$s4,$a1); # once we load a[7], no use for $a1
   1013 ($b_4,$b_5,$b_6,$b_7)=($s1,$s3,$s5,$a2); # once we load b[7], no use for $a2
   1014 
   1015 ($t_1,$t_2,$c_1,$c_2,$c_3)=($t8,$t9,$v0,$v1,$a3);
   1016 
   1017 $code.=<<___;
   1018 
   1019 .align	5
   1020 .globl	bn_mul_comba8
   1021 .ent	bn_mul_comba8
   1022 bn_mul_comba8:
   1023 	.set	noreorder
   1024 ___
   1025 $code.=<<___ if ($flavour =~ /nubi/i);
   1026 	.frame	$sp,12*$SZREG,$ra
   1027 	.mask	0x803ff008,-$SZREG
   1028 	$PTR_SUB $sp,12*$SZREG
   1029 	$REG_S	$ra,11*$SZREG($sp)
   1030 	$REG_S	$s5,10*$SZREG($sp)
   1031 	$REG_S	$s4,9*$SZREG($sp)
   1032 	$REG_S	$s3,8*$SZREG($sp)
   1033 	$REG_S	$s2,7*$SZREG($sp)
   1034 	$REG_S	$s1,6*$SZREG($sp)
   1035 	$REG_S	$s0,5*$SZREG($sp)
   1036 	$REG_S	$t3,4*$SZREG($sp)
   1037 	$REG_S	$t2,3*$SZREG($sp)
   1038 	$REG_S	$t1,2*$SZREG($sp)
   1039 	$REG_S	$t0,1*$SZREG($sp)
   1040 	$REG_S	$gp,0*$SZREG($sp)
   1041 ___
   1042 $code.=<<___ if ($flavour !~ /nubi/i);
   1043 	.frame	$sp,6*$SZREG,$ra
   1044 	.mask	0x003f0000,-$SZREG
   1045 	$PTR_SUB $sp,6*$SZREG
   1046 	$REG_S	$s5,5*$SZREG($sp)
   1047 	$REG_S	$s4,4*$SZREG($sp)
   1048 	$REG_S	$s3,3*$SZREG($sp)
   1049 	$REG_S	$s2,2*$SZREG($sp)
   1050 	$REG_S	$s1,1*$SZREG($sp)
   1051 	$REG_S	$s0,0*$SZREG($sp)
   1052 ___
   1053 $code.=<<___;
   1054 
   1055 	.set	reorder
   1056 	$LD	$a_0,0($a1)	# If compiled with -mips3 option on
   1057 				# R5000 box assembler barks on this
   1058 				# 1ine with "should not have mult/div
   1059 				# as last instruction in bb (R10K
   1060 				# bug)" warning. If anybody out there
   1061 				# has a clue about how to circumvent
   1062 				# this do send me a note.
   1063 				#		<appro\@fy.chalmers.se>
   1064 
   1065 	$LD	$b_0,0($a2)
   1066 	$LD	$a_1,$BNSZ($a1)
   1067 	$LD	$a_2,2*$BNSZ($a1)
   1068 	$MULTU	$a_0,$b_0		# mul_add_c(a[0],b[0],c1,c2,c3);
   1069 	$LD	$a_3,3*$BNSZ($a1)
   1070 	$LD	$b_1,$BNSZ($a2)
   1071 	$LD	$b_2,2*$BNSZ($a2)
   1072 	$LD	$b_3,3*$BNSZ($a2)
   1073 	mflo	$c_1
   1074 	mfhi	$c_2
   1075 
   1076 	$LD	$a_4,4*$BNSZ($a1)
   1077 	$LD	$a_5,5*$BNSZ($a1)
   1078 	$MULTU	$a_0,$b_1		# mul_add_c(a[0],b[1],c2,c3,c1);
   1079 	$LD	$a_6,6*$BNSZ($a1)
   1080 	$LD	$a_7,7*$BNSZ($a1)
   1081 	$LD	$b_4,4*$BNSZ($a2)
   1082 	$LD	$b_5,5*$BNSZ($a2)
   1083 	mflo	$t_1
   1084 	mfhi	$t_2
   1085 	$ADDU	$c_2,$t_1
   1086 	sltu	$at,$c_2,$t_1
   1087 	$MULTU	$a_1,$b_0		# mul_add_c(a[1],b[0],c2,c3,c1);
   1088 	$ADDU	$c_3,$t_2,$at
   1089 	$LD	$b_6,6*$BNSZ($a2)
   1090 	$LD	$b_7,7*$BNSZ($a2)
   1091 	$ST	$c_1,0($a0)	# r[0]=c1;
   1092 	mflo	$t_1
   1093 	mfhi	$t_2
   1094 	$ADDU	$c_2,$t_1
   1095 	sltu	$at,$c_2,$t_1
   1096 	 $MULTU	$a_2,$b_0		# mul_add_c(a[2],b[0],c3,c1,c2);
   1097 	$ADDU	$t_2,$at
   1098 	$ADDU	$c_3,$t_2
   1099 	sltu	$c_1,$c_3,$t_2
   1100 	$ST	$c_2,$BNSZ($a0)	# r[1]=c2;
   1101 
   1102 	mflo	$t_1
   1103 	mfhi	$t_2
   1104 	$ADDU	$c_3,$t_1
   1105 	sltu	$at,$c_3,$t_1
   1106 	$MULTU	$a_1,$b_1		# mul_add_c(a[1],b[1],c3,c1,c2);
   1107 	$ADDU	$t_2,$at
   1108 	$ADDU	$c_1,$t_2
   1109 	mflo	$t_1
   1110 	mfhi	$t_2
   1111 	$ADDU	$c_3,$t_1
   1112 	sltu	$at,$c_3,$t_1
   1113 	$MULTU	$a_0,$b_2		# mul_add_c(a[0],b[2],c3,c1,c2);
   1114 	$ADDU	$t_2,$at
   1115 	$ADDU	$c_1,$t_2
   1116 	sltu	$c_2,$c_1,$t_2
   1117 	mflo	$t_1
   1118 	mfhi	$t_2
   1119 	$ADDU	$c_3,$t_1
   1120 	sltu	$at,$c_3,$t_1
   1121 	 $MULTU	$a_0,$b_3		# mul_add_c(a[0],b[3],c1,c2,c3);
   1122 	$ADDU	$t_2,$at
   1123 	$ADDU	$c_1,$t_2
   1124 	sltu	$at,$c_1,$t_2
   1125 	$ADDU	$c_2,$at
   1126 	$ST	$c_3,2*$BNSZ($a0)	# r[2]=c3;
   1127 
   1128 	mflo	$t_1
   1129 	mfhi	$t_2
   1130 	$ADDU	$c_1,$t_1
   1131 	sltu	$at,$c_1,$t_1
   1132 	$MULTU	$a_1,$b_2		# mul_add_c(a[1],b[2],c1,c2,c3);
   1133 	$ADDU	$t_2,$at
   1134 	$ADDU	$c_2,$t_2
   1135 	sltu	$c_3,$c_2,$t_2
   1136 	mflo	$t_1
   1137 	mfhi	$t_2
   1138 	$ADDU	$c_1,$t_1
   1139 	sltu	$at,$c_1,$t_1
   1140 	$MULTU	$a_2,$b_1		# mul_add_c(a[2],b[1],c1,c2,c3);
   1141 	$ADDU	$t_2,$at
   1142 	$ADDU	$c_2,$t_2
   1143 	sltu	$at,$c_2,$t_2
   1144 	$ADDU	$c_3,$at
   1145 	mflo	$t_1
   1146 	mfhi	$t_2
   1147 	$ADDU	$c_1,$t_1
   1148 	sltu	$at,$c_1,$t_1
   1149 	$MULTU	$a_3,$b_0		# mul_add_c(a[3],b[0],c1,c2,c3);
   1150 	$ADDU	$t_2,$at
   1151 	$ADDU	$c_2,$t_2
   1152 	sltu	$at,$c_2,$t_2
   1153 	$ADDU	$c_3,$at
   1154 	mflo	$t_1
   1155 	mfhi	$t_2
   1156 	$ADDU	$c_1,$t_1
   1157 	sltu	$at,$c_1,$t_1
   1158 	 $MULTU	$a_4,$b_0		# mul_add_c(a[4],b[0],c2,c3,c1);
   1159 	$ADDU	$t_2,$at
   1160 	$ADDU	$c_2,$t_2
   1161 	sltu	$at,$c_2,$t_2
   1162 	$ADDU	$c_3,$at
   1163 	$ST	$c_1,3*$BNSZ($a0)	# r[3]=c1;
   1164 
   1165 	mflo	$t_1
   1166 	mfhi	$t_2
   1167 	$ADDU	$c_2,$t_1
   1168 	sltu	$at,$c_2,$t_1
   1169 	$MULTU	$a_3,$b_1		# mul_add_c(a[3],b[1],c2,c3,c1);
   1170 	$ADDU	$t_2,$at
   1171 	$ADDU	$c_3,$t_2
   1172 	sltu	$c_1,$c_3,$t_2
   1173 	mflo	$t_1
   1174 	mfhi	$t_2
   1175 	$ADDU	$c_2,$t_1
   1176 	sltu	$at,$c_2,$t_1
   1177 	$MULTU	$a_2,$b_2		# mul_add_c(a[2],b[2],c2,c3,c1);
   1178 	$ADDU	$t_2,$at
   1179 	$ADDU	$c_3,$t_2
   1180 	sltu	$at,$c_3,$t_2
   1181 	$ADDU	$c_1,$at
   1182 	mflo	$t_1
   1183 	mfhi	$t_2
   1184 	$ADDU	$c_2,$t_1
   1185 	sltu	$at,$c_2,$t_1
   1186 	$MULTU	$a_1,$b_3		# mul_add_c(a[1],b[3],c2,c3,c1);
   1187 	$ADDU	$t_2,$at
   1188 	$ADDU	$c_3,$t_2
   1189 	sltu	$at,$c_3,$t_2
   1190 	$ADDU	$c_1,$at
   1191 	mflo	$t_1
   1192 	mfhi	$t_2
   1193 	$ADDU	$c_2,$t_1
   1194 	sltu	$at,$c_2,$t_1
   1195 	$MULTU	$a_0,$b_4		# mul_add_c(a[0],b[4],c2,c3,c1);
   1196 	$ADDU	$t_2,$at
   1197 	$ADDU	$c_3,$t_2
   1198 	sltu	$at,$c_3,$t_2
   1199 	$ADDU	$c_1,$at
   1200 	mflo	$t_1
   1201 	mfhi	$t_2
   1202 	$ADDU	$c_2,$t_1
   1203 	sltu	$at,$c_2,$t_1
   1204 	 $MULTU	$a_0,$b_5		# mul_add_c(a[0],b[5],c3,c1,c2);
   1205 	$ADDU	$t_2,$at
   1206 	$ADDU	$c_3,$t_2
   1207 	sltu	$at,$c_3,$t_2
   1208 	$ADDU	$c_1,$at
   1209 	$ST	$c_2,4*$BNSZ($a0)	# r[4]=c2;
   1210 
   1211 	mflo	$t_1
   1212 	mfhi	$t_2
   1213 	$ADDU	$c_3,$t_1
   1214 	sltu	$at,$c_3,$t_1
   1215 	$MULTU	$a_1,$b_4		# mul_add_c(a[1],b[4],c3,c1,c2);
   1216 	$ADDU	$t_2,$at
   1217 	$ADDU	$c_1,$t_2
   1218 	sltu	$c_2,$c_1,$t_2
   1219 	mflo	$t_1
   1220 	mfhi	$t_2
   1221 	$ADDU	$c_3,$t_1
   1222 	sltu	$at,$c_3,$t_1
   1223 	$MULTU	$a_2,$b_3		# mul_add_c(a[2],b[3],c3,c1,c2);
   1224 	$ADDU	$t_2,$at
   1225 	$ADDU	$c_1,$t_2
   1226 	sltu	$at,$c_1,$t_2
   1227 	$ADDU	$c_2,$at
   1228 	mflo	$t_1
   1229 	mfhi	$t_2
   1230 	$ADDU	$c_3,$t_1
   1231 	sltu	$at,$c_3,$t_1
   1232 	$MULTU	$a_3,$b_2		# mul_add_c(a[3],b[2],c3,c1,c2);
   1233 	$ADDU	$t_2,$at
   1234 	$ADDU	$c_1,$t_2
   1235 	sltu	$at,$c_1,$t_2
   1236 	$ADDU	$c_2,$at
   1237 	mflo	$t_1
   1238 	mfhi	$t_2
   1239 	$ADDU	$c_3,$t_1
   1240 	sltu	$at,$c_3,$t_1
   1241 	$MULTU	$a_4,$b_1		# mul_add_c(a[4],b[1],c3,c1,c2);
   1242 	$ADDU	$t_2,$at
   1243 	$ADDU	$c_1,$t_2
   1244 	sltu	$at,$c_1,$t_2
   1245 	$ADDU	$c_2,$at
   1246 	mflo	$t_1
   1247 	mfhi	$t_2
   1248 	$ADDU	$c_3,$t_1
   1249 	sltu	$at,$c_3,$t_1
   1250 	$MULTU	$a_5,$b_0		# mul_add_c(a[5],b[0],c3,c1,c2);
   1251 	$ADDU	$t_2,$at
   1252 	$ADDU	$c_1,$t_2
   1253 	sltu	$at,$c_1,$t_2
   1254 	$ADDU	$c_2,$at
   1255 	mflo	$t_1
   1256 	mfhi	$t_2
   1257 	$ADDU	$c_3,$t_1
   1258 	sltu	$at,$c_3,$t_1
   1259 	 $MULTU	$a_6,$b_0		# mul_add_c(a[6],b[0],c1,c2,c3);
   1260 	$ADDU	$t_2,$at
   1261 	$ADDU	$c_1,$t_2
   1262 	sltu	$at,$c_1,$t_2
   1263 	$ADDU	$c_2,$at
   1264 	$ST	$c_3,5*$BNSZ($a0)	# r[5]=c3;
   1265 
   1266 	mflo	$t_1
   1267 	mfhi	$t_2
   1268 	$ADDU	$c_1,$t_1
   1269 	sltu	$at,$c_1,$t_1
   1270 	$MULTU	$a_5,$b_1		# mul_add_c(a[5],b[1],c1,c2,c3);
   1271 	$ADDU	$t_2,$at
   1272 	$ADDU	$c_2,$t_2
   1273 	sltu	$c_3,$c_2,$t_2
   1274 	mflo	$t_1
   1275 	mfhi	$t_2
   1276 	$ADDU	$c_1,$t_1
   1277 	sltu	$at,$c_1,$t_1
   1278 	$MULTU	$a_4,$b_2		# mul_add_c(a[4],b[2],c1,c2,c3);
   1279 	$ADDU	$t_2,$at
   1280 	$ADDU	$c_2,$t_2
   1281 	sltu	$at,$c_2,$t_2
   1282 	$ADDU	$c_3,$at
   1283 	mflo	$t_1
   1284 	mfhi	$t_2
   1285 	$ADDU	$c_1,$t_1
   1286 	sltu	$at,$c_1,$t_1
   1287 	$MULTU	$a_3,$b_3		# mul_add_c(a[3],b[3],c1,c2,c3);
   1288 	$ADDU	$t_2,$at
   1289 	$ADDU	$c_2,$t_2
   1290 	sltu	$at,$c_2,$t_2
   1291 	$ADDU	$c_3,$at
   1292 	mflo	$t_1
   1293 	mfhi	$t_2
   1294 	$ADDU	$c_1,$t_1
   1295 	sltu	$at,$c_1,$t_1
   1296 	$MULTU	$a_2,$b_4		# mul_add_c(a[2],b[4],c1,c2,c3);
   1297 	$ADDU	$t_2,$at
   1298 	$ADDU	$c_2,$t_2
   1299 	sltu	$at,$c_2,$t_2
   1300 	$ADDU	$c_3,$at
   1301 	mflo	$t_1
   1302 	mfhi	$t_2
   1303 	$ADDU	$c_1,$t_1
   1304 	sltu	$at,$c_1,$t_1
   1305 	$MULTU	$a_1,$b_5		# mul_add_c(a[1],b[5],c1,c2,c3);
   1306 	$ADDU	$t_2,$at
   1307 	$ADDU	$c_2,$t_2
   1308 	sltu	$at,$c_2,$t_2
   1309 	$ADDU	$c_3,$at
   1310 	mflo	$t_1
   1311 	mfhi	$t_2
   1312 	$ADDU	$c_1,$t_1
   1313 	sltu	$at,$c_1,$t_1
   1314 	$MULTU	$a_0,$b_6		# mul_add_c(a[0],b[6],c1,c2,c3);
   1315 	$ADDU	$t_2,$at
   1316 	$ADDU	$c_2,$t_2
   1317 	sltu	$at,$c_2,$t_2
   1318 	$ADDU	$c_3,$at
   1319 	mflo	$t_1
   1320 	mfhi	$t_2
   1321 	$ADDU	$c_1,$t_1
   1322 	sltu	$at,$c_1,$t_1
   1323 	 $MULTU	$a_0,$b_7		# mul_add_c(a[0],b[7],c2,c3,c1);
   1324 	$ADDU	$t_2,$at
   1325 	$ADDU	$c_2,$t_2
   1326 	sltu	$at,$c_2,$t_2
   1327 	$ADDU	$c_3,$at
   1328 	$ST	$c_1,6*$BNSZ($a0)	# r[6]=c1;
   1329 
   1330 	mflo	$t_1
   1331 	mfhi	$t_2
   1332 	$ADDU	$c_2,$t_1
   1333 	sltu	$at,$c_2,$t_1
   1334 	$MULTU	$a_1,$b_6		# mul_add_c(a[1],b[6],c2,c3,c1);
   1335 	$ADDU	$t_2,$at
   1336 	$ADDU	$c_3,$t_2
   1337 	sltu	$c_1,$c_3,$t_2
   1338 	mflo	$t_1
   1339 	mfhi	$t_2
   1340 	$ADDU	$c_2,$t_1
   1341 	sltu	$at,$c_2,$t_1
   1342 	$MULTU	$a_2,$b_5		# mul_add_c(a[2],b[5],c2,c3,c1);
   1343 	$ADDU	$t_2,$at
   1344 	$ADDU	$c_3,$t_2
   1345 	sltu	$at,$c_3,$t_2
   1346 	$ADDU	$c_1,$at
   1347 	mflo	$t_1
   1348 	mfhi	$t_2
   1349 	$ADDU	$c_2,$t_1
   1350 	sltu	$at,$c_2,$t_1
   1351 	$MULTU	$a_3,$b_4		# mul_add_c(a[3],b[4],c2,c3,c1);
   1352 	$ADDU	$t_2,$at
   1353 	$ADDU	$c_3,$t_2
   1354 	sltu	$at,$c_3,$t_2
   1355 	$ADDU	$c_1,$at
   1356 	mflo	$t_1
   1357 	mfhi	$t_2
   1358 	$ADDU	$c_2,$t_1
   1359 	sltu	$at,$c_2,$t_1
   1360 	$MULTU	$a_4,$b_3		# mul_add_c(a[4],b[3],c2,c3,c1);
   1361 	$ADDU	$t_2,$at
   1362 	$ADDU	$c_3,$t_2
   1363 	sltu	$at,$c_3,$t_2
   1364 	$ADDU	$c_1,$at
   1365 	mflo	$t_1
   1366 	mfhi	$t_2
   1367 	$ADDU	$c_2,$t_1
   1368 	sltu	$at,$c_2,$t_1
   1369 	$MULTU	$a_5,$b_2		# mul_add_c(a[5],b[2],c2,c3,c1);
   1370 	$ADDU	$t_2,$at
   1371 	$ADDU	$c_3,$t_2
   1372 	sltu	$at,$c_3,$t_2
   1373 	$ADDU	$c_1,$at
   1374 	mflo	$t_1
   1375 	mfhi	$t_2
   1376 	$ADDU	$c_2,$t_1
   1377 	sltu	$at,$c_2,$t_1
   1378 	$MULTU	$a_6,$b_1		# mul_add_c(a[6],b[1],c2,c3,c1);
   1379 	$ADDU	$t_2,$at
   1380 	$ADDU	$c_3,$t_2
   1381 	sltu	$at,$c_3,$t_2
   1382 	$ADDU	$c_1,$at
   1383 	mflo	$t_1
   1384 	mfhi	$t_2
   1385 	$ADDU	$c_2,$t_1
   1386 	sltu	$at,$c_2,$t_1
   1387 	$MULTU	$a_7,$b_0		# mul_add_c(a[7],b[0],c2,c3,c1);
   1388 	$ADDU	$t_2,$at
   1389 	$ADDU	$c_3,$t_2
   1390 	sltu	$at,$c_3,$t_2
   1391 	$ADDU	$c_1,$at
   1392 	mflo	$t_1
   1393 	mfhi	$t_2
   1394 	$ADDU	$c_2,$t_1
   1395 	sltu	$at,$c_2,$t_1
   1396 	 $MULTU	$a_7,$b_1		# mul_add_c(a[7],b[1],c3,c1,c2);
   1397 	$ADDU	$t_2,$at
   1398 	$ADDU	$c_3,$t_2
   1399 	sltu	$at,$c_3,$t_2
   1400 	$ADDU	$c_1,$at
   1401 	$ST	$c_2,7*$BNSZ($a0)	# r[7]=c2;
   1402 
   1403 	mflo	$t_1
   1404 	mfhi	$t_2
   1405 	$ADDU	$c_3,$t_1
   1406 	sltu	$at,$c_3,$t_1
   1407 	$MULTU	$a_6,$b_2		# mul_add_c(a[6],b[2],c3,c1,c2);
   1408 	$ADDU	$t_2,$at
   1409 	$ADDU	$c_1,$t_2
   1410 	sltu	$c_2,$c_1,$t_2
   1411 	mflo	$t_1
   1412 	mfhi	$t_2
   1413 	$ADDU	$c_3,$t_1
   1414 	sltu	$at,$c_3,$t_1
   1415 	$MULTU	$a_5,$b_3		# mul_add_c(a[5],b[3],c3,c1,c2);
   1416 	$ADDU	$t_2,$at
   1417 	$ADDU	$c_1,$t_2
   1418 	sltu	$at,$c_1,$t_2
   1419 	$ADDU	$c_2,$at
   1420 	mflo	$t_1
   1421 	mfhi	$t_2
   1422 	$ADDU	$c_3,$t_1
   1423 	sltu	$at,$c_3,$t_1
   1424 	$MULTU	$a_4,$b_4		# mul_add_c(a[4],b[4],c3,c1,c2);
   1425 	$ADDU	$t_2,$at
   1426 	$ADDU	$c_1,$t_2
   1427 	sltu	$at,$c_1,$t_2
   1428 	$ADDU	$c_2,$at
   1429 	mflo	$t_1
   1430 	mfhi	$t_2
   1431 	$ADDU	$c_3,$t_1
   1432 	sltu	$at,$c_3,$t_1
   1433 	$MULTU	$a_3,$b_5		# mul_add_c(a[3],b[5],c3,c1,c2);
   1434 	$ADDU	$t_2,$at
   1435 	$ADDU	$c_1,$t_2
   1436 	sltu	$at,$c_1,$t_2
   1437 	$ADDU	$c_2,$at
   1438 	mflo	$t_1
   1439 	mfhi	$t_2
   1440 	$ADDU	$c_3,$t_1
   1441 	sltu	$at,$c_3,$t_1
   1442 	$MULTU	$a_2,$b_6		# mul_add_c(a[2],b[6],c3,c1,c2);
   1443 	$ADDU	$t_2,$at
   1444 	$ADDU	$c_1,$t_2
   1445 	sltu	$at,$c_1,$t_2
   1446 	$ADDU	$c_2,$at
   1447 	mflo	$t_1
   1448 	mfhi	$t_2
   1449 	$ADDU	$c_3,$t_1
   1450 	sltu	$at,$c_3,$t_1
   1451 	$MULTU	$a_1,$b_7		# mul_add_c(a[1],b[7],c3,c1,c2);
   1452 	$ADDU	$t_2,$at
   1453 	$ADDU	$c_1,$t_2
   1454 	sltu	$at,$c_1,$t_2
   1455 	$ADDU	$c_2,$at
   1456 	mflo	$t_1
   1457 	mfhi	$t_2
   1458 	$ADDU	$c_3,$t_1
   1459 	sltu	$at,$c_3,$t_1
   1460 	 $MULTU	$a_2,$b_7		# mul_add_c(a[2],b[7],c1,c2,c3);
   1461 	$ADDU	$t_2,$at
   1462 	$ADDU	$c_1,$t_2
   1463 	sltu	$at,$c_1,$t_2
   1464 	$ADDU	$c_2,$at
   1465 	$ST	$c_3,8*$BNSZ($a0)	# r[8]=c3;
   1466 
   1467 	mflo	$t_1
   1468 	mfhi	$t_2
   1469 	$ADDU	$c_1,$t_1
   1470 	sltu	$at,$c_1,$t_1
   1471 	$MULTU	$a_3,$b_6		# mul_add_c(a[3],b[6],c1,c2,c3);
   1472 	$ADDU	$t_2,$at
   1473 	$ADDU	$c_2,$t_2
   1474 	sltu	$c_3,$c_2,$t_2
   1475 	mflo	$t_1
   1476 	mfhi	$t_2
   1477 	$ADDU	$c_1,$t_1
   1478 	sltu	$at,$c_1,$t_1
   1479 	$MULTU	$a_4,$b_5		# mul_add_c(a[4],b[5],c1,c2,c3);
   1480 	$ADDU	$t_2,$at
   1481 	$ADDU	$c_2,$t_2
   1482 	sltu	$at,$c_2,$t_2
   1483 	$ADDU	$c_3,$at
   1484 	mflo	$t_1
   1485 	mfhi	$t_2
   1486 	$ADDU	$c_1,$t_1
   1487 	sltu	$at,$c_1,$t_1
   1488 	$MULTU	$a_5,$b_4		# mul_add_c(a[5],b[4],c1,c2,c3);
   1489 	$ADDU	$t_2,$at
   1490 	$ADDU	$c_2,$t_2
   1491 	sltu	$at,$c_2,$t_2
   1492 	$ADDU	$c_3,$at
   1493 	mflo	$t_1
   1494 	mfhi	$t_2
   1495 	$ADDU	$c_1,$t_1
   1496 	sltu	$at,$c_1,$t_1
   1497 	$MULTU	$a_6,$b_3		# mul_add_c(a[6],b[3],c1,c2,c3);
   1498 	$ADDU	$t_2,$at
   1499 	$ADDU	$c_2,$t_2
   1500 	sltu	$at,$c_2,$t_2
   1501 	$ADDU	$c_3,$at
   1502 	mflo	$t_1
   1503 	mfhi	$t_2
   1504 	$ADDU	$c_1,$t_1
   1505 	sltu	$at,$c_1,$t_1
   1506 	$MULTU	$a_7,$b_2		# mul_add_c(a[7],b[2],c1,c2,c3);
   1507 	$ADDU	$t_2,$at
   1508 	$ADDU	$c_2,$t_2
   1509 	sltu	$at,$c_2,$t_2
   1510 	$ADDU	$c_3,$at
   1511 	mflo	$t_1
   1512 	mfhi	$t_2
   1513 	$ADDU	$c_1,$t_1
   1514 	sltu	$at,$c_1,$t_1
   1515 	 $MULTU	$a_7,$b_3		# mul_add_c(a[7],b[3],c2,c3,c1);
   1516 	$ADDU	$t_2,$at
   1517 	$ADDU	$c_2,$t_2
   1518 	sltu	$at,$c_2,$t_2
   1519 	$ADDU	$c_3,$at
   1520 	$ST	$c_1,9*$BNSZ($a0)	# r[9]=c1;
   1521 
   1522 	mflo	$t_1
   1523 	mfhi	$t_2
   1524 	$ADDU	$c_2,$t_1
   1525 	sltu	$at,$c_2,$t_1
   1526 	$MULTU	$a_6,$b_4		# mul_add_c(a[6],b[4],c2,c3,c1);
   1527 	$ADDU	$t_2,$at
   1528 	$ADDU	$c_3,$t_2
   1529 	sltu	$c_1,$c_3,$t_2
   1530 	mflo	$t_1
   1531 	mfhi	$t_2
   1532 	$ADDU	$c_2,$t_1
   1533 	sltu	$at,$c_2,$t_1
   1534 	$MULTU	$a_5,$b_5		# mul_add_c(a[5],b[5],c2,c3,c1);
   1535 	$ADDU	$t_2,$at
   1536 	$ADDU	$c_3,$t_2
   1537 	sltu	$at,$c_3,$t_2
   1538 	$ADDU	$c_1,$at
   1539 	mflo	$t_1
   1540 	mfhi	$t_2
   1541 	$ADDU	$c_2,$t_1
   1542 	sltu	$at,$c_2,$t_1
   1543 	$MULTU	$a_4,$b_6		# mul_add_c(a[4],b[6],c2,c3,c1);
   1544 	$ADDU	$t_2,$at
   1545 	$ADDU	$c_3,$t_2
   1546 	sltu	$at,$c_3,$t_2
   1547 	$ADDU	$c_1,$at
   1548 	mflo	$t_1
   1549 	mfhi	$t_2
   1550 	$ADDU	$c_2,$t_1
   1551 	sltu	$at,$c_2,$t_1
   1552 	$MULTU	$a_3,$b_7		# mul_add_c(a[3],b[7],c2,c3,c1);
   1553 	$ADDU	$t_2,$at
   1554 	$ADDU	$c_3,$t_2
   1555 	sltu	$at,$c_3,$t_2
   1556 	$ADDU	$c_1,$at
   1557 	mflo	$t_1
   1558 	mfhi	$t_2
   1559 	$ADDU	$c_2,$t_1
   1560 	sltu	$at,$c_2,$t_1
   1561 	$MULTU	$a_4,$b_7		# mul_add_c(a[4],b[7],c3,c1,c2);
   1562 	$ADDU	$t_2,$at
   1563 	$ADDU	$c_3,$t_2
   1564 	sltu	$at,$c_3,$t_2
   1565 	$ADDU	$c_1,$at
   1566 	$ST	$c_2,10*$BNSZ($a0)	# r[10]=c2;
   1567 
   1568 	mflo	$t_1
   1569 	mfhi	$t_2
   1570 	$ADDU	$c_3,$t_1
   1571 	sltu	$at,$c_3,$t_1
   1572 	$MULTU	$a_5,$b_6		# mul_add_c(a[5],b[6],c3,c1,c2);
   1573 	$ADDU	$t_2,$at
   1574 	$ADDU	$c_1,$t_2
   1575 	sltu	$c_2,$c_1,$t_2
   1576 	mflo	$t_1
   1577 	mfhi	$t_2
   1578 	$ADDU	$c_3,$t_1
   1579 	sltu	$at,$c_3,$t_1
   1580 	$MULTU	$a_6,$b_5		# mul_add_c(a[6],b[5],c3,c1,c2);
   1581 	$ADDU	$t_2,$at
   1582 	$ADDU	$c_1,$t_2
   1583 	sltu	$at,$c_1,$t_2
   1584 	$ADDU	$c_2,$at
   1585 	mflo	$t_1
   1586 	mfhi	$t_2
   1587 	$ADDU	$c_3,$t_1
   1588 	sltu	$at,$c_3,$t_1
   1589 	$MULTU	$a_7,$b_4		# mul_add_c(a[7],b[4],c3,c1,c2);
   1590 	$ADDU	$t_2,$at
   1591 	$ADDU	$c_1,$t_2
   1592 	sltu	$at,$c_1,$t_2
   1593 	$ADDU	$c_2,$at
   1594 	mflo	$t_1
   1595 	mfhi	$t_2
   1596 	$ADDU	$c_3,$t_1
   1597 	sltu	$at,$c_3,$t_1
   1598 	 $MULTU	$a_7,$b_5		# mul_add_c(a[7],b[5],c1,c2,c3);
   1599 	$ADDU	$t_2,$at
   1600 	$ADDU	$c_1,$t_2
   1601 	sltu	$at,$c_1,$t_2
   1602 	$ADDU	$c_2,$at
   1603 	$ST	$c_3,11*$BNSZ($a0)	# r[11]=c3;
   1604 
   1605 	mflo	$t_1
   1606 	mfhi	$t_2
   1607 	$ADDU	$c_1,$t_1
   1608 	sltu	$at,$c_1,$t_1
   1609 	$MULTU	$a_6,$b_6		# mul_add_c(a[6],b[6],c1,c2,c3);
   1610 	$ADDU	$t_2,$at
   1611 	$ADDU	$c_2,$t_2
   1612 	sltu	$c_3,$c_2,$t_2
   1613 	mflo	$t_1
   1614 	mfhi	$t_2
   1615 	$ADDU	$c_1,$t_1
   1616 	sltu	$at,$c_1,$t_1
   1617 	$MULTU	$a_5,$b_7		# mul_add_c(a[5],b[7],c1,c2,c3);
   1618 	$ADDU	$t_2,$at
   1619 	$ADDU	$c_2,$t_2
   1620 	sltu	$at,$c_2,$t_2
   1621 	$ADDU	$c_3,$at
   1622 	mflo	$t_1
   1623 	mfhi	$t_2
   1624 	$ADDU	$c_1,$t_1
   1625 	sltu	$at,$c_1,$t_1
   1626 	 $MULTU	$a_6,$b_7		# mul_add_c(a[6],b[7],c2,c3,c1);
   1627 	$ADDU	$t_2,$at
   1628 	$ADDU	$c_2,$t_2
   1629 	sltu	$at,$c_2,$t_2
   1630 	$ADDU	$c_3,$at
   1631 	$ST	$c_1,12*$BNSZ($a0)	# r[12]=c1;
   1632 
   1633 	mflo	$t_1
   1634 	mfhi	$t_2
   1635 	$ADDU	$c_2,$t_1
   1636 	sltu	$at,$c_2,$t_1
   1637 	$MULTU	$a_7,$b_6		# mul_add_c(a[7],b[6],c2,c3,c1);
   1638 	$ADDU	$t_2,$at
   1639 	$ADDU	$c_3,$t_2
   1640 	sltu	$c_1,$c_3,$t_2
   1641 	mflo	$t_1
   1642 	mfhi	$t_2
   1643 	$ADDU	$c_2,$t_1
   1644 	sltu	$at,$c_2,$t_1
   1645 	$MULTU	$a_7,$b_7		# mul_add_c(a[7],b[7],c3,c1,c2);
   1646 	$ADDU	$t_2,$at
   1647 	$ADDU	$c_3,$t_2
   1648 	sltu	$at,$c_3,$t_2
   1649 	$ADDU	$c_1,$at
   1650 	$ST	$c_2,13*$BNSZ($a0)	# r[13]=c2;
   1651 
   1652 	mflo	$t_1
   1653 	mfhi	$t_2
   1654 	$ADDU	$c_3,$t_1
   1655 	sltu	$at,$c_3,$t_1
   1656 	$ADDU	$t_2,$at
   1657 	$ADDU	$c_1,$t_2
   1658 	$ST	$c_3,14*$BNSZ($a0)	# r[14]=c3;
   1659 	$ST	$c_1,15*$BNSZ($a0)	# r[15]=c1;
   1660 
   1661 	.set	noreorder
   1662 ___
   1663 $code.=<<___ if ($flavour =~ /nubi/i);
   1664 	$REG_L	$s5,10*$SZREG($sp)
   1665 	$REG_L	$s4,9*$SZREG($sp)
   1666 	$REG_L	$s3,8*$SZREG($sp)
   1667 	$REG_L	$s2,7*$SZREG($sp)
   1668 	$REG_L	$s1,6*$SZREG($sp)
   1669 	$REG_L	$s0,5*$SZREG($sp)
   1670 	$REG_L	$t3,4*$SZREG($sp)
   1671 	$REG_L	$t2,3*$SZREG($sp)
   1672 	$REG_L	$t1,2*$SZREG($sp)
   1673 	$REG_L	$t0,1*$SZREG($sp)
   1674 	$REG_L	$gp,0*$SZREG($sp)
   1675 	jr	$ra
   1676 	$PTR_ADD $sp,12*$SZREG
   1677 ___
   1678 $code.=<<___ if ($flavour !~ /nubi/i);
   1679 	$REG_L	$s5,5*$SZREG($sp)
   1680 	$REG_L	$s4,4*$SZREG($sp)
   1681 	$REG_L	$s3,3*$SZREG($sp)
   1682 	$REG_L	$s2,2*$SZREG($sp)
   1683 	$REG_L	$s1,1*$SZREG($sp)
   1684 	$REG_L	$s0,0*$SZREG($sp)
   1685 	jr	$ra
   1686 	$PTR_ADD $sp,6*$SZREG
   1687 ___
   1688 $code.=<<___;
   1689 .end	bn_mul_comba8
   1690 
   1691 .align	5
   1692 .globl	bn_mul_comba4
   1693 .ent	bn_mul_comba4
   1694 bn_mul_comba4:
   1695 ___
   1696 $code.=<<___ if ($flavour =~ /nubi/i);
   1697 	.frame	$sp,6*$SZREG,$ra
   1698 	.mask	0x8000f008,-$SZREG
   1699 	.set	noreorder
   1700 	$PTR_SUB $sp,6*$SZREG
   1701 	$REG_S	$ra,5*$SZREG($sp)
   1702 	$REG_S	$t3,4*$SZREG($sp)
   1703 	$REG_S	$t2,3*$SZREG($sp)
   1704 	$REG_S	$t1,2*$SZREG($sp)
   1705 	$REG_S	$t0,1*$SZREG($sp)
   1706 	$REG_S	$gp,0*$SZREG($sp)
   1707 ___
   1708 $code.=<<___;
   1709 	.set	reorder
   1710 	$LD	$a_0,0($a1)
   1711 	$LD	$b_0,0($a2)
   1712 	$LD	$a_1,$BNSZ($a1)
   1713 	$LD	$a_2,2*$BNSZ($a1)
   1714 	$MULTU	$a_0,$b_0		# mul_add_c(a[0],b[0],c1,c2,c3);
   1715 	$LD	$a_3,3*$BNSZ($a1)
   1716 	$LD	$b_1,$BNSZ($a2)
   1717 	$LD	$b_2,2*$BNSZ($a2)
   1718 	$LD	$b_3,3*$BNSZ($a2)
   1719 	mflo	$c_1
   1720 	mfhi	$c_2
   1721 	$ST	$c_1,0($a0)
   1722 
   1723 	$MULTU	$a_0,$b_1		# mul_add_c(a[0],b[1],c2,c3,c1);
   1724 	mflo	$t_1
   1725 	mfhi	$t_2
   1726 	$ADDU	$c_2,$t_1
   1727 	sltu	$at,$c_2,$t_1
   1728 	$MULTU	$a_1,$b_0		# mul_add_c(a[1],b[0],c2,c3,c1);
   1729 	$ADDU	$c_3,$t_2,$at
   1730 	mflo	$t_1
   1731 	mfhi	$t_2
   1732 	$ADDU	$c_2,$t_1
   1733 	sltu	$at,$c_2,$t_1
   1734 	 $MULTU	$a_2,$b_0		# mul_add_c(a[2],b[0],c3,c1,c2);
   1735 	$ADDU	$t_2,$at
   1736 	$ADDU	$c_3,$t_2
   1737 	sltu	$c_1,$c_3,$t_2
   1738 	$ST	$c_2,$BNSZ($a0)
   1739 
   1740 	mflo	$t_1
   1741 	mfhi	$t_2
   1742 	$ADDU	$c_3,$t_1
   1743 	sltu	$at,$c_3,$t_1
   1744 	$MULTU	$a_1,$b_1		# mul_add_c(a[1],b[1],c3,c1,c2);
   1745 	$ADDU	$t_2,$at
   1746 	$ADDU	$c_1,$t_2
   1747 	mflo	$t_1
   1748 	mfhi	$t_2
   1749 	$ADDU	$c_3,$t_1
   1750 	sltu	$at,$c_3,$t_1
   1751 	$MULTU	$a_0,$b_2		# mul_add_c(a[0],b[2],c3,c1,c2);
   1752 	$ADDU	$t_2,$at
   1753 	$ADDU	$c_1,$t_2
   1754 	sltu	$c_2,$c_1,$t_2
   1755 	mflo	$t_1
   1756 	mfhi	$t_2
   1757 	$ADDU	$c_3,$t_1
   1758 	sltu	$at,$c_3,$t_1
   1759 	 $MULTU	$a_0,$b_3		# mul_add_c(a[0],b[3],c1,c2,c3);
   1760 	$ADDU	$t_2,$at
   1761 	$ADDU	$c_1,$t_2
   1762 	sltu	$at,$c_1,$t_2
   1763 	$ADDU	$c_2,$at
   1764 	$ST	$c_3,2*$BNSZ($a0)
   1765 
   1766 	mflo	$t_1
   1767 	mfhi	$t_2
   1768 	$ADDU	$c_1,$t_1
   1769 	sltu	$at,$c_1,$t_1
   1770 	$MULTU	$a_1,$b_2		# mul_add_c(a[1],b[2],c1,c2,c3);
   1771 	$ADDU	$t_2,$at
   1772 	$ADDU	$c_2,$t_2
   1773 	sltu	$c_3,$c_2,$t_2
   1774 	mflo	$t_1
   1775 	mfhi	$t_2
   1776 	$ADDU	$c_1,$t_1
   1777 	sltu	$at,$c_1,$t_1
   1778 	$MULTU	$a_2,$b_1		# mul_add_c(a[2],b[1],c1,c2,c3);
   1779 	$ADDU	$t_2,$at
   1780 	$ADDU	$c_2,$t_2
   1781 	sltu	$at,$c_2,$t_2
   1782 	$ADDU	$c_3,$at
   1783 	mflo	$t_1
   1784 	mfhi	$t_2
   1785 	$ADDU	$c_1,$t_1
   1786 	sltu	$at,$c_1,$t_1
   1787 	$MULTU	$a_3,$b_0		# mul_add_c(a[3],b[0],c1,c2,c3);
   1788 	$ADDU	$t_2,$at
   1789 	$ADDU	$c_2,$t_2
   1790 	sltu	$at,$c_2,$t_2
   1791 	$ADDU	$c_3,$at
   1792 	mflo	$t_1
   1793 	mfhi	$t_2
   1794 	$ADDU	$c_1,$t_1
   1795 	sltu	$at,$c_1,$t_1
   1796 	 $MULTU	$a_3,$b_1		# mul_add_c(a[3],b[1],c2,c3,c1);
   1797 	$ADDU	$t_2,$at
   1798 	$ADDU	$c_2,$t_2
   1799 	sltu	$at,$c_2,$t_2
   1800 	$ADDU	$c_3,$at
   1801 	$ST	$c_1,3*$BNSZ($a0)
   1802 
   1803 	mflo	$t_1
   1804 	mfhi	$t_2
   1805 	$ADDU	$c_2,$t_1
   1806 	sltu	$at,$c_2,$t_1
   1807 	$MULTU	$a_2,$b_2		# mul_add_c(a[2],b[2],c2,c3,c1);
   1808 	$ADDU	$t_2,$at
   1809 	$ADDU	$c_3,$t_2
   1810 	sltu	$c_1,$c_3,$t_2
   1811 	mflo	$t_1
   1812 	mfhi	$t_2
   1813 	$ADDU	$c_2,$t_1
   1814 	sltu	$at,$c_2,$t_1
   1815 	$MULTU	$a_1,$b_3		# mul_add_c(a[1],b[3],c2,c3,c1);
   1816 	$ADDU	$t_2,$at
   1817 	$ADDU	$c_3,$t_2
   1818 	sltu	$at,$c_3,$t_2
   1819 	$ADDU	$c_1,$at
   1820 	mflo	$t_1
   1821 	mfhi	$t_2
   1822 	$ADDU	$c_2,$t_1
   1823 	sltu	$at,$c_2,$t_1
   1824 	 $MULTU	$a_2,$b_3		# mul_add_c(a[2],b[3],c3,c1,c2);
   1825 	$ADDU	$t_2,$at
   1826 	$ADDU	$c_3,$t_2
   1827 	sltu	$at,$c_3,$t_2
   1828 	$ADDU	$c_1,$at
   1829 	$ST	$c_2,4*$BNSZ($a0)
   1830 
   1831 	mflo	$t_1
   1832 	mfhi	$t_2
   1833 	$ADDU	$c_3,$t_1
   1834 	sltu	$at,$c_3,$t_1
   1835 	$MULTU	$a_3,$b_2		# mul_add_c(a[3],b[2],c3,c1,c2);
   1836 	$ADDU	$t_2,$at
   1837 	$ADDU	$c_1,$t_2
   1838 	sltu	$c_2,$c_1,$t_2
   1839 	mflo	$t_1
   1840 	mfhi	$t_2
   1841 	$ADDU	$c_3,$t_1
   1842 	sltu	$at,$c_3,$t_1
   1843 	 $MULTU	$a_3,$b_3		# mul_add_c(a[3],b[3],c1,c2,c3);
   1844 	$ADDU	$t_2,$at
   1845 	$ADDU	$c_1,$t_2
   1846 	sltu	$at,$c_1,$t_2
   1847 	$ADDU	$c_2,$at
   1848 	$ST	$c_3,5*$BNSZ($a0)
   1849 
   1850 	mflo	$t_1
   1851 	mfhi	$t_2
   1852 	$ADDU	$c_1,$t_1
   1853 	sltu	$at,$c_1,$t_1
   1854 	$ADDU	$t_2,$at
   1855 	$ADDU	$c_2,$t_2
   1856 	$ST	$c_1,6*$BNSZ($a0)
   1857 	$ST	$c_2,7*$BNSZ($a0)
   1858 
   1859 	.set	noreorder
   1860 ___
   1861 $code.=<<___ if ($flavour =~ /nubi/i);
   1862 	$REG_L	$t3,4*$SZREG($sp)
   1863 	$REG_L	$t2,3*$SZREG($sp)
   1864 	$REG_L	$t1,2*$SZREG($sp)
   1865 	$REG_L	$t0,1*$SZREG($sp)
   1866 	$REG_L	$gp,0*$SZREG($sp)
   1867 	$PTR_ADD $sp,6*$SZREG
   1868 ___
   1869 $code.=<<___;
   1870 	jr	$ra
   1871 	nop
   1872 .end	bn_mul_comba4
   1873 ___
   1874 
   1875 ($a_4,$a_5,$a_6,$a_7)=($b_0,$b_1,$b_2,$b_3);
   1876 
   1877 $code.=<<___;
   1878 
   1879 .align	5
   1880 .globl	bn_sqr_comba8
   1881 .ent	bn_sqr_comba8
   1882 bn_sqr_comba8:
   1883 ___
   1884 $code.=<<___ if ($flavour =~ /nubi/i);
   1885 	.frame	$sp,6*$SZREG,$ra
   1886 	.mask	0x8000f008,-$SZREG
   1887 	.set	noreorder
   1888 	$PTR_SUB $sp,6*$SZREG
   1889 	$REG_S	$ra,5*$SZREG($sp)
   1890 	$REG_S	$t3,4*$SZREG($sp)
   1891 	$REG_S	$t2,3*$SZREG($sp)
   1892 	$REG_S	$t1,2*$SZREG($sp)
   1893 	$REG_S	$t0,1*$SZREG($sp)
   1894 	$REG_S	$gp,0*$SZREG($sp)
   1895 ___
   1896 $code.=<<___;
   1897 	.set	reorder
   1898 	$LD	$a_0,0($a1)
   1899 	$LD	$a_1,$BNSZ($a1)
   1900 	$LD	$a_2,2*$BNSZ($a1)
   1901 	$LD	$a_3,3*$BNSZ($a1)
   1902 
   1903 	$MULTU	$a_0,$a_0		# mul_add_c(a[0],b[0],c1,c2,c3);
   1904 	$LD	$a_4,4*$BNSZ($a1)
   1905 	$LD	$a_5,5*$BNSZ($a1)
   1906 	$LD	$a_6,6*$BNSZ($a1)
   1907 	$LD	$a_7,7*$BNSZ($a1)
   1908 	mflo	$c_1
   1909 	mfhi	$c_2
   1910 	$ST	$c_1,0($a0)
   1911 
   1912 	$MULTU	$a_0,$a_1		# mul_add_c2(a[0],b[1],c2,c3,c1);
   1913 	mflo	$t_1
   1914 	mfhi	$t_2
   1915 	slt	$c_1,$t_2,$zero
   1916 	$SLL	$t_2,1
   1917 	 $MULTU	$a_2,$a_0		# mul_add_c2(a[2],b[0],c3,c1,c2);
   1918 	slt	$a2,$t_1,$zero
   1919 	$ADDU	$t_2,$a2
   1920 	$SLL	$t_1,1
   1921 	$ADDU	$c_2,$t_1
   1922 	sltu	$at,$c_2,$t_1
   1923 	$ADDU	$c_3,$t_2,$at
   1924 	$ST	$c_2,$BNSZ($a0)
   1925 
   1926 	mflo	$t_1
   1927 	mfhi	$t_2
   1928 	slt	$c_2,$t_2,$zero
   1929 	$SLL	$t_2,1
   1930 	$MULTU	$a_1,$a_1		# mul_add_c(a[1],b[1],c3,c1,c2);
   1931 	slt	$a2,$t_1,$zero
   1932 	$ADDU	$t_2,$a2
   1933 	$SLL	$t_1,1
   1934 	$ADDU	$c_3,$t_1
   1935 	sltu	$at,$c_3,$t_1
   1936 	$ADDU	$t_2,$at
   1937 	$ADDU	$c_1,$t_2
   1938 	sltu	$at,$c_1,$t_2
   1939 	$ADDU	$c_2,$at
   1940 	mflo	$t_1
   1941 	mfhi	$t_2
   1942 	$ADDU	$c_3,$t_1
   1943 	sltu	$at,$c_3,$t_1
   1944 	 $MULTU	$a_0,$a_3		# mul_add_c2(a[0],b[3],c1,c2,c3);
   1945 	$ADDU	$t_2,$at
   1946 	$ADDU	$c_1,$t_2
   1947 	sltu	$at,$c_1,$t_2
   1948 	$ADDU	$c_2,$at
   1949 	$ST	$c_3,2*$BNSZ($a0)
   1950 
   1951 	mflo	$t_1
   1952 	mfhi	$t_2
   1953 	slt	$c_3,$t_2,$zero
   1954 	$SLL	$t_2,1
   1955 	$MULTU	$a_1,$a_2		# mul_add_c2(a[1],b[2],c1,c2,c3);
   1956 	slt	$a2,$t_1,$zero
   1957 	$ADDU	$t_2,$a2
   1958 	$SLL	$t_1,1
   1959 	$ADDU	$c_1,$t_1
   1960 	sltu	$at,$c_1,$t_1
   1961 	$ADDU	$t_2,$at
   1962 	$ADDU	$c_2,$t_2
   1963 	sltu	$at,$c_2,$t_2
   1964 	$ADDU	$c_3,$at
   1965 	mflo	$t_1
   1966 	mfhi	$t_2
   1967 	slt	$at,$t_2,$zero
   1968 	$ADDU	$c_3,$at
   1969 	 $MULTU	$a_4,$a_0		# mul_add_c2(a[4],b[0],c2,c3,c1);
   1970 	$SLL	$t_2,1
   1971 	slt	$a2,$t_1,$zero
   1972 	$ADDU	$t_2,$a2
   1973 	$SLL	$t_1,1
   1974 	$ADDU	$c_1,$t_1
   1975 	sltu	$at,$c_1,$t_1
   1976 	$ADDU	$t_2,$at
   1977 	$ADDU	$c_2,$t_2
   1978 	sltu	$at,$c_2,$t_2
   1979 	$ADDU	$c_3,$at
   1980 	$ST	$c_1,3*$BNSZ($a0)
   1981 
   1982 	mflo	$t_1
   1983 	mfhi	$t_2
   1984 	slt	$c_1,$t_2,$zero
   1985 	$SLL	$t_2,1
   1986 	$MULTU	$a_3,$a_1		# mul_add_c2(a[3],b[1],c2,c3,c1);
   1987 	slt	$a2,$t_1,$zero
   1988 	$ADDU	$t_2,$a2
   1989 	$SLL	$t_1,1
   1990 	$ADDU	$c_2,$t_1
   1991 	sltu	$at,$c_2,$t_1
   1992 	$ADDU	$t_2,$at
   1993 	$ADDU	$c_3,$t_2
   1994 	sltu	$at,$c_3,$t_2
   1995 	$ADDU	$c_1,$at
   1996 	mflo	$t_1
   1997 	mfhi	$t_2
   1998 	slt	$at,$t_2,$zero
   1999 	$ADDU	$c_1,$at
   2000 	$MULTU	$a_2,$a_2		# mul_add_c(a[2],b[2],c2,c3,c1);
   2001 	$SLL	$t_2,1
   2002 	slt	$a2,$t_1,$zero
   2003 	$ADDU	$t_2,$a2
   2004 	$SLL	$t_1,1
   2005 	$ADDU	$c_2,$t_1
   2006 	sltu	$at,$c_2,$t_1
   2007 	$ADDU	$t_2,$at
   2008 	$ADDU	$c_3,$t_2
   2009 	sltu	$at,$c_3,$t_2
   2010 	$ADDU	$c_1,$at
   2011 	mflo	$t_1
   2012 	mfhi	$t_2
   2013 	$ADDU	$c_2,$t_1
   2014 	sltu	$at,$c_2,$t_1
   2015 	 $MULTU	$a_0,$a_5		# mul_add_c2(a[0],b[5],c3,c1,c2);
   2016 	$ADDU	$t_2,$at
   2017 	$ADDU	$c_3,$t_2
   2018 	sltu	$at,$c_3,$t_2
   2019 	$ADDU	$c_1,$at
   2020 	$ST	$c_2,4*$BNSZ($a0)
   2021 
   2022 	mflo	$t_1
   2023 	mfhi	$t_2
   2024 	slt	$c_2,$t_2,$zero
   2025 	$SLL	$t_2,1
   2026 	$MULTU	$a_1,$a_4		# mul_add_c2(a[1],b[4],c3,c1,c2);
   2027 	slt	$a2,$t_1,$zero
   2028 	$ADDU	$t_2,$a2
   2029 	$SLL	$t_1,1
   2030 	$ADDU	$c_3,$t_1
   2031 	sltu	$at,$c_3,$t_1
   2032 	$ADDU	$t_2,$at
   2033 	$ADDU	$c_1,$t_2
   2034 	sltu	$at,$c_1,$t_2
   2035 	$ADDU	$c_2,$at
   2036 	mflo	$t_1
   2037 	mfhi	$t_2
   2038 	slt	$at,$t_2,$zero
   2039 	$ADDU	$c_2,$at
   2040 	$MULTU	$a_2,$a_3		# mul_add_c2(a[2],b[3],c3,c1,c2);
   2041 	$SLL	$t_2,1
   2042 	slt	$a2,$t_1,$zero
   2043 	$ADDU	$t_2,$a2
   2044 	$SLL	$t_1,1
   2045 	$ADDU	$c_3,$t_1
   2046 	sltu	$at,$c_3,$t_1
   2047 	$ADDU	$t_2,$at
   2048 	$ADDU	$c_1,$t_2
   2049 	sltu	$at,$c_1,$t_2
   2050 	$ADDU	$c_2,$at
   2051 	mflo	$t_1
   2052 	mfhi	$t_2
   2053 	slt	$at,$t_2,$zero
   2054 	 $MULTU	$a_6,$a_0		# mul_add_c2(a[6],b[0],c1,c2,c3);
   2055 	$ADDU	$c_2,$at
   2056 	$SLL	$t_2,1
   2057 	slt	$a2,$t_1,$zero
   2058 	$ADDU	$t_2,$a2
   2059 	$SLL	$t_1,1
   2060 	$ADDU	$c_3,$t_1
   2061 	sltu	$at,$c_3,$t_1
   2062 	$ADDU	$t_2,$at
   2063 	$ADDU	$c_1,$t_2
   2064 	sltu	$at,$c_1,$t_2
   2065 	$ADDU	$c_2,$at
   2066 	$ST	$c_3,5*$BNSZ($a0)
   2067 
   2068 	mflo	$t_1
   2069 	mfhi	$t_2
   2070 	slt	$c_3,$t_2,$zero
   2071 	$SLL	$t_2,1
   2072 	$MULTU	$a_5,$a_1		# mul_add_c2(a[5],b[1],c1,c2,c3);
   2073 	slt	$a2,$t_1,$zero
   2074 	$ADDU	$t_2,$a2
   2075 	$SLL	$t_1,1
   2076 	$ADDU	$c_1,$t_1
   2077 	sltu	$at,$c_1,$t_1
   2078 	$ADDU	$t_2,$at
   2079 	$ADDU	$c_2,$t_2
   2080 	sltu	$at,$c_2,$t_2
   2081 	$ADDU	$c_3,$at
   2082 	mflo	$t_1
   2083 	mfhi	$t_2
   2084 	slt	$at,$t_2,$zero
   2085 	$ADDU	$c_3,$at
   2086 	$MULTU	$a_4,$a_2		# mul_add_c2(a[4],b[2],c1,c2,c3);
   2087 	$SLL	$t_2,1
   2088 	slt	$a2,$t_1,$zero
   2089 	$ADDU	$t_2,$a2
   2090 	$SLL	$t_1,1
   2091 	$ADDU	$c_1,$t_1
   2092 	sltu	$at,$c_1,$t_1
   2093 	$ADDU	$t_2,$at
   2094 	$ADDU	$c_2,$t_2
   2095 	sltu	$at,$c_2,$t_2
   2096 	$ADDU	$c_3,$at
   2097 	mflo	$t_1
   2098 	mfhi	$t_2
   2099 	slt	$at,$t_2,$zero
   2100 	$ADDU	$c_3,$at
   2101 	$MULTU	$a_3,$a_3		# mul_add_c(a[3],b[3],c1,c2,c3);
   2102 	$SLL	$t_2,1
   2103 	slt	$a2,$t_1,$zero
   2104 	$ADDU	$t_2,$a2
   2105 	$SLL	$t_1,1
   2106 	$ADDU	$c_1,$t_1
   2107 	sltu	$at,$c_1,$t_1
   2108 	$ADDU	$t_2,$at
   2109 	$ADDU	$c_2,$t_2
   2110 	sltu	$at,$c_2,$t_2
   2111 	$ADDU	$c_3,$at
   2112 	mflo	$t_1
   2113 	mfhi	$t_2
   2114 	$ADDU	$c_1,$t_1
   2115 	sltu	$at,$c_1,$t_1
   2116 	 $MULTU	$a_0,$a_7		# mul_add_c2(a[0],b[7],c2,c3,c1);
   2117 	$ADDU	$t_2,$at
   2118 	$ADDU	$c_2,$t_2
   2119 	sltu	$at,$c_2,$t_2
   2120 	$ADDU	$c_3,$at
   2121 	$ST	$c_1,6*$BNSZ($a0)
   2122 
   2123 	mflo	$t_1
   2124 	mfhi	$t_2
   2125 	slt	$c_1,$t_2,$zero
   2126 	$SLL	$t_2,1
   2127 	$MULTU	$a_1,$a_6		# mul_add_c2(a[1],b[6],c2,c3,c1);
   2128 	slt	$a2,$t_1,$zero
   2129 	$ADDU	$t_2,$a2
   2130 	$SLL	$t_1,1
   2131 	$ADDU	$c_2,$t_1
   2132 	sltu	$at,$c_2,$t_1
   2133 	$ADDU	$t_2,$at
   2134 	$ADDU	$c_3,$t_2
   2135 	sltu	$at,$c_3,$t_2
   2136 	$ADDU	$c_1,$at
   2137 	mflo	$t_1
   2138 	mfhi	$t_2
   2139 	slt	$at,$t_2,$zero
   2140 	$ADDU	$c_1,$at
   2141 	$MULTU	$a_2,$a_5		# mul_add_c2(a[2],b[5],c2,c3,c1);
   2142 	$SLL	$t_2,1
   2143 	slt	$a2,$t_1,$zero
   2144 	$ADDU	$t_2,$a2
   2145 	$SLL	$t_1,1
   2146 	$ADDU	$c_2,$t_1
   2147 	sltu	$at,$c_2,$t_1
   2148 	$ADDU	$t_2,$at
   2149 	$ADDU	$c_3,$t_2
   2150 	sltu	$at,$c_3,$t_2
   2151 	$ADDU	$c_1,$at
   2152 	mflo	$t_1
   2153 	mfhi	$t_2
   2154 	slt	$at,$t_2,$zero
   2155 	$ADDU	$c_1,$at
   2156 	$MULTU	$a_3,$a_4		# mul_add_c2(a[3],b[4],c2,c3,c1);
   2157 	$SLL	$t_2,1
   2158 	slt	$a2,$t_1,$zero
   2159 	$ADDU	$t_2,$a2
   2160 	$SLL	$t_1,1
   2161 	$ADDU	$c_2,$t_1
   2162 	sltu	$at,$c_2,$t_1
   2163 	$ADDU	$t_2,$at
   2164 	$ADDU	$c_3,$t_2
   2165 	sltu	$at,$c_3,$t_2
   2166 	$ADDU	$c_1,$at
   2167 	mflo	$t_1
   2168 	mfhi	$t_2
   2169 	slt	$at,$t_2,$zero
   2170 	$ADDU	$c_1,$at
   2171 	 $MULTU	$a_7,$a_1		# mul_add_c2(a[7],b[1],c3,c1,c2);
   2172 	$SLL	$t_2,1
   2173 	slt	$a2,$t_1,$zero
   2174 	$ADDU	$t_2,$a2
   2175 	$SLL	$t_1,1
   2176 	$ADDU	$c_2,$t_1
   2177 	sltu	$at,$c_2,$t_1
   2178 	$ADDU	$t_2,$at
   2179 	$ADDU	$c_3,$t_2
   2180 	sltu	$at,$c_3,$t_2
   2181 	$ADDU	$c_1,$at
   2182 	$ST	$c_2,7*$BNSZ($a0)
   2183 
   2184 	mflo	$t_1
   2185 	mfhi	$t_2
   2186 	slt	$c_2,$t_2,$zero
   2187 	$SLL	$t_2,1
   2188 	$MULTU	$a_6,$a_2		# mul_add_c2(a[6],b[2],c3,c1,c2);
   2189 	slt	$a2,$t_1,$zero
   2190 	$ADDU	$t_2,$a2
   2191 	$SLL	$t_1,1
   2192 	$ADDU	$c_3,$t_1
   2193 	sltu	$at,$c_3,$t_1
   2194 	$ADDU	$t_2,$at
   2195 	$ADDU	$c_1,$t_2
   2196 	sltu	$at,$c_1,$t_2
   2197 	$ADDU	$c_2,$at
   2198 	mflo	$t_1
   2199 	mfhi	$t_2
   2200 	slt	$at,$t_2,$zero
   2201 	$ADDU	$c_2,$at
   2202 	$MULTU	$a_5,$a_3		# mul_add_c2(a[5],b[3],c3,c1,c2);
   2203 	$SLL	$t_2,1
   2204 	slt	$a2,$t_1,$zero
   2205 	$ADDU	$t_2,$a2
   2206 	$SLL	$t_1,1
   2207 	$ADDU	$c_3,$t_1
   2208 	sltu	$at,$c_3,$t_1
   2209 	$ADDU	$t_2,$at
   2210 	$ADDU	$c_1,$t_2
   2211 	sltu	$at,$c_1,$t_2
   2212 	$ADDU	$c_2,$at
   2213 	mflo	$t_1
   2214 	mfhi	$t_2
   2215 	slt	$at,$t_2,$zero
   2216 	$ADDU	$c_2,$at
   2217 	$MULTU	$a_4,$a_4		# mul_add_c(a[4],b[4],c3,c1,c2);
   2218 	$SLL	$t_2,1
   2219 	slt	$a2,$t_1,$zero
   2220 	$ADDU	$t_2,$a2
   2221 	$SLL	$t_1,1
   2222 	$ADDU	$c_3,$t_1
   2223 	sltu	$at,$c_3,$t_1
   2224 	$ADDU	$t_2,$at
   2225 	$ADDU	$c_1,$t_2
   2226 	sltu	$at,$c_1,$t_2
   2227 	$ADDU	$c_2,$at
   2228 	mflo	$t_1
   2229 	mfhi	$t_2
   2230 	$ADDU	$c_3,$t_1
   2231 	sltu	$at,$c_3,$t_1
   2232 	 $MULTU	$a_2,$a_7		# mul_add_c2(a[2],b[7],c1,c2,c3);
   2233 	$ADDU	$t_2,$at
   2234 	$ADDU	$c_1,$t_2
   2235 	sltu	$at,$c_1,$t_2
   2236 	$ADDU	$c_2,$at
   2237 	$ST	$c_3,8*$BNSZ($a0)
   2238 
   2239 	mflo	$t_1
   2240 	mfhi	$t_2
   2241 	slt	$c_3,$t_2,$zero
   2242 	$SLL	$t_2,1
   2243 	$MULTU	$a_3,$a_6		# mul_add_c2(a[3],b[6],c1,c2,c3);
   2244 	slt	$a2,$t_1,$zero
   2245 	$ADDU	$t_2,$a2
   2246 	$SLL	$t_1,1
   2247 	$ADDU	$c_1,$t_1
   2248 	sltu	$at,$c_1,$t_1
   2249 	$ADDU	$t_2,$at
   2250 	$ADDU	$c_2,$t_2
   2251 	sltu	$at,$c_2,$t_2
   2252 	$ADDU	$c_3,$at
   2253 	mflo	$t_1
   2254 	mfhi	$t_2
   2255 	slt	$at,$t_2,$zero
   2256 	$ADDU	$c_3,$at
   2257 	$MULTU	$a_4,$a_5		# mul_add_c2(a[4],b[5],c1,c2,c3);
   2258 	$SLL	$t_2,1
   2259 	slt	$a2,$t_1,$zero
   2260 	$ADDU	$t_2,$a2
   2261 	$SLL	$t_1,1
   2262 	$ADDU	$c_1,$t_1
   2263 	sltu	$at,$c_1,$t_1
   2264 	$ADDU	$t_2,$at
   2265 	$ADDU	$c_2,$t_2
   2266 	sltu	$at,$c_2,$t_2
   2267 	$ADDU	$c_3,$at
   2268 	mflo	$t_1
   2269 	mfhi	$t_2
   2270 	slt	$at,$t_2,$zero
   2271 	$ADDU	$c_3,$at
   2272 	 $MULTU	$a_7,$a_3		# mul_add_c2(a[7],b[3],c2,c3,c1);
   2273 	$SLL	$t_2,1
   2274 	slt	$a2,$t_1,$zero
   2275 	$ADDU	$t_2,$a2
   2276 	$SLL	$t_1,1
   2277 	$ADDU	$c_1,$t_1
   2278 	sltu	$at,$c_1,$t_1
   2279 	$ADDU	$t_2,$at
   2280 	$ADDU	$c_2,$t_2
   2281 	sltu	$at,$c_2,$t_2
   2282 	$ADDU	$c_3,$at
   2283 	$ST	$c_1,9*$BNSZ($a0)
   2284 
   2285 	mflo	$t_1
   2286 	mfhi	$t_2
   2287 	slt	$c_1,$t_2,$zero
   2288 	$SLL	$t_2,1
   2289 	$MULTU	$a_6,$a_4		# mul_add_c2(a[6],b[4],c2,c3,c1);
   2290 	slt	$a2,$t_1,$zero
   2291 	$ADDU	$t_2,$a2
   2292 	$SLL	$t_1,1
   2293 	$ADDU	$c_2,$t_1
   2294 	sltu	$at,$c_2,$t_1
   2295 	$ADDU	$t_2,$at
   2296 	$ADDU	$c_3,$t_2
   2297 	sltu	$at,$c_3,$t_2
   2298 	$ADDU	$c_1,$at
   2299 	mflo	$t_1
   2300 	mfhi	$t_2
   2301 	slt	$at,$t_2,$zero
   2302 	$ADDU	$c_1,$at
   2303 	$MULTU	$a_5,$a_5		# mul_add_c(a[5],b[5],c2,c3,c1);
   2304 	$SLL	$t_2,1
   2305 	slt	$a2,$t_1,$zero
   2306 	$ADDU	$t_2,$a2
   2307 	$SLL	$t_1,1
   2308 	$ADDU	$c_2,$t_1
   2309 	sltu	$at,$c_2,$t_1
   2310 	$ADDU	$t_2,$at
   2311 	$ADDU	$c_3,$t_2
   2312 	sltu	$at,$c_3,$t_2
   2313 	$ADDU	$c_1,$at
   2314 	mflo	$t_1
   2315 	mfhi	$t_2
   2316 	$ADDU	$c_2,$t_1
   2317 	sltu	$at,$c_2,$t_1
   2318 	 $MULTU	$a_4,$a_7		# mul_add_c2(a[4],b[7],c3,c1,c2);
   2319 	$ADDU	$t_2,$at
   2320 	$ADDU	$c_3,$t_2
   2321 	sltu	$at,$c_3,$t_2
   2322 	$ADDU	$c_1,$at
   2323 	$ST	$c_2,10*$BNSZ($a0)
   2324 
   2325 	mflo	$t_1
   2326 	mfhi	$t_2
   2327 	slt	$c_2,$t_2,$zero
   2328 	$SLL	$t_2,1
   2329 	$MULTU	$a_5,$a_6		# mul_add_c2(a[5],b[6],c3,c1,c2);
   2330 	slt	$a2,$t_1,$zero
   2331 	$ADDU	$t_2,$a2
   2332 	$SLL	$t_1,1
   2333 	$ADDU	$c_3,$t_1
   2334 	sltu	$at,$c_3,$t_1
   2335 	$ADDU	$t_2,$at
   2336 	$ADDU	$c_1,$t_2
   2337 	sltu	$at,$c_1,$t_2
   2338 	$ADDU	$c_2,$at
   2339 	mflo	$t_1
   2340 	mfhi	$t_2
   2341 	slt	$at,$t_2,$zero
   2342 	$ADDU	$c_2,$at
   2343 	 $MULTU	$a_7,$a_5		# mul_add_c2(a[7],b[5],c1,c2,c3);
   2344 	$SLL	$t_2,1
   2345 	slt	$a2,$t_1,$zero
   2346 	$ADDU	$t_2,$a2
   2347 	$SLL	$t_1,1
   2348 	$ADDU	$c_3,$t_1
   2349 	sltu	$at,$c_3,$t_1
   2350 	$ADDU	$t_2,$at
   2351 	$ADDU	$c_1,$t_2
   2352 	sltu	$at,$c_1,$t_2
   2353 	$ADDU	$c_2,$at
   2354 	$ST	$c_3,11*$BNSZ($a0)
   2355 
   2356 	mflo	$t_1
   2357 	mfhi	$t_2
   2358 	slt	$c_3,$t_2,$zero
   2359 	$SLL	$t_2,1
   2360 	$MULTU	$a_6,$a_6		# mul_add_c(a[6],b[6],c1,c2,c3);
   2361 	slt	$a2,$t_1,$zero
   2362 	$ADDU	$t_2,$a2
   2363 	$SLL	$t_1,1
   2364 	$ADDU	$c_1,$t_1
   2365 	sltu	$at,$c_1,$t_1
   2366 	$ADDU	$t_2,$at
   2367 	$ADDU	$c_2,$t_2
   2368 	sltu	$at,$c_2,$t_2
   2369 	$ADDU	$c_3,$at
   2370 	mflo	$t_1
   2371 	mfhi	$t_2
   2372 	$ADDU	$c_1,$t_1
   2373 	sltu	$at,$c_1,$t_1
   2374 	 $MULTU	$a_6,$a_7		# mul_add_c2(a[6],b[7],c2,c3,c1);
   2375 	$ADDU	$t_2,$at
   2376 	$ADDU	$c_2,$t_2
   2377 	sltu	$at,$c_2,$t_2
   2378 	$ADDU	$c_3,$at
   2379 	$ST	$c_1,12*$BNSZ($a0)
   2380 
   2381 	mflo	$t_1
   2382 	mfhi	$t_2
   2383 	slt	$c_1,$t_2,$zero
   2384 	$SLL	$t_2,1
   2385 	 $MULTU	$a_7,$a_7		# mul_add_c(a[7],b[7],c3,c1,c2);
   2386 	slt	$a2,$t_1,$zero
   2387 	$ADDU	$t_2,$a2
   2388 	$SLL	$t_1,1
   2389 	$ADDU	$c_2,$t_1
   2390 	sltu	$at,$c_2,$t_1
   2391 	$ADDU	$t_2,$at
   2392 	$ADDU	$c_3,$t_2
   2393 	sltu	$at,$c_3,$t_2
   2394 	$ADDU	$c_1,$at
   2395 	$ST	$c_2,13*$BNSZ($a0)
   2396 
   2397 	mflo	$t_1
   2398 	mfhi	$t_2
   2399 	$ADDU	$c_3,$t_1
   2400 	sltu	$at,$c_3,$t_1
   2401 	$ADDU	$t_2,$at
   2402 	$ADDU	$c_1,$t_2
   2403 	$ST	$c_3,14*$BNSZ($a0)
   2404 	$ST	$c_1,15*$BNSZ($a0)
   2405 
   2406 	.set	noreorder
   2407 ___
   2408 $code.=<<___ if ($flavour =~ /nubi/i);
   2409 	$REG_L	$t3,4*$SZREG($sp)
   2410 	$REG_L	$t2,3*$SZREG($sp)
   2411 	$REG_L	$t1,2*$SZREG($sp)
   2412 	$REG_L	$t0,1*$SZREG($sp)
   2413 	$REG_L	$gp,0*$SZREG($sp)
   2414 	$PTR_ADD $sp,6*$SZREG
   2415 ___
   2416 $code.=<<___;
   2417 	jr	$ra
   2418 	nop
   2419 .end	bn_sqr_comba8
   2420 
   2421 .align	5
   2422 .globl	bn_sqr_comba4
   2423 .ent	bn_sqr_comba4
   2424 bn_sqr_comba4:
   2425 ___
   2426 $code.=<<___ if ($flavour =~ /nubi/i);
   2427 	.frame	$sp,6*$SZREG,$ra
   2428 	.mask	0x8000f008,-$SZREG
   2429 	.set	noreorder
   2430 	$PTR_SUB $sp,6*$SZREG
   2431 	$REG_S	$ra,5*$SZREG($sp)
   2432 	$REG_S	$t3,4*$SZREG($sp)
   2433 	$REG_S	$t2,3*$SZREG($sp)
   2434 	$REG_S	$t1,2*$SZREG($sp)
   2435 	$REG_S	$t0,1*$SZREG($sp)
   2436 	$REG_S	$gp,0*$SZREG($sp)
   2437 ___
   2438 $code.=<<___;
   2439 	.set	reorder
   2440 	$LD	$a_0,0($a1)
   2441 	$LD	$a_1,$BNSZ($a1)
   2442 	$MULTU	$a_0,$a_0		# mul_add_c(a[0],b[0],c1,c2,c3);
   2443 	$LD	$a_2,2*$BNSZ($a1)
   2444 	$LD	$a_3,3*$BNSZ($a1)
   2445 	mflo	$c_1
   2446 	mfhi	$c_2
   2447 	$ST	$c_1,0($a0)
   2448 
   2449 	$MULTU	$a_0,$a_1		# mul_add_c2(a[0],b[1],c2,c3,c1);
   2450 	mflo	$t_1
   2451 	mfhi	$t_2
   2452 	slt	$c_1,$t_2,$zero
   2453 	$SLL	$t_2,1
   2454 	 $MULTU	$a_2,$a_0		# mul_add_c2(a[2],b[0],c3,c1,c2);
   2455 	slt	$a2,$t_1,$zero
   2456 	$ADDU	$t_2,$a2
   2457 	$SLL	$t_1,1
   2458 	$ADDU	$c_2,$t_1
   2459 	sltu	$at,$c_2,$t_1
   2460 	$ADDU	$c_3,$t_2,$at
   2461 	$ST	$c_2,$BNSZ($a0)
   2462 
   2463 	mflo	$t_1
   2464 	mfhi	$t_2
   2465 	slt	$c_2,$t_2,$zero
   2466 	$SLL	$t_2,1
   2467 	$MULTU	$a_1,$a_1		# mul_add_c(a[1],b[1],c3,c1,c2);
   2468 	slt	$a2,$t_1,$zero
   2469 	$ADDU	$t_2,$a2
   2470 	$SLL	$t_1,1
   2471 	$ADDU	$c_3,$t_1
   2472 	sltu	$at,$c_3,$t_1
   2473 	$ADDU	$t_2,$at
   2474 	$ADDU	$c_1,$t_2
   2475 	sltu	$at,$c_1,$t_2
   2476 	$ADDU	$c_2,$at
   2477 	mflo	$t_1
   2478 	mfhi	$t_2
   2479 	$ADDU	$c_3,$t_1
   2480 	sltu	$at,$c_3,$t_1
   2481 	 $MULTU	$a_0,$a_3		# mul_add_c2(a[0],b[3],c1,c2,c3);
   2482 	$ADDU	$t_2,$at
   2483 	$ADDU	$c_1,$t_2
   2484 	sltu	$at,$c_1,$t_2
   2485 	$ADDU	$c_2,$at
   2486 	$ST	$c_3,2*$BNSZ($a0)
   2487 
   2488 	mflo	$t_1
   2489 	mfhi	$t_2
   2490 	slt	$c_3,$t_2,$zero
   2491 	$SLL	$t_2,1
   2492 	$MULTU	$a_1,$a_2		# mul_add_c(a2[1],b[2],c1,c2,c3);
   2493 	slt	$a2,$t_1,$zero
   2494 	$ADDU	$t_2,$a2
   2495 	$SLL	$t_1,1
   2496 	$ADDU	$c_1,$t_1
   2497 	sltu	$at,$c_1,$t_1
   2498 	$ADDU	$t_2,$at
   2499 	$ADDU	$c_2,$t_2
   2500 	sltu	$at,$c_2,$t_2
   2501 	$ADDU	$c_3,$at
   2502 	mflo	$t_1
   2503 	mfhi	$t_2
   2504 	slt	$at,$t_2,$zero
   2505 	$ADDU	$c_3,$at
   2506 	 $MULTU	$a_3,$a_1		# mul_add_c2(a[3],b[1],c2,c3,c1);
   2507 	$SLL	$t_2,1
   2508 	slt	$a2,$t_1,$zero
   2509 	$ADDU	$t_2,$a2
   2510 	$SLL	$t_1,1
   2511 	$ADDU	$c_1,$t_1
   2512 	sltu	$at,$c_1,$t_1
   2513 	$ADDU	$t_2,$at
   2514 	$ADDU	$c_2,$t_2
   2515 	sltu	$at,$c_2,$t_2
   2516 	$ADDU	$c_3,$at
   2517 	$ST	$c_1,3*$BNSZ($a0)
   2518 
   2519 	mflo	$t_1
   2520 	mfhi	$t_2
   2521 	slt	$c_1,$t_2,$zero
   2522 	$SLL	$t_2,1
   2523 	$MULTU	$a_2,$a_2		# mul_add_c(a[2],b[2],c2,c3,c1);
   2524 	slt	$a2,$t_1,$zero
   2525 	$ADDU	$t_2,$a2
   2526 	$SLL	$t_1,1
   2527 	$ADDU	$c_2,$t_1
   2528 	sltu	$at,$c_2,$t_1
   2529 	$ADDU	$t_2,$at
   2530 	$ADDU	$c_3,$t_2
   2531 	sltu	$at,$c_3,$t_2
   2532 	$ADDU	$c_1,$at
   2533 	mflo	$t_1
   2534 	mfhi	$t_2
   2535 	$ADDU	$c_2,$t_1
   2536 	sltu	$at,$c_2,$t_1
   2537 	 $MULTU	$a_2,$a_3		# mul_add_c2(a[2],b[3],c3,c1,c2);
   2538 	$ADDU	$t_2,$at
   2539 	$ADDU	$c_3,$t_2
   2540 	sltu	$at,$c_3,$t_2
   2541 	$ADDU	$c_1,$at
   2542 	$ST	$c_2,4*$BNSZ($a0)
   2543 
   2544 	mflo	$t_1
   2545 	mfhi	$t_2
   2546 	slt	$c_2,$t_2,$zero
   2547 	$SLL	$t_2,1
   2548 	 $MULTU	$a_3,$a_3		# mul_add_c(a[3],b[3],c1,c2,c3);
   2549 	slt	$a2,$t_1,$zero
   2550 	$ADDU	$t_2,$a2
   2551 	$SLL	$t_1,1
   2552 	$ADDU	$c_3,$t_1
   2553 	sltu	$at,$c_3,$t_1
   2554 	$ADDU	$t_2,$at
   2555 	$ADDU	$c_1,$t_2
   2556 	sltu	$at,$c_1,$t_2
   2557 	$ADDU	$c_2,$at
   2558 	$ST	$c_3,5*$BNSZ($a0)
   2559 
   2560 	mflo	$t_1
   2561 	mfhi	$t_2
   2562 	$ADDU	$c_1,$t_1
   2563 	sltu	$at,$c_1,$t_1
   2564 	$ADDU	$t_2,$at
   2565 	$ADDU	$c_2,$t_2
   2566 	$ST	$c_1,6*$BNSZ($a0)
   2567 	$ST	$c_2,7*$BNSZ($a0)
   2568 
   2569 	.set	noreorder
   2570 ___
   2571 $code.=<<___ if ($flavour =~ /nubi/i);
   2572 	$REG_L	$t3,4*$SZREG($sp)
   2573 	$REG_L	$t2,3*$SZREG($sp)
   2574 	$REG_L	$t1,2*$SZREG($sp)
   2575 	$REG_L	$t0,1*$SZREG($sp)
   2576 	$REG_L	$gp,0*$SZREG($sp)
   2577 	$PTR_ADD $sp,6*$SZREG
   2578 ___
   2579 $code.=<<___;
   2580 	jr	$ra
   2581 	nop
   2582 .end	bn_sqr_comba4
   2583 ___
   2584 print $code;
   2585 close STDOUT;
   2586