Home | History | Annotate | Download | only in asm
      1 #!/usr/bin/env perl
      2 #
      3 # Implemented as a Perl wrapper as we want to support several different
      4 # architectures with single file. We pick up the target based on the
      5 # file name we are asked to generate.
      6 #
      7 # It should be noted though that this perl code is nothing like
      8 # <openssl>/crypto/perlasm/x86*. In this case perl is used pretty much
      9 # as pre-processor to cover for platform differences in name decoration,
     10 # linker tables, 32-/64-bit instruction sets...
     11 #
     12 # As you might know there're several PowerPC ABI in use. Most notably
     13 # Linux and AIX use different 32-bit ABIs. Good news are that these ABIs
     14 # are similar enough to implement leaf(!) functions, which would be ABI
     15 # neutral. And that's what you find here: ABI neutral leaf functions.
     16 # In case you wonder what that is...
     17 #
     18 #       AIX performance
     19 #
     20 #	MEASUREMENTS WITH cc ON a 200 MhZ PowerPC 604e.
     21 #
     22 #	The following is the performance of 32-bit compiler
     23 #	generated code:
     24 #
     25 #	OpenSSL 0.9.6c 21 dec 2001
     26 #	built on: Tue Jun 11 11:06:51 EDT 2002
     27 #	options:bn(64,32) ...
     28 #compiler: cc -DTHREADS  -DAIX -DB_ENDIAN -DBN_LLONG -O3
     29 #                  sign    verify    sign/s verify/s
     30 #rsa  512 bits   0.0098s   0.0009s    102.0   1170.6
     31 #rsa 1024 bits   0.0507s   0.0026s     19.7    387.5
     32 #rsa 2048 bits   0.3036s   0.0085s      3.3    117.1
     33 #rsa 4096 bits   2.0040s   0.0299s      0.5     33.4
     34 #dsa  512 bits   0.0087s   0.0106s    114.3     94.5
     35 #dsa 1024 bits   0.0256s   0.0313s     39.0     32.0	
     36 #
     37 #	Same bechmark with this assembler code:
     38 #
     39 #rsa  512 bits   0.0056s   0.0005s    178.6   2049.2
     40 #rsa 1024 bits   0.0283s   0.0015s     35.3    674.1
     41 #rsa 2048 bits   0.1744s   0.0050s      5.7    201.2
     42 #rsa 4096 bits   1.1644s   0.0179s      0.9     55.7
     43 #dsa  512 bits   0.0052s   0.0062s    191.6    162.0
     44 #dsa 1024 bits   0.0149s   0.0180s     67.0     55.5
     45 #
     46 #	Number of operations increases by at almost 75%
     47 #
     48 #	Here are performance numbers for 64-bit compiler
     49 #	generated code:
     50 #
     51 #	OpenSSL 0.9.6g [engine] 9 Aug 2002
     52 #	built on: Fri Apr 18 16:59:20 EDT 2003
     53 #	options:bn(64,64) ...
     54 #	compiler: cc -DTHREADS -D_REENTRANT -q64 -DB_ENDIAN -O3
     55 #                  sign    verify    sign/s verify/s
     56 #rsa  512 bits   0.0028s   0.0003s    357.1   3844.4
     57 #rsa 1024 bits   0.0148s   0.0008s     67.5   1239.7
     58 #rsa 2048 bits   0.0963s   0.0028s     10.4    353.0
     59 #rsa 4096 bits   0.6538s   0.0102s      1.5     98.1
     60 #dsa  512 bits   0.0026s   0.0032s    382.5    313.7
     61 #dsa 1024 bits   0.0081s   0.0099s    122.8    100.6
     62 #
     63 #	Same benchmark with this assembler code:
     64 #
     65 #rsa  512 bits   0.0020s   0.0002s    510.4   6273.7
     66 #rsa 1024 bits   0.0088s   0.0005s    114.1   2128.3
     67 #rsa 2048 bits   0.0540s   0.0016s     18.5    622.5
     68 #rsa 4096 bits   0.3700s   0.0058s      2.7    171.0
     69 #dsa  512 bits   0.0016s   0.0020s    610.7    507.1
     70 #dsa 1024 bits   0.0047s   0.0058s    212.5    173.2
     71 #	
     72 #	Again, performance increases by at about 75%
     73 #
     74 #       Mac OS X, Apple G5 1.8GHz (Note this is 32 bit code)
     75 #       OpenSSL 0.9.7c 30 Sep 2003
     76 #
     77 #       Original code.
     78 #
     79 #rsa  512 bits   0.0011s   0.0001s    906.1  11012.5
     80 #rsa 1024 bits   0.0060s   0.0003s    166.6   3363.1
     81 #rsa 2048 bits   0.0370s   0.0010s     27.1    982.4
     82 #rsa 4096 bits   0.2426s   0.0036s      4.1    280.4
     83 #dsa  512 bits   0.0010s   0.0012s   1038.1    841.5
     84 #dsa 1024 bits   0.0030s   0.0037s    329.6    269.7
     85 #dsa 2048 bits   0.0101s   0.0127s     98.9     78.6
     86 #
     87 #       Same benchmark with this assembler code:
     88 #
     89 #rsa  512 bits   0.0007s   0.0001s   1416.2  16645.9
     90 #rsa 1024 bits   0.0036s   0.0002s    274.4   5380.6
     91 #rsa 2048 bits   0.0222s   0.0006s     45.1   1589.5
     92 #rsa 4096 bits   0.1469s   0.0022s      6.8    449.6
     93 #dsa  512 bits   0.0006s   0.0007s   1664.2   1376.2
     94 #dsa 1024 bits   0.0018s   0.0023s    545.0    442.2
     95 #dsa 2048 bits   0.0061s   0.0075s    163.5    132.8
     96 #
     97 #        Performance increase of ~60%
     98 #
     99 #	If you have comments or suggestions to improve code send
    100 #	me a note at schari (at] us.ibm.com
    101 #
    102 
    103 $opf = shift;
    104 
    105 if ($opf =~ /32\.s/) {
    106 	$BITS=	32;
    107 	$BNSZ=	$BITS/8;
    108 	$ISA=	"\"ppc\"";
    109 
    110 	$LD=	"lwz";		# load
    111 	$LDU=	"lwzu";		# load and update
    112 	$ST=	"stw";		# store
    113 	$STU=	"stwu";		# store and update
    114 	$UMULL=	"mullw";	# unsigned multiply low
    115 	$UMULH=	"mulhwu";	# unsigned multiply high
    116 	$UDIV=	"divwu";	# unsigned divide
    117 	$UCMPI=	"cmplwi";	# unsigned compare with immediate
    118 	$UCMP=	"cmplw";	# unsigned compare
    119 	$CNTLZ=	"cntlzw";	# count leading zeros
    120 	$SHL=	"slw";		# shift left
    121 	$SHR=	"srw";		# unsigned shift right
    122 	$SHRI=	"srwi";		# unsigned shift right by immediate	
    123 	$SHLI=	"slwi";		# shift left by immediate
    124 	$CLRU=	"clrlwi";	# clear upper bits
    125 	$INSR=	"insrwi";	# insert right
    126 	$ROTL=	"rotlwi";	# rotate left by immediate
    127 	$TR=	"tw";		# conditional trap
    128 } elsif ($opf =~ /64\.s/) {
    129 	$BITS=	64;
    130 	$BNSZ=	$BITS/8;
    131 	$ISA=	"\"ppc64\"";
    132 
    133 	# same as above, but 64-bit mnemonics...
    134 	$LD=	"ld";		# load
    135 	$LDU=	"ldu";		# load and update
    136 	$ST=	"std";		# store
    137 	$STU=	"stdu";		# store and update
    138 	$UMULL=	"mulld";	# unsigned multiply low
    139 	$UMULH=	"mulhdu";	# unsigned multiply high
    140 	$UDIV=	"divdu";	# unsigned divide
    141 	$UCMPI=	"cmpldi";	# unsigned compare with immediate
    142 	$UCMP=	"cmpld";	# unsigned compare
    143 	$CNTLZ=	"cntlzd";	# count leading zeros
    144 	$SHL=	"sld";		# shift left
    145 	$SHR=	"srd";		# unsigned shift right
    146 	$SHRI=	"srdi";		# unsigned shift right by immediate	
    147 	$SHLI=	"sldi";		# shift left by immediate
    148 	$CLRU=	"clrldi";	# clear upper bits
    149 	$INSR=	"insrdi";	# insert right 
    150 	$ROTL=	"rotldi";	# rotate left by immediate
    151 	$TR=	"td";		# conditional trap
    152 } else { die "nonsense $opf"; }
    153 
    154 ( defined shift || open STDOUT,">$opf" ) || die "can't open $opf: $!";
    155 
    156 # function entry points from the AIX code
    157 #
    158 # There are other, more elegant, ways to handle this. We (IBM) chose
    159 # this approach as it plays well with scripts we run to 'namespace'
    160 # OpenSSL .i.e. we add a prefix to all the public symbols so we can
    161 # co-exist in the same process with other implementations of OpenSSL.
    162 # 'cleverer' ways of doing these substitutions tend to hide data we
    163 # need to be obvious.
    164 #
    165 my @items = ("bn_sqr_comba4",
    166 	     "bn_sqr_comba8",
    167 	     "bn_mul_comba4",
    168 	     "bn_mul_comba8",
    169 	     "bn_sub_words",
    170 	     "bn_add_words",
    171 	     "bn_div_words",
    172 	     "bn_sqr_words",
    173 	     "bn_mul_words",
    174 	     "bn_mul_add_words");
    175 
    176 if    ($opf =~ /linux/)	{  do_linux();	}
    177 elsif ($opf =~ /aix/)	{  do_aix();	}
    178 elsif ($opf =~ /osx/)	{  do_osx();	}
    179 else			{  do_bsd();	}
    180 
    181 sub do_linux {
    182     $d=&data();
    183 
    184     if ($BITS==64) {
    185       foreach $t (@items) {
    186         $d =~ s/\.$t:/\
    187 \t.section\t".opd","aw"\
    188 \t.align\t3\
    189 \t.globl\t$t\
    190 $t:\
    191 \t.quad\t.$t,.TOC.\@tocbase,0\
    192 \t.size\t$t,24\
    193 \t.previous\n\
    194 \t.type\t.$t,\@function\
    195 \t.globl\t.$t\
    196 .$t:/g;
    197       }
    198     }
    199     else {
    200       foreach $t (@items) {
    201         $d=~s/\.$t/$t/g;
    202       }
    203     }
    204     # hide internal labels to avoid pollution of name table...
    205     $d=~s/Lppcasm_/.Lppcasm_/gm;
    206     print $d;
    207 }
    208 
    209 sub do_aix {
    210     # AIX assembler is smart enough to please the linker without
    211     # making us do something special...
    212     print &data();
    213 }
    214 
    215 # MacOSX 32 bit
    216 sub do_osx {
    217     $d=&data();
    218     # Change the bn symbol prefix from '.' to '_'
    219     foreach $t (@items) {
    220       $d=~s/\.$t/_$t/g;
    221     }
    222     # Change .machine to something OS X asm will accept
    223     $d=~s/\.machine.*/.text/g;
    224     $d=~s/\#/;/g; # change comment from '#' to ';'
    225     print $d;
    226 }
    227 
    228 # BSD (Untested)
    229 sub do_bsd {
    230     $d=&data();
    231     foreach $t (@items) {
    232       $d=~s/\.$t/_$t/g;
    233     }
    234     print $d;
    235 }
    236 
    237 sub data {
    238 	local($data)=<<EOF;
    239 #--------------------------------------------------------------------
    240 #
    241 #
    242 #
    243 #
    244 #	File:		ppc32.s
    245 #
    246 #	Created by:	Suresh Chari
    247 #			IBM Thomas J. Watson Research Library
    248 #			Hawthorne, NY
    249 #
    250 #
    251 #	Description:	Optimized assembly routines for OpenSSL crypto
    252 #			on the 32 bitPowerPC platform.
    253 #
    254 #
    255 #	Version History
    256 #
    257 #	2. Fixed bn_add,bn_sub and bn_div_words, added comments,
    258 #	   cleaned up code. Also made a single version which can
    259 #	   be used for both the AIX and Linux compilers. See NOTE
    260 #	   below.
    261 #				12/05/03		Suresh Chari
    262 #			(with lots of help from)        Andy Polyakov
    263 ##	
    264 #	1. Initial version	10/20/02		Suresh Chari
    265 #
    266 #
    267 #	The following file works for the xlc,cc
    268 #	and gcc compilers.
    269 #
    270 #	NOTE:	To get the file to link correctly with the gcc compiler
    271 #	        you have to change the names of the routines and remove
    272 #		the first .(dot) character. This should automatically
    273 #		be done in the build process.
    274 #
    275 #	Hand optimized assembly code for the following routines
    276 #	
    277 #	bn_sqr_comba4
    278 #	bn_sqr_comba8
    279 #	bn_mul_comba4
    280 #	bn_mul_comba8
    281 #	bn_sub_words
    282 #	bn_add_words
    283 #	bn_div_words
    284 #	bn_sqr_words
    285 #	bn_mul_words
    286 #	bn_mul_add_words
    287 #
    288 #	NOTE:	It is possible to optimize this code more for
    289 #	specific PowerPC or Power architectures. On the Northstar
    290 #	architecture the optimizations in this file do
    291 #	 NOT provide much improvement.
    292 #
    293 #	If you have comments or suggestions to improve code send
    294 #	me a note at schari\@us.ibm.com
    295 #
    296 #--------------------------------------------------------------------------
    297 #
    298 #	Defines to be used in the assembly code.
    299 #	
    300 .set r0,0	# we use it as storage for value of 0
    301 .set SP,1	# preserved
    302 .set RTOC,2	# preserved 
    303 .set r3,3	# 1st argument/return value
    304 .set r4,4	# 2nd argument/volatile register
    305 .set r5,5	# 3rd argument/volatile register
    306 .set r6,6	# ...
    307 .set r7,7
    308 .set r8,8
    309 .set r9,9
    310 .set r10,10
    311 .set r11,11
    312 .set r12,12
    313 .set r13,13	# not used, nor any other "below" it...
    314 
    315 .set BO_IF_NOT,4
    316 .set BO_IF,12
    317 .set BO_dCTR_NZERO,16
    318 .set BO_dCTR_ZERO,18
    319 .set BO_ALWAYS,20
    320 .set CR0_LT,0;
    321 .set CR0_GT,1;
    322 .set CR0_EQ,2
    323 .set CR1_FX,4;
    324 .set CR1_FEX,5;
    325 .set CR1_VX,6
    326 .set LR,8
    327 
    328 #	Declare function names to be global
    329 #	NOTE:	For gcc these names MUST be changed to remove
    330 #	        the first . i.e. for example change ".bn_sqr_comba4"
    331 #		to "bn_sqr_comba4". This should be automatically done
    332 #		in the build.
    333 	
    334 	.globl	.bn_sqr_comba4
    335 	.globl	.bn_sqr_comba8
    336 	.globl	.bn_mul_comba4
    337 	.globl	.bn_mul_comba8
    338 	.globl	.bn_sub_words
    339 	.globl	.bn_add_words
    340 	.globl	.bn_div_words
    341 	.globl	.bn_sqr_words
    342 	.globl	.bn_mul_words
    343 	.globl	.bn_mul_add_words
    344 	
    345 # .text section
    346 	
    347 	.machine	$ISA
    348 
    349 #
    350 #	NOTE:	The following label name should be changed to
    351 #		"bn_sqr_comba4" i.e. remove the first dot
    352 #		for the gcc compiler. This should be automatically
    353 #		done in the build
    354 #
    355 
    356 .align	4
    357 .bn_sqr_comba4:
    358 #
    359 # Optimized version of bn_sqr_comba4.
    360 #
    361 # void bn_sqr_comba4(BN_ULONG *r, BN_ULONG *a)
    362 # r3 contains r
    363 # r4 contains a
    364 #
    365 # Freely use registers r5,r6,r7,r8,r9,r10,r11 as follows:	
    366 # 
    367 # r5,r6 are the two BN_ULONGs being multiplied.
    368 # r7,r8 are the results of the 32x32 giving 64 bit multiply.
    369 # r9,r10, r11 are the equivalents of c1,c2, c3.
    370 # Here's the assembly
    371 #
    372 #
    373 	xor		r0,r0,r0		# set r0 = 0. Used in the addze
    374 						# instructions below
    375 	
    376 						#sqr_add_c(a,0,c1,c2,c3)
    377 	$LD		r5,`0*$BNSZ`(r4)		
    378 	$UMULL		r9,r5,r5		
    379 	$UMULH		r10,r5,r5		#in first iteration. No need
    380 						#to add since c1=c2=c3=0.
    381 						# Note c3(r11) is NOT set to 0
    382 						# but will be.
    383 
    384 	$ST		r9,`0*$BNSZ`(r3)	# r[0]=c1;
    385 						# sqr_add_c2(a,1,0,c2,c3,c1);
    386 	$LD		r6,`1*$BNSZ`(r4)		
    387 	$UMULL		r7,r5,r6
    388 	$UMULH		r8,r5,r6
    389 					
    390 	addc		r7,r7,r7		# compute (r7,r8)=2*(r7,r8)
    391 	adde		r8,r8,r8
    392 	addze		r9,r0			# catch carry if any.
    393 						# r9= r0(=0) and carry 
    394 	
    395 	addc		r10,r7,r10		# now add to temp result.
    396 	addze		r11,r8                  # r8 added to r11 which is 0 
    397 	addze		r9,r9
    398 	
    399 	$ST		r10,`1*$BNSZ`(r3)	#r[1]=c2; 
    400 						#sqr_add_c(a,1,c3,c1,c2)
    401 	$UMULL		r7,r6,r6
    402 	$UMULH		r8,r6,r6
    403 	addc		r11,r7,r11
    404 	adde		r9,r8,r9
    405 	addze		r10,r0
    406 						#sqr_add_c2(a,2,0,c3,c1,c2)
    407 	$LD		r6,`2*$BNSZ`(r4)
    408 	$UMULL		r7,r5,r6
    409 	$UMULH		r8,r5,r6
    410 	
    411 	addc		r7,r7,r7
    412 	adde		r8,r8,r8
    413 	addze		r10,r10
    414 	
    415 	addc		r11,r7,r11
    416 	adde		r9,r8,r9
    417 	addze		r10,r10
    418 	$ST		r11,`2*$BNSZ`(r3)	#r[2]=c3 
    419 						#sqr_add_c2(a,3,0,c1,c2,c3);
    420 	$LD		r6,`3*$BNSZ`(r4)		
    421 	$UMULL		r7,r5,r6
    422 	$UMULH		r8,r5,r6
    423 	addc		r7,r7,r7
    424 	adde		r8,r8,r8
    425 	addze		r11,r0
    426 	
    427 	addc		r9,r7,r9
    428 	adde		r10,r8,r10
    429 	addze		r11,r11
    430 						#sqr_add_c2(a,2,1,c1,c2,c3);
    431 	$LD		r5,`1*$BNSZ`(r4)
    432 	$LD		r6,`2*$BNSZ`(r4)
    433 	$UMULL		r7,r5,r6
    434 	$UMULH		r8,r5,r6
    435 	
    436 	addc		r7,r7,r7
    437 	adde		r8,r8,r8
    438 	addze		r11,r11
    439 	addc		r9,r7,r9
    440 	adde		r10,r8,r10
    441 	addze		r11,r11
    442 	$ST		r9,`3*$BNSZ`(r3)	#r[3]=c1
    443 						#sqr_add_c(a,2,c2,c3,c1);
    444 	$UMULL		r7,r6,r6
    445 	$UMULH		r8,r6,r6
    446 	addc		r10,r7,r10
    447 	adde		r11,r8,r11
    448 	addze		r9,r0
    449 						#sqr_add_c2(a,3,1,c2,c3,c1);
    450 	$LD		r6,`3*$BNSZ`(r4)		
    451 	$UMULL		r7,r5,r6
    452 	$UMULH		r8,r5,r6
    453 	addc		r7,r7,r7
    454 	adde		r8,r8,r8
    455 	addze		r9,r9
    456 	
    457 	addc		r10,r7,r10
    458 	adde		r11,r8,r11
    459 	addze		r9,r9
    460 	$ST		r10,`4*$BNSZ`(r3)	#r[4]=c2
    461 						#sqr_add_c2(a,3,2,c3,c1,c2);
    462 	$LD		r5,`2*$BNSZ`(r4)		
    463 	$UMULL		r7,r5,r6
    464 	$UMULH		r8,r5,r6
    465 	addc		r7,r7,r7
    466 	adde		r8,r8,r8
    467 	addze		r10,r0
    468 	
    469 	addc		r11,r7,r11
    470 	adde		r9,r8,r9
    471 	addze		r10,r10
    472 	$ST		r11,`5*$BNSZ`(r3)	#r[5] = c3
    473 						#sqr_add_c(a,3,c1,c2,c3);
    474 	$UMULL		r7,r6,r6		
    475 	$UMULH		r8,r6,r6
    476 	addc		r9,r7,r9
    477 	adde		r10,r8,r10
    478 
    479 	$ST		r9,`6*$BNSZ`(r3)	#r[6]=c1
    480 	$ST		r10,`7*$BNSZ`(r3)	#r[7]=c2
    481 	bclr	BO_ALWAYS,CR0_LT
    482 	.long	0x00000000
    483 
    484 #
    485 #	NOTE:	The following label name should be changed to
    486 #		"bn_sqr_comba8" i.e. remove the first dot
    487 #		for the gcc compiler. This should be automatically
    488 #		done in the build
    489 #
    490 	
    491 .align	4
    492 .bn_sqr_comba8:
    493 #
    494 # This is an optimized version of the bn_sqr_comba8 routine.
    495 # Tightly uses the adde instruction
    496 #
    497 #
    498 # void bn_sqr_comba8(BN_ULONG *r, BN_ULONG *a)
    499 # r3 contains r
    500 # r4 contains a
    501 #
    502 # Freely use registers r5,r6,r7,r8,r9,r10,r11 as follows:	
    503 # 
    504 # r5,r6 are the two BN_ULONGs being multiplied.
    505 # r7,r8 are the results of the 32x32 giving 64 bit multiply.
    506 # r9,r10, r11 are the equivalents of c1,c2, c3.
    507 #
    508 # Possible optimization of loading all 8 longs of a into registers
    509 # doesnt provide any speedup
    510 # 
    511 
    512 	xor		r0,r0,r0		#set r0 = 0.Used in addze
    513 						#instructions below.
    514 
    515 						#sqr_add_c(a,0,c1,c2,c3);
    516 	$LD		r5,`0*$BNSZ`(r4)
    517 	$UMULL		r9,r5,r5		#1st iteration:	no carries.
    518 	$UMULH		r10,r5,r5
    519 	$ST		r9,`0*$BNSZ`(r3)	# r[0]=c1;
    520 						#sqr_add_c2(a,1,0,c2,c3,c1);
    521 	$LD		r6,`1*$BNSZ`(r4)
    522 	$UMULL		r7,r5,r6
    523 	$UMULH		r8,r5,r6	
    524 	
    525 	addc		r10,r7,r10		#add the two register number
    526 	adde		r11,r8,r0 		# (r8,r7) to the three register
    527 	addze		r9,r0			# number (r9,r11,r10).NOTE:r0=0
    528 	
    529 	addc		r10,r7,r10		#add the two register number
    530 	adde		r11,r8,r11 		# (r8,r7) to the three register
    531 	addze		r9,r9			# number (r9,r11,r10).
    532 	
    533 	$ST		r10,`1*$BNSZ`(r3)	# r[1]=c2
    534 				
    535 						#sqr_add_c(a,1,c3,c1,c2);
    536 	$UMULL		r7,r6,r6
    537 	$UMULH		r8,r6,r6
    538 	addc		r11,r7,r11
    539 	adde		r9,r8,r9
    540 	addze		r10,r0
    541 						#sqr_add_c2(a,2,0,c3,c1,c2);
    542 	$LD		r6,`2*$BNSZ`(r4)
    543 	$UMULL		r7,r5,r6
    544 	$UMULH		r8,r5,r6
    545 	
    546 	addc		r11,r7,r11
    547 	adde		r9,r8,r9
    548 	addze		r10,r10
    549 	
    550 	addc		r11,r7,r11
    551 	adde		r9,r8,r9
    552 	addze		r10,r10
    553 	
    554 	$ST		r11,`2*$BNSZ`(r3)	#r[2]=c3
    555 						#sqr_add_c2(a,3,0,c1,c2,c3);
    556 	$LD		r6,`3*$BNSZ`(r4)	#r6 = a[3]. r5 is already a[0].
    557 	$UMULL		r7,r5,r6
    558 	$UMULH		r8,r5,r6
    559 	
    560 	addc		r9,r7,r9
    561 	adde		r10,r8,r10
    562 	addze		r11,r0
    563 	
    564 	addc		r9,r7,r9
    565 	adde		r10,r8,r10
    566 	addze		r11,r11
    567 						#sqr_add_c2(a,2,1,c1,c2,c3);
    568 	$LD		r5,`1*$BNSZ`(r4)
    569 	$LD		r6,`2*$BNSZ`(r4)
    570 	$UMULL		r7,r5,r6
    571 	$UMULH		r8,r5,r6
    572 	
    573 	addc		r9,r7,r9
    574 	adde		r10,r8,r10
    575 	addze		r11,r11
    576 	
    577 	addc		r9,r7,r9
    578 	adde		r10,r8,r10
    579 	addze		r11,r11
    580 	
    581 	$ST		r9,`3*$BNSZ`(r3)	#r[3]=c1;
    582 						#sqr_add_c(a,2,c2,c3,c1);
    583 	$UMULL		r7,r6,r6
    584 	$UMULH		r8,r6,r6
    585 	
    586 	addc		r10,r7,r10
    587 	adde		r11,r8,r11
    588 	addze		r9,r0
    589 						#sqr_add_c2(a,3,1,c2,c3,c1);
    590 	$LD		r6,`3*$BNSZ`(r4)
    591 	$UMULL		r7,r5,r6
    592 	$UMULH		r8,r5,r6
    593 	
    594 	addc		r10,r7,r10
    595 	adde		r11,r8,r11
    596 	addze		r9,r9
    597 	
    598 	addc		r10,r7,r10
    599 	adde		r11,r8,r11
    600 	addze		r9,r9
    601 						#sqr_add_c2(a,4,0,c2,c3,c1);
    602 	$LD		r5,`0*$BNSZ`(r4)
    603 	$LD		r6,`4*$BNSZ`(r4)
    604 	$UMULL		r7,r5,r6
    605 	$UMULH		r8,r5,r6
    606 	
    607 	addc		r10,r7,r10
    608 	adde		r11,r8,r11
    609 	addze		r9,r9
    610 	
    611 	addc		r10,r7,r10
    612 	adde		r11,r8,r11
    613 	addze		r9,r9
    614 	$ST		r10,`4*$BNSZ`(r3)	#r[4]=c2;
    615 						#sqr_add_c2(a,5,0,c3,c1,c2);
    616 	$LD		r6,`5*$BNSZ`(r4)
    617 	$UMULL		r7,r5,r6
    618 	$UMULH		r8,r5,r6
    619 	
    620 	addc		r11,r7,r11
    621 	adde		r9,r8,r9
    622 	addze		r10,r0
    623 	
    624 	addc		r11,r7,r11
    625 	adde		r9,r8,r9
    626 	addze		r10,r10
    627 						#sqr_add_c2(a,4,1,c3,c1,c2);
    628 	$LD		r5,`1*$BNSZ`(r4)
    629 	$LD		r6,`4*$BNSZ`(r4)
    630 	$UMULL		r7,r5,r6
    631 	$UMULH		r8,r5,r6
    632 	
    633 	addc		r11,r7,r11
    634 	adde		r9,r8,r9
    635 	addze		r10,r10
    636 	
    637 	addc		r11,r7,r11
    638 	adde		r9,r8,r9
    639 	addze		r10,r10
    640 						#sqr_add_c2(a,3,2,c3,c1,c2);
    641 	$LD		r5,`2*$BNSZ`(r4)
    642 	$LD		r6,`3*$BNSZ`(r4)
    643 	$UMULL		r7,r5,r6
    644 	$UMULH		r8,r5,r6
    645 	
    646 	addc		r11,r7,r11
    647 	adde		r9,r8,r9
    648 	addze		r10,r10
    649 	
    650 	addc		r11,r7,r11
    651 	adde		r9,r8,r9
    652 	addze		r10,r10
    653 	$ST		r11,`5*$BNSZ`(r3)	#r[5]=c3;
    654 						#sqr_add_c(a,3,c1,c2,c3);
    655 	$UMULL		r7,r6,r6
    656 	$UMULH		r8,r6,r6
    657 	addc		r9,r7,r9
    658 	adde		r10,r8,r10
    659 	addze		r11,r0
    660 						#sqr_add_c2(a,4,2,c1,c2,c3);
    661 	$LD		r6,`4*$BNSZ`(r4)
    662 	$UMULL		r7,r5,r6
    663 	$UMULH		r8,r5,r6
    664 	
    665 	addc		r9,r7,r9
    666 	adde		r10,r8,r10
    667 	addze		r11,r11
    668 	
    669 	addc		r9,r7,r9
    670 	adde		r10,r8,r10
    671 	addze		r11,r11
    672 						#sqr_add_c2(a,5,1,c1,c2,c3);
    673 	$LD		r5,`1*$BNSZ`(r4)
    674 	$LD		r6,`5*$BNSZ`(r4)
    675 	$UMULL		r7,r5,r6
    676 	$UMULH		r8,r5,r6
    677 	
    678 	addc		r9,r7,r9
    679 	adde		r10,r8,r10
    680 	addze		r11,r11
    681 	
    682 	addc		r9,r7,r9
    683 	adde		r10,r8,r10
    684 	addze		r11,r11
    685 						#sqr_add_c2(a,6,0,c1,c2,c3);
    686 	$LD		r5,`0*$BNSZ`(r4)
    687 	$LD		r6,`6*$BNSZ`(r4)
    688 	$UMULL		r7,r5,r6
    689 	$UMULH		r8,r5,r6
    690 	addc		r9,r7,r9
    691 	adde		r10,r8,r10
    692 	addze		r11,r11
    693 	addc		r9,r7,r9
    694 	adde		r10,r8,r10
    695 	addze		r11,r11
    696 	$ST		r9,`6*$BNSZ`(r3)	#r[6]=c1;
    697 						#sqr_add_c2(a,7,0,c2,c3,c1);
    698 	$LD		r6,`7*$BNSZ`(r4)
    699 	$UMULL		r7,r5,r6
    700 	$UMULH		r8,r5,r6
    701 	
    702 	addc		r10,r7,r10
    703 	adde		r11,r8,r11
    704 	addze		r9,r0
    705 	addc		r10,r7,r10
    706 	adde		r11,r8,r11
    707 	addze		r9,r9
    708 						#sqr_add_c2(a,6,1,c2,c3,c1);
    709 	$LD		r5,`1*$BNSZ`(r4)
    710 	$LD		r6,`6*$BNSZ`(r4)
    711 	$UMULL		r7,r5,r6
    712 	$UMULH		r8,r5,r6
    713 	
    714 	addc		r10,r7,r10
    715 	adde		r11,r8,r11
    716 	addze		r9,r9
    717 	addc		r10,r7,r10
    718 	adde		r11,r8,r11
    719 	addze		r9,r9
    720 						#sqr_add_c2(a,5,2,c2,c3,c1);
    721 	$LD		r5,`2*$BNSZ`(r4)
    722 	$LD		r6,`5*$BNSZ`(r4)
    723 	$UMULL		r7,r5,r6
    724 	$UMULH		r8,r5,r6
    725 	addc		r10,r7,r10
    726 	adde		r11,r8,r11
    727 	addze		r9,r9
    728 	addc		r10,r7,r10
    729 	adde		r11,r8,r11
    730 	addze		r9,r9
    731 						#sqr_add_c2(a,4,3,c2,c3,c1);
    732 	$LD		r5,`3*$BNSZ`(r4)
    733 	$LD		r6,`4*$BNSZ`(r4)
    734 	$UMULL		r7,r5,r6
    735 	$UMULH		r8,r5,r6
    736 	
    737 	addc		r10,r7,r10
    738 	adde		r11,r8,r11
    739 	addze		r9,r9
    740 	addc		r10,r7,r10
    741 	adde		r11,r8,r11
    742 	addze		r9,r9
    743 	$ST		r10,`7*$BNSZ`(r3)	#r[7]=c2;
    744 						#sqr_add_c(a,4,c3,c1,c2);
    745 	$UMULL		r7,r6,r6
    746 	$UMULH		r8,r6,r6
    747 	addc		r11,r7,r11
    748 	adde		r9,r8,r9
    749 	addze		r10,r0
    750 						#sqr_add_c2(a,5,3,c3,c1,c2);
    751 	$LD		r6,`5*$BNSZ`(r4)
    752 	$UMULL		r7,r5,r6
    753 	$UMULH		r8,r5,r6
    754 	addc		r11,r7,r11
    755 	adde		r9,r8,r9
    756 	addze		r10,r10
    757 	addc		r11,r7,r11
    758 	adde		r9,r8,r9
    759 	addze		r10,r10
    760 						#sqr_add_c2(a,6,2,c3,c1,c2);
    761 	$LD		r5,`2*$BNSZ`(r4)
    762 	$LD		r6,`6*$BNSZ`(r4)
    763 	$UMULL		r7,r5,r6
    764 	$UMULH		r8,r5,r6
    765 	addc		r11,r7,r11
    766 	adde		r9,r8,r9
    767 	addze		r10,r10
    768 	
    769 	addc		r11,r7,r11
    770 	adde		r9,r8,r9
    771 	addze		r10,r10
    772 						#sqr_add_c2(a,7,1,c3,c1,c2);
    773 	$LD		r5,`1*$BNSZ`(r4)
    774 	$LD		r6,`7*$BNSZ`(r4)
    775 	$UMULL		r7,r5,r6
    776 	$UMULH		r8,r5,r6
    777 	addc		r11,r7,r11
    778 	adde		r9,r8,r9
    779 	addze		r10,r10
    780 	addc		r11,r7,r11
    781 	adde		r9,r8,r9
    782 	addze		r10,r10
    783 	$ST		r11,`8*$BNSZ`(r3)	#r[8]=c3;
    784 						#sqr_add_c2(a,7,2,c1,c2,c3);
    785 	$LD		r5,`2*$BNSZ`(r4)
    786 	$UMULL		r7,r5,r6
    787 	$UMULH		r8,r5,r6
    788 	
    789 	addc		r9,r7,r9
    790 	adde		r10,r8,r10
    791 	addze		r11,r0
    792 	addc		r9,r7,r9
    793 	adde		r10,r8,r10
    794 	addze		r11,r11
    795 						#sqr_add_c2(a,6,3,c1,c2,c3);
    796 	$LD		r5,`3*$BNSZ`(r4)
    797 	$LD		r6,`6*$BNSZ`(r4)
    798 	$UMULL		r7,r5,r6
    799 	$UMULH		r8,r5,r6
    800 	addc		r9,r7,r9
    801 	adde		r10,r8,r10
    802 	addze		r11,r11
    803 	addc		r9,r7,r9
    804 	adde		r10,r8,r10
    805 	addze		r11,r11
    806 						#sqr_add_c2(a,5,4,c1,c2,c3);
    807 	$LD		r5,`4*$BNSZ`(r4)
    808 	$LD		r6,`5*$BNSZ`(r4)
    809 	$UMULL		r7,r5,r6
    810 	$UMULH		r8,r5,r6
    811 	addc		r9,r7,r9
    812 	adde		r10,r8,r10
    813 	addze		r11,r11
    814 	addc		r9,r7,r9
    815 	adde		r10,r8,r10
    816 	addze		r11,r11
    817 	$ST		r9,`9*$BNSZ`(r3)	#r[9]=c1;
    818 						#sqr_add_c(a,5,c2,c3,c1);
    819 	$UMULL		r7,r6,r6
    820 	$UMULH		r8,r6,r6
    821 	addc		r10,r7,r10
    822 	adde		r11,r8,r11
    823 	addze		r9,r0
    824 						#sqr_add_c2(a,6,4,c2,c3,c1);
    825 	$LD		r6,`6*$BNSZ`(r4)
    826 	$UMULL		r7,r5,r6
    827 	$UMULH		r8,r5,r6
    828 	addc		r10,r7,r10
    829 	adde		r11,r8,r11
    830 	addze		r9,r9
    831 	addc		r10,r7,r10
    832 	adde		r11,r8,r11
    833 	addze		r9,r9
    834 						#sqr_add_c2(a,7,3,c2,c3,c1);
    835 	$LD		r5,`3*$BNSZ`(r4)
    836 	$LD		r6,`7*$BNSZ`(r4)
    837 	$UMULL		r7,r5,r6
    838 	$UMULH		r8,r5,r6
    839 	addc		r10,r7,r10
    840 	adde		r11,r8,r11
    841 	addze		r9,r9
    842 	addc		r10,r7,r10
    843 	adde		r11,r8,r11
    844 	addze		r9,r9
    845 	$ST		r10,`10*$BNSZ`(r3)	#r[10]=c2;
    846 						#sqr_add_c2(a,7,4,c3,c1,c2);
    847 	$LD		r5,`4*$BNSZ`(r4)
    848 	$UMULL		r7,r5,r6
    849 	$UMULH		r8,r5,r6
    850 	addc		r11,r7,r11
    851 	adde		r9,r8,r9
    852 	addze		r10,r0
    853 	addc		r11,r7,r11
    854 	adde		r9,r8,r9
    855 	addze		r10,r10
    856 						#sqr_add_c2(a,6,5,c3,c1,c2);
    857 	$LD		r5,`5*$BNSZ`(r4)
    858 	$LD		r6,`6*$BNSZ`(r4)
    859 	$UMULL		r7,r5,r6
    860 	$UMULH		r8,r5,r6
    861 	addc		r11,r7,r11
    862 	adde		r9,r8,r9
    863 	addze		r10,r10
    864 	addc		r11,r7,r11
    865 	adde		r9,r8,r9
    866 	addze		r10,r10
    867 	$ST		r11,`11*$BNSZ`(r3)	#r[11]=c3;
    868 						#sqr_add_c(a,6,c1,c2,c3);
    869 	$UMULL		r7,r6,r6
    870 	$UMULH		r8,r6,r6
    871 	addc		r9,r7,r9
    872 	adde		r10,r8,r10
    873 	addze		r11,r0
    874 						#sqr_add_c2(a,7,5,c1,c2,c3)
    875 	$LD		r6,`7*$BNSZ`(r4)
    876 	$UMULL		r7,r5,r6
    877 	$UMULH		r8,r5,r6
    878 	addc		r9,r7,r9
    879 	adde		r10,r8,r10
    880 	addze		r11,r11
    881 	addc		r9,r7,r9
    882 	adde		r10,r8,r10
    883 	addze		r11,r11
    884 	$ST		r9,`12*$BNSZ`(r3)	#r[12]=c1;
    885 	
    886 						#sqr_add_c2(a,7,6,c2,c3,c1)
    887 	$LD		r5,`6*$BNSZ`(r4)
    888 	$UMULL		r7,r5,r6
    889 	$UMULH		r8,r5,r6
    890 	addc		r10,r7,r10
    891 	adde		r11,r8,r11
    892 	addze		r9,r0
    893 	addc		r10,r7,r10
    894 	adde		r11,r8,r11
    895 	addze		r9,r9
    896 	$ST		r10,`13*$BNSZ`(r3)	#r[13]=c2;
    897 						#sqr_add_c(a,7,c3,c1,c2);
    898 	$UMULL		r7,r6,r6
    899 	$UMULH		r8,r6,r6
    900 	addc		r11,r7,r11
    901 	adde		r9,r8,r9
    902 	$ST		r11,`14*$BNSZ`(r3)	#r[14]=c3;
    903 	$ST		r9, `15*$BNSZ`(r3)	#r[15]=c1;
    904 
    905 
    906 	bclr	BO_ALWAYS,CR0_LT
    907 
    908 	.long	0x00000000
    909 
    910 #
    911 #	NOTE:	The following label name should be changed to
    912 #		"bn_mul_comba4" i.e. remove the first dot
    913 #		for the gcc compiler. This should be automatically
    914 #		done in the build
    915 #
    916 
    917 .align	4
    918 .bn_mul_comba4:
    919 #
    920 # This is an optimized version of the bn_mul_comba4 routine.
    921 #
    922 # void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
    923 # r3 contains r
    924 # r4 contains a
    925 # r5 contains b
    926 # r6, r7 are the 2 BN_ULONGs being multiplied.
    927 # r8, r9 are the results of the 32x32 giving 64 multiply.
    928 # r10, r11, r12 are the equivalents of c1, c2, and c3.
    929 #
    930 	xor	r0,r0,r0		#r0=0. Used in addze below.
    931 					#mul_add_c(a[0],b[0],c1,c2,c3);
    932 	$LD	r6,`0*$BNSZ`(r4)		
    933 	$LD	r7,`0*$BNSZ`(r5)		
    934 	$UMULL	r10,r6,r7		
    935 	$UMULH	r11,r6,r7		
    936 	$ST	r10,`0*$BNSZ`(r3)	#r[0]=c1
    937 					#mul_add_c(a[0],b[1],c2,c3,c1);
    938 	$LD	r7,`1*$BNSZ`(r5)		
    939 	$UMULL	r8,r6,r7
    940 	$UMULH	r9,r6,r7
    941 	addc	r11,r8,r11
    942 	adde	r12,r9,r0
    943 	addze	r10,r0
    944 					#mul_add_c(a[1],b[0],c2,c3,c1);
    945 	$LD	r6, `1*$BNSZ`(r4)		
    946 	$LD	r7, `0*$BNSZ`(r5)		
    947 	$UMULL	r8,r6,r7
    948 	$UMULH	r9,r6,r7
    949 	addc	r11,r8,r11
    950 	adde	r12,r9,r12
    951 	addze	r10,r10
    952 	$ST	r11,`1*$BNSZ`(r3)	#r[1]=c2
    953 					#mul_add_c(a[2],b[0],c3,c1,c2);
    954 	$LD	r6,`2*$BNSZ`(r4)		
    955 	$UMULL	r8,r6,r7
    956 	$UMULH	r9,r6,r7
    957 	addc	r12,r8,r12
    958 	adde	r10,r9,r10
    959 	addze	r11,r0
    960 					#mul_add_c(a[1],b[1],c3,c1,c2);
    961 	$LD	r6,`1*$BNSZ`(r4)		
    962 	$LD	r7,`1*$BNSZ`(r5)		
    963 	$UMULL	r8,r6,r7
    964 	$UMULH	r9,r6,r7
    965 	addc	r12,r8,r12
    966 	adde	r10,r9,r10
    967 	addze	r11,r11
    968 					#mul_add_c(a[0],b[2],c3,c1,c2);
    969 	$LD	r6,`0*$BNSZ`(r4)		
    970 	$LD	r7,`2*$BNSZ`(r5)		
    971 	$UMULL	r8,r6,r7
    972 	$UMULH	r9,r6,r7
    973 	addc	r12,r8,r12
    974 	adde	r10,r9,r10
    975 	addze	r11,r11
    976 	$ST	r12,`2*$BNSZ`(r3)	#r[2]=c3
    977 					#mul_add_c(a[0],b[3],c1,c2,c3);
    978 	$LD	r7,`3*$BNSZ`(r5)		
    979 	$UMULL	r8,r6,r7
    980 	$UMULH	r9,r6,r7
    981 	addc	r10,r8,r10
    982 	adde	r11,r9,r11
    983 	addze	r12,r0
    984 					#mul_add_c(a[1],b[2],c1,c2,c3);
    985 	$LD	r6,`1*$BNSZ`(r4)
    986 	$LD	r7,`2*$BNSZ`(r5)
    987 	$UMULL	r8,r6,r7
    988 	$UMULH	r9,r6,r7
    989 	addc	r10,r8,r10
    990 	adde	r11,r9,r11
    991 	addze	r12,r12
    992 					#mul_add_c(a[2],b[1],c1,c2,c3);
    993 	$LD	r6,`2*$BNSZ`(r4)
    994 	$LD	r7,`1*$BNSZ`(r5)
    995 	$UMULL	r8,r6,r7
    996 	$UMULH	r9,r6,r7
    997 	addc	r10,r8,r10
    998 	adde	r11,r9,r11
    999 	addze	r12,r12
   1000 					#mul_add_c(a[3],b[0],c1,c2,c3);
   1001 	$LD	r6,`3*$BNSZ`(r4)
   1002 	$LD	r7,`0*$BNSZ`(r5)
   1003 	$UMULL	r8,r6,r7
   1004 	$UMULH	r9,r6,r7
   1005 	addc	r10,r8,r10
   1006 	adde	r11,r9,r11
   1007 	addze	r12,r12
   1008 	$ST	r10,`3*$BNSZ`(r3)	#r[3]=c1
   1009 					#mul_add_c(a[3],b[1],c2,c3,c1);
   1010 	$LD	r7,`1*$BNSZ`(r5)		
   1011 	$UMULL	r8,r6,r7
   1012 	$UMULH	r9,r6,r7
   1013 	addc	r11,r8,r11
   1014 	adde	r12,r9,r12
   1015 	addze	r10,r0
   1016 					#mul_add_c(a[2],b[2],c2,c3,c1);
   1017 	$LD	r6,`2*$BNSZ`(r4)
   1018 	$LD	r7,`2*$BNSZ`(r5)
   1019 	$UMULL	r8,r6,r7
   1020 	$UMULH	r9,r6,r7
   1021 	addc	r11,r8,r11
   1022 	adde	r12,r9,r12
   1023 	addze	r10,r10
   1024 					#mul_add_c(a[1],b[3],c2,c3,c1);
   1025 	$LD	r6,`1*$BNSZ`(r4)
   1026 	$LD	r7,`3*$BNSZ`(r5)
   1027 	$UMULL	r8,r6,r7
   1028 	$UMULH	r9,r6,r7
   1029 	addc	r11,r8,r11
   1030 	adde	r12,r9,r12
   1031 	addze	r10,r10
   1032 	$ST	r11,`4*$BNSZ`(r3)	#r[4]=c2
   1033 					#mul_add_c(a[2],b[3],c3,c1,c2);
   1034 	$LD	r6,`2*$BNSZ`(r4)		
   1035 	$UMULL	r8,r6,r7
   1036 	$UMULH	r9,r6,r7
   1037 	addc	r12,r8,r12
   1038 	adde	r10,r9,r10
   1039 	addze	r11,r0
   1040 					#mul_add_c(a[3],b[2],c3,c1,c2);
   1041 	$LD	r6,`3*$BNSZ`(r4)
   1042 	$LD	r7,`2*$BNSZ`(r4)
   1043 	$UMULL	r8,r6,r7
   1044 	$UMULH	r9,r6,r7
   1045 	addc	r12,r8,r12
   1046 	adde	r10,r9,r10
   1047 	addze	r11,r11
   1048 	$ST	r12,`5*$BNSZ`(r3)	#r[5]=c3
   1049 					#mul_add_c(a[3],b[3],c1,c2,c3);
   1050 	$LD	r7,`3*$BNSZ`(r5)		
   1051 	$UMULL	r8,r6,r7
   1052 	$UMULH	r9,r6,r7
   1053 	addc	r10,r8,r10
   1054 	adde	r11,r9,r11
   1055 
   1056 	$ST	r10,`6*$BNSZ`(r3)	#r[6]=c1
   1057 	$ST	r11,`7*$BNSZ`(r3)	#r[7]=c2
   1058 	bclr	BO_ALWAYS,CR0_LT
   1059 	.long	0x00000000
   1060 
   1061 #
   1062 #	NOTE:	The following label name should be changed to
   1063 #		"bn_mul_comba8" i.e. remove the first dot
   1064 #		for the gcc compiler. This should be automatically
   1065 #		done in the build
   1066 #
   1067 	
   1068 .align	4
   1069 .bn_mul_comba8:
   1070 #
   1071 # Optimized version of the bn_mul_comba8 routine.
   1072 #
   1073 # void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
   1074 # r3 contains r
   1075 # r4 contains a
   1076 # r5 contains b
   1077 # r6, r7 are the 2 BN_ULONGs being multiplied.
   1078 # r8, r9 are the results of the 32x32 giving 64 multiply.
   1079 # r10, r11, r12 are the equivalents of c1, c2, and c3.
   1080 #
   1081 	xor	r0,r0,r0		#r0=0. Used in addze below.
   1082 	
   1083 					#mul_add_c(a[0],b[0],c1,c2,c3);
   1084 	$LD	r6,`0*$BNSZ`(r4)	#a[0]
   1085 	$LD	r7,`0*$BNSZ`(r5)	#b[0]
   1086 	$UMULL	r10,r6,r7
   1087 	$UMULH	r11,r6,r7
   1088 	$ST	r10,`0*$BNSZ`(r3)	#r[0]=c1;
   1089 					#mul_add_c(a[0],b[1],c2,c3,c1);
   1090 	$LD	r7,`1*$BNSZ`(r5)
   1091 	$UMULL	r8,r6,r7
   1092 	$UMULH	r9,r6,r7
   1093 	addc	r11,r11,r8
   1094 	addze	r12,r9			# since we didnt set r12 to zero before.
   1095 	addze	r10,r0
   1096 					#mul_add_c(a[1],b[0],c2,c3,c1);
   1097 	$LD	r6,`1*$BNSZ`(r4)
   1098 	$LD	r7,`0*$BNSZ`(r5)
   1099 	$UMULL	r8,r6,r7
   1100 	$UMULH	r9,r6,r7
   1101 	addc	r11,r11,r8
   1102 	adde	r12,r12,r9
   1103 	addze	r10,r10
   1104 	$ST	r11,`1*$BNSZ`(r3)	#r[1]=c2;
   1105 					#mul_add_c(a[2],b[0],c3,c1,c2);
   1106 	$LD	r6,`2*$BNSZ`(r4)
   1107 	$UMULL	r8,r6,r7
   1108 	$UMULH	r9,r6,r7
   1109 	addc	r12,r12,r8
   1110 	adde	r10,r10,r9
   1111 	addze	r11,r0
   1112 					#mul_add_c(a[1],b[1],c3,c1,c2);
   1113 	$LD	r6,`1*$BNSZ`(r4)
   1114 	$LD	r7,`1*$BNSZ`(r5)
   1115 	$UMULL	r8,r6,r7
   1116 	$UMULH	r9,r6,r7
   1117 	addc	r12,r12,r8
   1118 	adde	r10,r10,r9
   1119 	addze	r11,r11
   1120 					#mul_add_c(a[0],b[2],c3,c1,c2);
   1121 	$LD	r6,`0*$BNSZ`(r4)
   1122 	$LD	r7,`2*$BNSZ`(r5)
   1123 	$UMULL	r8,r6,r7
   1124 	$UMULH	r9,r6,r7
   1125 	addc	r12,r12,r8
   1126 	adde	r10,r10,r9
   1127 	addze	r11,r11
   1128 	$ST	r12,`2*$BNSZ`(r3)	#r[2]=c3;
   1129 					#mul_add_c(a[0],b[3],c1,c2,c3);
   1130 	$LD	r7,`3*$BNSZ`(r5)
   1131 	$UMULL	r8,r6,r7
   1132 	$UMULH	r9,r6,r7
   1133 	addc	r10,r10,r8
   1134 	adde	r11,r11,r9
   1135 	addze	r12,r0
   1136 					#mul_add_c(a[1],b[2],c1,c2,c3);
   1137 	$LD	r6,`1*$BNSZ`(r4)
   1138 	$LD	r7,`2*$BNSZ`(r5)
   1139 	$UMULL	r8,r6,r7
   1140 	$UMULH	r9,r6,r7
   1141 	addc	r10,r10,r8
   1142 	adde	r11,r11,r9
   1143 	addze	r12,r12
   1144 		
   1145 					#mul_add_c(a[2],b[1],c1,c2,c3);
   1146 	$LD	r6,`2*$BNSZ`(r4)
   1147 	$LD	r7,`1*$BNSZ`(r5)
   1148 	$UMULL	r8,r6,r7
   1149 	$UMULH	r9,r6,r7
   1150 	addc	r10,r10,r8
   1151 	adde	r11,r11,r9
   1152 	addze	r12,r12
   1153 					#mul_add_c(a[3],b[0],c1,c2,c3);
   1154 	$LD	r6,`3*$BNSZ`(r4)
   1155 	$LD	r7,`0*$BNSZ`(r5)
   1156 	$UMULL	r8,r6,r7
   1157 	$UMULH	r9,r6,r7
   1158 	addc	r10,r10,r8
   1159 	adde	r11,r11,r9
   1160 	addze	r12,r12
   1161 	$ST	r10,`3*$BNSZ`(r3)	#r[3]=c1;
   1162 					#mul_add_c(a[4],b[0],c2,c3,c1);
   1163 	$LD	r6,`4*$BNSZ`(r4)
   1164 	$UMULL	r8,r6,r7
   1165 	$UMULH	r9,r6,r7
   1166 	addc	r11,r11,r8
   1167 	adde	r12,r12,r9
   1168 	addze	r10,r0
   1169 					#mul_add_c(a[3],b[1],c2,c3,c1);
   1170 	$LD	r6,`3*$BNSZ`(r4)
   1171 	$LD	r7,`1*$BNSZ`(r5)
   1172 	$UMULL	r8,r6,r7
   1173 	$UMULH	r9,r6,r7
   1174 	addc	r11,r11,r8
   1175 	adde	r12,r12,r9
   1176 	addze	r10,r10
   1177 					#mul_add_c(a[2],b[2],c2,c3,c1);
   1178 	$LD	r6,`2*$BNSZ`(r4)
   1179 	$LD	r7,`2*$BNSZ`(r5)
   1180 	$UMULL	r8,r6,r7
   1181 	$UMULH	r9,r6,r7
   1182 	addc	r11,r11,r8
   1183 	adde	r12,r12,r9
   1184 	addze	r10,r10
   1185 					#mul_add_c(a[1],b[3],c2,c3,c1);
   1186 	$LD	r6,`1*$BNSZ`(r4)
   1187 	$LD	r7,`3*$BNSZ`(r5)
   1188 	$UMULL	r8,r6,r7
   1189 	$UMULH	r9,r6,r7
   1190 	addc	r11,r11,r8
   1191 	adde	r12,r12,r9
   1192 	addze	r10,r10
   1193 					#mul_add_c(a[0],b[4],c2,c3,c1);
   1194 	$LD	r6,`0*$BNSZ`(r4)
   1195 	$LD	r7,`4*$BNSZ`(r5)
   1196 	$UMULL	r8,r6,r7
   1197 	$UMULH	r9,r6,r7
   1198 	addc	r11,r11,r8
   1199 	adde	r12,r12,r9
   1200 	addze	r10,r10
   1201 	$ST	r11,`4*$BNSZ`(r3)	#r[4]=c2;
   1202 					#mul_add_c(a[0],b[5],c3,c1,c2);
   1203 	$LD	r7,`5*$BNSZ`(r5)
   1204 	$UMULL	r8,r6,r7
   1205 	$UMULH	r9,r6,r7
   1206 	addc	r12,r12,r8
   1207 	adde	r10,r10,r9
   1208 	addze	r11,r0
   1209 					#mul_add_c(a[1],b[4],c3,c1,c2);
   1210 	$LD	r6,`1*$BNSZ`(r4)		
   1211 	$LD	r7,`4*$BNSZ`(r5)
   1212 	$UMULL	r8,r6,r7
   1213 	$UMULH	r9,r6,r7
   1214 	addc	r12,r12,r8
   1215 	adde	r10,r10,r9
   1216 	addze	r11,r11
   1217 					#mul_add_c(a[2],b[3],c3,c1,c2);
   1218 	$LD	r6,`2*$BNSZ`(r4)		
   1219 	$LD	r7,`3*$BNSZ`(r5)
   1220 	$UMULL	r8,r6,r7
   1221 	$UMULH	r9,r6,r7
   1222 	addc	r12,r12,r8
   1223 	adde	r10,r10,r9
   1224 	addze	r11,r11
   1225 					#mul_add_c(a[3],b[2],c3,c1,c2);
   1226 	$LD	r6,`3*$BNSZ`(r4)		
   1227 	$LD	r7,`2*$BNSZ`(r5)
   1228 	$UMULL	r8,r6,r7
   1229 	$UMULH	r9,r6,r7
   1230 	addc	r12,r12,r8
   1231 	adde	r10,r10,r9
   1232 	addze	r11,r11
   1233 					#mul_add_c(a[4],b[1],c3,c1,c2);
   1234 	$LD	r6,`4*$BNSZ`(r4)		
   1235 	$LD	r7,`1*$BNSZ`(r5)
   1236 	$UMULL	r8,r6,r7
   1237 	$UMULH	r9,r6,r7
   1238 	addc	r12,r12,r8
   1239 	adde	r10,r10,r9
   1240 	addze	r11,r11
   1241 					#mul_add_c(a[5],b[0],c3,c1,c2);
   1242 	$LD	r6,`5*$BNSZ`(r4)		
   1243 	$LD	r7,`0*$BNSZ`(r5)
   1244 	$UMULL	r8,r6,r7
   1245 	$UMULH	r9,r6,r7
   1246 	addc	r12,r12,r8
   1247 	adde	r10,r10,r9
   1248 	addze	r11,r11
   1249 	$ST	r12,`5*$BNSZ`(r3)	#r[5]=c3;
   1250 					#mul_add_c(a[6],b[0],c1,c2,c3);
   1251 	$LD	r6,`6*$BNSZ`(r4)
   1252 	$UMULL	r8,r6,r7
   1253 	$UMULH	r9,r6,r7
   1254 	addc	r10,r10,r8
   1255 	adde	r11,r11,r9
   1256 	addze	r12,r0
   1257 					#mul_add_c(a[5],b[1],c1,c2,c3);
   1258 	$LD	r6,`5*$BNSZ`(r4)
   1259 	$LD	r7,`1*$BNSZ`(r5)
   1260 	$UMULL	r8,r6,r7
   1261 	$UMULH	r9,r6,r7
   1262 	addc	r10,r10,r8
   1263 	adde	r11,r11,r9
   1264 	addze	r12,r12
   1265 					#mul_add_c(a[4],b[2],c1,c2,c3);
   1266 	$LD	r6,`4*$BNSZ`(r4)
   1267 	$LD	r7,`2*$BNSZ`(r5)
   1268 	$UMULL	r8,r6,r7
   1269 	$UMULH	r9,r6,r7
   1270 	addc	r10,r10,r8
   1271 	adde	r11,r11,r9
   1272 	addze	r12,r12
   1273 					#mul_add_c(a[3],b[3],c1,c2,c3);
   1274 	$LD	r6,`3*$BNSZ`(r4)
   1275 	$LD	r7,`3*$BNSZ`(r5)
   1276 	$UMULL	r8,r6,r7
   1277 	$UMULH	r9,r6,r7
   1278 	addc	r10,r10,r8
   1279 	adde	r11,r11,r9
   1280 	addze	r12,r12
   1281 					#mul_add_c(a[2],b[4],c1,c2,c3);
   1282 	$LD	r6,`2*$BNSZ`(r4)
   1283 	$LD	r7,`4*$BNSZ`(r5)
   1284 	$UMULL	r8,r6,r7
   1285 	$UMULH	r9,r6,r7
   1286 	addc	r10,r10,r8
   1287 	adde	r11,r11,r9
   1288 	addze	r12,r12
   1289 					#mul_add_c(a[1],b[5],c1,c2,c3);
   1290 	$LD	r6,`1*$BNSZ`(r4)
   1291 	$LD	r7,`5*$BNSZ`(r5)
   1292 	$UMULL	r8,r6,r7
   1293 	$UMULH	r9,r6,r7
   1294 	addc	r10,r10,r8
   1295 	adde	r11,r11,r9
   1296 	addze	r12,r12
   1297 					#mul_add_c(a[0],b[6],c1,c2,c3);
   1298 	$LD	r6,`0*$BNSZ`(r4)
   1299 	$LD	r7,`6*$BNSZ`(r5)
   1300 	$UMULL	r8,r6,r7
   1301 	$UMULH	r9,r6,r7
   1302 	addc	r10,r10,r8
   1303 	adde	r11,r11,r9
   1304 	addze	r12,r12
   1305 	$ST	r10,`6*$BNSZ`(r3)	#r[6]=c1;
   1306 					#mul_add_c(a[0],b[7],c2,c3,c1);
   1307 	$LD	r7,`7*$BNSZ`(r5)
   1308 	$UMULL	r8,r6,r7
   1309 	$UMULH	r9,r6,r7
   1310 	addc	r11,r11,r8
   1311 	adde	r12,r12,r9
   1312 	addze	r10,r0
   1313 					#mul_add_c(a[1],b[6],c2,c3,c1);
   1314 	$LD	r6,`1*$BNSZ`(r4)
   1315 	$LD	r7,`6*$BNSZ`(r5)
   1316 	$UMULL	r8,r6,r7
   1317 	$UMULH	r9,r6,r7
   1318 	addc	r11,r11,r8
   1319 	adde	r12,r12,r9
   1320 	addze	r10,r10
   1321 					#mul_add_c(a[2],b[5],c2,c3,c1);
   1322 	$LD	r6,`2*$BNSZ`(r4)
   1323 	$LD	r7,`5*$BNSZ`(r5)
   1324 	$UMULL	r8,r6,r7
   1325 	$UMULH	r9,r6,r7
   1326 	addc	r11,r11,r8
   1327 	adde	r12,r12,r9
   1328 	addze	r10,r10
   1329 					#mul_add_c(a[3],b[4],c2,c3,c1);
   1330 	$LD	r6,`3*$BNSZ`(r4)
   1331 	$LD	r7,`4*$BNSZ`(r5)
   1332 	$UMULL	r8,r6,r7
   1333 	$UMULH	r9,r6,r7
   1334 	addc	r11,r11,r8
   1335 	adde	r12,r12,r9
   1336 	addze	r10,r10
   1337 					#mul_add_c(a[4],b[3],c2,c3,c1);
   1338 	$LD	r6,`4*$BNSZ`(r4)
   1339 	$LD	r7,`3*$BNSZ`(r5)
   1340 	$UMULL	r8,r6,r7
   1341 	$UMULH	r9,r6,r7
   1342 	addc	r11,r11,r8
   1343 	adde	r12,r12,r9
   1344 	addze	r10,r10
   1345 					#mul_add_c(a[5],b[2],c2,c3,c1);
   1346 	$LD	r6,`5*$BNSZ`(r4)
   1347 	$LD	r7,`2*$BNSZ`(r5)
   1348 	$UMULL	r8,r6,r7
   1349 	$UMULH	r9,r6,r7
   1350 	addc	r11,r11,r8
   1351 	adde	r12,r12,r9
   1352 	addze	r10,r10
   1353 					#mul_add_c(a[6],b[1],c2,c3,c1);
   1354 	$LD	r6,`6*$BNSZ`(r4)
   1355 	$LD	r7,`1*$BNSZ`(r5)
   1356 	$UMULL	r8,r6,r7
   1357 	$UMULH	r9,r6,r7
   1358 	addc	r11,r11,r8
   1359 	adde	r12,r12,r9
   1360 	addze	r10,r10
   1361 					#mul_add_c(a[7],b[0],c2,c3,c1);
   1362 	$LD	r6,`7*$BNSZ`(r4)
   1363 	$LD	r7,`0*$BNSZ`(r5)
   1364 	$UMULL	r8,r6,r7
   1365 	$UMULH	r9,r6,r7
   1366 	addc	r11,r11,r8
   1367 	adde	r12,r12,r9
   1368 	addze	r10,r10
   1369 	$ST	r11,`7*$BNSZ`(r3)	#r[7]=c2;
   1370 					#mul_add_c(a[7],b[1],c3,c1,c2);
   1371 	$LD	r7,`1*$BNSZ`(r5)
   1372 	$UMULL	r8,r6,r7
   1373 	$UMULH	r9,r6,r7
   1374 	addc	r12,r12,r8
   1375 	adde	r10,r10,r9
   1376 	addze	r11,r0
   1377 					#mul_add_c(a[6],b[2],c3,c1,c2);
   1378 	$LD	r6,`6*$BNSZ`(r4)
   1379 	$LD	r7,`2*$BNSZ`(r5)
   1380 	$UMULL	r8,r6,r7
   1381 	$UMULH	r9,r6,r7
   1382 	addc	r12,r12,r8
   1383 	adde	r10,r10,r9
   1384 	addze	r11,r11
   1385 					#mul_add_c(a[5],b[3],c3,c1,c2);
   1386 	$LD	r6,`5*$BNSZ`(r4)
   1387 	$LD	r7,`3*$BNSZ`(r5)
   1388 	$UMULL	r8,r6,r7
   1389 	$UMULH	r9,r6,r7
   1390 	addc	r12,r12,r8
   1391 	adde	r10,r10,r9
   1392 	addze	r11,r11
   1393 					#mul_add_c(a[4],b[4],c3,c1,c2);
   1394 	$LD	r6,`4*$BNSZ`(r4)
   1395 	$LD	r7,`4*$BNSZ`(r5)
   1396 	$UMULL	r8,r6,r7
   1397 	$UMULH	r9,r6,r7
   1398 	addc	r12,r12,r8
   1399 	adde	r10,r10,r9
   1400 	addze	r11,r11
   1401 					#mul_add_c(a[3],b[5],c3,c1,c2);
   1402 	$LD	r6,`3*$BNSZ`(r4)
   1403 	$LD	r7,`5*$BNSZ`(r5)
   1404 	$UMULL	r8,r6,r7
   1405 	$UMULH	r9,r6,r7
   1406 	addc	r12,r12,r8
   1407 	adde	r10,r10,r9
   1408 	addze	r11,r11
   1409 					#mul_add_c(a[2],b[6],c3,c1,c2);
   1410 	$LD	r6,`2*$BNSZ`(r4)
   1411 	$LD	r7,`6*$BNSZ`(r5)
   1412 	$UMULL	r8,r6,r7
   1413 	$UMULH	r9,r6,r7
   1414 	addc	r12,r12,r8
   1415 	adde	r10,r10,r9
   1416 	addze	r11,r11
   1417 					#mul_add_c(a[1],b[7],c3,c1,c2);
   1418 	$LD	r6,`1*$BNSZ`(r4)
   1419 	$LD	r7,`7*$BNSZ`(r5)
   1420 	$UMULL	r8,r6,r7
   1421 	$UMULH	r9,r6,r7
   1422 	addc	r12,r12,r8
   1423 	adde	r10,r10,r9
   1424 	addze	r11,r11
   1425 	$ST	r12,`8*$BNSZ`(r3)	#r[8]=c3;
   1426 					#mul_add_c(a[2],b[7],c1,c2,c3);
   1427 	$LD	r6,`2*$BNSZ`(r4)
   1428 	$UMULL	r8,r6,r7
   1429 	$UMULH	r9,r6,r7
   1430 	addc	r10,r10,r8
   1431 	adde	r11,r11,r9
   1432 	addze	r12,r0
   1433 					#mul_add_c(a[3],b[6],c1,c2,c3);
   1434 	$LD	r6,`3*$BNSZ`(r4)
   1435 	$LD	r7,`6*$BNSZ`(r5)
   1436 	$UMULL	r8,r6,r7
   1437 	$UMULH	r9,r6,r7
   1438 	addc	r10,r10,r8
   1439 	adde	r11,r11,r9
   1440 	addze	r12,r12
   1441 					#mul_add_c(a[4],b[5],c1,c2,c3);
   1442 	$LD	r6,`4*$BNSZ`(r4)
   1443 	$LD	r7,`5*$BNSZ`(r5)
   1444 	$UMULL	r8,r6,r7
   1445 	$UMULH	r9,r6,r7
   1446 	addc	r10,r10,r8
   1447 	adde	r11,r11,r9
   1448 	addze	r12,r12
   1449 					#mul_add_c(a[5],b[4],c1,c2,c3);
   1450 	$LD	r6,`5*$BNSZ`(r4)
   1451 	$LD	r7,`4*$BNSZ`(r5)
   1452 	$UMULL	r8,r6,r7
   1453 	$UMULH	r9,r6,r7
   1454 	addc	r10,r10,r8
   1455 	adde	r11,r11,r9
   1456 	addze	r12,r12
   1457 					#mul_add_c(a[6],b[3],c1,c2,c3);
   1458 	$LD	r6,`6*$BNSZ`(r4)
   1459 	$LD	r7,`3*$BNSZ`(r5)
   1460 	$UMULL	r8,r6,r7
   1461 	$UMULH	r9,r6,r7
   1462 	addc	r10,r10,r8
   1463 	adde	r11,r11,r9
   1464 	addze	r12,r12
   1465 					#mul_add_c(a[7],b[2],c1,c2,c3);
   1466 	$LD	r6,`7*$BNSZ`(r4)
   1467 	$LD	r7,`2*$BNSZ`(r5)
   1468 	$UMULL	r8,r6,r7
   1469 	$UMULH	r9,r6,r7
   1470 	addc	r10,r10,r8
   1471 	adde	r11,r11,r9
   1472 	addze	r12,r12
   1473 	$ST	r10,`9*$BNSZ`(r3)	#r[9]=c1;
   1474 					#mul_add_c(a[7],b[3],c2,c3,c1);
   1475 	$LD	r7,`3*$BNSZ`(r5)
   1476 	$UMULL	r8,r6,r7
   1477 	$UMULH	r9,r6,r7
   1478 	addc	r11,r11,r8
   1479 	adde	r12,r12,r9
   1480 	addze	r10,r0
   1481 					#mul_add_c(a[6],b[4],c2,c3,c1);
   1482 	$LD	r6,`6*$BNSZ`(r4)
   1483 	$LD	r7,`4*$BNSZ`(r5)
   1484 	$UMULL	r8,r6,r7
   1485 	$UMULH	r9,r6,r7
   1486 	addc	r11,r11,r8
   1487 	adde	r12,r12,r9
   1488 	addze	r10,r10
   1489 					#mul_add_c(a[5],b[5],c2,c3,c1);
   1490 	$LD	r6,`5*$BNSZ`(r4)
   1491 	$LD	r7,`5*$BNSZ`(r5)
   1492 	$UMULL	r8,r6,r7
   1493 	$UMULH	r9,r6,r7
   1494 	addc	r11,r11,r8
   1495 	adde	r12,r12,r9
   1496 	addze	r10,r10
   1497 					#mul_add_c(a[4],b[6],c2,c3,c1);
   1498 	$LD	r6,`4*$BNSZ`(r4)
   1499 	$LD	r7,`6*$BNSZ`(r5)
   1500 	$UMULL	r8,r6,r7
   1501 	$UMULH	r9,r6,r7
   1502 	addc	r11,r11,r8
   1503 	adde	r12,r12,r9
   1504 	addze	r10,r10
   1505 					#mul_add_c(a[3],b[7],c2,c3,c1);
   1506 	$LD	r6,`3*$BNSZ`(r4)
   1507 	$LD	r7,`7*$BNSZ`(r5)
   1508 	$UMULL	r8,r6,r7
   1509 	$UMULH	r9,r6,r7
   1510 	addc	r11,r11,r8
   1511 	adde	r12,r12,r9
   1512 	addze	r10,r10
   1513 	$ST	r11,`10*$BNSZ`(r3)	#r[10]=c2;
   1514 					#mul_add_c(a[4],b[7],c3,c1,c2);
   1515 	$LD	r6,`4*$BNSZ`(r4)
   1516 	$UMULL	r8,r6,r7
   1517 	$UMULH	r9,r6,r7
   1518 	addc	r12,r12,r8
   1519 	adde	r10,r10,r9
   1520 	addze	r11,r0
   1521 					#mul_add_c(a[5],b[6],c3,c1,c2);
   1522 	$LD	r6,`5*$BNSZ`(r4)
   1523 	$LD	r7,`6*$BNSZ`(r5)
   1524 	$UMULL	r8,r6,r7
   1525 	$UMULH	r9,r6,r7
   1526 	addc	r12,r12,r8
   1527 	adde	r10,r10,r9
   1528 	addze	r11,r11
   1529 					#mul_add_c(a[6],b[5],c3,c1,c2);
   1530 	$LD	r6,`6*$BNSZ`(r4)
   1531 	$LD	r7,`5*$BNSZ`(r5)
   1532 	$UMULL	r8,r6,r7
   1533 	$UMULH	r9,r6,r7
   1534 	addc	r12,r12,r8
   1535 	adde	r10,r10,r9
   1536 	addze	r11,r11
   1537 					#mul_add_c(a[7],b[4],c3,c1,c2);
   1538 	$LD	r6,`7*$BNSZ`(r4)
   1539 	$LD	r7,`4*$BNSZ`(r5)
   1540 	$UMULL	r8,r6,r7
   1541 	$UMULH	r9,r6,r7
   1542 	addc	r12,r12,r8
   1543 	adde	r10,r10,r9
   1544 	addze	r11,r11
   1545 	$ST	r12,`11*$BNSZ`(r3)	#r[11]=c3;
   1546 					#mul_add_c(a[7],b[5],c1,c2,c3);
   1547 	$LD	r7,`5*$BNSZ`(r5)
   1548 	$UMULL	r8,r6,r7
   1549 	$UMULH	r9,r6,r7
   1550 	addc	r10,r10,r8
   1551 	adde	r11,r11,r9
   1552 	addze	r12,r0
   1553 					#mul_add_c(a[6],b[6],c1,c2,c3);
   1554 	$LD	r6,`6*$BNSZ`(r4)
   1555 	$LD	r7,`6*$BNSZ`(r5)
   1556 	$UMULL	r8,r6,r7
   1557 	$UMULH	r9,r6,r7
   1558 	addc	r10,r10,r8
   1559 	adde	r11,r11,r9
   1560 	addze	r12,r12
   1561 					#mul_add_c(a[5],b[7],c1,c2,c3);
   1562 	$LD	r6,`5*$BNSZ`(r4)
   1563 	$LD	r7,`7*$BNSZ`(r5)
   1564 	$UMULL	r8,r6,r7
   1565 	$UMULH	r9,r6,r7
   1566 	addc	r10,r10,r8
   1567 	adde	r11,r11,r9
   1568 	addze	r12,r12
   1569 	$ST	r10,`12*$BNSZ`(r3)	#r[12]=c1;
   1570 					#mul_add_c(a[6],b[7],c2,c3,c1);
   1571 	$LD	r6,`6*$BNSZ`(r4)
   1572 	$UMULL	r8,r6,r7
   1573 	$UMULH	r9,r6,r7
   1574 	addc	r11,r11,r8
   1575 	adde	r12,r12,r9
   1576 	addze	r10,r0
   1577 					#mul_add_c(a[7],b[6],c2,c3,c1);
   1578 	$LD	r6,`7*$BNSZ`(r4)
   1579 	$LD	r7,`6*$BNSZ`(r5)
   1580 	$UMULL	r8,r6,r7
   1581 	$UMULH	r9,r6,r7
   1582 	addc	r11,r11,r8
   1583 	adde	r12,r12,r9
   1584 	addze	r10,r10
   1585 	$ST	r11,`13*$BNSZ`(r3)	#r[13]=c2;
   1586 					#mul_add_c(a[7],b[7],c3,c1,c2);
   1587 	$LD	r7,`7*$BNSZ`(r5)
   1588 	$UMULL	r8,r6,r7
   1589 	$UMULH	r9,r6,r7
   1590 	addc	r12,r12,r8
   1591 	adde	r10,r10,r9
   1592 	$ST	r12,`14*$BNSZ`(r3)	#r[14]=c3;
   1593 	$ST	r10,`15*$BNSZ`(r3)	#r[15]=c1;
   1594 	bclr	BO_ALWAYS,CR0_LT
   1595 	.long	0x00000000
   1596 
   1597 #
   1598 #	NOTE:	The following label name should be changed to
   1599 #		"bn_sub_words" i.e. remove the first dot
   1600 #		for the gcc compiler. This should be automatically
   1601 #		done in the build
   1602 #
   1603 #
   1604 .align	4
   1605 .bn_sub_words:
   1606 #
   1607 #	Handcoded version of bn_sub_words
   1608 #
   1609 #BN_ULONG bn_sub_words(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n)
   1610 #
   1611 #	r3 = r
   1612 #	r4 = a
   1613 #	r5 = b
   1614 #	r6 = n
   1615 #
   1616 #       Note:	No loop unrolling done since this is not a performance
   1617 #               critical loop.
   1618 
   1619 	xor	r0,r0,r0	#set r0 = 0
   1620 #
   1621 #	check for r6 = 0 AND set carry bit.
   1622 #
   1623 	subfc.	r7,r0,r6        # If r6 is 0 then result is 0.
   1624 				# if r6 > 0 then result !=0
   1625 				# In either case carry bit is set.
   1626 	bc	BO_IF,CR0_EQ,Lppcasm_sub_adios
   1627 	addi	r4,r4,-$BNSZ
   1628 	addi	r3,r3,-$BNSZ
   1629 	addi	r5,r5,-$BNSZ
   1630 	mtctr	r6
   1631 Lppcasm_sub_mainloop:	
   1632 	$LDU	r7,$BNSZ(r4)
   1633 	$LDU	r8,$BNSZ(r5)
   1634 	subfe	r6,r8,r7	# r6 = r7+carry bit + onescomplement(r8)
   1635 				# if carry = 1 this is r7-r8. Else it
   1636 				# is r7-r8 -1 as we need.
   1637 	$STU	r6,$BNSZ(r3)
   1638 	bc	BO_dCTR_NZERO,CR0_EQ,Lppcasm_sub_mainloop
   1639 Lppcasm_sub_adios:	
   1640 	subfze	r3,r0		# if carry bit is set then r3 = 0 else -1
   1641 	andi.	r3,r3,1         # keep only last bit.
   1642 	bclr	BO_ALWAYS,CR0_LT
   1643 	.long	0x00000000
   1644 
   1645 
   1646 #
   1647 #	NOTE:	The following label name should be changed to
   1648 #		"bn_add_words" i.e. remove the first dot
   1649 #		for the gcc compiler. This should be automatically
   1650 #		done in the build
   1651 #
   1652 
   1653 .align	4
   1654 .bn_add_words:
   1655 #
   1656 #	Handcoded version of bn_add_words
   1657 #
   1658 #BN_ULONG bn_add_words(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n)
   1659 #
   1660 #	r3 = r
   1661 #	r4 = a
   1662 #	r5 = b
   1663 #	r6 = n
   1664 #
   1665 #       Note:	No loop unrolling done since this is not a performance
   1666 #               critical loop.
   1667 
   1668 	xor	r0,r0,r0
   1669 #
   1670 #	check for r6 = 0. Is this needed?
   1671 #
   1672 	addic.	r6,r6,0		#test r6 and clear carry bit.
   1673 	bc	BO_IF,CR0_EQ,Lppcasm_add_adios
   1674 	addi	r4,r4,-$BNSZ
   1675 	addi	r3,r3,-$BNSZ
   1676 	addi	r5,r5,-$BNSZ
   1677 	mtctr	r6
   1678 Lppcasm_add_mainloop:	
   1679 	$LDU	r7,$BNSZ(r4)
   1680 	$LDU	r8,$BNSZ(r5)
   1681 	adde	r8,r7,r8
   1682 	$STU	r8,$BNSZ(r3)
   1683 	bc	BO_dCTR_NZERO,CR0_EQ,Lppcasm_add_mainloop
   1684 Lppcasm_add_adios:	
   1685 	addze	r3,r0			#return carry bit.
   1686 	bclr	BO_ALWAYS,CR0_LT
   1687 	.long	0x00000000
   1688 
   1689 #
   1690 #	NOTE:	The following label name should be changed to
   1691 #		"bn_div_words" i.e. remove the first dot
   1692 #		for the gcc compiler. This should be automatically
   1693 #		done in the build
   1694 #
   1695 
   1696 .align	4
   1697 .bn_div_words:
   1698 #
   1699 #	This is a cleaned up version of code generated by
   1700 #	the AIX compiler. The only optimization is to use
   1701 #	the PPC instruction to count leading zeros instead
   1702 #	of call to num_bits_word. Since this was compiled
   1703 #	only at level -O2 we can possibly squeeze it more?
   1704 #	
   1705 #	r3 = h
   1706 #	r4 = l
   1707 #	r5 = d
   1708 	
   1709 	$UCMPI	0,r5,0			# compare r5 and 0
   1710 	bc	BO_IF_NOT,CR0_EQ,Lppcasm_div1	# proceed if d!=0
   1711 	li	r3,-1			# d=0 return -1
   1712 	bclr	BO_ALWAYS,CR0_LT	
   1713 Lppcasm_div1:
   1714 	xor	r0,r0,r0		#r0=0
   1715 	li	r8,$BITS
   1716 	$CNTLZ.	r7,r5			#r7 = num leading 0s in d.
   1717 	bc	BO_IF,CR0_EQ,Lppcasm_div2	#proceed if no leading zeros
   1718 	subf	r8,r7,r8		#r8 = BN_num_bits_word(d)
   1719 	$SHR.	r9,r3,r8		#are there any bits above r8'th?
   1720 	$TR	16,r9,r0		#if there're, signal to dump core...
   1721 Lppcasm_div2:
   1722 	$UCMP	0,r3,r5			#h>=d?
   1723 	bc	BO_IF,CR0_LT,Lppcasm_div3	#goto Lppcasm_div3 if not
   1724 	subf	r3,r5,r3		#h-=d ; 
   1725 Lppcasm_div3:				#r7 = BN_BITS2-i. so r7=i
   1726 	cmpi	0,0,r7,0		# is (i == 0)?
   1727 	bc	BO_IF,CR0_EQ,Lppcasm_div4
   1728 	$SHL	r3,r3,r7		# h = (h<< i)
   1729 	$SHR	r8,r4,r8		# r8 = (l >> BN_BITS2 -i)
   1730 	$SHL	r5,r5,r7		# d<<=i
   1731 	or	r3,r3,r8		# h = (h<<i)|(l>>(BN_BITS2-i))
   1732 	$SHL	r4,r4,r7		# l <<=i
   1733 Lppcasm_div4:
   1734 	$SHRI	r9,r5,`$BITS/2`		# r9 = dh
   1735 					# dl will be computed when needed
   1736 					# as it saves registers.
   1737 	li	r6,2			#r6=2
   1738 	mtctr	r6			#counter will be in count.
   1739 Lppcasm_divouterloop: 
   1740 	$SHRI	r8,r3,`$BITS/2`		#r8 = (h>>BN_BITS4)
   1741 	$SHRI	r11,r4,`$BITS/2`	#r11= (l&BN_MASK2h)>>BN_BITS4
   1742 					# compute here for innerloop.
   1743 	$UCMP	0,r8,r9			# is (h>>BN_BITS4)==dh
   1744 	bc	BO_IF_NOT,CR0_EQ,Lppcasm_div5	# goto Lppcasm_div5 if not
   1745 
   1746 	li	r8,-1
   1747 	$CLRU	r8,r8,`$BITS/2`		#q = BN_MASK2l 
   1748 	b	Lppcasm_div6
   1749 Lppcasm_div5:
   1750 	$UDIV	r8,r3,r9		#q = h/dh
   1751 Lppcasm_div6:
   1752 	$UMULL	r12,r9,r8		#th = q*dh
   1753 	$CLRU	r10,r5,`$BITS/2`	#r10=dl
   1754 	$UMULL	r6,r8,r10		#tl = q*dl
   1755 	
   1756 Lppcasm_divinnerloop:
   1757 	subf	r10,r12,r3		#t = h -th
   1758 	$SHRI	r7,r10,`$BITS/2`	#r7= (t &BN_MASK2H), sort of...
   1759 	addic.	r7,r7,0			#test if r7 == 0. used below.
   1760 					# now want to compute
   1761 					# r7 = (t<<BN_BITS4)|((l&BN_MASK2h)>>BN_BITS4)
   1762 					# the following 2 instructions do that
   1763 	$SHLI	r7,r10,`$BITS/2`	# r7 = (t<<BN_BITS4)
   1764 	or	r7,r7,r11		# r7|=((l&BN_MASK2h)>>BN_BITS4)
   1765 	$UCMP	1,r6,r7			# compare (tl <= r7)
   1766 	bc	BO_IF_NOT,CR0_EQ,Lppcasm_divinnerexit
   1767 	bc	BO_IF_NOT,CR1_FEX,Lppcasm_divinnerexit
   1768 	addi	r8,r8,-1		#q--
   1769 	subf	r12,r9,r12		#th -=dh
   1770 	$CLRU	r10,r5,`$BITS/2`	#r10=dl. t is no longer needed in loop.
   1771 	subf	r6,r10,r6		#tl -=dl
   1772 	b	Lppcasm_divinnerloop
   1773 Lppcasm_divinnerexit:
   1774 	$SHRI	r10,r6,`$BITS/2`	#t=(tl>>BN_BITS4)
   1775 	$SHLI	r11,r6,`$BITS/2`	#tl=(tl<<BN_BITS4)&BN_MASK2h;
   1776 	$UCMP	1,r4,r11		# compare l and tl
   1777 	add	r12,r12,r10		# th+=t
   1778 	bc	BO_IF_NOT,CR1_FX,Lppcasm_div7  # if (l>=tl) goto Lppcasm_div7
   1779 	addi	r12,r12,1		# th++
   1780 Lppcasm_div7:
   1781 	subf	r11,r11,r4		#r11=l-tl
   1782 	$UCMP	1,r3,r12		#compare h and th
   1783 	bc	BO_IF_NOT,CR1_FX,Lppcasm_div8	#if (h>=th) goto Lppcasm_div8
   1784 	addi	r8,r8,-1		# q--
   1785 	add	r3,r5,r3		# h+=d
   1786 Lppcasm_div8:
   1787 	subf	r12,r12,r3		#r12 = h-th
   1788 	$SHLI	r4,r11,`$BITS/2`	#l=(l&BN_MASK2l)<<BN_BITS4
   1789 					# want to compute
   1790 					# h = ((h<<BN_BITS4)|(l>>BN_BITS4))&BN_MASK2
   1791 					# the following 2 instructions will do this.
   1792 	$INSR	r11,r12,`$BITS/2`,`$BITS/2`	# r11 is the value we want rotated $BITS/2.
   1793 	$ROTL	r3,r11,`$BITS/2`	# rotate by $BITS/2 and store in r3
   1794 	bc	BO_dCTR_ZERO,CR0_EQ,Lppcasm_div9#if (count==0) break ;
   1795 	$SHLI	r0,r8,`$BITS/2`		#ret =q<<BN_BITS4
   1796 	b	Lppcasm_divouterloop
   1797 Lppcasm_div9:
   1798 	or	r3,r8,r0
   1799 	bclr	BO_ALWAYS,CR0_LT
   1800 	.long	0x00000000
   1801 
   1802 #
   1803 #	NOTE:	The following label name should be changed to
   1804 #		"bn_sqr_words" i.e. remove the first dot
   1805 #		for the gcc compiler. This should be automatically
   1806 #		done in the build
   1807 #
   1808 .align	4
   1809 .bn_sqr_words:
   1810 #
   1811 #	Optimized version of bn_sqr_words
   1812 #
   1813 #	void bn_sqr_words(BN_ULONG *r, BN_ULONG *a, int n)
   1814 #
   1815 #	r3 = r
   1816 #	r4 = a
   1817 #	r5 = n
   1818 #
   1819 #	r6 = a[i].
   1820 #	r7,r8 = product.
   1821 #
   1822 #	No unrolling done here. Not performance critical.
   1823 
   1824 	addic.	r5,r5,0			#test r5.
   1825 	bc	BO_IF,CR0_EQ,Lppcasm_sqr_adios
   1826 	addi	r4,r4,-$BNSZ
   1827 	addi	r3,r3,-$BNSZ
   1828 	mtctr	r5
   1829 Lppcasm_sqr_mainloop:	
   1830 					#sqr(r[0],r[1],a[0]);
   1831 	$LDU	r6,$BNSZ(r4)
   1832 	$UMULL	r7,r6,r6
   1833 	$UMULH  r8,r6,r6
   1834 	$STU	r7,$BNSZ(r3)
   1835 	$STU	r8,$BNSZ(r3)
   1836 	bc	BO_dCTR_NZERO,CR0_EQ,Lppcasm_sqr_mainloop
   1837 Lppcasm_sqr_adios:	
   1838 	bclr	BO_ALWAYS,CR0_LT
   1839 	.long	0x00000000
   1840 
   1841 
   1842 #
   1843 #	NOTE:	The following label name should be changed to
   1844 #		"bn_mul_words" i.e. remove the first dot
   1845 #		for the gcc compiler. This should be automatically
   1846 #		done in the build
   1847 #
   1848 
   1849 .align	4	
   1850 .bn_mul_words:
   1851 #
   1852 # BN_ULONG bn_mul_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w)
   1853 #
   1854 # r3 = rp
   1855 # r4 = ap
   1856 # r5 = num
   1857 # r6 = w
   1858 	xor	r0,r0,r0
   1859 	xor	r12,r12,r12		# used for carry
   1860 	rlwinm.	r7,r5,30,2,31		# num >> 2
   1861 	bc	BO_IF,CR0_EQ,Lppcasm_mw_REM
   1862 	mtctr	r7
   1863 Lppcasm_mw_LOOP:	
   1864 					#mul(rp[0],ap[0],w,c1);
   1865 	$LD	r8,`0*$BNSZ`(r4)
   1866 	$UMULL	r9,r6,r8
   1867 	$UMULH  r10,r6,r8
   1868 	addc	r9,r9,r12
   1869 	#addze	r10,r10			#carry is NOT ignored.
   1870 					#will be taken care of
   1871 					#in second spin below
   1872 					#using adde.
   1873 	$ST	r9,`0*$BNSZ`(r3)
   1874 					#mul(rp[1],ap[1],w,c1);
   1875 	$LD	r8,`1*$BNSZ`(r4)	
   1876 	$UMULL	r11,r6,r8
   1877 	$UMULH  r12,r6,r8
   1878 	adde	r11,r11,r10
   1879 	#addze	r12,r12
   1880 	$ST	r11,`1*$BNSZ`(r3)
   1881 					#mul(rp[2],ap[2],w,c1);
   1882 	$LD	r8,`2*$BNSZ`(r4)
   1883 	$UMULL	r9,r6,r8
   1884 	$UMULH  r10,r6,r8
   1885 	adde	r9,r9,r12
   1886 	#addze	r10,r10
   1887 	$ST	r9,`2*$BNSZ`(r3)
   1888 					#mul_add(rp[3],ap[3],w,c1);
   1889 	$LD	r8,`3*$BNSZ`(r4)
   1890 	$UMULL	r11,r6,r8
   1891 	$UMULH  r12,r6,r8
   1892 	adde	r11,r11,r10
   1893 	addze	r12,r12			#this spin we collect carry into
   1894 					#r12
   1895 	$ST	r11,`3*$BNSZ`(r3)
   1896 	
   1897 	addi	r3,r3,`4*$BNSZ`
   1898 	addi	r4,r4,`4*$BNSZ`
   1899 	bc	BO_dCTR_NZERO,CR0_EQ,Lppcasm_mw_LOOP
   1900 
   1901 Lppcasm_mw_REM:
   1902 	andi.	r5,r5,0x3
   1903 	bc	BO_IF,CR0_EQ,Lppcasm_mw_OVER
   1904 					#mul(rp[0],ap[0],w,c1);
   1905 	$LD	r8,`0*$BNSZ`(r4)
   1906 	$UMULL	r9,r6,r8
   1907 	$UMULH  r10,r6,r8
   1908 	addc	r9,r9,r12
   1909 	addze	r10,r10
   1910 	$ST	r9,`0*$BNSZ`(r3)
   1911 	addi	r12,r10,0
   1912 	
   1913 	addi	r5,r5,-1
   1914 	cmpli	0,0,r5,0
   1915 	bc	BO_IF,CR0_EQ,Lppcasm_mw_OVER
   1916 
   1917 	
   1918 					#mul(rp[1],ap[1],w,c1);
   1919 	$LD	r8,`1*$BNSZ`(r4)	
   1920 	$UMULL	r9,r6,r8
   1921 	$UMULH  r10,r6,r8
   1922 	addc	r9,r9,r12
   1923 	addze	r10,r10
   1924 	$ST	r9,`1*$BNSZ`(r3)
   1925 	addi	r12,r10,0
   1926 	
   1927 	addi	r5,r5,-1
   1928 	cmpli	0,0,r5,0
   1929 	bc	BO_IF,CR0_EQ,Lppcasm_mw_OVER
   1930 	
   1931 					#mul_add(rp[2],ap[2],w,c1);
   1932 	$LD	r8,`2*$BNSZ`(r4)
   1933 	$UMULL	r9,r6,r8
   1934 	$UMULH  r10,r6,r8
   1935 	addc	r9,r9,r12
   1936 	addze	r10,r10
   1937 	$ST	r9,`2*$BNSZ`(r3)
   1938 	addi	r12,r10,0
   1939 		
   1940 Lppcasm_mw_OVER:	
   1941 	addi	r3,r12,0
   1942 	bclr	BO_ALWAYS,CR0_LT
   1943 	.long	0x00000000
   1944 
   1945 #
   1946 #	NOTE:	The following label name should be changed to
   1947 #		"bn_mul_add_words" i.e. remove the first dot
   1948 #		for the gcc compiler. This should be automatically
   1949 #		done in the build
   1950 #
   1951 
   1952 .align	4
   1953 .bn_mul_add_words:
   1954 #
   1955 # BN_ULONG bn_mul_add_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w)
   1956 #
   1957 # r3 = rp
   1958 # r4 = ap
   1959 # r5 = num
   1960 # r6 = w
   1961 #
   1962 # empirical evidence suggests that unrolled version performs best!!
   1963 #
   1964 	xor	r0,r0,r0		#r0 = 0
   1965 	xor	r12,r12,r12  		#r12 = 0 . used for carry		
   1966 	rlwinm.	r7,r5,30,2,31		# num >> 2
   1967 	bc	BO_IF,CR0_EQ,Lppcasm_maw_leftover	# if (num < 4) go LPPCASM_maw_leftover
   1968 	mtctr	r7
   1969 Lppcasm_maw_mainloop:	
   1970 					#mul_add(rp[0],ap[0],w,c1);
   1971 	$LD	r8,`0*$BNSZ`(r4)
   1972 	$LD	r11,`0*$BNSZ`(r3)
   1973 	$UMULL	r9,r6,r8
   1974 	$UMULH  r10,r6,r8
   1975 	addc	r9,r9,r12		#r12 is carry.
   1976 	addze	r10,r10
   1977 	addc	r9,r9,r11
   1978 	#addze	r10,r10
   1979 					#the above instruction addze
   1980 					#is NOT needed. Carry will NOT
   1981 					#be ignored. It's not affected
   1982 					#by multiply and will be collected
   1983 					#in the next spin
   1984 	$ST	r9,`0*$BNSZ`(r3)
   1985 	
   1986 					#mul_add(rp[1],ap[1],w,c1);
   1987 	$LD	r8,`1*$BNSZ`(r4)	
   1988 	$LD	r9,`1*$BNSZ`(r3)
   1989 	$UMULL	r11,r6,r8
   1990 	$UMULH  r12,r6,r8
   1991 	adde	r11,r11,r10		#r10 is carry.
   1992 	addze	r12,r12
   1993 	addc	r11,r11,r9
   1994 	#addze	r12,r12
   1995 	$ST	r11,`1*$BNSZ`(r3)
   1996 	
   1997 					#mul_add(rp[2],ap[2],w,c1);
   1998 	$LD	r8,`2*$BNSZ`(r4)
   1999 	$UMULL	r9,r6,r8
   2000 	$LD	r11,`2*$BNSZ`(r3)
   2001 	$UMULH  r10,r6,r8
   2002 	adde	r9,r9,r12
   2003 	addze	r10,r10
   2004 	addc	r9,r9,r11
   2005 	#addze	r10,r10
   2006 	$ST	r9,`2*$BNSZ`(r3)
   2007 	
   2008 					#mul_add(rp[3],ap[3],w,c1);
   2009 	$LD	r8,`3*$BNSZ`(r4)
   2010 	$UMULL	r11,r6,r8
   2011 	$LD	r9,`3*$BNSZ`(r3)
   2012 	$UMULH  r12,r6,r8
   2013 	adde	r11,r11,r10
   2014 	addze	r12,r12
   2015 	addc	r11,r11,r9
   2016 	addze	r12,r12
   2017 	$ST	r11,`3*$BNSZ`(r3)
   2018 	addi	r3,r3,`4*$BNSZ`
   2019 	addi	r4,r4,`4*$BNSZ`
   2020 	bc	BO_dCTR_NZERO,CR0_EQ,Lppcasm_maw_mainloop
   2021 	
   2022 Lppcasm_maw_leftover:
   2023 	andi.	r5,r5,0x3
   2024 	bc	BO_IF,CR0_EQ,Lppcasm_maw_adios
   2025 	addi	r3,r3,-$BNSZ
   2026 	addi	r4,r4,-$BNSZ
   2027 					#mul_add(rp[0],ap[0],w,c1);
   2028 	mtctr	r5
   2029 	$LDU	r8,$BNSZ(r4)
   2030 	$UMULL	r9,r6,r8
   2031 	$UMULH  r10,r6,r8
   2032 	$LDU	r11,$BNSZ(r3)
   2033 	addc	r9,r9,r11
   2034 	addze	r10,r10
   2035 	addc	r9,r9,r12
   2036 	addze	r12,r10
   2037 	$ST	r9,0(r3)
   2038 	
   2039 	bc	BO_dCTR_ZERO,CR0_EQ,Lppcasm_maw_adios
   2040 					#mul_add(rp[1],ap[1],w,c1);
   2041 	$LDU	r8,$BNSZ(r4)	
   2042 	$UMULL	r9,r6,r8
   2043 	$UMULH  r10,r6,r8
   2044 	$LDU	r11,$BNSZ(r3)
   2045 	addc	r9,r9,r11
   2046 	addze	r10,r10
   2047 	addc	r9,r9,r12
   2048 	addze	r12,r10
   2049 	$ST	r9,0(r3)
   2050 	
   2051 	bc	BO_dCTR_ZERO,CR0_EQ,Lppcasm_maw_adios
   2052 					#mul_add(rp[2],ap[2],w,c1);
   2053 	$LDU	r8,$BNSZ(r4)
   2054 	$UMULL	r9,r6,r8
   2055 	$UMULH  r10,r6,r8
   2056 	$LDU	r11,$BNSZ(r3)
   2057 	addc	r9,r9,r11
   2058 	addze	r10,r10
   2059 	addc	r9,r9,r12
   2060 	addze	r12,r10
   2061 	$ST	r9,0(r3)
   2062 		
   2063 Lppcasm_maw_adios:	
   2064 	addi	r3,r12,0
   2065 	bclr	BO_ALWAYS,CR0_LT
   2066 	.long	0x00000000
   2067 	.align	4
   2068 EOF
   2069 	$data =~ s/\`([^\`]*)\`/eval $1/gem;
   2070 
   2071 	# if some assembler chokes on some simplified mnemonic,
   2072 	# this is the spot to fix it up, e.g.:
   2073 	# GNU as doesn't seem to accept cmplw, 32-bit unsigned compare
   2074 	$data =~ s/^(\s*)cmplw(\s+)([^,]+),(.*)/$1cmpl$2$3,0,$4/gm;
   2075 	# assembler X doesn't accept li, load immediate value
   2076 	#$data =~ s/^(\s*)li(\s+)([^,]+),(.*)/$1addi$2$3,0,$4/gm;
   2077 	return($data);
   2078 }
   2079