Home | History | Annotate | Download | only in asm
      1 #!/usr/bin/env perl
      2 #
      3 # Implemented as a Perl wrapper as we want to support several different
      4 # architectures with single file. We pick up the target based on the
      5 # file name we are asked to generate.
      6 #
      7 # It should be noted though that this perl code is nothing like
      8 # <openssl>/crypto/perlasm/x86*. In this case perl is used pretty much
      9 # as pre-processor to cover for platform differences in name decoration,
     10 # linker tables, 32-/64-bit instruction sets...
     11 #
     12 # As you might know there're several PowerPC ABI in use. Most notably
     13 # Linux and AIX use different 32-bit ABIs. Good news are that these ABIs
     14 # are similar enough to implement leaf(!) functions, which would be ABI
     15 # neutral. And that's what you find here: ABI neutral leaf functions.
     16 # In case you wonder what that is...
     17 #
     18 #       AIX performance
     19 #
     20 #	MEASUREMENTS WITH cc ON a 200 MhZ PowerPC 604e.
     21 #
     22 #	The following is the performance of 32-bit compiler
     23 #	generated code:
     24 #
     25 #	OpenSSL 0.9.6c 21 dec 2001
     26 #	built on: Tue Jun 11 11:06:51 EDT 2002
     27 #	options:bn(64,32) ...
     28 #compiler: cc -DTHREADS  -DAIX -DB_ENDIAN -DBN_LLONG -O3
     29 #                  sign    verify    sign/s verify/s
     30 #rsa  512 bits   0.0098s   0.0009s    102.0   1170.6
     31 #rsa 1024 bits   0.0507s   0.0026s     19.7    387.5
     32 #rsa 2048 bits   0.3036s   0.0085s      3.3    117.1
     33 #rsa 4096 bits   2.0040s   0.0299s      0.5     33.4
     34 #dsa  512 bits   0.0087s   0.0106s    114.3     94.5
     35 #dsa 1024 bits   0.0256s   0.0313s     39.0     32.0	
     36 #
     37 #	Same bechmark with this assembler code:
     38 #
     39 #rsa  512 bits   0.0056s   0.0005s    178.6   2049.2
     40 #rsa 1024 bits   0.0283s   0.0015s     35.3    674.1
     41 #rsa 2048 bits   0.1744s   0.0050s      5.7    201.2
     42 #rsa 4096 bits   1.1644s   0.0179s      0.9     55.7
     43 #dsa  512 bits   0.0052s   0.0062s    191.6    162.0
     44 #dsa 1024 bits   0.0149s   0.0180s     67.0     55.5
     45 #
     46 #	Number of operations increases by at almost 75%
     47 #
     48 #	Here are performance numbers for 64-bit compiler
     49 #	generated code:
     50 #
     51 #	OpenSSL 0.9.6g [engine] 9 Aug 2002
     52 #	built on: Fri Apr 18 16:59:20 EDT 2003
     53 #	options:bn(64,64) ...
     54 #	compiler: cc -DTHREADS -D_REENTRANT -q64 -DB_ENDIAN -O3
     55 #                  sign    verify    sign/s verify/s
     56 #rsa  512 bits   0.0028s   0.0003s    357.1   3844.4
     57 #rsa 1024 bits   0.0148s   0.0008s     67.5   1239.7
     58 #rsa 2048 bits   0.0963s   0.0028s     10.4    353.0
     59 #rsa 4096 bits   0.6538s   0.0102s      1.5     98.1
     60 #dsa  512 bits   0.0026s   0.0032s    382.5    313.7
     61 #dsa 1024 bits   0.0081s   0.0099s    122.8    100.6
     62 #
     63 #	Same benchmark with this assembler code:
     64 #
     65 #rsa  512 bits   0.0020s   0.0002s    510.4   6273.7
     66 #rsa 1024 bits   0.0088s   0.0005s    114.1   2128.3
     67 #rsa 2048 bits   0.0540s   0.0016s     18.5    622.5
     68 #rsa 4096 bits   0.3700s   0.0058s      2.7    171.0
     69 #dsa  512 bits   0.0016s   0.0020s    610.7    507.1
     70 #dsa 1024 bits   0.0047s   0.0058s    212.5    173.2
     71 #	
     72 #	Again, performance increases by at about 75%
     73 #
     74 #       Mac OS X, Apple G5 1.8GHz (Note this is 32 bit code)
     75 #       OpenSSL 0.9.7c 30 Sep 2003
     76 #
     77 #       Original code.
     78 #
     79 #rsa  512 bits   0.0011s   0.0001s    906.1  11012.5
     80 #rsa 1024 bits   0.0060s   0.0003s    166.6   3363.1
     81 #rsa 2048 bits   0.0370s   0.0010s     27.1    982.4
     82 #rsa 4096 bits   0.2426s   0.0036s      4.1    280.4
     83 #dsa  512 bits   0.0010s   0.0012s   1038.1    841.5
     84 #dsa 1024 bits   0.0030s   0.0037s    329.6    269.7
     85 #dsa 2048 bits   0.0101s   0.0127s     98.9     78.6
     86 #
     87 #       Same benchmark with this assembler code:
     88 #
     89 #rsa  512 bits   0.0007s   0.0001s   1416.2  16645.9
     90 #rsa 1024 bits   0.0036s   0.0002s    274.4   5380.6
     91 #rsa 2048 bits   0.0222s   0.0006s     45.1   1589.5
     92 #rsa 4096 bits   0.1469s   0.0022s      6.8    449.6
     93 #dsa  512 bits   0.0006s   0.0007s   1664.2   1376.2
     94 #dsa 1024 bits   0.0018s   0.0023s    545.0    442.2
     95 #dsa 2048 bits   0.0061s   0.0075s    163.5    132.8
     96 #
     97 #        Performance increase of ~60%
     98 #
     99 #	If you have comments or suggestions to improve code send
    100 #	me a note at schari (at] us.ibm.com
    101 #
    102 
    103 $flavour = shift;
    104 
    105 if ($flavour =~ /32/) {
    106 	$BITS=	32;
    107 	$BNSZ=	$BITS/8;
    108 	$ISA=	"\"ppc\"";
    109 
    110 	$LD=	"lwz";		# load
    111 	$LDU=	"lwzu";		# load and update
    112 	$ST=	"stw";		# store
    113 	$STU=	"stwu";		# store and update
    114 	$UMULL=	"mullw";	# unsigned multiply low
    115 	$UMULH=	"mulhwu";	# unsigned multiply high
    116 	$UDIV=	"divwu";	# unsigned divide
    117 	$UCMPI=	"cmplwi";	# unsigned compare with immediate
    118 	$UCMP=	"cmplw";	# unsigned compare
    119 	$CNTLZ=	"cntlzw";	# count leading zeros
    120 	$SHL=	"slw";		# shift left
    121 	$SHR=	"srw";		# unsigned shift right
    122 	$SHRI=	"srwi";		# unsigned shift right by immediate	
    123 	$SHLI=	"slwi";		# shift left by immediate
    124 	$CLRU=	"clrlwi";	# clear upper bits
    125 	$INSR=	"insrwi";	# insert right
    126 	$ROTL=	"rotlwi";	# rotate left by immediate
    127 	$TR=	"tw";		# conditional trap
    128 } elsif ($flavour =~ /64/) {
    129 	$BITS=	64;
    130 	$BNSZ=	$BITS/8;
    131 	$ISA=	"\"ppc64\"";
    132 
    133 	# same as above, but 64-bit mnemonics...
    134 	$LD=	"ld";		# load
    135 	$LDU=	"ldu";		# load and update
    136 	$ST=	"std";		# store
    137 	$STU=	"stdu";		# store and update
    138 	$UMULL=	"mulld";	# unsigned multiply low
    139 	$UMULH=	"mulhdu";	# unsigned multiply high
    140 	$UDIV=	"divdu";	# unsigned divide
    141 	$UCMPI=	"cmpldi";	# unsigned compare with immediate
    142 	$UCMP=	"cmpld";	# unsigned compare
    143 	$CNTLZ=	"cntlzd";	# count leading zeros
    144 	$SHL=	"sld";		# shift left
    145 	$SHR=	"srd";		# unsigned shift right
    146 	$SHRI=	"srdi";		# unsigned shift right by immediate	
    147 	$SHLI=	"sldi";		# shift left by immediate
    148 	$CLRU=	"clrldi";	# clear upper bits
    149 	$INSR=	"insrdi";	# insert right 
    150 	$ROTL=	"rotldi";	# rotate left by immediate
    151 	$TR=	"td";		# conditional trap
    152 } else { die "nonsense $flavour"; }
    153 
    154 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
    155 ( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
    156 ( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
    157 die "can't locate ppc-xlate.pl";
    158 
    159 open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!";
    160 
    161 $data=<<EOF;
    162 #--------------------------------------------------------------------
    163 #
    164 #
    165 #
    166 #
    167 #	File:		ppc32.s
    168 #
    169 #	Created by:	Suresh Chari
    170 #			IBM Thomas J. Watson Research Library
    171 #			Hawthorne, NY
    172 #
    173 #
    174 #	Description:	Optimized assembly routines for OpenSSL crypto
    175 #			on the 32 bitPowerPC platform.
    176 #
    177 #
    178 #	Version History
    179 #
    180 #	2. Fixed bn_add,bn_sub and bn_div_words, added comments,
    181 #	   cleaned up code. Also made a single version which can
    182 #	   be used for both the AIX and Linux compilers. See NOTE
    183 #	   below.
    184 #				12/05/03		Suresh Chari
    185 #			(with lots of help from)        Andy Polyakov
    186 ##	
    187 #	1. Initial version	10/20/02		Suresh Chari
    188 #
    189 #
    190 #	The following file works for the xlc,cc
    191 #	and gcc compilers.
    192 #
    193 #	NOTE:	To get the file to link correctly with the gcc compiler
    194 #	        you have to change the names of the routines and remove
    195 #		the first .(dot) character. This should automatically
    196 #		be done in the build process.
    197 #
    198 #	Hand optimized assembly code for the following routines
    199 #	
    200 #	bn_sqr_comba4
    201 #	bn_sqr_comba8
    202 #	bn_mul_comba4
    203 #	bn_mul_comba8
    204 #	bn_sub_words
    205 #	bn_add_words
    206 #	bn_div_words
    207 #	bn_sqr_words
    208 #	bn_mul_words
    209 #	bn_mul_add_words
    210 #
    211 #	NOTE:	It is possible to optimize this code more for
    212 #	specific PowerPC or Power architectures. On the Northstar
    213 #	architecture the optimizations in this file do
    214 #	 NOT provide much improvement.
    215 #
    216 #	If you have comments or suggestions to improve code send
    217 #	me a note at schari\@us.ibm.com
    218 #
    219 #--------------------------------------------------------------------------
    220 #
    221 #	Defines to be used in the assembly code.
    222 #	
    223 #.set r0,0	# we use it as storage for value of 0
    224 #.set SP,1	# preserved
    225 #.set RTOC,2	# preserved 
    226 #.set r3,3	# 1st argument/return value
    227 #.set r4,4	# 2nd argument/volatile register
    228 #.set r5,5	# 3rd argument/volatile register
    229 #.set r6,6	# ...
    230 #.set r7,7
    231 #.set r8,8
    232 #.set r9,9
    233 #.set r10,10
    234 #.set r11,11
    235 #.set r12,12
    236 #.set r13,13	# not used, nor any other "below" it...
    237 
    238 #	Declare function names to be global
    239 #	NOTE:	For gcc these names MUST be changed to remove
    240 #	        the first . i.e. for example change ".bn_sqr_comba4"
    241 #		to "bn_sqr_comba4". This should be automatically done
    242 #		in the build.
    243 	
    244 	.globl	.bn_sqr_comba4
    245 	.globl	.bn_sqr_comba8
    246 	.globl	.bn_mul_comba4
    247 	.globl	.bn_mul_comba8
    248 	.globl	.bn_sub_words
    249 	.globl	.bn_add_words
    250 	.globl	.bn_div_words
    251 	.globl	.bn_sqr_words
    252 	.globl	.bn_mul_words
    253 	.globl	.bn_mul_add_words
    254 	
    255 # .text section
    256 	
    257 	.machine	"any"
    258 
    259 #
    260 #	NOTE:	The following label name should be changed to
    261 #		"bn_sqr_comba4" i.e. remove the first dot
    262 #		for the gcc compiler. This should be automatically
    263 #		done in the build
    264 #
    265 
    266 .align	4
    267 .bn_sqr_comba4:
    268 #
    269 # Optimized version of bn_sqr_comba4.
    270 #
    271 # void bn_sqr_comba4(BN_ULONG *r, BN_ULONG *a)
    272 # r3 contains r
    273 # r4 contains a
    274 #
    275 # Freely use registers r5,r6,r7,r8,r9,r10,r11 as follows:	
    276 # 
    277 # r5,r6 are the two BN_ULONGs being multiplied.
    278 # r7,r8 are the results of the 32x32 giving 64 bit multiply.
    279 # r9,r10, r11 are the equivalents of c1,c2, c3.
    280 # Here's the assembly
    281 #
    282 #
    283 	xor		r0,r0,r0		# set r0 = 0. Used in the addze
    284 						# instructions below
    285 	
    286 						#sqr_add_c(a,0,c1,c2,c3)
    287 	$LD		r5,`0*$BNSZ`(r4)		
    288 	$UMULL		r9,r5,r5		
    289 	$UMULH		r10,r5,r5		#in first iteration. No need
    290 						#to add since c1=c2=c3=0.
    291 						# Note c3(r11) is NOT set to 0
    292 						# but will be.
    293 
    294 	$ST		r9,`0*$BNSZ`(r3)	# r[0]=c1;
    295 						# sqr_add_c2(a,1,0,c2,c3,c1);
    296 	$LD		r6,`1*$BNSZ`(r4)		
    297 	$UMULL		r7,r5,r6
    298 	$UMULH		r8,r5,r6
    299 					
    300 	addc		r7,r7,r7		# compute (r7,r8)=2*(r7,r8)
    301 	adde		r8,r8,r8
    302 	addze		r9,r0			# catch carry if any.
    303 						# r9= r0(=0) and carry 
    304 	
    305 	addc		r10,r7,r10		# now add to temp result.
    306 	addze		r11,r8                  # r8 added to r11 which is 0 
    307 	addze		r9,r9
    308 	
    309 	$ST		r10,`1*$BNSZ`(r3)	#r[1]=c2; 
    310 						#sqr_add_c(a,1,c3,c1,c2)
    311 	$UMULL		r7,r6,r6
    312 	$UMULH		r8,r6,r6
    313 	addc		r11,r7,r11
    314 	adde		r9,r8,r9
    315 	addze		r10,r0
    316 						#sqr_add_c2(a,2,0,c3,c1,c2)
    317 	$LD		r6,`2*$BNSZ`(r4)
    318 	$UMULL		r7,r5,r6
    319 	$UMULH		r8,r5,r6
    320 	
    321 	addc		r7,r7,r7
    322 	adde		r8,r8,r8
    323 	addze		r10,r10
    324 	
    325 	addc		r11,r7,r11
    326 	adde		r9,r8,r9
    327 	addze		r10,r10
    328 	$ST		r11,`2*$BNSZ`(r3)	#r[2]=c3 
    329 						#sqr_add_c2(a,3,0,c1,c2,c3);
    330 	$LD		r6,`3*$BNSZ`(r4)		
    331 	$UMULL		r7,r5,r6
    332 	$UMULH		r8,r5,r6
    333 	addc		r7,r7,r7
    334 	adde		r8,r8,r8
    335 	addze		r11,r0
    336 	
    337 	addc		r9,r7,r9
    338 	adde		r10,r8,r10
    339 	addze		r11,r11
    340 						#sqr_add_c2(a,2,1,c1,c2,c3);
    341 	$LD		r5,`1*$BNSZ`(r4)
    342 	$LD		r6,`2*$BNSZ`(r4)
    343 	$UMULL		r7,r5,r6
    344 	$UMULH		r8,r5,r6
    345 	
    346 	addc		r7,r7,r7
    347 	adde		r8,r8,r8
    348 	addze		r11,r11
    349 	addc		r9,r7,r9
    350 	adde		r10,r8,r10
    351 	addze		r11,r11
    352 	$ST		r9,`3*$BNSZ`(r3)	#r[3]=c1
    353 						#sqr_add_c(a,2,c2,c3,c1);
    354 	$UMULL		r7,r6,r6
    355 	$UMULH		r8,r6,r6
    356 	addc		r10,r7,r10
    357 	adde		r11,r8,r11
    358 	addze		r9,r0
    359 						#sqr_add_c2(a,3,1,c2,c3,c1);
    360 	$LD		r6,`3*$BNSZ`(r4)		
    361 	$UMULL		r7,r5,r6
    362 	$UMULH		r8,r5,r6
    363 	addc		r7,r7,r7
    364 	adde		r8,r8,r8
    365 	addze		r9,r9
    366 	
    367 	addc		r10,r7,r10
    368 	adde		r11,r8,r11
    369 	addze		r9,r9
    370 	$ST		r10,`4*$BNSZ`(r3)	#r[4]=c2
    371 						#sqr_add_c2(a,3,2,c3,c1,c2);
    372 	$LD		r5,`2*$BNSZ`(r4)		
    373 	$UMULL		r7,r5,r6
    374 	$UMULH		r8,r5,r6
    375 	addc		r7,r7,r7
    376 	adde		r8,r8,r8
    377 	addze		r10,r0
    378 	
    379 	addc		r11,r7,r11
    380 	adde		r9,r8,r9
    381 	addze		r10,r10
    382 	$ST		r11,`5*$BNSZ`(r3)	#r[5] = c3
    383 						#sqr_add_c(a,3,c1,c2,c3);
    384 	$UMULL		r7,r6,r6		
    385 	$UMULH		r8,r6,r6
    386 	addc		r9,r7,r9
    387 	adde		r10,r8,r10
    388 
    389 	$ST		r9,`6*$BNSZ`(r3)	#r[6]=c1
    390 	$ST		r10,`7*$BNSZ`(r3)	#r[7]=c2
    391 	blr
    392 	.long	0x00000000
    393 
    394 #
    395 #	NOTE:	The following label name should be changed to
    396 #		"bn_sqr_comba8" i.e. remove the first dot
    397 #		for the gcc compiler. This should be automatically
    398 #		done in the build
    399 #
    400 	
    401 .align	4
    402 .bn_sqr_comba8:
    403 #
    404 # This is an optimized version of the bn_sqr_comba8 routine.
    405 # Tightly uses the adde instruction
    406 #
    407 #
    408 # void bn_sqr_comba8(BN_ULONG *r, BN_ULONG *a)
    409 # r3 contains r
    410 # r4 contains a
    411 #
    412 # Freely use registers r5,r6,r7,r8,r9,r10,r11 as follows:	
    413 # 
    414 # r5,r6 are the two BN_ULONGs being multiplied.
    415 # r7,r8 are the results of the 32x32 giving 64 bit multiply.
    416 # r9,r10, r11 are the equivalents of c1,c2, c3.
    417 #
    418 # Possible optimization of loading all 8 longs of a into registers
    419 # doesnt provide any speedup
    420 # 
    421 
    422 	xor		r0,r0,r0		#set r0 = 0.Used in addze
    423 						#instructions below.
    424 
    425 						#sqr_add_c(a,0,c1,c2,c3);
    426 	$LD		r5,`0*$BNSZ`(r4)
    427 	$UMULL		r9,r5,r5		#1st iteration:	no carries.
    428 	$UMULH		r10,r5,r5
    429 	$ST		r9,`0*$BNSZ`(r3)	# r[0]=c1;
    430 						#sqr_add_c2(a,1,0,c2,c3,c1);
    431 	$LD		r6,`1*$BNSZ`(r4)
    432 	$UMULL		r7,r5,r6
    433 	$UMULH		r8,r5,r6	
    434 	
    435 	addc		r10,r7,r10		#add the two register number
    436 	adde		r11,r8,r0 		# (r8,r7) to the three register
    437 	addze		r9,r0			# number (r9,r11,r10).NOTE:r0=0
    438 	
    439 	addc		r10,r7,r10		#add the two register number
    440 	adde		r11,r8,r11 		# (r8,r7) to the three register
    441 	addze		r9,r9			# number (r9,r11,r10).
    442 	
    443 	$ST		r10,`1*$BNSZ`(r3)	# r[1]=c2
    444 				
    445 						#sqr_add_c(a,1,c3,c1,c2);
    446 	$UMULL		r7,r6,r6
    447 	$UMULH		r8,r6,r6
    448 	addc		r11,r7,r11
    449 	adde		r9,r8,r9
    450 	addze		r10,r0
    451 						#sqr_add_c2(a,2,0,c3,c1,c2);
    452 	$LD		r6,`2*$BNSZ`(r4)
    453 	$UMULL		r7,r5,r6
    454 	$UMULH		r8,r5,r6
    455 	
    456 	addc		r11,r7,r11
    457 	adde		r9,r8,r9
    458 	addze		r10,r10
    459 	
    460 	addc		r11,r7,r11
    461 	adde		r9,r8,r9
    462 	addze		r10,r10
    463 	
    464 	$ST		r11,`2*$BNSZ`(r3)	#r[2]=c3
    465 						#sqr_add_c2(a,3,0,c1,c2,c3);
    466 	$LD		r6,`3*$BNSZ`(r4)	#r6 = a[3]. r5 is already a[0].
    467 	$UMULL		r7,r5,r6
    468 	$UMULH		r8,r5,r6
    469 	
    470 	addc		r9,r7,r9
    471 	adde		r10,r8,r10
    472 	addze		r11,r0
    473 	
    474 	addc		r9,r7,r9
    475 	adde		r10,r8,r10
    476 	addze		r11,r11
    477 						#sqr_add_c2(a,2,1,c1,c2,c3);
    478 	$LD		r5,`1*$BNSZ`(r4)
    479 	$LD		r6,`2*$BNSZ`(r4)
    480 	$UMULL		r7,r5,r6
    481 	$UMULH		r8,r5,r6
    482 	
    483 	addc		r9,r7,r9
    484 	adde		r10,r8,r10
    485 	addze		r11,r11
    486 	
    487 	addc		r9,r7,r9
    488 	adde		r10,r8,r10
    489 	addze		r11,r11
    490 	
    491 	$ST		r9,`3*$BNSZ`(r3)	#r[3]=c1;
    492 						#sqr_add_c(a,2,c2,c3,c1);
    493 	$UMULL		r7,r6,r6
    494 	$UMULH		r8,r6,r6
    495 	
    496 	addc		r10,r7,r10
    497 	adde		r11,r8,r11
    498 	addze		r9,r0
    499 						#sqr_add_c2(a,3,1,c2,c3,c1);
    500 	$LD		r6,`3*$BNSZ`(r4)
    501 	$UMULL		r7,r5,r6
    502 	$UMULH		r8,r5,r6
    503 	
    504 	addc		r10,r7,r10
    505 	adde		r11,r8,r11
    506 	addze		r9,r9
    507 	
    508 	addc		r10,r7,r10
    509 	adde		r11,r8,r11
    510 	addze		r9,r9
    511 						#sqr_add_c2(a,4,0,c2,c3,c1);
    512 	$LD		r5,`0*$BNSZ`(r4)
    513 	$LD		r6,`4*$BNSZ`(r4)
    514 	$UMULL		r7,r5,r6
    515 	$UMULH		r8,r5,r6
    516 	
    517 	addc		r10,r7,r10
    518 	adde		r11,r8,r11
    519 	addze		r9,r9
    520 	
    521 	addc		r10,r7,r10
    522 	adde		r11,r8,r11
    523 	addze		r9,r9
    524 	$ST		r10,`4*$BNSZ`(r3)	#r[4]=c2;
    525 						#sqr_add_c2(a,5,0,c3,c1,c2);
    526 	$LD		r6,`5*$BNSZ`(r4)
    527 	$UMULL		r7,r5,r6
    528 	$UMULH		r8,r5,r6
    529 	
    530 	addc		r11,r7,r11
    531 	adde		r9,r8,r9
    532 	addze		r10,r0
    533 	
    534 	addc		r11,r7,r11
    535 	adde		r9,r8,r9
    536 	addze		r10,r10
    537 						#sqr_add_c2(a,4,1,c3,c1,c2);
    538 	$LD		r5,`1*$BNSZ`(r4)
    539 	$LD		r6,`4*$BNSZ`(r4)
    540 	$UMULL		r7,r5,r6
    541 	$UMULH		r8,r5,r6
    542 	
    543 	addc		r11,r7,r11
    544 	adde		r9,r8,r9
    545 	addze		r10,r10
    546 	
    547 	addc		r11,r7,r11
    548 	adde		r9,r8,r9
    549 	addze		r10,r10
    550 						#sqr_add_c2(a,3,2,c3,c1,c2);
    551 	$LD		r5,`2*$BNSZ`(r4)
    552 	$LD		r6,`3*$BNSZ`(r4)
    553 	$UMULL		r7,r5,r6
    554 	$UMULH		r8,r5,r6
    555 	
    556 	addc		r11,r7,r11
    557 	adde		r9,r8,r9
    558 	addze		r10,r10
    559 	
    560 	addc		r11,r7,r11
    561 	adde		r9,r8,r9
    562 	addze		r10,r10
    563 	$ST		r11,`5*$BNSZ`(r3)	#r[5]=c3;
    564 						#sqr_add_c(a,3,c1,c2,c3);
    565 	$UMULL		r7,r6,r6
    566 	$UMULH		r8,r6,r6
    567 	addc		r9,r7,r9
    568 	adde		r10,r8,r10
    569 	addze		r11,r0
    570 						#sqr_add_c2(a,4,2,c1,c2,c3);
    571 	$LD		r6,`4*$BNSZ`(r4)
    572 	$UMULL		r7,r5,r6
    573 	$UMULH		r8,r5,r6
    574 	
    575 	addc		r9,r7,r9
    576 	adde		r10,r8,r10
    577 	addze		r11,r11
    578 	
    579 	addc		r9,r7,r9
    580 	adde		r10,r8,r10
    581 	addze		r11,r11
    582 						#sqr_add_c2(a,5,1,c1,c2,c3);
    583 	$LD		r5,`1*$BNSZ`(r4)
    584 	$LD		r6,`5*$BNSZ`(r4)
    585 	$UMULL		r7,r5,r6
    586 	$UMULH		r8,r5,r6
    587 	
    588 	addc		r9,r7,r9
    589 	adde		r10,r8,r10
    590 	addze		r11,r11
    591 	
    592 	addc		r9,r7,r9
    593 	adde		r10,r8,r10
    594 	addze		r11,r11
    595 						#sqr_add_c2(a,6,0,c1,c2,c3);
    596 	$LD		r5,`0*$BNSZ`(r4)
    597 	$LD		r6,`6*$BNSZ`(r4)
    598 	$UMULL		r7,r5,r6
    599 	$UMULH		r8,r5,r6
    600 	addc		r9,r7,r9
    601 	adde		r10,r8,r10
    602 	addze		r11,r11
    603 	addc		r9,r7,r9
    604 	adde		r10,r8,r10
    605 	addze		r11,r11
    606 	$ST		r9,`6*$BNSZ`(r3)	#r[6]=c1;
    607 						#sqr_add_c2(a,7,0,c2,c3,c1);
    608 	$LD		r6,`7*$BNSZ`(r4)
    609 	$UMULL		r7,r5,r6
    610 	$UMULH		r8,r5,r6
    611 	
    612 	addc		r10,r7,r10
    613 	adde		r11,r8,r11
    614 	addze		r9,r0
    615 	addc		r10,r7,r10
    616 	adde		r11,r8,r11
    617 	addze		r9,r9
    618 						#sqr_add_c2(a,6,1,c2,c3,c1);
    619 	$LD		r5,`1*$BNSZ`(r4)
    620 	$LD		r6,`6*$BNSZ`(r4)
    621 	$UMULL		r7,r5,r6
    622 	$UMULH		r8,r5,r6
    623 	
    624 	addc		r10,r7,r10
    625 	adde		r11,r8,r11
    626 	addze		r9,r9
    627 	addc		r10,r7,r10
    628 	adde		r11,r8,r11
    629 	addze		r9,r9
    630 						#sqr_add_c2(a,5,2,c2,c3,c1);
    631 	$LD		r5,`2*$BNSZ`(r4)
    632 	$LD		r6,`5*$BNSZ`(r4)
    633 	$UMULL		r7,r5,r6
    634 	$UMULH		r8,r5,r6
    635 	addc		r10,r7,r10
    636 	adde		r11,r8,r11
    637 	addze		r9,r9
    638 	addc		r10,r7,r10
    639 	adde		r11,r8,r11
    640 	addze		r9,r9
    641 						#sqr_add_c2(a,4,3,c2,c3,c1);
    642 	$LD		r5,`3*$BNSZ`(r4)
    643 	$LD		r6,`4*$BNSZ`(r4)
    644 	$UMULL		r7,r5,r6
    645 	$UMULH		r8,r5,r6
    646 	
    647 	addc		r10,r7,r10
    648 	adde		r11,r8,r11
    649 	addze		r9,r9
    650 	addc		r10,r7,r10
    651 	adde		r11,r8,r11
    652 	addze		r9,r9
    653 	$ST		r10,`7*$BNSZ`(r3)	#r[7]=c2;
    654 						#sqr_add_c(a,4,c3,c1,c2);
    655 	$UMULL		r7,r6,r6
    656 	$UMULH		r8,r6,r6
    657 	addc		r11,r7,r11
    658 	adde		r9,r8,r9
    659 	addze		r10,r0
    660 						#sqr_add_c2(a,5,3,c3,c1,c2);
    661 	$LD		r6,`5*$BNSZ`(r4)
    662 	$UMULL		r7,r5,r6
    663 	$UMULH		r8,r5,r6
    664 	addc		r11,r7,r11
    665 	adde		r9,r8,r9
    666 	addze		r10,r10
    667 	addc		r11,r7,r11
    668 	adde		r9,r8,r9
    669 	addze		r10,r10
    670 						#sqr_add_c2(a,6,2,c3,c1,c2);
    671 	$LD		r5,`2*$BNSZ`(r4)
    672 	$LD		r6,`6*$BNSZ`(r4)
    673 	$UMULL		r7,r5,r6
    674 	$UMULH		r8,r5,r6
    675 	addc		r11,r7,r11
    676 	adde		r9,r8,r9
    677 	addze		r10,r10
    678 	
    679 	addc		r11,r7,r11
    680 	adde		r9,r8,r9
    681 	addze		r10,r10
    682 						#sqr_add_c2(a,7,1,c3,c1,c2);
    683 	$LD		r5,`1*$BNSZ`(r4)
    684 	$LD		r6,`7*$BNSZ`(r4)
    685 	$UMULL		r7,r5,r6
    686 	$UMULH		r8,r5,r6
    687 	addc		r11,r7,r11
    688 	adde		r9,r8,r9
    689 	addze		r10,r10
    690 	addc		r11,r7,r11
    691 	adde		r9,r8,r9
    692 	addze		r10,r10
    693 	$ST		r11,`8*$BNSZ`(r3)	#r[8]=c3;
    694 						#sqr_add_c2(a,7,2,c1,c2,c3);
    695 	$LD		r5,`2*$BNSZ`(r4)
    696 	$UMULL		r7,r5,r6
    697 	$UMULH		r8,r5,r6
    698 	
    699 	addc		r9,r7,r9
    700 	adde		r10,r8,r10
    701 	addze		r11,r0
    702 	addc		r9,r7,r9
    703 	adde		r10,r8,r10
    704 	addze		r11,r11
    705 						#sqr_add_c2(a,6,3,c1,c2,c3);
    706 	$LD		r5,`3*$BNSZ`(r4)
    707 	$LD		r6,`6*$BNSZ`(r4)
    708 	$UMULL		r7,r5,r6
    709 	$UMULH		r8,r5,r6
    710 	addc		r9,r7,r9
    711 	adde		r10,r8,r10
    712 	addze		r11,r11
    713 	addc		r9,r7,r9
    714 	adde		r10,r8,r10
    715 	addze		r11,r11
    716 						#sqr_add_c2(a,5,4,c1,c2,c3);
    717 	$LD		r5,`4*$BNSZ`(r4)
    718 	$LD		r6,`5*$BNSZ`(r4)
    719 	$UMULL		r7,r5,r6
    720 	$UMULH		r8,r5,r6
    721 	addc		r9,r7,r9
    722 	adde		r10,r8,r10
    723 	addze		r11,r11
    724 	addc		r9,r7,r9
    725 	adde		r10,r8,r10
    726 	addze		r11,r11
    727 	$ST		r9,`9*$BNSZ`(r3)	#r[9]=c1;
    728 						#sqr_add_c(a,5,c2,c3,c1);
    729 	$UMULL		r7,r6,r6
    730 	$UMULH		r8,r6,r6
    731 	addc		r10,r7,r10
    732 	adde		r11,r8,r11
    733 	addze		r9,r0
    734 						#sqr_add_c2(a,6,4,c2,c3,c1);
    735 	$LD		r6,`6*$BNSZ`(r4)
    736 	$UMULL		r7,r5,r6
    737 	$UMULH		r8,r5,r6
    738 	addc		r10,r7,r10
    739 	adde		r11,r8,r11
    740 	addze		r9,r9
    741 	addc		r10,r7,r10
    742 	adde		r11,r8,r11
    743 	addze		r9,r9
    744 						#sqr_add_c2(a,7,3,c2,c3,c1);
    745 	$LD		r5,`3*$BNSZ`(r4)
    746 	$LD		r6,`7*$BNSZ`(r4)
    747 	$UMULL		r7,r5,r6
    748 	$UMULH		r8,r5,r6
    749 	addc		r10,r7,r10
    750 	adde		r11,r8,r11
    751 	addze		r9,r9
    752 	addc		r10,r7,r10
    753 	adde		r11,r8,r11
    754 	addze		r9,r9
    755 	$ST		r10,`10*$BNSZ`(r3)	#r[10]=c2;
    756 						#sqr_add_c2(a,7,4,c3,c1,c2);
    757 	$LD		r5,`4*$BNSZ`(r4)
    758 	$UMULL		r7,r5,r6
    759 	$UMULH		r8,r5,r6
    760 	addc		r11,r7,r11
    761 	adde		r9,r8,r9
    762 	addze		r10,r0
    763 	addc		r11,r7,r11
    764 	adde		r9,r8,r9
    765 	addze		r10,r10
    766 						#sqr_add_c2(a,6,5,c3,c1,c2);
    767 	$LD		r5,`5*$BNSZ`(r4)
    768 	$LD		r6,`6*$BNSZ`(r4)
    769 	$UMULL		r7,r5,r6
    770 	$UMULH		r8,r5,r6
    771 	addc		r11,r7,r11
    772 	adde		r9,r8,r9
    773 	addze		r10,r10
    774 	addc		r11,r7,r11
    775 	adde		r9,r8,r9
    776 	addze		r10,r10
    777 	$ST		r11,`11*$BNSZ`(r3)	#r[11]=c3;
    778 						#sqr_add_c(a,6,c1,c2,c3);
    779 	$UMULL		r7,r6,r6
    780 	$UMULH		r8,r6,r6
    781 	addc		r9,r7,r9
    782 	adde		r10,r8,r10
    783 	addze		r11,r0
    784 						#sqr_add_c2(a,7,5,c1,c2,c3)
    785 	$LD		r6,`7*$BNSZ`(r4)
    786 	$UMULL		r7,r5,r6
    787 	$UMULH		r8,r5,r6
    788 	addc		r9,r7,r9
    789 	adde		r10,r8,r10
    790 	addze		r11,r11
    791 	addc		r9,r7,r9
    792 	adde		r10,r8,r10
    793 	addze		r11,r11
    794 	$ST		r9,`12*$BNSZ`(r3)	#r[12]=c1;
    795 	
    796 						#sqr_add_c2(a,7,6,c2,c3,c1)
    797 	$LD		r5,`6*$BNSZ`(r4)
    798 	$UMULL		r7,r5,r6
    799 	$UMULH		r8,r5,r6
    800 	addc		r10,r7,r10
    801 	adde		r11,r8,r11
    802 	addze		r9,r0
    803 	addc		r10,r7,r10
    804 	adde		r11,r8,r11
    805 	addze		r9,r9
    806 	$ST		r10,`13*$BNSZ`(r3)	#r[13]=c2;
    807 						#sqr_add_c(a,7,c3,c1,c2);
    808 	$UMULL		r7,r6,r6
    809 	$UMULH		r8,r6,r6
    810 	addc		r11,r7,r11
    811 	adde		r9,r8,r9
    812 	$ST		r11,`14*$BNSZ`(r3)	#r[14]=c3;
    813 	$ST		r9, `15*$BNSZ`(r3)	#r[15]=c1;
    814 
    815 
    816 	blr
    817 
    818 	.long	0x00000000
    819 
    820 #
    821 #	NOTE:	The following label name should be changed to
    822 #		"bn_mul_comba4" i.e. remove the first dot
    823 #		for the gcc compiler. This should be automatically
    824 #		done in the build
    825 #
    826 
    827 .align	4
    828 .bn_mul_comba4:
    829 #
    830 # This is an optimized version of the bn_mul_comba4 routine.
    831 #
    832 # void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
    833 # r3 contains r
    834 # r4 contains a
    835 # r5 contains b
    836 # r6, r7 are the 2 BN_ULONGs being multiplied.
    837 # r8, r9 are the results of the 32x32 giving 64 multiply.
    838 # r10, r11, r12 are the equivalents of c1, c2, and c3.
    839 #
    840 	xor	r0,r0,r0		#r0=0. Used in addze below.
    841 					#mul_add_c(a[0],b[0],c1,c2,c3);
    842 	$LD	r6,`0*$BNSZ`(r4)		
    843 	$LD	r7,`0*$BNSZ`(r5)		
    844 	$UMULL	r10,r6,r7		
    845 	$UMULH	r11,r6,r7		
    846 	$ST	r10,`0*$BNSZ`(r3)	#r[0]=c1
    847 					#mul_add_c(a[0],b[1],c2,c3,c1);
    848 	$LD	r7,`1*$BNSZ`(r5)		
    849 	$UMULL	r8,r6,r7
    850 	$UMULH	r9,r6,r7
    851 	addc	r11,r8,r11
    852 	adde	r12,r9,r0
    853 	addze	r10,r0
    854 					#mul_add_c(a[1],b[0],c2,c3,c1);
    855 	$LD	r6, `1*$BNSZ`(r4)		
    856 	$LD	r7, `0*$BNSZ`(r5)		
    857 	$UMULL	r8,r6,r7
    858 	$UMULH	r9,r6,r7
    859 	addc	r11,r8,r11
    860 	adde	r12,r9,r12
    861 	addze	r10,r10
    862 	$ST	r11,`1*$BNSZ`(r3)	#r[1]=c2
    863 					#mul_add_c(a[2],b[0],c3,c1,c2);
    864 	$LD	r6,`2*$BNSZ`(r4)		
    865 	$UMULL	r8,r6,r7
    866 	$UMULH	r9,r6,r7
    867 	addc	r12,r8,r12
    868 	adde	r10,r9,r10
    869 	addze	r11,r0
    870 					#mul_add_c(a[1],b[1],c3,c1,c2);
    871 	$LD	r6,`1*$BNSZ`(r4)		
    872 	$LD	r7,`1*$BNSZ`(r5)		
    873 	$UMULL	r8,r6,r7
    874 	$UMULH	r9,r6,r7
    875 	addc	r12,r8,r12
    876 	adde	r10,r9,r10
    877 	addze	r11,r11
    878 					#mul_add_c(a[0],b[2],c3,c1,c2);
    879 	$LD	r6,`0*$BNSZ`(r4)		
    880 	$LD	r7,`2*$BNSZ`(r5)		
    881 	$UMULL	r8,r6,r7
    882 	$UMULH	r9,r6,r7
    883 	addc	r12,r8,r12
    884 	adde	r10,r9,r10
    885 	addze	r11,r11
    886 	$ST	r12,`2*$BNSZ`(r3)	#r[2]=c3
    887 					#mul_add_c(a[0],b[3],c1,c2,c3);
    888 	$LD	r7,`3*$BNSZ`(r5)		
    889 	$UMULL	r8,r6,r7
    890 	$UMULH	r9,r6,r7
    891 	addc	r10,r8,r10
    892 	adde	r11,r9,r11
    893 	addze	r12,r0
    894 					#mul_add_c(a[1],b[2],c1,c2,c3);
    895 	$LD	r6,`1*$BNSZ`(r4)
    896 	$LD	r7,`2*$BNSZ`(r5)
    897 	$UMULL	r8,r6,r7
    898 	$UMULH	r9,r6,r7
    899 	addc	r10,r8,r10
    900 	adde	r11,r9,r11
    901 	addze	r12,r12
    902 					#mul_add_c(a[2],b[1],c1,c2,c3);
    903 	$LD	r6,`2*$BNSZ`(r4)
    904 	$LD	r7,`1*$BNSZ`(r5)
    905 	$UMULL	r8,r6,r7
    906 	$UMULH	r9,r6,r7
    907 	addc	r10,r8,r10
    908 	adde	r11,r9,r11
    909 	addze	r12,r12
    910 					#mul_add_c(a[3],b[0],c1,c2,c3);
    911 	$LD	r6,`3*$BNSZ`(r4)
    912 	$LD	r7,`0*$BNSZ`(r5)
    913 	$UMULL	r8,r6,r7
    914 	$UMULH	r9,r6,r7
    915 	addc	r10,r8,r10
    916 	adde	r11,r9,r11
    917 	addze	r12,r12
    918 	$ST	r10,`3*$BNSZ`(r3)	#r[3]=c1
    919 					#mul_add_c(a[3],b[1],c2,c3,c1);
    920 	$LD	r7,`1*$BNSZ`(r5)		
    921 	$UMULL	r8,r6,r7
    922 	$UMULH	r9,r6,r7
    923 	addc	r11,r8,r11
    924 	adde	r12,r9,r12
    925 	addze	r10,r0
    926 					#mul_add_c(a[2],b[2],c2,c3,c1);
    927 	$LD	r6,`2*$BNSZ`(r4)
    928 	$LD	r7,`2*$BNSZ`(r5)
    929 	$UMULL	r8,r6,r7
    930 	$UMULH	r9,r6,r7
    931 	addc	r11,r8,r11
    932 	adde	r12,r9,r12
    933 	addze	r10,r10
    934 					#mul_add_c(a[1],b[3],c2,c3,c1);
    935 	$LD	r6,`1*$BNSZ`(r4)
    936 	$LD	r7,`3*$BNSZ`(r5)
    937 	$UMULL	r8,r6,r7
    938 	$UMULH	r9,r6,r7
    939 	addc	r11,r8,r11
    940 	adde	r12,r9,r12
    941 	addze	r10,r10
    942 	$ST	r11,`4*$BNSZ`(r3)	#r[4]=c2
    943 					#mul_add_c(a[2],b[3],c3,c1,c2);
    944 	$LD	r6,`2*$BNSZ`(r4)		
    945 	$UMULL	r8,r6,r7
    946 	$UMULH	r9,r6,r7
    947 	addc	r12,r8,r12
    948 	adde	r10,r9,r10
    949 	addze	r11,r0
    950 					#mul_add_c(a[3],b[2],c3,c1,c2);
    951 	$LD	r6,`3*$BNSZ`(r4)
    952 	$LD	r7,`2*$BNSZ`(r4)
    953 	$UMULL	r8,r6,r7
    954 	$UMULH	r9,r6,r7
    955 	addc	r12,r8,r12
    956 	adde	r10,r9,r10
    957 	addze	r11,r11
    958 	$ST	r12,`5*$BNSZ`(r3)	#r[5]=c3
    959 					#mul_add_c(a[3],b[3],c1,c2,c3);
    960 	$LD	r7,`3*$BNSZ`(r5)		
    961 	$UMULL	r8,r6,r7
    962 	$UMULH	r9,r6,r7
    963 	addc	r10,r8,r10
    964 	adde	r11,r9,r11
    965 
    966 	$ST	r10,`6*$BNSZ`(r3)	#r[6]=c1
    967 	$ST	r11,`7*$BNSZ`(r3)	#r[7]=c2
    968 	blr
    969 	.long	0x00000000
    970 
    971 #
    972 #	NOTE:	The following label name should be changed to
    973 #		"bn_mul_comba8" i.e. remove the first dot
    974 #		for the gcc compiler. This should be automatically
    975 #		done in the build
    976 #
    977 	
    978 .align	4
    979 .bn_mul_comba8:
    980 #
    981 # Optimized version of the bn_mul_comba8 routine.
    982 #
    983 # void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
    984 # r3 contains r
    985 # r4 contains a
    986 # r5 contains b
    987 # r6, r7 are the 2 BN_ULONGs being multiplied.
    988 # r8, r9 are the results of the 32x32 giving 64 multiply.
    989 # r10, r11, r12 are the equivalents of c1, c2, and c3.
    990 #
    991 	xor	r0,r0,r0		#r0=0. Used in addze below.
    992 	
    993 					#mul_add_c(a[0],b[0],c1,c2,c3);
    994 	$LD	r6,`0*$BNSZ`(r4)	#a[0]
    995 	$LD	r7,`0*$BNSZ`(r5)	#b[0]
    996 	$UMULL	r10,r6,r7
    997 	$UMULH	r11,r6,r7
    998 	$ST	r10,`0*$BNSZ`(r3)	#r[0]=c1;
    999 					#mul_add_c(a[0],b[1],c2,c3,c1);
   1000 	$LD	r7,`1*$BNSZ`(r5)
   1001 	$UMULL	r8,r6,r7
   1002 	$UMULH	r9,r6,r7
   1003 	addc	r11,r11,r8
   1004 	addze	r12,r9			# since we didnt set r12 to zero before.
   1005 	addze	r10,r0
   1006 					#mul_add_c(a[1],b[0],c2,c3,c1);
   1007 	$LD	r6,`1*$BNSZ`(r4)
   1008 	$LD	r7,`0*$BNSZ`(r5)
   1009 	$UMULL	r8,r6,r7
   1010 	$UMULH	r9,r6,r7
   1011 	addc	r11,r11,r8
   1012 	adde	r12,r12,r9
   1013 	addze	r10,r10
   1014 	$ST	r11,`1*$BNSZ`(r3)	#r[1]=c2;
   1015 					#mul_add_c(a[2],b[0],c3,c1,c2);
   1016 	$LD	r6,`2*$BNSZ`(r4)
   1017 	$UMULL	r8,r6,r7
   1018 	$UMULH	r9,r6,r7
   1019 	addc	r12,r12,r8
   1020 	adde	r10,r10,r9
   1021 	addze	r11,r0
   1022 					#mul_add_c(a[1],b[1],c3,c1,c2);
   1023 	$LD	r6,`1*$BNSZ`(r4)
   1024 	$LD	r7,`1*$BNSZ`(r5)
   1025 	$UMULL	r8,r6,r7
   1026 	$UMULH	r9,r6,r7
   1027 	addc	r12,r12,r8
   1028 	adde	r10,r10,r9
   1029 	addze	r11,r11
   1030 					#mul_add_c(a[0],b[2],c3,c1,c2);
   1031 	$LD	r6,`0*$BNSZ`(r4)
   1032 	$LD	r7,`2*$BNSZ`(r5)
   1033 	$UMULL	r8,r6,r7
   1034 	$UMULH	r9,r6,r7
   1035 	addc	r12,r12,r8
   1036 	adde	r10,r10,r9
   1037 	addze	r11,r11
   1038 	$ST	r12,`2*$BNSZ`(r3)	#r[2]=c3;
   1039 					#mul_add_c(a[0],b[3],c1,c2,c3);
   1040 	$LD	r7,`3*$BNSZ`(r5)
   1041 	$UMULL	r8,r6,r7
   1042 	$UMULH	r9,r6,r7
   1043 	addc	r10,r10,r8
   1044 	adde	r11,r11,r9
   1045 	addze	r12,r0
   1046 					#mul_add_c(a[1],b[2],c1,c2,c3);
   1047 	$LD	r6,`1*$BNSZ`(r4)
   1048 	$LD	r7,`2*$BNSZ`(r5)
   1049 	$UMULL	r8,r6,r7
   1050 	$UMULH	r9,r6,r7
   1051 	addc	r10,r10,r8
   1052 	adde	r11,r11,r9
   1053 	addze	r12,r12
   1054 		
   1055 					#mul_add_c(a[2],b[1],c1,c2,c3);
   1056 	$LD	r6,`2*$BNSZ`(r4)
   1057 	$LD	r7,`1*$BNSZ`(r5)
   1058 	$UMULL	r8,r6,r7
   1059 	$UMULH	r9,r6,r7
   1060 	addc	r10,r10,r8
   1061 	adde	r11,r11,r9
   1062 	addze	r12,r12
   1063 					#mul_add_c(a[3],b[0],c1,c2,c3);
   1064 	$LD	r6,`3*$BNSZ`(r4)
   1065 	$LD	r7,`0*$BNSZ`(r5)
   1066 	$UMULL	r8,r6,r7
   1067 	$UMULH	r9,r6,r7
   1068 	addc	r10,r10,r8
   1069 	adde	r11,r11,r9
   1070 	addze	r12,r12
   1071 	$ST	r10,`3*$BNSZ`(r3)	#r[3]=c1;
   1072 					#mul_add_c(a[4],b[0],c2,c3,c1);
   1073 	$LD	r6,`4*$BNSZ`(r4)
   1074 	$UMULL	r8,r6,r7
   1075 	$UMULH	r9,r6,r7
   1076 	addc	r11,r11,r8
   1077 	adde	r12,r12,r9
   1078 	addze	r10,r0
   1079 					#mul_add_c(a[3],b[1],c2,c3,c1);
   1080 	$LD	r6,`3*$BNSZ`(r4)
   1081 	$LD	r7,`1*$BNSZ`(r5)
   1082 	$UMULL	r8,r6,r7
   1083 	$UMULH	r9,r6,r7
   1084 	addc	r11,r11,r8
   1085 	adde	r12,r12,r9
   1086 	addze	r10,r10
   1087 					#mul_add_c(a[2],b[2],c2,c3,c1);
   1088 	$LD	r6,`2*$BNSZ`(r4)
   1089 	$LD	r7,`2*$BNSZ`(r5)
   1090 	$UMULL	r8,r6,r7
   1091 	$UMULH	r9,r6,r7
   1092 	addc	r11,r11,r8
   1093 	adde	r12,r12,r9
   1094 	addze	r10,r10
   1095 					#mul_add_c(a[1],b[3],c2,c3,c1);
   1096 	$LD	r6,`1*$BNSZ`(r4)
   1097 	$LD	r7,`3*$BNSZ`(r5)
   1098 	$UMULL	r8,r6,r7
   1099 	$UMULH	r9,r6,r7
   1100 	addc	r11,r11,r8
   1101 	adde	r12,r12,r9
   1102 	addze	r10,r10
   1103 					#mul_add_c(a[0],b[4],c2,c3,c1);
   1104 	$LD	r6,`0*$BNSZ`(r4)
   1105 	$LD	r7,`4*$BNSZ`(r5)
   1106 	$UMULL	r8,r6,r7
   1107 	$UMULH	r9,r6,r7
   1108 	addc	r11,r11,r8
   1109 	adde	r12,r12,r9
   1110 	addze	r10,r10
   1111 	$ST	r11,`4*$BNSZ`(r3)	#r[4]=c2;
   1112 					#mul_add_c(a[0],b[5],c3,c1,c2);
   1113 	$LD	r7,`5*$BNSZ`(r5)
   1114 	$UMULL	r8,r6,r7
   1115 	$UMULH	r9,r6,r7
   1116 	addc	r12,r12,r8
   1117 	adde	r10,r10,r9
   1118 	addze	r11,r0
   1119 					#mul_add_c(a[1],b[4],c3,c1,c2);
   1120 	$LD	r6,`1*$BNSZ`(r4)		
   1121 	$LD	r7,`4*$BNSZ`(r5)
   1122 	$UMULL	r8,r6,r7
   1123 	$UMULH	r9,r6,r7
   1124 	addc	r12,r12,r8
   1125 	adde	r10,r10,r9
   1126 	addze	r11,r11
   1127 					#mul_add_c(a[2],b[3],c3,c1,c2);
   1128 	$LD	r6,`2*$BNSZ`(r4)		
   1129 	$LD	r7,`3*$BNSZ`(r5)
   1130 	$UMULL	r8,r6,r7
   1131 	$UMULH	r9,r6,r7
   1132 	addc	r12,r12,r8
   1133 	adde	r10,r10,r9
   1134 	addze	r11,r11
   1135 					#mul_add_c(a[3],b[2],c3,c1,c2);
   1136 	$LD	r6,`3*$BNSZ`(r4)		
   1137 	$LD	r7,`2*$BNSZ`(r5)
   1138 	$UMULL	r8,r6,r7
   1139 	$UMULH	r9,r6,r7
   1140 	addc	r12,r12,r8
   1141 	adde	r10,r10,r9
   1142 	addze	r11,r11
   1143 					#mul_add_c(a[4],b[1],c3,c1,c2);
   1144 	$LD	r6,`4*$BNSZ`(r4)		
   1145 	$LD	r7,`1*$BNSZ`(r5)
   1146 	$UMULL	r8,r6,r7
   1147 	$UMULH	r9,r6,r7
   1148 	addc	r12,r12,r8
   1149 	adde	r10,r10,r9
   1150 	addze	r11,r11
   1151 					#mul_add_c(a[5],b[0],c3,c1,c2);
   1152 	$LD	r6,`5*$BNSZ`(r4)		
   1153 	$LD	r7,`0*$BNSZ`(r5)
   1154 	$UMULL	r8,r6,r7
   1155 	$UMULH	r9,r6,r7
   1156 	addc	r12,r12,r8
   1157 	adde	r10,r10,r9
   1158 	addze	r11,r11
   1159 	$ST	r12,`5*$BNSZ`(r3)	#r[5]=c3;
   1160 					#mul_add_c(a[6],b[0],c1,c2,c3);
   1161 	$LD	r6,`6*$BNSZ`(r4)
   1162 	$UMULL	r8,r6,r7
   1163 	$UMULH	r9,r6,r7
   1164 	addc	r10,r10,r8
   1165 	adde	r11,r11,r9
   1166 	addze	r12,r0
   1167 					#mul_add_c(a[5],b[1],c1,c2,c3);
   1168 	$LD	r6,`5*$BNSZ`(r4)
   1169 	$LD	r7,`1*$BNSZ`(r5)
   1170 	$UMULL	r8,r6,r7
   1171 	$UMULH	r9,r6,r7
   1172 	addc	r10,r10,r8
   1173 	adde	r11,r11,r9
   1174 	addze	r12,r12
   1175 					#mul_add_c(a[4],b[2],c1,c2,c3);
   1176 	$LD	r6,`4*$BNSZ`(r4)
   1177 	$LD	r7,`2*$BNSZ`(r5)
   1178 	$UMULL	r8,r6,r7
   1179 	$UMULH	r9,r6,r7
   1180 	addc	r10,r10,r8
   1181 	adde	r11,r11,r9
   1182 	addze	r12,r12
   1183 					#mul_add_c(a[3],b[3],c1,c2,c3);
   1184 	$LD	r6,`3*$BNSZ`(r4)
   1185 	$LD	r7,`3*$BNSZ`(r5)
   1186 	$UMULL	r8,r6,r7
   1187 	$UMULH	r9,r6,r7
   1188 	addc	r10,r10,r8
   1189 	adde	r11,r11,r9
   1190 	addze	r12,r12
   1191 					#mul_add_c(a[2],b[4],c1,c2,c3);
   1192 	$LD	r6,`2*$BNSZ`(r4)
   1193 	$LD	r7,`4*$BNSZ`(r5)
   1194 	$UMULL	r8,r6,r7
   1195 	$UMULH	r9,r6,r7
   1196 	addc	r10,r10,r8
   1197 	adde	r11,r11,r9
   1198 	addze	r12,r12
   1199 					#mul_add_c(a[1],b[5],c1,c2,c3);
   1200 	$LD	r6,`1*$BNSZ`(r4)
   1201 	$LD	r7,`5*$BNSZ`(r5)
   1202 	$UMULL	r8,r6,r7
   1203 	$UMULH	r9,r6,r7
   1204 	addc	r10,r10,r8
   1205 	adde	r11,r11,r9
   1206 	addze	r12,r12
   1207 					#mul_add_c(a[0],b[6],c1,c2,c3);
   1208 	$LD	r6,`0*$BNSZ`(r4)
   1209 	$LD	r7,`6*$BNSZ`(r5)
   1210 	$UMULL	r8,r6,r7
   1211 	$UMULH	r9,r6,r7
   1212 	addc	r10,r10,r8
   1213 	adde	r11,r11,r9
   1214 	addze	r12,r12
   1215 	$ST	r10,`6*$BNSZ`(r3)	#r[6]=c1;
   1216 					#mul_add_c(a[0],b[7],c2,c3,c1);
   1217 	$LD	r7,`7*$BNSZ`(r5)
   1218 	$UMULL	r8,r6,r7
   1219 	$UMULH	r9,r6,r7
   1220 	addc	r11,r11,r8
   1221 	adde	r12,r12,r9
   1222 	addze	r10,r0
   1223 					#mul_add_c(a[1],b[6],c2,c3,c1);
   1224 	$LD	r6,`1*$BNSZ`(r4)
   1225 	$LD	r7,`6*$BNSZ`(r5)
   1226 	$UMULL	r8,r6,r7
   1227 	$UMULH	r9,r6,r7
   1228 	addc	r11,r11,r8
   1229 	adde	r12,r12,r9
   1230 	addze	r10,r10
   1231 					#mul_add_c(a[2],b[5],c2,c3,c1);
   1232 	$LD	r6,`2*$BNSZ`(r4)
   1233 	$LD	r7,`5*$BNSZ`(r5)
   1234 	$UMULL	r8,r6,r7
   1235 	$UMULH	r9,r6,r7
   1236 	addc	r11,r11,r8
   1237 	adde	r12,r12,r9
   1238 	addze	r10,r10
   1239 					#mul_add_c(a[3],b[4],c2,c3,c1);
   1240 	$LD	r6,`3*$BNSZ`(r4)
   1241 	$LD	r7,`4*$BNSZ`(r5)
   1242 	$UMULL	r8,r6,r7
   1243 	$UMULH	r9,r6,r7
   1244 	addc	r11,r11,r8
   1245 	adde	r12,r12,r9
   1246 	addze	r10,r10
   1247 					#mul_add_c(a[4],b[3],c2,c3,c1);
   1248 	$LD	r6,`4*$BNSZ`(r4)
   1249 	$LD	r7,`3*$BNSZ`(r5)
   1250 	$UMULL	r8,r6,r7
   1251 	$UMULH	r9,r6,r7
   1252 	addc	r11,r11,r8
   1253 	adde	r12,r12,r9
   1254 	addze	r10,r10
   1255 					#mul_add_c(a[5],b[2],c2,c3,c1);
   1256 	$LD	r6,`5*$BNSZ`(r4)
   1257 	$LD	r7,`2*$BNSZ`(r5)
   1258 	$UMULL	r8,r6,r7
   1259 	$UMULH	r9,r6,r7
   1260 	addc	r11,r11,r8
   1261 	adde	r12,r12,r9
   1262 	addze	r10,r10
   1263 					#mul_add_c(a[6],b[1],c2,c3,c1);
   1264 	$LD	r6,`6*$BNSZ`(r4)
   1265 	$LD	r7,`1*$BNSZ`(r5)
   1266 	$UMULL	r8,r6,r7
   1267 	$UMULH	r9,r6,r7
   1268 	addc	r11,r11,r8
   1269 	adde	r12,r12,r9
   1270 	addze	r10,r10
   1271 					#mul_add_c(a[7],b[0],c2,c3,c1);
   1272 	$LD	r6,`7*$BNSZ`(r4)
   1273 	$LD	r7,`0*$BNSZ`(r5)
   1274 	$UMULL	r8,r6,r7
   1275 	$UMULH	r9,r6,r7
   1276 	addc	r11,r11,r8
   1277 	adde	r12,r12,r9
   1278 	addze	r10,r10
   1279 	$ST	r11,`7*$BNSZ`(r3)	#r[7]=c2;
   1280 					#mul_add_c(a[7],b[1],c3,c1,c2);
   1281 	$LD	r7,`1*$BNSZ`(r5)
   1282 	$UMULL	r8,r6,r7
   1283 	$UMULH	r9,r6,r7
   1284 	addc	r12,r12,r8
   1285 	adde	r10,r10,r9
   1286 	addze	r11,r0
   1287 					#mul_add_c(a[6],b[2],c3,c1,c2);
   1288 	$LD	r6,`6*$BNSZ`(r4)
   1289 	$LD	r7,`2*$BNSZ`(r5)
   1290 	$UMULL	r8,r6,r7
   1291 	$UMULH	r9,r6,r7
   1292 	addc	r12,r12,r8
   1293 	adde	r10,r10,r9
   1294 	addze	r11,r11
   1295 					#mul_add_c(a[5],b[3],c3,c1,c2);
   1296 	$LD	r6,`5*$BNSZ`(r4)
   1297 	$LD	r7,`3*$BNSZ`(r5)
   1298 	$UMULL	r8,r6,r7
   1299 	$UMULH	r9,r6,r7
   1300 	addc	r12,r12,r8
   1301 	adde	r10,r10,r9
   1302 	addze	r11,r11
   1303 					#mul_add_c(a[4],b[4],c3,c1,c2);
   1304 	$LD	r6,`4*$BNSZ`(r4)
   1305 	$LD	r7,`4*$BNSZ`(r5)
   1306 	$UMULL	r8,r6,r7
   1307 	$UMULH	r9,r6,r7
   1308 	addc	r12,r12,r8
   1309 	adde	r10,r10,r9
   1310 	addze	r11,r11
   1311 					#mul_add_c(a[3],b[5],c3,c1,c2);
   1312 	$LD	r6,`3*$BNSZ`(r4)
   1313 	$LD	r7,`5*$BNSZ`(r5)
   1314 	$UMULL	r8,r6,r7
   1315 	$UMULH	r9,r6,r7
   1316 	addc	r12,r12,r8
   1317 	adde	r10,r10,r9
   1318 	addze	r11,r11
   1319 					#mul_add_c(a[2],b[6],c3,c1,c2);
   1320 	$LD	r6,`2*$BNSZ`(r4)
   1321 	$LD	r7,`6*$BNSZ`(r5)
   1322 	$UMULL	r8,r6,r7
   1323 	$UMULH	r9,r6,r7
   1324 	addc	r12,r12,r8
   1325 	adde	r10,r10,r9
   1326 	addze	r11,r11
   1327 					#mul_add_c(a[1],b[7],c3,c1,c2);
   1328 	$LD	r6,`1*$BNSZ`(r4)
   1329 	$LD	r7,`7*$BNSZ`(r5)
   1330 	$UMULL	r8,r6,r7
   1331 	$UMULH	r9,r6,r7
   1332 	addc	r12,r12,r8
   1333 	adde	r10,r10,r9
   1334 	addze	r11,r11
   1335 	$ST	r12,`8*$BNSZ`(r3)	#r[8]=c3;
   1336 					#mul_add_c(a[2],b[7],c1,c2,c3);
   1337 	$LD	r6,`2*$BNSZ`(r4)
   1338 	$UMULL	r8,r6,r7
   1339 	$UMULH	r9,r6,r7
   1340 	addc	r10,r10,r8
   1341 	adde	r11,r11,r9
   1342 	addze	r12,r0
   1343 					#mul_add_c(a[3],b[6],c1,c2,c3);
   1344 	$LD	r6,`3*$BNSZ`(r4)
   1345 	$LD	r7,`6*$BNSZ`(r5)
   1346 	$UMULL	r8,r6,r7
   1347 	$UMULH	r9,r6,r7
   1348 	addc	r10,r10,r8
   1349 	adde	r11,r11,r9
   1350 	addze	r12,r12
   1351 					#mul_add_c(a[4],b[5],c1,c2,c3);
   1352 	$LD	r6,`4*$BNSZ`(r4)
   1353 	$LD	r7,`5*$BNSZ`(r5)
   1354 	$UMULL	r8,r6,r7
   1355 	$UMULH	r9,r6,r7
   1356 	addc	r10,r10,r8
   1357 	adde	r11,r11,r9
   1358 	addze	r12,r12
   1359 					#mul_add_c(a[5],b[4],c1,c2,c3);
   1360 	$LD	r6,`5*$BNSZ`(r4)
   1361 	$LD	r7,`4*$BNSZ`(r5)
   1362 	$UMULL	r8,r6,r7
   1363 	$UMULH	r9,r6,r7
   1364 	addc	r10,r10,r8
   1365 	adde	r11,r11,r9
   1366 	addze	r12,r12
   1367 					#mul_add_c(a[6],b[3],c1,c2,c3);
   1368 	$LD	r6,`6*$BNSZ`(r4)
   1369 	$LD	r7,`3*$BNSZ`(r5)
   1370 	$UMULL	r8,r6,r7
   1371 	$UMULH	r9,r6,r7
   1372 	addc	r10,r10,r8
   1373 	adde	r11,r11,r9
   1374 	addze	r12,r12
   1375 					#mul_add_c(a[7],b[2],c1,c2,c3);
   1376 	$LD	r6,`7*$BNSZ`(r4)
   1377 	$LD	r7,`2*$BNSZ`(r5)
   1378 	$UMULL	r8,r6,r7
   1379 	$UMULH	r9,r6,r7
   1380 	addc	r10,r10,r8
   1381 	adde	r11,r11,r9
   1382 	addze	r12,r12
   1383 	$ST	r10,`9*$BNSZ`(r3)	#r[9]=c1;
   1384 					#mul_add_c(a[7],b[3],c2,c3,c1);
   1385 	$LD	r7,`3*$BNSZ`(r5)
   1386 	$UMULL	r8,r6,r7
   1387 	$UMULH	r9,r6,r7
   1388 	addc	r11,r11,r8
   1389 	adde	r12,r12,r9
   1390 	addze	r10,r0
   1391 					#mul_add_c(a[6],b[4],c2,c3,c1);
   1392 	$LD	r6,`6*$BNSZ`(r4)
   1393 	$LD	r7,`4*$BNSZ`(r5)
   1394 	$UMULL	r8,r6,r7
   1395 	$UMULH	r9,r6,r7
   1396 	addc	r11,r11,r8
   1397 	adde	r12,r12,r9
   1398 	addze	r10,r10
   1399 					#mul_add_c(a[5],b[5],c2,c3,c1);
   1400 	$LD	r6,`5*$BNSZ`(r4)
   1401 	$LD	r7,`5*$BNSZ`(r5)
   1402 	$UMULL	r8,r6,r7
   1403 	$UMULH	r9,r6,r7
   1404 	addc	r11,r11,r8
   1405 	adde	r12,r12,r9
   1406 	addze	r10,r10
   1407 					#mul_add_c(a[4],b[6],c2,c3,c1);
   1408 	$LD	r6,`4*$BNSZ`(r4)
   1409 	$LD	r7,`6*$BNSZ`(r5)
   1410 	$UMULL	r8,r6,r7
   1411 	$UMULH	r9,r6,r7
   1412 	addc	r11,r11,r8
   1413 	adde	r12,r12,r9
   1414 	addze	r10,r10
   1415 					#mul_add_c(a[3],b[7],c2,c3,c1);
   1416 	$LD	r6,`3*$BNSZ`(r4)
   1417 	$LD	r7,`7*$BNSZ`(r5)
   1418 	$UMULL	r8,r6,r7
   1419 	$UMULH	r9,r6,r7
   1420 	addc	r11,r11,r8
   1421 	adde	r12,r12,r9
   1422 	addze	r10,r10
   1423 	$ST	r11,`10*$BNSZ`(r3)	#r[10]=c2;
   1424 					#mul_add_c(a[4],b[7],c3,c1,c2);
   1425 	$LD	r6,`4*$BNSZ`(r4)
   1426 	$UMULL	r8,r6,r7
   1427 	$UMULH	r9,r6,r7
   1428 	addc	r12,r12,r8
   1429 	adde	r10,r10,r9
   1430 	addze	r11,r0
   1431 					#mul_add_c(a[5],b[6],c3,c1,c2);
   1432 	$LD	r6,`5*$BNSZ`(r4)
   1433 	$LD	r7,`6*$BNSZ`(r5)
   1434 	$UMULL	r8,r6,r7
   1435 	$UMULH	r9,r6,r7
   1436 	addc	r12,r12,r8
   1437 	adde	r10,r10,r9
   1438 	addze	r11,r11
   1439 					#mul_add_c(a[6],b[5],c3,c1,c2);
   1440 	$LD	r6,`6*$BNSZ`(r4)
   1441 	$LD	r7,`5*$BNSZ`(r5)
   1442 	$UMULL	r8,r6,r7
   1443 	$UMULH	r9,r6,r7
   1444 	addc	r12,r12,r8
   1445 	adde	r10,r10,r9
   1446 	addze	r11,r11
   1447 					#mul_add_c(a[7],b[4],c3,c1,c2);
   1448 	$LD	r6,`7*$BNSZ`(r4)
   1449 	$LD	r7,`4*$BNSZ`(r5)
   1450 	$UMULL	r8,r6,r7
   1451 	$UMULH	r9,r6,r7
   1452 	addc	r12,r12,r8
   1453 	adde	r10,r10,r9
   1454 	addze	r11,r11
   1455 	$ST	r12,`11*$BNSZ`(r3)	#r[11]=c3;
   1456 					#mul_add_c(a[7],b[5],c1,c2,c3);
   1457 	$LD	r7,`5*$BNSZ`(r5)
   1458 	$UMULL	r8,r6,r7
   1459 	$UMULH	r9,r6,r7
   1460 	addc	r10,r10,r8
   1461 	adde	r11,r11,r9
   1462 	addze	r12,r0
   1463 					#mul_add_c(a[6],b[6],c1,c2,c3);
   1464 	$LD	r6,`6*$BNSZ`(r4)
   1465 	$LD	r7,`6*$BNSZ`(r5)
   1466 	$UMULL	r8,r6,r7
   1467 	$UMULH	r9,r6,r7
   1468 	addc	r10,r10,r8
   1469 	adde	r11,r11,r9
   1470 	addze	r12,r12
   1471 					#mul_add_c(a[5],b[7],c1,c2,c3);
   1472 	$LD	r6,`5*$BNSZ`(r4)
   1473 	$LD	r7,`7*$BNSZ`(r5)
   1474 	$UMULL	r8,r6,r7
   1475 	$UMULH	r9,r6,r7
   1476 	addc	r10,r10,r8
   1477 	adde	r11,r11,r9
   1478 	addze	r12,r12
   1479 	$ST	r10,`12*$BNSZ`(r3)	#r[12]=c1;
   1480 					#mul_add_c(a[6],b[7],c2,c3,c1);
   1481 	$LD	r6,`6*$BNSZ`(r4)
   1482 	$UMULL	r8,r6,r7
   1483 	$UMULH	r9,r6,r7
   1484 	addc	r11,r11,r8
   1485 	adde	r12,r12,r9
   1486 	addze	r10,r0
   1487 					#mul_add_c(a[7],b[6],c2,c3,c1);
   1488 	$LD	r6,`7*$BNSZ`(r4)
   1489 	$LD	r7,`6*$BNSZ`(r5)
   1490 	$UMULL	r8,r6,r7
   1491 	$UMULH	r9,r6,r7
   1492 	addc	r11,r11,r8
   1493 	adde	r12,r12,r9
   1494 	addze	r10,r10
   1495 	$ST	r11,`13*$BNSZ`(r3)	#r[13]=c2;
   1496 					#mul_add_c(a[7],b[7],c3,c1,c2);
   1497 	$LD	r7,`7*$BNSZ`(r5)
   1498 	$UMULL	r8,r6,r7
   1499 	$UMULH	r9,r6,r7
   1500 	addc	r12,r12,r8
   1501 	adde	r10,r10,r9
   1502 	$ST	r12,`14*$BNSZ`(r3)	#r[14]=c3;
   1503 	$ST	r10,`15*$BNSZ`(r3)	#r[15]=c1;
   1504 	blr
   1505 	.long	0x00000000
   1506 
   1507 #
   1508 #	NOTE:	The following label name should be changed to
   1509 #		"bn_sub_words" i.e. remove the first dot
   1510 #		for the gcc compiler. This should be automatically
   1511 #		done in the build
   1512 #
   1513 #
   1514 .align	4
   1515 .bn_sub_words:
   1516 #
   1517 #	Handcoded version of bn_sub_words
   1518 #
   1519 #BN_ULONG bn_sub_words(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n)
   1520 #
   1521 #	r3 = r
   1522 #	r4 = a
   1523 #	r5 = b
   1524 #	r6 = n
   1525 #
   1526 #       Note:	No loop unrolling done since this is not a performance
   1527 #               critical loop.
   1528 
   1529 	xor	r0,r0,r0	#set r0 = 0
   1530 #
   1531 #	check for r6 = 0 AND set carry bit.
   1532 #
   1533 	subfc.	r7,r0,r6        # If r6 is 0 then result is 0.
   1534 				# if r6 > 0 then result !=0
   1535 				# In either case carry bit is set.
   1536 	beq	Lppcasm_sub_adios
   1537 	addi	r4,r4,-$BNSZ
   1538 	addi	r3,r3,-$BNSZ
   1539 	addi	r5,r5,-$BNSZ
   1540 	mtctr	r6
   1541 Lppcasm_sub_mainloop:	
   1542 	$LDU	r7,$BNSZ(r4)
   1543 	$LDU	r8,$BNSZ(r5)
   1544 	subfe	r6,r8,r7	# r6 = r7+carry bit + onescomplement(r8)
   1545 				# if carry = 1 this is r7-r8. Else it
   1546 				# is r7-r8 -1 as we need.
   1547 	$STU	r6,$BNSZ(r3)
   1548 	bdnz-	Lppcasm_sub_mainloop
   1549 Lppcasm_sub_adios:	
   1550 	subfze	r3,r0		# if carry bit is set then r3 = 0 else -1
   1551 	andi.	r3,r3,1         # keep only last bit.
   1552 	blr
   1553 	.long	0x00000000
   1554 
   1555 
   1556 #
   1557 #	NOTE:	The following label name should be changed to
   1558 #		"bn_add_words" i.e. remove the first dot
   1559 #		for the gcc compiler. This should be automatically
   1560 #		done in the build
   1561 #
   1562 
   1563 .align	4
   1564 .bn_add_words:
   1565 #
   1566 #	Handcoded version of bn_add_words
   1567 #
   1568 #BN_ULONG bn_add_words(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n)
   1569 #
   1570 #	r3 = r
   1571 #	r4 = a
   1572 #	r5 = b
   1573 #	r6 = n
   1574 #
   1575 #       Note:	No loop unrolling done since this is not a performance
   1576 #               critical loop.
   1577 
   1578 	xor	r0,r0,r0
   1579 #
   1580 #	check for r6 = 0. Is this needed?
   1581 #
   1582 	addic.	r6,r6,0		#test r6 and clear carry bit.
   1583 	beq	Lppcasm_add_adios
   1584 	addi	r4,r4,-$BNSZ
   1585 	addi	r3,r3,-$BNSZ
   1586 	addi	r5,r5,-$BNSZ
   1587 	mtctr	r6
   1588 Lppcasm_add_mainloop:	
   1589 	$LDU	r7,$BNSZ(r4)
   1590 	$LDU	r8,$BNSZ(r5)
   1591 	adde	r8,r7,r8
   1592 	$STU	r8,$BNSZ(r3)
   1593 	bdnz-	Lppcasm_add_mainloop
   1594 Lppcasm_add_adios:	
   1595 	addze	r3,r0			#return carry bit.
   1596 	blr
   1597 	.long	0x00000000
   1598 
   1599 #
   1600 #	NOTE:	The following label name should be changed to
   1601 #		"bn_div_words" i.e. remove the first dot
   1602 #		for the gcc compiler. This should be automatically
   1603 #		done in the build
   1604 #
   1605 
   1606 .align	4
   1607 .bn_div_words:
   1608 #
   1609 #	This is a cleaned up version of code generated by
   1610 #	the AIX compiler. The only optimization is to use
   1611 #	the PPC instruction to count leading zeros instead
   1612 #	of call to num_bits_word. Since this was compiled
   1613 #	only at level -O2 we can possibly squeeze it more?
   1614 #	
   1615 #	r3 = h
   1616 #	r4 = l
   1617 #	r5 = d
   1618 	
   1619 	$UCMPI	0,r5,0			# compare r5 and 0
   1620 	bne	Lppcasm_div1		# proceed if d!=0
   1621 	li	r3,-1			# d=0 return -1
   1622 	blr
   1623 Lppcasm_div1:
   1624 	xor	r0,r0,r0		#r0=0
   1625 	li	r8,$BITS
   1626 	$CNTLZ.	r7,r5			#r7 = num leading 0s in d.
   1627 	beq	Lppcasm_div2		#proceed if no leading zeros
   1628 	subf	r8,r7,r8		#r8 = BN_num_bits_word(d)
   1629 	$SHR.	r9,r3,r8		#are there any bits above r8'th?
   1630 	$TR	16,r9,r0		#if there're, signal to dump core...
   1631 Lppcasm_div2:
   1632 	$UCMP	0,r3,r5			#h>=d?
   1633 	blt	Lppcasm_div3		#goto Lppcasm_div3 if not
   1634 	subf	r3,r5,r3		#h-=d ; 
   1635 Lppcasm_div3:				#r7 = BN_BITS2-i. so r7=i
   1636 	cmpi	0,0,r7,0		# is (i == 0)?
   1637 	beq	Lppcasm_div4
   1638 	$SHL	r3,r3,r7		# h = (h<< i)
   1639 	$SHR	r8,r4,r8		# r8 = (l >> BN_BITS2 -i)
   1640 	$SHL	r5,r5,r7		# d<<=i
   1641 	or	r3,r3,r8		# h = (h<<i)|(l>>(BN_BITS2-i))
   1642 	$SHL	r4,r4,r7		# l <<=i
   1643 Lppcasm_div4:
   1644 	$SHRI	r9,r5,`$BITS/2`		# r9 = dh
   1645 					# dl will be computed when needed
   1646 					# as it saves registers.
   1647 	li	r6,2			#r6=2
   1648 	mtctr	r6			#counter will be in count.
   1649 Lppcasm_divouterloop: 
   1650 	$SHRI	r8,r3,`$BITS/2`		#r8 = (h>>BN_BITS4)
   1651 	$SHRI	r11,r4,`$BITS/2`	#r11= (l&BN_MASK2h)>>BN_BITS4
   1652 					# compute here for innerloop.
   1653 	$UCMP	0,r8,r9			# is (h>>BN_BITS4)==dh
   1654 	bne	Lppcasm_div5		# goto Lppcasm_div5 if not
   1655 
   1656 	li	r8,-1
   1657 	$CLRU	r8,r8,`$BITS/2`		#q = BN_MASK2l 
   1658 	b	Lppcasm_div6
   1659 Lppcasm_div5:
   1660 	$UDIV	r8,r3,r9		#q = h/dh
   1661 Lppcasm_div6:
   1662 	$UMULL	r12,r9,r8		#th = q*dh
   1663 	$CLRU	r10,r5,`$BITS/2`	#r10=dl
   1664 	$UMULL	r6,r8,r10		#tl = q*dl
   1665 	
   1666 Lppcasm_divinnerloop:
   1667 	subf	r10,r12,r3		#t = h -th
   1668 	$SHRI	r7,r10,`$BITS/2`	#r7= (t &BN_MASK2H), sort of...
   1669 	addic.	r7,r7,0			#test if r7 == 0. used below.
   1670 					# now want to compute
   1671 					# r7 = (t<<BN_BITS4)|((l&BN_MASK2h)>>BN_BITS4)
   1672 					# the following 2 instructions do that
   1673 	$SHLI	r7,r10,`$BITS/2`	# r7 = (t<<BN_BITS4)
   1674 	or	r7,r7,r11		# r7|=((l&BN_MASK2h)>>BN_BITS4)
   1675 	$UCMP	cr1,r6,r7		# compare (tl <= r7)
   1676 	bne	Lppcasm_divinnerexit
   1677 	ble	cr1,Lppcasm_divinnerexit
   1678 	addi	r8,r8,-1		#q--
   1679 	subf	r12,r9,r12		#th -=dh
   1680 	$CLRU	r10,r5,`$BITS/2`	#r10=dl. t is no longer needed in loop.
   1681 	subf	r6,r10,r6		#tl -=dl
   1682 	b	Lppcasm_divinnerloop
   1683 Lppcasm_divinnerexit:
   1684 	$SHRI	r10,r6,`$BITS/2`	#t=(tl>>BN_BITS4)
   1685 	$SHLI	r11,r6,`$BITS/2`	#tl=(tl<<BN_BITS4)&BN_MASK2h;
   1686 	$UCMP	cr1,r4,r11		# compare l and tl
   1687 	add	r12,r12,r10		# th+=t
   1688 	bge	cr1,Lppcasm_div7	# if (l>=tl) goto Lppcasm_div7
   1689 	addi	r12,r12,1		# th++
   1690 Lppcasm_div7:
   1691 	subf	r11,r11,r4		#r11=l-tl
   1692 	$UCMP	cr1,r3,r12		#compare h and th
   1693 	bge	cr1,Lppcasm_div8	#if (h>=th) goto Lppcasm_div8
   1694 	addi	r8,r8,-1		# q--
   1695 	add	r3,r5,r3		# h+=d
   1696 Lppcasm_div8:
   1697 	subf	r12,r12,r3		#r12 = h-th
   1698 	$SHLI	r4,r11,`$BITS/2`	#l=(l&BN_MASK2l)<<BN_BITS4
   1699 					# want to compute
   1700 					# h = ((h<<BN_BITS4)|(l>>BN_BITS4))&BN_MASK2
   1701 					# the following 2 instructions will do this.
   1702 	$INSR	r11,r12,`$BITS/2`,`$BITS/2`	# r11 is the value we want rotated $BITS/2.
   1703 	$ROTL	r3,r11,`$BITS/2`	# rotate by $BITS/2 and store in r3
   1704 	bdz	Lppcasm_div9		#if (count==0) break ;
   1705 	$SHLI	r0,r8,`$BITS/2`		#ret =q<<BN_BITS4
   1706 	b	Lppcasm_divouterloop
   1707 Lppcasm_div9:
   1708 	or	r3,r8,r0
   1709 	blr
   1710 	.long	0x00000000
   1711 
   1712 #
   1713 #	NOTE:	The following label name should be changed to
   1714 #		"bn_sqr_words" i.e. remove the first dot
   1715 #		for the gcc compiler. This should be automatically
   1716 #		done in the build
   1717 #
   1718 .align	4
   1719 .bn_sqr_words:
   1720 #
   1721 #	Optimized version of bn_sqr_words
   1722 #
   1723 #	void bn_sqr_words(BN_ULONG *r, BN_ULONG *a, int n)
   1724 #
   1725 #	r3 = r
   1726 #	r4 = a
   1727 #	r5 = n
   1728 #
   1729 #	r6 = a[i].
   1730 #	r7,r8 = product.
   1731 #
   1732 #	No unrolling done here. Not performance critical.
   1733 
   1734 	addic.	r5,r5,0			#test r5.
   1735 	beq	Lppcasm_sqr_adios
   1736 	addi	r4,r4,-$BNSZ
   1737 	addi	r3,r3,-$BNSZ
   1738 	mtctr	r5
   1739 Lppcasm_sqr_mainloop:	
   1740 					#sqr(r[0],r[1],a[0]);
   1741 	$LDU	r6,$BNSZ(r4)
   1742 	$UMULL	r7,r6,r6
   1743 	$UMULH  r8,r6,r6
   1744 	$STU	r7,$BNSZ(r3)
   1745 	$STU	r8,$BNSZ(r3)
   1746 	bdnz-	Lppcasm_sqr_mainloop
   1747 Lppcasm_sqr_adios:	
   1748 	blr
   1749 	.long	0x00000000
   1750 
   1751 
   1752 #
   1753 #	NOTE:	The following label name should be changed to
   1754 #		"bn_mul_words" i.e. remove the first dot
   1755 #		for the gcc compiler. This should be automatically
   1756 #		done in the build
   1757 #
   1758 
   1759 .align	4	
   1760 .bn_mul_words:
   1761 #
   1762 # BN_ULONG bn_mul_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w)
   1763 #
   1764 # r3 = rp
   1765 # r4 = ap
   1766 # r5 = num
   1767 # r6 = w
   1768 	xor	r0,r0,r0
   1769 	xor	r12,r12,r12		# used for carry
   1770 	rlwinm.	r7,r5,30,2,31		# num >> 2
   1771 	beq	Lppcasm_mw_REM
   1772 	mtctr	r7
   1773 Lppcasm_mw_LOOP:	
   1774 					#mul(rp[0],ap[0],w,c1);
   1775 	$LD	r8,`0*$BNSZ`(r4)
   1776 	$UMULL	r9,r6,r8
   1777 	$UMULH  r10,r6,r8
   1778 	addc	r9,r9,r12
   1779 	#addze	r10,r10			#carry is NOT ignored.
   1780 					#will be taken care of
   1781 					#in second spin below
   1782 					#using adde.
   1783 	$ST	r9,`0*$BNSZ`(r3)
   1784 					#mul(rp[1],ap[1],w,c1);
   1785 	$LD	r8,`1*$BNSZ`(r4)	
   1786 	$UMULL	r11,r6,r8
   1787 	$UMULH  r12,r6,r8
   1788 	adde	r11,r11,r10
   1789 	#addze	r12,r12
   1790 	$ST	r11,`1*$BNSZ`(r3)
   1791 					#mul(rp[2],ap[2],w,c1);
   1792 	$LD	r8,`2*$BNSZ`(r4)
   1793 	$UMULL	r9,r6,r8
   1794 	$UMULH  r10,r6,r8
   1795 	adde	r9,r9,r12
   1796 	#addze	r10,r10
   1797 	$ST	r9,`2*$BNSZ`(r3)
   1798 					#mul_add(rp[3],ap[3],w,c1);
   1799 	$LD	r8,`3*$BNSZ`(r4)
   1800 	$UMULL	r11,r6,r8
   1801 	$UMULH  r12,r6,r8
   1802 	adde	r11,r11,r10
   1803 	addze	r12,r12			#this spin we collect carry into
   1804 					#r12
   1805 	$ST	r11,`3*$BNSZ`(r3)
   1806 	
   1807 	addi	r3,r3,`4*$BNSZ`
   1808 	addi	r4,r4,`4*$BNSZ`
   1809 	bdnz-	Lppcasm_mw_LOOP
   1810 
   1811 Lppcasm_mw_REM:
   1812 	andi.	r5,r5,0x3
   1813 	beq	Lppcasm_mw_OVER
   1814 					#mul(rp[0],ap[0],w,c1);
   1815 	$LD	r8,`0*$BNSZ`(r4)
   1816 	$UMULL	r9,r6,r8
   1817 	$UMULH  r10,r6,r8
   1818 	addc	r9,r9,r12
   1819 	addze	r10,r10
   1820 	$ST	r9,`0*$BNSZ`(r3)
   1821 	addi	r12,r10,0
   1822 	
   1823 	addi	r5,r5,-1
   1824 	cmpli	0,0,r5,0
   1825 	beq	Lppcasm_mw_OVER
   1826 
   1827 	
   1828 					#mul(rp[1],ap[1],w,c1);
   1829 	$LD	r8,`1*$BNSZ`(r4)	
   1830 	$UMULL	r9,r6,r8
   1831 	$UMULH  r10,r6,r8
   1832 	addc	r9,r9,r12
   1833 	addze	r10,r10
   1834 	$ST	r9,`1*$BNSZ`(r3)
   1835 	addi	r12,r10,0
   1836 	
   1837 	addi	r5,r5,-1
   1838 	cmpli	0,0,r5,0
   1839 	beq	Lppcasm_mw_OVER
   1840 	
   1841 					#mul_add(rp[2],ap[2],w,c1);
   1842 	$LD	r8,`2*$BNSZ`(r4)
   1843 	$UMULL	r9,r6,r8
   1844 	$UMULH  r10,r6,r8
   1845 	addc	r9,r9,r12
   1846 	addze	r10,r10
   1847 	$ST	r9,`2*$BNSZ`(r3)
   1848 	addi	r12,r10,0
   1849 		
   1850 Lppcasm_mw_OVER:	
   1851 	addi	r3,r12,0
   1852 	blr
   1853 	.long	0x00000000
   1854 
   1855 #
   1856 #	NOTE:	The following label name should be changed to
   1857 #		"bn_mul_add_words" i.e. remove the first dot
   1858 #		for the gcc compiler. This should be automatically
   1859 #		done in the build
   1860 #
   1861 
   1862 .align	4
   1863 .bn_mul_add_words:
   1864 #
   1865 # BN_ULONG bn_mul_add_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w)
   1866 #
   1867 # r3 = rp
   1868 # r4 = ap
   1869 # r5 = num
   1870 # r6 = w
   1871 #
   1872 # empirical evidence suggests that unrolled version performs best!!
   1873 #
   1874 	xor	r0,r0,r0		#r0 = 0
   1875 	xor	r12,r12,r12  		#r12 = 0 . used for carry		
   1876 	rlwinm.	r7,r5,30,2,31		# num >> 2
   1877 	beq	Lppcasm_maw_leftover	# if (num < 4) go LPPCASM_maw_leftover
   1878 	mtctr	r7
   1879 Lppcasm_maw_mainloop:	
   1880 					#mul_add(rp[0],ap[0],w,c1);
   1881 	$LD	r8,`0*$BNSZ`(r4)
   1882 	$LD	r11,`0*$BNSZ`(r3)
   1883 	$UMULL	r9,r6,r8
   1884 	$UMULH  r10,r6,r8
   1885 	addc	r9,r9,r12		#r12 is carry.
   1886 	addze	r10,r10
   1887 	addc	r9,r9,r11
   1888 	#addze	r10,r10
   1889 					#the above instruction addze
   1890 					#is NOT needed. Carry will NOT
   1891 					#be ignored. It's not affected
   1892 					#by multiply and will be collected
   1893 					#in the next spin
   1894 	$ST	r9,`0*$BNSZ`(r3)
   1895 	
   1896 					#mul_add(rp[1],ap[1],w,c1);
   1897 	$LD	r8,`1*$BNSZ`(r4)	
   1898 	$LD	r9,`1*$BNSZ`(r3)
   1899 	$UMULL	r11,r6,r8
   1900 	$UMULH  r12,r6,r8
   1901 	adde	r11,r11,r10		#r10 is carry.
   1902 	addze	r12,r12
   1903 	addc	r11,r11,r9
   1904 	#addze	r12,r12
   1905 	$ST	r11,`1*$BNSZ`(r3)
   1906 	
   1907 					#mul_add(rp[2],ap[2],w,c1);
   1908 	$LD	r8,`2*$BNSZ`(r4)
   1909 	$UMULL	r9,r6,r8
   1910 	$LD	r11,`2*$BNSZ`(r3)
   1911 	$UMULH  r10,r6,r8
   1912 	adde	r9,r9,r12
   1913 	addze	r10,r10
   1914 	addc	r9,r9,r11
   1915 	#addze	r10,r10
   1916 	$ST	r9,`2*$BNSZ`(r3)
   1917 	
   1918 					#mul_add(rp[3],ap[3],w,c1);
   1919 	$LD	r8,`3*$BNSZ`(r4)
   1920 	$UMULL	r11,r6,r8
   1921 	$LD	r9,`3*$BNSZ`(r3)
   1922 	$UMULH  r12,r6,r8
   1923 	adde	r11,r11,r10
   1924 	addze	r12,r12
   1925 	addc	r11,r11,r9
   1926 	addze	r12,r12
   1927 	$ST	r11,`3*$BNSZ`(r3)
   1928 	addi	r3,r3,`4*$BNSZ`
   1929 	addi	r4,r4,`4*$BNSZ`
   1930 	bdnz-	Lppcasm_maw_mainloop
   1931 	
   1932 Lppcasm_maw_leftover:
   1933 	andi.	r5,r5,0x3
   1934 	beq	Lppcasm_maw_adios
   1935 	addi	r3,r3,-$BNSZ
   1936 	addi	r4,r4,-$BNSZ
   1937 					#mul_add(rp[0],ap[0],w,c1);
   1938 	mtctr	r5
   1939 	$LDU	r8,$BNSZ(r4)
   1940 	$UMULL	r9,r6,r8
   1941 	$UMULH  r10,r6,r8
   1942 	$LDU	r11,$BNSZ(r3)
   1943 	addc	r9,r9,r11
   1944 	addze	r10,r10
   1945 	addc	r9,r9,r12
   1946 	addze	r12,r10
   1947 	$ST	r9,0(r3)
   1948 	
   1949 	bdz	Lppcasm_maw_adios
   1950 					#mul_add(rp[1],ap[1],w,c1);
   1951 	$LDU	r8,$BNSZ(r4)	
   1952 	$UMULL	r9,r6,r8
   1953 	$UMULH  r10,r6,r8
   1954 	$LDU	r11,$BNSZ(r3)
   1955 	addc	r9,r9,r11
   1956 	addze	r10,r10
   1957 	addc	r9,r9,r12
   1958 	addze	r12,r10
   1959 	$ST	r9,0(r3)
   1960 	
   1961 	bdz	Lppcasm_maw_adios
   1962 					#mul_add(rp[2],ap[2],w,c1);
   1963 	$LDU	r8,$BNSZ(r4)
   1964 	$UMULL	r9,r6,r8
   1965 	$UMULH  r10,r6,r8
   1966 	$LDU	r11,$BNSZ(r3)
   1967 	addc	r9,r9,r11
   1968 	addze	r10,r10
   1969 	addc	r9,r9,r12
   1970 	addze	r12,r10
   1971 	$ST	r9,0(r3)
   1972 		
   1973 Lppcasm_maw_adios:	
   1974 	addi	r3,r12,0
   1975 	blr
   1976 	.long	0x00000000
   1977 	.align	4
   1978 EOF
   1979 $data =~ s/\`([^\`]*)\`/eval $1/gem;
   1980 print $data;
   1981 close STDOUT;
   1982