Home | History | Annotate | Download | only in asm
      1 #!/usr/bin/env perl
      2 #
      3 # Implemented as a Perl wrapper as we want to support several different
      4 # architectures with single file. We pick up the target based on the
      5 # file name we are asked to generate.
      6 #
      7 # It should be noted though that this perl code is nothing like
      8 # <openssl>/crypto/perlasm/x86*. In this case perl is used pretty much
      9 # as pre-processor to cover for platform differences in name decoration,
     10 # linker tables, 32-/64-bit instruction sets...
     11 #
     12 # As you might know there're several PowerPC ABI in use. Most notably
     13 # Linux and AIX use different 32-bit ABIs. Good news are that these ABIs
     14 # are similar enough to implement leaf(!) functions, which would be ABI
     15 # neutral. And that's what you find here: ABI neutral leaf functions.
     16 # In case you wonder what that is...
     17 #
     18 #       AIX performance
     19 #
     20 #	MEASUREMENTS WITH cc ON a 200 MhZ PowerPC 604e.
     21 #
     22 #	The following is the performance of 32-bit compiler
     23 #	generated code:
     24 #
     25 #	OpenSSL 0.9.6c 21 dec 2001
     26 #	built on: Tue Jun 11 11:06:51 EDT 2002
     27 #	options:bn(64,32) ...
     28 #compiler: cc -DTHREADS  -DAIX -DB_ENDIAN -DBN_LLONG -O3
     29 #                  sign    verify    sign/s verify/s
     30 #rsa  512 bits   0.0098s   0.0009s    102.0   1170.6
     31 #rsa 1024 bits   0.0507s   0.0026s     19.7    387.5
     32 #rsa 2048 bits   0.3036s   0.0085s      3.3    117.1
     33 #rsa 4096 bits   2.0040s   0.0299s      0.5     33.4
     34 #dsa  512 bits   0.0087s   0.0106s    114.3     94.5
     35 #dsa 1024 bits   0.0256s   0.0313s     39.0     32.0	
     36 #
     37 #	Same bechmark with this assembler code:
     38 #
     39 #rsa  512 bits   0.0056s   0.0005s    178.6   2049.2
     40 #rsa 1024 bits   0.0283s   0.0015s     35.3    674.1
     41 #rsa 2048 bits   0.1744s   0.0050s      5.7    201.2
     42 #rsa 4096 bits   1.1644s   0.0179s      0.9     55.7
     43 #dsa  512 bits   0.0052s   0.0062s    191.6    162.0
     44 #dsa 1024 bits   0.0149s   0.0180s     67.0     55.5
     45 #
     46 #	Number of operations increases by at almost 75%
     47 #
     48 #	Here are performance numbers for 64-bit compiler
     49 #	generated code:
     50 #
     51 #	OpenSSL 0.9.6g [engine] 9 Aug 2002
     52 #	built on: Fri Apr 18 16:59:20 EDT 2003
     53 #	options:bn(64,64) ...
     54 #	compiler: cc -DTHREADS -D_REENTRANT -q64 -DB_ENDIAN -O3
     55 #                  sign    verify    sign/s verify/s
     56 #rsa  512 bits   0.0028s   0.0003s    357.1   3844.4
     57 #rsa 1024 bits   0.0148s   0.0008s     67.5   1239.7
     58 #rsa 2048 bits   0.0963s   0.0028s     10.4    353.0
     59 #rsa 4096 bits   0.6538s   0.0102s      1.5     98.1
     60 #dsa  512 bits   0.0026s   0.0032s    382.5    313.7
     61 #dsa 1024 bits   0.0081s   0.0099s    122.8    100.6
     62 #
     63 #	Same benchmark with this assembler code:
     64 #
     65 #rsa  512 bits   0.0020s   0.0002s    510.4   6273.7
     66 #rsa 1024 bits   0.0088s   0.0005s    114.1   2128.3
     67 #rsa 2048 bits   0.0540s   0.0016s     18.5    622.5
     68 #rsa 4096 bits   0.3700s   0.0058s      2.7    171.0
     69 #dsa  512 bits   0.0016s   0.0020s    610.7    507.1
     70 #dsa 1024 bits   0.0047s   0.0058s    212.5    173.2
     71 #	
     72 #	Again, performance increases by at about 75%
     73 #
     74 #       Mac OS X, Apple G5 1.8GHz (Note this is 32 bit code)
     75 #       OpenSSL 0.9.7c 30 Sep 2003
     76 #
     77 #       Original code.
     78 #
     79 #rsa  512 bits   0.0011s   0.0001s    906.1  11012.5
     80 #rsa 1024 bits   0.0060s   0.0003s    166.6   3363.1
     81 #rsa 2048 bits   0.0370s   0.0010s     27.1    982.4
     82 #rsa 4096 bits   0.2426s   0.0036s      4.1    280.4
     83 #dsa  512 bits   0.0010s   0.0012s   1038.1    841.5
     84 #dsa 1024 bits   0.0030s   0.0037s    329.6    269.7
     85 #dsa 2048 bits   0.0101s   0.0127s     98.9     78.6
     86 #
     87 #       Same benchmark with this assembler code:
     88 #
     89 #rsa  512 bits   0.0007s   0.0001s   1416.2  16645.9
     90 #rsa 1024 bits   0.0036s   0.0002s    274.4   5380.6
     91 #rsa 2048 bits   0.0222s   0.0006s     45.1   1589.5
     92 #rsa 4096 bits   0.1469s   0.0022s      6.8    449.6
     93 #dsa  512 bits   0.0006s   0.0007s   1664.2   1376.2
     94 #dsa 1024 bits   0.0018s   0.0023s    545.0    442.2
     95 #dsa 2048 bits   0.0061s   0.0075s    163.5    132.8
     96 #
     97 #        Performance increase of ~60%
     98 #
     99 #	If you have comments or suggestions to improve code send
    100 #	me a note at schari (at] us.ibm.com
    101 #
    102 
    103 $flavour = shift;
    104 
    105 if ($flavour =~ /32/) {
    106 	$BITS=	32;
    107 	$BNSZ=	$BITS/8;
    108 	$ISA=	"\"ppc\"";
    109 
    110 	$LD=	"lwz";		# load
    111 	$LDU=	"lwzu";		# load and update
    112 	$ST=	"stw";		# store
    113 	$STU=	"stwu";		# store and update
    114 	$UMULL=	"mullw";	# unsigned multiply low
    115 	$UMULH=	"mulhwu";	# unsigned multiply high
    116 	$UDIV=	"divwu";	# unsigned divide
    117 	$UCMPI=	"cmplwi";	# unsigned compare with immediate
    118 	$UCMP=	"cmplw";	# unsigned compare
    119 	$CNTLZ=	"cntlzw";	# count leading zeros
    120 	$SHL=	"slw";		# shift left
    121 	$SHR=	"srw";		# unsigned shift right
    122 	$SHRI=	"srwi";		# unsigned shift right by immediate	
    123 	$SHLI=	"slwi";		# shift left by immediate
    124 	$CLRU=	"clrlwi";	# clear upper bits
    125 	$INSR=	"insrwi";	# insert right
    126 	$ROTL=	"rotlwi";	# rotate left by immediate
    127 	$TR=	"tw";		# conditional trap
    128 } elsif ($flavour =~ /64/) {
    129 	$BITS=	64;
    130 	$BNSZ=	$BITS/8;
    131 	$ISA=	"\"ppc64\"";
    132 
    133 	# same as above, but 64-bit mnemonics...
    134 	$LD=	"ld";		# load
    135 	$LDU=	"ldu";		# load and update
    136 	$ST=	"std";		# store
    137 	$STU=	"stdu";		# store and update
    138 	$UMULL=	"mulld";	# unsigned multiply low
    139 	$UMULH=	"mulhdu";	# unsigned multiply high
    140 	$UDIV=	"divdu";	# unsigned divide
    141 	$UCMPI=	"cmpldi";	# unsigned compare with immediate
    142 	$UCMP=	"cmpld";	# unsigned compare
    143 	$CNTLZ=	"cntlzd";	# count leading zeros
    144 	$SHL=	"sld";		# shift left
    145 	$SHR=	"srd";		# unsigned shift right
    146 	$SHRI=	"srdi";		# unsigned shift right by immediate	
    147 	$SHLI=	"sldi";		# shift left by immediate
    148 	$CLRU=	"clrldi";	# clear upper bits
    149 	$INSR=	"insrdi";	# insert right 
    150 	$ROTL=	"rotldi";	# rotate left by immediate
    151 	$TR=	"td";		# conditional trap
    152 } else { die "nonsense $flavour"; }
    153 
    154 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
    155 ( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
    156 ( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
    157 die "can't locate ppc-xlate.pl";
    158 
    159 open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!";
    160 
    161 $data=<<EOF;
    162 #--------------------------------------------------------------------
    163 #
    164 #
    165 #
    166 #
    167 #	File:		ppc32.s
    168 #
    169 #	Created by:	Suresh Chari
    170 #			IBM Thomas J. Watson Research Library
    171 #			Hawthorne, NY
    172 #
    173 #
    174 #	Description:	Optimized assembly routines for OpenSSL crypto
    175 #			on the 32 bitPowerPC platform.
    176 #
    177 #
    178 #	Version History
    179 #
    180 #	2. Fixed bn_add,bn_sub and bn_div_words, added comments,
    181 #	   cleaned up code. Also made a single version which can
    182 #	   be used for both the AIX and Linux compilers. See NOTE
    183 #	   below.
    184 #				12/05/03		Suresh Chari
    185 #			(with lots of help from)        Andy Polyakov
    186 ##	
    187 #	1. Initial version	10/20/02		Suresh Chari
    188 #
    189 #
    190 #	The following file works for the xlc,cc
    191 #	and gcc compilers.
    192 #
    193 #	NOTE:	To get the file to link correctly with the gcc compiler
    194 #	        you have to change the names of the routines and remove
    195 #		the first .(dot) character. This should automatically
    196 #		be done in the build process.
    197 #
    198 #	Hand optimized assembly code for the following routines
    199 #	
    200 #	bn_sqr_comba4
    201 #	bn_sqr_comba8
    202 #	bn_mul_comba4
    203 #	bn_mul_comba8
    204 #	bn_sub_words
    205 #	bn_add_words
    206 #	bn_div_words
    207 #	bn_sqr_words
    208 #	bn_mul_words
    209 #	bn_mul_add_words
    210 #
    211 #	NOTE:	It is possible to optimize this code more for
    212 #	specific PowerPC or Power architectures. On the Northstar
    213 #	architecture the optimizations in this file do
    214 #	 NOT provide much improvement.
    215 #
    216 #	If you have comments or suggestions to improve code send
    217 #	me a note at schari\@us.ibm.com
    218 #
    219 #--------------------------------------------------------------------------
    220 #
    221 #	Defines to be used in the assembly code.
    222 #	
    223 #.set r0,0	# we use it as storage for value of 0
    224 #.set SP,1	# preserved
    225 #.set RTOC,2	# preserved 
    226 #.set r3,3	# 1st argument/return value
    227 #.set r4,4	# 2nd argument/volatile register
    228 #.set r5,5	# 3rd argument/volatile register
    229 #.set r6,6	# ...
    230 #.set r7,7
    231 #.set r8,8
    232 #.set r9,9
    233 #.set r10,10
    234 #.set r11,11
    235 #.set r12,12
    236 #.set r13,13	# not used, nor any other "below" it...
    237 
    238 #	Declare function names to be global
    239 #	NOTE:	For gcc these names MUST be changed to remove
    240 #	        the first . i.e. for example change ".bn_sqr_comba4"
    241 #		to "bn_sqr_comba4". This should be automatically done
    242 #		in the build.
    243 	
    244 	.globl	.bn_sqr_comba4
    245 	.globl	.bn_sqr_comba8
    246 	.globl	.bn_mul_comba4
    247 	.globl	.bn_mul_comba8
    248 	.globl	.bn_sub_words
    249 	.globl	.bn_add_words
    250 	.globl	.bn_div_words
    251 	.globl	.bn_sqr_words
    252 	.globl	.bn_mul_words
    253 	.globl	.bn_mul_add_words
    254 	
    255 # .text section
    256 	
    257 	.machine	"any"
    258 
    259 #
    260 #	NOTE:	The following label name should be changed to
    261 #		"bn_sqr_comba4" i.e. remove the first dot
    262 #		for the gcc compiler. This should be automatically
    263 #		done in the build
    264 #
    265 
    266 .align	4
    267 .bn_sqr_comba4:
    268 #
    269 # Optimized version of bn_sqr_comba4.
    270 #
    271 # void bn_sqr_comba4(BN_ULONG *r, BN_ULONG *a)
    272 # r3 contains r
    273 # r4 contains a
    274 #
    275 # Freely use registers r5,r6,r7,r8,r9,r10,r11 as follows:	
    276 # 
    277 # r5,r6 are the two BN_ULONGs being multiplied.
    278 # r7,r8 are the results of the 32x32 giving 64 bit multiply.
    279 # r9,r10, r11 are the equivalents of c1,c2, c3.
    280 # Here's the assembly
    281 #
    282 #
    283 	xor		r0,r0,r0		# set r0 = 0. Used in the addze
    284 						# instructions below
    285 	
    286 						#sqr_add_c(a,0,c1,c2,c3)
    287 	$LD		r5,`0*$BNSZ`(r4)		
    288 	$UMULL		r9,r5,r5		
    289 	$UMULH		r10,r5,r5		#in first iteration. No need
    290 						#to add since c1=c2=c3=0.
    291 						# Note c3(r11) is NOT set to 0
    292 						# but will be.
    293 
    294 	$ST		r9,`0*$BNSZ`(r3)	# r[0]=c1;
    295 						# sqr_add_c2(a,1,0,c2,c3,c1);
    296 	$LD		r6,`1*$BNSZ`(r4)		
    297 	$UMULL		r7,r5,r6
    298 	$UMULH		r8,r5,r6
    299 					
    300 	addc		r7,r7,r7		# compute (r7,r8)=2*(r7,r8)
    301 	adde		r8,r8,r8
    302 	addze		r9,r0			# catch carry if any.
    303 						# r9= r0(=0) and carry 
    304 	
    305 	addc		r10,r7,r10		# now add to temp result.
    306 	addze		r11,r8                  # r8 added to r11 which is 0 
    307 	addze		r9,r9
    308 	
    309 	$ST		r10,`1*$BNSZ`(r3)	#r[1]=c2; 
    310 						#sqr_add_c(a,1,c3,c1,c2)
    311 	$UMULL		r7,r6,r6
    312 	$UMULH		r8,r6,r6
    313 	addc		r11,r7,r11
    314 	adde		r9,r8,r9
    315 	addze		r10,r0
    316 						#sqr_add_c2(a,2,0,c3,c1,c2)
    317 	$LD		r6,`2*$BNSZ`(r4)
    318 	$UMULL		r7,r5,r6
    319 	$UMULH		r8,r5,r6
    320 	
    321 	addc		r7,r7,r7
    322 	adde		r8,r8,r8
    323 	addze		r10,r10
    324 	
    325 	addc		r11,r7,r11
    326 	adde		r9,r8,r9
    327 	addze		r10,r10
    328 	$ST		r11,`2*$BNSZ`(r3)	#r[2]=c3 
    329 						#sqr_add_c2(a,3,0,c1,c2,c3);
    330 	$LD		r6,`3*$BNSZ`(r4)		
    331 	$UMULL		r7,r5,r6
    332 	$UMULH		r8,r5,r6
    333 	addc		r7,r7,r7
    334 	adde		r8,r8,r8
    335 	addze		r11,r0
    336 	
    337 	addc		r9,r7,r9
    338 	adde		r10,r8,r10
    339 	addze		r11,r11
    340 						#sqr_add_c2(a,2,1,c1,c2,c3);
    341 	$LD		r5,`1*$BNSZ`(r4)
    342 	$LD		r6,`2*$BNSZ`(r4)
    343 	$UMULL		r7,r5,r6
    344 	$UMULH		r8,r5,r6
    345 	
    346 	addc		r7,r7,r7
    347 	adde		r8,r8,r8
    348 	addze		r11,r11
    349 	addc		r9,r7,r9
    350 	adde		r10,r8,r10
    351 	addze		r11,r11
    352 	$ST		r9,`3*$BNSZ`(r3)	#r[3]=c1
    353 						#sqr_add_c(a,2,c2,c3,c1);
    354 	$UMULL		r7,r6,r6
    355 	$UMULH		r8,r6,r6
    356 	addc		r10,r7,r10
    357 	adde		r11,r8,r11
    358 	addze		r9,r0
    359 						#sqr_add_c2(a,3,1,c2,c3,c1);
    360 	$LD		r6,`3*$BNSZ`(r4)		
    361 	$UMULL		r7,r5,r6
    362 	$UMULH		r8,r5,r6
    363 	addc		r7,r7,r7
    364 	adde		r8,r8,r8
    365 	addze		r9,r9
    366 	
    367 	addc		r10,r7,r10
    368 	adde		r11,r8,r11
    369 	addze		r9,r9
    370 	$ST		r10,`4*$BNSZ`(r3)	#r[4]=c2
    371 						#sqr_add_c2(a,3,2,c3,c1,c2);
    372 	$LD		r5,`2*$BNSZ`(r4)		
    373 	$UMULL		r7,r5,r6
    374 	$UMULH		r8,r5,r6
    375 	addc		r7,r7,r7
    376 	adde		r8,r8,r8
    377 	addze		r10,r0
    378 	
    379 	addc		r11,r7,r11
    380 	adde		r9,r8,r9
    381 	addze		r10,r10
    382 	$ST		r11,`5*$BNSZ`(r3)	#r[5] = c3
    383 						#sqr_add_c(a,3,c1,c2,c3);
    384 	$UMULL		r7,r6,r6		
    385 	$UMULH		r8,r6,r6
    386 	addc		r9,r7,r9
    387 	adde		r10,r8,r10
    388 
    389 	$ST		r9,`6*$BNSZ`(r3)	#r[6]=c1
    390 	$ST		r10,`7*$BNSZ`(r3)	#r[7]=c2
    391 	blr
    392 	.long	0
    393 	.byte	0,12,0x14,0,0,0,2,0
    394 	.long	0
    395 
    396 #
    397 #	NOTE:	The following label name should be changed to
    398 #		"bn_sqr_comba8" i.e. remove the first dot
    399 #		for the gcc compiler. This should be automatically
    400 #		done in the build
    401 #
    402 	
    403 .align	4
    404 .bn_sqr_comba8:
    405 #
    406 # This is an optimized version of the bn_sqr_comba8 routine.
    407 # Tightly uses the adde instruction
    408 #
    409 #
    410 # void bn_sqr_comba8(BN_ULONG *r, BN_ULONG *a)
    411 # r3 contains r
    412 # r4 contains a
    413 #
    414 # Freely use registers r5,r6,r7,r8,r9,r10,r11 as follows:	
    415 # 
    416 # r5,r6 are the two BN_ULONGs being multiplied.
    417 # r7,r8 are the results of the 32x32 giving 64 bit multiply.
    418 # r9,r10, r11 are the equivalents of c1,c2, c3.
    419 #
    420 # Possible optimization of loading all 8 longs of a into registers
    421 # doesnt provide any speedup
    422 # 
    423 
    424 	xor		r0,r0,r0		#set r0 = 0.Used in addze
    425 						#instructions below.
    426 
    427 						#sqr_add_c(a,0,c1,c2,c3);
    428 	$LD		r5,`0*$BNSZ`(r4)
    429 	$UMULL		r9,r5,r5		#1st iteration:	no carries.
    430 	$UMULH		r10,r5,r5
    431 	$ST		r9,`0*$BNSZ`(r3)	# r[0]=c1;
    432 						#sqr_add_c2(a,1,0,c2,c3,c1);
    433 	$LD		r6,`1*$BNSZ`(r4)
    434 	$UMULL		r7,r5,r6
    435 	$UMULH		r8,r5,r6	
    436 	
    437 	addc		r10,r7,r10		#add the two register number
    438 	adde		r11,r8,r0 		# (r8,r7) to the three register
    439 	addze		r9,r0			# number (r9,r11,r10).NOTE:r0=0
    440 	
    441 	addc		r10,r7,r10		#add the two register number
    442 	adde		r11,r8,r11 		# (r8,r7) to the three register
    443 	addze		r9,r9			# number (r9,r11,r10).
    444 	
    445 	$ST		r10,`1*$BNSZ`(r3)	# r[1]=c2
    446 				
    447 						#sqr_add_c(a,1,c3,c1,c2);
    448 	$UMULL		r7,r6,r6
    449 	$UMULH		r8,r6,r6
    450 	addc		r11,r7,r11
    451 	adde		r9,r8,r9
    452 	addze		r10,r0
    453 						#sqr_add_c2(a,2,0,c3,c1,c2);
    454 	$LD		r6,`2*$BNSZ`(r4)
    455 	$UMULL		r7,r5,r6
    456 	$UMULH		r8,r5,r6
    457 	
    458 	addc		r11,r7,r11
    459 	adde		r9,r8,r9
    460 	addze		r10,r10
    461 	
    462 	addc		r11,r7,r11
    463 	adde		r9,r8,r9
    464 	addze		r10,r10
    465 	
    466 	$ST		r11,`2*$BNSZ`(r3)	#r[2]=c3
    467 						#sqr_add_c2(a,3,0,c1,c2,c3);
    468 	$LD		r6,`3*$BNSZ`(r4)	#r6 = a[3]. r5 is already a[0].
    469 	$UMULL		r7,r5,r6
    470 	$UMULH		r8,r5,r6
    471 	
    472 	addc		r9,r7,r9
    473 	adde		r10,r8,r10
    474 	addze		r11,r0
    475 	
    476 	addc		r9,r7,r9
    477 	adde		r10,r8,r10
    478 	addze		r11,r11
    479 						#sqr_add_c2(a,2,1,c1,c2,c3);
    480 	$LD		r5,`1*$BNSZ`(r4)
    481 	$LD		r6,`2*$BNSZ`(r4)
    482 	$UMULL		r7,r5,r6
    483 	$UMULH		r8,r5,r6
    484 	
    485 	addc		r9,r7,r9
    486 	adde		r10,r8,r10
    487 	addze		r11,r11
    488 	
    489 	addc		r9,r7,r9
    490 	adde		r10,r8,r10
    491 	addze		r11,r11
    492 	
    493 	$ST		r9,`3*$BNSZ`(r3)	#r[3]=c1;
    494 						#sqr_add_c(a,2,c2,c3,c1);
    495 	$UMULL		r7,r6,r6
    496 	$UMULH		r8,r6,r6
    497 	
    498 	addc		r10,r7,r10
    499 	adde		r11,r8,r11
    500 	addze		r9,r0
    501 						#sqr_add_c2(a,3,1,c2,c3,c1);
    502 	$LD		r6,`3*$BNSZ`(r4)
    503 	$UMULL		r7,r5,r6
    504 	$UMULH		r8,r5,r6
    505 	
    506 	addc		r10,r7,r10
    507 	adde		r11,r8,r11
    508 	addze		r9,r9
    509 	
    510 	addc		r10,r7,r10
    511 	adde		r11,r8,r11
    512 	addze		r9,r9
    513 						#sqr_add_c2(a,4,0,c2,c3,c1);
    514 	$LD		r5,`0*$BNSZ`(r4)
    515 	$LD		r6,`4*$BNSZ`(r4)
    516 	$UMULL		r7,r5,r6
    517 	$UMULH		r8,r5,r6
    518 	
    519 	addc		r10,r7,r10
    520 	adde		r11,r8,r11
    521 	addze		r9,r9
    522 	
    523 	addc		r10,r7,r10
    524 	adde		r11,r8,r11
    525 	addze		r9,r9
    526 	$ST		r10,`4*$BNSZ`(r3)	#r[4]=c2;
    527 						#sqr_add_c2(a,5,0,c3,c1,c2);
    528 	$LD		r6,`5*$BNSZ`(r4)
    529 	$UMULL		r7,r5,r6
    530 	$UMULH		r8,r5,r6
    531 	
    532 	addc		r11,r7,r11
    533 	adde		r9,r8,r9
    534 	addze		r10,r0
    535 	
    536 	addc		r11,r7,r11
    537 	adde		r9,r8,r9
    538 	addze		r10,r10
    539 						#sqr_add_c2(a,4,1,c3,c1,c2);
    540 	$LD		r5,`1*$BNSZ`(r4)
    541 	$LD		r6,`4*$BNSZ`(r4)
    542 	$UMULL		r7,r5,r6
    543 	$UMULH		r8,r5,r6
    544 	
    545 	addc		r11,r7,r11
    546 	adde		r9,r8,r9
    547 	addze		r10,r10
    548 	
    549 	addc		r11,r7,r11
    550 	adde		r9,r8,r9
    551 	addze		r10,r10
    552 						#sqr_add_c2(a,3,2,c3,c1,c2);
    553 	$LD		r5,`2*$BNSZ`(r4)
    554 	$LD		r6,`3*$BNSZ`(r4)
    555 	$UMULL		r7,r5,r6
    556 	$UMULH		r8,r5,r6
    557 	
    558 	addc		r11,r7,r11
    559 	adde		r9,r8,r9
    560 	addze		r10,r10
    561 	
    562 	addc		r11,r7,r11
    563 	adde		r9,r8,r9
    564 	addze		r10,r10
    565 	$ST		r11,`5*$BNSZ`(r3)	#r[5]=c3;
    566 						#sqr_add_c(a,3,c1,c2,c3);
    567 	$UMULL		r7,r6,r6
    568 	$UMULH		r8,r6,r6
    569 	addc		r9,r7,r9
    570 	adde		r10,r8,r10
    571 	addze		r11,r0
    572 						#sqr_add_c2(a,4,2,c1,c2,c3);
    573 	$LD		r6,`4*$BNSZ`(r4)
    574 	$UMULL		r7,r5,r6
    575 	$UMULH		r8,r5,r6
    576 	
    577 	addc		r9,r7,r9
    578 	adde		r10,r8,r10
    579 	addze		r11,r11
    580 	
    581 	addc		r9,r7,r9
    582 	adde		r10,r8,r10
    583 	addze		r11,r11
    584 						#sqr_add_c2(a,5,1,c1,c2,c3);
    585 	$LD		r5,`1*$BNSZ`(r4)
    586 	$LD		r6,`5*$BNSZ`(r4)
    587 	$UMULL		r7,r5,r6
    588 	$UMULH		r8,r5,r6
    589 	
    590 	addc		r9,r7,r9
    591 	adde		r10,r8,r10
    592 	addze		r11,r11
    593 	
    594 	addc		r9,r7,r9
    595 	adde		r10,r8,r10
    596 	addze		r11,r11
    597 						#sqr_add_c2(a,6,0,c1,c2,c3);
    598 	$LD		r5,`0*$BNSZ`(r4)
    599 	$LD		r6,`6*$BNSZ`(r4)
    600 	$UMULL		r7,r5,r6
    601 	$UMULH		r8,r5,r6
    602 	addc		r9,r7,r9
    603 	adde		r10,r8,r10
    604 	addze		r11,r11
    605 	addc		r9,r7,r9
    606 	adde		r10,r8,r10
    607 	addze		r11,r11
    608 	$ST		r9,`6*$BNSZ`(r3)	#r[6]=c1;
    609 						#sqr_add_c2(a,7,0,c2,c3,c1);
    610 	$LD		r6,`7*$BNSZ`(r4)
    611 	$UMULL		r7,r5,r6
    612 	$UMULH		r8,r5,r6
    613 	
    614 	addc		r10,r7,r10
    615 	adde		r11,r8,r11
    616 	addze		r9,r0
    617 	addc		r10,r7,r10
    618 	adde		r11,r8,r11
    619 	addze		r9,r9
    620 						#sqr_add_c2(a,6,1,c2,c3,c1);
    621 	$LD		r5,`1*$BNSZ`(r4)
    622 	$LD		r6,`6*$BNSZ`(r4)
    623 	$UMULL		r7,r5,r6
    624 	$UMULH		r8,r5,r6
    625 	
    626 	addc		r10,r7,r10
    627 	adde		r11,r8,r11
    628 	addze		r9,r9
    629 	addc		r10,r7,r10
    630 	adde		r11,r8,r11
    631 	addze		r9,r9
    632 						#sqr_add_c2(a,5,2,c2,c3,c1);
    633 	$LD		r5,`2*$BNSZ`(r4)
    634 	$LD		r6,`5*$BNSZ`(r4)
    635 	$UMULL		r7,r5,r6
    636 	$UMULH		r8,r5,r6
    637 	addc		r10,r7,r10
    638 	adde		r11,r8,r11
    639 	addze		r9,r9
    640 	addc		r10,r7,r10
    641 	adde		r11,r8,r11
    642 	addze		r9,r9
    643 						#sqr_add_c2(a,4,3,c2,c3,c1);
    644 	$LD		r5,`3*$BNSZ`(r4)
    645 	$LD		r6,`4*$BNSZ`(r4)
    646 	$UMULL		r7,r5,r6
    647 	$UMULH		r8,r5,r6
    648 	
    649 	addc		r10,r7,r10
    650 	adde		r11,r8,r11
    651 	addze		r9,r9
    652 	addc		r10,r7,r10
    653 	adde		r11,r8,r11
    654 	addze		r9,r9
    655 	$ST		r10,`7*$BNSZ`(r3)	#r[7]=c2;
    656 						#sqr_add_c(a,4,c3,c1,c2);
    657 	$UMULL		r7,r6,r6
    658 	$UMULH		r8,r6,r6
    659 	addc		r11,r7,r11
    660 	adde		r9,r8,r9
    661 	addze		r10,r0
    662 						#sqr_add_c2(a,5,3,c3,c1,c2);
    663 	$LD		r6,`5*$BNSZ`(r4)
    664 	$UMULL		r7,r5,r6
    665 	$UMULH		r8,r5,r6
    666 	addc		r11,r7,r11
    667 	adde		r9,r8,r9
    668 	addze		r10,r10
    669 	addc		r11,r7,r11
    670 	adde		r9,r8,r9
    671 	addze		r10,r10
    672 						#sqr_add_c2(a,6,2,c3,c1,c2);
    673 	$LD		r5,`2*$BNSZ`(r4)
    674 	$LD		r6,`6*$BNSZ`(r4)
    675 	$UMULL		r7,r5,r6
    676 	$UMULH		r8,r5,r6
    677 	addc		r11,r7,r11
    678 	adde		r9,r8,r9
    679 	addze		r10,r10
    680 	
    681 	addc		r11,r7,r11
    682 	adde		r9,r8,r9
    683 	addze		r10,r10
    684 						#sqr_add_c2(a,7,1,c3,c1,c2);
    685 	$LD		r5,`1*$BNSZ`(r4)
    686 	$LD		r6,`7*$BNSZ`(r4)
    687 	$UMULL		r7,r5,r6
    688 	$UMULH		r8,r5,r6
    689 	addc		r11,r7,r11
    690 	adde		r9,r8,r9
    691 	addze		r10,r10
    692 	addc		r11,r7,r11
    693 	adde		r9,r8,r9
    694 	addze		r10,r10
    695 	$ST		r11,`8*$BNSZ`(r3)	#r[8]=c3;
    696 						#sqr_add_c2(a,7,2,c1,c2,c3);
    697 	$LD		r5,`2*$BNSZ`(r4)
    698 	$UMULL		r7,r5,r6
    699 	$UMULH		r8,r5,r6
    700 	
    701 	addc		r9,r7,r9
    702 	adde		r10,r8,r10
    703 	addze		r11,r0
    704 	addc		r9,r7,r9
    705 	adde		r10,r8,r10
    706 	addze		r11,r11
    707 						#sqr_add_c2(a,6,3,c1,c2,c3);
    708 	$LD		r5,`3*$BNSZ`(r4)
    709 	$LD		r6,`6*$BNSZ`(r4)
    710 	$UMULL		r7,r5,r6
    711 	$UMULH		r8,r5,r6
    712 	addc		r9,r7,r9
    713 	adde		r10,r8,r10
    714 	addze		r11,r11
    715 	addc		r9,r7,r9
    716 	adde		r10,r8,r10
    717 	addze		r11,r11
    718 						#sqr_add_c2(a,5,4,c1,c2,c3);
    719 	$LD		r5,`4*$BNSZ`(r4)
    720 	$LD		r6,`5*$BNSZ`(r4)
    721 	$UMULL		r7,r5,r6
    722 	$UMULH		r8,r5,r6
    723 	addc		r9,r7,r9
    724 	adde		r10,r8,r10
    725 	addze		r11,r11
    726 	addc		r9,r7,r9
    727 	adde		r10,r8,r10
    728 	addze		r11,r11
    729 	$ST		r9,`9*$BNSZ`(r3)	#r[9]=c1;
    730 						#sqr_add_c(a,5,c2,c3,c1);
    731 	$UMULL		r7,r6,r6
    732 	$UMULH		r8,r6,r6
    733 	addc		r10,r7,r10
    734 	adde		r11,r8,r11
    735 	addze		r9,r0
    736 						#sqr_add_c2(a,6,4,c2,c3,c1);
    737 	$LD		r6,`6*$BNSZ`(r4)
    738 	$UMULL		r7,r5,r6
    739 	$UMULH		r8,r5,r6
    740 	addc		r10,r7,r10
    741 	adde		r11,r8,r11
    742 	addze		r9,r9
    743 	addc		r10,r7,r10
    744 	adde		r11,r8,r11
    745 	addze		r9,r9
    746 						#sqr_add_c2(a,7,3,c2,c3,c1);
    747 	$LD		r5,`3*$BNSZ`(r4)
    748 	$LD		r6,`7*$BNSZ`(r4)
    749 	$UMULL		r7,r5,r6
    750 	$UMULH		r8,r5,r6
    751 	addc		r10,r7,r10
    752 	adde		r11,r8,r11
    753 	addze		r9,r9
    754 	addc		r10,r7,r10
    755 	adde		r11,r8,r11
    756 	addze		r9,r9
    757 	$ST		r10,`10*$BNSZ`(r3)	#r[10]=c2;
    758 						#sqr_add_c2(a,7,4,c3,c1,c2);
    759 	$LD		r5,`4*$BNSZ`(r4)
    760 	$UMULL		r7,r5,r6
    761 	$UMULH		r8,r5,r6
    762 	addc		r11,r7,r11
    763 	adde		r9,r8,r9
    764 	addze		r10,r0
    765 	addc		r11,r7,r11
    766 	adde		r9,r8,r9
    767 	addze		r10,r10
    768 						#sqr_add_c2(a,6,5,c3,c1,c2);
    769 	$LD		r5,`5*$BNSZ`(r4)
    770 	$LD		r6,`6*$BNSZ`(r4)
    771 	$UMULL		r7,r5,r6
    772 	$UMULH		r8,r5,r6
    773 	addc		r11,r7,r11
    774 	adde		r9,r8,r9
    775 	addze		r10,r10
    776 	addc		r11,r7,r11
    777 	adde		r9,r8,r9
    778 	addze		r10,r10
    779 	$ST		r11,`11*$BNSZ`(r3)	#r[11]=c3;
    780 						#sqr_add_c(a,6,c1,c2,c3);
    781 	$UMULL		r7,r6,r6
    782 	$UMULH		r8,r6,r6
    783 	addc		r9,r7,r9
    784 	adde		r10,r8,r10
    785 	addze		r11,r0
    786 						#sqr_add_c2(a,7,5,c1,c2,c3)
    787 	$LD		r6,`7*$BNSZ`(r4)
    788 	$UMULL		r7,r5,r6
    789 	$UMULH		r8,r5,r6
    790 	addc		r9,r7,r9
    791 	adde		r10,r8,r10
    792 	addze		r11,r11
    793 	addc		r9,r7,r9
    794 	adde		r10,r8,r10
    795 	addze		r11,r11
    796 	$ST		r9,`12*$BNSZ`(r3)	#r[12]=c1;
    797 	
    798 						#sqr_add_c2(a,7,6,c2,c3,c1)
    799 	$LD		r5,`6*$BNSZ`(r4)
    800 	$UMULL		r7,r5,r6
    801 	$UMULH		r8,r5,r6
    802 	addc		r10,r7,r10
    803 	adde		r11,r8,r11
    804 	addze		r9,r0
    805 	addc		r10,r7,r10
    806 	adde		r11,r8,r11
    807 	addze		r9,r9
    808 	$ST		r10,`13*$BNSZ`(r3)	#r[13]=c2;
    809 						#sqr_add_c(a,7,c3,c1,c2);
    810 	$UMULL		r7,r6,r6
    811 	$UMULH		r8,r6,r6
    812 	addc		r11,r7,r11
    813 	adde		r9,r8,r9
    814 	$ST		r11,`14*$BNSZ`(r3)	#r[14]=c3;
    815 	$ST		r9, `15*$BNSZ`(r3)	#r[15]=c1;
    816 
    817 
    818 	blr
    819 	.long	0
    820 	.byte	0,12,0x14,0,0,0,2,0
    821 	.long	0
    822 
    823 #
    824 #	NOTE:	The following label name should be changed to
    825 #		"bn_mul_comba4" i.e. remove the first dot
    826 #		for the gcc compiler. This should be automatically
    827 #		done in the build
    828 #
    829 
    830 .align	4
    831 .bn_mul_comba4:
    832 #
    833 # This is an optimized version of the bn_mul_comba4 routine.
    834 #
    835 # void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
    836 # r3 contains r
    837 # r4 contains a
    838 # r5 contains b
    839 # r6, r7 are the 2 BN_ULONGs being multiplied.
    840 # r8, r9 are the results of the 32x32 giving 64 multiply.
    841 # r10, r11, r12 are the equivalents of c1, c2, and c3.
    842 #
    843 	xor	r0,r0,r0		#r0=0. Used in addze below.
    844 					#mul_add_c(a[0],b[0],c1,c2,c3);
    845 	$LD	r6,`0*$BNSZ`(r4)		
    846 	$LD	r7,`0*$BNSZ`(r5)		
    847 	$UMULL	r10,r6,r7		
    848 	$UMULH	r11,r6,r7		
    849 	$ST	r10,`0*$BNSZ`(r3)	#r[0]=c1
    850 					#mul_add_c(a[0],b[1],c2,c3,c1);
    851 	$LD	r7,`1*$BNSZ`(r5)		
    852 	$UMULL	r8,r6,r7
    853 	$UMULH	r9,r6,r7
    854 	addc	r11,r8,r11
    855 	adde	r12,r9,r0
    856 	addze	r10,r0
    857 					#mul_add_c(a[1],b[0],c2,c3,c1);
    858 	$LD	r6, `1*$BNSZ`(r4)		
    859 	$LD	r7, `0*$BNSZ`(r5)		
    860 	$UMULL	r8,r6,r7
    861 	$UMULH	r9,r6,r7
    862 	addc	r11,r8,r11
    863 	adde	r12,r9,r12
    864 	addze	r10,r10
    865 	$ST	r11,`1*$BNSZ`(r3)	#r[1]=c2
    866 					#mul_add_c(a[2],b[0],c3,c1,c2);
    867 	$LD	r6,`2*$BNSZ`(r4)		
    868 	$UMULL	r8,r6,r7
    869 	$UMULH	r9,r6,r7
    870 	addc	r12,r8,r12
    871 	adde	r10,r9,r10
    872 	addze	r11,r0
    873 					#mul_add_c(a[1],b[1],c3,c1,c2);
    874 	$LD	r6,`1*$BNSZ`(r4)		
    875 	$LD	r7,`1*$BNSZ`(r5)		
    876 	$UMULL	r8,r6,r7
    877 	$UMULH	r9,r6,r7
    878 	addc	r12,r8,r12
    879 	adde	r10,r9,r10
    880 	addze	r11,r11
    881 					#mul_add_c(a[0],b[2],c3,c1,c2);
    882 	$LD	r6,`0*$BNSZ`(r4)		
    883 	$LD	r7,`2*$BNSZ`(r5)		
    884 	$UMULL	r8,r6,r7
    885 	$UMULH	r9,r6,r7
    886 	addc	r12,r8,r12
    887 	adde	r10,r9,r10
    888 	addze	r11,r11
    889 	$ST	r12,`2*$BNSZ`(r3)	#r[2]=c3
    890 					#mul_add_c(a[0],b[3],c1,c2,c3);
    891 	$LD	r7,`3*$BNSZ`(r5)		
    892 	$UMULL	r8,r6,r7
    893 	$UMULH	r9,r6,r7
    894 	addc	r10,r8,r10
    895 	adde	r11,r9,r11
    896 	addze	r12,r0
    897 					#mul_add_c(a[1],b[2],c1,c2,c3);
    898 	$LD	r6,`1*$BNSZ`(r4)
    899 	$LD	r7,`2*$BNSZ`(r5)
    900 	$UMULL	r8,r6,r7
    901 	$UMULH	r9,r6,r7
    902 	addc	r10,r8,r10
    903 	adde	r11,r9,r11
    904 	addze	r12,r12
    905 					#mul_add_c(a[2],b[1],c1,c2,c3);
    906 	$LD	r6,`2*$BNSZ`(r4)
    907 	$LD	r7,`1*$BNSZ`(r5)
    908 	$UMULL	r8,r6,r7
    909 	$UMULH	r9,r6,r7
    910 	addc	r10,r8,r10
    911 	adde	r11,r9,r11
    912 	addze	r12,r12
    913 					#mul_add_c(a[3],b[0],c1,c2,c3);
    914 	$LD	r6,`3*$BNSZ`(r4)
    915 	$LD	r7,`0*$BNSZ`(r5)
    916 	$UMULL	r8,r6,r7
    917 	$UMULH	r9,r6,r7
    918 	addc	r10,r8,r10
    919 	adde	r11,r9,r11
    920 	addze	r12,r12
    921 	$ST	r10,`3*$BNSZ`(r3)	#r[3]=c1
    922 					#mul_add_c(a[3],b[1],c2,c3,c1);
    923 	$LD	r7,`1*$BNSZ`(r5)		
    924 	$UMULL	r8,r6,r7
    925 	$UMULH	r9,r6,r7
    926 	addc	r11,r8,r11
    927 	adde	r12,r9,r12
    928 	addze	r10,r0
    929 					#mul_add_c(a[2],b[2],c2,c3,c1);
    930 	$LD	r6,`2*$BNSZ`(r4)
    931 	$LD	r7,`2*$BNSZ`(r5)
    932 	$UMULL	r8,r6,r7
    933 	$UMULH	r9,r6,r7
    934 	addc	r11,r8,r11
    935 	adde	r12,r9,r12
    936 	addze	r10,r10
    937 					#mul_add_c(a[1],b[3],c2,c3,c1);
    938 	$LD	r6,`1*$BNSZ`(r4)
    939 	$LD	r7,`3*$BNSZ`(r5)
    940 	$UMULL	r8,r6,r7
    941 	$UMULH	r9,r6,r7
    942 	addc	r11,r8,r11
    943 	adde	r12,r9,r12
    944 	addze	r10,r10
    945 	$ST	r11,`4*$BNSZ`(r3)	#r[4]=c2
    946 					#mul_add_c(a[2],b[3],c3,c1,c2);
    947 	$LD	r6,`2*$BNSZ`(r4)		
    948 	$UMULL	r8,r6,r7
    949 	$UMULH	r9,r6,r7
    950 	addc	r12,r8,r12
    951 	adde	r10,r9,r10
    952 	addze	r11,r0
    953 					#mul_add_c(a[3],b[2],c3,c1,c2);
    954 	$LD	r6,`3*$BNSZ`(r4)
    955 	$LD	r7,`2*$BNSZ`(r5)
    956 	$UMULL	r8,r6,r7
    957 	$UMULH	r9,r6,r7
    958 	addc	r12,r8,r12
    959 	adde	r10,r9,r10
    960 	addze	r11,r11
    961 	$ST	r12,`5*$BNSZ`(r3)	#r[5]=c3
    962 					#mul_add_c(a[3],b[3],c1,c2,c3);
    963 	$LD	r7,`3*$BNSZ`(r5)		
    964 	$UMULL	r8,r6,r7
    965 	$UMULH	r9,r6,r7
    966 	addc	r10,r8,r10
    967 	adde	r11,r9,r11
    968 
    969 	$ST	r10,`6*$BNSZ`(r3)	#r[6]=c1
    970 	$ST	r11,`7*$BNSZ`(r3)	#r[7]=c2
    971 	blr
    972 	.long	0
    973 	.byte	0,12,0x14,0,0,0,3,0
    974 	.long	0
    975 
    976 #
    977 #	NOTE:	The following label name should be changed to
    978 #		"bn_mul_comba8" i.e. remove the first dot
    979 #		for the gcc compiler. This should be automatically
    980 #		done in the build
    981 #
    982 	
    983 .align	4
    984 .bn_mul_comba8:
    985 #
    986 # Optimized version of the bn_mul_comba8 routine.
    987 #
    988 # void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
    989 # r3 contains r
    990 # r4 contains a
    991 # r5 contains b
    992 # r6, r7 are the 2 BN_ULONGs being multiplied.
    993 # r8, r9 are the results of the 32x32 giving 64 multiply.
    994 # r10, r11, r12 are the equivalents of c1, c2, and c3.
    995 #
    996 	xor	r0,r0,r0		#r0=0. Used in addze below.
    997 	
    998 					#mul_add_c(a[0],b[0],c1,c2,c3);
    999 	$LD	r6,`0*$BNSZ`(r4)	#a[0]
   1000 	$LD	r7,`0*$BNSZ`(r5)	#b[0]
   1001 	$UMULL	r10,r6,r7
   1002 	$UMULH	r11,r6,r7
   1003 	$ST	r10,`0*$BNSZ`(r3)	#r[0]=c1;
   1004 					#mul_add_c(a[0],b[1],c2,c3,c1);
   1005 	$LD	r7,`1*$BNSZ`(r5)
   1006 	$UMULL	r8,r6,r7
   1007 	$UMULH	r9,r6,r7
   1008 	addc	r11,r11,r8
   1009 	addze	r12,r9			# since we didnt set r12 to zero before.
   1010 	addze	r10,r0
   1011 					#mul_add_c(a[1],b[0],c2,c3,c1);
   1012 	$LD	r6,`1*$BNSZ`(r4)
   1013 	$LD	r7,`0*$BNSZ`(r5)
   1014 	$UMULL	r8,r6,r7
   1015 	$UMULH	r9,r6,r7
   1016 	addc	r11,r11,r8
   1017 	adde	r12,r12,r9
   1018 	addze	r10,r10
   1019 	$ST	r11,`1*$BNSZ`(r3)	#r[1]=c2;
   1020 					#mul_add_c(a[2],b[0],c3,c1,c2);
   1021 	$LD	r6,`2*$BNSZ`(r4)
   1022 	$UMULL	r8,r6,r7
   1023 	$UMULH	r9,r6,r7
   1024 	addc	r12,r12,r8
   1025 	adde	r10,r10,r9
   1026 	addze	r11,r0
   1027 					#mul_add_c(a[1],b[1],c3,c1,c2);
   1028 	$LD	r6,`1*$BNSZ`(r4)
   1029 	$LD	r7,`1*$BNSZ`(r5)
   1030 	$UMULL	r8,r6,r7
   1031 	$UMULH	r9,r6,r7
   1032 	addc	r12,r12,r8
   1033 	adde	r10,r10,r9
   1034 	addze	r11,r11
   1035 					#mul_add_c(a[0],b[2],c3,c1,c2);
   1036 	$LD	r6,`0*$BNSZ`(r4)
   1037 	$LD	r7,`2*$BNSZ`(r5)
   1038 	$UMULL	r8,r6,r7
   1039 	$UMULH	r9,r6,r7
   1040 	addc	r12,r12,r8
   1041 	adde	r10,r10,r9
   1042 	addze	r11,r11
   1043 	$ST	r12,`2*$BNSZ`(r3)	#r[2]=c3;
   1044 					#mul_add_c(a[0],b[3],c1,c2,c3);
   1045 	$LD	r7,`3*$BNSZ`(r5)
   1046 	$UMULL	r8,r6,r7
   1047 	$UMULH	r9,r6,r7
   1048 	addc	r10,r10,r8
   1049 	adde	r11,r11,r9
   1050 	addze	r12,r0
   1051 					#mul_add_c(a[1],b[2],c1,c2,c3);
   1052 	$LD	r6,`1*$BNSZ`(r4)
   1053 	$LD	r7,`2*$BNSZ`(r5)
   1054 	$UMULL	r8,r6,r7
   1055 	$UMULH	r9,r6,r7
   1056 	addc	r10,r10,r8
   1057 	adde	r11,r11,r9
   1058 	addze	r12,r12
   1059 		
   1060 					#mul_add_c(a[2],b[1],c1,c2,c3);
   1061 	$LD	r6,`2*$BNSZ`(r4)
   1062 	$LD	r7,`1*$BNSZ`(r5)
   1063 	$UMULL	r8,r6,r7
   1064 	$UMULH	r9,r6,r7
   1065 	addc	r10,r10,r8
   1066 	adde	r11,r11,r9
   1067 	addze	r12,r12
   1068 					#mul_add_c(a[3],b[0],c1,c2,c3);
   1069 	$LD	r6,`3*$BNSZ`(r4)
   1070 	$LD	r7,`0*$BNSZ`(r5)
   1071 	$UMULL	r8,r6,r7
   1072 	$UMULH	r9,r6,r7
   1073 	addc	r10,r10,r8
   1074 	adde	r11,r11,r9
   1075 	addze	r12,r12
   1076 	$ST	r10,`3*$BNSZ`(r3)	#r[3]=c1;
   1077 					#mul_add_c(a[4],b[0],c2,c3,c1);
   1078 	$LD	r6,`4*$BNSZ`(r4)
   1079 	$UMULL	r8,r6,r7
   1080 	$UMULH	r9,r6,r7
   1081 	addc	r11,r11,r8
   1082 	adde	r12,r12,r9
   1083 	addze	r10,r0
   1084 					#mul_add_c(a[3],b[1],c2,c3,c1);
   1085 	$LD	r6,`3*$BNSZ`(r4)
   1086 	$LD	r7,`1*$BNSZ`(r5)
   1087 	$UMULL	r8,r6,r7
   1088 	$UMULH	r9,r6,r7
   1089 	addc	r11,r11,r8
   1090 	adde	r12,r12,r9
   1091 	addze	r10,r10
   1092 					#mul_add_c(a[2],b[2],c2,c3,c1);
   1093 	$LD	r6,`2*$BNSZ`(r4)
   1094 	$LD	r7,`2*$BNSZ`(r5)
   1095 	$UMULL	r8,r6,r7
   1096 	$UMULH	r9,r6,r7
   1097 	addc	r11,r11,r8
   1098 	adde	r12,r12,r9
   1099 	addze	r10,r10
   1100 					#mul_add_c(a[1],b[3],c2,c3,c1);
   1101 	$LD	r6,`1*$BNSZ`(r4)
   1102 	$LD	r7,`3*$BNSZ`(r5)
   1103 	$UMULL	r8,r6,r7
   1104 	$UMULH	r9,r6,r7
   1105 	addc	r11,r11,r8
   1106 	adde	r12,r12,r9
   1107 	addze	r10,r10
   1108 					#mul_add_c(a[0],b[4],c2,c3,c1);
   1109 	$LD	r6,`0*$BNSZ`(r4)
   1110 	$LD	r7,`4*$BNSZ`(r5)
   1111 	$UMULL	r8,r6,r7
   1112 	$UMULH	r9,r6,r7
   1113 	addc	r11,r11,r8
   1114 	adde	r12,r12,r9
   1115 	addze	r10,r10
   1116 	$ST	r11,`4*$BNSZ`(r3)	#r[4]=c2;
   1117 					#mul_add_c(a[0],b[5],c3,c1,c2);
   1118 	$LD	r7,`5*$BNSZ`(r5)
   1119 	$UMULL	r8,r6,r7
   1120 	$UMULH	r9,r6,r7
   1121 	addc	r12,r12,r8
   1122 	adde	r10,r10,r9
   1123 	addze	r11,r0
   1124 					#mul_add_c(a[1],b[4],c3,c1,c2);
   1125 	$LD	r6,`1*$BNSZ`(r4)		
   1126 	$LD	r7,`4*$BNSZ`(r5)
   1127 	$UMULL	r8,r6,r7
   1128 	$UMULH	r9,r6,r7
   1129 	addc	r12,r12,r8
   1130 	adde	r10,r10,r9
   1131 	addze	r11,r11
   1132 					#mul_add_c(a[2],b[3],c3,c1,c2);
   1133 	$LD	r6,`2*$BNSZ`(r4)		
   1134 	$LD	r7,`3*$BNSZ`(r5)
   1135 	$UMULL	r8,r6,r7
   1136 	$UMULH	r9,r6,r7
   1137 	addc	r12,r12,r8
   1138 	adde	r10,r10,r9
   1139 	addze	r11,r11
   1140 					#mul_add_c(a[3],b[2],c3,c1,c2);
   1141 	$LD	r6,`3*$BNSZ`(r4)		
   1142 	$LD	r7,`2*$BNSZ`(r5)
   1143 	$UMULL	r8,r6,r7
   1144 	$UMULH	r9,r6,r7
   1145 	addc	r12,r12,r8
   1146 	adde	r10,r10,r9
   1147 	addze	r11,r11
   1148 					#mul_add_c(a[4],b[1],c3,c1,c2);
   1149 	$LD	r6,`4*$BNSZ`(r4)		
   1150 	$LD	r7,`1*$BNSZ`(r5)
   1151 	$UMULL	r8,r6,r7
   1152 	$UMULH	r9,r6,r7
   1153 	addc	r12,r12,r8
   1154 	adde	r10,r10,r9
   1155 	addze	r11,r11
   1156 					#mul_add_c(a[5],b[0],c3,c1,c2);
   1157 	$LD	r6,`5*$BNSZ`(r4)		
   1158 	$LD	r7,`0*$BNSZ`(r5)
   1159 	$UMULL	r8,r6,r7
   1160 	$UMULH	r9,r6,r7
   1161 	addc	r12,r12,r8
   1162 	adde	r10,r10,r9
   1163 	addze	r11,r11
   1164 	$ST	r12,`5*$BNSZ`(r3)	#r[5]=c3;
   1165 					#mul_add_c(a[6],b[0],c1,c2,c3);
   1166 	$LD	r6,`6*$BNSZ`(r4)
   1167 	$UMULL	r8,r6,r7
   1168 	$UMULH	r9,r6,r7
   1169 	addc	r10,r10,r8
   1170 	adde	r11,r11,r9
   1171 	addze	r12,r0
   1172 					#mul_add_c(a[5],b[1],c1,c2,c3);
   1173 	$LD	r6,`5*$BNSZ`(r4)
   1174 	$LD	r7,`1*$BNSZ`(r5)
   1175 	$UMULL	r8,r6,r7
   1176 	$UMULH	r9,r6,r7
   1177 	addc	r10,r10,r8
   1178 	adde	r11,r11,r9
   1179 	addze	r12,r12
   1180 					#mul_add_c(a[4],b[2],c1,c2,c3);
   1181 	$LD	r6,`4*$BNSZ`(r4)
   1182 	$LD	r7,`2*$BNSZ`(r5)
   1183 	$UMULL	r8,r6,r7
   1184 	$UMULH	r9,r6,r7
   1185 	addc	r10,r10,r8
   1186 	adde	r11,r11,r9
   1187 	addze	r12,r12
   1188 					#mul_add_c(a[3],b[3],c1,c2,c3);
   1189 	$LD	r6,`3*$BNSZ`(r4)
   1190 	$LD	r7,`3*$BNSZ`(r5)
   1191 	$UMULL	r8,r6,r7
   1192 	$UMULH	r9,r6,r7
   1193 	addc	r10,r10,r8
   1194 	adde	r11,r11,r9
   1195 	addze	r12,r12
   1196 					#mul_add_c(a[2],b[4],c1,c2,c3);
   1197 	$LD	r6,`2*$BNSZ`(r4)
   1198 	$LD	r7,`4*$BNSZ`(r5)
   1199 	$UMULL	r8,r6,r7
   1200 	$UMULH	r9,r6,r7
   1201 	addc	r10,r10,r8
   1202 	adde	r11,r11,r9
   1203 	addze	r12,r12
   1204 					#mul_add_c(a[1],b[5],c1,c2,c3);
   1205 	$LD	r6,`1*$BNSZ`(r4)
   1206 	$LD	r7,`5*$BNSZ`(r5)
   1207 	$UMULL	r8,r6,r7
   1208 	$UMULH	r9,r6,r7
   1209 	addc	r10,r10,r8
   1210 	adde	r11,r11,r9
   1211 	addze	r12,r12
   1212 					#mul_add_c(a[0],b[6],c1,c2,c3);
   1213 	$LD	r6,`0*$BNSZ`(r4)
   1214 	$LD	r7,`6*$BNSZ`(r5)
   1215 	$UMULL	r8,r6,r7
   1216 	$UMULH	r9,r6,r7
   1217 	addc	r10,r10,r8
   1218 	adde	r11,r11,r9
   1219 	addze	r12,r12
   1220 	$ST	r10,`6*$BNSZ`(r3)	#r[6]=c1;
   1221 					#mul_add_c(a[0],b[7],c2,c3,c1);
   1222 	$LD	r7,`7*$BNSZ`(r5)
   1223 	$UMULL	r8,r6,r7
   1224 	$UMULH	r9,r6,r7
   1225 	addc	r11,r11,r8
   1226 	adde	r12,r12,r9
   1227 	addze	r10,r0
   1228 					#mul_add_c(a[1],b[6],c2,c3,c1);
   1229 	$LD	r6,`1*$BNSZ`(r4)
   1230 	$LD	r7,`6*$BNSZ`(r5)
   1231 	$UMULL	r8,r6,r7
   1232 	$UMULH	r9,r6,r7
   1233 	addc	r11,r11,r8
   1234 	adde	r12,r12,r9
   1235 	addze	r10,r10
   1236 					#mul_add_c(a[2],b[5],c2,c3,c1);
   1237 	$LD	r6,`2*$BNSZ`(r4)
   1238 	$LD	r7,`5*$BNSZ`(r5)
   1239 	$UMULL	r8,r6,r7
   1240 	$UMULH	r9,r6,r7
   1241 	addc	r11,r11,r8
   1242 	adde	r12,r12,r9
   1243 	addze	r10,r10
   1244 					#mul_add_c(a[3],b[4],c2,c3,c1);
   1245 	$LD	r6,`3*$BNSZ`(r4)
   1246 	$LD	r7,`4*$BNSZ`(r5)
   1247 	$UMULL	r8,r6,r7
   1248 	$UMULH	r9,r6,r7
   1249 	addc	r11,r11,r8
   1250 	adde	r12,r12,r9
   1251 	addze	r10,r10
   1252 					#mul_add_c(a[4],b[3],c2,c3,c1);
   1253 	$LD	r6,`4*$BNSZ`(r4)
   1254 	$LD	r7,`3*$BNSZ`(r5)
   1255 	$UMULL	r8,r6,r7
   1256 	$UMULH	r9,r6,r7
   1257 	addc	r11,r11,r8
   1258 	adde	r12,r12,r9
   1259 	addze	r10,r10
   1260 					#mul_add_c(a[5],b[2],c2,c3,c1);
   1261 	$LD	r6,`5*$BNSZ`(r4)
   1262 	$LD	r7,`2*$BNSZ`(r5)
   1263 	$UMULL	r8,r6,r7
   1264 	$UMULH	r9,r6,r7
   1265 	addc	r11,r11,r8
   1266 	adde	r12,r12,r9
   1267 	addze	r10,r10
   1268 					#mul_add_c(a[6],b[1],c2,c3,c1);
   1269 	$LD	r6,`6*$BNSZ`(r4)
   1270 	$LD	r7,`1*$BNSZ`(r5)
   1271 	$UMULL	r8,r6,r7
   1272 	$UMULH	r9,r6,r7
   1273 	addc	r11,r11,r8
   1274 	adde	r12,r12,r9
   1275 	addze	r10,r10
   1276 					#mul_add_c(a[7],b[0],c2,c3,c1);
   1277 	$LD	r6,`7*$BNSZ`(r4)
   1278 	$LD	r7,`0*$BNSZ`(r5)
   1279 	$UMULL	r8,r6,r7
   1280 	$UMULH	r9,r6,r7
   1281 	addc	r11,r11,r8
   1282 	adde	r12,r12,r9
   1283 	addze	r10,r10
   1284 	$ST	r11,`7*$BNSZ`(r3)	#r[7]=c2;
   1285 					#mul_add_c(a[7],b[1],c3,c1,c2);
   1286 	$LD	r7,`1*$BNSZ`(r5)
   1287 	$UMULL	r8,r6,r7
   1288 	$UMULH	r9,r6,r7
   1289 	addc	r12,r12,r8
   1290 	adde	r10,r10,r9
   1291 	addze	r11,r0
   1292 					#mul_add_c(a[6],b[2],c3,c1,c2);
   1293 	$LD	r6,`6*$BNSZ`(r4)
   1294 	$LD	r7,`2*$BNSZ`(r5)
   1295 	$UMULL	r8,r6,r7
   1296 	$UMULH	r9,r6,r7
   1297 	addc	r12,r12,r8
   1298 	adde	r10,r10,r9
   1299 	addze	r11,r11
   1300 					#mul_add_c(a[5],b[3],c3,c1,c2);
   1301 	$LD	r6,`5*$BNSZ`(r4)
   1302 	$LD	r7,`3*$BNSZ`(r5)
   1303 	$UMULL	r8,r6,r7
   1304 	$UMULH	r9,r6,r7
   1305 	addc	r12,r12,r8
   1306 	adde	r10,r10,r9
   1307 	addze	r11,r11
   1308 					#mul_add_c(a[4],b[4],c3,c1,c2);
   1309 	$LD	r6,`4*$BNSZ`(r4)
   1310 	$LD	r7,`4*$BNSZ`(r5)
   1311 	$UMULL	r8,r6,r7
   1312 	$UMULH	r9,r6,r7
   1313 	addc	r12,r12,r8
   1314 	adde	r10,r10,r9
   1315 	addze	r11,r11
   1316 					#mul_add_c(a[3],b[5],c3,c1,c2);
   1317 	$LD	r6,`3*$BNSZ`(r4)
   1318 	$LD	r7,`5*$BNSZ`(r5)
   1319 	$UMULL	r8,r6,r7
   1320 	$UMULH	r9,r6,r7
   1321 	addc	r12,r12,r8
   1322 	adde	r10,r10,r9
   1323 	addze	r11,r11
   1324 					#mul_add_c(a[2],b[6],c3,c1,c2);
   1325 	$LD	r6,`2*$BNSZ`(r4)
   1326 	$LD	r7,`6*$BNSZ`(r5)
   1327 	$UMULL	r8,r6,r7
   1328 	$UMULH	r9,r6,r7
   1329 	addc	r12,r12,r8
   1330 	adde	r10,r10,r9
   1331 	addze	r11,r11
   1332 					#mul_add_c(a[1],b[7],c3,c1,c2);
   1333 	$LD	r6,`1*$BNSZ`(r4)
   1334 	$LD	r7,`7*$BNSZ`(r5)
   1335 	$UMULL	r8,r6,r7
   1336 	$UMULH	r9,r6,r7
   1337 	addc	r12,r12,r8
   1338 	adde	r10,r10,r9
   1339 	addze	r11,r11
   1340 	$ST	r12,`8*$BNSZ`(r3)	#r[8]=c3;
   1341 					#mul_add_c(a[2],b[7],c1,c2,c3);
   1342 	$LD	r6,`2*$BNSZ`(r4)
   1343 	$UMULL	r8,r6,r7
   1344 	$UMULH	r9,r6,r7
   1345 	addc	r10,r10,r8
   1346 	adde	r11,r11,r9
   1347 	addze	r12,r0
   1348 					#mul_add_c(a[3],b[6],c1,c2,c3);
   1349 	$LD	r6,`3*$BNSZ`(r4)
   1350 	$LD	r7,`6*$BNSZ`(r5)
   1351 	$UMULL	r8,r6,r7
   1352 	$UMULH	r9,r6,r7
   1353 	addc	r10,r10,r8
   1354 	adde	r11,r11,r9
   1355 	addze	r12,r12
   1356 					#mul_add_c(a[4],b[5],c1,c2,c3);
   1357 	$LD	r6,`4*$BNSZ`(r4)
   1358 	$LD	r7,`5*$BNSZ`(r5)
   1359 	$UMULL	r8,r6,r7
   1360 	$UMULH	r9,r6,r7
   1361 	addc	r10,r10,r8
   1362 	adde	r11,r11,r9
   1363 	addze	r12,r12
   1364 					#mul_add_c(a[5],b[4],c1,c2,c3);
   1365 	$LD	r6,`5*$BNSZ`(r4)
   1366 	$LD	r7,`4*$BNSZ`(r5)
   1367 	$UMULL	r8,r6,r7
   1368 	$UMULH	r9,r6,r7
   1369 	addc	r10,r10,r8
   1370 	adde	r11,r11,r9
   1371 	addze	r12,r12
   1372 					#mul_add_c(a[6],b[3],c1,c2,c3);
   1373 	$LD	r6,`6*$BNSZ`(r4)
   1374 	$LD	r7,`3*$BNSZ`(r5)
   1375 	$UMULL	r8,r6,r7
   1376 	$UMULH	r9,r6,r7
   1377 	addc	r10,r10,r8
   1378 	adde	r11,r11,r9
   1379 	addze	r12,r12
   1380 					#mul_add_c(a[7],b[2],c1,c2,c3);
   1381 	$LD	r6,`7*$BNSZ`(r4)
   1382 	$LD	r7,`2*$BNSZ`(r5)
   1383 	$UMULL	r8,r6,r7
   1384 	$UMULH	r9,r6,r7
   1385 	addc	r10,r10,r8
   1386 	adde	r11,r11,r9
   1387 	addze	r12,r12
   1388 	$ST	r10,`9*$BNSZ`(r3)	#r[9]=c1;
   1389 					#mul_add_c(a[7],b[3],c2,c3,c1);
   1390 	$LD	r7,`3*$BNSZ`(r5)
   1391 	$UMULL	r8,r6,r7
   1392 	$UMULH	r9,r6,r7
   1393 	addc	r11,r11,r8
   1394 	adde	r12,r12,r9
   1395 	addze	r10,r0
   1396 					#mul_add_c(a[6],b[4],c2,c3,c1);
   1397 	$LD	r6,`6*$BNSZ`(r4)
   1398 	$LD	r7,`4*$BNSZ`(r5)
   1399 	$UMULL	r8,r6,r7
   1400 	$UMULH	r9,r6,r7
   1401 	addc	r11,r11,r8
   1402 	adde	r12,r12,r9
   1403 	addze	r10,r10
   1404 					#mul_add_c(a[5],b[5],c2,c3,c1);
   1405 	$LD	r6,`5*$BNSZ`(r4)
   1406 	$LD	r7,`5*$BNSZ`(r5)
   1407 	$UMULL	r8,r6,r7
   1408 	$UMULH	r9,r6,r7
   1409 	addc	r11,r11,r8
   1410 	adde	r12,r12,r9
   1411 	addze	r10,r10
   1412 					#mul_add_c(a[4],b[6],c2,c3,c1);
   1413 	$LD	r6,`4*$BNSZ`(r4)
   1414 	$LD	r7,`6*$BNSZ`(r5)
   1415 	$UMULL	r8,r6,r7
   1416 	$UMULH	r9,r6,r7
   1417 	addc	r11,r11,r8
   1418 	adde	r12,r12,r9
   1419 	addze	r10,r10
   1420 					#mul_add_c(a[3],b[7],c2,c3,c1);
   1421 	$LD	r6,`3*$BNSZ`(r4)
   1422 	$LD	r7,`7*$BNSZ`(r5)
   1423 	$UMULL	r8,r6,r7
   1424 	$UMULH	r9,r6,r7
   1425 	addc	r11,r11,r8
   1426 	adde	r12,r12,r9
   1427 	addze	r10,r10
   1428 	$ST	r11,`10*$BNSZ`(r3)	#r[10]=c2;
   1429 					#mul_add_c(a[4],b[7],c3,c1,c2);
   1430 	$LD	r6,`4*$BNSZ`(r4)
   1431 	$UMULL	r8,r6,r7
   1432 	$UMULH	r9,r6,r7
   1433 	addc	r12,r12,r8
   1434 	adde	r10,r10,r9
   1435 	addze	r11,r0
   1436 					#mul_add_c(a[5],b[6],c3,c1,c2);
   1437 	$LD	r6,`5*$BNSZ`(r4)
   1438 	$LD	r7,`6*$BNSZ`(r5)
   1439 	$UMULL	r8,r6,r7
   1440 	$UMULH	r9,r6,r7
   1441 	addc	r12,r12,r8
   1442 	adde	r10,r10,r9
   1443 	addze	r11,r11
   1444 					#mul_add_c(a[6],b[5],c3,c1,c2);
   1445 	$LD	r6,`6*$BNSZ`(r4)
   1446 	$LD	r7,`5*$BNSZ`(r5)
   1447 	$UMULL	r8,r6,r7
   1448 	$UMULH	r9,r6,r7
   1449 	addc	r12,r12,r8
   1450 	adde	r10,r10,r9
   1451 	addze	r11,r11
   1452 					#mul_add_c(a[7],b[4],c3,c1,c2);
   1453 	$LD	r6,`7*$BNSZ`(r4)
   1454 	$LD	r7,`4*$BNSZ`(r5)
   1455 	$UMULL	r8,r6,r7
   1456 	$UMULH	r9,r6,r7
   1457 	addc	r12,r12,r8
   1458 	adde	r10,r10,r9
   1459 	addze	r11,r11
   1460 	$ST	r12,`11*$BNSZ`(r3)	#r[11]=c3;
   1461 					#mul_add_c(a[7],b[5],c1,c2,c3);
   1462 	$LD	r7,`5*$BNSZ`(r5)
   1463 	$UMULL	r8,r6,r7
   1464 	$UMULH	r9,r6,r7
   1465 	addc	r10,r10,r8
   1466 	adde	r11,r11,r9
   1467 	addze	r12,r0
   1468 					#mul_add_c(a[6],b[6],c1,c2,c3);
   1469 	$LD	r6,`6*$BNSZ`(r4)
   1470 	$LD	r7,`6*$BNSZ`(r5)
   1471 	$UMULL	r8,r6,r7
   1472 	$UMULH	r9,r6,r7
   1473 	addc	r10,r10,r8
   1474 	adde	r11,r11,r9
   1475 	addze	r12,r12
   1476 					#mul_add_c(a[5],b[7],c1,c2,c3);
   1477 	$LD	r6,`5*$BNSZ`(r4)
   1478 	$LD	r7,`7*$BNSZ`(r5)
   1479 	$UMULL	r8,r6,r7
   1480 	$UMULH	r9,r6,r7
   1481 	addc	r10,r10,r8
   1482 	adde	r11,r11,r9
   1483 	addze	r12,r12
   1484 	$ST	r10,`12*$BNSZ`(r3)	#r[12]=c1;
   1485 					#mul_add_c(a[6],b[7],c2,c3,c1);
   1486 	$LD	r6,`6*$BNSZ`(r4)
   1487 	$UMULL	r8,r6,r7
   1488 	$UMULH	r9,r6,r7
   1489 	addc	r11,r11,r8
   1490 	adde	r12,r12,r9
   1491 	addze	r10,r0
   1492 					#mul_add_c(a[7],b[6],c2,c3,c1);
   1493 	$LD	r6,`7*$BNSZ`(r4)
   1494 	$LD	r7,`6*$BNSZ`(r5)
   1495 	$UMULL	r8,r6,r7
   1496 	$UMULH	r9,r6,r7
   1497 	addc	r11,r11,r8
   1498 	adde	r12,r12,r9
   1499 	addze	r10,r10
   1500 	$ST	r11,`13*$BNSZ`(r3)	#r[13]=c2;
   1501 					#mul_add_c(a[7],b[7],c3,c1,c2);
   1502 	$LD	r7,`7*$BNSZ`(r5)
   1503 	$UMULL	r8,r6,r7
   1504 	$UMULH	r9,r6,r7
   1505 	addc	r12,r12,r8
   1506 	adde	r10,r10,r9
   1507 	$ST	r12,`14*$BNSZ`(r3)	#r[14]=c3;
   1508 	$ST	r10,`15*$BNSZ`(r3)	#r[15]=c1;
   1509 	blr
   1510 	.long	0
   1511 	.byte	0,12,0x14,0,0,0,3,0
   1512 	.long	0
   1513 
   1514 #
   1515 #	NOTE:	The following label name should be changed to
   1516 #		"bn_sub_words" i.e. remove the first dot
   1517 #		for the gcc compiler. This should be automatically
   1518 #		done in the build
   1519 #
   1520 #
   1521 .align	4
   1522 .bn_sub_words:
   1523 #
   1524 #	Handcoded version of bn_sub_words
   1525 #
   1526 #BN_ULONG bn_sub_words(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n)
   1527 #
   1528 #	r3 = r
   1529 #	r4 = a
   1530 #	r5 = b
   1531 #	r6 = n
   1532 #
   1533 #       Note:	No loop unrolling done since this is not a performance
   1534 #               critical loop.
   1535 
   1536 	xor	r0,r0,r0	#set r0 = 0
   1537 #
   1538 #	check for r6 = 0 AND set carry bit.
   1539 #
   1540 	subfc.	r7,r0,r6        # If r6 is 0 then result is 0.
   1541 				# if r6 > 0 then result !=0
   1542 				# In either case carry bit is set.
   1543 	beq	Lppcasm_sub_adios
   1544 	addi	r4,r4,-$BNSZ
   1545 	addi	r3,r3,-$BNSZ
   1546 	addi	r5,r5,-$BNSZ
   1547 	mtctr	r6
   1548 Lppcasm_sub_mainloop:	
   1549 	$LDU	r7,$BNSZ(r4)
   1550 	$LDU	r8,$BNSZ(r5)
   1551 	subfe	r6,r8,r7	# r6 = r7+carry bit + onescomplement(r8)
   1552 				# if carry = 1 this is r7-r8. Else it
   1553 				# is r7-r8 -1 as we need.
   1554 	$STU	r6,$BNSZ(r3)
   1555 	bdnz-	Lppcasm_sub_mainloop
   1556 Lppcasm_sub_adios:	
   1557 	subfze	r3,r0		# if carry bit is set then r3 = 0 else -1
   1558 	andi.	r3,r3,1         # keep only last bit.
   1559 	blr
   1560 	.long	0
   1561 	.byte	0,12,0x14,0,0,0,4,0
   1562 	.long	0
   1563 
   1564 #
   1565 #	NOTE:	The following label name should be changed to
   1566 #		"bn_add_words" i.e. remove the first dot
   1567 #		for the gcc compiler. This should be automatically
   1568 #		done in the build
   1569 #
   1570 
   1571 .align	4
   1572 .bn_add_words:
   1573 #
   1574 #	Handcoded version of bn_add_words
   1575 #
   1576 #BN_ULONG bn_add_words(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n)
   1577 #
   1578 #	r3 = r
   1579 #	r4 = a
   1580 #	r5 = b
   1581 #	r6 = n
   1582 #
   1583 #       Note:	No loop unrolling done since this is not a performance
   1584 #               critical loop.
   1585 
   1586 	xor	r0,r0,r0
   1587 #
   1588 #	check for r6 = 0. Is this needed?
   1589 #
   1590 	addic.	r6,r6,0		#test r6 and clear carry bit.
   1591 	beq	Lppcasm_add_adios
   1592 	addi	r4,r4,-$BNSZ
   1593 	addi	r3,r3,-$BNSZ
   1594 	addi	r5,r5,-$BNSZ
   1595 	mtctr	r6
   1596 Lppcasm_add_mainloop:	
   1597 	$LDU	r7,$BNSZ(r4)
   1598 	$LDU	r8,$BNSZ(r5)
   1599 	adde	r8,r7,r8
   1600 	$STU	r8,$BNSZ(r3)
   1601 	bdnz-	Lppcasm_add_mainloop
   1602 Lppcasm_add_adios:	
   1603 	addze	r3,r0			#return carry bit.
   1604 	blr
   1605 	.long	0
   1606 	.byte	0,12,0x14,0,0,0,4,0
   1607 	.long	0
   1608 
   1609 #
   1610 #	NOTE:	The following label name should be changed to
   1611 #		"bn_div_words" i.e. remove the first dot
   1612 #		for the gcc compiler. This should be automatically
   1613 #		done in the build
   1614 #
   1615 
   1616 .align	4
   1617 .bn_div_words:
   1618 #
   1619 #	This is a cleaned up version of code generated by
   1620 #	the AIX compiler. The only optimization is to use
   1621 #	the PPC instruction to count leading zeros instead
   1622 #	of call to num_bits_word. Since this was compiled
   1623 #	only at level -O2 we can possibly squeeze it more?
   1624 #	
   1625 #	r3 = h
   1626 #	r4 = l
   1627 #	r5 = d
   1628 	
   1629 	$UCMPI	0,r5,0			# compare r5 and 0
   1630 	bne	Lppcasm_div1		# proceed if d!=0
   1631 	li	r3,-1			# d=0 return -1
   1632 	blr
   1633 Lppcasm_div1:
   1634 	xor	r0,r0,r0		#r0=0
   1635 	li	r8,$BITS
   1636 	$CNTLZ.	r7,r5			#r7 = num leading 0s in d.
   1637 	beq	Lppcasm_div2		#proceed if no leading zeros
   1638 	subf	r8,r7,r8		#r8 = BN_num_bits_word(d)
   1639 	$SHR.	r9,r3,r8		#are there any bits above r8'th?
   1640 	$TR	16,r9,r0		#if there're, signal to dump core...
   1641 Lppcasm_div2:
   1642 	$UCMP	0,r3,r5			#h>=d?
   1643 	blt	Lppcasm_div3		#goto Lppcasm_div3 if not
   1644 	subf	r3,r5,r3		#h-=d ; 
   1645 Lppcasm_div3:				#r7 = BN_BITS2-i. so r7=i
   1646 	cmpi	0,0,r7,0		# is (i == 0)?
   1647 	beq	Lppcasm_div4
   1648 	$SHL	r3,r3,r7		# h = (h<< i)
   1649 	$SHR	r8,r4,r8		# r8 = (l >> BN_BITS2 -i)
   1650 	$SHL	r5,r5,r7		# d<<=i
   1651 	or	r3,r3,r8		# h = (h<<i)|(l>>(BN_BITS2-i))
   1652 	$SHL	r4,r4,r7		# l <<=i
   1653 Lppcasm_div4:
   1654 	$SHRI	r9,r5,`$BITS/2`		# r9 = dh
   1655 					# dl will be computed when needed
   1656 					# as it saves registers.
   1657 	li	r6,2			#r6=2
   1658 	mtctr	r6			#counter will be in count.
   1659 Lppcasm_divouterloop: 
   1660 	$SHRI	r8,r3,`$BITS/2`		#r8 = (h>>BN_BITS4)
   1661 	$SHRI	r11,r4,`$BITS/2`	#r11= (l&BN_MASK2h)>>BN_BITS4
   1662 					# compute here for innerloop.
   1663 	$UCMP	0,r8,r9			# is (h>>BN_BITS4)==dh
   1664 	bne	Lppcasm_div5		# goto Lppcasm_div5 if not
   1665 
   1666 	li	r8,-1
   1667 	$CLRU	r8,r8,`$BITS/2`		#q = BN_MASK2l 
   1668 	b	Lppcasm_div6
   1669 Lppcasm_div5:
   1670 	$UDIV	r8,r3,r9		#q = h/dh
   1671 Lppcasm_div6:
   1672 	$UMULL	r12,r9,r8		#th = q*dh
   1673 	$CLRU	r10,r5,`$BITS/2`	#r10=dl
   1674 	$UMULL	r6,r8,r10		#tl = q*dl
   1675 	
   1676 Lppcasm_divinnerloop:
   1677 	subf	r10,r12,r3		#t = h -th
   1678 	$SHRI	r7,r10,`$BITS/2`	#r7= (t &BN_MASK2H), sort of...
   1679 	addic.	r7,r7,0			#test if r7 == 0. used below.
   1680 					# now want to compute
   1681 					# r7 = (t<<BN_BITS4)|((l&BN_MASK2h)>>BN_BITS4)
   1682 					# the following 2 instructions do that
   1683 	$SHLI	r7,r10,`$BITS/2`	# r7 = (t<<BN_BITS4)
   1684 	or	r7,r7,r11		# r7|=((l&BN_MASK2h)>>BN_BITS4)
   1685 	$UCMP	cr1,r6,r7		# compare (tl <= r7)
   1686 	bne	Lppcasm_divinnerexit
   1687 	ble	cr1,Lppcasm_divinnerexit
   1688 	addi	r8,r8,-1		#q--
   1689 	subf	r12,r9,r12		#th -=dh
   1690 	$CLRU	r10,r5,`$BITS/2`	#r10=dl. t is no longer needed in loop.
   1691 	subf	r6,r10,r6		#tl -=dl
   1692 	b	Lppcasm_divinnerloop
   1693 Lppcasm_divinnerexit:
   1694 	$SHRI	r10,r6,`$BITS/2`	#t=(tl>>BN_BITS4)
   1695 	$SHLI	r11,r6,`$BITS/2`	#tl=(tl<<BN_BITS4)&BN_MASK2h;
   1696 	$UCMP	cr1,r4,r11		# compare l and tl
   1697 	add	r12,r12,r10		# th+=t
   1698 	bge	cr1,Lppcasm_div7	# if (l>=tl) goto Lppcasm_div7
   1699 	addi	r12,r12,1		# th++
   1700 Lppcasm_div7:
   1701 	subf	r11,r11,r4		#r11=l-tl
   1702 	$UCMP	cr1,r3,r12		#compare h and th
   1703 	bge	cr1,Lppcasm_div8	#if (h>=th) goto Lppcasm_div8
   1704 	addi	r8,r8,-1		# q--
   1705 	add	r3,r5,r3		# h+=d
   1706 Lppcasm_div8:
   1707 	subf	r12,r12,r3		#r12 = h-th
   1708 	$SHLI	r4,r11,`$BITS/2`	#l=(l&BN_MASK2l)<<BN_BITS4
   1709 					# want to compute
   1710 					# h = ((h<<BN_BITS4)|(l>>BN_BITS4))&BN_MASK2
   1711 					# the following 2 instructions will do this.
   1712 	$INSR	r11,r12,`$BITS/2`,`$BITS/2`	# r11 is the value we want rotated $BITS/2.
   1713 	$ROTL	r3,r11,`$BITS/2`	# rotate by $BITS/2 and store in r3
   1714 	bdz	Lppcasm_div9		#if (count==0) break ;
   1715 	$SHLI	r0,r8,`$BITS/2`		#ret =q<<BN_BITS4
   1716 	b	Lppcasm_divouterloop
   1717 Lppcasm_div9:
   1718 	or	r3,r8,r0
   1719 	blr
   1720 	.long	0
   1721 	.byte	0,12,0x14,0,0,0,3,0
   1722 	.long	0
   1723 
   1724 #
   1725 #	NOTE:	The following label name should be changed to
   1726 #		"bn_sqr_words" i.e. remove the first dot
   1727 #		for the gcc compiler. This should be automatically
   1728 #		done in the build
   1729 #
   1730 .align	4
   1731 .bn_sqr_words:
   1732 #
   1733 #	Optimized version of bn_sqr_words
   1734 #
   1735 #	void bn_sqr_words(BN_ULONG *r, BN_ULONG *a, int n)
   1736 #
   1737 #	r3 = r
   1738 #	r4 = a
   1739 #	r5 = n
   1740 #
   1741 #	r6 = a[i].
   1742 #	r7,r8 = product.
   1743 #
   1744 #	No unrolling done here. Not performance critical.
   1745 
   1746 	addic.	r5,r5,0			#test r5.
   1747 	beq	Lppcasm_sqr_adios
   1748 	addi	r4,r4,-$BNSZ
   1749 	addi	r3,r3,-$BNSZ
   1750 	mtctr	r5
   1751 Lppcasm_sqr_mainloop:	
   1752 					#sqr(r[0],r[1],a[0]);
   1753 	$LDU	r6,$BNSZ(r4)
   1754 	$UMULL	r7,r6,r6
   1755 	$UMULH  r8,r6,r6
   1756 	$STU	r7,$BNSZ(r3)
   1757 	$STU	r8,$BNSZ(r3)
   1758 	bdnz-	Lppcasm_sqr_mainloop
   1759 Lppcasm_sqr_adios:	
   1760 	blr
   1761 	.long	0
   1762 	.byte	0,12,0x14,0,0,0,3,0
   1763 	.long	0
   1764 
   1765 #
   1766 #	NOTE:	The following label name should be changed to
   1767 #		"bn_mul_words" i.e. remove the first dot
   1768 #		for the gcc compiler. This should be automatically
   1769 #		done in the build
   1770 #
   1771 
   1772 .align	4	
   1773 .bn_mul_words:
   1774 #
   1775 # BN_ULONG bn_mul_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w)
   1776 #
   1777 # r3 = rp
   1778 # r4 = ap
   1779 # r5 = num
   1780 # r6 = w
   1781 	xor	r0,r0,r0
   1782 	xor	r12,r12,r12		# used for carry
   1783 	rlwinm.	r7,r5,30,2,31		# num >> 2
   1784 	beq	Lppcasm_mw_REM
   1785 	mtctr	r7
   1786 Lppcasm_mw_LOOP:	
   1787 					#mul(rp[0],ap[0],w,c1);
   1788 	$LD	r8,`0*$BNSZ`(r4)
   1789 	$UMULL	r9,r6,r8
   1790 	$UMULH  r10,r6,r8
   1791 	addc	r9,r9,r12
   1792 	#addze	r10,r10			#carry is NOT ignored.
   1793 					#will be taken care of
   1794 					#in second spin below
   1795 					#using adde.
   1796 	$ST	r9,`0*$BNSZ`(r3)
   1797 					#mul(rp[1],ap[1],w,c1);
   1798 	$LD	r8,`1*$BNSZ`(r4)	
   1799 	$UMULL	r11,r6,r8
   1800 	$UMULH  r12,r6,r8
   1801 	adde	r11,r11,r10
   1802 	#addze	r12,r12
   1803 	$ST	r11,`1*$BNSZ`(r3)
   1804 					#mul(rp[2],ap[2],w,c1);
   1805 	$LD	r8,`2*$BNSZ`(r4)
   1806 	$UMULL	r9,r6,r8
   1807 	$UMULH  r10,r6,r8
   1808 	adde	r9,r9,r12
   1809 	#addze	r10,r10
   1810 	$ST	r9,`2*$BNSZ`(r3)
   1811 					#mul_add(rp[3],ap[3],w,c1);
   1812 	$LD	r8,`3*$BNSZ`(r4)
   1813 	$UMULL	r11,r6,r8
   1814 	$UMULH  r12,r6,r8
   1815 	adde	r11,r11,r10
   1816 	addze	r12,r12			#this spin we collect carry into
   1817 					#r12
   1818 	$ST	r11,`3*$BNSZ`(r3)
   1819 	
   1820 	addi	r3,r3,`4*$BNSZ`
   1821 	addi	r4,r4,`4*$BNSZ`
   1822 	bdnz-	Lppcasm_mw_LOOP
   1823 
   1824 Lppcasm_mw_REM:
   1825 	andi.	r5,r5,0x3
   1826 	beq	Lppcasm_mw_OVER
   1827 					#mul(rp[0],ap[0],w,c1);
   1828 	$LD	r8,`0*$BNSZ`(r4)
   1829 	$UMULL	r9,r6,r8
   1830 	$UMULH  r10,r6,r8
   1831 	addc	r9,r9,r12
   1832 	addze	r10,r10
   1833 	$ST	r9,`0*$BNSZ`(r3)
   1834 	addi	r12,r10,0
   1835 	
   1836 	addi	r5,r5,-1
   1837 	cmpli	0,0,r5,0
   1838 	beq	Lppcasm_mw_OVER
   1839 
   1840 	
   1841 					#mul(rp[1],ap[1],w,c1);
   1842 	$LD	r8,`1*$BNSZ`(r4)	
   1843 	$UMULL	r9,r6,r8
   1844 	$UMULH  r10,r6,r8
   1845 	addc	r9,r9,r12
   1846 	addze	r10,r10
   1847 	$ST	r9,`1*$BNSZ`(r3)
   1848 	addi	r12,r10,0
   1849 	
   1850 	addi	r5,r5,-1
   1851 	cmpli	0,0,r5,0
   1852 	beq	Lppcasm_mw_OVER
   1853 	
   1854 					#mul_add(rp[2],ap[2],w,c1);
   1855 	$LD	r8,`2*$BNSZ`(r4)
   1856 	$UMULL	r9,r6,r8
   1857 	$UMULH  r10,r6,r8
   1858 	addc	r9,r9,r12
   1859 	addze	r10,r10
   1860 	$ST	r9,`2*$BNSZ`(r3)
   1861 	addi	r12,r10,0
   1862 		
   1863 Lppcasm_mw_OVER:	
   1864 	addi	r3,r12,0
   1865 	blr
   1866 	.long	0
   1867 	.byte	0,12,0x14,0,0,0,4,0
   1868 	.long	0
   1869 
   1870 #
   1871 #	NOTE:	The following label name should be changed to
   1872 #		"bn_mul_add_words" i.e. remove the first dot
   1873 #		for the gcc compiler. This should be automatically
   1874 #		done in the build
   1875 #
   1876 
   1877 .align	4
   1878 .bn_mul_add_words:
   1879 #
   1880 # BN_ULONG bn_mul_add_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w)
   1881 #
   1882 # r3 = rp
   1883 # r4 = ap
   1884 # r5 = num
   1885 # r6 = w
   1886 #
   1887 # empirical evidence suggests that unrolled version performs best!!
   1888 #
   1889 	xor	r0,r0,r0		#r0 = 0
   1890 	xor	r12,r12,r12  		#r12 = 0 . used for carry		
   1891 	rlwinm.	r7,r5,30,2,31		# num >> 2
   1892 	beq	Lppcasm_maw_leftover	# if (num < 4) go LPPCASM_maw_leftover
   1893 	mtctr	r7
   1894 Lppcasm_maw_mainloop:	
   1895 					#mul_add(rp[0],ap[0],w,c1);
   1896 	$LD	r8,`0*$BNSZ`(r4)
   1897 	$LD	r11,`0*$BNSZ`(r3)
   1898 	$UMULL	r9,r6,r8
   1899 	$UMULH  r10,r6,r8
   1900 	addc	r9,r9,r12		#r12 is carry.
   1901 	addze	r10,r10
   1902 	addc	r9,r9,r11
   1903 	#addze	r10,r10
   1904 					#the above instruction addze
   1905 					#is NOT needed. Carry will NOT
   1906 					#be ignored. It's not affected
   1907 					#by multiply and will be collected
   1908 					#in the next spin
   1909 	$ST	r9,`0*$BNSZ`(r3)
   1910 	
   1911 					#mul_add(rp[1],ap[1],w,c1);
   1912 	$LD	r8,`1*$BNSZ`(r4)	
   1913 	$LD	r9,`1*$BNSZ`(r3)
   1914 	$UMULL	r11,r6,r8
   1915 	$UMULH  r12,r6,r8
   1916 	adde	r11,r11,r10		#r10 is carry.
   1917 	addze	r12,r12
   1918 	addc	r11,r11,r9
   1919 	#addze	r12,r12
   1920 	$ST	r11,`1*$BNSZ`(r3)
   1921 	
   1922 					#mul_add(rp[2],ap[2],w,c1);
   1923 	$LD	r8,`2*$BNSZ`(r4)
   1924 	$UMULL	r9,r6,r8
   1925 	$LD	r11,`2*$BNSZ`(r3)
   1926 	$UMULH  r10,r6,r8
   1927 	adde	r9,r9,r12
   1928 	addze	r10,r10
   1929 	addc	r9,r9,r11
   1930 	#addze	r10,r10
   1931 	$ST	r9,`2*$BNSZ`(r3)
   1932 	
   1933 					#mul_add(rp[3],ap[3],w,c1);
   1934 	$LD	r8,`3*$BNSZ`(r4)
   1935 	$UMULL	r11,r6,r8
   1936 	$LD	r9,`3*$BNSZ`(r3)
   1937 	$UMULH  r12,r6,r8
   1938 	adde	r11,r11,r10
   1939 	addze	r12,r12
   1940 	addc	r11,r11,r9
   1941 	addze	r12,r12
   1942 	$ST	r11,`3*$BNSZ`(r3)
   1943 	addi	r3,r3,`4*$BNSZ`
   1944 	addi	r4,r4,`4*$BNSZ`
   1945 	bdnz-	Lppcasm_maw_mainloop
   1946 	
   1947 Lppcasm_maw_leftover:
   1948 	andi.	r5,r5,0x3
   1949 	beq	Lppcasm_maw_adios
   1950 	addi	r3,r3,-$BNSZ
   1951 	addi	r4,r4,-$BNSZ
   1952 					#mul_add(rp[0],ap[0],w,c1);
   1953 	mtctr	r5
   1954 	$LDU	r8,$BNSZ(r4)
   1955 	$UMULL	r9,r6,r8
   1956 	$UMULH  r10,r6,r8
   1957 	$LDU	r11,$BNSZ(r3)
   1958 	addc	r9,r9,r11
   1959 	addze	r10,r10
   1960 	addc	r9,r9,r12
   1961 	addze	r12,r10
   1962 	$ST	r9,0(r3)
   1963 	
   1964 	bdz	Lppcasm_maw_adios
   1965 					#mul_add(rp[1],ap[1],w,c1);
   1966 	$LDU	r8,$BNSZ(r4)	
   1967 	$UMULL	r9,r6,r8
   1968 	$UMULH  r10,r6,r8
   1969 	$LDU	r11,$BNSZ(r3)
   1970 	addc	r9,r9,r11
   1971 	addze	r10,r10
   1972 	addc	r9,r9,r12
   1973 	addze	r12,r10
   1974 	$ST	r9,0(r3)
   1975 	
   1976 	bdz	Lppcasm_maw_adios
   1977 					#mul_add(rp[2],ap[2],w,c1);
   1978 	$LDU	r8,$BNSZ(r4)
   1979 	$UMULL	r9,r6,r8
   1980 	$UMULH  r10,r6,r8
   1981 	$LDU	r11,$BNSZ(r3)
   1982 	addc	r9,r9,r11
   1983 	addze	r10,r10
   1984 	addc	r9,r9,r12
   1985 	addze	r12,r10
   1986 	$ST	r9,0(r3)
   1987 		
   1988 Lppcasm_maw_adios:	
   1989 	addi	r3,r12,0
   1990 	blr
   1991 	.long	0
   1992 	.byte	0,12,0x14,0,0,0,4,0
   1993 	.long	0
   1994 	.align	4
   1995 EOF
   1996 $data =~ s/\`([^\`]*)\`/eval $1/gem;
   1997 print $data;
   1998 close STDOUT;
   1999