Home | History | Annotate | Download | only in asm
      1 #!/usr/bin/env perl
      2 
      3 # ====================================================================
      4 # Written by Andy Polyakov <appro (at] fy.chalmers.se> for the OpenSSL
      5 # project. The module is, however, dual licensed under OpenSSL and
      6 # CRYPTOGAMS licenses depending on where you obtain it. For further
      7 # details see http://www.openssl.org/~appro/cryptogams/.
      8 # ====================================================================
      9 
     10 # December 2007
     11 
     12 # The reason for undertaken effort is basically following. Even though
     13 # Power 6 CPU operates at incredible 4.7GHz clock frequency, its PKI
     14 # performance was observed to be less than impressive, essentially as
     15 # fast as 1.8GHz PPC970, or 2.6 times(!) slower than one would hope.
     16 # Well, it's not surprising that IBM had to make some sacrifices to
     17 # boost the clock frequency that much, but no overall improvement?
     18 # Having observed how much difference did switching to FPU make on
     19 # UltraSPARC, playing same stunt on Power 6 appeared appropriate...
     20 # Unfortunately the resulting performance improvement is not as
     21 # impressive, ~30%, and in absolute terms is still very far from what
     22 # one would expect from 4.7GHz CPU. There is a chance that I'm doing
     23 # something wrong, but in the lack of assembler level micro-profiling
     24 # data or at least decent platform guide I can't tell... Or better
     25 # results might be achieved with VMX... Anyway, this module provides
     26 # *worse* performance on other PowerPC implementations, ~40-15% slower
     27 # on PPC970 depending on key length and ~40% slower on Power 5 for all
     28 # key lengths. As it's obviously inappropriate as "best all-round"
     29 # alternative, it has to be complemented with run-time CPU family
     30 # detection. Oh! It should also be noted that unlike other PowerPC
     31 # implementation IALU ppc-mont.pl module performs *suboptimaly* on
     32 # >=1024-bit key lengths on Power 6. It should also be noted that
     33 # *everything* said so far applies to 64-bit builds! As far as 32-bit
     34 # application executed on 64-bit CPU goes, this module is likely to
     35 # become preferred choice, because it's easy to adapt it for such
     36 # case and *is* faster than 32-bit ppc-mont.pl on *all* processors.
     37 
     38 # February 2008
     39 
     40 # Micro-profiling assisted optimization results in ~15% improvement
     41 # over original ppc64-mont.pl version, or overall ~50% improvement
     42 # over ppc.pl module on Power 6. If compared to ppc-mont.pl on same
     43 # Power 6 CPU, this module is 5-150% faster depending on key length,
     44 # [hereafter] more for longer keys. But if compared to ppc-mont.pl
     45 # on 1.8GHz PPC970, it's only 5-55% faster. Still far from impressive
     46 # in absolute terms, but it's apparently the way Power 6 is...
     47 
     48 $flavour = shift;
     49 
     50 if ($flavour =~ /32/) {
     51 	$SIZE_T=4;
     52 	$RZONE=	224;
     53 	$FRAME=	$SIZE_T*12+8*12;
     54 	$fname=	"bn_mul_mont_ppc64";
     55 
     56 	$STUX=	"stwux";	# store indexed and update
     57 	$PUSH=	"stw";
     58 	$POP=	"lwz";
     59 	die "not implemented yet";
     60 } elsif ($flavour =~ /64/) {
     61 	$SIZE_T=8;
     62 	$RZONE=	288;
     63 	$FRAME=	$SIZE_T*12+8*12;
     64 	$fname=	"bn_mul_mont";
     65 
     66 	# same as above, but 64-bit mnemonics...
     67 	$STUX=	"stdux";	# store indexed and update
     68 	$PUSH=	"std";
     69 	$POP=	"ld";
     70 } else { die "nonsense $flavour"; }
     71 
     72 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
     73 ( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
     74 ( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
     75 die "can't locate ppc-xlate.pl";
     76 
     77 open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!";
     78 
     79 $FRAME=($FRAME+63)&~63;
     80 $TRANSFER=16*8;
     81 
     82 $carry="r0";
     83 $sp="r1";
     84 $toc="r2";
     85 $rp="r3";	$ovf="r3";
     86 $ap="r4";
     87 $bp="r5";
     88 $np="r6";
     89 $n0="r7";
     90 $num="r8";
     91 $rp="r9";	# $rp is reassigned
     92 $tp="r10";
     93 $j="r11";
     94 $i="r12";
     95 # non-volatile registers
     96 $nap_d="r14";	# interleaved ap and np in double format
     97 $a0="r15";	# ap[0]
     98 $t0="r16";	# temporary registers
     99 $t1="r17";
    100 $t2="r18";
    101 $t3="r19";
    102 $t4="r20";
    103 $t5="r21";
    104 $t6="r22";
    105 $t7="r23";
    106 
    107 # PPC offers enough register bank capacity to unroll inner loops twice
    108 #
    109 #     ..A3A2A1A0
    110 #           dcba
    111 #    -----------
    112 #            A0a
    113 #           A0b
    114 #          A0c
    115 #         A0d
    116 #          A1a
    117 #         A1b
    118 #        A1c
    119 #       A1d
    120 #        A2a
    121 #       A2b
    122 #      A2c
    123 #     A2d
    124 #      A3a
    125 #     A3b
    126 #    A3c
    127 #   A3d
    128 #    ..a
    129 #   ..b
    130 #
    131 $ba="f0";	$bb="f1";	$bc="f2";	$bd="f3";
    132 $na="f4";	$nb="f5";	$nc="f6";	$nd="f7";
    133 $dota="f8";	$dotb="f9";
    134 $A0="f10";	$A1="f11";	$A2="f12";	$A3="f13";
    135 $N0="f14";	$N1="f15";	$N2="f16";	$N3="f17";
    136 $T0a="f18";	$T0b="f19";
    137 $T1a="f20";	$T1b="f21";
    138 $T2a="f22";	$T2b="f23";
    139 $T3a="f24";	$T3b="f25";
    140 
    142 # sp----------->+-------------------------------+
    143 #		| saved sp			|
    144 #		+-------------------------------+
    145 #		|				|
    146 #		+-------------------------------+
    147 #		| 10 saved gpr, r14-r23		|
    148 #		.				.
    149 #		.				.
    150 #   +12*size_t	+-------------------------------+
    151 #		| 12 saved fpr, f14-f25		|
    152 #		.				.
    153 #		.				.
    154 #   +12*8	+-------------------------------+
    155 #		| padding to 64 byte boundary	|
    156 #		.				.
    157 #   +X		+-------------------------------+
    158 #		| 16 gpr<->fpr transfer zone	|
    159 #		.				.
    160 #		.				.
    161 #   +16*8	+-------------------------------+
    162 #		| __int64 tmp[-1]		|
    163 #		+-------------------------------+
    164 #		| __int64 tmp[num]		|
    165 #		.				.
    166 #		.				.
    167 #		.				.
    168 #   +(num+1)*8	+-------------------------------+
    169 #		| padding to 64 byte boundary	|
    170 #		.				.
    171 #   +X		+-------------------------------+
    172 #		| double nap_d[4*num]		|
    173 #		.				.
    174 #		.				.
    175 #		.				.
    176 #		+-------------------------------+
    177 
    179 $code=<<___;
    180 .machine "any"
    181 .text
    182 
    183 .globl	.$fname
    184 .align	5
    185 .$fname:
    186 	cmpwi	$num,4
    187 	mr	$rp,r3		; $rp is reassigned
    188 	li	r3,0		; possible "not handled" return code
    189 	bltlr-
    190 	andi.	r0,$num,1	; $num has to be even
    191 	bnelr-
    192 
    193 	slwi	$num,$num,3	; num*=8
    194 	li	$i,-4096
    195 	slwi	$tp,$num,2	; place for {an}p_{lh}[num], i.e. 4*num
    196 	add	$tp,$tp,$num	; place for tp[num+1]
    197 	addi	$tp,$tp,`$FRAME+$TRANSFER+8+64+$RZONE`
    198 	subf	$tp,$tp,$sp	; $sp-$tp
    199 	and	$tp,$tp,$i	; minimize TLB usage
    200 	subf	$tp,$sp,$tp	; $tp-$sp
    201 	$STUX	$sp,$sp,$tp	; alloca
    202 
    203 	$PUSH	r14,`2*$SIZE_T`($sp)
    204 	$PUSH	r15,`3*$SIZE_T`($sp)
    205 	$PUSH	r16,`4*$SIZE_T`($sp)
    206 	$PUSH	r17,`5*$SIZE_T`($sp)
    207 	$PUSH	r18,`6*$SIZE_T`($sp)
    208 	$PUSH	r19,`7*$SIZE_T`($sp)
    209 	$PUSH	r20,`8*$SIZE_T`($sp)
    210 	$PUSH	r21,`9*$SIZE_T`($sp)
    211 	$PUSH	r22,`10*$SIZE_T`($sp)
    212 	$PUSH	r23,`11*$SIZE_T`($sp)
    213 	stfd	f14,`12*$SIZE_T+0`($sp)
    214 	stfd	f15,`12*$SIZE_T+8`($sp)
    215 	stfd	f16,`12*$SIZE_T+16`($sp)
    216 	stfd	f17,`12*$SIZE_T+24`($sp)
    217 	stfd	f18,`12*$SIZE_T+32`($sp)
    218 	stfd	f19,`12*$SIZE_T+40`($sp)
    219 	stfd	f20,`12*$SIZE_T+48`($sp)
    220 	stfd	f21,`12*$SIZE_T+56`($sp)
    221 	stfd	f22,`12*$SIZE_T+64`($sp)
    222 	stfd	f23,`12*$SIZE_T+72`($sp)
    223 	stfd	f24,`12*$SIZE_T+80`($sp)
    224 	stfd	f25,`12*$SIZE_T+88`($sp)
    225 
    226 	ld	$a0,0($ap)	; pull ap[0] value
    227 	ld	$n0,0($n0)	; pull n0[0] value
    228 	ld	$t3,0($bp)	; bp[0]
    229 
    230 	addi	$tp,$sp,`$FRAME+$TRANSFER+8+64`
    231 	li	$i,-64
    232 	add	$nap_d,$tp,$num
    233 	and	$nap_d,$nap_d,$i	; align to 64 bytes
    234 
    236 	mulld	$t7,$a0,$t3	; ap[0]*bp[0]
    237 	; nap_d is off by 1, because it's used with stfdu/lfdu
    238 	addi	$nap_d,$nap_d,-8
    239 	srwi	$j,$num,`3+1`	; counter register, num/2
    240 	mulld	$t7,$t7,$n0	; tp[0]*n0
    241 	addi	$j,$j,-1
    242 	addi	$tp,$sp,`$FRAME+$TRANSFER-8`
    243 	li	$carry,0
    244 	mtctr	$j
    245 
    246 	; transfer bp[0] to FPU as 4x16-bit values
    247 	extrdi	$t0,$t3,16,48
    248 	extrdi	$t1,$t3,16,32
    249 	extrdi	$t2,$t3,16,16
    250 	extrdi	$t3,$t3,16,0
    251 	std	$t0,`$FRAME+0`($sp)
    252 	std	$t1,`$FRAME+8`($sp)
    253 	std	$t2,`$FRAME+16`($sp)
    254 	std	$t3,`$FRAME+24`($sp)
    255 	; transfer (ap[0]*bp[0])*n0 to FPU as 4x16-bit values
    256 	extrdi	$t4,$t7,16,48
    257 	extrdi	$t5,$t7,16,32
    258 	extrdi	$t6,$t7,16,16
    259 	extrdi	$t7,$t7,16,0
    260 	std	$t4,`$FRAME+32`($sp)
    261 	std	$t5,`$FRAME+40`($sp)
    262 	std	$t6,`$FRAME+48`($sp)
    263 	std	$t7,`$FRAME+56`($sp)
    264 	lwz	$t0,4($ap)		; load a[j] as 32-bit word pair
    265 	lwz	$t1,0($ap)
    266 	lwz	$t2,12($ap)		; load a[j+1] as 32-bit word pair
    267 	lwz	$t3,8($ap)
    268 	lwz	$t4,4($np)		; load n[j] as 32-bit word pair
    269 	lwz	$t5,0($np)
    270 	lwz	$t6,12($np)		; load n[j+1] as 32-bit word pair
    271 	lwz	$t7,8($np)
    272 	lfd	$ba,`$FRAME+0`($sp)
    273 	lfd	$bb,`$FRAME+8`($sp)
    274 	lfd	$bc,`$FRAME+16`($sp)
    275 	lfd	$bd,`$FRAME+24`($sp)
    276 	lfd	$na,`$FRAME+32`($sp)
    277 	lfd	$nb,`$FRAME+40`($sp)
    278 	lfd	$nc,`$FRAME+48`($sp)
    279 	lfd	$nd,`$FRAME+56`($sp)
    280 	std	$t0,`$FRAME+64`($sp)
    281 	std	$t1,`$FRAME+72`($sp)
    282 	std	$t2,`$FRAME+80`($sp)
    283 	std	$t3,`$FRAME+88`($sp)
    284 	std	$t4,`$FRAME+96`($sp)
    285 	std	$t5,`$FRAME+104`($sp)
    286 	std	$t6,`$FRAME+112`($sp)
    287 	std	$t7,`$FRAME+120`($sp)
    288 	fcfid	$ba,$ba
    289 	fcfid	$bb,$bb
    290 	fcfid	$bc,$bc
    291 	fcfid	$bd,$bd
    292 	fcfid	$na,$na
    293 	fcfid	$nb,$nb
    294 	fcfid	$nc,$nc
    295 	fcfid	$nd,$nd
    296 
    297 	lfd	$A0,`$FRAME+64`($sp)
    298 	lfd	$A1,`$FRAME+72`($sp)
    299 	lfd	$A2,`$FRAME+80`($sp)
    300 	lfd	$A3,`$FRAME+88`($sp)
    301 	lfd	$N0,`$FRAME+96`($sp)
    302 	lfd	$N1,`$FRAME+104`($sp)
    303 	lfd	$N2,`$FRAME+112`($sp)
    304 	lfd	$N3,`$FRAME+120`($sp)
    305 	fcfid	$A0,$A0
    306 	fcfid	$A1,$A1
    307 	fcfid	$A2,$A2
    308 	fcfid	$A3,$A3
    309 	fcfid	$N0,$N0
    310 	fcfid	$N1,$N1
    311 	fcfid	$N2,$N2
    312 	fcfid	$N3,$N3
    313 	addi	$ap,$ap,16
    314 	addi	$np,$np,16
    315 
    316 	fmul	$T1a,$A1,$ba
    317 	fmul	$T1b,$A1,$bb
    318 	stfd	$A0,8($nap_d)		; save a[j] in double format
    319 	stfd	$A1,16($nap_d)
    320 	fmul	$T2a,$A2,$ba
    321 	fmul	$T2b,$A2,$bb
    322 	stfd	$A2,24($nap_d)		; save a[j+1] in double format
    323 	stfd	$A3,32($nap_d)
    324 	fmul	$T3a,$A3,$ba
    325 	fmul	$T3b,$A3,$bb
    326 	stfd	$N0,40($nap_d)		; save n[j] in double format
    327 	stfd	$N1,48($nap_d)
    328 	fmul	$T0a,$A0,$ba
    329 	fmul	$T0b,$A0,$bb
    330 	stfd	$N2,56($nap_d)		; save n[j+1] in double format
    331 	stfdu	$N3,64($nap_d)
    332 
    333 	fmadd	$T1a,$A0,$bc,$T1a
    334 	fmadd	$T1b,$A0,$bd,$T1b
    335 	fmadd	$T2a,$A1,$bc,$T2a
    336 	fmadd	$T2b,$A1,$bd,$T2b
    337 	fmadd	$T3a,$A2,$bc,$T3a
    338 	fmadd	$T3b,$A2,$bd,$T3b
    339 	fmul	$dota,$A3,$bc
    340 	fmul	$dotb,$A3,$bd
    341 
    342 	fmadd	$T1a,$N1,$na,$T1a
    343 	fmadd	$T1b,$N1,$nb,$T1b
    344 	fmadd	$T2a,$N2,$na,$T2a
    345 	fmadd	$T2b,$N2,$nb,$T2b
    346 	fmadd	$T3a,$N3,$na,$T3a
    347 	fmadd	$T3b,$N3,$nb,$T3b
    348 	fmadd	$T0a,$N0,$na,$T0a
    349 	fmadd	$T0b,$N0,$nb,$T0b
    350 
    351 	fmadd	$T1a,$N0,$nc,$T1a
    352 	fmadd	$T1b,$N0,$nd,$T1b
    353 	fmadd	$T2a,$N1,$nc,$T2a
    354 	fmadd	$T2b,$N1,$nd,$T2b
    355 	fmadd	$T3a,$N2,$nc,$T3a
    356 	fmadd	$T3b,$N2,$nd,$T3b
    357 	fmadd	$dota,$N3,$nc,$dota
    358 	fmadd	$dotb,$N3,$nd,$dotb
    359 
    360 	fctid	$T0a,$T0a
    361 	fctid	$T0b,$T0b
    362 	fctid	$T1a,$T1a
    363 	fctid	$T1b,$T1b
    364 	fctid	$T2a,$T2a
    365 	fctid	$T2b,$T2b
    366 	fctid	$T3a,$T3a
    367 	fctid	$T3b,$T3b
    368 
    369 	stfd	$T0a,`$FRAME+0`($sp)
    370 	stfd	$T0b,`$FRAME+8`($sp)
    371 	stfd	$T1a,`$FRAME+16`($sp)
    372 	stfd	$T1b,`$FRAME+24`($sp)
    373 	stfd	$T2a,`$FRAME+32`($sp)
    374 	stfd	$T2b,`$FRAME+40`($sp)
    375 	stfd	$T3a,`$FRAME+48`($sp)
    376 	stfd	$T3b,`$FRAME+56`($sp)
    377 
    379 .align	5
    380 L1st:
    381 	lwz	$t0,4($ap)		; load a[j] as 32-bit word pair
    382 	lwz	$t1,0($ap)
    383 	lwz	$t2,12($ap)		; load a[j+1] as 32-bit word pair
    384 	lwz	$t3,8($ap)
    385 	lwz	$t4,4($np)		; load n[j] as 32-bit word pair
    386 	lwz	$t5,0($np)
    387 	lwz	$t6,12($np)		; load n[j+1] as 32-bit word pair
    388 	lwz	$t7,8($np)
    389 	std	$t0,`$FRAME+64`($sp)
    390 	std	$t1,`$FRAME+72`($sp)
    391 	std	$t2,`$FRAME+80`($sp)
    392 	std	$t3,`$FRAME+88`($sp)
    393 	std	$t4,`$FRAME+96`($sp)
    394 	std	$t5,`$FRAME+104`($sp)
    395 	std	$t6,`$FRAME+112`($sp)
    396 	std	$t7,`$FRAME+120`($sp)
    397 	ld	$t0,`$FRAME+0`($sp)
    398 	ld	$t1,`$FRAME+8`($sp)
    399 	ld	$t2,`$FRAME+16`($sp)
    400 	ld	$t3,`$FRAME+24`($sp)
    401 	ld	$t4,`$FRAME+32`($sp)
    402 	ld	$t5,`$FRAME+40`($sp)
    403 	ld	$t6,`$FRAME+48`($sp)
    404 	ld	$t7,`$FRAME+56`($sp)
    405 	lfd	$A0,`$FRAME+64`($sp)
    406 	lfd	$A1,`$FRAME+72`($sp)
    407 	lfd	$A2,`$FRAME+80`($sp)
    408 	lfd	$A3,`$FRAME+88`($sp)
    409 	lfd	$N0,`$FRAME+96`($sp)
    410 	lfd	$N1,`$FRAME+104`($sp)
    411 	lfd	$N2,`$FRAME+112`($sp)
    412 	lfd	$N3,`$FRAME+120`($sp)
    413 	fcfid	$A0,$A0
    414 	fcfid	$A1,$A1
    415 	fcfid	$A2,$A2
    416 	fcfid	$A3,$A3
    417 	fcfid	$N0,$N0
    418 	fcfid	$N1,$N1
    419 	fcfid	$N2,$N2
    420 	fcfid	$N3,$N3
    421 	addi	$ap,$ap,16
    422 	addi	$np,$np,16
    423 
    424 	fmul	$T1a,$A1,$ba
    425 	fmul	$T1b,$A1,$bb
    426 	fmul	$T2a,$A2,$ba
    427 	fmul	$T2b,$A2,$bb
    428 	stfd	$A0,8($nap_d)		; save a[j] in double format
    429 	stfd	$A1,16($nap_d)
    430 	fmul	$T3a,$A3,$ba
    431 	fmul	$T3b,$A3,$bb
    432 	fmadd	$T0a,$A0,$ba,$dota
    433 	fmadd	$T0b,$A0,$bb,$dotb
    434 	stfd	$A2,24($nap_d)		; save a[j+1] in double format
    435 	stfd	$A3,32($nap_d)
    436 
    437 	fmadd	$T1a,$A0,$bc,$T1a
    438 	fmadd	$T1b,$A0,$bd,$T1b
    439 	fmadd	$T2a,$A1,$bc,$T2a
    440 	fmadd	$T2b,$A1,$bd,$T2b
    441 	stfd	$N0,40($nap_d)		; save n[j] in double format
    442 	stfd	$N1,48($nap_d)
    443 	fmadd	$T3a,$A2,$bc,$T3a
    444 	fmadd	$T3b,$A2,$bd,$T3b
    445 	 add	$t0,$t0,$carry		; can not overflow
    446 	fmul	$dota,$A3,$bc
    447 	fmul	$dotb,$A3,$bd
    448 	stfd	$N2,56($nap_d)		; save n[j+1] in double format
    449 	stfdu	$N3,64($nap_d)
    450 	 srdi	$carry,$t0,16
    451 	 add	$t1,$t1,$carry
    452 	 srdi	$carry,$t1,16
    453 
    454 	fmadd	$T1a,$N1,$na,$T1a
    455 	fmadd	$T1b,$N1,$nb,$T1b
    456 	 insrdi	$t0,$t1,16,32
    457 	fmadd	$T2a,$N2,$na,$T2a
    458 	fmadd	$T2b,$N2,$nb,$T2b
    459 	 add	$t2,$t2,$carry
    460 	fmadd	$T3a,$N3,$na,$T3a
    461 	fmadd	$T3b,$N3,$nb,$T3b
    462 	 srdi	$carry,$t2,16
    463 	fmadd	$T0a,$N0,$na,$T0a
    464 	fmadd	$T0b,$N0,$nb,$T0b
    465 	 insrdi	$t0,$t2,16,16
    466 	 add	$t3,$t3,$carry
    467 	 srdi	$carry,$t3,16
    468 
    469 	fmadd	$T1a,$N0,$nc,$T1a
    470 	fmadd	$T1b,$N0,$nd,$T1b
    471 	 insrdi	$t0,$t3,16,0		; 0..63 bits
    472 	fmadd	$T2a,$N1,$nc,$T2a
    473 	fmadd	$T2b,$N1,$nd,$T2b
    474 	 add	$t4,$t4,$carry
    475 	fmadd	$T3a,$N2,$nc,$T3a
    476 	fmadd	$T3b,$N2,$nd,$T3b
    477 	 srdi	$carry,$t4,16
    478 	fmadd	$dota,$N3,$nc,$dota
    479 	fmadd	$dotb,$N3,$nd,$dotb
    480 	 add	$t5,$t5,$carry
    481 	 srdi	$carry,$t5,16
    482 	 insrdi	$t4,$t5,16,32
    483 
    484 	fctid	$T0a,$T0a
    485 	fctid	$T0b,$T0b
    486 	 add	$t6,$t6,$carry
    487 	fctid	$T1a,$T1a
    488 	fctid	$T1b,$T1b
    489 	 srdi	$carry,$t6,16
    490 	fctid	$T2a,$T2a
    491 	fctid	$T2b,$T2b
    492 	 insrdi	$t4,$t6,16,16
    493 	fctid	$T3a,$T3a
    494 	fctid	$T3b,$T3b
    495 	 add	$t7,$t7,$carry
    496 	 insrdi	$t4,$t7,16,0		; 64..127 bits
    497 	 srdi	$carry,$t7,16		; upper 33 bits
    498 
    499 	stfd	$T0a,`$FRAME+0`($sp)
    500 	stfd	$T0b,`$FRAME+8`($sp)
    501 	stfd	$T1a,`$FRAME+16`($sp)
    502 	stfd	$T1b,`$FRAME+24`($sp)
    503 	stfd	$T2a,`$FRAME+32`($sp)
    504 	stfd	$T2b,`$FRAME+40`($sp)
    505 	stfd	$T3a,`$FRAME+48`($sp)
    506 	stfd	$T3b,`$FRAME+56`($sp)
    507 	 std	$t0,8($tp)		; tp[j-1]
    508 	 stdu	$t4,16($tp)		; tp[j]
    509 	bdnz-	L1st
    510 
    512 	fctid	$dota,$dota
    513 	fctid	$dotb,$dotb
    514 
    515 	ld	$t0,`$FRAME+0`($sp)
    516 	ld	$t1,`$FRAME+8`($sp)
    517 	ld	$t2,`$FRAME+16`($sp)
    518 	ld	$t3,`$FRAME+24`($sp)
    519 	ld	$t4,`$FRAME+32`($sp)
    520 	ld	$t5,`$FRAME+40`($sp)
    521 	ld	$t6,`$FRAME+48`($sp)
    522 	ld	$t7,`$FRAME+56`($sp)
    523 	stfd	$dota,`$FRAME+64`($sp)
    524 	stfd	$dotb,`$FRAME+72`($sp)
    525 
    526 	add	$t0,$t0,$carry		; can not overflow
    527 	srdi	$carry,$t0,16
    528 	add	$t1,$t1,$carry
    529 	srdi	$carry,$t1,16
    530 	insrdi	$t0,$t1,16,32
    531 	add	$t2,$t2,$carry
    532 	srdi	$carry,$t2,16
    533 	insrdi	$t0,$t2,16,16
    534 	add	$t3,$t3,$carry
    535 	srdi	$carry,$t3,16
    536 	insrdi	$t0,$t3,16,0		; 0..63 bits
    537 	add	$t4,$t4,$carry
    538 	srdi	$carry,$t4,16
    539 	add	$t5,$t5,$carry
    540 	srdi	$carry,$t5,16
    541 	insrdi	$t4,$t5,16,32
    542 	add	$t6,$t6,$carry
    543 	srdi	$carry,$t6,16
    544 	insrdi	$t4,$t6,16,16
    545 	add	$t7,$t7,$carry
    546 	insrdi	$t4,$t7,16,0		; 64..127 bits
    547 	srdi	$carry,$t7,16		; upper 33 bits
    548 	ld	$t6,`$FRAME+64`($sp)
    549 	ld	$t7,`$FRAME+72`($sp)
    550 
    551 	std	$t0,8($tp)		; tp[j-1]
    552 	stdu	$t4,16($tp)		; tp[j]
    553 
    554 	add	$t6,$t6,$carry		; can not overflow
    555 	srdi	$carry,$t6,16
    556 	add	$t7,$t7,$carry
    557 	insrdi	$t6,$t7,48,0
    558 	srdi	$ovf,$t7,48
    559 	std	$t6,8($tp)		; tp[num-1]
    560 
    561 	slwi	$t7,$num,2
    562 	subf	$nap_d,$t7,$nap_d	; rewind pointer
    563 
    565 	li	$i,8			; i=1
    566 .align	5
    567 Louter:
    568 	ldx	$t3,$bp,$i	; bp[i]
    569 	ld	$t6,`$FRAME+$TRANSFER+8`($sp)	; tp[0]
    570 	mulld	$t7,$a0,$t3	; ap[0]*bp[i]
    571 
    572 	addi	$tp,$sp,`$FRAME+$TRANSFER`
    573 	add	$t7,$t7,$t6	; ap[0]*bp[i]+tp[0]
    574 	li	$carry,0
    575 	mulld	$t7,$t7,$n0	; tp[0]*n0
    576 	mtctr	$j
    577 
    578 	; transfer bp[i] to FPU as 4x16-bit values
    579 	extrdi	$t0,$t3,16,48
    580 	extrdi	$t1,$t3,16,32
    581 	extrdi	$t2,$t3,16,16
    582 	extrdi	$t3,$t3,16,0
    583 	std	$t0,`$FRAME+0`($sp)
    584 	std	$t1,`$FRAME+8`($sp)
    585 	std	$t2,`$FRAME+16`($sp)
    586 	std	$t3,`$FRAME+24`($sp)
    587 	; transfer (ap[0]*bp[i]+tp[0])*n0 to FPU as 4x16-bit values
    588 	extrdi	$t4,$t7,16,48
    589 	extrdi	$t5,$t7,16,32
    590 	extrdi	$t6,$t7,16,16
    591 	extrdi	$t7,$t7,16,0
    592 	std	$t4,`$FRAME+32`($sp)
    593 	std	$t5,`$FRAME+40`($sp)
    594 	std	$t6,`$FRAME+48`($sp)
    595 	std	$t7,`$FRAME+56`($sp)
    596 
    597 	lfd	$A0,8($nap_d)		; load a[j] in double format
    598 	lfd	$A1,16($nap_d)
    599 	lfd	$A2,24($nap_d)		; load a[j+1] in double format
    600 	lfd	$A3,32($nap_d)
    601 	lfd	$N0,40($nap_d)		; load n[j] in double format
    602 	lfd	$N1,48($nap_d)
    603 	lfd	$N2,56($nap_d)		; load n[j+1] in double format
    604 	lfdu	$N3,64($nap_d)
    605 
    606 	lfd	$ba,`$FRAME+0`($sp)
    607 	lfd	$bb,`$FRAME+8`($sp)
    608 	lfd	$bc,`$FRAME+16`($sp)
    609 	lfd	$bd,`$FRAME+24`($sp)
    610 	lfd	$na,`$FRAME+32`($sp)
    611 	lfd	$nb,`$FRAME+40`($sp)
    612 	lfd	$nc,`$FRAME+48`($sp)
    613 	lfd	$nd,`$FRAME+56`($sp)
    614 
    615 	fcfid	$ba,$ba
    616 	fcfid	$bb,$bb
    617 	fcfid	$bc,$bc
    618 	fcfid	$bd,$bd
    619 	fcfid	$na,$na
    620 	fcfid	$nb,$nb
    621 	fcfid	$nc,$nc
    622 	fcfid	$nd,$nd
    623 
    624 	fmul	$T1a,$A1,$ba
    625 	fmul	$T1b,$A1,$bb
    626 	fmul	$T2a,$A2,$ba
    627 	fmul	$T2b,$A2,$bb
    628 	fmul	$T3a,$A3,$ba
    629 	fmul	$T3b,$A3,$bb
    630 	fmul	$T0a,$A0,$ba
    631 	fmul	$T0b,$A0,$bb
    632 
    633 	fmadd	$T1a,$A0,$bc,$T1a
    634 	fmadd	$T1b,$A0,$bd,$T1b
    635 	fmadd	$T2a,$A1,$bc,$T2a
    636 	fmadd	$T2b,$A1,$bd,$T2b
    637 	fmadd	$T3a,$A2,$bc,$T3a
    638 	fmadd	$T3b,$A2,$bd,$T3b
    639 	fmul	$dota,$A3,$bc
    640 	fmul	$dotb,$A3,$bd
    641 
    642 	fmadd	$T1a,$N1,$na,$T1a
    643 	fmadd	$T1b,$N1,$nb,$T1b
    644 	 lfd	$A0,8($nap_d)		; load a[j] in double format
    645 	 lfd	$A1,16($nap_d)
    646 	fmadd	$T2a,$N2,$na,$T2a
    647 	fmadd	$T2b,$N2,$nb,$T2b
    648 	 lfd	$A2,24($nap_d)		; load a[j+1] in double format
    649 	 lfd	$A3,32($nap_d)
    650 	fmadd	$T3a,$N3,$na,$T3a
    651 	fmadd	$T3b,$N3,$nb,$T3b
    652 	fmadd	$T0a,$N0,$na,$T0a
    653 	fmadd	$T0b,$N0,$nb,$T0b
    654 
    655 	fmadd	$T1a,$N0,$nc,$T1a
    656 	fmadd	$T1b,$N0,$nd,$T1b
    657 	fmadd	$T2a,$N1,$nc,$T2a
    658 	fmadd	$T2b,$N1,$nd,$T2b
    659 	fmadd	$T3a,$N2,$nc,$T3a
    660 	fmadd	$T3b,$N2,$nd,$T3b
    661 	fmadd	$dota,$N3,$nc,$dota
    662 	fmadd	$dotb,$N3,$nd,$dotb
    663 
    664 	fctid	$T0a,$T0a
    665 	fctid	$T0b,$T0b
    666 	fctid	$T1a,$T1a
    667 	fctid	$T1b,$T1b
    668 	fctid	$T2a,$T2a
    669 	fctid	$T2b,$T2b
    670 	fctid	$T3a,$T3a
    671 	fctid	$T3b,$T3b
    672 
    673 	stfd	$T0a,`$FRAME+0`($sp)
    674 	stfd	$T0b,`$FRAME+8`($sp)
    675 	stfd	$T1a,`$FRAME+16`($sp)
    676 	stfd	$T1b,`$FRAME+24`($sp)
    677 	stfd	$T2a,`$FRAME+32`($sp)
    678 	stfd	$T2b,`$FRAME+40`($sp)
    679 	stfd	$T3a,`$FRAME+48`($sp)
    680 	stfd	$T3b,`$FRAME+56`($sp)
    681 
    683 .align	5
    684 Linner:
    685 	fmul	$T1a,$A1,$ba
    686 	fmul	$T1b,$A1,$bb
    687 	fmul	$T2a,$A2,$ba
    688 	fmul	$T2b,$A2,$bb
    689 	lfd	$N0,40($nap_d)		; load n[j] in double format
    690 	lfd	$N1,48($nap_d)
    691 	fmul	$T3a,$A3,$ba
    692 	fmul	$T3b,$A3,$bb
    693 	fmadd	$T0a,$A0,$ba,$dota
    694 	fmadd	$T0b,$A0,$bb,$dotb
    695 	lfd	$N2,56($nap_d)		; load n[j+1] in double format
    696 	lfdu	$N3,64($nap_d)
    697 
    698 	fmadd	$T1a,$A0,$bc,$T1a
    699 	fmadd	$T1b,$A0,$bd,$T1b
    700 	fmadd	$T2a,$A1,$bc,$T2a
    701 	fmadd	$T2b,$A1,$bd,$T2b
    702 	 lfd	$A0,8($nap_d)		; load a[j] in double format
    703 	 lfd	$A1,16($nap_d)
    704 	fmadd	$T3a,$A2,$bc,$T3a
    705 	fmadd	$T3b,$A2,$bd,$T3b
    706 	fmul	$dota,$A3,$bc
    707 	fmul	$dotb,$A3,$bd
    708 	 lfd	$A2,24($nap_d)		; load a[j+1] in double format
    709 	 lfd	$A3,32($nap_d)
    710 
    711 	fmadd	$T1a,$N1,$na,$T1a
    712 	fmadd	$T1b,$N1,$nb,$T1b
    713 	 ld	$t0,`$FRAME+0`($sp)
    714 	 ld	$t1,`$FRAME+8`($sp)
    715 	fmadd	$T2a,$N2,$na,$T2a
    716 	fmadd	$T2b,$N2,$nb,$T2b
    717 	 ld	$t2,`$FRAME+16`($sp)
    718 	 ld	$t3,`$FRAME+24`($sp)
    719 	fmadd	$T3a,$N3,$na,$T3a
    720 	fmadd	$T3b,$N3,$nb,$T3b
    721 	 add	$t0,$t0,$carry		; can not overflow
    722 	 ld	$t4,`$FRAME+32`($sp)
    723 	 ld	$t5,`$FRAME+40`($sp)
    724 	fmadd	$T0a,$N0,$na,$T0a
    725 	fmadd	$T0b,$N0,$nb,$T0b
    726 	 srdi	$carry,$t0,16
    727 	 add	$t1,$t1,$carry
    728 	 srdi	$carry,$t1,16
    729 	 ld	$t6,`$FRAME+48`($sp)
    730 	 ld	$t7,`$FRAME+56`($sp)
    731 
    732 	fmadd	$T1a,$N0,$nc,$T1a
    733 	fmadd	$T1b,$N0,$nd,$T1b
    734 	 insrdi	$t0,$t1,16,32
    735 	 ld	$t1,8($tp)		; tp[j]
    736 	fmadd	$T2a,$N1,$nc,$T2a
    737 	fmadd	$T2b,$N1,$nd,$T2b
    738 	 add	$t2,$t2,$carry
    739 	fmadd	$T3a,$N2,$nc,$T3a
    740 	fmadd	$T3b,$N2,$nd,$T3b
    741 	 srdi	$carry,$t2,16
    742 	 insrdi	$t0,$t2,16,16
    743 	fmadd	$dota,$N3,$nc,$dota
    744 	fmadd	$dotb,$N3,$nd,$dotb
    745 	 add	$t3,$t3,$carry
    746 	 ldu	$t2,16($tp)		; tp[j+1]
    747 	 srdi	$carry,$t3,16
    748 	 insrdi	$t0,$t3,16,0		; 0..63 bits
    749 	 add	$t4,$t4,$carry
    750 
    751 	fctid	$T0a,$T0a
    752 	fctid	$T0b,$T0b
    753 	 srdi	$carry,$t4,16
    754 	fctid	$T1a,$T1a
    755 	fctid	$T1b,$T1b
    756 	 add	$t5,$t5,$carry
    757 	fctid	$T2a,$T2a
    758 	fctid	$T2b,$T2b
    759 	 srdi	$carry,$t5,16
    760 	 insrdi	$t4,$t5,16,32
    761 	fctid	$T3a,$T3a
    762 	fctid	$T3b,$T3b
    763 	 add	$t6,$t6,$carry
    764 	 srdi	$carry,$t6,16
    765 	 insrdi	$t4,$t6,16,16
    766 
    767 	stfd	$T0a,`$FRAME+0`($sp)
    768 	stfd	$T0b,`$FRAME+8`($sp)
    769 	 add	$t7,$t7,$carry
    770 	 addc	$t3,$t0,$t1
    771 	stfd	$T1a,`$FRAME+16`($sp)
    772 	stfd	$T1b,`$FRAME+24`($sp)
    773 	 insrdi	$t4,$t7,16,0		; 64..127 bits
    774 	 srdi	$carry,$t7,16		; upper 33 bits
    775 	stfd	$T2a,`$FRAME+32`($sp)
    776 	stfd	$T2b,`$FRAME+40`($sp)
    777 	 adde	$t5,$t4,$t2
    778 	stfd	$T3a,`$FRAME+48`($sp)
    779 	stfd	$T3b,`$FRAME+56`($sp)
    780 	 addze	$carry,$carry
    781 	 std	$t3,-16($tp)		; tp[j-1]
    782 	 std	$t5,-8($tp)		; tp[j]
    783 	bdnz-	Linner
    784 
    786 	fctid	$dota,$dota
    787 	fctid	$dotb,$dotb
    788 	ld	$t0,`$FRAME+0`($sp)
    789 	ld	$t1,`$FRAME+8`($sp)
    790 	ld	$t2,`$FRAME+16`($sp)
    791 	ld	$t3,`$FRAME+24`($sp)
    792 	ld	$t4,`$FRAME+32`($sp)
    793 	ld	$t5,`$FRAME+40`($sp)
    794 	ld	$t6,`$FRAME+48`($sp)
    795 	ld	$t7,`$FRAME+56`($sp)
    796 	stfd	$dota,`$FRAME+64`($sp)
    797 	stfd	$dotb,`$FRAME+72`($sp)
    798 
    799 	add	$t0,$t0,$carry		; can not overflow
    800 	srdi	$carry,$t0,16
    801 	add	$t1,$t1,$carry
    802 	srdi	$carry,$t1,16
    803 	insrdi	$t0,$t1,16,32
    804 	add	$t2,$t2,$carry
    805 	ld	$t1,8($tp)		; tp[j]
    806 	srdi	$carry,$t2,16
    807 	insrdi	$t0,$t2,16,16
    808 	add	$t3,$t3,$carry
    809 	ldu	$t2,16($tp)		; tp[j+1]
    810 	srdi	$carry,$t3,16
    811 	insrdi	$t0,$t3,16,0		; 0..63 bits
    812 	add	$t4,$t4,$carry
    813 	srdi	$carry,$t4,16
    814 	add	$t5,$t5,$carry
    815 	srdi	$carry,$t5,16
    816 	insrdi	$t4,$t5,16,32
    817 	add	$t6,$t6,$carry
    818 	srdi	$carry,$t6,16
    819 	insrdi	$t4,$t6,16,16
    820 	add	$t7,$t7,$carry
    821 	insrdi	$t4,$t7,16,0		; 64..127 bits
    822 	srdi	$carry,$t7,16		; upper 33 bits
    823 	ld	$t6,`$FRAME+64`($sp)
    824 	ld	$t7,`$FRAME+72`($sp)
    825 
    826 	addc	$t3,$t0,$t1
    827 	adde	$t5,$t4,$t2
    828 	addze	$carry,$carry
    829 
    830 	std	$t3,-16($tp)		; tp[j-1]
    831 	std	$t5,-8($tp)		; tp[j]
    832 
    833 	add	$carry,$carry,$ovf	; comsume upmost overflow
    834 	add	$t6,$t6,$carry		; can not overflow
    835 	srdi	$carry,$t6,16
    836 	add	$t7,$t7,$carry
    837 	insrdi	$t6,$t7,48,0
    838 	srdi	$ovf,$t7,48
    839 	std	$t6,0($tp)		; tp[num-1]
    840 
    841 	slwi	$t7,$num,2
    842 	addi	$i,$i,8
    843 	subf	$nap_d,$t7,$nap_d	; rewind pointer
    844 	cmpw	$i,$num
    845 	blt-	Louter
    846 
    848 	subf	$np,$num,$np	; rewind np
    849 	addi	$j,$j,1		; restore counter
    850 	subfc	$i,$i,$i	; j=0 and "clear" XER[CA]
    851 	addi	$tp,$sp,`$FRAME+$TRANSFER+8`
    852 	addi	$t4,$sp,`$FRAME+$TRANSFER+16`
    853 	addi	$t5,$np,8
    854 	addi	$t6,$rp,8
    855 	mtctr	$j
    856 
    857 .align	4
    858 Lsub:	ldx	$t0,$tp,$i
    859 	ldx	$t1,$np,$i
    860 	ldx	$t2,$t4,$i
    861 	ldx	$t3,$t5,$i
    862 	subfe	$t0,$t1,$t0	; tp[j]-np[j]
    863 	subfe	$t2,$t3,$t2	; tp[j+1]-np[j+1]
    864 	stdx	$t0,$rp,$i
    865 	stdx	$t2,$t6,$i
    866 	addi	$i,$i,16
    867 	bdnz-	Lsub
    868 
    869 	li	$i,0
    870 	subfe	$ovf,$i,$ovf	; handle upmost overflow bit
    871 	and	$ap,$tp,$ovf
    872 	andc	$np,$rp,$ovf
    873 	or	$ap,$ap,$np	; ap=borrow?tp:rp
    874 	addi	$t7,$ap,8
    875 	mtctr	$j
    876 
    877 .align	4
    878 Lcopy:				; copy or in-place refresh
    879 	ldx	$t0,$ap,$i
    880 	ldx	$t1,$t7,$i
    881 	std	$i,8($nap_d)	; zap nap_d
    882 	std	$i,16($nap_d)
    883 	std	$i,24($nap_d)
    884 	std	$i,32($nap_d)
    885 	std	$i,40($nap_d)
    886 	std	$i,48($nap_d)
    887 	std	$i,56($nap_d)
    888 	stdu	$i,64($nap_d)
    889 	stdx	$t0,$rp,$i
    890 	stdx	$t1,$t6,$i
    891 	stdx	$i,$tp,$i	; zap tp at once
    892 	stdx	$i,$t4,$i
    893 	addi	$i,$i,16
    894 	bdnz-	Lcopy
    895 
    897 	$POP	r14,`2*$SIZE_T`($sp)
    898 	$POP	r15,`3*$SIZE_T`($sp)
    899 	$POP	r16,`4*$SIZE_T`($sp)
    900 	$POP	r17,`5*$SIZE_T`($sp)
    901 	$POP	r18,`6*$SIZE_T`($sp)
    902 	$POP	r19,`7*$SIZE_T`($sp)
    903 	$POP	r20,`8*$SIZE_T`($sp)
    904 	$POP	r21,`9*$SIZE_T`($sp)
    905 	$POP	r22,`10*$SIZE_T`($sp)
    906 	$POP	r23,`11*$SIZE_T`($sp)
    907 	lfd	f14,`12*$SIZE_T+0`($sp)
    908 	lfd	f15,`12*$SIZE_T+8`($sp)
    909 	lfd	f16,`12*$SIZE_T+16`($sp)
    910 	lfd	f17,`12*$SIZE_T+24`($sp)
    911 	lfd	f18,`12*$SIZE_T+32`($sp)
    912 	lfd	f19,`12*$SIZE_T+40`($sp)
    913 	lfd	f20,`12*$SIZE_T+48`($sp)
    914 	lfd	f21,`12*$SIZE_T+56`($sp)
    915 	lfd	f22,`12*$SIZE_T+64`($sp)
    916 	lfd	f23,`12*$SIZE_T+72`($sp)
    917 	lfd	f24,`12*$SIZE_T+80`($sp)
    918 	lfd	f25,`12*$SIZE_T+88`($sp)
    919 	$POP	$sp,0($sp)
    920 	li	r3,1	; signal "handled"
    921 	blr
    922 	.long	0
    923 .asciz  "Montgomery Multiplication for PPC64, CRYPTOGAMS by <appro\@fy.chalmers.se>"
    924 ___
    925 
    926 $code =~ s/\`([^\`]*)\`/eval $1/gem;
    927 print $code;
    928 close STDOUT;
    929