Home | History | Annotate | Download | only in asm
      1 #!/usr/bin/env perl
      2 
      3 # ====================================================================
      4 # Written by Andy Polyakov <appro (at] fy.chalmers.se> for the OpenSSL
      5 # project. The module is, however, dual licensed under OpenSSL and
      6 # CRYPTOGAMS licenses depending on where you obtain it. For further
      7 # details see http://www.openssl.org/~appro/cryptogams/.
      8 # ====================================================================
      9 
     10 # December 2007
     11 
     12 # The reason for undertaken effort is basically following. Even though
     13 # Power 6 CPU operates at incredible 4.7GHz clock frequency, its PKI
     14 # performance was observed to be less than impressive, essentially as
     15 # fast as 1.8GHz PPC970, or 2.6 times(!) slower than one would hope.
     16 # Well, it's not surprising that IBM had to make some sacrifices to
     17 # boost the clock frequency that much, but no overall improvement?
     18 # Having observed how much difference did switching to FPU make on
     19 # UltraSPARC, playing same stunt on Power 6 appeared appropriate...
     20 # Unfortunately the resulting performance improvement is not as
     21 # impressive, ~30%, and in absolute terms is still very far from what
     22 # one would expect from 4.7GHz CPU. There is a chance that I'm doing
     23 # something wrong, but in the lack of assembler level micro-profiling
     24 # data or at least decent platform guide I can't tell... Or better
     25 # results might be achieved with VMX... Anyway, this module provides
     26 # *worse* performance on other PowerPC implementations, ~40-15% slower
     27 # on PPC970 depending on key length and ~40% slower on Power 5 for all
     28 # key lengths. As it's obviously inappropriate as "best all-round"
     29 # alternative, it has to be complemented with run-time CPU family
     30 # detection. Oh! It should also be noted that unlike other PowerPC
     31 # implementation IALU ppc-mont.pl module performs *suboptimaly* on
     32 # >=1024-bit key lengths on Power 6. It should also be noted that
     33 # *everything* said so far applies to 64-bit builds! As far as 32-bit
     34 # application executed on 64-bit CPU goes, this module is likely to
     35 # become preferred choice, because it's easy to adapt it for such
     36 # case and *is* faster than 32-bit ppc-mont.pl on *all* processors.
     37 
     38 # February 2008
     39 
     40 # Micro-profiling assisted optimization results in ~15% improvement
     41 # over original ppc64-mont.pl version, or overall ~50% improvement
     42 # over ppc.pl module on Power 6. If compared to ppc-mont.pl on same
     43 # Power 6 CPU, this module is 5-150% faster depending on key length,
     44 # [hereafter] more for longer keys. But if compared to ppc-mont.pl
     45 # on 1.8GHz PPC970, it's only 5-55% faster. Still far from impressive
     46 # in absolute terms, but it's apparently the way Power 6 is...
     47 
     48 # December 2009
     49 
     50 # Adapted for 32-bit build this module delivers 25-120%, yes, more
     51 # than *twice* for longer keys, performance improvement over 32-bit
     52 # ppc-mont.pl on 1.8GHz PPC970. However! This implementation utilizes
     53 # even 64-bit integer operations and the trouble is that most PPC
     54 # operating systems don't preserve upper halves of general purpose
     55 # registers upon 32-bit signal delivery. They do preserve them upon
     56 # context switch, but not signalling:-( This means that asynchronous
     57 # signals have to be blocked upon entry to this subroutine. Signal
     58 # masking (and of course complementary unmasking) has quite an impact
     59 # on performance, naturally larger for shorter keys. It's so severe
     60 # that 512-bit key performance can be as low as 1/3 of expected one.
     61 # This is why this routine can be engaged for longer key operations
     62 # only on these OSes, see crypto/ppccap.c for further details. MacOS X
     63 # is an exception from this and doesn't require signal masking, and
     64 # that's where above improvement coefficients were collected. For
     65 # others alternative would be to break dependence on upper halves of
     66 # GPRs by sticking to 32-bit integer operations...
     67 
     68 $flavour = shift;
     69 
     70 if ($flavour =~ /32/) {
     71 	$SIZE_T=4;
     72 	$RZONE=	224;
     73 	$fname=	"bn_mul_mont_fpu64";
     74 
     75 	$STUX=	"stwux";	# store indexed and update
     76 	$PUSH=	"stw";
     77 	$POP=	"lwz";
     78 } elsif ($flavour =~ /64/) {
     79 	$SIZE_T=8;
     80 	$RZONE=	288;
     81 	$fname=	"bn_mul_mont_fpu64";
     82 
     83 	# same as above, but 64-bit mnemonics...
     84 	$STUX=	"stdux";	# store indexed and update
     85 	$PUSH=	"std";
     86 	$POP=	"ld";
     87 } else { die "nonsense $flavour"; }
     88 
     89 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
     90 ( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
     91 ( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
     92 die "can't locate ppc-xlate.pl";
     93 
     94 open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!";
     95 
     96 $FRAME=64;	# padded frame header
     97 $TRANSFER=16*8;
     98 
     99 $carry="r0";
    100 $sp="r1";
    101 $toc="r2";
    102 $rp="r3";	$ovf="r3";
    103 $ap="r4";
    104 $bp="r5";
    105 $np="r6";
    106 $n0="r7";
    107 $num="r8";
    108 $rp="r9";	# $rp is reassigned
    109 $tp="r10";
    110 $j="r11";
    111 $i="r12";
    112 # non-volatile registers
    113 $nap_d="r22";	# interleaved ap and np in double format
    114 $a0="r23";	# ap[0]
    115 $t0="r24";	# temporary registers
    116 $t1="r25";
    117 $t2="r26";
    118 $t3="r27";
    119 $t4="r28";
    120 $t5="r29";
    121 $t6="r30";
    122 $t7="r31";
    123 
    124 # PPC offers enough register bank capacity to unroll inner loops twice
    125 #
    126 #     ..A3A2A1A0
    127 #           dcba
    128 #    -----------
    129 #            A0a
    130 #           A0b
    131 #          A0c
    132 #         A0d
    133 #          A1a
    134 #         A1b
    135 #        A1c
    136 #       A1d
    137 #        A2a
    138 #       A2b
    139 #      A2c
    140 #     A2d
    141 #      A3a
    142 #     A3b
    143 #    A3c
    144 #   A3d
    145 #    ..a
    146 #   ..b
    147 #
    148 $ba="f0";	$bb="f1";	$bc="f2";	$bd="f3";
    149 $na="f4";	$nb="f5";	$nc="f6";	$nd="f7";
    150 $dota="f8";	$dotb="f9";
    151 $A0="f10";	$A1="f11";	$A2="f12";	$A3="f13";
    152 $N0="f20";	$N1="f21";	$N2="f22";	$N3="f23";
    153 $T0a="f24";	$T0b="f25";
    154 $T1a="f26";	$T1b="f27";
    155 $T2a="f28";	$T2b="f29";
    156 $T3a="f30";	$T3b="f31";
    157 
    159 # sp----------->+-------------------------------+
    160 #		| saved sp			|
    161 #		+-------------------------------+
    162 #		.				.
    163 #   +64		+-------------------------------+
    164 #		| 16 gpr<->fpr transfer zone	|
    165 #		.				.
    166 #		.				.
    167 #   +16*8	+-------------------------------+
    168 #		| __int64 tmp[-1]		|
    169 #		+-------------------------------+
    170 #		| __int64 tmp[num]		|
    171 #		.				.
    172 #		.				.
    173 #		.				.
    174 #   +(num+1)*8	+-------------------------------+
    175 #		| padding to 64 byte boundary	|
    176 #		.				.
    177 #   +X		+-------------------------------+
    178 #		| double nap_d[4*num]		|
    179 #		.				.
    180 #		.				.
    181 #		.				.
    182 #		+-------------------------------+
    183 #		.				.
    184 #   -12*size_t	+-------------------------------+
    185 #		| 10 saved gpr, r22-r31		|
    186 #		.				.
    187 #		.				.
    188 #   -12*8	+-------------------------------+
    189 #		| 12 saved fpr, f20-f31		|
    190 #		.				.
    191 #		.				.
    192 #		+-------------------------------+
    193 
    195 $code=<<___;
    196 .machine "any"
    197 .text
    198 
    199 .globl	.$fname
    200 .align	5
    201 .$fname:
    202 	cmpwi	$num,`3*8/$SIZE_T`
    203 	mr	$rp,r3		; $rp is reassigned
    204 	li	r3,0		; possible "not handled" return code
    205 	bltlr-
    206 	andi.	r0,$num,`16/$SIZE_T-1`		; $num has to be "even"
    207 	bnelr-
    208 
    209 	slwi	$num,$num,`log($SIZE_T)/log(2)`	; num*=sizeof(BN_LONG)
    210 	li	$i,-4096
    211 	slwi	$tp,$num,2	; place for {an}p_{lh}[num], i.e. 4*num
    212 	add	$tp,$tp,$num	; place for tp[num+1]
    213 	addi	$tp,$tp,`$FRAME+$TRANSFER+8+64+$RZONE`
    214 	subf	$tp,$tp,$sp	; $sp-$tp
    215 	and	$tp,$tp,$i	; minimize TLB usage
    216 	subf	$tp,$sp,$tp	; $tp-$sp
    217 	mr	$i,$sp
    218 	$STUX	$sp,$sp,$tp	; alloca
    219 
    220 	$PUSH	r22,`-12*8-10*$SIZE_T`($i)
    221 	$PUSH	r23,`-12*8-9*$SIZE_T`($i)
    222 	$PUSH	r24,`-12*8-8*$SIZE_T`($i)
    223 	$PUSH	r25,`-12*8-7*$SIZE_T`($i)
    224 	$PUSH	r26,`-12*8-6*$SIZE_T`($i)
    225 	$PUSH	r27,`-12*8-5*$SIZE_T`($i)
    226 	$PUSH	r28,`-12*8-4*$SIZE_T`($i)
    227 	$PUSH	r29,`-12*8-3*$SIZE_T`($i)
    228 	$PUSH	r30,`-12*8-2*$SIZE_T`($i)
    229 	$PUSH	r31,`-12*8-1*$SIZE_T`($i)
    230 	stfd	f20,`-12*8`($i)
    231 	stfd	f21,`-11*8`($i)
    232 	stfd	f22,`-10*8`($i)
    233 	stfd	f23,`-9*8`($i)
    234 	stfd	f24,`-8*8`($i)
    235 	stfd	f25,`-7*8`($i)
    236 	stfd	f26,`-6*8`($i)
    237 	stfd	f27,`-5*8`($i)
    238 	stfd	f28,`-4*8`($i)
    239 	stfd	f29,`-3*8`($i)
    240 	stfd	f30,`-2*8`($i)
    241 	stfd	f31,`-1*8`($i)
    242 ___
    243 $code.=<<___ if ($SIZE_T==8);
    244 	ld	$a0,0($ap)	; pull ap[0] value
    245 	ld	$n0,0($n0)	; pull n0[0] value
    246 	ld	$t3,0($bp)	; bp[0]
    247 ___
    248 $code.=<<___ if ($SIZE_T==4);
    249 	mr	$t1,$n0
    250 	lwz	$a0,0($ap)	; pull ap[0,1] value
    251 	lwz	$t0,4($ap)
    252 	lwz	$n0,0($t1)	; pull n0[0,1] value
    253 	lwz	$t1,4($t1)
    254 	lwz	$t3,0($bp)	; bp[0,1]
    255 	lwz	$t2,4($bp)
    256 	insrdi	$a0,$t0,32,0
    257 	insrdi	$n0,$t1,32,0
    258 	insrdi	$t3,$t2,32,0
    259 ___
    260 $code.=<<___;
    261 	addi	$tp,$sp,`$FRAME+$TRANSFER+8+64`
    262 	li	$i,-64
    263 	add	$nap_d,$tp,$num
    264 	and	$nap_d,$nap_d,$i	; align to 64 bytes
    265 
    267 	mulld	$t7,$a0,$t3	; ap[0]*bp[0]
    268 	; nap_d is off by 1, because it's used with stfdu/lfdu
    269 	addi	$nap_d,$nap_d,-8
    270 	srwi	$j,$num,`3+1`	; counter register, num/2
    271 	mulld	$t7,$t7,$n0	; tp[0]*n0
    272 	addi	$j,$j,-1
    273 	addi	$tp,$sp,`$FRAME+$TRANSFER-8`
    274 	li	$carry,0
    275 	mtctr	$j
    276 
    277 	; transfer bp[0] to FPU as 4x16-bit values
    278 	extrdi	$t0,$t3,16,48
    279 	extrdi	$t1,$t3,16,32
    280 	extrdi	$t2,$t3,16,16
    281 	extrdi	$t3,$t3,16,0
    282 	std	$t0,`$FRAME+0`($sp)
    283 	std	$t1,`$FRAME+8`($sp)
    284 	std	$t2,`$FRAME+16`($sp)
    285 	std	$t3,`$FRAME+24`($sp)
    286 	; transfer (ap[0]*bp[0])*n0 to FPU as 4x16-bit values
    287 	extrdi	$t4,$t7,16,48
    288 	extrdi	$t5,$t7,16,32
    289 	extrdi	$t6,$t7,16,16
    290 	extrdi	$t7,$t7,16,0
    291 	std	$t4,`$FRAME+32`($sp)
    292 	std	$t5,`$FRAME+40`($sp)
    293 	std	$t6,`$FRAME+48`($sp)
    294 	std	$t7,`$FRAME+56`($sp)
    295 ___
    296 $code.=<<___ if ($SIZE_T==8);
    297 	lwz	$t0,4($ap)		; load a[j] as 32-bit word pair
    298 	lwz	$t1,0($ap)
    299 	lwz	$t2,12($ap)		; load a[j+1] as 32-bit word pair
    300 	lwz	$t3,8($ap)
    301 	lwz	$t4,4($np)		; load n[j] as 32-bit word pair
    302 	lwz	$t5,0($np)
    303 	lwz	$t6,12($np)		; load n[j+1] as 32-bit word pair
    304 	lwz	$t7,8($np)
    305 ___
    306 $code.=<<___ if ($SIZE_T==4);
    307 	lwz	$t0,0($ap)		; load a[j..j+3] as 32-bit word pairs
    308 	lwz	$t1,4($ap)
    309 	lwz	$t2,8($ap)
    310 	lwz	$t3,12($ap)
    311 	lwz	$t4,0($np)		; load n[j..j+3] as 32-bit word pairs
    312 	lwz	$t5,4($np)
    313 	lwz	$t6,8($np)
    314 	lwz	$t7,12($np)
    315 ___
    316 $code.=<<___;
    317 	lfd	$ba,`$FRAME+0`($sp)
    318 	lfd	$bb,`$FRAME+8`($sp)
    319 	lfd	$bc,`$FRAME+16`($sp)
    320 	lfd	$bd,`$FRAME+24`($sp)
    321 	lfd	$na,`$FRAME+32`($sp)
    322 	lfd	$nb,`$FRAME+40`($sp)
    323 	lfd	$nc,`$FRAME+48`($sp)
    324 	lfd	$nd,`$FRAME+56`($sp)
    325 	std	$t0,`$FRAME+64`($sp)
    326 	std	$t1,`$FRAME+72`($sp)
    327 	std	$t2,`$FRAME+80`($sp)
    328 	std	$t3,`$FRAME+88`($sp)
    329 	std	$t4,`$FRAME+96`($sp)
    330 	std	$t5,`$FRAME+104`($sp)
    331 	std	$t6,`$FRAME+112`($sp)
    332 	std	$t7,`$FRAME+120`($sp)
    333 	fcfid	$ba,$ba
    334 	fcfid	$bb,$bb
    335 	fcfid	$bc,$bc
    336 	fcfid	$bd,$bd
    337 	fcfid	$na,$na
    338 	fcfid	$nb,$nb
    339 	fcfid	$nc,$nc
    340 	fcfid	$nd,$nd
    341 
    342 	lfd	$A0,`$FRAME+64`($sp)
    343 	lfd	$A1,`$FRAME+72`($sp)
    344 	lfd	$A2,`$FRAME+80`($sp)
    345 	lfd	$A3,`$FRAME+88`($sp)
    346 	lfd	$N0,`$FRAME+96`($sp)
    347 	lfd	$N1,`$FRAME+104`($sp)
    348 	lfd	$N2,`$FRAME+112`($sp)
    349 	lfd	$N3,`$FRAME+120`($sp)
    350 	fcfid	$A0,$A0
    351 	fcfid	$A1,$A1
    352 	fcfid	$A2,$A2
    353 	fcfid	$A3,$A3
    354 	fcfid	$N0,$N0
    355 	fcfid	$N1,$N1
    356 	fcfid	$N2,$N2
    357 	fcfid	$N3,$N3
    358 	addi	$ap,$ap,16
    359 	addi	$np,$np,16
    360 
    361 	fmul	$T1a,$A1,$ba
    362 	fmul	$T1b,$A1,$bb
    363 	stfd	$A0,8($nap_d)		; save a[j] in double format
    364 	stfd	$A1,16($nap_d)
    365 	fmul	$T2a,$A2,$ba
    366 	fmul	$T2b,$A2,$bb
    367 	stfd	$A2,24($nap_d)		; save a[j+1] in double format
    368 	stfd	$A3,32($nap_d)
    369 	fmul	$T3a,$A3,$ba
    370 	fmul	$T3b,$A3,$bb
    371 	stfd	$N0,40($nap_d)		; save n[j] in double format
    372 	stfd	$N1,48($nap_d)
    373 	fmul	$T0a,$A0,$ba
    374 	fmul	$T0b,$A0,$bb
    375 	stfd	$N2,56($nap_d)		; save n[j+1] in double format
    376 	stfdu	$N3,64($nap_d)
    377 
    378 	fmadd	$T1a,$A0,$bc,$T1a
    379 	fmadd	$T1b,$A0,$bd,$T1b
    380 	fmadd	$T2a,$A1,$bc,$T2a
    381 	fmadd	$T2b,$A1,$bd,$T2b
    382 	fmadd	$T3a,$A2,$bc,$T3a
    383 	fmadd	$T3b,$A2,$bd,$T3b
    384 	fmul	$dota,$A3,$bc
    385 	fmul	$dotb,$A3,$bd
    386 
    387 	fmadd	$T1a,$N1,$na,$T1a
    388 	fmadd	$T1b,$N1,$nb,$T1b
    389 	fmadd	$T2a,$N2,$na,$T2a
    390 	fmadd	$T2b,$N2,$nb,$T2b
    391 	fmadd	$T3a,$N3,$na,$T3a
    392 	fmadd	$T3b,$N3,$nb,$T3b
    393 	fmadd	$T0a,$N0,$na,$T0a
    394 	fmadd	$T0b,$N0,$nb,$T0b
    395 
    396 	fmadd	$T1a,$N0,$nc,$T1a
    397 	fmadd	$T1b,$N0,$nd,$T1b
    398 	fmadd	$T2a,$N1,$nc,$T2a
    399 	fmadd	$T2b,$N1,$nd,$T2b
    400 	fmadd	$T3a,$N2,$nc,$T3a
    401 	fmadd	$T3b,$N2,$nd,$T3b
    402 	fmadd	$dota,$N3,$nc,$dota
    403 	fmadd	$dotb,$N3,$nd,$dotb
    404 
    405 	fctid	$T0a,$T0a
    406 	fctid	$T0b,$T0b
    407 	fctid	$T1a,$T1a
    408 	fctid	$T1b,$T1b
    409 	fctid	$T2a,$T2a
    410 	fctid	$T2b,$T2b
    411 	fctid	$T3a,$T3a
    412 	fctid	$T3b,$T3b
    413 
    414 	stfd	$T0a,`$FRAME+0`($sp)
    415 	stfd	$T0b,`$FRAME+8`($sp)
    416 	stfd	$T1a,`$FRAME+16`($sp)
    417 	stfd	$T1b,`$FRAME+24`($sp)
    418 	stfd	$T2a,`$FRAME+32`($sp)
    419 	stfd	$T2b,`$FRAME+40`($sp)
    420 	stfd	$T3a,`$FRAME+48`($sp)
    421 	stfd	$T3b,`$FRAME+56`($sp)
    422 
    424 .align	5
    425 L1st:
    426 ___
    427 $code.=<<___ if ($SIZE_T==8);
    428 	lwz	$t0,4($ap)		; load a[j] as 32-bit word pair
    429 	lwz	$t1,0($ap)
    430 	lwz	$t2,12($ap)		; load a[j+1] as 32-bit word pair
    431 	lwz	$t3,8($ap)
    432 	lwz	$t4,4($np)		; load n[j] as 32-bit word pair
    433 	lwz	$t5,0($np)
    434 	lwz	$t6,12($np)		; load n[j+1] as 32-bit word pair
    435 	lwz	$t7,8($np)
    436 ___
    437 $code.=<<___ if ($SIZE_T==4);
    438 	lwz	$t0,0($ap)		; load a[j..j+3] as 32-bit word pairs
    439 	lwz	$t1,4($ap)
    440 	lwz	$t2,8($ap)
    441 	lwz	$t3,12($ap)
    442 	lwz	$t4,0($np)		; load n[j..j+3] as 32-bit word pairs
    443 	lwz	$t5,4($np)
    444 	lwz	$t6,8($np)
    445 	lwz	$t7,12($np)
    446 ___
    447 $code.=<<___;
    448 	std	$t0,`$FRAME+64`($sp)
    449 	std	$t1,`$FRAME+72`($sp)
    450 	std	$t2,`$FRAME+80`($sp)
    451 	std	$t3,`$FRAME+88`($sp)
    452 	std	$t4,`$FRAME+96`($sp)
    453 	std	$t5,`$FRAME+104`($sp)
    454 	std	$t6,`$FRAME+112`($sp)
    455 	std	$t7,`$FRAME+120`($sp)
    456 	ld	$t0,`$FRAME+0`($sp)
    457 	ld	$t1,`$FRAME+8`($sp)
    458 	ld	$t2,`$FRAME+16`($sp)
    459 	ld	$t3,`$FRAME+24`($sp)
    460 	ld	$t4,`$FRAME+32`($sp)
    461 	ld	$t5,`$FRAME+40`($sp)
    462 	ld	$t6,`$FRAME+48`($sp)
    463 	ld	$t7,`$FRAME+56`($sp)
    464 	lfd	$A0,`$FRAME+64`($sp)
    465 	lfd	$A1,`$FRAME+72`($sp)
    466 	lfd	$A2,`$FRAME+80`($sp)
    467 	lfd	$A3,`$FRAME+88`($sp)
    468 	lfd	$N0,`$FRAME+96`($sp)
    469 	lfd	$N1,`$FRAME+104`($sp)
    470 	lfd	$N2,`$FRAME+112`($sp)
    471 	lfd	$N3,`$FRAME+120`($sp)
    472 	fcfid	$A0,$A0
    473 	fcfid	$A1,$A1
    474 	fcfid	$A2,$A2
    475 	fcfid	$A3,$A3
    476 	fcfid	$N0,$N0
    477 	fcfid	$N1,$N1
    478 	fcfid	$N2,$N2
    479 	fcfid	$N3,$N3
    480 	addi	$ap,$ap,16
    481 	addi	$np,$np,16
    482 
    483 	fmul	$T1a,$A1,$ba
    484 	fmul	$T1b,$A1,$bb
    485 	fmul	$T2a,$A2,$ba
    486 	fmul	$T2b,$A2,$bb
    487 	stfd	$A0,8($nap_d)		; save a[j] in double format
    488 	stfd	$A1,16($nap_d)
    489 	fmul	$T3a,$A3,$ba
    490 	fmul	$T3b,$A3,$bb
    491 	fmadd	$T0a,$A0,$ba,$dota
    492 	fmadd	$T0b,$A0,$bb,$dotb
    493 	stfd	$A2,24($nap_d)		; save a[j+1] in double format
    494 	stfd	$A3,32($nap_d)
    495 
    496 	fmadd	$T1a,$A0,$bc,$T1a
    497 	fmadd	$T1b,$A0,$bd,$T1b
    498 	fmadd	$T2a,$A1,$bc,$T2a
    499 	fmadd	$T2b,$A1,$bd,$T2b
    500 	stfd	$N0,40($nap_d)		; save n[j] in double format
    501 	stfd	$N1,48($nap_d)
    502 	fmadd	$T3a,$A2,$bc,$T3a
    503 	fmadd	$T3b,$A2,$bd,$T3b
    504 	 add	$t0,$t0,$carry		; can not overflow
    505 	fmul	$dota,$A3,$bc
    506 	fmul	$dotb,$A3,$bd
    507 	stfd	$N2,56($nap_d)		; save n[j+1] in double format
    508 	stfdu	$N3,64($nap_d)
    509 	 srdi	$carry,$t0,16
    510 	 add	$t1,$t1,$carry
    511 	 srdi	$carry,$t1,16
    512 
    513 	fmadd	$T1a,$N1,$na,$T1a
    514 	fmadd	$T1b,$N1,$nb,$T1b
    515 	 insrdi	$t0,$t1,16,32
    516 	fmadd	$T2a,$N2,$na,$T2a
    517 	fmadd	$T2b,$N2,$nb,$T2b
    518 	 add	$t2,$t2,$carry
    519 	fmadd	$T3a,$N3,$na,$T3a
    520 	fmadd	$T3b,$N3,$nb,$T3b
    521 	 srdi	$carry,$t2,16
    522 	fmadd	$T0a,$N0,$na,$T0a
    523 	fmadd	$T0b,$N0,$nb,$T0b
    524 	 insrdi	$t0,$t2,16,16
    525 	 add	$t3,$t3,$carry
    526 	 srdi	$carry,$t3,16
    527 
    528 	fmadd	$T1a,$N0,$nc,$T1a
    529 	fmadd	$T1b,$N0,$nd,$T1b
    530 	 insrdi	$t0,$t3,16,0		; 0..63 bits
    531 	fmadd	$T2a,$N1,$nc,$T2a
    532 	fmadd	$T2b,$N1,$nd,$T2b
    533 	 add	$t4,$t4,$carry
    534 	fmadd	$T3a,$N2,$nc,$T3a
    535 	fmadd	$T3b,$N2,$nd,$T3b
    536 	 srdi	$carry,$t4,16
    537 	fmadd	$dota,$N3,$nc,$dota
    538 	fmadd	$dotb,$N3,$nd,$dotb
    539 	 add	$t5,$t5,$carry
    540 	 srdi	$carry,$t5,16
    541 	 insrdi	$t4,$t5,16,32
    542 
    543 	fctid	$T0a,$T0a
    544 	fctid	$T0b,$T0b
    545 	 add	$t6,$t6,$carry
    546 	fctid	$T1a,$T1a
    547 	fctid	$T1b,$T1b
    548 	 srdi	$carry,$t6,16
    549 	fctid	$T2a,$T2a
    550 	fctid	$T2b,$T2b
    551 	 insrdi	$t4,$t6,16,16
    552 	fctid	$T3a,$T3a
    553 	fctid	$T3b,$T3b
    554 	 add	$t7,$t7,$carry
    555 	 insrdi	$t4,$t7,16,0		; 64..127 bits
    556 	 srdi	$carry,$t7,16		; upper 33 bits
    557 
    558 	stfd	$T0a,`$FRAME+0`($sp)
    559 	stfd	$T0b,`$FRAME+8`($sp)
    560 	stfd	$T1a,`$FRAME+16`($sp)
    561 	stfd	$T1b,`$FRAME+24`($sp)
    562 	stfd	$T2a,`$FRAME+32`($sp)
    563 	stfd	$T2b,`$FRAME+40`($sp)
    564 	stfd	$T3a,`$FRAME+48`($sp)
    565 	stfd	$T3b,`$FRAME+56`($sp)
    566 	 std	$t0,8($tp)		; tp[j-1]
    567 	 stdu	$t4,16($tp)		; tp[j]
    568 	bdnz-	L1st
    569 
    571 	fctid	$dota,$dota
    572 	fctid	$dotb,$dotb
    573 
    574 	ld	$t0,`$FRAME+0`($sp)
    575 	ld	$t1,`$FRAME+8`($sp)
    576 	ld	$t2,`$FRAME+16`($sp)
    577 	ld	$t3,`$FRAME+24`($sp)
    578 	ld	$t4,`$FRAME+32`($sp)
    579 	ld	$t5,`$FRAME+40`($sp)
    580 	ld	$t6,`$FRAME+48`($sp)
    581 	ld	$t7,`$FRAME+56`($sp)
    582 	stfd	$dota,`$FRAME+64`($sp)
    583 	stfd	$dotb,`$FRAME+72`($sp)
    584 
    585 	add	$t0,$t0,$carry		; can not overflow
    586 	srdi	$carry,$t0,16
    587 	add	$t1,$t1,$carry
    588 	srdi	$carry,$t1,16
    589 	insrdi	$t0,$t1,16,32
    590 	add	$t2,$t2,$carry
    591 	srdi	$carry,$t2,16
    592 	insrdi	$t0,$t2,16,16
    593 	add	$t3,$t3,$carry
    594 	srdi	$carry,$t3,16
    595 	insrdi	$t0,$t3,16,0		; 0..63 bits
    596 	add	$t4,$t4,$carry
    597 	srdi	$carry,$t4,16
    598 	add	$t5,$t5,$carry
    599 	srdi	$carry,$t5,16
    600 	insrdi	$t4,$t5,16,32
    601 	add	$t6,$t6,$carry
    602 	srdi	$carry,$t6,16
    603 	insrdi	$t4,$t6,16,16
    604 	add	$t7,$t7,$carry
    605 	insrdi	$t4,$t7,16,0		; 64..127 bits
    606 	srdi	$carry,$t7,16		; upper 33 bits
    607 	ld	$t6,`$FRAME+64`($sp)
    608 	ld	$t7,`$FRAME+72`($sp)
    609 
    610 	std	$t0,8($tp)		; tp[j-1]
    611 	stdu	$t4,16($tp)		; tp[j]
    612 
    613 	add	$t6,$t6,$carry		; can not overflow
    614 	srdi	$carry,$t6,16
    615 	add	$t7,$t7,$carry
    616 	insrdi	$t6,$t7,48,0
    617 	srdi	$ovf,$t7,48
    618 	std	$t6,8($tp)		; tp[num-1]
    619 
    620 	slwi	$t7,$num,2
    621 	subf	$nap_d,$t7,$nap_d	; rewind pointer
    622 
    624 	li	$i,8			; i=1
    625 .align	5
    626 Louter:
    627 ___
    628 $code.=<<___ if ($SIZE_T==8);
    629 	ldx	$t3,$bp,$i	; bp[i]
    630 ___
    631 $code.=<<___ if ($SIZE_T==4);
    632 	add	$t0,$bp,$i
    633 	lwz	$t3,0($t0)		; bp[i,i+1]
    634 	lwz	$t0,4($t0)
    635 	insrdi	$t3,$t0,32,0
    636 ___
    637 $code.=<<___;
    638 	ld	$t6,`$FRAME+$TRANSFER+8`($sp)	; tp[0]
    639 	mulld	$t7,$a0,$t3	; ap[0]*bp[i]
    640 
    641 	addi	$tp,$sp,`$FRAME+$TRANSFER`
    642 	add	$t7,$t7,$t6	; ap[0]*bp[i]+tp[0]
    643 	li	$carry,0
    644 	mulld	$t7,$t7,$n0	; tp[0]*n0
    645 	mtctr	$j
    646 
    647 	; transfer bp[i] to FPU as 4x16-bit values
    648 	extrdi	$t0,$t3,16,48
    649 	extrdi	$t1,$t3,16,32
    650 	extrdi	$t2,$t3,16,16
    651 	extrdi	$t3,$t3,16,0
    652 	std	$t0,`$FRAME+0`($sp)
    653 	std	$t1,`$FRAME+8`($sp)
    654 	std	$t2,`$FRAME+16`($sp)
    655 	std	$t3,`$FRAME+24`($sp)
    656 	; transfer (ap[0]*bp[i]+tp[0])*n0 to FPU as 4x16-bit values
    657 	extrdi	$t4,$t7,16,48
    658 	extrdi	$t5,$t7,16,32
    659 	extrdi	$t6,$t7,16,16
    660 	extrdi	$t7,$t7,16,0
    661 	std	$t4,`$FRAME+32`($sp)
    662 	std	$t5,`$FRAME+40`($sp)
    663 	std	$t6,`$FRAME+48`($sp)
    664 	std	$t7,`$FRAME+56`($sp)
    665 
    666 	lfd	$A0,8($nap_d)		; load a[j] in double format
    667 	lfd	$A1,16($nap_d)
    668 	lfd	$A2,24($nap_d)		; load a[j+1] in double format
    669 	lfd	$A3,32($nap_d)
    670 	lfd	$N0,40($nap_d)		; load n[j] in double format
    671 	lfd	$N1,48($nap_d)
    672 	lfd	$N2,56($nap_d)		; load n[j+1] in double format
    673 	lfdu	$N3,64($nap_d)
    674 
    675 	lfd	$ba,`$FRAME+0`($sp)
    676 	lfd	$bb,`$FRAME+8`($sp)
    677 	lfd	$bc,`$FRAME+16`($sp)
    678 	lfd	$bd,`$FRAME+24`($sp)
    679 	lfd	$na,`$FRAME+32`($sp)
    680 	lfd	$nb,`$FRAME+40`($sp)
    681 	lfd	$nc,`$FRAME+48`($sp)
    682 	lfd	$nd,`$FRAME+56`($sp)
    683 
    684 	fcfid	$ba,$ba
    685 	fcfid	$bb,$bb
    686 	fcfid	$bc,$bc
    687 	fcfid	$bd,$bd
    688 	fcfid	$na,$na
    689 	fcfid	$nb,$nb
    690 	fcfid	$nc,$nc
    691 	fcfid	$nd,$nd
    692 
    693 	fmul	$T1a,$A1,$ba
    694 	fmul	$T1b,$A1,$bb
    695 	fmul	$T2a,$A2,$ba
    696 	fmul	$T2b,$A2,$bb
    697 	fmul	$T3a,$A3,$ba
    698 	fmul	$T3b,$A3,$bb
    699 	fmul	$T0a,$A0,$ba
    700 	fmul	$T0b,$A0,$bb
    701 
    702 	fmadd	$T1a,$A0,$bc,$T1a
    703 	fmadd	$T1b,$A0,$bd,$T1b
    704 	fmadd	$T2a,$A1,$bc,$T2a
    705 	fmadd	$T2b,$A1,$bd,$T2b
    706 	fmadd	$T3a,$A2,$bc,$T3a
    707 	fmadd	$T3b,$A2,$bd,$T3b
    708 	fmul	$dota,$A3,$bc
    709 	fmul	$dotb,$A3,$bd
    710 
    711 	fmadd	$T1a,$N1,$na,$T1a
    712 	fmadd	$T1b,$N1,$nb,$T1b
    713 	 lfd	$A0,8($nap_d)		; load a[j] in double format
    714 	 lfd	$A1,16($nap_d)
    715 	fmadd	$T2a,$N2,$na,$T2a
    716 	fmadd	$T2b,$N2,$nb,$T2b
    717 	 lfd	$A2,24($nap_d)		; load a[j+1] in double format
    718 	 lfd	$A3,32($nap_d)
    719 	fmadd	$T3a,$N3,$na,$T3a
    720 	fmadd	$T3b,$N3,$nb,$T3b
    721 	fmadd	$T0a,$N0,$na,$T0a
    722 	fmadd	$T0b,$N0,$nb,$T0b
    723 
    724 	fmadd	$T1a,$N0,$nc,$T1a
    725 	fmadd	$T1b,$N0,$nd,$T1b
    726 	fmadd	$T2a,$N1,$nc,$T2a
    727 	fmadd	$T2b,$N1,$nd,$T2b
    728 	fmadd	$T3a,$N2,$nc,$T3a
    729 	fmadd	$T3b,$N2,$nd,$T3b
    730 	fmadd	$dota,$N3,$nc,$dota
    731 	fmadd	$dotb,$N3,$nd,$dotb
    732 
    733 	fctid	$T0a,$T0a
    734 	fctid	$T0b,$T0b
    735 	fctid	$T1a,$T1a
    736 	fctid	$T1b,$T1b
    737 	fctid	$T2a,$T2a
    738 	fctid	$T2b,$T2b
    739 	fctid	$T3a,$T3a
    740 	fctid	$T3b,$T3b
    741 
    742 	stfd	$T0a,`$FRAME+0`($sp)
    743 	stfd	$T0b,`$FRAME+8`($sp)
    744 	stfd	$T1a,`$FRAME+16`($sp)
    745 	stfd	$T1b,`$FRAME+24`($sp)
    746 	stfd	$T2a,`$FRAME+32`($sp)
    747 	stfd	$T2b,`$FRAME+40`($sp)
    748 	stfd	$T3a,`$FRAME+48`($sp)
    749 	stfd	$T3b,`$FRAME+56`($sp)
    750 
    752 .align	5
    753 Linner:
    754 	fmul	$T1a,$A1,$ba
    755 	fmul	$T1b,$A1,$bb
    756 	fmul	$T2a,$A2,$ba
    757 	fmul	$T2b,$A2,$bb
    758 	lfd	$N0,40($nap_d)		; load n[j] in double format
    759 	lfd	$N1,48($nap_d)
    760 	fmul	$T3a,$A3,$ba
    761 	fmul	$T3b,$A3,$bb
    762 	fmadd	$T0a,$A0,$ba,$dota
    763 	fmadd	$T0b,$A0,$bb,$dotb
    764 	lfd	$N2,56($nap_d)		; load n[j+1] in double format
    765 	lfdu	$N3,64($nap_d)
    766 
    767 	fmadd	$T1a,$A0,$bc,$T1a
    768 	fmadd	$T1b,$A0,$bd,$T1b
    769 	fmadd	$T2a,$A1,$bc,$T2a
    770 	fmadd	$T2b,$A1,$bd,$T2b
    771 	 lfd	$A0,8($nap_d)		; load a[j] in double format
    772 	 lfd	$A1,16($nap_d)
    773 	fmadd	$T3a,$A2,$bc,$T3a
    774 	fmadd	$T3b,$A2,$bd,$T3b
    775 	fmul	$dota,$A3,$bc
    776 	fmul	$dotb,$A3,$bd
    777 	 lfd	$A2,24($nap_d)		; load a[j+1] in double format
    778 	 lfd	$A3,32($nap_d)
    779 
    780 	fmadd	$T1a,$N1,$na,$T1a
    781 	fmadd	$T1b,$N1,$nb,$T1b
    782 	 ld	$t0,`$FRAME+0`($sp)
    783 	 ld	$t1,`$FRAME+8`($sp)
    784 	fmadd	$T2a,$N2,$na,$T2a
    785 	fmadd	$T2b,$N2,$nb,$T2b
    786 	 ld	$t2,`$FRAME+16`($sp)
    787 	 ld	$t3,`$FRAME+24`($sp)
    788 	fmadd	$T3a,$N3,$na,$T3a
    789 	fmadd	$T3b,$N3,$nb,$T3b
    790 	 add	$t0,$t0,$carry		; can not overflow
    791 	 ld	$t4,`$FRAME+32`($sp)
    792 	 ld	$t5,`$FRAME+40`($sp)
    793 	fmadd	$T0a,$N0,$na,$T0a
    794 	fmadd	$T0b,$N0,$nb,$T0b
    795 	 srdi	$carry,$t0,16
    796 	 add	$t1,$t1,$carry
    797 	 srdi	$carry,$t1,16
    798 	 ld	$t6,`$FRAME+48`($sp)
    799 	 ld	$t7,`$FRAME+56`($sp)
    800 
    801 	fmadd	$T1a,$N0,$nc,$T1a
    802 	fmadd	$T1b,$N0,$nd,$T1b
    803 	 insrdi	$t0,$t1,16,32
    804 	 ld	$t1,8($tp)		; tp[j]
    805 	fmadd	$T2a,$N1,$nc,$T2a
    806 	fmadd	$T2b,$N1,$nd,$T2b
    807 	 add	$t2,$t2,$carry
    808 	fmadd	$T3a,$N2,$nc,$T3a
    809 	fmadd	$T3b,$N2,$nd,$T3b
    810 	 srdi	$carry,$t2,16
    811 	 insrdi	$t0,$t2,16,16
    812 	fmadd	$dota,$N3,$nc,$dota
    813 	fmadd	$dotb,$N3,$nd,$dotb
    814 	 add	$t3,$t3,$carry
    815 	 ldu	$t2,16($tp)		; tp[j+1]
    816 	 srdi	$carry,$t3,16
    817 	 insrdi	$t0,$t3,16,0		; 0..63 bits
    818 	 add	$t4,$t4,$carry
    819 
    820 	fctid	$T0a,$T0a
    821 	fctid	$T0b,$T0b
    822 	 srdi	$carry,$t4,16
    823 	fctid	$T1a,$T1a
    824 	fctid	$T1b,$T1b
    825 	 add	$t5,$t5,$carry
    826 	fctid	$T2a,$T2a
    827 	fctid	$T2b,$T2b
    828 	 srdi	$carry,$t5,16
    829 	 insrdi	$t4,$t5,16,32
    830 	fctid	$T3a,$T3a
    831 	fctid	$T3b,$T3b
    832 	 add	$t6,$t6,$carry
    833 	 srdi	$carry,$t6,16
    834 	 insrdi	$t4,$t6,16,16
    835 
    836 	stfd	$T0a,`$FRAME+0`($sp)
    837 	stfd	$T0b,`$FRAME+8`($sp)
    838 	 add	$t7,$t7,$carry
    839 	 addc	$t3,$t0,$t1
    840 ___
    841 $code.=<<___ if ($SIZE_T==4);		# adjust XER[CA]
    842 	extrdi	$t0,$t0,32,0
    843 	extrdi	$t1,$t1,32,0
    844 	adde	$t0,$t0,$t1
    845 ___
    846 $code.=<<___;
    847 	stfd	$T1a,`$FRAME+16`($sp)
    848 	stfd	$T1b,`$FRAME+24`($sp)
    849 	 insrdi	$t4,$t7,16,0		; 64..127 bits
    850 	 srdi	$carry,$t7,16		; upper 33 bits
    851 	stfd	$T2a,`$FRAME+32`($sp)
    852 	stfd	$T2b,`$FRAME+40`($sp)
    853 	 adde	$t5,$t4,$t2
    854 ___
    855 $code.=<<___ if ($SIZE_T==4);		# adjust XER[CA]
    856 	extrdi	$t4,$t4,32,0
    857 	extrdi	$t2,$t2,32,0
    858 	adde	$t4,$t4,$t2
    859 ___
    860 $code.=<<___;
    861 	stfd	$T3a,`$FRAME+48`($sp)
    862 	stfd	$T3b,`$FRAME+56`($sp)
    863 	 addze	$carry,$carry
    864 	 std	$t3,-16($tp)		; tp[j-1]
    865 	 std	$t5,-8($tp)		; tp[j]
    866 	bdnz-	Linner
    867 
    869 	fctid	$dota,$dota
    870 	fctid	$dotb,$dotb
    871 	ld	$t0,`$FRAME+0`($sp)
    872 	ld	$t1,`$FRAME+8`($sp)
    873 	ld	$t2,`$FRAME+16`($sp)
    874 	ld	$t3,`$FRAME+24`($sp)
    875 	ld	$t4,`$FRAME+32`($sp)
    876 	ld	$t5,`$FRAME+40`($sp)
    877 	ld	$t6,`$FRAME+48`($sp)
    878 	ld	$t7,`$FRAME+56`($sp)
    879 	stfd	$dota,`$FRAME+64`($sp)
    880 	stfd	$dotb,`$FRAME+72`($sp)
    881 
    882 	add	$t0,$t0,$carry		; can not overflow
    883 	srdi	$carry,$t0,16
    884 	add	$t1,$t1,$carry
    885 	srdi	$carry,$t1,16
    886 	insrdi	$t0,$t1,16,32
    887 	add	$t2,$t2,$carry
    888 	ld	$t1,8($tp)		; tp[j]
    889 	srdi	$carry,$t2,16
    890 	insrdi	$t0,$t2,16,16
    891 	add	$t3,$t3,$carry
    892 	ldu	$t2,16($tp)		; tp[j+1]
    893 	srdi	$carry,$t3,16
    894 	insrdi	$t0,$t3,16,0		; 0..63 bits
    895 	add	$t4,$t4,$carry
    896 	srdi	$carry,$t4,16
    897 	add	$t5,$t5,$carry
    898 	srdi	$carry,$t5,16
    899 	insrdi	$t4,$t5,16,32
    900 	add	$t6,$t6,$carry
    901 	srdi	$carry,$t6,16
    902 	insrdi	$t4,$t6,16,16
    903 	add	$t7,$t7,$carry
    904 	insrdi	$t4,$t7,16,0		; 64..127 bits
    905 	srdi	$carry,$t7,16		; upper 33 bits
    906 	ld	$t6,`$FRAME+64`($sp)
    907 	ld	$t7,`$FRAME+72`($sp)
    908 
    909 	addc	$t3,$t0,$t1
    910 ___
    911 $code.=<<___ if ($SIZE_T==4);		# adjust XER[CA]
    912 	extrdi	$t0,$t0,32,0
    913 	extrdi	$t1,$t1,32,0
    914 	adde	$t0,$t0,$t1
    915 ___
    916 $code.=<<___;
    917 	adde	$t5,$t4,$t2
    918 ___
    919 $code.=<<___ if ($SIZE_T==4);		# adjust XER[CA]
    920 	extrdi	$t4,$t4,32,0
    921 	extrdi	$t2,$t2,32,0
    922 	adde	$t4,$t4,$t2
    923 ___
    924 $code.=<<___;
    925 	addze	$carry,$carry
    926 
    927 	std	$t3,-16($tp)		; tp[j-1]
    928 	std	$t5,-8($tp)		; tp[j]
    929 
    930 	add	$carry,$carry,$ovf	; comsume upmost overflow
    931 	add	$t6,$t6,$carry		; can not overflow
    932 	srdi	$carry,$t6,16
    933 	add	$t7,$t7,$carry
    934 	insrdi	$t6,$t7,48,0
    935 	srdi	$ovf,$t7,48
    936 	std	$t6,0($tp)		; tp[num-1]
    937 
    938 	slwi	$t7,$num,2
    939 	addi	$i,$i,8
    940 	subf	$nap_d,$t7,$nap_d	; rewind pointer
    941 	cmpw	$i,$num
    942 	blt-	Louter
    943 ___
    944 
    946 $code.=<<___ if ($SIZE_T==8);
    947 	subf	$np,$num,$np	; rewind np
    948 	addi	$j,$j,1		; restore counter
    949 	subfc	$i,$i,$i	; j=0 and "clear" XER[CA]
    950 	addi	$tp,$sp,`$FRAME+$TRANSFER+8`
    951 	addi	$t4,$sp,`$FRAME+$TRANSFER+16`
    952 	addi	$t5,$np,8
    953 	addi	$t6,$rp,8
    954 	mtctr	$j
    955 
    956 .align	4
    957 Lsub:	ldx	$t0,$tp,$i
    958 	ldx	$t1,$np,$i
    959 	ldx	$t2,$t4,$i
    960 	ldx	$t3,$t5,$i
    961 	subfe	$t0,$t1,$t0	; tp[j]-np[j]
    962 	subfe	$t2,$t3,$t2	; tp[j+1]-np[j+1]
    963 	stdx	$t0,$rp,$i
    964 	stdx	$t2,$t6,$i
    965 	addi	$i,$i,16
    966 	bdnz-	Lsub
    967 
    968 	li	$i,0
    969 	subfe	$ovf,$i,$ovf	; handle upmost overflow bit
    970 	and	$ap,$tp,$ovf
    971 	andc	$np,$rp,$ovf
    972 	or	$ap,$ap,$np	; ap=borrow?tp:rp
    973 	addi	$t7,$ap,8
    974 	mtctr	$j
    975 
    976 .align	4
    977 Lcopy:				; copy or in-place refresh
    978 	ldx	$t0,$ap,$i
    979 	ldx	$t1,$t7,$i
    980 	std	$i,8($nap_d)	; zap nap_d
    981 	std	$i,16($nap_d)
    982 	std	$i,24($nap_d)
    983 	std	$i,32($nap_d)
    984 	std	$i,40($nap_d)
    985 	std	$i,48($nap_d)
    986 	std	$i,56($nap_d)
    987 	stdu	$i,64($nap_d)
    988 	stdx	$t0,$rp,$i
    989 	stdx	$t1,$t6,$i
    990 	stdx	$i,$tp,$i	; zap tp at once
    991 	stdx	$i,$t4,$i
    992 	addi	$i,$i,16
    993 	bdnz-	Lcopy
    994 ___
    995 $code.=<<___ if ($SIZE_T==4);
    996 	subf	$np,$num,$np	; rewind np
    997 	addi	$j,$j,1		; restore counter
    998 	subfc	$i,$i,$i	; j=0 and "clear" XER[CA]
    999 	addi	$tp,$sp,`$FRAME+$TRANSFER`
   1000 	addi	$np,$np,-4
   1001 	addi	$rp,$rp,-4
   1002 	addi	$ap,$sp,`$FRAME+$TRANSFER+4`
   1003 	mtctr	$j
   1004 
   1005 .align	4
   1006 Lsub:	ld	$t0,8($tp)	; load tp[j..j+3] in 64-bit word order
   1007 	ldu	$t2,16($tp)
   1008 	lwz	$t4,4($np)	; load np[j..j+3] in 32-bit word order
   1009 	lwz	$t5,8($np)
   1010 	lwz	$t6,12($np)
   1011 	lwzu	$t7,16($np)
   1012 	extrdi	$t1,$t0,32,0
   1013 	extrdi	$t3,$t2,32,0
   1014 	subfe	$t4,$t4,$t0	; tp[j]-np[j]
   1015 	 stw	$t0,4($ap)	; save tp[j..j+3] in 32-bit word order
   1016 	subfe	$t5,$t5,$t1	; tp[j+1]-np[j+1]
   1017 	 stw	$t1,8($ap)
   1018 	subfe	$t6,$t6,$t2	; tp[j+2]-np[j+2]
   1019 	 stw	$t2,12($ap)
   1020 	subfe	$t7,$t7,$t3	; tp[j+3]-np[j+3]
   1021 	 stwu	$t3,16($ap)
   1022 	stw	$t4,4($rp)
   1023 	stw	$t5,8($rp)
   1024 	stw	$t6,12($rp)
   1025 	stwu	$t7,16($rp)
   1026 	bdnz-	Lsub
   1027 
   1028 	li	$i,0
   1029 	subfe	$ovf,$i,$ovf	; handle upmost overflow bit
   1030 	addi	$tp,$sp,`$FRAME+$TRANSFER+4`
   1031 	subf	$rp,$num,$rp	; rewind rp
   1032 	and	$ap,$tp,$ovf
   1033 	andc	$np,$rp,$ovf
   1034 	or	$ap,$ap,$np	; ap=borrow?tp:rp
   1035 	addi	$tp,$sp,`$FRAME+$TRANSFER`
   1036 	mtctr	$j
   1037 
   1038 .align	4
   1039 Lcopy:				; copy or in-place refresh
   1040 	lwz	$t0,4($ap)
   1041 	lwz	$t1,8($ap)
   1042 	lwz	$t2,12($ap)
   1043 	lwzu	$t3,16($ap)
   1044 	std	$i,8($nap_d)	; zap nap_d
   1045 	std	$i,16($nap_d)
   1046 	std	$i,24($nap_d)
   1047 	std	$i,32($nap_d)
   1048 	std	$i,40($nap_d)
   1049 	std	$i,48($nap_d)
   1050 	std	$i,56($nap_d)
   1051 	stdu	$i,64($nap_d)
   1052 	stw	$t0,4($rp)
   1053 	stw	$t1,8($rp)
   1054 	stw	$t2,12($rp)
   1055 	stwu	$t3,16($rp)
   1056 	std	$i,8($tp)	; zap tp at once
   1057 	stdu	$i,16($tp)
   1058 	bdnz-	Lcopy
   1059 ___
   1060 
   1062 $code.=<<___;
   1063 	$POP	$i,0($sp)
   1064 	li	r3,1	; signal "handled"
   1065 	$POP	r22,`-12*8-10*$SIZE_T`($i)
   1066 	$POP	r23,`-12*8-9*$SIZE_T`($i)
   1067 	$POP	r24,`-12*8-8*$SIZE_T`($i)
   1068 	$POP	r25,`-12*8-7*$SIZE_T`($i)
   1069 	$POP	r26,`-12*8-6*$SIZE_T`($i)
   1070 	$POP	r27,`-12*8-5*$SIZE_T`($i)
   1071 	$POP	r28,`-12*8-4*$SIZE_T`($i)
   1072 	$POP	r29,`-12*8-3*$SIZE_T`($i)
   1073 	$POP	r30,`-12*8-2*$SIZE_T`($i)
   1074 	$POP	r31,`-12*8-1*$SIZE_T`($i)
   1075 	lfd	f20,`-12*8`($i)
   1076 	lfd	f21,`-11*8`($i)
   1077 	lfd	f22,`-10*8`($i)
   1078 	lfd	f23,`-9*8`($i)
   1079 	lfd	f24,`-8*8`($i)
   1080 	lfd	f25,`-7*8`($i)
   1081 	lfd	f26,`-6*8`($i)
   1082 	lfd	f27,`-5*8`($i)
   1083 	lfd	f28,`-4*8`($i)
   1084 	lfd	f29,`-3*8`($i)
   1085 	lfd	f30,`-2*8`($i)
   1086 	lfd	f31,`-1*8`($i)
   1087 	mr	$sp,$i
   1088 	blr
   1089 	.long	0
   1090 	.byte	0,12,4,0,0x8c,10,6,0
   1091 	.long	0
   1092 
   1093 .asciz  "Montgomery Multiplication for PPC64, CRYPTOGAMS by <appro\@openssl.org>"
   1094 ___
   1095 
   1096 $code =~ s/\`([^\`]*)\`/eval $1/gem;
   1097 print $code;
   1098 close STDOUT;
   1099