1 #!/usr/bin/env perl 2 3 # ==================================================================== 4 # Written by Andy Polyakov <appro (at] fy.chalmers.se> for the OpenSSL 5 # project. The module is, however, dual licensed under OpenSSL and 6 # CRYPTOGAMS licenses depending on where you obtain it. For further 7 # details see http://www.openssl.org/~appro/cryptogams/. 8 # ==================================================================== 9 10 # December 2007 11 12 # The reason for undertaken effort is basically following. Even though 13 # Power 6 CPU operates at incredible 4.7GHz clock frequency, its PKI 14 # performance was observed to be less than impressive, essentially as 15 # fast as 1.8GHz PPC970, or 2.6 times(!) slower than one would hope. 16 # Well, it's not surprising that IBM had to make some sacrifices to 17 # boost the clock frequency that much, but no overall improvement? 18 # Having observed how much difference did switching to FPU make on 19 # UltraSPARC, playing same stunt on Power 6 appeared appropriate... 20 # Unfortunately the resulting performance improvement is not as 21 # impressive, ~30%, and in absolute terms is still very far from what 22 # one would expect from 4.7GHz CPU. There is a chance that I'm doing 23 # something wrong, but in the lack of assembler level micro-profiling 24 # data or at least decent platform guide I can't tell... Or better 25 # results might be achieved with VMX... Anyway, this module provides 26 # *worse* performance on other PowerPC implementations, ~40-15% slower 27 # on PPC970 depending on key length and ~40% slower on Power 5 for all 28 # key lengths. As it's obviously inappropriate as "best all-round" 29 # alternative, it has to be complemented with run-time CPU family 30 # detection. Oh! It should also be noted that unlike other PowerPC 31 # implementation IALU ppc-mont.pl module performs *suboptimaly* on 32 # >=1024-bit key lengths on Power 6. It should also be noted that 33 # *everything* said so far applies to 64-bit builds! As far as 32-bit 34 # application executed on 64-bit CPU goes, this module is likely to 35 # become preferred choice, because it's easy to adapt it for such 36 # case and *is* faster than 32-bit ppc-mont.pl on *all* processors. 37 38 # February 2008 39 40 # Micro-profiling assisted optimization results in ~15% improvement 41 # over original ppc64-mont.pl version, or overall ~50% improvement 42 # over ppc.pl module on Power 6. If compared to ppc-mont.pl on same 43 # Power 6 CPU, this module is 5-150% faster depending on key length, 44 # [hereafter] more for longer keys. But if compared to ppc-mont.pl 45 # on 1.8GHz PPC970, it's only 5-55% faster. Still far from impressive 46 # in absolute terms, but it's apparently the way Power 6 is... 47 48 # December 2009 49 50 # Adapted for 32-bit build this module delivers 25-120%, yes, more 51 # than *twice* for longer keys, performance improvement over 32-bit 52 # ppc-mont.pl on 1.8GHz PPC970. However! This implementation utilizes 53 # even 64-bit integer operations and the trouble is that most PPC 54 # operating systems don't preserve upper halves of general purpose 55 # registers upon 32-bit signal delivery. They do preserve them upon 56 # context switch, but not signalling:-( This means that asynchronous 57 # signals have to be blocked upon entry to this subroutine. Signal 58 # masking (and of course complementary unmasking) has quite an impact 59 # on performance, naturally larger for shorter keys. It's so severe 60 # that 512-bit key performance can be as low as 1/3 of expected one. 61 # This is why this routine can be engaged for longer key operations 62 # only on these OSes, see crypto/ppccap.c for further details. MacOS X 63 # is an exception from this and doesn't require signal masking, and 64 # that's where above improvement coefficients were collected. For 65 # others alternative would be to break dependence on upper halves of 66 # GPRs by sticking to 32-bit integer operations... 67 68 $flavour = shift; 69 70 if ($flavour =~ /32/) { 71 $SIZE_T=4; 72 $RZONE= 224; 73 $fname= "bn_mul_mont_fpu64"; 74 75 $STUX= "stwux"; # store indexed and update 76 $PUSH= "stw"; 77 $POP= "lwz"; 78 } elsif ($flavour =~ /64/) { 79 $SIZE_T=8; 80 $RZONE= 288; 81 $fname= "bn_mul_mont_fpu64"; 82 83 # same as above, but 64-bit mnemonics... 84 $STUX= "stdux"; # store indexed and update 85 $PUSH= "std"; 86 $POP= "ld"; 87 } else { die "nonsense $flavour"; } 88 89 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 90 ( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or 91 ( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or 92 die "can't locate ppc-xlate.pl"; 93 94 open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!"; 95 96 $FRAME=64; # padded frame header 97 $TRANSFER=16*8; 98 99 $carry="r0"; 100 $sp="r1"; 101 $toc="r2"; 102 $rp="r3"; $ovf="r3"; 103 $ap="r4"; 104 $bp="r5"; 105 $np="r6"; 106 $n0="r7"; 107 $num="r8"; 108 $rp="r9"; # $rp is reassigned 109 $tp="r10"; 110 $j="r11"; 111 $i="r12"; 112 # non-volatile registers 113 $nap_d="r22"; # interleaved ap and np in double format 114 $a0="r23"; # ap[0] 115 $t0="r24"; # temporary registers 116 $t1="r25"; 117 $t2="r26"; 118 $t3="r27"; 119 $t4="r28"; 120 $t5="r29"; 121 $t6="r30"; 122 $t7="r31"; 123 124 # PPC offers enough register bank capacity to unroll inner loops twice 125 # 126 # ..A3A2A1A0 127 # dcba 128 # ----------- 129 # A0a 130 # A0b 131 # A0c 132 # A0d 133 # A1a 134 # A1b 135 # A1c 136 # A1d 137 # A2a 138 # A2b 139 # A2c 140 # A2d 141 # A3a 142 # A3b 143 # A3c 144 # A3d 145 # ..a 146 # ..b 147 # 148 $ba="f0"; $bb="f1"; $bc="f2"; $bd="f3"; 149 $na="f4"; $nb="f5"; $nc="f6"; $nd="f7"; 150 $dota="f8"; $dotb="f9"; 151 $A0="f10"; $A1="f11"; $A2="f12"; $A3="f13"; 152 $N0="f20"; $N1="f21"; $N2="f22"; $N3="f23"; 153 $T0a="f24"; $T0b="f25"; 154 $T1a="f26"; $T1b="f27"; 155 $T2a="f28"; $T2b="f29"; 156 $T3a="f30"; $T3b="f31"; 157 159 # sp----------->+-------------------------------+ 160 # | saved sp | 161 # +-------------------------------+ 162 # . . 163 # +64 +-------------------------------+ 164 # | 16 gpr<->fpr transfer zone | 165 # . . 166 # . . 167 # +16*8 +-------------------------------+ 168 # | __int64 tmp[-1] | 169 # +-------------------------------+ 170 # | __int64 tmp[num] | 171 # . . 172 # . . 173 # . . 174 # +(num+1)*8 +-------------------------------+ 175 # | padding to 64 byte boundary | 176 # . . 177 # +X +-------------------------------+ 178 # | double nap_d[4*num] | 179 # . . 180 # . . 181 # . . 182 # +-------------------------------+ 183 # . . 184 # -12*size_t +-------------------------------+ 185 # | 10 saved gpr, r22-r31 | 186 # . . 187 # . . 188 # -12*8 +-------------------------------+ 189 # | 12 saved fpr, f20-f31 | 190 # . . 191 # . . 192 # +-------------------------------+ 193 195 $code=<<___; 196 .machine "any" 197 .text 198 199 .globl .$fname 200 .align 5 201 .$fname: 202 cmpwi $num,`3*8/$SIZE_T` 203 mr $rp,r3 ; $rp is reassigned 204 li r3,0 ; possible "not handled" return code 205 bltlr- 206 andi. r0,$num,`16/$SIZE_T-1` ; $num has to be "even" 207 bnelr- 208 209 slwi $num,$num,`log($SIZE_T)/log(2)` ; num*=sizeof(BN_LONG) 210 li $i,-4096 211 slwi $tp,$num,2 ; place for {an}p_{lh}[num], i.e. 4*num 212 add $tp,$tp,$num ; place for tp[num+1] 213 addi $tp,$tp,`$FRAME+$TRANSFER+8+64+$RZONE` 214 subf $tp,$tp,$sp ; $sp-$tp 215 and $tp,$tp,$i ; minimize TLB usage 216 subf $tp,$sp,$tp ; $tp-$sp 217 mr $i,$sp 218 $STUX $sp,$sp,$tp ; alloca 219 220 $PUSH r22,`-12*8-10*$SIZE_T`($i) 221 $PUSH r23,`-12*8-9*$SIZE_T`($i) 222 $PUSH r24,`-12*8-8*$SIZE_T`($i) 223 $PUSH r25,`-12*8-7*$SIZE_T`($i) 224 $PUSH r26,`-12*8-6*$SIZE_T`($i) 225 $PUSH r27,`-12*8-5*$SIZE_T`($i) 226 $PUSH r28,`-12*8-4*$SIZE_T`($i) 227 $PUSH r29,`-12*8-3*$SIZE_T`($i) 228 $PUSH r30,`-12*8-2*$SIZE_T`($i) 229 $PUSH r31,`-12*8-1*$SIZE_T`($i) 230 stfd f20,`-12*8`($i) 231 stfd f21,`-11*8`($i) 232 stfd f22,`-10*8`($i) 233 stfd f23,`-9*8`($i) 234 stfd f24,`-8*8`($i) 235 stfd f25,`-7*8`($i) 236 stfd f26,`-6*8`($i) 237 stfd f27,`-5*8`($i) 238 stfd f28,`-4*8`($i) 239 stfd f29,`-3*8`($i) 240 stfd f30,`-2*8`($i) 241 stfd f31,`-1*8`($i) 242 ___ 243 $code.=<<___ if ($SIZE_T==8); 244 ld $a0,0($ap) ; pull ap[0] value 245 ld $n0,0($n0) ; pull n0[0] value 246 ld $t3,0($bp) ; bp[0] 247 ___ 248 $code.=<<___ if ($SIZE_T==4); 249 mr $t1,$n0 250 lwz $a0,0($ap) ; pull ap[0,1] value 251 lwz $t0,4($ap) 252 lwz $n0,0($t1) ; pull n0[0,1] value 253 lwz $t1,4($t1) 254 lwz $t3,0($bp) ; bp[0,1] 255 lwz $t2,4($bp) 256 insrdi $a0,$t0,32,0 257 insrdi $n0,$t1,32,0 258 insrdi $t3,$t2,32,0 259 ___ 260 $code.=<<___; 261 addi $tp,$sp,`$FRAME+$TRANSFER+8+64` 262 li $i,-64 263 add $nap_d,$tp,$num 264 and $nap_d,$nap_d,$i ; align to 64 bytes 265 267 mulld $t7,$a0,$t3 ; ap[0]*bp[0] 268 ; nap_d is off by 1, because it's used with stfdu/lfdu 269 addi $nap_d,$nap_d,-8 270 srwi $j,$num,`3+1` ; counter register, num/2 271 mulld $t7,$t7,$n0 ; tp[0]*n0 272 addi $j,$j,-1 273 addi $tp,$sp,`$FRAME+$TRANSFER-8` 274 li $carry,0 275 mtctr $j 276 277 ; transfer bp[0] to FPU as 4x16-bit values 278 extrdi $t0,$t3,16,48 279 extrdi $t1,$t3,16,32 280 extrdi $t2,$t3,16,16 281 extrdi $t3,$t3,16,0 282 std $t0,`$FRAME+0`($sp) 283 std $t1,`$FRAME+8`($sp) 284 std $t2,`$FRAME+16`($sp) 285 std $t3,`$FRAME+24`($sp) 286 ; transfer (ap[0]*bp[0])*n0 to FPU as 4x16-bit values 287 extrdi $t4,$t7,16,48 288 extrdi $t5,$t7,16,32 289 extrdi $t6,$t7,16,16 290 extrdi $t7,$t7,16,0 291 std $t4,`$FRAME+32`($sp) 292 std $t5,`$FRAME+40`($sp) 293 std $t6,`$FRAME+48`($sp) 294 std $t7,`$FRAME+56`($sp) 295 ___ 296 $code.=<<___ if ($SIZE_T==8); 297 lwz $t0,4($ap) ; load a[j] as 32-bit word pair 298 lwz $t1,0($ap) 299 lwz $t2,12($ap) ; load a[j+1] as 32-bit word pair 300 lwz $t3,8($ap) 301 lwz $t4,4($np) ; load n[j] as 32-bit word pair 302 lwz $t5,0($np) 303 lwz $t6,12($np) ; load n[j+1] as 32-bit word pair 304 lwz $t7,8($np) 305 ___ 306 $code.=<<___ if ($SIZE_T==4); 307 lwz $t0,0($ap) ; load a[j..j+3] as 32-bit word pairs 308 lwz $t1,4($ap) 309 lwz $t2,8($ap) 310 lwz $t3,12($ap) 311 lwz $t4,0($np) ; load n[j..j+3] as 32-bit word pairs 312 lwz $t5,4($np) 313 lwz $t6,8($np) 314 lwz $t7,12($np) 315 ___ 316 $code.=<<___; 317 lfd $ba,`$FRAME+0`($sp) 318 lfd $bb,`$FRAME+8`($sp) 319 lfd $bc,`$FRAME+16`($sp) 320 lfd $bd,`$FRAME+24`($sp) 321 lfd $na,`$FRAME+32`($sp) 322 lfd $nb,`$FRAME+40`($sp) 323 lfd $nc,`$FRAME+48`($sp) 324 lfd $nd,`$FRAME+56`($sp) 325 std $t0,`$FRAME+64`($sp) 326 std $t1,`$FRAME+72`($sp) 327 std $t2,`$FRAME+80`($sp) 328 std $t3,`$FRAME+88`($sp) 329 std $t4,`$FRAME+96`($sp) 330 std $t5,`$FRAME+104`($sp) 331 std $t6,`$FRAME+112`($sp) 332 std $t7,`$FRAME+120`($sp) 333 fcfid $ba,$ba 334 fcfid $bb,$bb 335 fcfid $bc,$bc 336 fcfid $bd,$bd 337 fcfid $na,$na 338 fcfid $nb,$nb 339 fcfid $nc,$nc 340 fcfid $nd,$nd 341 342 lfd $A0,`$FRAME+64`($sp) 343 lfd $A1,`$FRAME+72`($sp) 344 lfd $A2,`$FRAME+80`($sp) 345 lfd $A3,`$FRAME+88`($sp) 346 lfd $N0,`$FRAME+96`($sp) 347 lfd $N1,`$FRAME+104`($sp) 348 lfd $N2,`$FRAME+112`($sp) 349 lfd $N3,`$FRAME+120`($sp) 350 fcfid $A0,$A0 351 fcfid $A1,$A1 352 fcfid $A2,$A2 353 fcfid $A3,$A3 354 fcfid $N0,$N0 355 fcfid $N1,$N1 356 fcfid $N2,$N2 357 fcfid $N3,$N3 358 addi $ap,$ap,16 359 addi $np,$np,16 360 361 fmul $T1a,$A1,$ba 362 fmul $T1b,$A1,$bb 363 stfd $A0,8($nap_d) ; save a[j] in double format 364 stfd $A1,16($nap_d) 365 fmul $T2a,$A2,$ba 366 fmul $T2b,$A2,$bb 367 stfd $A2,24($nap_d) ; save a[j+1] in double format 368 stfd $A3,32($nap_d) 369 fmul $T3a,$A3,$ba 370 fmul $T3b,$A3,$bb 371 stfd $N0,40($nap_d) ; save n[j] in double format 372 stfd $N1,48($nap_d) 373 fmul $T0a,$A0,$ba 374 fmul $T0b,$A0,$bb 375 stfd $N2,56($nap_d) ; save n[j+1] in double format 376 stfdu $N3,64($nap_d) 377 378 fmadd $T1a,$A0,$bc,$T1a 379 fmadd $T1b,$A0,$bd,$T1b 380 fmadd $T2a,$A1,$bc,$T2a 381 fmadd $T2b,$A1,$bd,$T2b 382 fmadd $T3a,$A2,$bc,$T3a 383 fmadd $T3b,$A2,$bd,$T3b 384 fmul $dota,$A3,$bc 385 fmul $dotb,$A3,$bd 386 387 fmadd $T1a,$N1,$na,$T1a 388 fmadd $T1b,$N1,$nb,$T1b 389 fmadd $T2a,$N2,$na,$T2a 390 fmadd $T2b,$N2,$nb,$T2b 391 fmadd $T3a,$N3,$na,$T3a 392 fmadd $T3b,$N3,$nb,$T3b 393 fmadd $T0a,$N0,$na,$T0a 394 fmadd $T0b,$N0,$nb,$T0b 395 396 fmadd $T1a,$N0,$nc,$T1a 397 fmadd $T1b,$N0,$nd,$T1b 398 fmadd $T2a,$N1,$nc,$T2a 399 fmadd $T2b,$N1,$nd,$T2b 400 fmadd $T3a,$N2,$nc,$T3a 401 fmadd $T3b,$N2,$nd,$T3b 402 fmadd $dota,$N3,$nc,$dota 403 fmadd $dotb,$N3,$nd,$dotb 404 405 fctid $T0a,$T0a 406 fctid $T0b,$T0b 407 fctid $T1a,$T1a 408 fctid $T1b,$T1b 409 fctid $T2a,$T2a 410 fctid $T2b,$T2b 411 fctid $T3a,$T3a 412 fctid $T3b,$T3b 413 414 stfd $T0a,`$FRAME+0`($sp) 415 stfd $T0b,`$FRAME+8`($sp) 416 stfd $T1a,`$FRAME+16`($sp) 417 stfd $T1b,`$FRAME+24`($sp) 418 stfd $T2a,`$FRAME+32`($sp) 419 stfd $T2b,`$FRAME+40`($sp) 420 stfd $T3a,`$FRAME+48`($sp) 421 stfd $T3b,`$FRAME+56`($sp) 422 424 .align 5 425 L1st: 426 ___ 427 $code.=<<___ if ($SIZE_T==8); 428 lwz $t0,4($ap) ; load a[j] as 32-bit word pair 429 lwz $t1,0($ap) 430 lwz $t2,12($ap) ; load a[j+1] as 32-bit word pair 431 lwz $t3,8($ap) 432 lwz $t4,4($np) ; load n[j] as 32-bit word pair 433 lwz $t5,0($np) 434 lwz $t6,12($np) ; load n[j+1] as 32-bit word pair 435 lwz $t7,8($np) 436 ___ 437 $code.=<<___ if ($SIZE_T==4); 438 lwz $t0,0($ap) ; load a[j..j+3] as 32-bit word pairs 439 lwz $t1,4($ap) 440 lwz $t2,8($ap) 441 lwz $t3,12($ap) 442 lwz $t4,0($np) ; load n[j..j+3] as 32-bit word pairs 443 lwz $t5,4($np) 444 lwz $t6,8($np) 445 lwz $t7,12($np) 446 ___ 447 $code.=<<___; 448 std $t0,`$FRAME+64`($sp) 449 std $t1,`$FRAME+72`($sp) 450 std $t2,`$FRAME+80`($sp) 451 std $t3,`$FRAME+88`($sp) 452 std $t4,`$FRAME+96`($sp) 453 std $t5,`$FRAME+104`($sp) 454 std $t6,`$FRAME+112`($sp) 455 std $t7,`$FRAME+120`($sp) 456 ld $t0,`$FRAME+0`($sp) 457 ld $t1,`$FRAME+8`($sp) 458 ld $t2,`$FRAME+16`($sp) 459 ld $t3,`$FRAME+24`($sp) 460 ld $t4,`$FRAME+32`($sp) 461 ld $t5,`$FRAME+40`($sp) 462 ld $t6,`$FRAME+48`($sp) 463 ld $t7,`$FRAME+56`($sp) 464 lfd $A0,`$FRAME+64`($sp) 465 lfd $A1,`$FRAME+72`($sp) 466 lfd $A2,`$FRAME+80`($sp) 467 lfd $A3,`$FRAME+88`($sp) 468 lfd $N0,`$FRAME+96`($sp) 469 lfd $N1,`$FRAME+104`($sp) 470 lfd $N2,`$FRAME+112`($sp) 471 lfd $N3,`$FRAME+120`($sp) 472 fcfid $A0,$A0 473 fcfid $A1,$A1 474 fcfid $A2,$A2 475 fcfid $A3,$A3 476 fcfid $N0,$N0 477 fcfid $N1,$N1 478 fcfid $N2,$N2 479 fcfid $N3,$N3 480 addi $ap,$ap,16 481 addi $np,$np,16 482 483 fmul $T1a,$A1,$ba 484 fmul $T1b,$A1,$bb 485 fmul $T2a,$A2,$ba 486 fmul $T2b,$A2,$bb 487 stfd $A0,8($nap_d) ; save a[j] in double format 488 stfd $A1,16($nap_d) 489 fmul $T3a,$A3,$ba 490 fmul $T3b,$A3,$bb 491 fmadd $T0a,$A0,$ba,$dota 492 fmadd $T0b,$A0,$bb,$dotb 493 stfd $A2,24($nap_d) ; save a[j+1] in double format 494 stfd $A3,32($nap_d) 495 496 fmadd $T1a,$A0,$bc,$T1a 497 fmadd $T1b,$A0,$bd,$T1b 498 fmadd $T2a,$A1,$bc,$T2a 499 fmadd $T2b,$A1,$bd,$T2b 500 stfd $N0,40($nap_d) ; save n[j] in double format 501 stfd $N1,48($nap_d) 502 fmadd $T3a,$A2,$bc,$T3a 503 fmadd $T3b,$A2,$bd,$T3b 504 add $t0,$t0,$carry ; can not overflow 505 fmul $dota,$A3,$bc 506 fmul $dotb,$A3,$bd 507 stfd $N2,56($nap_d) ; save n[j+1] in double format 508 stfdu $N3,64($nap_d) 509 srdi $carry,$t0,16 510 add $t1,$t1,$carry 511 srdi $carry,$t1,16 512 513 fmadd $T1a,$N1,$na,$T1a 514 fmadd $T1b,$N1,$nb,$T1b 515 insrdi $t0,$t1,16,32 516 fmadd $T2a,$N2,$na,$T2a 517 fmadd $T2b,$N2,$nb,$T2b 518 add $t2,$t2,$carry 519 fmadd $T3a,$N3,$na,$T3a 520 fmadd $T3b,$N3,$nb,$T3b 521 srdi $carry,$t2,16 522 fmadd $T0a,$N0,$na,$T0a 523 fmadd $T0b,$N0,$nb,$T0b 524 insrdi $t0,$t2,16,16 525 add $t3,$t3,$carry 526 srdi $carry,$t3,16 527 528 fmadd $T1a,$N0,$nc,$T1a 529 fmadd $T1b,$N0,$nd,$T1b 530 insrdi $t0,$t3,16,0 ; 0..63 bits 531 fmadd $T2a,$N1,$nc,$T2a 532 fmadd $T2b,$N1,$nd,$T2b 533 add $t4,$t4,$carry 534 fmadd $T3a,$N2,$nc,$T3a 535 fmadd $T3b,$N2,$nd,$T3b 536 srdi $carry,$t4,16 537 fmadd $dota,$N3,$nc,$dota 538 fmadd $dotb,$N3,$nd,$dotb 539 add $t5,$t5,$carry 540 srdi $carry,$t5,16 541 insrdi $t4,$t5,16,32 542 543 fctid $T0a,$T0a 544 fctid $T0b,$T0b 545 add $t6,$t6,$carry 546 fctid $T1a,$T1a 547 fctid $T1b,$T1b 548 srdi $carry,$t6,16 549 fctid $T2a,$T2a 550 fctid $T2b,$T2b 551 insrdi $t4,$t6,16,16 552 fctid $T3a,$T3a 553 fctid $T3b,$T3b 554 add $t7,$t7,$carry 555 insrdi $t4,$t7,16,0 ; 64..127 bits 556 srdi $carry,$t7,16 ; upper 33 bits 557 558 stfd $T0a,`$FRAME+0`($sp) 559 stfd $T0b,`$FRAME+8`($sp) 560 stfd $T1a,`$FRAME+16`($sp) 561 stfd $T1b,`$FRAME+24`($sp) 562 stfd $T2a,`$FRAME+32`($sp) 563 stfd $T2b,`$FRAME+40`($sp) 564 stfd $T3a,`$FRAME+48`($sp) 565 stfd $T3b,`$FRAME+56`($sp) 566 std $t0,8($tp) ; tp[j-1] 567 stdu $t4,16($tp) ; tp[j] 568 bdnz- L1st 569 571 fctid $dota,$dota 572 fctid $dotb,$dotb 573 574 ld $t0,`$FRAME+0`($sp) 575 ld $t1,`$FRAME+8`($sp) 576 ld $t2,`$FRAME+16`($sp) 577 ld $t3,`$FRAME+24`($sp) 578 ld $t4,`$FRAME+32`($sp) 579 ld $t5,`$FRAME+40`($sp) 580 ld $t6,`$FRAME+48`($sp) 581 ld $t7,`$FRAME+56`($sp) 582 stfd $dota,`$FRAME+64`($sp) 583 stfd $dotb,`$FRAME+72`($sp) 584 585 add $t0,$t0,$carry ; can not overflow 586 srdi $carry,$t0,16 587 add $t1,$t1,$carry 588 srdi $carry,$t1,16 589 insrdi $t0,$t1,16,32 590 add $t2,$t2,$carry 591 srdi $carry,$t2,16 592 insrdi $t0,$t2,16,16 593 add $t3,$t3,$carry 594 srdi $carry,$t3,16 595 insrdi $t0,$t3,16,0 ; 0..63 bits 596 add $t4,$t4,$carry 597 srdi $carry,$t4,16 598 add $t5,$t5,$carry 599 srdi $carry,$t5,16 600 insrdi $t4,$t5,16,32 601 add $t6,$t6,$carry 602 srdi $carry,$t6,16 603 insrdi $t4,$t6,16,16 604 add $t7,$t7,$carry 605 insrdi $t4,$t7,16,0 ; 64..127 bits 606 srdi $carry,$t7,16 ; upper 33 bits 607 ld $t6,`$FRAME+64`($sp) 608 ld $t7,`$FRAME+72`($sp) 609 610 std $t0,8($tp) ; tp[j-1] 611 stdu $t4,16($tp) ; tp[j] 612 613 add $t6,$t6,$carry ; can not overflow 614 srdi $carry,$t6,16 615 add $t7,$t7,$carry 616 insrdi $t6,$t7,48,0 617 srdi $ovf,$t7,48 618 std $t6,8($tp) ; tp[num-1] 619 620 slwi $t7,$num,2 621 subf $nap_d,$t7,$nap_d ; rewind pointer 622 624 li $i,8 ; i=1 625 .align 5 626 Louter: 627 ___ 628 $code.=<<___ if ($SIZE_T==8); 629 ldx $t3,$bp,$i ; bp[i] 630 ___ 631 $code.=<<___ if ($SIZE_T==4); 632 add $t0,$bp,$i 633 lwz $t3,0($t0) ; bp[i,i+1] 634 lwz $t0,4($t0) 635 insrdi $t3,$t0,32,0 636 ___ 637 $code.=<<___; 638 ld $t6,`$FRAME+$TRANSFER+8`($sp) ; tp[0] 639 mulld $t7,$a0,$t3 ; ap[0]*bp[i] 640 641 addi $tp,$sp,`$FRAME+$TRANSFER` 642 add $t7,$t7,$t6 ; ap[0]*bp[i]+tp[0] 643 li $carry,0 644 mulld $t7,$t7,$n0 ; tp[0]*n0 645 mtctr $j 646 647 ; transfer bp[i] to FPU as 4x16-bit values 648 extrdi $t0,$t3,16,48 649 extrdi $t1,$t3,16,32 650 extrdi $t2,$t3,16,16 651 extrdi $t3,$t3,16,0 652 std $t0,`$FRAME+0`($sp) 653 std $t1,`$FRAME+8`($sp) 654 std $t2,`$FRAME+16`($sp) 655 std $t3,`$FRAME+24`($sp) 656 ; transfer (ap[0]*bp[i]+tp[0])*n0 to FPU as 4x16-bit values 657 extrdi $t4,$t7,16,48 658 extrdi $t5,$t7,16,32 659 extrdi $t6,$t7,16,16 660 extrdi $t7,$t7,16,0 661 std $t4,`$FRAME+32`($sp) 662 std $t5,`$FRAME+40`($sp) 663 std $t6,`$FRAME+48`($sp) 664 std $t7,`$FRAME+56`($sp) 665 666 lfd $A0,8($nap_d) ; load a[j] in double format 667 lfd $A1,16($nap_d) 668 lfd $A2,24($nap_d) ; load a[j+1] in double format 669 lfd $A3,32($nap_d) 670 lfd $N0,40($nap_d) ; load n[j] in double format 671 lfd $N1,48($nap_d) 672 lfd $N2,56($nap_d) ; load n[j+1] in double format 673 lfdu $N3,64($nap_d) 674 675 lfd $ba,`$FRAME+0`($sp) 676 lfd $bb,`$FRAME+8`($sp) 677 lfd $bc,`$FRAME+16`($sp) 678 lfd $bd,`$FRAME+24`($sp) 679 lfd $na,`$FRAME+32`($sp) 680 lfd $nb,`$FRAME+40`($sp) 681 lfd $nc,`$FRAME+48`($sp) 682 lfd $nd,`$FRAME+56`($sp) 683 684 fcfid $ba,$ba 685 fcfid $bb,$bb 686 fcfid $bc,$bc 687 fcfid $bd,$bd 688 fcfid $na,$na 689 fcfid $nb,$nb 690 fcfid $nc,$nc 691 fcfid $nd,$nd 692 693 fmul $T1a,$A1,$ba 694 fmul $T1b,$A1,$bb 695 fmul $T2a,$A2,$ba 696 fmul $T2b,$A2,$bb 697 fmul $T3a,$A3,$ba 698 fmul $T3b,$A3,$bb 699 fmul $T0a,$A0,$ba 700 fmul $T0b,$A0,$bb 701 702 fmadd $T1a,$A0,$bc,$T1a 703 fmadd $T1b,$A0,$bd,$T1b 704 fmadd $T2a,$A1,$bc,$T2a 705 fmadd $T2b,$A1,$bd,$T2b 706 fmadd $T3a,$A2,$bc,$T3a 707 fmadd $T3b,$A2,$bd,$T3b 708 fmul $dota,$A3,$bc 709 fmul $dotb,$A3,$bd 710 711 fmadd $T1a,$N1,$na,$T1a 712 fmadd $T1b,$N1,$nb,$T1b 713 lfd $A0,8($nap_d) ; load a[j] in double format 714 lfd $A1,16($nap_d) 715 fmadd $T2a,$N2,$na,$T2a 716 fmadd $T2b,$N2,$nb,$T2b 717 lfd $A2,24($nap_d) ; load a[j+1] in double format 718 lfd $A3,32($nap_d) 719 fmadd $T3a,$N3,$na,$T3a 720 fmadd $T3b,$N3,$nb,$T3b 721 fmadd $T0a,$N0,$na,$T0a 722 fmadd $T0b,$N0,$nb,$T0b 723 724 fmadd $T1a,$N0,$nc,$T1a 725 fmadd $T1b,$N0,$nd,$T1b 726 fmadd $T2a,$N1,$nc,$T2a 727 fmadd $T2b,$N1,$nd,$T2b 728 fmadd $T3a,$N2,$nc,$T3a 729 fmadd $T3b,$N2,$nd,$T3b 730 fmadd $dota,$N3,$nc,$dota 731 fmadd $dotb,$N3,$nd,$dotb 732 733 fctid $T0a,$T0a 734 fctid $T0b,$T0b 735 fctid $T1a,$T1a 736 fctid $T1b,$T1b 737 fctid $T2a,$T2a 738 fctid $T2b,$T2b 739 fctid $T3a,$T3a 740 fctid $T3b,$T3b 741 742 stfd $T0a,`$FRAME+0`($sp) 743 stfd $T0b,`$FRAME+8`($sp) 744 stfd $T1a,`$FRAME+16`($sp) 745 stfd $T1b,`$FRAME+24`($sp) 746 stfd $T2a,`$FRAME+32`($sp) 747 stfd $T2b,`$FRAME+40`($sp) 748 stfd $T3a,`$FRAME+48`($sp) 749 stfd $T3b,`$FRAME+56`($sp) 750 752 .align 5 753 Linner: 754 fmul $T1a,$A1,$ba 755 fmul $T1b,$A1,$bb 756 fmul $T2a,$A2,$ba 757 fmul $T2b,$A2,$bb 758 lfd $N0,40($nap_d) ; load n[j] in double format 759 lfd $N1,48($nap_d) 760 fmul $T3a,$A3,$ba 761 fmul $T3b,$A3,$bb 762 fmadd $T0a,$A0,$ba,$dota 763 fmadd $T0b,$A0,$bb,$dotb 764 lfd $N2,56($nap_d) ; load n[j+1] in double format 765 lfdu $N3,64($nap_d) 766 767 fmadd $T1a,$A0,$bc,$T1a 768 fmadd $T1b,$A0,$bd,$T1b 769 fmadd $T2a,$A1,$bc,$T2a 770 fmadd $T2b,$A1,$bd,$T2b 771 lfd $A0,8($nap_d) ; load a[j] in double format 772 lfd $A1,16($nap_d) 773 fmadd $T3a,$A2,$bc,$T3a 774 fmadd $T3b,$A2,$bd,$T3b 775 fmul $dota,$A3,$bc 776 fmul $dotb,$A3,$bd 777 lfd $A2,24($nap_d) ; load a[j+1] in double format 778 lfd $A3,32($nap_d) 779 780 fmadd $T1a,$N1,$na,$T1a 781 fmadd $T1b,$N1,$nb,$T1b 782 ld $t0,`$FRAME+0`($sp) 783 ld $t1,`$FRAME+8`($sp) 784 fmadd $T2a,$N2,$na,$T2a 785 fmadd $T2b,$N2,$nb,$T2b 786 ld $t2,`$FRAME+16`($sp) 787 ld $t3,`$FRAME+24`($sp) 788 fmadd $T3a,$N3,$na,$T3a 789 fmadd $T3b,$N3,$nb,$T3b 790 add $t0,$t0,$carry ; can not overflow 791 ld $t4,`$FRAME+32`($sp) 792 ld $t5,`$FRAME+40`($sp) 793 fmadd $T0a,$N0,$na,$T0a 794 fmadd $T0b,$N0,$nb,$T0b 795 srdi $carry,$t0,16 796 add $t1,$t1,$carry 797 srdi $carry,$t1,16 798 ld $t6,`$FRAME+48`($sp) 799 ld $t7,`$FRAME+56`($sp) 800 801 fmadd $T1a,$N0,$nc,$T1a 802 fmadd $T1b,$N0,$nd,$T1b 803 insrdi $t0,$t1,16,32 804 ld $t1,8($tp) ; tp[j] 805 fmadd $T2a,$N1,$nc,$T2a 806 fmadd $T2b,$N1,$nd,$T2b 807 add $t2,$t2,$carry 808 fmadd $T3a,$N2,$nc,$T3a 809 fmadd $T3b,$N2,$nd,$T3b 810 srdi $carry,$t2,16 811 insrdi $t0,$t2,16,16 812 fmadd $dota,$N3,$nc,$dota 813 fmadd $dotb,$N3,$nd,$dotb 814 add $t3,$t3,$carry 815 ldu $t2,16($tp) ; tp[j+1] 816 srdi $carry,$t3,16 817 insrdi $t0,$t3,16,0 ; 0..63 bits 818 add $t4,$t4,$carry 819 820 fctid $T0a,$T0a 821 fctid $T0b,$T0b 822 srdi $carry,$t4,16 823 fctid $T1a,$T1a 824 fctid $T1b,$T1b 825 add $t5,$t5,$carry 826 fctid $T2a,$T2a 827 fctid $T2b,$T2b 828 srdi $carry,$t5,16 829 insrdi $t4,$t5,16,32 830 fctid $T3a,$T3a 831 fctid $T3b,$T3b 832 add $t6,$t6,$carry 833 srdi $carry,$t6,16 834 insrdi $t4,$t6,16,16 835 836 stfd $T0a,`$FRAME+0`($sp) 837 stfd $T0b,`$FRAME+8`($sp) 838 add $t7,$t7,$carry 839 addc $t3,$t0,$t1 840 ___ 841 $code.=<<___ if ($SIZE_T==4); # adjust XER[CA] 842 extrdi $t0,$t0,32,0 843 extrdi $t1,$t1,32,0 844 adde $t0,$t0,$t1 845 ___ 846 $code.=<<___; 847 stfd $T1a,`$FRAME+16`($sp) 848 stfd $T1b,`$FRAME+24`($sp) 849 insrdi $t4,$t7,16,0 ; 64..127 bits 850 srdi $carry,$t7,16 ; upper 33 bits 851 stfd $T2a,`$FRAME+32`($sp) 852 stfd $T2b,`$FRAME+40`($sp) 853 adde $t5,$t4,$t2 854 ___ 855 $code.=<<___ if ($SIZE_T==4); # adjust XER[CA] 856 extrdi $t4,$t4,32,0 857 extrdi $t2,$t2,32,0 858 adde $t4,$t4,$t2 859 ___ 860 $code.=<<___; 861 stfd $T3a,`$FRAME+48`($sp) 862 stfd $T3b,`$FRAME+56`($sp) 863 addze $carry,$carry 864 std $t3,-16($tp) ; tp[j-1] 865 std $t5,-8($tp) ; tp[j] 866 bdnz- Linner 867 869 fctid $dota,$dota 870 fctid $dotb,$dotb 871 ld $t0,`$FRAME+0`($sp) 872 ld $t1,`$FRAME+8`($sp) 873 ld $t2,`$FRAME+16`($sp) 874 ld $t3,`$FRAME+24`($sp) 875 ld $t4,`$FRAME+32`($sp) 876 ld $t5,`$FRAME+40`($sp) 877 ld $t6,`$FRAME+48`($sp) 878 ld $t7,`$FRAME+56`($sp) 879 stfd $dota,`$FRAME+64`($sp) 880 stfd $dotb,`$FRAME+72`($sp) 881 882 add $t0,$t0,$carry ; can not overflow 883 srdi $carry,$t0,16 884 add $t1,$t1,$carry 885 srdi $carry,$t1,16 886 insrdi $t0,$t1,16,32 887 add $t2,$t2,$carry 888 ld $t1,8($tp) ; tp[j] 889 srdi $carry,$t2,16 890 insrdi $t0,$t2,16,16 891 add $t3,$t3,$carry 892 ldu $t2,16($tp) ; tp[j+1] 893 srdi $carry,$t3,16 894 insrdi $t0,$t3,16,0 ; 0..63 bits 895 add $t4,$t4,$carry 896 srdi $carry,$t4,16 897 add $t5,$t5,$carry 898 srdi $carry,$t5,16 899 insrdi $t4,$t5,16,32 900 add $t6,$t6,$carry 901 srdi $carry,$t6,16 902 insrdi $t4,$t6,16,16 903 add $t7,$t7,$carry 904 insrdi $t4,$t7,16,0 ; 64..127 bits 905 srdi $carry,$t7,16 ; upper 33 bits 906 ld $t6,`$FRAME+64`($sp) 907 ld $t7,`$FRAME+72`($sp) 908 909 addc $t3,$t0,$t1 910 ___ 911 $code.=<<___ if ($SIZE_T==4); # adjust XER[CA] 912 extrdi $t0,$t0,32,0 913 extrdi $t1,$t1,32,0 914 adde $t0,$t0,$t1 915 ___ 916 $code.=<<___; 917 adde $t5,$t4,$t2 918 ___ 919 $code.=<<___ if ($SIZE_T==4); # adjust XER[CA] 920 extrdi $t4,$t4,32,0 921 extrdi $t2,$t2,32,0 922 adde $t4,$t4,$t2 923 ___ 924 $code.=<<___; 925 addze $carry,$carry 926 927 std $t3,-16($tp) ; tp[j-1] 928 std $t5,-8($tp) ; tp[j] 929 930 add $carry,$carry,$ovf ; comsume upmost overflow 931 add $t6,$t6,$carry ; can not overflow 932 srdi $carry,$t6,16 933 add $t7,$t7,$carry 934 insrdi $t6,$t7,48,0 935 srdi $ovf,$t7,48 936 std $t6,0($tp) ; tp[num-1] 937 938 slwi $t7,$num,2 939 addi $i,$i,8 940 subf $nap_d,$t7,$nap_d ; rewind pointer 941 cmpw $i,$num 942 blt- Louter 943 ___ 944 946 $code.=<<___ if ($SIZE_T==8); 947 subf $np,$num,$np ; rewind np 948 addi $j,$j,1 ; restore counter 949 subfc $i,$i,$i ; j=0 and "clear" XER[CA] 950 addi $tp,$sp,`$FRAME+$TRANSFER+8` 951 addi $t4,$sp,`$FRAME+$TRANSFER+16` 952 addi $t5,$np,8 953 addi $t6,$rp,8 954 mtctr $j 955 956 .align 4 957 Lsub: ldx $t0,$tp,$i 958 ldx $t1,$np,$i 959 ldx $t2,$t4,$i 960 ldx $t3,$t5,$i 961 subfe $t0,$t1,$t0 ; tp[j]-np[j] 962 subfe $t2,$t3,$t2 ; tp[j+1]-np[j+1] 963 stdx $t0,$rp,$i 964 stdx $t2,$t6,$i 965 addi $i,$i,16 966 bdnz- Lsub 967 968 li $i,0 969 subfe $ovf,$i,$ovf ; handle upmost overflow bit 970 and $ap,$tp,$ovf 971 andc $np,$rp,$ovf 972 or $ap,$ap,$np ; ap=borrow?tp:rp 973 addi $t7,$ap,8 974 mtctr $j 975 976 .align 4 977 Lcopy: ; copy or in-place refresh 978 ldx $t0,$ap,$i 979 ldx $t1,$t7,$i 980 std $i,8($nap_d) ; zap nap_d 981 std $i,16($nap_d) 982 std $i,24($nap_d) 983 std $i,32($nap_d) 984 std $i,40($nap_d) 985 std $i,48($nap_d) 986 std $i,56($nap_d) 987 stdu $i,64($nap_d) 988 stdx $t0,$rp,$i 989 stdx $t1,$t6,$i 990 stdx $i,$tp,$i ; zap tp at once 991 stdx $i,$t4,$i 992 addi $i,$i,16 993 bdnz- Lcopy 994 ___ 995 $code.=<<___ if ($SIZE_T==4); 996 subf $np,$num,$np ; rewind np 997 addi $j,$j,1 ; restore counter 998 subfc $i,$i,$i ; j=0 and "clear" XER[CA] 999 addi $tp,$sp,`$FRAME+$TRANSFER` 1000 addi $np,$np,-4 1001 addi $rp,$rp,-4 1002 addi $ap,$sp,`$FRAME+$TRANSFER+4` 1003 mtctr $j 1004 1005 .align 4 1006 Lsub: ld $t0,8($tp) ; load tp[j..j+3] in 64-bit word order 1007 ldu $t2,16($tp) 1008 lwz $t4,4($np) ; load np[j..j+3] in 32-bit word order 1009 lwz $t5,8($np) 1010 lwz $t6,12($np) 1011 lwzu $t7,16($np) 1012 extrdi $t1,$t0,32,0 1013 extrdi $t3,$t2,32,0 1014 subfe $t4,$t4,$t0 ; tp[j]-np[j] 1015 stw $t0,4($ap) ; save tp[j..j+3] in 32-bit word order 1016 subfe $t5,$t5,$t1 ; tp[j+1]-np[j+1] 1017 stw $t1,8($ap) 1018 subfe $t6,$t6,$t2 ; tp[j+2]-np[j+2] 1019 stw $t2,12($ap) 1020 subfe $t7,$t7,$t3 ; tp[j+3]-np[j+3] 1021 stwu $t3,16($ap) 1022 stw $t4,4($rp) 1023 stw $t5,8($rp) 1024 stw $t6,12($rp) 1025 stwu $t7,16($rp) 1026 bdnz- Lsub 1027 1028 li $i,0 1029 subfe $ovf,$i,$ovf ; handle upmost overflow bit 1030 addi $tp,$sp,`$FRAME+$TRANSFER+4` 1031 subf $rp,$num,$rp ; rewind rp 1032 and $ap,$tp,$ovf 1033 andc $np,$rp,$ovf 1034 or $ap,$ap,$np ; ap=borrow?tp:rp 1035 addi $tp,$sp,`$FRAME+$TRANSFER` 1036 mtctr $j 1037 1038 .align 4 1039 Lcopy: ; copy or in-place refresh 1040 lwz $t0,4($ap) 1041 lwz $t1,8($ap) 1042 lwz $t2,12($ap) 1043 lwzu $t3,16($ap) 1044 std $i,8($nap_d) ; zap nap_d 1045 std $i,16($nap_d) 1046 std $i,24($nap_d) 1047 std $i,32($nap_d) 1048 std $i,40($nap_d) 1049 std $i,48($nap_d) 1050 std $i,56($nap_d) 1051 stdu $i,64($nap_d) 1052 stw $t0,4($rp) 1053 stw $t1,8($rp) 1054 stw $t2,12($rp) 1055 stwu $t3,16($rp) 1056 std $i,8($tp) ; zap tp at once 1057 stdu $i,16($tp) 1058 bdnz- Lcopy 1059 ___ 1060 1062 $code.=<<___; 1063 $POP $i,0($sp) 1064 li r3,1 ; signal "handled" 1065 $POP r22,`-12*8-10*$SIZE_T`($i) 1066 $POP r23,`-12*8-9*$SIZE_T`($i) 1067 $POP r24,`-12*8-8*$SIZE_T`($i) 1068 $POP r25,`-12*8-7*$SIZE_T`($i) 1069 $POP r26,`-12*8-6*$SIZE_T`($i) 1070 $POP r27,`-12*8-5*$SIZE_T`($i) 1071 $POP r28,`-12*8-4*$SIZE_T`($i) 1072 $POP r29,`-12*8-3*$SIZE_T`($i) 1073 $POP r30,`-12*8-2*$SIZE_T`($i) 1074 $POP r31,`-12*8-1*$SIZE_T`($i) 1075 lfd f20,`-12*8`($i) 1076 lfd f21,`-11*8`($i) 1077 lfd f22,`-10*8`($i) 1078 lfd f23,`-9*8`($i) 1079 lfd f24,`-8*8`($i) 1080 lfd f25,`-7*8`($i) 1081 lfd f26,`-6*8`($i) 1082 lfd f27,`-5*8`($i) 1083 lfd f28,`-4*8`($i) 1084 lfd f29,`-3*8`($i) 1085 lfd f30,`-2*8`($i) 1086 lfd f31,`-1*8`($i) 1087 mr $sp,$i 1088 blr 1089 .long 0 1090 .byte 0,12,4,0,0x8c,10,6,0 1091 .long 0 1092 1093 .asciz "Montgomery Multiplication for PPC64, CRYPTOGAMS by <appro\@openssl.org>" 1094 ___ 1095 1096 $code =~ s/\`([^\`]*)\`/eval $1/gem; 1097 print $code; 1098 close STDOUT; 1099