1 #!/usr/bin/env perl 2 3 # ==================================================================== 4 # Written by Andy Polyakov <appro (at] fy.chalmers.se> for the OpenSSL 5 # project. The module is, however, dual licensed under OpenSSL and 6 # CRYPTOGAMS licenses depending on where you obtain it. For further 7 # details see http://www.openssl.org/~appro/cryptogams/. 8 # ==================================================================== 9 10 # December 2007 11 12 # The reason for undertaken effort is basically following. Even though 13 # Power 6 CPU operates at incredible 4.7GHz clock frequency, its PKI 14 # performance was observed to be less than impressive, essentially as 15 # fast as 1.8GHz PPC970, or 2.6 times(!) slower than one would hope. 16 # Well, it's not surprising that IBM had to make some sacrifices to 17 # boost the clock frequency that much, but no overall improvement? 18 # Having observed how much difference did switching to FPU make on 19 # UltraSPARC, playing same stunt on Power 6 appeared appropriate... 20 # Unfortunately the resulting performance improvement is not as 21 # impressive, ~30%, and in absolute terms is still very far from what 22 # one would expect from 4.7GHz CPU. There is a chance that I'm doing 23 # something wrong, but in the lack of assembler level micro-profiling 24 # data or at least decent platform guide I can't tell... Or better 25 # results might be achieved with VMX... Anyway, this module provides 26 # *worse* performance on other PowerPC implementations, ~40-15% slower 27 # on PPC970 depending on key length and ~40% slower on Power 5 for all 28 # key lengths. As it's obviously inappropriate as "best all-round" 29 # alternative, it has to be complemented with run-time CPU family 30 # detection. Oh! It should also be noted that unlike other PowerPC 31 # implementation IALU ppc-mont.pl module performs *suboptimaly* on 32 # >=1024-bit key lengths on Power 6. It should also be noted that 33 # *everything* said so far applies to 64-bit builds! As far as 32-bit 34 # application executed on 64-bit CPU goes, this module is likely to 35 # become preferred choice, because it's easy to adapt it for such 36 # case and *is* faster than 32-bit ppc-mont.pl on *all* processors. 37 38 # February 2008 39 40 # Micro-profiling assisted optimization results in ~15% improvement 41 # over original ppc64-mont.pl version, or overall ~50% improvement 42 # over ppc.pl module on Power 6. If compared to ppc-mont.pl on same 43 # Power 6 CPU, this module is 5-150% faster depending on key length, 44 # [hereafter] more for longer keys. But if compared to ppc-mont.pl 45 # on 1.8GHz PPC970, it's only 5-55% faster. Still far from impressive 46 # in absolute terms, but it's apparently the way Power 6 is... 47 48 $flavour = shift; 49 50 if ($flavour =~ /32/) { 51 $SIZE_T=4; 52 $RZONE= 224; 53 $FRAME= $SIZE_T*12+8*12; 54 $fname= "bn_mul_mont_ppc64"; 55 56 $STUX= "stwux"; # store indexed and update 57 $PUSH= "stw"; 58 $POP= "lwz"; 59 die "not implemented yet"; 60 } elsif ($flavour =~ /64/) { 61 $SIZE_T=8; 62 $RZONE= 288; 63 $FRAME= $SIZE_T*12+8*12; 64 $fname= "bn_mul_mont"; 65 66 # same as above, but 64-bit mnemonics... 67 $STUX= "stdux"; # store indexed and update 68 $PUSH= "std"; 69 $POP= "ld"; 70 } else { die "nonsense $flavour"; } 71 72 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 73 ( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or 74 ( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or 75 die "can't locate ppc-xlate.pl"; 76 77 open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!"; 78 79 $FRAME=($FRAME+63)&~63; 80 $TRANSFER=16*8; 81 82 $carry="r0"; 83 $sp="r1"; 84 $toc="r2"; 85 $rp="r3"; $ovf="r3"; 86 $ap="r4"; 87 $bp="r5"; 88 $np="r6"; 89 $n0="r7"; 90 $num="r8"; 91 $rp="r9"; # $rp is reassigned 92 $tp="r10"; 93 $j="r11"; 94 $i="r12"; 95 # non-volatile registers 96 $nap_d="r14"; # interleaved ap and np in double format 97 $a0="r15"; # ap[0] 98 $t0="r16"; # temporary registers 99 $t1="r17"; 100 $t2="r18"; 101 $t3="r19"; 102 $t4="r20"; 103 $t5="r21"; 104 $t6="r22"; 105 $t7="r23"; 106 107 # PPC offers enough register bank capacity to unroll inner loops twice 108 # 109 # ..A3A2A1A0 110 # dcba 111 # ----------- 112 # A0a 113 # A0b 114 # A0c 115 # A0d 116 # A1a 117 # A1b 118 # A1c 119 # A1d 120 # A2a 121 # A2b 122 # A2c 123 # A2d 124 # A3a 125 # A3b 126 # A3c 127 # A3d 128 # ..a 129 # ..b 130 # 131 $ba="f0"; $bb="f1"; $bc="f2"; $bd="f3"; 132 $na="f4"; $nb="f5"; $nc="f6"; $nd="f7"; 133 $dota="f8"; $dotb="f9"; 134 $A0="f10"; $A1="f11"; $A2="f12"; $A3="f13"; 135 $N0="f14"; $N1="f15"; $N2="f16"; $N3="f17"; 136 $T0a="f18"; $T0b="f19"; 137 $T1a="f20"; $T1b="f21"; 138 $T2a="f22"; $T2b="f23"; 139 $T3a="f24"; $T3b="f25"; 140 142 # sp----------->+-------------------------------+ 143 # | saved sp | 144 # +-------------------------------+ 145 # | | 146 # +-------------------------------+ 147 # | 10 saved gpr, r14-r23 | 148 # . . 149 # . . 150 # +12*size_t +-------------------------------+ 151 # | 12 saved fpr, f14-f25 | 152 # . . 153 # . . 154 # +12*8 +-------------------------------+ 155 # | padding to 64 byte boundary | 156 # . . 157 # +X +-------------------------------+ 158 # | 16 gpr<->fpr transfer zone | 159 # . . 160 # . . 161 # +16*8 +-------------------------------+ 162 # | __int64 tmp[-1] | 163 # +-------------------------------+ 164 # | __int64 tmp[num] | 165 # . . 166 # . . 167 # . . 168 # +(num+1)*8 +-------------------------------+ 169 # | padding to 64 byte boundary | 170 # . . 171 # +X +-------------------------------+ 172 # | double nap_d[4*num] | 173 # . . 174 # . . 175 # . . 176 # +-------------------------------+ 177 179 $code=<<___; 180 .machine "any" 181 .text 182 183 .globl .$fname 184 .align 5 185 .$fname: 186 cmpwi $num,4 187 mr $rp,r3 ; $rp is reassigned 188 li r3,0 ; possible "not handled" return code 189 bltlr- 190 andi. r0,$num,1 ; $num has to be even 191 bnelr- 192 193 slwi $num,$num,3 ; num*=8 194 li $i,-4096 195 slwi $tp,$num,2 ; place for {an}p_{lh}[num], i.e. 4*num 196 add $tp,$tp,$num ; place for tp[num+1] 197 addi $tp,$tp,`$FRAME+$TRANSFER+8+64+$RZONE` 198 subf $tp,$tp,$sp ; $sp-$tp 199 and $tp,$tp,$i ; minimize TLB usage 200 subf $tp,$sp,$tp ; $tp-$sp 201 $STUX $sp,$sp,$tp ; alloca 202 203 $PUSH r14,`2*$SIZE_T`($sp) 204 $PUSH r15,`3*$SIZE_T`($sp) 205 $PUSH r16,`4*$SIZE_T`($sp) 206 $PUSH r17,`5*$SIZE_T`($sp) 207 $PUSH r18,`6*$SIZE_T`($sp) 208 $PUSH r19,`7*$SIZE_T`($sp) 209 $PUSH r20,`8*$SIZE_T`($sp) 210 $PUSH r21,`9*$SIZE_T`($sp) 211 $PUSH r22,`10*$SIZE_T`($sp) 212 $PUSH r23,`11*$SIZE_T`($sp) 213 stfd f14,`12*$SIZE_T+0`($sp) 214 stfd f15,`12*$SIZE_T+8`($sp) 215 stfd f16,`12*$SIZE_T+16`($sp) 216 stfd f17,`12*$SIZE_T+24`($sp) 217 stfd f18,`12*$SIZE_T+32`($sp) 218 stfd f19,`12*$SIZE_T+40`($sp) 219 stfd f20,`12*$SIZE_T+48`($sp) 220 stfd f21,`12*$SIZE_T+56`($sp) 221 stfd f22,`12*$SIZE_T+64`($sp) 222 stfd f23,`12*$SIZE_T+72`($sp) 223 stfd f24,`12*$SIZE_T+80`($sp) 224 stfd f25,`12*$SIZE_T+88`($sp) 225 226 ld $a0,0($ap) ; pull ap[0] value 227 ld $n0,0($n0) ; pull n0[0] value 228 ld $t3,0($bp) ; bp[0] 229 230 addi $tp,$sp,`$FRAME+$TRANSFER+8+64` 231 li $i,-64 232 add $nap_d,$tp,$num 233 and $nap_d,$nap_d,$i ; align to 64 bytes 234 236 mulld $t7,$a0,$t3 ; ap[0]*bp[0] 237 ; nap_d is off by 1, because it's used with stfdu/lfdu 238 addi $nap_d,$nap_d,-8 239 srwi $j,$num,`3+1` ; counter register, num/2 240 mulld $t7,$t7,$n0 ; tp[0]*n0 241 addi $j,$j,-1 242 addi $tp,$sp,`$FRAME+$TRANSFER-8` 243 li $carry,0 244 mtctr $j 245 246 ; transfer bp[0] to FPU as 4x16-bit values 247 extrdi $t0,$t3,16,48 248 extrdi $t1,$t3,16,32 249 extrdi $t2,$t3,16,16 250 extrdi $t3,$t3,16,0 251 std $t0,`$FRAME+0`($sp) 252 std $t1,`$FRAME+8`($sp) 253 std $t2,`$FRAME+16`($sp) 254 std $t3,`$FRAME+24`($sp) 255 ; transfer (ap[0]*bp[0])*n0 to FPU as 4x16-bit values 256 extrdi $t4,$t7,16,48 257 extrdi $t5,$t7,16,32 258 extrdi $t6,$t7,16,16 259 extrdi $t7,$t7,16,0 260 std $t4,`$FRAME+32`($sp) 261 std $t5,`$FRAME+40`($sp) 262 std $t6,`$FRAME+48`($sp) 263 std $t7,`$FRAME+56`($sp) 264 lwz $t0,4($ap) ; load a[j] as 32-bit word pair 265 lwz $t1,0($ap) 266 lwz $t2,12($ap) ; load a[j+1] as 32-bit word pair 267 lwz $t3,8($ap) 268 lwz $t4,4($np) ; load n[j] as 32-bit word pair 269 lwz $t5,0($np) 270 lwz $t6,12($np) ; load n[j+1] as 32-bit word pair 271 lwz $t7,8($np) 272 lfd $ba,`$FRAME+0`($sp) 273 lfd $bb,`$FRAME+8`($sp) 274 lfd $bc,`$FRAME+16`($sp) 275 lfd $bd,`$FRAME+24`($sp) 276 lfd $na,`$FRAME+32`($sp) 277 lfd $nb,`$FRAME+40`($sp) 278 lfd $nc,`$FRAME+48`($sp) 279 lfd $nd,`$FRAME+56`($sp) 280 std $t0,`$FRAME+64`($sp) 281 std $t1,`$FRAME+72`($sp) 282 std $t2,`$FRAME+80`($sp) 283 std $t3,`$FRAME+88`($sp) 284 std $t4,`$FRAME+96`($sp) 285 std $t5,`$FRAME+104`($sp) 286 std $t6,`$FRAME+112`($sp) 287 std $t7,`$FRAME+120`($sp) 288 fcfid $ba,$ba 289 fcfid $bb,$bb 290 fcfid $bc,$bc 291 fcfid $bd,$bd 292 fcfid $na,$na 293 fcfid $nb,$nb 294 fcfid $nc,$nc 295 fcfid $nd,$nd 296 297 lfd $A0,`$FRAME+64`($sp) 298 lfd $A1,`$FRAME+72`($sp) 299 lfd $A2,`$FRAME+80`($sp) 300 lfd $A3,`$FRAME+88`($sp) 301 lfd $N0,`$FRAME+96`($sp) 302 lfd $N1,`$FRAME+104`($sp) 303 lfd $N2,`$FRAME+112`($sp) 304 lfd $N3,`$FRAME+120`($sp) 305 fcfid $A0,$A0 306 fcfid $A1,$A1 307 fcfid $A2,$A2 308 fcfid $A3,$A3 309 fcfid $N0,$N0 310 fcfid $N1,$N1 311 fcfid $N2,$N2 312 fcfid $N3,$N3 313 addi $ap,$ap,16 314 addi $np,$np,16 315 316 fmul $T1a,$A1,$ba 317 fmul $T1b,$A1,$bb 318 stfd $A0,8($nap_d) ; save a[j] in double format 319 stfd $A1,16($nap_d) 320 fmul $T2a,$A2,$ba 321 fmul $T2b,$A2,$bb 322 stfd $A2,24($nap_d) ; save a[j+1] in double format 323 stfd $A3,32($nap_d) 324 fmul $T3a,$A3,$ba 325 fmul $T3b,$A3,$bb 326 stfd $N0,40($nap_d) ; save n[j] in double format 327 stfd $N1,48($nap_d) 328 fmul $T0a,$A0,$ba 329 fmul $T0b,$A0,$bb 330 stfd $N2,56($nap_d) ; save n[j+1] in double format 331 stfdu $N3,64($nap_d) 332 333 fmadd $T1a,$A0,$bc,$T1a 334 fmadd $T1b,$A0,$bd,$T1b 335 fmadd $T2a,$A1,$bc,$T2a 336 fmadd $T2b,$A1,$bd,$T2b 337 fmadd $T3a,$A2,$bc,$T3a 338 fmadd $T3b,$A2,$bd,$T3b 339 fmul $dota,$A3,$bc 340 fmul $dotb,$A3,$bd 341 342 fmadd $T1a,$N1,$na,$T1a 343 fmadd $T1b,$N1,$nb,$T1b 344 fmadd $T2a,$N2,$na,$T2a 345 fmadd $T2b,$N2,$nb,$T2b 346 fmadd $T3a,$N3,$na,$T3a 347 fmadd $T3b,$N3,$nb,$T3b 348 fmadd $T0a,$N0,$na,$T0a 349 fmadd $T0b,$N0,$nb,$T0b 350 351 fmadd $T1a,$N0,$nc,$T1a 352 fmadd $T1b,$N0,$nd,$T1b 353 fmadd $T2a,$N1,$nc,$T2a 354 fmadd $T2b,$N1,$nd,$T2b 355 fmadd $T3a,$N2,$nc,$T3a 356 fmadd $T3b,$N2,$nd,$T3b 357 fmadd $dota,$N3,$nc,$dota 358 fmadd $dotb,$N3,$nd,$dotb 359 360 fctid $T0a,$T0a 361 fctid $T0b,$T0b 362 fctid $T1a,$T1a 363 fctid $T1b,$T1b 364 fctid $T2a,$T2a 365 fctid $T2b,$T2b 366 fctid $T3a,$T3a 367 fctid $T3b,$T3b 368 369 stfd $T0a,`$FRAME+0`($sp) 370 stfd $T0b,`$FRAME+8`($sp) 371 stfd $T1a,`$FRAME+16`($sp) 372 stfd $T1b,`$FRAME+24`($sp) 373 stfd $T2a,`$FRAME+32`($sp) 374 stfd $T2b,`$FRAME+40`($sp) 375 stfd $T3a,`$FRAME+48`($sp) 376 stfd $T3b,`$FRAME+56`($sp) 377 379 .align 5 380 L1st: 381 lwz $t0,4($ap) ; load a[j] as 32-bit word pair 382 lwz $t1,0($ap) 383 lwz $t2,12($ap) ; load a[j+1] as 32-bit word pair 384 lwz $t3,8($ap) 385 lwz $t4,4($np) ; load n[j] as 32-bit word pair 386 lwz $t5,0($np) 387 lwz $t6,12($np) ; load n[j+1] as 32-bit word pair 388 lwz $t7,8($np) 389 std $t0,`$FRAME+64`($sp) 390 std $t1,`$FRAME+72`($sp) 391 std $t2,`$FRAME+80`($sp) 392 std $t3,`$FRAME+88`($sp) 393 std $t4,`$FRAME+96`($sp) 394 std $t5,`$FRAME+104`($sp) 395 std $t6,`$FRAME+112`($sp) 396 std $t7,`$FRAME+120`($sp) 397 ld $t0,`$FRAME+0`($sp) 398 ld $t1,`$FRAME+8`($sp) 399 ld $t2,`$FRAME+16`($sp) 400 ld $t3,`$FRAME+24`($sp) 401 ld $t4,`$FRAME+32`($sp) 402 ld $t5,`$FRAME+40`($sp) 403 ld $t6,`$FRAME+48`($sp) 404 ld $t7,`$FRAME+56`($sp) 405 lfd $A0,`$FRAME+64`($sp) 406 lfd $A1,`$FRAME+72`($sp) 407 lfd $A2,`$FRAME+80`($sp) 408 lfd $A3,`$FRAME+88`($sp) 409 lfd $N0,`$FRAME+96`($sp) 410 lfd $N1,`$FRAME+104`($sp) 411 lfd $N2,`$FRAME+112`($sp) 412 lfd $N3,`$FRAME+120`($sp) 413 fcfid $A0,$A0 414 fcfid $A1,$A1 415 fcfid $A2,$A2 416 fcfid $A3,$A3 417 fcfid $N0,$N0 418 fcfid $N1,$N1 419 fcfid $N2,$N2 420 fcfid $N3,$N3 421 addi $ap,$ap,16 422 addi $np,$np,16 423 424 fmul $T1a,$A1,$ba 425 fmul $T1b,$A1,$bb 426 fmul $T2a,$A2,$ba 427 fmul $T2b,$A2,$bb 428 stfd $A0,8($nap_d) ; save a[j] in double format 429 stfd $A1,16($nap_d) 430 fmul $T3a,$A3,$ba 431 fmul $T3b,$A3,$bb 432 fmadd $T0a,$A0,$ba,$dota 433 fmadd $T0b,$A0,$bb,$dotb 434 stfd $A2,24($nap_d) ; save a[j+1] in double format 435 stfd $A3,32($nap_d) 436 437 fmadd $T1a,$A0,$bc,$T1a 438 fmadd $T1b,$A0,$bd,$T1b 439 fmadd $T2a,$A1,$bc,$T2a 440 fmadd $T2b,$A1,$bd,$T2b 441 stfd $N0,40($nap_d) ; save n[j] in double format 442 stfd $N1,48($nap_d) 443 fmadd $T3a,$A2,$bc,$T3a 444 fmadd $T3b,$A2,$bd,$T3b 445 add $t0,$t0,$carry ; can not overflow 446 fmul $dota,$A3,$bc 447 fmul $dotb,$A3,$bd 448 stfd $N2,56($nap_d) ; save n[j+1] in double format 449 stfdu $N3,64($nap_d) 450 srdi $carry,$t0,16 451 add $t1,$t1,$carry 452 srdi $carry,$t1,16 453 454 fmadd $T1a,$N1,$na,$T1a 455 fmadd $T1b,$N1,$nb,$T1b 456 insrdi $t0,$t1,16,32 457 fmadd $T2a,$N2,$na,$T2a 458 fmadd $T2b,$N2,$nb,$T2b 459 add $t2,$t2,$carry 460 fmadd $T3a,$N3,$na,$T3a 461 fmadd $T3b,$N3,$nb,$T3b 462 srdi $carry,$t2,16 463 fmadd $T0a,$N0,$na,$T0a 464 fmadd $T0b,$N0,$nb,$T0b 465 insrdi $t0,$t2,16,16 466 add $t3,$t3,$carry 467 srdi $carry,$t3,16 468 469 fmadd $T1a,$N0,$nc,$T1a 470 fmadd $T1b,$N0,$nd,$T1b 471 insrdi $t0,$t3,16,0 ; 0..63 bits 472 fmadd $T2a,$N1,$nc,$T2a 473 fmadd $T2b,$N1,$nd,$T2b 474 add $t4,$t4,$carry 475 fmadd $T3a,$N2,$nc,$T3a 476 fmadd $T3b,$N2,$nd,$T3b 477 srdi $carry,$t4,16 478 fmadd $dota,$N3,$nc,$dota 479 fmadd $dotb,$N3,$nd,$dotb 480 add $t5,$t5,$carry 481 srdi $carry,$t5,16 482 insrdi $t4,$t5,16,32 483 484 fctid $T0a,$T0a 485 fctid $T0b,$T0b 486 add $t6,$t6,$carry 487 fctid $T1a,$T1a 488 fctid $T1b,$T1b 489 srdi $carry,$t6,16 490 fctid $T2a,$T2a 491 fctid $T2b,$T2b 492 insrdi $t4,$t6,16,16 493 fctid $T3a,$T3a 494 fctid $T3b,$T3b 495 add $t7,$t7,$carry 496 insrdi $t4,$t7,16,0 ; 64..127 bits 497 srdi $carry,$t7,16 ; upper 33 bits 498 499 stfd $T0a,`$FRAME+0`($sp) 500 stfd $T0b,`$FRAME+8`($sp) 501 stfd $T1a,`$FRAME+16`($sp) 502 stfd $T1b,`$FRAME+24`($sp) 503 stfd $T2a,`$FRAME+32`($sp) 504 stfd $T2b,`$FRAME+40`($sp) 505 stfd $T3a,`$FRAME+48`($sp) 506 stfd $T3b,`$FRAME+56`($sp) 507 std $t0,8($tp) ; tp[j-1] 508 stdu $t4,16($tp) ; tp[j] 509 bdnz- L1st 510 512 fctid $dota,$dota 513 fctid $dotb,$dotb 514 515 ld $t0,`$FRAME+0`($sp) 516 ld $t1,`$FRAME+8`($sp) 517 ld $t2,`$FRAME+16`($sp) 518 ld $t3,`$FRAME+24`($sp) 519 ld $t4,`$FRAME+32`($sp) 520 ld $t5,`$FRAME+40`($sp) 521 ld $t6,`$FRAME+48`($sp) 522 ld $t7,`$FRAME+56`($sp) 523 stfd $dota,`$FRAME+64`($sp) 524 stfd $dotb,`$FRAME+72`($sp) 525 526 add $t0,$t0,$carry ; can not overflow 527 srdi $carry,$t0,16 528 add $t1,$t1,$carry 529 srdi $carry,$t1,16 530 insrdi $t0,$t1,16,32 531 add $t2,$t2,$carry 532 srdi $carry,$t2,16 533 insrdi $t0,$t2,16,16 534 add $t3,$t3,$carry 535 srdi $carry,$t3,16 536 insrdi $t0,$t3,16,0 ; 0..63 bits 537 add $t4,$t4,$carry 538 srdi $carry,$t4,16 539 add $t5,$t5,$carry 540 srdi $carry,$t5,16 541 insrdi $t4,$t5,16,32 542 add $t6,$t6,$carry 543 srdi $carry,$t6,16 544 insrdi $t4,$t6,16,16 545 add $t7,$t7,$carry 546 insrdi $t4,$t7,16,0 ; 64..127 bits 547 srdi $carry,$t7,16 ; upper 33 bits 548 ld $t6,`$FRAME+64`($sp) 549 ld $t7,`$FRAME+72`($sp) 550 551 std $t0,8($tp) ; tp[j-1] 552 stdu $t4,16($tp) ; tp[j] 553 554 add $t6,$t6,$carry ; can not overflow 555 srdi $carry,$t6,16 556 add $t7,$t7,$carry 557 insrdi $t6,$t7,48,0 558 srdi $ovf,$t7,48 559 std $t6,8($tp) ; tp[num-1] 560 561 slwi $t7,$num,2 562 subf $nap_d,$t7,$nap_d ; rewind pointer 563 565 li $i,8 ; i=1 566 .align 5 567 Louter: 568 ldx $t3,$bp,$i ; bp[i] 569 ld $t6,`$FRAME+$TRANSFER+8`($sp) ; tp[0] 570 mulld $t7,$a0,$t3 ; ap[0]*bp[i] 571 572 addi $tp,$sp,`$FRAME+$TRANSFER` 573 add $t7,$t7,$t6 ; ap[0]*bp[i]+tp[0] 574 li $carry,0 575 mulld $t7,$t7,$n0 ; tp[0]*n0 576 mtctr $j 577 578 ; transfer bp[i] to FPU as 4x16-bit values 579 extrdi $t0,$t3,16,48 580 extrdi $t1,$t3,16,32 581 extrdi $t2,$t3,16,16 582 extrdi $t3,$t3,16,0 583 std $t0,`$FRAME+0`($sp) 584 std $t1,`$FRAME+8`($sp) 585 std $t2,`$FRAME+16`($sp) 586 std $t3,`$FRAME+24`($sp) 587 ; transfer (ap[0]*bp[i]+tp[0])*n0 to FPU as 4x16-bit values 588 extrdi $t4,$t7,16,48 589 extrdi $t5,$t7,16,32 590 extrdi $t6,$t7,16,16 591 extrdi $t7,$t7,16,0 592 std $t4,`$FRAME+32`($sp) 593 std $t5,`$FRAME+40`($sp) 594 std $t6,`$FRAME+48`($sp) 595 std $t7,`$FRAME+56`($sp) 596 597 lfd $A0,8($nap_d) ; load a[j] in double format 598 lfd $A1,16($nap_d) 599 lfd $A2,24($nap_d) ; load a[j+1] in double format 600 lfd $A3,32($nap_d) 601 lfd $N0,40($nap_d) ; load n[j] in double format 602 lfd $N1,48($nap_d) 603 lfd $N2,56($nap_d) ; load n[j+1] in double format 604 lfdu $N3,64($nap_d) 605 606 lfd $ba,`$FRAME+0`($sp) 607 lfd $bb,`$FRAME+8`($sp) 608 lfd $bc,`$FRAME+16`($sp) 609 lfd $bd,`$FRAME+24`($sp) 610 lfd $na,`$FRAME+32`($sp) 611 lfd $nb,`$FRAME+40`($sp) 612 lfd $nc,`$FRAME+48`($sp) 613 lfd $nd,`$FRAME+56`($sp) 614 615 fcfid $ba,$ba 616 fcfid $bb,$bb 617 fcfid $bc,$bc 618 fcfid $bd,$bd 619 fcfid $na,$na 620 fcfid $nb,$nb 621 fcfid $nc,$nc 622 fcfid $nd,$nd 623 624 fmul $T1a,$A1,$ba 625 fmul $T1b,$A1,$bb 626 fmul $T2a,$A2,$ba 627 fmul $T2b,$A2,$bb 628 fmul $T3a,$A3,$ba 629 fmul $T3b,$A3,$bb 630 fmul $T0a,$A0,$ba 631 fmul $T0b,$A0,$bb 632 633 fmadd $T1a,$A0,$bc,$T1a 634 fmadd $T1b,$A0,$bd,$T1b 635 fmadd $T2a,$A1,$bc,$T2a 636 fmadd $T2b,$A1,$bd,$T2b 637 fmadd $T3a,$A2,$bc,$T3a 638 fmadd $T3b,$A2,$bd,$T3b 639 fmul $dota,$A3,$bc 640 fmul $dotb,$A3,$bd 641 642 fmadd $T1a,$N1,$na,$T1a 643 fmadd $T1b,$N1,$nb,$T1b 644 lfd $A0,8($nap_d) ; load a[j] in double format 645 lfd $A1,16($nap_d) 646 fmadd $T2a,$N2,$na,$T2a 647 fmadd $T2b,$N2,$nb,$T2b 648 lfd $A2,24($nap_d) ; load a[j+1] in double format 649 lfd $A3,32($nap_d) 650 fmadd $T3a,$N3,$na,$T3a 651 fmadd $T3b,$N3,$nb,$T3b 652 fmadd $T0a,$N0,$na,$T0a 653 fmadd $T0b,$N0,$nb,$T0b 654 655 fmadd $T1a,$N0,$nc,$T1a 656 fmadd $T1b,$N0,$nd,$T1b 657 fmadd $T2a,$N1,$nc,$T2a 658 fmadd $T2b,$N1,$nd,$T2b 659 fmadd $T3a,$N2,$nc,$T3a 660 fmadd $T3b,$N2,$nd,$T3b 661 fmadd $dota,$N3,$nc,$dota 662 fmadd $dotb,$N3,$nd,$dotb 663 664 fctid $T0a,$T0a 665 fctid $T0b,$T0b 666 fctid $T1a,$T1a 667 fctid $T1b,$T1b 668 fctid $T2a,$T2a 669 fctid $T2b,$T2b 670 fctid $T3a,$T3a 671 fctid $T3b,$T3b 672 673 stfd $T0a,`$FRAME+0`($sp) 674 stfd $T0b,`$FRAME+8`($sp) 675 stfd $T1a,`$FRAME+16`($sp) 676 stfd $T1b,`$FRAME+24`($sp) 677 stfd $T2a,`$FRAME+32`($sp) 678 stfd $T2b,`$FRAME+40`($sp) 679 stfd $T3a,`$FRAME+48`($sp) 680 stfd $T3b,`$FRAME+56`($sp) 681 683 .align 5 684 Linner: 685 fmul $T1a,$A1,$ba 686 fmul $T1b,$A1,$bb 687 fmul $T2a,$A2,$ba 688 fmul $T2b,$A2,$bb 689 lfd $N0,40($nap_d) ; load n[j] in double format 690 lfd $N1,48($nap_d) 691 fmul $T3a,$A3,$ba 692 fmul $T3b,$A3,$bb 693 fmadd $T0a,$A0,$ba,$dota 694 fmadd $T0b,$A0,$bb,$dotb 695 lfd $N2,56($nap_d) ; load n[j+1] in double format 696 lfdu $N3,64($nap_d) 697 698 fmadd $T1a,$A0,$bc,$T1a 699 fmadd $T1b,$A0,$bd,$T1b 700 fmadd $T2a,$A1,$bc,$T2a 701 fmadd $T2b,$A1,$bd,$T2b 702 lfd $A0,8($nap_d) ; load a[j] in double format 703 lfd $A1,16($nap_d) 704 fmadd $T3a,$A2,$bc,$T3a 705 fmadd $T3b,$A2,$bd,$T3b 706 fmul $dota,$A3,$bc 707 fmul $dotb,$A3,$bd 708 lfd $A2,24($nap_d) ; load a[j+1] in double format 709 lfd $A3,32($nap_d) 710 711 fmadd $T1a,$N1,$na,$T1a 712 fmadd $T1b,$N1,$nb,$T1b 713 ld $t0,`$FRAME+0`($sp) 714 ld $t1,`$FRAME+8`($sp) 715 fmadd $T2a,$N2,$na,$T2a 716 fmadd $T2b,$N2,$nb,$T2b 717 ld $t2,`$FRAME+16`($sp) 718 ld $t3,`$FRAME+24`($sp) 719 fmadd $T3a,$N3,$na,$T3a 720 fmadd $T3b,$N3,$nb,$T3b 721 add $t0,$t0,$carry ; can not overflow 722 ld $t4,`$FRAME+32`($sp) 723 ld $t5,`$FRAME+40`($sp) 724 fmadd $T0a,$N0,$na,$T0a 725 fmadd $T0b,$N0,$nb,$T0b 726 srdi $carry,$t0,16 727 add $t1,$t1,$carry 728 srdi $carry,$t1,16 729 ld $t6,`$FRAME+48`($sp) 730 ld $t7,`$FRAME+56`($sp) 731 732 fmadd $T1a,$N0,$nc,$T1a 733 fmadd $T1b,$N0,$nd,$T1b 734 insrdi $t0,$t1,16,32 735 ld $t1,8($tp) ; tp[j] 736 fmadd $T2a,$N1,$nc,$T2a 737 fmadd $T2b,$N1,$nd,$T2b 738 add $t2,$t2,$carry 739 fmadd $T3a,$N2,$nc,$T3a 740 fmadd $T3b,$N2,$nd,$T3b 741 srdi $carry,$t2,16 742 insrdi $t0,$t2,16,16 743 fmadd $dota,$N3,$nc,$dota 744 fmadd $dotb,$N3,$nd,$dotb 745 add $t3,$t3,$carry 746 ldu $t2,16($tp) ; tp[j+1] 747 srdi $carry,$t3,16 748 insrdi $t0,$t3,16,0 ; 0..63 bits 749 add $t4,$t4,$carry 750 751 fctid $T0a,$T0a 752 fctid $T0b,$T0b 753 srdi $carry,$t4,16 754 fctid $T1a,$T1a 755 fctid $T1b,$T1b 756 add $t5,$t5,$carry 757 fctid $T2a,$T2a 758 fctid $T2b,$T2b 759 srdi $carry,$t5,16 760 insrdi $t4,$t5,16,32 761 fctid $T3a,$T3a 762 fctid $T3b,$T3b 763 add $t6,$t6,$carry 764 srdi $carry,$t6,16 765 insrdi $t4,$t6,16,16 766 767 stfd $T0a,`$FRAME+0`($sp) 768 stfd $T0b,`$FRAME+8`($sp) 769 add $t7,$t7,$carry 770 addc $t3,$t0,$t1 771 stfd $T1a,`$FRAME+16`($sp) 772 stfd $T1b,`$FRAME+24`($sp) 773 insrdi $t4,$t7,16,0 ; 64..127 bits 774 srdi $carry,$t7,16 ; upper 33 bits 775 stfd $T2a,`$FRAME+32`($sp) 776 stfd $T2b,`$FRAME+40`($sp) 777 adde $t5,$t4,$t2 778 stfd $T3a,`$FRAME+48`($sp) 779 stfd $T3b,`$FRAME+56`($sp) 780 addze $carry,$carry 781 std $t3,-16($tp) ; tp[j-1] 782 std $t5,-8($tp) ; tp[j] 783 bdnz- Linner 784 786 fctid $dota,$dota 787 fctid $dotb,$dotb 788 ld $t0,`$FRAME+0`($sp) 789 ld $t1,`$FRAME+8`($sp) 790 ld $t2,`$FRAME+16`($sp) 791 ld $t3,`$FRAME+24`($sp) 792 ld $t4,`$FRAME+32`($sp) 793 ld $t5,`$FRAME+40`($sp) 794 ld $t6,`$FRAME+48`($sp) 795 ld $t7,`$FRAME+56`($sp) 796 stfd $dota,`$FRAME+64`($sp) 797 stfd $dotb,`$FRAME+72`($sp) 798 799 add $t0,$t0,$carry ; can not overflow 800 srdi $carry,$t0,16 801 add $t1,$t1,$carry 802 srdi $carry,$t1,16 803 insrdi $t0,$t1,16,32 804 add $t2,$t2,$carry 805 ld $t1,8($tp) ; tp[j] 806 srdi $carry,$t2,16 807 insrdi $t0,$t2,16,16 808 add $t3,$t3,$carry 809 ldu $t2,16($tp) ; tp[j+1] 810 srdi $carry,$t3,16 811 insrdi $t0,$t3,16,0 ; 0..63 bits 812 add $t4,$t4,$carry 813 srdi $carry,$t4,16 814 add $t5,$t5,$carry 815 srdi $carry,$t5,16 816 insrdi $t4,$t5,16,32 817 add $t6,$t6,$carry 818 srdi $carry,$t6,16 819 insrdi $t4,$t6,16,16 820 add $t7,$t7,$carry 821 insrdi $t4,$t7,16,0 ; 64..127 bits 822 srdi $carry,$t7,16 ; upper 33 bits 823 ld $t6,`$FRAME+64`($sp) 824 ld $t7,`$FRAME+72`($sp) 825 826 addc $t3,$t0,$t1 827 adde $t5,$t4,$t2 828 addze $carry,$carry 829 830 std $t3,-16($tp) ; tp[j-1] 831 std $t5,-8($tp) ; tp[j] 832 833 add $carry,$carry,$ovf ; comsume upmost overflow 834 add $t6,$t6,$carry ; can not overflow 835 srdi $carry,$t6,16 836 add $t7,$t7,$carry 837 insrdi $t6,$t7,48,0 838 srdi $ovf,$t7,48 839 std $t6,0($tp) ; tp[num-1] 840 841 slwi $t7,$num,2 842 addi $i,$i,8 843 subf $nap_d,$t7,$nap_d ; rewind pointer 844 cmpw $i,$num 845 blt- Louter 846 848 subf $np,$num,$np ; rewind np 849 addi $j,$j,1 ; restore counter 850 subfc $i,$i,$i ; j=0 and "clear" XER[CA] 851 addi $tp,$sp,`$FRAME+$TRANSFER+8` 852 addi $t4,$sp,`$FRAME+$TRANSFER+16` 853 addi $t5,$np,8 854 addi $t6,$rp,8 855 mtctr $j 856 857 .align 4 858 Lsub: ldx $t0,$tp,$i 859 ldx $t1,$np,$i 860 ldx $t2,$t4,$i 861 ldx $t3,$t5,$i 862 subfe $t0,$t1,$t0 ; tp[j]-np[j] 863 subfe $t2,$t3,$t2 ; tp[j+1]-np[j+1] 864 stdx $t0,$rp,$i 865 stdx $t2,$t6,$i 866 addi $i,$i,16 867 bdnz- Lsub 868 869 li $i,0 870 subfe $ovf,$i,$ovf ; handle upmost overflow bit 871 and $ap,$tp,$ovf 872 andc $np,$rp,$ovf 873 or $ap,$ap,$np ; ap=borrow?tp:rp 874 addi $t7,$ap,8 875 mtctr $j 876 877 .align 4 878 Lcopy: ; copy or in-place refresh 879 ldx $t0,$ap,$i 880 ldx $t1,$t7,$i 881 std $i,8($nap_d) ; zap nap_d 882 std $i,16($nap_d) 883 std $i,24($nap_d) 884 std $i,32($nap_d) 885 std $i,40($nap_d) 886 std $i,48($nap_d) 887 std $i,56($nap_d) 888 stdu $i,64($nap_d) 889 stdx $t0,$rp,$i 890 stdx $t1,$t6,$i 891 stdx $i,$tp,$i ; zap tp at once 892 stdx $i,$t4,$i 893 addi $i,$i,16 894 bdnz- Lcopy 895 897 $POP r14,`2*$SIZE_T`($sp) 898 $POP r15,`3*$SIZE_T`($sp) 899 $POP r16,`4*$SIZE_T`($sp) 900 $POP r17,`5*$SIZE_T`($sp) 901 $POP r18,`6*$SIZE_T`($sp) 902 $POP r19,`7*$SIZE_T`($sp) 903 $POP r20,`8*$SIZE_T`($sp) 904 $POP r21,`9*$SIZE_T`($sp) 905 $POP r22,`10*$SIZE_T`($sp) 906 $POP r23,`11*$SIZE_T`($sp) 907 lfd f14,`12*$SIZE_T+0`($sp) 908 lfd f15,`12*$SIZE_T+8`($sp) 909 lfd f16,`12*$SIZE_T+16`($sp) 910 lfd f17,`12*$SIZE_T+24`($sp) 911 lfd f18,`12*$SIZE_T+32`($sp) 912 lfd f19,`12*$SIZE_T+40`($sp) 913 lfd f20,`12*$SIZE_T+48`($sp) 914 lfd f21,`12*$SIZE_T+56`($sp) 915 lfd f22,`12*$SIZE_T+64`($sp) 916 lfd f23,`12*$SIZE_T+72`($sp) 917 lfd f24,`12*$SIZE_T+80`($sp) 918 lfd f25,`12*$SIZE_T+88`($sp) 919 $POP $sp,0($sp) 920 li r3,1 ; signal "handled" 921 blr 922 .long 0 923 .asciz "Montgomery Multiplication for PPC64, CRYPTOGAMS by <appro\@fy.chalmers.se>" 924 ___ 925 926 $code =~ s/\`([^\`]*)\`/eval $1/gem; 927 print $code; 928 close STDOUT; 929