1 #!/usr/bin/env perl 2 3 # ==================================================================== 4 # Written by Andy Polyakov <appro (at] fy.chalmers.se> for the OpenSSL 5 # project. The module is, however, dual licensed under OpenSSL and 6 # CRYPTOGAMS licenses depending on where you obtain it. For further 7 # details see http://www.openssl.org/~appro/cryptogams/. 8 # ==================================================================== 9 10 # October 2005 11 # 12 # "Teaser" Montgomery multiplication module for UltraSPARC. Why FPU? 13 # Because unlike integer multiplier, which simply stalls whole CPU, 14 # FPU is fully pipelined and can effectively emit 48 bit partial 15 # product every cycle. Why not blended SPARC v9? One can argue that 16 # making this module dependent on UltraSPARC VIS extension limits its 17 # binary compatibility. Well yes, it does exclude SPARC64 prior-V(!) 18 # implementations from compatibility matrix. But the rest, whole Sun 19 # UltraSPARC family and brand new Fujitsu's SPARC64 V, all support 20 # VIS extension instructions used in this module. This is considered 21 # good enough to not care about HAL SPARC64 users [if any] who have 22 # integer-only pure SPARCv9 module to "fall down" to. 23 24 # USI&II cores currently exhibit uniform 2x improvement [over pre- 25 # bn_mul_mont codebase] for all key lengths and benchmarks. On USIII 26 # performance improves few percents for shorter keys and worsens few 27 # percents for longer keys. This is because USIII integer multiplier 28 # is >3x faster than USI&II one, which is harder to match [but see 29 # TODO list below]. It should also be noted that SPARC64 V features 30 # out-of-order execution, which *might* mean that integer multiplier 31 # is pipelined, which in turn *might* be impossible to match... On 32 # additional note, SPARC64 V implements FP Multiply-Add instruction, 33 # which is perfectly usable in this context... In other words, as far 34 # as Fujitsu SPARC64 V goes, talk to the author:-) 35 36 # The implementation implies following "non-natural" limitations on 37 # input arguments: 38 # - num may not be less than 4; 39 # - num has to be even; 40 # Failure to meet either condition has no fatal effects, simply 41 # doesn't give any performance gain. 42 43 # TODO: 44 # - modulo-schedule inner loop for better performance (on in-order 45 # execution core such as UltraSPARC this shall result in further 46 # noticeable(!) improvement); 47 # - dedicated squaring procedure[?]; 48 49 ###################################################################### 50 # November 2006 51 # 52 # Modulo-scheduled inner loops allow to interleave floating point and 53 # integer instructions and minimize Read-After-Write penalties. This 54 # results in *further* 20-50% perfromance improvement [depending on 55 # key length, more for longer keys] on USI&II cores and 30-80% - on 56 # USIII&IV. 57 58 $fname="bn_mul_mont_fpu"; 59 $bits=32; 60 for (@ARGV) { $bits=64 if (/\-m64/ || /\-xarch\=v9/); } 61 62 if ($bits==64) { 63 $bias=2047; 64 $frame=192; 65 } else { 66 $bias=0; 67 $frame=128; # 96 rounded up to largest known cache-line 68 } 69 $locals=64; 70 71 # In order to provide for 32-/64-bit ABI duality, I keep integers wider 72 # than 32 bit in %g1-%g4 and %o0-%o5. %l0-%l7 and %i0-%i5 are used 73 # exclusively for pointers, indexes and other small values... 74 # int bn_mul_mont( 75 $rp="%i0"; # BN_ULONG *rp, 76 $ap="%i1"; # const BN_ULONG *ap, 77 $bp="%i2"; # const BN_ULONG *bp, 78 $np="%i3"; # const BN_ULONG *np, 79 $n0="%i4"; # const BN_ULONG *n0, 80 $num="%i5"; # int num); 81 82 $tp="%l0"; # t[num] 83 $ap_l="%l1"; # a[num],n[num] are smashed to 32-bit words and saved 84 $ap_h="%l2"; # to these four vectors as double-precision FP values. 85 $np_l="%l3"; # This way a bunch of fxtods are eliminated in second 86 $np_h="%l4"; # loop and L1-cache aliasing is minimized... 87 $i="%l5"; 88 $j="%l6"; 89 $mask="%l7"; # 16-bit mask, 0xffff 90 91 $n0="%g4"; # reassigned(!) to "64-bit" register 92 $carry="%i4"; # %i4 reused(!) for a carry bit 93 94 # FP register naming chart 95 # 96 # ..HILO 97 # dcba 98 # -------- 99 # LOa 100 # LOb 101 # LOc 102 # LOd 103 # HIa 104 # HIb 105 # HIc 106 # HId 107 # ..a 108 # ..b 109 $ba="%f0"; $bb="%f2"; $bc="%f4"; $bd="%f6"; 110 $na="%f8"; $nb="%f10"; $nc="%f12"; $nd="%f14"; 111 $alo="%f16"; $alo_="%f17"; $ahi="%f18"; $ahi_="%f19"; 112 $nlo="%f20"; $nlo_="%f21"; $nhi="%f22"; $nhi_="%f23"; 113 114 $dota="%f24"; $dotb="%f26"; 115 116 $aloa="%f32"; $alob="%f34"; $aloc="%f36"; $alod="%f38"; 117 $ahia="%f40"; $ahib="%f42"; $ahic="%f44"; $ahid="%f46"; 118 $nloa="%f48"; $nlob="%f50"; $nloc="%f52"; $nlod="%f54"; 119 $nhia="%f56"; $nhib="%f58"; $nhic="%f60"; $nhid="%f62"; 120 121 $ASI_FL16_P=0xD2; # magic ASI value to engage 16-bit FP load 122 123 $code=<<___; 124 .section ".text",#alloc,#execinstr 125 126 .global $fname 127 .align 32 128 $fname: 129 save %sp,-$frame-$locals,%sp 130 131 cmp $num,4 132 bl,a,pn %icc,.Lret 133 clr %i0 134 andcc $num,1,%g0 ! $num has to be even... 135 bnz,a,pn %icc,.Lret 136 clr %i0 ! signal "unsupported input value" 137 138 srl $num,1,$num 139 sethi %hi(0xffff),$mask 140 ld [%i4+0],$n0 ! $n0 reassigned, remember? 141 or $mask,%lo(0xffff),$mask 142 ld [%i4+4],%o0 143 sllx %o0,32,%o0 144 or %o0,$n0,$n0 ! $n0=n0[1].n0[0] 145 146 sll $num,3,$num ! num*=8 147 148 add %sp,$bias,%o0 ! real top of stack 149 sll $num,2,%o1 150 add %o1,$num,%o1 ! %o1=num*5 151 sub %o0,%o1,%o0 152 and %o0,-2048,%o0 ! optimize TLB utilization 153 sub %o0,$bias,%sp ! alloca(5*num*8) 154 155 rd %asi,%o7 ! save %asi 156 add %sp,$bias+$frame+$locals,$tp 157 add $tp,$num,$ap_l 158 add $ap_l,$num,$ap_l ! [an]p_[lh] point at the vectors' ends ! 159 add $ap_l,$num,$ap_h 160 add $ap_h,$num,$np_l 161 add $np_l,$num,$np_h 162 163 wr %g0,$ASI_FL16_P,%asi ! setup %asi for 16-bit FP loads 164 165 add $rp,$num,$rp ! readjust input pointers to point 166 add $ap,$num,$ap ! at the ends too... 167 add $bp,$num,$bp 168 add $np,$num,$np 169 170 stx %o7,[%sp+$bias+$frame+48] ! save %asi 171 173 sub %g0,$num,$i ! i=-num 174 sub %g0,$num,$j ! j=-num 175 176 add $ap,$j,%o3 177 add $bp,$i,%o4 178 179 ld [%o3+4],%g1 ! bp[0] 180 ld [%o3+0],%o0 181 ld [%o4+4],%g5 ! ap[0] 182 sllx %g1,32,%g1 183 ld [%o4+0],%o1 184 sllx %g5,32,%g5 185 or %g1,%o0,%o0 186 or %g5,%o1,%o1 187 188 add $np,$j,%o5 189 190 mulx %o1,%o0,%o0 ! ap[0]*bp[0] 191 mulx $n0,%o0,%o0 ! ap[0]*bp[0]*n0 192 stx %o0,[%sp+$bias+$frame+0] 193 194 ld [%o3+0],$alo_ ! load a[j] as pair of 32-bit words 195 fzeros $alo 196 ld [%o3+4],$ahi_ 197 fzeros $ahi 198 ld [%o5+0],$nlo_ ! load n[j] as pair of 32-bit words 199 fzeros $nlo 200 ld [%o5+4],$nhi_ 201 fzeros $nhi 202 203 ! transfer b[i] to FPU as 4x16-bit values 204 ldda [%o4+2]%asi,$ba 205 fxtod $alo,$alo 206 ldda [%o4+0]%asi,$bb 207 fxtod $ahi,$ahi 208 ldda [%o4+6]%asi,$bc 209 fxtod $nlo,$nlo 210 ldda [%o4+4]%asi,$bd 211 fxtod $nhi,$nhi 212 213 ! transfer ap[0]*b[0]*n0 to FPU as 4x16-bit values 214 ldda [%sp+$bias+$frame+6]%asi,$na 215 fxtod $ba,$ba 216 ldda [%sp+$bias+$frame+4]%asi,$nb 217 fxtod $bb,$bb 218 ldda [%sp+$bias+$frame+2]%asi,$nc 219 fxtod $bc,$bc 220 ldda [%sp+$bias+$frame+0]%asi,$nd 221 fxtod $bd,$bd 222 223 std $alo,[$ap_l+$j] ! save smashed ap[j] in double format 224 fxtod $na,$na 225 std $ahi,[$ap_h+$j] 226 fxtod $nb,$nb 227 std $nlo,[$np_l+$j] ! save smashed np[j] in double format 228 fxtod $nc,$nc 229 std $nhi,[$np_h+$j] 230 fxtod $nd,$nd 231 232 fmuld $alo,$ba,$aloa 233 fmuld $nlo,$na,$nloa 234 fmuld $alo,$bb,$alob 235 fmuld $nlo,$nb,$nlob 236 fmuld $alo,$bc,$aloc 237 faddd $aloa,$nloa,$nloa 238 fmuld $nlo,$nc,$nloc 239 fmuld $alo,$bd,$alod 240 faddd $alob,$nlob,$nlob 241 fmuld $nlo,$nd,$nlod 242 fmuld $ahi,$ba,$ahia 243 faddd $aloc,$nloc,$nloc 244 fmuld $nhi,$na,$nhia 245 fmuld $ahi,$bb,$ahib 246 faddd $alod,$nlod,$nlod 247 fmuld $nhi,$nb,$nhib 248 fmuld $ahi,$bc,$ahic 249 faddd $ahia,$nhia,$nhia 250 fmuld $nhi,$nc,$nhic 251 fmuld $ahi,$bd,$ahid 252 faddd $ahib,$nhib,$nhib 253 fmuld $nhi,$nd,$nhid 254 255 faddd $ahic,$nhic,$dota ! $nhic 256 faddd $ahid,$nhid,$dotb ! $nhid 257 258 faddd $nloc,$nhia,$nloc 259 faddd $nlod,$nhib,$nlod 260 261 fdtox $nloa,$nloa 262 fdtox $nlob,$nlob 263 fdtox $nloc,$nloc 264 fdtox $nlod,$nlod 265 266 std $nloa,[%sp+$bias+$frame+0] 267 add $j,8,$j 268 std $nlob,[%sp+$bias+$frame+8] 269 add $ap,$j,%o4 270 std $nloc,[%sp+$bias+$frame+16] 271 add $np,$j,%o5 272 std $nlod,[%sp+$bias+$frame+24] 273 275 ld [%o4+0],$alo_ ! load a[j] as pair of 32-bit words 276 fzeros $alo 277 ld [%o4+4],$ahi_ 278 fzeros $ahi 279 ld [%o5+0],$nlo_ ! load n[j] as pair of 32-bit words 280 fzeros $nlo 281 ld [%o5+4],$nhi_ 282 fzeros $nhi 283 284 fxtod $alo,$alo 285 fxtod $ahi,$ahi 286 fxtod $nlo,$nlo 287 fxtod $nhi,$nhi 288 289 ldx [%sp+$bias+$frame+0],%o0 290 fmuld $alo,$ba,$aloa 291 ldx [%sp+$bias+$frame+8],%o1 292 fmuld $nlo,$na,$nloa 293 ldx [%sp+$bias+$frame+16],%o2 294 fmuld $alo,$bb,$alob 295 ldx [%sp+$bias+$frame+24],%o3 296 fmuld $nlo,$nb,$nlob 297 298 srlx %o0,16,%o7 299 std $alo,[$ap_l+$j] ! save smashed ap[j] in double format 300 fmuld $alo,$bc,$aloc 301 add %o7,%o1,%o1 302 std $ahi,[$ap_h+$j] 303 faddd $aloa,$nloa,$nloa 304 fmuld $nlo,$nc,$nloc 305 srlx %o1,16,%o7 306 std $nlo,[$np_l+$j] ! save smashed np[j] in double format 307 fmuld $alo,$bd,$alod 308 add %o7,%o2,%o2 309 std $nhi,[$np_h+$j] 310 faddd $alob,$nlob,$nlob 311 fmuld $nlo,$nd,$nlod 312 srlx %o2,16,%o7 313 fmuld $ahi,$ba,$ahia 314 add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15] 315 faddd $aloc,$nloc,$nloc 316 fmuld $nhi,$na,$nhia 317 !and %o0,$mask,%o0 318 !and %o1,$mask,%o1 319 !and %o2,$mask,%o2 320 !sllx %o1,16,%o1 321 !sllx %o2,32,%o2 322 !sllx %o3,48,%o7 323 !or %o1,%o0,%o0 324 !or %o2,%o0,%o0 325 !or %o7,%o0,%o0 ! 64-bit result 326 srlx %o3,16,%g1 ! 34-bit carry 327 fmuld $ahi,$bb,$ahib 328 329 faddd $alod,$nlod,$nlod 330 fmuld $nhi,$nb,$nhib 331 fmuld $ahi,$bc,$ahic 332 faddd $ahia,$nhia,$nhia 333 fmuld $nhi,$nc,$nhic 334 fmuld $ahi,$bd,$ahid 335 faddd $ahib,$nhib,$nhib 336 fmuld $nhi,$nd,$nhid 337 338 faddd $dota,$nloa,$nloa 339 faddd $dotb,$nlob,$nlob 340 faddd $ahic,$nhic,$dota ! $nhic 341 faddd $ahid,$nhid,$dotb ! $nhid 342 343 faddd $nloc,$nhia,$nloc 344 faddd $nlod,$nhib,$nlod 345 346 fdtox $nloa,$nloa 347 fdtox $nlob,$nlob 348 fdtox $nloc,$nloc 349 fdtox $nlod,$nlod 350 351 std $nloa,[%sp+$bias+$frame+0] 352 std $nlob,[%sp+$bias+$frame+8] 353 addcc $j,8,$j 354 std $nloc,[%sp+$bias+$frame+16] 355 bz,pn %icc,.L1stskip 356 std $nlod,[%sp+$bias+$frame+24] 357 359 .align 32 ! incidentally already aligned ! 360 .L1st: 361 add $ap,$j,%o4 362 add $np,$j,%o5 363 ld [%o4+0],$alo_ ! load a[j] as pair of 32-bit words 364 fzeros $alo 365 ld [%o4+4],$ahi_ 366 fzeros $ahi 367 ld [%o5+0],$nlo_ ! load n[j] as pair of 32-bit words 368 fzeros $nlo 369 ld [%o5+4],$nhi_ 370 fzeros $nhi 371 372 fxtod $alo,$alo 373 fxtod $ahi,$ahi 374 fxtod $nlo,$nlo 375 fxtod $nhi,$nhi 376 377 ldx [%sp+$bias+$frame+0],%o0 378 fmuld $alo,$ba,$aloa 379 ldx [%sp+$bias+$frame+8],%o1 380 fmuld $nlo,$na,$nloa 381 ldx [%sp+$bias+$frame+16],%o2 382 fmuld $alo,$bb,$alob 383 ldx [%sp+$bias+$frame+24],%o3 384 fmuld $nlo,$nb,$nlob 385 386 srlx %o0,16,%o7 387 std $alo,[$ap_l+$j] ! save smashed ap[j] in double format 388 fmuld $alo,$bc,$aloc 389 add %o7,%o1,%o1 390 std $ahi,[$ap_h+$j] 391 faddd $aloa,$nloa,$nloa 392 fmuld $nlo,$nc,$nloc 393 srlx %o1,16,%o7 394 std $nlo,[$np_l+$j] ! save smashed np[j] in double format 395 fmuld $alo,$bd,$alod 396 add %o7,%o2,%o2 397 std $nhi,[$np_h+$j] 398 faddd $alob,$nlob,$nlob 399 fmuld $nlo,$nd,$nlod 400 srlx %o2,16,%o7 401 fmuld $ahi,$ba,$ahia 402 add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15] 403 and %o0,$mask,%o0 404 faddd $aloc,$nloc,$nloc 405 fmuld $nhi,$na,$nhia 406 and %o1,$mask,%o1 407 and %o2,$mask,%o2 408 fmuld $ahi,$bb,$ahib 409 sllx %o1,16,%o1 410 faddd $alod,$nlod,$nlod 411 fmuld $nhi,$nb,$nhib 412 sllx %o2,32,%o2 413 fmuld $ahi,$bc,$ahic 414 sllx %o3,48,%o7 415 or %o1,%o0,%o0 416 faddd $ahia,$nhia,$nhia 417 fmuld $nhi,$nc,$nhic 418 or %o2,%o0,%o0 419 fmuld $ahi,$bd,$ahid 420 or %o7,%o0,%o0 ! 64-bit result 421 faddd $ahib,$nhib,$nhib 422 fmuld $nhi,$nd,$nhid 423 addcc %g1,%o0,%o0 424 faddd $dota,$nloa,$nloa 425 srlx %o3,16,%g1 ! 34-bit carry 426 faddd $dotb,$nlob,$nlob 427 bcs,a %xcc,.+8 428 add %g1,1,%g1 429 430 stx %o0,[$tp] ! tp[j-1]= 431 432 faddd $ahic,$nhic,$dota ! $nhic 433 faddd $ahid,$nhid,$dotb ! $nhid 434 435 faddd $nloc,$nhia,$nloc 436 faddd $nlod,$nhib,$nlod 437 438 fdtox $nloa,$nloa 439 fdtox $nlob,$nlob 440 fdtox $nloc,$nloc 441 fdtox $nlod,$nlod 442 443 std $nloa,[%sp+$bias+$frame+0] 444 std $nlob,[%sp+$bias+$frame+8] 445 std $nloc,[%sp+$bias+$frame+16] 446 std $nlod,[%sp+$bias+$frame+24] 447 448 addcc $j,8,$j 449 bnz,pt %icc,.L1st 450 add $tp,8,$tp 451 453 .L1stskip: 454 fdtox $dota,$dota 455 fdtox $dotb,$dotb 456 457 ldx [%sp+$bias+$frame+0],%o0 458 ldx [%sp+$bias+$frame+8],%o1 459 ldx [%sp+$bias+$frame+16],%o2 460 ldx [%sp+$bias+$frame+24],%o3 461 462 srlx %o0,16,%o7 463 std $dota,[%sp+$bias+$frame+32] 464 add %o7,%o1,%o1 465 std $dotb,[%sp+$bias+$frame+40] 466 srlx %o1,16,%o7 467 add %o7,%o2,%o2 468 srlx %o2,16,%o7 469 add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15] 470 and %o0,$mask,%o0 471 and %o1,$mask,%o1 472 and %o2,$mask,%o2 473 sllx %o1,16,%o1 474 sllx %o2,32,%o2 475 sllx %o3,48,%o7 476 or %o1,%o0,%o0 477 or %o2,%o0,%o0 478 or %o7,%o0,%o0 ! 64-bit result 479 ldx [%sp+$bias+$frame+32],%o4 480 addcc %g1,%o0,%o0 481 ldx [%sp+$bias+$frame+40],%o5 482 srlx %o3,16,%g1 ! 34-bit carry 483 bcs,a %xcc,.+8 484 add %g1,1,%g1 485 486 stx %o0,[$tp] ! tp[j-1]= 487 add $tp,8,$tp 488 489 srlx %o4,16,%o7 490 add %o7,%o5,%o5 491 and %o4,$mask,%o4 492 sllx %o5,16,%o7 493 or %o7,%o4,%o4 494 addcc %g1,%o4,%o4 495 srlx %o5,48,%g1 496 bcs,a %xcc,.+8 497 add %g1,1,%g1 498 499 mov %g1,$carry 500 stx %o4,[$tp] ! tp[num-1]= 501 503 ba .Louter 504 add $i,8,$i 505 .align 32 506 .Louter: 507 sub %g0,$num,$j ! j=-num 508 add %sp,$bias+$frame+$locals,$tp 509 510 add $ap,$j,%o3 511 add $bp,$i,%o4 512 513 ld [%o3+4],%g1 ! bp[i] 514 ld [%o3+0],%o0 515 ld [%o4+4],%g5 ! ap[0] 516 sllx %g1,32,%g1 517 ld [%o4+0],%o1 518 sllx %g5,32,%g5 519 or %g1,%o0,%o0 520 or %g5,%o1,%o1 521 522 ldx [$tp],%o2 ! tp[0] 523 mulx %o1,%o0,%o0 524 addcc %o2,%o0,%o0 525 mulx $n0,%o0,%o0 ! (ap[0]*bp[i]+t[0])*n0 526 stx %o0,[%sp+$bias+$frame+0] 527 528 ! transfer b[i] to FPU as 4x16-bit values 529 ldda [%o4+2]%asi,$ba 530 ldda [%o4+0]%asi,$bb 531 ldda [%o4+6]%asi,$bc 532 ldda [%o4+4]%asi,$bd 533 534 ! transfer (ap[0]*b[i]+t[0])*n0 to FPU as 4x16-bit values 535 ldda [%sp+$bias+$frame+6]%asi,$na 536 fxtod $ba,$ba 537 ldda [%sp+$bias+$frame+4]%asi,$nb 538 fxtod $bb,$bb 539 ldda [%sp+$bias+$frame+2]%asi,$nc 540 fxtod $bc,$bc 541 ldda [%sp+$bias+$frame+0]%asi,$nd 542 fxtod $bd,$bd 543 ldd [$ap_l+$j],$alo ! load a[j] in double format 544 fxtod $na,$na 545 ldd [$ap_h+$j],$ahi 546 fxtod $nb,$nb 547 ldd [$np_l+$j],$nlo ! load n[j] in double format 548 fxtod $nc,$nc 549 ldd [$np_h+$j],$nhi 550 fxtod $nd,$nd 551 552 fmuld $alo,$ba,$aloa 553 fmuld $nlo,$na,$nloa 554 fmuld $alo,$bb,$alob 555 fmuld $nlo,$nb,$nlob 556 fmuld $alo,$bc,$aloc 557 faddd $aloa,$nloa,$nloa 558 fmuld $nlo,$nc,$nloc 559 fmuld $alo,$bd,$alod 560 faddd $alob,$nlob,$nlob 561 fmuld $nlo,$nd,$nlod 562 fmuld $ahi,$ba,$ahia 563 faddd $aloc,$nloc,$nloc 564 fmuld $nhi,$na,$nhia 565 fmuld $ahi,$bb,$ahib 566 faddd $alod,$nlod,$nlod 567 fmuld $nhi,$nb,$nhib 568 fmuld $ahi,$bc,$ahic 569 faddd $ahia,$nhia,$nhia 570 fmuld $nhi,$nc,$nhic 571 fmuld $ahi,$bd,$ahid 572 faddd $ahib,$nhib,$nhib 573 fmuld $nhi,$nd,$nhid 574 575 faddd $ahic,$nhic,$dota ! $nhic 576 faddd $ahid,$nhid,$dotb ! $nhid 577 578 faddd $nloc,$nhia,$nloc 579 faddd $nlod,$nhib,$nlod 580 581 fdtox $nloa,$nloa 582 fdtox $nlob,$nlob 583 fdtox $nloc,$nloc 584 fdtox $nlod,$nlod 585 586 std $nloa,[%sp+$bias+$frame+0] 587 std $nlob,[%sp+$bias+$frame+8] 588 std $nloc,[%sp+$bias+$frame+16] 589 add $j,8,$j 590 std $nlod,[%sp+$bias+$frame+24] 591 593 ldd [$ap_l+$j],$alo ! load a[j] in double format 594 ldd [$ap_h+$j],$ahi 595 ldd [$np_l+$j],$nlo ! load n[j] in double format 596 ldd [$np_h+$j],$nhi 597 598 fmuld $alo,$ba,$aloa 599 fmuld $nlo,$na,$nloa 600 fmuld $alo,$bb,$alob 601 fmuld $nlo,$nb,$nlob 602 fmuld $alo,$bc,$aloc 603 ldx [%sp+$bias+$frame+0],%o0 604 faddd $aloa,$nloa,$nloa 605 fmuld $nlo,$nc,$nloc 606 ldx [%sp+$bias+$frame+8],%o1 607 fmuld $alo,$bd,$alod 608 ldx [%sp+$bias+$frame+16],%o2 609 faddd $alob,$nlob,$nlob 610 fmuld $nlo,$nd,$nlod 611 ldx [%sp+$bias+$frame+24],%o3 612 fmuld $ahi,$ba,$ahia 613 614 srlx %o0,16,%o7 615 faddd $aloc,$nloc,$nloc 616 fmuld $nhi,$na,$nhia 617 add %o7,%o1,%o1 618 fmuld $ahi,$bb,$ahib 619 srlx %o1,16,%o7 620 faddd $alod,$nlod,$nlod 621 fmuld $nhi,$nb,$nhib 622 add %o7,%o2,%o2 623 fmuld $ahi,$bc,$ahic 624 srlx %o2,16,%o7 625 faddd $ahia,$nhia,$nhia 626 fmuld $nhi,$nc,$nhic 627 add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15] 628 ! why? 629 and %o0,$mask,%o0 630 fmuld $ahi,$bd,$ahid 631 and %o1,$mask,%o1 632 and %o2,$mask,%o2 633 faddd $ahib,$nhib,$nhib 634 fmuld $nhi,$nd,$nhid 635 sllx %o1,16,%o1 636 faddd $dota,$nloa,$nloa 637 sllx %o2,32,%o2 638 faddd $dotb,$nlob,$nlob 639 sllx %o3,48,%o7 640 or %o1,%o0,%o0 641 faddd $ahic,$nhic,$dota ! $nhic 642 or %o2,%o0,%o0 643 faddd $ahid,$nhid,$dotb ! $nhid 644 or %o7,%o0,%o0 ! 64-bit result 645 ldx [$tp],%o7 646 faddd $nloc,$nhia,$nloc 647 addcc %o7,%o0,%o0 648 ! end-of-why? 649 faddd $nlod,$nhib,$nlod 650 srlx %o3,16,%g1 ! 34-bit carry 651 fdtox $nloa,$nloa 652 bcs,a %xcc,.+8 653 add %g1,1,%g1 654 655 fdtox $nlob,$nlob 656 fdtox $nloc,$nloc 657 fdtox $nlod,$nlod 658 659 std $nloa,[%sp+$bias+$frame+0] 660 std $nlob,[%sp+$bias+$frame+8] 661 addcc $j,8,$j 662 std $nloc,[%sp+$bias+$frame+16] 663 bz,pn %icc,.Linnerskip 664 std $nlod,[%sp+$bias+$frame+24] 665 667 ba .Linner 668 nop 669 .align 32 670 .Linner: 671 ldd [$ap_l+$j],$alo ! load a[j] in double format 672 ldd [$ap_h+$j],$ahi 673 ldd [$np_l+$j],$nlo ! load n[j] in double format 674 ldd [$np_h+$j],$nhi 675 676 fmuld $alo,$ba,$aloa 677 fmuld $nlo,$na,$nloa 678 fmuld $alo,$bb,$alob 679 fmuld $nlo,$nb,$nlob 680 fmuld $alo,$bc,$aloc 681 ldx [%sp+$bias+$frame+0],%o0 682 faddd $aloa,$nloa,$nloa 683 fmuld $nlo,$nc,$nloc 684 ldx [%sp+$bias+$frame+8],%o1 685 fmuld $alo,$bd,$alod 686 ldx [%sp+$bias+$frame+16],%o2 687 faddd $alob,$nlob,$nlob 688 fmuld $nlo,$nd,$nlod 689 ldx [%sp+$bias+$frame+24],%o3 690 fmuld $ahi,$ba,$ahia 691 692 srlx %o0,16,%o7 693 faddd $aloc,$nloc,$nloc 694 fmuld $nhi,$na,$nhia 695 add %o7,%o1,%o1 696 fmuld $ahi,$bb,$ahib 697 srlx %o1,16,%o7 698 faddd $alod,$nlod,$nlod 699 fmuld $nhi,$nb,$nhib 700 add %o7,%o2,%o2 701 fmuld $ahi,$bc,$ahic 702 srlx %o2,16,%o7 703 faddd $ahia,$nhia,$nhia 704 fmuld $nhi,$nc,$nhic 705 add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15] 706 and %o0,$mask,%o0 707 fmuld $ahi,$bd,$ahid 708 and %o1,$mask,%o1 709 and %o2,$mask,%o2 710 faddd $ahib,$nhib,$nhib 711 fmuld $nhi,$nd,$nhid 712 sllx %o1,16,%o1 713 faddd $dota,$nloa,$nloa 714 sllx %o2,32,%o2 715 faddd $dotb,$nlob,$nlob 716 sllx %o3,48,%o7 717 or %o1,%o0,%o0 718 faddd $ahic,$nhic,$dota ! $nhic 719 or %o2,%o0,%o0 720 faddd $ahid,$nhid,$dotb ! $nhid 721 or %o7,%o0,%o0 ! 64-bit result 722 faddd $nloc,$nhia,$nloc 723 addcc %g1,%o0,%o0 724 ldx [$tp+8],%o7 ! tp[j] 725 faddd $nlod,$nhib,$nlod 726 srlx %o3,16,%g1 ! 34-bit carry 727 fdtox $nloa,$nloa 728 bcs,a %xcc,.+8 729 add %g1,1,%g1 730 fdtox $nlob,$nlob 731 addcc %o7,%o0,%o0 732 fdtox $nloc,$nloc 733 bcs,a %xcc,.+8 734 add %g1,1,%g1 735 736 stx %o0,[$tp] ! tp[j-1] 737 fdtox $nlod,$nlod 738 739 std $nloa,[%sp+$bias+$frame+0] 740 std $nlob,[%sp+$bias+$frame+8] 741 std $nloc,[%sp+$bias+$frame+16] 742 addcc $j,8,$j 743 std $nlod,[%sp+$bias+$frame+24] 744 bnz,pt %icc,.Linner 745 add $tp,8,$tp 746 748 .Linnerskip: 749 fdtox $dota,$dota 750 fdtox $dotb,$dotb 751 752 ldx [%sp+$bias+$frame+0],%o0 753 ldx [%sp+$bias+$frame+8],%o1 754 ldx [%sp+$bias+$frame+16],%o2 755 ldx [%sp+$bias+$frame+24],%o3 756 757 srlx %o0,16,%o7 758 std $dota,[%sp+$bias+$frame+32] 759 add %o7,%o1,%o1 760 std $dotb,[%sp+$bias+$frame+40] 761 srlx %o1,16,%o7 762 add %o7,%o2,%o2 763 srlx %o2,16,%o7 764 add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15] 765 and %o0,$mask,%o0 766 and %o1,$mask,%o1 767 and %o2,$mask,%o2 768 sllx %o1,16,%o1 769 sllx %o2,32,%o2 770 sllx %o3,48,%o7 771 or %o1,%o0,%o0 772 or %o2,%o0,%o0 773 ldx [%sp+$bias+$frame+32],%o4 774 or %o7,%o0,%o0 ! 64-bit result 775 ldx [%sp+$bias+$frame+40],%o5 776 addcc %g1,%o0,%o0 777 ldx [$tp+8],%o7 ! tp[j] 778 srlx %o3,16,%g1 ! 34-bit carry 779 bcs,a %xcc,.+8 780 add %g1,1,%g1 781 782 addcc %o7,%o0,%o0 783 bcs,a %xcc,.+8 784 add %g1,1,%g1 785 786 stx %o0,[$tp] ! tp[j-1] 787 add $tp,8,$tp 788 789 srlx %o4,16,%o7 790 add %o7,%o5,%o5 791 and %o4,$mask,%o4 792 sllx %o5,16,%o7 793 or %o7,%o4,%o4 794 addcc %g1,%o4,%o4 795 srlx %o5,48,%g1 796 bcs,a %xcc,.+8 797 add %g1,1,%g1 798 799 addcc $carry,%o4,%o4 800 stx %o4,[$tp] ! tp[num-1] 801 mov %g1,$carry 802 bcs,a %xcc,.+8 803 add $carry,1,$carry 804 805 addcc $i,8,$i 806 bnz %icc,.Louter 807 nop 808 810 add $tp,8,$tp ! adjust tp to point at the end 811 orn %g0,%g0,%g4 812 sub %g0,$num,%o7 ! n=-num 813 ba .Lsub 814 subcc %g0,%g0,%g0 ! clear %icc.c 815 816 .align 32 817 .Lsub: 818 ldx [$tp+%o7],%o0 819 add $np,%o7,%g1 820 ld [%g1+0],%o2 821 ld [%g1+4],%o3 822 srlx %o0,32,%o1 823 subccc %o0,%o2,%o2 824 add $rp,%o7,%g1 825 subccc %o1,%o3,%o3 826 st %o2,[%g1+0] 827 add %o7,8,%o7 828 brnz,pt %o7,.Lsub 829 st %o3,[%g1+4] 830 subc $carry,0,%g4 831 sub %g0,$num,%o7 ! n=-num 832 ba .Lcopy 833 nop 834 835 .align 32 836 .Lcopy: 837 ldx [$tp+%o7],%o0 838 add $rp,%o7,%g1 839 ld [%g1+0],%o2 840 ld [%g1+4],%o3 841 stx %g0,[$tp+%o7] 842 and %o0,%g4,%o0 843 srlx %o0,32,%o1 844 andn %o2,%g4,%o2 845 andn %o3,%g4,%o3 846 or %o2,%o0,%o0 847 or %o3,%o1,%o1 848 st %o0,[%g1+0] 849 add %o7,8,%o7 850 brnz,pt %o7,.Lcopy 851 st %o1,[%g1+4] 852 sub %g0,$num,%o7 ! n=-num 853 854 .Lzap: 855 stx %g0,[$ap_l+%o7] 856 stx %g0,[$ap_h+%o7] 857 stx %g0,[$np_l+%o7] 858 stx %g0,[$np_h+%o7] 859 add %o7,8,%o7 860 brnz,pt %o7,.Lzap 861 nop 862 863 ldx [%sp+$bias+$frame+48],%o7 864 wr %g0,%o7,%asi ! restore %asi 865 866 mov 1,%i0 867 .Lret: 868 ret 869 restore 870 .type $fname,#function 871 .size $fname,(.-$fname) 872 .asciz "Montgomery Multipltication for UltraSPARC, CRYPTOGAMS by <appro\@openssl.org>" 873 .align 32 874 ___ 875 876 $code =~ s/\`([^\`]*)\`/eval($1)/gem; 877 878 # Below substitution makes it possible to compile without demanding 879 # VIS extentions on command line, e.g. -xarch=v9 vs. -xarch=v9a. I 880 # dare to do this, because VIS capability is detected at run-time now 881 # and this routine is not called on CPU not capable to execute it. Do 882 # note that fzeros is not the only VIS dependency! Another dependency 883 # is implicit and is just _a_ numerical value loaded to %asi register, 884 # which assembler can't recognize as VIS specific... 885 $code =~ s/fzeros\s+%f([0-9]+)/ 886 sprintf(".word\t0x%x\t! fzeros %%f%d",0x81b00c20|($1<<25),$1) 887 /gem; 888 889 print $code; 890 # flush 891 close STDOUT; 892