1 #!/usr/bin/env perl 2 # 3 # ==================================================================== 4 # Written by Andy Polyakov <appro (at] fy.chalmers.se> for the OpenSSL 5 # project. The module is, however, dual licensed under OpenSSL and 6 # CRYPTOGAMS licenses depending on where you obtain it. For further 7 # details see http://www.openssl.org/~appro/cryptogams/. 8 # ==================================================================== 9 10 # January 2010 11 # 12 # "Teaser" Montgomery multiplication module for IA-64. There are 13 # several possibilities for improvement: 14 # 15 # - modulo-scheduling outer loop would eliminate quite a number of 16 # stalls after ldf8, xma and getf.sig outside inner loop and 17 # improve shorter key performance; 18 # - shorter vector support [with input vectors being fetched only 19 # once] should be added; 20 # - 2x unroll with help of n0[1] would make the code scalable on 21 # "wider" IA-64, "wider" than Itanium 2 that is, which is not of 22 # acute interest, because upcoming Tukwila's individual cores are 23 # reportedly based on Itanium 2 design; 24 # - dedicated squaring procedure(?); 25 # 26 # January 2010 27 # 28 # Shorter vector support is implemented by zero-padding ap and np 29 # vectors up to 8 elements, or 512 bits. This means that 256-bit 30 # inputs will be processed only 2 times faster than 512-bit inputs, 31 # not 4 [as one would expect, because algorithm complexity is n^2]. 32 # The reason for padding is that inputs shorter than 512 bits won't 33 # be processed faster anyway, because minimal critical path of the 34 # core loop happens to match 512-bit timing. Either way, it resulted 35 # in >100% improvement of 512-bit RSA sign benchmark and 50% - of 36 # 1024-bit one [in comparison to original version of *this* module]. 37 # 38 # So far 'openssl speed rsa dsa' output on 900MHz Itanium 2 *with* 39 # this module is: 40 # sign verify sign/s verify/s 41 # rsa 512 bits 0.000290s 0.000024s 3452.8 42031.4 42 # rsa 1024 bits 0.000793s 0.000058s 1261.7 17172.0 43 # rsa 2048 bits 0.005908s 0.000148s 169.3 6754.0 44 # rsa 4096 bits 0.033456s 0.000469s 29.9 2133.6 45 # dsa 512 bits 0.000253s 0.000198s 3949.9 5057.0 46 # dsa 1024 bits 0.000585s 0.000607s 1708.4 1647.4 47 # dsa 2048 bits 0.001453s 0.001703s 688.1 587.4 48 # 49 # ... and *without* (but still with ia64.S): 50 # 51 # rsa 512 bits 0.000670s 0.000041s 1491.8 24145.5 52 # rsa 1024 bits 0.001988s 0.000080s 502.9 12499.3 53 # rsa 2048 bits 0.008702s 0.000189s 114.9 5293.9 54 # rsa 4096 bits 0.043860s 0.000533s 22.8 1875.9 55 # dsa 512 bits 0.000441s 0.000427s 2265.3 2340.6 56 # dsa 1024 bits 0.000823s 0.000867s 1215.6 1153.2 57 # dsa 2048 bits 0.001894s 0.002179s 528.1 458.9 58 # 59 # As it can be seen, RSA sign performance improves by 130-30%, 60 # hereafter less for longer keys, while verify - by 74-13%. 61 # DSA performance improves by 115-30%. 62 63 if ($^O eq "hpux") { 64 $ADDP="addp4"; 65 for (@ARGV) { $ADDP="add" if (/[\+DD|\-mlp]64/); } 66 } else { $ADDP="add"; } 67 68 $code=<<___; 69 .explicit 70 .text 71 72 // int bn_mul_mont (BN_ULONG *rp,const BN_ULONG *ap, 73 // const BN_ULONG *bp,const BN_ULONG *np, 74 // const BN_ULONG *n0p,int num); 75 .align 64 76 .global bn_mul_mont# 77 .proc bn_mul_mont# 78 bn_mul_mont: 79 .prologue 80 .body 81 { .mmi; cmp4.le p6,p7=2,r37;; 82 (p6) cmp4.lt.unc p8,p9=8,r37 83 mov ret0=r0 };; 84 { .bbb; 85 (p9) br.cond.dptk.many bn_mul_mont_8 86 (p8) br.cond.dpnt.many bn_mul_mont_general 87 (p7) br.ret.spnt.many b0 };; 88 .endp bn_mul_mont# 89 91 prevfs=r2; prevpr=r3; prevlc=r10; prevsp=r11; 92 93 rptr=r8; aptr=r9; bptr=r14; nptr=r15; 94 tptr=r16; // &tp[0] 95 tp_1=r17; // &tp[-1] 96 num=r18; len=r19; lc=r20; 97 topbit=r21; // carry bit from tmp[num] 98 99 n0=f6; 100 m0=f7; 101 bi=f8; 102 103 .align 64 104 .local bn_mul_mont_general# 105 .proc bn_mul_mont_general# 106 bn_mul_mont_general: 107 .prologue 108 { .mmi; .save ar.pfs,prevfs 109 alloc prevfs=ar.pfs,6,2,0,8 110 $ADDP aptr=0,in1 111 .save ar.lc,prevlc 112 mov prevlc=ar.lc } 113 { .mmi; .vframe prevsp 114 mov prevsp=sp 115 $ADDP bptr=0,in2 116 .save pr,prevpr 117 mov prevpr=pr };; 118 119 .body 120 .rotf alo[6],nlo[4],ahi[8],nhi[6] 121 .rotr a[3],n[3],t[2] 122 123 { .mmi; ldf8 bi=[bptr],8 // (*bp++) 124 ldf8 alo[4]=[aptr],16 // ap[0] 125 $ADDP r30=8,in1 };; 126 { .mmi; ldf8 alo[3]=[r30],16 // ap[1] 127 ldf8 alo[2]=[aptr],16 // ap[2] 128 $ADDP in4=0,in4 };; 129 { .mmi; ldf8 alo[1]=[r30] // ap[3] 130 ldf8 n0=[in4] // n0 131 $ADDP rptr=0,in0 } 132 { .mmi; $ADDP nptr=0,in3 133 mov r31=16 134 zxt4 num=in5 };; 135 { .mmi; ldf8 nlo[2]=[nptr],8 // np[0] 136 shladd len=num,3,r0 137 shladd r31=num,3,r31 };; 138 { .mmi; ldf8 nlo[1]=[nptr],8 // np[1] 139 add lc=-5,num 140 sub r31=sp,r31 };; 141 { .mfb; and sp=-16,r31 // alloca 142 xmpy.hu ahi[2]=alo[4],bi // ap[0]*bp[0] 143 nop.b 0 } 144 { .mfb; nop.m 0 145 xmpy.lu alo[4]=alo[4],bi 146 brp.loop.imp .L1st_ctop,.L1st_cend-16 147 };; 148 { .mfi; nop.m 0 149 xma.hu ahi[1]=alo[3],bi,ahi[2] // ap[1]*bp[0] 150 add tp_1=8,sp } 151 { .mfi; nop.m 0 152 xma.lu alo[3]=alo[3],bi,ahi[2] 153 mov pr.rot=0x20001f<<16 154 // ------^----- (p40) at first (p23) 155 // ----------^^ p[16:20]=1 156 };; 157 { .mfi; nop.m 0 158 xmpy.lu m0=alo[4],n0 // (ap[0]*bp[0])*n0 159 mov ar.lc=lc } 160 { .mfi; nop.m 0 161 fcvt.fxu.s1 nhi[1]=f0 162 mov ar.ec=8 };; 163 164 .align 32 165 .L1st_ctop: 166 .pred.rel "mutex",p40,p42 167 { .mfi; (p16) ldf8 alo[0]=[aptr],8 // *(aptr++) 168 (p18) xma.hu ahi[0]=alo[2],bi,ahi[1] 169 (p40) add n[2]=n[2],a[2] } // (p23) } 170 { .mfi; (p18) ldf8 nlo[0]=[nptr],8 // *(nptr++)(p16) 171 (p18) xma.lu alo[2]=alo[2],bi,ahi[1] 172 (p42) add n[2]=n[2],a[2],1 };; // (p23) 173 { .mfi; (p21) getf.sig a[0]=alo[5] 174 (p20) xma.hu nhi[0]=nlo[2],m0,nhi[1] 175 (p42) cmp.leu p41,p39=n[2],a[2] } // (p23) 176 { .mfi; (p23) st8 [tp_1]=n[2],8 177 (p20) xma.lu nlo[2]=nlo[2],m0,nhi[1] 178 (p40) cmp.ltu p41,p39=n[2],a[2] } // (p23) 179 { .mmb; (p21) getf.sig n[0]=nlo[3] 180 (p16) nop.m 0 181 br.ctop.sptk .L1st_ctop };; 182 .L1st_cend: 183 184 { .mmi; getf.sig a[0]=ahi[6] // (p24) 185 getf.sig n[0]=nhi[4] 186 add num=-1,num };; // num-- 187 { .mmi; .pred.rel "mutex",p40,p42 188 (p40) add n[0]=n[0],a[0] 189 (p42) add n[0]=n[0],a[0],1 190 sub aptr=aptr,len };; // rewind 191 { .mmi; .pred.rel "mutex",p40,p42 192 (p40) cmp.ltu p41,p39=n[0],a[0] 193 (p42) cmp.leu p41,p39=n[0],a[0] 194 sub nptr=nptr,len };; 195 { .mmi; .pred.rel "mutex",p39,p41 196 (p39) add topbit=r0,r0 197 (p41) add topbit=r0,r0,1 198 nop.i 0 } 199 { .mmi; st8 [tp_1]=n[0] 200 add tptr=16,sp 201 add tp_1=8,sp };; 202 204 .Louter: 205 { .mmi; ldf8 bi=[bptr],8 // (*bp++) 206 ldf8 ahi[3]=[tptr] // tp[0] 207 add r30=8,aptr };; 208 { .mmi; ldf8 alo[4]=[aptr],16 // ap[0] 209 ldf8 alo[3]=[r30],16 // ap[1] 210 add r31=8,nptr };; 211 { .mfb; ldf8 alo[2]=[aptr],16 // ap[2] 212 xma.hu ahi[2]=alo[4],bi,ahi[3] // ap[0]*bp[i]+tp[0] 213 brp.loop.imp .Linner_ctop,.Linner_cend-16 214 } 215 { .mfb; ldf8 alo[1]=[r30] // ap[3] 216 xma.lu alo[4]=alo[4],bi,ahi[3] 217 clrrrb.pr };; 218 { .mfi; ldf8 nlo[2]=[nptr],16 // np[0] 219 xma.hu ahi[1]=alo[3],bi,ahi[2] // ap[1]*bp[i] 220 nop.i 0 } 221 { .mfi; ldf8 nlo[1]=[r31] // np[1] 222 xma.lu alo[3]=alo[3],bi,ahi[2] 223 mov pr.rot=0x20101f<<16 224 // ------^----- (p40) at first (p23) 225 // --------^--- (p30) at first (p22) 226 // ----------^^ p[16:20]=1 227 };; 228 { .mfi; st8 [tptr]=r0 // tp[0] is already accounted 229 xmpy.lu m0=alo[4],n0 // (ap[0]*bp[i]+tp[0])*n0 230 mov ar.lc=lc } 231 { .mfi; 232 fcvt.fxu.s1 nhi[1]=f0 233 mov ar.ec=8 };; 234 235 // This loop spins in 4*(n+7) ticks on Itanium 2 and should spin in 236 // 7*(n+7) ticks on Itanium (the one codenamed Merced). Factor of 7 237 // in latter case accounts for two-tick pipeline stall, which means 238 // that its performance would be ~20% lower than optimal one. No 239 // attempt was made to address this, because original Itanium is 240 // hardly represented out in the wild... 241 .align 32 242 .Linner_ctop: 243 .pred.rel "mutex",p40,p42 244 .pred.rel "mutex",p30,p32 245 { .mfi; (p16) ldf8 alo[0]=[aptr],8 // *(aptr++) 246 (p18) xma.hu ahi[0]=alo[2],bi,ahi[1] 247 (p40) add n[2]=n[2],a[2] } // (p23) 248 { .mfi; (p16) nop.m 0 249 (p18) xma.lu alo[2]=alo[2],bi,ahi[1] 250 (p42) add n[2]=n[2],a[2],1 };; // (p23) 251 { .mfi; (p21) getf.sig a[0]=alo[5] 252 (p16) nop.f 0 253 (p40) cmp.ltu p41,p39=n[2],a[2] } // (p23) 254 { .mfi; (p21) ld8 t[0]=[tptr],8 255 (p16) nop.f 0 256 (p42) cmp.leu p41,p39=n[2],a[2] };; // (p23) 257 { .mfi; (p18) ldf8 nlo[0]=[nptr],8 // *(nptr++) 258 (p20) xma.hu nhi[0]=nlo[2],m0,nhi[1] 259 (p30) add a[1]=a[1],t[1] } // (p22) 260 { .mfi; (p16) nop.m 0 261 (p20) xma.lu nlo[2]=nlo[2],m0,nhi[1] 262 (p32) add a[1]=a[1],t[1],1 };; // (p22) 263 { .mmi; (p21) getf.sig n[0]=nlo[3] 264 (p16) nop.m 0 265 (p30) cmp.ltu p31,p29=a[1],t[1] } // (p22) 266 { .mmb; (p23) st8 [tp_1]=n[2],8 267 (p32) cmp.leu p31,p29=a[1],t[1] // (p22) 268 br.ctop.sptk .Linner_ctop };; 269 .Linner_cend: 270 271 { .mmi; getf.sig a[0]=ahi[6] // (p24) 272 getf.sig n[0]=nhi[4] 273 nop.i 0 };; 274 275 { .mmi; .pred.rel "mutex",p31,p33 276 (p31) add a[0]=a[0],topbit 277 (p33) add a[0]=a[0],topbit,1 278 mov topbit=r0 };; 279 { .mfi; .pred.rel "mutex",p31,p33 280 (p31) cmp.ltu p32,p30=a[0],topbit 281 (p33) cmp.leu p32,p30=a[0],topbit 282 } 283 { .mfi; .pred.rel "mutex",p40,p42 284 (p40) add n[0]=n[0],a[0] 285 (p42) add n[0]=n[0],a[0],1 286 };; 287 { .mmi; .pred.rel "mutex",p44,p46 288 (p40) cmp.ltu p41,p39=n[0],a[0] 289 (p42) cmp.leu p41,p39=n[0],a[0] 290 (p32) add topbit=r0,r0,1 } 291 292 { .mmi; st8 [tp_1]=n[0],8 293 cmp4.ne p6,p0=1,num 294 sub aptr=aptr,len };; // rewind 295 { .mmi; sub nptr=nptr,len 296 (p41) add topbit=r0,r0,1 297 add tptr=16,sp } 298 { .mmb; add tp_1=8,sp 299 add num=-1,num // num-- 300 (p6) br.cond.sptk.many .Louter };; 301 303 { .mbb; add lc=4,lc 304 brp.loop.imp .Lsub_ctop,.Lsub_cend-16 305 clrrrb.pr };; 306 { .mii; nop.m 0 307 mov pr.rot=0x10001<<16 308 // ------^---- (p33) at first (p17) 309 mov ar.lc=lc } 310 { .mii; nop.m 0 311 mov ar.ec=3 312 nop.i 0 };; 313 314 .Lsub_ctop: 315 .pred.rel "mutex",p33,p35 316 { .mfi; (p16) ld8 t[0]=[tptr],8 // t=*(tp++) 317 (p16) nop.f 0 318 (p33) sub n[1]=t[1],n[1] } // (p17) 319 { .mfi; (p16) ld8 n[0]=[nptr],8 // n=*(np++) 320 (p16) nop.f 0 321 (p35) sub n[1]=t[1],n[1],1 };; // (p17) 322 { .mib; (p18) st8 [rptr]=n[2],8 // *(rp++)=r 323 (p33) cmp.gtu p34,p32=n[1],t[1] // (p17) 324 (p18) nop.b 0 } 325 { .mib; (p18) nop.m 0 326 (p35) cmp.geu p34,p32=n[1],t[1] // (p17) 327 br.ctop.sptk .Lsub_ctop };; 328 .Lsub_cend: 329 330 { .mmb; .pred.rel "mutex",p34,p36 331 (p34) sub topbit=topbit,r0 // (p19) 332 (p36) sub topbit=topbit,r0,1 333 brp.loop.imp .Lcopy_ctop,.Lcopy_cend-16 334 } 335 { .mmb; sub rptr=rptr,len // rewind 336 sub tptr=tptr,len 337 clrrrb.pr };; 338 { .mmi; and aptr=tptr,topbit 339 andcm bptr=rptr,topbit 340 mov pr.rot=1<<16 };; 341 { .mii; or nptr=aptr,bptr 342 mov ar.lc=lc 343 mov ar.ec=3 };; 344 345 .Lcopy_ctop: 346 { .mmb; (p16) ld8 n[0]=[nptr],8 347 (p18) st8 [tptr]=r0,8 348 (p16) nop.b 0 } 349 { .mmb; (p16) nop.m 0 350 (p18) st8 [rptr]=n[2],8 351 br.ctop.sptk .Lcopy_ctop };; 352 .Lcopy_cend: 353 354 { .mmi; mov ret0=1 // signal "handled" 355 rum 1<<5 // clear um.mfh 356 mov ar.lc=prevlc } 357 { .mib; .restore sp 358 mov sp=prevsp 359 mov pr=prevpr,0x1ffff 360 br.ret.sptk.many b0 };; 361 .endp bn_mul_mont_general# 362 364 a1=r16; a2=r17; a3=r18; a4=r19; a5=r20; a6=r21; a7=r22; a8=r23; 365 n1=r24; n2=r25; n3=r26; n4=r27; n5=r28; n6=r29; n7=r30; n8=r31; 366 t0=r15; 367 368 ai0=f8; ai1=f9; ai2=f10; ai3=f11; ai4=f12; ai5=f13; ai6=f14; ai7=f15; 369 ni0=f16; ni1=f17; ni2=f18; ni3=f19; ni4=f20; ni5=f21; ni6=f22; ni7=f23; 370 371 .align 64 372 .skip 48 // aligns loop body 373 .local bn_mul_mont_8# 374 .proc bn_mul_mont_8# 375 bn_mul_mont_8: 376 .prologue 377 { .mmi; .save ar.pfs,prevfs 378 alloc prevfs=ar.pfs,6,2,0,8 379 .vframe prevsp 380 mov prevsp=sp 381 .save ar.lc,prevlc 382 mov prevlc=ar.lc } 383 { .mmi; add r17=-6*16,sp 384 add sp=-7*16,sp 385 .save pr,prevpr 386 mov prevpr=pr };; 387 388 { .mmi; .save.gf 0,0x10 389 stf.spill [sp]=f16,-16 390 .save.gf 0,0x20 391 stf.spill [r17]=f17,32 392 add r16=-5*16,prevsp};; 393 { .mmi; .save.gf 0,0x40 394 stf.spill [r16]=f18,32 395 .save.gf 0,0x80 396 stf.spill [r17]=f19,32 397 $ADDP aptr=0,in1 };; 398 { .mmi; .save.gf 0,0x100 399 stf.spill [r16]=f20,32 400 .save.gf 0,0x200 401 stf.spill [r17]=f21,32 402 $ADDP r29=8,in1 };; 403 { .mmi; .save.gf 0,0x400 404 stf.spill [r16]=f22 405 .save.gf 0,0x800 406 stf.spill [r17]=f23 407 $ADDP rptr=0,in0 };; 408 410 .body 411 .rotf bj[8],mj[2],tf[2],alo[10],ahi[10],nlo[10],nhi[10] 412 .rotr t[8] 413 414 // load input vectors padding them to 8 elements 415 { .mmi; ldf8 ai0=[aptr],16 // ap[0] 416 ldf8 ai1=[r29],16 // ap[1] 417 $ADDP bptr=0,in2 } 418 { .mmi; $ADDP r30=8,in2 419 $ADDP nptr=0,in3 420 $ADDP r31=8,in3 };; 421 { .mmi; ldf8 bj[7]=[bptr],16 // bp[0] 422 ldf8 bj[6]=[r30],16 // bp[1] 423 cmp4.le p4,p5=3,in5 } 424 { .mmi; ldf8 ni0=[nptr],16 // np[0] 425 ldf8 ni1=[r31],16 // np[1] 426 cmp4.le p6,p7=4,in5 };; 427 428 { .mfi; (p4)ldf8 ai2=[aptr],16 // ap[2] 429 (p5)fcvt.fxu ai2=f0 430 cmp4.le p8,p9=5,in5 } 431 { .mfi; (p6)ldf8 ai3=[r29],16 // ap[3] 432 (p7)fcvt.fxu ai3=f0 433 cmp4.le p10,p11=6,in5 } 434 { .mfi; (p4)ldf8 bj[5]=[bptr],16 // bp[2] 435 (p5)fcvt.fxu bj[5]=f0 436 cmp4.le p12,p13=7,in5 } 437 { .mfi; (p6)ldf8 bj[4]=[r30],16 // bp[3] 438 (p7)fcvt.fxu bj[4]=f0 439 cmp4.le p14,p15=8,in5 } 440 { .mfi; (p4)ldf8 ni2=[nptr],16 // np[2] 441 (p5)fcvt.fxu ni2=f0 442 addp4 r28=-1,in5 } 443 { .mfi; (p6)ldf8 ni3=[r31],16 // np[3] 444 (p7)fcvt.fxu ni3=f0 445 $ADDP in4=0,in4 };; 446 447 { .mfi; ldf8 n0=[in4] 448 fcvt.fxu tf[1]=f0 449 nop.i 0 } 450 451 { .mfi; (p8)ldf8 ai4=[aptr],16 // ap[4] 452 (p9)fcvt.fxu ai4=f0 453 mov t[0]=r0 } 454 { .mfi; (p10)ldf8 ai5=[r29],16 // ap[5] 455 (p11)fcvt.fxu ai5=f0 456 mov t[1]=r0 } 457 { .mfi; (p8)ldf8 bj[3]=[bptr],16 // bp[4] 458 (p9)fcvt.fxu bj[3]=f0 459 mov t[2]=r0 } 460 { .mfi; (p10)ldf8 bj[2]=[r30],16 // bp[5] 461 (p11)fcvt.fxu bj[2]=f0 462 mov t[3]=r0 } 463 { .mfi; (p8)ldf8 ni4=[nptr],16 // np[4] 464 (p9)fcvt.fxu ni4=f0 465 mov t[4]=r0 } 466 { .mfi; (p10)ldf8 ni5=[r31],16 // np[5] 467 (p11)fcvt.fxu ni5=f0 468 mov t[5]=r0 };; 469 470 { .mfi; (p12)ldf8 ai6=[aptr],16 // ap[6] 471 (p13)fcvt.fxu ai6=f0 472 mov t[6]=r0 } 473 { .mfi; (p14)ldf8 ai7=[r29],16 // ap[7] 474 (p15)fcvt.fxu ai7=f0 475 mov t[7]=r0 } 476 { .mfi; (p12)ldf8 bj[1]=[bptr],16 // bp[6] 477 (p13)fcvt.fxu bj[1]=f0 478 mov ar.lc=r28 } 479 { .mfi; (p14)ldf8 bj[0]=[r30],16 // bp[7] 480 (p15)fcvt.fxu bj[0]=f0 481 mov ar.ec=1 } 482 { .mfi; (p12)ldf8 ni6=[nptr],16 // np[6] 483 (p13)fcvt.fxu ni6=f0 484 mov pr.rot=1<<16 } 485 { .mfb; (p14)ldf8 ni7=[r31],16 // np[7] 486 (p15)fcvt.fxu ni7=f0 487 brp.loop.imp .Louter_8_ctop,.Louter_8_cend-16 488 };; 489 491 // The loop is scheduled for 32*n ticks on Itanium 2. Actual attempt 492 // to measure with help of Interval Time Counter indicated that the 493 // factor is a tad higher: 33 or 34, if not 35. Exact measurement and 494 // addressing the issue is problematic, because I don't have access 495 // to platform-specific instruction-level profiler. On Itanium it 496 // should run in 56*n ticks, because of higher xma latency... 497 .Louter_8_ctop: 498 .pred.rel "mutex",p40,p42 499 .pred.rel "mutex",p48,p50 500 { .mfi; (p16) nop.m 0 // 0: 501 (p16) xma.hu ahi[0]=ai0,bj[7],tf[1] // ap[0]*b[i]+t[0] 502 (p40) add a3=a3,n3 } // (p17) a3+=n3 503 { .mfi; (p42) add a3=a3,n3,1 504 (p16) xma.lu alo[0]=ai0,bj[7],tf[1] 505 (p16) nop.i 0 };; 506 { .mii; (p17) getf.sig a7=alo[8] // 1: 507 (p48) add t[6]=t[6],a3 // (p17) t[6]+=a3 508 (p50) add t[6]=t[6],a3,1 };; 509 { .mfi; (p17) getf.sig a8=ahi[8] // 2: 510 (p17) xma.hu nhi[7]=ni6,mj[1],nhi[6] // np[6]*m0 511 (p40) cmp.ltu p43,p41=a3,n3 } 512 { .mfi; (p42) cmp.leu p43,p41=a3,n3 513 (p17) xma.lu nlo[7]=ni6,mj[1],nhi[6] 514 (p16) nop.i 0 };; 515 { .mii; (p17) getf.sig n5=nlo[6] // 3: 516 (p48) cmp.ltu p51,p49=t[6],a3 517 (p50) cmp.leu p51,p49=t[6],a3 };; 518 .pred.rel "mutex",p41,p43 519 .pred.rel "mutex",p49,p51 520 { .mfi; (p16) nop.m 0 // 4: 521 (p16) xma.hu ahi[1]=ai1,bj[7],ahi[0] // ap[1]*b[i] 522 (p41) add a4=a4,n4 } // (p17) a4+=n4 523 { .mfi; (p43) add a4=a4,n4,1 524 (p16) xma.lu alo[1]=ai1,bj[7],ahi[0] 525 (p16) nop.i 0 };; 526 { .mfi; (p49) add t[5]=t[5],a4 // 5: (p17) t[5]+=a4 527 (p16) xmpy.lu mj[0]=alo[0],n0 // (ap[0]*b[i]+t[0])*n0 528 (p51) add t[5]=t[5],a4,1 };; 529 { .mfi; (p16) nop.m 0 // 6: 530 (p17) xma.hu nhi[8]=ni7,mj[1],nhi[7] // np[7]*m0 531 (p41) cmp.ltu p42,p40=a4,n4 } 532 { .mfi; (p43) cmp.leu p42,p40=a4,n4 533 (p17) xma.lu nlo[8]=ni7,mj[1],nhi[7] 534 (p16) nop.i 0 };; 535 { .mii; (p17) getf.sig n6=nlo[7] // 7: 536 (p49) cmp.ltu p50,p48=t[5],a4 537 (p51) cmp.leu p50,p48=t[5],a4 };; 538 .pred.rel "mutex",p40,p42 539 .pred.rel "mutex",p48,p50 540 { .mfi; (p16) nop.m 0 // 8: 541 (p16) xma.hu ahi[2]=ai2,bj[7],ahi[1] // ap[2]*b[i] 542 (p40) add a5=a5,n5 } // (p17) a5+=n5 543 { .mfi; (p42) add a5=a5,n5,1 544 (p16) xma.lu alo[2]=ai2,bj[7],ahi[1] 545 (p16) nop.i 0 };; 546 { .mii; (p16) getf.sig a1=alo[1] // 9: 547 (p48) add t[4]=t[4],a5 // p(17) t[4]+=a5 548 (p50) add t[4]=t[4],a5,1 };; 549 { .mfi; (p16) nop.m 0 // 10: 550 (p16) xma.hu nhi[0]=ni0,mj[0],alo[0] // np[0]*m0 551 (p40) cmp.ltu p43,p41=a5,n5 } 552 { .mfi; (p42) cmp.leu p43,p41=a5,n5 553 (p16) xma.lu nlo[0]=ni0,mj[0],alo[0] 554 (p16) nop.i 0 };; 555 { .mii; (p17) getf.sig n7=nlo[8] // 11: 556 (p48) cmp.ltu p51,p49=t[4],a5 557 (p50) cmp.leu p51,p49=t[4],a5 };; 558 .pred.rel "mutex",p41,p43 559 .pred.rel "mutex",p49,p51 560 { .mfi; (p17) getf.sig n8=nhi[8] // 12: 561 (p16) xma.hu ahi[3]=ai3,bj[7],ahi[2] // ap[3]*b[i] 562 (p41) add a6=a6,n6 } // (p17) a6+=n6 563 { .mfi; (p43) add a6=a6,n6,1 564 (p16) xma.lu alo[3]=ai3,bj[7],ahi[2] 565 (p16) nop.i 0 };; 566 { .mii; (p16) getf.sig a2=alo[2] // 13: 567 (p49) add t[3]=t[3],a6 // (p17) t[3]+=a6 568 (p51) add t[3]=t[3],a6,1 };; 569 { .mfi; (p16) nop.m 0 // 14: 570 (p16) xma.hu nhi[1]=ni1,mj[0],nhi[0] // np[1]*m0 571 (p41) cmp.ltu p42,p40=a6,n6 } 572 { .mfi; (p43) cmp.leu p42,p40=a6,n6 573 (p16) xma.lu nlo[1]=ni1,mj[0],nhi[0] 574 (p16) nop.i 0 };; 575 { .mii; (p16) nop.m 0 // 15: 576 (p49) cmp.ltu p50,p48=t[3],a6 577 (p51) cmp.leu p50,p48=t[3],a6 };; 578 .pred.rel "mutex",p40,p42 579 .pred.rel "mutex",p48,p50 580 { .mfi; (p16) nop.m 0 // 16: 581 (p16) xma.hu ahi[4]=ai4,bj[7],ahi[3] // ap[4]*b[i] 582 (p40) add a7=a7,n7 } // (p17) a7+=n7 583 { .mfi; (p42) add a7=a7,n7,1 584 (p16) xma.lu alo[4]=ai4,bj[7],ahi[3] 585 (p16) nop.i 0 };; 586 { .mii; (p16) getf.sig a3=alo[3] // 17: 587 (p48) add t[2]=t[2],a7 // (p17) t[2]+=a7 588 (p50) add t[2]=t[2],a7,1 };; 589 { .mfi; (p16) nop.m 0 // 18: 590 (p16) xma.hu nhi[2]=ni2,mj[0],nhi[1] // np[2]*m0 591 (p40) cmp.ltu p43,p41=a7,n7 } 592 { .mfi; (p42) cmp.leu p43,p41=a7,n7 593 (p16) xma.lu nlo[2]=ni2,mj[0],nhi[1] 594 (p16) nop.i 0 };; 595 { .mii; (p16) getf.sig n1=nlo[1] // 19: 596 (p48) cmp.ltu p51,p49=t[2],a7 597 (p50) cmp.leu p51,p49=t[2],a7 };; 598 .pred.rel "mutex",p41,p43 599 .pred.rel "mutex",p49,p51 600 { .mfi; (p16) nop.m 0 // 20: 601 (p16) xma.hu ahi[5]=ai5,bj[7],ahi[4] // ap[5]*b[i] 602 (p41) add a8=a8,n8 } // (p17) a8+=n8 603 { .mfi; (p43) add a8=a8,n8,1 604 (p16) xma.lu alo[5]=ai5,bj[7],ahi[4] 605 (p16) nop.i 0 };; 606 { .mii; (p16) getf.sig a4=alo[4] // 21: 607 (p49) add t[1]=t[1],a8 // (p17) t[1]+=a8 608 (p51) add t[1]=t[1],a8,1 };; 609 { .mfi; (p16) nop.m 0 // 22: 610 (p16) xma.hu nhi[3]=ni3,mj[0],nhi[2] // np[3]*m0 611 (p41) cmp.ltu p42,p40=a8,n8 } 612 { .mfi; (p43) cmp.leu p42,p40=a8,n8 613 (p16) xma.lu nlo[3]=ni3,mj[0],nhi[2] 614 (p16) nop.i 0 };; 615 { .mii; (p16) getf.sig n2=nlo[2] // 23: 616 (p49) cmp.ltu p50,p48=t[1],a8 617 (p51) cmp.leu p50,p48=t[1],a8 };; 618 { .mfi; (p16) nop.m 0 // 24: 619 (p16) xma.hu ahi[6]=ai6,bj[7],ahi[5] // ap[6]*b[i] 620 (p16) add a1=a1,n1 } // (p16) a1+=n1 621 { .mfi; (p16) nop.m 0 622 (p16) xma.lu alo[6]=ai6,bj[7],ahi[5] 623 (p17) mov t[0]=r0 };; 624 { .mii; (p16) getf.sig a5=alo[5] // 25: 625 (p16) add t0=t[7],a1 // (p16) t[7]+=a1 626 (p42) add t[0]=t[0],r0,1 };; 627 { .mfi; (p16) setf.sig tf[0]=t0 // 26: 628 (p16) xma.hu nhi[4]=ni4,mj[0],nhi[3] // np[4]*m0 629 (p50) add t[0]=t[0],r0,1 } 630 { .mfi; (p16) cmp.ltu.unc p42,p40=a1,n1 631 (p16) xma.lu nlo[4]=ni4,mj[0],nhi[3] 632 (p16) nop.i 0 };; 633 { .mii; (p16) getf.sig n3=nlo[3] // 27: 634 (p16) cmp.ltu.unc p50,p48=t0,a1 635 (p16) nop.i 0 };; 636 .pred.rel "mutex",p40,p42 637 .pred.rel "mutex",p48,p50 638 { .mfi; (p16) nop.m 0 // 28: 639 (p16) xma.hu ahi[7]=ai7,bj[7],ahi[6] // ap[7]*b[i] 640 (p40) add a2=a2,n2 } // (p16) a2+=n2 641 { .mfi; (p42) add a2=a2,n2,1 642 (p16) xma.lu alo[7]=ai7,bj[7],ahi[6] 643 (p16) nop.i 0 };; 644 { .mii; (p16) getf.sig a6=alo[6] // 29: 645 (p48) add t[6]=t[6],a2 // (p16) t[6]+=a2 646 (p50) add t[6]=t[6],a2,1 };; 647 { .mfi; (p16) nop.m 0 // 30: 648 (p16) xma.hu nhi[5]=ni5,mj[0],nhi[4] // np[5]*m0 649 (p40) cmp.ltu p41,p39=a2,n2 } 650 { .mfi; (p42) cmp.leu p41,p39=a2,n2 651 (p16) xma.lu nlo[5]=ni5,mj[0],nhi[4] 652 (p16) nop.i 0 };; 653 { .mfi; (p16) getf.sig n4=nlo[4] // 31: 654 (p16) nop.f 0 655 (p48) cmp.ltu p49,p47=t[6],a2 } 656 { .mfb; (p50) cmp.leu p49,p47=t[6],a2 657 (p16) nop.f 0 658 br.ctop.sptk.many .Louter_8_ctop };; 659 .Louter_8_cend: 660 662 // above loop has to execute one more time, without (p16), which is 663 // replaced with merged move of np[8] to GPR bank 664 .pred.rel "mutex",p40,p42 665 .pred.rel "mutex",p48,p50 666 { .mmi; (p0) getf.sig n1=ni0 // 0: 667 (p40) add a3=a3,n3 // (p17) a3+=n3 668 (p42) add a3=a3,n3,1 };; 669 { .mii; (p17) getf.sig a7=alo[8] // 1: 670 (p48) add t[6]=t[6],a3 // (p17) t[6]+=a3 671 (p50) add t[6]=t[6],a3,1 };; 672 { .mfi; (p17) getf.sig a8=ahi[8] // 2: 673 (p17) xma.hu nhi[7]=ni6,mj[1],nhi[6] // np[6]*m0 674 (p40) cmp.ltu p43,p41=a3,n3 } 675 { .mfi; (p42) cmp.leu p43,p41=a3,n3 676 (p17) xma.lu nlo[7]=ni6,mj[1],nhi[6] 677 (p0) nop.i 0 };; 678 { .mii; (p17) getf.sig n5=nlo[6] // 3: 679 (p48) cmp.ltu p51,p49=t[6],a3 680 (p50) cmp.leu p51,p49=t[6],a3 };; 681 .pred.rel "mutex",p41,p43 682 .pred.rel "mutex",p49,p51 683 { .mmi; (p0) getf.sig n2=ni1 // 4: 684 (p41) add a4=a4,n4 // (p17) a4+=n4 685 (p43) add a4=a4,n4,1 };; 686 { .mfi; (p49) add t[5]=t[5],a4 // 5: (p17) t[5]+=a4 687 (p0) nop.f 0 688 (p51) add t[5]=t[5],a4,1 };; 689 { .mfi; (p0) getf.sig n3=ni2 // 6: 690 (p17) xma.hu nhi[8]=ni7,mj[1],nhi[7] // np[7]*m0 691 (p41) cmp.ltu p42,p40=a4,n4 } 692 { .mfi; (p43) cmp.leu p42,p40=a4,n4 693 (p17) xma.lu nlo[8]=ni7,mj[1],nhi[7] 694 (p0) nop.i 0 };; 695 { .mii; (p17) getf.sig n6=nlo[7] // 7: 696 (p49) cmp.ltu p50,p48=t[5],a4 697 (p51) cmp.leu p50,p48=t[5],a4 };; 698 .pred.rel "mutex",p40,p42 699 .pred.rel "mutex",p48,p50 700 { .mii; (p0) getf.sig n4=ni3 // 8: 701 (p40) add a5=a5,n5 // (p17) a5+=n5 702 (p42) add a5=a5,n5,1 };; 703 { .mii; (p0) nop.m 0 // 9: 704 (p48) add t[4]=t[4],a5 // p(17) t[4]+=a5 705 (p50) add t[4]=t[4],a5,1 };; 706 { .mii; (p0) nop.m 0 // 10: 707 (p40) cmp.ltu p43,p41=a5,n5 708 (p42) cmp.leu p43,p41=a5,n5 };; 709 { .mii; (p17) getf.sig n7=nlo[8] // 11: 710 (p48) cmp.ltu p51,p49=t[4],a5 711 (p50) cmp.leu p51,p49=t[4],a5 };; 712 .pred.rel "mutex",p41,p43 713 .pred.rel "mutex",p49,p51 714 { .mii; (p17) getf.sig n8=nhi[8] // 12: 715 (p41) add a6=a6,n6 // (p17) a6+=n6 716 (p43) add a6=a6,n6,1 };; 717 { .mii; (p0) getf.sig n5=ni4 // 13: 718 (p49) add t[3]=t[3],a6 // (p17) t[3]+=a6 719 (p51) add t[3]=t[3],a6,1 };; 720 { .mii; (p0) nop.m 0 // 14: 721 (p41) cmp.ltu p42,p40=a6,n6 722 (p43) cmp.leu p42,p40=a6,n6 };; 723 { .mii; (p0) getf.sig n6=ni5 // 15: 724 (p49) cmp.ltu p50,p48=t[3],a6 725 (p51) cmp.leu p50,p48=t[3],a6 };; 726 .pred.rel "mutex",p40,p42 727 .pred.rel "mutex",p48,p50 728 { .mii; (p0) nop.m 0 // 16: 729 (p40) add a7=a7,n7 // (p17) a7+=n7 730 (p42) add a7=a7,n7,1 };; 731 { .mii; (p0) nop.m 0 // 17: 732 (p48) add t[2]=t[2],a7 // (p17) t[2]+=a7 733 (p50) add t[2]=t[2],a7,1 };; 734 { .mii; (p0) nop.m 0 // 18: 735 (p40) cmp.ltu p43,p41=a7,n7 736 (p42) cmp.leu p43,p41=a7,n7 };; 737 { .mii; (p0) getf.sig n7=ni6 // 19: 738 (p48) cmp.ltu p51,p49=t[2],a7 739 (p50) cmp.leu p51,p49=t[2],a7 };; 740 .pred.rel "mutex",p41,p43 741 .pred.rel "mutex",p49,p51 742 { .mii; (p0) nop.m 0 // 20: 743 (p41) add a8=a8,n8 // (p17) a8+=n8 744 (p43) add a8=a8,n8,1 };; 745 { .mmi; (p0) nop.m 0 // 21: 746 (p49) add t[1]=t[1],a8 // (p17) t[1]+=a8 747 (p51) add t[1]=t[1],a8,1 } 748 { .mmi; (p17) mov t[0]=r0 749 (p41) cmp.ltu p42,p40=a8,n8 750 (p43) cmp.leu p42,p40=a8,n8 };; 751 { .mmi; (p0) getf.sig n8=ni7 // 22: 752 (p49) cmp.ltu p50,p48=t[1],a8 753 (p51) cmp.leu p50,p48=t[1],a8 } 754 { .mmi; (p42) add t[0]=t[0],r0,1 755 (p0) add r16=-7*16,prevsp 756 (p0) add r17=-6*16,prevsp };; 757 759 // subtract np[8] from carrybit|tmp[8] 760 // carrybit|tmp[8] layout upon exit from above loop is: 761 // t[0]|t[1]|t[2]|t[3]|t[4]|t[5]|t[6]|t[7]|t0 (least significant) 762 { .mmi; (p50)add t[0]=t[0],r0,1 763 add r18=-5*16,prevsp 764 sub n1=t0,n1 };; 765 { .mmi; cmp.gtu p34,p32=n1,t0;; 766 .pred.rel "mutex",p32,p34 767 (p32)sub n2=t[7],n2 768 (p34)sub n2=t[7],n2,1 };; 769 { .mii; (p32)cmp.gtu p35,p33=n2,t[7] 770 (p34)cmp.geu p35,p33=n2,t[7];; 771 .pred.rel "mutex",p33,p35 772 (p33)sub n3=t[6],n3 } 773 { .mmi; (p35)sub n3=t[6],n3,1;; 774 (p33)cmp.gtu p34,p32=n3,t[6] 775 (p35)cmp.geu p34,p32=n3,t[6] };; 776 .pred.rel "mutex",p32,p34 777 { .mii; (p32)sub n4=t[5],n4 778 (p34)sub n4=t[5],n4,1;; 779 (p32)cmp.gtu p35,p33=n4,t[5] } 780 { .mmi; (p34)cmp.geu p35,p33=n4,t[5];; 781 .pred.rel "mutex",p33,p35 782 (p33)sub n5=t[4],n5 783 (p35)sub n5=t[4],n5,1 };; 784 { .mii; (p33)cmp.gtu p34,p32=n5,t[4] 785 (p35)cmp.geu p34,p32=n5,t[4];; 786 .pred.rel "mutex",p32,p34 787 (p32)sub n6=t[3],n6 } 788 { .mmi; (p34)sub n6=t[3],n6,1;; 789 (p32)cmp.gtu p35,p33=n6,t[3] 790 (p34)cmp.geu p35,p33=n6,t[3] };; 791 .pred.rel "mutex",p33,p35 792 { .mii; (p33)sub n7=t[2],n7 793 (p35)sub n7=t[2],n7,1;; 794 (p33)cmp.gtu p34,p32=n7,t[2] } 795 { .mmi; (p35)cmp.geu p34,p32=n7,t[2];; 796 .pred.rel "mutex",p32,p34 797 (p32)sub n8=t[1],n8 798 (p34)sub n8=t[1],n8,1 };; 799 { .mii; (p32)cmp.gtu p35,p33=n8,t[1] 800 (p34)cmp.geu p35,p33=n8,t[1];; 801 .pred.rel "mutex",p33,p35 802 (p33)sub a8=t[0],r0 } 803 { .mmi; (p35)sub a8=t[0],r0,1;; 804 (p33)cmp.gtu p34,p32=a8,t[0] 805 (p35)cmp.geu p34,p32=a8,t[0] };; 806 808 // save the result, either tmp[num] or tmp[num]-np[num] 809 .pred.rel "mutex",p32,p34 810 { .mmi; (p32)st8 [rptr]=n1,8 811 (p34)st8 [rptr]=t0,8 812 add r19=-4*16,prevsp};; 813 { .mmb; (p32)st8 [rptr]=n2,8 814 (p34)st8 [rptr]=t[7],8 815 (p5)br.cond.dpnt.few .Ldone };; 816 { .mmb; (p32)st8 [rptr]=n3,8 817 (p34)st8 [rptr]=t[6],8 818 (p7)br.cond.dpnt.few .Ldone };; 819 { .mmb; (p32)st8 [rptr]=n4,8 820 (p34)st8 [rptr]=t[5],8 821 (p9)br.cond.dpnt.few .Ldone };; 822 { .mmb; (p32)st8 [rptr]=n5,8 823 (p34)st8 [rptr]=t[4],8 824 (p11)br.cond.dpnt.few .Ldone };; 825 { .mmb; (p32)st8 [rptr]=n6,8 826 (p34)st8 [rptr]=t[3],8 827 (p13)br.cond.dpnt.few .Ldone };; 828 { .mmb; (p32)st8 [rptr]=n7,8 829 (p34)st8 [rptr]=t[2],8 830 (p15)br.cond.dpnt.few .Ldone };; 831 { .mmb; (p32)st8 [rptr]=n8,8 832 (p34)st8 [rptr]=t[1],8 833 nop.b 0 };; 834 .Ldone: // epilogue 835 { .mmi; ldf.fill f16=[r16],64 836 ldf.fill f17=[r17],64 837 nop.i 0 } 838 { .mmi; ldf.fill f18=[r18],64 839 ldf.fill f19=[r19],64 840 mov pr=prevpr,0x1ffff };; 841 { .mmi; ldf.fill f20=[r16] 842 ldf.fill f21=[r17] 843 mov ar.lc=prevlc } 844 { .mmi; ldf.fill f22=[r18] 845 ldf.fill f23=[r19] 846 mov ret0=1 } // signal "handled" 847 { .mib; rum 1<<5 848 .restore sp 849 mov sp=prevsp 850 br.ret.sptk.many b0 };; 851 .endp bn_mul_mont_8# 852 853 .type copyright#,\@object 854 copyright: 855 stringz "Montgomery multiplication for IA-64, CRYPTOGAMS by <appro\@openssl.org>" 856 ___ 857 858 $output=shift and open STDOUT,">$output"; 859 print $code; 860 close STDOUT; 861