1 #!/usr/bin/env perl 2 3 # ==================================================================== 4 # Written by Andy Polyakov <appro (at] fy.chalmers.se> for the OpenSSL 5 # project. The module is, however, dual licensed under OpenSSL and 6 # CRYPTOGAMS licenses depending on where you obtain it. For further 7 # details see http://www.openssl.org/~appro/cryptogams/. 8 # ==================================================================== 9 10 # On PA-7100LC this module performs ~90-50% better, less for longer 11 # keys, than code generated by gcc 3.2 for PA-RISC 1.1. Latter means 12 # that compiler utilized xmpyu instruction to perform 32x32=64-bit 13 # multiplication, which in turn means that "baseline" performance was 14 # optimal in respect to instruction set capabilities. Fair comparison 15 # with vendor compiler is problematic, because OpenSSL doesn't define 16 # BN_LLONG [presumably] for historical reasons, which drives compiler 17 # toward 4 times 16x16=32-bit multiplicatons [plus complementary 18 # shifts and additions] instead. This means that you should observe 19 # several times improvement over code generated by vendor compiler 20 # for PA-RISC 1.1, but the "baseline" is far from optimal. The actual 21 # improvement coefficient was never collected on PA-7100LC, or any 22 # other 1.1 CPU, because I don't have access to such machine with 23 # vendor compiler. But to give you a taste, PA-RISC 1.1 code path 24 # reportedly outperformed code generated by cc +DA1.1 +O3 by factor 25 # of ~5x on PA-8600. 26 # 27 # On PA-RISC 2.0 it has to compete with pa-risc2[W].s, which is 28 # reportedly ~2x faster than vendor compiler generated code [according 29 # to comment in pa-risc2[W].s]. Here comes a catch. Execution core of 30 # this implementation is actually 32-bit one, in the sense that it 31 # operates on 32-bit values. But pa-risc2[W].s operates on arrays of 32 # 64-bit BN_LONGs... How do they interoperate then? No problem. This 33 # module picks halves of 64-bit values in reverse order and pretends 34 # they were 32-bit BN_LONGs. But can 32-bit core compete with "pure" 35 # 64-bit code such as pa-risc2[W].s then? Well, the thing is that 36 # 32x32=64-bit multiplication is the best even PA-RISC 2.0 can do, 37 # i.e. there is no "wider" multiplication like on most other 64-bit 38 # platforms. This means that even being effectively 32-bit, this 39 # implementation performs "64-bit" computational task in same amount 40 # of arithmetic operations, most notably multiplications. It requires 41 # more memory references, most notably to tp[num], but this doesn't 42 # seem to exhaust memory port capacity. And indeed, dedicated PA-RISC 43 # 2.0 code path, provides virtually same performance as pa-risc2[W].s: 44 # it's ~10% better for shortest key length and ~10% worse for longest 45 # one. 46 # 47 # In case it wasn't clear. The module has two distinct code paths: 48 # PA-RISC 1.1 and PA-RISC 2.0 ones. Latter features carry-free 64-bit 49 # additions and 64-bit integer loads, not to mention specific 50 # instruction scheduling. In 64-bit build naturally only 2.0 code path 51 # is assembled. In 32-bit application context both code paths are 52 # assembled, PA-RISC 2.0 CPU is detected at run-time and proper path 53 # is taken automatically. Also, in 32-bit build the module imposes 54 # couple of limitations: vector lengths has to be even and vector 55 # addresses has to be 64-bit aligned. Normally neither is a problem: 56 # most common key lengths are even and vectors are commonly malloc-ed, 57 # which ensures alignment. 58 # 59 # Special thanks to polarhome.com for providing HP-UX account on 60 # PA-RISC 1.1 machine, and to correspondent who chose to remain 61 # anonymous for testing the code on PA-RISC 2.0 machine. 62 64 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 65 66 $flavour = shift; 67 $output = shift; 68 69 open STDOUT,">$output"; 70 71 if ($flavour =~ /64/) { 72 $LEVEL ="2.0W"; 73 $SIZE_T =8; 74 $FRAME_MARKER =80; 75 $SAVED_RP =16; 76 $PUSH ="std"; 77 $PUSHMA ="std,ma"; 78 $POP ="ldd"; 79 $POPMB ="ldd,mb"; 80 $BN_SZ =$SIZE_T; 81 } else { 82 $LEVEL ="1.1"; #$LEVEL.="\n\t.ALLOW\t2.0"; 83 $SIZE_T =4; 84 $FRAME_MARKER =48; 85 $SAVED_RP =20; 86 $PUSH ="stw"; 87 $PUSHMA ="stwm"; 88 $POP ="ldw"; 89 $POPMB ="ldwm"; 90 $BN_SZ =$SIZE_T; 91 if (open CONF,"<${dir}../../opensslconf.h") { 92 while(<CONF>) { 93 if (m/#\s*define\s+SIXTY_FOUR_BIT/) { 94 $BN_SZ=8; 95 $LEVEL="2.0"; 96 last; 97 } 98 } 99 close CONF; 100 } 101 } 102 103 $FRAME=8*$SIZE_T+$FRAME_MARKER; # 8 saved regs + frame marker 104 # [+ argument transfer] 105 $LOCALS=$FRAME-$FRAME_MARKER; 106 $FRAME+=32; # local variables 107 108 $tp="%r31"; 109 $ti1="%r29"; 110 $ti0="%r28"; 111 112 $rp="%r26"; 113 $ap="%r25"; 114 $bp="%r24"; 115 $np="%r23"; 116 $n0="%r22"; # passed through stack in 32-bit 117 $num="%r21"; # passed through stack in 32-bit 118 $idx="%r20"; 119 $arrsz="%r19"; 120 121 $nm1="%r7"; 122 $nm0="%r6"; 123 $ab1="%r5"; 124 $ab0="%r4"; 125 126 $fp="%r3"; 127 $hi1="%r2"; 128 $hi0="%r1"; 129 130 $xfer=$n0; # accomodates [-16..15] offset in fld[dw]s 131 132 $fm0="%fr4"; $fti=$fm0; 133 $fbi="%fr5L"; 134 $fn0="%fr5R"; 135 $fai="%fr6"; $fab0="%fr7"; $fab1="%fr8"; 136 $fni="%fr9"; $fnm0="%fr10"; $fnm1="%fr11"; 137 138 $code=<<___; 139 .LEVEL $LEVEL 140 .SPACE \$TEXT\$ 141 .SUBSPA \$CODE\$,QUAD=0,ALIGN=8,ACCESS=0x2C,CODE_ONLY 142 143 .EXPORT bn_mul_mont,ENTRY,ARGW0=GR,ARGW1=GR,ARGW2=GR,ARGW3=GR 144 .ALIGN 64 145 bn_mul_mont 146 .PROC 147 .CALLINFO FRAME=`$FRAME-8*$SIZE_T`,NO_CALLS,SAVE_RP,SAVE_SP,ENTRY_GR=6 148 .ENTRY 149 $PUSH %r2,-$SAVED_RP(%sp) ; standard prologue 150 $PUSHMA %r3,$FRAME(%sp) 151 $PUSH %r4,`-$FRAME+1*$SIZE_T`(%sp) 152 $PUSH %r5,`-$FRAME+2*$SIZE_T`(%sp) 153 $PUSH %r6,`-$FRAME+3*$SIZE_T`(%sp) 154 $PUSH %r7,`-$FRAME+4*$SIZE_T`(%sp) 155 $PUSH %r8,`-$FRAME+5*$SIZE_T`(%sp) 156 $PUSH %r9,`-$FRAME+6*$SIZE_T`(%sp) 157 $PUSH %r10,`-$FRAME+7*$SIZE_T`(%sp) 158 ldo -$FRAME(%sp),$fp 159 ___ 160 $code.=<<___ if ($SIZE_T==4); 161 ldw `-$FRAME_MARKER-4`($fp),$n0 162 ldw `-$FRAME_MARKER-8`($fp),$num 163 nop 164 nop ; alignment 165 ___ 166 $code.=<<___ if ($BN_SZ==4); 167 comiclr,<= 6,$num,%r0 ; are vectors long enough? 168 b L\$abort 169 ldi 0,%r28 ; signal "unhandled" 170 add,ev %r0,$num,$num ; is $num even? 171 b L\$abort 172 nop 173 or $ap,$np,$ti1 174 extru,= $ti1,31,3,%r0 ; are ap and np 64-bit aligned? 175 b L\$abort 176 nop 177 nop ; alignment 178 nop 179 180 fldws 0($n0),${fn0} 181 fldws,ma 4($bp),${fbi} ; bp[0] 182 ___ 183 $code.=<<___ if ($BN_SZ==8); 184 comib,> 3,$num,L\$abort ; are vectors long enough? 185 ldi 0,%r28 ; signal "unhandled" 186 addl $num,$num,$num ; I operate on 32-bit values 187 188 fldws 4($n0),${fn0} ; only low part of n0 189 fldws 4($bp),${fbi} ; bp[0] in flipped word order 190 ___ 191 $code.=<<___; 192 fldds 0($ap),${fai} ; ap[0,1] 193 fldds 0($np),${fni} ; np[0,1] 194 195 sh2addl $num,%r0,$arrsz 196 ldi 31,$hi0 197 ldo 36($arrsz),$hi1 ; space for tp[num+1] 198 andcm $hi1,$hi0,$hi1 ; align 199 addl $hi1,%sp,%sp 200 $PUSH $fp,-$SIZE_T(%sp) 201 202 ldo `$LOCALS+16`($fp),$xfer 203 ldo `$LOCALS+32+4`($fp),$tp 204 205 xmpyu ${fai}L,${fbi},${fab0} ; ap[0]*bp[0] 206 xmpyu ${fai}R,${fbi},${fab1} ; ap[1]*bp[0] 207 xmpyu ${fn0},${fab0}R,${fm0} 208 209 addl $arrsz,$ap,$ap ; point at the end 210 addl $arrsz,$np,$np 211 subi 0,$arrsz,$idx ; j=0 212 ldo 8($idx),$idx ; j++++ 213 214 xmpyu ${fni}L,${fm0}R,${fnm0} ; np[0]*m 215 xmpyu ${fni}R,${fm0}R,${fnm1} ; np[1]*m 216 fstds ${fab0},-16($xfer) 217 fstds ${fnm0},-8($xfer) 218 fstds ${fab1},0($xfer) 219 fstds ${fnm1},8($xfer) 220 flddx $idx($ap),${fai} ; ap[2,3] 221 flddx $idx($np),${fni} ; np[2,3] 222 ___ 223 $code.=<<___ if ($BN_SZ==4); 224 mtctl $hi0,%cr11 ; $hi0 still holds 31 225 extrd,u,*= $hi0,%sar,1,$hi0 ; executes on PA-RISC 1.0 226 b L\$parisc11 227 nop 228 ___ 229 $code.=<<___; # PA-RISC 2.0 code-path 230 xmpyu ${fai}L,${fbi},${fab0} ; ap[j]*bp[0] 231 xmpyu ${fni}L,${fm0}R,${fnm0} ; np[j]*m 232 ldd -16($xfer),$ab0 233 fstds ${fab0},-16($xfer) 234 235 extrd,u $ab0,31,32,$hi0 236 extrd,u $ab0,63,32,$ab0 237 ldd -8($xfer),$nm0 238 fstds ${fnm0},-8($xfer) 239 ldo 8($idx),$idx ; j++++ 240 addl $ab0,$nm0,$nm0 ; low part is discarded 241 extrd,u $nm0,31,32,$hi1 242 244 L\$1st 245 xmpyu ${fai}R,${fbi},${fab1} ; ap[j+1]*bp[0] 246 xmpyu ${fni}R,${fm0}R,${fnm1} ; np[j+1]*m 247 ldd 0($xfer),$ab1 248 fstds ${fab1},0($xfer) 249 addl $hi0,$ab1,$ab1 250 extrd,u $ab1,31,32,$hi0 251 ldd 8($xfer),$nm1 252 fstds ${fnm1},8($xfer) 253 extrd,u $ab1,63,32,$ab1 254 addl $hi1,$nm1,$nm1 255 flddx $idx($ap),${fai} ; ap[j,j+1] 256 flddx $idx($np),${fni} ; np[j,j+1] 257 addl $ab1,$nm1,$nm1 258 extrd,u $nm1,31,32,$hi1 259 260 xmpyu ${fai}L,${fbi},${fab0} ; ap[j]*bp[0] 261 xmpyu ${fni}L,${fm0}R,${fnm0} ; np[j]*m 262 ldd -16($xfer),$ab0 263 fstds ${fab0},-16($xfer) 264 addl $hi0,$ab0,$ab0 265 extrd,u $ab0,31,32,$hi0 266 ldd -8($xfer),$nm0 267 fstds ${fnm0},-8($xfer) 268 extrd,u $ab0,63,32,$ab0 269 addl $hi1,$nm0,$nm0 270 stw $nm1,-4($tp) ; tp[j-1] 271 addl $ab0,$nm0,$nm0 272 stw,ma $nm0,8($tp) ; tp[j-1] 273 addib,<> 8,$idx,L\$1st ; j++++ 274 extrd,u $nm0,31,32,$hi1 275 276 xmpyu ${fai}R,${fbi},${fab1} ; ap[j]*bp[0] 277 xmpyu ${fni}R,${fm0}R,${fnm1} ; np[j]*m 278 ldd 0($xfer),$ab1 279 fstds ${fab1},0($xfer) 280 addl $hi0,$ab1,$ab1 281 extrd,u $ab1,31,32,$hi0 282 ldd 8($xfer),$nm1 283 fstds ${fnm1},8($xfer) 284 extrd,u $ab1,63,32,$ab1 285 addl $hi1,$nm1,$nm1 286 ldd -16($xfer),$ab0 287 addl $ab1,$nm1,$nm1 288 ldd -8($xfer),$nm0 289 extrd,u $nm1,31,32,$hi1 290 291 addl $hi0,$ab0,$ab0 292 extrd,u $ab0,31,32,$hi0 293 stw $nm1,-4($tp) ; tp[j-1] 294 extrd,u $ab0,63,32,$ab0 295 addl $hi1,$nm0,$nm0 296 ldd 0($xfer),$ab1 297 addl $ab0,$nm0,$nm0 298 ldd,mb 8($xfer),$nm1 299 extrd,u $nm0,31,32,$hi1 300 stw,ma $nm0,8($tp) ; tp[j-1] 301 302 ldo -1($num),$num ; i-- 303 subi 0,$arrsz,$idx ; j=0 304 ___ 305 $code.=<<___ if ($BN_SZ==4); 306 fldws,ma 4($bp),${fbi} ; bp[1] 307 ___ 308 $code.=<<___ if ($BN_SZ==8); 309 fldws 0($bp),${fbi} ; bp[1] in flipped word order 310 ___ 311 $code.=<<___; 312 flddx $idx($ap),${fai} ; ap[0,1] 313 flddx $idx($np),${fni} ; np[0,1] 314 fldws 8($xfer),${fti}R ; tp[0] 315 addl $hi0,$ab1,$ab1 316 extrd,u $ab1,31,32,$hi0 317 extrd,u $ab1,63,32,$ab1 318 ldo 8($idx),$idx ; j++++ 319 xmpyu ${fai}L,${fbi},${fab0} ; ap[0]*bp[1] 320 xmpyu ${fai}R,${fbi},${fab1} ; ap[1]*bp[1] 321 addl $hi1,$nm1,$nm1 322 addl $ab1,$nm1,$nm1 323 extrd,u $nm1,31,32,$hi1 324 fstws,mb ${fab0}L,-8($xfer) ; save high part 325 stw $nm1,-4($tp) ; tp[j-1] 326 327 fcpy,sgl %fr0,${fti}L ; zero high part 328 fcpy,sgl %fr0,${fab0}L 329 addl $hi1,$hi0,$hi0 330 extrd,u $hi0,31,32,$hi1 331 fcnvxf,dbl,dbl ${fti},${fti} ; 32-bit unsigned int -> double 332 fcnvxf,dbl,dbl ${fab0},${fab0} 333 stw $hi0,0($tp) 334 stw $hi1,4($tp) 335 336 fadd,dbl ${fti},${fab0},${fab0} ; add tp[0] 337 fcnvfx,dbl,dbl ${fab0},${fab0} ; double -> 33-bit unsigned int 338 xmpyu ${fn0},${fab0}R,${fm0} 339 ldo `$LOCALS+32+4`($fp),$tp 340 L\$outer 341 xmpyu ${fni}L,${fm0}R,${fnm0} ; np[0]*m 342 xmpyu ${fni}R,${fm0}R,${fnm1} ; np[1]*m 343 fstds ${fab0},-16($xfer) ; 33-bit value 344 fstds ${fnm0},-8($xfer) 345 flddx $idx($ap),${fai} ; ap[2] 346 flddx $idx($np),${fni} ; np[2] 347 ldo 8($idx),$idx ; j++++ 348 ldd -16($xfer),$ab0 ; 33-bit value 349 ldd -8($xfer),$nm0 350 ldw 0($xfer),$hi0 ; high part 351 352 xmpyu ${fai}L,${fbi},${fab0} ; ap[j]*bp[i] 353 xmpyu ${fni}L,${fm0}R,${fnm0} ; np[j]*m 354 extrd,u $ab0,31,32,$ti0 ; carry bit 355 extrd,u $ab0,63,32,$ab0 356 fstds ${fab1},0($xfer) 357 addl $ti0,$hi0,$hi0 ; account carry bit 358 fstds ${fnm1},8($xfer) 359 addl $ab0,$nm0,$nm0 ; low part is discarded 360 ldw 0($tp),$ti1 ; tp[1] 361 extrd,u $nm0,31,32,$hi1 362 fstds ${fab0},-16($xfer) 363 fstds ${fnm0},-8($xfer) 364 366 L\$inner 367 xmpyu ${fai}R,${fbi},${fab1} ; ap[j+1]*bp[i] 368 xmpyu ${fni}R,${fm0}R,${fnm1} ; np[j+1]*m 369 ldd 0($xfer),$ab1 370 fstds ${fab1},0($xfer) 371 addl $hi0,$ti1,$ti1 372 addl $ti1,$ab1,$ab1 373 ldd 8($xfer),$nm1 374 fstds ${fnm1},8($xfer) 375 extrd,u $ab1,31,32,$hi0 376 extrd,u $ab1,63,32,$ab1 377 flddx $idx($ap),${fai} ; ap[j,j+1] 378 flddx $idx($np),${fni} ; np[j,j+1] 379 addl $hi1,$nm1,$nm1 380 addl $ab1,$nm1,$nm1 381 ldw 4($tp),$ti0 ; tp[j] 382 stw $nm1,-4($tp) ; tp[j-1] 383 384 xmpyu ${fai}L,${fbi},${fab0} ; ap[j]*bp[i] 385 xmpyu ${fni}L,${fm0}R,${fnm0} ; np[j]*m 386 ldd -16($xfer),$ab0 387 fstds ${fab0},-16($xfer) 388 addl $hi0,$ti0,$ti0 389 addl $ti0,$ab0,$ab0 390 ldd -8($xfer),$nm0 391 fstds ${fnm0},-8($xfer) 392 extrd,u $ab0,31,32,$hi0 393 extrd,u $nm1,31,32,$hi1 394 ldw 8($tp),$ti1 ; tp[j] 395 extrd,u $ab0,63,32,$ab0 396 addl $hi1,$nm0,$nm0 397 addl $ab0,$nm0,$nm0 398 stw,ma $nm0,8($tp) ; tp[j-1] 399 addib,<> 8,$idx,L\$inner ; j++++ 400 extrd,u $nm0,31,32,$hi1 401 402 xmpyu ${fai}R,${fbi},${fab1} ; ap[j]*bp[i] 403 xmpyu ${fni}R,${fm0}R,${fnm1} ; np[j]*m 404 ldd 0($xfer),$ab1 405 fstds ${fab1},0($xfer) 406 addl $hi0,$ti1,$ti1 407 addl $ti1,$ab1,$ab1 408 ldd 8($xfer),$nm1 409 fstds ${fnm1},8($xfer) 410 extrd,u $ab1,31,32,$hi0 411 extrd,u $ab1,63,32,$ab1 412 ldw 4($tp),$ti0 ; tp[j] 413 addl $hi1,$nm1,$nm1 414 addl $ab1,$nm1,$nm1 415 ldd -16($xfer),$ab0 416 ldd -8($xfer),$nm0 417 extrd,u $nm1,31,32,$hi1 418 419 addl $hi0,$ab0,$ab0 420 addl $ti0,$ab0,$ab0 421 stw $nm1,-4($tp) ; tp[j-1] 422 extrd,u $ab0,31,32,$hi0 423 ldw 8($tp),$ti1 ; tp[j] 424 extrd,u $ab0,63,32,$ab0 425 addl $hi1,$nm0,$nm0 426 ldd 0($xfer),$ab1 427 addl $ab0,$nm0,$nm0 428 ldd,mb 8($xfer),$nm1 429 extrd,u $nm0,31,32,$hi1 430 stw,ma $nm0,8($tp) ; tp[j-1] 431 432 addib,= -1,$num,L\$outerdone ; i-- 433 subi 0,$arrsz,$idx ; j=0 434 ___ 435 $code.=<<___ if ($BN_SZ==4); 436 fldws,ma 4($bp),${fbi} ; bp[i] 437 ___ 438 $code.=<<___ if ($BN_SZ==8); 439 ldi 12,$ti0 ; bp[i] in flipped word order 440 addl,ev %r0,$num,$num 441 ldi -4,$ti0 442 addl $ti0,$bp,$bp 443 fldws 0($bp),${fbi} 444 ___ 445 $code.=<<___; 446 flddx $idx($ap),${fai} ; ap[0] 447 addl $hi0,$ab1,$ab1 448 flddx $idx($np),${fni} ; np[0] 449 fldws 8($xfer),${fti}R ; tp[0] 450 addl $ti1,$ab1,$ab1 451 extrd,u $ab1,31,32,$hi0 452 extrd,u $ab1,63,32,$ab1 453 454 ldo 8($idx),$idx ; j++++ 455 xmpyu ${fai}L,${fbi},${fab0} ; ap[0]*bp[i] 456 xmpyu ${fai}R,${fbi},${fab1} ; ap[1]*bp[i] 457 ldw 4($tp),$ti0 ; tp[j] 458 459 addl $hi1,$nm1,$nm1 460 fstws,mb ${fab0}L,-8($xfer) ; save high part 461 addl $ab1,$nm1,$nm1 462 extrd,u $nm1,31,32,$hi1 463 fcpy,sgl %fr0,${fti}L ; zero high part 464 fcpy,sgl %fr0,${fab0}L 465 stw $nm1,-4($tp) ; tp[j-1] 466 467 fcnvxf,dbl,dbl ${fti},${fti} ; 32-bit unsigned int -> double 468 fcnvxf,dbl,dbl ${fab0},${fab0} 469 addl $hi1,$hi0,$hi0 470 fadd,dbl ${fti},${fab0},${fab0} ; add tp[0] 471 addl $ti0,$hi0,$hi0 472 extrd,u $hi0,31,32,$hi1 473 fcnvfx,dbl,dbl ${fab0},${fab0} ; double -> 33-bit unsigned int 474 stw $hi0,0($tp) 475 stw $hi1,4($tp) 476 xmpyu ${fn0},${fab0}R,${fm0} 477 478 b L\$outer 479 ldo `$LOCALS+32+4`($fp),$tp 480 482 L\$outerdone 483 addl $hi0,$ab1,$ab1 484 addl $ti1,$ab1,$ab1 485 extrd,u $ab1,31,32,$hi0 486 extrd,u $ab1,63,32,$ab1 487 488 ldw 4($tp),$ti0 ; tp[j] 489 490 addl $hi1,$nm1,$nm1 491 addl $ab1,$nm1,$nm1 492 extrd,u $nm1,31,32,$hi1 493 stw $nm1,-4($tp) ; tp[j-1] 494 495 addl $hi1,$hi0,$hi0 496 addl $ti0,$hi0,$hi0 497 extrd,u $hi0,31,32,$hi1 498 stw $hi0,0($tp) 499 stw $hi1,4($tp) 500 501 ldo `$LOCALS+32`($fp),$tp 502 sub %r0,%r0,%r0 ; clear borrow 503 ___ 504 $code.=<<___ if ($BN_SZ==4); 505 ldws,ma 4($tp),$ti0 506 extru,= $rp,31,3,%r0 ; is rp 64-bit aligned? 507 b L\$sub_pa11 508 addl $tp,$arrsz,$tp 509 L\$sub 510 ldwx $idx($np),$hi0 511 subb $ti0,$hi0,$hi1 512 ldwx $idx($tp),$ti0 513 addib,<> 4,$idx,L\$sub 514 stws,ma $hi1,4($rp) 515 516 subb $ti0,%r0,$hi1 517 ldo -4($tp),$tp 518 ___ 519 $code.=<<___ if ($BN_SZ==8); 520 ldd,ma 8($tp),$ti0 521 L\$sub 522 ldd $idx($np),$hi0 523 shrpd $ti0,$ti0,32,$ti0 ; flip word order 524 std $ti0,-8($tp) ; save flipped value 525 sub,db $ti0,$hi0,$hi1 526 ldd,ma 8($tp),$ti0 527 addib,<> 8,$idx,L\$sub 528 std,ma $hi1,8($rp) 529 530 extrd,u $ti0,31,32,$ti0 ; carry in flipped word order 531 sub,db $ti0,%r0,$hi1 532 ldo -8($tp),$tp 533 ___ 534 $code.=<<___; 535 and $tp,$hi1,$ap 536 andcm $rp,$hi1,$bp 537 or $ap,$bp,$np 538 539 sub $rp,$arrsz,$rp ; rewind rp 540 subi 0,$arrsz,$idx 541 ldo `$LOCALS+32`($fp),$tp 542 L\$copy 543 ldd $idx($np),$hi0 544 std,ma %r0,8($tp) 545 addib,<> 8,$idx,.-8 ; L\$copy 546 std,ma $hi0,8($rp) 547 ___ 548 549 if ($BN_SZ==4) { # PA-RISC 1.1 code-path 550 $ablo=$ab0; 551 $abhi=$ab1; 552 $nmlo0=$nm0; 553 $nmhi0=$nm1; 554 $nmlo1="%r9"; 555 $nmhi1="%r8"; 556 557 $code.=<<___; 558 b L\$done 559 nop 560 561 .ALIGN 8 562 L\$parisc11 563 xmpyu ${fai}L,${fbi},${fab0} ; ap[j]*bp[0] 564 xmpyu ${fni}L,${fm0}R,${fnm0} ; np[j]*m 565 ldw -12($xfer),$ablo 566 ldw -16($xfer),$hi0 567 ldw -4($xfer),$nmlo0 568 ldw -8($xfer),$nmhi0 569 fstds ${fab0},-16($xfer) 570 fstds ${fnm0},-8($xfer) 571 572 ldo 8($idx),$idx ; j++++ 573 add $ablo,$nmlo0,$nmlo0 ; discarded 574 addc %r0,$nmhi0,$hi1 575 ldw 4($xfer),$ablo 576 ldw 0($xfer),$abhi 577 nop 578 580 L\$1st_pa11 581 xmpyu ${fai}R,${fbi},${fab1} ; ap[j+1]*bp[0] 582 flddx $idx($ap),${fai} ; ap[j,j+1] 583 xmpyu ${fni}R,${fm0}R,${fnm1} ; np[j+1]*m 584 flddx $idx($np),${fni} ; np[j,j+1] 585 add $hi0,$ablo,$ablo 586 ldw 12($xfer),$nmlo1 587 addc %r0,$abhi,$hi0 588 ldw 8($xfer),$nmhi1 589 add $ablo,$nmlo1,$nmlo1 590 fstds ${fab1},0($xfer) 591 addc %r0,$nmhi1,$nmhi1 592 fstds ${fnm1},8($xfer) 593 add $hi1,$nmlo1,$nmlo1 594 ldw -12($xfer),$ablo 595 addc %r0,$nmhi1,$hi1 596 ldw -16($xfer),$abhi 597 598 xmpyu ${fai}L,${fbi},${fab0} ; ap[j]*bp[0] 599 ldw -4($xfer),$nmlo0 600 xmpyu ${fni}L,${fm0}R,${fnm0} ; np[j]*m 601 ldw -8($xfer),$nmhi0 602 add $hi0,$ablo,$ablo 603 stw $nmlo1,-4($tp) ; tp[j-1] 604 addc %r0,$abhi,$hi0 605 fstds ${fab0},-16($xfer) 606 add $ablo,$nmlo0,$nmlo0 607 fstds ${fnm0},-8($xfer) 608 addc %r0,$nmhi0,$nmhi0 609 ldw 0($xfer),$abhi 610 add $hi1,$nmlo0,$nmlo0 611 ldw 4($xfer),$ablo 612 stws,ma $nmlo0,8($tp) ; tp[j-1] 613 addib,<> 8,$idx,L\$1st_pa11 ; j++++ 614 addc %r0,$nmhi0,$hi1 615 616 ldw 8($xfer),$nmhi1 617 ldw 12($xfer),$nmlo1 618 xmpyu ${fai}R,${fbi},${fab1} ; ap[j]*bp[0] 619 xmpyu ${fni}R,${fm0}R,${fnm1} ; np[j]*m 620 add $hi0,$ablo,$ablo 621 fstds ${fab1},0($xfer) 622 addc %r0,$abhi,$hi0 623 fstds ${fnm1},8($xfer) 624 add $ablo,$nmlo1,$nmlo1 625 ldw -16($xfer),$abhi 626 addc %r0,$nmhi1,$nmhi1 627 ldw -12($xfer),$ablo 628 add $hi1,$nmlo1,$nmlo1 629 ldw -8($xfer),$nmhi0 630 addc %r0,$nmhi1,$hi1 631 ldw -4($xfer),$nmlo0 632 633 add $hi0,$ablo,$ablo 634 stw $nmlo1,-4($tp) ; tp[j-1] 635 addc %r0,$abhi,$hi0 636 ldw 0($xfer),$abhi 637 add $ablo,$nmlo0,$nmlo0 638 ldw 4($xfer),$ablo 639 addc %r0,$nmhi0,$nmhi0 640 ldws,mb 8($xfer),$nmhi1 641 add $hi1,$nmlo0,$nmlo0 642 ldw 4($xfer),$nmlo1 643 addc %r0,$nmhi0,$hi1 644 stws,ma $nmlo0,8($tp) ; tp[j-1] 645 646 ldo -1($num),$num ; i-- 647 subi 0,$arrsz,$idx ; j=0 648 649 fldws,ma 4($bp),${fbi} ; bp[1] 650 flddx $idx($ap),${fai} ; ap[0,1] 651 flddx $idx($np),${fni} ; np[0,1] 652 fldws 8($xfer),${fti}R ; tp[0] 653 add $hi0,$ablo,$ablo 654 addc %r0,$abhi,$hi0 655 ldo 8($idx),$idx ; j++++ 656 xmpyu ${fai}L,${fbi},${fab0} ; ap[0]*bp[1] 657 xmpyu ${fai}R,${fbi},${fab1} ; ap[1]*bp[1] 658 add $hi1,$nmlo1,$nmlo1 659 addc %r0,$nmhi1,$nmhi1 660 add $ablo,$nmlo1,$nmlo1 661 addc %r0,$nmhi1,$hi1 662 fstws,mb ${fab0}L,-8($xfer) ; save high part 663 stw $nmlo1,-4($tp) ; tp[j-1] 664 665 fcpy,sgl %fr0,${fti}L ; zero high part 666 fcpy,sgl %fr0,${fab0}L 667 add $hi1,$hi0,$hi0 668 addc %r0,%r0,$hi1 669 fcnvxf,dbl,dbl ${fti},${fti} ; 32-bit unsigned int -> double 670 fcnvxf,dbl,dbl ${fab0},${fab0} 671 stw $hi0,0($tp) 672 stw $hi1,4($tp) 673 674 fadd,dbl ${fti},${fab0},${fab0} ; add tp[0] 675 fcnvfx,dbl,dbl ${fab0},${fab0} ; double -> 33-bit unsigned int 676 xmpyu ${fn0},${fab0}R,${fm0} 677 ldo `$LOCALS+32+4`($fp),$tp 678 L\$outer_pa11 679 xmpyu ${fni}L,${fm0}R,${fnm0} ; np[0]*m 680 xmpyu ${fni}R,${fm0}R,${fnm1} ; np[1]*m 681 fstds ${fab0},-16($xfer) ; 33-bit value 682 fstds ${fnm0},-8($xfer) 683 flddx $idx($ap),${fai} ; ap[2,3] 684 flddx $idx($np),${fni} ; np[2,3] 685 ldw -16($xfer),$abhi ; carry bit actually 686 ldo 8($idx),$idx ; j++++ 687 ldw -12($xfer),$ablo 688 ldw -8($xfer),$nmhi0 689 ldw -4($xfer),$nmlo0 690 ldw 0($xfer),$hi0 ; high part 691 692 xmpyu ${fai}L,${fbi},${fab0} ; ap[j]*bp[i] 693 xmpyu ${fni}L,${fm0}R,${fnm0} ; np[j]*m 694 fstds ${fab1},0($xfer) 695 addl $abhi,$hi0,$hi0 ; account carry bit 696 fstds ${fnm1},8($xfer) 697 add $ablo,$nmlo0,$nmlo0 ; discarded 698 ldw 0($tp),$ti1 ; tp[1] 699 addc %r0,$nmhi0,$hi1 700 fstds ${fab0},-16($xfer) 701 fstds ${fnm0},-8($xfer) 702 ldw 4($xfer),$ablo 703 ldw 0($xfer),$abhi 704 706 L\$inner_pa11 707 xmpyu ${fai}R,${fbi},${fab1} ; ap[j+1]*bp[i] 708 flddx $idx($ap),${fai} ; ap[j,j+1] 709 xmpyu ${fni}R,${fm0}R,${fnm1} ; np[j+1]*m 710 flddx $idx($np),${fni} ; np[j,j+1] 711 add $hi0,$ablo,$ablo 712 ldw 4($tp),$ti0 ; tp[j] 713 addc %r0,$abhi,$abhi 714 ldw 12($xfer),$nmlo1 715 add $ti1,$ablo,$ablo 716 ldw 8($xfer),$nmhi1 717 addc %r0,$abhi,$hi0 718 fstds ${fab1},0($xfer) 719 add $ablo,$nmlo1,$nmlo1 720 fstds ${fnm1},8($xfer) 721 addc %r0,$nmhi1,$nmhi1 722 ldw -12($xfer),$ablo 723 add $hi1,$nmlo1,$nmlo1 724 ldw -16($xfer),$abhi 725 addc %r0,$nmhi1,$hi1 726 727 xmpyu ${fai}L,${fbi},${fab0} ; ap[j]*bp[i] 728 ldw 8($tp),$ti1 ; tp[j] 729 xmpyu ${fni}L,${fm0}R,${fnm0} ; np[j]*m 730 ldw -4($xfer),$nmlo0 731 add $hi0,$ablo,$ablo 732 ldw -8($xfer),$nmhi0 733 addc %r0,$abhi,$abhi 734 stw $nmlo1,-4($tp) ; tp[j-1] 735 add $ti0,$ablo,$ablo 736 fstds ${fab0},-16($xfer) 737 addc %r0,$abhi,$hi0 738 fstds ${fnm0},-8($xfer) 739 add $ablo,$nmlo0,$nmlo0 740 ldw 4($xfer),$ablo 741 addc %r0,$nmhi0,$nmhi0 742 ldw 0($xfer),$abhi 743 add $hi1,$nmlo0,$nmlo0 744 stws,ma $nmlo0,8($tp) ; tp[j-1] 745 addib,<> 8,$idx,L\$inner_pa11 ; j++++ 746 addc %r0,$nmhi0,$hi1 747 748 xmpyu ${fai}R,${fbi},${fab1} ; ap[j]*bp[i] 749 ldw 12($xfer),$nmlo1 750 xmpyu ${fni}R,${fm0}R,${fnm1} ; np[j]*m 751 ldw 8($xfer),$nmhi1 752 add $hi0,$ablo,$ablo 753 ldw 4($tp),$ti0 ; tp[j] 754 addc %r0,$abhi,$abhi 755 fstds ${fab1},0($xfer) 756 add $ti1,$ablo,$ablo 757 fstds ${fnm1},8($xfer) 758 addc %r0,$abhi,$hi0 759 ldw -16($xfer),$abhi 760 add $ablo,$nmlo1,$nmlo1 761 ldw -12($xfer),$ablo 762 addc %r0,$nmhi1,$nmhi1 763 ldw -8($xfer),$nmhi0 764 add $hi1,$nmlo1,$nmlo1 765 ldw -4($xfer),$nmlo0 766 addc %r0,$nmhi1,$hi1 767 768 add $hi0,$ablo,$ablo 769 stw $nmlo1,-4($tp) ; tp[j-1] 770 addc %r0,$abhi,$abhi 771 add $ti0,$ablo,$ablo 772 ldw 8($tp),$ti1 ; tp[j] 773 addc %r0,$abhi,$hi0 774 ldw 0($xfer),$abhi 775 add $ablo,$nmlo0,$nmlo0 776 ldw 4($xfer),$ablo 777 addc %r0,$nmhi0,$nmhi0 778 ldws,mb 8($xfer),$nmhi1 779 add $hi1,$nmlo0,$nmlo0 780 ldw 4($xfer),$nmlo1 781 addc %r0,$nmhi0,$hi1 782 stws,ma $nmlo0,8($tp) ; tp[j-1] 783 784 addib,= -1,$num,L\$outerdone_pa11; i-- 785 subi 0,$arrsz,$idx ; j=0 786 787 fldws,ma 4($bp),${fbi} ; bp[i] 788 flddx $idx($ap),${fai} ; ap[0] 789 add $hi0,$ablo,$ablo 790 addc %r0,$abhi,$abhi 791 flddx $idx($np),${fni} ; np[0] 792 fldws 8($xfer),${fti}R ; tp[0] 793 add $ti1,$ablo,$ablo 794 addc %r0,$abhi,$hi0 795 796 ldo 8($idx),$idx ; j++++ 797 xmpyu ${fai}L,${fbi},${fab0} ; ap[0]*bp[i] 798 xmpyu ${fai}R,${fbi},${fab1} ; ap[1]*bp[i] 799 ldw 4($tp),$ti0 ; tp[j] 800 801 add $hi1,$nmlo1,$nmlo1 802 addc %r0,$nmhi1,$nmhi1 803 fstws,mb ${fab0}L,-8($xfer) ; save high part 804 add $ablo,$nmlo1,$nmlo1 805 addc %r0,$nmhi1,$hi1 806 fcpy,sgl %fr0,${fti}L ; zero high part 807 fcpy,sgl %fr0,${fab0}L 808 stw $nmlo1,-4($tp) ; tp[j-1] 809 810 fcnvxf,dbl,dbl ${fti},${fti} ; 32-bit unsigned int -> double 811 fcnvxf,dbl,dbl ${fab0},${fab0} 812 add $hi1,$hi0,$hi0 813 addc %r0,%r0,$hi1 814 fadd,dbl ${fti},${fab0},${fab0} ; add tp[0] 815 add $ti0,$hi0,$hi0 816 addc %r0,$hi1,$hi1 817 fcnvfx,dbl,dbl ${fab0},${fab0} ; double -> 33-bit unsigned int 818 stw $hi0,0($tp) 819 stw $hi1,4($tp) 820 xmpyu ${fn0},${fab0}R,${fm0} 821 822 b L\$outer_pa11 823 ldo `$LOCALS+32+4`($fp),$tp 824 826 L\$outerdone_pa11 827 add $hi0,$ablo,$ablo 828 addc %r0,$abhi,$abhi 829 add $ti1,$ablo,$ablo 830 addc %r0,$abhi,$hi0 831 832 ldw 4($tp),$ti0 ; tp[j] 833 834 add $hi1,$nmlo1,$nmlo1 835 addc %r0,$nmhi1,$nmhi1 836 add $ablo,$nmlo1,$nmlo1 837 addc %r0,$nmhi1,$hi1 838 stw $nmlo1,-4($tp) ; tp[j-1] 839 840 add $hi1,$hi0,$hi0 841 addc %r0,%r0,$hi1 842 add $ti0,$hi0,$hi0 843 addc %r0,$hi1,$hi1 844 stw $hi0,0($tp) 845 stw $hi1,4($tp) 846 847 ldo `$LOCALS+32+4`($fp),$tp 848 sub %r0,%r0,%r0 ; clear borrow 849 ldw -4($tp),$ti0 850 addl $tp,$arrsz,$tp 851 L\$sub_pa11 852 ldwx $idx($np),$hi0 853 subb $ti0,$hi0,$hi1 854 ldwx $idx($tp),$ti0 855 addib,<> 4,$idx,L\$sub_pa11 856 stws,ma $hi1,4($rp) 857 858 subb $ti0,%r0,$hi1 859 ldo -4($tp),$tp 860 and $tp,$hi1,$ap 861 andcm $rp,$hi1,$bp 862 or $ap,$bp,$np 863 864 sub $rp,$arrsz,$rp ; rewind rp 865 subi 0,$arrsz,$idx 866 ldo `$LOCALS+32`($fp),$tp 867 L\$copy_pa11 868 ldwx $idx($np),$hi0 869 stws,ma %r0,4($tp) 870 addib,<> 4,$idx,L\$copy_pa11 871 stws,ma $hi0,4($rp) 872 873 nop ; alignment 874 L\$done 875 ___ 876 } 877 879 $code.=<<___; 880 ldi 1,%r28 ; signal "handled" 881 ldo $FRAME($fp),%sp ; destroy tp[num+1] 882 883 $POP `-$FRAME-$SAVED_RP`(%sp),%r2 ; standard epilogue 884 $POP `-$FRAME+1*$SIZE_T`(%sp),%r4 885 $POP `-$FRAME+2*$SIZE_T`(%sp),%r5 886 $POP `-$FRAME+3*$SIZE_T`(%sp),%r6 887 $POP `-$FRAME+4*$SIZE_T`(%sp),%r7 888 $POP `-$FRAME+5*$SIZE_T`(%sp),%r8 889 $POP `-$FRAME+6*$SIZE_T`(%sp),%r9 890 $POP `-$FRAME+7*$SIZE_T`(%sp),%r10 891 L\$abort 892 bv (%r2) 893 .EXIT 894 $POPMB -$FRAME(%sp),%r3 895 .PROCEND 896 .STRINGZ "Montgomery Multiplication for PA-RISC, CRYPTOGAMS by <appro\@openssl.org>" 897 ___ 898 900 # Explicitly encode PA-RISC 2.0 instructions used in this module, so 901 # that it can be compiled with .LEVEL 1.0. It should be noted that I 902 # wouldn't have to do this, if GNU assembler understood .ALLOW 2.0 903 # directive... 904 905 my $ldd = sub { 906 my ($mod,$args) = @_; 907 my $orig = "ldd$mod\t$args"; 908 909 if ($args =~ /%r([0-9]+)\(%r([0-9]+)\),%r([0-9]+)/) # format 4 910 { my $opcode=(0x03<<26)|($2<<21)|($1<<16)|(3<<6)|$3; 911 sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig; 912 } 913 elsif ($args =~ /(\-?[0-9]+)\(%r([0-9]+)\),%r([0-9]+)/) # format 5 914 { my $opcode=(0x03<<26)|($2<<21)|(1<<12)|(3<<6)|$3; 915 $opcode|=(($1&0xF)<<17)|(($1&0x10)<<12); # encode offset 916 $opcode|=(1<<5) if ($mod =~ /^,m/); 917 $opcode|=(1<<13) if ($mod =~ /^,mb/); 918 sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig; 919 } 920 else { "\t".$orig; } 921 }; 922 923 my $std = sub { 924 my ($mod,$args) = @_; 925 my $orig = "std$mod\t$args"; 926 927 if ($args =~ /%r([0-9]+),(\-?[0-9]+)\(%r([0-9]+)\)/) # format 6 928 { my $opcode=(0x03<<26)|($3<<21)|($1<<16)|(1<<12)|(0xB<<6); 929 $opcode|=(($2&0xF)<<1)|(($2&0x10)>>4); # encode offset 930 $opcode|=(1<<5) if ($mod =~ /^,m/); 931 $opcode|=(1<<13) if ($mod =~ /^,mb/); 932 sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig; 933 } 934 else { "\t".$orig; } 935 }; 936 937 my $extrd = sub { 938 my ($mod,$args) = @_; 939 my $orig = "extrd$mod\t$args"; 940 941 # I only have ",u" completer, it's implicitly encoded... 942 if ($args =~ /%r([0-9]+),([0-9]+),([0-9]+),%r([0-9]+)/) # format 15 943 { my $opcode=(0x36<<26)|($1<<21)|($4<<16); 944 my $len=32-$3; 945 $opcode |= (($2&0x20)<<6)|(($2&0x1f)<<5); # encode pos 946 $opcode |= (($len&0x20)<<7)|($len&0x1f); # encode len 947 sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig; 948 } 949 elsif ($args =~ /%r([0-9]+),%sar,([0-9]+),%r([0-9]+)/) # format 12 950 { my $opcode=(0x34<<26)|($1<<21)|($3<<16)|(2<<11)|(1<<9); 951 my $len=32-$2; 952 $opcode |= (($len&0x20)<<3)|($len&0x1f); # encode len 953 $opcode |= (1<<13) if ($mod =~ /,\**=/); 954 sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig; 955 } 956 else { "\t".$orig; } 957 }; 958 959 my $shrpd = sub { 960 my ($mod,$args) = @_; 961 my $orig = "shrpd$mod\t$args"; 962 963 if ($args =~ /%r([0-9]+),%r([0-9]+),([0-9]+),%r([0-9]+)/) # format 14 964 { my $opcode=(0x34<<26)|($2<<21)|($1<<16)|(1<<10)|$4; 965 my $cpos=63-$3; 966 $opcode |= (($cpos&0x20)<<6)|(($cpos&0x1f)<<5); # encode sa 967 sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig; 968 } 969 else { "\t".$orig; } 970 }; 971 972 my $sub = sub { 973 my ($mod,$args) = @_; 974 my $orig = "sub$mod\t$args"; 975 976 if ($mod eq ",db" && $args =~ /%r([0-9]+),%r([0-9]+),%r([0-9]+)/) { 977 my $opcode=(0x02<<26)|($2<<21)|($1<<16)|$3; 978 $opcode|=(1<<10); # e1 979 $opcode|=(1<<8); # e2 980 $opcode|=(1<<5); # d 981 sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig 982 } 983 else { "\t".$orig; } 984 }; 985 986 sub assemble { 987 my ($mnemonic,$mod,$args)=@_; 988 my $opcode = eval("\$$mnemonic"); 989 990 ref($opcode) eq 'CODE' ? &$opcode($mod,$args) : "\t$mnemonic$mod\t$args"; 991 } 992 993 foreach (split("\n",$code)) { 994 s/\`([^\`]*)\`/eval $1/ge; 995 # flip word order in 64-bit mode... 996 s/(xmpyu\s+)($fai|$fni)([LR])/$1.$2.($3 eq "L"?"R":"L")/e if ($BN_SZ==8); 997 # assemble 2.0 instructions in 32-bit mode... 998 s/^\s+([a-z]+)([\S]*)\s+([\S]*)/&assemble($1,$2,$3)/e if ($BN_SZ==4); 999 1000 print $_,"\n"; 1001 } 1002 close STDOUT; 1003