1 #!/usr/bin/env perl 2 # 3 # ==================================================================== 4 # Written by Andy Polyakov <appro (at] openssl.org> for the OpenSSL 5 # project. The module is, however, dual licensed under OpenSSL and 6 # CRYPTOGAMS licenses depending on where you obtain it. For further 7 # details see http://www.openssl.org/~appro/cryptogams/. 8 # ==================================================================== 9 # 10 # April 2010 11 # 12 # The module implements "4-bit" GCM GHASH function and underlying 13 # single multiplication operation in GF(2^128). "4-bit" means that it 14 # uses 256 bytes per-key table [+128 bytes shared table]. On PA-7100LC 15 # it processes one byte in 19.6 cycles, which is more than twice as 16 # fast as code generated by gcc 3.2. PA-RISC 2.0 loop is scheduled for 17 # 8 cycles, but measured performance on PA-8600 system is ~9 cycles per 18 # processed byte. This is ~2.2x faster than 64-bit code generated by 19 # vendor compiler (which used to be very hard to beat:-). 20 # 21 # Special thanks to polarhome.com for providing HP-UX account. 22 23 $flavour = shift; 24 $output = shift; 25 open STDOUT,">$output"; 26 27 if ($flavour =~ /64/) { 28 $LEVEL ="2.0W"; 29 $SIZE_T =8; 30 $FRAME_MARKER =80; 31 $SAVED_RP =16; 32 $PUSH ="std"; 33 $PUSHMA ="std,ma"; 34 $POP ="ldd"; 35 $POPMB ="ldd,mb"; 36 $NREGS =6; 37 } else { 38 $LEVEL ="1.0"; #"\n\t.ALLOW\t2.0"; 39 $SIZE_T =4; 40 $FRAME_MARKER =48; 41 $SAVED_RP =20; 42 $PUSH ="stw"; 43 $PUSHMA ="stwm"; 44 $POP ="ldw"; 45 $POPMB ="ldwm"; 46 $NREGS =11; 47 } 48 49 $FRAME=10*$SIZE_T+$FRAME_MARKER;# NREGS saved regs + frame marker 50 # [+ argument transfer] 51 52 ################# volatile registers 53 $Xi="%r26"; # argument block 54 $Htbl="%r25"; 55 $inp="%r24"; 56 $len="%r23"; 57 $Hhh=$Htbl; # variables 58 $Hll="%r22"; 59 $Zhh="%r21"; 60 $Zll="%r20"; 61 $cnt="%r19"; 62 $rem_4bit="%r28"; 63 $rem="%r29"; 64 $mask0xf0="%r31"; 65 66 ################# preserved registers 67 $Thh="%r1"; 68 $Tll="%r2"; 69 $nlo="%r3"; 70 $nhi="%r4"; 71 $byte="%r5"; 72 if ($SIZE_T==4) { 73 $Zhl="%r6"; 74 $Zlh="%r7"; 75 $Hhl="%r8"; 76 $Hlh="%r9"; 77 $Thl="%r10"; 78 $Tlh="%r11"; 79 } 80 $rem2="%r6"; # used in PA-RISC 2.0 code 81 82 $code.=<<___; 83 .LEVEL $LEVEL 84 .SPACE \$TEXT\$ 85 .SUBSPA \$CODE\$,QUAD=0,ALIGN=8,ACCESS=0x2C,CODE_ONLY 86 87 .EXPORT gcm_gmult_4bit,ENTRY,ARGW0=GR,ARGW1=GR 88 .ALIGN 64 89 gcm_gmult_4bit 90 .PROC 91 .CALLINFO FRAME=`$FRAME-10*$SIZE_T`,NO_CALLS,SAVE_RP,ENTRY_GR=$NREGS 92 .ENTRY 93 $PUSH %r2,-$SAVED_RP(%sp) ; standard prologue 94 $PUSHMA %r3,$FRAME(%sp) 95 $PUSH %r4,`-$FRAME+1*$SIZE_T`(%sp) 96 $PUSH %r5,`-$FRAME+2*$SIZE_T`(%sp) 97 $PUSH %r6,`-$FRAME+3*$SIZE_T`(%sp) 98 ___ 99 $code.=<<___ if ($SIZE_T==4); 100 $PUSH %r7,`-$FRAME+4*$SIZE_T`(%sp) 101 $PUSH %r8,`-$FRAME+5*$SIZE_T`(%sp) 102 $PUSH %r9,`-$FRAME+6*$SIZE_T`(%sp) 103 $PUSH %r10,`-$FRAME+7*$SIZE_T`(%sp) 104 $PUSH %r11,`-$FRAME+8*$SIZE_T`(%sp) 105 ___ 106 $code.=<<___; 107 blr %r0,$rem_4bit 108 ldi 3,$rem 109 L\$pic_gmult 110 andcm $rem_4bit,$rem,$rem_4bit 111 addl $inp,$len,$len 112 ldo L\$rem_4bit-L\$pic_gmult($rem_4bit),$rem_4bit 113 ldi 0xf0,$mask0xf0 114 ___ 115 $code.=<<___ if ($SIZE_T==4); 116 ldi 31,$rem 117 mtctl $rem,%cr11 118 extrd,u,*= $rem,%sar,1,$rem ; executes on PA-RISC 1.0 119 b L\$parisc1_gmult 120 nop 121 ___ 122 124 $code.=<<___; 125 ldb 15($Xi),$nlo 126 ldo 8($Htbl),$Hll 127 128 and $mask0xf0,$nlo,$nhi 129 depd,z $nlo,59,4,$nlo 130 131 ldd $nlo($Hll),$Zll 132 ldd $nlo($Hhh),$Zhh 133 134 depd,z $Zll,60,4,$rem 135 shrpd $Zhh,$Zll,4,$Zll 136 extrd,u $Zhh,59,60,$Zhh 137 ldb 14($Xi),$nlo 138 139 ldd $nhi($Hll),$Tll 140 ldd $nhi($Hhh),$Thh 141 and $mask0xf0,$nlo,$nhi 142 depd,z $nlo,59,4,$nlo 143 144 xor $Tll,$Zll,$Zll 145 xor $Thh,$Zhh,$Zhh 146 ldd $rem($rem_4bit),$rem 147 b L\$oop_gmult_pa2 148 ldi 13,$cnt 149 150 .ALIGN 8 151 L\$oop_gmult_pa2 152 xor $rem,$Zhh,$Zhh ; moved here to work around gas bug 153 depd,z $Zll,60,4,$rem 154 155 shrpd $Zhh,$Zll,4,$Zll 156 extrd,u $Zhh,59,60,$Zhh 157 ldd $nlo($Hll),$Tll 158 ldd $nlo($Hhh),$Thh 159 160 xor $Tll,$Zll,$Zll 161 xor $Thh,$Zhh,$Zhh 162 ldd $rem($rem_4bit),$rem 163 164 xor $rem,$Zhh,$Zhh 165 depd,z $Zll,60,4,$rem 166 ldbx $cnt($Xi),$nlo 167 168 shrpd $Zhh,$Zll,4,$Zll 169 extrd,u $Zhh,59,60,$Zhh 170 ldd $nhi($Hll),$Tll 171 ldd $nhi($Hhh),$Thh 172 173 and $mask0xf0,$nlo,$nhi 174 depd,z $nlo,59,4,$nlo 175 ldd $rem($rem_4bit),$rem 176 177 xor $Tll,$Zll,$Zll 178 addib,uv -1,$cnt,L\$oop_gmult_pa2 179 xor $Thh,$Zhh,$Zhh 180 181 xor $rem,$Zhh,$Zhh 182 depd,z $Zll,60,4,$rem 183 184 shrpd $Zhh,$Zll,4,$Zll 185 extrd,u $Zhh,59,60,$Zhh 186 ldd $nlo($Hll),$Tll 187 ldd $nlo($Hhh),$Thh 188 189 xor $Tll,$Zll,$Zll 190 xor $Thh,$Zhh,$Zhh 191 ldd $rem($rem_4bit),$rem 192 193 xor $rem,$Zhh,$Zhh 194 depd,z $Zll,60,4,$rem 195 196 shrpd $Zhh,$Zll,4,$Zll 197 extrd,u $Zhh,59,60,$Zhh 198 ldd $nhi($Hll),$Tll 199 ldd $nhi($Hhh),$Thh 200 201 xor $Tll,$Zll,$Zll 202 xor $Thh,$Zhh,$Zhh 203 ldd $rem($rem_4bit),$rem 204 205 xor $rem,$Zhh,$Zhh 206 std $Zll,8($Xi) 207 std $Zhh,0($Xi) 208 ___ 209 211 $code.=<<___ if ($SIZE_T==4); 212 b L\$done_gmult 213 nop 214 215 L\$parisc1_gmult 216 ldb 15($Xi),$nlo 217 ldo 12($Htbl),$Hll 218 ldo 8($Htbl),$Hlh 219 ldo 4($Htbl),$Hhl 220 221 and $mask0xf0,$nlo,$nhi 222 zdep $nlo,27,4,$nlo 223 224 ldwx $nlo($Hll),$Zll 225 ldwx $nlo($Hlh),$Zlh 226 ldwx $nlo($Hhl),$Zhl 227 ldwx $nlo($Hhh),$Zhh 228 zdep $Zll,28,4,$rem 229 ldb 14($Xi),$nlo 230 ldwx $rem($rem_4bit),$rem 231 shrpw $Zlh,$Zll,4,$Zll 232 ldwx $nhi($Hll),$Tll 233 shrpw $Zhl,$Zlh,4,$Zlh 234 ldwx $nhi($Hlh),$Tlh 235 shrpw $Zhh,$Zhl,4,$Zhl 236 ldwx $nhi($Hhl),$Thl 237 extru $Zhh,27,28,$Zhh 238 ldwx $nhi($Hhh),$Thh 239 xor $rem,$Zhh,$Zhh 240 and $mask0xf0,$nlo,$nhi 241 zdep $nlo,27,4,$nlo 242 243 xor $Tll,$Zll,$Zll 244 ldwx $nlo($Hll),$Tll 245 xor $Tlh,$Zlh,$Zlh 246 ldwx $nlo($Hlh),$Tlh 247 xor $Thl,$Zhl,$Zhl 248 b L\$oop_gmult_pa1 249 ldi 13,$cnt 250 251 .ALIGN 8 252 L\$oop_gmult_pa1 253 zdep $Zll,28,4,$rem 254 ldwx $nlo($Hhl),$Thl 255 xor $Thh,$Zhh,$Zhh 256 ldwx $rem($rem_4bit),$rem 257 shrpw $Zlh,$Zll,4,$Zll 258 ldwx $nlo($Hhh),$Thh 259 shrpw $Zhl,$Zlh,4,$Zlh 260 ldbx $cnt($Xi),$nlo 261 xor $Tll,$Zll,$Zll 262 ldwx $nhi($Hll),$Tll 263 shrpw $Zhh,$Zhl,4,$Zhl 264 xor $Tlh,$Zlh,$Zlh 265 ldwx $nhi($Hlh),$Tlh 266 extru $Zhh,27,28,$Zhh 267 xor $Thl,$Zhl,$Zhl 268 ldwx $nhi($Hhl),$Thl 269 xor $rem,$Zhh,$Zhh 270 zdep $Zll,28,4,$rem 271 xor $Thh,$Zhh,$Zhh 272 ldwx $nhi($Hhh),$Thh 273 shrpw $Zlh,$Zll,4,$Zll 274 ldwx $rem($rem_4bit),$rem 275 shrpw $Zhl,$Zlh,4,$Zlh 276 shrpw $Zhh,$Zhl,4,$Zhl 277 and $mask0xf0,$nlo,$nhi 278 extru $Zhh,27,28,$Zhh 279 zdep $nlo,27,4,$nlo 280 xor $Tll,$Zll,$Zll 281 ldwx $nlo($Hll),$Tll 282 xor $Tlh,$Zlh,$Zlh 283 ldwx $nlo($Hlh),$Tlh 284 xor $rem,$Zhh,$Zhh 285 addib,uv -1,$cnt,L\$oop_gmult_pa1 286 xor $Thl,$Zhl,$Zhl 287 288 zdep $Zll,28,4,$rem 289 ldwx $nlo($Hhl),$Thl 290 xor $Thh,$Zhh,$Zhh 291 ldwx $rem($rem_4bit),$rem 292 shrpw $Zlh,$Zll,4,$Zll 293 ldwx $nlo($Hhh),$Thh 294 shrpw $Zhl,$Zlh,4,$Zlh 295 xor $Tll,$Zll,$Zll 296 ldwx $nhi($Hll),$Tll 297 shrpw $Zhh,$Zhl,4,$Zhl 298 xor $Tlh,$Zlh,$Zlh 299 ldwx $nhi($Hlh),$Tlh 300 extru $Zhh,27,28,$Zhh 301 xor $rem,$Zhh,$Zhh 302 xor $Thl,$Zhl,$Zhl 303 ldwx $nhi($Hhl),$Thl 304 xor $Thh,$Zhh,$Zhh 305 ldwx $nhi($Hhh),$Thh 306 zdep $Zll,28,4,$rem 307 ldwx $rem($rem_4bit),$rem 308 shrpw $Zlh,$Zll,4,$Zll 309 shrpw $Zhl,$Zlh,4,$Zlh 310 shrpw $Zhh,$Zhl,4,$Zhl 311 extru $Zhh,27,28,$Zhh 312 xor $Tll,$Zll,$Zll 313 xor $Tlh,$Zlh,$Zlh 314 xor $rem,$Zhh,$Zhh 315 stw $Zll,12($Xi) 316 xor $Thl,$Zhl,$Zhl 317 stw $Zlh,8($Xi) 318 xor $Thh,$Zhh,$Zhh 319 stw $Zhl,4($Xi) 320 stw $Zhh,0($Xi) 321 ___ 322 $code.=<<___; 323 L\$done_gmult 324 $POP `-$FRAME-$SAVED_RP`(%sp),%r2 ; standard epilogue 325 $POP `-$FRAME+1*$SIZE_T`(%sp),%r4 326 $POP `-$FRAME+2*$SIZE_T`(%sp),%r5 327 $POP `-$FRAME+3*$SIZE_T`(%sp),%r6 328 ___ 329 $code.=<<___ if ($SIZE_T==4); 330 $POP `-$FRAME+4*$SIZE_T`(%sp),%r7 331 $POP `-$FRAME+5*$SIZE_T`(%sp),%r8 332 $POP `-$FRAME+6*$SIZE_T`(%sp),%r9 333 $POP `-$FRAME+7*$SIZE_T`(%sp),%r10 334 $POP `-$FRAME+8*$SIZE_T`(%sp),%r11 335 ___ 336 $code.=<<___; 337 bv (%r2) 338 .EXIT 339 $POPMB -$FRAME(%sp),%r3 340 .PROCEND 341 342 .EXPORT gcm_ghash_4bit,ENTRY,ARGW0=GR,ARGW1=GR,ARGW2=GR,ARGW3=GR 343 .ALIGN 64 344 gcm_ghash_4bit 345 .PROC 346 .CALLINFO FRAME=`$FRAME-10*$SIZE_T`,NO_CALLS,SAVE_RP,ENTRY_GR=11 347 .ENTRY 348 $PUSH %r2,-$SAVED_RP(%sp) ; standard prologue 349 $PUSHMA %r3,$FRAME(%sp) 350 $PUSH %r4,`-$FRAME+1*$SIZE_T`(%sp) 351 $PUSH %r5,`-$FRAME+2*$SIZE_T`(%sp) 352 $PUSH %r6,`-$FRAME+3*$SIZE_T`(%sp) 353 ___ 354 $code.=<<___ if ($SIZE_T==4); 355 $PUSH %r7,`-$FRAME+4*$SIZE_T`(%sp) 356 $PUSH %r8,`-$FRAME+5*$SIZE_T`(%sp) 357 $PUSH %r9,`-$FRAME+6*$SIZE_T`(%sp) 358 $PUSH %r10,`-$FRAME+7*$SIZE_T`(%sp) 359 $PUSH %r11,`-$FRAME+8*$SIZE_T`(%sp) 360 ___ 361 $code.=<<___; 362 blr %r0,$rem_4bit 363 ldi 3,$rem 364 L\$pic_ghash 365 andcm $rem_4bit,$rem,$rem_4bit 366 addl $inp,$len,$len 367 ldo L\$rem_4bit-L\$pic_ghash($rem_4bit),$rem_4bit 368 ldi 0xf0,$mask0xf0 369 ___ 370 $code.=<<___ if ($SIZE_T==4); 371 ldi 31,$rem 372 mtctl $rem,%cr11 373 extrd,u,*= $rem,%sar,1,$rem ; executes on PA-RISC 1.0 374 b L\$parisc1_ghash 375 nop 376 ___ 377 380 $code.=<<___; 381 ldb 15($Xi),$nlo 382 ldo 8($Htbl),$Hll 383 384 L\$outer_ghash_pa2 385 ldb 15($inp),$nhi 386 xor $nhi,$nlo,$nlo 387 and $mask0xf0,$nlo,$nhi 388 depd,z $nlo,59,4,$nlo 389 390 ldd $nlo($Hll),$Zll 391 ldd $nlo($Hhh),$Zhh 392 393 depd,z $Zll,60,4,$rem 394 shrpd $Zhh,$Zll,4,$Zll 395 extrd,u $Zhh,59,60,$Zhh 396 ldb 14($Xi),$nlo 397 ldb 14($inp),$byte 398 399 ldd $nhi($Hll),$Tll 400 ldd $nhi($Hhh),$Thh 401 xor $byte,$nlo,$nlo 402 and $mask0xf0,$nlo,$nhi 403 depd,z $nlo,59,4,$nlo 404 405 xor $Tll,$Zll,$Zll 406 xor $Thh,$Zhh,$Zhh 407 ldd $rem($rem_4bit),$rem 408 b L\$oop_ghash_pa2 409 ldi 13,$cnt 410 411 .ALIGN 8 412 L\$oop_ghash_pa2 413 xor $rem,$Zhh,$Zhh ; moved here to work around gas bug 414 depd,z $Zll,60,4,$rem2 415 416 shrpd $Zhh,$Zll,4,$Zll 417 extrd,u $Zhh,59,60,$Zhh 418 ldd $nlo($Hll),$Tll 419 ldd $nlo($Hhh),$Thh 420 421 xor $Tll,$Zll,$Zll 422 xor $Thh,$Zhh,$Zhh 423 ldbx $cnt($Xi),$nlo 424 ldbx $cnt($inp),$byte 425 426 depd,z $Zll,60,4,$rem 427 shrpd $Zhh,$Zll,4,$Zll 428 ldd $rem2($rem_4bit),$rem2 429 430 xor $rem2,$Zhh,$Zhh 431 xor $byte,$nlo,$nlo 432 ldd $nhi($Hll),$Tll 433 ldd $nhi($Hhh),$Thh 434 435 and $mask0xf0,$nlo,$nhi 436 depd,z $nlo,59,4,$nlo 437 438 extrd,u $Zhh,59,60,$Zhh 439 xor $Tll,$Zll,$Zll 440 441 ldd $rem($rem_4bit),$rem 442 addib,uv -1,$cnt,L\$oop_ghash_pa2 443 xor $Thh,$Zhh,$Zhh 444 445 xor $rem,$Zhh,$Zhh 446 depd,z $Zll,60,4,$rem2 447 448 shrpd $Zhh,$Zll,4,$Zll 449 extrd,u $Zhh,59,60,$Zhh 450 ldd $nlo($Hll),$Tll 451 ldd $nlo($Hhh),$Thh 452 453 xor $Tll,$Zll,$Zll 454 xor $Thh,$Zhh,$Zhh 455 456 depd,z $Zll,60,4,$rem 457 shrpd $Zhh,$Zll,4,$Zll 458 ldd $rem2($rem_4bit),$rem2 459 460 xor $rem2,$Zhh,$Zhh 461 ldd $nhi($Hll),$Tll 462 ldd $nhi($Hhh),$Thh 463 464 extrd,u $Zhh,59,60,$Zhh 465 xor $Tll,$Zll,$Zll 466 xor $Thh,$Zhh,$Zhh 467 ldd $rem($rem_4bit),$rem 468 469 xor $rem,$Zhh,$Zhh 470 std $Zll,8($Xi) 471 ldo 16($inp),$inp 472 std $Zhh,0($Xi) 473 cmpb,*<> $inp,$len,L\$outer_ghash_pa2 474 copy $Zll,$nlo 475 ___ 476 478 $code.=<<___ if ($SIZE_T==4); 479 b L\$done_ghash 480 nop 481 482 L\$parisc1_ghash 483 ldb 15($Xi),$nlo 484 ldo 12($Htbl),$Hll 485 ldo 8($Htbl),$Hlh 486 ldo 4($Htbl),$Hhl 487 488 L\$outer_ghash_pa1 489 ldb 15($inp),$byte 490 xor $byte,$nlo,$nlo 491 and $mask0xf0,$nlo,$nhi 492 zdep $nlo,27,4,$nlo 493 494 ldwx $nlo($Hll),$Zll 495 ldwx $nlo($Hlh),$Zlh 496 ldwx $nlo($Hhl),$Zhl 497 ldwx $nlo($Hhh),$Zhh 498 zdep $Zll,28,4,$rem 499 ldb 14($Xi),$nlo 500 ldb 14($inp),$byte 501 ldwx $rem($rem_4bit),$rem 502 shrpw $Zlh,$Zll,4,$Zll 503 ldwx $nhi($Hll),$Tll 504 shrpw $Zhl,$Zlh,4,$Zlh 505 ldwx $nhi($Hlh),$Tlh 506 shrpw $Zhh,$Zhl,4,$Zhl 507 ldwx $nhi($Hhl),$Thl 508 extru $Zhh,27,28,$Zhh 509 ldwx $nhi($Hhh),$Thh 510 xor $byte,$nlo,$nlo 511 xor $rem,$Zhh,$Zhh 512 and $mask0xf0,$nlo,$nhi 513 zdep $nlo,27,4,$nlo 514 515 xor $Tll,$Zll,$Zll 516 ldwx $nlo($Hll),$Tll 517 xor $Tlh,$Zlh,$Zlh 518 ldwx $nlo($Hlh),$Tlh 519 xor $Thl,$Zhl,$Zhl 520 b L\$oop_ghash_pa1 521 ldi 13,$cnt 522 523 .ALIGN 8 524 L\$oop_ghash_pa1 525 zdep $Zll,28,4,$rem 526 ldwx $nlo($Hhl),$Thl 527 xor $Thh,$Zhh,$Zhh 528 ldwx $rem($rem_4bit),$rem 529 shrpw $Zlh,$Zll,4,$Zll 530 ldwx $nlo($Hhh),$Thh 531 shrpw $Zhl,$Zlh,4,$Zlh 532 ldbx $cnt($Xi),$nlo 533 xor $Tll,$Zll,$Zll 534 ldwx $nhi($Hll),$Tll 535 shrpw $Zhh,$Zhl,4,$Zhl 536 ldbx $cnt($inp),$byte 537 xor $Tlh,$Zlh,$Zlh 538 ldwx $nhi($Hlh),$Tlh 539 extru $Zhh,27,28,$Zhh 540 xor $Thl,$Zhl,$Zhl 541 ldwx $nhi($Hhl),$Thl 542 xor $rem,$Zhh,$Zhh 543 zdep $Zll,28,4,$rem 544 xor $Thh,$Zhh,$Zhh 545 ldwx $nhi($Hhh),$Thh 546 shrpw $Zlh,$Zll,4,$Zll 547 ldwx $rem($rem_4bit),$rem 548 shrpw $Zhl,$Zlh,4,$Zlh 549 xor $byte,$nlo,$nlo 550 shrpw $Zhh,$Zhl,4,$Zhl 551 and $mask0xf0,$nlo,$nhi 552 extru $Zhh,27,28,$Zhh 553 zdep $nlo,27,4,$nlo 554 xor $Tll,$Zll,$Zll 555 ldwx $nlo($Hll),$Tll 556 xor $Tlh,$Zlh,$Zlh 557 ldwx $nlo($Hlh),$Tlh 558 xor $rem,$Zhh,$Zhh 559 addib,uv -1,$cnt,L\$oop_ghash_pa1 560 xor $Thl,$Zhl,$Zhl 561 562 zdep $Zll,28,4,$rem 563 ldwx $nlo($Hhl),$Thl 564 xor $Thh,$Zhh,$Zhh 565 ldwx $rem($rem_4bit),$rem 566 shrpw $Zlh,$Zll,4,$Zll 567 ldwx $nlo($Hhh),$Thh 568 shrpw $Zhl,$Zlh,4,$Zlh 569 xor $Tll,$Zll,$Zll 570 ldwx $nhi($Hll),$Tll 571 shrpw $Zhh,$Zhl,4,$Zhl 572 xor $Tlh,$Zlh,$Zlh 573 ldwx $nhi($Hlh),$Tlh 574 extru $Zhh,27,28,$Zhh 575 xor $rem,$Zhh,$Zhh 576 xor $Thl,$Zhl,$Zhl 577 ldwx $nhi($Hhl),$Thl 578 xor $Thh,$Zhh,$Zhh 579 ldwx $nhi($Hhh),$Thh 580 zdep $Zll,28,4,$rem 581 ldwx $rem($rem_4bit),$rem 582 shrpw $Zlh,$Zll,4,$Zll 583 shrpw $Zhl,$Zlh,4,$Zlh 584 shrpw $Zhh,$Zhl,4,$Zhl 585 extru $Zhh,27,28,$Zhh 586 xor $Tll,$Zll,$Zll 587 xor $Tlh,$Zlh,$Zlh 588 xor $rem,$Zhh,$Zhh 589 stw $Zll,12($Xi) 590 xor $Thl,$Zhl,$Zhl 591 stw $Zlh,8($Xi) 592 xor $Thh,$Zhh,$Zhh 593 stw $Zhl,4($Xi) 594 ldo 16($inp),$inp 595 stw $Zhh,0($Xi) 596 comb,<> $inp,$len,L\$outer_ghash_pa1 597 copy $Zll,$nlo 598 ___ 599 $code.=<<___; 600 L\$done_ghash 601 $POP `-$FRAME-$SAVED_RP`(%sp),%r2 ; standard epilogue 602 $POP `-$FRAME+1*$SIZE_T`(%sp),%r4 603 $POP `-$FRAME+2*$SIZE_T`(%sp),%r5 604 $POP `-$FRAME+3*$SIZE_T`(%sp),%r6 605 ___ 606 $code.=<<___ if ($SIZE_T==4); 607 $POP `-$FRAME+4*$SIZE_T`(%sp),%r7 608 $POP `-$FRAME+5*$SIZE_T`(%sp),%r8 609 $POP `-$FRAME+6*$SIZE_T`(%sp),%r9 610 $POP `-$FRAME+7*$SIZE_T`(%sp),%r10 611 $POP `-$FRAME+8*$SIZE_T`(%sp),%r11 612 ___ 613 $code.=<<___; 614 bv (%r2) 615 .EXIT 616 $POPMB -$FRAME(%sp),%r3 617 .PROCEND 618 619 .ALIGN 64 620 L\$rem_4bit 621 .WORD `0x0000<<16`,0,`0x1C20<<16`,0,`0x3840<<16`,0,`0x2460<<16`,0 622 .WORD `0x7080<<16`,0,`0x6CA0<<16`,0,`0x48C0<<16`,0,`0x54E0<<16`,0 623 .WORD `0xE100<<16`,0,`0xFD20<<16`,0,`0xD940<<16`,0,`0xC560<<16`,0 624 .WORD `0x9180<<16`,0,`0x8DA0<<16`,0,`0xA9C0<<16`,0,`0xB5E0<<16`,0 625 .STRINGZ "GHASH for PA-RISC, GRYPTOGAMS by <appro\@openssl.org>" 626 .ALIGN 64 627 ___ 628 629 # Explicitly encode PA-RISC 2.0 instructions used in this module, so 630 # that it can be compiled with .LEVEL 1.0. It should be noted that I 631 # wouldn't have to do this, if GNU assembler understood .ALLOW 2.0 632 # directive... 633 634 my $ldd = sub { 635 my ($mod,$args) = @_; 636 my $orig = "ldd$mod\t$args"; 637 638 if ($args =~ /%r([0-9]+)\(%r([0-9]+)\),%r([0-9]+)/) # format 4 639 { my $opcode=(0x03<<26)|($2<<21)|($1<<16)|(3<<6)|$3; 640 sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig; 641 } 642 elsif ($args =~ /(\-?[0-9]+)\(%r([0-9]+)\),%r([0-9]+)/) # format 5 643 { my $opcode=(0x03<<26)|($2<<21)|(1<<12)|(3<<6)|$3; 644 $opcode|=(($1&0xF)<<17)|(($1&0x10)<<12); # encode offset 645 $opcode|=(1<<5) if ($mod =~ /^,m/); 646 $opcode|=(1<<13) if ($mod =~ /^,mb/); 647 sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig; 648 } 649 else { "\t".$orig; } 650 }; 651 652 my $std = sub { 653 my ($mod,$args) = @_; 654 my $orig = "std$mod\t$args"; 655 656 if ($args =~ /%r([0-9]+),(\-?[0-9]+)\(%r([0-9]+)\)/) # format 3 suffices 657 { my $opcode=(0x1c<<26)|($3<<21)|($1<<16)|(($2&0x1FF8)<<1)|(($2>>13)&1); 658 sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig; 659 } 660 else { "\t".$orig; } 661 }; 662 663 my $extrd = sub { 664 my ($mod,$args) = @_; 665 my $orig = "extrd$mod\t$args"; 666 667 # I only have ",u" completer, it's implicitly encoded... 668 if ($args =~ /%r([0-9]+),([0-9]+),([0-9]+),%r([0-9]+)/) # format 15 669 { my $opcode=(0x36<<26)|($1<<21)|($4<<16); 670 my $len=32-$3; 671 $opcode |= (($2&0x20)<<6)|(($2&0x1f)<<5); # encode pos 672 $opcode |= (($len&0x20)<<7)|($len&0x1f); # encode len 673 sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig; 674 } 675 elsif ($args =~ /%r([0-9]+),%sar,([0-9]+),%r([0-9]+)/) # format 12 676 { my $opcode=(0x34<<26)|($1<<21)|($3<<16)|(2<<11)|(1<<9); 677 my $len=32-$2; 678 $opcode |= (($len&0x20)<<3)|($len&0x1f); # encode len 679 $opcode |= (1<<13) if ($mod =~ /,\**=/); 680 sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig; 681 } 682 else { "\t".$orig; } 683 }; 684 685 my $shrpd = sub { 686 my ($mod,$args) = @_; 687 my $orig = "shrpd$mod\t$args"; 688 689 if ($args =~ /%r([0-9]+),%r([0-9]+),([0-9]+),%r([0-9]+)/) # format 14 690 { my $opcode=(0x34<<26)|($2<<21)|($1<<16)|(1<<10)|$4; 691 my $cpos=63-$3; 692 $opcode |= (($cpos&0x20)<<6)|(($cpos&0x1f)<<5); # encode sa 693 sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig; 694 } 695 elsif ($args =~ /%r([0-9]+),%r([0-9]+),%sar,%r([0-9]+)/) # format 11 696 { sprintf "\t.WORD\t0x%08x\t; %s", 697 (0x34<<26)|($2<<21)|($1<<16)|(1<<9)|$3,$orig; 698 } 699 else { "\t".$orig; } 700 }; 701 702 my $depd = sub { 703 my ($mod,$args) = @_; 704 my $orig = "depd$mod\t$args"; 705 706 # I only have ",z" completer, it's impicitly encoded... 707 if ($args =~ /%r([0-9]+),([0-9]+),([0-9]+),%r([0-9]+)/) # format 16 708 { my $opcode=(0x3c<<26)|($4<<21)|($1<<16); 709 my $cpos=63-$2; 710 my $len=32-$3; 711 $opcode |= (($cpos&0x20)<<6)|(($cpos&0x1f)<<5); # encode pos 712 $opcode |= (($len&0x20)<<7)|($len&0x1f); # encode len 713 sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig; 714 } 715 else { "\t".$orig; } 716 }; 717 718 sub assemble { 719 my ($mnemonic,$mod,$args)=@_; 720 my $opcode = eval("\$$mnemonic"); 721 722 ref($opcode) eq 'CODE' ? &$opcode($mod,$args) : "\t$mnemonic$mod\t$args"; 723 } 724 725 foreach (split("\n",$code)) { 726 s/\`([^\`]*)\`/eval $1/ge; 727 if ($SIZE_T==4) { 728 s/^\s+([a-z]+)([\S]*)\s+([\S]*)/&assemble($1,$2,$3)/e; 729 s/cmpb,\*/comb,/; 730 s/,\*/,/; 731 } 732 print $_,"\n"; 733 } 734 735 close STDOUT; 736