1 #!/usr/bin/env perl 2 3 # ==================================================================== 4 # Written by Andy Polyakov <appro (at] fy.chalmers.se> for the OpenSSL 5 # project. The module is, however, dual licensed under OpenSSL and 6 # CRYPTOGAMS licenses depending on where you obtain it. For further 7 # details see http://www.openssl.org/~appro/cryptogams/. 8 # ==================================================================== 9 10 # SHA256/512 block procedure for PA-RISC. 11 12 # June 2009. 13 # 14 # SHA256 performance is >75% better than gcc 3.2 generated code on 15 # PA-7100LC. Compared to code generated by vendor compiler this 16 # implementation is almost 70% faster in 64-bit build, but delivers 17 # virtually same performance in 32-bit build on PA-8600. 18 # 19 # SHA512 performance is >2.9x better than gcc 3.2 generated code on 20 # PA-7100LC, PA-RISC 1.1 processor. Then implementation detects if the 21 # code is executed on PA-RISC 2.0 processor and switches to 64-bit 22 # code path delivering adequate peformance even in "blended" 32-bit 23 # build. Though 64-bit code is not any faster than code generated by 24 # vendor compiler on PA-8600... 25 # 26 # Special thanks to polarhome.com for providing HP-UX account. 27 28 $flavour = shift; 29 $output = shift; 30 open STDOUT,">$output"; 31 32 if ($flavour =~ /64/) { 33 $LEVEL ="2.0W"; 34 $SIZE_T =8; 35 $FRAME_MARKER =80; 36 $SAVED_RP =16; 37 $PUSH ="std"; 38 $PUSHMA ="std,ma"; 39 $POP ="ldd"; 40 $POPMB ="ldd,mb"; 41 } else { 42 $LEVEL ="1.0"; 43 $SIZE_T =4; 44 $FRAME_MARKER =48; 45 $SAVED_RP =20; 46 $PUSH ="stw"; 47 $PUSHMA ="stwm"; 48 $POP ="ldw"; 49 $POPMB ="ldwm"; 50 } 51 52 if ($output =~ /512/) { 53 $func="sha512_block_data_order"; 54 $SZ=8; 55 @Sigma0=(28,34,39); 56 @Sigma1=(14,18,41); 57 @sigma0=(1, 8, 7); 58 @sigma1=(19,61, 6); 59 $rounds=80; 60 $LAST10BITS=0x017; 61 $LD="ldd"; 62 $LDM="ldd,ma"; 63 $ST="std"; 64 } else { 65 $func="sha256_block_data_order"; 66 $SZ=4; 67 @Sigma0=( 2,13,22); 68 @Sigma1=( 6,11,25); 69 @sigma0=( 7,18, 3); 70 @sigma1=(17,19,10); 71 $rounds=64; 72 $LAST10BITS=0x0f2; 73 $LD="ldw"; 74 $LDM="ldwm"; 75 $ST="stw"; 76 } 77 78 $FRAME=16*$SIZE_T+$FRAME_MARKER;# 16 saved regs + frame marker 79 # [+ argument transfer] 80 $XOFF=16*$SZ+32; # local variables 81 $FRAME+=$XOFF; 82 $XOFF+=$FRAME_MARKER; # distance between %sp and local variables 83 84 $ctx="%r26"; # zapped by $a0 85 $inp="%r25"; # zapped by $a1 86 $num="%r24"; # zapped by $t0 87 88 $a0 ="%r26"; 89 $a1 ="%r25"; 90 $t0 ="%r24"; 91 $t1 ="%r29"; 92 $Tbl="%r31"; 93 94 @V=($A,$B,$C,$D,$E,$F,$G,$H)=("%r17","%r18","%r19","%r20","%r21","%r22","%r23","%r28"); 95 96 @X=("%r1", "%r2", "%r3", "%r4", "%r5", "%r6", "%r7", "%r8", 97 "%r9", "%r10","%r11","%r12","%r13","%r14","%r15","%r16",$inp); 98 99 sub ROUND_00_15 { 100 my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_; 101 $code.=<<___; 102 _ror $e,$Sigma1[0],$a0 103 and $f,$e,$t0 104 _ror $e,$Sigma1[1],$a1 105 addl $t1,$h,$h 106 andcm $g,$e,$t1 107 xor $a1,$a0,$a0 108 _ror $a1,`$Sigma1[2]-$Sigma1[1]`,$a1 109 or $t0,$t1,$t1 ; Ch(e,f,g) 110 addl @X[$i%16],$h,$h 111 xor $a0,$a1,$a1 ; Sigma1(e) 112 addl $t1,$h,$h 113 _ror $a,$Sigma0[0],$a0 114 addl $a1,$h,$h 115 116 _ror $a,$Sigma0[1],$a1 117 and $a,$b,$t0 118 and $a,$c,$t1 119 xor $a1,$a0,$a0 120 _ror $a1,`$Sigma0[2]-$Sigma0[1]`,$a1 121 xor $t1,$t0,$t0 122 and $b,$c,$t1 123 xor $a0,$a1,$a1 ; Sigma0(a) 124 addl $h,$d,$d 125 xor $t1,$t0,$t0 ; Maj(a,b,c) 126 `"$LDM $SZ($Tbl),$t1" if ($i<15)` 127 addl $a1,$h,$h 128 addl $t0,$h,$h 129 130 ___ 131 } 132 133 sub ROUND_16_xx { 134 my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_; 135 $i-=16; 136 $code.=<<___; 137 _ror @X[($i+1)%16],$sigma0[0],$a0 138 _ror @X[($i+1)%16],$sigma0[1],$a1 139 addl @X[($i+9)%16],@X[$i],@X[$i] 140 _ror @X[($i+14)%16],$sigma1[0],$t0 141 _ror @X[($i+14)%16],$sigma1[1],$t1 142 xor $a1,$a0,$a0 143 _shr @X[($i+1)%16],$sigma0[2],$a1 144 xor $t1,$t0,$t0 145 _shr @X[($i+14)%16],$sigma1[2],$t1 146 xor $a1,$a0,$a0 ; sigma0(X[(i+1)&0x0f]) 147 xor $t1,$t0,$t0 ; sigma1(X[(i+14)&0x0f]) 148 $LDM $SZ($Tbl),$t1 149 addl $a0,@X[$i],@X[$i] 150 addl $t0,@X[$i],@X[$i] 151 ___ 152 $code.=<<___ if ($i==15); 153 extru $t1,31,10,$a1 154 comiclr,<> $LAST10BITS,$a1,%r0 155 ldo 1($Tbl),$Tbl ; signal end of $Tbl 156 ___ 157 &ROUND_00_15($i+16,$a,$b,$c,$d,$e,$f,$g,$h); 158 } 159 160 $code=<<___; 161 .LEVEL $LEVEL 162 .SPACE \$TEXT\$ 163 .SUBSPA \$CODE\$,QUAD=0,ALIGN=8,ACCESS=0x2C,CODE_ONLY 164 165 .ALIGN 64 166 L\$table 167 ___ 168 $code.=<<___ if ($SZ==8); 169 .WORD 0x428a2f98,0xd728ae22,0x71374491,0x23ef65cd 170 .WORD 0xb5c0fbcf,0xec4d3b2f,0xe9b5dba5,0x8189dbbc 171 .WORD 0x3956c25b,0xf348b538,0x59f111f1,0xb605d019 172 .WORD 0x923f82a4,0xaf194f9b,0xab1c5ed5,0xda6d8118 173 .WORD 0xd807aa98,0xa3030242,0x12835b01,0x45706fbe 174 .WORD 0x243185be,0x4ee4b28c,0x550c7dc3,0xd5ffb4e2 175 .WORD 0x72be5d74,0xf27b896f,0x80deb1fe,0x3b1696b1 176 .WORD 0x9bdc06a7,0x25c71235,0xc19bf174,0xcf692694 177 .WORD 0xe49b69c1,0x9ef14ad2,0xefbe4786,0x384f25e3 178 .WORD 0x0fc19dc6,0x8b8cd5b5,0x240ca1cc,0x77ac9c65 179 .WORD 0x2de92c6f,0x592b0275,0x4a7484aa,0x6ea6e483 180 .WORD 0x5cb0a9dc,0xbd41fbd4,0x76f988da,0x831153b5 181 .WORD 0x983e5152,0xee66dfab,0xa831c66d,0x2db43210 182 .WORD 0xb00327c8,0x98fb213f,0xbf597fc7,0xbeef0ee4 183 .WORD 0xc6e00bf3,0x3da88fc2,0xd5a79147,0x930aa725 184 .WORD 0x06ca6351,0xe003826f,0x14292967,0x0a0e6e70 185 .WORD 0x27b70a85,0x46d22ffc,0x2e1b2138,0x5c26c926 186 .WORD 0x4d2c6dfc,0x5ac42aed,0x53380d13,0x9d95b3df 187 .WORD 0x650a7354,0x8baf63de,0x766a0abb,0x3c77b2a8 188 .WORD 0x81c2c92e,0x47edaee6,0x92722c85,0x1482353b 189 .WORD 0xa2bfe8a1,0x4cf10364,0xa81a664b,0xbc423001 190 .WORD 0xc24b8b70,0xd0f89791,0xc76c51a3,0x0654be30 191 .WORD 0xd192e819,0xd6ef5218,0xd6990624,0x5565a910 192 .WORD 0xf40e3585,0x5771202a,0x106aa070,0x32bbd1b8 193 .WORD 0x19a4c116,0xb8d2d0c8,0x1e376c08,0x5141ab53 194 .WORD 0x2748774c,0xdf8eeb99,0x34b0bcb5,0xe19b48a8 195 .WORD 0x391c0cb3,0xc5c95a63,0x4ed8aa4a,0xe3418acb 196 .WORD 0x5b9cca4f,0x7763e373,0x682e6ff3,0xd6b2b8a3 197 .WORD 0x748f82ee,0x5defb2fc,0x78a5636f,0x43172f60 198 .WORD 0x84c87814,0xa1f0ab72,0x8cc70208,0x1a6439ec 199 .WORD 0x90befffa,0x23631e28,0xa4506ceb,0xde82bde9 200 .WORD 0xbef9a3f7,0xb2c67915,0xc67178f2,0xe372532b 201 .WORD 0xca273ece,0xea26619c,0xd186b8c7,0x21c0c207 202 .WORD 0xeada7dd6,0xcde0eb1e,0xf57d4f7f,0xee6ed178 203 .WORD 0x06f067aa,0x72176fba,0x0a637dc5,0xa2c898a6 204 .WORD 0x113f9804,0xbef90dae,0x1b710b35,0x131c471b 205 .WORD 0x28db77f5,0x23047d84,0x32caab7b,0x40c72493 206 .WORD 0x3c9ebe0a,0x15c9bebc,0x431d67c4,0x9c100d4c 207 .WORD 0x4cc5d4be,0xcb3e42b6,0x597f299c,0xfc657e2a 208 .WORD 0x5fcb6fab,0x3ad6faec,0x6c44198c,0x4a475817 209 ___ 210 $code.=<<___ if ($SZ==4); 211 .WORD 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 212 .WORD 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 213 .WORD 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 214 .WORD 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 215 .WORD 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc 216 .WORD 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da 217 .WORD 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 218 .WORD 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 219 .WORD 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 220 .WORD 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 221 .WORD 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 222 .WORD 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 223 .WORD 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 224 .WORD 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 225 .WORD 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 226 .WORD 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 227 ___ 228 $code.=<<___; 229 230 .EXPORT $func,ENTRY,ARGW0=GR,ARGW1=GR,ARGW2=GR 231 .ALIGN 64 232 $func 233 .PROC 234 .CALLINFO FRAME=`$FRAME-16*$SIZE_T`,NO_CALLS,SAVE_RP,ENTRY_GR=18 235 .ENTRY 236 $PUSH %r2,-$SAVED_RP(%sp) ; standard prologue 237 $PUSHMA %r3,$FRAME(%sp) 238 $PUSH %r4,`-$FRAME+1*$SIZE_T`(%sp) 239 $PUSH %r5,`-$FRAME+2*$SIZE_T`(%sp) 240 $PUSH %r6,`-$FRAME+3*$SIZE_T`(%sp) 241 $PUSH %r7,`-$FRAME+4*$SIZE_T`(%sp) 242 $PUSH %r8,`-$FRAME+5*$SIZE_T`(%sp) 243 $PUSH %r9,`-$FRAME+6*$SIZE_T`(%sp) 244 $PUSH %r10,`-$FRAME+7*$SIZE_T`(%sp) 245 $PUSH %r11,`-$FRAME+8*$SIZE_T`(%sp) 246 $PUSH %r12,`-$FRAME+9*$SIZE_T`(%sp) 247 $PUSH %r13,`-$FRAME+10*$SIZE_T`(%sp) 248 $PUSH %r14,`-$FRAME+11*$SIZE_T`(%sp) 249 $PUSH %r15,`-$FRAME+12*$SIZE_T`(%sp) 250 $PUSH %r16,`-$FRAME+13*$SIZE_T`(%sp) 251 $PUSH %r17,`-$FRAME+14*$SIZE_T`(%sp) 252 $PUSH %r18,`-$FRAME+15*$SIZE_T`(%sp) 253 254 _shl $num,`log(16*$SZ)/log(2)`,$num 255 addl $inp,$num,$num ; $num to point at the end of $inp 256 257 $PUSH $num,`-$FRAME_MARKER-4*$SIZE_T`(%sp) ; save arguments 258 $PUSH $inp,`-$FRAME_MARKER-3*$SIZE_T`(%sp) 259 $PUSH $ctx,`-$FRAME_MARKER-2*$SIZE_T`(%sp) 260 261 blr %r0,$Tbl 262 ldi 3,$t1 263 L\$pic 264 andcm $Tbl,$t1,$Tbl ; wipe privilege level 265 ldo L\$table-L\$pic($Tbl),$Tbl 266 ___ 267 $code.=<<___ if ($SZ==8 && $SIZE_T==4); 268 ldi 31,$t1 269 mtctl $t1,%cr11 270 extrd,u,*= $t1,%sar,1,$t1 ; executes on PA-RISC 1.0 271 b L\$parisc1 272 nop 273 ___ 274 $code.=<<___; 275 $LD `0*$SZ`($ctx),$A ; load context 276 $LD `1*$SZ`($ctx),$B 277 $LD `2*$SZ`($ctx),$C 278 $LD `3*$SZ`($ctx),$D 279 $LD `4*$SZ`($ctx),$E 280 $LD `5*$SZ`($ctx),$F 281 $LD `6*$SZ`($ctx),$G 282 $LD `7*$SZ`($ctx),$H 283 284 extru $inp,31,`log($SZ)/log(2)`,$t0 285 sh3addl $t0,%r0,$t0 286 subi `8*$SZ`,$t0,$t0 287 mtctl $t0,%cr11 ; load %sar with align factor 288 289 L\$oop 290 ldi `$SZ-1`,$t0 291 $LDM $SZ($Tbl),$t1 292 andcm $inp,$t0,$t0 ; align $inp 293 ___ 294 for ($i=0;$i<15;$i++) { # load input block 295 $code.="\t$LD `$SZ*$i`($t0),@X[$i]\n"; } 296 $code.=<<___; 297 cmpb,*= $inp,$t0,L\$aligned 298 $LD `$SZ*15`($t0),@X[15] 299 $LD `$SZ*16`($t0),@X[16] 300 ___ 301 for ($i=0;$i<16;$i++) { # align data 302 $code.="\t_align @X[$i],@X[$i+1],@X[$i]\n"; } 303 $code.=<<___; 304 L\$aligned 305 nop ; otherwise /usr/ccs/bin/as is confused by below .WORD 306 ___ 307 308 for($i=0;$i<16;$i++) { &ROUND_00_15($i,@V); unshift(@V,pop(@V)); } 309 $code.=<<___; 310 L\$rounds 311 nop ; otherwise /usr/ccs/bin/as is confused by below .WORD 312 ___ 313 for(;$i<32;$i++) { &ROUND_16_xx($i,@V); unshift(@V,pop(@V)); } 314 $code.=<<___; 315 bb,>= $Tbl,31,L\$rounds ; end of $Tbl signalled? 316 nop 317 318 $POP `-$FRAME_MARKER-2*$SIZE_T`(%sp),$ctx ; restore arguments 319 $POP `-$FRAME_MARKER-3*$SIZE_T`(%sp),$inp 320 $POP `-$FRAME_MARKER-4*$SIZE_T`(%sp),$num 321 ldo `-$rounds*$SZ-1`($Tbl),$Tbl ; rewind $Tbl 322 323 $LD `0*$SZ`($ctx),@X[0] ; load context 324 $LD `1*$SZ`($ctx),@X[1] 325 $LD `2*$SZ`($ctx),@X[2] 326 $LD `3*$SZ`($ctx),@X[3] 327 $LD `4*$SZ`($ctx),@X[4] 328 $LD `5*$SZ`($ctx),@X[5] 329 addl @X[0],$A,$A 330 $LD `6*$SZ`($ctx),@X[6] 331 addl @X[1],$B,$B 332 $LD `7*$SZ`($ctx),@X[7] 333 ldo `16*$SZ`($inp),$inp ; advance $inp 334 335 $ST $A,`0*$SZ`($ctx) ; save context 336 addl @X[2],$C,$C 337 $ST $B,`1*$SZ`($ctx) 338 addl @X[3],$D,$D 339 $ST $C,`2*$SZ`($ctx) 340 addl @X[4],$E,$E 341 $ST $D,`3*$SZ`($ctx) 342 addl @X[5],$F,$F 343 $ST $E,`4*$SZ`($ctx) 344 addl @X[6],$G,$G 345 $ST $F,`5*$SZ`($ctx) 346 addl @X[7],$H,$H 347 $ST $G,`6*$SZ`($ctx) 348 $ST $H,`7*$SZ`($ctx) 349 350 cmpb,*<>,n $inp,$num,L\$oop 351 $PUSH $inp,`-$FRAME_MARKER-3*$SIZE_T`(%sp) ; save $inp 352 ___ 353 if ($SZ==8 && $SIZE_T==4) # SHA512 for 32-bit PA-RISC 1.0 354 {{ 355 $code.=<<___; 356 b L\$done 357 nop 358 359 .ALIGN 64 360 L\$parisc1 361 ___ 362 363 @V=( $Ahi, $Alo, $Bhi, $Blo, $Chi, $Clo, $Dhi, $Dlo, 364 $Ehi, $Elo, $Fhi, $Flo, $Ghi, $Glo, $Hhi, $Hlo) = 365 ( "%r1", "%r2", "%r3", "%r4", "%r5", "%r6", "%r7", "%r8", 366 "%r9","%r10","%r11","%r12","%r13","%r14","%r15","%r16"); 367 $a0 ="%r17"; 368 $a1 ="%r18"; 369 $a2 ="%r19"; 370 $a3 ="%r20"; 371 $t0 ="%r21"; 372 $t1 ="%r22"; 373 $t2 ="%r28"; 374 $t3 ="%r29"; 375 $Tbl="%r31"; 376 377 @X=("%r23","%r24","%r25","%r26"); # zaps $num,$inp,$ctx 378 379 sub ROUND_00_15_pa1 { 380 my ($i,$ahi,$alo,$bhi,$blo,$chi,$clo,$dhi,$dlo, 381 $ehi,$elo,$fhi,$flo,$ghi,$glo,$hhi,$hlo,$flag)=@_; 382 my ($Xhi,$Xlo,$Xnhi,$Xnlo) = @X; 383 384 $code.=<<___ if (!$flag); 385 ldw `-$XOFF+8*(($i+1)%16)`(%sp),$Xnhi 386 ldw `-$XOFF+8*(($i+1)%16)+4`(%sp),$Xnlo ; load X[i+1] 387 ___ 388 $code.=<<___; 389 shd $ehi,$elo,$Sigma1[0],$t0 390 add $Xlo,$hlo,$hlo 391 shd $elo,$ehi,$Sigma1[0],$t1 392 addc $Xhi,$hhi,$hhi ; h += X[i] 393 shd $ehi,$elo,$Sigma1[1],$t2 394 ldwm 8($Tbl),$Xhi 395 shd $elo,$ehi,$Sigma1[1],$t3 396 ldw -4($Tbl),$Xlo ; load K[i] 397 xor $t2,$t0,$t0 398 xor $t3,$t1,$t1 399 and $flo,$elo,$a0 400 and $fhi,$ehi,$a1 401 shd $ehi,$elo,$Sigma1[2],$t2 402 andcm $glo,$elo,$a2 403 shd $elo,$ehi,$Sigma1[2],$t3 404 andcm $ghi,$ehi,$a3 405 xor $t2,$t0,$t0 406 xor $t3,$t1,$t1 ; Sigma1(e) 407 add $Xlo,$hlo,$hlo 408 xor $a2,$a0,$a0 409 addc $Xhi,$hhi,$hhi ; h += K[i] 410 xor $a3,$a1,$a1 ; Ch(e,f,g) 411 412 add $t0,$hlo,$hlo 413 shd $ahi,$alo,$Sigma0[0],$t0 414 addc $t1,$hhi,$hhi ; h += Sigma1(e) 415 shd $alo,$ahi,$Sigma0[0],$t1 416 add $a0,$hlo,$hlo 417 shd $ahi,$alo,$Sigma0[1],$t2 418 addc $a1,$hhi,$hhi ; h += Ch(e,f,g) 419 shd $alo,$ahi,$Sigma0[1],$t3 420 421 xor $t2,$t0,$t0 422 xor $t3,$t1,$t1 423 shd $ahi,$alo,$Sigma0[2],$t2 424 and $alo,$blo,$a0 425 shd $alo,$ahi,$Sigma0[2],$t3 426 and $ahi,$bhi,$a1 427 xor $t2,$t0,$t0 428 xor $t3,$t1,$t1 ; Sigma0(a) 429 430 and $alo,$clo,$a2 431 and $ahi,$chi,$a3 432 xor $a2,$a0,$a0 433 add $hlo,$dlo,$dlo 434 xor $a3,$a1,$a1 435 addc $hhi,$dhi,$dhi ; d += h 436 and $blo,$clo,$a2 437 add $t0,$hlo,$hlo 438 and $bhi,$chi,$a3 439 addc $t1,$hhi,$hhi ; h += Sigma0(a) 440 xor $a2,$a0,$a0 441 add $a0,$hlo,$hlo 442 xor $a3,$a1,$a1 ; Maj(a,b,c) 443 addc $a1,$hhi,$hhi ; h += Maj(a,b,c) 444 445 ___ 446 $code.=<<___ if ($i==15 && $flag); 447 extru $Xlo,31,10,$Xlo 448 comiclr,= $LAST10BITS,$Xlo,%r0 449 b L\$rounds_pa1 450 nop 451 ___ 452 push(@X,shift(@X)); push(@X,shift(@X)); 453 } 454 455 sub ROUND_16_xx_pa1 { 456 my ($Xhi,$Xlo,$Xnhi,$Xnlo) = @X; 457 my ($i)=shift; 458 $i-=16; 459 $code.=<<___; 460 ldw `-$XOFF+8*(($i+1)%16)`(%sp),$Xnhi 461 ldw `-$XOFF+8*(($i+1)%16)+4`(%sp),$Xnlo ; load X[i+1] 462 ldw `-$XOFF+8*(($i+9)%16)`(%sp),$a1 463 ldw `-$XOFF+8*(($i+9)%16)+4`(%sp),$a0 ; load X[i+9] 464 ldw `-$XOFF+8*(($i+14)%16)`(%sp),$a3 465 ldw `-$XOFF+8*(($i+14)%16)+4`(%sp),$a2 ; load X[i+14] 466 shd $Xnhi,$Xnlo,$sigma0[0],$t0 467 shd $Xnlo,$Xnhi,$sigma0[0],$t1 468 add $a0,$Xlo,$Xlo 469 shd $Xnhi,$Xnlo,$sigma0[1],$t2 470 addc $a1,$Xhi,$Xhi 471 shd $Xnlo,$Xnhi,$sigma0[1],$t3 472 xor $t2,$t0,$t0 473 shd $Xnhi,$Xnlo,$sigma0[2],$t2 474 xor $t3,$t1,$t1 475 extru $Xnhi,`31-$sigma0[2]`,`32-$sigma0[2]`,$t3 476 xor $t2,$t0,$t0 477 shd $a3,$a2,$sigma1[0],$a0 478 xor $t3,$t1,$t1 ; sigma0(X[i+1)&0x0f]) 479 shd $a2,$a3,$sigma1[0],$a1 480 add $t0,$Xlo,$Xlo 481 shd $a3,$a2,$sigma1[1],$t2 482 addc $t1,$Xhi,$Xhi 483 shd $a2,$a3,$sigma1[1],$t3 484 xor $t2,$a0,$a0 485 shd $a3,$a2,$sigma1[2],$t2 486 xor $t3,$a1,$a1 487 extru $a3,`31-$sigma1[2]`,`32-$sigma1[2]`,$t3 488 xor $t2,$a0,$a0 489 xor $t3,$a1,$a1 ; sigma0(X[i+14)&0x0f]) 490 add $a0,$Xlo,$Xlo 491 addc $a1,$Xhi,$Xhi 492 493 stw $Xhi,`-$XOFF+8*($i%16)`(%sp) 494 stw $Xlo,`-$XOFF+8*($i%16)+4`(%sp) 495 ___ 496 &ROUND_00_15_pa1($i,@_,1); 497 } 498 $code.=<<___; 499 ldw `0*4`($ctx),$Ahi ; load context 500 ldw `1*4`($ctx),$Alo 501 ldw `2*4`($ctx),$Bhi 502 ldw `3*4`($ctx),$Blo 503 ldw `4*4`($ctx),$Chi 504 ldw `5*4`($ctx),$Clo 505 ldw `6*4`($ctx),$Dhi 506 ldw `7*4`($ctx),$Dlo 507 ldw `8*4`($ctx),$Ehi 508 ldw `9*4`($ctx),$Elo 509 ldw `10*4`($ctx),$Fhi 510 ldw `11*4`($ctx),$Flo 511 ldw `12*4`($ctx),$Ghi 512 ldw `13*4`($ctx),$Glo 513 ldw `14*4`($ctx),$Hhi 514 ldw `15*4`($ctx),$Hlo 515 516 extru $inp,31,2,$t0 517 sh3addl $t0,%r0,$t0 518 subi 32,$t0,$t0 519 mtctl $t0,%cr11 ; load %sar with align factor 520 521 L\$oop_pa1 522 extru $inp,31,2,$a3 523 comib,= 0,$a3,L\$aligned_pa1 524 sub $inp,$a3,$inp 525 526 ldw `0*4`($inp),$X[0] 527 ldw `1*4`($inp),$X[1] 528 ldw `2*4`($inp),$t2 529 ldw `3*4`($inp),$t3 530 ldw `4*4`($inp),$a0 531 ldw `5*4`($inp),$a1 532 ldw `6*4`($inp),$a2 533 ldw `7*4`($inp),$a3 534 vshd $X[0],$X[1],$X[0] 535 vshd $X[1],$t2,$X[1] 536 stw $X[0],`-$XOFF+0*4`(%sp) 537 ldw `8*4`($inp),$t0 538 vshd $t2,$t3,$t2 539 stw $X[1],`-$XOFF+1*4`(%sp) 540 ldw `9*4`($inp),$t1 541 vshd $t3,$a0,$t3 542 ___ 543 { 544 my @t=($t2,$t3,$a0,$a1,$a2,$a3,$t0,$t1); 545 for ($i=2;$i<=(128/4-8);$i++) { 546 $code.=<<___; 547 stw $t[0],`-$XOFF+$i*4`(%sp) 548 ldw `(8+$i)*4`($inp),$t[0] 549 vshd $t[1],$t[2],$t[1] 550 ___ 551 push(@t,shift(@t)); 552 } 553 for (;$i<(128/4-1);$i++) { 554 $code.=<<___; 555 stw $t[0],`-$XOFF+$i*4`(%sp) 556 vshd $t[1],$t[2],$t[1] 557 ___ 558 push(@t,shift(@t)); 559 } 560 $code.=<<___; 561 b L\$collected_pa1 562 stw $t[0],`-$XOFF+$i*4`(%sp) 563 564 ___ 565 } 566 $code.=<<___; 567 L\$aligned_pa1 568 ldw `0*4`($inp),$X[0] 569 ldw `1*4`($inp),$X[1] 570 ldw `2*4`($inp),$t2 571 ldw `3*4`($inp),$t3 572 ldw `4*4`($inp),$a0 573 ldw `5*4`($inp),$a1 574 ldw `6*4`($inp),$a2 575 ldw `7*4`($inp),$a3 576 stw $X[0],`-$XOFF+0*4`(%sp) 577 ldw `8*4`($inp),$t0 578 stw $X[1],`-$XOFF+1*4`(%sp) 579 ldw `9*4`($inp),$t1 580 ___ 581 { 582 my @t=($t2,$t3,$a0,$a1,$a2,$a3,$t0,$t1); 583 for ($i=2;$i<(128/4-8);$i++) { 584 $code.=<<___; 585 stw $t[0],`-$XOFF+$i*4`(%sp) 586 ldw `(8+$i)*4`($inp),$t[0] 587 ___ 588 push(@t,shift(@t)); 589 } 590 for (;$i<128/4;$i++) { 591 $code.=<<___; 592 stw $t[0],`-$XOFF+$i*4`(%sp) 593 ___ 594 push(@t,shift(@t)); 595 } 596 $code.="L\$collected_pa1\n"; 597 } 598 599 for($i=0;$i<16;$i++) { &ROUND_00_15_pa1($i,@V); unshift(@V,pop(@V)); unshift(@V,pop(@V)); } 600 $code.="L\$rounds_pa1\n"; 601 for(;$i<32;$i++) { &ROUND_16_xx_pa1($i,@V); unshift(@V,pop(@V)); unshift(@V,pop(@V)); } 602 603 $code.=<<___; 604 $POP `-$FRAME_MARKER-2*$SIZE_T`(%sp),$ctx ; restore arguments 605 $POP `-$FRAME_MARKER-3*$SIZE_T`(%sp),$inp 606 $POP `-$FRAME_MARKER-4*$SIZE_T`(%sp),$num 607 ldo `-$rounds*$SZ`($Tbl),$Tbl ; rewind $Tbl 608 609 ldw `0*4`($ctx),$t1 ; update context 610 ldw `1*4`($ctx),$t0 611 ldw `2*4`($ctx),$t3 612 ldw `3*4`($ctx),$t2 613 ldw `4*4`($ctx),$a1 614 ldw `5*4`($ctx),$a0 615 ldw `6*4`($ctx),$a3 616 add $t0,$Alo,$Alo 617 ldw `7*4`($ctx),$a2 618 addc $t1,$Ahi,$Ahi 619 ldw `8*4`($ctx),$t1 620 add $t2,$Blo,$Blo 621 ldw `9*4`($ctx),$t0 622 addc $t3,$Bhi,$Bhi 623 ldw `10*4`($ctx),$t3 624 add $a0,$Clo,$Clo 625 ldw `11*4`($ctx),$t2 626 addc $a1,$Chi,$Chi 627 ldw `12*4`($ctx),$a1 628 add $a2,$Dlo,$Dlo 629 ldw `13*4`($ctx),$a0 630 addc $a3,$Dhi,$Dhi 631 ldw `14*4`($ctx),$a3 632 add $t0,$Elo,$Elo 633 ldw `15*4`($ctx),$a2 634 addc $t1,$Ehi,$Ehi 635 stw $Ahi,`0*4`($ctx) 636 add $t2,$Flo,$Flo 637 stw $Alo,`1*4`($ctx) 638 addc $t3,$Fhi,$Fhi 639 stw $Bhi,`2*4`($ctx) 640 add $a0,$Glo,$Glo 641 stw $Blo,`3*4`($ctx) 642 addc $a1,$Ghi,$Ghi 643 stw $Chi,`4*4`($ctx) 644 add $a2,$Hlo,$Hlo 645 stw $Clo,`5*4`($ctx) 646 addc $a3,$Hhi,$Hhi 647 stw $Dhi,`6*4`($ctx) 648 ldo `16*$SZ`($inp),$inp ; advance $inp 649 stw $Dlo,`7*4`($ctx) 650 stw $Ehi,`8*4`($ctx) 651 stw $Elo,`9*4`($ctx) 652 stw $Fhi,`10*4`($ctx) 653 stw $Flo,`11*4`($ctx) 654 stw $Ghi,`12*4`($ctx) 655 stw $Glo,`13*4`($ctx) 656 stw $Hhi,`14*4`($ctx) 657 comb,= $inp,$num,L\$done 658 stw $Hlo,`15*4`($ctx) 659 b L\$oop_pa1 660 $PUSH $inp,`-$FRAME_MARKER-3*$SIZE_T`(%sp) ; save $inp 661 L\$done 662 ___ 663 }} 664 $code.=<<___; 665 $POP `-$FRAME-$SAVED_RP`(%sp),%r2 ; standard epilogue 666 $POP `-$FRAME+1*$SIZE_T`(%sp),%r4 667 $POP `-$FRAME+2*$SIZE_T`(%sp),%r5 668 $POP `-$FRAME+3*$SIZE_T`(%sp),%r6 669 $POP `-$FRAME+4*$SIZE_T`(%sp),%r7 670 $POP `-$FRAME+5*$SIZE_T`(%sp),%r8 671 $POP `-$FRAME+6*$SIZE_T`(%sp),%r9 672 $POP `-$FRAME+7*$SIZE_T`(%sp),%r10 673 $POP `-$FRAME+8*$SIZE_T`(%sp),%r11 674 $POP `-$FRAME+9*$SIZE_T`(%sp),%r12 675 $POP `-$FRAME+10*$SIZE_T`(%sp),%r13 676 $POP `-$FRAME+11*$SIZE_T`(%sp),%r14 677 $POP `-$FRAME+12*$SIZE_T`(%sp),%r15 678 $POP `-$FRAME+13*$SIZE_T`(%sp),%r16 679 $POP `-$FRAME+14*$SIZE_T`(%sp),%r17 680 $POP `-$FRAME+15*$SIZE_T`(%sp),%r18 681 bv (%r2) 682 .EXIT 683 $POPMB -$FRAME(%sp),%r3 684 .PROCEND 685 .STRINGZ "SHA`64*$SZ` block transform for PA-RISC, CRYPTOGAMS by <appro\@openssl.org>" 686 ___ 687 688 # Explicitly encode PA-RISC 2.0 instructions used in this module, so 689 # that it can be compiled with .LEVEL 1.0. It should be noted that I 690 # wouldn't have to do this, if GNU assembler understood .ALLOW 2.0 691 # directive... 692 693 my $ldd = sub { 694 my ($mod,$args) = @_; 695 my $orig = "ldd$mod\t$args"; 696 697 if ($args =~ /(\-?[0-9]+)\(%r([0-9]+)\),%r([0-9]+)/) # format 3 suffices 698 { my $opcode=(0x14<<26)|($2<<21)|($3<<16)|(($1&0x1FF8)<<1)|(($1>>13)&1); 699 $opcode|=(1<<3) if ($mod =~ /^,m/); 700 $opcode|=(1<<2) if ($mod =~ /^,mb/); 701 sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig; 702 } 703 else { "\t".$orig; } 704 }; 705 706 my $std = sub { 707 my ($mod,$args) = @_; 708 my $orig = "std$mod\t$args"; 709 710 if ($args =~ /%r([0-9]+),(\-?[0-9]+)\(%r([0-9]+)\)/) # format 3 suffices 711 { my $opcode=(0x1c<<26)|($3<<21)|($1<<16)|(($2&0x1FF8)<<1)|(($2>>13)&1); 712 sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig; 713 } 714 else { "\t".$orig; } 715 }; 716 717 my $extrd = sub { 718 my ($mod,$args) = @_; 719 my $orig = "extrd$mod\t$args"; 720 721 # I only have ",u" completer, it's implicitly encoded... 722 if ($args =~ /%r([0-9]+),([0-9]+),([0-9]+),%r([0-9]+)/) # format 15 723 { my $opcode=(0x36<<26)|($1<<21)|($4<<16); 724 my $len=32-$3; 725 $opcode |= (($2&0x20)<<6)|(($2&0x1f)<<5); # encode pos 726 $opcode |= (($len&0x20)<<7)|($len&0x1f); # encode len 727 sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig; 728 } 729 elsif ($args =~ /%r([0-9]+),%sar,([0-9]+),%r([0-9]+)/) # format 12 730 { my $opcode=(0x34<<26)|($1<<21)|($3<<16)|(2<<11)|(1<<9); 731 my $len=32-$2; 732 $opcode |= (($len&0x20)<<3)|($len&0x1f); # encode len 733 $opcode |= (1<<13) if ($mod =~ /,\**=/); 734 sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig; 735 } 736 else { "\t".$orig; } 737 }; 738 739 my $shrpd = sub { 740 my ($mod,$args) = @_; 741 my $orig = "shrpd$mod\t$args"; 742 743 if ($args =~ /%r([0-9]+),%r([0-9]+),([0-9]+),%r([0-9]+)/) # format 14 744 { my $opcode=(0x34<<26)|($2<<21)|($1<<16)|(1<<10)|$4; 745 my $cpos=63-$3; 746 $opcode |= (($cpos&0x20)<<6)|(($cpos&0x1f)<<5); # encode sa 747 sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig; 748 } 749 elsif ($args =~ /%r([0-9]+),%r([0-9]+),%sar,%r([0-9]+)/) # format 11 750 { sprintf "\t.WORD\t0x%08x\t; %s", 751 (0x34<<26)|($2<<21)|($1<<16)|(1<<9)|$3,$orig; 752 } 753 else { "\t".$orig; } 754 }; 755 756 sub assemble { 757 my ($mnemonic,$mod,$args)=@_; 758 my $opcode = eval("\$$mnemonic"); 759 760 ref($opcode) eq 'CODE' ? &$opcode($mod,$args) : "\t$mnemonic$mod\t$args"; 761 } 762 763 foreach (split("\n",$code)) { 764 s/\`([^\`]*)\`/eval $1/ge; 765 766 s/shd\s+(%r[0-9]+),(%r[0-9]+),([0-9]+)/ 767 $3>31 ? sprintf("shd\t%$2,%$1,%d",$3-32) # rotation for >=32 768 : sprintf("shd\t%$1,%$2,%d",$3)/e or 769 # translate made up instructons: _ror, _shr, _align, _shl 770 s/_ror(\s+)(%r[0-9]+),/ 771 ($SZ==4 ? "shd" : "shrpd")."$1$2,$2,"/e or 772 773 s/_shr(\s+%r[0-9]+),([0-9]+),/ 774 $SZ==4 ? sprintf("extru%s,%d,%d,",$1,31-$2,32-$2) 775 : sprintf("extrd,u%s,%d,%d,",$1,63-$2,64-$2)/e or 776 777 s/_align(\s+%r[0-9]+,%r[0-9]+),/ 778 ($SZ==4 ? "vshd$1," : "shrpd$1,%sar,")/e or 779 780 s/_shl(\s+%r[0-9]+),([0-9]+),/ 781 $SIZE_T==4 ? sprintf("zdep%s,%d,%d,",$1,31-$2,32-$2) 782 : sprintf("depd,z%s,%d,%d,",$1,63-$2,64-$2)/e; 783 784 s/^\s+([a-z]+)([\S]*)\s+([\S]*)/&assemble($1,$2,$3)/e if ($SIZE_T==4); 785 786 s/cmpb,\*/comb,/ if ($SIZE_T==4); 787 788 print $_,"\n"; 789 } 790 791 close STDOUT; 792