Home | History | Annotate | Download | only in asm
      1 #!/usr/bin/env perl
      2 
      3 # ====================================================================
      4 # Written by Andy Polyakov <appro (at] fy.chalmers.se> for the OpenSSL
      5 # project. The module is, however, dual licensed under OpenSSL and
      6 # CRYPTOGAMS licenses depending on where you obtain it. For further
      7 # details see http://www.openssl.org/~appro/cryptogams/.
      8 # ====================================================================
      9 
     10 # SHA256/512 block procedure for PA-RISC.
     11 
     12 # June 2009.
     13 #
     14 # SHA256 performance is >75% better than gcc 3.2 generated code on
     15 # PA-7100LC. Compared to code generated by vendor compiler this
     16 # implementation is almost 70% faster in 64-bit build, but delivers
     17 # virtually same performance in 32-bit build on PA-8600.
     18 #
     19 # SHA512 performance is >2.9x better than gcc 3.2 generated code on
     20 # PA-7100LC, PA-RISC 1.1 processor. Then implementation detects if the
     21 # code is executed on PA-RISC 2.0 processor and switches to 64-bit
     22 # code path delivering adequate peformance even in "blended" 32-bit
     23 # build. Though 64-bit code is not any faster than code generated by
     24 # vendor compiler on PA-8600...
     25 #
     26 # Special thanks to polarhome.com for providing HP-UX account.
     27 
     28 $flavour = shift;
     29 $output = shift;
     30 open STDOUT,">$output";
     31 
     32 if ($flavour =~ /64/) {
     33 	$LEVEL		="2.0W";
     34 	$SIZE_T		=8;
     35 	$FRAME_MARKER	=80;
     36 	$SAVED_RP	=16;
     37 	$PUSH		="std";
     38 	$PUSHMA		="std,ma";
     39 	$POP		="ldd";
     40 	$POPMB		="ldd,mb";
     41 } else {
     42 	$LEVEL		="1.0";
     43 	$SIZE_T		=4;
     44 	$FRAME_MARKER	=48;
     45 	$SAVED_RP	=20;
     46 	$PUSH		="stw";
     47 	$PUSHMA		="stwm";
     48 	$POP		="ldw";
     49 	$POPMB		="ldwm";
     50 }
     51 
     52 if ($output =~ /512/) {
     53 	$func="sha512_block_data_order";
     54 	$SZ=8;
     55 	@Sigma0=(28,34,39);
     56 	@Sigma1=(14,18,41);
     57 	@sigma0=(1,  8, 7);
     58 	@sigma1=(19,61, 6);
     59 	$rounds=80;
     60 	$LAST10BITS=0x017;
     61 	$LD="ldd";
     62 	$LDM="ldd,ma";
     63 	$ST="std";
     64 } else {
     65 	$func="sha256_block_data_order";
     66 	$SZ=4;
     67 	@Sigma0=( 2,13,22);
     68 	@Sigma1=( 6,11,25);
     69 	@sigma0=( 7,18, 3);
     70 	@sigma1=(17,19,10);
     71 	$rounds=64;
     72 	$LAST10BITS=0x0f2;
     73 	$LD="ldw";
     74 	$LDM="ldwm";
     75 	$ST="stw";
     76 }
     77 
     78 $FRAME=16*$SIZE_T+$FRAME_MARKER;# 16 saved regs + frame marker
     79 				#                 [+ argument transfer]
     80 $XOFF=16*$SZ+32;		# local variables
     81 $FRAME+=$XOFF;
     82 $XOFF+=$FRAME_MARKER;		# distance between %sp and local variables
     83 
     84 $ctx="%r26";	# zapped by $a0
     85 $inp="%r25";	# zapped by $a1
     86 $num="%r24";	# zapped by $t0
     87 
     88 $a0 ="%r26";
     89 $a1 ="%r25";
     90 $t0 ="%r24";
     91 $t1 ="%r29";
     92 $Tbl="%r31";
     93 
     94 @V=($A,$B,$C,$D,$E,$F,$G,$H)=("%r17","%r18","%r19","%r20","%r21","%r22","%r23","%r28");
     95 
     96 @X=("%r1", "%r2", "%r3", "%r4", "%r5", "%r6", "%r7", "%r8",
     97     "%r9", "%r10","%r11","%r12","%r13","%r14","%r15","%r16",$inp);
     98 
     99 sub ROUND_00_15 {
    100 my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
    101 $code.=<<___;
    102 	_ror	$e,$Sigma1[0],$a0
    103 	and	$f,$e,$t0
    104 	_ror	$e,$Sigma1[1],$a1
    105 	addl	$t1,$h,$h
    106 	andcm	$g,$e,$t1
    107 	xor	$a1,$a0,$a0
    108 	_ror	$a1,`$Sigma1[2]-$Sigma1[1]`,$a1
    109 	or	$t0,$t1,$t1		; Ch(e,f,g)
    110 	addl	@X[$i%16],$h,$h
    111 	xor	$a0,$a1,$a1		; Sigma1(e)
    112 	addl	$t1,$h,$h
    113 	_ror	$a,$Sigma0[0],$a0
    114 	addl	$a1,$h,$h
    115 
    116 	_ror	$a,$Sigma0[1],$a1
    117 	and	$a,$b,$t0
    118 	and	$a,$c,$t1
    119 	xor	$a1,$a0,$a0
    120 	_ror	$a1,`$Sigma0[2]-$Sigma0[1]`,$a1
    121 	xor	$t1,$t0,$t0
    122 	and	$b,$c,$t1
    123 	xor	$a0,$a1,$a1		; Sigma0(a)
    124 	addl	$h,$d,$d
    125 	xor	$t1,$t0,$t0		; Maj(a,b,c)
    126 	`"$LDM	$SZ($Tbl),$t1" if ($i<15)`
    127 	addl	$a1,$h,$h
    128 	addl	$t0,$h,$h
    129 
    130 ___
    131 }
    132 
    133 sub ROUND_16_xx {
    134 my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
    135 $i-=16;
    136 $code.=<<___;
    137 	_ror	@X[($i+1)%16],$sigma0[0],$a0
    138 	_ror	@X[($i+1)%16],$sigma0[1],$a1
    139 	addl	@X[($i+9)%16],@X[$i],@X[$i]
    140 	_ror	@X[($i+14)%16],$sigma1[0],$t0
    141 	_ror	@X[($i+14)%16],$sigma1[1],$t1
    142 	xor	$a1,$a0,$a0
    143 	_shr	@X[($i+1)%16],$sigma0[2],$a1
    144 	xor	$t1,$t0,$t0
    145 	_shr	@X[($i+14)%16],$sigma1[2],$t1
    146 	xor	$a1,$a0,$a0		; sigma0(X[(i+1)&0x0f])
    147 	xor	$t1,$t0,$t0		; sigma1(X[(i+14)&0x0f])
    148 	$LDM	$SZ($Tbl),$t1
    149 	addl	$a0,@X[$i],@X[$i]
    150 	addl	$t0,@X[$i],@X[$i]
    151 ___
    152 $code.=<<___ if ($i==15);
    153 	extru	$t1,31,10,$a1
    154 	comiclr,<> $LAST10BITS,$a1,%r0
    155 	ldo	1($Tbl),$Tbl		; signal end of $Tbl
    156 ___
    157 &ROUND_00_15($i+16,$a,$b,$c,$d,$e,$f,$g,$h);
    158 }
    159 
    160 $code=<<___;
    161 	.LEVEL	$LEVEL
    162 	.SPACE	\$TEXT\$
    163 	.SUBSPA	\$CODE\$,QUAD=0,ALIGN=8,ACCESS=0x2C,CODE_ONLY
    164 
    165 	.ALIGN	64
    166 L\$table
    167 ___
    168 $code.=<<___ if ($SZ==8);
    169 	.WORD	0x428a2f98,0xd728ae22,0x71374491,0x23ef65cd
    170 	.WORD	0xb5c0fbcf,0xec4d3b2f,0xe9b5dba5,0x8189dbbc
    171 	.WORD	0x3956c25b,0xf348b538,0x59f111f1,0xb605d019
    172 	.WORD	0x923f82a4,0xaf194f9b,0xab1c5ed5,0xda6d8118
    173 	.WORD	0xd807aa98,0xa3030242,0x12835b01,0x45706fbe
    174 	.WORD	0x243185be,0x4ee4b28c,0x550c7dc3,0xd5ffb4e2
    175 	.WORD	0x72be5d74,0xf27b896f,0x80deb1fe,0x3b1696b1
    176 	.WORD	0x9bdc06a7,0x25c71235,0xc19bf174,0xcf692694
    177 	.WORD	0xe49b69c1,0x9ef14ad2,0xefbe4786,0x384f25e3
    178 	.WORD	0x0fc19dc6,0x8b8cd5b5,0x240ca1cc,0x77ac9c65
    179 	.WORD	0x2de92c6f,0x592b0275,0x4a7484aa,0x6ea6e483
    180 	.WORD	0x5cb0a9dc,0xbd41fbd4,0x76f988da,0x831153b5
    181 	.WORD	0x983e5152,0xee66dfab,0xa831c66d,0x2db43210
    182 	.WORD	0xb00327c8,0x98fb213f,0xbf597fc7,0xbeef0ee4
    183 	.WORD	0xc6e00bf3,0x3da88fc2,0xd5a79147,0x930aa725
    184 	.WORD	0x06ca6351,0xe003826f,0x14292967,0x0a0e6e70
    185 	.WORD	0x27b70a85,0x46d22ffc,0x2e1b2138,0x5c26c926
    186 	.WORD	0x4d2c6dfc,0x5ac42aed,0x53380d13,0x9d95b3df
    187 	.WORD	0x650a7354,0x8baf63de,0x766a0abb,0x3c77b2a8
    188 	.WORD	0x81c2c92e,0x47edaee6,0x92722c85,0x1482353b
    189 	.WORD	0xa2bfe8a1,0x4cf10364,0xa81a664b,0xbc423001
    190 	.WORD	0xc24b8b70,0xd0f89791,0xc76c51a3,0x0654be30
    191 	.WORD	0xd192e819,0xd6ef5218,0xd6990624,0x5565a910
    192 	.WORD	0xf40e3585,0x5771202a,0x106aa070,0x32bbd1b8
    193 	.WORD	0x19a4c116,0xb8d2d0c8,0x1e376c08,0x5141ab53
    194 	.WORD	0x2748774c,0xdf8eeb99,0x34b0bcb5,0xe19b48a8
    195 	.WORD	0x391c0cb3,0xc5c95a63,0x4ed8aa4a,0xe3418acb
    196 	.WORD	0x5b9cca4f,0x7763e373,0x682e6ff3,0xd6b2b8a3
    197 	.WORD	0x748f82ee,0x5defb2fc,0x78a5636f,0x43172f60
    198 	.WORD	0x84c87814,0xa1f0ab72,0x8cc70208,0x1a6439ec
    199 	.WORD	0x90befffa,0x23631e28,0xa4506ceb,0xde82bde9
    200 	.WORD	0xbef9a3f7,0xb2c67915,0xc67178f2,0xe372532b
    201 	.WORD	0xca273ece,0xea26619c,0xd186b8c7,0x21c0c207
    202 	.WORD	0xeada7dd6,0xcde0eb1e,0xf57d4f7f,0xee6ed178
    203 	.WORD	0x06f067aa,0x72176fba,0x0a637dc5,0xa2c898a6
    204 	.WORD	0x113f9804,0xbef90dae,0x1b710b35,0x131c471b
    205 	.WORD	0x28db77f5,0x23047d84,0x32caab7b,0x40c72493
    206 	.WORD	0x3c9ebe0a,0x15c9bebc,0x431d67c4,0x9c100d4c
    207 	.WORD	0x4cc5d4be,0xcb3e42b6,0x597f299c,0xfc657e2a
    208 	.WORD	0x5fcb6fab,0x3ad6faec,0x6c44198c,0x4a475817
    209 ___
    210 $code.=<<___ if ($SZ==4);
    211 	.WORD	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
    212 	.WORD	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
    213 	.WORD	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
    214 	.WORD	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
    215 	.WORD	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
    216 	.WORD	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
    217 	.WORD	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
    218 	.WORD	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
    219 	.WORD	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
    220 	.WORD	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
    221 	.WORD	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
    222 	.WORD	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
    223 	.WORD	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
    224 	.WORD	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
    225 	.WORD	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
    226 	.WORD	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
    227 ___
    228 $code.=<<___;
    229 
    230 	.EXPORT	$func,ENTRY,ARGW0=GR,ARGW1=GR,ARGW2=GR
    231 	.ALIGN	64
    232 $func
    233 	.PROC
    234 	.CALLINFO	FRAME=`$FRAME-16*$SIZE_T`,NO_CALLS,SAVE_RP,ENTRY_GR=18
    235 	.ENTRY
    236 	$PUSH	%r2,-$SAVED_RP(%sp)	; standard prologue
    237 	$PUSHMA	%r3,$FRAME(%sp)
    238 	$PUSH	%r4,`-$FRAME+1*$SIZE_T`(%sp)
    239 	$PUSH	%r5,`-$FRAME+2*$SIZE_T`(%sp)
    240 	$PUSH	%r6,`-$FRAME+3*$SIZE_T`(%sp)
    241 	$PUSH	%r7,`-$FRAME+4*$SIZE_T`(%sp)
    242 	$PUSH	%r8,`-$FRAME+5*$SIZE_T`(%sp)
    243 	$PUSH	%r9,`-$FRAME+6*$SIZE_T`(%sp)
    244 	$PUSH	%r10,`-$FRAME+7*$SIZE_T`(%sp)
    245 	$PUSH	%r11,`-$FRAME+8*$SIZE_T`(%sp)
    246 	$PUSH	%r12,`-$FRAME+9*$SIZE_T`(%sp)
    247 	$PUSH	%r13,`-$FRAME+10*$SIZE_T`(%sp)
    248 	$PUSH	%r14,`-$FRAME+11*$SIZE_T`(%sp)
    249 	$PUSH	%r15,`-$FRAME+12*$SIZE_T`(%sp)
    250 	$PUSH	%r16,`-$FRAME+13*$SIZE_T`(%sp)
    251 	$PUSH	%r17,`-$FRAME+14*$SIZE_T`(%sp)
    252 	$PUSH	%r18,`-$FRAME+15*$SIZE_T`(%sp)
    253 
    254 	_shl	$num,`log(16*$SZ)/log(2)`,$num
    255 	addl	$inp,$num,$num		; $num to point at the end of $inp
    256 
    257 	$PUSH	$num,`-$FRAME_MARKER-4*$SIZE_T`(%sp)	; save arguments
    258 	$PUSH	$inp,`-$FRAME_MARKER-3*$SIZE_T`(%sp)
    259 	$PUSH	$ctx,`-$FRAME_MARKER-2*$SIZE_T`(%sp)
    260 
    261 	blr	%r0,$Tbl
    262 	ldi	3,$t1
    263 L\$pic
    264 	andcm	$Tbl,$t1,$Tbl		; wipe privilege level
    265 	ldo	L\$table-L\$pic($Tbl),$Tbl
    266 ___
    267 $code.=<<___ if ($SZ==8 && $SIZE_T==4);
    268 	ldi	31,$t1
    269 	mtctl	$t1,%cr11
    270 	extrd,u,*= $t1,%sar,1,$t1	; executes on PA-RISC 1.0
    271 	b	L\$parisc1
    272 	nop
    273 ___
    274 $code.=<<___;
    275 	$LD	`0*$SZ`($ctx),$A	; load context
    276 	$LD	`1*$SZ`($ctx),$B
    277 	$LD	`2*$SZ`($ctx),$C
    278 	$LD	`3*$SZ`($ctx),$D
    279 	$LD	`4*$SZ`($ctx),$E
    280 	$LD	`5*$SZ`($ctx),$F
    281 	$LD	`6*$SZ`($ctx),$G
    282 	$LD	`7*$SZ`($ctx),$H
    283 
    284 	extru	$inp,31,`log($SZ)/log(2)`,$t0
    285 	sh3addl	$t0,%r0,$t0
    286 	subi	`8*$SZ`,$t0,$t0
    287 	mtctl	$t0,%cr11		; load %sar with align factor
    288 
    289 L\$oop
    290 	ldi	`$SZ-1`,$t0
    291 	$LDM	$SZ($Tbl),$t1
    292 	andcm	$inp,$t0,$t0		; align $inp
    293 ___
    294 	for ($i=0;$i<15;$i++) {		# load input block
    295 	$code.="\t$LD	`$SZ*$i`($t0),@X[$i]\n";		}
    296 $code.=<<___;
    297 	cmpb,*=	$inp,$t0,L\$aligned
    298 	$LD	`$SZ*15`($t0),@X[15]
    299 	$LD	`$SZ*16`($t0),@X[16]
    300 ___
    301 	for ($i=0;$i<16;$i++) {		# align data
    302 	$code.="\t_align	@X[$i],@X[$i+1],@X[$i]\n";	}
    303 $code.=<<___;
    304 L\$aligned
    305 	nop	; otherwise /usr/ccs/bin/as is confused by below .WORD
    306 ___
    307 
    308 for($i=0;$i<16;$i++)	{ &ROUND_00_15($i,@V); unshift(@V,pop(@V)); }
    309 $code.=<<___;
    310 L\$rounds
    311 	nop	; otherwise /usr/ccs/bin/as is confused by below .WORD
    312 ___
    313 for(;$i<32;$i++)	{ &ROUND_16_xx($i,@V); unshift(@V,pop(@V)); }
    314 $code.=<<___;
    315 	bb,>=	$Tbl,31,L\$rounds	; end of $Tbl signalled?
    316 	nop
    317 
    318 	$POP	`-$FRAME_MARKER-2*$SIZE_T`(%sp),$ctx	; restore arguments
    319 	$POP	`-$FRAME_MARKER-3*$SIZE_T`(%sp),$inp
    320 	$POP	`-$FRAME_MARKER-4*$SIZE_T`(%sp),$num
    321 	ldo	`-$rounds*$SZ-1`($Tbl),$Tbl		; rewind $Tbl
    322 
    323 	$LD	`0*$SZ`($ctx),@X[0]	; load context
    324 	$LD	`1*$SZ`($ctx),@X[1]
    325 	$LD	`2*$SZ`($ctx),@X[2]
    326 	$LD	`3*$SZ`($ctx),@X[3]
    327 	$LD	`4*$SZ`($ctx),@X[4]
    328 	$LD	`5*$SZ`($ctx),@X[5]
    329 	addl	@X[0],$A,$A
    330 	$LD	`6*$SZ`($ctx),@X[6]
    331 	addl	@X[1],$B,$B
    332 	$LD	`7*$SZ`($ctx),@X[7]
    333 	ldo	`16*$SZ`($inp),$inp	; advance $inp
    334 
    335 	$ST	$A,`0*$SZ`($ctx)	; save context
    336 	addl	@X[2],$C,$C
    337 	$ST	$B,`1*$SZ`($ctx)
    338 	addl	@X[3],$D,$D
    339 	$ST	$C,`2*$SZ`($ctx)
    340 	addl	@X[4],$E,$E
    341 	$ST	$D,`3*$SZ`($ctx)
    342 	addl	@X[5],$F,$F
    343 	$ST	$E,`4*$SZ`($ctx)
    344 	addl	@X[6],$G,$G
    345 	$ST	$F,`5*$SZ`($ctx)
    346 	addl	@X[7],$H,$H
    347 	$ST	$G,`6*$SZ`($ctx)
    348 	$ST	$H,`7*$SZ`($ctx)
    349 
    350 	cmpb,*<>,n $inp,$num,L\$oop
    351 	$PUSH	$inp,`-$FRAME_MARKER-3*$SIZE_T`(%sp)	; save $inp
    352 ___
    353 if ($SZ==8 && $SIZE_T==4)	# SHA512 for 32-bit PA-RISC 1.0
    354 {{
    355 $code.=<<___;
    356 	b	L\$done
    357 	nop
    358 
    359 	.ALIGN	64
    360 L\$parisc1
    361 ___
    362 
    363 @V=(  $Ahi,  $Alo,  $Bhi,  $Blo,  $Chi,  $Clo,  $Dhi,  $Dlo,
    364       $Ehi,  $Elo,  $Fhi,  $Flo,  $Ghi,  $Glo,  $Hhi,  $Hlo) = 
    365    ( "%r1", "%r2", "%r3", "%r4", "%r5", "%r6", "%r7", "%r8",
    366      "%r9","%r10","%r11","%r12","%r13","%r14","%r15","%r16");
    367 $a0 ="%r17";
    368 $a1 ="%r18";
    369 $a2 ="%r19";
    370 $a3 ="%r20";
    371 $t0 ="%r21";
    372 $t1 ="%r22";
    373 $t2 ="%r28";
    374 $t3 ="%r29";
    375 $Tbl="%r31";
    376 
    377 @X=("%r23","%r24","%r25","%r26");	# zaps $num,$inp,$ctx
    378 
    379 sub ROUND_00_15_pa1 {
    380 my ($i,$ahi,$alo,$bhi,$blo,$chi,$clo,$dhi,$dlo,
    381        $ehi,$elo,$fhi,$flo,$ghi,$glo,$hhi,$hlo,$flag)=@_;
    382 my ($Xhi,$Xlo,$Xnhi,$Xnlo) = @X;
    383 
    384 $code.=<<___ if (!$flag);
    385 	ldw	`-$XOFF+8*(($i+1)%16)`(%sp),$Xnhi
    386 	ldw	`-$XOFF+8*(($i+1)%16)+4`(%sp),$Xnlo	; load X[i+1]
    387 ___
    388 $code.=<<___;
    389 	shd	$ehi,$elo,$Sigma1[0],$t0
    390 	 add	$Xlo,$hlo,$hlo
    391 	shd	$elo,$ehi,$Sigma1[0],$t1
    392 	 addc	$Xhi,$hhi,$hhi		; h += X[i]
    393 	shd	$ehi,$elo,$Sigma1[1],$t2
    394 	 ldwm	8($Tbl),$Xhi
    395 	shd	$elo,$ehi,$Sigma1[1],$t3
    396 	 ldw	-4($Tbl),$Xlo		; load K[i]
    397 	xor	$t2,$t0,$t0
    398 	xor	$t3,$t1,$t1
    399 	 and	$flo,$elo,$a0
    400 	 and	$fhi,$ehi,$a1
    401 	shd	$ehi,$elo,$Sigma1[2],$t2
    402 	 andcm	$glo,$elo,$a2
    403 	shd	$elo,$ehi,$Sigma1[2],$t3
    404 	 andcm	$ghi,$ehi,$a3
    405 	xor	$t2,$t0,$t0
    406 	xor	$t3,$t1,$t1		; Sigma1(e)
    407 	add	$Xlo,$hlo,$hlo
    408 	 xor	$a2,$a0,$a0
    409 	addc	$Xhi,$hhi,$hhi		; h += K[i]
    410 	 xor	$a3,$a1,$a1		; Ch(e,f,g)
    411 
    412 	 add	$t0,$hlo,$hlo
    413 	shd	$ahi,$alo,$Sigma0[0],$t0
    414 	 addc	$t1,$hhi,$hhi		; h += Sigma1(e)
    415 	shd	$alo,$ahi,$Sigma0[0],$t1	
    416 	 add	$a0,$hlo,$hlo
    417 	shd	$ahi,$alo,$Sigma0[1],$t2
    418 	 addc	$a1,$hhi,$hhi		; h += Ch(e,f,g)
    419 	shd	$alo,$ahi,$Sigma0[1],$t3
    420 
    421 	xor	$t2,$t0,$t0
    422 	xor	$t3,$t1,$t1
    423 	shd	$ahi,$alo,$Sigma0[2],$t2
    424 	and	$alo,$blo,$a0
    425 	shd	$alo,$ahi,$Sigma0[2],$t3
    426 	and	$ahi,$bhi,$a1
    427 	xor	$t2,$t0,$t0
    428 	xor	$t3,$t1,$t1		; Sigma0(a)
    429 
    430 	and	$alo,$clo,$a2
    431 	and	$ahi,$chi,$a3
    432 	xor	$a2,$a0,$a0
    433 	 add	$hlo,$dlo,$dlo
    434 	xor	$a3,$a1,$a1
    435 	 addc	$hhi,$dhi,$dhi		; d += h
    436 	and	$blo,$clo,$a2
    437 	 add	$t0,$hlo,$hlo
    438 	and	$bhi,$chi,$a3
    439 	 addc	$t1,$hhi,$hhi		; h += Sigma0(a)
    440 	xor	$a2,$a0,$a0
    441 	 add	$a0,$hlo,$hlo
    442 	xor	$a3,$a1,$a1		; Maj(a,b,c)
    443 	 addc	$a1,$hhi,$hhi		; h += Maj(a,b,c)
    444 
    445 ___
    446 $code.=<<___ if ($i==15 && $flag);
    447 	extru	$Xlo,31,10,$Xlo
    448 	comiclr,= $LAST10BITS,$Xlo,%r0
    449 	b	L\$rounds_pa1
    450 	nop
    451 ___
    452 push(@X,shift(@X)); push(@X,shift(@X));
    453 }
    454 
    455 sub ROUND_16_xx_pa1 {
    456 my ($Xhi,$Xlo,$Xnhi,$Xnlo) = @X;
    457 my ($i)=shift;
    458 $i-=16;
    459 $code.=<<___;
    460 	ldw	`-$XOFF+8*(($i+1)%16)`(%sp),$Xnhi
    461 	ldw	`-$XOFF+8*(($i+1)%16)+4`(%sp),$Xnlo	; load X[i+1]
    462 	ldw	`-$XOFF+8*(($i+9)%16)`(%sp),$a1
    463 	ldw	`-$XOFF+8*(($i+9)%16)+4`(%sp),$a0	; load X[i+9]
    464 	ldw	`-$XOFF+8*(($i+14)%16)`(%sp),$a3
    465 	ldw	`-$XOFF+8*(($i+14)%16)+4`(%sp),$a2	; load X[i+14]
    466 	shd	$Xnhi,$Xnlo,$sigma0[0],$t0
    467 	shd	$Xnlo,$Xnhi,$sigma0[0],$t1
    468 	 add	$a0,$Xlo,$Xlo
    469 	shd	$Xnhi,$Xnlo,$sigma0[1],$t2
    470 	 addc	$a1,$Xhi,$Xhi
    471 	shd	$Xnlo,$Xnhi,$sigma0[1],$t3
    472 	xor	$t2,$t0,$t0
    473 	shd	$Xnhi,$Xnlo,$sigma0[2],$t2
    474 	xor	$t3,$t1,$t1
    475 	extru	$Xnhi,`31-$sigma0[2]`,`32-$sigma0[2]`,$t3
    476 	xor	$t2,$t0,$t0
    477 	 shd	$a3,$a2,$sigma1[0],$a0
    478 	xor	$t3,$t1,$t1		; sigma0(X[i+1)&0x0f])
    479 	 shd	$a2,$a3,$sigma1[0],$a1
    480 	add	$t0,$Xlo,$Xlo
    481 	 shd	$a3,$a2,$sigma1[1],$t2
    482 	addc	$t1,$Xhi,$Xhi
    483 	 shd	$a2,$a3,$sigma1[1],$t3
    484 	xor	$t2,$a0,$a0
    485 	shd	$a3,$a2,$sigma1[2],$t2
    486 	xor	$t3,$a1,$a1
    487 	extru	$a3,`31-$sigma1[2]`,`32-$sigma1[2]`,$t3
    488 	xor	$t2,$a0,$a0
    489 	xor	$t3,$a1,$a1		; sigma0(X[i+14)&0x0f])
    490 	add	$a0,$Xlo,$Xlo
    491 	addc	$a1,$Xhi,$Xhi
    492 
    493 	stw	$Xhi,`-$XOFF+8*($i%16)`(%sp)
    494 	stw	$Xlo,`-$XOFF+8*($i%16)+4`(%sp)
    495 ___
    496 &ROUND_00_15_pa1($i,@_,1);
    497 }
    498 $code.=<<___;
    499 	ldw	`0*4`($ctx),$Ahi		; load context
    500 	ldw	`1*4`($ctx),$Alo
    501 	ldw	`2*4`($ctx),$Bhi
    502 	ldw	`3*4`($ctx),$Blo
    503 	ldw	`4*4`($ctx),$Chi
    504 	ldw	`5*4`($ctx),$Clo
    505 	ldw	`6*4`($ctx),$Dhi
    506 	ldw	`7*4`($ctx),$Dlo
    507 	ldw	`8*4`($ctx),$Ehi
    508 	ldw	`9*4`($ctx),$Elo
    509 	ldw	`10*4`($ctx),$Fhi
    510 	ldw	`11*4`($ctx),$Flo
    511 	ldw	`12*4`($ctx),$Ghi
    512 	ldw	`13*4`($ctx),$Glo
    513 	ldw	`14*4`($ctx),$Hhi
    514 	ldw	`15*4`($ctx),$Hlo
    515 
    516 	extru	$inp,31,2,$t0
    517 	sh3addl	$t0,%r0,$t0
    518 	subi	32,$t0,$t0
    519 	mtctl	$t0,%cr11		; load %sar with align factor
    520 
    521 L\$oop_pa1
    522 	extru	$inp,31,2,$a3
    523 	comib,=	0,$a3,L\$aligned_pa1
    524 	sub	$inp,$a3,$inp
    525 
    526 	ldw	`0*4`($inp),$X[0]
    527 	ldw	`1*4`($inp),$X[1]
    528 	ldw	`2*4`($inp),$t2
    529 	ldw	`3*4`($inp),$t3
    530 	ldw	`4*4`($inp),$a0
    531 	ldw	`5*4`($inp),$a1
    532 	ldw	`6*4`($inp),$a2
    533 	ldw	`7*4`($inp),$a3
    534 	vshd	$X[0],$X[1],$X[0]
    535 	vshd	$X[1],$t2,$X[1]
    536 	stw	$X[0],`-$XOFF+0*4`(%sp)
    537 	ldw	`8*4`($inp),$t0
    538 	vshd	$t2,$t3,$t2
    539 	stw	$X[1],`-$XOFF+1*4`(%sp)
    540 	ldw	`9*4`($inp),$t1
    541 	vshd	$t3,$a0,$t3
    542 ___
    543 {
    544 my @t=($t2,$t3,$a0,$a1,$a2,$a3,$t0,$t1);
    545 for ($i=2;$i<=(128/4-8);$i++) {
    546 $code.=<<___;
    547 	stw	$t[0],`-$XOFF+$i*4`(%sp)
    548 	ldw	`(8+$i)*4`($inp),$t[0]
    549 	vshd	$t[1],$t[2],$t[1]
    550 ___
    551 push(@t,shift(@t));
    552 }
    553 for (;$i<(128/4-1);$i++) {
    554 $code.=<<___;
    555 	stw	$t[0],`-$XOFF+$i*4`(%sp)
    556 	vshd	$t[1],$t[2],$t[1]
    557 ___
    558 push(@t,shift(@t));
    559 }
    560 $code.=<<___;
    561 	b	L\$collected_pa1
    562 	stw	$t[0],`-$XOFF+$i*4`(%sp)
    563 
    564 ___
    565 }
    566 $code.=<<___;
    567 L\$aligned_pa1
    568 	ldw	`0*4`($inp),$X[0]
    569 	ldw	`1*4`($inp),$X[1]
    570 	ldw	`2*4`($inp),$t2
    571 	ldw	`3*4`($inp),$t3
    572 	ldw	`4*4`($inp),$a0
    573 	ldw	`5*4`($inp),$a1
    574 	ldw	`6*4`($inp),$a2
    575 	ldw	`7*4`($inp),$a3
    576 	stw	$X[0],`-$XOFF+0*4`(%sp)
    577 	ldw	`8*4`($inp),$t0
    578 	stw	$X[1],`-$XOFF+1*4`(%sp)
    579 	ldw	`9*4`($inp),$t1
    580 ___
    581 {
    582 my @t=($t2,$t3,$a0,$a1,$a2,$a3,$t0,$t1);
    583 for ($i=2;$i<(128/4-8);$i++) {
    584 $code.=<<___;
    585 	stw	$t[0],`-$XOFF+$i*4`(%sp)
    586 	ldw	`(8+$i)*4`($inp),$t[0]
    587 ___
    588 push(@t,shift(@t));
    589 }
    590 for (;$i<128/4;$i++) {
    591 $code.=<<___;
    592 	stw	$t[0],`-$XOFF+$i*4`(%sp)
    593 ___
    594 push(@t,shift(@t));
    595 }
    596 $code.="L\$collected_pa1\n";
    597 }
    598 
    599 for($i=0;$i<16;$i++)	{ &ROUND_00_15_pa1($i,@V); unshift(@V,pop(@V)); unshift(@V,pop(@V)); }
    600 $code.="L\$rounds_pa1\n";
    601 for(;$i<32;$i++)	{ &ROUND_16_xx_pa1($i,@V); unshift(@V,pop(@V)); unshift(@V,pop(@V)); }
    602 
    603 $code.=<<___;
    604 	$POP	`-$FRAME_MARKER-2*$SIZE_T`(%sp),$ctx	; restore arguments
    605 	$POP	`-$FRAME_MARKER-3*$SIZE_T`(%sp),$inp
    606 	$POP	`-$FRAME_MARKER-4*$SIZE_T`(%sp),$num
    607 	ldo	`-$rounds*$SZ`($Tbl),$Tbl		; rewind $Tbl
    608 
    609 	ldw	`0*4`($ctx),$t1		; update context
    610 	ldw	`1*4`($ctx),$t0
    611 	ldw	`2*4`($ctx),$t3
    612 	ldw	`3*4`($ctx),$t2
    613 	ldw	`4*4`($ctx),$a1
    614 	ldw	`5*4`($ctx),$a0
    615 	ldw	`6*4`($ctx),$a3
    616 	add	$t0,$Alo,$Alo
    617 	ldw	`7*4`($ctx),$a2
    618 	addc	$t1,$Ahi,$Ahi
    619 	ldw	`8*4`($ctx),$t1
    620 	add	$t2,$Blo,$Blo
    621 	ldw	`9*4`($ctx),$t0
    622 	addc	$t3,$Bhi,$Bhi
    623 	ldw	`10*4`($ctx),$t3
    624 	add	$a0,$Clo,$Clo
    625 	ldw	`11*4`($ctx),$t2
    626 	addc	$a1,$Chi,$Chi
    627 	ldw	`12*4`($ctx),$a1
    628 	add	$a2,$Dlo,$Dlo
    629 	ldw	`13*4`($ctx),$a0
    630 	addc	$a3,$Dhi,$Dhi
    631 	ldw	`14*4`($ctx),$a3
    632 	add	$t0,$Elo,$Elo
    633 	ldw	`15*4`($ctx),$a2
    634 	addc	$t1,$Ehi,$Ehi
    635 	stw	$Ahi,`0*4`($ctx)
    636 	add	$t2,$Flo,$Flo
    637 	stw	$Alo,`1*4`($ctx)
    638 	addc	$t3,$Fhi,$Fhi
    639 	stw	$Bhi,`2*4`($ctx)
    640 	add	$a0,$Glo,$Glo
    641 	stw	$Blo,`3*4`($ctx)
    642 	addc	$a1,$Ghi,$Ghi
    643 	stw	$Chi,`4*4`($ctx)
    644 	add	$a2,$Hlo,$Hlo
    645 	stw	$Clo,`5*4`($ctx)
    646 	addc	$a3,$Hhi,$Hhi
    647 	stw	$Dhi,`6*4`($ctx)
    648 	ldo	`16*$SZ`($inp),$inp	; advance $inp
    649 	stw	$Dlo,`7*4`($ctx)
    650 	stw	$Ehi,`8*4`($ctx)
    651 	stw	$Elo,`9*4`($ctx)
    652 	stw	$Fhi,`10*4`($ctx)
    653 	stw	$Flo,`11*4`($ctx)
    654 	stw	$Ghi,`12*4`($ctx)
    655 	stw	$Glo,`13*4`($ctx)
    656 	stw	$Hhi,`14*4`($ctx)
    657 	comb,=	$inp,$num,L\$done
    658 	stw	$Hlo,`15*4`($ctx)
    659 	b	L\$oop_pa1
    660 	$PUSH	$inp,`-$FRAME_MARKER-3*$SIZE_T`(%sp)	; save $inp
    661 L\$done
    662 ___
    663 }}
    664 $code.=<<___;
    665 	$POP	`-$FRAME-$SAVED_RP`(%sp),%r2		; standard epilogue
    666 	$POP	`-$FRAME+1*$SIZE_T`(%sp),%r4
    667 	$POP	`-$FRAME+2*$SIZE_T`(%sp),%r5
    668 	$POP	`-$FRAME+3*$SIZE_T`(%sp),%r6
    669 	$POP	`-$FRAME+4*$SIZE_T`(%sp),%r7
    670 	$POP	`-$FRAME+5*$SIZE_T`(%sp),%r8
    671 	$POP	`-$FRAME+6*$SIZE_T`(%sp),%r9
    672 	$POP	`-$FRAME+7*$SIZE_T`(%sp),%r10
    673 	$POP	`-$FRAME+8*$SIZE_T`(%sp),%r11
    674 	$POP	`-$FRAME+9*$SIZE_T`(%sp),%r12
    675 	$POP	`-$FRAME+10*$SIZE_T`(%sp),%r13
    676 	$POP	`-$FRAME+11*$SIZE_T`(%sp),%r14
    677 	$POP	`-$FRAME+12*$SIZE_T`(%sp),%r15
    678 	$POP	`-$FRAME+13*$SIZE_T`(%sp),%r16
    679 	$POP	`-$FRAME+14*$SIZE_T`(%sp),%r17
    680 	$POP	`-$FRAME+15*$SIZE_T`(%sp),%r18
    681 	bv	(%r2)
    682 	.EXIT
    683 	$POPMB	-$FRAME(%sp),%r3
    684 	.PROCEND
    685 	.STRINGZ "SHA`64*$SZ` block transform for PA-RISC, CRYPTOGAMS by <appro\@openssl.org>"
    686 ___
    687 
    688 # Explicitly encode PA-RISC 2.0 instructions used in this module, so
    689 # that it can be compiled with .LEVEL 1.0. It should be noted that I
    690 # wouldn't have to do this, if GNU assembler understood .ALLOW 2.0
    691 # directive...
    692 
    693 my $ldd = sub {
    694   my ($mod,$args) = @_;
    695   my $orig = "ldd$mod\t$args";
    696 
    697     if ($args =~ /(\-?[0-9]+)\(%r([0-9]+)\),%r([0-9]+)/) # format 3 suffices
    698     {	my $opcode=(0x14<<26)|($2<<21)|($3<<16)|(($1&0x1FF8)<<1)|(($1>>13)&1);
    699 	$opcode|=(1<<3) if ($mod =~ /^,m/);
    700 	$opcode|=(1<<2) if ($mod =~ /^,mb/);
    701 	sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
    702     }
    703     else { "\t".$orig; }
    704 };
    705 
    706 my $std = sub {
    707   my ($mod,$args) = @_;
    708   my $orig = "std$mod\t$args";
    709 
    710     if ($args =~ /%r([0-9]+),(\-?[0-9]+)\(%r([0-9]+)\)/) # format 3 suffices
    711     {	my $opcode=(0x1c<<26)|($3<<21)|($1<<16)|(($2&0x1FF8)<<1)|(($2>>13)&1);
    712 	sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
    713     }
    714     else { "\t".$orig; }
    715 };
    716 
    717 my $extrd = sub {
    718   my ($mod,$args) = @_;
    719   my $orig = "extrd$mod\t$args";
    720 
    721     # I only have ",u" completer, it's implicitly encoded...
    722     if ($args =~ /%r([0-9]+),([0-9]+),([0-9]+),%r([0-9]+)/)	# format 15
    723     {	my $opcode=(0x36<<26)|($1<<21)|($4<<16);
    724 	my $len=32-$3;
    725 	$opcode |= (($2&0x20)<<6)|(($2&0x1f)<<5);		# encode pos
    726 	$opcode |= (($len&0x20)<<7)|($len&0x1f);		# encode len
    727 	sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
    728     }
    729     elsif ($args =~ /%r([0-9]+),%sar,([0-9]+),%r([0-9]+)/)	# format 12
    730     {	my $opcode=(0x34<<26)|($1<<21)|($3<<16)|(2<<11)|(1<<9);
    731 	my $len=32-$2;
    732 	$opcode |= (($len&0x20)<<3)|($len&0x1f);		# encode len
    733 	$opcode |= (1<<13) if ($mod =~ /,\**=/);
    734 	sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
    735     }
    736     else { "\t".$orig; }
    737 };
    738 
    739 my $shrpd = sub {
    740   my ($mod,$args) = @_;
    741   my $orig = "shrpd$mod\t$args";
    742 
    743     if ($args =~ /%r([0-9]+),%r([0-9]+),([0-9]+),%r([0-9]+)/)	# format 14
    744     {	my $opcode=(0x34<<26)|($2<<21)|($1<<16)|(1<<10)|$4;
    745 	my $cpos=63-$3;
    746 	$opcode |= (($cpos&0x20)<<6)|(($cpos&0x1f)<<5);		# encode sa
    747 	sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
    748     }
    749     elsif ($args =~ /%r([0-9]+),%r([0-9]+),%sar,%r([0-9]+)/)	# format 11
    750     {	sprintf "\t.WORD\t0x%08x\t; %s",
    751 		(0x34<<26)|($2<<21)|($1<<16)|(1<<9)|$3,$orig;
    752     }
    753     else { "\t".$orig; }
    754 };
    755 
    756 sub assemble {
    757   my ($mnemonic,$mod,$args)=@_;
    758   my $opcode = eval("\$$mnemonic");
    759 
    760     ref($opcode) eq 'CODE' ? &$opcode($mod,$args) : "\t$mnemonic$mod\t$args";
    761 }
    762 
    763 foreach (split("\n",$code)) {
    764 	s/\`([^\`]*)\`/eval $1/ge;
    765 
    766 	s/shd\s+(%r[0-9]+),(%r[0-9]+),([0-9]+)/
    767 		$3>31 ? sprintf("shd\t%$2,%$1,%d",$3-32)	# rotation for >=32
    768 		:       sprintf("shd\t%$1,%$2,%d",$3)/e			or
    769 	# translate made up instructons: _ror, _shr, _align, _shl
    770 	s/_ror(\s+)(%r[0-9]+),/
    771 		($SZ==4 ? "shd" : "shrpd")."$1$2,$2,"/e			or
    772 
    773 	s/_shr(\s+%r[0-9]+),([0-9]+),/
    774 		$SZ==4 ? sprintf("extru%s,%d,%d,",$1,31-$2,32-$2)
    775 		:        sprintf("extrd,u%s,%d,%d,",$1,63-$2,64-$2)/e	or
    776 
    777 	s/_align(\s+%r[0-9]+,%r[0-9]+),/
    778 		($SZ==4 ? "vshd$1," : "shrpd$1,%sar,")/e		or
    779 
    780 	s/_shl(\s+%r[0-9]+),([0-9]+),/
    781 		$SIZE_T==4 ? sprintf("zdep%s,%d,%d,",$1,31-$2,32-$2)
    782 		:            sprintf("depd,z%s,%d,%d,",$1,63-$2,64-$2)/e;
    783 
    784 	s/^\s+([a-z]+)([\S]*)\s+([\S]*)/&assemble($1,$2,$3)/e if ($SIZE_T==4);
    785 
    786 	s/cmpb,\*/comb,/ if ($SIZE_T==4);
    787 
    788 	print $_,"\n";
    789 }
    790 
    791 close STDOUT;
    792