Home | History | Annotate | Download | only in asm
      1 #!/usr/bin/env perl
      2 
      3 # ====================================================================
      4 # Written by Andy Polyakov <appro (at] fy.chalmers.se> for the OpenSSL
      5 # project. The module is, however, dual licensed under OpenSSL and
      6 # CRYPTOGAMS licenses depending on where you obtain it. For further
      7 # details see http://www.openssl.org/~appro/cryptogams/.
      8 # ====================================================================
      9 
     10 # January 2009
     11 #
     12 # Provided that UltraSPARC VIS instructions are pipe-lined(*) and
     13 # pairable(*) with IALU ones, offloading of Xupdate to the UltraSPARC
     14 # Graphic Unit would make it possible to achieve higher instruction-
     15 # level parallelism, ILP, and thus higher performance. It should be
     16 # explicitly noted that ILP is the keyword, and it means that this
     17 # code would be unsuitable for cores like UltraSPARC-Tx. The idea is
     18 # not really novel, Sun had VIS-powered implementation for a while.
     19 # Unlike Sun's implementation this one can process multiple unaligned
     20 # input blocks, and as such works as drop-in replacement for OpenSSL
     21 # sha1_block_data_order. Performance improvement was measured to be
     22 # 40% over pure IALU sha1-sparcv9.pl on UltraSPARC-IIi, but 12% on
     23 # UltraSPARC-III. See below for discussion...
     24 #
     25 # The module does not present direct interest for OpenSSL, because
     26 # it doesn't provide better performance on contemporary SPARCv9 CPUs,
     27 # UltraSPARC-Tx and SPARC64-V[II] to be specific. Those who feel they
     28 # absolutely must score on UltraSPARC-I-IV can simply replace
     29 # crypto/sha/asm/sha1-sparcv9.pl with this module.
     30 #
     31 # (*)	"Pipe-lined" means that even if it takes several cycles to
     32 #	complete, next instruction using same functional unit [but not
     33 #	depending on the result of the current instruction] can start
     34 #	execution without having to wait for the unit. "Pairable"
     35 #	means that two [or more] independent instructions can be
     36 #	issued at the very same time.
     37 
     38 $bits=32;
     39 for (@ARGV)	{ $bits=64 if (/\-m64/ || /\-xarch\=v9/); }
     40 if ($bits==64)	{ $bias=2047; $frame=192; }
     41 else		{ $bias=0;    $frame=112; }
     42 
     43 $output=shift;
     44 open STDOUT,">$output";
     45 
     46 $ctx="%i0";
     47 $inp="%i1";
     48 $len="%i2";
     49 $tmp0="%i3";
     50 $tmp1="%i4";
     51 $tmp2="%i5";
     52 $tmp3="%g5";
     53 
     54 $base="%g1";
     55 $align="%g4";
     56 $Xfer="%o5";
     57 $nXfer=$tmp3;
     58 $Xi="%o7";
     59 
     60 $A="%l0";
     61 $B="%l1";
     62 $C="%l2";
     63 $D="%l3";
     64 $E="%l4";
     65 @V=($A,$B,$C,$D,$E);
     66 
     67 $Actx="%o0";
     68 $Bctx="%o1";
     69 $Cctx="%o2";
     70 $Dctx="%o3";
     71 $Ectx="%o4";
     72 
     73 $fmul="%f32";
     74 $VK_00_19="%f34";
     75 $VK_20_39="%f36";
     76 $VK_40_59="%f38";
     77 $VK_60_79="%f40";
     78 @VK=($VK_00_19,$VK_20_39,$VK_40_59,$VK_60_79);
     79 @X=("%f0", "%f1", "%f2", "%f3", "%f4", "%f5", "%f6", "%f7",
     80     "%f8", "%f9","%f10","%f11","%f12","%f13","%f14","%f15","%f16");
     81 
     82 # This is reference 2x-parallelized VIS-powered Xupdate procedure. It
     83 # covers even K_NN_MM addition...
     84 sub Xupdate {
     85 my ($i)=@_;
     86 my $K=@VK[($i+16)/20];
     87 my $j=($i+16)%16;
     88 
     89 #	[ provided that GSR.alignaddr_offset is 5, $mul contains
     90 #	  0x100ULL<<32|0x100 value and K_NN_MM are pre-loaded to
     91 #	  chosen registers... ]
     92 $code.=<<___;
     93 	fxors		@X[($j+13)%16],@X[$j],@X[$j]	!-1/-1/-1:X[0]^=X[13]
     94 	fxors		@X[($j+14)%16],@X[$j+1],@X[$j+1]! 0/ 0/ 0:X[1]^=X[14]
     95 	fxor		@X[($j+2)%16],@X[($j+8)%16],%f18! 1/ 1/ 1:Tmp=X[2,3]^X[8,9]
     96 	fxor		%f18,@X[$j],@X[$j]		! 2/ 4/ 3:X[0,1]^=X[2,3]^X[8,9]
     97 	faligndata	@X[$j],@X[$j],%f18		! 3/ 7/ 5:Tmp=X[0,1]>>>24
     98 	fpadd32		@X[$j],@X[$j],@X[$j]		! 4/ 8/ 6:X[0,1]<<=1
     99 	fmul8ulx16	%f18,$fmul,%f18			! 5/10/ 7:Tmp>>=7, Tmp&=1
    100 	![fxors		%f15,%f2,%f2]
    101 	for		%f18,@X[$j],@X[$j]		! 8/14/10:X[0,1]|=Tmp
    102 	![fxors		%f0,%f3,%f3]			!10/17/12:X[0] dependency
    103 	fpadd32		$K,@X[$j],%f20
    104 	std		%f20,[$Xfer+`4*$j`]
    105 ___
    106 # The numbers delimited with slash are the earliest possible dispatch
    107 # cycles for given instruction assuming 1 cycle latency for simple VIS
    108 # instructions, such as on UltraSPARC-I&II, 3 cycles latency, such as
    109 # on UltraSPARC-III&IV, and 2 cycles latency(*), respectively. Being
    110 # 2x-parallelized the procedure is "worth" 5, 8.5 or 6 ticks per SHA1
    111 # round. As [long as] FPU/VIS instructions are perfectly pairable with
    112 # IALU ones, the round timing is defined by the maximum between VIS
    113 # and IALU timings. The latter varies from round to round and averages
    114 # out at 6.25 ticks. This means that USI&II should operate at IALU
    115 # rate, while USIII&IV - at VIS rate. This explains why performance
    116 # improvement varies among processors. Well, given that pure IALU
    117 # sha1-sparcv9.pl module exhibits virtually uniform performance of
    118 # ~9.3 cycles per SHA1 round. Timings mentioned above are theoretical
    119 # lower limits. Real-life performance was measured to be 6.6 cycles
    120 # per SHA1 round on USIIi and 8.3 on USIII. The latter is lower than
    121 # half-round VIS timing, because there are 16 Xupdate-free rounds,
    122 # which "push down" average theoretical timing to 8 cycles...
    123 
    124 # (*)	SPARC64-V[II] was originally believed to have 2 cycles VIS
    125 #	latency. Well, it might have, but it doesn't have dedicated
    126 #	VIS-unit. Instead, VIS instructions are executed by other
    127 #	functional units, ones used here - by IALU. This doesn't
    128 #	improve effective ILP...
    129 }
    130 
    131 # The reference Xupdate procedure is then "strained" over *pairs* of
    132 # BODY_NN_MM and kind of modulo-scheduled in respect to X[n]^=X[n+13]
    133 # and K_NN_MM addition. It's "running" 15 rounds ahead, which leaves
    134 # plenty of room to amortize for read-after-write hazard, as well as
    135 # to fetch and align input for the next spin. The VIS instructions are
    136 # scheduled for latency of 2 cycles, because there are not enough IALU
    137 # instructions to schedule for latency of 3, while scheduling for 1
    138 # would give no gain on USI&II anyway.
    139 
    140 sub BODY_00_19 {
    141 my ($i,$a,$b,$c,$d,$e)=@_;
    142 my $j=$i&~1;
    143 my $k=($j+16+2)%16;	# ahead reference
    144 my $l=($j+16-2)%16;	# behind reference
    145 my $K=@VK[($j+16-2)/20];
    146 
    147 $j=($j+16)%16;
    148 
    149 $code.=<<___ if (!($i&1));
    150 	sll		$a,5,$tmp0			!! $i
    151 	and		$c,$b,$tmp3
    152 	ld		[$Xfer+`4*($i%16)`],$Xi
    153 	 fxors		@X[($j+14)%16],@X[$j+1],@X[$j+1]! 0/ 0/ 0:X[1]^=X[14]
    154 	srl		$a,27,$tmp1
    155 	add		$tmp0,$e,$e
    156 	 fxor		@X[($j+2)%16],@X[($j+8)%16],%f18! 1/ 1/ 1:Tmp=X[2,3]^X[8,9]
    157 	sll		$b,30,$tmp2
    158 	add		$tmp1,$e,$e
    159 	andn		$d,$b,$tmp1
    160 	add		$Xi,$e,$e
    161 	 fxor		%f18,@X[$j],@X[$j]		! 2/ 4/ 3:X[0,1]^=X[2,3]^X[8,9]
    162 	srl		$b,2,$b
    163 	or		$tmp1,$tmp3,$tmp1
    164 	or		$tmp2,$b,$b
    165 	add		$tmp1,$e,$e
    166 	 faligndata	@X[$j],@X[$j],%f18		! 3/ 7/ 5:Tmp=X[0,1]>>>24
    167 ___
    168 $code.=<<___ if ($i&1);
    169 	sll		$a,5,$tmp0			!! $i
    170 	and		$c,$b,$tmp3
    171 	ld		[$Xfer+`4*($i%16)`],$Xi
    172 	 fpadd32	@X[$j],@X[$j],@X[$j]		! 4/ 8/ 6:X[0,1]<<=1
    173 	srl		$a,27,$tmp1
    174 	add		$tmp0,$e,$e
    175 	 fmul8ulx16	%f18,$fmul,%f18			! 5/10/ 7:Tmp>>=7, Tmp&=1
    176 	sll		$b,30,$tmp2
    177 	add		$tmp1,$e,$e
    178 	 fpadd32	$K,@X[$l],%f20			!
    179 	andn		$d,$b,$tmp1
    180 	add		$Xi,$e,$e
    181 	 fxors		@X[($k+13)%16],@X[$k],@X[$k]	!-1/-1/-1:X[0]^=X[13]
    182 	srl		$b,2,$b
    183 	or		$tmp1,$tmp3,$tmp1
    184 	 fxor		%f18,@X[$j],@X[$j]		! 8/14/10:X[0,1]|=Tmp
    185 	or		$tmp2,$b,$b
    186 	add		$tmp1,$e,$e
    187 ___
    188 $code.=<<___ if ($i&1 && $i>=2);
    189 	 std		%f20,[$Xfer+`4*$l`]		!
    190 ___
    191 }
    192 
    193 sub BODY_20_39 {
    194 my ($i,$a,$b,$c,$d,$e)=@_;
    195 my $j=$i&~1;
    196 my $k=($j+16+2)%16;	# ahead reference
    197 my $l=($j+16-2)%16;	# behind reference
    198 my $K=@VK[($j+16-2)/20];
    199 
    200 $j=($j+16)%16;
    201 
    202 $code.=<<___ if (!($i&1) && $i<64);
    203 	sll		$a,5,$tmp0			!! $i
    204 	ld		[$Xfer+`4*($i%16)`],$Xi
    205 	 fxors		@X[($j+14)%16],@X[$j+1],@X[$j+1]! 0/ 0/ 0:X[1]^=X[14]
    206 	srl		$a,27,$tmp1
    207 	add		$tmp0,$e,$e
    208 	 fxor		@X[($j+2)%16],@X[($j+8)%16],%f18! 1/ 1/ 1:Tmp=X[2,3]^X[8,9]
    209 	xor		$c,$b,$tmp0
    210 	add		$tmp1,$e,$e
    211 	sll		$b,30,$tmp2
    212 	xor		$d,$tmp0,$tmp1
    213 	 fxor		%f18,@X[$j],@X[$j]		! 2/ 4/ 3:X[0,1]^=X[2,3]^X[8,9]
    214 	srl		$b,2,$b
    215 	add		$tmp1,$e,$e
    216 	or		$tmp2,$b,$b
    217 	add		$Xi,$e,$e
    218 	 faligndata	@X[$j],@X[$j],%f18		! 3/ 7/ 5:Tmp=X[0,1]>>>24
    219 ___
    220 $code.=<<___ if ($i&1 && $i<64);
    221 	sll		$a,5,$tmp0			!! $i
    222 	ld		[$Xfer+`4*($i%16)`],$Xi
    223 	 fpadd32	@X[$j],@X[$j],@X[$j]		! 4/ 8/ 6:X[0,1]<<=1
    224 	srl		$a,27,$tmp1
    225 	add		$tmp0,$e,$e
    226 	 fmul8ulx16	%f18,$fmul,%f18			! 5/10/ 7:Tmp>>=7, Tmp&=1
    227 	xor		$c,$b,$tmp0
    228 	add		$tmp1,$e,$e
    229 	 fpadd32	$K,@X[$l],%f20			!
    230 	sll		$b,30,$tmp2
    231 	xor		$d,$tmp0,$tmp1
    232 	 fxors		@X[($k+13)%16],@X[$k],@X[$k]	!-1/-1/-1:X[0]^=X[13]
    233 	srl		$b,2,$b
    234 	add		$tmp1,$e,$e
    235 	 fxor		%f18,@X[$j],@X[$j]		! 8/14/10:X[0,1]|=Tmp
    236 	or		$tmp2,$b,$b
    237 	add		$Xi,$e,$e
    238 	 std		%f20,[$Xfer+`4*$l`]		!
    239 ___
    240 $code.=<<___ if ($i==64);
    241 	sll		$a,5,$tmp0			!! $i
    242 	ld		[$Xfer+`4*($i%16)`],$Xi
    243 	 fpadd32	$K,@X[$l],%f20
    244 	srl		$a,27,$tmp1
    245 	add		$tmp0,$e,$e
    246 	xor		$c,$b,$tmp0
    247 	add		$tmp1,$e,$e
    248 	sll		$b,30,$tmp2
    249 	xor		$d,$tmp0,$tmp1
    250 	 std		%f20,[$Xfer+`4*$l`]
    251 	srl		$b,2,$b
    252 	add		$tmp1,$e,$e
    253 	or		$tmp2,$b,$b
    254 	add		$Xi,$e,$e
    255 ___
    256 $code.=<<___ if ($i>64);
    257 	sll		$a,5,$tmp0			!! $i
    258 	ld		[$Xfer+`4*($i%16)`],$Xi
    259 	srl		$a,27,$tmp1
    260 	add		$tmp0,$e,$e
    261 	xor		$c,$b,$tmp0
    262 	add		$tmp1,$e,$e
    263 	sll		$b,30,$tmp2
    264 	xor		$d,$tmp0,$tmp1
    265 	srl		$b,2,$b
    266 	add		$tmp1,$e,$e
    267 	or		$tmp2,$b,$b
    268 	add		$Xi,$e,$e
    269 ___
    270 }
    271 
    272 sub BODY_40_59 {
    273 my ($i,$a,$b,$c,$d,$e)=@_;
    274 my $j=$i&~1;
    275 my $k=($j+16+2)%16;	# ahead reference
    276 my $l=($j+16-2)%16;	# behind reference
    277 my $K=@VK[($j+16-2)/20];
    278 
    279 $j=($j+16)%16;
    280 
    281 $code.=<<___ if (!($i&1));
    282 	sll		$a,5,$tmp0			!! $i
    283 	ld		[$Xfer+`4*($i%16)`],$Xi
    284 	 fxors		@X[($j+14)%16],@X[$j+1],@X[$j+1]! 0/ 0/ 0:X[1]^=X[14]
    285 	srl		$a,27,$tmp1
    286 	add		$tmp0,$e,$e
    287 	 fxor		@X[($j+2)%16],@X[($j+8)%16],%f18! 1/ 1/ 1:Tmp=X[2,3]^X[8,9]
    288 	and		$c,$b,$tmp0
    289 	add		$tmp1,$e,$e
    290 	sll		$b,30,$tmp2
    291 	or		$c,$b,$tmp1
    292 	 fxor		%f18,@X[$j],@X[$j]		! 2/ 4/ 3:X[0,1]^=X[2,3]^X[8,9]
    293 	srl		$b,2,$b
    294 	and		$d,$tmp1,$tmp1
    295 	add		$Xi,$e,$e
    296 	or		$tmp1,$tmp0,$tmp1
    297 	 faligndata	@X[$j],@X[$j],%f18		! 3/ 7/ 5:Tmp=X[0,1]>>>24
    298 	or		$tmp2,$b,$b
    299 	add		$tmp1,$e,$e
    300 	 fpadd32	@X[$j],@X[$j],@X[$j]		! 4/ 8/ 6:X[0,1]<<=1
    301 ___
    302 $code.=<<___ if ($i&1);
    303 	sll		$a,5,$tmp0			!! $i
    304 	ld		[$Xfer+`4*($i%16)`],$Xi
    305 	srl		$a,27,$tmp1
    306 	add		$tmp0,$e,$e
    307 	 fmul8ulx16	%f18,$fmul,%f18			! 5/10/ 7:Tmp>>=7, Tmp&=1
    308 	and		$c,$b,$tmp0
    309 	add		$tmp1,$e,$e
    310 	 fpadd32	$K,@X[$l],%f20			!
    311 	sll		$b,30,$tmp2
    312 	or		$c,$b,$tmp1
    313 	 fxors		@X[($k+13)%16],@X[$k],@X[$k]	!-1/-1/-1:X[0]^=X[13]
    314 	srl		$b,2,$b
    315 	and		$d,$tmp1,$tmp1
    316 	 fxor		%f18,@X[$j],@X[$j]		! 8/14/10:X[0,1]|=Tmp
    317 	add		$Xi,$e,$e
    318 	or		$tmp1,$tmp0,$tmp1
    319 	or		$tmp2,$b,$b
    320 	add		$tmp1,$e,$e
    321 	 std		%f20,[$Xfer+`4*$l`]		!
    322 ___
    323 }
    324 
    325 # If there is more data to process, then we pre-fetch the data for
    326 # next iteration in last ten rounds...
    327 sub BODY_70_79 {
    328 my ($i,$a,$b,$c,$d,$e)=@_;
    329 my $j=$i&~1;
    330 my $m=($i%8)*2;
    331 
    332 $j=($j+16)%16;
    333 
    334 $code.=<<___ if ($i==70);
    335 	sll		$a,5,$tmp0			!! $i
    336 	ld		[$Xfer+`4*($i%16)`],$Xi
    337 	srl		$a,27,$tmp1
    338 	add		$tmp0,$e,$e
    339 	 ldd		[$inp+64],@X[0]
    340 	xor		$c,$b,$tmp0
    341 	add		$tmp1,$e,$e
    342 	sll		$b,30,$tmp2
    343 	xor		$d,$tmp0,$tmp1
    344 	srl		$b,2,$b
    345 	add		$tmp1,$e,$e
    346 	or		$tmp2,$b,$b
    347 	add		$Xi,$e,$e
    348 
    349 	and		$inp,-64,$nXfer
    350 	inc		64,$inp
    351 	and		$nXfer,255,$nXfer
    352 	alignaddr	%g0,$align,%g0
    353 	add		$base,$nXfer,$nXfer
    354 ___
    355 $code.=<<___ if ($i==71);
    356 	sll		$a,5,$tmp0			!! $i
    357 	ld		[$Xfer+`4*($i%16)`],$Xi
    358 	srl		$a,27,$tmp1
    359 	add		$tmp0,$e,$e
    360 	xor		$c,$b,$tmp0
    361 	add		$tmp1,$e,$e
    362 	sll		$b,30,$tmp2
    363 	xor		$d,$tmp0,$tmp1
    364 	srl		$b,2,$b
    365 	add		$tmp1,$e,$e
    366 	or		$tmp2,$b,$b
    367 	add		$Xi,$e,$e
    368 ___
    369 $code.=<<___ if ($i>=72);
    370 	 faligndata	@X[$m],@X[$m+2],@X[$m]
    371 	sll		$a,5,$tmp0			!! $i
    372 	ld		[$Xfer+`4*($i%16)`],$Xi
    373 	srl		$a,27,$tmp1
    374 	add		$tmp0,$e,$e
    375 	xor		$c,$b,$tmp0
    376 	add		$tmp1,$e,$e
    377 	 fpadd32	$VK_00_19,@X[$m],%f20
    378 	sll		$b,30,$tmp2
    379 	xor		$d,$tmp0,$tmp1
    380 	srl		$b,2,$b
    381 	add		$tmp1,$e,$e
    382 	or		$tmp2,$b,$b
    383 	add		$Xi,$e,$e
    384 ___
    385 $code.=<<___ if ($i<77);
    386 	 ldd		[$inp+`8*($i+1-70)`],@X[2*($i+1-70)]
    387 ___
    388 $code.=<<___ if ($i==77);	# redundant if $inp was aligned
    389 	 add		$align,63,$tmp0
    390 	 and		$tmp0,-8,$tmp0
    391 	 ldd		[$inp+$tmp0],@X[16]
    392 ___
    393 $code.=<<___ if ($i>=72);
    394 	 std		%f20,[$nXfer+`4*$m`]
    395 ___
    396 }
    397 
    398 $code.=<<___;
    399 .section	".text",#alloc,#execinstr
    400 
    401 .align	64
    402 vis_const:
    403 .long	0x5a827999,0x5a827999	! K_00_19
    404 .long	0x6ed9eba1,0x6ed9eba1	! K_20_39
    405 .long	0x8f1bbcdc,0x8f1bbcdc	! K_40_59
    406 .long	0xca62c1d6,0xca62c1d6	! K_60_79
    407 .long	0x00000100,0x00000100
    408 .align	64
    409 .type	vis_const,#object
    410 .size	vis_const,(.-vis_const)
    411 
    412 .globl	sha1_block_data_order
    413 sha1_block_data_order:
    414 	save	%sp,-$frame,%sp
    415 	add	%fp,$bias-256,$base
    416 
    417 1:	call	.+8
    418 	add	%o7,vis_const-1b,$tmp0
    419 
    420 	ldd	[$tmp0+0],$VK_00_19
    421 	ldd	[$tmp0+8],$VK_20_39
    422 	ldd	[$tmp0+16],$VK_40_59
    423 	ldd	[$tmp0+24],$VK_60_79
    424 	ldd	[$tmp0+32],$fmul
    425 
    426 	ld	[$ctx+0],$Actx
    427 	and	$base,-256,$base
    428 	ld	[$ctx+4],$Bctx
    429 	sub	$base,$bias+$frame,%sp
    430 	ld	[$ctx+8],$Cctx
    431 	and	$inp,7,$align
    432 	ld	[$ctx+12],$Dctx
    433 	and	$inp,-8,$inp
    434 	ld	[$ctx+16],$Ectx
    435 
    436 	! X[16] is maintained in FP register bank
    437 	alignaddr	%g0,$align,%g0
    438 	ldd		[$inp+0],@X[0]
    439 	sub		$inp,-64,$Xfer
    440 	ldd		[$inp+8],@X[2]
    441 	and		$Xfer,-64,$Xfer
    442 	ldd		[$inp+16],@X[4]
    443 	and		$Xfer,255,$Xfer
    444 	ldd		[$inp+24],@X[6]
    445 	add		$base,$Xfer,$Xfer
    446 	ldd		[$inp+32],@X[8]
    447 	ldd		[$inp+40],@X[10]
    448 	ldd		[$inp+48],@X[12]
    449 	brz,pt		$align,.Laligned
    450 	ldd		[$inp+56],@X[14]
    451 
    452 	ldd		[$inp+64],@X[16]
    453 	faligndata	@X[0],@X[2],@X[0]
    454 	faligndata	@X[2],@X[4],@X[2]
    455 	faligndata	@X[4],@X[6],@X[4]
    456 	faligndata	@X[6],@X[8],@X[6]
    457 	faligndata	@X[8],@X[10],@X[8]
    458 	faligndata	@X[10],@X[12],@X[10]
    459 	faligndata	@X[12],@X[14],@X[12]
    460 	faligndata	@X[14],@X[16],@X[14]
    461 
    462 .Laligned:
    463 	mov		5,$tmp0
    464 	dec		1,$len
    465 	alignaddr	%g0,$tmp0,%g0
    466 	fpadd32		$VK_00_19,@X[0],%f16
    467 	fpadd32		$VK_00_19,@X[2],%f18
    468 	fpadd32		$VK_00_19,@X[4],%f20
    469 	fpadd32		$VK_00_19,@X[6],%f22
    470 	fpadd32		$VK_00_19,@X[8],%f24
    471 	fpadd32		$VK_00_19,@X[10],%f26
    472 	fpadd32		$VK_00_19,@X[12],%f28
    473 	fpadd32		$VK_00_19,@X[14],%f30
    474 	std		%f16,[$Xfer+0]
    475 	mov		$Actx,$A
    476 	std		%f18,[$Xfer+8]
    477 	mov		$Bctx,$B
    478 	std		%f20,[$Xfer+16]
    479 	mov		$Cctx,$C
    480 	std		%f22,[$Xfer+24]
    481 	mov		$Dctx,$D
    482 	std		%f24,[$Xfer+32]
    483 	mov		$Ectx,$E
    484 	std		%f26,[$Xfer+40]
    485 	fxors		@X[13],@X[0],@X[0]
    486 	std		%f28,[$Xfer+48]
    487 	ba		.Loop
    488 	std		%f30,[$Xfer+56]
    489 .align	32
    490 .Loop:
    491 ___
    492 for ($i=0;$i<20;$i++)	{ &BODY_00_19($i,@V); unshift(@V,pop(@V)); }
    493 for (;$i<40;$i++)	{ &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
    494 for (;$i<60;$i++)	{ &BODY_40_59($i,@V); unshift(@V,pop(@V)); }
    495 for (;$i<70;$i++)	{ &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
    496 $code.=<<___;
    497 	tst		$len
    498 	bz,pn		`$bits==32?"%icc":"%xcc"`,.Ltail
    499 	nop
    500 ___
    501 for (;$i<80;$i++)	{ &BODY_70_79($i,@V); unshift(@V,pop(@V)); }
    502 $code.=<<___;
    503 	add		$A,$Actx,$Actx
    504 	add		$B,$Bctx,$Bctx
    505 	add		$C,$Cctx,$Cctx
    506 	add		$D,$Dctx,$Dctx
    507 	add		$E,$Ectx,$Ectx
    508 	mov		5,$tmp0
    509 	fxors		@X[13],@X[0],@X[0]
    510 	mov		$Actx,$A
    511 	mov		$Bctx,$B
    512 	mov		$Cctx,$C
    513 	mov		$Dctx,$D
    514 	mov		$Ectx,$E
    515 	alignaddr	%g0,$tmp0,%g0	
    516 	dec		1,$len
    517 	ba		.Loop
    518 	mov		$nXfer,$Xfer
    519 
    520 .align	32
    521 .Ltail:
    522 ___
    523 for($i=70;$i<80;$i++)	{ &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
    524 $code.=<<___;
    525 	add	$A,$Actx,$Actx
    526 	add	$B,$Bctx,$Bctx
    527 	add	$C,$Cctx,$Cctx
    528 	add	$D,$Dctx,$Dctx
    529 	add	$E,$Ectx,$Ectx
    530 
    531 	st	$Actx,[$ctx+0]
    532 	st	$Bctx,[$ctx+4]
    533 	st	$Cctx,[$ctx+8]
    534 	st	$Dctx,[$ctx+12]
    535 	st	$Ectx,[$ctx+16]
    536 
    537 	ret
    538 	restore
    539 .type	sha1_block_data_order,#function
    540 .size	sha1_block_data_order,(.-sha1_block_data_order)
    541 .asciz	"SHA1 block transform for SPARCv9a, CRYPTOGAMS by <appro\@openssl.org>"
    542 .align	4
    543 ___
    544 
    545 # Purpose of these subroutines is to explicitly encode VIS instructions,
    546 # so that one can compile the module without having to specify VIS
    547 # extentions on compiler command line, e.g. -xarch=v9 vs. -xarch=v9a.
    548 # Idea is to reserve for option to produce "universal" binary and let
    549 # programmer detect if current CPU is VIS capable at run-time.
    550 sub unvis {
    551 my ($mnemonic,$rs1,$rs2,$rd)=@_;
    552 my ($ref,$opf);
    553 my %visopf = (	"fmul8ulx16"	=> 0x037,
    554 		"faligndata"	=> 0x048,
    555 		"fpadd32"	=> 0x052,
    556 		"fxor"		=> 0x06c,
    557 		"fxors"		=> 0x06d	);
    558 
    559     $ref = "$mnemonic\t$rs1,$rs2,$rd";
    560 
    561     if ($opf=$visopf{$mnemonic}) {
    562 	foreach ($rs1,$rs2,$rd) {
    563 	    return $ref if (!/%f([0-9]{1,2})/);
    564 	    $_=$1;
    565 	    if ($1>=32) {
    566 		return $ref if ($1&1);
    567 		# re-encode for upper double register addressing
    568 		$_=($1|$1>>5)&31;
    569 	    }
    570 	}
    571 
    572 	return	sprintf ".word\t0x%08x !%s",
    573 			0x81b00000|$rd<<25|$rs1<<14|$opf<<5|$rs2,
    574 			$ref;
    575     } else {
    576 	return $ref;
    577     }
    578 }
    579 sub unalignaddr {
    580 my ($mnemonic,$rs1,$rs2,$rd)=@_;
    581 my %bias = ( "g" => 0, "o" => 8, "l" => 16, "i" => 24 );
    582 my $ref="$mnemonic\t$rs1,$rs2,$rd";
    583 
    584     foreach ($rs1,$rs2,$rd) {
    585 	if (/%([goli])([0-7])/)	{ $_=$bias{$1}+$2; }
    586 	else			{ return $ref; }
    587     }
    588     return  sprintf ".word\t0x%08x !%s",
    589 		    0x81b00300|$rd<<25|$rs1<<14|$rs2,
    590 		    $ref;
    591 }
    592 
    593 $code =~ s/\`([^\`]*)\`/eval $1/gem;
    594 $code =~ s/\b(f[^\s]*)\s+(%f[0-9]{1,2}),(%f[0-9]{1,2}),(%f[0-9]{1,2})/
    595 		&unvis($1,$2,$3,$4)
    596 	  /gem;
    597 $code =~ s/\b(alignaddr)\s+(%[goli][0-7]),(%[goli][0-7]),(%[goli][0-7])/
    598 		&unalignaddr($1,$2,$3,$4)
    599 	  /gem;
    600 print $code;
    601 close STDOUT;
    602