Home | History | Annotate | Download | only in asm
      1 #!/usr/bin/env perl
      2 
      3 # ====================================================================
      4 # Written by Andy Polyakov <appro (at] fy.chalmers.se> for the OpenSSL
      5 # project. The module is, however, dual licensed under OpenSSL and
      6 # CRYPTOGAMS licenses depending on where you obtain it. For further
      7 # details see http://www.openssl.org/~appro/cryptogams/.
      8 # ====================================================================
      9 
     10 # SHA256 performance improvement over compiler generated code varies
     11 # from 40% for Sun C [32-bit build] to 70% for gcc [3.3, 64-bit
     12 # build]. Just like in SHA1 module I aim to ensure scalability on
     13 # UltraSPARC T1 by packing X[16] to 8 64-bit registers.
     14 
     15 # SHA512 on pre-T1 UltraSPARC.
     16 #
     17 # Performance is >75% better than 64-bit code generated by Sun C and
     18 # over 2x than 32-bit code. X[16] resides on stack, but access to it
     19 # is scheduled for L2 latency and staged through 32 least significant
     20 # bits of %l0-%l7. The latter is done to achieve 32-/64-bit ABI
     21 # duality. Nevetheless it's ~40% faster than SHA256, which is pretty
     22 # good [optimal coefficient is 50%].
     23 #
     24 # SHA512 on UltraSPARC T1.
     25 #
     26 # It's not any faster than 64-bit code generated by Sun C 5.8. This is
     27 # because 64-bit code generator has the advantage of using 64-bit
     28 # loads(*) to access X[16], which I consciously traded for 32-/64-bit
     29 # ABI duality [as per above]. But it surpasses 32-bit Sun C generated
     30 # code by 60%, not to mention that it doesn't suffer from severe decay
     31 # when running 4 times physical cores threads and that it leaves gcc
     32 # [3.4] behind by over 4x factor! If compared to SHA256, single thread
     33 # performance is only 10% better, but overall throughput for maximum
     34 # amount of threads for given CPU exceeds corresponding one of SHA256
     35 # by 30% [again, optimal coefficient is 50%].
     36 #
     37 # (*)	Unlike pre-T1 UltraSPARC loads on T1 are executed strictly
     38 #	in-order, i.e. load instruction has to complete prior next
     39 #	instruction in given thread is executed, even if the latter is
     40 #	not dependent on load result! This means that on T1 two 32-bit
     41 #	loads are always slower than one 64-bit load. Once again this
     42 #	is unlike pre-T1 UltraSPARC, where, if scheduled appropriately,
     43 #	2x32-bit loads can be as fast as 1x64-bit ones.
     44 
     45 $bits=32;
     46 for (@ARGV)	{ $bits=64 if (/\-m64/ || /\-xarch\=v9/); }
     47 if ($bits==64)	{ $bias=2047; $frame=192; }
     48 else		{ $bias=0;    $frame=112; }
     49 
     50 $output=shift;
     51 open STDOUT,">$output";
     52 
     53 if ($output =~ /512/) {
     54 	$label="512";
     55 	$SZ=8;
     56 	$LD="ldx";		# load from memory
     57 	$ST="stx";		# store to memory
     58 	$SLL="sllx";		# shift left logical
     59 	$SRL="srlx";		# shift right logical
     60 	@Sigma0=(28,34,39);
     61 	@Sigma1=(14,18,41);
     62 	@sigma0=( 7, 1, 8);	# right shift first
     63 	@sigma1=( 6,19,61);	# right shift first
     64 	$lastK=0x817;
     65 	$rounds=80;
     66 	$align=4;
     67 
     68 	$locals=16*$SZ;		# X[16]
     69 
     70 	$A="%o0";
     71 	$B="%o1";
     72 	$C="%o2";
     73 	$D="%o3";
     74 	$E="%o4";
     75 	$F="%o5";
     76 	$G="%g1";
     77 	$H="%o7";
     78 	@V=($A,$B,$C,$D,$E,$F,$G,$H);
     79 } else {
     80 	$label="256";
     81 	$SZ=4;
     82 	$LD="ld";		# load from memory
     83 	$ST="st";		# store to memory
     84 	$SLL="sll";		# shift left logical
     85 	$SRL="srl";		# shift right logical
     86 	@Sigma0=( 2,13,22);
     87 	@Sigma1=( 6,11,25);
     88 	@sigma0=( 3, 7,18);	# right shift first
     89 	@sigma1=(10,17,19);	# right shift first
     90 	$lastK=0x8f2;
     91 	$rounds=64;
     92 	$align=8;
     93 
     94 	$locals=0;		# X[16] is register resident
     95 	@X=("%o0","%o1","%o2","%o3","%o4","%o5","%g1","%o7");
     96 	
     97 	$A="%l0";
     98 	$B="%l1";
     99 	$C="%l2";
    100 	$D="%l3";
    101 	$E="%l4";
    102 	$F="%l5";
    103 	$G="%l6";
    104 	$H="%l7";
    105 	@V=($A,$B,$C,$D,$E,$F,$G,$H);
    106 }
    107 $T1="%g2";
    108 $tmp0="%g3";
    109 $tmp1="%g4";
    110 $tmp2="%g5";
    111 
    112 $ctx="%i0";
    113 $inp="%i1";
    114 $len="%i2";
    115 $Ktbl="%i3";
    116 $tmp31="%i4";
    117 $tmp32="%i5";
    118 
    119 ########### SHA256
    120 $Xload = sub {
    121 my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
    122 
    123     if ($i==0) {
    124 $code.=<<___;
    125 	ldx	[$inp+0],@X[0]
    126 	ldx	[$inp+16],@X[2]
    127 	ldx	[$inp+32],@X[4]
    128 	ldx	[$inp+48],@X[6]
    129 	ldx	[$inp+8],@X[1]
    130 	ldx	[$inp+24],@X[3]
    131 	subcc	%g0,$tmp31,$tmp32 ! should be 64-$tmp31, but -$tmp31 works too
    132 	ldx	[$inp+40],@X[5]
    133 	bz,pt	%icc,.Laligned
    134 	ldx	[$inp+56],@X[7]
    135 
    136 	sllx	@X[0],$tmp31,@X[0]
    137 	ldx	[$inp+64],$T1
    138 ___
    139 for($j=0;$j<7;$j++)
    140 {   $code.=<<___;
    141 	srlx	@X[$j+1],$tmp32,$tmp1
    142 	sllx	@X[$j+1],$tmp31,@X[$j+1]
    143 	or	$tmp1,@X[$j],@X[$j]
    144 ___
    145 }
    146 $code.=<<___;
    147 	srlx	$T1,$tmp32,$T1
    148 	or	$T1,@X[7],@X[7]
    149 .Laligned:
    150 ___
    151     }
    152 
    153     if ($i&1) {
    154 	$code.="\tadd	@X[$i/2],$h,$T1\n";
    155     } else {
    156 	$code.="\tsrlx	@X[$i/2],32,$T1\n\tadd	$h,$T1,$T1\n";
    157     }
    158 } if ($SZ==4);
    159 
    160 ########### SHA512
    161 $Xload = sub {
    162 my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
    163 my @pair=("%l".eval(($i*2)%8),"%l".eval(($i*2)%8+1),"%l".eval((($i+1)*2)%8));
    164 
    165 $code.=<<___ if ($i==0);
    166 	ld	[$inp+0],%l0
    167 	ld	[$inp+4],%l1
    168 	ld	[$inp+8],%l2
    169 	ld	[$inp+12],%l3
    170 	ld	[$inp+16],%l4
    171 	ld	[$inp+20],%l5
    172 	ld	[$inp+24],%l6
    173 	ld	[$inp+28],%l7
    174 ___
    175 $code.=<<___ if ($i<15);
    176 	sllx	@pair[1],$tmp31,$tmp2	! Xload($i)
    177 	add	$tmp31,32,$tmp0
    178 	sllx	@pair[0],$tmp0,$tmp1
    179 	`"ld	[$inp+".eval(32+0+$i*8)."],@pair[0]"	if ($i<12)`
    180 	srlx	@pair[2],$tmp32,@pair[1]
    181 	or	$tmp1,$tmp2,$tmp2
    182 	or	@pair[1],$tmp2,$tmp2
    183 	`"ld	[$inp+".eval(32+4+$i*8)."],@pair[1]"	if ($i<12)`
    184 	add	$h,$tmp2,$T1
    185 	$ST	$tmp2,[%sp+`$bias+$frame+$i*$SZ`]
    186 ___
    187 $code.=<<___ if ($i==12);
    188 	brnz,a	$tmp31,.+8
    189 	ld	[$inp+128],%l0
    190 ___
    191 $code.=<<___ if ($i==15);
    192 	ld	[%sp+`$bias+$frame+(($i+1+1)%16)*$SZ+0`],%l2
    193 	sllx	@pair[1],$tmp31,$tmp2	! Xload($i)
    194 	add	$tmp31,32,$tmp0
    195 	ld	[%sp+`$bias+$frame+(($i+1+1)%16)*$SZ+4`],%l3
    196 	sllx	@pair[0],$tmp0,$tmp1
    197 	ld	[%sp+`$bias+$frame+(($i+1+9)%16)*$SZ+0`],%l4
    198 	srlx	@pair[2],$tmp32,@pair[1]
    199 	or	$tmp1,$tmp2,$tmp2
    200 	ld	[%sp+`$bias+$frame+(($i+1+9)%16)*$SZ+4`],%l5
    201 	or	@pair[1],$tmp2,$tmp2
    202 	ld	[%sp+`$bias+$frame+(($i+1+14)%16)*$SZ+0`],%l6
    203 	add	$h,$tmp2,$T1
    204 	$ST	$tmp2,[%sp+`$bias+$frame+$i*$SZ`]
    205 	ld	[%sp+`$bias+$frame+(($i+1+14)%16)*$SZ+4`],%l7
    206 	ld	[%sp+`$bias+$frame+(($i+1+0)%16)*$SZ+0`],%l0
    207 	ld	[%sp+`$bias+$frame+(($i+1+0)%16)*$SZ+4`],%l1
    208 ___
    209 } if ($SZ==8);
    210 
    211 ########### common
    212 sub BODY_00_15 {
    213 my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
    214 
    215     if ($i<16) {
    216 	&$Xload(@_);
    217     } else {
    218 	$code.="\tadd	$h,$T1,$T1\n";
    219     }
    220 
    221 $code.=<<___;
    222 	$SRL	$e,@Sigma1[0],$h	!! $i
    223 	xor	$f,$g,$tmp2
    224 	$SLL	$e,`$SZ*8-@Sigma1[2]`,$tmp1
    225 	and	$e,$tmp2,$tmp2
    226 	$SRL	$e,@Sigma1[1],$tmp0
    227 	xor	$tmp1,$h,$h
    228 	$SLL	$e,`$SZ*8-@Sigma1[1]`,$tmp1
    229 	xor	$tmp0,$h,$h
    230 	$SRL	$e,@Sigma1[2],$tmp0
    231 	xor	$tmp1,$h,$h
    232 	$SLL	$e,`$SZ*8-@Sigma1[0]`,$tmp1
    233 	xor	$tmp0,$h,$h
    234 	xor	$g,$tmp2,$tmp2		! Ch(e,f,g)
    235 	xor	$tmp1,$h,$tmp0		! Sigma1(e)
    236 
    237 	$SRL	$a,@Sigma0[0],$h
    238 	add	$tmp2,$T1,$T1
    239 	$LD	[$Ktbl+`$i*$SZ`],$tmp2	! K[$i]
    240 	$SLL	$a,`$SZ*8-@Sigma0[2]`,$tmp1
    241 	add	$tmp0,$T1,$T1
    242 	$SRL	$a,@Sigma0[1],$tmp0
    243 	xor	$tmp1,$h,$h
    244 	$SLL	$a,`$SZ*8-@Sigma0[1]`,$tmp1
    245 	xor	$tmp0,$h,$h
    246 	$SRL	$a,@Sigma0[2],$tmp0
    247 	xor	$tmp1,$h,$h	
    248 	$SLL	$a,`$SZ*8-@Sigma0[0]`,$tmp1
    249 	xor	$tmp0,$h,$h
    250 	xor	$tmp1,$h,$h		! Sigma0(a)
    251 
    252 	or	$a,$b,$tmp0
    253 	and	$a,$b,$tmp1
    254 	and	$c,$tmp0,$tmp0
    255 	or	$tmp0,$tmp1,$tmp1	! Maj(a,b,c)
    256 	add	$tmp2,$T1,$T1		! +=K[$i]
    257 	add	$tmp1,$h,$h
    258 
    259 	add	$T1,$d,$d
    260 	add	$T1,$h,$h
    261 ___
    262 }
    263 
    264 ########### SHA256
    265 $BODY_16_XX = sub {
    266 my $i=@_[0];
    267 my $xi;
    268 
    269     if ($i&1) {
    270 	$xi=$tmp32;
    271 	$code.="\tsrlx	@X[(($i+1)/2)%8],32,$xi\n";
    272     } else {
    273 	$xi=@X[(($i+1)/2)%8];
    274     }
    275 $code.=<<___;
    276 	srl	$xi,@sigma0[0],$T1		!! Xupdate($i)
    277 	sll	$xi,`32-@sigma0[2]`,$tmp1
    278 	srl	$xi,@sigma0[1],$tmp0
    279 	xor	$tmp1,$T1,$T1
    280 	sll	$tmp1,`@sigma0[2]-@sigma0[1]`,$tmp1
    281 	xor	$tmp0,$T1,$T1
    282 	srl	$xi,@sigma0[2],$tmp0
    283 	xor	$tmp1,$T1,$T1
    284 ___
    285     if ($i&1) {
    286 	$xi=@X[(($i+14)/2)%8];
    287     } else {
    288 	$xi=$tmp32;
    289 	$code.="\tsrlx	@X[(($i+14)/2)%8],32,$xi\n";
    290     }
    291 $code.=<<___;
    292 	srl	$xi,@sigma1[0],$tmp2
    293 	xor	$tmp0,$T1,$T1			! T1=sigma0(X[i+1])
    294 	sll	$xi,`32-@sigma1[2]`,$tmp1
    295 	srl	$xi,@sigma1[1],$tmp0
    296 	xor	$tmp1,$tmp2,$tmp2
    297 	sll	$tmp1,`@sigma1[2]-@sigma1[1]`,$tmp1
    298 	xor	$tmp0,$tmp2,$tmp2
    299 	srl	$xi,@sigma1[2],$tmp0
    300 	xor	$tmp1,$tmp2,$tmp2
    301 ___
    302     if ($i&1) {
    303 	$xi=@X[($i/2)%8];
    304 $code.=<<___;
    305 	srlx	@X[(($i+9)/2)%8],32,$tmp1	! X[i+9]
    306 	xor	$tmp0,$tmp2,$tmp2		! sigma1(X[i+14])
    307 	srl	@X[($i/2)%8],0,$tmp0
    308 	add	$tmp2,$tmp1,$tmp1
    309 	add	$xi,$T1,$T1			! +=X[i]
    310 	xor	$tmp0,@X[($i/2)%8],@X[($i/2)%8]
    311 	add	$tmp1,$T1,$T1
    312 
    313 	srl	$T1,0,$T1
    314 	or	$T1,@X[($i/2)%8],@X[($i/2)%8]
    315 ___
    316     } else {
    317 	$xi=@X[(($i+9)/2)%8];
    318 $code.=<<___;
    319 	srlx	@X[($i/2)%8],32,$tmp1		! X[i]
    320 	xor	$tmp0,$tmp2,$tmp2		! sigma1(X[i+14])
    321 	add	$xi,$T1,$T1			! +=X[i+9]
    322 	add	$tmp2,$tmp1,$tmp1
    323 	srl	@X[($i/2)%8],0,@X[($i/2)%8]
    324 	add	$tmp1,$T1,$T1
    325 
    326 	sllx	$T1,32,$tmp0
    327 	or	$tmp0,@X[($i/2)%8],@X[($i/2)%8]
    328 ___
    329     }
    330     &BODY_00_15(@_);
    331 } if ($SZ==4);
    332 
    333 ########### SHA512
    334 $BODY_16_XX = sub {
    335 my $i=@_[0];
    336 my @pair=("%l".eval(($i*2)%8),"%l".eval(($i*2)%8+1));
    337 
    338 $code.=<<___;
    339 	sllx	%l2,32,$tmp0		!! Xupdate($i)
    340 	or	%l3,$tmp0,$tmp0
    341 
    342 	srlx	$tmp0,@sigma0[0],$T1
    343 	ld	[%sp+`$bias+$frame+(($i+1+1)%16)*$SZ+0`],%l2
    344 	sllx	$tmp0,`64-@sigma0[2]`,$tmp1
    345 	ld	[%sp+`$bias+$frame+(($i+1+1)%16)*$SZ+4`],%l3
    346 	srlx	$tmp0,@sigma0[1],$tmp0
    347 	xor	$tmp1,$T1,$T1
    348 	sllx	$tmp1,`@sigma0[2]-@sigma0[1]`,$tmp1
    349 	xor	$tmp0,$T1,$T1
    350 	srlx	$tmp0,`@sigma0[2]-@sigma0[1]`,$tmp0
    351 	xor	$tmp1,$T1,$T1
    352 	sllx	%l6,32,$tmp2
    353 	xor	$tmp0,$T1,$T1		! sigma0(X[$i+1])
    354 	or	%l7,$tmp2,$tmp2
    355 
    356 	srlx	$tmp2,@sigma1[0],$tmp1
    357 	ld	[%sp+`$bias+$frame+(($i+1+14)%16)*$SZ+0`],%l6
    358 	sllx	$tmp2,`64-@sigma1[2]`,$tmp0
    359 	ld	[%sp+`$bias+$frame+(($i+1+14)%16)*$SZ+4`],%l7
    360 	srlx	$tmp2,@sigma1[1],$tmp2
    361 	xor	$tmp0,$tmp1,$tmp1
    362 	sllx	$tmp0,`@sigma1[2]-@sigma1[1]`,$tmp0
    363 	xor	$tmp2,$tmp1,$tmp1
    364 	srlx	$tmp2,`@sigma1[2]-@sigma1[1]`,$tmp2
    365 	xor	$tmp0,$tmp1,$tmp1
    366 	sllx	%l4,32,$tmp0
    367 	xor	$tmp2,$tmp1,$tmp1	! sigma1(X[$i+14])
    368 	ld	[%sp+`$bias+$frame+(($i+1+9)%16)*$SZ+0`],%l4
    369 	or	%l5,$tmp0,$tmp0
    370 	ld	[%sp+`$bias+$frame+(($i+1+9)%16)*$SZ+4`],%l5
    371 
    372 	sllx	%l0,32,$tmp2
    373 	add	$tmp1,$T1,$T1
    374 	ld	[%sp+`$bias+$frame+(($i+1+0)%16)*$SZ+0`],%l0
    375 	or	%l1,$tmp2,$tmp2
    376 	add	$tmp0,$T1,$T1		! +=X[$i+9]
    377 	ld	[%sp+`$bias+$frame+(($i+1+0)%16)*$SZ+4`],%l1
    378 	add	$tmp2,$T1,$T1		! +=X[$i]
    379 	$ST	$T1,[%sp+`$bias+$frame+($i%16)*$SZ`]
    380 ___
    381     &BODY_00_15(@_);
    382 } if ($SZ==8);
    383 
    384 $code.=<<___ if ($bits==64);
    385 .register	%g2,#scratch
    386 .register	%g3,#scratch
    387 ___
    388 $code.=<<___;
    389 .section	".text",#alloc,#execinstr
    390 
    391 .align	64
    392 K${label}:
    393 .type	K${label},#object
    394 ___
    395 if ($SZ==4) {
    396 $code.=<<___;
    397 	.long	0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5
    398 	.long	0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5
    399 	.long	0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3
    400 	.long	0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174
    401 	.long	0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc
    402 	.long	0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da
    403 	.long	0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7
    404 	.long	0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967
    405 	.long	0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13
    406 	.long	0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85
    407 	.long	0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3
    408 	.long	0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070
    409 	.long	0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5
    410 	.long	0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3
    411 	.long	0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208
    412 	.long	0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
    413 ___
    414 } else {
    415 $code.=<<___;
    416 	.long	0x428a2f98,0xd728ae22, 0x71374491,0x23ef65cd
    417 	.long	0xb5c0fbcf,0xec4d3b2f, 0xe9b5dba5,0x8189dbbc
    418 	.long	0x3956c25b,0xf348b538, 0x59f111f1,0xb605d019
    419 	.long	0x923f82a4,0xaf194f9b, 0xab1c5ed5,0xda6d8118
    420 	.long	0xd807aa98,0xa3030242, 0x12835b01,0x45706fbe
    421 	.long	0x243185be,0x4ee4b28c, 0x550c7dc3,0xd5ffb4e2
    422 	.long	0x72be5d74,0xf27b896f, 0x80deb1fe,0x3b1696b1
    423 	.long	0x9bdc06a7,0x25c71235, 0xc19bf174,0xcf692694
    424 	.long	0xe49b69c1,0x9ef14ad2, 0xefbe4786,0x384f25e3
    425 	.long	0x0fc19dc6,0x8b8cd5b5, 0x240ca1cc,0x77ac9c65
    426 	.long	0x2de92c6f,0x592b0275, 0x4a7484aa,0x6ea6e483
    427 	.long	0x5cb0a9dc,0xbd41fbd4, 0x76f988da,0x831153b5
    428 	.long	0x983e5152,0xee66dfab, 0xa831c66d,0x2db43210
    429 	.long	0xb00327c8,0x98fb213f, 0xbf597fc7,0xbeef0ee4
    430 	.long	0xc6e00bf3,0x3da88fc2, 0xd5a79147,0x930aa725
    431 	.long	0x06ca6351,0xe003826f, 0x14292967,0x0a0e6e70
    432 	.long	0x27b70a85,0x46d22ffc, 0x2e1b2138,0x5c26c926
    433 	.long	0x4d2c6dfc,0x5ac42aed, 0x53380d13,0x9d95b3df
    434 	.long	0x650a7354,0x8baf63de, 0x766a0abb,0x3c77b2a8
    435 	.long	0x81c2c92e,0x47edaee6, 0x92722c85,0x1482353b
    436 	.long	0xa2bfe8a1,0x4cf10364, 0xa81a664b,0xbc423001
    437 	.long	0xc24b8b70,0xd0f89791, 0xc76c51a3,0x0654be30
    438 	.long	0xd192e819,0xd6ef5218, 0xd6990624,0x5565a910
    439 	.long	0xf40e3585,0x5771202a, 0x106aa070,0x32bbd1b8
    440 	.long	0x19a4c116,0xb8d2d0c8, 0x1e376c08,0x5141ab53
    441 	.long	0x2748774c,0xdf8eeb99, 0x34b0bcb5,0xe19b48a8
    442 	.long	0x391c0cb3,0xc5c95a63, 0x4ed8aa4a,0xe3418acb
    443 	.long	0x5b9cca4f,0x7763e373, 0x682e6ff3,0xd6b2b8a3
    444 	.long	0x748f82ee,0x5defb2fc, 0x78a5636f,0x43172f60
    445 	.long	0x84c87814,0xa1f0ab72, 0x8cc70208,0x1a6439ec
    446 	.long	0x90befffa,0x23631e28, 0xa4506ceb,0xde82bde9
    447 	.long	0xbef9a3f7,0xb2c67915, 0xc67178f2,0xe372532b
    448 	.long	0xca273ece,0xea26619c, 0xd186b8c7,0x21c0c207
    449 	.long	0xeada7dd6,0xcde0eb1e, 0xf57d4f7f,0xee6ed178
    450 	.long	0x06f067aa,0x72176fba, 0x0a637dc5,0xa2c898a6
    451 	.long	0x113f9804,0xbef90dae, 0x1b710b35,0x131c471b
    452 	.long	0x28db77f5,0x23047d84, 0x32caab7b,0x40c72493
    453 	.long	0x3c9ebe0a,0x15c9bebc, 0x431d67c4,0x9c100d4c
    454 	.long	0x4cc5d4be,0xcb3e42b6, 0x597f299c,0xfc657e2a
    455 	.long	0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817
    456 ___
    457 }
    458 $code.=<<___;
    459 .size	K${label},.-K${label}
    460 .globl	sha${label}_block_data_order
    461 sha${label}_block_data_order:
    462 	save	%sp,`-$frame-$locals`,%sp
    463 	and	$inp,`$align-1`,$tmp31
    464 	sllx	$len,`log(16*$SZ)/log(2)`,$len
    465 	andn	$inp,`$align-1`,$inp
    466 	sll	$tmp31,3,$tmp31
    467 	add	$inp,$len,$len
    468 ___
    469 $code.=<<___ if ($SZ==8); # SHA512
    470 	mov	32,$tmp32
    471 	sub	$tmp32,$tmp31,$tmp32
    472 ___
    473 $code.=<<___;
    474 .Lpic:	call	.+8
    475 	add	%o7,K${label}-.Lpic,$Ktbl
    476 
    477 	$LD	[$ctx+`0*$SZ`],$A
    478 	$LD	[$ctx+`1*$SZ`],$B
    479 	$LD	[$ctx+`2*$SZ`],$C
    480 	$LD	[$ctx+`3*$SZ`],$D
    481 	$LD	[$ctx+`4*$SZ`],$E
    482 	$LD	[$ctx+`5*$SZ`],$F
    483 	$LD	[$ctx+`6*$SZ`],$G
    484 	$LD	[$ctx+`7*$SZ`],$H
    485 
    486 .Lloop:
    487 ___
    488 for ($i=0;$i<16;$i++)	{ &BODY_00_15($i,@V); unshift(@V,pop(@V)); }
    489 $code.=".L16_xx:\n";
    490 for (;$i<32;$i++)	{ &$BODY_16_XX($i,@V); unshift(@V,pop(@V)); }
    491 $code.=<<___;
    492 	and	$tmp2,0xfff,$tmp2
    493 	cmp	$tmp2,$lastK
    494 	bne	.L16_xx
    495 	add	$Ktbl,`16*$SZ`,$Ktbl	! Ktbl+=16
    496 
    497 ___
    498 $code.=<<___ if ($SZ==4); # SHA256
    499 	$LD	[$ctx+`0*$SZ`],@X[0]
    500 	$LD	[$ctx+`1*$SZ`],@X[1]
    501 	$LD	[$ctx+`2*$SZ`],@X[2]
    502 	$LD	[$ctx+`3*$SZ`],@X[3]
    503 	$LD	[$ctx+`4*$SZ`],@X[4]
    504 	$LD	[$ctx+`5*$SZ`],@X[5]
    505 	$LD	[$ctx+`6*$SZ`],@X[6]
    506 	$LD	[$ctx+`7*$SZ`],@X[7]
    507 
    508 	add	$A,@X[0],$A
    509 	$ST	$A,[$ctx+`0*$SZ`]
    510 	add	$B,@X[1],$B
    511 	$ST	$B,[$ctx+`1*$SZ`]
    512 	add	$C,@X[2],$C
    513 	$ST	$C,[$ctx+`2*$SZ`]
    514 	add	$D,@X[3],$D
    515 	$ST	$D,[$ctx+`3*$SZ`]
    516 	add	$E,@X[4],$E
    517 	$ST	$E,[$ctx+`4*$SZ`]
    518 	add	$F,@X[5],$F
    519 	$ST	$F,[$ctx+`5*$SZ`]
    520 	add	$G,@X[6],$G
    521 	$ST	$G,[$ctx+`6*$SZ`]
    522 	add	$H,@X[7],$H
    523 	$ST	$H,[$ctx+`7*$SZ`]
    524 ___
    525 $code.=<<___ if ($SZ==8); # SHA512
    526 	ld	[$ctx+`0*$SZ+0`],%l0
    527 	ld	[$ctx+`0*$SZ+4`],%l1
    528 	ld	[$ctx+`1*$SZ+0`],%l2
    529 	ld	[$ctx+`1*$SZ+4`],%l3
    530 	ld	[$ctx+`2*$SZ+0`],%l4
    531 	ld	[$ctx+`2*$SZ+4`],%l5
    532 	ld	[$ctx+`3*$SZ+0`],%l6
    533 
    534 	sllx	%l0,32,$tmp0
    535 	ld	[$ctx+`3*$SZ+4`],%l7
    536 	sllx	%l2,32,$tmp1
    537 	or	%l1,$tmp0,$tmp0
    538 	or	%l3,$tmp1,$tmp1
    539 	add	$tmp0,$A,$A
    540 	add	$tmp1,$B,$B
    541 	$ST	$A,[$ctx+`0*$SZ`]
    542 	sllx	%l4,32,$tmp2
    543 	$ST	$B,[$ctx+`1*$SZ`]
    544 	sllx	%l6,32,$T1
    545 	or	%l5,$tmp2,$tmp2
    546 	or	%l7,$T1,$T1
    547 	add	$tmp2,$C,$C
    548 	$ST	$C,[$ctx+`2*$SZ`]
    549 	add	$T1,$D,$D
    550 	$ST	$D,[$ctx+`3*$SZ`]
    551 
    552 	ld	[$ctx+`4*$SZ+0`],%l0
    553 	ld	[$ctx+`4*$SZ+4`],%l1
    554 	ld	[$ctx+`5*$SZ+0`],%l2
    555 	ld	[$ctx+`5*$SZ+4`],%l3
    556 	ld	[$ctx+`6*$SZ+0`],%l4
    557 	ld	[$ctx+`6*$SZ+4`],%l5
    558 	ld	[$ctx+`7*$SZ+0`],%l6
    559 
    560 	sllx	%l0,32,$tmp0
    561 	ld	[$ctx+`7*$SZ+4`],%l7
    562 	sllx	%l2,32,$tmp1
    563 	or	%l1,$tmp0,$tmp0
    564 	or	%l3,$tmp1,$tmp1
    565 	add	$tmp0,$E,$E
    566 	add	$tmp1,$F,$F
    567 	$ST	$E,[$ctx+`4*$SZ`]
    568 	sllx	%l4,32,$tmp2
    569 	$ST	$F,[$ctx+`5*$SZ`]
    570 	sllx	%l6,32,$T1
    571 	or	%l5,$tmp2,$tmp2
    572 	or	%l7,$T1,$T1
    573 	add	$tmp2,$G,$G
    574 	$ST	$G,[$ctx+`6*$SZ`]
    575 	add	$T1,$H,$H
    576 	$ST	$H,[$ctx+`7*$SZ`]
    577 ___
    578 $code.=<<___;
    579 	add	$inp,`16*$SZ`,$inp		! advance inp
    580 	cmp	$inp,$len
    581 	bne	`$bits==64?"%xcc":"%icc"`,.Lloop
    582 	sub	$Ktbl,`($rounds-16)*$SZ`,$Ktbl	! rewind Ktbl
    583 
    584 	ret
    585 	restore
    586 .type	sha${label}_block_data_order,#function
    587 .size	sha${label}_block_data_order,(.-sha${label}_block_data_order)
    588 .asciz	"SHA${label} block transform for SPARCv9, CRYPTOGAMS by <appro\@openssl.org>"
    589 .align	4
    590 ___
    591 
    592 $code =~ s/\`([^\`]*)\`/eval $1/gem;
    593 print $code;
    594 close STDOUT;
    595