Home | History | Annotate | Download | only in asm
      1 #!/usr/bin/env perl
      2 #
      3 # ====================================================================
      4 # Written by Andy Polyakov <appro (at] openssl.org> for the OpenSSL
      5 # project. Rights for redistribution and usage in source and binary
      6 # forms are granted according to the OpenSSL license.
      7 # ====================================================================
      8 #
      9 # sha256/512_block procedure for x86_64.
     10 #
     11 # 40% improvement over compiler-generated code on Opteron. On EM64T
     12 # sha256 was observed to run >80% faster and sha512 - >40%. No magical
     13 # tricks, just straight implementation... I really wonder why gcc
     14 # [being armed with inline assembler] fails to generate as fast code.
     15 # The only thing which is cool about this module is that it's very
     16 # same instruction sequence used for both SHA-256 and SHA-512. In
     17 # former case the instructions operate on 32-bit operands, while in
     18 # latter - on 64-bit ones. All I had to do is to get one flavor right,
     19 # the other one passed the test right away:-)
     20 #
     21 # sha256_block runs in ~1005 cycles on Opteron, which gives you
     22 # asymptotic performance of 64*1000/1005=63.7MBps times CPU clock
     23 # frequency in GHz. sha512_block runs in ~1275 cycles, which results
     24 # in 128*1000/1275=100MBps per GHz. Is there room for improvement?
     25 # Well, if you compare it to IA-64 implementation, which maintains
     26 # X[16] in register bank[!], tends to 4 instructions per CPU clock
     27 # cycle and runs in 1003 cycles, 1275 is very good result for 3-way
     28 # issue Opteron pipeline and X[16] maintained in memory. So that *if*
     29 # there is a way to improve it, *then* the only way would be to try to
     30 # offload X[16] updates to SSE unit, but that would require "deeper"
     31 # loop unroll, which in turn would naturally cause size blow-up, not
     32 # to mention increased complexity! And once again, only *if* it's
     33 # actually possible to noticeably improve overall ILP, instruction
     34 # level parallelism, on a given CPU implementation in this case.
     35 #
     36 # Special note on Intel EM64T. While Opteron CPU exhibits perfect
     37 # perfromance ratio of 1.5 between 64- and 32-bit flavors [see above],
     38 # [currently available] EM64T CPUs apparently are far from it. On the
     39 # contrary, 64-bit version, sha512_block, is ~30% *slower* than 32-bit
     40 # sha256_block:-( This is presumably because 64-bit shifts/rotates
     41 # apparently are not atomic instructions, but implemented in microcode.
     42 #
     43 # May 2012.
     44 #
     45 # Optimization including one of Pavel Semjanov's ideas, alternative
     46 # Maj, resulted in >=5% improvement on most CPUs, +20% SHA256 and
     47 # unfortunately -2% SHA512 on P4 [which nobody should care about
     48 # that much].
     49 #
     50 # June 2012.
     51 #
     52 # Add SIMD code paths, see below for improvement coefficients. SSSE3
     53 # code path was not attempted for SHA512, because improvement is not
     54 # estimated to be high enough, noticeably less than 9%, to justify
     55 # the effort, not on pre-AVX processors. [Obviously with exclusion
     56 # for VIA Nano, but it has SHA512 instruction that is faster and
     57 # should be used instead.] For reference, corresponding estimated
     58 # upper limit for improvement for SSSE3 SHA256 is 28%. The fact that
     59 # higher coefficients are observed on VIA Nano and Bulldozer has more
     60 # to do with specifics of their architecture [which is topic for
     61 # separate discussion].
     62 #
     63 # November 2012.
     64 #
     65 # Add AVX2 code path. Two consecutive input blocks are loaded to
     66 # 256-bit %ymm registers, with data from first block to least
     67 # significant 128-bit halves and data from second to most significant.
     68 # The data is then processed with same SIMD instruction sequence as
     69 # for AVX, but with %ymm as operands. Side effect is increased stack
     70 # frame, 448 additional bytes in SHA256 and 1152 in SHA512, and 1.2KB
     71 # code size increase.
     72 #
     73 # March 2014.
     74 #
     75 # Add support for Intel SHA Extensions.
     76 
     77 ######################################################################
     78 # Current performance in cycles per processed byte (less is better):
     79 #
     80 #		SHA256	SSSE3       AVX/XOP(*)	    SHA512  AVX/XOP(*)
     81 #
     82 # AMD K8	14.9	-	    -		    9.57    -
     83 # P4		17.3	-	    -		    30.8    -
     84 # Core 2	15.6	13.8(+13%)  -		    9.97    -
     85 # Westmere	14.8	12.3(+19%)  -		    9.58    -
     86 # Sandy Bridge	17.4	14.2(+23%)  11.6(+50%(**))  11.2    8.10(+38%(**))
     87 # Ivy Bridge	12.6	10.5(+20%)  10.3(+22%)	    8.17    7.22(+13%)
     88 # Haswell	12.2	9.28(+31%)  7.80(+56%)	    7.66    5.40(+42%)
     89 # Bulldozer	21.1	13.6(+54%)  13.6(+54%(***)) 13.5    8.58(+57%)
     90 # VIA Nano	23.0	16.5(+39%)  -		    14.7    -
     91 # Atom		23.0	18.9(+22%)  -		    14.7    -
     92 #
     93 # (*)	whichever best applicable;
     94 # (**)	switch from ror to shrd stands for fair share of improvement;
     95 # (***)	execution time is fully determined by remaining integer-only
     96 #	part, body_00_15; reducing the amount of SIMD instructions
     97 #	below certain limit makes no difference/sense; to conserve
     98 #	space SHA256 XOP code path is therefore omitted;
     99 
    100 $flavour = shift;
    101 $output  = shift;
    102 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
    103 
    104 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
    105 
    106 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
    107 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
    108 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
    109 die "can't locate x86_64-xlate.pl";
    110 
    111 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
    112 		=~ /GNU assembler version ([2-9]\.[0-9]+)/) {
    113 	$avx = ($1>=2.19) + ($1>=2.22);
    114 }
    115 
    116 if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
    117 	   `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
    118 	$avx = ($1>=2.09) + ($1>=2.10);
    119 }
    120 
    121 if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
    122 	   `ml64 2>&1` =~ /Version ([0-9]+)\./) {
    123 	$avx = ($1>=10) + ($1>=11);
    124 }
    125 
    126 if (!$avx && `$ENV{CC} -v 2>&1` =~ /(^clang version|based on LLVM) ([3-9]\.[0-9]+)/) {
    127 	$avx = ($2>=3.0) + ($2>3.0);
    128 }
    129 
    130 $shaext=0;	### set to zero if compiling for 1.0.1
    131 $avx=1		if (!$shaext && $avx);
    132 
    133 open OUT,"| \"$^X\" $xlate $flavour";
    134 *STDOUT=*OUT;
    135 
    136 if ($output =~ /512/) {
    137 	$func="sha512_block_data_order";
    138 	$TABLE="K512";
    139 	$SZ=8;
    140 	@ROT=($A,$B,$C,$D,$E,$F,$G,$H)=("%rax","%rbx","%rcx","%rdx",
    141 					"%r8", "%r9", "%r10","%r11");
    142 	($T1,$a0,$a1,$a2,$a3)=("%r12","%r13","%r14","%r15","%rdi");
    143 	@Sigma0=(28,34,39);
    144 	@Sigma1=(14,18,41);
    145 	@sigma0=(1,  8, 7);
    146 	@sigma1=(19,61, 6);
    147 	$rounds=80;
    148 } else {
    149 	$func="sha256_block_data_order";
    150 	$TABLE="K256";
    151 	$SZ=4;
    152 	@ROT=($A,$B,$C,$D,$E,$F,$G,$H)=("%eax","%ebx","%ecx","%edx",
    153 					"%r8d","%r9d","%r10d","%r11d");
    154 	($T1,$a0,$a1,$a2,$a3)=("%r12d","%r13d","%r14d","%r15d","%edi");
    155 	@Sigma0=( 2,13,22);
    156 	@Sigma1=( 6,11,25);
    157 	@sigma0=( 7,18, 3);
    158 	@sigma1=(17,19,10);
    159 	$rounds=64;
    160 }
    161 
    162 $ctx="%rdi";	# 1st arg, zapped by $a3
    163 $inp="%rsi";	# 2nd arg
    164 $Tbl="%rbp";
    165 
    166 $_ctx="16*$SZ+0*8(%rsp)";
    167 $_inp="16*$SZ+1*8(%rsp)";
    168 $_end="16*$SZ+2*8(%rsp)";
    169 $_rsp="16*$SZ+3*8(%rsp)";
    170 $framesz="16*$SZ+4*8";
    171 
    172 
    173 sub ROUND_00_15()
    174 { my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
    175   my $STRIDE=$SZ;
    176      $STRIDE += 16 if ($i%(16/$SZ)==(16/$SZ-1));
    177 
    178 $code.=<<___;
    179 	ror	\$`$Sigma1[2]-$Sigma1[1]`,$a0
    180 	mov	$f,$a2
    181 
    182 	xor	$e,$a0
    183 	ror	\$`$Sigma0[2]-$Sigma0[1]`,$a1
    184 	xor	$g,$a2			# f^g
    185 
    186 	mov	$T1,`$SZ*($i&0xf)`(%rsp)
    187 	xor	$a,$a1
    188 	and	$e,$a2			# (f^g)&e
    189 
    190 	ror	\$`$Sigma1[1]-$Sigma1[0]`,$a0
    191 	add	$h,$T1			# T1+=h
    192 	xor	$g,$a2			# Ch(e,f,g)=((f^g)&e)^g
    193 
    194 	ror	\$`$Sigma0[1]-$Sigma0[0]`,$a1
    195 	xor	$e,$a0
    196 	add	$a2,$T1			# T1+=Ch(e,f,g)
    197 
    198 	mov	$a,$a2
    199 	add	($Tbl),$T1		# T1+=K[round]
    200 	xor	$a,$a1
    201 
    202 	xor	$b,$a2			# a^b, b^c in next round
    203 	ror	\$$Sigma1[0],$a0	# Sigma1(e)
    204 	mov	$b,$h
    205 
    206 	and	$a2,$a3
    207 	ror	\$$Sigma0[0],$a1	# Sigma0(a)
    208 	add	$a0,$T1			# T1+=Sigma1(e)
    209 
    210 	xor	$a3,$h			# h=Maj(a,b,c)=Ch(a^b,c,b)
    211 	add	$T1,$d			# d+=T1
    212 	add	$T1,$h			# h+=T1
    213 
    214 	lea	$STRIDE($Tbl),$Tbl	# round++
    215 ___
    216 $code.=<<___ if ($i<15);
    217 	add	$a1,$h			# h+=Sigma0(a)
    218 ___
    219 	($a2,$a3) = ($a3,$a2);
    220 }
    221 
    222 sub ROUND_16_XX()
    223 { my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
    224 
    225 $code.=<<___;
    226 	mov	`$SZ*(($i+1)&0xf)`(%rsp),$a0
    227 	mov	`$SZ*(($i+14)&0xf)`(%rsp),$a2
    228 
    229 	mov	$a0,$T1
    230 	ror	\$`$sigma0[1]-$sigma0[0]`,$a0
    231 	add	$a1,$a			# modulo-scheduled h+=Sigma0(a)
    232 	mov	$a2,$a1
    233 	ror	\$`$sigma1[1]-$sigma1[0]`,$a2
    234 
    235 	xor	$T1,$a0
    236 	shr	\$$sigma0[2],$T1
    237 	ror	\$$sigma0[0],$a0
    238 	xor	$a1,$a2
    239 	shr	\$$sigma1[2],$a1
    240 
    241 	ror	\$$sigma1[0],$a2
    242 	xor	$a0,$T1			# sigma0(X[(i+1)&0xf])
    243 	xor	$a1,$a2			# sigma1(X[(i+14)&0xf])
    244 	add	`$SZ*(($i+9)&0xf)`(%rsp),$T1
    245 
    246 	add	`$SZ*($i&0xf)`(%rsp),$T1
    247 	mov	$e,$a0
    248 	add	$a2,$T1
    249 	mov	$a,$a1
    250 ___
    251 	&ROUND_00_15(@_);
    252 }
    253 
    254 $code=<<___;
    255 .text
    256 
    257 .extern	OPENSSL_ia32cap_P
    258 .globl	$func
    259 .type	$func,\@function,3
    260 .align	16
    261 $func:
    262 ___
    263 $code.=<<___ if ($SZ==4 || $avx);
    264 	lea	OPENSSL_ia32cap_P(%rip),%r11
    265 	mov	0(%r11),%r9d
    266 	mov	4(%r11),%r10d
    267 	mov	8(%r11),%r11d
    268 ___
    269 $code.=<<___ if ($SZ==4 && $shaext);
    270 	test	\$`1<<29`,%r11d		# check for SHA
    271 	jnz	_shaext_shortcut
    272 ___
    273 $code.=<<___ if ($avx && $SZ==8);
    274 	test	\$`1<<11`,%r10d		# check for XOP
    275 	jnz	.Lxop_shortcut
    276 ___
    277 $code.=<<___ if ($avx>1);
    278 	and	\$`1<<8|1<<5|1<<3`,%r11d	# check for BMI2+AVX2+BMI1
    279 	cmp	\$`1<<8|1<<5|1<<3`,%r11d
    280 	je	.Lavx2_shortcut
    281 ___
    282 $code.=<<___ if ($avx);
    283 	and	\$`1<<30`,%r9d		# mask "Intel CPU" bit
    284 	and	\$`1<<28|1<<9`,%r10d	# mask AVX and SSSE3 bits
    285 	or	%r9d,%r10d
    286 	cmp	\$`1<<28|1<<9|1<<30`,%r10d
    287 	je	.Lavx_shortcut
    288 ___
    289 $code.=<<___ if ($SZ==4);
    290 	test	\$`1<<9`,%r10d
    291 	jnz	.Lssse3_shortcut
    292 ___
    293 $code.=<<___;
    294 	push	%rbx
    295 	push	%rbp
    296 	push	%r12
    297 	push	%r13
    298 	push	%r14
    299 	push	%r15
    300 	mov	%rsp,%r11		# copy %rsp
    301 	shl	\$4,%rdx		# num*16
    302 	sub	\$$framesz,%rsp
    303 	lea	($inp,%rdx,$SZ),%rdx	# inp+num*16*$SZ
    304 	and	\$-64,%rsp		# align stack frame
    305 	mov	$ctx,$_ctx		# save ctx, 1st arg
    306 	mov	$inp,$_inp		# save inp, 2nd arh
    307 	mov	%rdx,$_end		# save end pointer, "3rd" arg
    308 	mov	%r11,$_rsp		# save copy of %rsp
    309 .Lprologue:
    310 
    311 	mov	$SZ*0($ctx),$A
    312 	mov	$SZ*1($ctx),$B
    313 	mov	$SZ*2($ctx),$C
    314 	mov	$SZ*3($ctx),$D
    315 	mov	$SZ*4($ctx),$E
    316 	mov	$SZ*5($ctx),$F
    317 	mov	$SZ*6($ctx),$G
    318 	mov	$SZ*7($ctx),$H
    319 	jmp	.Lloop
    320 
    321 .align	16
    322 .Lloop:
    323 	mov	$B,$a3
    324 	lea	$TABLE(%rip),$Tbl
    325 	xor	$C,$a3			# magic
    326 ___
    327 	for($i=0;$i<16;$i++) {
    328 		$code.="	mov	$SZ*$i($inp),$T1\n";
    329 		$code.="	mov	@ROT[4],$a0\n";
    330 		$code.="	mov	@ROT[0],$a1\n";
    331 		$code.="	bswap	$T1\n";
    332 		&ROUND_00_15($i,@ROT);
    333 		unshift(@ROT,pop(@ROT));
    334 	}
    335 $code.=<<___;
    336 	jmp	.Lrounds_16_xx
    337 .align	16
    338 .Lrounds_16_xx:
    339 ___
    340 	for(;$i<32;$i++) {
    341 		&ROUND_16_XX($i,@ROT);
    342 		unshift(@ROT,pop(@ROT));
    343 	}
    344 
    345 $code.=<<___;
    346 	cmpb	\$0,`$SZ-1`($Tbl)
    347 	jnz	.Lrounds_16_xx
    348 
    349 	mov	$_ctx,$ctx
    350 	add	$a1,$A			# modulo-scheduled h+=Sigma0(a)
    351 	lea	16*$SZ($inp),$inp
    352 
    353 	add	$SZ*0($ctx),$A
    354 	add	$SZ*1($ctx),$B
    355 	add	$SZ*2($ctx),$C
    356 	add	$SZ*3($ctx),$D
    357 	add	$SZ*4($ctx),$E
    358 	add	$SZ*5($ctx),$F
    359 	add	$SZ*6($ctx),$G
    360 	add	$SZ*7($ctx),$H
    361 
    362 	cmp	$_end,$inp
    363 
    364 	mov	$A,$SZ*0($ctx)
    365 	mov	$B,$SZ*1($ctx)
    366 	mov	$C,$SZ*2($ctx)
    367 	mov	$D,$SZ*3($ctx)
    368 	mov	$E,$SZ*4($ctx)
    369 	mov	$F,$SZ*5($ctx)
    370 	mov	$G,$SZ*6($ctx)
    371 	mov	$H,$SZ*7($ctx)
    372 	jb	.Lloop
    373 
    374 	mov	$_rsp,%rsi
    375 	mov	(%rsi),%r15
    376 	mov	8(%rsi),%r14
    377 	mov	16(%rsi),%r13
    378 	mov	24(%rsi),%r12
    379 	mov	32(%rsi),%rbp
    380 	mov	40(%rsi),%rbx
    381 	lea	48(%rsi),%rsp
    382 .Lepilogue:
    383 	ret
    384 .size	$func,.-$func
    385 ___
    386 
    387 if ($SZ==4) {
    388 $code.=<<___;
    389 .align	64
    390 .type	$TABLE,\@object
    391 $TABLE:
    392 	.long	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
    393 	.long	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
    394 	.long	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
    395 	.long	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
    396 	.long	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
    397 	.long	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
    398 	.long	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
    399 	.long	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
    400 	.long	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
    401 	.long	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
    402 	.long	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
    403 	.long	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
    404 	.long	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
    405 	.long	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
    406 	.long	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
    407 	.long	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
    408 	.long	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
    409 	.long	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
    410 	.long	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
    411 	.long	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
    412 	.long	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
    413 	.long	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
    414 	.long	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
    415 	.long	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
    416 	.long	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
    417 	.long	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
    418 	.long	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
    419 	.long	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
    420 	.long	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
    421 	.long	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
    422 	.long	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
    423 	.long	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
    424 
    425 	.long	0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
    426 	.long	0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
    427 	.long	0x03020100,0x0b0a0908,0xffffffff,0xffffffff
    428 	.long	0x03020100,0x0b0a0908,0xffffffff,0xffffffff
    429 	.long	0xffffffff,0xffffffff,0x03020100,0x0b0a0908
    430 	.long	0xffffffff,0xffffffff,0x03020100,0x0b0a0908
    431 	.asciz	"SHA256 block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
    432 ___
    433 } else {
    434 $code.=<<___;
    435 .align	64
    436 .type	$TABLE,\@object
    437 $TABLE:
    438 	.quad	0x428a2f98d728ae22,0x7137449123ef65cd
    439 	.quad	0x428a2f98d728ae22,0x7137449123ef65cd
    440 	.quad	0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
    441 	.quad	0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
    442 	.quad	0x3956c25bf348b538,0x59f111f1b605d019
    443 	.quad	0x3956c25bf348b538,0x59f111f1b605d019
    444 	.quad	0x923f82a4af194f9b,0xab1c5ed5da6d8118
    445 	.quad	0x923f82a4af194f9b,0xab1c5ed5da6d8118
    446 	.quad	0xd807aa98a3030242,0x12835b0145706fbe
    447 	.quad	0xd807aa98a3030242,0x12835b0145706fbe
    448 	.quad	0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
    449 	.quad	0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
    450 	.quad	0x72be5d74f27b896f,0x80deb1fe3b1696b1
    451 	.quad	0x72be5d74f27b896f,0x80deb1fe3b1696b1
    452 	.quad	0x9bdc06a725c71235,0xc19bf174cf692694
    453 	.quad	0x9bdc06a725c71235,0xc19bf174cf692694
    454 	.quad	0xe49b69c19ef14ad2,0xefbe4786384f25e3
    455 	.quad	0xe49b69c19ef14ad2,0xefbe4786384f25e3
    456 	.quad	0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
    457 	.quad	0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
    458 	.quad	0x2de92c6f592b0275,0x4a7484aa6ea6e483
    459 	.quad	0x2de92c6f592b0275,0x4a7484aa6ea6e483
    460 	.quad	0x5cb0a9dcbd41fbd4,0x76f988da831153b5
    461 	.quad	0x5cb0a9dcbd41fbd4,0x76f988da831153b5
    462 	.quad	0x983e5152ee66dfab,0xa831c66d2db43210
    463 	.quad	0x983e5152ee66dfab,0xa831c66d2db43210
    464 	.quad	0xb00327c898fb213f,0xbf597fc7beef0ee4
    465 	.quad	0xb00327c898fb213f,0xbf597fc7beef0ee4
    466 	.quad	0xc6e00bf33da88fc2,0xd5a79147930aa725
    467 	.quad	0xc6e00bf33da88fc2,0xd5a79147930aa725
    468 	.quad	0x06ca6351e003826f,0x142929670a0e6e70
    469 	.quad	0x06ca6351e003826f,0x142929670a0e6e70
    470 	.quad	0x27b70a8546d22ffc,0x2e1b21385c26c926
    471 	.quad	0x27b70a8546d22ffc,0x2e1b21385c26c926
    472 	.quad	0x4d2c6dfc5ac42aed,0x53380d139d95b3df
    473 	.quad	0x4d2c6dfc5ac42aed,0x53380d139d95b3df
    474 	.quad	0x650a73548baf63de,0x766a0abb3c77b2a8
    475 	.quad	0x650a73548baf63de,0x766a0abb3c77b2a8
    476 	.quad	0x81c2c92e47edaee6,0x92722c851482353b
    477 	.quad	0x81c2c92e47edaee6,0x92722c851482353b
    478 	.quad	0xa2bfe8a14cf10364,0xa81a664bbc423001
    479 	.quad	0xa2bfe8a14cf10364,0xa81a664bbc423001
    480 	.quad	0xc24b8b70d0f89791,0xc76c51a30654be30
    481 	.quad	0xc24b8b70d0f89791,0xc76c51a30654be30
    482 	.quad	0xd192e819d6ef5218,0xd69906245565a910
    483 	.quad	0xd192e819d6ef5218,0xd69906245565a910
    484 	.quad	0xf40e35855771202a,0x106aa07032bbd1b8
    485 	.quad	0xf40e35855771202a,0x106aa07032bbd1b8
    486 	.quad	0x19a4c116b8d2d0c8,0x1e376c085141ab53
    487 	.quad	0x19a4c116b8d2d0c8,0x1e376c085141ab53
    488 	.quad	0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
    489 	.quad	0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
    490 	.quad	0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
    491 	.quad	0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
    492 	.quad	0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
    493 	.quad	0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
    494 	.quad	0x748f82ee5defb2fc,0x78a5636f43172f60
    495 	.quad	0x748f82ee5defb2fc,0x78a5636f43172f60
    496 	.quad	0x84c87814a1f0ab72,0x8cc702081a6439ec
    497 	.quad	0x84c87814a1f0ab72,0x8cc702081a6439ec
    498 	.quad	0x90befffa23631e28,0xa4506cebde82bde9
    499 	.quad	0x90befffa23631e28,0xa4506cebde82bde9
    500 	.quad	0xbef9a3f7b2c67915,0xc67178f2e372532b
    501 	.quad	0xbef9a3f7b2c67915,0xc67178f2e372532b
    502 	.quad	0xca273eceea26619c,0xd186b8c721c0c207
    503 	.quad	0xca273eceea26619c,0xd186b8c721c0c207
    504 	.quad	0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
    505 	.quad	0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
    506 	.quad	0x06f067aa72176fba,0x0a637dc5a2c898a6
    507 	.quad	0x06f067aa72176fba,0x0a637dc5a2c898a6
    508 	.quad	0x113f9804bef90dae,0x1b710b35131c471b
    509 	.quad	0x113f9804bef90dae,0x1b710b35131c471b
    510 	.quad	0x28db77f523047d84,0x32caab7b40c72493
    511 	.quad	0x28db77f523047d84,0x32caab7b40c72493
    512 	.quad	0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
    513 	.quad	0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
    514 	.quad	0x4cc5d4becb3e42b6,0x597f299cfc657e2a
    515 	.quad	0x4cc5d4becb3e42b6,0x597f299cfc657e2a
    516 	.quad	0x5fcb6fab3ad6faec,0x6c44198c4a475817
    517 	.quad	0x5fcb6fab3ad6faec,0x6c44198c4a475817
    518 
    519 	.quad	0x0001020304050607,0x08090a0b0c0d0e0f
    520 	.quad	0x0001020304050607,0x08090a0b0c0d0e0f
    521 	.asciz	"SHA512 block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
    522 ___
    523 }
    524 
    525 ######################################################################
    526 # SIMD code paths
    527 #
    528 if ($SZ==4 && $shaext) {{{
    529 ######################################################################
    530 # Intel SHA Extensions implementation of SHA256 update function.
    531 #
    532 my ($ctx,$inp,$num,$Tbl)=("%rdi","%rsi","%rdx","%rcx");
    533 
    534 my ($Wi,$ABEF,$CDGH,$TMP,$BSWAP,$ABEF_SAVE,$CDGH_SAVE)=map("%xmm$_",(0..2,7..10));
    535 my @MSG=map("%xmm$_",(3..6));
    536 
    537 $code.=<<___;
    538 .type	sha256_block_data_order_shaext,\@function,3
    539 .align	64
    540 sha256_block_data_order_shaext:
    541 _shaext_shortcut:
    542 ___
    543 $code.=<<___ if ($win64);
    544 	lea	`-8-5*16`(%rsp),%rsp
    545 	movaps	%xmm6,-8-5*16(%rax)
    546 	movaps	%xmm7,-8-4*16(%rax)
    547 	movaps	%xmm8,-8-3*16(%rax)
    548 	movaps	%xmm9,-8-2*16(%rax)
    549 	movaps	%xmm10,-8-1*16(%rax)
    550 .Lprologue_shaext:
    551 ___
    552 $code.=<<___;
    553 	lea		K256+0x80(%rip),$Tbl
    554 	movdqu		($ctx),$ABEF		# DCBA
    555 	movdqu		16($ctx),$CDGH		# HGFE
    556 	movdqa		0x200-0x80($Tbl),$TMP	# byte swap mask
    557 
    558 	pshufd		\$0x1b,$ABEF,$Wi	# ABCD
    559 	pshufd		\$0xb1,$ABEF,$ABEF	# CDAB
    560 	pshufd		\$0x1b,$CDGH,$CDGH	# EFGH
    561 	movdqa		$TMP,$BSWAP		# offload
    562 	palignr		\$8,$CDGH,$ABEF		# ABEF
    563 	punpcklqdq	$Wi,$CDGH		# CDGH
    564 	jmp		.Loop_shaext
    565 
    566 .align	16
    567 .Loop_shaext:
    568 	movdqu		($inp),@MSG[0]
    569 	movdqu		0x10($inp),@MSG[1]
    570 	movdqu		0x20($inp),@MSG[2]
    571 	pshufb		$TMP,@MSG[0]
    572 	movdqu		0x30($inp),@MSG[3]
    573 
    574 	movdqa		0*32-0x80($Tbl),$Wi
    575 	paddd		@MSG[0],$Wi
    576 	pshufb		$TMP,@MSG[1]
    577 	movdqa		$CDGH,$CDGH_SAVE	# offload
    578 	sha256rnds2	$ABEF,$CDGH		# 0-3
    579 	pshufd		\$0x0e,$Wi,$Wi
    580 	nop
    581 	movdqa		$ABEF,$ABEF_SAVE	# offload
    582 	sha256rnds2	$CDGH,$ABEF
    583 
    584 	movdqa		1*32-0x80($Tbl),$Wi
    585 	paddd		@MSG[1],$Wi
    586 	pshufb		$TMP,@MSG[2]
    587 	sha256rnds2	$ABEF,$CDGH		# 4-7
    588 	pshufd		\$0x0e,$Wi,$Wi
    589 	lea		0x40($inp),$inp
    590 	sha256msg1	@MSG[1],@MSG[0]
    591 	sha256rnds2	$CDGH,$ABEF
    592 
    593 	movdqa		2*32-0x80($Tbl),$Wi
    594 	paddd		@MSG[2],$Wi
    595 	pshufb		$TMP,@MSG[3]
    596 	sha256rnds2	$ABEF,$CDGH		# 8-11
    597 	pshufd		\$0x0e,$Wi,$Wi
    598 	movdqa		@MSG[3],$TMP
    599 	palignr		\$4,@MSG[2],$TMP
    600 	nop
    601 	paddd		$TMP,@MSG[0]
    602 	sha256msg1	@MSG[2],@MSG[1]
    603 	sha256rnds2	$CDGH,$ABEF
    604 
    605 	movdqa		3*32-0x80($Tbl),$Wi
    606 	paddd		@MSG[3],$Wi
    607 	sha256msg2	@MSG[3],@MSG[0]
    608 	sha256rnds2	$ABEF,$CDGH		# 12-15
    609 	pshufd		\$0x0e,$Wi,$Wi
    610 	movdqa		@MSG[0],$TMP
    611 	palignr		\$4,@MSG[3],$TMP
    612 	nop
    613 	paddd		$TMP,@MSG[1]
    614 	sha256msg1	@MSG[3],@MSG[2]
    615 	sha256rnds2	$CDGH,$ABEF
    616 ___
    617 for($i=4;$i<16-3;$i++) {
    618 $code.=<<___;
    619 	movdqa		$i*32-0x80($Tbl),$Wi
    620 	paddd		@MSG[0],$Wi
    621 	sha256msg2	@MSG[0],@MSG[1]
    622 	sha256rnds2	$ABEF,$CDGH		# 16-19...
    623 	pshufd		\$0x0e,$Wi,$Wi
    624 	movdqa		@MSG[1],$TMP
    625 	palignr		\$4,@MSG[0],$TMP
    626 	nop
    627 	paddd		$TMP,@MSG[2]
    628 	sha256msg1	@MSG[0],@MSG[3]
    629 	sha256rnds2	$CDGH,$ABEF
    630 ___
    631 	push(@MSG,shift(@MSG));
    632 }
    633 $code.=<<___;
    634 	movdqa		13*32-0x80($Tbl),$Wi
    635 	paddd		@MSG[0],$Wi
    636 	sha256msg2	@MSG[0],@MSG[1]
    637 	sha256rnds2	$ABEF,$CDGH		# 52-55
    638 	pshufd		\$0x0e,$Wi,$Wi
    639 	movdqa		@MSG[1],$TMP
    640 	palignr		\$4,@MSG[0],$TMP
    641 	sha256rnds2	$CDGH,$ABEF
    642 	paddd		$TMP,@MSG[2]
    643 
    644 	movdqa		14*32-0x80($Tbl),$Wi
    645 	paddd		@MSG[1],$Wi
    646 	sha256rnds2	$ABEF,$CDGH		# 56-59
    647 	pshufd		\$0x0e,$Wi,$Wi
    648 	sha256msg2	@MSG[1],@MSG[2]
    649 	movdqa		$BSWAP,$TMP
    650 	sha256rnds2	$CDGH,$ABEF
    651 
    652 	movdqa		15*32-0x80($Tbl),$Wi
    653 	paddd		@MSG[2],$Wi
    654 	nop
    655 	sha256rnds2	$ABEF,$CDGH		# 60-63
    656 	pshufd		\$0x0e,$Wi,$Wi
    657 	dec		$num
    658 	nop
    659 	sha256rnds2	$CDGH,$ABEF
    660 
    661 	paddd		$CDGH_SAVE,$CDGH
    662 	paddd		$ABEF_SAVE,$ABEF
    663 	jnz		.Loop_shaext
    664 
    665 	pshufd		\$0xb1,$CDGH,$CDGH	# DCHG
    666 	pshufd		\$0x1b,$ABEF,$TMP	# FEBA
    667 	pshufd		\$0xb1,$ABEF,$ABEF	# BAFE
    668 	punpckhqdq	$CDGH,$ABEF		# DCBA
    669 	palignr		\$8,$TMP,$CDGH		# HGFE
    670 
    671 	movdqu	$ABEF,($ctx)
    672 	movdqu	$CDGH,16($ctx)
    673 ___
    674 $code.=<<___ if ($win64);
    675 	movaps	-8-5*16(%rax),%xmm6
    676 	movaps	-8-4*16(%rax),%xmm7
    677 	movaps	-8-3*16(%rax),%xmm8
    678 	movaps	-8-2*16(%rax),%xmm9
    679 	movaps	-8-1*16(%rax),%xmm10
    680 	mov	%rax,%rsp
    681 .Lepilogue_shaext:
    682 ___
    683 $code.=<<___;
    684 	ret
    685 .size	sha256_block_data_order_shaext,.-sha256_block_data_order_shaext
    686 ___
    687 }}}
    688 {{{
    689 
    690 my $a4=$T1;
    691 my ($a,$b,$c,$d,$e,$f,$g,$h);
    692 
    693 sub AUTOLOAD()		# thunk [simplified] 32-bit style perlasm
    694 { my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
    695   my $arg = pop;
    696     $arg = "\$$arg" if ($arg*1 eq $arg);
    697     $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n";
    698 }
    699 
    700 sub body_00_15 () {
    701 	(
    702 	'($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'.
    703 
    704 	'&ror	($a0,$Sigma1[2]-$Sigma1[1])',
    705 	'&mov	($a,$a1)',
    706 	'&mov	($a4,$f)',
    707 
    708 	'&ror	($a1,$Sigma0[2]-$Sigma0[1])',
    709 	'&xor	($a0,$e)',
    710 	'&xor	($a4,$g)',			# f^g
    711 
    712 	'&ror	($a0,$Sigma1[1]-$Sigma1[0])',
    713 	'&xor	($a1,$a)',
    714 	'&and	($a4,$e)',			# (f^g)&e
    715 
    716 	'&xor	($a0,$e)',
    717 	'&add	($h,$SZ*($i&15)."(%rsp)")',	# h+=X[i]+K[i]
    718 	'&mov	($a2,$a)',
    719 
    720 	'&xor	($a4,$g)',			# Ch(e,f,g)=((f^g)&e)^g
    721 	'&ror	($a1,$Sigma0[1]-$Sigma0[0])',
    722 	'&xor	($a2,$b)',			# a^b, b^c in next round
    723 
    724 	'&add	($h,$a4)',			# h+=Ch(e,f,g)
    725 	'&ror	($a0,$Sigma1[0])',		# Sigma1(e)
    726 	'&and	($a3,$a2)',			# (b^c)&(a^b)
    727 
    728 	'&xor	($a1,$a)',
    729 	'&add	($h,$a0)',			# h+=Sigma1(e)
    730 	'&xor	($a3,$b)',			# Maj(a,b,c)=Ch(a^b,c,b)
    731 
    732 	'&ror	($a1,$Sigma0[0])',		# Sigma0(a)
    733 	'&add	($d,$h)',			# d+=h
    734 	'&add	($h,$a3)',			# h+=Maj(a,b,c)
    735 
    736 	'&mov	($a0,$d)',
    737 	'&add	($a1,$h);'.			# h+=Sigma0(a)
    738 	'($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;'
    739 	);
    740 }
    741 
    742 ######################################################################
    743 # SSSE3 code path
    744 #
    745 if ($SZ==4) {	# SHA256 only
    746 my @X = map("%xmm$_",(0..3));
    747 my ($t0,$t1,$t2,$t3, $t4,$t5) = map("%xmm$_",(4..9));
    748 
    749 $code.=<<___;
    750 .type	${func}_ssse3,\@function,3
    751 .align	64
    752 ${func}_ssse3:
    753 .Lssse3_shortcut:
    754 	push	%rbx
    755 	push	%rbp
    756 	push	%r12
    757 	push	%r13
    758 	push	%r14
    759 	push	%r15
    760 	mov	%rsp,%r11		# copy %rsp
    761 	shl	\$4,%rdx		# num*16
    762 	sub	\$`$framesz+$win64*16*4`,%rsp
    763 	lea	($inp,%rdx,$SZ),%rdx	# inp+num*16*$SZ
    764 	and	\$-64,%rsp		# align stack frame
    765 	mov	$ctx,$_ctx		# save ctx, 1st arg
    766 	mov	$inp,$_inp		# save inp, 2nd arh
    767 	mov	%rdx,$_end		# save end pointer, "3rd" arg
    768 	mov	%r11,$_rsp		# save copy of %rsp
    769 ___
    770 $code.=<<___ if ($win64);
    771 	movaps	%xmm6,16*$SZ+32(%rsp)
    772 	movaps	%xmm7,16*$SZ+48(%rsp)
    773 	movaps	%xmm8,16*$SZ+64(%rsp)
    774 	movaps	%xmm9,16*$SZ+80(%rsp)
    775 ___
    776 $code.=<<___;
    777 .Lprologue_ssse3:
    778 
    779 	mov	$SZ*0($ctx),$A
    780 	mov	$SZ*1($ctx),$B
    781 	mov	$SZ*2($ctx),$C
    782 	mov	$SZ*3($ctx),$D
    783 	mov	$SZ*4($ctx),$E
    784 	mov	$SZ*5($ctx),$F
    785 	mov	$SZ*6($ctx),$G
    786 	mov	$SZ*7($ctx),$H
    787 ___
    788 
    789 $code.=<<___;
    790 	#movdqa	$TABLE+`$SZ*2*$rounds`+32(%rip),$t4
    791 	#movdqa	$TABLE+`$SZ*2*$rounds`+64(%rip),$t5
    792 	jmp	.Lloop_ssse3
    793 .align	16
    794 .Lloop_ssse3:
    795 	movdqa	$TABLE+`$SZ*2*$rounds`(%rip),$t3
    796 	movdqu	0x00($inp),@X[0]
    797 	movdqu	0x10($inp),@X[1]
    798 	movdqu	0x20($inp),@X[2]
    799 	pshufb	$t3,@X[0]
    800 	movdqu	0x30($inp),@X[3]
    801 	lea	$TABLE(%rip),$Tbl
    802 	pshufb	$t3,@X[1]
    803 	movdqa	0x00($Tbl),$t0
    804 	movdqa	0x20($Tbl),$t1
    805 	pshufb	$t3,@X[2]
    806 	paddd	@X[0],$t0
    807 	movdqa	0x40($Tbl),$t2
    808 	pshufb	$t3,@X[3]
    809 	movdqa	0x60($Tbl),$t3
    810 	paddd	@X[1],$t1
    811 	paddd	@X[2],$t2
    812 	paddd	@X[3],$t3
    813 	movdqa	$t0,0x00(%rsp)
    814 	mov	$A,$a1
    815 	movdqa	$t1,0x10(%rsp)
    816 	mov	$B,$a3
    817 	movdqa	$t2,0x20(%rsp)
    818 	xor	$C,$a3			# magic
    819 	movdqa	$t3,0x30(%rsp)
    820 	mov	$E,$a0
    821 	jmp	.Lssse3_00_47
    822 
    823 .align	16
    824 .Lssse3_00_47:
    825 	sub	\$`-16*2*$SZ`,$Tbl	# size optimization
    826 ___
    827 sub Xupdate_256_SSSE3 () {
    828 	(
    829 	'&movdqa	($t0,@X[1]);',
    830 	'&movdqa	($t3,@X[3])',
    831 	'&palignr	($t0,@X[0],$SZ)',	# X[1..4]
    832 	 '&palignr	($t3,@X[2],$SZ);',	# X[9..12]
    833 	'&movdqa	($t1,$t0)',
    834 	'&movdqa	($t2,$t0);',
    835 	'&psrld		($t0,$sigma0[2])',
    836 	 '&paddd	(@X[0],$t3);',		# X[0..3] += X[9..12]
    837 	'&psrld		($t2,$sigma0[0])',
    838 	 '&pshufd	($t3,@X[3],0b11111010)',# X[14..15]
    839 	'&pslld		($t1,8*$SZ-$sigma0[1]);'.
    840 	'&pxor		($t0,$t2)',
    841 	'&psrld		($t2,$sigma0[1]-$sigma0[0]);'.
    842 	'&pxor		($t0,$t1)',
    843 	'&pslld		($t1,$sigma0[1]-$sigma0[0]);'.
    844 	'&pxor		($t0,$t2);',
    845 	 '&movdqa	($t2,$t3)',
    846 	'&pxor		($t0,$t1);',		# sigma0(X[1..4])
    847 	 '&psrld	($t3,$sigma1[2])',
    848 	'&paddd		(@X[0],$t0);',		# X[0..3] += sigma0(X[1..4])
    849 	 '&psrlq	($t2,$sigma1[0])',
    850 	 '&pxor		($t3,$t2);',
    851 	 '&psrlq	($t2,$sigma1[1]-$sigma1[0])',
    852 	 '&pxor		($t3,$t2)',
    853 	 '&pshufb	($t3,$t4)',		# sigma1(X[14..15])
    854 	'&paddd		(@X[0],$t3)',		# X[0..1] += sigma1(X[14..15])
    855 	 '&pshufd	($t3,@X[0],0b01010000)',# X[16..17]
    856 	 '&movdqa	($t2,$t3);',
    857 	 '&psrld	($t3,$sigma1[2])',
    858 	 '&psrlq	($t2,$sigma1[0])',
    859 	 '&pxor		($t3,$t2);',
    860 	 '&psrlq	($t2,$sigma1[1]-$sigma1[0])',
    861 	 '&pxor		($t3,$t2);',
    862 	'&movdqa	($t2,16*2*$j."($Tbl)")',
    863 	 '&pshufb	($t3,$t5)',
    864 	'&paddd		(@X[0],$t3)'		# X[2..3] += sigma1(X[16..17])
    865 	);
    866 }
    867 
    868 sub SSSE3_256_00_47 () {
    869 my $j = shift;
    870 my $body = shift;
    871 my @X = @_;
    872 my @insns = (&$body,&$body,&$body,&$body);	# 104 instructions
    873 
    874     if (0) {
    875 	foreach (Xupdate_256_SSSE3()) {		# 36 instructions
    876 	    eval;
    877 	    eval(shift(@insns));
    878 	    eval(shift(@insns));
    879 	    eval(shift(@insns));
    880 	}
    881     } else {			# squeeze extra 4% on Westmere and 19% on Atom
    882 	  eval(shift(@insns));	#@
    883 	&movdqa		($t0,@X[1]);
    884 	  eval(shift(@insns));
    885 	  eval(shift(@insns));
    886 	&movdqa		($t3,@X[3]);
    887 	  eval(shift(@insns));	#@
    888 	  eval(shift(@insns));
    889 	  eval(shift(@insns));
    890 	  eval(shift(@insns));	#@
    891 	  eval(shift(@insns));
    892 	&palignr	($t0,@X[0],$SZ);	# X[1..4]
    893 	  eval(shift(@insns));
    894 	  eval(shift(@insns));
    895 	 &palignr	($t3,@X[2],$SZ);	# X[9..12]
    896 	  eval(shift(@insns));
    897 	  eval(shift(@insns));
    898 	  eval(shift(@insns));
    899 	  eval(shift(@insns));	#@
    900 	&movdqa		($t1,$t0);
    901 	  eval(shift(@insns));
    902 	  eval(shift(@insns));
    903 	&movdqa		($t2,$t0);
    904 	  eval(shift(@insns));	#@
    905 	  eval(shift(@insns));
    906 	&psrld		($t0,$sigma0[2]);
    907 	  eval(shift(@insns));
    908 	  eval(shift(@insns));
    909 	  eval(shift(@insns));
    910 	 &paddd		(@X[0],$t3);		# X[0..3] += X[9..12]
    911 	  eval(shift(@insns));	#@
    912 	  eval(shift(@insns));
    913 	&psrld		($t2,$sigma0[0]);
    914 	  eval(shift(@insns));
    915 	  eval(shift(@insns));
    916 	 &pshufd	($t3,@X[3],0b11111010);	# X[4..15]
    917 	  eval(shift(@insns));
    918 	  eval(shift(@insns));	#@
    919 	&pslld		($t1,8*$SZ-$sigma0[1]);
    920 	  eval(shift(@insns));
    921 	  eval(shift(@insns));
    922 	&pxor		($t0,$t2);
    923 	  eval(shift(@insns));	#@
    924 	  eval(shift(@insns));
    925 	  eval(shift(@insns));
    926 	  eval(shift(@insns));	#@
    927 	&psrld		($t2,$sigma0[1]-$sigma0[0]);
    928 	  eval(shift(@insns));
    929 	&pxor		($t0,$t1);
    930 	  eval(shift(@insns));
    931 	  eval(shift(@insns));
    932 	&pslld		($t1,$sigma0[1]-$sigma0[0]);
    933 	  eval(shift(@insns));
    934 	  eval(shift(@insns));
    935 	&pxor		($t0,$t2);
    936 	  eval(shift(@insns));
    937 	  eval(shift(@insns));	#@
    938 	 &movdqa	($t2,$t3);
    939 	  eval(shift(@insns));
    940 	  eval(shift(@insns));
    941 	&pxor		($t0,$t1);		# sigma0(X[1..4])
    942 	  eval(shift(@insns));	#@
    943 	  eval(shift(@insns));
    944 	  eval(shift(@insns));
    945 	 &psrld		($t3,$sigma1[2]);
    946 	  eval(shift(@insns));
    947 	  eval(shift(@insns));
    948 	&paddd		(@X[0],$t0);		# X[0..3] += sigma0(X[1..4])
    949 	  eval(shift(@insns));	#@
    950 	  eval(shift(@insns));
    951 	 &psrlq		($t2,$sigma1[0]);
    952 	  eval(shift(@insns));
    953 	  eval(shift(@insns));
    954 	  eval(shift(@insns));
    955 	 &pxor		($t3,$t2);
    956 	  eval(shift(@insns));	#@
    957 	  eval(shift(@insns));
    958 	  eval(shift(@insns));
    959 	  eval(shift(@insns));	#@
    960 	 &psrlq		($t2,$sigma1[1]-$sigma1[0]);
    961 	  eval(shift(@insns));
    962 	  eval(shift(@insns));
    963 	 &pxor		($t3,$t2);
    964 	  eval(shift(@insns));	#@
    965 	  eval(shift(@insns));
    966 	  eval(shift(@insns));
    967 	 #&pshufb	($t3,$t4);		# sigma1(X[14..15])
    968 	 &pshufd	($t3,$t3,0b10000000);
    969 	  eval(shift(@insns));
    970 	  eval(shift(@insns));
    971 	  eval(shift(@insns));
    972 	 &psrldq	($t3,8);
    973 	  eval(shift(@insns));
    974 	  eval(shift(@insns));	#@
    975 	  eval(shift(@insns));
    976 	  eval(shift(@insns));
    977 	  eval(shift(@insns));	#@
    978 	&paddd		(@X[0],$t3);		# X[0..1] += sigma1(X[14..15])
    979 	  eval(shift(@insns));
    980 	  eval(shift(@insns));
    981 	  eval(shift(@insns));
    982 	 &pshufd	($t3,@X[0],0b01010000);	# X[16..17]
    983 	  eval(shift(@insns));
    984 	  eval(shift(@insns));	#@
    985 	  eval(shift(@insns));
    986 	 &movdqa	($t2,$t3);
    987 	  eval(shift(@insns));
    988 	  eval(shift(@insns));
    989 	 &psrld		($t3,$sigma1[2]);
    990 	  eval(shift(@insns));
    991 	  eval(shift(@insns));	#@
    992 	 &psrlq		($t2,$sigma1[0]);
    993 	  eval(shift(@insns));
    994 	  eval(shift(@insns));
    995 	 &pxor		($t3,$t2);
    996 	  eval(shift(@insns));	#@
    997 	  eval(shift(@insns));
    998 	  eval(shift(@insns));
    999 	  eval(shift(@insns));	#@
   1000 	  eval(shift(@insns));
   1001 	 &psrlq		($t2,$sigma1[1]-$sigma1[0]);
   1002 	  eval(shift(@insns));
   1003 	  eval(shift(@insns));
   1004 	  eval(shift(@insns));
   1005 	 &pxor		($t3,$t2);
   1006 	  eval(shift(@insns));
   1007 	  eval(shift(@insns));
   1008 	  eval(shift(@insns));	#@
   1009 	 #&pshufb	($t3,$t5);
   1010 	 &pshufd	($t3,$t3,0b00001000);
   1011 	  eval(shift(@insns));
   1012 	  eval(shift(@insns));
   1013 	&movdqa		($t2,16*2*$j."($Tbl)");
   1014 	  eval(shift(@insns));	#@
   1015 	  eval(shift(@insns));
   1016 	 &pslldq	($t3,8);
   1017 	  eval(shift(@insns));
   1018 	  eval(shift(@insns));
   1019 	  eval(shift(@insns));
   1020 	&paddd		(@X[0],$t3);		# X[2..3] += sigma1(X[16..17])
   1021 	  eval(shift(@insns));	#@
   1022 	  eval(shift(@insns));
   1023 	  eval(shift(@insns));
   1024     }
   1025 	&paddd		($t2,@X[0]);
   1026 	  foreach (@insns) { eval; }		# remaining instructions
   1027 	&movdqa		(16*$j."(%rsp)",$t2);
   1028 }
   1029 
   1030     for ($i=0,$j=0; $j<4; $j++) {
   1031 	&SSSE3_256_00_47($j,\&body_00_15,@X);
   1032 	push(@X,shift(@X));			# rotate(@X)
   1033     }
   1034 	&cmpb	($SZ-1+16*2*$SZ."($Tbl)",0);
   1035 	&jne	(".Lssse3_00_47");
   1036 
   1037     for ($i=0; $i<16; ) {
   1038 	foreach(body_00_15()) { eval; }
   1039     }
   1040 $code.=<<___;
   1041 	mov	$_ctx,$ctx
   1042 	mov	$a1,$A
   1043 
   1044 	add	$SZ*0($ctx),$A
   1045 	lea	16*$SZ($inp),$inp
   1046 	add	$SZ*1($ctx),$B
   1047 	add	$SZ*2($ctx),$C
   1048 	add	$SZ*3($ctx),$D
   1049 	add	$SZ*4($ctx),$E
   1050 	add	$SZ*5($ctx),$F
   1051 	add	$SZ*6($ctx),$G
   1052 	add	$SZ*7($ctx),$H
   1053 
   1054 	cmp	$_end,$inp
   1055 
   1056 	mov	$A,$SZ*0($ctx)
   1057 	mov	$B,$SZ*1($ctx)
   1058 	mov	$C,$SZ*2($ctx)
   1059 	mov	$D,$SZ*3($ctx)
   1060 	mov	$E,$SZ*4($ctx)
   1061 	mov	$F,$SZ*5($ctx)
   1062 	mov	$G,$SZ*6($ctx)
   1063 	mov	$H,$SZ*7($ctx)
   1064 	jb	.Lloop_ssse3
   1065 
   1066 	mov	$_rsp,%rsi
   1067 ___
   1068 $code.=<<___ if ($win64);
   1069 	movaps	16*$SZ+32(%rsp),%xmm6
   1070 	movaps	16*$SZ+48(%rsp),%xmm7
   1071 	movaps	16*$SZ+64(%rsp),%xmm8
   1072 	movaps	16*$SZ+80(%rsp),%xmm9
   1073 ___
   1074 $code.=<<___;
   1075 	mov	(%rsi),%r15
   1076 	mov	8(%rsi),%r14
   1077 	mov	16(%rsi),%r13
   1078 	mov	24(%rsi),%r12
   1079 	mov	32(%rsi),%rbp
   1080 	mov	40(%rsi),%rbx
   1081 	lea	48(%rsi),%rsp
   1082 .Lepilogue_ssse3:
   1083 	ret
   1084 .size	${func}_ssse3,.-${func}_ssse3
   1085 ___
   1086 }
   1087 
   1088 if ($avx) {{
   1089 ######################################################################
   1090 # XOP code path
   1091 #
   1092 if ($SZ==8) {	# SHA512 only
   1093 $code.=<<___;
   1094 .type	${func}_xop,\@function,3
   1095 .align	64
   1096 ${func}_xop:
   1097 .Lxop_shortcut:
   1098 	push	%rbx
   1099 	push	%rbp
   1100 	push	%r12
   1101 	push	%r13
   1102 	push	%r14
   1103 	push	%r15
   1104 	mov	%rsp,%r11		# copy %rsp
   1105 	shl	\$4,%rdx		# num*16
   1106 	sub	\$`$framesz+$win64*16*($SZ==4?4:6)`,%rsp
   1107 	lea	($inp,%rdx,$SZ),%rdx	# inp+num*16*$SZ
   1108 	and	\$-64,%rsp		# align stack frame
   1109 	mov	$ctx,$_ctx		# save ctx, 1st arg
   1110 	mov	$inp,$_inp		# save inp, 2nd arh
   1111 	mov	%rdx,$_end		# save end pointer, "3rd" arg
   1112 	mov	%r11,$_rsp		# save copy of %rsp
   1113 ___
   1114 $code.=<<___ if ($win64);
   1115 	movaps	%xmm6,16*$SZ+32(%rsp)
   1116 	movaps	%xmm7,16*$SZ+48(%rsp)
   1117 	movaps	%xmm8,16*$SZ+64(%rsp)
   1118 	movaps	%xmm9,16*$SZ+80(%rsp)
   1119 ___
   1120 $code.=<<___ if ($win64 && $SZ>4);
   1121 	movaps	%xmm10,16*$SZ+96(%rsp)
   1122 	movaps	%xmm11,16*$SZ+112(%rsp)
   1123 ___
   1124 $code.=<<___;
   1125 .Lprologue_xop:
   1126 
   1127 	vzeroupper
   1128 	mov	$SZ*0($ctx),$A
   1129 	mov	$SZ*1($ctx),$B
   1130 	mov	$SZ*2($ctx),$C
   1131 	mov	$SZ*3($ctx),$D
   1132 	mov	$SZ*4($ctx),$E
   1133 	mov	$SZ*5($ctx),$F
   1134 	mov	$SZ*6($ctx),$G
   1135 	mov	$SZ*7($ctx),$H
   1136 	jmp	.Lloop_xop
   1137 ___
   1138 					if ($SZ==4) {	# SHA256
   1139     my @X = map("%xmm$_",(0..3));
   1140     my ($t0,$t1,$t2,$t3) = map("%xmm$_",(4..7));
   1141 
   1142 $code.=<<___;
   1143 .align	16
   1144 .Lloop_xop:
   1145 	vmovdqa	$TABLE+`$SZ*2*$rounds`(%rip),$t3
   1146 	vmovdqu	0x00($inp),@X[0]
   1147 	vmovdqu	0x10($inp),@X[1]
   1148 	vmovdqu	0x20($inp),@X[2]
   1149 	vmovdqu	0x30($inp),@X[3]
   1150 	vpshufb	$t3,@X[0],@X[0]
   1151 	lea	$TABLE(%rip),$Tbl
   1152 	vpshufb	$t3,@X[1],@X[1]
   1153 	vpshufb	$t3,@X[2],@X[2]
   1154 	vpaddd	0x00($Tbl),@X[0],$t0
   1155 	vpshufb	$t3,@X[3],@X[3]
   1156 	vpaddd	0x20($Tbl),@X[1],$t1
   1157 	vpaddd	0x40($Tbl),@X[2],$t2
   1158 	vpaddd	0x60($Tbl),@X[3],$t3
   1159 	vmovdqa	$t0,0x00(%rsp)
   1160 	mov	$A,$a1
   1161 	vmovdqa	$t1,0x10(%rsp)
   1162 	mov	$B,$a3
   1163 	vmovdqa	$t2,0x20(%rsp)
   1164 	xor	$C,$a3			# magic
   1165 	vmovdqa	$t3,0x30(%rsp)
   1166 	mov	$E,$a0
   1167 	jmp	.Lxop_00_47
   1168 
   1169 .align	16
   1170 .Lxop_00_47:
   1171 	sub	\$`-16*2*$SZ`,$Tbl	# size optimization
   1172 ___
   1173 sub XOP_256_00_47 () {
   1174 my $j = shift;
   1175 my $body = shift;
   1176 my @X = @_;
   1177 my @insns = (&$body,&$body,&$body,&$body);	# 104 instructions
   1178 
   1179 	&vpalignr	($t0,@X[1],@X[0],$SZ);	# X[1..4]
   1180 	  eval(shift(@insns));
   1181 	  eval(shift(@insns));
   1182 	 &vpalignr	($t3,@X[3],@X[2],$SZ);	# X[9..12]
   1183 	  eval(shift(@insns));
   1184 	  eval(shift(@insns));
   1185 	&vprotd		($t1,$t0,8*$SZ-$sigma0[1]);
   1186 	  eval(shift(@insns));
   1187 	  eval(shift(@insns));
   1188 	&vpsrld		($t0,$t0,$sigma0[2]);
   1189 	  eval(shift(@insns));
   1190 	  eval(shift(@insns));
   1191 	 &vpaddd	(@X[0],@X[0],$t3);	# X[0..3] += X[9..12]
   1192 	  eval(shift(@insns));
   1193 	  eval(shift(@insns));
   1194 	  eval(shift(@insns));
   1195 	  eval(shift(@insns));
   1196 	&vprotd		($t2,$t1,$sigma0[1]-$sigma0[0]);
   1197 	  eval(shift(@insns));
   1198 	  eval(shift(@insns));
   1199 	&vpxor		($t0,$t0,$t1);
   1200 	  eval(shift(@insns));
   1201 	  eval(shift(@insns));
   1202 	  eval(shift(@insns));
   1203 	  eval(shift(@insns));
   1204 	 &vprotd	($t3,@X[3],8*$SZ-$sigma1[1]);
   1205 	  eval(shift(@insns));
   1206 	  eval(shift(@insns));
   1207 	&vpxor		($t0,$t0,$t2);		# sigma0(X[1..4])
   1208 	  eval(shift(@insns));
   1209 	  eval(shift(@insns));
   1210 	 &vpsrld	($t2,@X[3],$sigma1[2]);
   1211 	  eval(shift(@insns));
   1212 	  eval(shift(@insns));
   1213 	&vpaddd		(@X[0],@X[0],$t0);	# X[0..3] += sigma0(X[1..4])
   1214 	  eval(shift(@insns));
   1215 	  eval(shift(@insns));
   1216 	 &vprotd	($t1,$t3,$sigma1[1]-$sigma1[0]);
   1217 	  eval(shift(@insns));
   1218 	  eval(shift(@insns));
   1219 	 &vpxor		($t3,$t3,$t2);
   1220 	  eval(shift(@insns));
   1221 	  eval(shift(@insns));
   1222 	  eval(shift(@insns));
   1223 	  eval(shift(@insns));
   1224 	 &vpxor		($t3,$t3,$t1);		# sigma1(X[14..15])
   1225 	  eval(shift(@insns));
   1226 	  eval(shift(@insns));
   1227 	  eval(shift(@insns));
   1228 	  eval(shift(@insns));
   1229 	&vpsrldq	($t3,$t3,8);
   1230 	  eval(shift(@insns));
   1231 	  eval(shift(@insns));
   1232 	  eval(shift(@insns));
   1233 	  eval(shift(@insns));
   1234 	&vpaddd		(@X[0],@X[0],$t3);	# X[0..1] += sigma1(X[14..15])
   1235 	  eval(shift(@insns));
   1236 	  eval(shift(@insns));
   1237 	  eval(shift(@insns));
   1238 	  eval(shift(@insns));
   1239 	 &vprotd	($t3,@X[0],8*$SZ-$sigma1[1]);
   1240 	  eval(shift(@insns));
   1241 	  eval(shift(@insns));
   1242 	 &vpsrld	($t2,@X[0],$sigma1[2]);
   1243 	  eval(shift(@insns));
   1244 	  eval(shift(@insns));
   1245 	 &vprotd	($t1,$t3,$sigma1[1]-$sigma1[0]);
   1246 	  eval(shift(@insns));
   1247 	  eval(shift(@insns));
   1248 	 &vpxor		($t3,$t3,$t2);
   1249 	  eval(shift(@insns));
   1250 	  eval(shift(@insns));
   1251 	  eval(shift(@insns));
   1252 	  eval(shift(@insns));
   1253 	 &vpxor		($t3,$t3,$t1);		# sigma1(X[16..17])
   1254 	  eval(shift(@insns));
   1255 	  eval(shift(@insns));
   1256 	  eval(shift(@insns));
   1257 	  eval(shift(@insns));
   1258 	&vpslldq	($t3,$t3,8);		# 22 instructions
   1259 	  eval(shift(@insns));
   1260 	  eval(shift(@insns));
   1261 	  eval(shift(@insns));
   1262 	  eval(shift(@insns));
   1263 	&vpaddd		(@X[0],@X[0],$t3);	# X[2..3] += sigma1(X[16..17])
   1264 	  eval(shift(@insns));
   1265 	  eval(shift(@insns));
   1266 	  eval(shift(@insns));
   1267 	  eval(shift(@insns));
   1268 	&vpaddd		($t2,@X[0],16*2*$j."($Tbl)");
   1269 	  foreach (@insns) { eval; }		# remaining instructions
   1270 	&vmovdqa	(16*$j."(%rsp)",$t2);
   1271 }
   1272 
   1273     for ($i=0,$j=0; $j<4; $j++) {
   1274 	&XOP_256_00_47($j,\&body_00_15,@X);
   1275 	push(@X,shift(@X));			# rotate(@X)
   1276     }
   1277 	&cmpb	($SZ-1+16*2*$SZ."($Tbl)",0);
   1278 	&jne	(".Lxop_00_47");
   1279 
   1280     for ($i=0; $i<16; ) {
   1281 	foreach(body_00_15()) { eval; }
   1282     }
   1283 
   1284 					} else {	# SHA512
   1285     my @X = map("%xmm$_",(0..7));
   1286     my ($t0,$t1,$t2,$t3) = map("%xmm$_",(8..11));
   1287 
   1288 $code.=<<___;
   1289 .align	16
   1290 .Lloop_xop:
   1291 	vmovdqa	$TABLE+`$SZ*2*$rounds`(%rip),$t3
   1292 	vmovdqu	0x00($inp),@X[0]
   1293 	lea	$TABLE+0x80(%rip),$Tbl	# size optimization
   1294 	vmovdqu	0x10($inp),@X[1]
   1295 	vmovdqu	0x20($inp),@X[2]
   1296 	vpshufb	$t3,@X[0],@X[0]
   1297 	vmovdqu	0x30($inp),@X[3]
   1298 	vpshufb	$t3,@X[1],@X[1]
   1299 	vmovdqu	0x40($inp),@X[4]
   1300 	vpshufb	$t3,@X[2],@X[2]
   1301 	vmovdqu	0x50($inp),@X[5]
   1302 	vpshufb	$t3,@X[3],@X[3]
   1303 	vmovdqu	0x60($inp),@X[6]
   1304 	vpshufb	$t3,@X[4],@X[4]
   1305 	vmovdqu	0x70($inp),@X[7]
   1306 	vpshufb	$t3,@X[5],@X[5]
   1307 	vpaddq	-0x80($Tbl),@X[0],$t0
   1308 	vpshufb	$t3,@X[6],@X[6]
   1309 	vpaddq	-0x60($Tbl),@X[1],$t1
   1310 	vpshufb	$t3,@X[7],@X[7]
   1311 	vpaddq	-0x40($Tbl),@X[2],$t2
   1312 	vpaddq	-0x20($Tbl),@X[3],$t3
   1313 	vmovdqa	$t0,0x00(%rsp)
   1314 	vpaddq	0x00($Tbl),@X[4],$t0
   1315 	vmovdqa	$t1,0x10(%rsp)
   1316 	vpaddq	0x20($Tbl),@X[5],$t1
   1317 	vmovdqa	$t2,0x20(%rsp)
   1318 	vpaddq	0x40($Tbl),@X[6],$t2
   1319 	vmovdqa	$t3,0x30(%rsp)
   1320 	vpaddq	0x60($Tbl),@X[7],$t3
   1321 	vmovdqa	$t0,0x40(%rsp)
   1322 	mov	$A,$a1
   1323 	vmovdqa	$t1,0x50(%rsp)
   1324 	mov	$B,$a3
   1325 	vmovdqa	$t2,0x60(%rsp)
   1326 	xor	$C,$a3			# magic
   1327 	vmovdqa	$t3,0x70(%rsp)
   1328 	mov	$E,$a0
   1329 	jmp	.Lxop_00_47
   1330 
   1331 .align	16
   1332 .Lxop_00_47:
   1333 	add	\$`16*2*$SZ`,$Tbl
   1334 ___
   1335 sub XOP_512_00_47 () {
   1336 my $j = shift;
   1337 my $body = shift;
   1338 my @X = @_;
   1339 my @insns = (&$body,&$body);			# 52 instructions
   1340 
   1341 	&vpalignr	($t0,@X[1],@X[0],$SZ);	# X[1..2]
   1342 	  eval(shift(@insns));
   1343 	  eval(shift(@insns));
   1344 	 &vpalignr	($t3,@X[5],@X[4],$SZ);	# X[9..10]
   1345 	  eval(shift(@insns));
   1346 	  eval(shift(@insns));
   1347 	&vprotq		($t1,$t0,8*$SZ-$sigma0[1]);
   1348 	  eval(shift(@insns));
   1349 	  eval(shift(@insns));
   1350 	&vpsrlq		($t0,$t0,$sigma0[2]);
   1351 	  eval(shift(@insns));
   1352 	  eval(shift(@insns));
   1353 	 &vpaddq	(@X[0],@X[0],$t3);	# X[0..1] += X[9..10]
   1354 	  eval(shift(@insns));
   1355 	  eval(shift(@insns));
   1356 	  eval(shift(@insns));
   1357 	  eval(shift(@insns));
   1358 	&vprotq		($t2,$t1,$sigma0[1]-$sigma0[0]);
   1359 	  eval(shift(@insns));
   1360 	  eval(shift(@insns));
   1361 	&vpxor		($t0,$t0,$t1);
   1362 	  eval(shift(@insns));
   1363 	  eval(shift(@insns));
   1364 	  eval(shift(@insns));
   1365 	  eval(shift(@insns));
   1366 	 &vprotq	($t3,@X[7],8*$SZ-$sigma1[1]);
   1367 	  eval(shift(@insns));
   1368 	  eval(shift(@insns));
   1369 	&vpxor		($t0,$t0,$t2);		# sigma0(X[1..2])
   1370 	  eval(shift(@insns));
   1371 	  eval(shift(@insns));
   1372 	 &vpsrlq	($t2,@X[7],$sigma1[2]);
   1373 	  eval(shift(@insns));
   1374 	  eval(shift(@insns));
   1375 	&vpaddq		(@X[0],@X[0],$t0);	# X[0..1] += sigma0(X[1..2])
   1376 	  eval(shift(@insns));
   1377 	  eval(shift(@insns));
   1378 	 &vprotq	($t1,$t3,$sigma1[1]-$sigma1[0]);
   1379 	  eval(shift(@insns));
   1380 	  eval(shift(@insns));
   1381 	 &vpxor		($t3,$t3,$t2);
   1382 	  eval(shift(@insns));
   1383 	  eval(shift(@insns));
   1384 	  eval(shift(@insns));
   1385 	  eval(shift(@insns));
   1386 	 &vpxor		($t3,$t3,$t1);		# sigma1(X[14..15])
   1387 	  eval(shift(@insns));
   1388 	  eval(shift(@insns));
   1389 	  eval(shift(@insns));
   1390 	  eval(shift(@insns));
   1391 	&vpaddq		(@X[0],@X[0],$t3);	# X[0..1] += sigma1(X[14..15])
   1392 	  eval(shift(@insns));
   1393 	  eval(shift(@insns));
   1394 	  eval(shift(@insns));
   1395 	  eval(shift(@insns));
   1396 	&vpaddq		($t2,@X[0],16*2*$j-0x80."($Tbl)");
   1397 	  foreach (@insns) { eval; }		# remaining instructions
   1398 	&vmovdqa	(16*$j."(%rsp)",$t2);
   1399 }
   1400 
   1401     for ($i=0,$j=0; $j<8; $j++) {
   1402 	&XOP_512_00_47($j,\&body_00_15,@X);
   1403 	push(@X,shift(@X));			# rotate(@X)
   1404     }
   1405 	&cmpb	($SZ-1+16*2*$SZ-0x80."($Tbl)",0);
   1406 	&jne	(".Lxop_00_47");
   1407 
   1408     for ($i=0; $i<16; ) {
   1409 	foreach(body_00_15()) { eval; }
   1410     }
   1411 }
   1412 $code.=<<___;
   1413 	mov	$_ctx,$ctx
   1414 	mov	$a1,$A
   1415 
   1416 	add	$SZ*0($ctx),$A
   1417 	lea	16*$SZ($inp),$inp
   1418 	add	$SZ*1($ctx),$B
   1419 	add	$SZ*2($ctx),$C
   1420 	add	$SZ*3($ctx),$D
   1421 	add	$SZ*4($ctx),$E
   1422 	add	$SZ*5($ctx),$F
   1423 	add	$SZ*6($ctx),$G
   1424 	add	$SZ*7($ctx),$H
   1425 
   1426 	cmp	$_end,$inp
   1427 
   1428 	mov	$A,$SZ*0($ctx)
   1429 	mov	$B,$SZ*1($ctx)
   1430 	mov	$C,$SZ*2($ctx)
   1431 	mov	$D,$SZ*3($ctx)
   1432 	mov	$E,$SZ*4($ctx)
   1433 	mov	$F,$SZ*5($ctx)
   1434 	mov	$G,$SZ*6($ctx)
   1435 	mov	$H,$SZ*7($ctx)
   1436 	jb	.Lloop_xop
   1437 
   1438 	mov	$_rsp,%rsi
   1439 	vzeroupper
   1440 ___
   1441 $code.=<<___ if ($win64);
   1442 	movaps	16*$SZ+32(%rsp),%xmm6
   1443 	movaps	16*$SZ+48(%rsp),%xmm7
   1444 	movaps	16*$SZ+64(%rsp),%xmm8
   1445 	movaps	16*$SZ+80(%rsp),%xmm9
   1446 ___
   1447 $code.=<<___ if ($win64 && $SZ>4);
   1448 	movaps	16*$SZ+96(%rsp),%xmm10
   1449 	movaps	16*$SZ+112(%rsp),%xmm11
   1450 ___
   1451 $code.=<<___;
   1452 	mov	(%rsi),%r15
   1453 	mov	8(%rsi),%r14
   1454 	mov	16(%rsi),%r13
   1455 	mov	24(%rsi),%r12
   1456 	mov	32(%rsi),%rbp
   1457 	mov	40(%rsi),%rbx
   1458 	lea	48(%rsi),%rsp
   1459 .Lepilogue_xop:
   1460 	ret
   1461 .size	${func}_xop,.-${func}_xop
   1462 ___
   1463 }
   1464 ######################################################################
   1465 # AVX+shrd code path
   1466 #
   1467 local *ror = sub { &shrd(@_[0],@_) };
   1468 
   1469 $code.=<<___;
   1470 .type	${func}_avx,\@function,3
   1471 .align	64
   1472 ${func}_avx:
   1473 .Lavx_shortcut:
   1474 	push	%rbx
   1475 	push	%rbp
   1476 	push	%r12
   1477 	push	%r13
   1478 	push	%r14
   1479 	push	%r15
   1480 	mov	%rsp,%r11		# copy %rsp
   1481 	shl	\$4,%rdx		# num*16
   1482 	sub	\$`$framesz+$win64*16*($SZ==4?4:6)`,%rsp
   1483 	lea	($inp,%rdx,$SZ),%rdx	# inp+num*16*$SZ
   1484 	and	\$-64,%rsp		# align stack frame
   1485 	mov	$ctx,$_ctx		# save ctx, 1st arg
   1486 	mov	$inp,$_inp		# save inp, 2nd arh
   1487 	mov	%rdx,$_end		# save end pointer, "3rd" arg
   1488 	mov	%r11,$_rsp		# save copy of %rsp
   1489 ___
   1490 $code.=<<___ if ($win64);
   1491 	movaps	%xmm6,16*$SZ+32(%rsp)
   1492 	movaps	%xmm7,16*$SZ+48(%rsp)
   1493 	movaps	%xmm8,16*$SZ+64(%rsp)
   1494 	movaps	%xmm9,16*$SZ+80(%rsp)
   1495 ___
   1496 $code.=<<___ if ($win64 && $SZ>4);
   1497 	movaps	%xmm10,16*$SZ+96(%rsp)
   1498 	movaps	%xmm11,16*$SZ+112(%rsp)
   1499 ___
   1500 $code.=<<___;
   1501 .Lprologue_avx:
   1502 
   1503 	vzeroupper
   1504 	mov	$SZ*0($ctx),$A
   1505 	mov	$SZ*1($ctx),$B
   1506 	mov	$SZ*2($ctx),$C
   1507 	mov	$SZ*3($ctx),$D
   1508 	mov	$SZ*4($ctx),$E
   1509 	mov	$SZ*5($ctx),$F
   1510 	mov	$SZ*6($ctx),$G
   1511 	mov	$SZ*7($ctx),$H
   1512 ___
   1513 					if ($SZ==4) {	# SHA256
   1514     my @X = map("%xmm$_",(0..3));
   1515     my ($t0,$t1,$t2,$t3, $t4,$t5) = map("%xmm$_",(4..9));
   1516 
   1517 $code.=<<___;
   1518 	vmovdqa	$TABLE+`$SZ*2*$rounds`+32(%rip),$t4
   1519 	vmovdqa	$TABLE+`$SZ*2*$rounds`+64(%rip),$t5
   1520 	jmp	.Lloop_avx
   1521 .align	16
   1522 .Lloop_avx:
   1523 	vmovdqa	$TABLE+`$SZ*2*$rounds`(%rip),$t3
   1524 	vmovdqu	0x00($inp),@X[0]
   1525 	vmovdqu	0x10($inp),@X[1]
   1526 	vmovdqu	0x20($inp),@X[2]
   1527 	vmovdqu	0x30($inp),@X[3]
   1528 	vpshufb	$t3,@X[0],@X[0]
   1529 	lea	$TABLE(%rip),$Tbl
   1530 	vpshufb	$t3,@X[1],@X[1]
   1531 	vpshufb	$t3,@X[2],@X[2]
   1532 	vpaddd	0x00($Tbl),@X[0],$t0
   1533 	vpshufb	$t3,@X[3],@X[3]
   1534 	vpaddd	0x20($Tbl),@X[1],$t1
   1535 	vpaddd	0x40($Tbl),@X[2],$t2
   1536 	vpaddd	0x60($Tbl),@X[3],$t3
   1537 	vmovdqa	$t0,0x00(%rsp)
   1538 	mov	$A,$a1
   1539 	vmovdqa	$t1,0x10(%rsp)
   1540 	mov	$B,$a3
   1541 	vmovdqa	$t2,0x20(%rsp)
   1542 	xor	$C,$a3			# magic
   1543 	vmovdqa	$t3,0x30(%rsp)
   1544 	mov	$E,$a0
   1545 	jmp	.Lavx_00_47
   1546 
   1547 .align	16
   1548 .Lavx_00_47:
   1549 	sub	\$`-16*2*$SZ`,$Tbl	# size optimization
   1550 ___
   1551 sub Xupdate_256_AVX () {
   1552 	(
   1553 	'&vpalignr	($t0,@X[1],@X[0],$SZ)',	# X[1..4]
   1554 	 '&vpalignr	($t3,@X[3],@X[2],$SZ)',	# X[9..12]
   1555 	'&vpsrld	($t2,$t0,$sigma0[0]);',
   1556 	 '&vpaddd	(@X[0],@X[0],$t3)',	# X[0..3] += X[9..12]
   1557 	'&vpsrld	($t3,$t0,$sigma0[2])',
   1558 	'&vpslld	($t1,$t0,8*$SZ-$sigma0[1]);',
   1559 	'&vpxor		($t0,$t3,$t2)',
   1560 	 '&vpshufd	($t3,@X[3],0b11111010)',# X[14..15]
   1561 	'&vpsrld	($t2,$t2,$sigma0[1]-$sigma0[0]);',
   1562 	'&vpxor		($t0,$t0,$t1)',
   1563 	'&vpslld	($t1,$t1,$sigma0[1]-$sigma0[0]);',
   1564 	'&vpxor		($t0,$t0,$t2)',
   1565 	 '&vpsrld	($t2,$t3,$sigma1[2]);',
   1566 	'&vpxor		($t0,$t0,$t1)',		# sigma0(X[1..4])
   1567 	 '&vpsrlq	($t3,$t3,$sigma1[0]);',
   1568 	'&vpaddd	(@X[0],@X[0],$t0)',	# X[0..3] += sigma0(X[1..4])
   1569 	 '&vpxor	($t2,$t2,$t3);',
   1570 	 '&vpsrlq	($t3,$t3,$sigma1[1]-$sigma1[0])',
   1571 	 '&vpxor	($t2,$t2,$t3)',
   1572 	 '&vpshufb	($t2,$t2,$t4)',		# sigma1(X[14..15])
   1573 	'&vpaddd	(@X[0],@X[0],$t2)',	# X[0..1] += sigma1(X[14..15])
   1574 	 '&vpshufd	($t3,@X[0],0b01010000)',# X[16..17]
   1575 	 '&vpsrld	($t2,$t3,$sigma1[2])',
   1576 	 '&vpsrlq	($t3,$t3,$sigma1[0])',
   1577 	 '&vpxor	($t2,$t2,$t3);',
   1578 	 '&vpsrlq	($t3,$t3,$sigma1[1]-$sigma1[0])',
   1579 	 '&vpxor	($t2,$t2,$t3)',
   1580 	 '&vpshufb	($t2,$t2,$t5)',
   1581 	'&vpaddd	(@X[0],@X[0],$t2)'	# X[2..3] += sigma1(X[16..17])
   1582 	);
   1583 }
   1584 
   1585 sub AVX_256_00_47 () {
   1586 my $j = shift;
   1587 my $body = shift;
   1588 my @X = @_;
   1589 my @insns = (&$body,&$body,&$body,&$body);	# 104 instructions
   1590 
   1591 	foreach (Xupdate_256_AVX()) {		# 29 instructions
   1592 	    eval;
   1593 	    eval(shift(@insns));
   1594 	    eval(shift(@insns));
   1595 	    eval(shift(@insns));
   1596 	}
   1597 	&vpaddd		($t2,@X[0],16*2*$j."($Tbl)");
   1598 	  foreach (@insns) { eval; }		# remaining instructions
   1599 	&vmovdqa	(16*$j."(%rsp)",$t2);
   1600 }
   1601 
   1602     for ($i=0,$j=0; $j<4; $j++) {
   1603 	&AVX_256_00_47($j,\&body_00_15,@X);
   1604 	push(@X,shift(@X));			# rotate(@X)
   1605     }
   1606 	&cmpb	($SZ-1+16*2*$SZ."($Tbl)",0);
   1607 	&jne	(".Lavx_00_47");
   1608 
   1609     for ($i=0; $i<16; ) {
   1610 	foreach(body_00_15()) { eval; }
   1611     }
   1612 
   1613 					} else {	# SHA512
   1614     my @X = map("%xmm$_",(0..7));
   1615     my ($t0,$t1,$t2,$t3) = map("%xmm$_",(8..11));
   1616 
   1617 $code.=<<___;
   1618 	jmp	.Lloop_avx
   1619 .align	16
   1620 .Lloop_avx:
   1621 	vmovdqa	$TABLE+`$SZ*2*$rounds`(%rip),$t3
   1622 	vmovdqu	0x00($inp),@X[0]
   1623 	lea	$TABLE+0x80(%rip),$Tbl	# size optimization
   1624 	vmovdqu	0x10($inp),@X[1]
   1625 	vmovdqu	0x20($inp),@X[2]
   1626 	vpshufb	$t3,@X[0],@X[0]
   1627 	vmovdqu	0x30($inp),@X[3]
   1628 	vpshufb	$t3,@X[1],@X[1]
   1629 	vmovdqu	0x40($inp),@X[4]
   1630 	vpshufb	$t3,@X[2],@X[2]
   1631 	vmovdqu	0x50($inp),@X[5]
   1632 	vpshufb	$t3,@X[3],@X[3]
   1633 	vmovdqu	0x60($inp),@X[6]
   1634 	vpshufb	$t3,@X[4],@X[4]
   1635 	vmovdqu	0x70($inp),@X[7]
   1636 	vpshufb	$t3,@X[5],@X[5]
   1637 	vpaddq	-0x80($Tbl),@X[0],$t0
   1638 	vpshufb	$t3,@X[6],@X[6]
   1639 	vpaddq	-0x60($Tbl),@X[1],$t1
   1640 	vpshufb	$t3,@X[7],@X[7]
   1641 	vpaddq	-0x40($Tbl),@X[2],$t2
   1642 	vpaddq	-0x20($Tbl),@X[3],$t3
   1643 	vmovdqa	$t0,0x00(%rsp)
   1644 	vpaddq	0x00($Tbl),@X[4],$t0
   1645 	vmovdqa	$t1,0x10(%rsp)
   1646 	vpaddq	0x20($Tbl),@X[5],$t1
   1647 	vmovdqa	$t2,0x20(%rsp)
   1648 	vpaddq	0x40($Tbl),@X[6],$t2
   1649 	vmovdqa	$t3,0x30(%rsp)
   1650 	vpaddq	0x60($Tbl),@X[7],$t3
   1651 	vmovdqa	$t0,0x40(%rsp)
   1652 	mov	$A,$a1
   1653 	vmovdqa	$t1,0x50(%rsp)
   1654 	mov	$B,$a3
   1655 	vmovdqa	$t2,0x60(%rsp)
   1656 	xor	$C,$a3			# magic
   1657 	vmovdqa	$t3,0x70(%rsp)
   1658 	mov	$E,$a0
   1659 	jmp	.Lavx_00_47
   1660 
   1661 .align	16
   1662 .Lavx_00_47:
   1663 	add	\$`16*2*$SZ`,$Tbl
   1664 ___
   1665 sub Xupdate_512_AVX () {
   1666 	(
   1667 	'&vpalignr	($t0,@X[1],@X[0],$SZ)',	# X[1..2]
   1668 	 '&vpalignr	($t3,@X[5],@X[4],$SZ)',	# X[9..10]
   1669 	'&vpsrlq	($t2,$t0,$sigma0[0])',
   1670 	 '&vpaddq	(@X[0],@X[0],$t3);',	# X[0..1] += X[9..10]
   1671 	'&vpsrlq	($t3,$t0,$sigma0[2])',
   1672 	'&vpsllq	($t1,$t0,8*$SZ-$sigma0[1]);',
   1673 	 '&vpxor	($t0,$t3,$t2)',
   1674 	'&vpsrlq	($t2,$t2,$sigma0[1]-$sigma0[0]);',
   1675 	 '&vpxor	($t0,$t0,$t1)',
   1676 	'&vpsllq	($t1,$t1,$sigma0[1]-$sigma0[0]);',
   1677 	 '&vpxor	($t0,$t0,$t2)',
   1678 	 '&vpsrlq	($t3,@X[7],$sigma1[2]);',
   1679 	'&vpxor		($t0,$t0,$t1)',		# sigma0(X[1..2])
   1680 	 '&vpsllq	($t2,@X[7],8*$SZ-$sigma1[1]);',
   1681 	'&vpaddq	(@X[0],@X[0],$t0)',	# X[0..1] += sigma0(X[1..2])
   1682 	 '&vpsrlq	($t1,@X[7],$sigma1[0]);',
   1683 	 '&vpxor	($t3,$t3,$t2)',
   1684 	 '&vpsllq	($t2,$t2,$sigma1[1]-$sigma1[0]);',
   1685 	 '&vpxor	($t3,$t3,$t1)',
   1686 	 '&vpsrlq	($t1,$t1,$sigma1[1]-$sigma1[0]);',
   1687 	 '&vpxor	($t3,$t3,$t2)',
   1688 	 '&vpxor	($t3,$t3,$t1)',		# sigma1(X[14..15])
   1689 	'&vpaddq	(@X[0],@X[0],$t3)',	# X[0..1] += sigma1(X[14..15])
   1690 	);
   1691 }
   1692 
   1693 sub AVX_512_00_47 () {
   1694 my $j = shift;
   1695 my $body = shift;
   1696 my @X = @_;
   1697 my @insns = (&$body,&$body);			# 52 instructions
   1698 
   1699 	foreach (Xupdate_512_AVX()) {		# 23 instructions
   1700 	    eval;
   1701 	    eval(shift(@insns));
   1702 	    eval(shift(@insns));
   1703 	}
   1704 	&vpaddq		($t2,@X[0],16*2*$j-0x80."($Tbl)");
   1705 	  foreach (@insns) { eval; }		# remaining instructions
   1706 	&vmovdqa	(16*$j."(%rsp)",$t2);
   1707 }
   1708 
   1709     for ($i=0,$j=0; $j<8; $j++) {
   1710 	&AVX_512_00_47($j,\&body_00_15,@X);
   1711 	push(@X,shift(@X));			# rotate(@X)
   1712     }
   1713 	&cmpb	($SZ-1+16*2*$SZ-0x80."($Tbl)",0);
   1714 	&jne	(".Lavx_00_47");
   1715 
   1716     for ($i=0; $i<16; ) {
   1717 	foreach(body_00_15()) { eval; }
   1718     }
   1719 }
   1720 $code.=<<___;
   1721 	mov	$_ctx,$ctx
   1722 	mov	$a1,$A
   1723 
   1724 	add	$SZ*0($ctx),$A
   1725 	lea	16*$SZ($inp),$inp
   1726 	add	$SZ*1($ctx),$B
   1727 	add	$SZ*2($ctx),$C
   1728 	add	$SZ*3($ctx),$D
   1729 	add	$SZ*4($ctx),$E
   1730 	add	$SZ*5($ctx),$F
   1731 	add	$SZ*6($ctx),$G
   1732 	add	$SZ*7($ctx),$H
   1733 
   1734 	cmp	$_end,$inp
   1735 
   1736 	mov	$A,$SZ*0($ctx)
   1737 	mov	$B,$SZ*1($ctx)
   1738 	mov	$C,$SZ*2($ctx)
   1739 	mov	$D,$SZ*3($ctx)
   1740 	mov	$E,$SZ*4($ctx)
   1741 	mov	$F,$SZ*5($ctx)
   1742 	mov	$G,$SZ*6($ctx)
   1743 	mov	$H,$SZ*7($ctx)
   1744 	jb	.Lloop_avx
   1745 
   1746 	mov	$_rsp,%rsi
   1747 	vzeroupper
   1748 ___
   1749 $code.=<<___ if ($win64);
   1750 	movaps	16*$SZ+32(%rsp),%xmm6
   1751 	movaps	16*$SZ+48(%rsp),%xmm7
   1752 	movaps	16*$SZ+64(%rsp),%xmm8
   1753 	movaps	16*$SZ+80(%rsp),%xmm9
   1754 ___
   1755 $code.=<<___ if ($win64 && $SZ>4);
   1756 	movaps	16*$SZ+96(%rsp),%xmm10
   1757 	movaps	16*$SZ+112(%rsp),%xmm11
   1758 ___
   1759 $code.=<<___;
   1760 	mov	(%rsi),%r15
   1761 	mov	8(%rsi),%r14
   1762 	mov	16(%rsi),%r13
   1763 	mov	24(%rsi),%r12
   1764 	mov	32(%rsi),%rbp
   1765 	mov	40(%rsi),%rbx
   1766 	lea	48(%rsi),%rsp
   1767 .Lepilogue_avx:
   1768 	ret
   1769 .size	${func}_avx,.-${func}_avx
   1770 ___
   1771 
   1772 if ($avx>1) {{
   1773 ######################################################################
   1774 # AVX2+BMI code path
   1775 #
   1776 my $a5=$SZ==4?"%esi":"%rsi";	# zap $inp 
   1777 my $PUSH8=8*2*$SZ;
   1778 use integer;
   1779 
   1780 sub bodyx_00_15 () {
   1781 	# at start $a1 should be zero, $a3 - $b^$c and $a4 copy of $f
   1782 	(
   1783 	'($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'.
   1784 
   1785 	'&add	($h,(32*($i/(16/$SZ))+$SZ*($i%(16/$SZ)))%$PUSH8.$base)',    # h+=X[i]+K[i]
   1786 	'&and	($a4,$e)',		# f&e
   1787 	'&rorx	($a0,$e,$Sigma1[2])',
   1788 	'&rorx	($a2,$e,$Sigma1[1])',
   1789 
   1790 	'&lea	($a,"($a,$a1)")',	# h+=Sigma0(a) from the past
   1791 	'&lea	($h,"($h,$a4)")',
   1792 	'&andn	($a4,$e,$g)',		# ~e&g
   1793 	'&xor	($a0,$a2)',
   1794 
   1795 	'&rorx	($a1,$e,$Sigma1[0])',
   1796 	'&lea	($h,"($h,$a4)")',	# h+=Ch(e,f,g)=(e&f)+(~e&g)
   1797 	'&xor	($a0,$a1)',		# Sigma1(e)
   1798 	'&mov	($a2,$a)',
   1799 
   1800 	'&rorx	($a4,$a,$Sigma0[2])',
   1801 	'&lea	($h,"($h,$a0)")',	# h+=Sigma1(e)
   1802 	'&xor	($a2,$b)',		# a^b, b^c in next round
   1803 	'&rorx	($a1,$a,$Sigma0[1])',
   1804 
   1805 	'&rorx	($a0,$a,$Sigma0[0])',
   1806 	'&lea	($d,"($d,$h)")',	# d+=h
   1807 	'&and	($a3,$a2)',		# (b^c)&(a^b)
   1808 	'&xor	($a1,$a4)',
   1809 
   1810 	'&xor	($a3,$b)',		# Maj(a,b,c)=Ch(a^b,c,b)
   1811 	'&xor	($a1,$a0)',		# Sigma0(a)
   1812 	'&lea	($h,"($h,$a3)");'.	# h+=Maj(a,b,c)
   1813 	'&mov	($a4,$e)',		# copy of f in future
   1814 
   1815 	'($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;'
   1816 	);
   1817 	# and at the finish one has to $a+=$a1
   1818 }
   1819 
   1820 $code.=<<___;
   1821 .type	${func}_avx2,\@function,3
   1822 .align	64
   1823 ${func}_avx2:
   1824 .Lavx2_shortcut:
   1825 	push	%rbx
   1826 	push	%rbp
   1827 	push	%r12
   1828 	push	%r13
   1829 	push	%r14
   1830 	push	%r15
   1831 	mov	%rsp,%r11		# copy %rsp
   1832 	sub	\$`2*$SZ*$rounds+4*8+$win64*16*($SZ==4?4:6)`,%rsp
   1833 	shl	\$4,%rdx		# num*16
   1834 	and	\$-256*$SZ,%rsp		# align stack frame
   1835 	lea	($inp,%rdx,$SZ),%rdx	# inp+num*16*$SZ
   1836 	add	\$`2*$SZ*($rounds-8)`,%rsp
   1837 	mov	$ctx,$_ctx		# save ctx, 1st arg
   1838 	mov	$inp,$_inp		# save inp, 2nd arh
   1839 	mov	%rdx,$_end		# save end pointer, "3rd" arg
   1840 	mov	%r11,$_rsp		# save copy of %rsp
   1841 ___
   1842 $code.=<<___ if ($win64);
   1843 	movaps	%xmm6,16*$SZ+32(%rsp)
   1844 	movaps	%xmm7,16*$SZ+48(%rsp)
   1845 	movaps	%xmm8,16*$SZ+64(%rsp)
   1846 	movaps	%xmm9,16*$SZ+80(%rsp)
   1847 ___
   1848 $code.=<<___ if ($win64 && $SZ>4);
   1849 	movaps	%xmm10,16*$SZ+96(%rsp)
   1850 	movaps	%xmm11,16*$SZ+112(%rsp)
   1851 ___
   1852 $code.=<<___;
   1853 .Lprologue_avx2:
   1854 
   1855 	vzeroupper
   1856 	sub	\$-16*$SZ,$inp		# inp++, size optimization
   1857 	mov	$SZ*0($ctx),$A
   1858 	mov	$inp,%r12		# borrow $T1
   1859 	mov	$SZ*1($ctx),$B
   1860 	cmp	%rdx,$inp		# $_end
   1861 	mov	$SZ*2($ctx),$C
   1862 	cmove	%rsp,%r12		# next block or random data
   1863 	mov	$SZ*3($ctx),$D
   1864 	mov	$SZ*4($ctx),$E
   1865 	mov	$SZ*5($ctx),$F
   1866 	mov	$SZ*6($ctx),$G
   1867 	mov	$SZ*7($ctx),$H
   1868 ___
   1869 					if ($SZ==4) {	# SHA256
   1870     my @X = map("%ymm$_",(0..3));
   1871     my ($t0,$t1,$t2,$t3, $t4,$t5) = map("%ymm$_",(4..9));
   1872 
   1873 $code.=<<___;
   1874 	vmovdqa	$TABLE+`$SZ*2*$rounds`+32(%rip),$t4
   1875 	vmovdqa	$TABLE+`$SZ*2*$rounds`+64(%rip),$t5
   1876 	jmp	.Loop_avx2
   1877 .align	16
   1878 .Loop_avx2:
   1879 	vmovdqa	$TABLE+`$SZ*2*$rounds`(%rip),$t3
   1880 	vmovdqu	-16*$SZ+0($inp),%xmm0
   1881 	vmovdqu	-16*$SZ+16($inp),%xmm1
   1882 	vmovdqu	-16*$SZ+32($inp),%xmm2
   1883 	vmovdqu	-16*$SZ+48($inp),%xmm3
   1884 	#mov		$inp,$_inp	# offload $inp
   1885 	vinserti128	\$1,(%r12),@X[0],@X[0]
   1886 	vinserti128	\$1,16(%r12),@X[1],@X[1]
   1887 	vpshufb		$t3,@X[0],@X[0]
   1888 	vinserti128	\$1,32(%r12),@X[2],@X[2]
   1889 	vpshufb		$t3,@X[1],@X[1]
   1890 	vinserti128	\$1,48(%r12),@X[3],@X[3]
   1891 
   1892 	lea	$TABLE(%rip),$Tbl
   1893 	vpshufb	$t3,@X[2],@X[2]
   1894 	vpaddd	0x00($Tbl),@X[0],$t0
   1895 	vpshufb	$t3,@X[3],@X[3]
   1896 	vpaddd	0x20($Tbl),@X[1],$t1
   1897 	vpaddd	0x40($Tbl),@X[2],$t2
   1898 	vpaddd	0x60($Tbl),@X[3],$t3
   1899 	vmovdqa	$t0,0x00(%rsp)
   1900 	xor	$a1,$a1
   1901 	vmovdqa	$t1,0x20(%rsp)
   1902 	lea	-$PUSH8(%rsp),%rsp
   1903 	mov	$B,$a3
   1904 	vmovdqa	$t2,0x00(%rsp)
   1905 	xor	$C,$a3			# magic
   1906 	vmovdqa	$t3,0x20(%rsp)
   1907 	mov	$F,$a4
   1908 	sub	\$-16*2*$SZ,$Tbl	# size optimization
   1909 	jmp	.Lavx2_00_47
   1910 
   1911 .align	16
   1912 .Lavx2_00_47:
   1913 ___
   1914 
   1915 sub AVX2_256_00_47 () {
   1916 my $j = shift;
   1917 my $body = shift;
   1918 my @X = @_;
   1919 my @insns = (&$body,&$body,&$body,&$body);	# 96 instructions
   1920 my $base = "+2*$PUSH8(%rsp)";
   1921 
   1922 	&lea	("%rsp","-$PUSH8(%rsp)")	if (($j%2)==0);
   1923 	foreach (Xupdate_256_AVX()) {		# 29 instructions
   1924 	    eval;
   1925 	    eval(shift(@insns));
   1926 	    eval(shift(@insns));
   1927 	    eval(shift(@insns));
   1928 	}
   1929 	&vpaddd		($t2,@X[0],16*2*$j."($Tbl)");
   1930 	  foreach (@insns) { eval; }		# remaining instructions
   1931 	&vmovdqa	((32*$j)%$PUSH8."(%rsp)",$t2);
   1932 }
   1933 
   1934     for ($i=0,$j=0; $j<4; $j++) {
   1935 	&AVX2_256_00_47($j,\&bodyx_00_15,@X);
   1936 	push(@X,shift(@X));			# rotate(@X)
   1937     }
   1938 	&lea	($Tbl,16*2*$SZ."($Tbl)");
   1939 	&cmpb	(($SZ-1)."($Tbl)",0);
   1940 	&jne	(".Lavx2_00_47");
   1941 
   1942     for ($i=0; $i<16; ) {
   1943 	my $base=$i<8?"+$PUSH8(%rsp)":"(%rsp)";
   1944 	foreach(bodyx_00_15()) { eval; }
   1945     }
   1946 					} else {	# SHA512
   1947     my @X = map("%ymm$_",(0..7));
   1948     my ($t0,$t1,$t2,$t3) = map("%ymm$_",(8..11));
   1949 
   1950 $code.=<<___;
   1951 	jmp	.Loop_avx2
   1952 .align	16
   1953 .Loop_avx2:
   1954 	vmovdqu	-16*$SZ($inp),%xmm0
   1955 	vmovdqu	-16*$SZ+16($inp),%xmm1
   1956 	vmovdqu	-16*$SZ+32($inp),%xmm2
   1957 	lea	$TABLE+0x80(%rip),$Tbl	# size optimization
   1958 	vmovdqu	-16*$SZ+48($inp),%xmm3
   1959 	vmovdqu	-16*$SZ+64($inp),%xmm4
   1960 	vmovdqu	-16*$SZ+80($inp),%xmm5
   1961 	vmovdqu	-16*$SZ+96($inp),%xmm6
   1962 	vmovdqu	-16*$SZ+112($inp),%xmm7
   1963 	#mov	$inp,$_inp	# offload $inp
   1964 	vmovdqa	`$SZ*2*$rounds-0x80`($Tbl),$t2
   1965 	vinserti128	\$1,(%r12),@X[0],@X[0]
   1966 	vinserti128	\$1,16(%r12),@X[1],@X[1]
   1967 	 vpshufb	$t2,@X[0],@X[0]
   1968 	vinserti128	\$1,32(%r12),@X[2],@X[2]
   1969 	 vpshufb	$t2,@X[1],@X[1]
   1970 	vinserti128	\$1,48(%r12),@X[3],@X[3]
   1971 	 vpshufb	$t2,@X[2],@X[2]
   1972 	vinserti128	\$1,64(%r12),@X[4],@X[4]
   1973 	 vpshufb	$t2,@X[3],@X[3]
   1974 	vinserti128	\$1,80(%r12),@X[5],@X[5]
   1975 	 vpshufb	$t2,@X[4],@X[4]
   1976 	vinserti128	\$1,96(%r12),@X[6],@X[6]
   1977 	 vpshufb	$t2,@X[5],@X[5]
   1978 	vinserti128	\$1,112(%r12),@X[7],@X[7]
   1979 
   1980 	vpaddq	-0x80($Tbl),@X[0],$t0
   1981 	vpshufb	$t2,@X[6],@X[6]
   1982 	vpaddq	-0x60($Tbl),@X[1],$t1
   1983 	vpshufb	$t2,@X[7],@X[7]
   1984 	vpaddq	-0x40($Tbl),@X[2],$t2
   1985 	vpaddq	-0x20($Tbl),@X[3],$t3
   1986 	vmovdqa	$t0,0x00(%rsp)
   1987 	vpaddq	0x00($Tbl),@X[4],$t0
   1988 	vmovdqa	$t1,0x20(%rsp)
   1989 	vpaddq	0x20($Tbl),@X[5],$t1
   1990 	vmovdqa	$t2,0x40(%rsp)
   1991 	vpaddq	0x40($Tbl),@X[6],$t2
   1992 	vmovdqa	$t3,0x60(%rsp)
   1993 	lea	-$PUSH8(%rsp),%rsp
   1994 	vpaddq	0x60($Tbl),@X[7],$t3
   1995 	vmovdqa	$t0,0x00(%rsp)
   1996 	xor	$a1,$a1
   1997 	vmovdqa	$t1,0x20(%rsp)
   1998 	mov	$B,$a3
   1999 	vmovdqa	$t2,0x40(%rsp)
   2000 	xor	$C,$a3			# magic
   2001 	vmovdqa	$t3,0x60(%rsp)
   2002 	mov	$F,$a4
   2003 	add	\$16*2*$SZ,$Tbl
   2004 	jmp	.Lavx2_00_47
   2005 
   2006 .align	16
   2007 .Lavx2_00_47:
   2008 ___
   2009 
   2010 sub AVX2_512_00_47 () {
   2011 my $j = shift;
   2012 my $body = shift;
   2013 my @X = @_;
   2014 my @insns = (&$body,&$body);			# 48 instructions
   2015 my $base = "+2*$PUSH8(%rsp)";
   2016 
   2017 	&lea	("%rsp","-$PUSH8(%rsp)")	if (($j%4)==0);
   2018 	foreach (Xupdate_512_AVX()) {		# 23 instructions
   2019 	    eval;
   2020 	    if ($_ !~ /\;$/) {
   2021 		eval(shift(@insns));
   2022 		eval(shift(@insns));
   2023 		eval(shift(@insns));
   2024 	    }
   2025 	}
   2026 	&vpaddq		($t2,@X[0],16*2*$j-0x80."($Tbl)");
   2027 	  foreach (@insns) { eval; }		# remaining instructions
   2028 	&vmovdqa	((32*$j)%$PUSH8."(%rsp)",$t2);
   2029 }
   2030 
   2031     for ($i=0,$j=0; $j<8; $j++) {
   2032 	&AVX2_512_00_47($j,\&bodyx_00_15,@X);
   2033 	push(@X,shift(@X));			# rotate(@X)
   2034     }
   2035 	&lea	($Tbl,16*2*$SZ."($Tbl)");
   2036 	&cmpb	(($SZ-1-0x80)."($Tbl)",0);
   2037 	&jne	(".Lavx2_00_47");
   2038 
   2039     for ($i=0; $i<16; ) {
   2040 	my $base=$i<8?"+$PUSH8(%rsp)":"(%rsp)";
   2041 	foreach(bodyx_00_15()) { eval; }
   2042     }
   2043 }
   2044 $code.=<<___;
   2045 	mov	`2*$SZ*$rounds`(%rsp),$ctx	# $_ctx
   2046 	add	$a1,$A
   2047 	#mov	`2*$SZ*$rounds+8`(%rsp),$inp	# $_inp
   2048 	lea	`2*$SZ*($rounds-8)`(%rsp),$Tbl
   2049 
   2050 	add	$SZ*0($ctx),$A
   2051 	add	$SZ*1($ctx),$B
   2052 	add	$SZ*2($ctx),$C
   2053 	add	$SZ*3($ctx),$D
   2054 	add	$SZ*4($ctx),$E
   2055 	add	$SZ*5($ctx),$F
   2056 	add	$SZ*6($ctx),$G
   2057 	add	$SZ*7($ctx),$H
   2058 
   2059 	mov	$A,$SZ*0($ctx)
   2060 	mov	$B,$SZ*1($ctx)
   2061 	mov	$C,$SZ*2($ctx)
   2062 	mov	$D,$SZ*3($ctx)
   2063 	mov	$E,$SZ*4($ctx)
   2064 	mov	$F,$SZ*5($ctx)
   2065 	mov	$G,$SZ*6($ctx)
   2066 	mov	$H,$SZ*7($ctx)
   2067 
   2068 	cmp	`$PUSH8+2*8`($Tbl),$inp	# $_end
   2069 	je	.Ldone_avx2
   2070 
   2071 	xor	$a1,$a1
   2072 	mov	$B,$a3
   2073 	xor	$C,$a3			# magic
   2074 	mov	$F,$a4
   2075 	jmp	.Lower_avx2
   2076 .align	16
   2077 .Lower_avx2:
   2078 ___
   2079     for ($i=0; $i<8; ) {
   2080 	my $base="+16($Tbl)";
   2081 	foreach(bodyx_00_15()) { eval; }
   2082     }
   2083 $code.=<<___;
   2084 	lea	-$PUSH8($Tbl),$Tbl
   2085 	cmp	%rsp,$Tbl
   2086 	jae	.Lower_avx2
   2087 
   2088 	mov	`2*$SZ*$rounds`(%rsp),$ctx	# $_ctx
   2089 	add	$a1,$A
   2090 	#mov	`2*$SZ*$rounds+8`(%rsp),$inp	# $_inp
   2091 	lea	`2*$SZ*($rounds-8)`(%rsp),%rsp
   2092 
   2093 	add	$SZ*0($ctx),$A
   2094 	add	$SZ*1($ctx),$B
   2095 	add	$SZ*2($ctx),$C
   2096 	add	$SZ*3($ctx),$D
   2097 	add	$SZ*4($ctx),$E
   2098 	add	$SZ*5($ctx),$F
   2099 	lea	`2*16*$SZ`($inp),$inp	# inp+=2
   2100 	add	$SZ*6($ctx),$G
   2101 	mov	$inp,%r12
   2102 	add	$SZ*7($ctx),$H
   2103 	cmp	$_end,$inp
   2104 
   2105 	mov	$A,$SZ*0($ctx)
   2106 	cmove	%rsp,%r12		# next block or stale data
   2107 	mov	$B,$SZ*1($ctx)
   2108 	mov	$C,$SZ*2($ctx)
   2109 	mov	$D,$SZ*3($ctx)
   2110 	mov	$E,$SZ*4($ctx)
   2111 	mov	$F,$SZ*5($ctx)
   2112 	mov	$G,$SZ*6($ctx)
   2113 	mov	$H,$SZ*7($ctx)
   2114 
   2115 	jbe	.Loop_avx2
   2116 	lea	(%rsp),$Tbl
   2117 
   2118 .Ldone_avx2:
   2119 	lea	($Tbl),%rsp
   2120 	mov	$_rsp,%rsi
   2121 	vzeroupper
   2122 ___
   2123 $code.=<<___ if ($win64);
   2124 	movaps	16*$SZ+32(%rsp),%xmm6
   2125 	movaps	16*$SZ+48(%rsp),%xmm7
   2126 	movaps	16*$SZ+64(%rsp),%xmm8
   2127 	movaps	16*$SZ+80(%rsp),%xmm9
   2128 ___
   2129 $code.=<<___ if ($win64 && $SZ>4);
   2130 	movaps	16*$SZ+96(%rsp),%xmm10
   2131 	movaps	16*$SZ+112(%rsp),%xmm11
   2132 ___
   2133 $code.=<<___;
   2134 	mov	(%rsi),%r15
   2135 	mov	8(%rsi),%r14
   2136 	mov	16(%rsi),%r13
   2137 	mov	24(%rsi),%r12
   2138 	mov	32(%rsi),%rbp
   2139 	mov	40(%rsi),%rbx
   2140 	lea	48(%rsi),%rsp
   2141 .Lepilogue_avx2:
   2142 	ret
   2143 .size	${func}_avx2,.-${func}_avx2
   2144 ___
   2145 }}
   2146 }}}}}
   2147 
   2148 # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
   2149 #		CONTEXT *context,DISPATCHER_CONTEXT *disp)
   2150 if ($win64) {
   2151 $rec="%rcx";
   2152 $frame="%rdx";
   2153 $context="%r8";
   2154 $disp="%r9";
   2155 
   2156 $code.=<<___;
   2157 .extern	__imp_RtlVirtualUnwind
   2158 .type	se_handler,\@abi-omnipotent
   2159 .align	16
   2160 se_handler:
   2161 	push	%rsi
   2162 	push	%rdi
   2163 	push	%rbx
   2164 	push	%rbp
   2165 	push	%r12
   2166 	push	%r13
   2167 	push	%r14
   2168 	push	%r15
   2169 	pushfq
   2170 	sub	\$64,%rsp
   2171 
   2172 	mov	120($context),%rax	# pull context->Rax
   2173 	mov	248($context),%rbx	# pull context->Rip
   2174 
   2175 	mov	8($disp),%rsi		# disp->ImageBase
   2176 	mov	56($disp),%r11		# disp->HanderlData
   2177 
   2178 	mov	0(%r11),%r10d		# HandlerData[0]
   2179 	lea	(%rsi,%r10),%r10	# prologue label
   2180 	cmp	%r10,%rbx		# context->Rip<prologue label
   2181 	jb	.Lin_prologue
   2182 
   2183 	mov	152($context),%rax	# pull context->Rsp
   2184 
   2185 	mov	4(%r11),%r10d		# HandlerData[1]
   2186 	lea	(%rsi,%r10),%r10	# epilogue label
   2187 	cmp	%r10,%rbx		# context->Rip>=epilogue label
   2188 	jae	.Lin_prologue
   2189 ___
   2190 $code.=<<___ if ($avx>1);
   2191 	lea	.Lavx2_shortcut(%rip),%r10
   2192 	cmp	%r10,%rbx		# context->Rip<avx2_shortcut
   2193 	jb	.Lnot_in_avx2
   2194 
   2195 	and	\$-256*$SZ,%rax
   2196 	add	\$`2*$SZ*($rounds-8)`,%rax
   2197 .Lnot_in_avx2:
   2198 ___
   2199 $code.=<<___;
   2200 	mov	%rax,%rsi		# put aside Rsp
   2201 	mov	16*$SZ+3*8(%rax),%rax	# pull $_rsp
   2202 	lea	48(%rax),%rax
   2203 
   2204 	mov	-8(%rax),%rbx
   2205 	mov	-16(%rax),%rbp
   2206 	mov	-24(%rax),%r12
   2207 	mov	-32(%rax),%r13
   2208 	mov	-40(%rax),%r14
   2209 	mov	-48(%rax),%r15
   2210 	mov	%rbx,144($context)	# restore context->Rbx
   2211 	mov	%rbp,160($context)	# restore context->Rbp
   2212 	mov	%r12,216($context)	# restore context->R12
   2213 	mov	%r13,224($context)	# restore context->R13
   2214 	mov	%r14,232($context)	# restore context->R14
   2215 	mov	%r15,240($context)	# restore context->R15
   2216 
   2217 	lea	.Lepilogue(%rip),%r10
   2218 	cmp	%r10,%rbx
   2219 	jb	.Lin_prologue		# non-AVX code
   2220 
   2221 	lea	16*$SZ+4*8(%rsi),%rsi	# Xmm6- save area
   2222 	lea	512($context),%rdi	# &context.Xmm6
   2223 	mov	\$`$SZ==4?8:12`,%ecx
   2224 	.long	0xa548f3fc		# cld; rep movsq
   2225 
   2226 .Lin_prologue:
   2227 	mov	8(%rax),%rdi
   2228 	mov	16(%rax),%rsi
   2229 	mov	%rax,152($context)	# restore context->Rsp
   2230 	mov	%rsi,168($context)	# restore context->Rsi
   2231 	mov	%rdi,176($context)	# restore context->Rdi
   2232 
   2233 	mov	40($disp),%rdi		# disp->ContextRecord
   2234 	mov	$context,%rsi		# context
   2235 	mov	\$154,%ecx		# sizeof(CONTEXT)
   2236 	.long	0xa548f3fc		# cld; rep movsq
   2237 
   2238 	mov	$disp,%rsi
   2239 	xor	%rcx,%rcx		# arg1, UNW_FLAG_NHANDLER
   2240 	mov	8(%rsi),%rdx		# arg2, disp->ImageBase
   2241 	mov	0(%rsi),%r8		# arg3, disp->ControlPc
   2242 	mov	16(%rsi),%r9		# arg4, disp->FunctionEntry
   2243 	mov	40(%rsi),%r10		# disp->ContextRecord
   2244 	lea	56(%rsi),%r11		# &disp->HandlerData
   2245 	lea	24(%rsi),%r12		# &disp->EstablisherFrame
   2246 	mov	%r10,32(%rsp)		# arg5
   2247 	mov	%r11,40(%rsp)		# arg6
   2248 	mov	%r12,48(%rsp)		# arg7
   2249 	mov	%rcx,56(%rsp)		# arg8, (NULL)
   2250 	call	*__imp_RtlVirtualUnwind(%rip)
   2251 
   2252 	mov	\$1,%eax		# ExceptionContinueSearch
   2253 	add	\$64,%rsp
   2254 	popfq
   2255 	pop	%r15
   2256 	pop	%r14
   2257 	pop	%r13
   2258 	pop	%r12
   2259 	pop	%rbp
   2260 	pop	%rbx
   2261 	pop	%rdi
   2262 	pop	%rsi
   2263 	ret
   2264 .size	se_handler,.-se_handler
   2265 ___
   2266 $code.=<<___ if ($SZ == 4 && $shaext);
   2267 .type	shaext_handler,\@abi-omnipotent
   2268 .align	16
   2269 shaext_handler:
   2270 	push	%rsi
   2271 	push	%rdi
   2272 	push	%rbx
   2273 	push	%rbp
   2274 	push	%r12
   2275 	push	%r13
   2276 	push	%r14
   2277 	push	%r15
   2278 	pushfq
   2279 	sub	\$64,%rsp
   2280 
   2281 	mov	120($context),%rax	# pull context->Rax
   2282 	mov	248($context),%rbx	# pull context->Rip
   2283 
   2284 	lea	.Lprologue_shaext(%rip),%r10
   2285 	cmp	%r10,%rbx		# context->Rip<.Lprologue
   2286 	jb	.Lin_prologue
   2287 
   2288 	lea	.Lepilogue_shaext(%rip),%r10
   2289 	cmp	%r10,%rbx		# context->Rip>=.Lepilogue
   2290 	jae	.Lin_prologue
   2291 
   2292 	lea	-8-5*16(%rax),%rsi
   2293 	lea	512($context),%rdi	# &context.Xmm6
   2294 	mov	\$10,%ecx
   2295 	.long	0xa548f3fc		# cld; rep movsq
   2296 
   2297 	jmp	.Lin_prologue
   2298 .size	shaext_handler,.-shaext_handler
   2299 ___
   2300 $code.=<<___;
   2301 .section	.pdata
   2302 .align	4
   2303 	.rva	.LSEH_begin_$func
   2304 	.rva	.LSEH_end_$func
   2305 	.rva	.LSEH_info_$func
   2306 ___
   2307 $code.=<<___ if ($SZ==4 && $shext);
   2308 	.rva	.LSEH_begin_${func}_shaext
   2309 	.rva	.LSEH_end_${func}_shaext
   2310 	.rva	.LSEH_info_${func}_shaext
   2311 ___
   2312 $code.=<<___ if ($SZ==4);
   2313 	.rva	.LSEH_begin_${func}_ssse3
   2314 	.rva	.LSEH_end_${func}_ssse3
   2315 	.rva	.LSEH_info_${func}_ssse3
   2316 ___
   2317 $code.=<<___ if ($avx && $SZ==8);
   2318 	.rva	.LSEH_begin_${func}_xop
   2319 	.rva	.LSEH_end_${func}_xop
   2320 	.rva	.LSEH_info_${func}_xop
   2321 ___
   2322 $code.=<<___ if ($avx);
   2323 	.rva	.LSEH_begin_${func}_avx
   2324 	.rva	.LSEH_end_${func}_avx
   2325 	.rva	.LSEH_info_${func}_avx
   2326 ___
   2327 $code.=<<___ if ($avx>1);
   2328 	.rva	.LSEH_begin_${func}_avx2
   2329 	.rva	.LSEH_end_${func}_avx2
   2330 	.rva	.LSEH_info_${func}_avx2
   2331 ___
   2332 $code.=<<___;
   2333 .section	.xdata
   2334 .align	8
   2335 .LSEH_info_$func:
   2336 	.byte	9,0,0,0
   2337 	.rva	se_handler
   2338 	.rva	.Lprologue,.Lepilogue			# HandlerData[]
   2339 ___
   2340 $code.=<<___ if ($SZ==4 && $shaext);
   2341 .LSEH_info_${func}_shaext:
   2342 	.byte	9,0,0,0
   2343 	.rva	shaext_handler
   2344 ___
   2345 $code.=<<___ if ($SZ==4);
   2346 .LSEH_info_${func}_ssse3:
   2347 	.byte	9,0,0,0
   2348 	.rva	se_handler
   2349 	.rva	.Lprologue_ssse3,.Lepilogue_ssse3	# HandlerData[]
   2350 ___
   2351 $code.=<<___ if ($avx && $SZ==8);
   2352 .LSEH_info_${func}_xop:
   2353 	.byte	9,0,0,0
   2354 	.rva	se_handler
   2355 	.rva	.Lprologue_xop,.Lepilogue_xop		# HandlerData[]
   2356 ___
   2357 $code.=<<___ if ($avx);
   2358 .LSEH_info_${func}_avx:
   2359 	.byte	9,0,0,0
   2360 	.rva	se_handler
   2361 	.rva	.Lprologue_avx,.Lepilogue_avx		# HandlerData[]
   2362 ___
   2363 $code.=<<___ if ($avx>1);
   2364 .LSEH_info_${func}_avx2:
   2365 	.byte	9,0,0,0
   2366 	.rva	se_handler
   2367 	.rva	.Lprologue_avx2,.Lepilogue_avx2		# HandlerData[]
   2368 ___
   2369 }
   2370 
   2371 sub sha256op38 {
   2372     my $instr = shift;
   2373     my %opcodelet = (
   2374 		"sha256rnds2" => 0xcb,
   2375   		"sha256msg1"  => 0xcc,
   2376 		"sha256msg2"  => 0xcd	);
   2377 
   2378     if (defined($opcodelet{$instr}) && @_[0] =~ /%xmm([0-7]),\s*%xmm([0-7])/) {
   2379       my @opcode=(0x0f,0x38);
   2380 	push @opcode,$opcodelet{$instr};
   2381 	push @opcode,0xc0|($1&7)|(($2&7)<<3);		# ModR/M
   2382 	return ".byte\t".join(',',@opcode);
   2383     } else {
   2384 	return $instr."\t".@_[0];
   2385     }
   2386 }
   2387 
   2388 foreach (split("\n",$code)) {
   2389 	s/\`([^\`]*)\`/eval $1/geo;
   2390 
   2391 	s/\b(sha256[^\s]*)\s+(.*)/sha256op38($1,$2)/geo;
   2392 
   2393 	print $_,"\n";
   2394 }
   2395 close STDOUT;
   2396