Home | History | Annotate | Download | only in asm
      1 #! /usr/bin/env perl
      2 # Copyright 2005-2016 The OpenSSL Project Authors. All Rights Reserved.
      3 #
      4 # Licensed under the OpenSSL license (the "License").  You may not use
      5 # this file except in compliance with the License.  You can obtain a copy
      6 # in the file LICENSE in the source distribution or at
      7 # https://www.openssl.org/source/license.html
      8 
      9 #
     10 # ====================================================================
     11 # Written by Andy Polyakov <appro (at] openssl.org> for the OpenSSL
     12 # project. Rights for redistribution and usage in source and binary
     13 # forms are granted according to the OpenSSL license.
     14 # ====================================================================
     15 #
     16 # sha256/512_block procedure for x86_64.
     17 #
     18 # 40% improvement over compiler-generated code on Opteron. On EM64T
     19 # sha256 was observed to run >80% faster and sha512 - >40%. No magical
     20 # tricks, just straight implementation... I really wonder why gcc
     21 # [being armed with inline assembler] fails to generate as fast code.
     22 # The only thing which is cool about this module is that it's very
     23 # same instruction sequence used for both SHA-256 and SHA-512. In
     24 # former case the instructions operate on 32-bit operands, while in
     25 # latter - on 64-bit ones. All I had to do is to get one flavor right,
     26 # the other one passed the test right away:-)
     27 #
     28 # sha256_block runs in ~1005 cycles on Opteron, which gives you
     29 # asymptotic performance of 64*1000/1005=63.7MBps times CPU clock
     30 # frequency in GHz. sha512_block runs in ~1275 cycles, which results
     31 # in 128*1000/1275=100MBps per GHz. Is there room for improvement?
     32 # Well, if you compare it to IA-64 implementation, which maintains
     33 # X[16] in register bank[!], tends to 4 instructions per CPU clock
     34 # cycle and runs in 1003 cycles, 1275 is very good result for 3-way
     35 # issue Opteron pipeline and X[16] maintained in memory. So that *if*
     36 # there is a way to improve it, *then* the only way would be to try to
     37 # offload X[16] updates to SSE unit, but that would require "deeper"
     38 # loop unroll, which in turn would naturally cause size blow-up, not
     39 # to mention increased complexity! And once again, only *if* it's
     40 # actually possible to noticeably improve overall ILP, instruction
     41 # level parallelism, on a given CPU implementation in this case.
     42 #
     43 # Special note on Intel EM64T. While Opteron CPU exhibits perfect
     44 # performance ratio of 1.5 between 64- and 32-bit flavors [see above],
     45 # [currently available] EM64T CPUs apparently are far from it. On the
     46 # contrary, 64-bit version, sha512_block, is ~30% *slower* than 32-bit
     47 # sha256_block:-( This is presumably because 64-bit shifts/rotates
     48 # apparently are not atomic instructions, but implemented in microcode.
     49 #
     50 # May 2012.
     51 #
     52 # Optimization including one of Pavel Semjanov's ideas, alternative
     53 # Maj, resulted in >=5% improvement on most CPUs, +20% SHA256 and
     54 # unfortunately -2% SHA512 on P4 [which nobody should care about
     55 # that much].
     56 #
     57 # June 2012.
     58 #
     59 # Add SIMD code paths, see below for improvement coefficients. SSSE3
     60 # code path was not attempted for SHA512, because improvement is not
     61 # estimated to be high enough, noticeably less than 9%, to justify
     62 # the effort, not on pre-AVX processors. [Obviously with exclusion
     63 # for VIA Nano, but it has SHA512 instruction that is faster and
     64 # should be used instead.] For reference, corresponding estimated
     65 # upper limit for improvement for SSSE3 SHA256 is 28%. The fact that
     66 # higher coefficients are observed on VIA Nano and Bulldozer has more
     67 # to do with specifics of their architecture [which is topic for
     68 # separate discussion].
     69 #
     70 # November 2012.
     71 #
     72 # Add AVX2 code path. Two consecutive input blocks are loaded to
     73 # 256-bit %ymm registers, with data from first block to least
     74 # significant 128-bit halves and data from second to most significant.
     75 # The data is then processed with same SIMD instruction sequence as
     76 # for AVX, but with %ymm as operands. Side effect is increased stack
     77 # frame, 448 additional bytes in SHA256 and 1152 in SHA512, and 1.2KB
     78 # code size increase.
     79 #
     80 # March 2014.
     81 #
     82 # Add support for Intel SHA Extensions.
     83 
     84 ######################################################################
     85 # Current performance in cycles per processed byte (less is better):
     86 #
     87 #		SHA256	SSSE3       AVX/XOP(*)	    SHA512  AVX/XOP(*)
     88 #
     89 # AMD K8	14.9	-	    -		    9.57    -
     90 # P4		17.3	-	    -		    30.8    -
     91 # Core 2	15.6	13.8(+13%)  -		    9.97    -
     92 # Westmere	14.8	12.3(+19%)  -		    9.58    -
     93 # Sandy Bridge	17.4	14.2(+23%)  11.6(+50%(**))  11.2    8.10(+38%(**))
     94 # Ivy Bridge	12.6	10.5(+20%)  10.3(+22%)	    8.17    7.22(+13%)
     95 # Haswell	12.2	9.28(+31%)  7.80(+56%)	    7.66    5.40(+42%)
     96 # Skylake	11.4	9.03(+26%)  7.70(+48%)      7.25    5.20(+40%)
     97 # Bulldozer	21.1	13.6(+54%)  13.6(+54%(***)) 13.5    8.58(+57%)
     98 # Ryzen		11.0	9.02(+22%)  2.05(+440%)     7.05    5.67(+20%)
     99 # VIA Nano	23.0	16.5(+39%)  -		    14.7    -
    100 # Atom		23.0	18.9(+22%)  -		    14.7    -
    101 # Silvermont	27.4	20.6(+33%)  -               17.5    -
    102 # Knights L	27.4	21.0(+30%)  19.6(+40%)	    17.5    12.8(+37%)
    103 # Goldmont	18.9	14.3(+32%)  4.16(+350%)     12.0    -
    104 #
    105 # (*)	whichever best applicable, including SHAEXT;
    106 # (**)	switch from ror to shrd stands for fair share of improvement;
    107 # (***)	execution time is fully determined by remaining integer-only
    108 #	part, body_00_15; reducing the amount of SIMD instructions
    109 #	below certain limit makes no difference/sense; to conserve
    110 #	space SHA256 XOP code path is therefore omitted;
    111 #
    112 # Modified from upstream OpenSSL to remove the XOP code.
    113 
    114 $flavour = shift;
    115 $output  = shift;
    116 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
    117 
    118 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
    119 
    120 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
    121 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
    122 ( $xlate="${dir}../../../perlasm/x86_64-xlate.pl" and -f $xlate) or
    123 die "can't locate x86_64-xlate.pl";
    124 
    125 # In upstream, this is controlled by shelling out to the compiler to check
    126 # versions, but BoringSSL is intended to be used with pre-generated perlasm
    127 # output, so this isn't useful anyway.
    128 #
    129 # TODO(davidben): Enable AVX2 code after testing by setting $avx to 2. Is it
    130 # necessary to disable AVX2 code when SHA Extensions code is disabled? Upstream
    131 # did not tie them together until after $shaext was added.
    132 $avx = 1;
    133 
    134 # TODO(davidben): Consider enabling the Intel SHA Extensions code once it's
    135 # been tested.
    136 $shaext=0;	### set to zero if compiling for 1.0.1
    137 $avx=1		if (!$shaext && $avx);
    138 
    139 open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
    140 *STDOUT=*OUT;
    141 
    142 if ($output =~ /512/) {
    143 	$func="sha512_block_data_order";
    144 	$TABLE="K512";
    145 	$SZ=8;
    146 	@ROT=($A,$B,$C,$D,$E,$F,$G,$H)=("%rax","%rbx","%rcx","%rdx",
    147 					"%r8", "%r9", "%r10","%r11");
    148 	($T1,$a0,$a1,$a2,$a3)=("%r12","%r13","%r14","%r15","%rdi");
    149 	@Sigma0=(28,34,39);
    150 	@Sigma1=(14,18,41);
    151 	@sigma0=(1,  8, 7);
    152 	@sigma1=(19,61, 6);
    153 	$rounds=80;
    154 } else {
    155 	$func="sha256_block_data_order";
    156 	$TABLE="K256";
    157 	$SZ=4;
    158 	@ROT=($A,$B,$C,$D,$E,$F,$G,$H)=("%eax","%ebx","%ecx","%edx",
    159 					"%r8d","%r9d","%r10d","%r11d");
    160 	($T1,$a0,$a1,$a2,$a3)=("%r12d","%r13d","%r14d","%r15d","%edi");
    161 	@Sigma0=( 2,13,22);
    162 	@Sigma1=( 6,11,25);
    163 	@sigma0=( 7,18, 3);
    164 	@sigma1=(17,19,10);
    165 	$rounds=64;
    166 }
    167 
    168 $ctx="%rdi";	# 1st arg, zapped by $a3
    169 $inp="%rsi";	# 2nd arg
    170 $Tbl="%rbp";
    171 
    172 $_ctx="16*$SZ+0*8(%rsp)";
    173 $_inp="16*$SZ+1*8(%rsp)";
    174 $_end="16*$SZ+2*8(%rsp)";
    175 $_rsp="`16*$SZ+3*8`(%rsp)";
    176 $framesz="16*$SZ+4*8";
    177 
    178 
    179 sub ROUND_00_15()
    180 { my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
    181   my $STRIDE=$SZ;
    182      $STRIDE += 16 if ($i%(16/$SZ)==(16/$SZ-1));
    183 
    184 $code.=<<___;
    185 	ror	\$`$Sigma1[2]-$Sigma1[1]`,$a0
    186 	mov	$f,$a2
    187 
    188 	xor	$e,$a0
    189 	ror	\$`$Sigma0[2]-$Sigma0[1]`,$a1
    190 	xor	$g,$a2			# f^g
    191 
    192 	mov	$T1,`$SZ*($i&0xf)`(%rsp)
    193 	xor	$a,$a1
    194 	and	$e,$a2			# (f^g)&e
    195 
    196 	ror	\$`$Sigma1[1]-$Sigma1[0]`,$a0
    197 	add	$h,$T1			# T1+=h
    198 	xor	$g,$a2			# Ch(e,f,g)=((f^g)&e)^g
    199 
    200 	ror	\$`$Sigma0[1]-$Sigma0[0]`,$a1
    201 	xor	$e,$a0
    202 	add	$a2,$T1			# T1+=Ch(e,f,g)
    203 
    204 	mov	$a,$a2
    205 	add	($Tbl),$T1		# T1+=K[round]
    206 	xor	$a,$a1
    207 
    208 	xor	$b,$a2			# a^b, b^c in next round
    209 	ror	\$$Sigma1[0],$a0	# Sigma1(e)
    210 	mov	$b,$h
    211 
    212 	and	$a2,$a3
    213 	ror	\$$Sigma0[0],$a1	# Sigma0(a)
    214 	add	$a0,$T1			# T1+=Sigma1(e)
    215 
    216 	xor	$a3,$h			# h=Maj(a,b,c)=Ch(a^b,c,b)
    217 	add	$T1,$d			# d+=T1
    218 	add	$T1,$h			# h+=T1
    219 
    220 	lea	$STRIDE($Tbl),$Tbl	# round++
    221 ___
    222 $code.=<<___ if ($i<15);
    223 	add	$a1,$h			# h+=Sigma0(a)
    224 ___
    225 	($a2,$a3) = ($a3,$a2);
    226 }
    227 
    228 sub ROUND_16_XX()
    229 { my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
    230 
    231 $code.=<<___;
    232 	mov	`$SZ*(($i+1)&0xf)`(%rsp),$a0
    233 	mov	`$SZ*(($i+14)&0xf)`(%rsp),$a2
    234 
    235 	mov	$a0,$T1
    236 	ror	\$`$sigma0[1]-$sigma0[0]`,$a0
    237 	add	$a1,$a			# modulo-scheduled h+=Sigma0(a)
    238 	mov	$a2,$a1
    239 	ror	\$`$sigma1[1]-$sigma1[0]`,$a2
    240 
    241 	xor	$T1,$a0
    242 	shr	\$$sigma0[2],$T1
    243 	ror	\$$sigma0[0],$a0
    244 	xor	$a1,$a2
    245 	shr	\$$sigma1[2],$a1
    246 
    247 	ror	\$$sigma1[0],$a2
    248 	xor	$a0,$T1			# sigma0(X[(i+1)&0xf])
    249 	xor	$a1,$a2			# sigma1(X[(i+14)&0xf])
    250 	add	`$SZ*(($i+9)&0xf)`(%rsp),$T1
    251 
    252 	add	`$SZ*($i&0xf)`(%rsp),$T1
    253 	mov	$e,$a0
    254 	add	$a2,$T1
    255 	mov	$a,$a1
    256 ___
    257 	&ROUND_00_15(@_);
    258 }
    259 
    260 $code=<<___;
    261 .text
    262 
    263 .extern	OPENSSL_ia32cap_P
    264 .globl	$func
    265 .type	$func,\@function,3
    266 .align	16
    267 $func:
    268 .cfi_startproc
    269 ___
    270 $code.=<<___ if ($SZ==4 || $avx);
    271 	leaq	OPENSSL_ia32cap_P(%rip),%r11
    272 	mov	0(%r11),%r9d
    273 	mov	4(%r11),%r10d
    274 	mov	8(%r11),%r11d
    275 ___
    276 $code.=<<___ if ($SZ==4 && $shaext);
    277 	test	\$`1<<29`,%r11d		# check for SHA
    278 	jnz	_shaext_shortcut
    279 ___
    280     # XOP codepath removed.
    281 ___
    282 $code.=<<___ if ($avx>1);
    283 	and	\$`1<<8|1<<5|1<<3`,%r11d	# check for BMI2+AVX2+BMI1
    284 	cmp	\$`1<<8|1<<5|1<<3`,%r11d
    285 	je	.Lavx2_shortcut
    286 ___
    287 $code.=<<___ if ($avx);
    288 	and	\$`1<<30`,%r9d		# mask "Intel CPU" bit
    289 	and	\$`1<<28|1<<9`,%r10d	# mask AVX and SSSE3 bits
    290 	or	%r9d,%r10d
    291 	cmp	\$`1<<28|1<<9|1<<30`,%r10d
    292 	je	.Lavx_shortcut
    293 ___
    294 $code.=<<___ if ($SZ==4);
    295 	test	\$`1<<9`,%r10d
    296 	jnz	.Lssse3_shortcut
    297 ___
    298 $code.=<<___;
    299 	mov	%rsp,%rax		# copy %rsp
    300 .cfi_def_cfa_register	%rax
    301 	push	%rbx
    302 .cfi_push	%rbx
    303 	push	%rbp
    304 .cfi_push	%rbp
    305 	push	%r12
    306 .cfi_push	%r12
    307 	push	%r13
    308 .cfi_push	%r13
    309 	push	%r14
    310 .cfi_push	%r14
    311 	push	%r15
    312 .cfi_push	%r15
    313 	shl	\$4,%rdx		# num*16
    314 	sub	\$$framesz,%rsp
    315 	lea	($inp,%rdx,$SZ),%rdx	# inp+num*16*$SZ
    316 	and	\$-64,%rsp		# align stack frame
    317 	mov	$ctx,$_ctx		# save ctx, 1st arg
    318 	mov	$inp,$_inp		# save inp, 2nd arh
    319 	mov	%rdx,$_end		# save end pointer, "3rd" arg
    320 	mov	%rax,$_rsp		# save copy of %rsp
    321 .cfi_cfa_expression	$_rsp,deref,+8
    322 .Lprologue:
    323 
    324 	mov	$SZ*0($ctx),$A
    325 	mov	$SZ*1($ctx),$B
    326 	mov	$SZ*2($ctx),$C
    327 	mov	$SZ*3($ctx),$D
    328 	mov	$SZ*4($ctx),$E
    329 	mov	$SZ*5($ctx),$F
    330 	mov	$SZ*6($ctx),$G
    331 	mov	$SZ*7($ctx),$H
    332 	jmp	.Lloop
    333 
    334 .align	16
    335 .Lloop:
    336 	mov	$B,$a3
    337 	lea	$TABLE(%rip),$Tbl
    338 	xor	$C,$a3			# magic
    339 ___
    340 	for($i=0;$i<16;$i++) {
    341 		$code.="	mov	$SZ*$i($inp),$T1\n";
    342 		$code.="	mov	@ROT[4],$a0\n";
    343 		$code.="	mov	@ROT[0],$a1\n";
    344 		$code.="	bswap	$T1\n";
    345 		&ROUND_00_15($i,@ROT);
    346 		unshift(@ROT,pop(@ROT));
    347 	}
    348 $code.=<<___;
    349 	jmp	.Lrounds_16_xx
    350 .align	16
    351 .Lrounds_16_xx:
    352 ___
    353 	for(;$i<32;$i++) {
    354 		&ROUND_16_XX($i,@ROT);
    355 		unshift(@ROT,pop(@ROT));
    356 	}
    357 
    358 $code.=<<___;
    359 	cmpb	\$0,`$SZ-1`($Tbl)
    360 	jnz	.Lrounds_16_xx
    361 
    362 	mov	$_ctx,$ctx
    363 	add	$a1,$A			# modulo-scheduled h+=Sigma0(a)
    364 	lea	16*$SZ($inp),$inp
    365 
    366 	add	$SZ*0($ctx),$A
    367 	add	$SZ*1($ctx),$B
    368 	add	$SZ*2($ctx),$C
    369 	add	$SZ*3($ctx),$D
    370 	add	$SZ*4($ctx),$E
    371 	add	$SZ*5($ctx),$F
    372 	add	$SZ*6($ctx),$G
    373 	add	$SZ*7($ctx),$H
    374 
    375 	cmp	$_end,$inp
    376 
    377 	mov	$A,$SZ*0($ctx)
    378 	mov	$B,$SZ*1($ctx)
    379 	mov	$C,$SZ*2($ctx)
    380 	mov	$D,$SZ*3($ctx)
    381 	mov	$E,$SZ*4($ctx)
    382 	mov	$F,$SZ*5($ctx)
    383 	mov	$G,$SZ*6($ctx)
    384 	mov	$H,$SZ*7($ctx)
    385 	jb	.Lloop
    386 
    387 	mov	$_rsp,%rsi
    388 .cfi_def_cfa	%rsi,8
    389 	mov	-48(%rsi),%r15
    390 .cfi_restore	%r15
    391 	mov	-40(%rsi),%r14
    392 .cfi_restore	%r14
    393 	mov	-32(%rsi),%r13
    394 .cfi_restore	%r13
    395 	mov	-24(%rsi),%r12
    396 .cfi_restore	%r12
    397 	mov	-16(%rsi),%rbp
    398 .cfi_restore	%rbp
    399 	mov	-8(%rsi),%rbx
    400 .cfi_restore	%rbx
    401 	lea	(%rsi),%rsp
    402 .cfi_def_cfa_register	%rsp
    403 .Lepilogue:
    404 	ret
    405 .cfi_endproc
    406 .size	$func,.-$func
    407 ___
    408 
    409 if ($SZ==4) {
    410 $code.=<<___;
    411 .align	64
    412 .type	$TABLE,\@object
    413 $TABLE:
    414 	.long	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
    415 	.long	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
    416 	.long	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
    417 	.long	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
    418 	.long	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
    419 	.long	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
    420 	.long	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
    421 	.long	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
    422 	.long	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
    423 	.long	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
    424 	.long	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
    425 	.long	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
    426 	.long	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
    427 	.long	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
    428 	.long	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
    429 	.long	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
    430 	.long	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
    431 	.long	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
    432 	.long	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
    433 	.long	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
    434 	.long	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
    435 	.long	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
    436 	.long	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
    437 	.long	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
    438 	.long	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
    439 	.long	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
    440 	.long	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
    441 	.long	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
    442 	.long	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
    443 	.long	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
    444 	.long	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
    445 	.long	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
    446 
    447 	.long	0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
    448 	.long	0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
    449 	.long	0x03020100,0x0b0a0908,0xffffffff,0xffffffff
    450 	.long	0x03020100,0x0b0a0908,0xffffffff,0xffffffff
    451 	.long	0xffffffff,0xffffffff,0x03020100,0x0b0a0908
    452 	.long	0xffffffff,0xffffffff,0x03020100,0x0b0a0908
    453 	.asciz	"SHA256 block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
    454 ___
    455 } else {
    456 $code.=<<___;
    457 .align	64
    458 .type	$TABLE,\@object
    459 $TABLE:
    460 	.quad	0x428a2f98d728ae22,0x7137449123ef65cd
    461 	.quad	0x428a2f98d728ae22,0x7137449123ef65cd
    462 	.quad	0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
    463 	.quad	0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
    464 	.quad	0x3956c25bf348b538,0x59f111f1b605d019
    465 	.quad	0x3956c25bf348b538,0x59f111f1b605d019
    466 	.quad	0x923f82a4af194f9b,0xab1c5ed5da6d8118
    467 	.quad	0x923f82a4af194f9b,0xab1c5ed5da6d8118
    468 	.quad	0xd807aa98a3030242,0x12835b0145706fbe
    469 	.quad	0xd807aa98a3030242,0x12835b0145706fbe
    470 	.quad	0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
    471 	.quad	0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
    472 	.quad	0x72be5d74f27b896f,0x80deb1fe3b1696b1
    473 	.quad	0x72be5d74f27b896f,0x80deb1fe3b1696b1
    474 	.quad	0x9bdc06a725c71235,0xc19bf174cf692694
    475 	.quad	0x9bdc06a725c71235,0xc19bf174cf692694
    476 	.quad	0xe49b69c19ef14ad2,0xefbe4786384f25e3
    477 	.quad	0xe49b69c19ef14ad2,0xefbe4786384f25e3
    478 	.quad	0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
    479 	.quad	0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
    480 	.quad	0x2de92c6f592b0275,0x4a7484aa6ea6e483
    481 	.quad	0x2de92c6f592b0275,0x4a7484aa6ea6e483
    482 	.quad	0x5cb0a9dcbd41fbd4,0x76f988da831153b5
    483 	.quad	0x5cb0a9dcbd41fbd4,0x76f988da831153b5
    484 	.quad	0x983e5152ee66dfab,0xa831c66d2db43210
    485 	.quad	0x983e5152ee66dfab,0xa831c66d2db43210
    486 	.quad	0xb00327c898fb213f,0xbf597fc7beef0ee4
    487 	.quad	0xb00327c898fb213f,0xbf597fc7beef0ee4
    488 	.quad	0xc6e00bf33da88fc2,0xd5a79147930aa725
    489 	.quad	0xc6e00bf33da88fc2,0xd5a79147930aa725
    490 	.quad	0x06ca6351e003826f,0x142929670a0e6e70
    491 	.quad	0x06ca6351e003826f,0x142929670a0e6e70
    492 	.quad	0x27b70a8546d22ffc,0x2e1b21385c26c926
    493 	.quad	0x27b70a8546d22ffc,0x2e1b21385c26c926
    494 	.quad	0x4d2c6dfc5ac42aed,0x53380d139d95b3df
    495 	.quad	0x4d2c6dfc5ac42aed,0x53380d139d95b3df
    496 	.quad	0x650a73548baf63de,0x766a0abb3c77b2a8
    497 	.quad	0x650a73548baf63de,0x766a0abb3c77b2a8
    498 	.quad	0x81c2c92e47edaee6,0x92722c851482353b
    499 	.quad	0x81c2c92e47edaee6,0x92722c851482353b
    500 	.quad	0xa2bfe8a14cf10364,0xa81a664bbc423001
    501 	.quad	0xa2bfe8a14cf10364,0xa81a664bbc423001
    502 	.quad	0xc24b8b70d0f89791,0xc76c51a30654be30
    503 	.quad	0xc24b8b70d0f89791,0xc76c51a30654be30
    504 	.quad	0xd192e819d6ef5218,0xd69906245565a910
    505 	.quad	0xd192e819d6ef5218,0xd69906245565a910
    506 	.quad	0xf40e35855771202a,0x106aa07032bbd1b8
    507 	.quad	0xf40e35855771202a,0x106aa07032bbd1b8
    508 	.quad	0x19a4c116b8d2d0c8,0x1e376c085141ab53
    509 	.quad	0x19a4c116b8d2d0c8,0x1e376c085141ab53
    510 	.quad	0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
    511 	.quad	0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
    512 	.quad	0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
    513 	.quad	0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
    514 	.quad	0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
    515 	.quad	0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
    516 	.quad	0x748f82ee5defb2fc,0x78a5636f43172f60
    517 	.quad	0x748f82ee5defb2fc,0x78a5636f43172f60
    518 	.quad	0x84c87814a1f0ab72,0x8cc702081a6439ec
    519 	.quad	0x84c87814a1f0ab72,0x8cc702081a6439ec
    520 	.quad	0x90befffa23631e28,0xa4506cebde82bde9
    521 	.quad	0x90befffa23631e28,0xa4506cebde82bde9
    522 	.quad	0xbef9a3f7b2c67915,0xc67178f2e372532b
    523 	.quad	0xbef9a3f7b2c67915,0xc67178f2e372532b
    524 	.quad	0xca273eceea26619c,0xd186b8c721c0c207
    525 	.quad	0xca273eceea26619c,0xd186b8c721c0c207
    526 	.quad	0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
    527 	.quad	0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
    528 	.quad	0x06f067aa72176fba,0x0a637dc5a2c898a6
    529 	.quad	0x06f067aa72176fba,0x0a637dc5a2c898a6
    530 	.quad	0x113f9804bef90dae,0x1b710b35131c471b
    531 	.quad	0x113f9804bef90dae,0x1b710b35131c471b
    532 	.quad	0x28db77f523047d84,0x32caab7b40c72493
    533 	.quad	0x28db77f523047d84,0x32caab7b40c72493
    534 	.quad	0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
    535 	.quad	0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
    536 	.quad	0x4cc5d4becb3e42b6,0x597f299cfc657e2a
    537 	.quad	0x4cc5d4becb3e42b6,0x597f299cfc657e2a
    538 	.quad	0x5fcb6fab3ad6faec,0x6c44198c4a475817
    539 	.quad	0x5fcb6fab3ad6faec,0x6c44198c4a475817
    540 
    541 	.quad	0x0001020304050607,0x08090a0b0c0d0e0f
    542 	.quad	0x0001020304050607,0x08090a0b0c0d0e0f
    543 	.asciz	"SHA512 block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
    544 ___
    545 }
    546 
    547 ######################################################################
    548 # SIMD code paths
    549 #
    550 if ($SZ==4 && $shaext) {{{
    551 ######################################################################
    552 # Intel SHA Extensions implementation of SHA256 update function.
    553 #
    554 my ($ctx,$inp,$num,$Tbl)=("%rdi","%rsi","%rdx","%rcx");
    555 
    556 my ($Wi,$ABEF,$CDGH,$TMP,$BSWAP,$ABEF_SAVE,$CDGH_SAVE)=map("%xmm$_",(0..2,7..10));
    557 my @MSG=map("%xmm$_",(3..6));
    558 
    559 $code.=<<___;
    560 .type	sha256_block_data_order_shaext,\@function,3
    561 .align	64
    562 sha256_block_data_order_shaext:
    563 _shaext_shortcut:
    564 ___
    565 $code.=<<___ if ($win64);
    566 	lea	`-8-5*16`(%rsp),%rsp
    567 	movaps	%xmm6,-8-5*16(%rax)
    568 	movaps	%xmm7,-8-4*16(%rax)
    569 	movaps	%xmm8,-8-3*16(%rax)
    570 	movaps	%xmm9,-8-2*16(%rax)
    571 	movaps	%xmm10,-8-1*16(%rax)
    572 .Lprologue_shaext:
    573 ___
    574 $code.=<<___;
    575 	lea		K256+0x80(%rip),$Tbl
    576 	movdqu		($ctx),$ABEF		# DCBA
    577 	movdqu		16($ctx),$CDGH		# HGFE
    578 	movdqa		0x200-0x80($Tbl),$TMP	# byte swap mask
    579 
    580 	pshufd		\$0x1b,$ABEF,$Wi	# ABCD
    581 	pshufd		\$0xb1,$ABEF,$ABEF	# CDAB
    582 	pshufd		\$0x1b,$CDGH,$CDGH	# EFGH
    583 	movdqa		$TMP,$BSWAP		# offload
    584 	palignr		\$8,$CDGH,$ABEF		# ABEF
    585 	punpcklqdq	$Wi,$CDGH		# CDGH
    586 	jmp		.Loop_shaext
    587 
    588 .align	16
    589 .Loop_shaext:
    590 	movdqu		($inp),@MSG[0]
    591 	movdqu		0x10($inp),@MSG[1]
    592 	movdqu		0x20($inp),@MSG[2]
    593 	pshufb		$TMP,@MSG[0]
    594 	movdqu		0x30($inp),@MSG[3]
    595 
    596 	movdqa		0*32-0x80($Tbl),$Wi
    597 	paddd		@MSG[0],$Wi
    598 	pshufb		$TMP,@MSG[1]
    599 	movdqa		$CDGH,$CDGH_SAVE	# offload
    600 	sha256rnds2	$ABEF,$CDGH		# 0-3
    601 	pshufd		\$0x0e,$Wi,$Wi
    602 	nop
    603 	movdqa		$ABEF,$ABEF_SAVE	# offload
    604 	sha256rnds2	$CDGH,$ABEF
    605 
    606 	movdqa		1*32-0x80($Tbl),$Wi
    607 	paddd		@MSG[1],$Wi
    608 	pshufb		$TMP,@MSG[2]
    609 	sha256rnds2	$ABEF,$CDGH		# 4-7
    610 	pshufd		\$0x0e,$Wi,$Wi
    611 	lea		0x40($inp),$inp
    612 	sha256msg1	@MSG[1],@MSG[0]
    613 	sha256rnds2	$CDGH,$ABEF
    614 
    615 	movdqa		2*32-0x80($Tbl),$Wi
    616 	paddd		@MSG[2],$Wi
    617 	pshufb		$TMP,@MSG[3]
    618 	sha256rnds2	$ABEF,$CDGH		# 8-11
    619 	pshufd		\$0x0e,$Wi,$Wi
    620 	movdqa		@MSG[3],$TMP
    621 	palignr		\$4,@MSG[2],$TMP
    622 	nop
    623 	paddd		$TMP,@MSG[0]
    624 	sha256msg1	@MSG[2],@MSG[1]
    625 	sha256rnds2	$CDGH,$ABEF
    626 
    627 	movdqa		3*32-0x80($Tbl),$Wi
    628 	paddd		@MSG[3],$Wi
    629 	sha256msg2	@MSG[3],@MSG[0]
    630 	sha256rnds2	$ABEF,$CDGH		# 12-15
    631 	pshufd		\$0x0e,$Wi,$Wi
    632 	movdqa		@MSG[0],$TMP
    633 	palignr		\$4,@MSG[3],$TMP
    634 	nop
    635 	paddd		$TMP,@MSG[1]
    636 	sha256msg1	@MSG[3],@MSG[2]
    637 	sha256rnds2	$CDGH,$ABEF
    638 ___
    639 for($i=4;$i<16-3;$i++) {
    640 $code.=<<___;
    641 	movdqa		$i*32-0x80($Tbl),$Wi
    642 	paddd		@MSG[0],$Wi
    643 	sha256msg2	@MSG[0],@MSG[1]
    644 	sha256rnds2	$ABEF,$CDGH		# 16-19...
    645 	pshufd		\$0x0e,$Wi,$Wi
    646 	movdqa		@MSG[1],$TMP
    647 	palignr		\$4,@MSG[0],$TMP
    648 	nop
    649 	paddd		$TMP,@MSG[2]
    650 	sha256msg1	@MSG[0],@MSG[3]
    651 	sha256rnds2	$CDGH,$ABEF
    652 ___
    653 	push(@MSG,shift(@MSG));
    654 }
    655 $code.=<<___;
    656 	movdqa		13*32-0x80($Tbl),$Wi
    657 	paddd		@MSG[0],$Wi
    658 	sha256msg2	@MSG[0],@MSG[1]
    659 	sha256rnds2	$ABEF,$CDGH		# 52-55
    660 	pshufd		\$0x0e,$Wi,$Wi
    661 	movdqa		@MSG[1],$TMP
    662 	palignr		\$4,@MSG[0],$TMP
    663 	sha256rnds2	$CDGH,$ABEF
    664 	paddd		$TMP,@MSG[2]
    665 
    666 	movdqa		14*32-0x80($Tbl),$Wi
    667 	paddd		@MSG[1],$Wi
    668 	sha256rnds2	$ABEF,$CDGH		# 56-59
    669 	pshufd		\$0x0e,$Wi,$Wi
    670 	sha256msg2	@MSG[1],@MSG[2]
    671 	movdqa		$BSWAP,$TMP
    672 	sha256rnds2	$CDGH,$ABEF
    673 
    674 	movdqa		15*32-0x80($Tbl),$Wi
    675 	paddd		@MSG[2],$Wi
    676 	nop
    677 	sha256rnds2	$ABEF,$CDGH		# 60-63
    678 	pshufd		\$0x0e,$Wi,$Wi
    679 	dec		$num
    680 	nop
    681 	sha256rnds2	$CDGH,$ABEF
    682 
    683 	paddd		$CDGH_SAVE,$CDGH
    684 	paddd		$ABEF_SAVE,$ABEF
    685 	jnz		.Loop_shaext
    686 
    687 	pshufd		\$0xb1,$CDGH,$CDGH	# DCHG
    688 	pshufd		\$0x1b,$ABEF,$TMP	# FEBA
    689 	pshufd		\$0xb1,$ABEF,$ABEF	# BAFE
    690 	punpckhqdq	$CDGH,$ABEF		# DCBA
    691 	palignr		\$8,$TMP,$CDGH		# HGFE
    692 
    693 	movdqu	$ABEF,($ctx)
    694 	movdqu	$CDGH,16($ctx)
    695 ___
    696 $code.=<<___ if ($win64);
    697 	movaps	-8-5*16(%rax),%xmm6
    698 	movaps	-8-4*16(%rax),%xmm7
    699 	movaps	-8-3*16(%rax),%xmm8
    700 	movaps	-8-2*16(%rax),%xmm9
    701 	movaps	-8-1*16(%rax),%xmm10
    702 	mov	%rax,%rsp
    703 .Lepilogue_shaext:
    704 ___
    705 $code.=<<___;
    706 	ret
    707 .size	sha256_block_data_order_shaext,.-sha256_block_data_order_shaext
    708 ___
    709 }}}
    710 {{{
    711 
    712 my $a4=$T1;
    713 my ($a,$b,$c,$d,$e,$f,$g,$h);
    714 
    715 sub AUTOLOAD()		# thunk [simplified] 32-bit style perlasm
    716 { my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
    717   my $arg = pop;
    718     $arg = "\$$arg" if ($arg*1 eq $arg);
    719     $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n";
    720 }
    721 
    722 sub body_00_15 () {
    723 	(
    724 	'($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'.
    725 
    726 	'&ror	($a0,$Sigma1[2]-$Sigma1[1])',
    727 	'&mov	($a,$a1)',
    728 	'&mov	($a4,$f)',
    729 
    730 	'&ror	($a1,$Sigma0[2]-$Sigma0[1])',
    731 	'&xor	($a0,$e)',
    732 	'&xor	($a4,$g)',			# f^g
    733 
    734 	'&ror	($a0,$Sigma1[1]-$Sigma1[0])',
    735 	'&xor	($a1,$a)',
    736 	'&and	($a4,$e)',			# (f^g)&e
    737 
    738 	'&xor	($a0,$e)',
    739 	'&add	($h,$SZ*($i&15)."(%rsp)")',	# h+=X[i]+K[i]
    740 	'&mov	($a2,$a)',
    741 
    742 	'&xor	($a4,$g)',			# Ch(e,f,g)=((f^g)&e)^g
    743 	'&ror	($a1,$Sigma0[1]-$Sigma0[0])',
    744 	'&xor	($a2,$b)',			# a^b, b^c in next round
    745 
    746 	'&add	($h,$a4)',			# h+=Ch(e,f,g)
    747 	'&ror	($a0,$Sigma1[0])',		# Sigma1(e)
    748 	'&and	($a3,$a2)',			# (b^c)&(a^b)
    749 
    750 	'&xor	($a1,$a)',
    751 	'&add	($h,$a0)',			# h+=Sigma1(e)
    752 	'&xor	($a3,$b)',			# Maj(a,b,c)=Ch(a^b,c,b)
    753 
    754 	'&ror	($a1,$Sigma0[0])',		# Sigma0(a)
    755 	'&add	($d,$h)',			# d+=h
    756 	'&add	($h,$a3)',			# h+=Maj(a,b,c)
    757 
    758 	'&mov	($a0,$d)',
    759 	'&add	($a1,$h);'.			# h+=Sigma0(a)
    760 	'($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;'
    761 	);
    762 }
    763 
    764 ######################################################################
    765 # SSSE3 code path
    766 #
    767 if ($SZ==4) {	# SHA256 only
    768 my @X = map("%xmm$_",(0..3));
    769 my ($t0,$t1,$t2,$t3, $t4,$t5) = map("%xmm$_",(4..9));
    770 
    771 $code.=<<___;
    772 .type	${func}_ssse3,\@function,3
    773 .align	64
    774 ${func}_ssse3:
    775 .cfi_startproc
    776 .Lssse3_shortcut:
    777 	mov	%rsp,%rax		# copy %rsp
    778 .cfi_def_cfa_register	%rax
    779 	push	%rbx
    780 .cfi_push	%rbx
    781 	push	%rbp
    782 .cfi_push	%rbp
    783 	push	%r12
    784 .cfi_push	%r12
    785 	push	%r13
    786 .cfi_push	%r13
    787 	push	%r14
    788 .cfi_push	%r14
    789 	push	%r15
    790 .cfi_push	%r15
    791 	shl	\$4,%rdx		# num*16
    792 	sub	\$`$framesz+$win64*16*4`,%rsp
    793 	lea	($inp,%rdx,$SZ),%rdx	# inp+num*16*$SZ
    794 	and	\$-64,%rsp		# align stack frame
    795 	mov	$ctx,$_ctx		# save ctx, 1st arg
    796 	mov	$inp,$_inp		# save inp, 2nd arh
    797 	mov	%rdx,$_end		# save end pointer, "3rd" arg
    798 	mov	%rax,$_rsp		# save copy of %rsp
    799 .cfi_cfa_expression	$_rsp,deref,+8
    800 ___
    801 $code.=<<___ if ($win64);
    802 	movaps	%xmm6,16*$SZ+32(%rsp)
    803 	movaps	%xmm7,16*$SZ+48(%rsp)
    804 	movaps	%xmm8,16*$SZ+64(%rsp)
    805 	movaps	%xmm9,16*$SZ+80(%rsp)
    806 ___
    807 $code.=<<___;
    808 .Lprologue_ssse3:
    809 
    810 	mov	$SZ*0($ctx),$A
    811 	mov	$SZ*1($ctx),$B
    812 	mov	$SZ*2($ctx),$C
    813 	mov	$SZ*3($ctx),$D
    814 	mov	$SZ*4($ctx),$E
    815 	mov	$SZ*5($ctx),$F
    816 	mov	$SZ*6($ctx),$G
    817 	mov	$SZ*7($ctx),$H
    818 ___
    819 
    820 $code.=<<___;
    821 	#movdqa	$TABLE+`$SZ*2*$rounds`+32(%rip),$t4
    822 	#movdqa	$TABLE+`$SZ*2*$rounds`+64(%rip),$t5
    823 	jmp	.Lloop_ssse3
    824 .align	16
    825 .Lloop_ssse3:
    826 	movdqa	$TABLE+`$SZ*2*$rounds`(%rip),$t3
    827 	movdqu	0x00($inp),@X[0]
    828 	movdqu	0x10($inp),@X[1]
    829 	movdqu	0x20($inp),@X[2]
    830 	pshufb	$t3,@X[0]
    831 	movdqu	0x30($inp),@X[3]
    832 	lea	$TABLE(%rip),$Tbl
    833 	pshufb	$t3,@X[1]
    834 	movdqa	0x00($Tbl),$t0
    835 	movdqa	0x20($Tbl),$t1
    836 	pshufb	$t3,@X[2]
    837 	paddd	@X[0],$t0
    838 	movdqa	0x40($Tbl),$t2
    839 	pshufb	$t3,@X[3]
    840 	movdqa	0x60($Tbl),$t3
    841 	paddd	@X[1],$t1
    842 	paddd	@X[2],$t2
    843 	paddd	@X[3],$t3
    844 	movdqa	$t0,0x00(%rsp)
    845 	mov	$A,$a1
    846 	movdqa	$t1,0x10(%rsp)
    847 	mov	$B,$a3
    848 	movdqa	$t2,0x20(%rsp)
    849 	xor	$C,$a3			# magic
    850 	movdqa	$t3,0x30(%rsp)
    851 	mov	$E,$a0
    852 	jmp	.Lssse3_00_47
    853 
    854 .align	16
    855 .Lssse3_00_47:
    856 	sub	\$`-16*2*$SZ`,$Tbl	# size optimization
    857 ___
    858 sub Xupdate_256_SSSE3 () {
    859 	(
    860 	'&movdqa	($t0,@X[1]);',
    861 	'&movdqa	($t3,@X[3])',
    862 	'&palignr	($t0,@X[0],$SZ)',	# X[1..4]
    863 	 '&palignr	($t3,@X[2],$SZ);',	# X[9..12]
    864 	'&movdqa	($t1,$t0)',
    865 	'&movdqa	($t2,$t0);',
    866 	'&psrld		($t0,$sigma0[2])',
    867 	 '&paddd	(@X[0],$t3);',		# X[0..3] += X[9..12]
    868 	'&psrld		($t2,$sigma0[0])',
    869 	 '&pshufd	($t3,@X[3],0b11111010)',# X[14..15]
    870 	'&pslld		($t1,8*$SZ-$sigma0[1]);'.
    871 	'&pxor		($t0,$t2)',
    872 	'&psrld		($t2,$sigma0[1]-$sigma0[0]);'.
    873 	'&pxor		($t0,$t1)',
    874 	'&pslld		($t1,$sigma0[1]-$sigma0[0]);'.
    875 	'&pxor		($t0,$t2);',
    876 	 '&movdqa	($t2,$t3)',
    877 	'&pxor		($t0,$t1);',		# sigma0(X[1..4])
    878 	 '&psrld	($t3,$sigma1[2])',
    879 	'&paddd		(@X[0],$t0);',		# X[0..3] += sigma0(X[1..4])
    880 	 '&psrlq	($t2,$sigma1[0])',
    881 	 '&pxor		($t3,$t2);',
    882 	 '&psrlq	($t2,$sigma1[1]-$sigma1[0])',
    883 	 '&pxor		($t3,$t2)',
    884 	 '&pshufb	($t3,$t4)',		# sigma1(X[14..15])
    885 	'&paddd		(@X[0],$t3)',		# X[0..1] += sigma1(X[14..15])
    886 	 '&pshufd	($t3,@X[0],0b01010000)',# X[16..17]
    887 	 '&movdqa	($t2,$t3);',
    888 	 '&psrld	($t3,$sigma1[2])',
    889 	 '&psrlq	($t2,$sigma1[0])',
    890 	 '&pxor		($t3,$t2);',
    891 	 '&psrlq	($t2,$sigma1[1]-$sigma1[0])',
    892 	 '&pxor		($t3,$t2);',
    893 	'&movdqa	($t2,16*2*$j."($Tbl)")',
    894 	 '&pshufb	($t3,$t5)',
    895 	'&paddd		(@X[0],$t3)'		# X[2..3] += sigma1(X[16..17])
    896 	);
    897 }
    898 
    899 sub SSSE3_256_00_47 () {
    900 my $j = shift;
    901 my $body = shift;
    902 my @X = @_;
    903 my @insns = (&$body,&$body,&$body,&$body);	# 104 instructions
    904 
    905     if (0) {
    906 	foreach (Xupdate_256_SSSE3()) {		# 36 instructions
    907 	    eval;
    908 	    eval(shift(@insns));
    909 	    eval(shift(@insns));
    910 	    eval(shift(@insns));
    911 	}
    912     } else {			# squeeze extra 4% on Westmere and 19% on Atom
    913 	  eval(shift(@insns));	#@
    914 	&movdqa		($t0,@X[1]);
    915 	  eval(shift(@insns));
    916 	  eval(shift(@insns));
    917 	&movdqa		($t3,@X[3]);
    918 	  eval(shift(@insns));	#@
    919 	  eval(shift(@insns));
    920 	  eval(shift(@insns));
    921 	  eval(shift(@insns));	#@
    922 	  eval(shift(@insns));
    923 	&palignr	($t0,@X[0],$SZ);	# X[1..4]
    924 	  eval(shift(@insns));
    925 	  eval(shift(@insns));
    926 	 &palignr	($t3,@X[2],$SZ);	# X[9..12]
    927 	  eval(shift(@insns));
    928 	  eval(shift(@insns));
    929 	  eval(shift(@insns));
    930 	  eval(shift(@insns));	#@
    931 	&movdqa		($t1,$t0);
    932 	  eval(shift(@insns));
    933 	  eval(shift(@insns));
    934 	&movdqa		($t2,$t0);
    935 	  eval(shift(@insns));	#@
    936 	  eval(shift(@insns));
    937 	&psrld		($t0,$sigma0[2]);
    938 	  eval(shift(@insns));
    939 	  eval(shift(@insns));
    940 	  eval(shift(@insns));
    941 	 &paddd		(@X[0],$t3);		# X[0..3] += X[9..12]
    942 	  eval(shift(@insns));	#@
    943 	  eval(shift(@insns));
    944 	&psrld		($t2,$sigma0[0]);
    945 	  eval(shift(@insns));
    946 	  eval(shift(@insns));
    947 	 &pshufd	($t3,@X[3],0b11111010);	# X[4..15]
    948 	  eval(shift(@insns));
    949 	  eval(shift(@insns));	#@
    950 	&pslld		($t1,8*$SZ-$sigma0[1]);
    951 	  eval(shift(@insns));
    952 	  eval(shift(@insns));
    953 	&pxor		($t0,$t2);
    954 	  eval(shift(@insns));	#@
    955 	  eval(shift(@insns));
    956 	  eval(shift(@insns));
    957 	  eval(shift(@insns));	#@
    958 	&psrld		($t2,$sigma0[1]-$sigma0[0]);
    959 	  eval(shift(@insns));
    960 	&pxor		($t0,$t1);
    961 	  eval(shift(@insns));
    962 	  eval(shift(@insns));
    963 	&pslld		($t1,$sigma0[1]-$sigma0[0]);
    964 	  eval(shift(@insns));
    965 	  eval(shift(@insns));
    966 	&pxor		($t0,$t2);
    967 	  eval(shift(@insns));
    968 	  eval(shift(@insns));	#@
    969 	 &movdqa	($t2,$t3);
    970 	  eval(shift(@insns));
    971 	  eval(shift(@insns));
    972 	&pxor		($t0,$t1);		# sigma0(X[1..4])
    973 	  eval(shift(@insns));	#@
    974 	  eval(shift(@insns));
    975 	  eval(shift(@insns));
    976 	 &psrld		($t3,$sigma1[2]);
    977 	  eval(shift(@insns));
    978 	  eval(shift(@insns));
    979 	&paddd		(@X[0],$t0);		# X[0..3] += sigma0(X[1..4])
    980 	  eval(shift(@insns));	#@
    981 	  eval(shift(@insns));
    982 	 &psrlq		($t2,$sigma1[0]);
    983 	  eval(shift(@insns));
    984 	  eval(shift(@insns));
    985 	  eval(shift(@insns));
    986 	 &pxor		($t3,$t2);
    987 	  eval(shift(@insns));	#@
    988 	  eval(shift(@insns));
    989 	  eval(shift(@insns));
    990 	  eval(shift(@insns));	#@
    991 	 &psrlq		($t2,$sigma1[1]-$sigma1[0]);
    992 	  eval(shift(@insns));
    993 	  eval(shift(@insns));
    994 	 &pxor		($t3,$t2);
    995 	  eval(shift(@insns));	#@
    996 	  eval(shift(@insns));
    997 	  eval(shift(@insns));
    998 	 #&pshufb	($t3,$t4);		# sigma1(X[14..15])
    999 	 &pshufd	($t3,$t3,0b10000000);
   1000 	  eval(shift(@insns));
   1001 	  eval(shift(@insns));
   1002 	  eval(shift(@insns));
   1003 	 &psrldq	($t3,8);
   1004 	  eval(shift(@insns));
   1005 	  eval(shift(@insns));	#@
   1006 	  eval(shift(@insns));
   1007 	  eval(shift(@insns));
   1008 	  eval(shift(@insns));	#@
   1009 	&paddd		(@X[0],$t3);		# X[0..1] += sigma1(X[14..15])
   1010 	  eval(shift(@insns));
   1011 	  eval(shift(@insns));
   1012 	  eval(shift(@insns));
   1013 	 &pshufd	($t3,@X[0],0b01010000);	# X[16..17]
   1014 	  eval(shift(@insns));
   1015 	  eval(shift(@insns));	#@
   1016 	  eval(shift(@insns));
   1017 	 &movdqa	($t2,$t3);
   1018 	  eval(shift(@insns));
   1019 	  eval(shift(@insns));
   1020 	 &psrld		($t3,$sigma1[2]);
   1021 	  eval(shift(@insns));
   1022 	  eval(shift(@insns));	#@
   1023 	 &psrlq		($t2,$sigma1[0]);
   1024 	  eval(shift(@insns));
   1025 	  eval(shift(@insns));
   1026 	 &pxor		($t3,$t2);
   1027 	  eval(shift(@insns));	#@
   1028 	  eval(shift(@insns));
   1029 	  eval(shift(@insns));
   1030 	  eval(shift(@insns));	#@
   1031 	  eval(shift(@insns));
   1032 	 &psrlq		($t2,$sigma1[1]-$sigma1[0]);
   1033 	  eval(shift(@insns));
   1034 	  eval(shift(@insns));
   1035 	  eval(shift(@insns));
   1036 	 &pxor		($t3,$t2);
   1037 	  eval(shift(@insns));
   1038 	  eval(shift(@insns));
   1039 	  eval(shift(@insns));	#@
   1040 	 #&pshufb	($t3,$t5);
   1041 	 &pshufd	($t3,$t3,0b00001000);
   1042 	  eval(shift(@insns));
   1043 	  eval(shift(@insns));
   1044 	&movdqa		($t2,16*2*$j."($Tbl)");
   1045 	  eval(shift(@insns));	#@
   1046 	  eval(shift(@insns));
   1047 	 &pslldq	($t3,8);
   1048 	  eval(shift(@insns));
   1049 	  eval(shift(@insns));
   1050 	  eval(shift(@insns));
   1051 	&paddd		(@X[0],$t3);		# X[2..3] += sigma1(X[16..17])
   1052 	  eval(shift(@insns));	#@
   1053 	  eval(shift(@insns));
   1054 	  eval(shift(@insns));
   1055     }
   1056 	&paddd		($t2,@X[0]);
   1057 	  foreach (@insns) { eval; }		# remaining instructions
   1058 	&movdqa		(16*$j."(%rsp)",$t2);
   1059 }
   1060 
   1061     for ($i=0,$j=0; $j<4; $j++) {
   1062 	&SSSE3_256_00_47($j,\&body_00_15,@X);
   1063 	push(@X,shift(@X));			# rotate(@X)
   1064     }
   1065 	&cmpb	($SZ-1+16*2*$SZ."($Tbl)",0);
   1066 	&jne	(".Lssse3_00_47");
   1067 
   1068     for ($i=0; $i<16; ) {
   1069 	foreach(body_00_15()) { eval; }
   1070     }
   1071 $code.=<<___;
   1072 	mov	$_ctx,$ctx
   1073 	mov	$a1,$A
   1074 
   1075 	add	$SZ*0($ctx),$A
   1076 	lea	16*$SZ($inp),$inp
   1077 	add	$SZ*1($ctx),$B
   1078 	add	$SZ*2($ctx),$C
   1079 	add	$SZ*3($ctx),$D
   1080 	add	$SZ*4($ctx),$E
   1081 	add	$SZ*5($ctx),$F
   1082 	add	$SZ*6($ctx),$G
   1083 	add	$SZ*7($ctx),$H
   1084 
   1085 	cmp	$_end,$inp
   1086 
   1087 	mov	$A,$SZ*0($ctx)
   1088 	mov	$B,$SZ*1($ctx)
   1089 	mov	$C,$SZ*2($ctx)
   1090 	mov	$D,$SZ*3($ctx)
   1091 	mov	$E,$SZ*4($ctx)
   1092 	mov	$F,$SZ*5($ctx)
   1093 	mov	$G,$SZ*6($ctx)
   1094 	mov	$H,$SZ*7($ctx)
   1095 	jb	.Lloop_ssse3
   1096 
   1097 	mov	$_rsp,%rsi
   1098 .cfi_def_cfa	%rsi,8
   1099 ___
   1100 $code.=<<___ if ($win64);
   1101 	movaps	16*$SZ+32(%rsp),%xmm6
   1102 	movaps	16*$SZ+48(%rsp),%xmm7
   1103 	movaps	16*$SZ+64(%rsp),%xmm8
   1104 	movaps	16*$SZ+80(%rsp),%xmm9
   1105 ___
   1106 $code.=<<___;
   1107 	mov	-48(%rsi),%r15
   1108 .cfi_restore	%r15
   1109 	mov	-40(%rsi),%r14
   1110 .cfi_restore	%r14
   1111 	mov	-32(%rsi),%r13
   1112 .cfi_restore	%r13
   1113 	mov	-24(%rsi),%r12
   1114 .cfi_restore	%r12
   1115 	mov	-16(%rsi),%rbp
   1116 .cfi_restore	%rbp
   1117 	mov	-8(%rsi),%rbx
   1118 .cfi_restore	%rbx
   1119 	lea	(%rsi),%rsp
   1120 .cfi_def_cfa_register	%rsp
   1121 .Lepilogue_ssse3:
   1122 	ret
   1123 .cfi_endproc
   1124 .size	${func}_ssse3,.-${func}_ssse3
   1125 ___
   1126 }
   1127 
   1128 if ($avx) {{
   1129 ######################################################################
   1130 # AVX+shrd code path
   1131 #
   1132 local *ror = sub { &shrd(@_[0],@_) };
   1133 
   1134 $code.=<<___;
   1135 .type	${func}_avx,\@function,3
   1136 .align	64
   1137 ${func}_avx:
   1138 .cfi_startproc
   1139 .Lavx_shortcut:
   1140 	mov	%rsp,%rax		# copy %rsp
   1141 .cfi_def_cfa_register	%rax
   1142 	push	%rbx
   1143 .cfi_push	%rbx
   1144 	push	%rbp
   1145 .cfi_push	%rbp
   1146 	push	%r12
   1147 .cfi_push	%r12
   1148 	push	%r13
   1149 .cfi_push	%r13
   1150 	push	%r14
   1151 .cfi_push	%r14
   1152 	push	%r15
   1153 .cfi_push	%r15
   1154 	shl	\$4,%rdx		# num*16
   1155 	sub	\$`$framesz+$win64*16*($SZ==4?4:6)`,%rsp
   1156 	lea	($inp,%rdx,$SZ),%rdx	# inp+num*16*$SZ
   1157 	and	\$-64,%rsp		# align stack frame
   1158 	mov	$ctx,$_ctx		# save ctx, 1st arg
   1159 	mov	$inp,$_inp		# save inp, 2nd arh
   1160 	mov	%rdx,$_end		# save end pointer, "3rd" arg
   1161 	mov	%rax,$_rsp		# save copy of %rsp
   1162 .cfi_cfa_expression	$_rsp,deref,+8
   1163 ___
   1164 $code.=<<___ if ($win64);
   1165 	movaps	%xmm6,16*$SZ+32(%rsp)
   1166 	movaps	%xmm7,16*$SZ+48(%rsp)
   1167 	movaps	%xmm8,16*$SZ+64(%rsp)
   1168 	movaps	%xmm9,16*$SZ+80(%rsp)
   1169 ___
   1170 $code.=<<___ if ($win64 && $SZ>4);
   1171 	movaps	%xmm10,16*$SZ+96(%rsp)
   1172 	movaps	%xmm11,16*$SZ+112(%rsp)
   1173 ___
   1174 $code.=<<___;
   1175 .Lprologue_avx:
   1176 
   1177 	vzeroupper
   1178 	mov	$SZ*0($ctx),$A
   1179 	mov	$SZ*1($ctx),$B
   1180 	mov	$SZ*2($ctx),$C
   1181 	mov	$SZ*3($ctx),$D
   1182 	mov	$SZ*4($ctx),$E
   1183 	mov	$SZ*5($ctx),$F
   1184 	mov	$SZ*6($ctx),$G
   1185 	mov	$SZ*7($ctx),$H
   1186 ___
   1187 					if ($SZ==4) {	# SHA256
   1188     my @X = map("%xmm$_",(0..3));
   1189     my ($t0,$t1,$t2,$t3, $t4,$t5) = map("%xmm$_",(4..9));
   1190 
   1191 $code.=<<___;
   1192 	vmovdqa	$TABLE+`$SZ*2*$rounds`+32(%rip),$t4
   1193 	vmovdqa	$TABLE+`$SZ*2*$rounds`+64(%rip),$t5
   1194 	jmp	.Lloop_avx
   1195 .align	16
   1196 .Lloop_avx:
   1197 	vmovdqa	$TABLE+`$SZ*2*$rounds`(%rip),$t3
   1198 	vmovdqu	0x00($inp),@X[0]
   1199 	vmovdqu	0x10($inp),@X[1]
   1200 	vmovdqu	0x20($inp),@X[2]
   1201 	vmovdqu	0x30($inp),@X[3]
   1202 	vpshufb	$t3,@X[0],@X[0]
   1203 	lea	$TABLE(%rip),$Tbl
   1204 	vpshufb	$t3,@X[1],@X[1]
   1205 	vpshufb	$t3,@X[2],@X[2]
   1206 	vpaddd	0x00($Tbl),@X[0],$t0
   1207 	vpshufb	$t3,@X[3],@X[3]
   1208 	vpaddd	0x20($Tbl),@X[1],$t1
   1209 	vpaddd	0x40($Tbl),@X[2],$t2
   1210 	vpaddd	0x60($Tbl),@X[3],$t3
   1211 	vmovdqa	$t0,0x00(%rsp)
   1212 	mov	$A,$a1
   1213 	vmovdqa	$t1,0x10(%rsp)
   1214 	mov	$B,$a3
   1215 	vmovdqa	$t2,0x20(%rsp)
   1216 	xor	$C,$a3			# magic
   1217 	vmovdqa	$t3,0x30(%rsp)
   1218 	mov	$E,$a0
   1219 	jmp	.Lavx_00_47
   1220 
   1221 .align	16
   1222 .Lavx_00_47:
   1223 	sub	\$`-16*2*$SZ`,$Tbl	# size optimization
   1224 ___
   1225 sub Xupdate_256_AVX () {
   1226 	(
   1227 	'&vpalignr	($t0,@X[1],@X[0],$SZ)',	# X[1..4]
   1228 	 '&vpalignr	($t3,@X[3],@X[2],$SZ)',	# X[9..12]
   1229 	'&vpsrld	($t2,$t0,$sigma0[0]);',
   1230 	 '&vpaddd	(@X[0],@X[0],$t3)',	# X[0..3] += X[9..12]
   1231 	'&vpsrld	($t3,$t0,$sigma0[2])',
   1232 	'&vpslld	($t1,$t0,8*$SZ-$sigma0[1]);',
   1233 	'&vpxor		($t0,$t3,$t2)',
   1234 	 '&vpshufd	($t3,@X[3],0b11111010)',# X[14..15]
   1235 	'&vpsrld	($t2,$t2,$sigma0[1]-$sigma0[0]);',
   1236 	'&vpxor		($t0,$t0,$t1)',
   1237 	'&vpslld	($t1,$t1,$sigma0[1]-$sigma0[0]);',
   1238 	'&vpxor		($t0,$t0,$t2)',
   1239 	 '&vpsrld	($t2,$t3,$sigma1[2]);',
   1240 	'&vpxor		($t0,$t0,$t1)',		# sigma0(X[1..4])
   1241 	 '&vpsrlq	($t3,$t3,$sigma1[0]);',
   1242 	'&vpaddd	(@X[0],@X[0],$t0)',	# X[0..3] += sigma0(X[1..4])
   1243 	 '&vpxor	($t2,$t2,$t3);',
   1244 	 '&vpsrlq	($t3,$t3,$sigma1[1]-$sigma1[0])',
   1245 	 '&vpxor	($t2,$t2,$t3)',
   1246 	 '&vpshufb	($t2,$t2,$t4)',		# sigma1(X[14..15])
   1247 	'&vpaddd	(@X[0],@X[0],$t2)',	# X[0..1] += sigma1(X[14..15])
   1248 	 '&vpshufd	($t3,@X[0],0b01010000)',# X[16..17]
   1249 	 '&vpsrld	($t2,$t3,$sigma1[2])',
   1250 	 '&vpsrlq	($t3,$t3,$sigma1[0])',
   1251 	 '&vpxor	($t2,$t2,$t3);',
   1252 	 '&vpsrlq	($t3,$t3,$sigma1[1]-$sigma1[0])',
   1253 	 '&vpxor	($t2,$t2,$t3)',
   1254 	 '&vpshufb	($t2,$t2,$t5)',
   1255 	'&vpaddd	(@X[0],@X[0],$t2)'	# X[2..3] += sigma1(X[16..17])
   1256 	);
   1257 }
   1258 
   1259 sub AVX_256_00_47 () {
   1260 my $j = shift;
   1261 my $body = shift;
   1262 my @X = @_;
   1263 my @insns = (&$body,&$body,&$body,&$body);	# 104 instructions
   1264 
   1265 	foreach (Xupdate_256_AVX()) {		# 29 instructions
   1266 	    eval;
   1267 	    eval(shift(@insns));
   1268 	    eval(shift(@insns));
   1269 	    eval(shift(@insns));
   1270 	}
   1271 	&vpaddd		($t2,@X[0],16*2*$j."($Tbl)");
   1272 	  foreach (@insns) { eval; }		# remaining instructions
   1273 	&vmovdqa	(16*$j."(%rsp)",$t2);
   1274 }
   1275 
   1276     for ($i=0,$j=0; $j<4; $j++) {
   1277 	&AVX_256_00_47($j,\&body_00_15,@X);
   1278 	push(@X,shift(@X));			# rotate(@X)
   1279     }
   1280 	&cmpb	($SZ-1+16*2*$SZ."($Tbl)",0);
   1281 	&jne	(".Lavx_00_47");
   1282 
   1283     for ($i=0; $i<16; ) {
   1284 	foreach(body_00_15()) { eval; }
   1285     }
   1286 
   1287 					} else {	# SHA512
   1288     my @X = map("%xmm$_",(0..7));
   1289     my ($t0,$t1,$t2,$t3) = map("%xmm$_",(8..11));
   1290 
   1291 $code.=<<___;
   1292 	jmp	.Lloop_avx
   1293 .align	16
   1294 .Lloop_avx:
   1295 	vmovdqa	$TABLE+`$SZ*2*$rounds`(%rip),$t3
   1296 	vmovdqu	0x00($inp),@X[0]
   1297 	lea	$TABLE+0x80(%rip),$Tbl	# size optimization
   1298 	vmovdqu	0x10($inp),@X[1]
   1299 	vmovdqu	0x20($inp),@X[2]
   1300 	vpshufb	$t3,@X[0],@X[0]
   1301 	vmovdqu	0x30($inp),@X[3]
   1302 	vpshufb	$t3,@X[1],@X[1]
   1303 	vmovdqu	0x40($inp),@X[4]
   1304 	vpshufb	$t3,@X[2],@X[2]
   1305 	vmovdqu	0x50($inp),@X[5]
   1306 	vpshufb	$t3,@X[3],@X[3]
   1307 	vmovdqu	0x60($inp),@X[6]
   1308 	vpshufb	$t3,@X[4],@X[4]
   1309 	vmovdqu	0x70($inp),@X[7]
   1310 	vpshufb	$t3,@X[5],@X[5]
   1311 	vpaddq	-0x80($Tbl),@X[0],$t0
   1312 	vpshufb	$t3,@X[6],@X[6]
   1313 	vpaddq	-0x60($Tbl),@X[1],$t1
   1314 	vpshufb	$t3,@X[7],@X[7]
   1315 	vpaddq	-0x40($Tbl),@X[2],$t2
   1316 	vpaddq	-0x20($Tbl),@X[3],$t3
   1317 	vmovdqa	$t0,0x00(%rsp)
   1318 	vpaddq	0x00($Tbl),@X[4],$t0
   1319 	vmovdqa	$t1,0x10(%rsp)
   1320 	vpaddq	0x20($Tbl),@X[5],$t1
   1321 	vmovdqa	$t2,0x20(%rsp)
   1322 	vpaddq	0x40($Tbl),@X[6],$t2
   1323 	vmovdqa	$t3,0x30(%rsp)
   1324 	vpaddq	0x60($Tbl),@X[7],$t3
   1325 	vmovdqa	$t0,0x40(%rsp)
   1326 	mov	$A,$a1
   1327 	vmovdqa	$t1,0x50(%rsp)
   1328 	mov	$B,$a3
   1329 	vmovdqa	$t2,0x60(%rsp)
   1330 	xor	$C,$a3			# magic
   1331 	vmovdqa	$t3,0x70(%rsp)
   1332 	mov	$E,$a0
   1333 	jmp	.Lavx_00_47
   1334 
   1335 .align	16
   1336 .Lavx_00_47:
   1337 	add	\$`16*2*$SZ`,$Tbl
   1338 ___
   1339 sub Xupdate_512_AVX () {
   1340 	(
   1341 	'&vpalignr	($t0,@X[1],@X[0],$SZ)',	# X[1..2]
   1342 	 '&vpalignr	($t3,@X[5],@X[4],$SZ)',	# X[9..10]
   1343 	'&vpsrlq	($t2,$t0,$sigma0[0])',
   1344 	 '&vpaddq	(@X[0],@X[0],$t3);',	# X[0..1] += X[9..10]
   1345 	'&vpsrlq	($t3,$t0,$sigma0[2])',
   1346 	'&vpsllq	($t1,$t0,8*$SZ-$sigma0[1]);',
   1347 	 '&vpxor	($t0,$t3,$t2)',
   1348 	'&vpsrlq	($t2,$t2,$sigma0[1]-$sigma0[0]);',
   1349 	 '&vpxor	($t0,$t0,$t1)',
   1350 	'&vpsllq	($t1,$t1,$sigma0[1]-$sigma0[0]);',
   1351 	 '&vpxor	($t0,$t0,$t2)',
   1352 	 '&vpsrlq	($t3,@X[7],$sigma1[2]);',
   1353 	'&vpxor		($t0,$t0,$t1)',		# sigma0(X[1..2])
   1354 	 '&vpsllq	($t2,@X[7],8*$SZ-$sigma1[1]);',
   1355 	'&vpaddq	(@X[0],@X[0],$t0)',	# X[0..1] += sigma0(X[1..2])
   1356 	 '&vpsrlq	($t1,@X[7],$sigma1[0]);',
   1357 	 '&vpxor	($t3,$t3,$t2)',
   1358 	 '&vpsllq	($t2,$t2,$sigma1[1]-$sigma1[0]);',
   1359 	 '&vpxor	($t3,$t3,$t1)',
   1360 	 '&vpsrlq	($t1,$t1,$sigma1[1]-$sigma1[0]);',
   1361 	 '&vpxor	($t3,$t3,$t2)',
   1362 	 '&vpxor	($t3,$t3,$t1)',		# sigma1(X[14..15])
   1363 	'&vpaddq	(@X[0],@X[0],$t3)',	# X[0..1] += sigma1(X[14..15])
   1364 	);
   1365 }
   1366 
   1367 sub AVX_512_00_47 () {
   1368 my $j = shift;
   1369 my $body = shift;
   1370 my @X = @_;
   1371 my @insns = (&$body,&$body);			# 52 instructions
   1372 
   1373 	foreach (Xupdate_512_AVX()) {		# 23 instructions
   1374 	    eval;
   1375 	    eval(shift(@insns));
   1376 	    eval(shift(@insns));
   1377 	}
   1378 	&vpaddq		($t2,@X[0],16*2*$j-0x80."($Tbl)");
   1379 	  foreach (@insns) { eval; }		# remaining instructions
   1380 	&vmovdqa	(16*$j."(%rsp)",$t2);
   1381 }
   1382 
   1383     for ($i=0,$j=0; $j<8; $j++) {
   1384 	&AVX_512_00_47($j,\&body_00_15,@X);
   1385 	push(@X,shift(@X));			# rotate(@X)
   1386     }
   1387 	&cmpb	($SZ-1+16*2*$SZ-0x80."($Tbl)",0);
   1388 	&jne	(".Lavx_00_47");
   1389 
   1390     for ($i=0; $i<16; ) {
   1391 	foreach(body_00_15()) { eval; }
   1392     }
   1393 }
   1394 $code.=<<___;
   1395 	mov	$_ctx,$ctx
   1396 	mov	$a1,$A
   1397 
   1398 	add	$SZ*0($ctx),$A
   1399 	lea	16*$SZ($inp),$inp
   1400 	add	$SZ*1($ctx),$B
   1401 	add	$SZ*2($ctx),$C
   1402 	add	$SZ*3($ctx),$D
   1403 	add	$SZ*4($ctx),$E
   1404 	add	$SZ*5($ctx),$F
   1405 	add	$SZ*6($ctx),$G
   1406 	add	$SZ*7($ctx),$H
   1407 
   1408 	cmp	$_end,$inp
   1409 
   1410 	mov	$A,$SZ*0($ctx)
   1411 	mov	$B,$SZ*1($ctx)
   1412 	mov	$C,$SZ*2($ctx)
   1413 	mov	$D,$SZ*3($ctx)
   1414 	mov	$E,$SZ*4($ctx)
   1415 	mov	$F,$SZ*5($ctx)
   1416 	mov	$G,$SZ*6($ctx)
   1417 	mov	$H,$SZ*7($ctx)
   1418 	jb	.Lloop_avx
   1419 
   1420 	mov	$_rsp,%rsi
   1421 .cfi_def_cfa	%rsi,8
   1422 	vzeroupper
   1423 ___
   1424 $code.=<<___ if ($win64);
   1425 	movaps	16*$SZ+32(%rsp),%xmm6
   1426 	movaps	16*$SZ+48(%rsp),%xmm7
   1427 	movaps	16*$SZ+64(%rsp),%xmm8
   1428 	movaps	16*$SZ+80(%rsp),%xmm9
   1429 ___
   1430 $code.=<<___ if ($win64 && $SZ>4);
   1431 	movaps	16*$SZ+96(%rsp),%xmm10
   1432 	movaps	16*$SZ+112(%rsp),%xmm11
   1433 ___
   1434 $code.=<<___;
   1435 	mov	-48(%rsi),%r15
   1436 .cfi_restore	%r15
   1437 	mov	-40(%rsi),%r14
   1438 .cfi_restore	%r14
   1439 	mov	-32(%rsi),%r13
   1440 .cfi_restore	%r13
   1441 	mov	-24(%rsi),%r12
   1442 .cfi_restore	%r12
   1443 	mov	-16(%rsi),%rbp
   1444 .cfi_restore	%rbp
   1445 	mov	-8(%rsi),%rbx
   1446 .cfi_restore	%rbx
   1447 	lea	(%rsi),%rsp
   1448 .cfi_def_cfa_register	%rsp
   1449 .Lepilogue_avx:
   1450 	ret
   1451 .cfi_endproc
   1452 .size	${func}_avx,.-${func}_avx
   1453 ___
   1454 
   1455 if ($avx>1) {{
   1456 ######################################################################
   1457 # AVX2+BMI code path
   1458 #
   1459 my $a5=$SZ==4?"%esi":"%rsi";	# zap $inp
   1460 my $PUSH8=8*2*$SZ;
   1461 use integer;
   1462 
   1463 sub bodyx_00_15 () {
   1464 	# at start $a1 should be zero, $a3 - $b^$c and $a4 copy of $f
   1465 	(
   1466 	'($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'.
   1467 
   1468 	'&add	($h,(32*($i/(16/$SZ))+$SZ*($i%(16/$SZ)))%$PUSH8.$base)',    # h+=X[i]+K[i]
   1469 	'&and	($a4,$e)',		# f&e
   1470 	'&rorx	($a0,$e,$Sigma1[2])',
   1471 	'&rorx	($a2,$e,$Sigma1[1])',
   1472 
   1473 	'&lea	($a,"($a,$a1)")',	# h+=Sigma0(a) from the past
   1474 	'&lea	($h,"($h,$a4)")',
   1475 	'&andn	($a4,$e,$g)',		# ~e&g
   1476 	'&xor	($a0,$a2)',
   1477 
   1478 	'&rorx	($a1,$e,$Sigma1[0])',
   1479 	'&lea	($h,"($h,$a4)")',	# h+=Ch(e,f,g)=(e&f)+(~e&g)
   1480 	'&xor	($a0,$a1)',		# Sigma1(e)
   1481 	'&mov	($a2,$a)',
   1482 
   1483 	'&rorx	($a4,$a,$Sigma0[2])',
   1484 	'&lea	($h,"($h,$a0)")',	# h+=Sigma1(e)
   1485 	'&xor	($a2,$b)',		# a^b, b^c in next round
   1486 	'&rorx	($a1,$a,$Sigma0[1])',
   1487 
   1488 	'&rorx	($a0,$a,$Sigma0[0])',
   1489 	'&lea	($d,"($d,$h)")',	# d+=h
   1490 	'&and	($a3,$a2)',		# (b^c)&(a^b)
   1491 	'&xor	($a1,$a4)',
   1492 
   1493 	'&xor	($a3,$b)',		# Maj(a,b,c)=Ch(a^b,c,b)
   1494 	'&xor	($a1,$a0)',		# Sigma0(a)
   1495 	'&lea	($h,"($h,$a3)");'.	# h+=Maj(a,b,c)
   1496 	'&mov	($a4,$e)',		# copy of f in future
   1497 
   1498 	'($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;'
   1499 	);
   1500 	# and at the finish one has to $a+=$a1
   1501 }
   1502 
   1503 $code.=<<___;
   1504 .type	${func}_avx2,\@function,3
   1505 .align	64
   1506 ${func}_avx2:
   1507 .cfi_startproc
   1508 .Lavx2_shortcut:
   1509 	mov	%rsp,%rax		# copy %rsp
   1510 .cfi_def_cfa_register	%rax
   1511 	push	%rbx
   1512 .cfi_push	%rbx
   1513 	push	%rbp
   1514 .cfi_push	%rbp
   1515 	push	%r12
   1516 .cfi_push	%r12
   1517 	push	%r13
   1518 .cfi_push	%r13
   1519 	push	%r14
   1520 .cfi_push	%r14
   1521 	push	%r15
   1522 .cfi_push	%r15
   1523 	sub	\$`2*$SZ*$rounds+4*8+$win64*16*($SZ==4?4:6)`,%rsp
   1524 	shl	\$4,%rdx		# num*16
   1525 	and	\$-256*$SZ,%rsp		# align stack frame
   1526 	lea	($inp,%rdx,$SZ),%rdx	# inp+num*16*$SZ
   1527 	add	\$`2*$SZ*($rounds-8)`,%rsp
   1528 	mov	$ctx,$_ctx		# save ctx, 1st arg
   1529 	mov	$inp,$_inp		# save inp, 2nd arh
   1530 	mov	%rdx,$_end		# save end pointer, "3rd" arg
   1531 	mov	%rax,$_rsp		# save copy of %rsp
   1532 .cfi_cfa_expression	$_rsp,deref,+8
   1533 ___
   1534 $code.=<<___ if ($win64);
   1535 	movaps	%xmm6,16*$SZ+32(%rsp)
   1536 	movaps	%xmm7,16*$SZ+48(%rsp)
   1537 	movaps	%xmm8,16*$SZ+64(%rsp)
   1538 	movaps	%xmm9,16*$SZ+80(%rsp)
   1539 ___
   1540 $code.=<<___ if ($win64 && $SZ>4);
   1541 	movaps	%xmm10,16*$SZ+96(%rsp)
   1542 	movaps	%xmm11,16*$SZ+112(%rsp)
   1543 ___
   1544 $code.=<<___;
   1545 .Lprologue_avx2:
   1546 
   1547 	vzeroupper
   1548 	sub	\$-16*$SZ,$inp		# inp++, size optimization
   1549 	mov	$SZ*0($ctx),$A
   1550 	mov	$inp,%r12		# borrow $T1
   1551 	mov	$SZ*1($ctx),$B
   1552 	cmp	%rdx,$inp		# $_end
   1553 	mov	$SZ*2($ctx),$C
   1554 	cmove	%rsp,%r12		# next block or random data
   1555 	mov	$SZ*3($ctx),$D
   1556 	mov	$SZ*4($ctx),$E
   1557 	mov	$SZ*5($ctx),$F
   1558 	mov	$SZ*6($ctx),$G
   1559 	mov	$SZ*7($ctx),$H
   1560 ___
   1561 					if ($SZ==4) {	# SHA256
   1562     my @X = map("%ymm$_",(0..3));
   1563     my ($t0,$t1,$t2,$t3, $t4,$t5) = map("%ymm$_",(4..9));
   1564 
   1565 $code.=<<___;
   1566 	vmovdqa	$TABLE+`$SZ*2*$rounds`+32(%rip),$t4
   1567 	vmovdqa	$TABLE+`$SZ*2*$rounds`+64(%rip),$t5
   1568 	jmp	.Loop_avx2
   1569 .align	16
   1570 .Loop_avx2:
   1571 	vmovdqa	$TABLE+`$SZ*2*$rounds`(%rip),$t3
   1572 	vmovdqu	-16*$SZ+0($inp),%xmm0
   1573 	vmovdqu	-16*$SZ+16($inp),%xmm1
   1574 	vmovdqu	-16*$SZ+32($inp),%xmm2
   1575 	vmovdqu	-16*$SZ+48($inp),%xmm3
   1576 	#mov		$inp,$_inp	# offload $inp
   1577 	vinserti128	\$1,(%r12),@X[0],@X[0]
   1578 	vinserti128	\$1,16(%r12),@X[1],@X[1]
   1579 	vpshufb		$t3,@X[0],@X[0]
   1580 	vinserti128	\$1,32(%r12),@X[2],@X[2]
   1581 	vpshufb		$t3,@X[1],@X[1]
   1582 	vinserti128	\$1,48(%r12),@X[3],@X[3]
   1583 
   1584 	lea	$TABLE(%rip),$Tbl
   1585 	vpshufb	$t3,@X[2],@X[2]
   1586 	vpaddd	0x00($Tbl),@X[0],$t0
   1587 	vpshufb	$t3,@X[3],@X[3]
   1588 	vpaddd	0x20($Tbl),@X[1],$t1
   1589 	vpaddd	0x40($Tbl),@X[2],$t2
   1590 	vpaddd	0x60($Tbl),@X[3],$t3
   1591 	vmovdqa	$t0,0x00(%rsp)
   1592 	xor	$a1,$a1
   1593 	vmovdqa	$t1,0x20(%rsp)
   1594 	lea	-$PUSH8(%rsp),%rsp
   1595 	mov	$B,$a3
   1596 	vmovdqa	$t2,0x00(%rsp)
   1597 	xor	$C,$a3			# magic
   1598 	vmovdqa	$t3,0x20(%rsp)
   1599 	mov	$F,$a4
   1600 	sub	\$-16*2*$SZ,$Tbl	# size optimization
   1601 	jmp	.Lavx2_00_47
   1602 
   1603 .align	16
   1604 .Lavx2_00_47:
   1605 ___
   1606 
   1607 sub AVX2_256_00_47 () {
   1608 my $j = shift;
   1609 my $body = shift;
   1610 my @X = @_;
   1611 my @insns = (&$body,&$body,&$body,&$body);	# 96 instructions
   1612 my $base = "+2*$PUSH8(%rsp)";
   1613 
   1614 	&lea	("%rsp","-$PUSH8(%rsp)")	if (($j%2)==0);
   1615 	foreach (Xupdate_256_AVX()) {		# 29 instructions
   1616 	    eval;
   1617 	    eval(shift(@insns));
   1618 	    eval(shift(@insns));
   1619 	    eval(shift(@insns));
   1620 	}
   1621 	&vpaddd		($t2,@X[0],16*2*$j."($Tbl)");
   1622 	  foreach (@insns) { eval; }		# remaining instructions
   1623 	&vmovdqa	((32*$j)%$PUSH8."(%rsp)",$t2);
   1624 }
   1625 
   1626     for ($i=0,$j=0; $j<4; $j++) {
   1627 	&AVX2_256_00_47($j,\&bodyx_00_15,@X);
   1628 	push(@X,shift(@X));			# rotate(@X)
   1629     }
   1630 	&lea	($Tbl,16*2*$SZ."($Tbl)");
   1631 	&cmpb	(($SZ-1)."($Tbl)",0);
   1632 	&jne	(".Lavx2_00_47");
   1633 
   1634     for ($i=0; $i<16; ) {
   1635 	my $base=$i<8?"+$PUSH8(%rsp)":"(%rsp)";
   1636 	foreach(bodyx_00_15()) { eval; }
   1637     }
   1638 					} else {	# SHA512
   1639     my @X = map("%ymm$_",(0..7));
   1640     my ($t0,$t1,$t2,$t3) = map("%ymm$_",(8..11));
   1641 
   1642 $code.=<<___;
   1643 	jmp	.Loop_avx2
   1644 .align	16
   1645 .Loop_avx2:
   1646 	vmovdqu	-16*$SZ($inp),%xmm0
   1647 	vmovdqu	-16*$SZ+16($inp),%xmm1
   1648 	vmovdqu	-16*$SZ+32($inp),%xmm2
   1649 	lea	$TABLE+0x80(%rip),$Tbl	# size optimization
   1650 	vmovdqu	-16*$SZ+48($inp),%xmm3
   1651 	vmovdqu	-16*$SZ+64($inp),%xmm4
   1652 	vmovdqu	-16*$SZ+80($inp),%xmm5
   1653 	vmovdqu	-16*$SZ+96($inp),%xmm6
   1654 	vmovdqu	-16*$SZ+112($inp),%xmm7
   1655 	#mov	$inp,$_inp	# offload $inp
   1656 	vmovdqa	`$SZ*2*$rounds-0x80`($Tbl),$t2
   1657 	vinserti128	\$1,(%r12),@X[0],@X[0]
   1658 	vinserti128	\$1,16(%r12),@X[1],@X[1]
   1659 	 vpshufb	$t2,@X[0],@X[0]
   1660 	vinserti128	\$1,32(%r12),@X[2],@X[2]
   1661 	 vpshufb	$t2,@X[1],@X[1]
   1662 	vinserti128	\$1,48(%r12),@X[3],@X[3]
   1663 	 vpshufb	$t2,@X[2],@X[2]
   1664 	vinserti128	\$1,64(%r12),@X[4],@X[4]
   1665 	 vpshufb	$t2,@X[3],@X[3]
   1666 	vinserti128	\$1,80(%r12),@X[5],@X[5]
   1667 	 vpshufb	$t2,@X[4],@X[4]
   1668 	vinserti128	\$1,96(%r12),@X[6],@X[6]
   1669 	 vpshufb	$t2,@X[5],@X[5]
   1670 	vinserti128	\$1,112(%r12),@X[7],@X[7]
   1671 
   1672 	vpaddq	-0x80($Tbl),@X[0],$t0
   1673 	vpshufb	$t2,@X[6],@X[6]
   1674 	vpaddq	-0x60($Tbl),@X[1],$t1
   1675 	vpshufb	$t2,@X[7],@X[7]
   1676 	vpaddq	-0x40($Tbl),@X[2],$t2
   1677 	vpaddq	-0x20($Tbl),@X[3],$t3
   1678 	vmovdqa	$t0,0x00(%rsp)
   1679 	vpaddq	0x00($Tbl),@X[4],$t0
   1680 	vmovdqa	$t1,0x20(%rsp)
   1681 	vpaddq	0x20($Tbl),@X[5],$t1
   1682 	vmovdqa	$t2,0x40(%rsp)
   1683 	vpaddq	0x40($Tbl),@X[6],$t2
   1684 	vmovdqa	$t3,0x60(%rsp)
   1685 	lea	-$PUSH8(%rsp),%rsp
   1686 	vpaddq	0x60($Tbl),@X[7],$t3
   1687 	vmovdqa	$t0,0x00(%rsp)
   1688 	xor	$a1,$a1
   1689 	vmovdqa	$t1,0x20(%rsp)
   1690 	mov	$B,$a3
   1691 	vmovdqa	$t2,0x40(%rsp)
   1692 	xor	$C,$a3			# magic
   1693 	vmovdqa	$t3,0x60(%rsp)
   1694 	mov	$F,$a4
   1695 	add	\$16*2*$SZ,$Tbl
   1696 	jmp	.Lavx2_00_47
   1697 
   1698 .align	16
   1699 .Lavx2_00_47:
   1700 ___
   1701 
   1702 sub AVX2_512_00_47 () {
   1703 my $j = shift;
   1704 my $body = shift;
   1705 my @X = @_;
   1706 my @insns = (&$body,&$body);			# 48 instructions
   1707 my $base = "+2*$PUSH8(%rsp)";
   1708 
   1709 	&lea	("%rsp","-$PUSH8(%rsp)")	if (($j%4)==0);
   1710 	foreach (Xupdate_512_AVX()) {		# 23 instructions
   1711 	    eval;
   1712 	    if ($_ !~ /\;$/) {
   1713 		eval(shift(@insns));
   1714 		eval(shift(@insns));
   1715 		eval(shift(@insns));
   1716 	    }
   1717 	}
   1718 	&vpaddq		($t2,@X[0],16*2*$j-0x80."($Tbl)");
   1719 	  foreach (@insns) { eval; }		# remaining instructions
   1720 	&vmovdqa	((32*$j)%$PUSH8."(%rsp)",$t2);
   1721 }
   1722 
   1723     for ($i=0,$j=0; $j<8; $j++) {
   1724 	&AVX2_512_00_47($j,\&bodyx_00_15,@X);
   1725 	push(@X,shift(@X));			# rotate(@X)
   1726     }
   1727 	&lea	($Tbl,16*2*$SZ."($Tbl)");
   1728 	&cmpb	(($SZ-1-0x80)."($Tbl)",0);
   1729 	&jne	(".Lavx2_00_47");
   1730 
   1731     for ($i=0; $i<16; ) {
   1732 	my $base=$i<8?"+$PUSH8(%rsp)":"(%rsp)";
   1733 	foreach(bodyx_00_15()) { eval; }
   1734     }
   1735 }
   1736 $code.=<<___;
   1737 	mov	`2*$SZ*$rounds`(%rsp),$ctx	# $_ctx
   1738 	add	$a1,$A
   1739 	#mov	`2*$SZ*$rounds+8`(%rsp),$inp	# $_inp
   1740 	lea	`2*$SZ*($rounds-8)`(%rsp),$Tbl
   1741 
   1742 	add	$SZ*0($ctx),$A
   1743 	add	$SZ*1($ctx),$B
   1744 	add	$SZ*2($ctx),$C
   1745 	add	$SZ*3($ctx),$D
   1746 	add	$SZ*4($ctx),$E
   1747 	add	$SZ*5($ctx),$F
   1748 	add	$SZ*6($ctx),$G
   1749 	add	$SZ*7($ctx),$H
   1750 
   1751 	mov	$A,$SZ*0($ctx)
   1752 	mov	$B,$SZ*1($ctx)
   1753 	mov	$C,$SZ*2($ctx)
   1754 	mov	$D,$SZ*3($ctx)
   1755 	mov	$E,$SZ*4($ctx)
   1756 	mov	$F,$SZ*5($ctx)
   1757 	mov	$G,$SZ*6($ctx)
   1758 	mov	$H,$SZ*7($ctx)
   1759 
   1760 	cmp	`$PUSH8+2*8`($Tbl),$inp	# $_end
   1761 	je	.Ldone_avx2
   1762 
   1763 	xor	$a1,$a1
   1764 	mov	$B,$a3
   1765 	xor	$C,$a3			# magic
   1766 	mov	$F,$a4
   1767 	jmp	.Lower_avx2
   1768 .align	16
   1769 .Lower_avx2:
   1770 ___
   1771     for ($i=0; $i<8; ) {
   1772 	my $base="+16($Tbl)";
   1773 	foreach(bodyx_00_15()) { eval; }
   1774     }
   1775 $code.=<<___;
   1776 	lea	-$PUSH8($Tbl),$Tbl
   1777 	cmp	%rsp,$Tbl
   1778 	jae	.Lower_avx2
   1779 
   1780 	mov	`2*$SZ*$rounds`(%rsp),$ctx	# $_ctx
   1781 	add	$a1,$A
   1782 	#mov	`2*$SZ*$rounds+8`(%rsp),$inp	# $_inp
   1783 	lea	`2*$SZ*($rounds-8)`(%rsp),%rsp
   1784 
   1785 	add	$SZ*0($ctx),$A
   1786 	add	$SZ*1($ctx),$B
   1787 	add	$SZ*2($ctx),$C
   1788 	add	$SZ*3($ctx),$D
   1789 	add	$SZ*4($ctx),$E
   1790 	add	$SZ*5($ctx),$F
   1791 	lea	`2*16*$SZ`($inp),$inp	# inp+=2
   1792 	add	$SZ*6($ctx),$G
   1793 	mov	$inp,%r12
   1794 	add	$SZ*7($ctx),$H
   1795 	cmp	$_end,$inp
   1796 
   1797 	mov	$A,$SZ*0($ctx)
   1798 	cmove	%rsp,%r12		# next block or stale data
   1799 	mov	$B,$SZ*1($ctx)
   1800 	mov	$C,$SZ*2($ctx)
   1801 	mov	$D,$SZ*3($ctx)
   1802 	mov	$E,$SZ*4($ctx)
   1803 	mov	$F,$SZ*5($ctx)
   1804 	mov	$G,$SZ*6($ctx)
   1805 	mov	$H,$SZ*7($ctx)
   1806 
   1807 	jbe	.Loop_avx2
   1808 	lea	(%rsp),$Tbl
   1809 
   1810 .Ldone_avx2:
   1811 	lea	($Tbl),%rsp
   1812 	mov	$_rsp,%rsi
   1813 .cfi_def_cfa	%rsi,8
   1814 	vzeroupper
   1815 ___
   1816 $code.=<<___ if ($win64);
   1817 	movaps	16*$SZ+32(%rsp),%xmm6
   1818 	movaps	16*$SZ+48(%rsp),%xmm7
   1819 	movaps	16*$SZ+64(%rsp),%xmm8
   1820 	movaps	16*$SZ+80(%rsp),%xmm9
   1821 ___
   1822 $code.=<<___ if ($win64 && $SZ>4);
   1823 	movaps	16*$SZ+96(%rsp),%xmm10
   1824 	movaps	16*$SZ+112(%rsp),%xmm11
   1825 ___
   1826 $code.=<<___;
   1827 	mov	-48(%rsi),%r15
   1828 .cfi_restore	%r15
   1829 	mov	-40(%rsi),%r14
   1830 .cfi_restore	%r14
   1831 	mov	-32(%rsi),%r13
   1832 .cfi_restore	%r13
   1833 	mov	-24(%rsi),%r12
   1834 .cfi_restore	%r12
   1835 	mov	-16(%rsi),%rbp
   1836 .cfi_restore	%rbp
   1837 	mov	-8(%rsi),%rbx
   1838 .cfi_restore	%rbx
   1839 	lea	(%rsi),%rsp
   1840 .cfi_def_cfa_register	%rsp
   1841 .Lepilogue_avx2:
   1842 	ret
   1843 .cfi_endproc
   1844 .size	${func}_avx2,.-${func}_avx2
   1845 ___
   1846 }}
   1847 }}}}}
   1848 
   1849 # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
   1850 #		CONTEXT *context,DISPATCHER_CONTEXT *disp)
   1851 if ($win64) {
   1852 $rec="%rcx";
   1853 $frame="%rdx";
   1854 $context="%r8";
   1855 $disp="%r9";
   1856 
   1857 $code.=<<___;
   1858 .extern	__imp_RtlVirtualUnwind
   1859 .type	se_handler,\@abi-omnipotent
   1860 .align	16
   1861 se_handler:
   1862 	push	%rsi
   1863 	push	%rdi
   1864 	push	%rbx
   1865 	push	%rbp
   1866 	push	%r12
   1867 	push	%r13
   1868 	push	%r14
   1869 	push	%r15
   1870 	pushfq
   1871 	sub	\$64,%rsp
   1872 
   1873 	mov	120($context),%rax	# pull context->Rax
   1874 	mov	248($context),%rbx	# pull context->Rip
   1875 
   1876 	mov	8($disp),%rsi		# disp->ImageBase
   1877 	mov	56($disp),%r11		# disp->HanderlData
   1878 
   1879 	mov	0(%r11),%r10d		# HandlerData[0]
   1880 	lea	(%rsi,%r10),%r10	# prologue label
   1881 	cmp	%r10,%rbx		# context->Rip<prologue label
   1882 	jb	.Lin_prologue
   1883 
   1884 	mov	152($context),%rax	# pull context->Rsp
   1885 
   1886 	mov	4(%r11),%r10d		# HandlerData[1]
   1887 	lea	(%rsi,%r10),%r10	# epilogue label
   1888 	cmp	%r10,%rbx		# context->Rip>=epilogue label
   1889 	jae	.Lin_prologue
   1890 ___
   1891 $code.=<<___ if ($avx>1);
   1892 	lea	.Lavx2_shortcut(%rip),%r10
   1893 	cmp	%r10,%rbx		# context->Rip<avx2_shortcut
   1894 	jb	.Lnot_in_avx2
   1895 
   1896 	and	\$-256*$SZ,%rax
   1897 	add	\$`2*$SZ*($rounds-8)`,%rax
   1898 .Lnot_in_avx2:
   1899 ___
   1900 $code.=<<___;
   1901 	mov	%rax,%rsi		# put aside Rsp
   1902 	mov	16*$SZ+3*8(%rax),%rax	# pull $_rsp
   1903 
   1904 	mov	-8(%rax),%rbx
   1905 	mov	-16(%rax),%rbp
   1906 	mov	-24(%rax),%r12
   1907 	mov	-32(%rax),%r13
   1908 	mov	-40(%rax),%r14
   1909 	mov	-48(%rax),%r15
   1910 	mov	%rbx,144($context)	# restore context->Rbx
   1911 	mov	%rbp,160($context)	# restore context->Rbp
   1912 	mov	%r12,216($context)	# restore context->R12
   1913 	mov	%r13,224($context)	# restore context->R13
   1914 	mov	%r14,232($context)	# restore context->R14
   1915 	mov	%r15,240($context)	# restore context->R15
   1916 
   1917 	lea	.Lepilogue(%rip),%r10
   1918 	cmp	%r10,%rbx
   1919 	jb	.Lin_prologue		# non-AVX code
   1920 
   1921 	lea	16*$SZ+4*8(%rsi),%rsi	# Xmm6- save area
   1922 	lea	512($context),%rdi	# &context.Xmm6
   1923 	mov	\$`$SZ==4?8:12`,%ecx
   1924 	.long	0xa548f3fc		# cld; rep movsq
   1925 
   1926 .Lin_prologue:
   1927 	mov	8(%rax),%rdi
   1928 	mov	16(%rax),%rsi
   1929 	mov	%rax,152($context)	# restore context->Rsp
   1930 	mov	%rsi,168($context)	# restore context->Rsi
   1931 	mov	%rdi,176($context)	# restore context->Rdi
   1932 
   1933 	mov	40($disp),%rdi		# disp->ContextRecord
   1934 	mov	$context,%rsi		# context
   1935 	mov	\$154,%ecx		# sizeof(CONTEXT)
   1936 	.long	0xa548f3fc		# cld; rep movsq
   1937 
   1938 	mov	$disp,%rsi
   1939 	xor	%rcx,%rcx		# arg1, UNW_FLAG_NHANDLER
   1940 	mov	8(%rsi),%rdx		# arg2, disp->ImageBase
   1941 	mov	0(%rsi),%r8		# arg3, disp->ControlPc
   1942 	mov	16(%rsi),%r9		# arg4, disp->FunctionEntry
   1943 	mov	40(%rsi),%r10		# disp->ContextRecord
   1944 	lea	56(%rsi),%r11		# &disp->HandlerData
   1945 	lea	24(%rsi),%r12		# &disp->EstablisherFrame
   1946 	mov	%r10,32(%rsp)		# arg5
   1947 	mov	%r11,40(%rsp)		# arg6
   1948 	mov	%r12,48(%rsp)		# arg7
   1949 	mov	%rcx,56(%rsp)		# arg8, (NULL)
   1950 	call	*__imp_RtlVirtualUnwind(%rip)
   1951 
   1952 	mov	\$1,%eax		# ExceptionContinueSearch
   1953 	add	\$64,%rsp
   1954 	popfq
   1955 	pop	%r15
   1956 	pop	%r14
   1957 	pop	%r13
   1958 	pop	%r12
   1959 	pop	%rbp
   1960 	pop	%rbx
   1961 	pop	%rdi
   1962 	pop	%rsi
   1963 	ret
   1964 .size	se_handler,.-se_handler
   1965 ___
   1966 
   1967 $code.=<<___ if ($SZ==4 && $shaext);
   1968 .type	shaext_handler,\@abi-omnipotent
   1969 .align	16
   1970 shaext_handler:
   1971 	push	%rsi
   1972 	push	%rdi
   1973 	push	%rbx
   1974 	push	%rbp
   1975 	push	%r12
   1976 	push	%r13
   1977 	push	%r14
   1978 	push	%r15
   1979 	pushfq
   1980 	sub	\$64,%rsp
   1981 
   1982 	mov	120($context),%rax	# pull context->Rax
   1983 	mov	248($context),%rbx	# pull context->Rip
   1984 
   1985 	lea	.Lprologue_shaext(%rip),%r10
   1986 	cmp	%r10,%rbx		# context->Rip<.Lprologue
   1987 	jb	.Lin_prologue
   1988 
   1989 	lea	.Lepilogue_shaext(%rip),%r10
   1990 	cmp	%r10,%rbx		# context->Rip>=.Lepilogue
   1991 	jae	.Lin_prologue
   1992 
   1993 	lea	-8-5*16(%rax),%rsi
   1994 	lea	512($context),%rdi	# &context.Xmm6
   1995 	mov	\$10,%ecx
   1996 	.long	0xa548f3fc		# cld; rep movsq
   1997 
   1998 	jmp	.Lin_prologue
   1999 .size	shaext_handler,.-shaext_handler
   2000 ___
   2001 
   2002 $code.=<<___;
   2003 .section	.pdata
   2004 .align	4
   2005 	.rva	.LSEH_begin_$func
   2006 	.rva	.LSEH_end_$func
   2007 	.rva	.LSEH_info_$func
   2008 ___
   2009 $code.=<<___ if ($SZ==4 && $shaext);
   2010 	.rva	.LSEH_begin_${func}_shaext
   2011 	.rva	.LSEH_end_${func}_shaext
   2012 	.rva	.LSEH_info_${func}_shaext
   2013 ___
   2014 $code.=<<___ if ($SZ==4);
   2015 	.rva	.LSEH_begin_${func}_ssse3
   2016 	.rva	.LSEH_end_${func}_ssse3
   2017 	.rva	.LSEH_info_${func}_ssse3
   2018 ___
   2019 $code.=<<___ if ($avx);
   2020 	.rva	.LSEH_begin_${func}_avx
   2021 	.rva	.LSEH_end_${func}_avx
   2022 	.rva	.LSEH_info_${func}_avx
   2023 ___
   2024 $code.=<<___ if ($avx>1);
   2025 	.rva	.LSEH_begin_${func}_avx2
   2026 	.rva	.LSEH_end_${func}_avx2
   2027 	.rva	.LSEH_info_${func}_avx2
   2028 ___
   2029 $code.=<<___;
   2030 .section	.xdata
   2031 .align	8
   2032 .LSEH_info_$func:
   2033 	.byte	9,0,0,0
   2034 	.rva	se_handler
   2035 	.rva	.Lprologue,.Lepilogue			# HandlerData[]
   2036 ___
   2037 $code.=<<___ if ($SZ==4 && $shaext);
   2038 .LSEH_info_${func}_shaext:
   2039 	.byte	9,0,0,0
   2040 	.rva	shaext_handler
   2041 ___
   2042 $code.=<<___ if ($SZ==4);
   2043 .LSEH_info_${func}_ssse3:
   2044 	.byte	9,0,0,0
   2045 	.rva	se_handler
   2046 	.rva	.Lprologue_ssse3,.Lepilogue_ssse3	# HandlerData[]
   2047 ___
   2048 $code.=<<___ if ($avx);
   2049 .LSEH_info_${func}_avx:
   2050 	.byte	9,0,0,0
   2051 	.rva	se_handler
   2052 	.rva	.Lprologue_avx,.Lepilogue_avx		# HandlerData[]
   2053 ___
   2054 $code.=<<___ if ($avx>1);
   2055 .LSEH_info_${func}_avx2:
   2056 	.byte	9,0,0,0
   2057 	.rva	se_handler
   2058 	.rva	.Lprologue_avx2,.Lepilogue_avx2		# HandlerData[]
   2059 ___
   2060 }
   2061 
   2062 sub sha256op38 {
   2063     my $instr = shift;
   2064     my %opcodelet = (
   2065 		"sha256rnds2" => 0xcb,
   2066   		"sha256msg1"  => 0xcc,
   2067 		"sha256msg2"  => 0xcd	);
   2068 
   2069     if (defined($opcodelet{$instr}) && @_[0] =~ /%xmm([0-7]),\s*%xmm([0-7])/) {
   2070       my @opcode=(0x0f,0x38);
   2071 	push @opcode,$opcodelet{$instr};
   2072 	push @opcode,0xc0|($1&7)|(($2&7)<<3);		# ModR/M
   2073 	return ".byte\t".join(',',@opcode);
   2074     } else {
   2075 	return $instr."\t".@_[0];
   2076     }
   2077 }
   2078 
   2079 foreach (split("\n",$code)) {
   2080 	s/\`([^\`]*)\`/eval $1/geo;
   2081 
   2082 	s/\b(sha256[^\s]*)\s+(.*)/sha256op38($1,$2)/geo;
   2083 
   2084 	print $_,"\n";
   2085 }
   2086 close STDOUT;
   2087