Home | History | Annotate | Download | only in asm
      1 #!/usr/bin/env perl
      2 #
      3 # ====================================================================
      4 # Written by Andy Polyakov <appro (at] openssl.org> for the OpenSSL
      5 # project. Rights for redistribution and usage in source and binary
      6 # forms are granted according to the OpenSSL license.
      7 # ====================================================================
      8 #
      9 # sha256/512_block procedure for x86_64.
     10 #
     11 # 40% improvement over compiler-generated code on Opteron. On EM64T
     12 # sha256 was observed to run >80% faster and sha512 - >40%. No magical
     13 # tricks, just straight implementation... I really wonder why gcc
     14 # [being armed with inline assembler] fails to generate as fast code.
     15 # The only thing which is cool about this module is that it's very
     16 # same instruction sequence used for both SHA-256 and SHA-512. In
     17 # former case the instructions operate on 32-bit operands, while in
     18 # latter - on 64-bit ones. All I had to do is to get one flavor right,
     19 # the other one passed the test right away:-)
     20 #
     21 # sha256_block runs in ~1005 cycles on Opteron, which gives you
     22 # asymptotic performance of 64*1000/1005=63.7MBps times CPU clock
     23 # frequency in GHz. sha512_block runs in ~1275 cycles, which results
     24 # in 128*1000/1275=100MBps per GHz. Is there room for improvement?
     25 # Well, if you compare it to IA-64 implementation, which maintains
     26 # X[16] in register bank[!], tends to 4 instructions per CPU clock
     27 # cycle and runs in 1003 cycles, 1275 is very good result for 3-way
     28 # issue Opteron pipeline and X[16] maintained in memory. So that *if*
     29 # there is a way to improve it, *then* the only way would be to try to
     30 # offload X[16] updates to SSE unit, but that would require "deeper"
     31 # loop unroll, which in turn would naturally cause size blow-up, not
     32 # to mention increased complexity! And once again, only *if* it's
     33 # actually possible to noticeably improve overall ILP, instruction
     34 # level parallelism, on a given CPU implementation in this case.
     35 #
     36 # Special note on Intel EM64T. While Opteron CPU exhibits perfect
     37 # perfromance ratio of 1.5 between 64- and 32-bit flavors [see above],
     38 # [currently available] EM64T CPUs apparently are far from it. On the
     39 # contrary, 64-bit version, sha512_block, is ~30% *slower* than 32-bit
     40 # sha256_block:-( This is presumably because 64-bit shifts/rotates
     41 # apparently are not atomic instructions, but implemented in microcode.
     42 #
     43 # May 2012.
     44 #
     45 # Optimization including one of Pavel Semjanov's ideas, alternative
     46 # Maj, resulted in >=5% improvement on most CPUs, +20% SHA256 and
     47 # unfortunately -2% SHA512 on P4 [which nobody should care about
     48 # that much].
     49 #
     50 # June 2012.
     51 #
     52 # Add SIMD code paths, see below for improvement coefficients. SSSE3
     53 # code path was not attempted for SHA512, because improvement is not
     54 # estimated to be high enough, noticeably less than 9%, to justify
     55 # the effort, not on pre-AVX processors. [Obviously with exclusion
     56 # for VIA Nano, but it has SHA512 instruction that is faster and
     57 # should be used instead.] For reference, corresponding estimated
     58 # upper limit for improvement for SSSE3 SHA256 is 28%. The fact that
     59 # higher coefficients are observed on VIA Nano and Bulldozer has more
     60 # to do with specifics of their architecture [which is topic for
     61 # separate discussion].
     62 #
     63 # November 2012.
     64 #
     65 # Add AVX2 code path. Two consecutive input blocks are loaded to
     66 # 256-bit %ymm registers, with data from first block to least
     67 # significant 128-bit halves and data from second to most significant.
     68 # The data is then processed with same SIMD instruction sequence as
     69 # for AVX, but with %ymm as operands. Side effect is increased stack
     70 # frame, 448 additional bytes in SHA256 and 1152 in SHA512, and 1.2KB
     71 # code size increase.
     72 #
     73 # March 2014.
     74 #
     75 # Add support for Intel SHA Extensions.
     76 
     77 ######################################################################
     78 # Current performance in cycles per processed byte (less is better):
     79 #
     80 #		SHA256	SSSE3       AVX/XOP(*)	    SHA512  AVX/XOP(*)
     81 #
     82 # AMD K8	14.9	-	    -		    9.57    -
     83 # P4		17.3	-	    -		    30.8    -
     84 # Core 2	15.6	13.8(+13%)  -		    9.97    -
     85 # Westmere	14.8	12.3(+19%)  -		    9.58    -
     86 # Sandy Bridge	17.4	14.2(+23%)  11.6(+50%(**))  11.2    8.10(+38%(**))
     87 # Ivy Bridge	12.6	10.5(+20%)  10.3(+22%)	    8.17    7.22(+13%)
     88 # Haswell	12.2	9.28(+31%)  7.80(+56%)	    7.66    5.40(+42%)
     89 # Bulldozer	21.1	13.6(+54%)  13.6(+54%(***)) 13.5    8.58(+57%)
     90 # VIA Nano	23.0	16.5(+39%)  -		    14.7    -
     91 # Atom		23.0	18.9(+22%)  -		    14.7    -
     92 # Silvermont	27.4	20.6(+33%)  -               17.5    -
     93 #
     94 # (*)	whichever best applicable;
     95 # (**)	switch from ror to shrd stands for fair share of improvement;
     96 # (***)	execution time is fully determined by remaining integer-only
     97 #	part, body_00_15; reducing the amount of SIMD instructions
     98 #	below certain limit makes no difference/sense; to conserve
     99 #	space SHA256 XOP code path is therefore omitted;
    100 
    101 $flavour = shift;
    102 $output  = shift;
    103 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
    104 
    105 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
    106 
    107 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
    108 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
    109 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
    110 die "can't locate x86_64-xlate.pl";
    111 
    112 # In upstream, this is controlled by shelling out to the compiler to check
    113 # versions, but BoringSSL is intended to be used with pre-generated perlasm
    114 # output, so this isn't useful anyway.
    115 #
    116 # TODO(davidben): Enable AVX2 code after testing by setting $avx to 2. Is it
    117 # necessary to disable AVX2 code when SHA Extensions code is disabled? Upstream
    118 # did not tie them together until after $shaext was added.
    119 $avx = 1;
    120 
    121 # TODO(davidben): Consider enabling the Intel SHA Extensions code once it's
    122 # been tested.
    123 $shaext=0;	### set to zero if compiling for 1.0.1
    124 $avx=1		if (!$shaext && $avx);
    125 
    126 open OUT,"| \"$^X\" $xlate $flavour";
    127 *STDOUT=*OUT;
    128 
    129 if ($output =~ /512/) {
    130 	$func="sha512_block_data_order";
    131 	$TABLE="K512";
    132 	$SZ=8;
    133 	@ROT=($A,$B,$C,$D,$E,$F,$G,$H)=("%rax","%rbx","%rcx","%rdx",
    134 					"%r8", "%r9", "%r10","%r11");
    135 	($T1,$a0,$a1,$a2,$a3)=("%r12","%r13","%r14","%r15","%rdi");
    136 	@Sigma0=(28,34,39);
    137 	@Sigma1=(14,18,41);
    138 	@sigma0=(1,  8, 7);
    139 	@sigma1=(19,61, 6);
    140 	$rounds=80;
    141 } else {
    142 	$func="sha256_block_data_order";
    143 	$TABLE="K256";
    144 	$SZ=4;
    145 	@ROT=($A,$B,$C,$D,$E,$F,$G,$H)=("%eax","%ebx","%ecx","%edx",
    146 					"%r8d","%r9d","%r10d","%r11d");
    147 	($T1,$a0,$a1,$a2,$a3)=("%r12d","%r13d","%r14d","%r15d","%edi");
    148 	@Sigma0=( 2,13,22);
    149 	@Sigma1=( 6,11,25);
    150 	@sigma0=( 7,18, 3);
    151 	@sigma1=(17,19,10);
    152 	$rounds=64;
    153 }
    154 
    155 $ctx="%rdi";	# 1st arg, zapped by $a3
    156 $inp="%rsi";	# 2nd arg
    157 $Tbl="%rbp";
    158 
    159 $_ctx="16*$SZ+0*8(%rsp)";
    160 $_inp="16*$SZ+1*8(%rsp)";
    161 $_end="16*$SZ+2*8(%rsp)";
    162 $_rsp="16*$SZ+3*8(%rsp)";
    163 $framesz="16*$SZ+4*8";
    164 
    165 
    166 sub ROUND_00_15()
    167 { my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
    168   my $STRIDE=$SZ;
    169      $STRIDE += 16 if ($i%(16/$SZ)==(16/$SZ-1));
    170 
    171 $code.=<<___;
    172 	ror	\$`$Sigma1[2]-$Sigma1[1]`,$a0
    173 	mov	$f,$a2
    174 
    175 	xor	$e,$a0
    176 	ror	\$`$Sigma0[2]-$Sigma0[1]`,$a1
    177 	xor	$g,$a2			# f^g
    178 
    179 	mov	$T1,`$SZ*($i&0xf)`(%rsp)
    180 	xor	$a,$a1
    181 	and	$e,$a2			# (f^g)&e
    182 
    183 	ror	\$`$Sigma1[1]-$Sigma1[0]`,$a0
    184 	add	$h,$T1			# T1+=h
    185 	xor	$g,$a2			# Ch(e,f,g)=((f^g)&e)^g
    186 
    187 	ror	\$`$Sigma0[1]-$Sigma0[0]`,$a1
    188 	xor	$e,$a0
    189 	add	$a2,$T1			# T1+=Ch(e,f,g)
    190 
    191 	mov	$a,$a2
    192 	add	($Tbl),$T1		# T1+=K[round]
    193 	xor	$a,$a1
    194 
    195 	xor	$b,$a2			# a^b, b^c in next round
    196 	ror	\$$Sigma1[0],$a0	# Sigma1(e)
    197 	mov	$b,$h
    198 
    199 	and	$a2,$a3
    200 	ror	\$$Sigma0[0],$a1	# Sigma0(a)
    201 	add	$a0,$T1			# T1+=Sigma1(e)
    202 
    203 	xor	$a3,$h			# h=Maj(a,b,c)=Ch(a^b,c,b)
    204 	add	$T1,$d			# d+=T1
    205 	add	$T1,$h			# h+=T1
    206 
    207 	lea	$STRIDE($Tbl),$Tbl	# round++
    208 ___
    209 $code.=<<___ if ($i<15);
    210 	add	$a1,$h			# h+=Sigma0(a)
    211 ___
    212 	($a2,$a3) = ($a3,$a2);
    213 }
    214 
    215 sub ROUND_16_XX()
    216 { my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
    217 
    218 $code.=<<___;
    219 	mov	`$SZ*(($i+1)&0xf)`(%rsp),$a0
    220 	mov	`$SZ*(($i+14)&0xf)`(%rsp),$a2
    221 
    222 	mov	$a0,$T1
    223 	ror	\$`$sigma0[1]-$sigma0[0]`,$a0
    224 	add	$a1,$a			# modulo-scheduled h+=Sigma0(a)
    225 	mov	$a2,$a1
    226 	ror	\$`$sigma1[1]-$sigma1[0]`,$a2
    227 
    228 	xor	$T1,$a0
    229 	shr	\$$sigma0[2],$T1
    230 	ror	\$$sigma0[0],$a0
    231 	xor	$a1,$a2
    232 	shr	\$$sigma1[2],$a1
    233 
    234 	ror	\$$sigma1[0],$a2
    235 	xor	$a0,$T1			# sigma0(X[(i+1)&0xf])
    236 	xor	$a1,$a2			# sigma1(X[(i+14)&0xf])
    237 	add	`$SZ*(($i+9)&0xf)`(%rsp),$T1
    238 
    239 	add	`$SZ*($i&0xf)`(%rsp),$T1
    240 	mov	$e,$a0
    241 	add	$a2,$T1
    242 	mov	$a,$a1
    243 ___
    244 	&ROUND_00_15(@_);
    245 }
    246 
    247 $code=<<___;
    248 .text
    249 
    250 .extern	OPENSSL_ia32cap_P
    251 .globl	$func
    252 .type	$func,\@function,3
    253 .align	16
    254 $func:
    255 ___
    256 $code.=<<___ if ($SZ==4 || $avx);
    257 	lea	OPENSSL_ia32cap_P(%rip),%r11
    258 	mov	0(%r11),%r9d
    259 	mov	4(%r11),%r10d
    260 	mov	8(%r11),%r11d
    261 ___
    262 $code.=<<___ if ($SZ==4 && $shaext);
    263 	test	\$`1<<29`,%r11d		# check for SHA
    264 	jnz	_shaext_shortcut
    265 ___
    266 $code.=<<___ if ($avx && $SZ==8);
    267 	test	\$`1<<11`,%r10d		# check for XOP
    268 	jnz	.Lxop_shortcut
    269 ___
    270 $code.=<<___ if ($avx>1);
    271 	and	\$`1<<8|1<<5|1<<3`,%r11d	# check for BMI2+AVX2+BMI1
    272 	cmp	\$`1<<8|1<<5|1<<3`,%r11d
    273 	je	.Lavx2_shortcut
    274 ___
    275 $code.=<<___ if ($avx);
    276 	and	\$`1<<30`,%r9d		# mask "Intel CPU" bit
    277 	and	\$`1<<28|1<<9`,%r10d	# mask AVX and SSSE3 bits
    278 	or	%r9d,%r10d
    279 	cmp	\$`1<<28|1<<9|1<<30`,%r10d
    280 	je	.Lavx_shortcut
    281 ___
    282 $code.=<<___ if ($SZ==4);
    283 	test	\$`1<<9`,%r10d
    284 	jnz	.Lssse3_shortcut
    285 ___
    286 $code.=<<___;
    287 	push	%rbx
    288 	push	%rbp
    289 	push	%r12
    290 	push	%r13
    291 	push	%r14
    292 	push	%r15
    293 	mov	%rsp,%r11		# copy %rsp
    294 	shl	\$4,%rdx		# num*16
    295 	sub	\$$framesz,%rsp
    296 	lea	($inp,%rdx,$SZ),%rdx	# inp+num*16*$SZ
    297 	and	\$-64,%rsp		# align stack frame
    298 	mov	$ctx,$_ctx		# save ctx, 1st arg
    299 	mov	$inp,$_inp		# save inp, 2nd arh
    300 	mov	%rdx,$_end		# save end pointer, "3rd" arg
    301 	mov	%r11,$_rsp		# save copy of %rsp
    302 .Lprologue:
    303 
    304 	mov	$SZ*0($ctx),$A
    305 	mov	$SZ*1($ctx),$B
    306 	mov	$SZ*2($ctx),$C
    307 	mov	$SZ*3($ctx),$D
    308 	mov	$SZ*4($ctx),$E
    309 	mov	$SZ*5($ctx),$F
    310 	mov	$SZ*6($ctx),$G
    311 	mov	$SZ*7($ctx),$H
    312 	jmp	.Lloop
    313 
    314 .align	16
    315 .Lloop:
    316 	mov	$B,$a3
    317 	lea	$TABLE(%rip),$Tbl
    318 	xor	$C,$a3			# magic
    319 ___
    320 	for($i=0;$i<16;$i++) {
    321 		$code.="	mov	$SZ*$i($inp),$T1\n";
    322 		$code.="	mov	@ROT[4],$a0\n";
    323 		$code.="	mov	@ROT[0],$a1\n";
    324 		$code.="	bswap	$T1\n";
    325 		&ROUND_00_15($i,@ROT);
    326 		unshift(@ROT,pop(@ROT));
    327 	}
    328 $code.=<<___;
    329 	jmp	.Lrounds_16_xx
    330 .align	16
    331 .Lrounds_16_xx:
    332 ___
    333 	for(;$i<32;$i++) {
    334 		&ROUND_16_XX($i,@ROT);
    335 		unshift(@ROT,pop(@ROT));
    336 	}
    337 
    338 $code.=<<___;
    339 	cmpb	\$0,`$SZ-1`($Tbl)
    340 	jnz	.Lrounds_16_xx
    341 
    342 	mov	$_ctx,$ctx
    343 	add	$a1,$A			# modulo-scheduled h+=Sigma0(a)
    344 	lea	16*$SZ($inp),$inp
    345 
    346 	add	$SZ*0($ctx),$A
    347 	add	$SZ*1($ctx),$B
    348 	add	$SZ*2($ctx),$C
    349 	add	$SZ*3($ctx),$D
    350 	add	$SZ*4($ctx),$E
    351 	add	$SZ*5($ctx),$F
    352 	add	$SZ*6($ctx),$G
    353 	add	$SZ*7($ctx),$H
    354 
    355 	cmp	$_end,$inp
    356 
    357 	mov	$A,$SZ*0($ctx)
    358 	mov	$B,$SZ*1($ctx)
    359 	mov	$C,$SZ*2($ctx)
    360 	mov	$D,$SZ*3($ctx)
    361 	mov	$E,$SZ*4($ctx)
    362 	mov	$F,$SZ*5($ctx)
    363 	mov	$G,$SZ*6($ctx)
    364 	mov	$H,$SZ*7($ctx)
    365 	jb	.Lloop
    366 
    367 	mov	$_rsp,%rsi
    368 	mov	(%rsi),%r15
    369 	mov	8(%rsi),%r14
    370 	mov	16(%rsi),%r13
    371 	mov	24(%rsi),%r12
    372 	mov	32(%rsi),%rbp
    373 	mov	40(%rsi),%rbx
    374 	lea	48(%rsi),%rsp
    375 .Lepilogue:
    376 	ret
    377 .size	$func,.-$func
    378 ___
    379 
    380 if ($SZ==4) {
    381 $code.=<<___;
    382 .align	64
    383 .type	$TABLE,\@object
    384 $TABLE:
    385 	.long	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
    386 	.long	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
    387 	.long	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
    388 	.long	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
    389 	.long	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
    390 	.long	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
    391 	.long	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
    392 	.long	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
    393 	.long	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
    394 	.long	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
    395 	.long	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
    396 	.long	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
    397 	.long	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
    398 	.long	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
    399 	.long	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
    400 	.long	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
    401 	.long	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
    402 	.long	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
    403 	.long	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
    404 	.long	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
    405 	.long	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
    406 	.long	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
    407 	.long	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
    408 	.long	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
    409 	.long	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
    410 	.long	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
    411 	.long	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
    412 	.long	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
    413 	.long	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
    414 	.long	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
    415 	.long	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
    416 	.long	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
    417 
    418 	.long	0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
    419 	.long	0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
    420 	.long	0x03020100,0x0b0a0908,0xffffffff,0xffffffff
    421 	.long	0x03020100,0x0b0a0908,0xffffffff,0xffffffff
    422 	.long	0xffffffff,0xffffffff,0x03020100,0x0b0a0908
    423 	.long	0xffffffff,0xffffffff,0x03020100,0x0b0a0908
    424 	.asciz	"SHA256 block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
    425 ___
    426 } else {
    427 $code.=<<___;
    428 .align	64
    429 .type	$TABLE,\@object
    430 $TABLE:
    431 	.quad	0x428a2f98d728ae22,0x7137449123ef65cd
    432 	.quad	0x428a2f98d728ae22,0x7137449123ef65cd
    433 	.quad	0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
    434 	.quad	0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
    435 	.quad	0x3956c25bf348b538,0x59f111f1b605d019
    436 	.quad	0x3956c25bf348b538,0x59f111f1b605d019
    437 	.quad	0x923f82a4af194f9b,0xab1c5ed5da6d8118
    438 	.quad	0x923f82a4af194f9b,0xab1c5ed5da6d8118
    439 	.quad	0xd807aa98a3030242,0x12835b0145706fbe
    440 	.quad	0xd807aa98a3030242,0x12835b0145706fbe
    441 	.quad	0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
    442 	.quad	0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
    443 	.quad	0x72be5d74f27b896f,0x80deb1fe3b1696b1
    444 	.quad	0x72be5d74f27b896f,0x80deb1fe3b1696b1
    445 	.quad	0x9bdc06a725c71235,0xc19bf174cf692694
    446 	.quad	0x9bdc06a725c71235,0xc19bf174cf692694
    447 	.quad	0xe49b69c19ef14ad2,0xefbe4786384f25e3
    448 	.quad	0xe49b69c19ef14ad2,0xefbe4786384f25e3
    449 	.quad	0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
    450 	.quad	0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
    451 	.quad	0x2de92c6f592b0275,0x4a7484aa6ea6e483
    452 	.quad	0x2de92c6f592b0275,0x4a7484aa6ea6e483
    453 	.quad	0x5cb0a9dcbd41fbd4,0x76f988da831153b5
    454 	.quad	0x5cb0a9dcbd41fbd4,0x76f988da831153b5
    455 	.quad	0x983e5152ee66dfab,0xa831c66d2db43210
    456 	.quad	0x983e5152ee66dfab,0xa831c66d2db43210
    457 	.quad	0xb00327c898fb213f,0xbf597fc7beef0ee4
    458 	.quad	0xb00327c898fb213f,0xbf597fc7beef0ee4
    459 	.quad	0xc6e00bf33da88fc2,0xd5a79147930aa725
    460 	.quad	0xc6e00bf33da88fc2,0xd5a79147930aa725
    461 	.quad	0x06ca6351e003826f,0x142929670a0e6e70
    462 	.quad	0x06ca6351e003826f,0x142929670a0e6e70
    463 	.quad	0x27b70a8546d22ffc,0x2e1b21385c26c926
    464 	.quad	0x27b70a8546d22ffc,0x2e1b21385c26c926
    465 	.quad	0x4d2c6dfc5ac42aed,0x53380d139d95b3df
    466 	.quad	0x4d2c6dfc5ac42aed,0x53380d139d95b3df
    467 	.quad	0x650a73548baf63de,0x766a0abb3c77b2a8
    468 	.quad	0x650a73548baf63de,0x766a0abb3c77b2a8
    469 	.quad	0x81c2c92e47edaee6,0x92722c851482353b
    470 	.quad	0x81c2c92e47edaee6,0x92722c851482353b
    471 	.quad	0xa2bfe8a14cf10364,0xa81a664bbc423001
    472 	.quad	0xa2bfe8a14cf10364,0xa81a664bbc423001
    473 	.quad	0xc24b8b70d0f89791,0xc76c51a30654be30
    474 	.quad	0xc24b8b70d0f89791,0xc76c51a30654be30
    475 	.quad	0xd192e819d6ef5218,0xd69906245565a910
    476 	.quad	0xd192e819d6ef5218,0xd69906245565a910
    477 	.quad	0xf40e35855771202a,0x106aa07032bbd1b8
    478 	.quad	0xf40e35855771202a,0x106aa07032bbd1b8
    479 	.quad	0x19a4c116b8d2d0c8,0x1e376c085141ab53
    480 	.quad	0x19a4c116b8d2d0c8,0x1e376c085141ab53
    481 	.quad	0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
    482 	.quad	0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
    483 	.quad	0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
    484 	.quad	0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
    485 	.quad	0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
    486 	.quad	0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
    487 	.quad	0x748f82ee5defb2fc,0x78a5636f43172f60
    488 	.quad	0x748f82ee5defb2fc,0x78a5636f43172f60
    489 	.quad	0x84c87814a1f0ab72,0x8cc702081a6439ec
    490 	.quad	0x84c87814a1f0ab72,0x8cc702081a6439ec
    491 	.quad	0x90befffa23631e28,0xa4506cebde82bde9
    492 	.quad	0x90befffa23631e28,0xa4506cebde82bde9
    493 	.quad	0xbef9a3f7b2c67915,0xc67178f2e372532b
    494 	.quad	0xbef9a3f7b2c67915,0xc67178f2e372532b
    495 	.quad	0xca273eceea26619c,0xd186b8c721c0c207
    496 	.quad	0xca273eceea26619c,0xd186b8c721c0c207
    497 	.quad	0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
    498 	.quad	0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
    499 	.quad	0x06f067aa72176fba,0x0a637dc5a2c898a6
    500 	.quad	0x06f067aa72176fba,0x0a637dc5a2c898a6
    501 	.quad	0x113f9804bef90dae,0x1b710b35131c471b
    502 	.quad	0x113f9804bef90dae,0x1b710b35131c471b
    503 	.quad	0x28db77f523047d84,0x32caab7b40c72493
    504 	.quad	0x28db77f523047d84,0x32caab7b40c72493
    505 	.quad	0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
    506 	.quad	0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
    507 	.quad	0x4cc5d4becb3e42b6,0x597f299cfc657e2a
    508 	.quad	0x4cc5d4becb3e42b6,0x597f299cfc657e2a
    509 	.quad	0x5fcb6fab3ad6faec,0x6c44198c4a475817
    510 	.quad	0x5fcb6fab3ad6faec,0x6c44198c4a475817
    511 
    512 	.quad	0x0001020304050607,0x08090a0b0c0d0e0f
    513 	.quad	0x0001020304050607,0x08090a0b0c0d0e0f
    514 	.asciz	"SHA512 block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
    515 ___
    516 }
    517 
    518 ######################################################################
    519 # SIMD code paths
    520 #
    521 if ($SZ==4 && $shaext) {{{
    522 ######################################################################
    523 # Intel SHA Extensions implementation of SHA256 update function.
    524 #
    525 my ($ctx,$inp,$num,$Tbl)=("%rdi","%rsi","%rdx","%rcx");
    526 
    527 my ($Wi,$ABEF,$CDGH,$TMP,$BSWAP,$ABEF_SAVE,$CDGH_SAVE)=map("%xmm$_",(0..2,7..10));
    528 my @MSG=map("%xmm$_",(3..6));
    529 
    530 $code.=<<___;
    531 .type	sha256_block_data_order_shaext,\@function,3
    532 .align	64
    533 sha256_block_data_order_shaext:
    534 _shaext_shortcut:
    535 ___
    536 $code.=<<___ if ($win64);
    537 	lea	`-8-5*16`(%rsp),%rsp
    538 	movaps	%xmm6,-8-5*16(%rax)
    539 	movaps	%xmm7,-8-4*16(%rax)
    540 	movaps	%xmm8,-8-3*16(%rax)
    541 	movaps	%xmm9,-8-2*16(%rax)
    542 	movaps	%xmm10,-8-1*16(%rax)
    543 .Lprologue_shaext:
    544 ___
    545 $code.=<<___;
    546 	lea		K256+0x80(%rip),$Tbl
    547 	movdqu		($ctx),$ABEF		# DCBA
    548 	movdqu		16($ctx),$CDGH		# HGFE
    549 	movdqa		0x200-0x80($Tbl),$TMP	# byte swap mask
    550 
    551 	pshufd		\$0x1b,$ABEF,$Wi	# ABCD
    552 	pshufd		\$0xb1,$ABEF,$ABEF	# CDAB
    553 	pshufd		\$0x1b,$CDGH,$CDGH	# EFGH
    554 	movdqa		$TMP,$BSWAP		# offload
    555 	palignr		\$8,$CDGH,$ABEF		# ABEF
    556 	punpcklqdq	$Wi,$CDGH		# CDGH
    557 	jmp		.Loop_shaext
    558 
    559 .align	16
    560 .Loop_shaext:
    561 	movdqu		($inp),@MSG[0]
    562 	movdqu		0x10($inp),@MSG[1]
    563 	movdqu		0x20($inp),@MSG[2]
    564 	pshufb		$TMP,@MSG[0]
    565 	movdqu		0x30($inp),@MSG[3]
    566 
    567 	movdqa		0*32-0x80($Tbl),$Wi
    568 	paddd		@MSG[0],$Wi
    569 	pshufb		$TMP,@MSG[1]
    570 	movdqa		$CDGH,$CDGH_SAVE	# offload
    571 	sha256rnds2	$ABEF,$CDGH		# 0-3
    572 	pshufd		\$0x0e,$Wi,$Wi
    573 	nop
    574 	movdqa		$ABEF,$ABEF_SAVE	# offload
    575 	sha256rnds2	$CDGH,$ABEF
    576 
    577 	movdqa		1*32-0x80($Tbl),$Wi
    578 	paddd		@MSG[1],$Wi
    579 	pshufb		$TMP,@MSG[2]
    580 	sha256rnds2	$ABEF,$CDGH		# 4-7
    581 	pshufd		\$0x0e,$Wi,$Wi
    582 	lea		0x40($inp),$inp
    583 	sha256msg1	@MSG[1],@MSG[0]
    584 	sha256rnds2	$CDGH,$ABEF
    585 
    586 	movdqa		2*32-0x80($Tbl),$Wi
    587 	paddd		@MSG[2],$Wi
    588 	pshufb		$TMP,@MSG[3]
    589 	sha256rnds2	$ABEF,$CDGH		# 8-11
    590 	pshufd		\$0x0e,$Wi,$Wi
    591 	movdqa		@MSG[3],$TMP
    592 	palignr		\$4,@MSG[2],$TMP
    593 	nop
    594 	paddd		$TMP,@MSG[0]
    595 	sha256msg1	@MSG[2],@MSG[1]
    596 	sha256rnds2	$CDGH,$ABEF
    597 
    598 	movdqa		3*32-0x80($Tbl),$Wi
    599 	paddd		@MSG[3],$Wi
    600 	sha256msg2	@MSG[3],@MSG[0]
    601 	sha256rnds2	$ABEF,$CDGH		# 12-15
    602 	pshufd		\$0x0e,$Wi,$Wi
    603 	movdqa		@MSG[0],$TMP
    604 	palignr		\$4,@MSG[3],$TMP
    605 	nop
    606 	paddd		$TMP,@MSG[1]
    607 	sha256msg1	@MSG[3],@MSG[2]
    608 	sha256rnds2	$CDGH,$ABEF
    609 ___
    610 for($i=4;$i<16-3;$i++) {
    611 $code.=<<___;
    612 	movdqa		$i*32-0x80($Tbl),$Wi
    613 	paddd		@MSG[0],$Wi
    614 	sha256msg2	@MSG[0],@MSG[1]
    615 	sha256rnds2	$ABEF,$CDGH		# 16-19...
    616 	pshufd		\$0x0e,$Wi,$Wi
    617 	movdqa		@MSG[1],$TMP
    618 	palignr		\$4,@MSG[0],$TMP
    619 	nop
    620 	paddd		$TMP,@MSG[2]
    621 	sha256msg1	@MSG[0],@MSG[3]
    622 	sha256rnds2	$CDGH,$ABEF
    623 ___
    624 	push(@MSG,shift(@MSG));
    625 }
    626 $code.=<<___;
    627 	movdqa		13*32-0x80($Tbl),$Wi
    628 	paddd		@MSG[0],$Wi
    629 	sha256msg2	@MSG[0],@MSG[1]
    630 	sha256rnds2	$ABEF,$CDGH		# 52-55
    631 	pshufd		\$0x0e,$Wi,$Wi
    632 	movdqa		@MSG[1],$TMP
    633 	palignr		\$4,@MSG[0],$TMP
    634 	sha256rnds2	$CDGH,$ABEF
    635 	paddd		$TMP,@MSG[2]
    636 
    637 	movdqa		14*32-0x80($Tbl),$Wi
    638 	paddd		@MSG[1],$Wi
    639 	sha256rnds2	$ABEF,$CDGH		# 56-59
    640 	pshufd		\$0x0e,$Wi,$Wi
    641 	sha256msg2	@MSG[1],@MSG[2]
    642 	movdqa		$BSWAP,$TMP
    643 	sha256rnds2	$CDGH,$ABEF
    644 
    645 	movdqa		15*32-0x80($Tbl),$Wi
    646 	paddd		@MSG[2],$Wi
    647 	nop
    648 	sha256rnds2	$ABEF,$CDGH		# 60-63
    649 	pshufd		\$0x0e,$Wi,$Wi
    650 	dec		$num
    651 	nop
    652 	sha256rnds2	$CDGH,$ABEF
    653 
    654 	paddd		$CDGH_SAVE,$CDGH
    655 	paddd		$ABEF_SAVE,$ABEF
    656 	jnz		.Loop_shaext
    657 
    658 	pshufd		\$0xb1,$CDGH,$CDGH	# DCHG
    659 	pshufd		\$0x1b,$ABEF,$TMP	# FEBA
    660 	pshufd		\$0xb1,$ABEF,$ABEF	# BAFE
    661 	punpckhqdq	$CDGH,$ABEF		# DCBA
    662 	palignr		\$8,$TMP,$CDGH		# HGFE
    663 
    664 	movdqu	$ABEF,($ctx)
    665 	movdqu	$CDGH,16($ctx)
    666 ___
    667 $code.=<<___ if ($win64);
    668 	movaps	-8-5*16(%rax),%xmm6
    669 	movaps	-8-4*16(%rax),%xmm7
    670 	movaps	-8-3*16(%rax),%xmm8
    671 	movaps	-8-2*16(%rax),%xmm9
    672 	movaps	-8-1*16(%rax),%xmm10
    673 	mov	%rax,%rsp
    674 .Lepilogue_shaext:
    675 ___
    676 $code.=<<___;
    677 	ret
    678 .size	sha256_block_data_order_shaext,.-sha256_block_data_order_shaext
    679 ___
    680 }}}
    681 {{{
    682 
    683 my $a4=$T1;
    684 my ($a,$b,$c,$d,$e,$f,$g,$h);
    685 
    686 sub AUTOLOAD()		# thunk [simplified] 32-bit style perlasm
    687 { my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
    688   my $arg = pop;
    689     $arg = "\$$arg" if ($arg*1 eq $arg);
    690     $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n";
    691 }
    692 
    693 sub body_00_15 () {
    694 	(
    695 	'($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'.
    696 
    697 	'&ror	($a0,$Sigma1[2]-$Sigma1[1])',
    698 	'&mov	($a,$a1)',
    699 	'&mov	($a4,$f)',
    700 
    701 	'&ror	($a1,$Sigma0[2]-$Sigma0[1])',
    702 	'&xor	($a0,$e)',
    703 	'&xor	($a4,$g)',			# f^g
    704 
    705 	'&ror	($a0,$Sigma1[1]-$Sigma1[0])',
    706 	'&xor	($a1,$a)',
    707 	'&and	($a4,$e)',			# (f^g)&e
    708 
    709 	'&xor	($a0,$e)',
    710 	'&add	($h,$SZ*($i&15)."(%rsp)")',	# h+=X[i]+K[i]
    711 	'&mov	($a2,$a)',
    712 
    713 	'&xor	($a4,$g)',			# Ch(e,f,g)=((f^g)&e)^g
    714 	'&ror	($a1,$Sigma0[1]-$Sigma0[0])',
    715 	'&xor	($a2,$b)',			# a^b, b^c in next round
    716 
    717 	'&add	($h,$a4)',			# h+=Ch(e,f,g)
    718 	'&ror	($a0,$Sigma1[0])',		# Sigma1(e)
    719 	'&and	($a3,$a2)',			# (b^c)&(a^b)
    720 
    721 	'&xor	($a1,$a)',
    722 	'&add	($h,$a0)',			# h+=Sigma1(e)
    723 	'&xor	($a3,$b)',			# Maj(a,b,c)=Ch(a^b,c,b)
    724 
    725 	'&ror	($a1,$Sigma0[0])',		# Sigma0(a)
    726 	'&add	($d,$h)',			# d+=h
    727 	'&add	($h,$a3)',			# h+=Maj(a,b,c)
    728 
    729 	'&mov	($a0,$d)',
    730 	'&add	($a1,$h);'.			# h+=Sigma0(a)
    731 	'($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;'
    732 	);
    733 }
    734 
    735 ######################################################################
    736 # SSSE3 code path
    737 #
    738 if ($SZ==4) {	# SHA256 only
    739 my @X = map("%xmm$_",(0..3));
    740 my ($t0,$t1,$t2,$t3, $t4,$t5) = map("%xmm$_",(4..9));
    741 
    742 $code.=<<___;
    743 .type	${func}_ssse3,\@function,3
    744 .align	64
    745 ${func}_ssse3:
    746 .Lssse3_shortcut:
    747 	push	%rbx
    748 	push	%rbp
    749 	push	%r12
    750 	push	%r13
    751 	push	%r14
    752 	push	%r15
    753 	mov	%rsp,%r11		# copy %rsp
    754 	shl	\$4,%rdx		# num*16
    755 	sub	\$`$framesz+$win64*16*4`,%rsp
    756 	lea	($inp,%rdx,$SZ),%rdx	# inp+num*16*$SZ
    757 	and	\$-64,%rsp		# align stack frame
    758 	mov	$ctx,$_ctx		# save ctx, 1st arg
    759 	mov	$inp,$_inp		# save inp, 2nd arh
    760 	mov	%rdx,$_end		# save end pointer, "3rd" arg
    761 	mov	%r11,$_rsp		# save copy of %rsp
    762 ___
    763 $code.=<<___ if ($win64);
    764 	movaps	%xmm6,16*$SZ+32(%rsp)
    765 	movaps	%xmm7,16*$SZ+48(%rsp)
    766 	movaps	%xmm8,16*$SZ+64(%rsp)
    767 	movaps	%xmm9,16*$SZ+80(%rsp)
    768 ___
    769 $code.=<<___;
    770 .Lprologue_ssse3:
    771 
    772 	mov	$SZ*0($ctx),$A
    773 	mov	$SZ*1($ctx),$B
    774 	mov	$SZ*2($ctx),$C
    775 	mov	$SZ*3($ctx),$D
    776 	mov	$SZ*4($ctx),$E
    777 	mov	$SZ*5($ctx),$F
    778 	mov	$SZ*6($ctx),$G
    779 	mov	$SZ*7($ctx),$H
    780 ___
    781 
    782 $code.=<<___;
    783 	#movdqa	$TABLE+`$SZ*2*$rounds`+32(%rip),$t4
    784 	#movdqa	$TABLE+`$SZ*2*$rounds`+64(%rip),$t5
    785 	jmp	.Lloop_ssse3
    786 .align	16
    787 .Lloop_ssse3:
    788 	movdqa	$TABLE+`$SZ*2*$rounds`(%rip),$t3
    789 	movdqu	0x00($inp),@X[0]
    790 	movdqu	0x10($inp),@X[1]
    791 	movdqu	0x20($inp),@X[2]
    792 	pshufb	$t3,@X[0]
    793 	movdqu	0x30($inp),@X[3]
    794 	lea	$TABLE(%rip),$Tbl
    795 	pshufb	$t3,@X[1]
    796 	movdqa	0x00($Tbl),$t0
    797 	movdqa	0x20($Tbl),$t1
    798 	pshufb	$t3,@X[2]
    799 	paddd	@X[0],$t0
    800 	movdqa	0x40($Tbl),$t2
    801 	pshufb	$t3,@X[3]
    802 	movdqa	0x60($Tbl),$t3
    803 	paddd	@X[1],$t1
    804 	paddd	@X[2],$t2
    805 	paddd	@X[3],$t3
    806 	movdqa	$t0,0x00(%rsp)
    807 	mov	$A,$a1
    808 	movdqa	$t1,0x10(%rsp)
    809 	mov	$B,$a3
    810 	movdqa	$t2,0x20(%rsp)
    811 	xor	$C,$a3			# magic
    812 	movdqa	$t3,0x30(%rsp)
    813 	mov	$E,$a0
    814 	jmp	.Lssse3_00_47
    815 
    816 .align	16
    817 .Lssse3_00_47:
    818 	sub	\$`-16*2*$SZ`,$Tbl	# size optimization
    819 ___
    820 sub Xupdate_256_SSSE3 () {
    821 	(
    822 	'&movdqa	($t0,@X[1]);',
    823 	'&movdqa	($t3,@X[3])',
    824 	'&palignr	($t0,@X[0],$SZ)',	# X[1..4]
    825 	 '&palignr	($t3,@X[2],$SZ);',	# X[9..12]
    826 	'&movdqa	($t1,$t0)',
    827 	'&movdqa	($t2,$t0);',
    828 	'&psrld		($t0,$sigma0[2])',
    829 	 '&paddd	(@X[0],$t3);',		# X[0..3] += X[9..12]
    830 	'&psrld		($t2,$sigma0[0])',
    831 	 '&pshufd	($t3,@X[3],0b11111010)',# X[14..15]
    832 	'&pslld		($t1,8*$SZ-$sigma0[1]);'.
    833 	'&pxor		($t0,$t2)',
    834 	'&psrld		($t2,$sigma0[1]-$sigma0[0]);'.
    835 	'&pxor		($t0,$t1)',
    836 	'&pslld		($t1,$sigma0[1]-$sigma0[0]);'.
    837 	'&pxor		($t0,$t2);',
    838 	 '&movdqa	($t2,$t3)',
    839 	'&pxor		($t0,$t1);',		# sigma0(X[1..4])
    840 	 '&psrld	($t3,$sigma1[2])',
    841 	'&paddd		(@X[0],$t0);',		# X[0..3] += sigma0(X[1..4])
    842 	 '&psrlq	($t2,$sigma1[0])',
    843 	 '&pxor		($t3,$t2);',
    844 	 '&psrlq	($t2,$sigma1[1]-$sigma1[0])',
    845 	 '&pxor		($t3,$t2)',
    846 	 '&pshufb	($t3,$t4)',		# sigma1(X[14..15])
    847 	'&paddd		(@X[0],$t3)',		# X[0..1] += sigma1(X[14..15])
    848 	 '&pshufd	($t3,@X[0],0b01010000)',# X[16..17]
    849 	 '&movdqa	($t2,$t3);',
    850 	 '&psrld	($t3,$sigma1[2])',
    851 	 '&psrlq	($t2,$sigma1[0])',
    852 	 '&pxor		($t3,$t2);',
    853 	 '&psrlq	($t2,$sigma1[1]-$sigma1[0])',
    854 	 '&pxor		($t3,$t2);',
    855 	'&movdqa	($t2,16*2*$j."($Tbl)")',
    856 	 '&pshufb	($t3,$t5)',
    857 	'&paddd		(@X[0],$t3)'		# X[2..3] += sigma1(X[16..17])
    858 	);
    859 }
    860 
    861 sub SSSE3_256_00_47 () {
    862 my $j = shift;
    863 my $body = shift;
    864 my @X = @_;
    865 my @insns = (&$body,&$body,&$body,&$body);	# 104 instructions
    866 
    867     if (0) {
    868 	foreach (Xupdate_256_SSSE3()) {		# 36 instructions
    869 	    eval;
    870 	    eval(shift(@insns));
    871 	    eval(shift(@insns));
    872 	    eval(shift(@insns));
    873 	}
    874     } else {			# squeeze extra 4% on Westmere and 19% on Atom
    875 	  eval(shift(@insns));	#@
    876 	&movdqa		($t0,@X[1]);
    877 	  eval(shift(@insns));
    878 	  eval(shift(@insns));
    879 	&movdqa		($t3,@X[3]);
    880 	  eval(shift(@insns));	#@
    881 	  eval(shift(@insns));
    882 	  eval(shift(@insns));
    883 	  eval(shift(@insns));	#@
    884 	  eval(shift(@insns));
    885 	&palignr	($t0,@X[0],$SZ);	# X[1..4]
    886 	  eval(shift(@insns));
    887 	  eval(shift(@insns));
    888 	 &palignr	($t3,@X[2],$SZ);	# X[9..12]
    889 	  eval(shift(@insns));
    890 	  eval(shift(@insns));
    891 	  eval(shift(@insns));
    892 	  eval(shift(@insns));	#@
    893 	&movdqa		($t1,$t0);
    894 	  eval(shift(@insns));
    895 	  eval(shift(@insns));
    896 	&movdqa		($t2,$t0);
    897 	  eval(shift(@insns));	#@
    898 	  eval(shift(@insns));
    899 	&psrld		($t0,$sigma0[2]);
    900 	  eval(shift(@insns));
    901 	  eval(shift(@insns));
    902 	  eval(shift(@insns));
    903 	 &paddd		(@X[0],$t3);		# X[0..3] += X[9..12]
    904 	  eval(shift(@insns));	#@
    905 	  eval(shift(@insns));
    906 	&psrld		($t2,$sigma0[0]);
    907 	  eval(shift(@insns));
    908 	  eval(shift(@insns));
    909 	 &pshufd	($t3,@X[3],0b11111010);	# X[4..15]
    910 	  eval(shift(@insns));
    911 	  eval(shift(@insns));	#@
    912 	&pslld		($t1,8*$SZ-$sigma0[1]);
    913 	  eval(shift(@insns));
    914 	  eval(shift(@insns));
    915 	&pxor		($t0,$t2);
    916 	  eval(shift(@insns));	#@
    917 	  eval(shift(@insns));
    918 	  eval(shift(@insns));
    919 	  eval(shift(@insns));	#@
    920 	&psrld		($t2,$sigma0[1]-$sigma0[0]);
    921 	  eval(shift(@insns));
    922 	&pxor		($t0,$t1);
    923 	  eval(shift(@insns));
    924 	  eval(shift(@insns));
    925 	&pslld		($t1,$sigma0[1]-$sigma0[0]);
    926 	  eval(shift(@insns));
    927 	  eval(shift(@insns));
    928 	&pxor		($t0,$t2);
    929 	  eval(shift(@insns));
    930 	  eval(shift(@insns));	#@
    931 	 &movdqa	($t2,$t3);
    932 	  eval(shift(@insns));
    933 	  eval(shift(@insns));
    934 	&pxor		($t0,$t1);		# sigma0(X[1..4])
    935 	  eval(shift(@insns));	#@
    936 	  eval(shift(@insns));
    937 	  eval(shift(@insns));
    938 	 &psrld		($t3,$sigma1[2]);
    939 	  eval(shift(@insns));
    940 	  eval(shift(@insns));
    941 	&paddd		(@X[0],$t0);		# X[0..3] += sigma0(X[1..4])
    942 	  eval(shift(@insns));	#@
    943 	  eval(shift(@insns));
    944 	 &psrlq		($t2,$sigma1[0]);
    945 	  eval(shift(@insns));
    946 	  eval(shift(@insns));
    947 	  eval(shift(@insns));
    948 	 &pxor		($t3,$t2);
    949 	  eval(shift(@insns));	#@
    950 	  eval(shift(@insns));
    951 	  eval(shift(@insns));
    952 	  eval(shift(@insns));	#@
    953 	 &psrlq		($t2,$sigma1[1]-$sigma1[0]);
    954 	  eval(shift(@insns));
    955 	  eval(shift(@insns));
    956 	 &pxor		($t3,$t2);
    957 	  eval(shift(@insns));	#@
    958 	  eval(shift(@insns));
    959 	  eval(shift(@insns));
    960 	 #&pshufb	($t3,$t4);		# sigma1(X[14..15])
    961 	 &pshufd	($t3,$t3,0b10000000);
    962 	  eval(shift(@insns));
    963 	  eval(shift(@insns));
    964 	  eval(shift(@insns));
    965 	 &psrldq	($t3,8);
    966 	  eval(shift(@insns));
    967 	  eval(shift(@insns));	#@
    968 	  eval(shift(@insns));
    969 	  eval(shift(@insns));
    970 	  eval(shift(@insns));	#@
    971 	&paddd		(@X[0],$t3);		# X[0..1] += sigma1(X[14..15])
    972 	  eval(shift(@insns));
    973 	  eval(shift(@insns));
    974 	  eval(shift(@insns));
    975 	 &pshufd	($t3,@X[0],0b01010000);	# X[16..17]
    976 	  eval(shift(@insns));
    977 	  eval(shift(@insns));	#@
    978 	  eval(shift(@insns));
    979 	 &movdqa	($t2,$t3);
    980 	  eval(shift(@insns));
    981 	  eval(shift(@insns));
    982 	 &psrld		($t3,$sigma1[2]);
    983 	  eval(shift(@insns));
    984 	  eval(shift(@insns));	#@
    985 	 &psrlq		($t2,$sigma1[0]);
    986 	  eval(shift(@insns));
    987 	  eval(shift(@insns));
    988 	 &pxor		($t3,$t2);
    989 	  eval(shift(@insns));	#@
    990 	  eval(shift(@insns));
    991 	  eval(shift(@insns));
    992 	  eval(shift(@insns));	#@
    993 	  eval(shift(@insns));
    994 	 &psrlq		($t2,$sigma1[1]-$sigma1[0]);
    995 	  eval(shift(@insns));
    996 	  eval(shift(@insns));
    997 	  eval(shift(@insns));
    998 	 &pxor		($t3,$t2);
    999 	  eval(shift(@insns));
   1000 	  eval(shift(@insns));
   1001 	  eval(shift(@insns));	#@
   1002 	 #&pshufb	($t3,$t5);
   1003 	 &pshufd	($t3,$t3,0b00001000);
   1004 	  eval(shift(@insns));
   1005 	  eval(shift(@insns));
   1006 	&movdqa		($t2,16*2*$j."($Tbl)");
   1007 	  eval(shift(@insns));	#@
   1008 	  eval(shift(@insns));
   1009 	 &pslldq	($t3,8);
   1010 	  eval(shift(@insns));
   1011 	  eval(shift(@insns));
   1012 	  eval(shift(@insns));
   1013 	&paddd		(@X[0],$t3);		# X[2..3] += sigma1(X[16..17])
   1014 	  eval(shift(@insns));	#@
   1015 	  eval(shift(@insns));
   1016 	  eval(shift(@insns));
   1017     }
   1018 	&paddd		($t2,@X[0]);
   1019 	  foreach (@insns) { eval; }		# remaining instructions
   1020 	&movdqa		(16*$j."(%rsp)",$t2);
   1021 }
   1022 
   1023     for ($i=0,$j=0; $j<4; $j++) {
   1024 	&SSSE3_256_00_47($j,\&body_00_15,@X);
   1025 	push(@X,shift(@X));			# rotate(@X)
   1026     }
   1027 	&cmpb	($SZ-1+16*2*$SZ."($Tbl)",0);
   1028 	&jne	(".Lssse3_00_47");
   1029 
   1030     for ($i=0; $i<16; ) {
   1031 	foreach(body_00_15()) { eval; }
   1032     }
   1033 $code.=<<___;
   1034 	mov	$_ctx,$ctx
   1035 	mov	$a1,$A
   1036 
   1037 	add	$SZ*0($ctx),$A
   1038 	lea	16*$SZ($inp),$inp
   1039 	add	$SZ*1($ctx),$B
   1040 	add	$SZ*2($ctx),$C
   1041 	add	$SZ*3($ctx),$D
   1042 	add	$SZ*4($ctx),$E
   1043 	add	$SZ*5($ctx),$F
   1044 	add	$SZ*6($ctx),$G
   1045 	add	$SZ*7($ctx),$H
   1046 
   1047 	cmp	$_end,$inp
   1048 
   1049 	mov	$A,$SZ*0($ctx)
   1050 	mov	$B,$SZ*1($ctx)
   1051 	mov	$C,$SZ*2($ctx)
   1052 	mov	$D,$SZ*3($ctx)
   1053 	mov	$E,$SZ*4($ctx)
   1054 	mov	$F,$SZ*5($ctx)
   1055 	mov	$G,$SZ*6($ctx)
   1056 	mov	$H,$SZ*7($ctx)
   1057 	jb	.Lloop_ssse3
   1058 
   1059 	mov	$_rsp,%rsi
   1060 ___
   1061 $code.=<<___ if ($win64);
   1062 	movaps	16*$SZ+32(%rsp),%xmm6
   1063 	movaps	16*$SZ+48(%rsp),%xmm7
   1064 	movaps	16*$SZ+64(%rsp),%xmm8
   1065 	movaps	16*$SZ+80(%rsp),%xmm9
   1066 ___
   1067 $code.=<<___;
   1068 	mov	(%rsi),%r15
   1069 	mov	8(%rsi),%r14
   1070 	mov	16(%rsi),%r13
   1071 	mov	24(%rsi),%r12
   1072 	mov	32(%rsi),%rbp
   1073 	mov	40(%rsi),%rbx
   1074 	lea	48(%rsi),%rsp
   1075 .Lepilogue_ssse3:
   1076 	ret
   1077 .size	${func}_ssse3,.-${func}_ssse3
   1078 ___
   1079 }
   1080 
   1081 if ($avx) {{
   1082 ######################################################################
   1083 # XOP code path
   1084 #
   1085 if ($SZ==8) {	# SHA512 only
   1086 $code.=<<___;
   1087 .type	${func}_xop,\@function,3
   1088 .align	64
   1089 ${func}_xop:
   1090 .Lxop_shortcut:
   1091 	push	%rbx
   1092 	push	%rbp
   1093 	push	%r12
   1094 	push	%r13
   1095 	push	%r14
   1096 	push	%r15
   1097 	mov	%rsp,%r11		# copy %rsp
   1098 	shl	\$4,%rdx		# num*16
   1099 	sub	\$`$framesz+$win64*16*($SZ==4?4:6)`,%rsp
   1100 	lea	($inp,%rdx,$SZ),%rdx	# inp+num*16*$SZ
   1101 	and	\$-64,%rsp		# align stack frame
   1102 	mov	$ctx,$_ctx		# save ctx, 1st arg
   1103 	mov	$inp,$_inp		# save inp, 2nd arh
   1104 	mov	%rdx,$_end		# save end pointer, "3rd" arg
   1105 	mov	%r11,$_rsp		# save copy of %rsp
   1106 ___
   1107 $code.=<<___ if ($win64);
   1108 	movaps	%xmm6,16*$SZ+32(%rsp)
   1109 	movaps	%xmm7,16*$SZ+48(%rsp)
   1110 	movaps	%xmm8,16*$SZ+64(%rsp)
   1111 	movaps	%xmm9,16*$SZ+80(%rsp)
   1112 ___
   1113 $code.=<<___ if ($win64 && $SZ>4);
   1114 	movaps	%xmm10,16*$SZ+96(%rsp)
   1115 	movaps	%xmm11,16*$SZ+112(%rsp)
   1116 ___
   1117 $code.=<<___;
   1118 .Lprologue_xop:
   1119 
   1120 	vzeroupper
   1121 	mov	$SZ*0($ctx),$A
   1122 	mov	$SZ*1($ctx),$B
   1123 	mov	$SZ*2($ctx),$C
   1124 	mov	$SZ*3($ctx),$D
   1125 	mov	$SZ*4($ctx),$E
   1126 	mov	$SZ*5($ctx),$F
   1127 	mov	$SZ*6($ctx),$G
   1128 	mov	$SZ*7($ctx),$H
   1129 	jmp	.Lloop_xop
   1130 ___
   1131 					if ($SZ==4) {	# SHA256
   1132     my @X = map("%xmm$_",(0..3));
   1133     my ($t0,$t1,$t2,$t3) = map("%xmm$_",(4..7));
   1134 
   1135 $code.=<<___;
   1136 .align	16
   1137 .Lloop_xop:
   1138 	vmovdqa	$TABLE+`$SZ*2*$rounds`(%rip),$t3
   1139 	vmovdqu	0x00($inp),@X[0]
   1140 	vmovdqu	0x10($inp),@X[1]
   1141 	vmovdqu	0x20($inp),@X[2]
   1142 	vmovdqu	0x30($inp),@X[3]
   1143 	vpshufb	$t3,@X[0],@X[0]
   1144 	lea	$TABLE(%rip),$Tbl
   1145 	vpshufb	$t3,@X[1],@X[1]
   1146 	vpshufb	$t3,@X[2],@X[2]
   1147 	vpaddd	0x00($Tbl),@X[0],$t0
   1148 	vpshufb	$t3,@X[3],@X[3]
   1149 	vpaddd	0x20($Tbl),@X[1],$t1
   1150 	vpaddd	0x40($Tbl),@X[2],$t2
   1151 	vpaddd	0x60($Tbl),@X[3],$t3
   1152 	vmovdqa	$t0,0x00(%rsp)
   1153 	mov	$A,$a1
   1154 	vmovdqa	$t1,0x10(%rsp)
   1155 	mov	$B,$a3
   1156 	vmovdqa	$t2,0x20(%rsp)
   1157 	xor	$C,$a3			# magic
   1158 	vmovdqa	$t3,0x30(%rsp)
   1159 	mov	$E,$a0
   1160 	jmp	.Lxop_00_47
   1161 
   1162 .align	16
   1163 .Lxop_00_47:
   1164 	sub	\$`-16*2*$SZ`,$Tbl	# size optimization
   1165 ___
   1166 sub XOP_256_00_47 () {
   1167 my $j = shift;
   1168 my $body = shift;
   1169 my @X = @_;
   1170 my @insns = (&$body,&$body,&$body,&$body);	# 104 instructions
   1171 
   1172 	&vpalignr	($t0,@X[1],@X[0],$SZ);	# X[1..4]
   1173 	  eval(shift(@insns));
   1174 	  eval(shift(@insns));
   1175 	 &vpalignr	($t3,@X[3],@X[2],$SZ);	# X[9..12]
   1176 	  eval(shift(@insns));
   1177 	  eval(shift(@insns));
   1178 	&vprotd		($t1,$t0,8*$SZ-$sigma0[1]);
   1179 	  eval(shift(@insns));
   1180 	  eval(shift(@insns));
   1181 	&vpsrld		($t0,$t0,$sigma0[2]);
   1182 	  eval(shift(@insns));
   1183 	  eval(shift(@insns));
   1184 	 &vpaddd	(@X[0],@X[0],$t3);	# X[0..3] += X[9..12]
   1185 	  eval(shift(@insns));
   1186 	  eval(shift(@insns));
   1187 	  eval(shift(@insns));
   1188 	  eval(shift(@insns));
   1189 	&vprotd		($t2,$t1,$sigma0[1]-$sigma0[0]);
   1190 	  eval(shift(@insns));
   1191 	  eval(shift(@insns));
   1192 	&vpxor		($t0,$t0,$t1);
   1193 	  eval(shift(@insns));
   1194 	  eval(shift(@insns));
   1195 	  eval(shift(@insns));
   1196 	  eval(shift(@insns));
   1197 	 &vprotd	($t3,@X[3],8*$SZ-$sigma1[1]);
   1198 	  eval(shift(@insns));
   1199 	  eval(shift(@insns));
   1200 	&vpxor		($t0,$t0,$t2);		# sigma0(X[1..4])
   1201 	  eval(shift(@insns));
   1202 	  eval(shift(@insns));
   1203 	 &vpsrld	($t2,@X[3],$sigma1[2]);
   1204 	  eval(shift(@insns));
   1205 	  eval(shift(@insns));
   1206 	&vpaddd		(@X[0],@X[0],$t0);	# X[0..3] += sigma0(X[1..4])
   1207 	  eval(shift(@insns));
   1208 	  eval(shift(@insns));
   1209 	 &vprotd	($t1,$t3,$sigma1[1]-$sigma1[0]);
   1210 	  eval(shift(@insns));
   1211 	  eval(shift(@insns));
   1212 	 &vpxor		($t3,$t3,$t2);
   1213 	  eval(shift(@insns));
   1214 	  eval(shift(@insns));
   1215 	  eval(shift(@insns));
   1216 	  eval(shift(@insns));
   1217 	 &vpxor		($t3,$t3,$t1);		# sigma1(X[14..15])
   1218 	  eval(shift(@insns));
   1219 	  eval(shift(@insns));
   1220 	  eval(shift(@insns));
   1221 	  eval(shift(@insns));
   1222 	&vpsrldq	($t3,$t3,8);
   1223 	  eval(shift(@insns));
   1224 	  eval(shift(@insns));
   1225 	  eval(shift(@insns));
   1226 	  eval(shift(@insns));
   1227 	&vpaddd		(@X[0],@X[0],$t3);	# X[0..1] += sigma1(X[14..15])
   1228 	  eval(shift(@insns));
   1229 	  eval(shift(@insns));
   1230 	  eval(shift(@insns));
   1231 	  eval(shift(@insns));
   1232 	 &vprotd	($t3,@X[0],8*$SZ-$sigma1[1]);
   1233 	  eval(shift(@insns));
   1234 	  eval(shift(@insns));
   1235 	 &vpsrld	($t2,@X[0],$sigma1[2]);
   1236 	  eval(shift(@insns));
   1237 	  eval(shift(@insns));
   1238 	 &vprotd	($t1,$t3,$sigma1[1]-$sigma1[0]);
   1239 	  eval(shift(@insns));
   1240 	  eval(shift(@insns));
   1241 	 &vpxor		($t3,$t3,$t2);
   1242 	  eval(shift(@insns));
   1243 	  eval(shift(@insns));
   1244 	  eval(shift(@insns));
   1245 	  eval(shift(@insns));
   1246 	 &vpxor		($t3,$t3,$t1);		# sigma1(X[16..17])
   1247 	  eval(shift(@insns));
   1248 	  eval(shift(@insns));
   1249 	  eval(shift(@insns));
   1250 	  eval(shift(@insns));
   1251 	&vpslldq	($t3,$t3,8);		# 22 instructions
   1252 	  eval(shift(@insns));
   1253 	  eval(shift(@insns));
   1254 	  eval(shift(@insns));
   1255 	  eval(shift(@insns));
   1256 	&vpaddd		(@X[0],@X[0],$t3);	# X[2..3] += sigma1(X[16..17])
   1257 	  eval(shift(@insns));
   1258 	  eval(shift(@insns));
   1259 	  eval(shift(@insns));
   1260 	  eval(shift(@insns));
   1261 	&vpaddd		($t2,@X[0],16*2*$j."($Tbl)");
   1262 	  foreach (@insns) { eval; }		# remaining instructions
   1263 	&vmovdqa	(16*$j."(%rsp)",$t2);
   1264 }
   1265 
   1266     for ($i=0,$j=0; $j<4; $j++) {
   1267 	&XOP_256_00_47($j,\&body_00_15,@X);
   1268 	push(@X,shift(@X));			# rotate(@X)
   1269     }
   1270 	&cmpb	($SZ-1+16*2*$SZ."($Tbl)",0);
   1271 	&jne	(".Lxop_00_47");
   1272 
   1273     for ($i=0; $i<16; ) {
   1274 	foreach(body_00_15()) { eval; }
   1275     }
   1276 
   1277 					} else {	# SHA512
   1278     my @X = map("%xmm$_",(0..7));
   1279     my ($t0,$t1,$t2,$t3) = map("%xmm$_",(8..11));
   1280 
   1281 $code.=<<___;
   1282 .align	16
   1283 .Lloop_xop:
   1284 	vmovdqa	$TABLE+`$SZ*2*$rounds`(%rip),$t3
   1285 	vmovdqu	0x00($inp),@X[0]
   1286 	lea	$TABLE+0x80(%rip),$Tbl	# size optimization
   1287 	vmovdqu	0x10($inp),@X[1]
   1288 	vmovdqu	0x20($inp),@X[2]
   1289 	vpshufb	$t3,@X[0],@X[0]
   1290 	vmovdqu	0x30($inp),@X[3]
   1291 	vpshufb	$t3,@X[1],@X[1]
   1292 	vmovdqu	0x40($inp),@X[4]
   1293 	vpshufb	$t3,@X[2],@X[2]
   1294 	vmovdqu	0x50($inp),@X[5]
   1295 	vpshufb	$t3,@X[3],@X[3]
   1296 	vmovdqu	0x60($inp),@X[6]
   1297 	vpshufb	$t3,@X[4],@X[4]
   1298 	vmovdqu	0x70($inp),@X[7]
   1299 	vpshufb	$t3,@X[5],@X[5]
   1300 	vpaddq	-0x80($Tbl),@X[0],$t0
   1301 	vpshufb	$t3,@X[6],@X[6]
   1302 	vpaddq	-0x60($Tbl),@X[1],$t1
   1303 	vpshufb	$t3,@X[7],@X[7]
   1304 	vpaddq	-0x40($Tbl),@X[2],$t2
   1305 	vpaddq	-0x20($Tbl),@X[3],$t3
   1306 	vmovdqa	$t0,0x00(%rsp)
   1307 	vpaddq	0x00($Tbl),@X[4],$t0
   1308 	vmovdqa	$t1,0x10(%rsp)
   1309 	vpaddq	0x20($Tbl),@X[5],$t1
   1310 	vmovdqa	$t2,0x20(%rsp)
   1311 	vpaddq	0x40($Tbl),@X[6],$t2
   1312 	vmovdqa	$t3,0x30(%rsp)
   1313 	vpaddq	0x60($Tbl),@X[7],$t3
   1314 	vmovdqa	$t0,0x40(%rsp)
   1315 	mov	$A,$a1
   1316 	vmovdqa	$t1,0x50(%rsp)
   1317 	mov	$B,$a3
   1318 	vmovdqa	$t2,0x60(%rsp)
   1319 	xor	$C,$a3			# magic
   1320 	vmovdqa	$t3,0x70(%rsp)
   1321 	mov	$E,$a0
   1322 	jmp	.Lxop_00_47
   1323 
   1324 .align	16
   1325 .Lxop_00_47:
   1326 	add	\$`16*2*$SZ`,$Tbl
   1327 ___
   1328 sub XOP_512_00_47 () {
   1329 my $j = shift;
   1330 my $body = shift;
   1331 my @X = @_;
   1332 my @insns = (&$body,&$body);			# 52 instructions
   1333 
   1334 	&vpalignr	($t0,@X[1],@X[0],$SZ);	# X[1..2]
   1335 	  eval(shift(@insns));
   1336 	  eval(shift(@insns));
   1337 	 &vpalignr	($t3,@X[5],@X[4],$SZ);	# X[9..10]
   1338 	  eval(shift(@insns));
   1339 	  eval(shift(@insns));
   1340 	&vprotq		($t1,$t0,8*$SZ-$sigma0[1]);
   1341 	  eval(shift(@insns));
   1342 	  eval(shift(@insns));
   1343 	&vpsrlq		($t0,$t0,$sigma0[2]);
   1344 	  eval(shift(@insns));
   1345 	  eval(shift(@insns));
   1346 	 &vpaddq	(@X[0],@X[0],$t3);	# X[0..1] += X[9..10]
   1347 	  eval(shift(@insns));
   1348 	  eval(shift(@insns));
   1349 	  eval(shift(@insns));
   1350 	  eval(shift(@insns));
   1351 	&vprotq		($t2,$t1,$sigma0[1]-$sigma0[0]);
   1352 	  eval(shift(@insns));
   1353 	  eval(shift(@insns));
   1354 	&vpxor		($t0,$t0,$t1);
   1355 	  eval(shift(@insns));
   1356 	  eval(shift(@insns));
   1357 	  eval(shift(@insns));
   1358 	  eval(shift(@insns));
   1359 	 &vprotq	($t3,@X[7],8*$SZ-$sigma1[1]);
   1360 	  eval(shift(@insns));
   1361 	  eval(shift(@insns));
   1362 	&vpxor		($t0,$t0,$t2);		# sigma0(X[1..2])
   1363 	  eval(shift(@insns));
   1364 	  eval(shift(@insns));
   1365 	 &vpsrlq	($t2,@X[7],$sigma1[2]);
   1366 	  eval(shift(@insns));
   1367 	  eval(shift(@insns));
   1368 	&vpaddq		(@X[0],@X[0],$t0);	# X[0..1] += sigma0(X[1..2])
   1369 	  eval(shift(@insns));
   1370 	  eval(shift(@insns));
   1371 	 &vprotq	($t1,$t3,$sigma1[1]-$sigma1[0]);
   1372 	  eval(shift(@insns));
   1373 	  eval(shift(@insns));
   1374 	 &vpxor		($t3,$t3,$t2);
   1375 	  eval(shift(@insns));
   1376 	  eval(shift(@insns));
   1377 	  eval(shift(@insns));
   1378 	  eval(shift(@insns));
   1379 	 &vpxor		($t3,$t3,$t1);		# sigma1(X[14..15])
   1380 	  eval(shift(@insns));
   1381 	  eval(shift(@insns));
   1382 	  eval(shift(@insns));
   1383 	  eval(shift(@insns));
   1384 	&vpaddq		(@X[0],@X[0],$t3);	# X[0..1] += sigma1(X[14..15])
   1385 	  eval(shift(@insns));
   1386 	  eval(shift(@insns));
   1387 	  eval(shift(@insns));
   1388 	  eval(shift(@insns));
   1389 	&vpaddq		($t2,@X[0],16*2*$j-0x80."($Tbl)");
   1390 	  foreach (@insns) { eval; }		# remaining instructions
   1391 	&vmovdqa	(16*$j."(%rsp)",$t2);
   1392 }
   1393 
   1394     for ($i=0,$j=0; $j<8; $j++) {
   1395 	&XOP_512_00_47($j,\&body_00_15,@X);
   1396 	push(@X,shift(@X));			# rotate(@X)
   1397     }
   1398 	&cmpb	($SZ-1+16*2*$SZ-0x80."($Tbl)",0);
   1399 	&jne	(".Lxop_00_47");
   1400 
   1401     for ($i=0; $i<16; ) {
   1402 	foreach(body_00_15()) { eval; }
   1403     }
   1404 }
   1405 $code.=<<___;
   1406 	mov	$_ctx,$ctx
   1407 	mov	$a1,$A
   1408 
   1409 	add	$SZ*0($ctx),$A
   1410 	lea	16*$SZ($inp),$inp
   1411 	add	$SZ*1($ctx),$B
   1412 	add	$SZ*2($ctx),$C
   1413 	add	$SZ*3($ctx),$D
   1414 	add	$SZ*4($ctx),$E
   1415 	add	$SZ*5($ctx),$F
   1416 	add	$SZ*6($ctx),$G
   1417 	add	$SZ*7($ctx),$H
   1418 
   1419 	cmp	$_end,$inp
   1420 
   1421 	mov	$A,$SZ*0($ctx)
   1422 	mov	$B,$SZ*1($ctx)
   1423 	mov	$C,$SZ*2($ctx)
   1424 	mov	$D,$SZ*3($ctx)
   1425 	mov	$E,$SZ*4($ctx)
   1426 	mov	$F,$SZ*5($ctx)
   1427 	mov	$G,$SZ*6($ctx)
   1428 	mov	$H,$SZ*7($ctx)
   1429 	jb	.Lloop_xop
   1430 
   1431 	mov	$_rsp,%rsi
   1432 	vzeroupper
   1433 ___
   1434 $code.=<<___ if ($win64);
   1435 	movaps	16*$SZ+32(%rsp),%xmm6
   1436 	movaps	16*$SZ+48(%rsp),%xmm7
   1437 	movaps	16*$SZ+64(%rsp),%xmm8
   1438 	movaps	16*$SZ+80(%rsp),%xmm9
   1439 ___
   1440 $code.=<<___ if ($win64 && $SZ>4);
   1441 	movaps	16*$SZ+96(%rsp),%xmm10
   1442 	movaps	16*$SZ+112(%rsp),%xmm11
   1443 ___
   1444 $code.=<<___;
   1445 	mov	(%rsi),%r15
   1446 	mov	8(%rsi),%r14
   1447 	mov	16(%rsi),%r13
   1448 	mov	24(%rsi),%r12
   1449 	mov	32(%rsi),%rbp
   1450 	mov	40(%rsi),%rbx
   1451 	lea	48(%rsi),%rsp
   1452 .Lepilogue_xop:
   1453 	ret
   1454 .size	${func}_xop,.-${func}_xop
   1455 ___
   1456 }
   1457 ######################################################################
   1458 # AVX+shrd code path
   1459 #
   1460 local *ror = sub { &shrd(@_[0],@_) };
   1461 
   1462 $code.=<<___;
   1463 .type	${func}_avx,\@function,3
   1464 .align	64
   1465 ${func}_avx:
   1466 .Lavx_shortcut:
   1467 	push	%rbx
   1468 	push	%rbp
   1469 	push	%r12
   1470 	push	%r13
   1471 	push	%r14
   1472 	push	%r15
   1473 	mov	%rsp,%r11		# copy %rsp
   1474 	shl	\$4,%rdx		# num*16
   1475 	sub	\$`$framesz+$win64*16*($SZ==4?4:6)`,%rsp
   1476 	lea	($inp,%rdx,$SZ),%rdx	# inp+num*16*$SZ
   1477 	and	\$-64,%rsp		# align stack frame
   1478 	mov	$ctx,$_ctx		# save ctx, 1st arg
   1479 	mov	$inp,$_inp		# save inp, 2nd arh
   1480 	mov	%rdx,$_end		# save end pointer, "3rd" arg
   1481 	mov	%r11,$_rsp		# save copy of %rsp
   1482 ___
   1483 $code.=<<___ if ($win64);
   1484 	movaps	%xmm6,16*$SZ+32(%rsp)
   1485 	movaps	%xmm7,16*$SZ+48(%rsp)
   1486 	movaps	%xmm8,16*$SZ+64(%rsp)
   1487 	movaps	%xmm9,16*$SZ+80(%rsp)
   1488 ___
   1489 $code.=<<___ if ($win64 && $SZ>4);
   1490 	movaps	%xmm10,16*$SZ+96(%rsp)
   1491 	movaps	%xmm11,16*$SZ+112(%rsp)
   1492 ___
   1493 $code.=<<___;
   1494 .Lprologue_avx:
   1495 
   1496 	vzeroupper
   1497 	mov	$SZ*0($ctx),$A
   1498 	mov	$SZ*1($ctx),$B
   1499 	mov	$SZ*2($ctx),$C
   1500 	mov	$SZ*3($ctx),$D
   1501 	mov	$SZ*4($ctx),$E
   1502 	mov	$SZ*5($ctx),$F
   1503 	mov	$SZ*6($ctx),$G
   1504 	mov	$SZ*7($ctx),$H
   1505 ___
   1506 					if ($SZ==4) {	# SHA256
   1507     my @X = map("%xmm$_",(0..3));
   1508     my ($t0,$t1,$t2,$t3, $t4,$t5) = map("%xmm$_",(4..9));
   1509 
   1510 $code.=<<___;
   1511 	vmovdqa	$TABLE+`$SZ*2*$rounds`+32(%rip),$t4
   1512 	vmovdqa	$TABLE+`$SZ*2*$rounds`+64(%rip),$t5
   1513 	jmp	.Lloop_avx
   1514 .align	16
   1515 .Lloop_avx:
   1516 	vmovdqa	$TABLE+`$SZ*2*$rounds`(%rip),$t3
   1517 	vmovdqu	0x00($inp),@X[0]
   1518 	vmovdqu	0x10($inp),@X[1]
   1519 	vmovdqu	0x20($inp),@X[2]
   1520 	vmovdqu	0x30($inp),@X[3]
   1521 	vpshufb	$t3,@X[0],@X[0]
   1522 	lea	$TABLE(%rip),$Tbl
   1523 	vpshufb	$t3,@X[1],@X[1]
   1524 	vpshufb	$t3,@X[2],@X[2]
   1525 	vpaddd	0x00($Tbl),@X[0],$t0
   1526 	vpshufb	$t3,@X[3],@X[3]
   1527 	vpaddd	0x20($Tbl),@X[1],$t1
   1528 	vpaddd	0x40($Tbl),@X[2],$t2
   1529 	vpaddd	0x60($Tbl),@X[3],$t3
   1530 	vmovdqa	$t0,0x00(%rsp)
   1531 	mov	$A,$a1
   1532 	vmovdqa	$t1,0x10(%rsp)
   1533 	mov	$B,$a3
   1534 	vmovdqa	$t2,0x20(%rsp)
   1535 	xor	$C,$a3			# magic
   1536 	vmovdqa	$t3,0x30(%rsp)
   1537 	mov	$E,$a0
   1538 	jmp	.Lavx_00_47
   1539 
   1540 .align	16
   1541 .Lavx_00_47:
   1542 	sub	\$`-16*2*$SZ`,$Tbl	# size optimization
   1543 ___
   1544 sub Xupdate_256_AVX () {
   1545 	(
   1546 	'&vpalignr	($t0,@X[1],@X[0],$SZ)',	# X[1..4]
   1547 	 '&vpalignr	($t3,@X[3],@X[2],$SZ)',	# X[9..12]
   1548 	'&vpsrld	($t2,$t0,$sigma0[0]);',
   1549 	 '&vpaddd	(@X[0],@X[0],$t3)',	# X[0..3] += X[9..12]
   1550 	'&vpsrld	($t3,$t0,$sigma0[2])',
   1551 	'&vpslld	($t1,$t0,8*$SZ-$sigma0[1]);',
   1552 	'&vpxor		($t0,$t3,$t2)',
   1553 	 '&vpshufd	($t3,@X[3],0b11111010)',# X[14..15]
   1554 	'&vpsrld	($t2,$t2,$sigma0[1]-$sigma0[0]);',
   1555 	'&vpxor		($t0,$t0,$t1)',
   1556 	'&vpslld	($t1,$t1,$sigma0[1]-$sigma0[0]);',
   1557 	'&vpxor		($t0,$t0,$t2)',
   1558 	 '&vpsrld	($t2,$t3,$sigma1[2]);',
   1559 	'&vpxor		($t0,$t0,$t1)',		# sigma0(X[1..4])
   1560 	 '&vpsrlq	($t3,$t3,$sigma1[0]);',
   1561 	'&vpaddd	(@X[0],@X[0],$t0)',	# X[0..3] += sigma0(X[1..4])
   1562 	 '&vpxor	($t2,$t2,$t3);',
   1563 	 '&vpsrlq	($t3,$t3,$sigma1[1]-$sigma1[0])',
   1564 	 '&vpxor	($t2,$t2,$t3)',
   1565 	 '&vpshufb	($t2,$t2,$t4)',		# sigma1(X[14..15])
   1566 	'&vpaddd	(@X[0],@X[0],$t2)',	# X[0..1] += sigma1(X[14..15])
   1567 	 '&vpshufd	($t3,@X[0],0b01010000)',# X[16..17]
   1568 	 '&vpsrld	($t2,$t3,$sigma1[2])',
   1569 	 '&vpsrlq	($t3,$t3,$sigma1[0])',
   1570 	 '&vpxor	($t2,$t2,$t3);',
   1571 	 '&vpsrlq	($t3,$t3,$sigma1[1]-$sigma1[0])',
   1572 	 '&vpxor	($t2,$t2,$t3)',
   1573 	 '&vpshufb	($t2,$t2,$t5)',
   1574 	'&vpaddd	(@X[0],@X[0],$t2)'	# X[2..3] += sigma1(X[16..17])
   1575 	);
   1576 }
   1577 
   1578 sub AVX_256_00_47 () {
   1579 my $j = shift;
   1580 my $body = shift;
   1581 my @X = @_;
   1582 my @insns = (&$body,&$body,&$body,&$body);	# 104 instructions
   1583 
   1584 	foreach (Xupdate_256_AVX()) {		# 29 instructions
   1585 	    eval;
   1586 	    eval(shift(@insns));
   1587 	    eval(shift(@insns));
   1588 	    eval(shift(@insns));
   1589 	}
   1590 	&vpaddd		($t2,@X[0],16*2*$j."($Tbl)");
   1591 	  foreach (@insns) { eval; }		# remaining instructions
   1592 	&vmovdqa	(16*$j."(%rsp)",$t2);
   1593 }
   1594 
   1595     for ($i=0,$j=0; $j<4; $j++) {
   1596 	&AVX_256_00_47($j,\&body_00_15,@X);
   1597 	push(@X,shift(@X));			# rotate(@X)
   1598     }
   1599 	&cmpb	($SZ-1+16*2*$SZ."($Tbl)",0);
   1600 	&jne	(".Lavx_00_47");
   1601 
   1602     for ($i=0; $i<16; ) {
   1603 	foreach(body_00_15()) { eval; }
   1604     }
   1605 
   1606 					} else {	# SHA512
   1607     my @X = map("%xmm$_",(0..7));
   1608     my ($t0,$t1,$t2,$t3) = map("%xmm$_",(8..11));
   1609 
   1610 $code.=<<___;
   1611 	jmp	.Lloop_avx
   1612 .align	16
   1613 .Lloop_avx:
   1614 	vmovdqa	$TABLE+`$SZ*2*$rounds`(%rip),$t3
   1615 	vmovdqu	0x00($inp),@X[0]
   1616 	lea	$TABLE+0x80(%rip),$Tbl	# size optimization
   1617 	vmovdqu	0x10($inp),@X[1]
   1618 	vmovdqu	0x20($inp),@X[2]
   1619 	vpshufb	$t3,@X[0],@X[0]
   1620 	vmovdqu	0x30($inp),@X[3]
   1621 	vpshufb	$t3,@X[1],@X[1]
   1622 	vmovdqu	0x40($inp),@X[4]
   1623 	vpshufb	$t3,@X[2],@X[2]
   1624 	vmovdqu	0x50($inp),@X[5]
   1625 	vpshufb	$t3,@X[3],@X[3]
   1626 	vmovdqu	0x60($inp),@X[6]
   1627 	vpshufb	$t3,@X[4],@X[4]
   1628 	vmovdqu	0x70($inp),@X[7]
   1629 	vpshufb	$t3,@X[5],@X[5]
   1630 	vpaddq	-0x80($Tbl),@X[0],$t0
   1631 	vpshufb	$t3,@X[6],@X[6]
   1632 	vpaddq	-0x60($Tbl),@X[1],$t1
   1633 	vpshufb	$t3,@X[7],@X[7]
   1634 	vpaddq	-0x40($Tbl),@X[2],$t2
   1635 	vpaddq	-0x20($Tbl),@X[3],$t3
   1636 	vmovdqa	$t0,0x00(%rsp)
   1637 	vpaddq	0x00($Tbl),@X[4],$t0
   1638 	vmovdqa	$t1,0x10(%rsp)
   1639 	vpaddq	0x20($Tbl),@X[5],$t1
   1640 	vmovdqa	$t2,0x20(%rsp)
   1641 	vpaddq	0x40($Tbl),@X[6],$t2
   1642 	vmovdqa	$t3,0x30(%rsp)
   1643 	vpaddq	0x60($Tbl),@X[7],$t3
   1644 	vmovdqa	$t0,0x40(%rsp)
   1645 	mov	$A,$a1
   1646 	vmovdqa	$t1,0x50(%rsp)
   1647 	mov	$B,$a3
   1648 	vmovdqa	$t2,0x60(%rsp)
   1649 	xor	$C,$a3			# magic
   1650 	vmovdqa	$t3,0x70(%rsp)
   1651 	mov	$E,$a0
   1652 	jmp	.Lavx_00_47
   1653 
   1654 .align	16
   1655 .Lavx_00_47:
   1656 	add	\$`16*2*$SZ`,$Tbl
   1657 ___
   1658 sub Xupdate_512_AVX () {
   1659 	(
   1660 	'&vpalignr	($t0,@X[1],@X[0],$SZ)',	# X[1..2]
   1661 	 '&vpalignr	($t3,@X[5],@X[4],$SZ)',	# X[9..10]
   1662 	'&vpsrlq	($t2,$t0,$sigma0[0])',
   1663 	 '&vpaddq	(@X[0],@X[0],$t3);',	# X[0..1] += X[9..10]
   1664 	'&vpsrlq	($t3,$t0,$sigma0[2])',
   1665 	'&vpsllq	($t1,$t0,8*$SZ-$sigma0[1]);',
   1666 	 '&vpxor	($t0,$t3,$t2)',
   1667 	'&vpsrlq	($t2,$t2,$sigma0[1]-$sigma0[0]);',
   1668 	 '&vpxor	($t0,$t0,$t1)',
   1669 	'&vpsllq	($t1,$t1,$sigma0[1]-$sigma0[0]);',
   1670 	 '&vpxor	($t0,$t0,$t2)',
   1671 	 '&vpsrlq	($t3,@X[7],$sigma1[2]);',
   1672 	'&vpxor		($t0,$t0,$t1)',		# sigma0(X[1..2])
   1673 	 '&vpsllq	($t2,@X[7],8*$SZ-$sigma1[1]);',
   1674 	'&vpaddq	(@X[0],@X[0],$t0)',	# X[0..1] += sigma0(X[1..2])
   1675 	 '&vpsrlq	($t1,@X[7],$sigma1[0]);',
   1676 	 '&vpxor	($t3,$t3,$t2)',
   1677 	 '&vpsllq	($t2,$t2,$sigma1[1]-$sigma1[0]);',
   1678 	 '&vpxor	($t3,$t3,$t1)',
   1679 	 '&vpsrlq	($t1,$t1,$sigma1[1]-$sigma1[0]);',
   1680 	 '&vpxor	($t3,$t3,$t2)',
   1681 	 '&vpxor	($t3,$t3,$t1)',		# sigma1(X[14..15])
   1682 	'&vpaddq	(@X[0],@X[0],$t3)',	# X[0..1] += sigma1(X[14..15])
   1683 	);
   1684 }
   1685 
   1686 sub AVX_512_00_47 () {
   1687 my $j = shift;
   1688 my $body = shift;
   1689 my @X = @_;
   1690 my @insns = (&$body,&$body);			# 52 instructions
   1691 
   1692 	foreach (Xupdate_512_AVX()) {		# 23 instructions
   1693 	    eval;
   1694 	    eval(shift(@insns));
   1695 	    eval(shift(@insns));
   1696 	}
   1697 	&vpaddq		($t2,@X[0],16*2*$j-0x80."($Tbl)");
   1698 	  foreach (@insns) { eval; }		# remaining instructions
   1699 	&vmovdqa	(16*$j."(%rsp)",$t2);
   1700 }
   1701 
   1702     for ($i=0,$j=0; $j<8; $j++) {
   1703 	&AVX_512_00_47($j,\&body_00_15,@X);
   1704 	push(@X,shift(@X));			# rotate(@X)
   1705     }
   1706 	&cmpb	($SZ-1+16*2*$SZ-0x80."($Tbl)",0);
   1707 	&jne	(".Lavx_00_47");
   1708 
   1709     for ($i=0; $i<16; ) {
   1710 	foreach(body_00_15()) { eval; }
   1711     }
   1712 }
   1713 $code.=<<___;
   1714 	mov	$_ctx,$ctx
   1715 	mov	$a1,$A
   1716 
   1717 	add	$SZ*0($ctx),$A
   1718 	lea	16*$SZ($inp),$inp
   1719 	add	$SZ*1($ctx),$B
   1720 	add	$SZ*2($ctx),$C
   1721 	add	$SZ*3($ctx),$D
   1722 	add	$SZ*4($ctx),$E
   1723 	add	$SZ*5($ctx),$F
   1724 	add	$SZ*6($ctx),$G
   1725 	add	$SZ*7($ctx),$H
   1726 
   1727 	cmp	$_end,$inp
   1728 
   1729 	mov	$A,$SZ*0($ctx)
   1730 	mov	$B,$SZ*1($ctx)
   1731 	mov	$C,$SZ*2($ctx)
   1732 	mov	$D,$SZ*3($ctx)
   1733 	mov	$E,$SZ*4($ctx)
   1734 	mov	$F,$SZ*5($ctx)
   1735 	mov	$G,$SZ*6($ctx)
   1736 	mov	$H,$SZ*7($ctx)
   1737 	jb	.Lloop_avx
   1738 
   1739 	mov	$_rsp,%rsi
   1740 	vzeroupper
   1741 ___
   1742 $code.=<<___ if ($win64);
   1743 	movaps	16*$SZ+32(%rsp),%xmm6
   1744 	movaps	16*$SZ+48(%rsp),%xmm7
   1745 	movaps	16*$SZ+64(%rsp),%xmm8
   1746 	movaps	16*$SZ+80(%rsp),%xmm9
   1747 ___
   1748 $code.=<<___ if ($win64 && $SZ>4);
   1749 	movaps	16*$SZ+96(%rsp),%xmm10
   1750 	movaps	16*$SZ+112(%rsp),%xmm11
   1751 ___
   1752 $code.=<<___;
   1753 	mov	(%rsi),%r15
   1754 	mov	8(%rsi),%r14
   1755 	mov	16(%rsi),%r13
   1756 	mov	24(%rsi),%r12
   1757 	mov	32(%rsi),%rbp
   1758 	mov	40(%rsi),%rbx
   1759 	lea	48(%rsi),%rsp
   1760 .Lepilogue_avx:
   1761 	ret
   1762 .size	${func}_avx,.-${func}_avx
   1763 ___
   1764 
   1765 if ($avx>1) {{
   1766 ######################################################################
   1767 # AVX2+BMI code path
   1768 #
   1769 my $a5=$SZ==4?"%esi":"%rsi";	# zap $inp 
   1770 my $PUSH8=8*2*$SZ;
   1771 use integer;
   1772 
   1773 sub bodyx_00_15 () {
   1774 	# at start $a1 should be zero, $a3 - $b^$c and $a4 copy of $f
   1775 	(
   1776 	'($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'.
   1777 
   1778 	'&add	($h,(32*($i/(16/$SZ))+$SZ*($i%(16/$SZ)))%$PUSH8.$base)',    # h+=X[i]+K[i]
   1779 	'&and	($a4,$e)',		# f&e
   1780 	'&rorx	($a0,$e,$Sigma1[2])',
   1781 	'&rorx	($a2,$e,$Sigma1[1])',
   1782 
   1783 	'&lea	($a,"($a,$a1)")',	# h+=Sigma0(a) from the past
   1784 	'&lea	($h,"($h,$a4)")',
   1785 	'&andn	($a4,$e,$g)',		# ~e&g
   1786 	'&xor	($a0,$a2)',
   1787 
   1788 	'&rorx	($a1,$e,$Sigma1[0])',
   1789 	'&lea	($h,"($h,$a4)")',	# h+=Ch(e,f,g)=(e&f)+(~e&g)
   1790 	'&xor	($a0,$a1)',		# Sigma1(e)
   1791 	'&mov	($a2,$a)',
   1792 
   1793 	'&rorx	($a4,$a,$Sigma0[2])',
   1794 	'&lea	($h,"($h,$a0)")',	# h+=Sigma1(e)
   1795 	'&xor	($a2,$b)',		# a^b, b^c in next round
   1796 	'&rorx	($a1,$a,$Sigma0[1])',
   1797 
   1798 	'&rorx	($a0,$a,$Sigma0[0])',
   1799 	'&lea	($d,"($d,$h)")',	# d+=h
   1800 	'&and	($a3,$a2)',		# (b^c)&(a^b)
   1801 	'&xor	($a1,$a4)',
   1802 
   1803 	'&xor	($a3,$b)',		# Maj(a,b,c)=Ch(a^b,c,b)
   1804 	'&xor	($a1,$a0)',		# Sigma0(a)
   1805 	'&lea	($h,"($h,$a3)");'.	# h+=Maj(a,b,c)
   1806 	'&mov	($a4,$e)',		# copy of f in future
   1807 
   1808 	'($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;'
   1809 	);
   1810 	# and at the finish one has to $a+=$a1
   1811 }
   1812 
   1813 $code.=<<___;
   1814 .type	${func}_avx2,\@function,3
   1815 .align	64
   1816 ${func}_avx2:
   1817 .Lavx2_shortcut:
   1818 	push	%rbx
   1819 	push	%rbp
   1820 	push	%r12
   1821 	push	%r13
   1822 	push	%r14
   1823 	push	%r15
   1824 	mov	%rsp,%r11		# copy %rsp
   1825 	sub	\$`2*$SZ*$rounds+4*8+$win64*16*($SZ==4?4:6)`,%rsp
   1826 	shl	\$4,%rdx		# num*16
   1827 	and	\$-256*$SZ,%rsp		# align stack frame
   1828 	lea	($inp,%rdx,$SZ),%rdx	# inp+num*16*$SZ
   1829 	add	\$`2*$SZ*($rounds-8)`,%rsp
   1830 	mov	$ctx,$_ctx		# save ctx, 1st arg
   1831 	mov	$inp,$_inp		# save inp, 2nd arh
   1832 	mov	%rdx,$_end		# save end pointer, "3rd" arg
   1833 	mov	%r11,$_rsp		# save copy of %rsp
   1834 ___
   1835 $code.=<<___ if ($win64);
   1836 	movaps	%xmm6,16*$SZ+32(%rsp)
   1837 	movaps	%xmm7,16*$SZ+48(%rsp)
   1838 	movaps	%xmm8,16*$SZ+64(%rsp)
   1839 	movaps	%xmm9,16*$SZ+80(%rsp)
   1840 ___
   1841 $code.=<<___ if ($win64 && $SZ>4);
   1842 	movaps	%xmm10,16*$SZ+96(%rsp)
   1843 	movaps	%xmm11,16*$SZ+112(%rsp)
   1844 ___
   1845 $code.=<<___;
   1846 .Lprologue_avx2:
   1847 
   1848 	vzeroupper
   1849 	sub	\$-16*$SZ,$inp		# inp++, size optimization
   1850 	mov	$SZ*0($ctx),$A
   1851 	mov	$inp,%r12		# borrow $T1
   1852 	mov	$SZ*1($ctx),$B
   1853 	cmp	%rdx,$inp		# $_end
   1854 	mov	$SZ*2($ctx),$C
   1855 	cmove	%rsp,%r12		# next block or random data
   1856 	mov	$SZ*3($ctx),$D
   1857 	mov	$SZ*4($ctx),$E
   1858 	mov	$SZ*5($ctx),$F
   1859 	mov	$SZ*6($ctx),$G
   1860 	mov	$SZ*7($ctx),$H
   1861 ___
   1862 					if ($SZ==4) {	# SHA256
   1863     my @X = map("%ymm$_",(0..3));
   1864     my ($t0,$t1,$t2,$t3, $t4,$t5) = map("%ymm$_",(4..9));
   1865 
   1866 $code.=<<___;
   1867 	vmovdqa	$TABLE+`$SZ*2*$rounds`+32(%rip),$t4
   1868 	vmovdqa	$TABLE+`$SZ*2*$rounds`+64(%rip),$t5
   1869 	jmp	.Loop_avx2
   1870 .align	16
   1871 .Loop_avx2:
   1872 	vmovdqa	$TABLE+`$SZ*2*$rounds`(%rip),$t3
   1873 	vmovdqu	-16*$SZ+0($inp),%xmm0
   1874 	vmovdqu	-16*$SZ+16($inp),%xmm1
   1875 	vmovdqu	-16*$SZ+32($inp),%xmm2
   1876 	vmovdqu	-16*$SZ+48($inp),%xmm3
   1877 	#mov		$inp,$_inp	# offload $inp
   1878 	vinserti128	\$1,(%r12),@X[0],@X[0]
   1879 	vinserti128	\$1,16(%r12),@X[1],@X[1]
   1880 	vpshufb		$t3,@X[0],@X[0]
   1881 	vinserti128	\$1,32(%r12),@X[2],@X[2]
   1882 	vpshufb		$t3,@X[1],@X[1]
   1883 	vinserti128	\$1,48(%r12),@X[3],@X[3]
   1884 
   1885 	lea	$TABLE(%rip),$Tbl
   1886 	vpshufb	$t3,@X[2],@X[2]
   1887 	vpaddd	0x00($Tbl),@X[0],$t0
   1888 	vpshufb	$t3,@X[3],@X[3]
   1889 	vpaddd	0x20($Tbl),@X[1],$t1
   1890 	vpaddd	0x40($Tbl),@X[2],$t2
   1891 	vpaddd	0x60($Tbl),@X[3],$t3
   1892 	vmovdqa	$t0,0x00(%rsp)
   1893 	xor	$a1,$a1
   1894 	vmovdqa	$t1,0x20(%rsp)
   1895 	lea	-$PUSH8(%rsp),%rsp
   1896 	mov	$B,$a3
   1897 	vmovdqa	$t2,0x00(%rsp)
   1898 	xor	$C,$a3			# magic
   1899 	vmovdqa	$t3,0x20(%rsp)
   1900 	mov	$F,$a4
   1901 	sub	\$-16*2*$SZ,$Tbl	# size optimization
   1902 	jmp	.Lavx2_00_47
   1903 
   1904 .align	16
   1905 .Lavx2_00_47:
   1906 ___
   1907 
   1908 sub AVX2_256_00_47 () {
   1909 my $j = shift;
   1910 my $body = shift;
   1911 my @X = @_;
   1912 my @insns = (&$body,&$body,&$body,&$body);	# 96 instructions
   1913 my $base = "+2*$PUSH8(%rsp)";
   1914 
   1915 	&lea	("%rsp","-$PUSH8(%rsp)")	if (($j%2)==0);
   1916 	foreach (Xupdate_256_AVX()) {		# 29 instructions
   1917 	    eval;
   1918 	    eval(shift(@insns));
   1919 	    eval(shift(@insns));
   1920 	    eval(shift(@insns));
   1921 	}
   1922 	&vpaddd		($t2,@X[0],16*2*$j."($Tbl)");
   1923 	  foreach (@insns) { eval; }		# remaining instructions
   1924 	&vmovdqa	((32*$j)%$PUSH8."(%rsp)",$t2);
   1925 }
   1926 
   1927     for ($i=0,$j=0; $j<4; $j++) {
   1928 	&AVX2_256_00_47($j,\&bodyx_00_15,@X);
   1929 	push(@X,shift(@X));			# rotate(@X)
   1930     }
   1931 	&lea	($Tbl,16*2*$SZ."($Tbl)");
   1932 	&cmpb	(($SZ-1)."($Tbl)",0);
   1933 	&jne	(".Lavx2_00_47");
   1934 
   1935     for ($i=0; $i<16; ) {
   1936 	my $base=$i<8?"+$PUSH8(%rsp)":"(%rsp)";
   1937 	foreach(bodyx_00_15()) { eval; }
   1938     }
   1939 					} else {	# SHA512
   1940     my @X = map("%ymm$_",(0..7));
   1941     my ($t0,$t1,$t2,$t3) = map("%ymm$_",(8..11));
   1942 
   1943 $code.=<<___;
   1944 	jmp	.Loop_avx2
   1945 .align	16
   1946 .Loop_avx2:
   1947 	vmovdqu	-16*$SZ($inp),%xmm0
   1948 	vmovdqu	-16*$SZ+16($inp),%xmm1
   1949 	vmovdqu	-16*$SZ+32($inp),%xmm2
   1950 	lea	$TABLE+0x80(%rip),$Tbl	# size optimization
   1951 	vmovdqu	-16*$SZ+48($inp),%xmm3
   1952 	vmovdqu	-16*$SZ+64($inp),%xmm4
   1953 	vmovdqu	-16*$SZ+80($inp),%xmm5
   1954 	vmovdqu	-16*$SZ+96($inp),%xmm6
   1955 	vmovdqu	-16*$SZ+112($inp),%xmm7
   1956 	#mov	$inp,$_inp	# offload $inp
   1957 	vmovdqa	`$SZ*2*$rounds-0x80`($Tbl),$t2
   1958 	vinserti128	\$1,(%r12),@X[0],@X[0]
   1959 	vinserti128	\$1,16(%r12),@X[1],@X[1]
   1960 	 vpshufb	$t2,@X[0],@X[0]
   1961 	vinserti128	\$1,32(%r12),@X[2],@X[2]
   1962 	 vpshufb	$t2,@X[1],@X[1]
   1963 	vinserti128	\$1,48(%r12),@X[3],@X[3]
   1964 	 vpshufb	$t2,@X[2],@X[2]
   1965 	vinserti128	\$1,64(%r12),@X[4],@X[4]
   1966 	 vpshufb	$t2,@X[3],@X[3]
   1967 	vinserti128	\$1,80(%r12),@X[5],@X[5]
   1968 	 vpshufb	$t2,@X[4],@X[4]
   1969 	vinserti128	\$1,96(%r12),@X[6],@X[6]
   1970 	 vpshufb	$t2,@X[5],@X[5]
   1971 	vinserti128	\$1,112(%r12),@X[7],@X[7]
   1972 
   1973 	vpaddq	-0x80($Tbl),@X[0],$t0
   1974 	vpshufb	$t2,@X[6],@X[6]
   1975 	vpaddq	-0x60($Tbl),@X[1],$t1
   1976 	vpshufb	$t2,@X[7],@X[7]
   1977 	vpaddq	-0x40($Tbl),@X[2],$t2
   1978 	vpaddq	-0x20($Tbl),@X[3],$t3
   1979 	vmovdqa	$t0,0x00(%rsp)
   1980 	vpaddq	0x00($Tbl),@X[4],$t0
   1981 	vmovdqa	$t1,0x20(%rsp)
   1982 	vpaddq	0x20($Tbl),@X[5],$t1
   1983 	vmovdqa	$t2,0x40(%rsp)
   1984 	vpaddq	0x40($Tbl),@X[6],$t2
   1985 	vmovdqa	$t3,0x60(%rsp)
   1986 	lea	-$PUSH8(%rsp),%rsp
   1987 	vpaddq	0x60($Tbl),@X[7],$t3
   1988 	vmovdqa	$t0,0x00(%rsp)
   1989 	xor	$a1,$a1
   1990 	vmovdqa	$t1,0x20(%rsp)
   1991 	mov	$B,$a3
   1992 	vmovdqa	$t2,0x40(%rsp)
   1993 	xor	$C,$a3			# magic
   1994 	vmovdqa	$t3,0x60(%rsp)
   1995 	mov	$F,$a4
   1996 	add	\$16*2*$SZ,$Tbl
   1997 	jmp	.Lavx2_00_47
   1998 
   1999 .align	16
   2000 .Lavx2_00_47:
   2001 ___
   2002 
   2003 sub AVX2_512_00_47 () {
   2004 my $j = shift;
   2005 my $body = shift;
   2006 my @X = @_;
   2007 my @insns = (&$body,&$body);			# 48 instructions
   2008 my $base = "+2*$PUSH8(%rsp)";
   2009 
   2010 	&lea	("%rsp","-$PUSH8(%rsp)")	if (($j%4)==0);
   2011 	foreach (Xupdate_512_AVX()) {		# 23 instructions
   2012 	    eval;
   2013 	    if ($_ !~ /\;$/) {
   2014 		eval(shift(@insns));
   2015 		eval(shift(@insns));
   2016 		eval(shift(@insns));
   2017 	    }
   2018 	}
   2019 	&vpaddq		($t2,@X[0],16*2*$j-0x80."($Tbl)");
   2020 	  foreach (@insns) { eval; }		# remaining instructions
   2021 	&vmovdqa	((32*$j)%$PUSH8."(%rsp)",$t2);
   2022 }
   2023 
   2024     for ($i=0,$j=0; $j<8; $j++) {
   2025 	&AVX2_512_00_47($j,\&bodyx_00_15,@X);
   2026 	push(@X,shift(@X));			# rotate(@X)
   2027     }
   2028 	&lea	($Tbl,16*2*$SZ."($Tbl)");
   2029 	&cmpb	(($SZ-1-0x80)."($Tbl)",0);
   2030 	&jne	(".Lavx2_00_47");
   2031 
   2032     for ($i=0; $i<16; ) {
   2033 	my $base=$i<8?"+$PUSH8(%rsp)":"(%rsp)";
   2034 	foreach(bodyx_00_15()) { eval; }
   2035     }
   2036 }
   2037 $code.=<<___;
   2038 	mov	`2*$SZ*$rounds`(%rsp),$ctx	# $_ctx
   2039 	add	$a1,$A
   2040 	#mov	`2*$SZ*$rounds+8`(%rsp),$inp	# $_inp
   2041 	lea	`2*$SZ*($rounds-8)`(%rsp),$Tbl
   2042 
   2043 	add	$SZ*0($ctx),$A
   2044 	add	$SZ*1($ctx),$B
   2045 	add	$SZ*2($ctx),$C
   2046 	add	$SZ*3($ctx),$D
   2047 	add	$SZ*4($ctx),$E
   2048 	add	$SZ*5($ctx),$F
   2049 	add	$SZ*6($ctx),$G
   2050 	add	$SZ*7($ctx),$H
   2051 
   2052 	mov	$A,$SZ*0($ctx)
   2053 	mov	$B,$SZ*1($ctx)
   2054 	mov	$C,$SZ*2($ctx)
   2055 	mov	$D,$SZ*3($ctx)
   2056 	mov	$E,$SZ*4($ctx)
   2057 	mov	$F,$SZ*5($ctx)
   2058 	mov	$G,$SZ*6($ctx)
   2059 	mov	$H,$SZ*7($ctx)
   2060 
   2061 	cmp	`$PUSH8+2*8`($Tbl),$inp	# $_end
   2062 	je	.Ldone_avx2
   2063 
   2064 	xor	$a1,$a1
   2065 	mov	$B,$a3
   2066 	xor	$C,$a3			# magic
   2067 	mov	$F,$a4
   2068 	jmp	.Lower_avx2
   2069 .align	16
   2070 .Lower_avx2:
   2071 ___
   2072     for ($i=0; $i<8; ) {
   2073 	my $base="+16($Tbl)";
   2074 	foreach(bodyx_00_15()) { eval; }
   2075     }
   2076 $code.=<<___;
   2077 	lea	-$PUSH8($Tbl),$Tbl
   2078 	cmp	%rsp,$Tbl
   2079 	jae	.Lower_avx2
   2080 
   2081 	mov	`2*$SZ*$rounds`(%rsp),$ctx	# $_ctx
   2082 	add	$a1,$A
   2083 	#mov	`2*$SZ*$rounds+8`(%rsp),$inp	# $_inp
   2084 	lea	`2*$SZ*($rounds-8)`(%rsp),%rsp
   2085 
   2086 	add	$SZ*0($ctx),$A
   2087 	add	$SZ*1($ctx),$B
   2088 	add	$SZ*2($ctx),$C
   2089 	add	$SZ*3($ctx),$D
   2090 	add	$SZ*4($ctx),$E
   2091 	add	$SZ*5($ctx),$F
   2092 	lea	`2*16*$SZ`($inp),$inp	# inp+=2
   2093 	add	$SZ*6($ctx),$G
   2094 	mov	$inp,%r12
   2095 	add	$SZ*7($ctx),$H
   2096 	cmp	$_end,$inp
   2097 
   2098 	mov	$A,$SZ*0($ctx)
   2099 	cmove	%rsp,%r12		# next block or stale data
   2100 	mov	$B,$SZ*1($ctx)
   2101 	mov	$C,$SZ*2($ctx)
   2102 	mov	$D,$SZ*3($ctx)
   2103 	mov	$E,$SZ*4($ctx)
   2104 	mov	$F,$SZ*5($ctx)
   2105 	mov	$G,$SZ*6($ctx)
   2106 	mov	$H,$SZ*7($ctx)
   2107 
   2108 	jbe	.Loop_avx2
   2109 	lea	(%rsp),$Tbl
   2110 
   2111 .Ldone_avx2:
   2112 	lea	($Tbl),%rsp
   2113 	mov	$_rsp,%rsi
   2114 	vzeroupper
   2115 ___
   2116 $code.=<<___ if ($win64);
   2117 	movaps	16*$SZ+32(%rsp),%xmm6
   2118 	movaps	16*$SZ+48(%rsp),%xmm7
   2119 	movaps	16*$SZ+64(%rsp),%xmm8
   2120 	movaps	16*$SZ+80(%rsp),%xmm9
   2121 ___
   2122 $code.=<<___ if ($win64 && $SZ>4);
   2123 	movaps	16*$SZ+96(%rsp),%xmm10
   2124 	movaps	16*$SZ+112(%rsp),%xmm11
   2125 ___
   2126 $code.=<<___;
   2127 	mov	(%rsi),%r15
   2128 	mov	8(%rsi),%r14
   2129 	mov	16(%rsi),%r13
   2130 	mov	24(%rsi),%r12
   2131 	mov	32(%rsi),%rbp
   2132 	mov	40(%rsi),%rbx
   2133 	lea	48(%rsi),%rsp
   2134 .Lepilogue_avx2:
   2135 	ret
   2136 .size	${func}_avx2,.-${func}_avx2
   2137 ___
   2138 }}
   2139 }}}}}
   2140 
   2141 # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
   2142 #		CONTEXT *context,DISPATCHER_CONTEXT *disp)
   2143 if ($win64) {
   2144 $rec="%rcx";
   2145 $frame="%rdx";
   2146 $context="%r8";
   2147 $disp="%r9";
   2148 
   2149 $code.=<<___;
   2150 .extern	__imp_RtlVirtualUnwind
   2151 .type	se_handler,\@abi-omnipotent
   2152 .align	16
   2153 se_handler:
   2154 	push	%rsi
   2155 	push	%rdi
   2156 	push	%rbx
   2157 	push	%rbp
   2158 	push	%r12
   2159 	push	%r13
   2160 	push	%r14
   2161 	push	%r15
   2162 	pushfq
   2163 	sub	\$64,%rsp
   2164 
   2165 	mov	120($context),%rax	# pull context->Rax
   2166 	mov	248($context),%rbx	# pull context->Rip
   2167 
   2168 	mov	8($disp),%rsi		# disp->ImageBase
   2169 	mov	56($disp),%r11		# disp->HanderlData
   2170 
   2171 	mov	0(%r11),%r10d		# HandlerData[0]
   2172 	lea	(%rsi,%r10),%r10	# prologue label
   2173 	cmp	%r10,%rbx		# context->Rip<prologue label
   2174 	jb	.Lin_prologue
   2175 
   2176 	mov	152($context),%rax	# pull context->Rsp
   2177 
   2178 	mov	4(%r11),%r10d		# HandlerData[1]
   2179 	lea	(%rsi,%r10),%r10	# epilogue label
   2180 	cmp	%r10,%rbx		# context->Rip>=epilogue label
   2181 	jae	.Lin_prologue
   2182 ___
   2183 $code.=<<___ if ($avx>1);
   2184 	lea	.Lavx2_shortcut(%rip),%r10
   2185 	cmp	%r10,%rbx		# context->Rip<avx2_shortcut
   2186 	jb	.Lnot_in_avx2
   2187 
   2188 	and	\$-256*$SZ,%rax
   2189 	add	\$`2*$SZ*($rounds-8)`,%rax
   2190 .Lnot_in_avx2:
   2191 ___
   2192 $code.=<<___;
   2193 	mov	%rax,%rsi		# put aside Rsp
   2194 	mov	16*$SZ+3*8(%rax),%rax	# pull $_rsp
   2195 	lea	48(%rax),%rax
   2196 
   2197 	mov	-8(%rax),%rbx
   2198 	mov	-16(%rax),%rbp
   2199 	mov	-24(%rax),%r12
   2200 	mov	-32(%rax),%r13
   2201 	mov	-40(%rax),%r14
   2202 	mov	-48(%rax),%r15
   2203 	mov	%rbx,144($context)	# restore context->Rbx
   2204 	mov	%rbp,160($context)	# restore context->Rbp
   2205 	mov	%r12,216($context)	# restore context->R12
   2206 	mov	%r13,224($context)	# restore context->R13
   2207 	mov	%r14,232($context)	# restore context->R14
   2208 	mov	%r15,240($context)	# restore context->R15
   2209 
   2210 	lea	.Lepilogue(%rip),%r10
   2211 	cmp	%r10,%rbx
   2212 	jb	.Lin_prologue		# non-AVX code
   2213 
   2214 	lea	16*$SZ+4*8(%rsi),%rsi	# Xmm6- save area
   2215 	lea	512($context),%rdi	# &context.Xmm6
   2216 	mov	\$`$SZ==4?8:12`,%ecx
   2217 	.long	0xa548f3fc		# cld; rep movsq
   2218 
   2219 .Lin_prologue:
   2220 	mov	8(%rax),%rdi
   2221 	mov	16(%rax),%rsi
   2222 	mov	%rax,152($context)	# restore context->Rsp
   2223 	mov	%rsi,168($context)	# restore context->Rsi
   2224 	mov	%rdi,176($context)	# restore context->Rdi
   2225 
   2226 	mov	40($disp),%rdi		# disp->ContextRecord
   2227 	mov	$context,%rsi		# context
   2228 	mov	\$154,%ecx		# sizeof(CONTEXT)
   2229 	.long	0xa548f3fc		# cld; rep movsq
   2230 
   2231 	mov	$disp,%rsi
   2232 	xor	%rcx,%rcx		# arg1, UNW_FLAG_NHANDLER
   2233 	mov	8(%rsi),%rdx		# arg2, disp->ImageBase
   2234 	mov	0(%rsi),%r8		# arg3, disp->ControlPc
   2235 	mov	16(%rsi),%r9		# arg4, disp->FunctionEntry
   2236 	mov	40(%rsi),%r10		# disp->ContextRecord
   2237 	lea	56(%rsi),%r11		# &disp->HandlerData
   2238 	lea	24(%rsi),%r12		# &disp->EstablisherFrame
   2239 	mov	%r10,32(%rsp)		# arg5
   2240 	mov	%r11,40(%rsp)		# arg6
   2241 	mov	%r12,48(%rsp)		# arg7
   2242 	mov	%rcx,56(%rsp)		# arg8, (NULL)
   2243 	call	*__imp_RtlVirtualUnwind(%rip)
   2244 
   2245 	mov	\$1,%eax		# ExceptionContinueSearch
   2246 	add	\$64,%rsp
   2247 	popfq
   2248 	pop	%r15
   2249 	pop	%r14
   2250 	pop	%r13
   2251 	pop	%r12
   2252 	pop	%rbp
   2253 	pop	%rbx
   2254 	pop	%rdi
   2255 	pop	%rsi
   2256 	ret
   2257 .size	se_handler,.-se_handler
   2258 ___
   2259 
   2260 $code.=<<___ if ($SZ==4 && $shaext);
   2261 .type	shaext_handler,\@abi-omnipotent
   2262 .align	16
   2263 shaext_handler:
   2264 	push	%rsi
   2265 	push	%rdi
   2266 	push	%rbx
   2267 	push	%rbp
   2268 	push	%r12
   2269 	push	%r13
   2270 	push	%r14
   2271 	push	%r15
   2272 	pushfq
   2273 	sub	\$64,%rsp
   2274 
   2275 	mov	120($context),%rax	# pull context->Rax
   2276 	mov	248($context),%rbx	# pull context->Rip
   2277 
   2278 	lea	.Lprologue_shaext(%rip),%r10
   2279 	cmp	%r10,%rbx		# context->Rip<.Lprologue
   2280 	jb	.Lin_prologue
   2281 
   2282 	lea	.Lepilogue_shaext(%rip),%r10
   2283 	cmp	%r10,%rbx		# context->Rip>=.Lepilogue
   2284 	jae	.Lin_prologue
   2285 
   2286 	lea	-8-5*16(%rax),%rsi
   2287 	lea	512($context),%rdi	# &context.Xmm6
   2288 	mov	\$10,%ecx
   2289 	.long	0xa548f3fc		# cld; rep movsq
   2290 
   2291 	jmp	.Lin_prologue
   2292 .size	shaext_handler,.-shaext_handler
   2293 ___
   2294 
   2295 $code.=<<___;
   2296 .section	.pdata
   2297 .align	4
   2298 	.rva	.LSEH_begin_$func
   2299 	.rva	.LSEH_end_$func
   2300 	.rva	.LSEH_info_$func
   2301 ___
   2302 $code.=<<___ if ($SZ==4 && $shaext);
   2303 	.rva	.LSEH_begin_${func}_shaext
   2304 	.rva	.LSEH_end_${func}_shaext
   2305 	.rva	.LSEH_info_${func}_shaext
   2306 ___
   2307 $code.=<<___ if ($SZ==4);
   2308 	.rva	.LSEH_begin_${func}_ssse3
   2309 	.rva	.LSEH_end_${func}_ssse3
   2310 	.rva	.LSEH_info_${func}_ssse3
   2311 ___
   2312 $code.=<<___ if ($avx && $SZ==8);
   2313 	.rva	.LSEH_begin_${func}_xop
   2314 	.rva	.LSEH_end_${func}_xop
   2315 	.rva	.LSEH_info_${func}_xop
   2316 ___
   2317 $code.=<<___ if ($avx);
   2318 	.rva	.LSEH_begin_${func}_avx
   2319 	.rva	.LSEH_end_${func}_avx
   2320 	.rva	.LSEH_info_${func}_avx
   2321 ___
   2322 $code.=<<___ if ($avx>1);
   2323 	.rva	.LSEH_begin_${func}_avx2
   2324 	.rva	.LSEH_end_${func}_avx2
   2325 	.rva	.LSEH_info_${func}_avx2
   2326 ___
   2327 $code.=<<___;
   2328 .section	.xdata
   2329 .align	8
   2330 .LSEH_info_$func:
   2331 	.byte	9,0,0,0
   2332 	.rva	se_handler
   2333 	.rva	.Lprologue,.Lepilogue			# HandlerData[]
   2334 ___
   2335 $code.=<<___ if ($SZ==4 && $shaext);
   2336 .LSEH_info_${func}_shaext:
   2337 	.byte	9,0,0,0
   2338 	.rva	shaext_handler
   2339 ___
   2340 $code.=<<___ if ($SZ==4);
   2341 .LSEH_info_${func}_ssse3:
   2342 	.byte	9,0,0,0
   2343 	.rva	se_handler
   2344 	.rva	.Lprologue_ssse3,.Lepilogue_ssse3	# HandlerData[]
   2345 ___
   2346 $code.=<<___ if ($avx && $SZ==8);
   2347 .LSEH_info_${func}_xop:
   2348 	.byte	9,0,0,0
   2349 	.rva	se_handler
   2350 	.rva	.Lprologue_xop,.Lepilogue_xop		# HandlerData[]
   2351 ___
   2352 $code.=<<___ if ($avx);
   2353 .LSEH_info_${func}_avx:
   2354 	.byte	9,0,0,0
   2355 	.rva	se_handler
   2356 	.rva	.Lprologue_avx,.Lepilogue_avx		# HandlerData[]
   2357 ___
   2358 $code.=<<___ if ($avx>1);
   2359 .LSEH_info_${func}_avx2:
   2360 	.byte	9,0,0,0
   2361 	.rva	se_handler
   2362 	.rva	.Lprologue_avx2,.Lepilogue_avx2		# HandlerData[]
   2363 ___
   2364 }
   2365 
   2366 sub sha256op38 {
   2367     my $instr = shift;
   2368     my %opcodelet = (
   2369 		"sha256rnds2" => 0xcb,
   2370   		"sha256msg1"  => 0xcc,
   2371 		"sha256msg2"  => 0xcd	);
   2372 
   2373     if (defined($opcodelet{$instr}) && @_[0] =~ /%xmm([0-7]),\s*%xmm([0-7])/) {
   2374       my @opcode=(0x0f,0x38);
   2375 	push @opcode,$opcodelet{$instr};
   2376 	push @opcode,0xc0|($1&7)|(($2&7)<<3);		# ModR/M
   2377 	return ".byte\t".join(',',@opcode);
   2378     } else {
   2379 	return $instr."\t".@_[0];
   2380     }
   2381 }
   2382 
   2383 foreach (split("\n",$code)) {
   2384 	s/\`([^\`]*)\`/eval $1/geo;
   2385 
   2386 	s/\b(sha256[^\s]*)\s+(.*)/sha256op38($1,$2)/geo;
   2387 
   2388 	print $_,"\n";
   2389 }
   2390 close STDOUT;
   2391