Home | History | Annotate | Download | only in asm
      1 #!/usr/bin/env perl
      2 #
      3 # ====================================================================
      4 # Written by Andy Polyakov <appro (at] openssl.org> for the OpenSSL
      5 # project. Rights for redistribution and usage in source and binary
      6 # forms are granted according to the OpenSSL license.
      7 # ====================================================================
      8 #
      9 # sha256/512_block procedure for x86_64.
     10 #
     11 # 40% improvement over compiler-generated code on Opteron. On EM64T
     12 # sha256 was observed to run >80% faster and sha512 - >40%. No magical
     13 # tricks, just straight implementation... I really wonder why gcc
     14 # [being armed with inline assembler] fails to generate as fast code.
     15 # The only thing which is cool about this module is that it's very
     16 # same instruction sequence used for both SHA-256 and SHA-512. In
     17 # former case the instructions operate on 32-bit operands, while in
     18 # latter - on 64-bit ones. All I had to do is to get one flavor right,
     19 # the other one passed the test right away:-)
     20 #
     21 # sha256_block runs in ~1005 cycles on Opteron, which gives you
     22 # asymptotic performance of 64*1000/1005=63.7MBps times CPU clock
     23 # frequency in GHz. sha512_block runs in ~1275 cycles, which results
     24 # in 128*1000/1275=100MBps per GHz. Is there room for improvement?
     25 # Well, if you compare it to IA-64 implementation, which maintains
     26 # X[16] in register bank[!], tends to 4 instructions per CPU clock
     27 # cycle and runs in 1003 cycles, 1275 is very good result for 3-way
     28 # issue Opteron pipeline and X[16] maintained in memory. So that *if*
     29 # there is a way to improve it, *then* the only way would be to try to
     30 # offload X[16] updates to SSE unit, but that would require "deeper"
     31 # loop unroll, which in turn would naturally cause size blow-up, not
     32 # to mention increased complexity! And once again, only *if* it's
     33 # actually possible to noticeably improve overall ILP, instruction
     34 # level parallelism, on a given CPU implementation in this case.
     35 #
     36 # Special note on Intel EM64T. While Opteron CPU exhibits perfect
     37 # performance ratio of 1.5 between 64- and 32-bit flavors [see above],
     38 # [currently available] EM64T CPUs apparently are far from it. On the
     39 # contrary, 64-bit version, sha512_block, is ~30% *slower* than 32-bit
     40 # sha256_block:-( This is presumably because 64-bit shifts/rotates
     41 # apparently are not atomic instructions, but implemented in microcode.
     42 #
     43 # May 2012.
     44 #
     45 # Optimization including one of Pavel Semjanov's ideas, alternative
     46 # Maj, resulted in >=5% improvement on most CPUs, +20% SHA256 and
     47 # unfortunately -2% SHA512 on P4 [which nobody should care about
     48 # that much].
     49 #
     50 # June 2012.
     51 #
     52 # Add SIMD code paths, see below for improvement coefficients. SSSE3
     53 # code path was not attempted for SHA512, because improvement is not
     54 # estimated to be high enough, noticeably less than 9%, to justify
     55 # the effort, not on pre-AVX processors. [Obviously with exclusion
     56 # for VIA Nano, but it has SHA512 instruction that is faster and
     57 # should be used instead.] For reference, corresponding estimated
     58 # upper limit for improvement for SSSE3 SHA256 is 28%. The fact that
     59 # higher coefficients are observed on VIA Nano and Bulldozer has more
     60 # to do with specifics of their architecture [which is topic for
     61 # separate discussion].
     62 #
     63 # November 2012.
     64 #
     65 # Add AVX2 code path. Two consecutive input blocks are loaded to
     66 # 256-bit %ymm registers, with data from first block to least
     67 # significant 128-bit halves and data from second to most significant.
     68 # The data is then processed with same SIMD instruction sequence as
     69 # for AVX, but with %ymm as operands. Side effect is increased stack
     70 # frame, 448 additional bytes in SHA256 and 1152 in SHA512, and 1.2KB
     71 # code size increase.
     72 #
     73 # March 2014.
     74 #
     75 # Add support for Intel SHA Extensions.
     76 
     77 ######################################################################
     78 # Current performance in cycles per processed byte (less is better):
     79 #
     80 #		SHA256	SSSE3       AVX/XOP(*)	    SHA512  AVX/XOP(*)
     81 #
     82 # AMD K8	14.9	-	    -		    9.57    -
     83 # P4		17.3	-	    -		    30.8    -
     84 # Core 2	15.6	13.8(+13%)  -		    9.97    -
     85 # Westmere	14.8	12.3(+19%)  -		    9.58    -
     86 # Sandy Bridge	17.4	14.2(+23%)  11.6(+50%(**))  11.2    8.10(+38%(**))
     87 # Ivy Bridge	12.6	10.5(+20%)  10.3(+22%)	    8.17    7.22(+13%)
     88 # Haswell	12.2	9.28(+31%)  7.80(+56%)	    7.66    5.40(+42%)
     89 # Skylake	11.4	9.03(+26%)  7.70(+48%)      7.25    5.20(+40%)
     90 # Bulldozer	21.1	13.6(+54%)  13.6(+54%(***)) 13.5    8.58(+57%)
     91 # VIA Nano	23.0	16.5(+39%)  -		    14.7    -
     92 # Atom		23.0	18.9(+22%)  -		    14.7    -
     93 # Silvermont	27.4	20.6(+33%)  -               17.5    -
     94 # Goldmont	18.9	14.3(+32%)  4.16(+350%)     12.0    -
     95 #
     96 # (*)	whichever best applicable, including SHAEXT;
     97 # (**)	switch from ror to shrd stands for fair share of improvement;
     98 # (***)	execution time is fully determined by remaining integer-only
     99 #	part, body_00_15; reducing the amount of SIMD instructions
    100 #	below certain limit makes no difference/sense; to conserve
    101 #	space SHA256 XOP code path is therefore omitted;
    102 
    103 $flavour = shift;
    104 $output  = shift;
    105 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
    106 
    107 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
    108 
    109 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
    110 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
    111 ( $xlate="${dir}../../../perlasm/x86_64-xlate.pl" and -f $xlate) or
    112 die "can't locate x86_64-xlate.pl";
    113 
    114 # In upstream, this is controlled by shelling out to the compiler to check
    115 # versions, but BoringSSL is intended to be used with pre-generated perlasm
    116 # output, so this isn't useful anyway.
    117 #
    118 # TODO(davidben): Enable AVX2 code after testing by setting $avx to 2. Is it
    119 # necessary to disable AVX2 code when SHA Extensions code is disabled? Upstream
    120 # did not tie them together until after $shaext was added.
    121 $avx = 1;
    122 
    123 # TODO(davidben): Consider enabling the Intel SHA Extensions code once it's
    124 # been tested.
    125 $shaext=0;	### set to zero if compiling for 1.0.1
    126 $avx=1		if (!$shaext && $avx);
    127 
    128 open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
    129 *STDOUT=*OUT;
    130 
    131 if ($output =~ /512/) {
    132 	$func="sha512_block_data_order";
    133 	$TABLE="K512";
    134 	$SZ=8;
    135 	@ROT=($A,$B,$C,$D,$E,$F,$G,$H)=("%rax","%rbx","%rcx","%rdx",
    136 					"%r8", "%r9", "%r10","%r11");
    137 	($T1,$a0,$a1,$a2,$a3)=("%r12","%r13","%r14","%r15","%rdi");
    138 	@Sigma0=(28,34,39);
    139 	@Sigma1=(14,18,41);
    140 	@sigma0=(1,  8, 7);
    141 	@sigma1=(19,61, 6);
    142 	$rounds=80;
    143 } else {
    144 	$func="sha256_block_data_order";
    145 	$TABLE="K256";
    146 	$SZ=4;
    147 	@ROT=($A,$B,$C,$D,$E,$F,$G,$H)=("%eax","%ebx","%ecx","%edx",
    148 					"%r8d","%r9d","%r10d","%r11d");
    149 	($T1,$a0,$a1,$a2,$a3)=("%r12d","%r13d","%r14d","%r15d","%edi");
    150 	@Sigma0=( 2,13,22);
    151 	@Sigma1=( 6,11,25);
    152 	@sigma0=( 7,18, 3);
    153 	@sigma1=(17,19,10);
    154 	$rounds=64;
    155 }
    156 
    157 $ctx="%rdi";	# 1st arg, zapped by $a3
    158 $inp="%rsi";	# 2nd arg
    159 $Tbl="%rbp";
    160 
    161 $_ctx="16*$SZ+0*8(%rsp)";
    162 $_inp="16*$SZ+1*8(%rsp)";
    163 $_end="16*$SZ+2*8(%rsp)";
    164 $_rsp="16*$SZ+3*8(%rsp)";
    165 $framesz="16*$SZ+4*8";
    166 
    167 
    168 sub ROUND_00_15()
    169 { my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
    170   my $STRIDE=$SZ;
    171      $STRIDE += 16 if ($i%(16/$SZ)==(16/$SZ-1));
    172 
    173 $code.=<<___;
    174 	ror	\$`$Sigma1[2]-$Sigma1[1]`,$a0
    175 	mov	$f,$a2
    176 
    177 	xor	$e,$a0
    178 	ror	\$`$Sigma0[2]-$Sigma0[1]`,$a1
    179 	xor	$g,$a2			# f^g
    180 
    181 	mov	$T1,`$SZ*($i&0xf)`(%rsp)
    182 	xor	$a,$a1
    183 	and	$e,$a2			# (f^g)&e
    184 
    185 	ror	\$`$Sigma1[1]-$Sigma1[0]`,$a0
    186 	add	$h,$T1			# T1+=h
    187 	xor	$g,$a2			# Ch(e,f,g)=((f^g)&e)^g
    188 
    189 	ror	\$`$Sigma0[1]-$Sigma0[0]`,$a1
    190 	xor	$e,$a0
    191 	add	$a2,$T1			# T1+=Ch(e,f,g)
    192 
    193 	mov	$a,$a2
    194 	add	($Tbl),$T1		# T1+=K[round]
    195 	xor	$a,$a1
    196 
    197 	xor	$b,$a2			# a^b, b^c in next round
    198 	ror	\$$Sigma1[0],$a0	# Sigma1(e)
    199 	mov	$b,$h
    200 
    201 	and	$a2,$a3
    202 	ror	\$$Sigma0[0],$a1	# Sigma0(a)
    203 	add	$a0,$T1			# T1+=Sigma1(e)
    204 
    205 	xor	$a3,$h			# h=Maj(a,b,c)=Ch(a^b,c,b)
    206 	add	$T1,$d			# d+=T1
    207 	add	$T1,$h			# h+=T1
    208 
    209 	lea	$STRIDE($Tbl),$Tbl	# round++
    210 ___
    211 $code.=<<___ if ($i<15);
    212 	add	$a1,$h			# h+=Sigma0(a)
    213 ___
    214 	($a2,$a3) = ($a3,$a2);
    215 }
    216 
    217 sub ROUND_16_XX()
    218 { my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
    219 
    220 $code.=<<___;
    221 	mov	`$SZ*(($i+1)&0xf)`(%rsp),$a0
    222 	mov	`$SZ*(($i+14)&0xf)`(%rsp),$a2
    223 
    224 	mov	$a0,$T1
    225 	ror	\$`$sigma0[1]-$sigma0[0]`,$a0
    226 	add	$a1,$a			# modulo-scheduled h+=Sigma0(a)
    227 	mov	$a2,$a1
    228 	ror	\$`$sigma1[1]-$sigma1[0]`,$a2
    229 
    230 	xor	$T1,$a0
    231 	shr	\$$sigma0[2],$T1
    232 	ror	\$$sigma0[0],$a0
    233 	xor	$a1,$a2
    234 	shr	\$$sigma1[2],$a1
    235 
    236 	ror	\$$sigma1[0],$a2
    237 	xor	$a0,$T1			# sigma0(X[(i+1)&0xf])
    238 	xor	$a1,$a2			# sigma1(X[(i+14)&0xf])
    239 	add	`$SZ*(($i+9)&0xf)`(%rsp),$T1
    240 
    241 	add	`$SZ*($i&0xf)`(%rsp),$T1
    242 	mov	$e,$a0
    243 	add	$a2,$T1
    244 	mov	$a,$a1
    245 ___
    246 	&ROUND_00_15(@_);
    247 }
    248 
    249 $code=<<___;
    250 .text
    251 
    252 .extern	OPENSSL_ia32cap_P
    253 .globl	$func
    254 .type	$func,\@function,3
    255 .align	16
    256 $func:
    257 ___
    258 $code.=<<___ if ($SZ==4 || $avx);
    259 	leaq	OPENSSL_ia32cap_P(%rip),%r11
    260 	mov	0(%r11),%r9d
    261 	mov	4(%r11),%r10d
    262 	mov	8(%r11),%r11d
    263 ___
    264 $code.=<<___ if ($SZ==4 && $shaext);
    265 	test	\$`1<<29`,%r11d		# check for SHA
    266 	jnz	_shaext_shortcut
    267 ___
    268 $code.=<<___ if ($avx && $SZ==8);
    269 	test	\$`1<<11`,%r10d		# check for XOP
    270 	jnz	.Lxop_shortcut
    271 ___
    272 $code.=<<___ if ($avx>1);
    273 	and	\$`1<<8|1<<5|1<<3`,%r11d	# check for BMI2+AVX2+BMI1
    274 	cmp	\$`1<<8|1<<5|1<<3`,%r11d
    275 	je	.Lavx2_shortcut
    276 ___
    277 $code.=<<___ if ($avx);
    278 	and	\$`1<<30`,%r9d		# mask "Intel CPU" bit
    279 	and	\$`1<<28|1<<9`,%r10d	# mask AVX and SSSE3 bits
    280 	or	%r9d,%r10d
    281 	cmp	\$`1<<28|1<<9|1<<30`,%r10d
    282 	je	.Lavx_shortcut
    283 ___
    284 $code.=<<___ if ($SZ==4);
    285 	test	\$`1<<9`,%r10d
    286 	jnz	.Lssse3_shortcut
    287 ___
    288 $code.=<<___;
    289 	mov	%rsp,%rax		# copy %rsp
    290 	push	%rbx
    291 	push	%rbp
    292 	push	%r12
    293 	push	%r13
    294 	push	%r14
    295 	push	%r15
    296 	shl	\$4,%rdx		# num*16
    297 	sub	\$$framesz,%rsp
    298 	lea	($inp,%rdx,$SZ),%rdx	# inp+num*16*$SZ
    299 	and	\$-64,%rsp		# align stack frame
    300 	mov	$ctx,$_ctx		# save ctx, 1st arg
    301 	mov	$inp,$_inp		# save inp, 2nd arh
    302 	mov	%rdx,$_end		# save end pointer, "3rd" arg
    303 	mov	%rax,$_rsp		# save copy of %rsp
    304 .Lprologue:
    305 
    306 	mov	$SZ*0($ctx),$A
    307 	mov	$SZ*1($ctx),$B
    308 	mov	$SZ*2($ctx),$C
    309 	mov	$SZ*3($ctx),$D
    310 	mov	$SZ*4($ctx),$E
    311 	mov	$SZ*5($ctx),$F
    312 	mov	$SZ*6($ctx),$G
    313 	mov	$SZ*7($ctx),$H
    314 
    315 	jmp	.Lloop
    316 
    317 .align	16
    318 .Lloop:
    319 	mov	$B,$a3
    320 	lea	$TABLE(%rip),$Tbl
    321 	xor	$C,$a3			# magic
    322 ___
    323 	for($i=0;$i<16;$i++) {
    324 		$code.="	mov	$SZ*$i($inp),$T1\n";
    325 		$code.="	mov	@ROT[4],$a0\n";
    326 		$code.="	mov	@ROT[0],$a1\n";
    327 		$code.="	bswap	$T1\n";
    328 		&ROUND_00_15($i,@ROT);
    329 		unshift(@ROT,pop(@ROT));
    330 	}
    331 $code.=<<___;
    332 	jmp	.Lrounds_16_xx
    333 .align	16
    334 .Lrounds_16_xx:
    335 ___
    336 	for(;$i<32;$i++) {
    337 		&ROUND_16_XX($i,@ROT);
    338 		unshift(@ROT,pop(@ROT));
    339 	}
    340 
    341 $code.=<<___;
    342 	cmpb	\$0,`$SZ-1`($Tbl)
    343 	jnz	.Lrounds_16_xx
    344 
    345 	mov	$_ctx,$ctx
    346 	add	$a1,$A			# modulo-scheduled h+=Sigma0(a)
    347 	lea	16*$SZ($inp),$inp
    348 
    349 	add	$SZ*0($ctx),$A
    350 	add	$SZ*1($ctx),$B
    351 	add	$SZ*2($ctx),$C
    352 	add	$SZ*3($ctx),$D
    353 	add	$SZ*4($ctx),$E
    354 	add	$SZ*5($ctx),$F
    355 	add	$SZ*6($ctx),$G
    356 	add	$SZ*7($ctx),$H
    357 
    358 	cmp	$_end,$inp
    359 
    360 	mov	$A,$SZ*0($ctx)
    361 	mov	$B,$SZ*1($ctx)
    362 	mov	$C,$SZ*2($ctx)
    363 	mov	$D,$SZ*3($ctx)
    364 	mov	$E,$SZ*4($ctx)
    365 	mov	$F,$SZ*5($ctx)
    366 	mov	$G,$SZ*6($ctx)
    367 	mov	$H,$SZ*7($ctx)
    368 	jb	.Lloop
    369 
    370 	mov	$_rsp,%rsi
    371 	mov	-48(%rsi),%r15
    372 	mov	-40(%rsi),%r14
    373 	mov	-32(%rsi),%r13
    374 	mov	-24(%rsi),%r12
    375 	mov	-16(%rsi),%rbp
    376 	mov	-8(%rsi),%rbx
    377 	lea	(%rsi),%rsp
    378 .Lepilogue:
    379 	ret
    380 .size	$func,.-$func
    381 ___
    382 
    383 if ($SZ==4) {
    384 $code.=<<___;
    385 .align	64
    386 .type	$TABLE,\@object
    387 $TABLE:
    388 	.long	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
    389 	.long	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
    390 	.long	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
    391 	.long	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
    392 	.long	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
    393 	.long	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
    394 	.long	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
    395 	.long	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
    396 	.long	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
    397 	.long	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
    398 	.long	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
    399 	.long	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
    400 	.long	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
    401 	.long	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
    402 	.long	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
    403 	.long	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
    404 	.long	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
    405 	.long	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
    406 	.long	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
    407 	.long	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
    408 	.long	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
    409 	.long	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
    410 	.long	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
    411 	.long	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
    412 	.long	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
    413 	.long	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
    414 	.long	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
    415 	.long	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
    416 	.long	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
    417 	.long	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
    418 	.long	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
    419 	.long	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
    420 
    421 	.long	0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
    422 	.long	0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
    423 	.long	0x03020100,0x0b0a0908,0xffffffff,0xffffffff
    424 	.long	0x03020100,0x0b0a0908,0xffffffff,0xffffffff
    425 	.long	0xffffffff,0xffffffff,0x03020100,0x0b0a0908
    426 	.long	0xffffffff,0xffffffff,0x03020100,0x0b0a0908
    427 	.asciz	"SHA256 block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
    428 ___
    429 } else {
    430 $code.=<<___;
    431 .align	64
    432 .type	$TABLE,\@object
    433 $TABLE:
    434 	.quad	0x428a2f98d728ae22,0x7137449123ef65cd
    435 	.quad	0x428a2f98d728ae22,0x7137449123ef65cd
    436 	.quad	0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
    437 	.quad	0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
    438 	.quad	0x3956c25bf348b538,0x59f111f1b605d019
    439 	.quad	0x3956c25bf348b538,0x59f111f1b605d019
    440 	.quad	0x923f82a4af194f9b,0xab1c5ed5da6d8118
    441 	.quad	0x923f82a4af194f9b,0xab1c5ed5da6d8118
    442 	.quad	0xd807aa98a3030242,0x12835b0145706fbe
    443 	.quad	0xd807aa98a3030242,0x12835b0145706fbe
    444 	.quad	0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
    445 	.quad	0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
    446 	.quad	0x72be5d74f27b896f,0x80deb1fe3b1696b1
    447 	.quad	0x72be5d74f27b896f,0x80deb1fe3b1696b1
    448 	.quad	0x9bdc06a725c71235,0xc19bf174cf692694
    449 	.quad	0x9bdc06a725c71235,0xc19bf174cf692694
    450 	.quad	0xe49b69c19ef14ad2,0xefbe4786384f25e3
    451 	.quad	0xe49b69c19ef14ad2,0xefbe4786384f25e3
    452 	.quad	0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
    453 	.quad	0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
    454 	.quad	0x2de92c6f592b0275,0x4a7484aa6ea6e483
    455 	.quad	0x2de92c6f592b0275,0x4a7484aa6ea6e483
    456 	.quad	0x5cb0a9dcbd41fbd4,0x76f988da831153b5
    457 	.quad	0x5cb0a9dcbd41fbd4,0x76f988da831153b5
    458 	.quad	0x983e5152ee66dfab,0xa831c66d2db43210
    459 	.quad	0x983e5152ee66dfab,0xa831c66d2db43210
    460 	.quad	0xb00327c898fb213f,0xbf597fc7beef0ee4
    461 	.quad	0xb00327c898fb213f,0xbf597fc7beef0ee4
    462 	.quad	0xc6e00bf33da88fc2,0xd5a79147930aa725
    463 	.quad	0xc6e00bf33da88fc2,0xd5a79147930aa725
    464 	.quad	0x06ca6351e003826f,0x142929670a0e6e70
    465 	.quad	0x06ca6351e003826f,0x142929670a0e6e70
    466 	.quad	0x27b70a8546d22ffc,0x2e1b21385c26c926
    467 	.quad	0x27b70a8546d22ffc,0x2e1b21385c26c926
    468 	.quad	0x4d2c6dfc5ac42aed,0x53380d139d95b3df
    469 	.quad	0x4d2c6dfc5ac42aed,0x53380d139d95b3df
    470 	.quad	0x650a73548baf63de,0x766a0abb3c77b2a8
    471 	.quad	0x650a73548baf63de,0x766a0abb3c77b2a8
    472 	.quad	0x81c2c92e47edaee6,0x92722c851482353b
    473 	.quad	0x81c2c92e47edaee6,0x92722c851482353b
    474 	.quad	0xa2bfe8a14cf10364,0xa81a664bbc423001
    475 	.quad	0xa2bfe8a14cf10364,0xa81a664bbc423001
    476 	.quad	0xc24b8b70d0f89791,0xc76c51a30654be30
    477 	.quad	0xc24b8b70d0f89791,0xc76c51a30654be30
    478 	.quad	0xd192e819d6ef5218,0xd69906245565a910
    479 	.quad	0xd192e819d6ef5218,0xd69906245565a910
    480 	.quad	0xf40e35855771202a,0x106aa07032bbd1b8
    481 	.quad	0xf40e35855771202a,0x106aa07032bbd1b8
    482 	.quad	0x19a4c116b8d2d0c8,0x1e376c085141ab53
    483 	.quad	0x19a4c116b8d2d0c8,0x1e376c085141ab53
    484 	.quad	0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
    485 	.quad	0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
    486 	.quad	0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
    487 	.quad	0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
    488 	.quad	0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
    489 	.quad	0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
    490 	.quad	0x748f82ee5defb2fc,0x78a5636f43172f60
    491 	.quad	0x748f82ee5defb2fc,0x78a5636f43172f60
    492 	.quad	0x84c87814a1f0ab72,0x8cc702081a6439ec
    493 	.quad	0x84c87814a1f0ab72,0x8cc702081a6439ec
    494 	.quad	0x90befffa23631e28,0xa4506cebde82bde9
    495 	.quad	0x90befffa23631e28,0xa4506cebde82bde9
    496 	.quad	0xbef9a3f7b2c67915,0xc67178f2e372532b
    497 	.quad	0xbef9a3f7b2c67915,0xc67178f2e372532b
    498 	.quad	0xca273eceea26619c,0xd186b8c721c0c207
    499 	.quad	0xca273eceea26619c,0xd186b8c721c0c207
    500 	.quad	0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
    501 	.quad	0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
    502 	.quad	0x06f067aa72176fba,0x0a637dc5a2c898a6
    503 	.quad	0x06f067aa72176fba,0x0a637dc5a2c898a6
    504 	.quad	0x113f9804bef90dae,0x1b710b35131c471b
    505 	.quad	0x113f9804bef90dae,0x1b710b35131c471b
    506 	.quad	0x28db77f523047d84,0x32caab7b40c72493
    507 	.quad	0x28db77f523047d84,0x32caab7b40c72493
    508 	.quad	0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
    509 	.quad	0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
    510 	.quad	0x4cc5d4becb3e42b6,0x597f299cfc657e2a
    511 	.quad	0x4cc5d4becb3e42b6,0x597f299cfc657e2a
    512 	.quad	0x5fcb6fab3ad6faec,0x6c44198c4a475817
    513 	.quad	0x5fcb6fab3ad6faec,0x6c44198c4a475817
    514 
    515 	.quad	0x0001020304050607,0x08090a0b0c0d0e0f
    516 	.quad	0x0001020304050607,0x08090a0b0c0d0e0f
    517 	.asciz	"SHA512 block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
    518 ___
    519 }
    520 
    521 ######################################################################
    522 # SIMD code paths
    523 #
    524 if ($SZ==4 && $shaext) {{{
    525 ######################################################################
    526 # Intel SHA Extensions implementation of SHA256 update function.
    527 #
    528 my ($ctx,$inp,$num,$Tbl)=("%rdi","%rsi","%rdx","%rcx");
    529 
    530 my ($Wi,$ABEF,$CDGH,$TMP,$BSWAP,$ABEF_SAVE,$CDGH_SAVE)=map("%xmm$_",(0..2,7..10));
    531 my @MSG=map("%xmm$_",(3..6));
    532 
    533 $code.=<<___;
    534 .type	sha256_block_data_order_shaext,\@function,3
    535 .align	64
    536 sha256_block_data_order_shaext:
    537 _shaext_shortcut:
    538 ___
    539 $code.=<<___ if ($win64);
    540 	lea	`-8-5*16`(%rsp),%rsp
    541 	movaps	%xmm6,-8-5*16(%rax)
    542 	movaps	%xmm7,-8-4*16(%rax)
    543 	movaps	%xmm8,-8-3*16(%rax)
    544 	movaps	%xmm9,-8-2*16(%rax)
    545 	movaps	%xmm10,-8-1*16(%rax)
    546 .Lprologue_shaext:
    547 ___
    548 $code.=<<___;
    549 	lea		K256+0x80(%rip),$Tbl
    550 	movdqu		($ctx),$ABEF		# DCBA
    551 	movdqu		16($ctx),$CDGH		# HGFE
    552 	movdqa		0x200-0x80($Tbl),$TMP	# byte swap mask
    553 
    554 	pshufd		\$0x1b,$ABEF,$Wi	# ABCD
    555 	pshufd		\$0xb1,$ABEF,$ABEF	# CDAB
    556 	pshufd		\$0x1b,$CDGH,$CDGH	# EFGH
    557 	movdqa		$TMP,$BSWAP		# offload
    558 	palignr		\$8,$CDGH,$ABEF		# ABEF
    559 	punpcklqdq	$Wi,$CDGH		# CDGH
    560 	jmp		.Loop_shaext
    561 
    562 .align	16
    563 .Loop_shaext:
    564 	movdqu		($inp),@MSG[0]
    565 	movdqu		0x10($inp),@MSG[1]
    566 	movdqu		0x20($inp),@MSG[2]
    567 	pshufb		$TMP,@MSG[0]
    568 	movdqu		0x30($inp),@MSG[3]
    569 
    570 	movdqa		0*32-0x80($Tbl),$Wi
    571 	paddd		@MSG[0],$Wi
    572 	pshufb		$TMP,@MSG[1]
    573 	movdqa		$CDGH,$CDGH_SAVE	# offload
    574 	sha256rnds2	$ABEF,$CDGH		# 0-3
    575 	pshufd		\$0x0e,$Wi,$Wi
    576 	nop
    577 	movdqa		$ABEF,$ABEF_SAVE	# offload
    578 	sha256rnds2	$CDGH,$ABEF
    579 
    580 	movdqa		1*32-0x80($Tbl),$Wi
    581 	paddd		@MSG[1],$Wi
    582 	pshufb		$TMP,@MSG[2]
    583 	sha256rnds2	$ABEF,$CDGH		# 4-7
    584 	pshufd		\$0x0e,$Wi,$Wi
    585 	lea		0x40($inp),$inp
    586 	sha256msg1	@MSG[1],@MSG[0]
    587 	sha256rnds2	$CDGH,$ABEF
    588 
    589 	movdqa		2*32-0x80($Tbl),$Wi
    590 	paddd		@MSG[2],$Wi
    591 	pshufb		$TMP,@MSG[3]
    592 	sha256rnds2	$ABEF,$CDGH		# 8-11
    593 	pshufd		\$0x0e,$Wi,$Wi
    594 	movdqa		@MSG[3],$TMP
    595 	palignr		\$4,@MSG[2],$TMP
    596 	nop
    597 	paddd		$TMP,@MSG[0]
    598 	sha256msg1	@MSG[2],@MSG[1]
    599 	sha256rnds2	$CDGH,$ABEF
    600 
    601 	movdqa		3*32-0x80($Tbl),$Wi
    602 	paddd		@MSG[3],$Wi
    603 	sha256msg2	@MSG[3],@MSG[0]
    604 	sha256rnds2	$ABEF,$CDGH		# 12-15
    605 	pshufd		\$0x0e,$Wi,$Wi
    606 	movdqa		@MSG[0],$TMP
    607 	palignr		\$4,@MSG[3],$TMP
    608 	nop
    609 	paddd		$TMP,@MSG[1]
    610 	sha256msg1	@MSG[3],@MSG[2]
    611 	sha256rnds2	$CDGH,$ABEF
    612 ___
    613 for($i=4;$i<16-3;$i++) {
    614 $code.=<<___;
    615 	movdqa		$i*32-0x80($Tbl),$Wi
    616 	paddd		@MSG[0],$Wi
    617 	sha256msg2	@MSG[0],@MSG[1]
    618 	sha256rnds2	$ABEF,$CDGH		# 16-19...
    619 	pshufd		\$0x0e,$Wi,$Wi
    620 	movdqa		@MSG[1],$TMP
    621 	palignr		\$4,@MSG[0],$TMP
    622 	nop
    623 	paddd		$TMP,@MSG[2]
    624 	sha256msg1	@MSG[0],@MSG[3]
    625 	sha256rnds2	$CDGH,$ABEF
    626 ___
    627 	push(@MSG,shift(@MSG));
    628 }
    629 $code.=<<___;
    630 	movdqa		13*32-0x80($Tbl),$Wi
    631 	paddd		@MSG[0],$Wi
    632 	sha256msg2	@MSG[0],@MSG[1]
    633 	sha256rnds2	$ABEF,$CDGH		# 52-55
    634 	pshufd		\$0x0e,$Wi,$Wi
    635 	movdqa		@MSG[1],$TMP
    636 	palignr		\$4,@MSG[0],$TMP
    637 	sha256rnds2	$CDGH,$ABEF
    638 	paddd		$TMP,@MSG[2]
    639 
    640 	movdqa		14*32-0x80($Tbl),$Wi
    641 	paddd		@MSG[1],$Wi
    642 	sha256rnds2	$ABEF,$CDGH		# 56-59
    643 	pshufd		\$0x0e,$Wi,$Wi
    644 	sha256msg2	@MSG[1],@MSG[2]
    645 	movdqa		$BSWAP,$TMP
    646 	sha256rnds2	$CDGH,$ABEF
    647 
    648 	movdqa		15*32-0x80($Tbl),$Wi
    649 	paddd		@MSG[2],$Wi
    650 	nop
    651 	sha256rnds2	$ABEF,$CDGH		# 60-63
    652 	pshufd		\$0x0e,$Wi,$Wi
    653 	dec		$num
    654 	nop
    655 	sha256rnds2	$CDGH,$ABEF
    656 
    657 	paddd		$CDGH_SAVE,$CDGH
    658 	paddd		$ABEF_SAVE,$ABEF
    659 	jnz		.Loop_shaext
    660 
    661 	pshufd		\$0xb1,$CDGH,$CDGH	# DCHG
    662 	pshufd		\$0x1b,$ABEF,$TMP	# FEBA
    663 	pshufd		\$0xb1,$ABEF,$ABEF	# BAFE
    664 	punpckhqdq	$CDGH,$ABEF		# DCBA
    665 	palignr		\$8,$TMP,$CDGH		# HGFE
    666 
    667 	movdqu	$ABEF,($ctx)
    668 	movdqu	$CDGH,16($ctx)
    669 ___
    670 $code.=<<___ if ($win64);
    671 	movaps	-8-5*16(%rax),%xmm6
    672 	movaps	-8-4*16(%rax),%xmm7
    673 	movaps	-8-3*16(%rax),%xmm8
    674 	movaps	-8-2*16(%rax),%xmm9
    675 	movaps	-8-1*16(%rax),%xmm10
    676 	mov	%rax,%rsp
    677 .Lepilogue_shaext:
    678 ___
    679 $code.=<<___;
    680 	ret
    681 .size	sha256_block_data_order_shaext,.-sha256_block_data_order_shaext
    682 ___
    683 }}}
    684 {{{
    685 
    686 my $a4=$T1;
    687 my ($a,$b,$c,$d,$e,$f,$g,$h);
    688 
    689 sub AUTOLOAD()		# thunk [simplified] 32-bit style perlasm
    690 { my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
    691   my $arg = pop;
    692     $arg = "\$$arg" if ($arg*1 eq $arg);
    693     $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n";
    694 }
    695 
    696 sub body_00_15 () {
    697 	(
    698 	'($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'.
    699 
    700 	'&ror	($a0,$Sigma1[2]-$Sigma1[1])',
    701 	'&mov	($a,$a1)',
    702 	'&mov	($a4,$f)',
    703 
    704 	'&ror	($a1,$Sigma0[2]-$Sigma0[1])',
    705 	'&xor	($a0,$e)',
    706 	'&xor	($a4,$g)',			# f^g
    707 
    708 	'&ror	($a0,$Sigma1[1]-$Sigma1[0])',
    709 	'&xor	($a1,$a)',
    710 	'&and	($a4,$e)',			# (f^g)&e
    711 
    712 	'&xor	($a0,$e)',
    713 	'&add	($h,$SZ*($i&15)."(%rsp)")',	# h+=X[i]+K[i]
    714 	'&mov	($a2,$a)',
    715 
    716 	'&xor	($a4,$g)',			# Ch(e,f,g)=((f^g)&e)^g
    717 	'&ror	($a1,$Sigma0[1]-$Sigma0[0])',
    718 	'&xor	($a2,$b)',			# a^b, b^c in next round
    719 
    720 	'&add	($h,$a4)',			# h+=Ch(e,f,g)
    721 	'&ror	($a0,$Sigma1[0])',		# Sigma1(e)
    722 	'&and	($a3,$a2)',			# (b^c)&(a^b)
    723 
    724 	'&xor	($a1,$a)',
    725 	'&add	($h,$a0)',			# h+=Sigma1(e)
    726 	'&xor	($a3,$b)',			# Maj(a,b,c)=Ch(a^b,c,b)
    727 
    728 	'&ror	($a1,$Sigma0[0])',		# Sigma0(a)
    729 	'&add	($d,$h)',			# d+=h
    730 	'&add	($h,$a3)',			# h+=Maj(a,b,c)
    731 
    732 	'&mov	($a0,$d)',
    733 	'&add	($a1,$h);'.			# h+=Sigma0(a)
    734 	'($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;'
    735 	);
    736 }
    737 
    738 ######################################################################
    739 # SSSE3 code path
    740 #
    741 if ($SZ==4) {	# SHA256 only
    742 my @X = map("%xmm$_",(0..3));
    743 my ($t0,$t1,$t2,$t3, $t4,$t5) = map("%xmm$_",(4..9));
    744 
    745 $code.=<<___;
    746 .type	${func}_ssse3,\@function,3
    747 .align	64
    748 ${func}_ssse3:
    749 .Lssse3_shortcut:
    750 	mov	%rsp,%rax		# copy %rsp
    751 	push	%rbx
    752 	push	%rbp
    753 	push	%r12
    754 	push	%r13
    755 	push	%r14
    756 	push	%r15
    757 	shl	\$4,%rdx		# num*16
    758 	sub	\$`$framesz+$win64*16*4`,%rsp
    759 	lea	($inp,%rdx,$SZ),%rdx	# inp+num*16*$SZ
    760 	and	\$-64,%rsp		# align stack frame
    761 	mov	$ctx,$_ctx		# save ctx, 1st arg
    762 	mov	$inp,$_inp		# save inp, 2nd arh
    763 	mov	%rdx,$_end		# save end pointer, "3rd" arg
    764 	mov	%rax,$_rsp		# save copy of %rsp
    765 ___
    766 $code.=<<___ if ($win64);
    767 	movaps	%xmm6,16*$SZ+32(%rsp)
    768 	movaps	%xmm7,16*$SZ+48(%rsp)
    769 	movaps	%xmm8,16*$SZ+64(%rsp)
    770 	movaps	%xmm9,16*$SZ+80(%rsp)
    771 ___
    772 $code.=<<___;
    773 .Lprologue_ssse3:
    774 
    775 	mov	$SZ*0($ctx),$A
    776 	mov	$SZ*1($ctx),$B
    777 	mov	$SZ*2($ctx),$C
    778 	mov	$SZ*3($ctx),$D
    779 	mov	$SZ*4($ctx),$E
    780 	mov	$SZ*5($ctx),$F
    781 	mov	$SZ*6($ctx),$G
    782 	mov	$SZ*7($ctx),$H
    783 ___
    784 
    785 $code.=<<___;
    786 	#movdqa	$TABLE+`$SZ*2*$rounds`+32(%rip),$t4
    787 	#movdqa	$TABLE+`$SZ*2*$rounds`+64(%rip),$t5
    788 	jmp	.Lloop_ssse3
    789 .align	16
    790 .Lloop_ssse3:
    791 	movdqa	$TABLE+`$SZ*2*$rounds`(%rip),$t3
    792 	movdqu	0x00($inp),@X[0]
    793 	movdqu	0x10($inp),@X[1]
    794 	movdqu	0x20($inp),@X[2]
    795 	pshufb	$t3,@X[0]
    796 	movdqu	0x30($inp),@X[3]
    797 	lea	$TABLE(%rip),$Tbl
    798 	pshufb	$t3,@X[1]
    799 	movdqa	0x00($Tbl),$t0
    800 	movdqa	0x20($Tbl),$t1
    801 	pshufb	$t3,@X[2]
    802 	paddd	@X[0],$t0
    803 	movdqa	0x40($Tbl),$t2
    804 	pshufb	$t3,@X[3]
    805 	movdqa	0x60($Tbl),$t3
    806 	paddd	@X[1],$t1
    807 	paddd	@X[2],$t2
    808 	paddd	@X[3],$t3
    809 	movdqa	$t0,0x00(%rsp)
    810 	mov	$A,$a1
    811 	movdqa	$t1,0x10(%rsp)
    812 	mov	$B,$a3
    813 	movdqa	$t2,0x20(%rsp)
    814 	xor	$C,$a3			# magic
    815 	movdqa	$t3,0x30(%rsp)
    816 	mov	$E,$a0
    817 	jmp	.Lssse3_00_47
    818 
    819 .align	16
    820 .Lssse3_00_47:
    821 	sub	\$`-16*2*$SZ`,$Tbl	# size optimization
    822 ___
    823 sub Xupdate_256_SSSE3 () {
    824 	(
    825 	'&movdqa	($t0,@X[1]);',
    826 	'&movdqa	($t3,@X[3])',
    827 	'&palignr	($t0,@X[0],$SZ)',	# X[1..4]
    828 	 '&palignr	($t3,@X[2],$SZ);',	# X[9..12]
    829 	'&movdqa	($t1,$t0)',
    830 	'&movdqa	($t2,$t0);',
    831 	'&psrld		($t0,$sigma0[2])',
    832 	 '&paddd	(@X[0],$t3);',		# X[0..3] += X[9..12]
    833 	'&psrld		($t2,$sigma0[0])',
    834 	 '&pshufd	($t3,@X[3],0b11111010)',# X[14..15]
    835 	'&pslld		($t1,8*$SZ-$sigma0[1]);'.
    836 	'&pxor		($t0,$t2)',
    837 	'&psrld		($t2,$sigma0[1]-$sigma0[0]);'.
    838 	'&pxor		($t0,$t1)',
    839 	'&pslld		($t1,$sigma0[1]-$sigma0[0]);'.
    840 	'&pxor		($t0,$t2);',
    841 	 '&movdqa	($t2,$t3)',
    842 	'&pxor		($t0,$t1);',		# sigma0(X[1..4])
    843 	 '&psrld	($t3,$sigma1[2])',
    844 	'&paddd		(@X[0],$t0);',		# X[0..3] += sigma0(X[1..4])
    845 	 '&psrlq	($t2,$sigma1[0])',
    846 	 '&pxor		($t3,$t2);',
    847 	 '&psrlq	($t2,$sigma1[1]-$sigma1[0])',
    848 	 '&pxor		($t3,$t2)',
    849 	 '&pshufb	($t3,$t4)',		# sigma1(X[14..15])
    850 	'&paddd		(@X[0],$t3)',		# X[0..1] += sigma1(X[14..15])
    851 	 '&pshufd	($t3,@X[0],0b01010000)',# X[16..17]
    852 	 '&movdqa	($t2,$t3);',
    853 	 '&psrld	($t3,$sigma1[2])',
    854 	 '&psrlq	($t2,$sigma1[0])',
    855 	 '&pxor		($t3,$t2);',
    856 	 '&psrlq	($t2,$sigma1[1]-$sigma1[0])',
    857 	 '&pxor		($t3,$t2);',
    858 	'&movdqa	($t2,16*2*$j."($Tbl)")',
    859 	 '&pshufb	($t3,$t5)',
    860 	'&paddd		(@X[0],$t3)'		# X[2..3] += sigma1(X[16..17])
    861 	);
    862 }
    863 
    864 sub SSSE3_256_00_47 () {
    865 my $j = shift;
    866 my $body = shift;
    867 my @X = @_;
    868 my @insns = (&$body,&$body,&$body,&$body);	# 104 instructions
    869 
    870     if (0) {
    871 	foreach (Xupdate_256_SSSE3()) {		# 36 instructions
    872 	    eval;
    873 	    eval(shift(@insns));
    874 	    eval(shift(@insns));
    875 	    eval(shift(@insns));
    876 	}
    877     } else {			# squeeze extra 4% on Westmere and 19% on Atom
    878 	  eval(shift(@insns));	#@
    879 	&movdqa		($t0,@X[1]);
    880 	  eval(shift(@insns));
    881 	  eval(shift(@insns));
    882 	&movdqa		($t3,@X[3]);
    883 	  eval(shift(@insns));	#@
    884 	  eval(shift(@insns));
    885 	  eval(shift(@insns));
    886 	  eval(shift(@insns));	#@
    887 	  eval(shift(@insns));
    888 	&palignr	($t0,@X[0],$SZ);	# X[1..4]
    889 	  eval(shift(@insns));
    890 	  eval(shift(@insns));
    891 	 &palignr	($t3,@X[2],$SZ);	# X[9..12]
    892 	  eval(shift(@insns));
    893 	  eval(shift(@insns));
    894 	  eval(shift(@insns));
    895 	  eval(shift(@insns));	#@
    896 	&movdqa		($t1,$t0);
    897 	  eval(shift(@insns));
    898 	  eval(shift(@insns));
    899 	&movdqa		($t2,$t0);
    900 	  eval(shift(@insns));	#@
    901 	  eval(shift(@insns));
    902 	&psrld		($t0,$sigma0[2]);
    903 	  eval(shift(@insns));
    904 	  eval(shift(@insns));
    905 	  eval(shift(@insns));
    906 	 &paddd		(@X[0],$t3);		# X[0..3] += X[9..12]
    907 	  eval(shift(@insns));	#@
    908 	  eval(shift(@insns));
    909 	&psrld		($t2,$sigma0[0]);
    910 	  eval(shift(@insns));
    911 	  eval(shift(@insns));
    912 	 &pshufd	($t3,@X[3],0b11111010);	# X[4..15]
    913 	  eval(shift(@insns));
    914 	  eval(shift(@insns));	#@
    915 	&pslld		($t1,8*$SZ-$sigma0[1]);
    916 	  eval(shift(@insns));
    917 	  eval(shift(@insns));
    918 	&pxor		($t0,$t2);
    919 	  eval(shift(@insns));	#@
    920 	  eval(shift(@insns));
    921 	  eval(shift(@insns));
    922 	  eval(shift(@insns));	#@
    923 	&psrld		($t2,$sigma0[1]-$sigma0[0]);
    924 	  eval(shift(@insns));
    925 	&pxor		($t0,$t1);
    926 	  eval(shift(@insns));
    927 	  eval(shift(@insns));
    928 	&pslld		($t1,$sigma0[1]-$sigma0[0]);
    929 	  eval(shift(@insns));
    930 	  eval(shift(@insns));
    931 	&pxor		($t0,$t2);
    932 	  eval(shift(@insns));
    933 	  eval(shift(@insns));	#@
    934 	 &movdqa	($t2,$t3);
    935 	  eval(shift(@insns));
    936 	  eval(shift(@insns));
    937 	&pxor		($t0,$t1);		# sigma0(X[1..4])
    938 	  eval(shift(@insns));	#@
    939 	  eval(shift(@insns));
    940 	  eval(shift(@insns));
    941 	 &psrld		($t3,$sigma1[2]);
    942 	  eval(shift(@insns));
    943 	  eval(shift(@insns));
    944 	&paddd		(@X[0],$t0);		# X[0..3] += sigma0(X[1..4])
    945 	  eval(shift(@insns));	#@
    946 	  eval(shift(@insns));
    947 	 &psrlq		($t2,$sigma1[0]);
    948 	  eval(shift(@insns));
    949 	  eval(shift(@insns));
    950 	  eval(shift(@insns));
    951 	 &pxor		($t3,$t2);
    952 	  eval(shift(@insns));	#@
    953 	  eval(shift(@insns));
    954 	  eval(shift(@insns));
    955 	  eval(shift(@insns));	#@
    956 	 &psrlq		($t2,$sigma1[1]-$sigma1[0]);
    957 	  eval(shift(@insns));
    958 	  eval(shift(@insns));
    959 	 &pxor		($t3,$t2);
    960 	  eval(shift(@insns));	#@
    961 	  eval(shift(@insns));
    962 	  eval(shift(@insns));
    963 	 #&pshufb	($t3,$t4);		# sigma1(X[14..15])
    964 	 &pshufd	($t3,$t3,0b10000000);
    965 	  eval(shift(@insns));
    966 	  eval(shift(@insns));
    967 	  eval(shift(@insns));
    968 	 &psrldq	($t3,8);
    969 	  eval(shift(@insns));
    970 	  eval(shift(@insns));	#@
    971 	  eval(shift(@insns));
    972 	  eval(shift(@insns));
    973 	  eval(shift(@insns));	#@
    974 	&paddd		(@X[0],$t3);		# X[0..1] += sigma1(X[14..15])
    975 	  eval(shift(@insns));
    976 	  eval(shift(@insns));
    977 	  eval(shift(@insns));
    978 	 &pshufd	($t3,@X[0],0b01010000);	# X[16..17]
    979 	  eval(shift(@insns));
    980 	  eval(shift(@insns));	#@
    981 	  eval(shift(@insns));
    982 	 &movdqa	($t2,$t3);
    983 	  eval(shift(@insns));
    984 	  eval(shift(@insns));
    985 	 &psrld		($t3,$sigma1[2]);
    986 	  eval(shift(@insns));
    987 	  eval(shift(@insns));	#@
    988 	 &psrlq		($t2,$sigma1[0]);
    989 	  eval(shift(@insns));
    990 	  eval(shift(@insns));
    991 	 &pxor		($t3,$t2);
    992 	  eval(shift(@insns));	#@
    993 	  eval(shift(@insns));
    994 	  eval(shift(@insns));
    995 	  eval(shift(@insns));	#@
    996 	  eval(shift(@insns));
    997 	 &psrlq		($t2,$sigma1[1]-$sigma1[0]);
    998 	  eval(shift(@insns));
    999 	  eval(shift(@insns));
   1000 	  eval(shift(@insns));
   1001 	 &pxor		($t3,$t2);
   1002 	  eval(shift(@insns));
   1003 	  eval(shift(@insns));
   1004 	  eval(shift(@insns));	#@
   1005 	 #&pshufb	($t3,$t5);
   1006 	 &pshufd	($t3,$t3,0b00001000);
   1007 	  eval(shift(@insns));
   1008 	  eval(shift(@insns));
   1009 	&movdqa		($t2,16*2*$j."($Tbl)");
   1010 	  eval(shift(@insns));	#@
   1011 	  eval(shift(@insns));
   1012 	 &pslldq	($t3,8);
   1013 	  eval(shift(@insns));
   1014 	  eval(shift(@insns));
   1015 	  eval(shift(@insns));
   1016 	&paddd		(@X[0],$t3);		# X[2..3] += sigma1(X[16..17])
   1017 	  eval(shift(@insns));	#@
   1018 	  eval(shift(@insns));
   1019 	  eval(shift(@insns));
   1020     }
   1021 	&paddd		($t2,@X[0]);
   1022 	  foreach (@insns) { eval; }		# remaining instructions
   1023 	&movdqa		(16*$j."(%rsp)",$t2);
   1024 }
   1025 
   1026     for ($i=0,$j=0; $j<4; $j++) {
   1027 	&SSSE3_256_00_47($j,\&body_00_15,@X);
   1028 	push(@X,shift(@X));			# rotate(@X)
   1029     }
   1030 	&cmpb	($SZ-1+16*2*$SZ."($Tbl)",0);
   1031 	&jne	(".Lssse3_00_47");
   1032 
   1033     for ($i=0; $i<16; ) {
   1034 	foreach(body_00_15()) { eval; }
   1035     }
   1036 $code.=<<___;
   1037 	mov	$_ctx,$ctx
   1038 	mov	$a1,$A
   1039 
   1040 	add	$SZ*0($ctx),$A
   1041 	lea	16*$SZ($inp),$inp
   1042 	add	$SZ*1($ctx),$B
   1043 	add	$SZ*2($ctx),$C
   1044 	add	$SZ*3($ctx),$D
   1045 	add	$SZ*4($ctx),$E
   1046 	add	$SZ*5($ctx),$F
   1047 	add	$SZ*6($ctx),$G
   1048 	add	$SZ*7($ctx),$H
   1049 
   1050 	cmp	$_end,$inp
   1051 
   1052 	mov	$A,$SZ*0($ctx)
   1053 	mov	$B,$SZ*1($ctx)
   1054 	mov	$C,$SZ*2($ctx)
   1055 	mov	$D,$SZ*3($ctx)
   1056 	mov	$E,$SZ*4($ctx)
   1057 	mov	$F,$SZ*5($ctx)
   1058 	mov	$G,$SZ*6($ctx)
   1059 	mov	$H,$SZ*7($ctx)
   1060 	jb	.Lloop_ssse3
   1061 
   1062 	mov	$_rsp,%rsi
   1063 ___
   1064 $code.=<<___ if ($win64);
   1065 	movaps	16*$SZ+32(%rsp),%xmm6
   1066 	movaps	16*$SZ+48(%rsp),%xmm7
   1067 	movaps	16*$SZ+64(%rsp),%xmm8
   1068 	movaps	16*$SZ+80(%rsp),%xmm9
   1069 ___
   1070 $code.=<<___;
   1071 	mov	-48(%rsi),%r15
   1072 	mov	-40(%rsi),%r14
   1073 	mov	-32(%rsi),%r13
   1074 	mov	-24(%rsi),%r12
   1075 	mov	-16(%rsi),%rbp
   1076 	mov	-8(%rsi),%rbx
   1077 	lea	(%rsi),%rsp
   1078 .Lepilogue_ssse3:
   1079 	ret
   1080 .size	${func}_ssse3,.-${func}_ssse3
   1081 ___
   1082 }
   1083 
   1084 if ($avx) {{
   1085 ######################################################################
   1086 # XOP code path
   1087 #
   1088 if ($SZ==8) {	# SHA512 only
   1089 $code.=<<___;
   1090 .type	${func}_xop,\@function,3
   1091 .align	64
   1092 ${func}_xop:
   1093 .Lxop_shortcut:
   1094 	mov	%rsp,%rax		# copy %rsp
   1095 	push	%rbx
   1096 	push	%rbp
   1097 	push	%r12
   1098 	push	%r13
   1099 	push	%r14
   1100 	push	%r15
   1101 	shl	\$4,%rdx		# num*16
   1102 	sub	\$`$framesz+$win64*16*($SZ==4?4:6)`,%rsp
   1103 	lea	($inp,%rdx,$SZ),%rdx	# inp+num*16*$SZ
   1104 	and	\$-64,%rsp		# align stack frame
   1105 	mov	$ctx,$_ctx		# save ctx, 1st arg
   1106 	mov	$inp,$_inp		# save inp, 2nd arh
   1107 	mov	%rdx,$_end		# save end pointer, "3rd" arg
   1108 	mov	%rax,$_rsp		# save copy of %rsp
   1109 ___
   1110 $code.=<<___ if ($win64);
   1111 	movaps	%xmm6,16*$SZ+32(%rsp)
   1112 	movaps	%xmm7,16*$SZ+48(%rsp)
   1113 	movaps	%xmm8,16*$SZ+64(%rsp)
   1114 	movaps	%xmm9,16*$SZ+80(%rsp)
   1115 ___
   1116 $code.=<<___ if ($win64 && $SZ>4);
   1117 	movaps	%xmm10,16*$SZ+96(%rsp)
   1118 	movaps	%xmm11,16*$SZ+112(%rsp)
   1119 ___
   1120 $code.=<<___;
   1121 .Lprologue_xop:
   1122 
   1123 	vzeroupper
   1124 	mov	$SZ*0($ctx),$A
   1125 	mov	$SZ*1($ctx),$B
   1126 	mov	$SZ*2($ctx),$C
   1127 	mov	$SZ*3($ctx),$D
   1128 	mov	$SZ*4($ctx),$E
   1129 	mov	$SZ*5($ctx),$F
   1130 	mov	$SZ*6($ctx),$G
   1131 	mov	$SZ*7($ctx),$H
   1132 	jmp	.Lloop_xop
   1133 ___
   1134 					if ($SZ==4) {	# SHA256
   1135     my @X = map("%xmm$_",(0..3));
   1136     my ($t0,$t1,$t2,$t3) = map("%xmm$_",(4..7));
   1137 
   1138 $code.=<<___;
   1139 .align	16
   1140 .Lloop_xop:
   1141 	vmovdqa	$TABLE+`$SZ*2*$rounds`(%rip),$t3
   1142 	vmovdqu	0x00($inp),@X[0]
   1143 	vmovdqu	0x10($inp),@X[1]
   1144 	vmovdqu	0x20($inp),@X[2]
   1145 	vmovdqu	0x30($inp),@X[3]
   1146 	vpshufb	$t3,@X[0],@X[0]
   1147 	lea	$TABLE(%rip),$Tbl
   1148 	vpshufb	$t3,@X[1],@X[1]
   1149 	vpshufb	$t3,@X[2],@X[2]
   1150 	vpaddd	0x00($Tbl),@X[0],$t0
   1151 	vpshufb	$t3,@X[3],@X[3]
   1152 	vpaddd	0x20($Tbl),@X[1],$t1
   1153 	vpaddd	0x40($Tbl),@X[2],$t2
   1154 	vpaddd	0x60($Tbl),@X[3],$t3
   1155 	vmovdqa	$t0,0x00(%rsp)
   1156 	mov	$A,$a1
   1157 	vmovdqa	$t1,0x10(%rsp)
   1158 	mov	$B,$a3
   1159 	vmovdqa	$t2,0x20(%rsp)
   1160 	xor	$C,$a3			# magic
   1161 	vmovdqa	$t3,0x30(%rsp)
   1162 	mov	$E,$a0
   1163 	jmp	.Lxop_00_47
   1164 
   1165 .align	16
   1166 .Lxop_00_47:
   1167 	sub	\$`-16*2*$SZ`,$Tbl	# size optimization
   1168 ___
   1169 sub XOP_256_00_47 () {
   1170 my $j = shift;
   1171 my $body = shift;
   1172 my @X = @_;
   1173 my @insns = (&$body,&$body,&$body,&$body);	# 104 instructions
   1174 
   1175 	&vpalignr	($t0,@X[1],@X[0],$SZ);	# X[1..4]
   1176 	  eval(shift(@insns));
   1177 	  eval(shift(@insns));
   1178 	 &vpalignr	($t3,@X[3],@X[2],$SZ);	# X[9..12]
   1179 	  eval(shift(@insns));
   1180 	  eval(shift(@insns));
   1181 	&vprotd		($t1,$t0,8*$SZ-$sigma0[1]);
   1182 	  eval(shift(@insns));
   1183 	  eval(shift(@insns));
   1184 	&vpsrld		($t0,$t0,$sigma0[2]);
   1185 	  eval(shift(@insns));
   1186 	  eval(shift(@insns));
   1187 	 &vpaddd	(@X[0],@X[0],$t3);	# X[0..3] += X[9..12]
   1188 	  eval(shift(@insns));
   1189 	  eval(shift(@insns));
   1190 	  eval(shift(@insns));
   1191 	  eval(shift(@insns));
   1192 	&vprotd		($t2,$t1,$sigma0[1]-$sigma0[0]);
   1193 	  eval(shift(@insns));
   1194 	  eval(shift(@insns));
   1195 	&vpxor		($t0,$t0,$t1);
   1196 	  eval(shift(@insns));
   1197 	  eval(shift(@insns));
   1198 	  eval(shift(@insns));
   1199 	  eval(shift(@insns));
   1200 	 &vprotd	($t3,@X[3],8*$SZ-$sigma1[1]);
   1201 	  eval(shift(@insns));
   1202 	  eval(shift(@insns));
   1203 	&vpxor		($t0,$t0,$t2);		# sigma0(X[1..4])
   1204 	  eval(shift(@insns));
   1205 	  eval(shift(@insns));
   1206 	 &vpsrld	($t2,@X[3],$sigma1[2]);
   1207 	  eval(shift(@insns));
   1208 	  eval(shift(@insns));
   1209 	&vpaddd		(@X[0],@X[0],$t0);	# X[0..3] += sigma0(X[1..4])
   1210 	  eval(shift(@insns));
   1211 	  eval(shift(@insns));
   1212 	 &vprotd	($t1,$t3,$sigma1[1]-$sigma1[0]);
   1213 	  eval(shift(@insns));
   1214 	  eval(shift(@insns));
   1215 	 &vpxor		($t3,$t3,$t2);
   1216 	  eval(shift(@insns));
   1217 	  eval(shift(@insns));
   1218 	  eval(shift(@insns));
   1219 	  eval(shift(@insns));
   1220 	 &vpxor		($t3,$t3,$t1);		# sigma1(X[14..15])
   1221 	  eval(shift(@insns));
   1222 	  eval(shift(@insns));
   1223 	  eval(shift(@insns));
   1224 	  eval(shift(@insns));
   1225 	&vpsrldq	($t3,$t3,8);
   1226 	  eval(shift(@insns));
   1227 	  eval(shift(@insns));
   1228 	  eval(shift(@insns));
   1229 	  eval(shift(@insns));
   1230 	&vpaddd		(@X[0],@X[0],$t3);	# X[0..1] += sigma1(X[14..15])
   1231 	  eval(shift(@insns));
   1232 	  eval(shift(@insns));
   1233 	  eval(shift(@insns));
   1234 	  eval(shift(@insns));
   1235 	 &vprotd	($t3,@X[0],8*$SZ-$sigma1[1]);
   1236 	  eval(shift(@insns));
   1237 	  eval(shift(@insns));
   1238 	 &vpsrld	($t2,@X[0],$sigma1[2]);
   1239 	  eval(shift(@insns));
   1240 	  eval(shift(@insns));
   1241 	 &vprotd	($t1,$t3,$sigma1[1]-$sigma1[0]);
   1242 	  eval(shift(@insns));
   1243 	  eval(shift(@insns));
   1244 	 &vpxor		($t3,$t3,$t2);
   1245 	  eval(shift(@insns));
   1246 	  eval(shift(@insns));
   1247 	  eval(shift(@insns));
   1248 	  eval(shift(@insns));
   1249 	 &vpxor		($t3,$t3,$t1);		# sigma1(X[16..17])
   1250 	  eval(shift(@insns));
   1251 	  eval(shift(@insns));
   1252 	  eval(shift(@insns));
   1253 	  eval(shift(@insns));
   1254 	&vpslldq	($t3,$t3,8);		# 22 instructions
   1255 	  eval(shift(@insns));
   1256 	  eval(shift(@insns));
   1257 	  eval(shift(@insns));
   1258 	  eval(shift(@insns));
   1259 	&vpaddd		(@X[0],@X[0],$t3);	# X[2..3] += sigma1(X[16..17])
   1260 	  eval(shift(@insns));
   1261 	  eval(shift(@insns));
   1262 	  eval(shift(@insns));
   1263 	  eval(shift(@insns));
   1264 	&vpaddd		($t2,@X[0],16*2*$j."($Tbl)");
   1265 	  foreach (@insns) { eval; }		# remaining instructions
   1266 	&vmovdqa	(16*$j."(%rsp)",$t2);
   1267 }
   1268 
   1269     for ($i=0,$j=0; $j<4; $j++) {
   1270 	&XOP_256_00_47($j,\&body_00_15,@X);
   1271 	push(@X,shift(@X));			# rotate(@X)
   1272     }
   1273 	&cmpb	($SZ-1+16*2*$SZ."($Tbl)",0);
   1274 	&jne	(".Lxop_00_47");
   1275 
   1276     for ($i=0; $i<16; ) {
   1277 	foreach(body_00_15()) { eval; }
   1278     }
   1279 
   1280 					} else {	# SHA512
   1281     my @X = map("%xmm$_",(0..7));
   1282     my ($t0,$t1,$t2,$t3) = map("%xmm$_",(8..11));
   1283 
   1284 $code.=<<___;
   1285 .align	16
   1286 .Lloop_xop:
   1287 	vmovdqa	$TABLE+`$SZ*2*$rounds`(%rip),$t3
   1288 	vmovdqu	0x00($inp),@X[0]
   1289 	lea	$TABLE+0x80(%rip),$Tbl	# size optimization
   1290 	vmovdqu	0x10($inp),@X[1]
   1291 	vmovdqu	0x20($inp),@X[2]
   1292 	vpshufb	$t3,@X[0],@X[0]
   1293 	vmovdqu	0x30($inp),@X[3]
   1294 	vpshufb	$t3,@X[1],@X[1]
   1295 	vmovdqu	0x40($inp),@X[4]
   1296 	vpshufb	$t3,@X[2],@X[2]
   1297 	vmovdqu	0x50($inp),@X[5]
   1298 	vpshufb	$t3,@X[3],@X[3]
   1299 	vmovdqu	0x60($inp),@X[6]
   1300 	vpshufb	$t3,@X[4],@X[4]
   1301 	vmovdqu	0x70($inp),@X[7]
   1302 	vpshufb	$t3,@X[5],@X[5]
   1303 	vpaddq	-0x80($Tbl),@X[0],$t0
   1304 	vpshufb	$t3,@X[6],@X[6]
   1305 	vpaddq	-0x60($Tbl),@X[1],$t1
   1306 	vpshufb	$t3,@X[7],@X[7]
   1307 	vpaddq	-0x40($Tbl),@X[2],$t2
   1308 	vpaddq	-0x20($Tbl),@X[3],$t3
   1309 	vmovdqa	$t0,0x00(%rsp)
   1310 	vpaddq	0x00($Tbl),@X[4],$t0
   1311 	vmovdqa	$t1,0x10(%rsp)
   1312 	vpaddq	0x20($Tbl),@X[5],$t1
   1313 	vmovdqa	$t2,0x20(%rsp)
   1314 	vpaddq	0x40($Tbl),@X[6],$t2
   1315 	vmovdqa	$t3,0x30(%rsp)
   1316 	vpaddq	0x60($Tbl),@X[7],$t3
   1317 	vmovdqa	$t0,0x40(%rsp)
   1318 	mov	$A,$a1
   1319 	vmovdqa	$t1,0x50(%rsp)
   1320 	mov	$B,$a3
   1321 	vmovdqa	$t2,0x60(%rsp)
   1322 	xor	$C,$a3			# magic
   1323 	vmovdqa	$t3,0x70(%rsp)
   1324 	mov	$E,$a0
   1325 	jmp	.Lxop_00_47
   1326 
   1327 .align	16
   1328 .Lxop_00_47:
   1329 	add	\$`16*2*$SZ`,$Tbl
   1330 ___
   1331 sub XOP_512_00_47 () {
   1332 my $j = shift;
   1333 my $body = shift;
   1334 my @X = @_;
   1335 my @insns = (&$body,&$body);			# 52 instructions
   1336 
   1337 	&vpalignr	($t0,@X[1],@X[0],$SZ);	# X[1..2]
   1338 	  eval(shift(@insns));
   1339 	  eval(shift(@insns));
   1340 	 &vpalignr	($t3,@X[5],@X[4],$SZ);	# X[9..10]
   1341 	  eval(shift(@insns));
   1342 	  eval(shift(@insns));
   1343 	&vprotq		($t1,$t0,8*$SZ-$sigma0[1]);
   1344 	  eval(shift(@insns));
   1345 	  eval(shift(@insns));
   1346 	&vpsrlq		($t0,$t0,$sigma0[2]);
   1347 	  eval(shift(@insns));
   1348 	  eval(shift(@insns));
   1349 	 &vpaddq	(@X[0],@X[0],$t3);	# X[0..1] += X[9..10]
   1350 	  eval(shift(@insns));
   1351 	  eval(shift(@insns));
   1352 	  eval(shift(@insns));
   1353 	  eval(shift(@insns));
   1354 	&vprotq		($t2,$t1,$sigma0[1]-$sigma0[0]);
   1355 	  eval(shift(@insns));
   1356 	  eval(shift(@insns));
   1357 	&vpxor		($t0,$t0,$t1);
   1358 	  eval(shift(@insns));
   1359 	  eval(shift(@insns));
   1360 	  eval(shift(@insns));
   1361 	  eval(shift(@insns));
   1362 	 &vprotq	($t3,@X[7],8*$SZ-$sigma1[1]);
   1363 	  eval(shift(@insns));
   1364 	  eval(shift(@insns));
   1365 	&vpxor		($t0,$t0,$t2);		# sigma0(X[1..2])
   1366 	  eval(shift(@insns));
   1367 	  eval(shift(@insns));
   1368 	 &vpsrlq	($t2,@X[7],$sigma1[2]);
   1369 	  eval(shift(@insns));
   1370 	  eval(shift(@insns));
   1371 	&vpaddq		(@X[0],@X[0],$t0);	# X[0..1] += sigma0(X[1..2])
   1372 	  eval(shift(@insns));
   1373 	  eval(shift(@insns));
   1374 	 &vprotq	($t1,$t3,$sigma1[1]-$sigma1[0]);
   1375 	  eval(shift(@insns));
   1376 	  eval(shift(@insns));
   1377 	 &vpxor		($t3,$t3,$t2);
   1378 	  eval(shift(@insns));
   1379 	  eval(shift(@insns));
   1380 	  eval(shift(@insns));
   1381 	  eval(shift(@insns));
   1382 	 &vpxor		($t3,$t3,$t1);		# sigma1(X[14..15])
   1383 	  eval(shift(@insns));
   1384 	  eval(shift(@insns));
   1385 	  eval(shift(@insns));
   1386 	  eval(shift(@insns));
   1387 	&vpaddq		(@X[0],@X[0],$t3);	# X[0..1] += sigma1(X[14..15])
   1388 	  eval(shift(@insns));
   1389 	  eval(shift(@insns));
   1390 	  eval(shift(@insns));
   1391 	  eval(shift(@insns));
   1392 	&vpaddq		($t2,@X[0],16*2*$j-0x80."($Tbl)");
   1393 	  foreach (@insns) { eval; }		# remaining instructions
   1394 	&vmovdqa	(16*$j."(%rsp)",$t2);
   1395 }
   1396 
   1397     for ($i=0,$j=0; $j<8; $j++) {
   1398 	&XOP_512_00_47($j,\&body_00_15,@X);
   1399 	push(@X,shift(@X));			# rotate(@X)
   1400     }
   1401 	&cmpb	($SZ-1+16*2*$SZ-0x80."($Tbl)",0);
   1402 	&jne	(".Lxop_00_47");
   1403 
   1404     for ($i=0; $i<16; ) {
   1405 	foreach(body_00_15()) { eval; }
   1406     }
   1407 }
   1408 $code.=<<___;
   1409 	mov	$_ctx,$ctx
   1410 	mov	$a1,$A
   1411 
   1412 	add	$SZ*0($ctx),$A
   1413 	lea	16*$SZ($inp),$inp
   1414 	add	$SZ*1($ctx),$B
   1415 	add	$SZ*2($ctx),$C
   1416 	add	$SZ*3($ctx),$D
   1417 	add	$SZ*4($ctx),$E
   1418 	add	$SZ*5($ctx),$F
   1419 	add	$SZ*6($ctx),$G
   1420 	add	$SZ*7($ctx),$H
   1421 
   1422 	cmp	$_end,$inp
   1423 
   1424 	mov	$A,$SZ*0($ctx)
   1425 	mov	$B,$SZ*1($ctx)
   1426 	mov	$C,$SZ*2($ctx)
   1427 	mov	$D,$SZ*3($ctx)
   1428 	mov	$E,$SZ*4($ctx)
   1429 	mov	$F,$SZ*5($ctx)
   1430 	mov	$G,$SZ*6($ctx)
   1431 	mov	$H,$SZ*7($ctx)
   1432 	jb	.Lloop_xop
   1433 
   1434 	mov	$_rsp,%rsi
   1435 	vzeroupper
   1436 ___
   1437 $code.=<<___ if ($win64);
   1438 	movaps	16*$SZ+32(%rsp),%xmm6
   1439 	movaps	16*$SZ+48(%rsp),%xmm7
   1440 	movaps	16*$SZ+64(%rsp),%xmm8
   1441 	movaps	16*$SZ+80(%rsp),%xmm9
   1442 ___
   1443 $code.=<<___ if ($win64 && $SZ>4);
   1444 	movaps	16*$SZ+96(%rsp),%xmm10
   1445 	movaps	16*$SZ+112(%rsp),%xmm11
   1446 ___
   1447 $code.=<<___;
   1448 	mov	-48(%rsi),%r15
   1449 	mov	-40(%rsi),%r14
   1450 	mov	-32(%rsi),%r13
   1451 	mov	-24(%rsi),%r12
   1452 	mov	-16(%rsi),%rbp
   1453 	mov	-8(%rsi),%rbx
   1454 	lea	(%rsi),%rsp
   1455 .Lepilogue_xop:
   1456 	ret
   1457 .size	${func}_xop,.-${func}_xop
   1458 ___
   1459 }
   1460 ######################################################################
   1461 # AVX+shrd code path
   1462 #
   1463 local *ror = sub { &shrd(@_[0],@_) };
   1464 
   1465 $code.=<<___;
   1466 .type	${func}_avx,\@function,3
   1467 .align	64
   1468 ${func}_avx:
   1469 .Lavx_shortcut:
   1470 	mov	%rsp,%rax		# copy %rsp
   1471 	push	%rbx
   1472 	push	%rbp
   1473 	push	%r12
   1474 	push	%r13
   1475 	push	%r14
   1476 	push	%r15
   1477 	shl	\$4,%rdx		# num*16
   1478 	sub	\$`$framesz+$win64*16*($SZ==4?4:6)`,%rsp
   1479 	lea	($inp,%rdx,$SZ),%rdx	# inp+num*16*$SZ
   1480 	and	\$-64,%rsp		# align stack frame
   1481 	mov	$ctx,$_ctx		# save ctx, 1st arg
   1482 	mov	$inp,$_inp		# save inp, 2nd arh
   1483 	mov	%rdx,$_end		# save end pointer, "3rd" arg
   1484 	mov	%rax,$_rsp		# save copy of %rsp
   1485 ___
   1486 $code.=<<___ if ($win64);
   1487 	movaps	%xmm6,16*$SZ+32(%rsp)
   1488 	movaps	%xmm7,16*$SZ+48(%rsp)
   1489 	movaps	%xmm8,16*$SZ+64(%rsp)
   1490 	movaps	%xmm9,16*$SZ+80(%rsp)
   1491 ___
   1492 $code.=<<___ if ($win64 && $SZ>4);
   1493 	movaps	%xmm10,16*$SZ+96(%rsp)
   1494 	movaps	%xmm11,16*$SZ+112(%rsp)
   1495 ___
   1496 $code.=<<___;
   1497 .Lprologue_avx:
   1498 
   1499 	vzeroupper
   1500 	mov	$SZ*0($ctx),$A
   1501 	mov	$SZ*1($ctx),$B
   1502 	mov	$SZ*2($ctx),$C
   1503 	mov	$SZ*3($ctx),$D
   1504 	mov	$SZ*4($ctx),$E
   1505 	mov	$SZ*5($ctx),$F
   1506 	mov	$SZ*6($ctx),$G
   1507 	mov	$SZ*7($ctx),$H
   1508 ___
   1509 					if ($SZ==4) {	# SHA256
   1510     my @X = map("%xmm$_",(0..3));
   1511     my ($t0,$t1,$t2,$t3, $t4,$t5) = map("%xmm$_",(4..9));
   1512 
   1513 $code.=<<___;
   1514 	vmovdqa	$TABLE+`$SZ*2*$rounds`+32(%rip),$t4
   1515 	vmovdqa	$TABLE+`$SZ*2*$rounds`+64(%rip),$t5
   1516 	jmp	.Lloop_avx
   1517 .align	16
   1518 .Lloop_avx:
   1519 	vmovdqa	$TABLE+`$SZ*2*$rounds`(%rip),$t3
   1520 	vmovdqu	0x00($inp),@X[0]
   1521 	vmovdqu	0x10($inp),@X[1]
   1522 	vmovdqu	0x20($inp),@X[2]
   1523 	vmovdqu	0x30($inp),@X[3]
   1524 	vpshufb	$t3,@X[0],@X[0]
   1525 	lea	$TABLE(%rip),$Tbl
   1526 	vpshufb	$t3,@X[1],@X[1]
   1527 	vpshufb	$t3,@X[2],@X[2]
   1528 	vpaddd	0x00($Tbl),@X[0],$t0
   1529 	vpshufb	$t3,@X[3],@X[3]
   1530 	vpaddd	0x20($Tbl),@X[1],$t1
   1531 	vpaddd	0x40($Tbl),@X[2],$t2
   1532 	vpaddd	0x60($Tbl),@X[3],$t3
   1533 	vmovdqa	$t0,0x00(%rsp)
   1534 	mov	$A,$a1
   1535 	vmovdqa	$t1,0x10(%rsp)
   1536 	mov	$B,$a3
   1537 	vmovdqa	$t2,0x20(%rsp)
   1538 	xor	$C,$a3			# magic
   1539 	vmovdqa	$t3,0x30(%rsp)
   1540 	mov	$E,$a0
   1541 	jmp	.Lavx_00_47
   1542 
   1543 .align	16
   1544 .Lavx_00_47:
   1545 	sub	\$`-16*2*$SZ`,$Tbl	# size optimization
   1546 ___
   1547 sub Xupdate_256_AVX () {
   1548 	(
   1549 	'&vpalignr	($t0,@X[1],@X[0],$SZ)',	# X[1..4]
   1550 	 '&vpalignr	($t3,@X[3],@X[2],$SZ)',	# X[9..12]
   1551 	'&vpsrld	($t2,$t0,$sigma0[0]);',
   1552 	 '&vpaddd	(@X[0],@X[0],$t3)',	# X[0..3] += X[9..12]
   1553 	'&vpsrld	($t3,$t0,$sigma0[2])',
   1554 	'&vpslld	($t1,$t0,8*$SZ-$sigma0[1]);',
   1555 	'&vpxor		($t0,$t3,$t2)',
   1556 	 '&vpshufd	($t3,@X[3],0b11111010)',# X[14..15]
   1557 	'&vpsrld	($t2,$t2,$sigma0[1]-$sigma0[0]);',
   1558 	'&vpxor		($t0,$t0,$t1)',
   1559 	'&vpslld	($t1,$t1,$sigma0[1]-$sigma0[0]);',
   1560 	'&vpxor		($t0,$t0,$t2)',
   1561 	 '&vpsrld	($t2,$t3,$sigma1[2]);',
   1562 	'&vpxor		($t0,$t0,$t1)',		# sigma0(X[1..4])
   1563 	 '&vpsrlq	($t3,$t3,$sigma1[0]);',
   1564 	'&vpaddd	(@X[0],@X[0],$t0)',	# X[0..3] += sigma0(X[1..4])
   1565 	 '&vpxor	($t2,$t2,$t3);',
   1566 	 '&vpsrlq	($t3,$t3,$sigma1[1]-$sigma1[0])',
   1567 	 '&vpxor	($t2,$t2,$t3)',
   1568 	 '&vpshufb	($t2,$t2,$t4)',		# sigma1(X[14..15])
   1569 	'&vpaddd	(@X[0],@X[0],$t2)',	# X[0..1] += sigma1(X[14..15])
   1570 	 '&vpshufd	($t3,@X[0],0b01010000)',# X[16..17]
   1571 	 '&vpsrld	($t2,$t3,$sigma1[2])',
   1572 	 '&vpsrlq	($t3,$t3,$sigma1[0])',
   1573 	 '&vpxor	($t2,$t2,$t3);',
   1574 	 '&vpsrlq	($t3,$t3,$sigma1[1]-$sigma1[0])',
   1575 	 '&vpxor	($t2,$t2,$t3)',
   1576 	 '&vpshufb	($t2,$t2,$t5)',
   1577 	'&vpaddd	(@X[0],@X[0],$t2)'	# X[2..3] += sigma1(X[16..17])
   1578 	);
   1579 }
   1580 
   1581 sub AVX_256_00_47 () {
   1582 my $j = shift;
   1583 my $body = shift;
   1584 my @X = @_;
   1585 my @insns = (&$body,&$body,&$body,&$body);	# 104 instructions
   1586 
   1587 	foreach (Xupdate_256_AVX()) {		# 29 instructions
   1588 	    eval;
   1589 	    eval(shift(@insns));
   1590 	    eval(shift(@insns));
   1591 	    eval(shift(@insns));
   1592 	}
   1593 	&vpaddd		($t2,@X[0],16*2*$j."($Tbl)");
   1594 	  foreach (@insns) { eval; }		# remaining instructions
   1595 	&vmovdqa	(16*$j."(%rsp)",$t2);
   1596 }
   1597 
   1598     for ($i=0,$j=0; $j<4; $j++) {
   1599 	&AVX_256_00_47($j,\&body_00_15,@X);
   1600 	push(@X,shift(@X));			# rotate(@X)
   1601     }
   1602 	&cmpb	($SZ-1+16*2*$SZ."($Tbl)",0);
   1603 	&jne	(".Lavx_00_47");
   1604 
   1605     for ($i=0; $i<16; ) {
   1606 	foreach(body_00_15()) { eval; }
   1607     }
   1608 
   1609 					} else {	# SHA512
   1610     my @X = map("%xmm$_",(0..7));
   1611     my ($t0,$t1,$t2,$t3) = map("%xmm$_",(8..11));
   1612 
   1613 $code.=<<___;
   1614 	jmp	.Lloop_avx
   1615 .align	16
   1616 .Lloop_avx:
   1617 	vmovdqa	$TABLE+`$SZ*2*$rounds`(%rip),$t3
   1618 	vmovdqu	0x00($inp),@X[0]
   1619 	lea	$TABLE+0x80(%rip),$Tbl	# size optimization
   1620 	vmovdqu	0x10($inp),@X[1]
   1621 	vmovdqu	0x20($inp),@X[2]
   1622 	vpshufb	$t3,@X[0],@X[0]
   1623 	vmovdqu	0x30($inp),@X[3]
   1624 	vpshufb	$t3,@X[1],@X[1]
   1625 	vmovdqu	0x40($inp),@X[4]
   1626 	vpshufb	$t3,@X[2],@X[2]
   1627 	vmovdqu	0x50($inp),@X[5]
   1628 	vpshufb	$t3,@X[3],@X[3]
   1629 	vmovdqu	0x60($inp),@X[6]
   1630 	vpshufb	$t3,@X[4],@X[4]
   1631 	vmovdqu	0x70($inp),@X[7]
   1632 	vpshufb	$t3,@X[5],@X[5]
   1633 	vpaddq	-0x80($Tbl),@X[0],$t0
   1634 	vpshufb	$t3,@X[6],@X[6]
   1635 	vpaddq	-0x60($Tbl),@X[1],$t1
   1636 	vpshufb	$t3,@X[7],@X[7]
   1637 	vpaddq	-0x40($Tbl),@X[2],$t2
   1638 	vpaddq	-0x20($Tbl),@X[3],$t3
   1639 	vmovdqa	$t0,0x00(%rsp)
   1640 	vpaddq	0x00($Tbl),@X[4],$t0
   1641 	vmovdqa	$t1,0x10(%rsp)
   1642 	vpaddq	0x20($Tbl),@X[5],$t1
   1643 	vmovdqa	$t2,0x20(%rsp)
   1644 	vpaddq	0x40($Tbl),@X[6],$t2
   1645 	vmovdqa	$t3,0x30(%rsp)
   1646 	vpaddq	0x60($Tbl),@X[7],$t3
   1647 	vmovdqa	$t0,0x40(%rsp)
   1648 	mov	$A,$a1
   1649 	vmovdqa	$t1,0x50(%rsp)
   1650 	mov	$B,$a3
   1651 	vmovdqa	$t2,0x60(%rsp)
   1652 	xor	$C,$a3			# magic
   1653 	vmovdqa	$t3,0x70(%rsp)
   1654 	mov	$E,$a0
   1655 	jmp	.Lavx_00_47
   1656 
   1657 .align	16
   1658 .Lavx_00_47:
   1659 	add	\$`16*2*$SZ`,$Tbl
   1660 ___
   1661 sub Xupdate_512_AVX () {
   1662 	(
   1663 	'&vpalignr	($t0,@X[1],@X[0],$SZ)',	# X[1..2]
   1664 	 '&vpalignr	($t3,@X[5],@X[4],$SZ)',	# X[9..10]
   1665 	'&vpsrlq	($t2,$t0,$sigma0[0])',
   1666 	 '&vpaddq	(@X[0],@X[0],$t3);',	# X[0..1] += X[9..10]
   1667 	'&vpsrlq	($t3,$t0,$sigma0[2])',
   1668 	'&vpsllq	($t1,$t0,8*$SZ-$sigma0[1]);',
   1669 	 '&vpxor	($t0,$t3,$t2)',
   1670 	'&vpsrlq	($t2,$t2,$sigma0[1]-$sigma0[0]);',
   1671 	 '&vpxor	($t0,$t0,$t1)',
   1672 	'&vpsllq	($t1,$t1,$sigma0[1]-$sigma0[0]);',
   1673 	 '&vpxor	($t0,$t0,$t2)',
   1674 	 '&vpsrlq	($t3,@X[7],$sigma1[2]);',
   1675 	'&vpxor		($t0,$t0,$t1)',		# sigma0(X[1..2])
   1676 	 '&vpsllq	($t2,@X[7],8*$SZ-$sigma1[1]);',
   1677 	'&vpaddq	(@X[0],@X[0],$t0)',	# X[0..1] += sigma0(X[1..2])
   1678 	 '&vpsrlq	($t1,@X[7],$sigma1[0]);',
   1679 	 '&vpxor	($t3,$t3,$t2)',
   1680 	 '&vpsllq	($t2,$t2,$sigma1[1]-$sigma1[0]);',
   1681 	 '&vpxor	($t3,$t3,$t1)',
   1682 	 '&vpsrlq	($t1,$t1,$sigma1[1]-$sigma1[0]);',
   1683 	 '&vpxor	($t3,$t3,$t2)',
   1684 	 '&vpxor	($t3,$t3,$t1)',		# sigma1(X[14..15])
   1685 	'&vpaddq	(@X[0],@X[0],$t3)',	# X[0..1] += sigma1(X[14..15])
   1686 	);
   1687 }
   1688 
   1689 sub AVX_512_00_47 () {
   1690 my $j = shift;
   1691 my $body = shift;
   1692 my @X = @_;
   1693 my @insns = (&$body,&$body);			# 52 instructions
   1694 
   1695 	foreach (Xupdate_512_AVX()) {		# 23 instructions
   1696 	    eval;
   1697 	    eval(shift(@insns));
   1698 	    eval(shift(@insns));
   1699 	}
   1700 	&vpaddq		($t2,@X[0],16*2*$j-0x80."($Tbl)");
   1701 	  foreach (@insns) { eval; }		# remaining instructions
   1702 	&vmovdqa	(16*$j."(%rsp)",$t2);
   1703 }
   1704 
   1705     for ($i=0,$j=0; $j<8; $j++) {
   1706 	&AVX_512_00_47($j,\&body_00_15,@X);
   1707 	push(@X,shift(@X));			# rotate(@X)
   1708     }
   1709 	&cmpb	($SZ-1+16*2*$SZ-0x80."($Tbl)",0);
   1710 	&jne	(".Lavx_00_47");
   1711 
   1712     for ($i=0; $i<16; ) {
   1713 	foreach(body_00_15()) { eval; }
   1714     }
   1715 }
   1716 $code.=<<___;
   1717 	mov	$_ctx,$ctx
   1718 	mov	$a1,$A
   1719 
   1720 	add	$SZ*0($ctx),$A
   1721 	lea	16*$SZ($inp),$inp
   1722 	add	$SZ*1($ctx),$B
   1723 	add	$SZ*2($ctx),$C
   1724 	add	$SZ*3($ctx),$D
   1725 	add	$SZ*4($ctx),$E
   1726 	add	$SZ*5($ctx),$F
   1727 	add	$SZ*6($ctx),$G
   1728 	add	$SZ*7($ctx),$H
   1729 
   1730 	cmp	$_end,$inp
   1731 
   1732 	mov	$A,$SZ*0($ctx)
   1733 	mov	$B,$SZ*1($ctx)
   1734 	mov	$C,$SZ*2($ctx)
   1735 	mov	$D,$SZ*3($ctx)
   1736 	mov	$E,$SZ*4($ctx)
   1737 	mov	$F,$SZ*5($ctx)
   1738 	mov	$G,$SZ*6($ctx)
   1739 	mov	$H,$SZ*7($ctx)
   1740 	jb	.Lloop_avx
   1741 
   1742 	mov	$_rsp,%rsi
   1743 	vzeroupper
   1744 ___
   1745 $code.=<<___ if ($win64);
   1746 	movaps	16*$SZ+32(%rsp),%xmm6
   1747 	movaps	16*$SZ+48(%rsp),%xmm7
   1748 	movaps	16*$SZ+64(%rsp),%xmm8
   1749 	movaps	16*$SZ+80(%rsp),%xmm9
   1750 ___
   1751 $code.=<<___ if ($win64 && $SZ>4);
   1752 	movaps	16*$SZ+96(%rsp),%xmm10
   1753 	movaps	16*$SZ+112(%rsp),%xmm11
   1754 ___
   1755 $code.=<<___;
   1756 	mov	-48(%rsi),%r15
   1757 	mov	-40(%rsi),%r14
   1758 	mov	-32(%rsi),%r13
   1759 	mov	-24(%rsi),%r12
   1760 	mov	-16(%rsi),%rbp
   1761 	mov	-8(%rsi),%rbx
   1762 	lea	(%rsi),%rsp
   1763 .Lepilogue_avx:
   1764 	ret
   1765 .size	${func}_avx,.-${func}_avx
   1766 ___
   1767 
   1768 if ($avx>1) {{
   1769 ######################################################################
   1770 # AVX2+BMI code path
   1771 #
   1772 my $a5=$SZ==4?"%esi":"%rsi";	# zap $inp
   1773 my $PUSH8=8*2*$SZ;
   1774 use integer;
   1775 
   1776 sub bodyx_00_15 () {
   1777 	# at start $a1 should be zero, $a3 - $b^$c and $a4 copy of $f
   1778 	(
   1779 	'($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'.
   1780 
   1781 	'&add	($h,(32*($i/(16/$SZ))+$SZ*($i%(16/$SZ)))%$PUSH8.$base)',    # h+=X[i]+K[i]
   1782 	'&and	($a4,$e)',		# f&e
   1783 	'&rorx	($a0,$e,$Sigma1[2])',
   1784 	'&rorx	($a2,$e,$Sigma1[1])',
   1785 
   1786 	'&lea	($a,"($a,$a1)")',	# h+=Sigma0(a) from the past
   1787 	'&lea	($h,"($h,$a4)")',
   1788 	'&andn	($a4,$e,$g)',		# ~e&g
   1789 	'&xor	($a0,$a2)',
   1790 
   1791 	'&rorx	($a1,$e,$Sigma1[0])',
   1792 	'&lea	($h,"($h,$a4)")',	# h+=Ch(e,f,g)=(e&f)+(~e&g)
   1793 	'&xor	($a0,$a1)',		# Sigma1(e)
   1794 	'&mov	($a2,$a)',
   1795 
   1796 	'&rorx	($a4,$a,$Sigma0[2])',
   1797 	'&lea	($h,"($h,$a0)")',	# h+=Sigma1(e)
   1798 	'&xor	($a2,$b)',		# a^b, b^c in next round
   1799 	'&rorx	($a1,$a,$Sigma0[1])',
   1800 
   1801 	'&rorx	($a0,$a,$Sigma0[0])',
   1802 	'&lea	($d,"($d,$h)")',	# d+=h
   1803 	'&and	($a3,$a2)',		# (b^c)&(a^b)
   1804 	'&xor	($a1,$a4)',
   1805 
   1806 	'&xor	($a3,$b)',		# Maj(a,b,c)=Ch(a^b,c,b)
   1807 	'&xor	($a1,$a0)',		# Sigma0(a)
   1808 	'&lea	($h,"($h,$a3)");'.	# h+=Maj(a,b,c)
   1809 	'&mov	($a4,$e)',		# copy of f in future
   1810 
   1811 	'($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;'
   1812 	);
   1813 	# and at the finish one has to $a+=$a1
   1814 }
   1815 
   1816 $code.=<<___;
   1817 .type	${func}_avx2,\@function,3
   1818 .align	64
   1819 ${func}_avx2:
   1820 .Lavx2_shortcut:
   1821 	mov	%rsp,%rax		# copy %rsp
   1822 	push	%rbx
   1823 	push	%rbp
   1824 	push	%r12
   1825 	push	%r13
   1826 	push	%r14
   1827 	push	%r15
   1828 	sub	\$`2*$SZ*$rounds+4*8+$win64*16*($SZ==4?4:6)`,%rsp
   1829 	shl	\$4,%rdx		# num*16
   1830 	and	\$-256*$SZ,%rsp		# align stack frame
   1831 	lea	($inp,%rdx,$SZ),%rdx	# inp+num*16*$SZ
   1832 	add	\$`2*$SZ*($rounds-8)`,%rsp
   1833 	mov	$ctx,$_ctx		# save ctx, 1st arg
   1834 	mov	$inp,$_inp		# save inp, 2nd arh
   1835 	mov	%rdx,$_end		# save end pointer, "3rd" arg
   1836 	mov	%rax,$_rsp		# save copy of %rsp
   1837 ___
   1838 $code.=<<___ if ($win64);
   1839 	movaps	%xmm6,16*$SZ+32(%rsp)
   1840 	movaps	%xmm7,16*$SZ+48(%rsp)
   1841 	movaps	%xmm8,16*$SZ+64(%rsp)
   1842 	movaps	%xmm9,16*$SZ+80(%rsp)
   1843 ___
   1844 $code.=<<___ if ($win64 && $SZ>4);
   1845 	movaps	%xmm10,16*$SZ+96(%rsp)
   1846 	movaps	%xmm11,16*$SZ+112(%rsp)
   1847 ___
   1848 $code.=<<___;
   1849 .Lprologue_avx2:
   1850 
   1851 	vzeroupper
   1852 	sub	\$-16*$SZ,$inp		# inp++, size optimization
   1853 	mov	$SZ*0($ctx),$A
   1854 	mov	$inp,%r12		# borrow $T1
   1855 	mov	$SZ*1($ctx),$B
   1856 	cmp	%rdx,$inp		# $_end
   1857 	mov	$SZ*2($ctx),$C
   1858 	cmove	%rsp,%r12		# next block or random data
   1859 	mov	$SZ*3($ctx),$D
   1860 	mov	$SZ*4($ctx),$E
   1861 	mov	$SZ*5($ctx),$F
   1862 	mov	$SZ*6($ctx),$G
   1863 	mov	$SZ*7($ctx),$H
   1864 ___
   1865 					if ($SZ==4) {	# SHA256
   1866     my @X = map("%ymm$_",(0..3));
   1867     my ($t0,$t1,$t2,$t3, $t4,$t5) = map("%ymm$_",(4..9));
   1868 
   1869 $code.=<<___;
   1870 	vmovdqa	$TABLE+`$SZ*2*$rounds`+32(%rip),$t4
   1871 	vmovdqa	$TABLE+`$SZ*2*$rounds`+64(%rip),$t5
   1872 	jmp	.Loop_avx2
   1873 .align	16
   1874 .Loop_avx2:
   1875 	vmovdqa	$TABLE+`$SZ*2*$rounds`(%rip),$t3
   1876 	vmovdqu	-16*$SZ+0($inp),%xmm0
   1877 	vmovdqu	-16*$SZ+16($inp),%xmm1
   1878 	vmovdqu	-16*$SZ+32($inp),%xmm2
   1879 	vmovdqu	-16*$SZ+48($inp),%xmm3
   1880 	#mov		$inp,$_inp	# offload $inp
   1881 	vinserti128	\$1,(%r12),@X[0],@X[0]
   1882 	vinserti128	\$1,16(%r12),@X[1],@X[1]
   1883 	vpshufb		$t3,@X[0],@X[0]
   1884 	vinserti128	\$1,32(%r12),@X[2],@X[2]
   1885 	vpshufb		$t3,@X[1],@X[1]
   1886 	vinserti128	\$1,48(%r12),@X[3],@X[3]
   1887 
   1888 	lea	$TABLE(%rip),$Tbl
   1889 	vpshufb	$t3,@X[2],@X[2]
   1890 	vpaddd	0x00($Tbl),@X[0],$t0
   1891 	vpshufb	$t3,@X[3],@X[3]
   1892 	vpaddd	0x20($Tbl),@X[1],$t1
   1893 	vpaddd	0x40($Tbl),@X[2],$t2
   1894 	vpaddd	0x60($Tbl),@X[3],$t3
   1895 	vmovdqa	$t0,0x00(%rsp)
   1896 	xor	$a1,$a1
   1897 	vmovdqa	$t1,0x20(%rsp)
   1898 	lea	-$PUSH8(%rsp),%rsp
   1899 	mov	$B,$a3
   1900 	vmovdqa	$t2,0x00(%rsp)
   1901 	xor	$C,$a3			# magic
   1902 	vmovdqa	$t3,0x20(%rsp)
   1903 	mov	$F,$a4
   1904 	sub	\$-16*2*$SZ,$Tbl	# size optimization
   1905 	jmp	.Lavx2_00_47
   1906 
   1907 .align	16
   1908 .Lavx2_00_47:
   1909 ___
   1910 
   1911 sub AVX2_256_00_47 () {
   1912 my $j = shift;
   1913 my $body = shift;
   1914 my @X = @_;
   1915 my @insns = (&$body,&$body,&$body,&$body);	# 96 instructions
   1916 my $base = "+2*$PUSH8(%rsp)";
   1917 
   1918 	&lea	("%rsp","-$PUSH8(%rsp)")	if (($j%2)==0);
   1919 	foreach (Xupdate_256_AVX()) {		# 29 instructions
   1920 	    eval;
   1921 	    eval(shift(@insns));
   1922 	    eval(shift(@insns));
   1923 	    eval(shift(@insns));
   1924 	}
   1925 	&vpaddd		($t2,@X[0],16*2*$j."($Tbl)");
   1926 	  foreach (@insns) { eval; }		# remaining instructions
   1927 	&vmovdqa	((32*$j)%$PUSH8."(%rsp)",$t2);
   1928 }
   1929 
   1930     for ($i=0,$j=0; $j<4; $j++) {
   1931 	&AVX2_256_00_47($j,\&bodyx_00_15,@X);
   1932 	push(@X,shift(@X));			# rotate(@X)
   1933     }
   1934 	&lea	($Tbl,16*2*$SZ."($Tbl)");
   1935 	&cmpb	(($SZ-1)."($Tbl)",0);
   1936 	&jne	(".Lavx2_00_47");
   1937 
   1938     for ($i=0; $i<16; ) {
   1939 	my $base=$i<8?"+$PUSH8(%rsp)":"(%rsp)";
   1940 	foreach(bodyx_00_15()) { eval; }
   1941     }
   1942 					} else {	# SHA512
   1943     my @X = map("%ymm$_",(0..7));
   1944     my ($t0,$t1,$t2,$t3) = map("%ymm$_",(8..11));
   1945 
   1946 $code.=<<___;
   1947 	jmp	.Loop_avx2
   1948 .align	16
   1949 .Loop_avx2:
   1950 	vmovdqu	-16*$SZ($inp),%xmm0
   1951 	vmovdqu	-16*$SZ+16($inp),%xmm1
   1952 	vmovdqu	-16*$SZ+32($inp),%xmm2
   1953 	lea	$TABLE+0x80(%rip),$Tbl	# size optimization
   1954 	vmovdqu	-16*$SZ+48($inp),%xmm3
   1955 	vmovdqu	-16*$SZ+64($inp),%xmm4
   1956 	vmovdqu	-16*$SZ+80($inp),%xmm5
   1957 	vmovdqu	-16*$SZ+96($inp),%xmm6
   1958 	vmovdqu	-16*$SZ+112($inp),%xmm7
   1959 	#mov	$inp,$_inp	# offload $inp
   1960 	vmovdqa	`$SZ*2*$rounds-0x80`($Tbl),$t2
   1961 	vinserti128	\$1,(%r12),@X[0],@X[0]
   1962 	vinserti128	\$1,16(%r12),@X[1],@X[1]
   1963 	 vpshufb	$t2,@X[0],@X[0]
   1964 	vinserti128	\$1,32(%r12),@X[2],@X[2]
   1965 	 vpshufb	$t2,@X[1],@X[1]
   1966 	vinserti128	\$1,48(%r12),@X[3],@X[3]
   1967 	 vpshufb	$t2,@X[2],@X[2]
   1968 	vinserti128	\$1,64(%r12),@X[4],@X[4]
   1969 	 vpshufb	$t2,@X[3],@X[3]
   1970 	vinserti128	\$1,80(%r12),@X[5],@X[5]
   1971 	 vpshufb	$t2,@X[4],@X[4]
   1972 	vinserti128	\$1,96(%r12),@X[6],@X[6]
   1973 	 vpshufb	$t2,@X[5],@X[5]
   1974 	vinserti128	\$1,112(%r12),@X[7],@X[7]
   1975 
   1976 	vpaddq	-0x80($Tbl),@X[0],$t0
   1977 	vpshufb	$t2,@X[6],@X[6]
   1978 	vpaddq	-0x60($Tbl),@X[1],$t1
   1979 	vpshufb	$t2,@X[7],@X[7]
   1980 	vpaddq	-0x40($Tbl),@X[2],$t2
   1981 	vpaddq	-0x20($Tbl),@X[3],$t3
   1982 	vmovdqa	$t0,0x00(%rsp)
   1983 	vpaddq	0x00($Tbl),@X[4],$t0
   1984 	vmovdqa	$t1,0x20(%rsp)
   1985 	vpaddq	0x20($Tbl),@X[5],$t1
   1986 	vmovdqa	$t2,0x40(%rsp)
   1987 	vpaddq	0x40($Tbl),@X[6],$t2
   1988 	vmovdqa	$t3,0x60(%rsp)
   1989 	lea	-$PUSH8(%rsp),%rsp
   1990 	vpaddq	0x60($Tbl),@X[7],$t3
   1991 	vmovdqa	$t0,0x00(%rsp)
   1992 	xor	$a1,$a1
   1993 	vmovdqa	$t1,0x20(%rsp)
   1994 	mov	$B,$a3
   1995 	vmovdqa	$t2,0x40(%rsp)
   1996 	xor	$C,$a3			# magic
   1997 	vmovdqa	$t3,0x60(%rsp)
   1998 	mov	$F,$a4
   1999 	add	\$16*2*$SZ,$Tbl
   2000 	jmp	.Lavx2_00_47
   2001 
   2002 .align	16
   2003 .Lavx2_00_47:
   2004 ___
   2005 
   2006 sub AVX2_512_00_47 () {
   2007 my $j = shift;
   2008 my $body = shift;
   2009 my @X = @_;
   2010 my @insns = (&$body,&$body);			# 48 instructions
   2011 my $base = "+2*$PUSH8(%rsp)";
   2012 
   2013 	&lea	("%rsp","-$PUSH8(%rsp)")	if (($j%4)==0);
   2014 	foreach (Xupdate_512_AVX()) {		# 23 instructions
   2015 	    eval;
   2016 	    if ($_ !~ /\;$/) {
   2017 		eval(shift(@insns));
   2018 		eval(shift(@insns));
   2019 		eval(shift(@insns));
   2020 	    }
   2021 	}
   2022 	&vpaddq		($t2,@X[0],16*2*$j-0x80."($Tbl)");
   2023 	  foreach (@insns) { eval; }		# remaining instructions
   2024 	&vmovdqa	((32*$j)%$PUSH8."(%rsp)",$t2);
   2025 }
   2026 
   2027     for ($i=0,$j=0; $j<8; $j++) {
   2028 	&AVX2_512_00_47($j,\&bodyx_00_15,@X);
   2029 	push(@X,shift(@X));			# rotate(@X)
   2030     }
   2031 	&lea	($Tbl,16*2*$SZ."($Tbl)");
   2032 	&cmpb	(($SZ-1-0x80)."($Tbl)",0);
   2033 	&jne	(".Lavx2_00_47");
   2034 
   2035     for ($i=0; $i<16; ) {
   2036 	my $base=$i<8?"+$PUSH8(%rsp)":"(%rsp)";
   2037 	foreach(bodyx_00_15()) { eval; }
   2038     }
   2039 }
   2040 $code.=<<___;
   2041 	mov	`2*$SZ*$rounds`(%rsp),$ctx	# $_ctx
   2042 	add	$a1,$A
   2043 	#mov	`2*$SZ*$rounds+8`(%rsp),$inp	# $_inp
   2044 	lea	`2*$SZ*($rounds-8)`(%rsp),$Tbl
   2045 
   2046 	add	$SZ*0($ctx),$A
   2047 	add	$SZ*1($ctx),$B
   2048 	add	$SZ*2($ctx),$C
   2049 	add	$SZ*3($ctx),$D
   2050 	add	$SZ*4($ctx),$E
   2051 	add	$SZ*5($ctx),$F
   2052 	add	$SZ*6($ctx),$G
   2053 	add	$SZ*7($ctx),$H
   2054 
   2055 	mov	$A,$SZ*0($ctx)
   2056 	mov	$B,$SZ*1($ctx)
   2057 	mov	$C,$SZ*2($ctx)
   2058 	mov	$D,$SZ*3($ctx)
   2059 	mov	$E,$SZ*4($ctx)
   2060 	mov	$F,$SZ*5($ctx)
   2061 	mov	$G,$SZ*6($ctx)
   2062 	mov	$H,$SZ*7($ctx)
   2063 
   2064 	cmp	`$PUSH8+2*8`($Tbl),$inp	# $_end
   2065 	je	.Ldone_avx2
   2066 
   2067 	xor	$a1,$a1
   2068 	mov	$B,$a3
   2069 	xor	$C,$a3			# magic
   2070 	mov	$F,$a4
   2071 	jmp	.Lower_avx2
   2072 .align	16
   2073 .Lower_avx2:
   2074 ___
   2075     for ($i=0; $i<8; ) {
   2076 	my $base="+16($Tbl)";
   2077 	foreach(bodyx_00_15()) { eval; }
   2078     }
   2079 $code.=<<___;
   2080 	lea	-$PUSH8($Tbl),$Tbl
   2081 	cmp	%rsp,$Tbl
   2082 	jae	.Lower_avx2
   2083 
   2084 	mov	`2*$SZ*$rounds`(%rsp),$ctx	# $_ctx
   2085 	add	$a1,$A
   2086 	#mov	`2*$SZ*$rounds+8`(%rsp),$inp	# $_inp
   2087 	lea	`2*$SZ*($rounds-8)`(%rsp),%rsp
   2088 
   2089 	add	$SZ*0($ctx),$A
   2090 	add	$SZ*1($ctx),$B
   2091 	add	$SZ*2($ctx),$C
   2092 	add	$SZ*3($ctx),$D
   2093 	add	$SZ*4($ctx),$E
   2094 	add	$SZ*5($ctx),$F
   2095 	lea	`2*16*$SZ`($inp),$inp	# inp+=2
   2096 	add	$SZ*6($ctx),$G
   2097 	mov	$inp,%r12
   2098 	add	$SZ*7($ctx),$H
   2099 	cmp	$_end,$inp
   2100 
   2101 	mov	$A,$SZ*0($ctx)
   2102 	cmove	%rsp,%r12		# next block or stale data
   2103 	mov	$B,$SZ*1($ctx)
   2104 	mov	$C,$SZ*2($ctx)
   2105 	mov	$D,$SZ*3($ctx)
   2106 	mov	$E,$SZ*4($ctx)
   2107 	mov	$F,$SZ*5($ctx)
   2108 	mov	$G,$SZ*6($ctx)
   2109 	mov	$H,$SZ*7($ctx)
   2110 
   2111 	jbe	.Loop_avx2
   2112 	lea	(%rsp),$Tbl
   2113 
   2114 .Ldone_avx2:
   2115 	lea	($Tbl),%rsp
   2116 	mov	$_rsp,%rsi
   2117 	vzeroupper
   2118 ___
   2119 $code.=<<___ if ($win64);
   2120 	movaps	16*$SZ+32(%rsp),%xmm6
   2121 	movaps	16*$SZ+48(%rsp),%xmm7
   2122 	movaps	16*$SZ+64(%rsp),%xmm8
   2123 	movaps	16*$SZ+80(%rsp),%xmm9
   2124 ___
   2125 $code.=<<___ if ($win64 && $SZ>4);
   2126 	movaps	16*$SZ+96(%rsp),%xmm10
   2127 	movaps	16*$SZ+112(%rsp),%xmm11
   2128 ___
   2129 $code.=<<___;
   2130 	mov	-48(%rsi),%r15
   2131 	mov	-40(%rsi),%r14
   2132 	mov	-32(%rsi),%r13
   2133 	mov	-24(%rsi),%r12
   2134 	mov	-16(%rsi),%rbp
   2135 	mov	-8(%rsi),%rbx
   2136 	lea	(%rsi),%rsp
   2137 .Lepilogue_avx2:
   2138 	ret
   2139 .size	${func}_avx2,.-${func}_avx2
   2140 ___
   2141 }}
   2142 }}}}}
   2143 
   2144 # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
   2145 #		CONTEXT *context,DISPATCHER_CONTEXT *disp)
   2146 if ($win64) {
   2147 $rec="%rcx";
   2148 $frame="%rdx";
   2149 $context="%r8";
   2150 $disp="%r9";
   2151 
   2152 $code.=<<___;
   2153 .extern	__imp_RtlVirtualUnwind
   2154 .type	se_handler,\@abi-omnipotent
   2155 .align	16
   2156 se_handler:
   2157 	push	%rsi
   2158 	push	%rdi
   2159 	push	%rbx
   2160 	push	%rbp
   2161 	push	%r12
   2162 	push	%r13
   2163 	push	%r14
   2164 	push	%r15
   2165 	pushfq
   2166 	sub	\$64,%rsp
   2167 
   2168 	mov	120($context),%rax	# pull context->Rax
   2169 	mov	248($context),%rbx	# pull context->Rip
   2170 
   2171 	mov	8($disp),%rsi		# disp->ImageBase
   2172 	mov	56($disp),%r11		# disp->HanderlData
   2173 
   2174 	mov	0(%r11),%r10d		# HandlerData[0]
   2175 	lea	(%rsi,%r10),%r10	# prologue label
   2176 	cmp	%r10,%rbx		# context->Rip<prologue label
   2177 	jb	.Lin_prologue
   2178 
   2179 	mov	152($context),%rax	# pull context->Rsp
   2180 
   2181 	mov	4(%r11),%r10d		# HandlerData[1]
   2182 	lea	(%rsi,%r10),%r10	# epilogue label
   2183 	cmp	%r10,%rbx		# context->Rip>=epilogue label
   2184 	jae	.Lin_prologue
   2185 ___
   2186 $code.=<<___ if ($avx>1);
   2187 	lea	.Lavx2_shortcut(%rip),%r10
   2188 	cmp	%r10,%rbx		# context->Rip<avx2_shortcut
   2189 	jb	.Lnot_in_avx2
   2190 
   2191 	and	\$-256*$SZ,%rax
   2192 	add	\$`2*$SZ*($rounds-8)`,%rax
   2193 .Lnot_in_avx2:
   2194 ___
   2195 $code.=<<___;
   2196 	mov	%rax,%rsi		# put aside Rsp
   2197 	mov	16*$SZ+3*8(%rax),%rax	# pull $_rsp
   2198 
   2199 	mov	-8(%rax),%rbx
   2200 	mov	-16(%rax),%rbp
   2201 	mov	-24(%rax),%r12
   2202 	mov	-32(%rax),%r13
   2203 	mov	-40(%rax),%r14
   2204 	mov	-48(%rax),%r15
   2205 	mov	%rbx,144($context)	# restore context->Rbx
   2206 	mov	%rbp,160($context)	# restore context->Rbp
   2207 	mov	%r12,216($context)	# restore context->R12
   2208 	mov	%r13,224($context)	# restore context->R13
   2209 	mov	%r14,232($context)	# restore context->R14
   2210 	mov	%r15,240($context)	# restore context->R15
   2211 
   2212 	lea	.Lepilogue(%rip),%r10
   2213 	cmp	%r10,%rbx
   2214 	jb	.Lin_prologue		# non-AVX code
   2215 
   2216 	lea	16*$SZ+4*8(%rsi),%rsi	# Xmm6- save area
   2217 	lea	512($context),%rdi	# &context.Xmm6
   2218 	mov	\$`$SZ==4?8:12`,%ecx
   2219 	.long	0xa548f3fc		# cld; rep movsq
   2220 
   2221 .Lin_prologue:
   2222 	mov	8(%rax),%rdi
   2223 	mov	16(%rax),%rsi
   2224 	mov	%rax,152($context)	# restore context->Rsp
   2225 	mov	%rsi,168($context)	# restore context->Rsi
   2226 	mov	%rdi,176($context)	# restore context->Rdi
   2227 
   2228 	mov	40($disp),%rdi		# disp->ContextRecord
   2229 	mov	$context,%rsi		# context
   2230 	mov	\$154,%ecx		# sizeof(CONTEXT)
   2231 	.long	0xa548f3fc		# cld; rep movsq
   2232 
   2233 	mov	$disp,%rsi
   2234 	xor	%rcx,%rcx		# arg1, UNW_FLAG_NHANDLER
   2235 	mov	8(%rsi),%rdx		# arg2, disp->ImageBase
   2236 	mov	0(%rsi),%r8		# arg3, disp->ControlPc
   2237 	mov	16(%rsi),%r9		# arg4, disp->FunctionEntry
   2238 	mov	40(%rsi),%r10		# disp->ContextRecord
   2239 	lea	56(%rsi),%r11		# &disp->HandlerData
   2240 	lea	24(%rsi),%r12		# &disp->EstablisherFrame
   2241 	mov	%r10,32(%rsp)		# arg5
   2242 	mov	%r11,40(%rsp)		# arg6
   2243 	mov	%r12,48(%rsp)		# arg7
   2244 	mov	%rcx,56(%rsp)		# arg8, (NULL)
   2245 	call	*__imp_RtlVirtualUnwind(%rip)
   2246 
   2247 	mov	\$1,%eax		# ExceptionContinueSearch
   2248 	add	\$64,%rsp
   2249 	popfq
   2250 	pop	%r15
   2251 	pop	%r14
   2252 	pop	%r13
   2253 	pop	%r12
   2254 	pop	%rbp
   2255 	pop	%rbx
   2256 	pop	%rdi
   2257 	pop	%rsi
   2258 	ret
   2259 .size	se_handler,.-se_handler
   2260 ___
   2261 
   2262 $code.=<<___ if ($SZ==4 && $shaext);
   2263 .type	shaext_handler,\@abi-omnipotent
   2264 .align	16
   2265 shaext_handler:
   2266 	push	%rsi
   2267 	push	%rdi
   2268 	push	%rbx
   2269 	push	%rbp
   2270 	push	%r12
   2271 	push	%r13
   2272 	push	%r14
   2273 	push	%r15
   2274 	pushfq
   2275 	sub	\$64,%rsp
   2276 
   2277 	mov	120($context),%rax	# pull context->Rax
   2278 	mov	248($context),%rbx	# pull context->Rip
   2279 
   2280 	lea	.Lprologue_shaext(%rip),%r10
   2281 	cmp	%r10,%rbx		# context->Rip<.Lprologue
   2282 	jb	.Lin_prologue
   2283 
   2284 	lea	.Lepilogue_shaext(%rip),%r10
   2285 	cmp	%r10,%rbx		# context->Rip>=.Lepilogue
   2286 	jae	.Lin_prologue
   2287 
   2288 	lea	-8-5*16(%rax),%rsi
   2289 	lea	512($context),%rdi	# &context.Xmm6
   2290 	mov	\$10,%ecx
   2291 	.long	0xa548f3fc		# cld; rep movsq
   2292 
   2293 	jmp	.Lin_prologue
   2294 .size	shaext_handler,.-shaext_handler
   2295 ___
   2296 
   2297 $code.=<<___;
   2298 .section	.pdata
   2299 .align	4
   2300 	.rva	.LSEH_begin_$func
   2301 	.rva	.LSEH_end_$func
   2302 	.rva	.LSEH_info_$func
   2303 ___
   2304 $code.=<<___ if ($SZ==4 && $shaext);
   2305 	.rva	.LSEH_begin_${func}_shaext
   2306 	.rva	.LSEH_end_${func}_shaext
   2307 	.rva	.LSEH_info_${func}_shaext
   2308 ___
   2309 $code.=<<___ if ($SZ==4);
   2310 	.rva	.LSEH_begin_${func}_ssse3
   2311 	.rva	.LSEH_end_${func}_ssse3
   2312 	.rva	.LSEH_info_${func}_ssse3
   2313 ___
   2314 $code.=<<___ if ($avx && $SZ==8);
   2315 	.rva	.LSEH_begin_${func}_xop
   2316 	.rva	.LSEH_end_${func}_xop
   2317 	.rva	.LSEH_info_${func}_xop
   2318 ___
   2319 $code.=<<___ if ($avx);
   2320 	.rva	.LSEH_begin_${func}_avx
   2321 	.rva	.LSEH_end_${func}_avx
   2322 	.rva	.LSEH_info_${func}_avx
   2323 ___
   2324 $code.=<<___ if ($avx>1);
   2325 	.rva	.LSEH_begin_${func}_avx2
   2326 	.rva	.LSEH_end_${func}_avx2
   2327 	.rva	.LSEH_info_${func}_avx2
   2328 ___
   2329 $code.=<<___;
   2330 .section	.xdata
   2331 .align	8
   2332 .LSEH_info_$func:
   2333 	.byte	9,0,0,0
   2334 	.rva	se_handler
   2335 	.rva	.Lprologue,.Lepilogue			# HandlerData[]
   2336 ___
   2337 $code.=<<___ if ($SZ==4 && $shaext);
   2338 .LSEH_info_${func}_shaext:
   2339 	.byte	9,0,0,0
   2340 	.rva	shaext_handler
   2341 ___
   2342 $code.=<<___ if ($SZ==4);
   2343 .LSEH_info_${func}_ssse3:
   2344 	.byte	9,0,0,0
   2345 	.rva	se_handler
   2346 	.rva	.Lprologue_ssse3,.Lepilogue_ssse3	# HandlerData[]
   2347 ___
   2348 $code.=<<___ if ($avx && $SZ==8);
   2349 .LSEH_info_${func}_xop:
   2350 	.byte	9,0,0,0
   2351 	.rva	se_handler
   2352 	.rva	.Lprologue_xop,.Lepilogue_xop		# HandlerData[]
   2353 ___
   2354 $code.=<<___ if ($avx);
   2355 .LSEH_info_${func}_avx:
   2356 	.byte	9,0,0,0
   2357 	.rva	se_handler
   2358 	.rva	.Lprologue_avx,.Lepilogue_avx		# HandlerData[]
   2359 ___
   2360 $code.=<<___ if ($avx>1);
   2361 .LSEH_info_${func}_avx2:
   2362 	.byte	9,0,0,0
   2363 	.rva	se_handler
   2364 	.rva	.Lprologue_avx2,.Lepilogue_avx2		# HandlerData[]
   2365 ___
   2366 }
   2367 
   2368 sub sha256op38 {
   2369     my $instr = shift;
   2370     my %opcodelet = (
   2371 		"sha256rnds2" => 0xcb,
   2372   		"sha256msg1"  => 0xcc,
   2373 		"sha256msg2"  => 0xcd	);
   2374 
   2375     if (defined($opcodelet{$instr}) && @_[0] =~ /%xmm([0-7]),\s*%xmm([0-7])/) {
   2376       my @opcode=(0x0f,0x38);
   2377 	push @opcode,$opcodelet{$instr};
   2378 	push @opcode,0xc0|($1&7)|(($2&7)<<3);		# ModR/M
   2379 	return ".byte\t".join(',',@opcode);
   2380     } else {
   2381 	return $instr."\t".@_[0];
   2382     }
   2383 }
   2384 
   2385 foreach (split("\n",$code)) {
   2386 	s/\`([^\`]*)\`/eval $1/geo;
   2387 
   2388 	s/\b(sha256[^\s]*)\s+(.*)/sha256op38($1,$2)/geo;
   2389 
   2390 	print $_,"\n";
   2391 }
   2392 close STDOUT;
   2393