Home | History | Annotate | Download | only in asm
      1 #! /usr/bin/env perl
      2 # Copyright 2006-2016 The OpenSSL Project Authors. All Rights Reserved.
      3 #
      4 # Licensed under the OpenSSL license (the "License").  You may not use
      5 # this file except in compliance with the License.  You can obtain a copy
      6 # in the file LICENSE in the source distribution or at
      7 # https://www.openssl.org/source/license.html
      8 
      9 #
     10 # ====================================================================
     11 # Written by Andy Polyakov <appro (at] openssl.org> for the OpenSSL
     12 # project. The module is, however, dual licensed under OpenSSL and
     13 # CRYPTOGAMS licenses depending on where you obtain it. For further
     14 # details see http://www.openssl.org/~appro/cryptogams/.
     15 # ====================================================================
     16 #
     17 # sha1_block procedure for x86_64.
     18 #
     19 # It was brought to my attention that on EM64T compiler-generated code
     20 # was far behind 32-bit assembler implementation. This is unlike on
     21 # Opteron where compiler-generated code was only 15% behind 32-bit
     22 # assembler, which originally made it hard to motivate the effort.
     23 # There was suggestion to mechanically translate 32-bit code, but I
     24 # dismissed it, reasoning that x86_64 offers enough register bank
     25 # capacity to fully utilize SHA-1 parallelism. Therefore this fresh
     26 # implementation:-) However! While 64-bit code does perform better
     27 # on Opteron, I failed to beat 32-bit assembler on EM64T core. Well,
     28 # x86_64 does offer larger *addressable* bank, but out-of-order core
     29 # reaches for even more registers through dynamic aliasing, and EM64T
     30 # core must have managed to run-time optimize even 32-bit code just as
     31 # good as 64-bit one. Performance improvement is summarized in the
     32 # following table:
     33 #
     34 #		gcc 3.4		32-bit asm	cycles/byte
     35 # Opteron	+45%		+20%		6.8
     36 # Xeon P4	+65%		+0%		9.9
     37 # Core2		+60%		+10%		7.0
     38 
     39 # August 2009.
     40 #
     41 # The code was revised to minimize code size and to maximize
     42 # "distance" between instructions producing input to 'lea'
     43 # instruction and the 'lea' instruction itself, which is essential
     44 # for Intel Atom core.
     45 
     46 # October 2010.
     47 #
     48 # Add SSSE3, Supplemental[!] SSE3, implementation. The idea behind it
     49 # is to offload message schedule denoted by Wt in NIST specification,
     50 # or Xupdate in OpenSSL source, to SIMD unit. See sha1-586.pl module
     51 # for background and implementation details. The only difference from
     52 # 32-bit code is that 64-bit code doesn't have to spill @X[] elements
     53 # to free temporary registers.
     54 
     55 # April 2011.
     56 #
     57 # Add AVX code path. See sha1-586.pl for further information.
     58 
     59 # May 2013.
     60 #
     61 # Add AVX2+BMI code path. Initial attempt (utilizing BMI instructions
     62 # and loading pair of consecutive blocks to 256-bit %ymm registers)
     63 # did not provide impressive performance improvement till a crucial
     64 # hint regarding the number of Xupdate iterations to pre-compute in
     65 # advance was provided by Ilya Albrekht of Intel Corp.
     66 
     67 # March 2014.
     68 #
     69 # Add support for Intel SHA Extensions.
     70 
     71 ######################################################################
     72 # Current performance is summarized in following table. Numbers are
     73 # CPU clock cycles spent to process single byte (less is better).
     74 #
     75 #		x86_64		SSSE3		AVX[2]
     76 # P4		9.05		-
     77 # Opteron	6.26		-
     78 # Core2		6.55		6.05/+8%	-
     79 # Westmere	6.73		5.30/+27%	-
     80 # Sandy Bridge	7.70		6.10/+26%	4.99/+54%
     81 # Ivy Bridge	6.06		4.67/+30%	4.60/+32%
     82 # Haswell	5.45		4.15/+31%	3.57/+53%
     83 # Skylake	5.18		4.06/+28%	3.54/+46%
     84 # Bulldozer	9.11		5.95/+53%
     85 # Ryzen		4.75		3.80/+24%	1.93/+150%(**)
     86 # VIA Nano	9.32		7.15/+30%
     87 # Atom		10.3		9.17/+12%
     88 # Silvermont	13.1(*)		9.37/+40%
     89 # Knights L	13.2(*)		9.68/+36%	8.30/+59%
     90 # Goldmont	8.13		6.42/+27%	1.70/+380%(**)
     91 #
     92 # (*)	obviously suboptimal result, nothing was done about it,
     93 #	because SSSE3 code is compiled unconditionally;
     94 # (**)	SHAEXT result
     95 
     96 $flavour = shift;
     97 $output  = shift;
     98 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
     99 
    100 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
    101 
    102 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
    103 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
    104 ( $xlate="${dir}../../../perlasm/x86_64-xlate.pl" and -f $xlate) or
    105 die "can't locate x86_64-xlate.pl";
    106 
    107 # In upstream, this is controlled by shelling out to the compiler to check
    108 # versions, but BoringSSL is intended to be used with pre-generated perlasm
    109 # output, so this isn't useful anyway.
    110 #
    111 # TODO(davidben): Enable AVX2 code after testing by setting $avx to 2. Is it
    112 # necessary to disable AVX2 code when SHA Extensions code is disabled? Upstream
    113 # did not tie them together until after $shaext was added.
    114 $avx = 1;
    115 
    116 # TODO(davidben): Consider enabling the Intel SHA Extensions code once it's
    117 # been tested.
    118 $shaext=0;	### set to zero if compiling for 1.0.1
    119 $avx=1		if (!$shaext && $avx);
    120 
    121 open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
    122 *STDOUT=*OUT;
    123 
    124 $ctx="%rdi";	# 1st arg
    125 $inp="%rsi";	# 2nd arg
    126 $num="%rdx";	# 3rd arg
    127 
    128 # reassign arguments in order to produce more compact code
    129 $ctx="%r8";
    130 $inp="%r9";
    131 $num="%r10";
    132 
    133 $t0="%eax";
    134 $t1="%ebx";
    135 $t2="%ecx";
    136 @xi=("%edx","%ebp","%r14d");
    137 $A="%esi";
    138 $B="%edi";
    139 $C="%r11d";
    140 $D="%r12d";
    141 $E="%r13d";
    142 
    143 @V=($A,$B,$C,$D,$E);
    144 
    145 sub BODY_00_19 {
    146 my ($i,$a,$b,$c,$d,$e)=@_;
    147 my $j=$i+1;
    148 $code.=<<___ if ($i==0);
    149 	mov	`4*$i`($inp),$xi[0]
    150 	bswap	$xi[0]
    151 ___
    152 $code.=<<___ if ($i<15);
    153 	mov	`4*$j`($inp),$xi[1]
    154 	mov	$d,$t0
    155 	mov	$xi[0],`4*$i`(%rsp)
    156 	mov	$a,$t2
    157 	bswap	$xi[1]
    158 	xor	$c,$t0
    159 	rol	\$5,$t2
    160 	and	$b,$t0
    161 	lea	0x5a827999($xi[0],$e),$e
    162 	add	$t2,$e
    163 	xor	$d,$t0
    164 	rol	\$30,$b
    165 	add	$t0,$e
    166 ___
    167 $code.=<<___ if ($i>=15);
    168 	xor	`4*($j%16)`(%rsp),$xi[1]
    169 	mov	$d,$t0
    170 	mov	$xi[0],`4*($i%16)`(%rsp)
    171 	mov	$a,$t2
    172 	xor	`4*(($j+2)%16)`(%rsp),$xi[1]
    173 	xor	$c,$t0
    174 	rol	\$5,$t2
    175 	xor	`4*(($j+8)%16)`(%rsp),$xi[1]
    176 	and	$b,$t0
    177 	lea	0x5a827999($xi[0],$e),$e
    178 	rol	\$30,$b
    179 	xor	$d,$t0
    180 	add	$t2,$e
    181 	rol	\$1,$xi[1]
    182 	add	$t0,$e
    183 ___
    184 push(@xi,shift(@xi));
    185 }
    186 
    187 sub BODY_20_39 {
    188 my ($i,$a,$b,$c,$d,$e)=@_;
    189 my $j=$i+1;
    190 my $K=($i<40)?0x6ed9eba1:0xca62c1d6;
    191 $code.=<<___ if ($i<79);
    192 	xor	`4*($j%16)`(%rsp),$xi[1]
    193 	mov	$b,$t0
    194 	`"mov	$xi[0],".4*($i%16)."(%rsp)"	if ($i<72)`
    195 	mov	$a,$t2
    196 	xor	`4*(($j+2)%16)`(%rsp),$xi[1]
    197 	xor	$d,$t0
    198 	rol	\$5,$t2
    199 	xor	`4*(($j+8)%16)`(%rsp),$xi[1]
    200 	lea	$K($xi[0],$e),$e
    201 	xor	$c,$t0
    202 	add	$t2,$e
    203 	rol	\$30,$b
    204 	add	$t0,$e
    205 	rol	\$1,$xi[1]
    206 ___
    207 $code.=<<___ if ($i==79);
    208 	mov	$b,$t0
    209 	mov	$a,$t2
    210 	xor	$d,$t0
    211 	lea	$K($xi[0],$e),$e
    212 	rol	\$5,$t2
    213 	xor	$c,$t0
    214 	add	$t2,$e
    215 	rol	\$30,$b
    216 	add	$t0,$e
    217 ___
    218 push(@xi,shift(@xi));
    219 }
    220 
    221 sub BODY_40_59 {
    222 my ($i,$a,$b,$c,$d,$e)=@_;
    223 my $j=$i+1;
    224 $code.=<<___;
    225 	xor	`4*($j%16)`(%rsp),$xi[1]
    226 	mov	$d,$t0
    227 	mov	$xi[0],`4*($i%16)`(%rsp)
    228 	mov	$d,$t1
    229 	xor	`4*(($j+2)%16)`(%rsp),$xi[1]
    230 	and	$c,$t0
    231 	mov	$a,$t2
    232 	xor	`4*(($j+8)%16)`(%rsp),$xi[1]
    233 	lea	0x8f1bbcdc($xi[0],$e),$e
    234 	xor	$c,$t1
    235 	rol	\$5,$t2
    236 	add	$t0,$e
    237 	rol	\$1,$xi[1]
    238 	and	$b,$t1
    239 	add	$t2,$e
    240 	rol	\$30,$b
    241 	add	$t1,$e
    242 ___
    243 push(@xi,shift(@xi));
    244 }
    245 
    246 $code.=<<___;
    247 .text
    248 .extern	OPENSSL_ia32cap_P
    249 
    250 .globl	sha1_block_data_order
    251 .type	sha1_block_data_order,\@function,3
    252 .align	16
    253 sha1_block_data_order:
    254 .cfi_startproc
    255 	leaq	OPENSSL_ia32cap_P(%rip),%r10
    256 	mov	0(%r10),%r9d
    257 	mov	4(%r10),%r8d
    258 	mov	8(%r10),%r10d
    259 	test	\$`1<<9`,%r8d		# check SSSE3 bit
    260 	jz	.Lialu
    261 ___
    262 $code.=<<___ if ($shaext);
    263 	test	\$`1<<29`,%r10d		# check SHA bit
    264 	jnz	_shaext_shortcut
    265 ___
    266 $code.=<<___ if ($avx>1);
    267 	and	\$`1<<3|1<<5|1<<8`,%r10d	# check AVX2+BMI1+BMI2
    268 	cmp	\$`1<<3|1<<5|1<<8`,%r10d
    269 	je	_avx2_shortcut
    270 ___
    271 $code.=<<___ if ($avx);
    272 	and	\$`1<<28`,%r8d		# mask AVX bit
    273 	and	\$`1<<30`,%r9d		# mask "Intel CPU" bit
    274 	or	%r9d,%r8d
    275 	cmp	\$`1<<28|1<<30`,%r8d
    276 	je	_avx_shortcut
    277 ___
    278 $code.=<<___;
    279 	jmp	_ssse3_shortcut
    280 
    281 .align	16
    282 .Lialu:
    283 	mov	%rsp,%rax
    284 .cfi_def_cfa_register	%rax
    285 	push	%rbx
    286 .cfi_push	%rbx
    287 	push	%rbp
    288 .cfi_push	%rbp
    289 	push	%r12
    290 .cfi_push	%r12
    291 	push	%r13
    292 .cfi_push	%r13
    293 	push	%r14
    294 .cfi_push	%r14
    295 	mov	%rdi,$ctx	# reassigned argument
    296 	sub	\$`8+16*4`,%rsp
    297 	mov	%rsi,$inp	# reassigned argument
    298 	and	\$-64,%rsp
    299 	mov	%rdx,$num	# reassigned argument
    300 	mov	%rax,`16*4`(%rsp)
    301 .cfi_cfa_expression	%rsp+64,deref,+8
    302 .Lprologue:
    303 
    304 	mov	0($ctx),$A
    305 	mov	4($ctx),$B
    306 	mov	8($ctx),$C
    307 	mov	12($ctx),$D
    308 	mov	16($ctx),$E
    309 	jmp	.Lloop
    310 
    311 .align	16
    312 .Lloop:
    313 ___
    314 for($i=0;$i<20;$i++)	{ &BODY_00_19($i,@V); unshift(@V,pop(@V)); }
    315 for(;$i<40;$i++)	{ &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
    316 for(;$i<60;$i++)	{ &BODY_40_59($i,@V); unshift(@V,pop(@V)); }
    317 for(;$i<80;$i++)	{ &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
    318 $code.=<<___;
    319 	add	0($ctx),$A
    320 	add	4($ctx),$B
    321 	add	8($ctx),$C
    322 	add	12($ctx),$D
    323 	add	16($ctx),$E
    324 	mov	$A,0($ctx)
    325 	mov	$B,4($ctx)
    326 	mov	$C,8($ctx)
    327 	mov	$D,12($ctx)
    328 	mov	$E,16($ctx)
    329 
    330 	sub	\$1,$num
    331 	lea	`16*4`($inp),$inp
    332 	jnz	.Lloop
    333 
    334 	mov	`16*4`(%rsp),%rsi
    335 .cfi_def_cfa	%rsi,8
    336 	mov	-40(%rsi),%r14
    337 .cfi_restore	%r14
    338 	mov	-32(%rsi),%r13
    339 .cfi_restore	%r13
    340 	mov	-24(%rsi),%r12
    341 .cfi_restore	%r12
    342 	mov	-16(%rsi),%rbp
    343 .cfi_restore	%rbp
    344 	mov	-8(%rsi),%rbx
    345 .cfi_restore	%rbx
    346 	lea	(%rsi),%rsp
    347 .cfi_def_cfa_register	%rsp
    348 .Lepilogue:
    349 	ret
    350 .cfi_endproc
    351 .size	sha1_block_data_order,.-sha1_block_data_order
    352 ___
    353 if ($shaext) {{{
    354 ######################################################################
    355 # Intel SHA Extensions implementation of SHA1 update function.
    356 #
    357 my ($ctx,$inp,$num)=("%rdi","%rsi","%rdx");
    358 my ($ABCD,$E,$E_,$BSWAP,$ABCD_SAVE,$E_SAVE)=map("%xmm$_",(0..3,8,9));
    359 my @MSG=map("%xmm$_",(4..7));
    360 
    361 $code.=<<___;
    362 .type	sha1_block_data_order_shaext,\@function,3
    363 .align	32
    364 sha1_block_data_order_shaext:
    365 _shaext_shortcut:
    366 .cfi_startproc
    367 ___
    368 $code.=<<___ if ($win64);
    369 	lea	`-8-4*16`(%rsp),%rsp
    370 	movaps	%xmm6,-8-4*16(%rax)
    371 	movaps	%xmm7,-8-3*16(%rax)
    372 	movaps	%xmm8,-8-2*16(%rax)
    373 	movaps	%xmm9,-8-1*16(%rax)
    374 .Lprologue_shaext:
    375 ___
    376 $code.=<<___;
    377 	movdqu	($ctx),$ABCD
    378 	movd	16($ctx),$E
    379 	movdqa	K_XX_XX+0xa0(%rip),$BSWAP	# byte-n-word swap
    380 
    381 	movdqu	($inp),@MSG[0]
    382 	pshufd	\$0b00011011,$ABCD,$ABCD	# flip word order
    383 	movdqu	0x10($inp),@MSG[1]
    384 	pshufd	\$0b00011011,$E,$E		# flip word order
    385 	movdqu	0x20($inp),@MSG[2]
    386 	pshufb	$BSWAP,@MSG[0]
    387 	movdqu	0x30($inp),@MSG[3]
    388 	pshufb	$BSWAP,@MSG[1]
    389 	pshufb	$BSWAP,@MSG[2]
    390 	movdqa	$E,$E_SAVE			# offload $E
    391 	pshufb	$BSWAP,@MSG[3]
    392 	jmp	.Loop_shaext
    393 
    394 .align	16
    395 .Loop_shaext:
    396 	dec		$num
    397 	lea		0x40($inp),%r8		# next input block
    398 	paddd		@MSG[0],$E
    399 	cmovne		%r8,$inp
    400 	movdqa		$ABCD,$ABCD_SAVE	# offload $ABCD
    401 ___
    402 for($i=0;$i<20-4;$i+=2) {
    403 $code.=<<___;
    404 	sha1msg1	@MSG[1],@MSG[0]
    405 	movdqa		$ABCD,$E_
    406 	sha1rnds4	\$`int($i/5)`,$E,$ABCD	# 0-3...
    407 	sha1nexte	@MSG[1],$E_
    408 	pxor		@MSG[2],@MSG[0]
    409 	sha1msg1	@MSG[2],@MSG[1]
    410 	sha1msg2	@MSG[3],@MSG[0]
    411 
    412 	movdqa		$ABCD,$E
    413 	sha1rnds4	\$`int(($i+1)/5)`,$E_,$ABCD
    414 	sha1nexte	@MSG[2],$E
    415 	pxor		@MSG[3],@MSG[1]
    416 	sha1msg2	@MSG[0],@MSG[1]
    417 ___
    418 	push(@MSG,shift(@MSG));	push(@MSG,shift(@MSG));
    419 }
    420 $code.=<<___;
    421 	movdqu		($inp),@MSG[0]
    422 	movdqa		$ABCD,$E_
    423 	sha1rnds4	\$3,$E,$ABCD		# 64-67
    424 	sha1nexte	@MSG[1],$E_
    425 	movdqu		0x10($inp),@MSG[1]
    426 	pshufb		$BSWAP,@MSG[0]
    427 
    428 	movdqa		$ABCD,$E
    429 	sha1rnds4	\$3,$E_,$ABCD		# 68-71
    430 	sha1nexte	@MSG[2],$E
    431 	movdqu		0x20($inp),@MSG[2]
    432 	pshufb		$BSWAP,@MSG[1]
    433 
    434 	movdqa		$ABCD,$E_
    435 	sha1rnds4	\$3,$E,$ABCD		# 72-75
    436 	sha1nexte	@MSG[3],$E_
    437 	movdqu		0x30($inp),@MSG[3]
    438 	pshufb		$BSWAP,@MSG[2]
    439 
    440 	movdqa		$ABCD,$E
    441 	sha1rnds4	\$3,$E_,$ABCD		# 76-79
    442 	sha1nexte	$E_SAVE,$E
    443 	pshufb		$BSWAP,@MSG[3]
    444 
    445 	paddd		$ABCD_SAVE,$ABCD
    446 	movdqa		$E,$E_SAVE		# offload $E
    447 
    448 	jnz		.Loop_shaext
    449 
    450 	pshufd	\$0b00011011,$ABCD,$ABCD
    451 	pshufd	\$0b00011011,$E,$E
    452 	movdqu	$ABCD,($ctx)
    453 	movd	$E,16($ctx)
    454 ___
    455 $code.=<<___ if ($win64);
    456 	movaps	-8-4*16(%rax),%xmm6
    457 	movaps	-8-3*16(%rax),%xmm7
    458 	movaps	-8-2*16(%rax),%xmm8
    459 	movaps	-8-1*16(%rax),%xmm9
    460 	mov	%rax,%rsp
    461 .Lepilogue_shaext:
    462 ___
    463 $code.=<<___;
    464 .cfi_endproc
    465 	ret
    466 .size	sha1_block_data_order_shaext,.-sha1_block_data_order_shaext
    467 ___
    468 }}}
    469 {{{
    470 my $Xi=4;
    471 my @X=map("%xmm$_",(4..7,0..3));
    472 my @Tx=map("%xmm$_",(8..10));
    473 my $Kx="%xmm11";
    474 my @V=($A,$B,$C,$D,$E)=("%eax","%ebx","%ecx","%edx","%ebp");	# size optimization
    475 my @T=("%esi","%edi");
    476 my $j=0;
    477 my $rx=0;
    478 my $K_XX_XX="%r14";
    479 my $fp="%r11";
    480 
    481 my $_rol=sub { &rol(@_) };
    482 my $_ror=sub { &ror(@_) };
    483 
    484 { my $sn;
    485 sub align32() {
    486   ++$sn;
    487 $code.=<<___;
    488 	jmp	.Lalign32_$sn	# see "Decoded ICache" in manual
    489 .align	32
    490 .Lalign32_$sn:
    491 ___
    492 }
    493 }
    494 
    495 $code.=<<___;
    496 .type	sha1_block_data_order_ssse3,\@function,3
    497 .align	16
    498 sha1_block_data_order_ssse3:
    499 _ssse3_shortcut:
    500 .cfi_startproc
    501 	mov	%rsp,$fp	# frame pointer
    502 .cfi_def_cfa_register	$fp
    503 	push	%rbx
    504 .cfi_push	%rbx
    505 	push	%rbp
    506 .cfi_push	%rbp
    507 	push	%r12
    508 .cfi_push	%r12
    509 	push	%r13		# redundant, done to share Win64 SE handler
    510 .cfi_push	%r13
    511 	push	%r14
    512 .cfi_push	%r14
    513 	lea	`-64-($win64?6*16:0)`(%rsp),%rsp
    514 ___
    515 $code.=<<___ if ($win64);
    516 	movaps	%xmm6,-40-6*16($fp)
    517 	movaps	%xmm7,-40-5*16($fp)
    518 	movaps	%xmm8,-40-4*16($fp)
    519 	movaps	%xmm9,-40-3*16($fp)
    520 	movaps	%xmm10,-40-2*16($fp)
    521 	movaps	%xmm11,-40-1*16($fp)
    522 .Lprologue_ssse3:
    523 ___
    524 $code.=<<___;
    525 	and	\$-64,%rsp
    526 	mov	%rdi,$ctx	# reassigned argument
    527 	mov	%rsi,$inp	# reassigned argument
    528 	mov	%rdx,$num	# reassigned argument
    529 
    530 	shl	\$6,$num
    531 	add	$inp,$num
    532 	lea	K_XX_XX+64(%rip),$K_XX_XX
    533 
    534 	mov	0($ctx),$A		# load context
    535 	mov	4($ctx),$B
    536 	mov	8($ctx),$C
    537 	mov	12($ctx),$D
    538 	mov	$B,@T[0]		# magic seed
    539 	mov	16($ctx),$E
    540 	mov	$C,@T[1]
    541 	xor	$D,@T[1]
    542 	and	@T[1],@T[0]
    543 
    544 	movdqa	64($K_XX_XX),@X[2]	# pbswap mask
    545 	movdqa	-64($K_XX_XX),@Tx[1]	# K_00_19
    546 	movdqu	0($inp),@X[-4&7]	# load input to %xmm[0-3]
    547 	movdqu	16($inp),@X[-3&7]
    548 	movdqu	32($inp),@X[-2&7]
    549 	movdqu	48($inp),@X[-1&7]
    550 	pshufb	@X[2],@X[-4&7]		# byte swap
    551 	pshufb	@X[2],@X[-3&7]
    552 	pshufb	@X[2],@X[-2&7]
    553 	add	\$64,$inp
    554 	paddd	@Tx[1],@X[-4&7]		# add K_00_19
    555 	pshufb	@X[2],@X[-1&7]
    556 	paddd	@Tx[1],@X[-3&7]
    557 	paddd	@Tx[1],@X[-2&7]
    558 	movdqa	@X[-4&7],0(%rsp)	# X[]+K xfer to IALU
    559 	psubd	@Tx[1],@X[-4&7]		# restore X[]
    560 	movdqa	@X[-3&7],16(%rsp)
    561 	psubd	@Tx[1],@X[-3&7]
    562 	movdqa	@X[-2&7],32(%rsp)
    563 	psubd	@Tx[1],@X[-2&7]
    564 	jmp	.Loop_ssse3
    565 ___
    566 
    567 sub AUTOLOAD()		# thunk [simplified] 32-bit style perlasm
    568 { my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
    569   my $arg = pop;
    570     $arg = "\$$arg" if ($arg*1 eq $arg);
    571     $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n";
    572 }
    573 
    574 sub Xupdate_ssse3_16_31()		# recall that $Xi starts with 4
    575 { use integer;
    576   my $body = shift;
    577   my @insns = (&$body,&$body,&$body,&$body);	# 40 instructions
    578   my ($a,$b,$c,$d,$e);
    579 
    580 	 eval(shift(@insns));		# ror
    581 	&pshufd	(@X[0],@X[-4&7],0xee);	# was &movdqa	(@X[0],@X[-3&7]);
    582 	 eval(shift(@insns));
    583 	&movdqa	(@Tx[0],@X[-1&7]);
    584 	  &paddd	(@Tx[1],@X[-1&7]);
    585 	 eval(shift(@insns));
    586 	 eval(shift(@insns));
    587 
    588 	&punpcklqdq(@X[0],@X[-3&7]);	# compose "X[-14]" in "X[0]", was &palignr(@X[0],@X[-4&7],8);
    589 	 eval(shift(@insns));
    590 	 eval(shift(@insns));		# rol
    591 	 eval(shift(@insns));
    592 	&psrldq	(@Tx[0],4);		# "X[-3]", 3 dwords
    593 	 eval(shift(@insns));
    594 	 eval(shift(@insns));
    595 
    596 	&pxor	(@X[0],@X[-4&7]);	# "X[0]"^="X[-16]"
    597 	 eval(shift(@insns));
    598 	 eval(shift(@insns));		# ror
    599 	&pxor	(@Tx[0],@X[-2&7]);	# "X[-3]"^"X[-8]"
    600 	 eval(shift(@insns));
    601 	 eval(shift(@insns));
    602 	 eval(shift(@insns));
    603 
    604 	&pxor	(@X[0],@Tx[0]);		# "X[0]"^="X[-3]"^"X[-8]"
    605 	 eval(shift(@insns));
    606 	 eval(shift(@insns));		# rol
    607 	  &movdqa	(eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]);	# X[]+K xfer to IALU
    608 	 eval(shift(@insns));
    609 	 eval(shift(@insns));
    610 
    611 	&movdqa	(@Tx[2],@X[0]);
    612 	 eval(shift(@insns));
    613 	 eval(shift(@insns));
    614 	 eval(shift(@insns));		# ror
    615 	&movdqa	(@Tx[0],@X[0]);
    616 	 eval(shift(@insns));
    617 
    618 	&pslldq	(@Tx[2],12);		# "X[0]"<<96, extract one dword
    619 	&paddd	(@X[0],@X[0]);
    620 	 eval(shift(@insns));
    621 	 eval(shift(@insns));
    622 
    623 	&psrld	(@Tx[0],31);
    624 	 eval(shift(@insns));
    625 	 eval(shift(@insns));		# rol
    626 	 eval(shift(@insns));
    627 	&movdqa	(@Tx[1],@Tx[2]);
    628 	 eval(shift(@insns));
    629 	 eval(shift(@insns));
    630 
    631 	&psrld	(@Tx[2],30);
    632 	 eval(shift(@insns));
    633 	 eval(shift(@insns));		# ror
    634 	&por	(@X[0],@Tx[0]);		# "X[0]"<<<=1
    635 	 eval(shift(@insns));
    636 	 eval(shift(@insns));
    637 	 eval(shift(@insns));
    638 
    639 	&pslld	(@Tx[1],2);
    640 	&pxor	(@X[0],@Tx[2]);
    641 	 eval(shift(@insns));
    642 	  &movdqa	(@Tx[2],eval(2*16*(($Xi)/5)-64)."($K_XX_XX)");	# K_XX_XX
    643 	 eval(shift(@insns));		# rol
    644 	 eval(shift(@insns));
    645 	 eval(shift(@insns));
    646 
    647 	&pxor	(@X[0],@Tx[1]);		# "X[0]"^=("X[0]">>96)<<<2
    648 	&pshufd (@Tx[1],@X[-1&7],0xee)	if ($Xi==7);	# was &movdqa	(@Tx[0],@X[-1&7]) in Xupdate_ssse3_32_79
    649 
    650 	 foreach (@insns) { eval; }	# remaining instructions [if any]
    651 
    652   $Xi++;	push(@X,shift(@X));	# "rotate" X[]
    653 		push(@Tx,shift(@Tx));
    654 }
    655 
    656 sub Xupdate_ssse3_32_79()
    657 { use integer;
    658   my $body = shift;
    659   my @insns = (&$body,&$body,&$body,&$body);	# 32 to 44 instructions
    660   my ($a,$b,$c,$d,$e);
    661 
    662 	 eval(shift(@insns))		if ($Xi==8);
    663 	&pxor	(@X[0],@X[-4&7]);	# "X[0]"="X[-32]"^"X[-16]"
    664 	 eval(shift(@insns))		if ($Xi==8);
    665 	 eval(shift(@insns));		# body_20_39
    666 	 eval(shift(@insns));
    667 	 eval(shift(@insns))		if (@insns[1] =~ /_ror/);
    668 	 eval(shift(@insns))		if (@insns[0] =~ /_ror/);
    669 	&punpcklqdq(@Tx[0],@X[-1&7]);	# compose "X[-6]", was &palignr(@Tx[0],@X[-2&7],8);
    670 	 eval(shift(@insns));
    671 	 eval(shift(@insns));		# rol
    672 
    673 	&pxor	(@X[0],@X[-7&7]);	# "X[0]"^="X[-28]"
    674 	 eval(shift(@insns));
    675 	 eval(shift(@insns));
    676 	if ($Xi%5) {
    677 	  &movdqa	(@Tx[2],@Tx[1]);# "perpetuate" K_XX_XX...
    678 	} else {			# ... or load next one
    679 	  &movdqa	(@Tx[2],eval(2*16*($Xi/5)-64)."($K_XX_XX)");
    680 	}
    681 	 eval(shift(@insns));		# ror
    682 	  &paddd	(@Tx[1],@X[-1&7]);
    683 	 eval(shift(@insns));
    684 
    685 	&pxor	(@X[0],@Tx[0]);		# "X[0]"^="X[-6]"
    686 	 eval(shift(@insns));		# body_20_39
    687 	 eval(shift(@insns));
    688 	 eval(shift(@insns));
    689 	 eval(shift(@insns));		# rol
    690 	 eval(shift(@insns))		if (@insns[0] =~ /_ror/);
    691 
    692 	&movdqa	(@Tx[0],@X[0]);
    693 	 eval(shift(@insns));
    694 	 eval(shift(@insns));
    695 	  &movdqa	(eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]);	# X[]+K xfer to IALU
    696 	 eval(shift(@insns));		# ror
    697 	 eval(shift(@insns));
    698 	 eval(shift(@insns));		# body_20_39
    699 
    700 	&pslld	(@X[0],2);
    701 	 eval(shift(@insns));
    702 	 eval(shift(@insns));
    703 	&psrld	(@Tx[0],30);
    704 	 eval(shift(@insns))		if (@insns[0] =~ /_rol/);# rol
    705 	 eval(shift(@insns));
    706 	 eval(shift(@insns));
    707 	 eval(shift(@insns));		# ror
    708 
    709 	&por	(@X[0],@Tx[0]);		# "X[0]"<<<=2
    710 	 eval(shift(@insns));
    711 	 eval(shift(@insns));		# body_20_39
    712 	 eval(shift(@insns))		if (@insns[1] =~ /_rol/);
    713 	 eval(shift(@insns))		if (@insns[0] =~ /_rol/);
    714 	  &pshufd(@Tx[1],@X[-1&7],0xee)	if ($Xi<19);	# was &movdqa	(@Tx[1],@X[0])
    715 	 eval(shift(@insns));
    716 	 eval(shift(@insns));		# rol
    717 	 eval(shift(@insns));
    718 	 eval(shift(@insns));
    719 	 eval(shift(@insns));		# rol
    720 	 eval(shift(@insns));
    721 
    722 	 foreach (@insns) { eval; }	# remaining instructions
    723 
    724   $Xi++;	push(@X,shift(@X));	# "rotate" X[]
    725 		push(@Tx,shift(@Tx));
    726 }
    727 
    728 sub Xuplast_ssse3_80()
    729 { use integer;
    730   my $body = shift;
    731   my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
    732   my ($a,$b,$c,$d,$e);
    733 
    734 	 eval(shift(@insns));
    735 	 eval(shift(@insns));
    736 	 eval(shift(@insns));
    737 	 eval(shift(@insns));
    738 	  &paddd	(@Tx[1],@X[-1&7]);
    739 	 eval(shift(@insns));
    740 	 eval(shift(@insns));
    741 
    742 	  &movdqa	(eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]);	# X[]+K xfer IALU
    743 
    744 	 foreach (@insns) { eval; }		# remaining instructions
    745 
    746 	&cmp	($inp,$num);
    747 	&je	(".Ldone_ssse3");
    748 
    749 	unshift(@Tx,pop(@Tx));
    750 
    751 	&movdqa	(@X[2],"64($K_XX_XX)");		# pbswap mask
    752 	&movdqa	(@Tx[1],"-64($K_XX_XX)");	# K_00_19
    753 	&movdqu	(@X[-4&7],"0($inp)");		# load input
    754 	&movdqu	(@X[-3&7],"16($inp)");
    755 	&movdqu	(@X[-2&7],"32($inp)");
    756 	&movdqu	(@X[-1&7],"48($inp)");
    757 	&pshufb	(@X[-4&7],@X[2]);		# byte swap
    758 	&add	($inp,64);
    759 
    760   $Xi=0;
    761 }
    762 
    763 sub Xloop_ssse3()
    764 { use integer;
    765   my $body = shift;
    766   my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
    767   my ($a,$b,$c,$d,$e);
    768 
    769 	 eval(shift(@insns));
    770 	 eval(shift(@insns));
    771 	 eval(shift(@insns));
    772 	&pshufb	(@X[($Xi-3)&7],@X[2]);
    773 	 eval(shift(@insns));
    774 	 eval(shift(@insns));
    775 	 eval(shift(@insns));
    776 	 eval(shift(@insns));
    777 	&paddd	(@X[($Xi-4)&7],@Tx[1]);
    778 	 eval(shift(@insns));
    779 	 eval(shift(@insns));
    780 	 eval(shift(@insns));
    781 	 eval(shift(@insns));
    782 	&movdqa	(eval(16*$Xi)."(%rsp)",@X[($Xi-4)&7]);	# X[]+K xfer to IALU
    783 	 eval(shift(@insns));
    784 	 eval(shift(@insns));
    785 	 eval(shift(@insns));
    786 	 eval(shift(@insns));
    787 	&psubd	(@X[($Xi-4)&7],@Tx[1]);
    788 
    789 	foreach (@insns) { eval; }
    790   $Xi++;
    791 }
    792 
    793 sub Xtail_ssse3()
    794 { use integer;
    795   my $body = shift;
    796   my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
    797   my ($a,$b,$c,$d,$e);
    798 
    799 	foreach (@insns) { eval; }
    800 }
    801 
    802 sub body_00_19 () {	# ((c^d)&b)^d
    803 	# on start @T[0]=(c^d)&b
    804 	return &body_20_39() if ($rx==19); $rx++;
    805 	(
    806 	'($a,$b,$c,$d,$e)=@V;'.
    807 	'&$_ror	($b,$j?7:2)',	# $b>>>2
    808 	'&xor	(@T[0],$d)',
    809 	'&mov	(@T[1],$a)',	# $b for next round
    810 
    811 	'&add	($e,eval(4*($j&15))."(%rsp)")',	# X[]+K xfer
    812 	'&xor	($b,$c)',	# $c^$d for next round
    813 
    814 	'&$_rol	($a,5)',
    815 	'&add	($e,@T[0])',
    816 	'&and	(@T[1],$b)',	# ($b&($c^$d)) for next round
    817 
    818 	'&xor	($b,$c)',	# restore $b
    819 	'&add	($e,$a);'	.'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));'
    820 	);
    821 }
    822 
    823 sub body_20_39 () {	# b^d^c
    824 	# on entry @T[0]=b^d
    825 	return &body_40_59() if ($rx==39); $rx++;
    826 	(
    827 	'($a,$b,$c,$d,$e)=@V;'.
    828 	'&add	($e,eval(4*($j&15))."(%rsp)")',	# X[]+K xfer
    829 	'&xor	(@T[0],$d)	if($j==19);'.
    830 	'&xor	(@T[0],$c)	if($j> 19)',	# ($b^$d^$c)
    831 	'&mov	(@T[1],$a)',	# $b for next round
    832 
    833 	'&$_rol	($a,5)',
    834 	'&add	($e,@T[0])',
    835 	'&xor	(@T[1],$c)	if ($j< 79)',	# $b^$d for next round
    836 
    837 	'&$_ror	($b,7)',	# $b>>>2
    838 	'&add	($e,$a);'	.'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));'
    839 	);
    840 }
    841 
    842 sub body_40_59 () {	# ((b^c)&(c^d))^c
    843 	# on entry @T[0]=(b^c), (c^=d)
    844 	$rx++;
    845 	(
    846 	'($a,$b,$c,$d,$e)=@V;'.
    847 	'&add	($e,eval(4*($j&15))."(%rsp)")',	# X[]+K xfer
    848 	'&and	(@T[0],$c)	if ($j>=40)',	# (b^c)&(c^d)
    849 	'&xor	($c,$d)		if ($j>=40)',	# restore $c
    850 
    851 	'&$_ror	($b,7)',	# $b>>>2
    852 	'&mov	(@T[1],$a)',	# $b for next round
    853 	'&xor	(@T[0],$c)',
    854 
    855 	'&$_rol	($a,5)',
    856 	'&add	($e,@T[0])',
    857 	'&xor	(@T[1],$c)	if ($j==59);'.
    858 	'&xor	(@T[1],$b)	if ($j< 59)',	# b^c for next round
    859 
    860 	'&xor	($b,$c)		if ($j< 59)',	# c^d for next round
    861 	'&add	($e,$a);'	.'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));'
    862 	);
    863 }
    864 $code.=<<___;
    865 .align	16
    866 .Loop_ssse3:
    867 ___
    868 	&Xupdate_ssse3_16_31(\&body_00_19);
    869 	&Xupdate_ssse3_16_31(\&body_00_19);
    870 	&Xupdate_ssse3_16_31(\&body_00_19);
    871 	&Xupdate_ssse3_16_31(\&body_00_19);
    872 	&Xupdate_ssse3_32_79(\&body_00_19);
    873 	&Xupdate_ssse3_32_79(\&body_20_39);
    874 	&Xupdate_ssse3_32_79(\&body_20_39);
    875 	&Xupdate_ssse3_32_79(\&body_20_39);
    876 	&Xupdate_ssse3_32_79(\&body_20_39);
    877 	&Xupdate_ssse3_32_79(\&body_20_39);
    878 	&Xupdate_ssse3_32_79(\&body_40_59);
    879 	&Xupdate_ssse3_32_79(\&body_40_59);
    880 	&Xupdate_ssse3_32_79(\&body_40_59);
    881 	&Xupdate_ssse3_32_79(\&body_40_59);
    882 	&Xupdate_ssse3_32_79(\&body_40_59);
    883 	&Xupdate_ssse3_32_79(\&body_20_39);
    884 	&Xuplast_ssse3_80(\&body_20_39);	# can jump to "done"
    885 
    886 				$saved_j=$j; @saved_V=@V;
    887 
    888 	&Xloop_ssse3(\&body_20_39);
    889 	&Xloop_ssse3(\&body_20_39);
    890 	&Xloop_ssse3(\&body_20_39);
    891 
    892 $code.=<<___;
    893 	add	0($ctx),$A			# update context
    894 	add	4($ctx),@T[0]
    895 	add	8($ctx),$C
    896 	add	12($ctx),$D
    897 	mov	$A,0($ctx)
    898 	add	16($ctx),$E
    899 	mov	@T[0],4($ctx)
    900 	mov	@T[0],$B			# magic seed
    901 	mov	$C,8($ctx)
    902 	mov	$C,@T[1]
    903 	mov	$D,12($ctx)
    904 	xor	$D,@T[1]
    905 	mov	$E,16($ctx)
    906 	and	@T[1],@T[0]
    907 	jmp	.Loop_ssse3
    908 
    909 .align	16
    910 .Ldone_ssse3:
    911 ___
    912 				$j=$saved_j; @V=@saved_V;
    913 
    914 	&Xtail_ssse3(\&body_20_39);
    915 	&Xtail_ssse3(\&body_20_39);
    916 	&Xtail_ssse3(\&body_20_39);
    917 
    918 $code.=<<___;
    919 	add	0($ctx),$A			# update context
    920 	add	4($ctx),@T[0]
    921 	add	8($ctx),$C
    922 	mov	$A,0($ctx)
    923 	add	12($ctx),$D
    924 	mov	@T[0],4($ctx)
    925 	add	16($ctx),$E
    926 	mov	$C,8($ctx)
    927 	mov	$D,12($ctx)
    928 	mov	$E,16($ctx)
    929 ___
    930 $code.=<<___ if ($win64);
    931 	movaps	-40-6*16($fp),%xmm6
    932 	movaps	-40-5*16($fp),%xmm7
    933 	movaps	-40-4*16($fp),%xmm8
    934 	movaps	-40-3*16($fp),%xmm9
    935 	movaps	-40-2*16($fp),%xmm10
    936 	movaps	-40-1*16($fp),%xmm11
    937 ___
    938 $code.=<<___;
    939 	mov	-40($fp),%r14
    940 .cfi_restore	%r14
    941 	mov	-32($fp),%r13
    942 .cfi_restore	%r13
    943 	mov	-24($fp),%r12
    944 .cfi_restore	%r12
    945 	mov	-16($fp),%rbp
    946 .cfi_restore	%rbp
    947 	mov	-8($fp),%rbx
    948 .cfi_restore	%rbx
    949 	lea	($fp),%rsp
    950 .cfi_def_cfa_register	%rsp
    951 .Lepilogue_ssse3:
    952 	ret
    953 .cfi_endproc
    954 .size	sha1_block_data_order_ssse3,.-sha1_block_data_order_ssse3
    955 ___
    956 
    957 if ($avx) {
    958 $Xi=4;				# reset variables
    959 @X=map("%xmm$_",(4..7,0..3));
    960 @Tx=map("%xmm$_",(8..10));
    961 $j=0;
    962 $rx=0;
    963 
    964 my $done_avx_label=".Ldone_avx";
    965 
    966 my $_rol=sub { &shld(@_[0],@_) };
    967 my $_ror=sub { &shrd(@_[0],@_) };
    968 
    969 $code.=<<___;
    970 .type	sha1_block_data_order_avx,\@function,3
    971 .align	16
    972 sha1_block_data_order_avx:
    973 _avx_shortcut:
    974 .cfi_startproc
    975 	mov	%rsp,$fp
    976 .cfi_def_cfa_register	$fp
    977 	push	%rbx
    978 .cfi_push	%rbx
    979 	push	%rbp
    980 .cfi_push	%rbp
    981 	push	%r12
    982 .cfi_push	%r12
    983 	push	%r13		# redundant, done to share Win64 SE handler
    984 .cfi_push	%r13
    985 	push	%r14
    986 .cfi_push	%r14
    987 	lea	`-64-($win64?6*16:0)`(%rsp),%rsp
    988 	vzeroupper
    989 ___
    990 $code.=<<___ if ($win64);
    991 	vmovaps	%xmm6,-40-6*16($fp)
    992 	vmovaps	%xmm7,-40-5*16($fp)
    993 	vmovaps	%xmm8,-40-4*16($fp)
    994 	vmovaps	%xmm9,-40-3*16($fp)
    995 	vmovaps	%xmm10,-40-2*16($fp)
    996 	vmovaps	%xmm11,-40-1*16($fp)
    997 .Lprologue_avx:
    998 ___
    999 $code.=<<___;
   1000 	and	\$-64,%rsp
   1001 	mov	%rdi,$ctx	# reassigned argument
   1002 	mov	%rsi,$inp	# reassigned argument
   1003 	mov	%rdx,$num	# reassigned argument
   1004 
   1005 	shl	\$6,$num
   1006 	add	$inp,$num
   1007 	lea	K_XX_XX+64(%rip),$K_XX_XX
   1008 
   1009 	mov	0($ctx),$A		# load context
   1010 	mov	4($ctx),$B
   1011 	mov	8($ctx),$C
   1012 	mov	12($ctx),$D
   1013 	mov	$B,@T[0]		# magic seed
   1014 	mov	16($ctx),$E
   1015 	mov	$C,@T[1]
   1016 	xor	$D,@T[1]
   1017 	and	@T[1],@T[0]
   1018 
   1019 	vmovdqa	64($K_XX_XX),@X[2]	# pbswap mask
   1020 	vmovdqa	-64($K_XX_XX),$Kx	# K_00_19
   1021 	vmovdqu	0($inp),@X[-4&7]	# load input to %xmm[0-3]
   1022 	vmovdqu	16($inp),@X[-3&7]
   1023 	vmovdqu	32($inp),@X[-2&7]
   1024 	vmovdqu	48($inp),@X[-1&7]
   1025 	vpshufb	@X[2],@X[-4&7],@X[-4&7]	# byte swap
   1026 	add	\$64,$inp
   1027 	vpshufb	@X[2],@X[-3&7],@X[-3&7]
   1028 	vpshufb	@X[2],@X[-2&7],@X[-2&7]
   1029 	vpshufb	@X[2],@X[-1&7],@X[-1&7]
   1030 	vpaddd	$Kx,@X[-4&7],@X[0]	# add K_00_19
   1031 	vpaddd	$Kx,@X[-3&7],@X[1]
   1032 	vpaddd	$Kx,@X[-2&7],@X[2]
   1033 	vmovdqa	@X[0],0(%rsp)		# X[]+K xfer to IALU
   1034 	vmovdqa	@X[1],16(%rsp)
   1035 	vmovdqa	@X[2],32(%rsp)
   1036 	jmp	.Loop_avx
   1037 ___
   1038 
   1039 sub Xupdate_avx_16_31()		# recall that $Xi starts with 4
   1040 { use integer;
   1041   my $body = shift;
   1042   my @insns = (&$body,&$body,&$body,&$body);	# 40 instructions
   1043   my ($a,$b,$c,$d,$e);
   1044 
   1045 	 eval(shift(@insns));
   1046 	 eval(shift(@insns));
   1047 	&vpalignr(@X[0],@X[-3&7],@X[-4&7],8);	# compose "X[-14]" in "X[0]"
   1048 	 eval(shift(@insns));
   1049 	 eval(shift(@insns));
   1050 
   1051 	  &vpaddd	(@Tx[1],$Kx,@X[-1&7]);
   1052 	 eval(shift(@insns));
   1053 	 eval(shift(@insns));
   1054 	&vpsrldq(@Tx[0],@X[-1&7],4);		# "X[-3]", 3 dwords
   1055 	 eval(shift(@insns));
   1056 	 eval(shift(@insns));
   1057 	&vpxor	(@X[0],@X[0],@X[-4&7]);		# "X[0]"^="X[-16]"
   1058 	 eval(shift(@insns));
   1059 	 eval(shift(@insns));
   1060 
   1061 	&vpxor	(@Tx[0],@Tx[0],@X[-2&7]);	# "X[-3]"^"X[-8]"
   1062 	 eval(shift(@insns));
   1063 	 eval(shift(@insns));
   1064 	 eval(shift(@insns));
   1065 	 eval(shift(@insns));
   1066 
   1067 	&vpxor	(@X[0],@X[0],@Tx[0]);		# "X[0]"^="X[-3]"^"X[-8]"
   1068 	 eval(shift(@insns));
   1069 	 eval(shift(@insns));
   1070 	  &vmovdqa	(eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]);	# X[]+K xfer to IALU
   1071 	 eval(shift(@insns));
   1072 	 eval(shift(@insns));
   1073 
   1074 	&vpsrld	(@Tx[0],@X[0],31);
   1075 	 eval(shift(@insns));
   1076 	 eval(shift(@insns));
   1077 	 eval(shift(@insns));
   1078 	 eval(shift(@insns));
   1079 
   1080 	&vpslldq(@Tx[2],@X[0],12);		# "X[0]"<<96, extract one dword
   1081 	&vpaddd	(@X[0],@X[0],@X[0]);
   1082 	 eval(shift(@insns));
   1083 	 eval(shift(@insns));
   1084 	 eval(shift(@insns));
   1085 	 eval(shift(@insns));
   1086 
   1087 	&vpsrld	(@Tx[1],@Tx[2],30);
   1088 	&vpor	(@X[0],@X[0],@Tx[0]);		# "X[0]"<<<=1
   1089 	 eval(shift(@insns));
   1090 	 eval(shift(@insns));
   1091 	 eval(shift(@insns));
   1092 	 eval(shift(@insns));
   1093 
   1094 	&vpslld	(@Tx[2],@Tx[2],2);
   1095 	&vpxor	(@X[0],@X[0],@Tx[1]);
   1096 	 eval(shift(@insns));
   1097 	 eval(shift(@insns));
   1098 	 eval(shift(@insns));
   1099 	 eval(shift(@insns));
   1100 
   1101 	&vpxor	(@X[0],@X[0],@Tx[2]);		# "X[0]"^=("X[0]">>96)<<<2
   1102 	 eval(shift(@insns));
   1103 	 eval(shift(@insns));
   1104 	  &vmovdqa	($Kx,eval(2*16*(($Xi)/5)-64)."($K_XX_XX)")	if ($Xi%5==0);	# K_XX_XX
   1105 	 eval(shift(@insns));
   1106 	 eval(shift(@insns));
   1107 
   1108 
   1109 	 foreach (@insns) { eval; }	# remaining instructions [if any]
   1110 
   1111   $Xi++;	push(@X,shift(@X));	# "rotate" X[]
   1112 }
   1113 
   1114 sub Xupdate_avx_32_79()
   1115 { use integer;
   1116   my $body = shift;
   1117   my @insns = (&$body,&$body,&$body,&$body);	# 32 to 44 instructions
   1118   my ($a,$b,$c,$d,$e);
   1119 
   1120 	&vpalignr(@Tx[0],@X[-1&7],@X[-2&7],8);	# compose "X[-6]"
   1121 	&vpxor	(@X[0],@X[0],@X[-4&7]);		# "X[0]"="X[-32]"^"X[-16]"
   1122 	 eval(shift(@insns));		# body_20_39
   1123 	 eval(shift(@insns));
   1124 	 eval(shift(@insns));
   1125 	 eval(shift(@insns));		# rol
   1126 
   1127 	&vpxor	(@X[0],@X[0],@X[-7&7]);		# "X[0]"^="X[-28]"
   1128 	 eval(shift(@insns));
   1129 	 eval(shift(@insns))	if (@insns[0] !~ /&ro[rl]/);
   1130 	  &vpaddd	(@Tx[1],$Kx,@X[-1&7]);
   1131 	  &vmovdqa	($Kx,eval(2*16*($Xi/5)-64)."($K_XX_XX)")	if ($Xi%5==0);
   1132 	 eval(shift(@insns));		# ror
   1133 	 eval(shift(@insns));
   1134 
   1135 	&vpxor	(@X[0],@X[0],@Tx[0]);		# "X[0]"^="X[-6]"
   1136 	 eval(shift(@insns));		# body_20_39
   1137 	 eval(shift(@insns));
   1138 	 eval(shift(@insns));
   1139 	 eval(shift(@insns));		# rol
   1140 
   1141 	&vpsrld	(@Tx[0],@X[0],30);
   1142 	  &vmovdqa	(eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]);	# X[]+K xfer to IALU
   1143 	 eval(shift(@insns));
   1144 	 eval(shift(@insns));
   1145 	 eval(shift(@insns));		# ror
   1146 	 eval(shift(@insns));
   1147 
   1148 	&vpslld	(@X[0],@X[0],2);
   1149 	 eval(shift(@insns));		# body_20_39
   1150 	 eval(shift(@insns));
   1151 	 eval(shift(@insns));
   1152 	 eval(shift(@insns));		# rol
   1153 	 eval(shift(@insns));
   1154 	 eval(shift(@insns));
   1155 	 eval(shift(@insns));		# ror
   1156 	 eval(shift(@insns));
   1157 
   1158 	&vpor	(@X[0],@X[0],@Tx[0]);		# "X[0]"<<<=2
   1159 	 eval(shift(@insns));		# body_20_39
   1160 	 eval(shift(@insns));
   1161 	 eval(shift(@insns));
   1162 	 eval(shift(@insns));		# rol
   1163 	 eval(shift(@insns));
   1164 	 eval(shift(@insns));
   1165 	 eval(shift(@insns));		# rol
   1166 	 eval(shift(@insns));
   1167 
   1168 	 foreach (@insns) { eval; }	# remaining instructions
   1169 
   1170   $Xi++;	push(@X,shift(@X));	# "rotate" X[]
   1171 }
   1172 
   1173 sub Xuplast_avx_80()
   1174 { use integer;
   1175   my $body = shift;
   1176   my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
   1177   my ($a,$b,$c,$d,$e);
   1178 
   1179 	 eval(shift(@insns));
   1180 	  &vpaddd	(@Tx[1],$Kx,@X[-1&7]);
   1181 	 eval(shift(@insns));
   1182 	 eval(shift(@insns));
   1183 	 eval(shift(@insns));
   1184 	 eval(shift(@insns));
   1185 
   1186 	  &vmovdqa	(eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]);	# X[]+K xfer IALU
   1187 
   1188 	 foreach (@insns) { eval; }		# remaining instructions
   1189 
   1190 	&cmp	($inp,$num);
   1191 	&je	($done_avx_label);
   1192 
   1193 	&vmovdqa(@X[2],"64($K_XX_XX)");		# pbswap mask
   1194 	&vmovdqa($Kx,"-64($K_XX_XX)");		# K_00_19
   1195 	&vmovdqu(@X[-4&7],"0($inp)");		# load input
   1196 	&vmovdqu(@X[-3&7],"16($inp)");
   1197 	&vmovdqu(@X[-2&7],"32($inp)");
   1198 	&vmovdqu(@X[-1&7],"48($inp)");
   1199 	&vpshufb(@X[-4&7],@X[-4&7],@X[2]);	# byte swap
   1200 	&add	($inp,64);
   1201 
   1202   $Xi=0;
   1203 }
   1204 
   1205 sub Xloop_avx()
   1206 { use integer;
   1207   my $body = shift;
   1208   my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
   1209   my ($a,$b,$c,$d,$e);
   1210 
   1211 	 eval(shift(@insns));
   1212 	 eval(shift(@insns));
   1213 	&vpshufb(@X[($Xi-3)&7],@X[($Xi-3)&7],@X[2]);
   1214 	 eval(shift(@insns));
   1215 	 eval(shift(@insns));
   1216 	&vpaddd	(@X[$Xi&7],@X[($Xi-4)&7],$Kx);
   1217 	 eval(shift(@insns));
   1218 	 eval(shift(@insns));
   1219 	 eval(shift(@insns));
   1220 	 eval(shift(@insns));
   1221 	&vmovdqa(eval(16*$Xi)."(%rsp)",@X[$Xi&7]);	# X[]+K xfer to IALU
   1222 	 eval(shift(@insns));
   1223 	 eval(shift(@insns));
   1224 
   1225 	foreach (@insns) { eval; }
   1226   $Xi++;
   1227 }
   1228 
   1229 sub Xtail_avx()
   1230 { use integer;
   1231   my $body = shift;
   1232   my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
   1233   my ($a,$b,$c,$d,$e);
   1234 
   1235 	foreach (@insns) { eval; }
   1236 }
   1237 
   1238 $code.=<<___;
   1239 .align	16
   1240 .Loop_avx:
   1241 ___
   1242 	&Xupdate_avx_16_31(\&body_00_19);
   1243 	&Xupdate_avx_16_31(\&body_00_19);
   1244 	&Xupdate_avx_16_31(\&body_00_19);
   1245 	&Xupdate_avx_16_31(\&body_00_19);
   1246 	&Xupdate_avx_32_79(\&body_00_19);
   1247 	&Xupdate_avx_32_79(\&body_20_39);
   1248 	&Xupdate_avx_32_79(\&body_20_39);
   1249 	&Xupdate_avx_32_79(\&body_20_39);
   1250 	&Xupdate_avx_32_79(\&body_20_39);
   1251 	&Xupdate_avx_32_79(\&body_20_39);
   1252 	&Xupdate_avx_32_79(\&body_40_59);
   1253 	&Xupdate_avx_32_79(\&body_40_59);
   1254 	&Xupdate_avx_32_79(\&body_40_59);
   1255 	&Xupdate_avx_32_79(\&body_40_59);
   1256 	&Xupdate_avx_32_79(\&body_40_59);
   1257 	&Xupdate_avx_32_79(\&body_20_39);
   1258 	&Xuplast_avx_80(\&body_20_39);	# can jump to "done"
   1259 
   1260 				$saved_j=$j; @saved_V=@V;
   1261 
   1262 	&Xloop_avx(\&body_20_39);
   1263 	&Xloop_avx(\&body_20_39);
   1264 	&Xloop_avx(\&body_20_39);
   1265 
   1266 $code.=<<___;
   1267 	add	0($ctx),$A			# update context
   1268 	add	4($ctx),@T[0]
   1269 	add	8($ctx),$C
   1270 	add	12($ctx),$D
   1271 	mov	$A,0($ctx)
   1272 	add	16($ctx),$E
   1273 	mov	@T[0],4($ctx)
   1274 	mov	@T[0],$B			# magic seed
   1275 	mov	$C,8($ctx)
   1276 	mov	$C,@T[1]
   1277 	mov	$D,12($ctx)
   1278 	xor	$D,@T[1]
   1279 	mov	$E,16($ctx)
   1280 	and	@T[1],@T[0]
   1281 	jmp	.Loop_avx
   1282 
   1283 .align	16
   1284 $done_avx_label:
   1285 ___
   1286 				$j=$saved_j; @V=@saved_V;
   1287 
   1288 	&Xtail_avx(\&body_20_39);
   1289 	&Xtail_avx(\&body_20_39);
   1290 	&Xtail_avx(\&body_20_39);
   1291 
   1292 $code.=<<___;
   1293 	vzeroupper
   1294 
   1295 	add	0($ctx),$A			# update context
   1296 	add	4($ctx),@T[0]
   1297 	add	8($ctx),$C
   1298 	mov	$A,0($ctx)
   1299 	add	12($ctx),$D
   1300 	mov	@T[0],4($ctx)
   1301 	add	16($ctx),$E
   1302 	mov	$C,8($ctx)
   1303 	mov	$D,12($ctx)
   1304 	mov	$E,16($ctx)
   1305 ___
   1306 $code.=<<___ if ($win64);
   1307 	movaps	-40-6*16($fp),%xmm6
   1308 	movaps	-40-5*16($fp),%xmm7
   1309 	movaps	-40-4*16($fp),%xmm8
   1310 	movaps	-40-3*16($fp),%xmm9
   1311 	movaps	-40-2*16($fp),%xmm10
   1312 	movaps	-40-1*16($fp),%xmm11
   1313 ___
   1314 $code.=<<___;
   1315 	mov	-40($fp),%r14
   1316 .cfi_restore	%r14
   1317 	mov	-32($fp),%r13
   1318 .cfi_restore	%r13
   1319 	mov	-24($fp),%r12
   1320 .cfi_restore	%r12
   1321 	mov	-16($fp),%rbp
   1322 .cfi_restore	%rbp
   1323 	mov	-8($fp),%rbx
   1324 .cfi_restore	%rbx
   1325 	lea	($fp),%rsp
   1326 .cfi_def_cfa_register	%rsp
   1327 .Lepilogue_avx:
   1328 	ret
   1329 .cfi_endproc
   1330 .size	sha1_block_data_order_avx,.-sha1_block_data_order_avx
   1331 ___
   1332 
   1333 if ($avx>1) {
   1334 use integer;
   1335 $Xi=4;					# reset variables
   1336 @X=map("%ymm$_",(4..7,0..3));
   1337 @Tx=map("%ymm$_",(8..10));
   1338 $Kx="%ymm11";
   1339 $j=0;
   1340 
   1341 my @ROTX=("%eax","%ebp","%ebx","%ecx","%edx","%esi");
   1342 my ($a5,$t0)=("%r12d","%edi");
   1343 
   1344 my ($A,$F,$B,$C,$D,$E)=@ROTX;
   1345 my $rx=0;
   1346 my $frame="%r13";
   1347 
   1348 $code.=<<___;
   1349 .type	sha1_block_data_order_avx2,\@function,3
   1350 .align	16
   1351 sha1_block_data_order_avx2:
   1352 _avx2_shortcut:
   1353 .cfi_startproc
   1354 	mov	%rsp,$fp
   1355 .cfi_def_cfa_register	$fp
   1356 	push	%rbx
   1357 .cfi_push	%rbx
   1358 	push	%rbp
   1359 .cfi_push	%rbp
   1360 	push	%r12
   1361 .cfi_push	%r12
   1362 	push	%r13
   1363 .cfi_push	%r13
   1364 	push	%r14
   1365 .cfi_push	%r14
   1366 	vzeroupper
   1367 ___
   1368 $code.=<<___ if ($win64);
   1369 	lea	-6*16(%rsp),%rsp
   1370 	vmovaps	%xmm6,-40-6*16($fp)
   1371 	vmovaps	%xmm7,-40-5*16($fp)
   1372 	vmovaps	%xmm8,-40-4*16($fp)
   1373 	vmovaps	%xmm9,-40-3*16($fp)
   1374 	vmovaps	%xmm10,-40-2*16($fp)
   1375 	vmovaps	%xmm11,-40-1*16($fp)
   1376 .Lprologue_avx2:
   1377 ___
   1378 $code.=<<___;
   1379 	mov	%rdi,$ctx		# reassigned argument
   1380 	mov	%rsi,$inp		# reassigned argument
   1381 	mov	%rdx,$num		# reassigned argument
   1382 
   1383 	lea	-640(%rsp),%rsp
   1384 	shl	\$6,$num
   1385 	 lea	64($inp),$frame
   1386 	and	\$-128,%rsp
   1387 	add	$inp,$num
   1388 	lea	K_XX_XX+64(%rip),$K_XX_XX
   1389 
   1390 	mov	0($ctx),$A		# load context
   1391 	 cmp	$num,$frame
   1392 	 cmovae	$inp,$frame		# next or same block
   1393 	mov	4($ctx),$F
   1394 	mov	8($ctx),$C
   1395 	mov	12($ctx),$D
   1396 	mov	16($ctx),$E
   1397 	vmovdqu	64($K_XX_XX),@X[2]	# pbswap mask
   1398 
   1399 	vmovdqu		($inp),%xmm0
   1400 	vmovdqu		16($inp),%xmm1
   1401 	vmovdqu		32($inp),%xmm2
   1402 	vmovdqu		48($inp),%xmm3
   1403 	lea		64($inp),$inp
   1404 	vinserti128	\$1,($frame),@X[-4&7],@X[-4&7]
   1405 	vinserti128	\$1,16($frame),@X[-3&7],@X[-3&7]
   1406 	vpshufb		@X[2],@X[-4&7],@X[-4&7]
   1407 	vinserti128	\$1,32($frame),@X[-2&7],@X[-2&7]
   1408 	vpshufb		@X[2],@X[-3&7],@X[-3&7]
   1409 	vinserti128	\$1,48($frame),@X[-1&7],@X[-1&7]
   1410 	vpshufb		@X[2],@X[-2&7],@X[-2&7]
   1411 	vmovdqu		-64($K_XX_XX),$Kx	# K_00_19
   1412 	vpshufb		@X[2],@X[-1&7],@X[-1&7]
   1413 
   1414 	vpaddd	$Kx,@X[-4&7],@X[0]	# add K_00_19
   1415 	vpaddd	$Kx,@X[-3&7],@X[1]
   1416 	vmovdqu	@X[0],0(%rsp)		# X[]+K xfer to IALU
   1417 	vpaddd	$Kx,@X[-2&7],@X[2]
   1418 	vmovdqu	@X[1],32(%rsp)
   1419 	vpaddd	$Kx,@X[-1&7],@X[3]
   1420 	vmovdqu	@X[2],64(%rsp)
   1421 	vmovdqu	@X[3],96(%rsp)
   1422 ___
   1423 for (;$Xi<8;$Xi++) {	# Xupdate_avx2_16_31
   1424     use integer;
   1425 
   1426 	&vpalignr(@X[0],@X[-3&7],@X[-4&7],8);	# compose "X[-14]" in "X[0]"
   1427 	&vpsrldq(@Tx[0],@X[-1&7],4);		# "X[-3]", 3 dwords
   1428 	&vpxor	(@X[0],@X[0],@X[-4&7]);		# "X[0]"^="X[-16]"
   1429 	&vpxor	(@Tx[0],@Tx[0],@X[-2&7]);	# "X[-3]"^"X[-8]"
   1430 	&vpxor	(@X[0],@X[0],@Tx[0]);		# "X[0]"^="X[-3]"^"X[-8]"
   1431 	&vpsrld	(@Tx[0],@X[0],31);
   1432 	&vmovdqu($Kx,eval(2*16*(($Xi)/5)-64)."($K_XX_XX)")	if ($Xi%5==0);	# K_XX_XX
   1433 	&vpslldq(@Tx[2],@X[0],12);		# "X[0]"<<96, extract one dword
   1434 	&vpaddd	(@X[0],@X[0],@X[0]);
   1435 	&vpsrld	(@Tx[1],@Tx[2],30);
   1436 	&vpor	(@X[0],@X[0],@Tx[0]);		# "X[0]"<<<=1
   1437 	&vpslld	(@Tx[2],@Tx[2],2);
   1438 	&vpxor	(@X[0],@X[0],@Tx[1]);
   1439 	&vpxor	(@X[0],@X[0],@Tx[2]);		# "X[0]"^=("X[0]">>96)<<<2
   1440 	&vpaddd	(@Tx[1],@X[0],$Kx);
   1441 	&vmovdqu("32*$Xi(%rsp)",@Tx[1]);	# X[]+K xfer to IALU
   1442 
   1443 	push(@X,shift(@X));	# "rotate" X[]
   1444 }
   1445 $code.=<<___;
   1446 	lea	128(%rsp),$frame
   1447 	jmp	.Loop_avx2
   1448 .align	32
   1449 .Loop_avx2:
   1450 	rorx	\$2,$F,$B
   1451 	andn	$D,$F,$t0
   1452 	and	$C,$F
   1453 	xor	$t0,$F
   1454 ___
   1455 sub bodyx_00_19 () {	# 8 instructions, 3 cycles critical path
   1456 	# at start $f=(b&c)^(~b&d), $b>>>=2
   1457 	return &bodyx_20_39() if ($rx==19); $rx++;
   1458 	(
   1459 	'($a,$f,$b,$c,$d,$e)=@ROTX;'.
   1460 
   1461 	'&add	($e,((32*($j/4)+4*($j%4))%256-128)."($frame)");'.	# e+=X[i]+K
   1462 	 '&lea	($frame,"256($frame)")	if ($j%32==31);',
   1463 	'&andn	($t0,$a,$c)',			# ~b&d for next round
   1464 
   1465 	'&add	($e,$f)',			# e+=(b&c)^(~b&d)
   1466 	'&rorx	($a5,$a,27)',			# a<<<5
   1467 	'&rorx	($f,$a,2)',			# b>>>2 for next round
   1468 	'&and	($a,$b)',			# b&c for next round
   1469 
   1470 	'&add	($e,$a5)',			# e+=a<<<5
   1471 	'&xor	($a,$t0);'.			# f=(b&c)^(~b&d) for next round
   1472 
   1473 	'unshift(@ROTX,pop(@ROTX)); $j++;'
   1474 	)
   1475 }
   1476 
   1477 sub bodyx_20_39 () {	# 7 instructions, 2 cycles critical path
   1478 	# on entry $f=b^c^d, $b>>>=2
   1479 	return &bodyx_40_59() if ($rx==39); $rx++;
   1480 	(
   1481 	'($a,$f,$b,$c,$d,$e)=@ROTX;'.
   1482 
   1483 	'&add	($e,((32*($j/4)+4*($j%4))%256-128)."($frame)");'.	# e+=X[i]+K
   1484 	 '&lea	($frame,"256($frame)")	if ($j%32==31);',
   1485 
   1486 	'&lea	($e,"($e,$f)")',		# e+=b^c^d
   1487 	'&rorx	($a5,$a,27)',			# a<<<5
   1488 	'&rorx	($f,$a,2)	if ($j<79)',	# b>>>2 in next round
   1489 	'&xor	($a,$b)		if ($j<79)',	# b^c for next round
   1490 
   1491 	'&add	($e,$a5)',			# e+=a<<<5
   1492 	'&xor	($a,$c)		if ($j<79);'.	# f=b^c^d for next round
   1493 
   1494 	'unshift(@ROTX,pop(@ROTX)); $j++;'
   1495 	)
   1496 }
   1497 
   1498 sub bodyx_40_59 () {	# 10 instructions, 3 cycles critical path
   1499 	# on entry $f=((b^c)&(c^d)), $b>>>=2
   1500 	$rx++;
   1501 	(
   1502 	'($a,$f,$b,$c,$d,$e)=@ROTX;'.
   1503 
   1504 	'&add	($e,((32*($j/4)+4*($j%4))%256-128)."($frame)");'.	# e+=X[i]+K
   1505 	 '&lea	($frame,"256($frame)")	if ($j%32==31);',
   1506 	'&xor	($f,$c)		if ($j>39)',	# (b^c)&(c^d)^c
   1507 	'&mov	($t0,$b)	if ($j<59)',	# count on zero latency
   1508 	'&xor	($t0,$c)	if ($j<59)',	# c^d for next round
   1509 
   1510 	'&lea	($e,"($e,$f)")',		# e+=(b^c)&(c^d)^c
   1511 	'&rorx	($a5,$a,27)',			# a<<<5
   1512 	'&rorx	($f,$a,2)',			# b>>>2 in next round
   1513 	'&xor	($a,$b)',			# b^c for next round
   1514 
   1515 	'&add	($e,$a5)',			# e+=a<<<5
   1516 	'&and	($a,$t0)	if ($j< 59);'.	# f=(b^c)&(c^d) for next round
   1517 	'&xor	($a,$c)		if ($j==59);'.	# f=b^c^d for next round
   1518 
   1519 	'unshift(@ROTX,pop(@ROTX)); $j++;'
   1520 	)
   1521 }
   1522 
   1523 sub Xupdate_avx2_16_31()		# recall that $Xi starts with 4
   1524 { use integer;
   1525   my $body = shift;
   1526   my @insns = (&$body,&$body,&$body,&$body,&$body);	# 35 instructions
   1527   my ($a,$b,$c,$d,$e);
   1528 
   1529 	&vpalignr(@X[0],@X[-3&7],@X[-4&7],8);	# compose "X[-14]" in "X[0]"
   1530 	 eval(shift(@insns));
   1531 	 eval(shift(@insns));
   1532 	 eval(shift(@insns));
   1533 	 eval(shift(@insns));
   1534 
   1535 	&vpsrldq(@Tx[0],@X[-1&7],4);		# "X[-3]", 3 dwords
   1536 	 eval(shift(@insns));
   1537 	 eval(shift(@insns));
   1538 	 eval(shift(@insns));
   1539 
   1540 	&vpxor	(@X[0],@X[0],@X[-4&7]);		# "X[0]"^="X[-16]"
   1541 	&vpxor	(@Tx[0],@Tx[0],@X[-2&7]);	# "X[-3]"^"X[-8]"
   1542 	 eval(shift(@insns));
   1543 	 eval(shift(@insns));
   1544 
   1545 	&vpxor	(@X[0],@X[0],@Tx[0]);		# "X[0]"^="X[-3]"^"X[-8]"
   1546 	 eval(shift(@insns));
   1547 	 eval(shift(@insns));
   1548 	 eval(shift(@insns));
   1549 	 eval(shift(@insns));
   1550 
   1551 	&vpsrld	(@Tx[0],@X[0],31);
   1552 	&vmovdqu($Kx,eval(2*16*(($Xi)/5)-64)."($K_XX_XX)")	if ($Xi%5==0);	# K_XX_XX
   1553 	 eval(shift(@insns));
   1554 	 eval(shift(@insns));
   1555 	 eval(shift(@insns));
   1556 
   1557 	&vpslldq(@Tx[2],@X[0],12);		# "X[0]"<<96, extract one dword
   1558 	&vpaddd	(@X[0],@X[0],@X[0]);
   1559 	 eval(shift(@insns));
   1560 	 eval(shift(@insns));
   1561 
   1562 	&vpsrld	(@Tx[1],@Tx[2],30);
   1563 	&vpor	(@X[0],@X[0],@Tx[0]);		# "X[0]"<<<=1
   1564 	 eval(shift(@insns));
   1565 	 eval(shift(@insns));
   1566 
   1567 	&vpslld	(@Tx[2],@Tx[2],2);
   1568 	&vpxor	(@X[0],@X[0],@Tx[1]);
   1569 	 eval(shift(@insns));
   1570 	 eval(shift(@insns));
   1571 
   1572 	&vpxor	(@X[0],@X[0],@Tx[2]);		# "X[0]"^=("X[0]">>96)<<<2
   1573 	 eval(shift(@insns));
   1574 	 eval(shift(@insns));
   1575 	 eval(shift(@insns));
   1576 
   1577 	&vpaddd	(@Tx[1],@X[0],$Kx);
   1578 	 eval(shift(@insns));
   1579 	 eval(shift(@insns));
   1580 	 eval(shift(@insns));
   1581 	&vmovdqu(eval(32*($Xi))."(%rsp)",@Tx[1]);	# X[]+K xfer to IALU
   1582 
   1583 	 foreach (@insns) { eval; }	# remaining instructions [if any]
   1584 
   1585 	$Xi++;
   1586 	push(@X,shift(@X));	# "rotate" X[]
   1587 }
   1588 
   1589 sub Xupdate_avx2_32_79()
   1590 { use integer;
   1591   my $body = shift;
   1592   my @insns = (&$body,&$body,&$body,&$body,&$body);	# 35 to 50 instructions
   1593   my ($a,$b,$c,$d,$e);
   1594 
   1595 	&vpalignr(@Tx[0],@X[-1&7],@X[-2&7],8);	# compose "X[-6]"
   1596 	&vpxor	(@X[0],@X[0],@X[-4&7]);		# "X[0]"="X[-32]"^"X[-16]"
   1597 	 eval(shift(@insns));
   1598 	 eval(shift(@insns));
   1599 
   1600 	&vpxor	(@X[0],@X[0],@X[-7&7]);		# "X[0]"^="X[-28]"
   1601 	&vmovdqu($Kx,eval(2*16*($Xi/5)-64)."($K_XX_XX)")	if ($Xi%5==0);
   1602 	 eval(shift(@insns));
   1603 	 eval(shift(@insns));
   1604 	 eval(shift(@insns));
   1605 
   1606 	&vpxor	(@X[0],@X[0],@Tx[0]);		# "X[0]"^="X[-6]"
   1607 	 eval(shift(@insns));
   1608 	 eval(shift(@insns));
   1609 	 eval(shift(@insns));
   1610 
   1611 	&vpsrld	(@Tx[0],@X[0],30);
   1612 	&vpslld	(@X[0],@X[0],2);
   1613 	 eval(shift(@insns));
   1614 	 eval(shift(@insns));
   1615 	 eval(shift(@insns));
   1616 
   1617 	#&vpslld	(@X[0],@X[0],2);
   1618 	 eval(shift(@insns));
   1619 	 eval(shift(@insns));
   1620 	 eval(shift(@insns));
   1621 
   1622 	&vpor	(@X[0],@X[0],@Tx[0]);		# "X[0]"<<<=2
   1623 	 eval(shift(@insns));
   1624 	 eval(shift(@insns));
   1625 	 eval(shift(@insns));
   1626 	 eval(shift(@insns));
   1627 
   1628 	&vpaddd	(@Tx[1],@X[0],$Kx);
   1629 	 eval(shift(@insns));
   1630 	 eval(shift(@insns));
   1631 	 eval(shift(@insns));
   1632 	 eval(shift(@insns));
   1633 
   1634 	&vmovdqu("32*$Xi(%rsp)",@Tx[1]);	# X[]+K xfer to IALU
   1635 
   1636 	 foreach (@insns) { eval; }	# remaining instructions
   1637 
   1638 	$Xi++;
   1639 	push(@X,shift(@X));	# "rotate" X[]
   1640 }
   1641 
   1642 sub Xloop_avx2()
   1643 { use integer;
   1644   my $body = shift;
   1645   my @insns = (&$body,&$body,&$body,&$body,&$body);	# 32 instructions
   1646   my ($a,$b,$c,$d,$e);
   1647 
   1648 	 foreach (@insns) { eval; }
   1649 }
   1650 
   1651 	&align32();
   1652 	&Xupdate_avx2_32_79(\&bodyx_00_19);
   1653 	&Xupdate_avx2_32_79(\&bodyx_00_19);
   1654 	&Xupdate_avx2_32_79(\&bodyx_00_19);
   1655 	&Xupdate_avx2_32_79(\&bodyx_00_19);
   1656 
   1657 	&Xupdate_avx2_32_79(\&bodyx_20_39);
   1658 	&Xupdate_avx2_32_79(\&bodyx_20_39);
   1659 	&Xupdate_avx2_32_79(\&bodyx_20_39);
   1660 	&Xupdate_avx2_32_79(\&bodyx_20_39);
   1661 
   1662 	&align32();
   1663 	&Xupdate_avx2_32_79(\&bodyx_40_59);
   1664 	&Xupdate_avx2_32_79(\&bodyx_40_59);
   1665 	&Xupdate_avx2_32_79(\&bodyx_40_59);
   1666 	&Xupdate_avx2_32_79(\&bodyx_40_59);
   1667 
   1668 	&Xloop_avx2(\&bodyx_20_39);
   1669 	&Xloop_avx2(\&bodyx_20_39);
   1670 	&Xloop_avx2(\&bodyx_20_39);
   1671 	&Xloop_avx2(\&bodyx_20_39);
   1672 
   1673 $code.=<<___;
   1674 	lea	128($inp),$frame
   1675 	lea	128($inp),%rdi			# borrow $t0
   1676 	cmp	$num,$frame
   1677 	cmovae	$inp,$frame			# next or previous block
   1678 
   1679 	# output is d-e-[a]-f-b-c => A=d,F=e,C=f,D=b,E=c
   1680 	add	0($ctx),@ROTX[0]		# update context
   1681 	add	4($ctx),@ROTX[1]
   1682 	add	8($ctx),@ROTX[3]
   1683 	mov	@ROTX[0],0($ctx)
   1684 	add	12($ctx),@ROTX[4]
   1685 	mov	@ROTX[1],4($ctx)
   1686 	 mov	@ROTX[0],$A			# A=d
   1687 	add	16($ctx),@ROTX[5]
   1688 	 mov	@ROTX[3],$a5
   1689 	mov	@ROTX[3],8($ctx)
   1690 	 mov	@ROTX[4],$D			# D=b
   1691 	 #xchg	@ROTX[5],$F			# F=c, C=f
   1692 	mov	@ROTX[4],12($ctx)
   1693 	 mov	@ROTX[1],$F			# F=e
   1694 	mov	@ROTX[5],16($ctx)
   1695 	#mov	$F,16($ctx)
   1696 	 mov	@ROTX[5],$E			# E=c
   1697 	 mov	$a5,$C				# C=f
   1698 	 #xchg	$F,$E				# E=c, F=e
   1699 
   1700 	cmp	$num,$inp
   1701 	je	.Ldone_avx2
   1702 ___
   1703 
   1704 $Xi=4;				# reset variables
   1705 @X=map("%ymm$_",(4..7,0..3));
   1706 
   1707 $code.=<<___;
   1708 	vmovdqu	64($K_XX_XX),@X[2]		# pbswap mask
   1709 	cmp	$num,%rdi			# borrowed $t0
   1710 	ja	.Last_avx2
   1711 
   1712 	vmovdqu		-64(%rdi),%xmm0		# low part of @X[-4&7]
   1713 	vmovdqu		-48(%rdi),%xmm1
   1714 	vmovdqu		-32(%rdi),%xmm2
   1715 	vmovdqu		-16(%rdi),%xmm3
   1716 	vinserti128	\$1,0($frame),@X[-4&7],@X[-4&7]
   1717 	vinserti128	\$1,16($frame),@X[-3&7],@X[-3&7]
   1718 	vinserti128	\$1,32($frame),@X[-2&7],@X[-2&7]
   1719 	vinserti128	\$1,48($frame),@X[-1&7],@X[-1&7]
   1720 	jmp	.Last_avx2
   1721 
   1722 .align	32
   1723 .Last_avx2:
   1724 	lea	128+16(%rsp),$frame
   1725 	rorx	\$2,$F,$B
   1726 	andn	$D,$F,$t0
   1727 	and	$C,$F
   1728 	xor	$t0,$F
   1729 	sub	\$-128,$inp
   1730 ___
   1731 	$rx=$j=0;	@ROTX=($A,$F,$B,$C,$D,$E);
   1732 
   1733 	&Xloop_avx2	(\&bodyx_00_19);
   1734 	&Xloop_avx2	(\&bodyx_00_19);
   1735 	&Xloop_avx2	(\&bodyx_00_19);
   1736 	&Xloop_avx2	(\&bodyx_00_19);
   1737 
   1738 	&Xloop_avx2	(\&bodyx_20_39);
   1739 	  &vmovdqu	($Kx,"-64($K_XX_XX)");		# K_00_19
   1740 	  &vpshufb	(@X[-4&7],@X[-4&7],@X[2]);	# byte swap
   1741 	&Xloop_avx2	(\&bodyx_20_39);
   1742 	  &vpshufb	(@X[-3&7],@X[-3&7],@X[2]);
   1743 	  &vpaddd	(@Tx[0],@X[-4&7],$Kx);		# add K_00_19
   1744 	&Xloop_avx2	(\&bodyx_20_39);
   1745 	  &vmovdqu	("0(%rsp)",@Tx[0]);
   1746 	  &vpshufb	(@X[-2&7],@X[-2&7],@X[2]);
   1747 	  &vpaddd	(@Tx[1],@X[-3&7],$Kx);
   1748 	&Xloop_avx2	(\&bodyx_20_39);
   1749 	  &vmovdqu	("32(%rsp)",@Tx[1]);
   1750 	  &vpshufb	(@X[-1&7],@X[-1&7],@X[2]);
   1751 	  &vpaddd	(@X[2],@X[-2&7],$Kx);
   1752 
   1753 	&Xloop_avx2	(\&bodyx_40_59);
   1754 	&align32	();
   1755 	  &vmovdqu	("64(%rsp)",@X[2]);
   1756 	  &vpaddd	(@X[3],@X[-1&7],$Kx);
   1757 	&Xloop_avx2	(\&bodyx_40_59);
   1758 	  &vmovdqu	("96(%rsp)",@X[3]);
   1759 	&Xloop_avx2	(\&bodyx_40_59);
   1760 	&Xupdate_avx2_16_31(\&bodyx_40_59);
   1761 
   1762 	&Xupdate_avx2_16_31(\&bodyx_20_39);
   1763 	&Xupdate_avx2_16_31(\&bodyx_20_39);
   1764 	&Xupdate_avx2_16_31(\&bodyx_20_39);
   1765 	&Xloop_avx2	(\&bodyx_20_39);
   1766 
   1767 $code.=<<___;
   1768 	lea	128(%rsp),$frame
   1769 
   1770 	# output is d-e-[a]-f-b-c => A=d,F=e,C=f,D=b,E=c
   1771 	add	0($ctx),@ROTX[0]		# update context
   1772 	add	4($ctx),@ROTX[1]
   1773 	add	8($ctx),@ROTX[3]
   1774 	mov	@ROTX[0],0($ctx)
   1775 	add	12($ctx),@ROTX[4]
   1776 	mov	@ROTX[1],4($ctx)
   1777 	 mov	@ROTX[0],$A			# A=d
   1778 	add	16($ctx),@ROTX[5]
   1779 	 mov	@ROTX[3],$a5
   1780 	mov	@ROTX[3],8($ctx)
   1781 	 mov	@ROTX[4],$D			# D=b
   1782 	 #xchg	@ROTX[5],$F			# F=c, C=f
   1783 	mov	@ROTX[4],12($ctx)
   1784 	 mov	@ROTX[1],$F			# F=e
   1785 	mov	@ROTX[5],16($ctx)
   1786 	#mov	$F,16($ctx)
   1787 	 mov	@ROTX[5],$E			# E=c
   1788 	 mov	$a5,$C				# C=f
   1789 	 #xchg	$F,$E				# E=c, F=e
   1790 
   1791 	cmp	$num,$inp
   1792 	jbe	.Loop_avx2
   1793 
   1794 .Ldone_avx2:
   1795 	vzeroupper
   1796 ___
   1797 $code.=<<___ if ($win64);
   1798 	movaps	-40-6*16($fp),%xmm6
   1799 	movaps	-40-5*16($fp),%xmm7
   1800 	movaps	-40-4*16($fp),%xmm8
   1801 	movaps	-40-3*16($fp),%xmm9
   1802 	movaps	-40-2*16($fp),%xmm10
   1803 	movaps	-40-1*16($fp),%xmm11
   1804 ___
   1805 $code.=<<___;
   1806 	mov	-40($fp),%r14
   1807 .cfi_restore	%r14
   1808 	mov	-32($fp),%r13
   1809 .cfi_restore	%r13
   1810 	mov	-24($fp),%r12
   1811 .cfi_restore	%r12
   1812 	mov	-16($fp),%rbp
   1813 .cfi_restore	%rbp
   1814 	mov	-8($fp),%rbx
   1815 .cfi_restore	%rbx
   1816 	lea	($fp),%rsp
   1817 .cfi_def_cfa_register	%rsp
   1818 .Lepilogue_avx2:
   1819 	ret
   1820 .cfi_endproc
   1821 .size	sha1_block_data_order_avx2,.-sha1_block_data_order_avx2
   1822 ___
   1823 }
   1824 }
   1825 $code.=<<___;
   1826 .align	64
   1827 K_XX_XX:
   1828 .long	0x5a827999,0x5a827999,0x5a827999,0x5a827999	# K_00_19
   1829 .long	0x5a827999,0x5a827999,0x5a827999,0x5a827999	# K_00_19
   1830 .long	0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1	# K_20_39
   1831 .long	0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1	# K_20_39
   1832 .long	0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc	# K_40_59
   1833 .long	0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc	# K_40_59
   1834 .long	0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6	# K_60_79
   1835 .long	0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6	# K_60_79
   1836 .long	0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f	# pbswap mask
   1837 .long	0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f	# pbswap mask
   1838 .byte	0xf,0xe,0xd,0xc,0xb,0xa,0x9,0x8,0x7,0x6,0x5,0x4,0x3,0x2,0x1,0x0
   1839 ___
   1840 }}}
   1841 $code.=<<___;
   1842 .asciz	"SHA1 block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
   1843 .align	64
   1844 ___
   1845 
   1846 # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
   1847 #		CONTEXT *context,DISPATCHER_CONTEXT *disp)
   1848 if ($win64) {
   1849 $rec="%rcx";
   1850 $frame="%rdx";
   1851 $context="%r8";
   1852 $disp="%r9";
   1853 
   1854 $code.=<<___;
   1855 .extern	__imp_RtlVirtualUnwind
   1856 .type	se_handler,\@abi-omnipotent
   1857 .align	16
   1858 se_handler:
   1859 	push	%rsi
   1860 	push	%rdi
   1861 	push	%rbx
   1862 	push	%rbp
   1863 	push	%r12
   1864 	push	%r13
   1865 	push	%r14
   1866 	push	%r15
   1867 	pushfq
   1868 	sub	\$64,%rsp
   1869 
   1870 	mov	120($context),%rax	# pull context->Rax
   1871 	mov	248($context),%rbx	# pull context->Rip
   1872 
   1873 	lea	.Lprologue(%rip),%r10
   1874 	cmp	%r10,%rbx		# context->Rip<.Lprologue
   1875 	jb	.Lcommon_seh_tail
   1876 
   1877 	mov	152($context),%rax	# pull context->Rsp
   1878 
   1879 	lea	.Lepilogue(%rip),%r10
   1880 	cmp	%r10,%rbx		# context->Rip>=.Lepilogue
   1881 	jae	.Lcommon_seh_tail
   1882 
   1883 	mov	`16*4`(%rax),%rax	# pull saved stack pointer
   1884 
   1885 	mov	-8(%rax),%rbx
   1886 	mov	-16(%rax),%rbp
   1887 	mov	-24(%rax),%r12
   1888 	mov	-32(%rax),%r13
   1889 	mov	-40(%rax),%r14
   1890 	mov	%rbx,144($context)	# restore context->Rbx
   1891 	mov	%rbp,160($context)	# restore context->Rbp
   1892 	mov	%r12,216($context)	# restore context->R12
   1893 	mov	%r13,224($context)	# restore context->R13
   1894 	mov	%r14,232($context)	# restore context->R14
   1895 
   1896 	jmp	.Lcommon_seh_tail
   1897 .size	se_handler,.-se_handler
   1898 ___
   1899 
   1900 $code.=<<___ if ($shaext);
   1901 .type	shaext_handler,\@abi-omnipotent
   1902 .align	16
   1903 shaext_handler:
   1904 	push	%rsi
   1905 	push	%rdi
   1906 	push	%rbx
   1907 	push	%rbp
   1908 	push	%r12
   1909 	push	%r13
   1910 	push	%r14
   1911 	push	%r15
   1912 	pushfq
   1913 	sub	\$64,%rsp
   1914 
   1915 	mov	120($context),%rax	# pull context->Rax
   1916 	mov	248($context),%rbx	# pull context->Rip
   1917 
   1918 	lea	.Lprologue_shaext(%rip),%r10
   1919 	cmp	%r10,%rbx		# context->Rip<.Lprologue
   1920 	jb	.Lcommon_seh_tail
   1921 
   1922 	lea	.Lepilogue_shaext(%rip),%r10
   1923 	cmp	%r10,%rbx		# context->Rip>=.Lepilogue
   1924 	jae	.Lcommon_seh_tail
   1925 
   1926 	lea	-8-4*16(%rax),%rsi
   1927 	lea	512($context),%rdi	# &context.Xmm6
   1928 	mov	\$8,%ecx
   1929 	.long	0xa548f3fc		# cld; rep movsq
   1930 
   1931 	jmp	.Lcommon_seh_tail
   1932 .size	shaext_handler,.-shaext_handler
   1933 ___
   1934 
   1935 $code.=<<___;
   1936 .type	ssse3_handler,\@abi-omnipotent
   1937 .align	16
   1938 ssse3_handler:
   1939 	push	%rsi
   1940 	push	%rdi
   1941 	push	%rbx
   1942 	push	%rbp
   1943 	push	%r12
   1944 	push	%r13
   1945 	push	%r14
   1946 	push	%r15
   1947 	pushfq
   1948 	sub	\$64,%rsp
   1949 
   1950 	mov	120($context),%rax	# pull context->Rax
   1951 	mov	248($context),%rbx	# pull context->Rip
   1952 
   1953 	mov	8($disp),%rsi		# disp->ImageBase
   1954 	mov	56($disp),%r11		# disp->HandlerData
   1955 
   1956 	mov	0(%r11),%r10d		# HandlerData[0]
   1957 	lea	(%rsi,%r10),%r10	# prologue label
   1958 	cmp	%r10,%rbx		# context->Rip<prologue label
   1959 	jb	.Lcommon_seh_tail
   1960 
   1961 	mov	208($context),%rax	# pull context->R11
   1962 
   1963 	mov	4(%r11),%r10d		# HandlerData[1]
   1964 	lea	(%rsi,%r10),%r10	# epilogue label
   1965 	cmp	%r10,%rbx		# context->Rip>=epilogue label
   1966 	jae	.Lcommon_seh_tail
   1967 
   1968 	lea	-40-6*16(%rax),%rsi
   1969 	lea	512($context),%rdi	# &context.Xmm6
   1970 	mov	\$12,%ecx
   1971 	.long	0xa548f3fc		# cld; rep movsq
   1972 
   1973 	mov	-8(%rax),%rbx
   1974 	mov	-16(%rax),%rbp
   1975 	mov	-24(%rax),%r12
   1976 	mov	-32(%rax),%r13
   1977 	mov	-40(%rax),%r14
   1978 	mov	%rbx,144($context)	# restore context->Rbx
   1979 	mov	%rbp,160($context)	# restore context->Rbp
   1980 	mov	%r12,216($context)	# restore context->R12
   1981 	mov	%r13,224($context)	# restore context->R13
   1982 	mov	%r14,232($context)	# restore context->R14
   1983 
   1984 .Lcommon_seh_tail:
   1985 	mov	8(%rax),%rdi
   1986 	mov	16(%rax),%rsi
   1987 	mov	%rax,152($context)	# restore context->Rsp
   1988 	mov	%rsi,168($context)	# restore context->Rsi
   1989 	mov	%rdi,176($context)	# restore context->Rdi
   1990 
   1991 	mov	40($disp),%rdi		# disp->ContextRecord
   1992 	mov	$context,%rsi		# context
   1993 	mov	\$154,%ecx		# sizeof(CONTEXT)
   1994 	.long	0xa548f3fc		# cld; rep movsq
   1995 
   1996 	mov	$disp,%rsi
   1997 	xor	%rcx,%rcx		# arg1, UNW_FLAG_NHANDLER
   1998 	mov	8(%rsi),%rdx		# arg2, disp->ImageBase
   1999 	mov	0(%rsi),%r8		# arg3, disp->ControlPc
   2000 	mov	16(%rsi),%r9		# arg4, disp->FunctionEntry
   2001 	mov	40(%rsi),%r10		# disp->ContextRecord
   2002 	lea	56(%rsi),%r11		# &disp->HandlerData
   2003 	lea	24(%rsi),%r12		# &disp->EstablisherFrame
   2004 	mov	%r10,32(%rsp)		# arg5
   2005 	mov	%r11,40(%rsp)		# arg6
   2006 	mov	%r12,48(%rsp)		# arg7
   2007 	mov	%rcx,56(%rsp)		# arg8, (NULL)
   2008 	call	*__imp_RtlVirtualUnwind(%rip)
   2009 
   2010 	mov	\$1,%eax		# ExceptionContinueSearch
   2011 	add	\$64,%rsp
   2012 	popfq
   2013 	pop	%r15
   2014 	pop	%r14
   2015 	pop	%r13
   2016 	pop	%r12
   2017 	pop	%rbp
   2018 	pop	%rbx
   2019 	pop	%rdi
   2020 	pop	%rsi
   2021 	ret
   2022 .size	ssse3_handler,.-ssse3_handler
   2023 
   2024 .section	.pdata
   2025 .align	4
   2026 	.rva	.LSEH_begin_sha1_block_data_order
   2027 	.rva	.LSEH_end_sha1_block_data_order
   2028 	.rva	.LSEH_info_sha1_block_data_order
   2029 ___
   2030 $code.=<<___ if ($shaext);
   2031 	.rva	.LSEH_begin_sha1_block_data_order_shaext
   2032 	.rva	.LSEH_end_sha1_block_data_order_shaext
   2033 	.rva	.LSEH_info_sha1_block_data_order_shaext
   2034 ___
   2035 $code.=<<___;
   2036 	.rva	.LSEH_begin_sha1_block_data_order_ssse3
   2037 	.rva	.LSEH_end_sha1_block_data_order_ssse3
   2038 	.rva	.LSEH_info_sha1_block_data_order_ssse3
   2039 ___
   2040 $code.=<<___ if ($avx);
   2041 	.rva	.LSEH_begin_sha1_block_data_order_avx
   2042 	.rva	.LSEH_end_sha1_block_data_order_avx
   2043 	.rva	.LSEH_info_sha1_block_data_order_avx
   2044 ___
   2045 $code.=<<___ if ($avx>1);
   2046 	.rva	.LSEH_begin_sha1_block_data_order_avx2
   2047 	.rva	.LSEH_end_sha1_block_data_order_avx2
   2048 	.rva	.LSEH_info_sha1_block_data_order_avx2
   2049 ___
   2050 $code.=<<___;
   2051 .section	.xdata
   2052 .align	8
   2053 .LSEH_info_sha1_block_data_order:
   2054 	.byte	9,0,0,0
   2055 	.rva	se_handler
   2056 ___
   2057 $code.=<<___ if ($shaext);
   2058 .LSEH_info_sha1_block_data_order_shaext:
   2059 	.byte	9,0,0,0
   2060 	.rva	shaext_handler
   2061 ___
   2062 $code.=<<___;
   2063 .LSEH_info_sha1_block_data_order_ssse3:
   2064 	.byte	9,0,0,0
   2065 	.rva	ssse3_handler
   2066 	.rva	.Lprologue_ssse3,.Lepilogue_ssse3	# HandlerData[]
   2067 ___
   2068 $code.=<<___ if ($avx);
   2069 .LSEH_info_sha1_block_data_order_avx:
   2070 	.byte	9,0,0,0
   2071 	.rva	ssse3_handler
   2072 	.rva	.Lprologue_avx,.Lepilogue_avx		# HandlerData[]
   2073 ___
   2074 $code.=<<___ if ($avx>1);
   2075 .LSEH_info_sha1_block_data_order_avx2:
   2076 	.byte	9,0,0,0
   2077 	.rva	ssse3_handler
   2078 	.rva	.Lprologue_avx2,.Lepilogue_avx2		# HandlerData[]
   2079 ___
   2080 }
   2081 
   2082 ####################################################################
   2083 
   2084 sub sha1rnds4 {
   2085     if (@_[0] =~ /\$([x0-9a-f]+),\s*%xmm([0-7]),\s*%xmm([0-7])/) {
   2086       my @opcode=(0x0f,0x3a,0xcc);
   2087 	push @opcode,0xc0|($2&7)|(($3&7)<<3);		# ModR/M
   2088 	my $c=$1;
   2089 	push @opcode,$c=~/^0/?oct($c):$c;
   2090 	return ".byte\t".join(',',@opcode);
   2091     } else {
   2092 	return "sha1rnds4\t".@_[0];
   2093     }
   2094 }
   2095 
   2096 sub sha1op38 {
   2097     my $instr = shift;
   2098     my %opcodelet = (
   2099 		"sha1nexte" => 0xc8,
   2100   		"sha1msg1"  => 0xc9,
   2101 		"sha1msg2"  => 0xca	);
   2102 
   2103     if (defined($opcodelet{$instr}) && @_[0] =~ /%xmm([0-9]+),\s*%xmm([0-9]+)/) {
   2104       my @opcode=(0x0f,0x38);
   2105       my $rex=0;
   2106 	$rex|=0x04			if ($2>=8);
   2107 	$rex|=0x01			if ($1>=8);
   2108 	unshift @opcode,0x40|$rex	if ($rex);
   2109 	push @opcode,$opcodelet{$instr};
   2110 	push @opcode,0xc0|($1&7)|(($2&7)<<3);		# ModR/M
   2111 	return ".byte\t".join(',',@opcode);
   2112     } else {
   2113 	return $instr."\t".@_[0];
   2114     }
   2115 }
   2116 
   2117 foreach (split("\n",$code)) {
   2118 	s/\`([^\`]*)\`/eval $1/geo;
   2119 
   2120 	s/\b(sha1rnds4)\s+(.*)/sha1rnds4($2)/geo	or
   2121 	s/\b(sha1[^\s]*)\s+(.*)/sha1op38($1,$2)/geo;
   2122 
   2123 	print $_,"\n";
   2124 }
   2125 close STDOUT;
   2126