Home | History | Annotate | Download | only in asm
      1 #!/usr/bin/env perl
      2 #
      3 # ====================================================================
      4 # Written by Andy Polyakov <appro (at] openssl.org> for the OpenSSL
      5 # project. The module is, however, dual licensed under OpenSSL and
      6 # CRYPTOGAMS licenses depending on where you obtain it. For further
      7 # details see http://www.openssl.org/~appro/cryptogams/.
      8 # ====================================================================
      9 #
     10 # sha1_block procedure for x86_64.
     11 #
     12 # It was brought to my attention that on EM64T compiler-generated code
     13 # was far behind 32-bit assembler implementation. This is unlike on
     14 # Opteron where compiler-generated code was only 15% behind 32-bit
     15 # assembler, which originally made it hard to motivate the effort.
     16 # There was suggestion to mechanically translate 32-bit code, but I
     17 # dismissed it, reasoning that x86_64 offers enough register bank
     18 # capacity to fully utilize SHA-1 parallelism. Therefore this fresh
     19 # implementation:-) However! While 64-bit code does perform better
     20 # on Opteron, I failed to beat 32-bit assembler on EM64T core. Well,
     21 # x86_64 does offer larger *addressable* bank, but out-of-order core
     22 # reaches for even more registers through dynamic aliasing, and EM64T
     23 # core must have managed to run-time optimize even 32-bit code just as
     24 # good as 64-bit one. Performance improvement is summarized in the
     25 # following table:
     26 #
     27 #		gcc 3.4		32-bit asm	cycles/byte
     28 # Opteron	+45%		+20%		6.8
     29 # Xeon P4	+65%		+0%		9.9
     30 # Core2		+60%		+10%		7.0
     31 
     32 # August 2009.
     33 #
     34 # The code was revised to minimize code size and to maximize
     35 # "distance" between instructions producing input to 'lea'
     36 # instruction and the 'lea' instruction itself, which is essential
     37 # for Intel Atom core.
     38 
     39 # October 2010.
     40 #
     41 # Add SSSE3, Supplemental[!] SSE3, implementation. The idea behind it
     42 # is to offload message schedule denoted by Wt in NIST specification,
     43 # or Xupdate in OpenSSL source, to SIMD unit. See sha1-586.pl module
     44 # for background and implementation details. The only difference from
     45 # 32-bit code is that 64-bit code doesn't have to spill @X[] elements
     46 # to free temporary registers.
     47 
     48 # April 2011.
     49 #
     50 # Add AVX code path. See sha1-586.pl for further information.
     51 
     52 # May 2013.
     53 #
     54 # Add AVX2+BMI code path. Initial attempt (utilizing BMI instructions
     55 # and loading pair of consecutive blocks to 256-bit %ymm registers)
     56 # did not provide impressive performance improvement till a crucial
     57 # hint regarding the number of Xupdate iterations to pre-compute in
     58 # advance was provided by Ilya Albrekht of Intel Corp.
     59 
     60 # March 2014.
     61 #
     62 # Add support for Intel SHA Extensions.
     63 
     64 ######################################################################
     65 # Current performance is summarized in following table. Numbers are
     66 # CPU clock cycles spent to process single byte (less is better).
     67 #
     68 #		x86_64		SSSE3		AVX[2]
     69 # P4		9.05		-
     70 # Opteron	6.26		-
     71 # Core2		6.55		6.05/+8%	-
     72 # Westmere	6.73		5.30/+27%	-
     73 # Sandy Bridge	7.70		6.10/+26%	4.99/+54%
     74 # Ivy Bridge	6.06		4.67/+30%	4.60/+32%
     75 # Haswell	5.45		4.15/+31%	3.57/+53%
     76 # Bulldozer	9.11		5.95/+53%
     77 # VIA Nano	9.32		7.15/+30%
     78 # Atom		10.3		9.17/+12%
     79 # Silvermont	13.1(*)		9.37/+40%
     80 #
     81 # (*)	obviously suboptimal result, nothing was done about it,
     82 #	because SSSE3 code is compiled unconditionally;
     83 
     84 $flavour = shift;
     85 $output  = shift;
     86 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
     87 
     88 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
     89 
     90 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
     91 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
     92 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
     93 die "can't locate x86_64-xlate.pl";
     94 
     95 # In upstream, this is controlled by shelling out to the compiler to check
     96 # versions, but BoringSSL is intended to be used with pre-generated perlasm
     97 # output, so this isn't useful anyway.
     98 #
     99 # TODO(davidben): Enable AVX2 code after testing by setting $avx to 2. Is it
    100 # necessary to disable AVX2 code when SHA Extensions code is disabled? Upstream
    101 # did not tie them together until after $shaext was added.
    102 $avx = 1;
    103 
    104 # TODO(davidben): Consider enabling the Intel SHA Extensions code once it's
    105 # been tested.
    106 $shaext=0;	### set to zero if compiling for 1.0.1
    107 $avx=1		if (!$shaext && $avx);
    108 
    109 open OUT,"| \"$^X\" $xlate $flavour $output";
    110 *STDOUT=*OUT;
    111 
    112 $ctx="%rdi";	# 1st arg
    113 $inp="%rsi";	# 2nd arg
    114 $num="%rdx";	# 3rd arg
    115 
    116 # reassign arguments in order to produce more compact code
    117 $ctx="%r8";
    118 $inp="%r9";
    119 $num="%r10";
    120 
    121 $t0="%eax";
    122 $t1="%ebx";
    123 $t2="%ecx";
    124 @xi=("%edx","%ebp","%r14d");
    125 $A="%esi";
    126 $B="%edi";
    127 $C="%r11d";
    128 $D="%r12d";
    129 $E="%r13d";
    130 
    131 @V=($A,$B,$C,$D,$E);
    132 
    133 sub BODY_00_19 {
    134 my ($i,$a,$b,$c,$d,$e)=@_;
    135 my $j=$i+1;
    136 $code.=<<___ if ($i==0);
    137 	mov	`4*$i`($inp),$xi[0]
    138 	bswap	$xi[0]
    139 ___
    140 $code.=<<___ if ($i<15);
    141 	mov	`4*$j`($inp),$xi[1]
    142 	mov	$d,$t0
    143 	mov	$xi[0],`4*$i`(%rsp)
    144 	mov	$a,$t2
    145 	bswap	$xi[1]
    146 	xor	$c,$t0
    147 	rol	\$5,$t2
    148 	and	$b,$t0
    149 	lea	0x5a827999($xi[0],$e),$e
    150 	add	$t2,$e
    151 	xor	$d,$t0
    152 	rol	\$30,$b
    153 	add	$t0,$e
    154 ___
    155 $code.=<<___ if ($i>=15);
    156 	xor	`4*($j%16)`(%rsp),$xi[1]
    157 	mov	$d,$t0
    158 	mov	$xi[0],`4*($i%16)`(%rsp)
    159 	mov	$a,$t2
    160 	xor	`4*(($j+2)%16)`(%rsp),$xi[1]
    161 	xor	$c,$t0
    162 	rol	\$5,$t2
    163 	xor	`4*(($j+8)%16)`(%rsp),$xi[1]
    164 	and	$b,$t0
    165 	lea	0x5a827999($xi[0],$e),$e
    166 	rol	\$30,$b
    167 	xor	$d,$t0
    168 	add	$t2,$e
    169 	rol	\$1,$xi[1]
    170 	add	$t0,$e
    171 ___
    172 push(@xi,shift(@xi));
    173 }
    174 
    175 sub BODY_20_39 {
    176 my ($i,$a,$b,$c,$d,$e)=@_;
    177 my $j=$i+1;
    178 my $K=($i<40)?0x6ed9eba1:0xca62c1d6;
    179 $code.=<<___ if ($i<79);
    180 	xor	`4*($j%16)`(%rsp),$xi[1]
    181 	mov	$b,$t0
    182 	`"mov	$xi[0],".4*($i%16)."(%rsp)"	if ($i<72)`
    183 	mov	$a,$t2
    184 	xor	`4*(($j+2)%16)`(%rsp),$xi[1]
    185 	xor	$d,$t0
    186 	rol	\$5,$t2
    187 	xor	`4*(($j+8)%16)`(%rsp),$xi[1]
    188 	lea	$K($xi[0],$e),$e
    189 	xor	$c,$t0
    190 	add	$t2,$e
    191 	rol	\$30,$b
    192 	add	$t0,$e
    193 	rol	\$1,$xi[1]
    194 ___
    195 $code.=<<___ if ($i==79);
    196 	mov	$b,$t0
    197 	mov	$a,$t2
    198 	xor	$d,$t0
    199 	lea	$K($xi[0],$e),$e
    200 	rol	\$5,$t2
    201 	xor	$c,$t0
    202 	add	$t2,$e
    203 	rol	\$30,$b
    204 	add	$t0,$e
    205 ___
    206 push(@xi,shift(@xi));
    207 }
    208 
    209 sub BODY_40_59 {
    210 my ($i,$a,$b,$c,$d,$e)=@_;
    211 my $j=$i+1;
    212 $code.=<<___;
    213 	xor	`4*($j%16)`(%rsp),$xi[1]
    214 	mov	$d,$t0
    215 	mov	$xi[0],`4*($i%16)`(%rsp)
    216 	mov	$d,$t1
    217 	xor	`4*(($j+2)%16)`(%rsp),$xi[1]
    218 	and	$c,$t0
    219 	mov	$a,$t2
    220 	xor	`4*(($j+8)%16)`(%rsp),$xi[1]
    221 	lea	0x8f1bbcdc($xi[0],$e),$e
    222 	xor	$c,$t1
    223 	rol	\$5,$t2
    224 	add	$t0,$e
    225 	rol	\$1,$xi[1]
    226 	and	$b,$t1
    227 	add	$t2,$e
    228 	rol	\$30,$b
    229 	add	$t1,$e
    230 ___
    231 push(@xi,shift(@xi));
    232 }
    233 
    234 $code.=<<___;
    235 .text
    236 .extern	OPENSSL_ia32cap_P
    237 
    238 .globl	sha1_block_data_order
    239 .type	sha1_block_data_order,\@function,3
    240 .align	16
    241 sha1_block_data_order:
    242 	mov	OPENSSL_ia32cap_P+0(%rip),%r9d
    243 	mov	OPENSSL_ia32cap_P+4(%rip),%r8d
    244 	mov	OPENSSL_ia32cap_P+8(%rip),%r10d
    245 	test	\$`1<<9`,%r8d		# check SSSE3 bit
    246 	jz	.Lialu
    247 ___
    248 $code.=<<___ if ($shaext);
    249 	test	\$`1<<29`,%r10d		# check SHA bit	
    250 	jnz	_shaext_shortcut
    251 ___
    252 $code.=<<___ if ($avx>1);
    253 	and	\$`1<<3|1<<5|1<<8`,%r10d	# check AVX2+BMI1+BMI2
    254 	cmp	\$`1<<3|1<<5|1<<8`,%r10d
    255 	je	_avx2_shortcut
    256 ___
    257 $code.=<<___ if ($avx);
    258 	and	\$`1<<28`,%r8d		# mask AVX bit
    259 	and	\$`1<<30`,%r9d		# mask "Intel CPU" bit
    260 	or	%r9d,%r8d
    261 	cmp	\$`1<<28|1<<30`,%r8d
    262 	je	_avx_shortcut
    263 ___
    264 $code.=<<___;
    265 	jmp	_ssse3_shortcut
    266 
    267 .align	16
    268 .Lialu:
    269 	mov	%rsp,%rax
    270 	push	%rbx
    271 	push	%rbp
    272 	push	%r12
    273 	push	%r13
    274 	push	%r14
    275 	mov	%rdi,$ctx	# reassigned argument
    276 	sub	\$`8+16*4`,%rsp
    277 	mov	%rsi,$inp	# reassigned argument
    278 	and	\$-64,%rsp
    279 	mov	%rdx,$num	# reassigned argument
    280 	mov	%rax,`16*4`(%rsp)
    281 .Lprologue:
    282 
    283 	mov	0($ctx),$A
    284 	mov	4($ctx),$B
    285 	mov	8($ctx),$C
    286 	mov	12($ctx),$D
    287 	mov	16($ctx),$E
    288 	jmp	.Lloop
    289 
    290 .align	16
    291 .Lloop:
    292 ___
    293 for($i=0;$i<20;$i++)	{ &BODY_00_19($i,@V); unshift(@V,pop(@V)); }
    294 for(;$i<40;$i++)	{ &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
    295 for(;$i<60;$i++)	{ &BODY_40_59($i,@V); unshift(@V,pop(@V)); }
    296 for(;$i<80;$i++)	{ &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
    297 $code.=<<___;
    298 	add	0($ctx),$A
    299 	add	4($ctx),$B
    300 	add	8($ctx),$C
    301 	add	12($ctx),$D
    302 	add	16($ctx),$E
    303 	mov	$A,0($ctx)
    304 	mov	$B,4($ctx)
    305 	mov	$C,8($ctx)
    306 	mov	$D,12($ctx)
    307 	mov	$E,16($ctx)
    308 
    309 	sub	\$1,$num
    310 	lea	`16*4`($inp),$inp
    311 	jnz	.Lloop
    312 
    313 	mov	`16*4`(%rsp),%rsi
    314 	mov	-40(%rsi),%r14
    315 	mov	-32(%rsi),%r13
    316 	mov	-24(%rsi),%r12
    317 	mov	-16(%rsi),%rbp
    318 	mov	-8(%rsi),%rbx
    319 	lea	(%rsi),%rsp
    320 .Lepilogue:
    321 	ret
    322 .size	sha1_block_data_order,.-sha1_block_data_order
    323 ___
    324 if ($shaext) {{{
    325 ######################################################################
    326 # Intel SHA Extensions implementation of SHA1 update function.
    327 #
    328 my ($ctx,$inp,$num)=("%rdi","%rsi","%rdx");
    329 my ($ABCD,$E,$E_,$BSWAP,$ABCD_SAVE,$E_SAVE)=map("%xmm$_",(0..3,8,9));
    330 my @MSG=map("%xmm$_",(4..7));
    331 
    332 $code.=<<___;
    333 .type	sha1_block_data_order_shaext,\@function,3
    334 .align	32
    335 sha1_block_data_order_shaext:
    336 _shaext_shortcut:
    337 ___
    338 $code.=<<___ if ($win64);
    339 	lea	`-8-4*16`(%rsp),%rsp
    340 	movaps	%xmm6,-8-4*16(%rax)
    341 	movaps	%xmm7,-8-3*16(%rax)
    342 	movaps	%xmm8,-8-2*16(%rax)
    343 	movaps	%xmm9,-8-1*16(%rax)
    344 .Lprologue_shaext:
    345 ___
    346 $code.=<<___;
    347 	movdqu	($ctx),$ABCD
    348 	movd	16($ctx),$E
    349 	movdqa	K_XX_XX+0xa0(%rip),$BSWAP	# byte-n-word swap
    350 
    351 	movdqu	($inp),@MSG[0]
    352 	pshufd	\$0b00011011,$ABCD,$ABCD	# flip word order
    353 	movdqu	0x10($inp),@MSG[1]
    354 	pshufd	\$0b00011011,$E,$E		# flip word order
    355 	movdqu	0x20($inp),@MSG[2]
    356 	pshufb	$BSWAP,@MSG[0]
    357 	movdqu	0x30($inp),@MSG[3]
    358 	pshufb	$BSWAP,@MSG[1]
    359 	pshufb	$BSWAP,@MSG[2]
    360 	movdqa	$E,$E_SAVE			# offload $E
    361 	pshufb	$BSWAP,@MSG[3]
    362 	jmp	.Loop_shaext
    363 
    364 .align	16
    365 .Loop_shaext:
    366 	dec		$num
    367 	lea		0x40($inp),%rax		# next input block
    368 	paddd		@MSG[0],$E
    369 	cmovne		%rax,$inp
    370 	movdqa		$ABCD,$ABCD_SAVE	# offload $ABCD
    371 ___
    372 for($i=0;$i<20-4;$i+=2) {
    373 $code.=<<___;
    374 	sha1msg1	@MSG[1],@MSG[0]
    375 	movdqa		$ABCD,$E_
    376 	sha1rnds4	\$`int($i/5)`,$E,$ABCD	# 0-3...
    377 	sha1nexte	@MSG[1],$E_
    378 	pxor		@MSG[2],@MSG[0]
    379 	sha1msg1	@MSG[2],@MSG[1]
    380 	sha1msg2	@MSG[3],@MSG[0]
    381 
    382 	movdqa		$ABCD,$E
    383 	sha1rnds4	\$`int(($i+1)/5)`,$E_,$ABCD
    384 	sha1nexte	@MSG[2],$E
    385 	pxor		@MSG[3],@MSG[1]
    386 	sha1msg2	@MSG[0],@MSG[1]
    387 ___
    388 	push(@MSG,shift(@MSG));	push(@MSG,shift(@MSG));
    389 }
    390 $code.=<<___;
    391 	movdqu		($inp),@MSG[0]
    392 	movdqa		$ABCD,$E_
    393 	sha1rnds4	\$3,$E,$ABCD		# 64-67
    394 	sha1nexte	@MSG[1],$E_
    395 	movdqu		0x10($inp),@MSG[1]
    396 	pshufb		$BSWAP,@MSG[0]
    397 
    398 	movdqa		$ABCD,$E
    399 	sha1rnds4	\$3,$E_,$ABCD		# 68-71
    400 	sha1nexte	@MSG[2],$E
    401 	movdqu		0x20($inp),@MSG[2]
    402 	pshufb		$BSWAP,@MSG[1]
    403 
    404 	movdqa		$ABCD,$E_
    405 	sha1rnds4	\$3,$E,$ABCD		# 72-75
    406 	sha1nexte	@MSG[3],$E_
    407 	movdqu		0x30($inp),@MSG[3]
    408 	pshufb		$BSWAP,@MSG[2]
    409 
    410 	movdqa		$ABCD,$E
    411 	sha1rnds4	\$3,$E_,$ABCD		# 76-79
    412 	sha1nexte	$E_SAVE,$E
    413 	pshufb		$BSWAP,@MSG[3]
    414 
    415 	paddd		$ABCD_SAVE,$ABCD
    416 	movdqa		$E,$E_SAVE		# offload $E
    417 
    418 	jnz		.Loop_shaext
    419 
    420 	pshufd	\$0b00011011,$ABCD,$ABCD
    421 	pshufd	\$0b00011011,$E,$E
    422 	movdqu	$ABCD,($ctx)
    423 	movd	$E,16($ctx)
    424 ___
    425 $code.=<<___ if ($win64);
    426 	movaps	-8-4*16(%rax),%xmm6
    427 	movaps	-8-3*16(%rax),%xmm7
    428 	movaps	-8-2*16(%rax),%xmm8
    429 	movaps	-8-1*16(%rax),%xmm9
    430 	mov	%rax,%rsp
    431 .Lepilogue_shaext:
    432 ___
    433 $code.=<<___;
    434 	ret
    435 .size	sha1_block_data_order_shaext,.-sha1_block_data_order_shaext
    436 ___
    437 }}}
    438 {{{
    439 my $Xi=4;
    440 my @X=map("%xmm$_",(4..7,0..3));
    441 my @Tx=map("%xmm$_",(8..10));
    442 my $Kx="%xmm11";
    443 my @V=($A,$B,$C,$D,$E)=("%eax","%ebx","%ecx","%edx","%ebp");	# size optimization
    444 my @T=("%esi","%edi");
    445 my $j=0;
    446 my $rx=0;
    447 my $K_XX_XX="%r11";
    448 
    449 my $_rol=sub { &rol(@_) };
    450 my $_ror=sub { &ror(@_) };
    451 
    452 { my $sn;
    453 sub align32() {
    454   ++$sn;
    455 $code.=<<___;
    456 	jmp	.Lalign32_$sn	# see "Decoded ICache" in manual
    457 .align	32
    458 .Lalign32_$sn:
    459 ___
    460 }
    461 }
    462 
    463 $code.=<<___;
    464 .type	sha1_block_data_order_ssse3,\@function,3
    465 .align	16
    466 sha1_block_data_order_ssse3:
    467 _ssse3_shortcut:
    468 	mov	%rsp,%rax
    469 	push	%rbx
    470 	push	%rbp
    471 	push	%r12
    472 	push	%r13		# redundant, done to share Win64 SE handler
    473 	push	%r14
    474 	lea	`-64-($win64?6*16:0)`(%rsp),%rsp
    475 ___
    476 $code.=<<___ if ($win64);
    477 	movaps	%xmm6,-40-6*16(%rax)
    478 	movaps	%xmm7,-40-5*16(%rax)
    479 	movaps	%xmm8,-40-4*16(%rax)
    480 	movaps	%xmm9,-40-3*16(%rax)
    481 	movaps	%xmm10,-40-2*16(%rax)
    482 	movaps	%xmm11,-40-1*16(%rax)
    483 .Lprologue_ssse3:
    484 ___
    485 $code.=<<___;
    486 	mov	%rax,%r14	# original %rsp
    487 	and	\$-64,%rsp
    488 	mov	%rdi,$ctx	# reassigned argument
    489 	mov	%rsi,$inp	# reassigned argument
    490 	mov	%rdx,$num	# reassigned argument
    491 
    492 	shl	\$6,$num
    493 	add	$inp,$num
    494 	lea	K_XX_XX+64(%rip),$K_XX_XX
    495 
    496 	mov	0($ctx),$A		# load context
    497 	mov	4($ctx),$B
    498 	mov	8($ctx),$C
    499 	mov	12($ctx),$D
    500 	mov	$B,@T[0]		# magic seed
    501 	mov	16($ctx),$E
    502 	mov	$C,@T[1]
    503 	xor	$D,@T[1]
    504 	and	@T[1],@T[0]
    505 
    506 	movdqa	64($K_XX_XX),@X[2]	# pbswap mask
    507 	movdqa	-64($K_XX_XX),@Tx[1]	# K_00_19
    508 	movdqu	0($inp),@X[-4&7]	# load input to %xmm[0-3]
    509 	movdqu	16($inp),@X[-3&7]
    510 	movdqu	32($inp),@X[-2&7]
    511 	movdqu	48($inp),@X[-1&7]
    512 	pshufb	@X[2],@X[-4&7]		# byte swap
    513 	pshufb	@X[2],@X[-3&7]
    514 	pshufb	@X[2],@X[-2&7]
    515 	add	\$64,$inp
    516 	paddd	@Tx[1],@X[-4&7]		# add K_00_19
    517 	pshufb	@X[2],@X[-1&7]
    518 	paddd	@Tx[1],@X[-3&7]
    519 	paddd	@Tx[1],@X[-2&7]
    520 	movdqa	@X[-4&7],0(%rsp)	# X[]+K xfer to IALU
    521 	psubd	@Tx[1],@X[-4&7]		# restore X[]
    522 	movdqa	@X[-3&7],16(%rsp)
    523 	psubd	@Tx[1],@X[-3&7]
    524 	movdqa	@X[-2&7],32(%rsp)
    525 	psubd	@Tx[1],@X[-2&7]
    526 	jmp	.Loop_ssse3
    527 ___
    528 
    529 sub AUTOLOAD()		# thunk [simplified] 32-bit style perlasm
    530 { my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
    531   my $arg = pop;
    532     $arg = "\$$arg" if ($arg*1 eq $arg);
    533     $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n";
    534 }
    535 
    536 sub Xupdate_ssse3_16_31()		# recall that $Xi starts wtih 4
    537 { use integer;
    538   my $body = shift;
    539   my @insns = (&$body,&$body,&$body,&$body);	# 40 instructions
    540   my ($a,$b,$c,$d,$e);
    541 
    542 	 eval(shift(@insns));		# ror
    543 	&pshufd	(@X[0],@X[-4&7],0xee);	# was &movdqa	(@X[0],@X[-3&7]);
    544 	 eval(shift(@insns));
    545 	&movdqa	(@Tx[0],@X[-1&7]);
    546 	  &paddd	(@Tx[1],@X[-1&7]);
    547 	 eval(shift(@insns));
    548 	 eval(shift(@insns));
    549 
    550 	&punpcklqdq(@X[0],@X[-3&7]);	# compose "X[-14]" in "X[0]", was &palignr(@X[0],@X[-4&7],8);
    551 	 eval(shift(@insns));
    552 	 eval(shift(@insns));		# rol
    553 	 eval(shift(@insns));
    554 	&psrldq	(@Tx[0],4);		# "X[-3]", 3 dwords
    555 	 eval(shift(@insns));
    556 	 eval(shift(@insns));
    557 
    558 	&pxor	(@X[0],@X[-4&7]);	# "X[0]"^="X[-16]"
    559 	 eval(shift(@insns));
    560 	 eval(shift(@insns));		# ror
    561 	&pxor	(@Tx[0],@X[-2&7]);	# "X[-3]"^"X[-8]"
    562 	 eval(shift(@insns));
    563 	 eval(shift(@insns));
    564 	 eval(shift(@insns));
    565 
    566 	&pxor	(@X[0],@Tx[0]);		# "X[0]"^="X[-3]"^"X[-8]"
    567 	 eval(shift(@insns));
    568 	 eval(shift(@insns));		# rol
    569 	  &movdqa	(eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]);	# X[]+K xfer to IALU
    570 	 eval(shift(@insns));
    571 	 eval(shift(@insns));
    572 
    573 	&movdqa	(@Tx[2],@X[0]);
    574 	 eval(shift(@insns));
    575 	 eval(shift(@insns));
    576 	 eval(shift(@insns));		# ror
    577 	&movdqa	(@Tx[0],@X[0]);
    578 	 eval(shift(@insns));
    579 
    580 	&pslldq	(@Tx[2],12);		# "X[0]"<<96, extract one dword
    581 	&paddd	(@X[0],@X[0]);
    582 	 eval(shift(@insns));
    583 	 eval(shift(@insns));
    584 
    585 	&psrld	(@Tx[0],31);
    586 	 eval(shift(@insns));
    587 	 eval(shift(@insns));		# rol
    588 	 eval(shift(@insns));
    589 	&movdqa	(@Tx[1],@Tx[2]);
    590 	 eval(shift(@insns));
    591 	 eval(shift(@insns));
    592 
    593 	&psrld	(@Tx[2],30);
    594 	 eval(shift(@insns));
    595 	 eval(shift(@insns));		# ror
    596 	&por	(@X[0],@Tx[0]);		# "X[0]"<<<=1
    597 	 eval(shift(@insns));
    598 	 eval(shift(@insns));
    599 	 eval(shift(@insns));
    600 
    601 	&pslld	(@Tx[1],2);
    602 	&pxor	(@X[0],@Tx[2]);
    603 	 eval(shift(@insns));
    604 	  &movdqa	(@Tx[2],eval(2*16*(($Xi)/5)-64)."($K_XX_XX)");	# K_XX_XX
    605 	 eval(shift(@insns));		# rol
    606 	 eval(shift(@insns));
    607 	 eval(shift(@insns));
    608 
    609 	&pxor	(@X[0],@Tx[1]);		# "X[0]"^=("X[0]">>96)<<<2
    610 	&pshufd (@Tx[1],@X[-1&7],0xee)	if ($Xi==7);	# was &movdqa	(@Tx[0],@X[-1&7]) in Xupdate_ssse3_32_79
    611 
    612 	 foreach (@insns) { eval; }	# remaining instructions [if any]
    613 
    614   $Xi++;	push(@X,shift(@X));	# "rotate" X[]
    615 		push(@Tx,shift(@Tx));
    616 }
    617 
    618 sub Xupdate_ssse3_32_79()
    619 { use integer;
    620   my $body = shift;
    621   my @insns = (&$body,&$body,&$body,&$body);	# 32 to 44 instructions
    622   my ($a,$b,$c,$d,$e);
    623 
    624 	 eval(shift(@insns))		if ($Xi==8);
    625 	&pxor	(@X[0],@X[-4&7]);	# "X[0]"="X[-32]"^"X[-16]"
    626 	 eval(shift(@insns))		if ($Xi==8);
    627 	 eval(shift(@insns));		# body_20_39
    628 	 eval(shift(@insns));
    629 	 eval(shift(@insns))		if (@insns[1] =~ /_ror/);
    630 	 eval(shift(@insns))		if (@insns[0] =~ /_ror/);
    631 	&punpcklqdq(@Tx[0],@X[-1&7]);	# compose "X[-6]", was &palignr(@Tx[0],@X[-2&7],8);
    632 	 eval(shift(@insns));
    633 	 eval(shift(@insns));		# rol
    634 
    635 	&pxor	(@X[0],@X[-7&7]);	# "X[0]"^="X[-28]"
    636 	 eval(shift(@insns));
    637 	 eval(shift(@insns));
    638 	if ($Xi%5) {
    639 	  &movdqa	(@Tx[2],@Tx[1]);# "perpetuate" K_XX_XX...
    640 	} else {			# ... or load next one
    641 	  &movdqa	(@Tx[2],eval(2*16*($Xi/5)-64)."($K_XX_XX)");
    642 	}
    643 	 eval(shift(@insns));		# ror
    644 	  &paddd	(@Tx[1],@X[-1&7]);
    645 	 eval(shift(@insns));
    646 
    647 	&pxor	(@X[0],@Tx[0]);		# "X[0]"^="X[-6]"
    648 	 eval(shift(@insns));		# body_20_39
    649 	 eval(shift(@insns));
    650 	 eval(shift(@insns));
    651 	 eval(shift(@insns));		# rol
    652 	 eval(shift(@insns))		if (@insns[0] =~ /_ror/);
    653 
    654 	&movdqa	(@Tx[0],@X[0]);
    655 	 eval(shift(@insns));
    656 	 eval(shift(@insns));
    657 	  &movdqa	(eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]);	# X[]+K xfer to IALU
    658 	 eval(shift(@insns));		# ror
    659 	 eval(shift(@insns));
    660 	 eval(shift(@insns));		# body_20_39
    661 
    662 	&pslld	(@X[0],2);
    663 	 eval(shift(@insns));
    664 	 eval(shift(@insns));
    665 	&psrld	(@Tx[0],30);
    666 	 eval(shift(@insns))		if (@insns[0] =~ /_rol/);# rol
    667 	 eval(shift(@insns));
    668 	 eval(shift(@insns));
    669 	 eval(shift(@insns));		# ror
    670 
    671 	&por	(@X[0],@Tx[0]);		# "X[0]"<<<=2
    672 	 eval(shift(@insns));
    673 	 eval(shift(@insns));		# body_20_39
    674 	 eval(shift(@insns))		if (@insns[1] =~ /_rol/);
    675 	 eval(shift(@insns))		if (@insns[0] =~ /_rol/);
    676 	  &pshufd(@Tx[1],@X[-1&7],0xee)	if ($Xi<19);	# was &movdqa	(@Tx[1],@X[0])
    677 	 eval(shift(@insns));
    678 	 eval(shift(@insns));		# rol
    679 	 eval(shift(@insns));
    680 	 eval(shift(@insns));
    681 	 eval(shift(@insns));		# rol
    682 	 eval(shift(@insns));
    683 
    684 	 foreach (@insns) { eval; }	# remaining instructions
    685 
    686   $Xi++;	push(@X,shift(@X));	# "rotate" X[]
    687 		push(@Tx,shift(@Tx));
    688 }
    689 
    690 sub Xuplast_ssse3_80()
    691 { use integer;
    692   my $body = shift;
    693   my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
    694   my ($a,$b,$c,$d,$e);
    695 
    696 	 eval(shift(@insns));
    697 	 eval(shift(@insns));
    698 	 eval(shift(@insns));
    699 	 eval(shift(@insns));
    700 	  &paddd	(@Tx[1],@X[-1&7]);
    701 	 eval(shift(@insns));
    702 	 eval(shift(@insns));
    703 
    704 	  &movdqa	(eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]);	# X[]+K xfer IALU
    705 
    706 	 foreach (@insns) { eval; }		# remaining instructions
    707 
    708 	&cmp	($inp,$num);
    709 	&je	(".Ldone_ssse3");
    710 
    711 	unshift(@Tx,pop(@Tx));
    712 
    713 	&movdqa	(@X[2],"64($K_XX_XX)");		# pbswap mask
    714 	&movdqa	(@Tx[1],"-64($K_XX_XX)");	# K_00_19
    715 	&movdqu	(@X[-4&7],"0($inp)");		# load input
    716 	&movdqu	(@X[-3&7],"16($inp)");
    717 	&movdqu	(@X[-2&7],"32($inp)");
    718 	&movdqu	(@X[-1&7],"48($inp)");
    719 	&pshufb	(@X[-4&7],@X[2]);		# byte swap
    720 	&add	($inp,64);
    721 
    722   $Xi=0;
    723 }
    724 
    725 sub Xloop_ssse3()
    726 { use integer;
    727   my $body = shift;
    728   my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
    729   my ($a,$b,$c,$d,$e);
    730 
    731 	 eval(shift(@insns));
    732 	 eval(shift(@insns));
    733 	 eval(shift(@insns));
    734 	&pshufb	(@X[($Xi-3)&7],@X[2]);
    735 	 eval(shift(@insns));
    736 	 eval(shift(@insns));
    737 	 eval(shift(@insns));
    738 	 eval(shift(@insns));
    739 	&paddd	(@X[($Xi-4)&7],@Tx[1]);
    740 	 eval(shift(@insns));
    741 	 eval(shift(@insns));
    742 	 eval(shift(@insns));
    743 	 eval(shift(@insns));
    744 	&movdqa	(eval(16*$Xi)."(%rsp)",@X[($Xi-4)&7]);	# X[]+K xfer to IALU
    745 	 eval(shift(@insns));
    746 	 eval(shift(@insns));
    747 	 eval(shift(@insns));
    748 	 eval(shift(@insns));
    749 	&psubd	(@X[($Xi-4)&7],@Tx[1]);
    750 
    751 	foreach (@insns) { eval; }
    752   $Xi++;
    753 }
    754 
    755 sub Xtail_ssse3()
    756 { use integer;
    757   my $body = shift;
    758   my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
    759   my ($a,$b,$c,$d,$e);
    760 
    761 	foreach (@insns) { eval; }
    762 }
    763 
    764 sub body_00_19 () {	# ((c^d)&b)^d
    765 	# on start @T[0]=(c^d)&b
    766 	return &body_20_39() if ($rx==19); $rx++;
    767 	(
    768 	'($a,$b,$c,$d,$e)=@V;'.
    769 	'&$_ror	($b,$j?7:2)',	# $b>>>2
    770 	'&xor	(@T[0],$d)',
    771 	'&mov	(@T[1],$a)',	# $b for next round
    772 
    773 	'&add	($e,eval(4*($j&15))."(%rsp)")',	# X[]+K xfer
    774 	'&xor	($b,$c)',	# $c^$d for next round
    775 
    776 	'&$_rol	($a,5)',
    777 	'&add	($e,@T[0])',
    778 	'&and	(@T[1],$b)',	# ($b&($c^$d)) for next round
    779 
    780 	'&xor	($b,$c)',	# restore $b
    781 	'&add	($e,$a);'	.'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));'
    782 	);
    783 }
    784 
    785 sub body_20_39 () {	# b^d^c
    786 	# on entry @T[0]=b^d
    787 	return &body_40_59() if ($rx==39); $rx++;
    788 	(
    789 	'($a,$b,$c,$d,$e)=@V;'.
    790 	'&add	($e,eval(4*($j&15))."(%rsp)")',	# X[]+K xfer
    791 	'&xor	(@T[0],$d)	if($j==19);'.
    792 	'&xor	(@T[0],$c)	if($j> 19)',	# ($b^$d^$c)
    793 	'&mov	(@T[1],$a)',	# $b for next round
    794 
    795 	'&$_rol	($a,5)',
    796 	'&add	($e,@T[0])',
    797 	'&xor	(@T[1],$c)	if ($j< 79)',	# $b^$d for next round
    798 
    799 	'&$_ror	($b,7)',	# $b>>>2
    800 	'&add	($e,$a);'	.'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));'
    801 	);
    802 }
    803 
    804 sub body_40_59 () {	# ((b^c)&(c^d))^c
    805 	# on entry @T[0]=(b^c), (c^=d)
    806 	$rx++;
    807 	(
    808 	'($a,$b,$c,$d,$e)=@V;'.
    809 	'&add	($e,eval(4*($j&15))."(%rsp)")',	# X[]+K xfer
    810 	'&and	(@T[0],$c)	if ($j>=40)',	# (b^c)&(c^d)
    811 	'&xor	($c,$d)		if ($j>=40)',	# restore $c
    812 
    813 	'&$_ror	($b,7)',	# $b>>>2
    814 	'&mov	(@T[1],$a)',	# $b for next round
    815 	'&xor	(@T[0],$c)',
    816 
    817 	'&$_rol	($a,5)',
    818 	'&add	($e,@T[0])',
    819 	'&xor	(@T[1],$c)	if ($j==59);'.
    820 	'&xor	(@T[1],$b)	if ($j< 59)',	# b^c for next round
    821 
    822 	'&xor	($b,$c)		if ($j< 59)',	# c^d for next round
    823 	'&add	($e,$a);'	.'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));'
    824 	);
    825 }
    826 $code.=<<___;
    827 .align	16
    828 .Loop_ssse3:
    829 ___
    830 	&Xupdate_ssse3_16_31(\&body_00_19);
    831 	&Xupdate_ssse3_16_31(\&body_00_19);
    832 	&Xupdate_ssse3_16_31(\&body_00_19);
    833 	&Xupdate_ssse3_16_31(\&body_00_19);
    834 	&Xupdate_ssse3_32_79(\&body_00_19);
    835 	&Xupdate_ssse3_32_79(\&body_20_39);
    836 	&Xupdate_ssse3_32_79(\&body_20_39);
    837 	&Xupdate_ssse3_32_79(\&body_20_39);
    838 	&Xupdate_ssse3_32_79(\&body_20_39);
    839 	&Xupdate_ssse3_32_79(\&body_20_39);
    840 	&Xupdate_ssse3_32_79(\&body_40_59);
    841 	&Xupdate_ssse3_32_79(\&body_40_59);
    842 	&Xupdate_ssse3_32_79(\&body_40_59);
    843 	&Xupdate_ssse3_32_79(\&body_40_59);
    844 	&Xupdate_ssse3_32_79(\&body_40_59);
    845 	&Xupdate_ssse3_32_79(\&body_20_39);
    846 	&Xuplast_ssse3_80(\&body_20_39);	# can jump to "done"
    847 
    848 				$saved_j=$j; @saved_V=@V;
    849 
    850 	&Xloop_ssse3(\&body_20_39);
    851 	&Xloop_ssse3(\&body_20_39);
    852 	&Xloop_ssse3(\&body_20_39);
    853 
    854 $code.=<<___;
    855 	add	0($ctx),$A			# update context
    856 	add	4($ctx),@T[0]
    857 	add	8($ctx),$C
    858 	add	12($ctx),$D
    859 	mov	$A,0($ctx)
    860 	add	16($ctx),$E
    861 	mov	@T[0],4($ctx)
    862 	mov	@T[0],$B			# magic seed
    863 	mov	$C,8($ctx)
    864 	mov	$C,@T[1]
    865 	mov	$D,12($ctx)
    866 	xor	$D,@T[1]
    867 	mov	$E,16($ctx)
    868 	and	@T[1],@T[0]
    869 	jmp	.Loop_ssse3
    870 
    871 .align	16
    872 .Ldone_ssse3:
    873 ___
    874 				$j=$saved_j; @V=@saved_V;
    875 
    876 	&Xtail_ssse3(\&body_20_39);
    877 	&Xtail_ssse3(\&body_20_39);
    878 	&Xtail_ssse3(\&body_20_39);
    879 
    880 $code.=<<___;
    881 	add	0($ctx),$A			# update context
    882 	add	4($ctx),@T[0]
    883 	add	8($ctx),$C
    884 	mov	$A,0($ctx)
    885 	add	12($ctx),$D
    886 	mov	@T[0],4($ctx)
    887 	add	16($ctx),$E
    888 	mov	$C,8($ctx)
    889 	mov	$D,12($ctx)
    890 	mov	$E,16($ctx)
    891 ___
    892 $code.=<<___ if ($win64);
    893 	movaps	-40-6*16(%r14),%xmm6
    894 	movaps	-40-5*16(%r14),%xmm7
    895 	movaps	-40-4*16(%r14),%xmm8
    896 	movaps	-40-3*16(%r14),%xmm9
    897 	movaps	-40-2*16(%r14),%xmm10
    898 	movaps	-40-1*16(%r14),%xmm11
    899 ___
    900 $code.=<<___;
    901 	lea	(%r14),%rsi
    902 	mov	-40(%rsi),%r14
    903 	mov	-32(%rsi),%r13
    904 	mov	-24(%rsi),%r12
    905 	mov	-16(%rsi),%rbp
    906 	mov	-8(%rsi),%rbx
    907 	lea	(%rsi),%rsp
    908 .Lepilogue_ssse3:
    909 	ret
    910 .size	sha1_block_data_order_ssse3,.-sha1_block_data_order_ssse3
    911 ___
    912 
    913 if ($avx) {
    914 $Xi=4;				# reset variables
    915 @X=map("%xmm$_",(4..7,0..3));
    916 @Tx=map("%xmm$_",(8..10));
    917 $j=0;
    918 $rx=0;
    919 
    920 my $done_avx_label=".Ldone_avx";
    921 
    922 my $_rol=sub { &shld(@_[0],@_) };
    923 my $_ror=sub { &shrd(@_[0],@_) };
    924 
    925 $code.=<<___;
    926 .type	sha1_block_data_order_avx,\@function,3
    927 .align	16
    928 sha1_block_data_order_avx:
    929 _avx_shortcut:
    930 	mov	%rsp,%rax
    931 	push	%rbx
    932 	push	%rbp
    933 	push	%r12
    934 	push	%r13		# redundant, done to share Win64 SE handler
    935 	push	%r14
    936 	lea	`-64-($win64?6*16:0)`(%rsp),%rsp
    937 	vzeroupper
    938 ___
    939 $code.=<<___ if ($win64);
    940 	vmovaps	%xmm6,-40-6*16(%rax)
    941 	vmovaps	%xmm7,-40-5*16(%rax)
    942 	vmovaps	%xmm8,-40-4*16(%rax)
    943 	vmovaps	%xmm9,-40-3*16(%rax)
    944 	vmovaps	%xmm10,-40-2*16(%rax)
    945 	vmovaps	%xmm11,-40-1*16(%rax)
    946 .Lprologue_avx:
    947 ___
    948 $code.=<<___;
    949 	mov	%rax,%r14	# original %rsp
    950 	and	\$-64,%rsp
    951 	mov	%rdi,$ctx	# reassigned argument
    952 	mov	%rsi,$inp	# reassigned argument
    953 	mov	%rdx,$num	# reassigned argument
    954 
    955 	shl	\$6,$num
    956 	add	$inp,$num
    957 	lea	K_XX_XX+64(%rip),$K_XX_XX
    958 
    959 	mov	0($ctx),$A		# load context
    960 	mov	4($ctx),$B
    961 	mov	8($ctx),$C
    962 	mov	12($ctx),$D
    963 	mov	$B,@T[0]		# magic seed
    964 	mov	16($ctx),$E
    965 	mov	$C,@T[1]
    966 	xor	$D,@T[1]
    967 	and	@T[1],@T[0]
    968 
    969 	vmovdqa	64($K_XX_XX),@X[2]	# pbswap mask
    970 	vmovdqa	-64($K_XX_XX),$Kx	# K_00_19
    971 	vmovdqu	0($inp),@X[-4&7]	# load input to %xmm[0-3]
    972 	vmovdqu	16($inp),@X[-3&7]
    973 	vmovdqu	32($inp),@X[-2&7]
    974 	vmovdqu	48($inp),@X[-1&7]
    975 	vpshufb	@X[2],@X[-4&7],@X[-4&7]	# byte swap
    976 	add	\$64,$inp
    977 	vpshufb	@X[2],@X[-3&7],@X[-3&7]
    978 	vpshufb	@X[2],@X[-2&7],@X[-2&7]
    979 	vpshufb	@X[2],@X[-1&7],@X[-1&7]
    980 	vpaddd	$Kx,@X[-4&7],@X[0]	# add K_00_19
    981 	vpaddd	$Kx,@X[-3&7],@X[1]
    982 	vpaddd	$Kx,@X[-2&7],@X[2]
    983 	vmovdqa	@X[0],0(%rsp)		# X[]+K xfer to IALU
    984 	vmovdqa	@X[1],16(%rsp)
    985 	vmovdqa	@X[2],32(%rsp)
    986 	jmp	.Loop_avx
    987 ___
    988 
    989 sub Xupdate_avx_16_31()		# recall that $Xi starts wtih 4
    990 { use integer;
    991   my $body = shift;
    992   my @insns = (&$body,&$body,&$body,&$body);	# 40 instructions
    993   my ($a,$b,$c,$d,$e);
    994 
    995 	 eval(shift(@insns));
    996 	 eval(shift(@insns));
    997 	&vpalignr(@X[0],@X[-3&7],@X[-4&7],8);	# compose "X[-14]" in "X[0]"
    998 	 eval(shift(@insns));
    999 	 eval(shift(@insns));
   1000 
   1001 	  &vpaddd	(@Tx[1],$Kx,@X[-1&7]);
   1002 	 eval(shift(@insns));
   1003 	 eval(shift(@insns));
   1004 	&vpsrldq(@Tx[0],@X[-1&7],4);		# "X[-3]", 3 dwords
   1005 	 eval(shift(@insns));
   1006 	 eval(shift(@insns));
   1007 	&vpxor	(@X[0],@X[0],@X[-4&7]);		# "X[0]"^="X[-16]"
   1008 	 eval(shift(@insns));
   1009 	 eval(shift(@insns));
   1010 
   1011 	&vpxor	(@Tx[0],@Tx[0],@X[-2&7]);	# "X[-3]"^"X[-8]"
   1012 	 eval(shift(@insns));
   1013 	 eval(shift(@insns));
   1014 	 eval(shift(@insns));
   1015 	 eval(shift(@insns));
   1016 
   1017 	&vpxor	(@X[0],@X[0],@Tx[0]);		# "X[0]"^="X[-3]"^"X[-8]"
   1018 	 eval(shift(@insns));
   1019 	 eval(shift(@insns));
   1020 	  &vmovdqa	(eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]);	# X[]+K xfer to IALU
   1021 	 eval(shift(@insns));
   1022 	 eval(shift(@insns));
   1023 
   1024 	&vpsrld	(@Tx[0],@X[0],31);
   1025 	 eval(shift(@insns));
   1026 	 eval(shift(@insns));
   1027 	 eval(shift(@insns));
   1028 	 eval(shift(@insns));
   1029 
   1030 	&vpslldq(@Tx[2],@X[0],12);		# "X[0]"<<96, extract one dword
   1031 	&vpaddd	(@X[0],@X[0],@X[0]);
   1032 	 eval(shift(@insns));
   1033 	 eval(shift(@insns));
   1034 	 eval(shift(@insns));
   1035 	 eval(shift(@insns));
   1036 
   1037 	&vpsrld	(@Tx[1],@Tx[2],30);
   1038 	&vpor	(@X[0],@X[0],@Tx[0]);		# "X[0]"<<<=1
   1039 	 eval(shift(@insns));
   1040 	 eval(shift(@insns));
   1041 	 eval(shift(@insns));
   1042 	 eval(shift(@insns));
   1043 
   1044 	&vpslld	(@Tx[2],@Tx[2],2);
   1045 	&vpxor	(@X[0],@X[0],@Tx[1]);
   1046 	 eval(shift(@insns));
   1047 	 eval(shift(@insns));
   1048 	 eval(shift(@insns));
   1049 	 eval(shift(@insns));
   1050 
   1051 	&vpxor	(@X[0],@X[0],@Tx[2]);		# "X[0]"^=("X[0]">>96)<<<2
   1052 	 eval(shift(@insns));
   1053 	 eval(shift(@insns));
   1054 	  &vmovdqa	($Kx,eval(2*16*(($Xi)/5)-64)."($K_XX_XX)")	if ($Xi%5==0);	# K_XX_XX
   1055 	 eval(shift(@insns));
   1056 	 eval(shift(@insns));
   1057 
   1058 
   1059 	 foreach (@insns) { eval; }	# remaining instructions [if any]
   1060 
   1061   $Xi++;	push(@X,shift(@X));	# "rotate" X[]
   1062 }
   1063 
   1064 sub Xupdate_avx_32_79()
   1065 { use integer;
   1066   my $body = shift;
   1067   my @insns = (&$body,&$body,&$body,&$body);	# 32 to 44 instructions
   1068   my ($a,$b,$c,$d,$e);
   1069 
   1070 	&vpalignr(@Tx[0],@X[-1&7],@X[-2&7],8);	# compose "X[-6]"
   1071 	&vpxor	(@X[0],@X[0],@X[-4&7]);		# "X[0]"="X[-32]"^"X[-16]"
   1072 	 eval(shift(@insns));		# body_20_39
   1073 	 eval(shift(@insns));
   1074 	 eval(shift(@insns));
   1075 	 eval(shift(@insns));		# rol
   1076 
   1077 	&vpxor	(@X[0],@X[0],@X[-7&7]);		# "X[0]"^="X[-28]"
   1078 	 eval(shift(@insns));
   1079 	 eval(shift(@insns))	if (@insns[0] !~ /&ro[rl]/);
   1080 	  &vpaddd	(@Tx[1],$Kx,@X[-1&7]);
   1081 	  &vmovdqa	($Kx,eval(2*16*($Xi/5)-64)."($K_XX_XX)")	if ($Xi%5==0);
   1082 	 eval(shift(@insns));		# ror
   1083 	 eval(shift(@insns));
   1084 
   1085 	&vpxor	(@X[0],@X[0],@Tx[0]);		# "X[0]"^="X[-6]"
   1086 	 eval(shift(@insns));		# body_20_39
   1087 	 eval(shift(@insns));
   1088 	 eval(shift(@insns));
   1089 	 eval(shift(@insns));		# rol
   1090 
   1091 	&vpsrld	(@Tx[0],@X[0],30);
   1092 	  &vmovdqa	(eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]);	# X[]+K xfer to IALU
   1093 	 eval(shift(@insns));
   1094 	 eval(shift(@insns));
   1095 	 eval(shift(@insns));		# ror
   1096 	 eval(shift(@insns));
   1097 
   1098 	&vpslld	(@X[0],@X[0],2);
   1099 	 eval(shift(@insns));		# body_20_39
   1100 	 eval(shift(@insns));
   1101 	 eval(shift(@insns));
   1102 	 eval(shift(@insns));		# rol
   1103 	 eval(shift(@insns));
   1104 	 eval(shift(@insns));
   1105 	 eval(shift(@insns));		# ror
   1106 	 eval(shift(@insns));
   1107 
   1108 	&vpor	(@X[0],@X[0],@Tx[0]);		# "X[0]"<<<=2
   1109 	 eval(shift(@insns));		# body_20_39
   1110 	 eval(shift(@insns));
   1111 	 eval(shift(@insns));
   1112 	 eval(shift(@insns));		# rol
   1113 	 eval(shift(@insns));
   1114 	 eval(shift(@insns));
   1115 	 eval(shift(@insns));		# rol
   1116 	 eval(shift(@insns));
   1117 
   1118 	 foreach (@insns) { eval; }	# remaining instructions
   1119 
   1120   $Xi++;	push(@X,shift(@X));	# "rotate" X[]
   1121 }
   1122 
   1123 sub Xuplast_avx_80()
   1124 { use integer;
   1125   my $body = shift;
   1126   my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
   1127   my ($a,$b,$c,$d,$e);
   1128 
   1129 	 eval(shift(@insns));
   1130 	  &vpaddd	(@Tx[1],$Kx,@X[-1&7]);
   1131 	 eval(shift(@insns));
   1132 	 eval(shift(@insns));
   1133 	 eval(shift(@insns));
   1134 	 eval(shift(@insns));
   1135 
   1136 	  &vmovdqa	(eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]);	# X[]+K xfer IALU
   1137 
   1138 	 foreach (@insns) { eval; }		# remaining instructions
   1139 
   1140 	&cmp	($inp,$num);
   1141 	&je	($done_avx_label);
   1142 
   1143 	&vmovdqa(@X[2],"64($K_XX_XX)");		# pbswap mask
   1144 	&vmovdqa($Kx,"-64($K_XX_XX)");		# K_00_19
   1145 	&vmovdqu(@X[-4&7],"0($inp)");		# load input
   1146 	&vmovdqu(@X[-3&7],"16($inp)");
   1147 	&vmovdqu(@X[-2&7],"32($inp)");
   1148 	&vmovdqu(@X[-1&7],"48($inp)");
   1149 	&vpshufb(@X[-4&7],@X[-4&7],@X[2]);	# byte swap
   1150 	&add	($inp,64);
   1151 
   1152   $Xi=0;
   1153 }
   1154 
   1155 sub Xloop_avx()
   1156 { use integer;
   1157   my $body = shift;
   1158   my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
   1159   my ($a,$b,$c,$d,$e);
   1160 
   1161 	 eval(shift(@insns));
   1162 	 eval(shift(@insns));
   1163 	&vpshufb(@X[($Xi-3)&7],@X[($Xi-3)&7],@X[2]);
   1164 	 eval(shift(@insns));
   1165 	 eval(shift(@insns));
   1166 	&vpaddd	(@X[$Xi&7],@X[($Xi-4)&7],$Kx);
   1167 	 eval(shift(@insns));
   1168 	 eval(shift(@insns));
   1169 	 eval(shift(@insns));
   1170 	 eval(shift(@insns));
   1171 	&vmovdqa(eval(16*$Xi)."(%rsp)",@X[$Xi&7]);	# X[]+K xfer to IALU
   1172 	 eval(shift(@insns));
   1173 	 eval(shift(@insns));
   1174 
   1175 	foreach (@insns) { eval; }
   1176   $Xi++;
   1177 }
   1178 
   1179 sub Xtail_avx()
   1180 { use integer;
   1181   my $body = shift;
   1182   my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
   1183   my ($a,$b,$c,$d,$e);
   1184 
   1185 	foreach (@insns) { eval; }
   1186 }
   1187 
   1188 $code.=<<___;
   1189 .align	16
   1190 .Loop_avx:
   1191 ___
   1192 	&Xupdate_avx_16_31(\&body_00_19);
   1193 	&Xupdate_avx_16_31(\&body_00_19);
   1194 	&Xupdate_avx_16_31(\&body_00_19);
   1195 	&Xupdate_avx_16_31(\&body_00_19);
   1196 	&Xupdate_avx_32_79(\&body_00_19);
   1197 	&Xupdate_avx_32_79(\&body_20_39);
   1198 	&Xupdate_avx_32_79(\&body_20_39);
   1199 	&Xupdate_avx_32_79(\&body_20_39);
   1200 	&Xupdate_avx_32_79(\&body_20_39);
   1201 	&Xupdate_avx_32_79(\&body_20_39);
   1202 	&Xupdate_avx_32_79(\&body_40_59);
   1203 	&Xupdate_avx_32_79(\&body_40_59);
   1204 	&Xupdate_avx_32_79(\&body_40_59);
   1205 	&Xupdate_avx_32_79(\&body_40_59);
   1206 	&Xupdate_avx_32_79(\&body_40_59);
   1207 	&Xupdate_avx_32_79(\&body_20_39);
   1208 	&Xuplast_avx_80(\&body_20_39);	# can jump to "done"
   1209 
   1210 				$saved_j=$j; @saved_V=@V;
   1211 
   1212 	&Xloop_avx(\&body_20_39);
   1213 	&Xloop_avx(\&body_20_39);
   1214 	&Xloop_avx(\&body_20_39);
   1215 
   1216 $code.=<<___;
   1217 	add	0($ctx),$A			# update context
   1218 	add	4($ctx),@T[0]
   1219 	add	8($ctx),$C
   1220 	add	12($ctx),$D
   1221 	mov	$A,0($ctx)
   1222 	add	16($ctx),$E
   1223 	mov	@T[0],4($ctx)
   1224 	mov	@T[0],$B			# magic seed
   1225 	mov	$C,8($ctx)
   1226 	mov	$C,@T[1]
   1227 	mov	$D,12($ctx)
   1228 	xor	$D,@T[1]
   1229 	mov	$E,16($ctx)
   1230 	and	@T[1],@T[0]
   1231 	jmp	.Loop_avx
   1232 
   1233 .align	16
   1234 $done_avx_label:
   1235 ___
   1236 				$j=$saved_j; @V=@saved_V;
   1237 
   1238 	&Xtail_avx(\&body_20_39);
   1239 	&Xtail_avx(\&body_20_39);
   1240 	&Xtail_avx(\&body_20_39);
   1241 
   1242 $code.=<<___;
   1243 	vzeroupper
   1244 
   1245 	add	0($ctx),$A			# update context
   1246 	add	4($ctx),@T[0]
   1247 	add	8($ctx),$C
   1248 	mov	$A,0($ctx)
   1249 	add	12($ctx),$D
   1250 	mov	@T[0],4($ctx)
   1251 	add	16($ctx),$E
   1252 	mov	$C,8($ctx)
   1253 	mov	$D,12($ctx)
   1254 	mov	$E,16($ctx)
   1255 ___
   1256 $code.=<<___ if ($win64);
   1257 	movaps	-40-6*16(%r14),%xmm6
   1258 	movaps	-40-5*16(%r14),%xmm7
   1259 	movaps	-40-4*16(%r14),%xmm8
   1260 	movaps	-40-3*16(%r14),%xmm9
   1261 	movaps	-40-2*16(%r14),%xmm10
   1262 	movaps	-40-1*16(%r14),%xmm11
   1263 ___
   1264 $code.=<<___;
   1265 	lea	(%r14),%rsi
   1266 	mov	-40(%rsi),%r14
   1267 	mov	-32(%rsi),%r13
   1268 	mov	-24(%rsi),%r12
   1269 	mov	-16(%rsi),%rbp
   1270 	mov	-8(%rsi),%rbx
   1271 	lea	(%rsi),%rsp
   1272 .Lepilogue_avx:
   1273 	ret
   1274 .size	sha1_block_data_order_avx,.-sha1_block_data_order_avx
   1275 ___
   1276 
   1277 if ($avx>1) {
   1278 use integer;
   1279 $Xi=4;					# reset variables
   1280 @X=map("%ymm$_",(4..7,0..3));
   1281 @Tx=map("%ymm$_",(8..10));
   1282 $Kx="%ymm11";
   1283 $j=0;
   1284 
   1285 my @ROTX=("%eax","%ebp","%ebx","%ecx","%edx","%esi");
   1286 my ($a5,$t0)=("%r12d","%edi");
   1287 
   1288 my ($A,$F,$B,$C,$D,$E)=@ROTX;
   1289 my $rx=0;
   1290 my $frame="%r13";
   1291 
   1292 $code.=<<___;
   1293 .type	sha1_block_data_order_avx2,\@function,3
   1294 .align	16
   1295 sha1_block_data_order_avx2:
   1296 _avx2_shortcut:
   1297 	mov	%rsp,%rax
   1298 	push	%rbx
   1299 	push	%rbp
   1300 	push	%r12
   1301 	push	%r13
   1302 	push	%r14
   1303 	vzeroupper
   1304 ___
   1305 $code.=<<___ if ($win64);
   1306 	lea	-6*16(%rsp),%rsp
   1307 	vmovaps	%xmm6,-40-6*16(%rax)
   1308 	vmovaps	%xmm7,-40-5*16(%rax)
   1309 	vmovaps	%xmm8,-40-4*16(%rax)
   1310 	vmovaps	%xmm9,-40-3*16(%rax)
   1311 	vmovaps	%xmm10,-40-2*16(%rax)
   1312 	vmovaps	%xmm11,-40-1*16(%rax)
   1313 .Lprologue_avx2:
   1314 ___
   1315 $code.=<<___;
   1316 	mov	%rax,%r14		# original %rsp
   1317 	mov	%rdi,$ctx		# reassigned argument
   1318 	mov	%rsi,$inp		# reassigned argument
   1319 	mov	%rdx,$num		# reassigned argument
   1320 
   1321 	lea	-640(%rsp),%rsp
   1322 	shl	\$6,$num
   1323 	 lea	64($inp),$frame
   1324 	and	\$-128,%rsp
   1325 	add	$inp,$num
   1326 	lea	K_XX_XX+64(%rip),$K_XX_XX
   1327 
   1328 	mov	0($ctx),$A		# load context
   1329 	 cmp	$num,$frame
   1330 	 cmovae	$inp,$frame		# next or same block
   1331 	mov	4($ctx),$F
   1332 	mov	8($ctx),$C
   1333 	mov	12($ctx),$D
   1334 	mov	16($ctx),$E
   1335 	vmovdqu	64($K_XX_XX),@X[2]	# pbswap mask
   1336 
   1337 	vmovdqu		($inp),%xmm0
   1338 	vmovdqu		16($inp),%xmm1
   1339 	vmovdqu		32($inp),%xmm2
   1340 	vmovdqu		48($inp),%xmm3
   1341 	lea		64($inp),$inp
   1342 	vinserti128	\$1,($frame),@X[-4&7],@X[-4&7]
   1343 	vinserti128	\$1,16($frame),@X[-3&7],@X[-3&7]
   1344 	vpshufb		@X[2],@X[-4&7],@X[-4&7]
   1345 	vinserti128	\$1,32($frame),@X[-2&7],@X[-2&7]
   1346 	vpshufb		@X[2],@X[-3&7],@X[-3&7]
   1347 	vinserti128	\$1,48($frame),@X[-1&7],@X[-1&7]
   1348 	vpshufb		@X[2],@X[-2&7],@X[-2&7]
   1349 	vmovdqu		-64($K_XX_XX),$Kx	# K_00_19
   1350 	vpshufb		@X[2],@X[-1&7],@X[-1&7]
   1351 
   1352 	vpaddd	$Kx,@X[-4&7],@X[0]	# add K_00_19
   1353 	vpaddd	$Kx,@X[-3&7],@X[1]
   1354 	vmovdqu	@X[0],0(%rsp)		# X[]+K xfer to IALU
   1355 	vpaddd	$Kx,@X[-2&7],@X[2]
   1356 	vmovdqu	@X[1],32(%rsp)
   1357 	vpaddd	$Kx,@X[-1&7],@X[3]
   1358 	vmovdqu	@X[2],64(%rsp)
   1359 	vmovdqu	@X[3],96(%rsp)
   1360 ___
   1361 for (;$Xi<8;$Xi++) {	# Xupdate_avx2_16_31
   1362     use integer;
   1363 
   1364 	&vpalignr(@X[0],@X[-3&7],@X[-4&7],8);	# compose "X[-14]" in "X[0]"
   1365 	&vpsrldq(@Tx[0],@X[-1&7],4);		# "X[-3]", 3 dwords
   1366 	&vpxor	(@X[0],@X[0],@X[-4&7]);		# "X[0]"^="X[-16]"
   1367 	&vpxor	(@Tx[0],@Tx[0],@X[-2&7]);	# "X[-3]"^"X[-8]"
   1368 	&vpxor	(@X[0],@X[0],@Tx[0]);		# "X[0]"^="X[-3]"^"X[-8]"
   1369 	&vpsrld	(@Tx[0],@X[0],31);
   1370 	&vmovdqu($Kx,eval(2*16*(($Xi)/5)-64)."($K_XX_XX)")	if ($Xi%5==0);	# K_XX_XX
   1371 	&vpslldq(@Tx[2],@X[0],12);		# "X[0]"<<96, extract one dword
   1372 	&vpaddd	(@X[0],@X[0],@X[0]);
   1373 	&vpsrld	(@Tx[1],@Tx[2],30);
   1374 	&vpor	(@X[0],@X[0],@Tx[0]);		# "X[0]"<<<=1
   1375 	&vpslld	(@Tx[2],@Tx[2],2);
   1376 	&vpxor	(@X[0],@X[0],@Tx[1]);
   1377 	&vpxor	(@X[0],@X[0],@Tx[2]);		# "X[0]"^=("X[0]">>96)<<<2
   1378 	&vpaddd	(@Tx[1],@X[0],$Kx);
   1379 	&vmovdqu("32*$Xi(%rsp)",@Tx[1]);	# X[]+K xfer to IALU
   1380 
   1381 	push(@X,shift(@X));	# "rotate" X[]
   1382 }
   1383 $code.=<<___;
   1384 	lea	128(%rsp),$frame
   1385 	jmp	.Loop_avx2
   1386 .align	32
   1387 .Loop_avx2:
   1388 	rorx	\$2,$F,$B
   1389 	andn	$D,$F,$t0
   1390 	and	$C,$F
   1391 	xor	$t0,$F
   1392 ___
   1393 sub bodyx_00_19 () {	# 8 instructions, 3 cycles critical path
   1394 	# at start $f=(b&c)^(~b&d), $b>>>=2
   1395 	return &bodyx_20_39() if ($rx==19); $rx++;
   1396 	(
   1397 	'($a,$f,$b,$c,$d,$e)=@ROTX;'.
   1398 
   1399 	'&add	($e,((32*($j/4)+4*($j%4))%256-128)."($frame)");'.	# e+=X[i]+K
   1400 	 '&lea	($frame,"256($frame)")	if ($j%32==31);',
   1401 	'&andn	($t0,$a,$c)',			# ~b&d for next round
   1402 
   1403 	'&add	($e,$f)',			# e+=(b&c)^(~b&d)
   1404 	'&rorx	($a5,$a,27)',			# a<<<5
   1405 	'&rorx	($f,$a,2)',			# b>>>2 for next round
   1406 	'&and	($a,$b)',			# b&c for next round
   1407 
   1408 	'&add	($e,$a5)',			# e+=a<<<5
   1409 	'&xor	($a,$t0);'.			# f=(b&c)^(~b&d) for next round
   1410 
   1411 	'unshift(@ROTX,pop(@ROTX)); $j++;'
   1412 	)
   1413 }
   1414 
   1415 sub bodyx_20_39 () {	# 7 instructions, 2 cycles critical path
   1416 	# on entry $f=b^c^d, $b>>>=2
   1417 	return &bodyx_40_59() if ($rx==39); $rx++;
   1418 	(
   1419 	'($a,$f,$b,$c,$d,$e)=@ROTX;'.
   1420 
   1421 	'&add	($e,((32*($j/4)+4*($j%4))%256-128)."($frame)");'.	# e+=X[i]+K
   1422 	 '&lea	($frame,"256($frame)")	if ($j%32==31);',
   1423 
   1424 	'&lea	($e,"($e,$f)")',		# e+=b^c^d
   1425 	'&rorx	($a5,$a,27)',			# a<<<5
   1426 	'&rorx	($f,$a,2)	if ($j<79)',	# b>>>2 in next round
   1427 	'&xor	($a,$b)		if ($j<79)',	# b^c for next round
   1428 
   1429 	'&add	($e,$a5)',			# e+=a<<<5
   1430 	'&xor	($a,$c)		if ($j<79);'.	# f=b^c^d for next round
   1431 
   1432 	'unshift(@ROTX,pop(@ROTX)); $j++;'
   1433 	)
   1434 }
   1435 
   1436 sub bodyx_40_59 () {	# 10 instructions, 3 cycles critical path
   1437 	# on entry $f=((b^c)&(c^d)), $b>>>=2
   1438 	$rx++;
   1439 	(
   1440 	'($a,$f,$b,$c,$d,$e)=@ROTX;'.
   1441 
   1442 	'&add	($e,((32*($j/4)+4*($j%4))%256-128)."($frame)");'.	# e+=X[i]+K
   1443 	 '&lea	($frame,"256($frame)")	if ($j%32==31);',
   1444 	'&xor	($f,$c)		if ($j>39)',	# (b^c)&(c^d)^c
   1445 	'&mov	($t0,$b)	if ($j<59)',	# count on zero latency
   1446 	'&xor	($t0,$c)	if ($j<59)',	# c^d for next round
   1447 
   1448 	'&lea	($e,"($e,$f)")',		# e+=(b^c)&(c^d)^c
   1449 	'&rorx	($a5,$a,27)',			# a<<<5
   1450 	'&rorx	($f,$a,2)',			# b>>>2 in next round
   1451 	'&xor	($a,$b)',			# b^c for next round
   1452 
   1453 	'&add	($e,$a5)',			# e+=a<<<5
   1454 	'&and	($a,$t0)	if ($j< 59);'.	# f=(b^c)&(c^d) for next round
   1455 	'&xor	($a,$c)		if ($j==59);'.	# f=b^c^d for next round
   1456 
   1457 	'unshift(@ROTX,pop(@ROTX)); $j++;'
   1458 	)
   1459 }
   1460 
   1461 sub Xupdate_avx2_16_31()		# recall that $Xi starts wtih 4
   1462 { use integer;
   1463   my $body = shift;
   1464   my @insns = (&$body,&$body,&$body,&$body,&$body);	# 35 instructions
   1465   my ($a,$b,$c,$d,$e);
   1466 
   1467 	&vpalignr(@X[0],@X[-3&7],@X[-4&7],8);	# compose "X[-14]" in "X[0]"
   1468 	 eval(shift(@insns));
   1469 	 eval(shift(@insns));
   1470 	 eval(shift(@insns));
   1471 	 eval(shift(@insns));
   1472 
   1473 	&vpsrldq(@Tx[0],@X[-1&7],4);		# "X[-3]", 3 dwords
   1474 	 eval(shift(@insns));
   1475 	 eval(shift(@insns));
   1476 	 eval(shift(@insns));
   1477 
   1478 	&vpxor	(@X[0],@X[0],@X[-4&7]);		# "X[0]"^="X[-16]"
   1479 	&vpxor	(@Tx[0],@Tx[0],@X[-2&7]);	# "X[-3]"^"X[-8]"
   1480 	 eval(shift(@insns));
   1481 	 eval(shift(@insns));
   1482 
   1483 	&vpxor	(@X[0],@X[0],@Tx[0]);		# "X[0]"^="X[-3]"^"X[-8]"
   1484 	 eval(shift(@insns));
   1485 	 eval(shift(@insns));
   1486 	 eval(shift(@insns));
   1487 	 eval(shift(@insns));
   1488 
   1489 	&vpsrld	(@Tx[0],@X[0],31);
   1490 	&vmovdqu($Kx,eval(2*16*(($Xi)/5)-64)."($K_XX_XX)")	if ($Xi%5==0);	# K_XX_XX
   1491 	 eval(shift(@insns));
   1492 	 eval(shift(@insns));
   1493 	 eval(shift(@insns));
   1494 
   1495 	&vpslldq(@Tx[2],@X[0],12);		# "X[0]"<<96, extract one dword
   1496 	&vpaddd	(@X[0],@X[0],@X[0]);
   1497 	 eval(shift(@insns));
   1498 	 eval(shift(@insns));
   1499 
   1500 	&vpsrld	(@Tx[1],@Tx[2],30);
   1501 	&vpor	(@X[0],@X[0],@Tx[0]);		# "X[0]"<<<=1
   1502 	 eval(shift(@insns));
   1503 	 eval(shift(@insns));
   1504 
   1505 	&vpslld	(@Tx[2],@Tx[2],2);
   1506 	&vpxor	(@X[0],@X[0],@Tx[1]);
   1507 	 eval(shift(@insns));
   1508 	 eval(shift(@insns));
   1509 
   1510 	&vpxor	(@X[0],@X[0],@Tx[2]);		# "X[0]"^=("X[0]">>96)<<<2
   1511 	 eval(shift(@insns));
   1512 	 eval(shift(@insns));
   1513 	 eval(shift(@insns));
   1514 
   1515 	&vpaddd	(@Tx[1],@X[0],$Kx);
   1516 	 eval(shift(@insns));
   1517 	 eval(shift(@insns));
   1518 	 eval(shift(@insns));
   1519 	&vmovdqu(eval(32*($Xi))."(%rsp)",@Tx[1]);	# X[]+K xfer to IALU
   1520 
   1521 	 foreach (@insns) { eval; }	# remaining instructions [if any]
   1522 
   1523 	$Xi++;
   1524 	push(@X,shift(@X));	# "rotate" X[]
   1525 }
   1526 
   1527 sub Xupdate_avx2_32_79()
   1528 { use integer;
   1529   my $body = shift;
   1530   my @insns = (&$body,&$body,&$body,&$body,&$body);	# 35 to 50 instructions
   1531   my ($a,$b,$c,$d,$e);
   1532 
   1533 	&vpalignr(@Tx[0],@X[-1&7],@X[-2&7],8);	# compose "X[-6]"
   1534 	&vpxor	(@X[0],@X[0],@X[-4&7]);		# "X[0]"="X[-32]"^"X[-16]"
   1535 	 eval(shift(@insns));
   1536 	 eval(shift(@insns));
   1537 
   1538 	&vpxor	(@X[0],@X[0],@X[-7&7]);		# "X[0]"^="X[-28]"
   1539 	&vmovdqu($Kx,eval(2*16*($Xi/5)-64)."($K_XX_XX)")	if ($Xi%5==0);
   1540 	 eval(shift(@insns));
   1541 	 eval(shift(@insns));
   1542 	 eval(shift(@insns));
   1543 
   1544 	&vpxor	(@X[0],@X[0],@Tx[0]);		# "X[0]"^="X[-6]"
   1545 	 eval(shift(@insns));
   1546 	 eval(shift(@insns));
   1547 	 eval(shift(@insns));
   1548 
   1549 	&vpsrld	(@Tx[0],@X[0],30);
   1550 	&vpslld	(@X[0],@X[0],2);
   1551 	 eval(shift(@insns));
   1552 	 eval(shift(@insns));
   1553 	 eval(shift(@insns));
   1554 
   1555 	#&vpslld	(@X[0],@X[0],2);
   1556 	 eval(shift(@insns));
   1557 	 eval(shift(@insns));
   1558 	 eval(shift(@insns));
   1559 
   1560 	&vpor	(@X[0],@X[0],@Tx[0]);		# "X[0]"<<<=2
   1561 	 eval(shift(@insns));
   1562 	 eval(shift(@insns));
   1563 	 eval(shift(@insns));
   1564 	 eval(shift(@insns));
   1565 
   1566 	&vpaddd	(@Tx[1],@X[0],$Kx);
   1567 	 eval(shift(@insns));
   1568 	 eval(shift(@insns));
   1569 	 eval(shift(@insns));
   1570 	 eval(shift(@insns));
   1571 
   1572 	&vmovdqu("32*$Xi(%rsp)",@Tx[1]);	# X[]+K xfer to IALU
   1573 
   1574 	 foreach (@insns) { eval; }	# remaining instructions
   1575 
   1576 	$Xi++;
   1577 	push(@X,shift(@X));	# "rotate" X[]
   1578 }
   1579 
   1580 sub Xloop_avx2()
   1581 { use integer;
   1582   my $body = shift;
   1583   my @insns = (&$body,&$body,&$body,&$body,&$body);	# 32 instructions
   1584   my ($a,$b,$c,$d,$e);
   1585 
   1586 	 foreach (@insns) { eval; }
   1587 }
   1588 
   1589 	&align32();
   1590 	&Xupdate_avx2_32_79(\&bodyx_00_19);
   1591 	&Xupdate_avx2_32_79(\&bodyx_00_19);
   1592 	&Xupdate_avx2_32_79(\&bodyx_00_19);
   1593 	&Xupdate_avx2_32_79(\&bodyx_00_19);
   1594 
   1595 	&Xupdate_avx2_32_79(\&bodyx_20_39);
   1596 	&Xupdate_avx2_32_79(\&bodyx_20_39);
   1597 	&Xupdate_avx2_32_79(\&bodyx_20_39);
   1598 	&Xupdate_avx2_32_79(\&bodyx_20_39);
   1599 
   1600 	&align32();
   1601 	&Xupdate_avx2_32_79(\&bodyx_40_59);
   1602 	&Xupdate_avx2_32_79(\&bodyx_40_59);
   1603 	&Xupdate_avx2_32_79(\&bodyx_40_59);
   1604 	&Xupdate_avx2_32_79(\&bodyx_40_59);
   1605 
   1606 	&Xloop_avx2(\&bodyx_20_39);
   1607 	&Xloop_avx2(\&bodyx_20_39);
   1608 	&Xloop_avx2(\&bodyx_20_39);
   1609 	&Xloop_avx2(\&bodyx_20_39);
   1610 
   1611 $code.=<<___;
   1612 	lea	128($inp),$frame
   1613 	lea	128($inp),%rdi			# borrow $t0
   1614 	cmp	$num,$frame
   1615 	cmovae	$inp,$frame			# next or previous block
   1616 
   1617 	# output is d-e-[a]-f-b-c => A=d,F=e,C=f,D=b,E=c
   1618 	add	0($ctx),@ROTX[0]		# update context
   1619 	add	4($ctx),@ROTX[1]
   1620 	add	8($ctx),@ROTX[3]
   1621 	mov	@ROTX[0],0($ctx)
   1622 	add	12($ctx),@ROTX[4]
   1623 	mov	@ROTX[1],4($ctx)
   1624 	 mov	@ROTX[0],$A			# A=d
   1625 	add	16($ctx),@ROTX[5]
   1626 	 mov	@ROTX[3],$a5
   1627 	mov	@ROTX[3],8($ctx)
   1628 	 mov	@ROTX[4],$D			# D=b
   1629 	 #xchg	@ROTX[5],$F			# F=c, C=f
   1630 	mov	@ROTX[4],12($ctx)
   1631 	 mov	@ROTX[1],$F			# F=e
   1632 	mov	@ROTX[5],16($ctx)
   1633 	#mov	$F,16($ctx)
   1634 	 mov	@ROTX[5],$E			# E=c
   1635 	 mov	$a5,$C				# C=f
   1636 	 #xchg	$F,$E				# E=c, F=e
   1637 
   1638 	cmp	$num,$inp
   1639 	je	.Ldone_avx2
   1640 ___
   1641 
   1642 $Xi=4;				# reset variables
   1643 @X=map("%ymm$_",(4..7,0..3));
   1644 
   1645 $code.=<<___;
   1646 	vmovdqu	64($K_XX_XX),@X[2]		# pbswap mask
   1647 	cmp	$num,%rdi			# borrowed $t0
   1648 	ja	.Last_avx2
   1649 
   1650 	vmovdqu		-64(%rdi),%xmm0		# low part of @X[-4&7]
   1651 	vmovdqu		-48(%rdi),%xmm1
   1652 	vmovdqu		-32(%rdi),%xmm2
   1653 	vmovdqu		-16(%rdi),%xmm3
   1654 	vinserti128	\$1,0($frame),@X[-4&7],@X[-4&7]
   1655 	vinserti128	\$1,16($frame),@X[-3&7],@X[-3&7]
   1656 	vinserti128	\$1,32($frame),@X[-2&7],@X[-2&7]
   1657 	vinserti128	\$1,48($frame),@X[-1&7],@X[-1&7]
   1658 	jmp	.Last_avx2
   1659 
   1660 .align	32
   1661 .Last_avx2:
   1662 	lea	128+16(%rsp),$frame
   1663 	rorx	\$2,$F,$B
   1664 	andn	$D,$F,$t0
   1665 	and	$C,$F
   1666 	xor	$t0,$F
   1667 	sub	\$-128,$inp
   1668 ___
   1669 	$rx=$j=0;	@ROTX=($A,$F,$B,$C,$D,$E);
   1670 
   1671 	&Xloop_avx2	(\&bodyx_00_19);
   1672 	&Xloop_avx2	(\&bodyx_00_19);
   1673 	&Xloop_avx2	(\&bodyx_00_19);
   1674 	&Xloop_avx2	(\&bodyx_00_19);
   1675 
   1676 	&Xloop_avx2	(\&bodyx_20_39);
   1677 	  &vmovdqu	($Kx,"-64($K_XX_XX)");		# K_00_19
   1678 	  &vpshufb	(@X[-4&7],@X[-4&7],@X[2]);	# byte swap
   1679 	&Xloop_avx2	(\&bodyx_20_39);
   1680 	  &vpshufb	(@X[-3&7],@X[-3&7],@X[2]);
   1681 	  &vpaddd	(@Tx[0],@X[-4&7],$Kx);		# add K_00_19
   1682 	&Xloop_avx2	(\&bodyx_20_39);
   1683 	  &vmovdqu	("0(%rsp)",@Tx[0]);
   1684 	  &vpshufb	(@X[-2&7],@X[-2&7],@X[2]);
   1685 	  &vpaddd	(@Tx[1],@X[-3&7],$Kx);
   1686 	&Xloop_avx2	(\&bodyx_20_39);
   1687 	  &vmovdqu	("32(%rsp)",@Tx[1]);
   1688 	  &vpshufb	(@X[-1&7],@X[-1&7],@X[2]);
   1689 	  &vpaddd	(@X[2],@X[-2&7],$Kx);
   1690 
   1691 	&Xloop_avx2	(\&bodyx_40_59);
   1692 	&align32	();
   1693 	  &vmovdqu	("64(%rsp)",@X[2]);
   1694 	  &vpaddd	(@X[3],@X[-1&7],$Kx);
   1695 	&Xloop_avx2	(\&bodyx_40_59);
   1696 	  &vmovdqu	("96(%rsp)",@X[3]);
   1697 	&Xloop_avx2	(\&bodyx_40_59);
   1698 	&Xupdate_avx2_16_31(\&bodyx_40_59);
   1699 
   1700 	&Xupdate_avx2_16_31(\&bodyx_20_39);
   1701 	&Xupdate_avx2_16_31(\&bodyx_20_39);
   1702 	&Xupdate_avx2_16_31(\&bodyx_20_39);
   1703 	&Xloop_avx2	(\&bodyx_20_39);
   1704 
   1705 $code.=<<___;
   1706 	lea	128(%rsp),$frame
   1707 
   1708 	# output is d-e-[a]-f-b-c => A=d,F=e,C=f,D=b,E=c
   1709 	add	0($ctx),@ROTX[0]		# update context
   1710 	add	4($ctx),@ROTX[1]
   1711 	add	8($ctx),@ROTX[3]
   1712 	mov	@ROTX[0],0($ctx)
   1713 	add	12($ctx),@ROTX[4]
   1714 	mov	@ROTX[1],4($ctx)
   1715 	 mov	@ROTX[0],$A			# A=d
   1716 	add	16($ctx),@ROTX[5]
   1717 	 mov	@ROTX[3],$a5
   1718 	mov	@ROTX[3],8($ctx)
   1719 	 mov	@ROTX[4],$D			# D=b
   1720 	 #xchg	@ROTX[5],$F			# F=c, C=f
   1721 	mov	@ROTX[4],12($ctx)
   1722 	 mov	@ROTX[1],$F			# F=e
   1723 	mov	@ROTX[5],16($ctx)
   1724 	#mov	$F,16($ctx)
   1725 	 mov	@ROTX[5],$E			# E=c
   1726 	 mov	$a5,$C				# C=f
   1727 	 #xchg	$F,$E				# E=c, F=e
   1728 
   1729 	cmp	$num,$inp
   1730 	jbe	.Loop_avx2
   1731 
   1732 .Ldone_avx2:
   1733 	vzeroupper
   1734 ___
   1735 $code.=<<___ if ($win64);
   1736 	movaps	-40-6*16(%r14),%xmm6
   1737 	movaps	-40-5*16(%r14),%xmm7
   1738 	movaps	-40-4*16(%r14),%xmm8
   1739 	movaps	-40-3*16(%r14),%xmm9
   1740 	movaps	-40-2*16(%r14),%xmm10
   1741 	movaps	-40-1*16(%r14),%xmm11
   1742 ___
   1743 $code.=<<___;
   1744 	lea	(%r14),%rsi
   1745 	mov	-40(%rsi),%r14
   1746 	mov	-32(%rsi),%r13
   1747 	mov	-24(%rsi),%r12
   1748 	mov	-16(%rsi),%rbp
   1749 	mov	-8(%rsi),%rbx
   1750 	lea	(%rsi),%rsp
   1751 .Lepilogue_avx2:
   1752 	ret
   1753 .size	sha1_block_data_order_avx2,.-sha1_block_data_order_avx2
   1754 ___
   1755 }
   1756 }
   1757 $code.=<<___;
   1758 .align	64
   1759 K_XX_XX:
   1760 .long	0x5a827999,0x5a827999,0x5a827999,0x5a827999	# K_00_19
   1761 .long	0x5a827999,0x5a827999,0x5a827999,0x5a827999	# K_00_19
   1762 .long	0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1	# K_20_39
   1763 .long	0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1	# K_20_39
   1764 .long	0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc	# K_40_59
   1765 .long	0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc	# K_40_59
   1766 .long	0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6	# K_60_79
   1767 .long	0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6	# K_60_79
   1768 .long	0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f	# pbswap mask
   1769 .long	0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f	# pbswap mask
   1770 .byte	0xf,0xe,0xd,0xc,0xb,0xa,0x9,0x8,0x7,0x6,0x5,0x4,0x3,0x2,0x1,0x0
   1771 ___
   1772 }}}
   1773 $code.=<<___;
   1774 .asciz	"SHA1 block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
   1775 .align	64
   1776 ___
   1777 
   1778 # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
   1779 #		CONTEXT *context,DISPATCHER_CONTEXT *disp)
   1780 if ($win64) {
   1781 $rec="%rcx";
   1782 $frame="%rdx";
   1783 $context="%r8";
   1784 $disp="%r9";
   1785 
   1786 $code.=<<___;
   1787 .extern	__imp_RtlVirtualUnwind
   1788 .type	se_handler,\@abi-omnipotent
   1789 .align	16
   1790 se_handler:
   1791 	push	%rsi
   1792 	push	%rdi
   1793 	push	%rbx
   1794 	push	%rbp
   1795 	push	%r12
   1796 	push	%r13
   1797 	push	%r14
   1798 	push	%r15
   1799 	pushfq
   1800 	sub	\$64,%rsp
   1801 
   1802 	mov	120($context),%rax	# pull context->Rax
   1803 	mov	248($context),%rbx	# pull context->Rip
   1804 
   1805 	lea	.Lprologue(%rip),%r10
   1806 	cmp	%r10,%rbx		# context->Rip<.Lprologue
   1807 	jb	.Lcommon_seh_tail
   1808 
   1809 	mov	152($context),%rax	# pull context->Rsp
   1810 
   1811 	lea	.Lepilogue(%rip),%r10
   1812 	cmp	%r10,%rbx		# context->Rip>=.Lepilogue
   1813 	jae	.Lcommon_seh_tail
   1814 
   1815 	mov	`16*4`(%rax),%rax	# pull saved stack pointer
   1816 
   1817 	mov	-8(%rax),%rbx
   1818 	mov	-16(%rax),%rbp
   1819 	mov	-24(%rax),%r12
   1820 	mov	-32(%rax),%r13
   1821 	mov	-40(%rax),%r14
   1822 	mov	%rbx,144($context)	# restore context->Rbx
   1823 	mov	%rbp,160($context)	# restore context->Rbp
   1824 	mov	%r12,216($context)	# restore context->R12
   1825 	mov	%r13,224($context)	# restore context->R13
   1826 	mov	%r14,232($context)	# restore context->R14
   1827 
   1828 	jmp	.Lcommon_seh_tail
   1829 .size	se_handler,.-se_handler
   1830 ___
   1831 
   1832 $code.=<<___ if ($shaext);
   1833 .type	shaext_handler,\@abi-omnipotent
   1834 .align	16
   1835 shaext_handler:
   1836 	push	%rsi
   1837 	push	%rdi
   1838 	push	%rbx
   1839 	push	%rbp
   1840 	push	%r12
   1841 	push	%r13
   1842 	push	%r14
   1843 	push	%r15
   1844 	pushfq
   1845 	sub	\$64,%rsp
   1846 
   1847 	mov	120($context),%rax	# pull context->Rax
   1848 	mov	248($context),%rbx	# pull context->Rip
   1849 
   1850 	lea	.Lprologue_shaext(%rip),%r10
   1851 	cmp	%r10,%rbx		# context->Rip<.Lprologue
   1852 	jb	.Lcommon_seh_tail
   1853 
   1854 	lea	.Lepilogue_shaext(%rip),%r10
   1855 	cmp	%r10,%rbx		# context->Rip>=.Lepilogue
   1856 	jae	.Lcommon_seh_tail
   1857 
   1858 	lea	-8-4*16(%rax),%rsi
   1859 	lea	512($context),%rdi	# &context.Xmm6
   1860 	mov	\$8,%ecx
   1861 	.long	0xa548f3fc		# cld; rep movsq
   1862 
   1863 	jmp	.Lcommon_seh_tail
   1864 .size	shaext_handler,.-shaext_handler
   1865 ___
   1866 
   1867 $code.=<<___;
   1868 .type	ssse3_handler,\@abi-omnipotent
   1869 .align	16
   1870 ssse3_handler:
   1871 	push	%rsi
   1872 	push	%rdi
   1873 	push	%rbx
   1874 	push	%rbp
   1875 	push	%r12
   1876 	push	%r13
   1877 	push	%r14
   1878 	push	%r15
   1879 	pushfq
   1880 	sub	\$64,%rsp
   1881 
   1882 	mov	120($context),%rax	# pull context->Rax
   1883 	mov	248($context),%rbx	# pull context->Rip
   1884 
   1885 	mov	8($disp),%rsi		# disp->ImageBase
   1886 	mov	56($disp),%r11		# disp->HandlerData
   1887 
   1888 	mov	0(%r11),%r10d		# HandlerData[0]
   1889 	lea	(%rsi,%r10),%r10	# prologue label
   1890 	cmp	%r10,%rbx		# context->Rip<prologue label
   1891 	jb	.Lcommon_seh_tail
   1892 
   1893 	mov	152($context),%rax	# pull context->Rsp
   1894 
   1895 	mov	4(%r11),%r10d		# HandlerData[1]
   1896 	lea	(%rsi,%r10),%r10	# epilogue label
   1897 	cmp	%r10,%rbx		# context->Rip>=epilogue label
   1898 	jae	.Lcommon_seh_tail
   1899 
   1900 	mov	232($context),%rax	# pull context->R14
   1901 
   1902 	lea	-40-6*16(%rax),%rsi
   1903 	lea	512($context),%rdi	# &context.Xmm6
   1904 	mov	\$12,%ecx
   1905 	.long	0xa548f3fc		# cld; rep movsq
   1906 
   1907 	mov	-8(%rax),%rbx
   1908 	mov	-16(%rax),%rbp
   1909 	mov	-24(%rax),%r12
   1910 	mov	-32(%rax),%r13
   1911 	mov	-40(%rax),%r14
   1912 	mov	%rbx,144($context)	# restore context->Rbx
   1913 	mov	%rbp,160($context)	# restore context->Rbp
   1914 	mov	%r12,216($context)	# restore cotnext->R12
   1915 	mov	%r13,224($context)	# restore cotnext->R13
   1916 	mov	%r14,232($context)	# restore cotnext->R14
   1917 
   1918 .Lcommon_seh_tail:
   1919 	mov	8(%rax),%rdi
   1920 	mov	16(%rax),%rsi
   1921 	mov	%rax,152($context)	# restore context->Rsp
   1922 	mov	%rsi,168($context)	# restore context->Rsi
   1923 	mov	%rdi,176($context)	# restore context->Rdi
   1924 
   1925 	mov	40($disp),%rdi		# disp->ContextRecord
   1926 	mov	$context,%rsi		# context
   1927 	mov	\$154,%ecx		# sizeof(CONTEXT)
   1928 	.long	0xa548f3fc		# cld; rep movsq
   1929 
   1930 	mov	$disp,%rsi
   1931 	xor	%rcx,%rcx		# arg1, UNW_FLAG_NHANDLER
   1932 	mov	8(%rsi),%rdx		# arg2, disp->ImageBase
   1933 	mov	0(%rsi),%r8		# arg3, disp->ControlPc
   1934 	mov	16(%rsi),%r9		# arg4, disp->FunctionEntry
   1935 	mov	40(%rsi),%r10		# disp->ContextRecord
   1936 	lea	56(%rsi),%r11		# &disp->HandlerData
   1937 	lea	24(%rsi),%r12		# &disp->EstablisherFrame
   1938 	mov	%r10,32(%rsp)		# arg5
   1939 	mov	%r11,40(%rsp)		# arg6
   1940 	mov	%r12,48(%rsp)		# arg7
   1941 	mov	%rcx,56(%rsp)		# arg8, (NULL)
   1942 	call	*__imp_RtlVirtualUnwind(%rip)
   1943 
   1944 	mov	\$1,%eax		# ExceptionContinueSearch
   1945 	add	\$64,%rsp
   1946 	popfq
   1947 	pop	%r15
   1948 	pop	%r14
   1949 	pop	%r13
   1950 	pop	%r12
   1951 	pop	%rbp
   1952 	pop	%rbx
   1953 	pop	%rdi
   1954 	pop	%rsi
   1955 	ret
   1956 .size	ssse3_handler,.-ssse3_handler
   1957 
   1958 .section	.pdata
   1959 .align	4
   1960 	.rva	.LSEH_begin_sha1_block_data_order
   1961 	.rva	.LSEH_end_sha1_block_data_order
   1962 	.rva	.LSEH_info_sha1_block_data_order
   1963 ___
   1964 $code.=<<___ if ($shaext);
   1965 	.rva	.LSEH_begin_sha1_block_data_order_shaext
   1966 	.rva	.LSEH_end_sha1_block_data_order_shaext
   1967 	.rva	.LSEH_info_sha1_block_data_order_shaext
   1968 ___
   1969 $code.=<<___;
   1970 	.rva	.LSEH_begin_sha1_block_data_order_ssse3
   1971 	.rva	.LSEH_end_sha1_block_data_order_ssse3
   1972 	.rva	.LSEH_info_sha1_block_data_order_ssse3
   1973 ___
   1974 $code.=<<___ if ($avx);
   1975 	.rva	.LSEH_begin_sha1_block_data_order_avx
   1976 	.rva	.LSEH_end_sha1_block_data_order_avx
   1977 	.rva	.LSEH_info_sha1_block_data_order_avx
   1978 ___
   1979 $code.=<<___ if ($avx>1);
   1980 	.rva	.LSEH_begin_sha1_block_data_order_avx2
   1981 	.rva	.LSEH_end_sha1_block_data_order_avx2
   1982 	.rva	.LSEH_info_sha1_block_data_order_avx2
   1983 ___
   1984 $code.=<<___;
   1985 .section	.xdata
   1986 .align	8
   1987 .LSEH_info_sha1_block_data_order:
   1988 	.byte	9,0,0,0
   1989 	.rva	se_handler
   1990 ___
   1991 $code.=<<___ if ($shaext);
   1992 .LSEH_info_sha1_block_data_order_shaext:
   1993 	.byte	9,0,0,0
   1994 	.rva	shaext_handler
   1995 ___
   1996 $code.=<<___;
   1997 .LSEH_info_sha1_block_data_order_ssse3:
   1998 	.byte	9,0,0,0
   1999 	.rva	ssse3_handler
   2000 	.rva	.Lprologue_ssse3,.Lepilogue_ssse3	# HandlerData[]
   2001 ___
   2002 $code.=<<___ if ($avx);
   2003 .LSEH_info_sha1_block_data_order_avx:
   2004 	.byte	9,0,0,0
   2005 	.rva	ssse3_handler
   2006 	.rva	.Lprologue_avx,.Lepilogue_avx		# HandlerData[]
   2007 ___
   2008 $code.=<<___ if ($avx>1);
   2009 .LSEH_info_sha1_block_data_order_avx2:
   2010 	.byte	9,0,0,0
   2011 	.rva	ssse3_handler
   2012 	.rva	.Lprologue_avx2,.Lepilogue_avx2		# HandlerData[]
   2013 ___
   2014 }
   2015 
   2016 ####################################################################
   2017 
   2018 sub sha1rnds4 {
   2019     if (@_[0] =~ /\$([x0-9a-f]+),\s*%xmm([0-7]),\s*%xmm([0-7])/) {
   2020       my @opcode=(0x0f,0x3a,0xcc);
   2021 	push @opcode,0xc0|($2&7)|(($3&7)<<3);		# ModR/M
   2022 	my $c=$1;
   2023 	push @opcode,$c=~/^0/?oct($c):$c;
   2024 	return ".byte\t".join(',',@opcode);
   2025     } else {
   2026 	return "sha1rnds4\t".@_[0];
   2027     }
   2028 }
   2029 
   2030 sub sha1op38 {
   2031     my $instr = shift;
   2032     my %opcodelet = (
   2033 		"sha1nexte" => 0xc8,
   2034   		"sha1msg1"  => 0xc9,
   2035 		"sha1msg2"  => 0xca	);
   2036 
   2037     if (defined($opcodelet{$instr}) && @_[0] =~ /%xmm([0-9]+),\s*%xmm([0-9]+)/) {
   2038       my @opcode=(0x0f,0x38);
   2039       my $rex=0;
   2040 	$rex|=0x04			if ($2>=8);
   2041 	$rex|=0x01			if ($1>=8);
   2042 	unshift @opcode,0x40|$rex	if ($rex);
   2043 	push @opcode,$opcodelet{$instr};
   2044 	push @opcode,0xc0|($1&7)|(($2&7)<<3);		# ModR/M
   2045 	return ".byte\t".join(',',@opcode);
   2046     } else {
   2047 	return $instr."\t".@_[0];
   2048     }
   2049 }
   2050 
   2051 foreach (split("\n",$code)) {
   2052 	s/\`([^\`]*)\`/eval $1/geo;
   2053 
   2054 	s/\b(sha1rnds4)\s+(.*)/sha1rnds4($2)/geo	or
   2055 	s/\b(sha1[^\s]*)\s+(.*)/sha1op38($1,$2)/geo;
   2056 
   2057 	print $_,"\n";
   2058 }
   2059 close STDOUT;
   2060