Home | History | Annotate | Download | only in asm
      1 #!/usr/bin/env perl
      2 #
      3 # ====================================================================
      4 # Written by Andy Polyakov <appro (at] openssl.org> for the OpenSSL
      5 # project. The module is, however, dual licensed under OpenSSL and
      6 # CRYPTOGAMS licenses depending on where you obtain it. For further
      7 # details see http://www.openssl.org/~appro/cryptogams/.
      8 # ====================================================================
      9 #
     10 # sha1_block procedure for x86_64.
     11 #
     12 # It was brought to my attention that on EM64T compiler-generated code
     13 # was far behind 32-bit assembler implementation. This is unlike on
     14 # Opteron where compiler-generated code was only 15% behind 32-bit
     15 # assembler, which originally made it hard to motivate the effort.
     16 # There was suggestion to mechanically translate 32-bit code, but I
     17 # dismissed it, reasoning that x86_64 offers enough register bank
     18 # capacity to fully utilize SHA-1 parallelism. Therefore this fresh
     19 # implementation:-) However! While 64-bit code does perform better
     20 # on Opteron, I failed to beat 32-bit assembler on EM64T core. Well,
     21 # x86_64 does offer larger *addressable* bank, but out-of-order core
     22 # reaches for even more registers through dynamic aliasing, and EM64T
     23 # core must have managed to run-time optimize even 32-bit code just as
     24 # good as 64-bit one. Performance improvement is summarized in the
     25 # following table:
     26 #
     27 #		gcc 3.4		32-bit asm	cycles/byte
     28 # Opteron	+45%		+20%		6.8
     29 # Xeon P4	+65%		+0%		9.9
     30 # Core2		+60%		+10%		7.0
     31 
     32 # August 2009.
     33 #
     34 # The code was revised to minimize code size and to maximize
     35 # "distance" between instructions producing input to 'lea'
     36 # instruction and the 'lea' instruction itself, which is essential
     37 # for Intel Atom core.
     38 
     39 # October 2010.
     40 #
     41 # Add SSSE3, Supplemental[!] SSE3, implementation. The idea behind it
     42 # is to offload message schedule denoted by Wt in NIST specification,
     43 # or Xupdate in OpenSSL source, to SIMD unit. See sha1-586.pl module
     44 # for background and implementation details. The only difference from
     45 # 32-bit code is that 64-bit code doesn't have to spill @X[] elements
     46 # to free temporary registers.
     47 
     48 # April 2011.
     49 #
     50 # Add AVX code path. See sha1-586.pl for further information.
     51 
     52 # May 2013.
     53 #
     54 # Add AVX2+BMI code path. Initial attempt (utilizing BMI instructions
     55 # and loading pair of consecutive blocks to 256-bit %ymm registers)
     56 # did not provide impressive performance improvement till a crucial
     57 # hint regarding the number of Xupdate iterations to pre-compute in
     58 # advance was provided by Ilya Albrekht of Intel Corp.
     59 
     60 # March 2014.
     61 #
     62 # Add support for Intel SHA Extensions.
     63 
     64 ######################################################################
     65 # Current performance is summarized in following table. Numbers are
     66 # CPU clock cycles spent to process single byte (less is better).
     67 #
     68 #		x86_64		SSSE3		AVX[2]
     69 # P4		9.05		-
     70 # Opteron	6.26		-
     71 # Core2		6.55		6.05/+8%	-
     72 # Westmere	6.73		5.30/+27%	-
     73 # Sandy Bridge	7.70		6.10/+26%	4.99/+54%
     74 # Ivy Bridge	6.06		4.67/+30%	4.60/+32%
     75 # Haswell	5.45		4.15/+31%	3.57/+53%
     76 # Bulldozer	9.11		5.95/+53%
     77 # VIA Nano	9.32		7.15/+30%
     78 # Atom		10.3		9.17/+12%
     79 # Silvermont	13.1(*)		9.37/+40%
     80 #
     81 # (*)	obviously suboptimal result, nothing was done about it,
     82 #	because SSSE3 code is compiled unconditionally;
     83 
     84 $flavour = shift;
     85 $output  = shift;
     86 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
     87 
     88 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
     89 
     90 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
     91 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
     92 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
     93 die "can't locate x86_64-xlate.pl";
     94 
     95 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
     96 		=~ /GNU assembler version ([2-9]\.[0-9]+)/) {
     97 	$avx = ($1>=2.19) + ($1>=2.22);
     98 }
     99 
    100 if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
    101 	   `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
    102 	$avx = ($1>=2.09) + ($1>=2.10);
    103 }
    104 
    105 if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
    106 	   `ml64 2>&1` =~ /Version ([0-9]+)\./) {
    107 	$avx = ($1>=10) + ($1>=11);
    108 }
    109 
    110 if (!$avx && `$ENV{CC} -v 2>&1` =~ /(^clang version|based on LLVM) ([2-9]\.[0-9]+)/) {
    111 	$avx = ($2>=3.0) + ($2>3.0);
    112 }
    113 
    114 $shaext=0;	### set to zero if compiling for 1.0.1
    115 $avx=1		if (!$shaext && $avx);
    116 
    117 open OUT,"| \"$^X\" $xlate $flavour $output";
    118 *STDOUT=*OUT;
    119 
    120 $ctx="%rdi";	# 1st arg
    121 $inp="%rsi";	# 2nd arg
    122 $num="%rdx";	# 3rd arg
    123 
    124 # reassign arguments in order to produce more compact code
    125 $ctx="%r8";
    126 $inp="%r9";
    127 $num="%r10";
    128 
    129 $t0="%eax";
    130 $t1="%ebx";
    131 $t2="%ecx";
    132 @xi=("%edx","%ebp","%r14d");
    133 $A="%esi";
    134 $B="%edi";
    135 $C="%r11d";
    136 $D="%r12d";
    137 $E="%r13d";
    138 
    139 @V=($A,$B,$C,$D,$E);
    140 
    141 sub BODY_00_19 {
    142 my ($i,$a,$b,$c,$d,$e)=@_;
    143 my $j=$i+1;
    144 $code.=<<___ if ($i==0);
    145 	mov	`4*$i`($inp),$xi[0]
    146 	bswap	$xi[0]
    147 ___
    148 $code.=<<___ if ($i<15);
    149 	mov	`4*$j`($inp),$xi[1]
    150 	mov	$d,$t0
    151 	mov	$xi[0],`4*$i`(%rsp)
    152 	mov	$a,$t2
    153 	bswap	$xi[1]
    154 	xor	$c,$t0
    155 	rol	\$5,$t2
    156 	and	$b,$t0
    157 	lea	0x5a827999($xi[0],$e),$e
    158 	add	$t2,$e
    159 	xor	$d,$t0
    160 	rol	\$30,$b
    161 	add	$t0,$e
    162 ___
    163 $code.=<<___ if ($i>=15);
    164 	xor	`4*($j%16)`(%rsp),$xi[1]
    165 	mov	$d,$t0
    166 	mov	$xi[0],`4*($i%16)`(%rsp)
    167 	mov	$a,$t2
    168 	xor	`4*(($j+2)%16)`(%rsp),$xi[1]
    169 	xor	$c,$t0
    170 	rol	\$5,$t2
    171 	xor	`4*(($j+8)%16)`(%rsp),$xi[1]
    172 	and	$b,$t0
    173 	lea	0x5a827999($xi[0],$e),$e
    174 	rol	\$30,$b
    175 	xor	$d,$t0
    176 	add	$t2,$e
    177 	rol	\$1,$xi[1]
    178 	add	$t0,$e
    179 ___
    180 push(@xi,shift(@xi));
    181 }
    182 
    183 sub BODY_20_39 {
    184 my ($i,$a,$b,$c,$d,$e)=@_;
    185 my $j=$i+1;
    186 my $K=($i<40)?0x6ed9eba1:0xca62c1d6;
    187 $code.=<<___ if ($i<79);
    188 	xor	`4*($j%16)`(%rsp),$xi[1]
    189 	mov	$b,$t0
    190 	`"mov	$xi[0],".4*($i%16)."(%rsp)"	if ($i<72)`
    191 	mov	$a,$t2
    192 	xor	`4*(($j+2)%16)`(%rsp),$xi[1]
    193 	xor	$d,$t0
    194 	rol	\$5,$t2
    195 	xor	`4*(($j+8)%16)`(%rsp),$xi[1]
    196 	lea	$K($xi[0],$e),$e
    197 	xor	$c,$t0
    198 	add	$t2,$e
    199 	rol	\$30,$b
    200 	add	$t0,$e
    201 	rol	\$1,$xi[1]
    202 ___
    203 $code.=<<___ if ($i==79);
    204 	mov	$b,$t0
    205 	mov	$a,$t2
    206 	xor	$d,$t0
    207 	lea	$K($xi[0],$e),$e
    208 	rol	\$5,$t2
    209 	xor	$c,$t0
    210 	add	$t2,$e
    211 	rol	\$30,$b
    212 	add	$t0,$e
    213 ___
    214 push(@xi,shift(@xi));
    215 }
    216 
    217 sub BODY_40_59 {
    218 my ($i,$a,$b,$c,$d,$e)=@_;
    219 my $j=$i+1;
    220 $code.=<<___;
    221 	xor	`4*($j%16)`(%rsp),$xi[1]
    222 	mov	$d,$t0
    223 	mov	$xi[0],`4*($i%16)`(%rsp)
    224 	mov	$d,$t1
    225 	xor	`4*(($j+2)%16)`(%rsp),$xi[1]
    226 	and	$c,$t0
    227 	mov	$a,$t2
    228 	xor	`4*(($j+8)%16)`(%rsp),$xi[1]
    229 	lea	0x8f1bbcdc($xi[0],$e),$e
    230 	xor	$c,$t1
    231 	rol	\$5,$t2
    232 	add	$t0,$e
    233 	rol	\$1,$xi[1]
    234 	and	$b,$t1
    235 	add	$t2,$e
    236 	rol	\$30,$b
    237 	add	$t1,$e
    238 ___
    239 push(@xi,shift(@xi));
    240 }
    241 
    242 $code.=<<___;
    243 .text
    244 .extern	OPENSSL_ia32cap_P
    245 
    246 .globl	sha1_block_data_order
    247 .type	sha1_block_data_order,\@function,3
    248 .align	16
    249 sha1_block_data_order:
    250 	mov	OPENSSL_ia32cap_P+0(%rip),%r9d
    251 	mov	OPENSSL_ia32cap_P+4(%rip),%r8d
    252 	mov	OPENSSL_ia32cap_P+8(%rip),%r10d
    253 	test	\$`1<<9`,%r8d		# check SSSE3 bit
    254 	jz	.Lialu
    255 ___
    256 $code.=<<___ if ($shaext);
    257 	test	\$`1<<29`,%r10d		# check SHA bit	
    258 	jnz	_shaext_shortcut
    259 ___
    260 $code.=<<___ if ($avx>1);
    261 	and	\$`1<<3|1<<5|1<<8`,%r10d	# check AVX2+BMI1+BMI2
    262 	cmp	\$`1<<3|1<<5|1<<8`,%r10d
    263 	je	_avx2_shortcut
    264 ___
    265 $code.=<<___ if ($avx);
    266 	and	\$`1<<28`,%r8d		# mask AVX bit
    267 	and	\$`1<<30`,%r9d		# mask "Intel CPU" bit
    268 	or	%r9d,%r8d
    269 	cmp	\$`1<<28|1<<30`,%r8d
    270 	je	_avx_shortcut
    271 ___
    272 $code.=<<___;
    273 	jmp	_ssse3_shortcut
    274 
    275 .align	16
    276 .Lialu:
    277 	mov	%rsp,%rax
    278 	push	%rbx
    279 	push	%rbp
    280 	push	%r12
    281 	push	%r13
    282 	push	%r14
    283 	mov	%rdi,$ctx	# reassigned argument
    284 	sub	\$`8+16*4`,%rsp
    285 	mov	%rsi,$inp	# reassigned argument
    286 	and	\$-64,%rsp
    287 	mov	%rdx,$num	# reassigned argument
    288 	mov	%rax,`16*4`(%rsp)
    289 .Lprologue:
    290 
    291 	mov	0($ctx),$A
    292 	mov	4($ctx),$B
    293 	mov	8($ctx),$C
    294 	mov	12($ctx),$D
    295 	mov	16($ctx),$E
    296 	jmp	.Lloop
    297 
    298 .align	16
    299 .Lloop:
    300 ___
    301 for($i=0;$i<20;$i++)	{ &BODY_00_19($i,@V); unshift(@V,pop(@V)); }
    302 for(;$i<40;$i++)	{ &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
    303 for(;$i<60;$i++)	{ &BODY_40_59($i,@V); unshift(@V,pop(@V)); }
    304 for(;$i<80;$i++)	{ &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
    305 $code.=<<___;
    306 	add	0($ctx),$A
    307 	add	4($ctx),$B
    308 	add	8($ctx),$C
    309 	add	12($ctx),$D
    310 	add	16($ctx),$E
    311 	mov	$A,0($ctx)
    312 	mov	$B,4($ctx)
    313 	mov	$C,8($ctx)
    314 	mov	$D,12($ctx)
    315 	mov	$E,16($ctx)
    316 
    317 	sub	\$1,$num
    318 	lea	`16*4`($inp),$inp
    319 	jnz	.Lloop
    320 
    321 	mov	`16*4`(%rsp),%rsi
    322 	mov	-40(%rsi),%r14
    323 	mov	-32(%rsi),%r13
    324 	mov	-24(%rsi),%r12
    325 	mov	-16(%rsi),%rbp
    326 	mov	-8(%rsi),%rbx
    327 	lea	(%rsi),%rsp
    328 .Lepilogue:
    329 	ret
    330 .size	sha1_block_data_order,.-sha1_block_data_order
    331 ___
    332 if ($shaext) {{{
    333 ######################################################################
    334 # Intel SHA Extensions implementation of SHA1 update function.
    335 #
    336 my ($ctx,$inp,$num)=("%rdi","%rsi","%rdx");
    337 my ($ABCD,$E,$E_,$BSWAP,$ABCD_SAVE,$E_SAVE)=map("%xmm$_",(0..3,8,9));
    338 my @MSG=map("%xmm$_",(4..7));
    339 
    340 $code.=<<___;
    341 .type	sha1_block_data_order_shaext,\@function,3
    342 .align	32
    343 sha1_block_data_order_shaext:
    344 _shaext_shortcut:
    345 ___
    346 $code.=<<___ if ($win64);
    347 	lea	`-8-4*16`(%rsp),%rsp
    348 	movaps	%xmm6,-8-4*16(%rax)
    349 	movaps	%xmm7,-8-3*16(%rax)
    350 	movaps	%xmm8,-8-2*16(%rax)
    351 	movaps	%xmm9,-8-1*16(%rax)
    352 .Lprologue_shaext:
    353 ___
    354 $code.=<<___;
    355 	movdqu	($ctx),$ABCD
    356 	movd	16($ctx),$E
    357 	movdqa	K_XX_XX+0xa0(%rip),$BSWAP	# byte-n-word swap
    358 
    359 	movdqu	($inp),@MSG[0]
    360 	pshufd	\$0b00011011,$ABCD,$ABCD	# flip word order
    361 	movdqu	0x10($inp),@MSG[1]
    362 	pshufd	\$0b00011011,$E,$E		# flip word order
    363 	movdqu	0x20($inp),@MSG[2]
    364 	pshufb	$BSWAP,@MSG[0]
    365 	movdqu	0x30($inp),@MSG[3]
    366 	pshufb	$BSWAP,@MSG[1]
    367 	pshufb	$BSWAP,@MSG[2]
    368 	movdqa	$E,$E_SAVE			# offload $E
    369 	pshufb	$BSWAP,@MSG[3]
    370 	jmp	.Loop_shaext
    371 
    372 .align	16
    373 .Loop_shaext:
    374 	dec		$num
    375 	lea		0x40($inp),%rax		# next input block
    376 	paddd		@MSG[0],$E
    377 	cmovne		%rax,$inp
    378 	movdqa		$ABCD,$ABCD_SAVE	# offload $ABCD
    379 ___
    380 for($i=0;$i<20-4;$i+=2) {
    381 $code.=<<___;
    382 	sha1msg1	@MSG[1],@MSG[0]
    383 	movdqa		$ABCD,$E_
    384 	sha1rnds4	\$`int($i/5)`,$E,$ABCD	# 0-3...
    385 	sha1nexte	@MSG[1],$E_
    386 	pxor		@MSG[2],@MSG[0]
    387 	sha1msg1	@MSG[2],@MSG[1]
    388 	sha1msg2	@MSG[3],@MSG[0]
    389 
    390 	movdqa		$ABCD,$E
    391 	sha1rnds4	\$`int(($i+1)/5)`,$E_,$ABCD
    392 	sha1nexte	@MSG[2],$E
    393 	pxor		@MSG[3],@MSG[1]
    394 	sha1msg2	@MSG[0],@MSG[1]
    395 ___
    396 	push(@MSG,shift(@MSG));	push(@MSG,shift(@MSG));
    397 }
    398 $code.=<<___;
    399 	movdqu		($inp),@MSG[0]
    400 	movdqa		$ABCD,$E_
    401 	sha1rnds4	\$3,$E,$ABCD		# 64-67
    402 	sha1nexte	@MSG[1],$E_
    403 	movdqu		0x10($inp),@MSG[1]
    404 	pshufb		$BSWAP,@MSG[0]
    405 
    406 	movdqa		$ABCD,$E
    407 	sha1rnds4	\$3,$E_,$ABCD		# 68-71
    408 	sha1nexte	@MSG[2],$E
    409 	movdqu		0x20($inp),@MSG[2]
    410 	pshufb		$BSWAP,@MSG[1]
    411 
    412 	movdqa		$ABCD,$E_
    413 	sha1rnds4	\$3,$E,$ABCD		# 72-75
    414 	sha1nexte	@MSG[3],$E_
    415 	movdqu		0x30($inp),@MSG[3]
    416 	pshufb		$BSWAP,@MSG[2]
    417 
    418 	movdqa		$ABCD,$E
    419 	sha1rnds4	\$3,$E_,$ABCD		# 76-79
    420 	sha1nexte	$E_SAVE,$E
    421 	pshufb		$BSWAP,@MSG[3]
    422 
    423 	paddd		$ABCD_SAVE,$ABCD
    424 	movdqa		$E,$E_SAVE		# offload $E
    425 
    426 	jnz		.Loop_shaext
    427 
    428 	pshufd	\$0b00011011,$ABCD,$ABCD
    429 	pshufd	\$0b00011011,$E,$E
    430 	movdqu	$ABCD,($ctx)
    431 	movd	$E,16($ctx)
    432 ___
    433 $code.=<<___ if ($win64);
    434 	movaps	-8-4*16(%rax),%xmm6
    435 	movaps	-8-3*16(%rax),%xmm7
    436 	movaps	-8-2*16(%rax),%xmm8
    437 	movaps	-8-1*16(%rax),%xmm9
    438 	mov	%rax,%rsp
    439 .Lepilogue_shaext:
    440 ___
    441 $code.=<<___;
    442 	ret
    443 .size	sha1_block_data_order_shaext,.-sha1_block_data_order_shaext
    444 ___
    445 }}}
    446 {{{
    447 my $Xi=4;
    448 my @X=map("%xmm$_",(4..7,0..3));
    449 my @Tx=map("%xmm$_",(8..10));
    450 my $Kx="%xmm11";
    451 my @V=($A,$B,$C,$D,$E)=("%eax","%ebx","%ecx","%edx","%ebp");	# size optimization
    452 my @T=("%esi","%edi");
    453 my $j=0;
    454 my $rx=0;
    455 my $K_XX_XX="%r11";
    456 
    457 my $_rol=sub { &rol(@_) };
    458 my $_ror=sub { &ror(@_) };
    459 
    460 { my $sn;
    461 sub align32() {
    462   ++$sn;
    463 $code.=<<___;
    464 	jmp	.Lalign32_$sn	# see "Decoded ICache" in manual
    465 .align	32
    466 .Lalign32_$sn:
    467 ___
    468 }
    469 }
    470 
    471 $code.=<<___;
    472 .type	sha1_block_data_order_ssse3,\@function,3
    473 .align	16
    474 sha1_block_data_order_ssse3:
    475 _ssse3_shortcut:
    476 	mov	%rsp,%rax
    477 	push	%rbx
    478 	push	%rbp
    479 	push	%r12
    480 	push	%r13		# redundant, done to share Win64 SE handler
    481 	push	%r14
    482 	lea	`-64-($win64?6*16:0)`(%rsp),%rsp
    483 ___
    484 $code.=<<___ if ($win64);
    485 	movaps	%xmm6,-40-6*16(%rax)
    486 	movaps	%xmm7,-40-5*16(%rax)
    487 	movaps	%xmm8,-40-4*16(%rax)
    488 	movaps	%xmm9,-40-3*16(%rax)
    489 	movaps	%xmm10,-40-2*16(%rax)
    490 	movaps	%xmm11,-40-1*16(%rax)
    491 .Lprologue_ssse3:
    492 ___
    493 $code.=<<___;
    494 	mov	%rax,%r14	# original %rsp
    495 	and	\$-64,%rsp
    496 	mov	%rdi,$ctx	# reassigned argument
    497 	mov	%rsi,$inp	# reassigned argument
    498 	mov	%rdx,$num	# reassigned argument
    499 
    500 	shl	\$6,$num
    501 	add	$inp,$num
    502 	lea	K_XX_XX+64(%rip),$K_XX_XX
    503 
    504 	mov	0($ctx),$A		# load context
    505 	mov	4($ctx),$B
    506 	mov	8($ctx),$C
    507 	mov	12($ctx),$D
    508 	mov	$B,@T[0]		# magic seed
    509 	mov	16($ctx),$E
    510 	mov	$C,@T[1]
    511 	xor	$D,@T[1]
    512 	and	@T[1],@T[0]
    513 
    514 	movdqa	64($K_XX_XX),@X[2]	# pbswap mask
    515 	movdqa	-64($K_XX_XX),@Tx[1]	# K_00_19
    516 	movdqu	0($inp),@X[-4&7]	# load input to %xmm[0-3]
    517 	movdqu	16($inp),@X[-3&7]
    518 	movdqu	32($inp),@X[-2&7]
    519 	movdqu	48($inp),@X[-1&7]
    520 	pshufb	@X[2],@X[-4&7]		# byte swap
    521 	pshufb	@X[2],@X[-3&7]
    522 	pshufb	@X[2],@X[-2&7]
    523 	add	\$64,$inp
    524 	paddd	@Tx[1],@X[-4&7]		# add K_00_19
    525 	pshufb	@X[2],@X[-1&7]
    526 	paddd	@Tx[1],@X[-3&7]
    527 	paddd	@Tx[1],@X[-2&7]
    528 	movdqa	@X[-4&7],0(%rsp)	# X[]+K xfer to IALU
    529 	psubd	@Tx[1],@X[-4&7]		# restore X[]
    530 	movdqa	@X[-3&7],16(%rsp)
    531 	psubd	@Tx[1],@X[-3&7]
    532 	movdqa	@X[-2&7],32(%rsp)
    533 	psubd	@Tx[1],@X[-2&7]
    534 	jmp	.Loop_ssse3
    535 ___
    536 
    537 sub AUTOLOAD()		# thunk [simplified] 32-bit style perlasm
    538 { my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
    539   my $arg = pop;
    540     $arg = "\$$arg" if ($arg*1 eq $arg);
    541     $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n";
    542 }
    543 
    544 sub Xupdate_ssse3_16_31()		# recall that $Xi starts wtih 4
    545 { use integer;
    546   my $body = shift;
    547   my @insns = (&$body,&$body,&$body,&$body);	# 40 instructions
    548   my ($a,$b,$c,$d,$e);
    549 
    550 	 eval(shift(@insns));		# ror
    551 	&pshufd	(@X[0],@X[-4&7],0xee);	# was &movdqa	(@X[0],@X[-3&7]);
    552 	 eval(shift(@insns));
    553 	&movdqa	(@Tx[0],@X[-1&7]);
    554 	  &paddd	(@Tx[1],@X[-1&7]);
    555 	 eval(shift(@insns));
    556 	 eval(shift(@insns));
    557 
    558 	&punpcklqdq(@X[0],@X[-3&7]);	# compose "X[-14]" in "X[0]", was &palignr(@X[0],@X[-4&7],8);
    559 	 eval(shift(@insns));
    560 	 eval(shift(@insns));		# rol
    561 	 eval(shift(@insns));
    562 	&psrldq	(@Tx[0],4);		# "X[-3]", 3 dwords
    563 	 eval(shift(@insns));
    564 	 eval(shift(@insns));
    565 
    566 	&pxor	(@X[0],@X[-4&7]);	# "X[0]"^="X[-16]"
    567 	 eval(shift(@insns));
    568 	 eval(shift(@insns));		# ror
    569 	&pxor	(@Tx[0],@X[-2&7]);	# "X[-3]"^"X[-8]"
    570 	 eval(shift(@insns));
    571 	 eval(shift(@insns));
    572 	 eval(shift(@insns));
    573 
    574 	&pxor	(@X[0],@Tx[0]);		# "X[0]"^="X[-3]"^"X[-8]"
    575 	 eval(shift(@insns));
    576 	 eval(shift(@insns));		# rol
    577 	  &movdqa	(eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]);	# X[]+K xfer to IALU
    578 	 eval(shift(@insns));
    579 	 eval(shift(@insns));
    580 
    581 	&movdqa	(@Tx[2],@X[0]);
    582 	 eval(shift(@insns));
    583 	 eval(shift(@insns));
    584 	 eval(shift(@insns));		# ror
    585 	&movdqa	(@Tx[0],@X[0]);
    586 	 eval(shift(@insns));
    587 
    588 	&pslldq	(@Tx[2],12);		# "X[0]"<<96, extract one dword
    589 	&paddd	(@X[0],@X[0]);
    590 	 eval(shift(@insns));
    591 	 eval(shift(@insns));
    592 
    593 	&psrld	(@Tx[0],31);
    594 	 eval(shift(@insns));
    595 	 eval(shift(@insns));		# rol
    596 	 eval(shift(@insns));
    597 	&movdqa	(@Tx[1],@Tx[2]);
    598 	 eval(shift(@insns));
    599 	 eval(shift(@insns));
    600 
    601 	&psrld	(@Tx[2],30);
    602 	 eval(shift(@insns));
    603 	 eval(shift(@insns));		# ror
    604 	&por	(@X[0],@Tx[0]);		# "X[0]"<<<=1
    605 	 eval(shift(@insns));
    606 	 eval(shift(@insns));
    607 	 eval(shift(@insns));
    608 
    609 	&pslld	(@Tx[1],2);
    610 	&pxor	(@X[0],@Tx[2]);
    611 	 eval(shift(@insns));
    612 	  &movdqa	(@Tx[2],eval(2*16*(($Xi)/5)-64)."($K_XX_XX)");	# K_XX_XX
    613 	 eval(shift(@insns));		# rol
    614 	 eval(shift(@insns));
    615 	 eval(shift(@insns));
    616 
    617 	&pxor	(@X[0],@Tx[1]);		# "X[0]"^=("X[0]">>96)<<<2
    618 	&pshufd (@Tx[1],@X[-1&7],0xee)	if ($Xi==7);	# was &movdqa	(@Tx[0],@X[-1&7]) in Xupdate_ssse3_32_79
    619 
    620 	 foreach (@insns) { eval; }	# remaining instructions [if any]
    621 
    622   $Xi++;	push(@X,shift(@X));	# "rotate" X[]
    623 		push(@Tx,shift(@Tx));
    624 }
    625 
    626 sub Xupdate_ssse3_32_79()
    627 { use integer;
    628   my $body = shift;
    629   my @insns = (&$body,&$body,&$body,&$body);	# 32 to 44 instructions
    630   my ($a,$b,$c,$d,$e);
    631 
    632 	 eval(shift(@insns))		if ($Xi==8);
    633 	&pxor	(@X[0],@X[-4&7]);	# "X[0]"="X[-32]"^"X[-16]"
    634 	 eval(shift(@insns))		if ($Xi==8);
    635 	 eval(shift(@insns));		# body_20_39
    636 	 eval(shift(@insns));
    637 	 eval(shift(@insns))		if (@insns[1] =~ /_ror/);
    638 	 eval(shift(@insns))		if (@insns[0] =~ /_ror/);
    639 	&punpcklqdq(@Tx[0],@X[-1&7]);	# compose "X[-6]", was &palignr(@Tx[0],@X[-2&7],8);
    640 	 eval(shift(@insns));
    641 	 eval(shift(@insns));		# rol
    642 
    643 	&pxor	(@X[0],@X[-7&7]);	# "X[0]"^="X[-28]"
    644 	 eval(shift(@insns));
    645 	 eval(shift(@insns));
    646 	if ($Xi%5) {
    647 	  &movdqa	(@Tx[2],@Tx[1]);# "perpetuate" K_XX_XX...
    648 	} else {			# ... or load next one
    649 	  &movdqa	(@Tx[2],eval(2*16*($Xi/5)-64)."($K_XX_XX)");
    650 	}
    651 	 eval(shift(@insns));		# ror
    652 	  &paddd	(@Tx[1],@X[-1&7]);
    653 	 eval(shift(@insns));
    654 
    655 	&pxor	(@X[0],@Tx[0]);		# "X[0]"^="X[-6]"
    656 	 eval(shift(@insns));		# body_20_39
    657 	 eval(shift(@insns));
    658 	 eval(shift(@insns));
    659 	 eval(shift(@insns));		# rol
    660 	 eval(shift(@insns))		if (@insns[0] =~ /_ror/);
    661 
    662 	&movdqa	(@Tx[0],@X[0]);
    663 	 eval(shift(@insns));
    664 	 eval(shift(@insns));
    665 	  &movdqa	(eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]);	# X[]+K xfer to IALU
    666 	 eval(shift(@insns));		# ror
    667 	 eval(shift(@insns));
    668 	 eval(shift(@insns));		# body_20_39
    669 
    670 	&pslld	(@X[0],2);
    671 	 eval(shift(@insns));
    672 	 eval(shift(@insns));
    673 	&psrld	(@Tx[0],30);
    674 	 eval(shift(@insns))		if (@insns[0] =~ /_rol/);# rol
    675 	 eval(shift(@insns));
    676 	 eval(shift(@insns));
    677 	 eval(shift(@insns));		# ror
    678 
    679 	&por	(@X[0],@Tx[0]);		# "X[0]"<<<=2
    680 	 eval(shift(@insns));
    681 	 eval(shift(@insns));		# body_20_39
    682 	 eval(shift(@insns))		if (@insns[1] =~ /_rol/);
    683 	 eval(shift(@insns))		if (@insns[0] =~ /_rol/);
    684 	  &pshufd(@Tx[1],@X[-1&7],0xee)	if ($Xi<19);	# was &movdqa	(@Tx[1],@X[0])
    685 	 eval(shift(@insns));
    686 	 eval(shift(@insns));		# rol
    687 	 eval(shift(@insns));
    688 	 eval(shift(@insns));
    689 	 eval(shift(@insns));		# rol
    690 	 eval(shift(@insns));
    691 
    692 	 foreach (@insns) { eval; }	# remaining instructions
    693 
    694   $Xi++;	push(@X,shift(@X));	# "rotate" X[]
    695 		push(@Tx,shift(@Tx));
    696 }
    697 
    698 sub Xuplast_ssse3_80()
    699 { use integer;
    700   my $body = shift;
    701   my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
    702   my ($a,$b,$c,$d,$e);
    703 
    704 	 eval(shift(@insns));
    705 	 eval(shift(@insns));
    706 	 eval(shift(@insns));
    707 	 eval(shift(@insns));
    708 	  &paddd	(@Tx[1],@X[-1&7]);
    709 	 eval(shift(@insns));
    710 	 eval(shift(@insns));
    711 
    712 	  &movdqa	(eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]);	# X[]+K xfer IALU
    713 
    714 	 foreach (@insns) { eval; }		# remaining instructions
    715 
    716 	&cmp	($inp,$num);
    717 	&je	(".Ldone_ssse3");
    718 
    719 	unshift(@Tx,pop(@Tx));
    720 
    721 	&movdqa	(@X[2],"64($K_XX_XX)");		# pbswap mask
    722 	&movdqa	(@Tx[1],"-64($K_XX_XX)");	# K_00_19
    723 	&movdqu	(@X[-4&7],"0($inp)");		# load input
    724 	&movdqu	(@X[-3&7],"16($inp)");
    725 	&movdqu	(@X[-2&7],"32($inp)");
    726 	&movdqu	(@X[-1&7],"48($inp)");
    727 	&pshufb	(@X[-4&7],@X[2]);		# byte swap
    728 	&add	($inp,64);
    729 
    730   $Xi=0;
    731 }
    732 
    733 sub Xloop_ssse3()
    734 { use integer;
    735   my $body = shift;
    736   my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
    737   my ($a,$b,$c,$d,$e);
    738 
    739 	 eval(shift(@insns));
    740 	 eval(shift(@insns));
    741 	 eval(shift(@insns));
    742 	&pshufb	(@X[($Xi-3)&7],@X[2]);
    743 	 eval(shift(@insns));
    744 	 eval(shift(@insns));
    745 	 eval(shift(@insns));
    746 	 eval(shift(@insns));
    747 	&paddd	(@X[($Xi-4)&7],@Tx[1]);
    748 	 eval(shift(@insns));
    749 	 eval(shift(@insns));
    750 	 eval(shift(@insns));
    751 	 eval(shift(@insns));
    752 	&movdqa	(eval(16*$Xi)."(%rsp)",@X[($Xi-4)&7]);	# X[]+K xfer to IALU
    753 	 eval(shift(@insns));
    754 	 eval(shift(@insns));
    755 	 eval(shift(@insns));
    756 	 eval(shift(@insns));
    757 	&psubd	(@X[($Xi-4)&7],@Tx[1]);
    758 
    759 	foreach (@insns) { eval; }
    760   $Xi++;
    761 }
    762 
    763 sub Xtail_ssse3()
    764 { use integer;
    765   my $body = shift;
    766   my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
    767   my ($a,$b,$c,$d,$e);
    768 
    769 	foreach (@insns) { eval; }
    770 }
    771 
    772 sub body_00_19 () {	# ((c^d)&b)^d
    773 	# on start @T[0]=(c^d)&b
    774 	return &body_20_39() if ($rx==19); $rx++;
    775 	(
    776 	'($a,$b,$c,$d,$e)=@V;'.
    777 	'&$_ror	($b,$j?7:2)',	# $b>>>2
    778 	'&xor	(@T[0],$d)',
    779 	'&mov	(@T[1],$a)',	# $b for next round
    780 
    781 	'&add	($e,eval(4*($j&15))."(%rsp)")',	# X[]+K xfer
    782 	'&xor	($b,$c)',	# $c^$d for next round
    783 
    784 	'&$_rol	($a,5)',
    785 	'&add	($e,@T[0])',
    786 	'&and	(@T[1],$b)',	# ($b&($c^$d)) for next round
    787 
    788 	'&xor	($b,$c)',	# restore $b
    789 	'&add	($e,$a);'	.'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));'
    790 	);
    791 }
    792 
    793 sub body_20_39 () {	# b^d^c
    794 	# on entry @T[0]=b^d
    795 	return &body_40_59() if ($rx==39); $rx++;
    796 	(
    797 	'($a,$b,$c,$d,$e)=@V;'.
    798 	'&add	($e,eval(4*($j&15))."(%rsp)")',	# X[]+K xfer
    799 	'&xor	(@T[0],$d)	if($j==19);'.
    800 	'&xor	(@T[0],$c)	if($j> 19)',	# ($b^$d^$c)
    801 	'&mov	(@T[1],$a)',	# $b for next round
    802 
    803 	'&$_rol	($a,5)',
    804 	'&add	($e,@T[0])',
    805 	'&xor	(@T[1],$c)	if ($j< 79)',	# $b^$d for next round
    806 
    807 	'&$_ror	($b,7)',	# $b>>>2
    808 	'&add	($e,$a);'	.'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));'
    809 	);
    810 }
    811 
    812 sub body_40_59 () {	# ((b^c)&(c^d))^c
    813 	# on entry @T[0]=(b^c), (c^=d)
    814 	$rx++;
    815 	(
    816 	'($a,$b,$c,$d,$e)=@V;'.
    817 	'&add	($e,eval(4*($j&15))."(%rsp)")',	# X[]+K xfer
    818 	'&and	(@T[0],$c)	if ($j>=40)',	# (b^c)&(c^d)
    819 	'&xor	($c,$d)		if ($j>=40)',	# restore $c
    820 
    821 	'&$_ror	($b,7)',	# $b>>>2
    822 	'&mov	(@T[1],$a)',	# $b for next round
    823 	'&xor	(@T[0],$c)',
    824 
    825 	'&$_rol	($a,5)',
    826 	'&add	($e,@T[0])',
    827 	'&xor	(@T[1],$c)	if ($j==59);'.
    828 	'&xor	(@T[1],$b)	if ($j< 59)',	# b^c for next round
    829 
    830 	'&xor	($b,$c)		if ($j< 59)',	# c^d for next round
    831 	'&add	($e,$a);'	.'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));'
    832 	);
    833 }
    834 $code.=<<___;
    835 .align	16
    836 .Loop_ssse3:
    837 ___
    838 	&Xupdate_ssse3_16_31(\&body_00_19);
    839 	&Xupdate_ssse3_16_31(\&body_00_19);
    840 	&Xupdate_ssse3_16_31(\&body_00_19);
    841 	&Xupdate_ssse3_16_31(\&body_00_19);
    842 	&Xupdate_ssse3_32_79(\&body_00_19);
    843 	&Xupdate_ssse3_32_79(\&body_20_39);
    844 	&Xupdate_ssse3_32_79(\&body_20_39);
    845 	&Xupdate_ssse3_32_79(\&body_20_39);
    846 	&Xupdate_ssse3_32_79(\&body_20_39);
    847 	&Xupdate_ssse3_32_79(\&body_20_39);
    848 	&Xupdate_ssse3_32_79(\&body_40_59);
    849 	&Xupdate_ssse3_32_79(\&body_40_59);
    850 	&Xupdate_ssse3_32_79(\&body_40_59);
    851 	&Xupdate_ssse3_32_79(\&body_40_59);
    852 	&Xupdate_ssse3_32_79(\&body_40_59);
    853 	&Xupdate_ssse3_32_79(\&body_20_39);
    854 	&Xuplast_ssse3_80(\&body_20_39);	# can jump to "done"
    855 
    856 				$saved_j=$j; @saved_V=@V;
    857 
    858 	&Xloop_ssse3(\&body_20_39);
    859 	&Xloop_ssse3(\&body_20_39);
    860 	&Xloop_ssse3(\&body_20_39);
    861 
    862 $code.=<<___;
    863 	add	0($ctx),$A			# update context
    864 	add	4($ctx),@T[0]
    865 	add	8($ctx),$C
    866 	add	12($ctx),$D
    867 	mov	$A,0($ctx)
    868 	add	16($ctx),$E
    869 	mov	@T[0],4($ctx)
    870 	mov	@T[0],$B			# magic seed
    871 	mov	$C,8($ctx)
    872 	mov	$C,@T[1]
    873 	mov	$D,12($ctx)
    874 	xor	$D,@T[1]
    875 	mov	$E,16($ctx)
    876 	and	@T[1],@T[0]
    877 	jmp	.Loop_ssse3
    878 
    879 .align	16
    880 .Ldone_ssse3:
    881 ___
    882 				$j=$saved_j; @V=@saved_V;
    883 
    884 	&Xtail_ssse3(\&body_20_39);
    885 	&Xtail_ssse3(\&body_20_39);
    886 	&Xtail_ssse3(\&body_20_39);
    887 
    888 $code.=<<___;
    889 	add	0($ctx),$A			# update context
    890 	add	4($ctx),@T[0]
    891 	add	8($ctx),$C
    892 	mov	$A,0($ctx)
    893 	add	12($ctx),$D
    894 	mov	@T[0],4($ctx)
    895 	add	16($ctx),$E
    896 	mov	$C,8($ctx)
    897 	mov	$D,12($ctx)
    898 	mov	$E,16($ctx)
    899 ___
    900 $code.=<<___ if ($win64);
    901 	movaps	-40-6*16(%r14),%xmm6
    902 	movaps	-40-5*16(%r14),%xmm7
    903 	movaps	-40-4*16(%r14),%xmm8
    904 	movaps	-40-3*16(%r14),%xmm9
    905 	movaps	-40-2*16(%r14),%xmm10
    906 	movaps	-40-1*16(%r14),%xmm11
    907 ___
    908 $code.=<<___;
    909 	lea	(%r14),%rsi
    910 	mov	-40(%rsi),%r14
    911 	mov	-32(%rsi),%r13
    912 	mov	-24(%rsi),%r12
    913 	mov	-16(%rsi),%rbp
    914 	mov	-8(%rsi),%rbx
    915 	lea	(%rsi),%rsp
    916 .Lepilogue_ssse3:
    917 	ret
    918 .size	sha1_block_data_order_ssse3,.-sha1_block_data_order_ssse3
    919 ___
    920 
    921 if ($avx) {
    922 $Xi=4;				# reset variables
    923 @X=map("%xmm$_",(4..7,0..3));
    924 @Tx=map("%xmm$_",(8..10));
    925 $j=0;
    926 $rx=0;
    927 
    928 my $done_avx_label=".Ldone_avx";
    929 
    930 my $_rol=sub { &shld(@_[0],@_) };
    931 my $_ror=sub { &shrd(@_[0],@_) };
    932 
    933 $code.=<<___;
    934 .type	sha1_block_data_order_avx,\@function,3
    935 .align	16
    936 sha1_block_data_order_avx:
    937 _avx_shortcut:
    938 	mov	%rsp,%rax
    939 	push	%rbx
    940 	push	%rbp
    941 	push	%r12
    942 	push	%r13		# redundant, done to share Win64 SE handler
    943 	push	%r14
    944 	lea	`-64-($win64?6*16:0)`(%rsp),%rsp
    945 	vzeroupper
    946 ___
    947 $code.=<<___ if ($win64);
    948 	vmovaps	%xmm6,-40-6*16(%rax)
    949 	vmovaps	%xmm7,-40-5*16(%rax)
    950 	vmovaps	%xmm8,-40-4*16(%rax)
    951 	vmovaps	%xmm9,-40-3*16(%rax)
    952 	vmovaps	%xmm10,-40-2*16(%rax)
    953 	vmovaps	%xmm11,-40-1*16(%rax)
    954 .Lprologue_avx:
    955 ___
    956 $code.=<<___;
    957 	mov	%rax,%r14	# original %rsp
    958 	and	\$-64,%rsp
    959 	mov	%rdi,$ctx	# reassigned argument
    960 	mov	%rsi,$inp	# reassigned argument
    961 	mov	%rdx,$num	# reassigned argument
    962 
    963 	shl	\$6,$num
    964 	add	$inp,$num
    965 	lea	K_XX_XX+64(%rip),$K_XX_XX
    966 
    967 	mov	0($ctx),$A		# load context
    968 	mov	4($ctx),$B
    969 	mov	8($ctx),$C
    970 	mov	12($ctx),$D
    971 	mov	$B,@T[0]		# magic seed
    972 	mov	16($ctx),$E
    973 	mov	$C,@T[1]
    974 	xor	$D,@T[1]
    975 	and	@T[1],@T[0]
    976 
    977 	vmovdqa	64($K_XX_XX),@X[2]	# pbswap mask
    978 	vmovdqa	-64($K_XX_XX),$Kx	# K_00_19
    979 	vmovdqu	0($inp),@X[-4&7]	# load input to %xmm[0-3]
    980 	vmovdqu	16($inp),@X[-3&7]
    981 	vmovdqu	32($inp),@X[-2&7]
    982 	vmovdqu	48($inp),@X[-1&7]
    983 	vpshufb	@X[2],@X[-4&7],@X[-4&7]	# byte swap
    984 	add	\$64,$inp
    985 	vpshufb	@X[2],@X[-3&7],@X[-3&7]
    986 	vpshufb	@X[2],@X[-2&7],@X[-2&7]
    987 	vpshufb	@X[2],@X[-1&7],@X[-1&7]
    988 	vpaddd	$Kx,@X[-4&7],@X[0]	# add K_00_19
    989 	vpaddd	$Kx,@X[-3&7],@X[1]
    990 	vpaddd	$Kx,@X[-2&7],@X[2]
    991 	vmovdqa	@X[0],0(%rsp)		# X[]+K xfer to IALU
    992 	vmovdqa	@X[1],16(%rsp)
    993 	vmovdqa	@X[2],32(%rsp)
    994 	jmp	.Loop_avx
    995 ___
    996 
    997 sub Xupdate_avx_16_31()		# recall that $Xi starts wtih 4
    998 { use integer;
    999   my $body = shift;
   1000   my @insns = (&$body,&$body,&$body,&$body);	# 40 instructions
   1001   my ($a,$b,$c,$d,$e);
   1002 
   1003 	 eval(shift(@insns));
   1004 	 eval(shift(@insns));
   1005 	&vpalignr(@X[0],@X[-3&7],@X[-4&7],8);	# compose "X[-14]" in "X[0]"
   1006 	 eval(shift(@insns));
   1007 	 eval(shift(@insns));
   1008 
   1009 	  &vpaddd	(@Tx[1],$Kx,@X[-1&7]);
   1010 	 eval(shift(@insns));
   1011 	 eval(shift(@insns));
   1012 	&vpsrldq(@Tx[0],@X[-1&7],4);		# "X[-3]", 3 dwords
   1013 	 eval(shift(@insns));
   1014 	 eval(shift(@insns));
   1015 	&vpxor	(@X[0],@X[0],@X[-4&7]);		# "X[0]"^="X[-16]"
   1016 	 eval(shift(@insns));
   1017 	 eval(shift(@insns));
   1018 
   1019 	&vpxor	(@Tx[0],@Tx[0],@X[-2&7]);	# "X[-3]"^"X[-8]"
   1020 	 eval(shift(@insns));
   1021 	 eval(shift(@insns));
   1022 	 eval(shift(@insns));
   1023 	 eval(shift(@insns));
   1024 
   1025 	&vpxor	(@X[0],@X[0],@Tx[0]);		# "X[0]"^="X[-3]"^"X[-8]"
   1026 	 eval(shift(@insns));
   1027 	 eval(shift(@insns));
   1028 	  &vmovdqa	(eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]);	# X[]+K xfer to IALU
   1029 	 eval(shift(@insns));
   1030 	 eval(shift(@insns));
   1031 
   1032 	&vpsrld	(@Tx[0],@X[0],31);
   1033 	 eval(shift(@insns));
   1034 	 eval(shift(@insns));
   1035 	 eval(shift(@insns));
   1036 	 eval(shift(@insns));
   1037 
   1038 	&vpslldq(@Tx[2],@X[0],12);		# "X[0]"<<96, extract one dword
   1039 	&vpaddd	(@X[0],@X[0],@X[0]);
   1040 	 eval(shift(@insns));
   1041 	 eval(shift(@insns));
   1042 	 eval(shift(@insns));
   1043 	 eval(shift(@insns));
   1044 
   1045 	&vpsrld	(@Tx[1],@Tx[2],30);
   1046 	&vpor	(@X[0],@X[0],@Tx[0]);		# "X[0]"<<<=1
   1047 	 eval(shift(@insns));
   1048 	 eval(shift(@insns));
   1049 	 eval(shift(@insns));
   1050 	 eval(shift(@insns));
   1051 
   1052 	&vpslld	(@Tx[2],@Tx[2],2);
   1053 	&vpxor	(@X[0],@X[0],@Tx[1]);
   1054 	 eval(shift(@insns));
   1055 	 eval(shift(@insns));
   1056 	 eval(shift(@insns));
   1057 	 eval(shift(@insns));
   1058 
   1059 	&vpxor	(@X[0],@X[0],@Tx[2]);		# "X[0]"^=("X[0]">>96)<<<2
   1060 	 eval(shift(@insns));
   1061 	 eval(shift(@insns));
   1062 	  &vmovdqa	($Kx,eval(2*16*(($Xi)/5)-64)."($K_XX_XX)")	if ($Xi%5==0);	# K_XX_XX
   1063 	 eval(shift(@insns));
   1064 	 eval(shift(@insns));
   1065 
   1066 
   1067 	 foreach (@insns) { eval; }	# remaining instructions [if any]
   1068 
   1069   $Xi++;	push(@X,shift(@X));	# "rotate" X[]
   1070 }
   1071 
   1072 sub Xupdate_avx_32_79()
   1073 { use integer;
   1074   my $body = shift;
   1075   my @insns = (&$body,&$body,&$body,&$body);	# 32 to 44 instructions
   1076   my ($a,$b,$c,$d,$e);
   1077 
   1078 	&vpalignr(@Tx[0],@X[-1&7],@X[-2&7],8);	# compose "X[-6]"
   1079 	&vpxor	(@X[0],@X[0],@X[-4&7]);		# "X[0]"="X[-32]"^"X[-16]"
   1080 	 eval(shift(@insns));		# body_20_39
   1081 	 eval(shift(@insns));
   1082 	 eval(shift(@insns));
   1083 	 eval(shift(@insns));		# rol
   1084 
   1085 	&vpxor	(@X[0],@X[0],@X[-7&7]);		# "X[0]"^="X[-28]"
   1086 	 eval(shift(@insns));
   1087 	 eval(shift(@insns))	if (@insns[0] !~ /&ro[rl]/);
   1088 	  &vpaddd	(@Tx[1],$Kx,@X[-1&7]);
   1089 	  &vmovdqa	($Kx,eval(2*16*($Xi/5)-64)."($K_XX_XX)")	if ($Xi%5==0);
   1090 	 eval(shift(@insns));		# ror
   1091 	 eval(shift(@insns));
   1092 
   1093 	&vpxor	(@X[0],@X[0],@Tx[0]);		# "X[0]"^="X[-6]"
   1094 	 eval(shift(@insns));		# body_20_39
   1095 	 eval(shift(@insns));
   1096 	 eval(shift(@insns));
   1097 	 eval(shift(@insns));		# rol
   1098 
   1099 	&vpsrld	(@Tx[0],@X[0],30);
   1100 	  &vmovdqa	(eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]);	# X[]+K xfer to IALU
   1101 	 eval(shift(@insns));
   1102 	 eval(shift(@insns));
   1103 	 eval(shift(@insns));		# ror
   1104 	 eval(shift(@insns));
   1105 
   1106 	&vpslld	(@X[0],@X[0],2);
   1107 	 eval(shift(@insns));		# body_20_39
   1108 	 eval(shift(@insns));
   1109 	 eval(shift(@insns));
   1110 	 eval(shift(@insns));		# rol
   1111 	 eval(shift(@insns));
   1112 	 eval(shift(@insns));
   1113 	 eval(shift(@insns));		# ror
   1114 	 eval(shift(@insns));
   1115 
   1116 	&vpor	(@X[0],@X[0],@Tx[0]);		# "X[0]"<<<=2
   1117 	 eval(shift(@insns));		# body_20_39
   1118 	 eval(shift(@insns));
   1119 	 eval(shift(@insns));
   1120 	 eval(shift(@insns));		# rol
   1121 	 eval(shift(@insns));
   1122 	 eval(shift(@insns));
   1123 	 eval(shift(@insns));		# rol
   1124 	 eval(shift(@insns));
   1125 
   1126 	 foreach (@insns) { eval; }	# remaining instructions
   1127 
   1128   $Xi++;	push(@X,shift(@X));	# "rotate" X[]
   1129 }
   1130 
   1131 sub Xuplast_avx_80()
   1132 { use integer;
   1133   my $body = shift;
   1134   my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
   1135   my ($a,$b,$c,$d,$e);
   1136 
   1137 	 eval(shift(@insns));
   1138 	  &vpaddd	(@Tx[1],$Kx,@X[-1&7]);
   1139 	 eval(shift(@insns));
   1140 	 eval(shift(@insns));
   1141 	 eval(shift(@insns));
   1142 	 eval(shift(@insns));
   1143 
   1144 	  &vmovdqa	(eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]);	# X[]+K xfer IALU
   1145 
   1146 	 foreach (@insns) { eval; }		# remaining instructions
   1147 
   1148 	&cmp	($inp,$num);
   1149 	&je	($done_avx_label);
   1150 
   1151 	&vmovdqa(@X[2],"64($K_XX_XX)");		# pbswap mask
   1152 	&vmovdqa($Kx,"-64($K_XX_XX)");		# K_00_19
   1153 	&vmovdqu(@X[-4&7],"0($inp)");		# load input
   1154 	&vmovdqu(@X[-3&7],"16($inp)");
   1155 	&vmovdqu(@X[-2&7],"32($inp)");
   1156 	&vmovdqu(@X[-1&7],"48($inp)");
   1157 	&vpshufb(@X[-4&7],@X[-4&7],@X[2]);	# byte swap
   1158 	&add	($inp,64);
   1159 
   1160   $Xi=0;
   1161 }
   1162 
   1163 sub Xloop_avx()
   1164 { use integer;
   1165   my $body = shift;
   1166   my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
   1167   my ($a,$b,$c,$d,$e);
   1168 
   1169 	 eval(shift(@insns));
   1170 	 eval(shift(@insns));
   1171 	&vpshufb(@X[($Xi-3)&7],@X[($Xi-3)&7],@X[2]);
   1172 	 eval(shift(@insns));
   1173 	 eval(shift(@insns));
   1174 	&vpaddd	(@X[$Xi&7],@X[($Xi-4)&7],$Kx);
   1175 	 eval(shift(@insns));
   1176 	 eval(shift(@insns));
   1177 	 eval(shift(@insns));
   1178 	 eval(shift(@insns));
   1179 	&vmovdqa(eval(16*$Xi)."(%rsp)",@X[$Xi&7]);	# X[]+K xfer to IALU
   1180 	 eval(shift(@insns));
   1181 	 eval(shift(@insns));
   1182 
   1183 	foreach (@insns) { eval; }
   1184   $Xi++;
   1185 }
   1186 
   1187 sub Xtail_avx()
   1188 { use integer;
   1189   my $body = shift;
   1190   my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
   1191   my ($a,$b,$c,$d,$e);
   1192 
   1193 	foreach (@insns) { eval; }
   1194 }
   1195 
   1196 $code.=<<___;
   1197 .align	16
   1198 .Loop_avx:
   1199 ___
   1200 	&Xupdate_avx_16_31(\&body_00_19);
   1201 	&Xupdate_avx_16_31(\&body_00_19);
   1202 	&Xupdate_avx_16_31(\&body_00_19);
   1203 	&Xupdate_avx_16_31(\&body_00_19);
   1204 	&Xupdate_avx_32_79(\&body_00_19);
   1205 	&Xupdate_avx_32_79(\&body_20_39);
   1206 	&Xupdate_avx_32_79(\&body_20_39);
   1207 	&Xupdate_avx_32_79(\&body_20_39);
   1208 	&Xupdate_avx_32_79(\&body_20_39);
   1209 	&Xupdate_avx_32_79(\&body_20_39);
   1210 	&Xupdate_avx_32_79(\&body_40_59);
   1211 	&Xupdate_avx_32_79(\&body_40_59);
   1212 	&Xupdate_avx_32_79(\&body_40_59);
   1213 	&Xupdate_avx_32_79(\&body_40_59);
   1214 	&Xupdate_avx_32_79(\&body_40_59);
   1215 	&Xupdate_avx_32_79(\&body_20_39);
   1216 	&Xuplast_avx_80(\&body_20_39);	# can jump to "done"
   1217 
   1218 				$saved_j=$j; @saved_V=@V;
   1219 
   1220 	&Xloop_avx(\&body_20_39);
   1221 	&Xloop_avx(\&body_20_39);
   1222 	&Xloop_avx(\&body_20_39);
   1223 
   1224 $code.=<<___;
   1225 	add	0($ctx),$A			# update context
   1226 	add	4($ctx),@T[0]
   1227 	add	8($ctx),$C
   1228 	add	12($ctx),$D
   1229 	mov	$A,0($ctx)
   1230 	add	16($ctx),$E
   1231 	mov	@T[0],4($ctx)
   1232 	mov	@T[0],$B			# magic seed
   1233 	mov	$C,8($ctx)
   1234 	mov	$C,@T[1]
   1235 	mov	$D,12($ctx)
   1236 	xor	$D,@T[1]
   1237 	mov	$E,16($ctx)
   1238 	and	@T[1],@T[0]
   1239 	jmp	.Loop_avx
   1240 
   1241 .align	16
   1242 $done_avx_label:
   1243 ___
   1244 				$j=$saved_j; @V=@saved_V;
   1245 
   1246 	&Xtail_avx(\&body_20_39);
   1247 	&Xtail_avx(\&body_20_39);
   1248 	&Xtail_avx(\&body_20_39);
   1249 
   1250 $code.=<<___;
   1251 	vzeroupper
   1252 
   1253 	add	0($ctx),$A			# update context
   1254 	add	4($ctx),@T[0]
   1255 	add	8($ctx),$C
   1256 	mov	$A,0($ctx)
   1257 	add	12($ctx),$D
   1258 	mov	@T[0],4($ctx)
   1259 	add	16($ctx),$E
   1260 	mov	$C,8($ctx)
   1261 	mov	$D,12($ctx)
   1262 	mov	$E,16($ctx)
   1263 ___
   1264 $code.=<<___ if ($win64);
   1265 	movaps	-40-6*16(%r14),%xmm6
   1266 	movaps	-40-5*16(%r14),%xmm7
   1267 	movaps	-40-4*16(%r14),%xmm8
   1268 	movaps	-40-3*16(%r14),%xmm9
   1269 	movaps	-40-2*16(%r14),%xmm10
   1270 	movaps	-40-1*16(%r14),%xmm11
   1271 ___
   1272 $code.=<<___;
   1273 	lea	(%r14),%rsi
   1274 	mov	-40(%rsi),%r14
   1275 	mov	-32(%rsi),%r13
   1276 	mov	-24(%rsi),%r12
   1277 	mov	-16(%rsi),%rbp
   1278 	mov	-8(%rsi),%rbx
   1279 	lea	(%rsi),%rsp
   1280 .Lepilogue_avx:
   1281 	ret
   1282 .size	sha1_block_data_order_avx,.-sha1_block_data_order_avx
   1283 ___
   1284 
   1285 if ($avx>1) {
   1286 use integer;
   1287 $Xi=4;					# reset variables
   1288 @X=map("%ymm$_",(4..7,0..3));
   1289 @Tx=map("%ymm$_",(8..10));
   1290 $Kx="%ymm11";
   1291 $j=0;
   1292 
   1293 my @ROTX=("%eax","%ebp","%ebx","%ecx","%edx","%esi");
   1294 my ($a5,$t0)=("%r12d","%edi");
   1295 
   1296 my ($A,$F,$B,$C,$D,$E)=@ROTX;
   1297 my $rx=0;
   1298 my $frame="%r13";
   1299 
   1300 $code.=<<___;
   1301 .type	sha1_block_data_order_avx2,\@function,3
   1302 .align	16
   1303 sha1_block_data_order_avx2:
   1304 _avx2_shortcut:
   1305 	mov	%rsp,%rax
   1306 	push	%rbx
   1307 	push	%rbp
   1308 	push	%r12
   1309 	push	%r13
   1310 	push	%r14
   1311 	vzeroupper
   1312 ___
   1313 $code.=<<___ if ($win64);
   1314 	lea	-6*16(%rsp),%rsp
   1315 	vmovaps	%xmm6,-40-6*16(%rax)
   1316 	vmovaps	%xmm7,-40-5*16(%rax)
   1317 	vmovaps	%xmm8,-40-4*16(%rax)
   1318 	vmovaps	%xmm9,-40-3*16(%rax)
   1319 	vmovaps	%xmm10,-40-2*16(%rax)
   1320 	vmovaps	%xmm11,-40-1*16(%rax)
   1321 .Lprologue_avx2:
   1322 ___
   1323 $code.=<<___;
   1324 	mov	%rax,%r14		# original %rsp
   1325 	mov	%rdi,$ctx		# reassigned argument
   1326 	mov	%rsi,$inp		# reassigned argument
   1327 	mov	%rdx,$num		# reassigned argument
   1328 
   1329 	lea	-640(%rsp),%rsp
   1330 	shl	\$6,$num
   1331 	 lea	64($inp),$frame
   1332 	and	\$-128,%rsp
   1333 	add	$inp,$num
   1334 	lea	K_XX_XX+64(%rip),$K_XX_XX
   1335 
   1336 	mov	0($ctx),$A		# load context
   1337 	 cmp	$num,$frame
   1338 	 cmovae	$inp,$frame		# next or same block
   1339 	mov	4($ctx),$F
   1340 	mov	8($ctx),$C
   1341 	mov	12($ctx),$D
   1342 	mov	16($ctx),$E
   1343 	vmovdqu	64($K_XX_XX),@X[2]	# pbswap mask
   1344 
   1345 	vmovdqu		($inp),%xmm0
   1346 	vmovdqu		16($inp),%xmm1
   1347 	vmovdqu		32($inp),%xmm2
   1348 	vmovdqu		48($inp),%xmm3
   1349 	lea		64($inp),$inp
   1350 	vinserti128	\$1,($frame),@X[-4&7],@X[-4&7]
   1351 	vinserti128	\$1,16($frame),@X[-3&7],@X[-3&7]
   1352 	vpshufb		@X[2],@X[-4&7],@X[-4&7]
   1353 	vinserti128	\$1,32($frame),@X[-2&7],@X[-2&7]
   1354 	vpshufb		@X[2],@X[-3&7],@X[-3&7]
   1355 	vinserti128	\$1,48($frame),@X[-1&7],@X[-1&7]
   1356 	vpshufb		@X[2],@X[-2&7],@X[-2&7]
   1357 	vmovdqu		-64($K_XX_XX),$Kx	# K_00_19
   1358 	vpshufb		@X[2],@X[-1&7],@X[-1&7]
   1359 
   1360 	vpaddd	$Kx,@X[-4&7],@X[0]	# add K_00_19
   1361 	vpaddd	$Kx,@X[-3&7],@X[1]
   1362 	vmovdqu	@X[0],0(%rsp)		# X[]+K xfer to IALU
   1363 	vpaddd	$Kx,@X[-2&7],@X[2]
   1364 	vmovdqu	@X[1],32(%rsp)
   1365 	vpaddd	$Kx,@X[-1&7],@X[3]
   1366 	vmovdqu	@X[2],64(%rsp)
   1367 	vmovdqu	@X[3],96(%rsp)
   1368 ___
   1369 for (;$Xi<8;$Xi++) {	# Xupdate_avx2_16_31
   1370     use integer;
   1371 
   1372 	&vpalignr(@X[0],@X[-3&7],@X[-4&7],8);	# compose "X[-14]" in "X[0]"
   1373 	&vpsrldq(@Tx[0],@X[-1&7],4);		# "X[-3]", 3 dwords
   1374 	&vpxor	(@X[0],@X[0],@X[-4&7]);		# "X[0]"^="X[-16]"
   1375 	&vpxor	(@Tx[0],@Tx[0],@X[-2&7]);	# "X[-3]"^"X[-8]"
   1376 	&vpxor	(@X[0],@X[0],@Tx[0]);		# "X[0]"^="X[-3]"^"X[-8]"
   1377 	&vpsrld	(@Tx[0],@X[0],31);
   1378 	&vmovdqu($Kx,eval(2*16*(($Xi)/5)-64)."($K_XX_XX)")	if ($Xi%5==0);	# K_XX_XX
   1379 	&vpslldq(@Tx[2],@X[0],12);		# "X[0]"<<96, extract one dword
   1380 	&vpaddd	(@X[0],@X[0],@X[0]);
   1381 	&vpsrld	(@Tx[1],@Tx[2],30);
   1382 	&vpor	(@X[0],@X[0],@Tx[0]);		# "X[0]"<<<=1
   1383 	&vpslld	(@Tx[2],@Tx[2],2);
   1384 	&vpxor	(@X[0],@X[0],@Tx[1]);
   1385 	&vpxor	(@X[0],@X[0],@Tx[2]);		# "X[0]"^=("X[0]">>96)<<<2
   1386 	&vpaddd	(@Tx[1],@X[0],$Kx);
   1387 	&vmovdqu("32*$Xi(%rsp)",@Tx[1]);	# X[]+K xfer to IALU
   1388 
   1389 	push(@X,shift(@X));	# "rotate" X[]
   1390 }
   1391 $code.=<<___;
   1392 	lea	128(%rsp),$frame
   1393 	jmp	.Loop_avx2
   1394 .align	32
   1395 .Loop_avx2:
   1396 	rorx	\$2,$F,$B
   1397 	andn	$D,$F,$t0
   1398 	and	$C,$F
   1399 	xor	$t0,$F
   1400 ___
   1401 sub bodyx_00_19 () {	# 8 instructions, 3 cycles critical path
   1402 	# at start $f=(b&c)^(~b&d), $b>>>=2
   1403 	return &bodyx_20_39() if ($rx==19); $rx++;
   1404 	(
   1405 	'($a,$f,$b,$c,$d,$e)=@ROTX;'.
   1406 
   1407 	'&add	($e,((32*($j/4)+4*($j%4))%256-128)."($frame)");'.	# e+=X[i]+K
   1408 	 '&lea	($frame,"256($frame)")	if ($j%32==31);',
   1409 	'&andn	($t0,$a,$c)',			# ~b&d for next round
   1410 
   1411 	'&add	($e,$f)',			# e+=(b&c)^(~b&d)
   1412 	'&rorx	($a5,$a,27)',			# a<<<5
   1413 	'&rorx	($f,$a,2)',			# b>>>2 for next round
   1414 	'&and	($a,$b)',			# b&c for next round
   1415 
   1416 	'&add	($e,$a5)',			# e+=a<<<5
   1417 	'&xor	($a,$t0);'.			# f=(b&c)^(~b&d) for next round
   1418 
   1419 	'unshift(@ROTX,pop(@ROTX)); $j++;'
   1420 	)
   1421 }
   1422 
   1423 sub bodyx_20_39 () {	# 7 instructions, 2 cycles critical path
   1424 	# on entry $f=b^c^d, $b>>>=2
   1425 	return &bodyx_40_59() if ($rx==39); $rx++;
   1426 	(
   1427 	'($a,$f,$b,$c,$d,$e)=@ROTX;'.
   1428 
   1429 	'&add	($e,((32*($j/4)+4*($j%4))%256-128)."($frame)");'.	# e+=X[i]+K
   1430 	 '&lea	($frame,"256($frame)")	if ($j%32==31);',
   1431 
   1432 	'&lea	($e,"($e,$f)")',		# e+=b^c^d
   1433 	'&rorx	($a5,$a,27)',			# a<<<5
   1434 	'&rorx	($f,$a,2)	if ($j<79)',	# b>>>2 in next round
   1435 	'&xor	($a,$b)		if ($j<79)',	# b^c for next round
   1436 
   1437 	'&add	($e,$a5)',			# e+=a<<<5
   1438 	'&xor	($a,$c)		if ($j<79);'.	# f=b^c^d for next round
   1439 
   1440 	'unshift(@ROTX,pop(@ROTX)); $j++;'
   1441 	)
   1442 }
   1443 
   1444 sub bodyx_40_59 () {	# 10 instructions, 3 cycles critical path
   1445 	# on entry $f=((b^c)&(c^d)), $b>>>=2
   1446 	$rx++;
   1447 	(
   1448 	'($a,$f,$b,$c,$d,$e)=@ROTX;'.
   1449 
   1450 	'&add	($e,((32*($j/4)+4*($j%4))%256-128)."($frame)");'.	# e+=X[i]+K
   1451 	 '&lea	($frame,"256($frame)")	if ($j%32==31);',
   1452 	'&xor	($f,$c)		if ($j>39)',	# (b^c)&(c^d)^c
   1453 	'&mov	($t0,$b)	if ($j<59)',	# count on zero latency
   1454 	'&xor	($t0,$c)	if ($j<59)',	# c^d for next round
   1455 
   1456 	'&lea	($e,"($e,$f)")',		# e+=(b^c)&(c^d)^c
   1457 	'&rorx	($a5,$a,27)',			# a<<<5
   1458 	'&rorx	($f,$a,2)',			# b>>>2 in next round
   1459 	'&xor	($a,$b)',			# b^c for next round
   1460 
   1461 	'&add	($e,$a5)',			# e+=a<<<5
   1462 	'&and	($a,$t0)	if ($j< 59);'.	# f=(b^c)&(c^d) for next round
   1463 	'&xor	($a,$c)		if ($j==59);'.	# f=b^c^d for next round
   1464 
   1465 	'unshift(@ROTX,pop(@ROTX)); $j++;'
   1466 	)
   1467 }
   1468 
   1469 sub Xupdate_avx2_16_31()		# recall that $Xi starts wtih 4
   1470 { use integer;
   1471   my $body = shift;
   1472   my @insns = (&$body,&$body,&$body,&$body,&$body);	# 35 instructions
   1473   my ($a,$b,$c,$d,$e);
   1474 
   1475 	&vpalignr(@X[0],@X[-3&7],@X[-4&7],8);	# compose "X[-14]" in "X[0]"
   1476 	 eval(shift(@insns));
   1477 	 eval(shift(@insns));
   1478 	 eval(shift(@insns));
   1479 	 eval(shift(@insns));
   1480 
   1481 	&vpsrldq(@Tx[0],@X[-1&7],4);		# "X[-3]", 3 dwords
   1482 	 eval(shift(@insns));
   1483 	 eval(shift(@insns));
   1484 	 eval(shift(@insns));
   1485 
   1486 	&vpxor	(@X[0],@X[0],@X[-4&7]);		# "X[0]"^="X[-16]"
   1487 	&vpxor	(@Tx[0],@Tx[0],@X[-2&7]);	# "X[-3]"^"X[-8]"
   1488 	 eval(shift(@insns));
   1489 	 eval(shift(@insns));
   1490 
   1491 	&vpxor	(@X[0],@X[0],@Tx[0]);		# "X[0]"^="X[-3]"^"X[-8]"
   1492 	 eval(shift(@insns));
   1493 	 eval(shift(@insns));
   1494 	 eval(shift(@insns));
   1495 	 eval(shift(@insns));
   1496 
   1497 	&vpsrld	(@Tx[0],@X[0],31);
   1498 	&vmovdqu($Kx,eval(2*16*(($Xi)/5)-64)."($K_XX_XX)")	if ($Xi%5==0);	# K_XX_XX
   1499 	 eval(shift(@insns));
   1500 	 eval(shift(@insns));
   1501 	 eval(shift(@insns));
   1502 
   1503 	&vpslldq(@Tx[2],@X[0],12);		# "X[0]"<<96, extract one dword
   1504 	&vpaddd	(@X[0],@X[0],@X[0]);
   1505 	 eval(shift(@insns));
   1506 	 eval(shift(@insns));
   1507 
   1508 	&vpsrld	(@Tx[1],@Tx[2],30);
   1509 	&vpor	(@X[0],@X[0],@Tx[0]);		# "X[0]"<<<=1
   1510 	 eval(shift(@insns));
   1511 	 eval(shift(@insns));
   1512 
   1513 	&vpslld	(@Tx[2],@Tx[2],2);
   1514 	&vpxor	(@X[0],@X[0],@Tx[1]);
   1515 	 eval(shift(@insns));
   1516 	 eval(shift(@insns));
   1517 
   1518 	&vpxor	(@X[0],@X[0],@Tx[2]);		# "X[0]"^=("X[0]">>96)<<<2
   1519 	 eval(shift(@insns));
   1520 	 eval(shift(@insns));
   1521 	 eval(shift(@insns));
   1522 
   1523 	&vpaddd	(@Tx[1],@X[0],$Kx);
   1524 	 eval(shift(@insns));
   1525 	 eval(shift(@insns));
   1526 	 eval(shift(@insns));
   1527 	&vmovdqu(eval(32*($Xi))."(%rsp)",@Tx[1]);	# X[]+K xfer to IALU
   1528 
   1529 	 foreach (@insns) { eval; }	# remaining instructions [if any]
   1530 
   1531 	$Xi++;
   1532 	push(@X,shift(@X));	# "rotate" X[]
   1533 }
   1534 
   1535 sub Xupdate_avx2_32_79()
   1536 { use integer;
   1537   my $body = shift;
   1538   my @insns = (&$body,&$body,&$body,&$body,&$body);	# 35 to 50 instructions
   1539   my ($a,$b,$c,$d,$e);
   1540 
   1541 	&vpalignr(@Tx[0],@X[-1&7],@X[-2&7],8);	# compose "X[-6]"
   1542 	&vpxor	(@X[0],@X[0],@X[-4&7]);		# "X[0]"="X[-32]"^"X[-16]"
   1543 	 eval(shift(@insns));
   1544 	 eval(shift(@insns));
   1545 
   1546 	&vpxor	(@X[0],@X[0],@X[-7&7]);		# "X[0]"^="X[-28]"
   1547 	&vmovdqu($Kx,eval(2*16*($Xi/5)-64)."($K_XX_XX)")	if ($Xi%5==0);
   1548 	 eval(shift(@insns));
   1549 	 eval(shift(@insns));
   1550 	 eval(shift(@insns));
   1551 
   1552 	&vpxor	(@X[0],@X[0],@Tx[0]);		# "X[0]"^="X[-6]"
   1553 	 eval(shift(@insns));
   1554 	 eval(shift(@insns));
   1555 	 eval(shift(@insns));
   1556 
   1557 	&vpsrld	(@Tx[0],@X[0],30);
   1558 	&vpslld	(@X[0],@X[0],2);
   1559 	 eval(shift(@insns));
   1560 	 eval(shift(@insns));
   1561 	 eval(shift(@insns));
   1562 
   1563 	#&vpslld	(@X[0],@X[0],2);
   1564 	 eval(shift(@insns));
   1565 	 eval(shift(@insns));
   1566 	 eval(shift(@insns));
   1567 
   1568 	&vpor	(@X[0],@X[0],@Tx[0]);		# "X[0]"<<<=2
   1569 	 eval(shift(@insns));
   1570 	 eval(shift(@insns));
   1571 	 eval(shift(@insns));
   1572 	 eval(shift(@insns));
   1573 
   1574 	&vpaddd	(@Tx[1],@X[0],$Kx);
   1575 	 eval(shift(@insns));
   1576 	 eval(shift(@insns));
   1577 	 eval(shift(@insns));
   1578 	 eval(shift(@insns));
   1579 
   1580 	&vmovdqu("32*$Xi(%rsp)",@Tx[1]);	# X[]+K xfer to IALU
   1581 
   1582 	 foreach (@insns) { eval; }	# remaining instructions
   1583 
   1584 	$Xi++;
   1585 	push(@X,shift(@X));	# "rotate" X[]
   1586 }
   1587 
   1588 sub Xloop_avx2()
   1589 { use integer;
   1590   my $body = shift;
   1591   my @insns = (&$body,&$body,&$body,&$body,&$body);	# 32 instructions
   1592   my ($a,$b,$c,$d,$e);
   1593 
   1594 	 foreach (@insns) { eval; }
   1595 }
   1596 
   1597 	&align32();
   1598 	&Xupdate_avx2_32_79(\&bodyx_00_19);
   1599 	&Xupdate_avx2_32_79(\&bodyx_00_19);
   1600 	&Xupdate_avx2_32_79(\&bodyx_00_19);
   1601 	&Xupdate_avx2_32_79(\&bodyx_00_19);
   1602 
   1603 	&Xupdate_avx2_32_79(\&bodyx_20_39);
   1604 	&Xupdate_avx2_32_79(\&bodyx_20_39);
   1605 	&Xupdate_avx2_32_79(\&bodyx_20_39);
   1606 	&Xupdate_avx2_32_79(\&bodyx_20_39);
   1607 
   1608 	&align32();
   1609 	&Xupdate_avx2_32_79(\&bodyx_40_59);
   1610 	&Xupdate_avx2_32_79(\&bodyx_40_59);
   1611 	&Xupdate_avx2_32_79(\&bodyx_40_59);
   1612 	&Xupdate_avx2_32_79(\&bodyx_40_59);
   1613 
   1614 	&Xloop_avx2(\&bodyx_20_39);
   1615 	&Xloop_avx2(\&bodyx_20_39);
   1616 	&Xloop_avx2(\&bodyx_20_39);
   1617 	&Xloop_avx2(\&bodyx_20_39);
   1618 
   1619 $code.=<<___;
   1620 	lea	128($inp),$frame
   1621 	lea	128($inp),%rdi			# borrow $t0
   1622 	cmp	$num,$frame
   1623 	cmovae	$inp,$frame			# next or previous block
   1624 
   1625 	# output is d-e-[a]-f-b-c => A=d,F=e,C=f,D=b,E=c
   1626 	add	0($ctx),@ROTX[0]		# update context
   1627 	add	4($ctx),@ROTX[1]
   1628 	add	8($ctx),@ROTX[3]
   1629 	mov	@ROTX[0],0($ctx)
   1630 	add	12($ctx),@ROTX[4]
   1631 	mov	@ROTX[1],4($ctx)
   1632 	 mov	@ROTX[0],$A			# A=d
   1633 	add	16($ctx),@ROTX[5]
   1634 	 mov	@ROTX[3],$a5
   1635 	mov	@ROTX[3],8($ctx)
   1636 	 mov	@ROTX[4],$D			# D=b
   1637 	 #xchg	@ROTX[5],$F			# F=c, C=f
   1638 	mov	@ROTX[4],12($ctx)
   1639 	 mov	@ROTX[1],$F			# F=e
   1640 	mov	@ROTX[5],16($ctx)
   1641 	#mov	$F,16($ctx)
   1642 	 mov	@ROTX[5],$E			# E=c
   1643 	 mov	$a5,$C				# C=f
   1644 	 #xchg	$F,$E				# E=c, F=e
   1645 
   1646 	cmp	$num,$inp
   1647 	je	.Ldone_avx2
   1648 ___
   1649 
   1650 $Xi=4;				# reset variables
   1651 @X=map("%ymm$_",(4..7,0..3));
   1652 
   1653 $code.=<<___;
   1654 	vmovdqu	64($K_XX_XX),@X[2]		# pbswap mask
   1655 	cmp	$num,%rdi			# borrowed $t0
   1656 	ja	.Last_avx2
   1657 
   1658 	vmovdqu		-64(%rdi),%xmm0		# low part of @X[-4&7]
   1659 	vmovdqu		-48(%rdi),%xmm1
   1660 	vmovdqu		-32(%rdi),%xmm2
   1661 	vmovdqu		-16(%rdi),%xmm3
   1662 	vinserti128	\$1,0($frame),@X[-4&7],@X[-4&7]
   1663 	vinserti128	\$1,16($frame),@X[-3&7],@X[-3&7]
   1664 	vinserti128	\$1,32($frame),@X[-2&7],@X[-2&7]
   1665 	vinserti128	\$1,48($frame),@X[-1&7],@X[-1&7]
   1666 	jmp	.Last_avx2
   1667 
   1668 .align	32
   1669 .Last_avx2:
   1670 	lea	128+16(%rsp),$frame
   1671 	rorx	\$2,$F,$B
   1672 	andn	$D,$F,$t0
   1673 	and	$C,$F
   1674 	xor	$t0,$F
   1675 	sub	\$-128,$inp
   1676 ___
   1677 	$rx=$j=0;	@ROTX=($A,$F,$B,$C,$D,$E);
   1678 
   1679 	&Xloop_avx2	(\&bodyx_00_19);
   1680 	&Xloop_avx2	(\&bodyx_00_19);
   1681 	&Xloop_avx2	(\&bodyx_00_19);
   1682 	&Xloop_avx2	(\&bodyx_00_19);
   1683 
   1684 	&Xloop_avx2	(\&bodyx_20_39);
   1685 	  &vmovdqu	($Kx,"-64($K_XX_XX)");		# K_00_19
   1686 	  &vpshufb	(@X[-4&7],@X[-4&7],@X[2]);	# byte swap
   1687 	&Xloop_avx2	(\&bodyx_20_39);
   1688 	  &vpshufb	(@X[-3&7],@X[-3&7],@X[2]);
   1689 	  &vpaddd	(@Tx[0],@X[-4&7],$Kx);		# add K_00_19
   1690 	&Xloop_avx2	(\&bodyx_20_39);
   1691 	  &vmovdqu	("0(%rsp)",@Tx[0]);
   1692 	  &vpshufb	(@X[-2&7],@X[-2&7],@X[2]);
   1693 	  &vpaddd	(@Tx[1],@X[-3&7],$Kx);
   1694 	&Xloop_avx2	(\&bodyx_20_39);
   1695 	  &vmovdqu	("32(%rsp)",@Tx[1]);
   1696 	  &vpshufb	(@X[-1&7],@X[-1&7],@X[2]);
   1697 	  &vpaddd	(@X[2],@X[-2&7],$Kx);
   1698 
   1699 	&Xloop_avx2	(\&bodyx_40_59);
   1700 	&align32	();
   1701 	  &vmovdqu	("64(%rsp)",@X[2]);
   1702 	  &vpaddd	(@X[3],@X[-1&7],$Kx);
   1703 	&Xloop_avx2	(\&bodyx_40_59);
   1704 	  &vmovdqu	("96(%rsp)",@X[3]);
   1705 	&Xloop_avx2	(\&bodyx_40_59);
   1706 	&Xupdate_avx2_16_31(\&bodyx_40_59);
   1707 
   1708 	&Xupdate_avx2_16_31(\&bodyx_20_39);
   1709 	&Xupdate_avx2_16_31(\&bodyx_20_39);
   1710 	&Xupdate_avx2_16_31(\&bodyx_20_39);
   1711 	&Xloop_avx2	(\&bodyx_20_39);
   1712 
   1713 $code.=<<___;
   1714 	lea	128(%rsp),$frame
   1715 
   1716 	# output is d-e-[a]-f-b-c => A=d,F=e,C=f,D=b,E=c
   1717 	add	0($ctx),@ROTX[0]		# update context
   1718 	add	4($ctx),@ROTX[1]
   1719 	add	8($ctx),@ROTX[3]
   1720 	mov	@ROTX[0],0($ctx)
   1721 	add	12($ctx),@ROTX[4]
   1722 	mov	@ROTX[1],4($ctx)
   1723 	 mov	@ROTX[0],$A			# A=d
   1724 	add	16($ctx),@ROTX[5]
   1725 	 mov	@ROTX[3],$a5
   1726 	mov	@ROTX[3],8($ctx)
   1727 	 mov	@ROTX[4],$D			# D=b
   1728 	 #xchg	@ROTX[5],$F			# F=c, C=f
   1729 	mov	@ROTX[4],12($ctx)
   1730 	 mov	@ROTX[1],$F			# F=e
   1731 	mov	@ROTX[5],16($ctx)
   1732 	#mov	$F,16($ctx)
   1733 	 mov	@ROTX[5],$E			# E=c
   1734 	 mov	$a5,$C				# C=f
   1735 	 #xchg	$F,$E				# E=c, F=e
   1736 
   1737 	cmp	$num,$inp
   1738 	jbe	.Loop_avx2
   1739 
   1740 .Ldone_avx2:
   1741 	vzeroupper
   1742 ___
   1743 $code.=<<___ if ($win64);
   1744 	movaps	-40-6*16(%r14),%xmm6
   1745 	movaps	-40-5*16(%r14),%xmm7
   1746 	movaps	-40-4*16(%r14),%xmm8
   1747 	movaps	-40-3*16(%r14),%xmm9
   1748 	movaps	-40-2*16(%r14),%xmm10
   1749 	movaps	-40-1*16(%r14),%xmm11
   1750 ___
   1751 $code.=<<___;
   1752 	lea	(%r14),%rsi
   1753 	mov	-40(%rsi),%r14
   1754 	mov	-32(%rsi),%r13
   1755 	mov	-24(%rsi),%r12
   1756 	mov	-16(%rsi),%rbp
   1757 	mov	-8(%rsi),%rbx
   1758 	lea	(%rsi),%rsp
   1759 .Lepilogue_avx2:
   1760 	ret
   1761 .size	sha1_block_data_order_avx2,.-sha1_block_data_order_avx2
   1762 ___
   1763 }
   1764 }
   1765 $code.=<<___;
   1766 .align	64
   1767 K_XX_XX:
   1768 .long	0x5a827999,0x5a827999,0x5a827999,0x5a827999	# K_00_19
   1769 .long	0x5a827999,0x5a827999,0x5a827999,0x5a827999	# K_00_19
   1770 .long	0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1	# K_20_39
   1771 .long	0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1	# K_20_39
   1772 .long	0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc	# K_40_59
   1773 .long	0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc	# K_40_59
   1774 .long	0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6	# K_60_79
   1775 .long	0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6	# K_60_79
   1776 .long	0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f	# pbswap mask
   1777 .long	0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f	# pbswap mask
   1778 .byte	0xf,0xe,0xd,0xc,0xb,0xa,0x9,0x8,0x7,0x6,0x5,0x4,0x3,0x2,0x1,0x0
   1779 ___
   1780 }}}
   1781 $code.=<<___;
   1782 .asciz	"SHA1 block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
   1783 .align	64
   1784 ___
   1785 
   1786 # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
   1787 #		CONTEXT *context,DISPATCHER_CONTEXT *disp)
   1788 if ($win64) {
   1789 $rec="%rcx";
   1790 $frame="%rdx";
   1791 $context="%r8";
   1792 $disp="%r9";
   1793 
   1794 $code.=<<___;
   1795 .extern	__imp_RtlVirtualUnwind
   1796 .type	se_handler,\@abi-omnipotent
   1797 .align	16
   1798 se_handler:
   1799 	push	%rsi
   1800 	push	%rdi
   1801 	push	%rbx
   1802 	push	%rbp
   1803 	push	%r12
   1804 	push	%r13
   1805 	push	%r14
   1806 	push	%r15
   1807 	pushfq
   1808 	sub	\$64,%rsp
   1809 
   1810 	mov	120($context),%rax	# pull context->Rax
   1811 	mov	248($context),%rbx	# pull context->Rip
   1812 
   1813 	lea	.Lprologue(%rip),%r10
   1814 	cmp	%r10,%rbx		# context->Rip<.Lprologue
   1815 	jb	.Lcommon_seh_tail
   1816 
   1817 	mov	152($context),%rax	# pull context->Rsp
   1818 
   1819 	lea	.Lepilogue(%rip),%r10
   1820 	cmp	%r10,%rbx		# context->Rip>=.Lepilogue
   1821 	jae	.Lcommon_seh_tail
   1822 
   1823 	mov	`16*4`(%rax),%rax	# pull saved stack pointer
   1824 
   1825 	mov	-8(%rax),%rbx
   1826 	mov	-16(%rax),%rbp
   1827 	mov	-24(%rax),%r12
   1828 	mov	-32(%rax),%r13
   1829 	mov	-40(%rax),%r14
   1830 	mov	%rbx,144($context)	# restore context->Rbx
   1831 	mov	%rbp,160($context)	# restore context->Rbp
   1832 	mov	%r12,216($context)	# restore context->R12
   1833 	mov	%r13,224($context)	# restore context->R13
   1834 	mov	%r14,232($context)	# restore context->R14
   1835 
   1836 	jmp	.Lcommon_seh_tail
   1837 .size	se_handler,.-se_handler
   1838 ___
   1839 
   1840 $code.=<<___ if ($shaext);
   1841 .type	shaext_handler,\@abi-omnipotent
   1842 .align	16
   1843 shaext_handler:
   1844 	push	%rsi
   1845 	push	%rdi
   1846 	push	%rbx
   1847 	push	%rbp
   1848 	push	%r12
   1849 	push	%r13
   1850 	push	%r14
   1851 	push	%r15
   1852 	pushfq
   1853 	sub	\$64,%rsp
   1854 
   1855 	mov	120($context),%rax	# pull context->Rax
   1856 	mov	248($context),%rbx	# pull context->Rip
   1857 
   1858 	lea	.Lprologue_shaext(%rip),%r10
   1859 	cmp	%r10,%rbx		# context->Rip<.Lprologue
   1860 	jb	.Lcommon_seh_tail
   1861 
   1862 	lea	.Lepilogue_shaext(%rip),%r10
   1863 	cmp	%r10,%rbx		# context->Rip>=.Lepilogue
   1864 	jae	.Lcommon_seh_tail
   1865 
   1866 	lea	-8-4*16(%rax),%rsi
   1867 	lea	512($context),%rdi	# &context.Xmm6
   1868 	mov	\$8,%ecx
   1869 	.long	0xa548f3fc		# cld; rep movsq
   1870 
   1871 	jmp	.Lcommon_seh_tail
   1872 .size	shaext_handler,.-shaext_handler
   1873 ___
   1874 
   1875 $code.=<<___;
   1876 .type	ssse3_handler,\@abi-omnipotent
   1877 .align	16
   1878 ssse3_handler:
   1879 	push	%rsi
   1880 	push	%rdi
   1881 	push	%rbx
   1882 	push	%rbp
   1883 	push	%r12
   1884 	push	%r13
   1885 	push	%r14
   1886 	push	%r15
   1887 	pushfq
   1888 	sub	\$64,%rsp
   1889 
   1890 	mov	120($context),%rax	# pull context->Rax
   1891 	mov	248($context),%rbx	# pull context->Rip
   1892 
   1893 	mov	8($disp),%rsi		# disp->ImageBase
   1894 	mov	56($disp),%r11		# disp->HandlerData
   1895 
   1896 	mov	0(%r11),%r10d		# HandlerData[0]
   1897 	lea	(%rsi,%r10),%r10	# prologue label
   1898 	cmp	%r10,%rbx		# context->Rip<prologue label
   1899 	jb	.Lcommon_seh_tail
   1900 
   1901 	mov	152($context),%rax	# pull context->Rsp
   1902 
   1903 	mov	4(%r11),%r10d		# HandlerData[1]
   1904 	lea	(%rsi,%r10),%r10	# epilogue label
   1905 	cmp	%r10,%rbx		# context->Rip>=epilogue label
   1906 	jae	.Lcommon_seh_tail
   1907 
   1908 	mov	232($context),%rax	# pull context->R14
   1909 
   1910 	lea	-40-6*16(%rax),%rsi
   1911 	lea	512($context),%rdi	# &context.Xmm6
   1912 	mov	\$12,%ecx
   1913 	.long	0xa548f3fc		# cld; rep movsq
   1914 
   1915 	mov	-8(%rax),%rbx
   1916 	mov	-16(%rax),%rbp
   1917 	mov	-24(%rax),%r12
   1918 	mov	-32(%rax),%r13
   1919 	mov	-40(%rax),%r14
   1920 	mov	%rbx,144($context)	# restore context->Rbx
   1921 	mov	%rbp,160($context)	# restore context->Rbp
   1922 	mov	%r12,216($context)	# restore cotnext->R12
   1923 	mov	%r13,224($context)	# restore cotnext->R13
   1924 	mov	%r14,232($context)	# restore cotnext->R14
   1925 
   1926 .Lcommon_seh_tail:
   1927 	mov	8(%rax),%rdi
   1928 	mov	16(%rax),%rsi
   1929 	mov	%rax,152($context)	# restore context->Rsp
   1930 	mov	%rsi,168($context)	# restore context->Rsi
   1931 	mov	%rdi,176($context)	# restore context->Rdi
   1932 
   1933 	mov	40($disp),%rdi		# disp->ContextRecord
   1934 	mov	$context,%rsi		# context
   1935 	mov	\$154,%ecx		# sizeof(CONTEXT)
   1936 	.long	0xa548f3fc		# cld; rep movsq
   1937 
   1938 	mov	$disp,%rsi
   1939 	xor	%rcx,%rcx		# arg1, UNW_FLAG_NHANDLER
   1940 	mov	8(%rsi),%rdx		# arg2, disp->ImageBase
   1941 	mov	0(%rsi),%r8		# arg3, disp->ControlPc
   1942 	mov	16(%rsi),%r9		# arg4, disp->FunctionEntry
   1943 	mov	40(%rsi),%r10		# disp->ContextRecord
   1944 	lea	56(%rsi),%r11		# &disp->HandlerData
   1945 	lea	24(%rsi),%r12		# &disp->EstablisherFrame
   1946 	mov	%r10,32(%rsp)		# arg5
   1947 	mov	%r11,40(%rsp)		# arg6
   1948 	mov	%r12,48(%rsp)		# arg7
   1949 	mov	%rcx,56(%rsp)		# arg8, (NULL)
   1950 	call	*__imp_RtlVirtualUnwind(%rip)
   1951 
   1952 	mov	\$1,%eax		# ExceptionContinueSearch
   1953 	add	\$64,%rsp
   1954 	popfq
   1955 	pop	%r15
   1956 	pop	%r14
   1957 	pop	%r13
   1958 	pop	%r12
   1959 	pop	%rbp
   1960 	pop	%rbx
   1961 	pop	%rdi
   1962 	pop	%rsi
   1963 	ret
   1964 .size	ssse3_handler,.-ssse3_handler
   1965 
   1966 .section	.pdata
   1967 .align	4
   1968 	.rva	.LSEH_begin_sha1_block_data_order
   1969 	.rva	.LSEH_end_sha1_block_data_order
   1970 	.rva	.LSEH_info_sha1_block_data_order
   1971 ___
   1972 $code.=<<___ if ($shaext);
   1973 	.rva	.LSEH_begin_sha1_block_data_order_shaext
   1974 	.rva	.LSEH_end_sha1_block_data_order_shaext
   1975 	.rva	.LSEH_info_sha1_block_data_order_shaext
   1976 ___
   1977 $code.=<<___;
   1978 	.rva	.LSEH_begin_sha1_block_data_order_ssse3
   1979 	.rva	.LSEH_end_sha1_block_data_order_ssse3
   1980 	.rva	.LSEH_info_sha1_block_data_order_ssse3
   1981 ___
   1982 $code.=<<___ if ($avx);
   1983 	.rva	.LSEH_begin_sha1_block_data_order_avx
   1984 	.rva	.LSEH_end_sha1_block_data_order_avx
   1985 	.rva	.LSEH_info_sha1_block_data_order_avx
   1986 ___
   1987 $code.=<<___ if ($avx>1);
   1988 	.rva	.LSEH_begin_sha1_block_data_order_avx2
   1989 	.rva	.LSEH_end_sha1_block_data_order_avx2
   1990 	.rva	.LSEH_info_sha1_block_data_order_avx2
   1991 ___
   1992 $code.=<<___;
   1993 .section	.xdata
   1994 .align	8
   1995 .LSEH_info_sha1_block_data_order:
   1996 	.byte	9,0,0,0
   1997 	.rva	se_handler
   1998 ___
   1999 $code.=<<___ if ($shaext);
   2000 .LSEH_info_sha1_block_data_order_shaext:
   2001 	.byte	9,0,0,0
   2002 	.rva	shaext_handler
   2003 ___
   2004 $code.=<<___;
   2005 .LSEH_info_sha1_block_data_order_ssse3:
   2006 	.byte	9,0,0,0
   2007 	.rva	ssse3_handler
   2008 	.rva	.Lprologue_ssse3,.Lepilogue_ssse3	# HandlerData[]
   2009 ___
   2010 $code.=<<___ if ($avx);
   2011 .LSEH_info_sha1_block_data_order_avx:
   2012 	.byte	9,0,0,0
   2013 	.rva	ssse3_handler
   2014 	.rva	.Lprologue_avx,.Lepilogue_avx		# HandlerData[]
   2015 ___
   2016 $code.=<<___ if ($avx>1);
   2017 .LSEH_info_sha1_block_data_order_avx2:
   2018 	.byte	9,0,0,0
   2019 	.rva	ssse3_handler
   2020 	.rva	.Lprologue_avx2,.Lepilogue_avx2		# HandlerData[]
   2021 ___
   2022 }
   2023 
   2024 ####################################################################
   2025 
   2026 sub sha1rnds4 {
   2027     if (@_[0] =~ /\$([x0-9a-f]+),\s*%xmm([0-7]),\s*%xmm([0-7])/) {
   2028       my @opcode=(0x0f,0x3a,0xcc);
   2029 	push @opcode,0xc0|($2&7)|(($3&7)<<3);		# ModR/M
   2030 	my $c=$1;
   2031 	push @opcode,$c=~/^0/?oct($c):$c;
   2032 	return ".byte\t".join(',',@opcode);
   2033     } else {
   2034 	return "sha1rnds4\t".@_[0];
   2035     }
   2036 }
   2037 
   2038 sub sha1op38 {
   2039     my $instr = shift;
   2040     my %opcodelet = (
   2041 		"sha1nexte" => 0xc8,
   2042   		"sha1msg1"  => 0xc9,
   2043 		"sha1msg2"  => 0xca	);
   2044 
   2045     if (defined($opcodelet{$instr}) && @_[0] =~ /%xmm([0-9]+),\s*%xmm([0-9]+)/) {
   2046       my @opcode=(0x0f,0x38);
   2047       my $rex=0;
   2048 	$rex|=0x04			if ($2>=8);
   2049 	$rex|=0x01			if ($1>=8);
   2050 	unshift @opcode,0x40|$rex	if ($rex);
   2051 	push @opcode,$opcodelet{$instr};
   2052 	push @opcode,0xc0|($1&7)|(($2&7)<<3);		# ModR/M
   2053 	return ".byte\t".join(',',@opcode);
   2054     } else {
   2055 	return $instr."\t".@_[0];
   2056     }
   2057 }
   2058 
   2059 foreach (split("\n",$code)) {
   2060 	s/\`([^\`]*)\`/eval $1/geo;
   2061 
   2062 	s/\b(sha1rnds4)\s+(.*)/sha1rnds4($2)/geo	or
   2063 	s/\b(sha1[^\s]*)\s+(.*)/sha1op38($1,$2)/geo;
   2064 
   2065 	print $_,"\n";
   2066 }
   2067 close STDOUT;
   2068