Home | History | Annotate | Download | only in asm
      1 #! /usr/bin/env perl
      2 # Copyright 2006-2016 The OpenSSL Project Authors. All Rights Reserved.
      3 #
      4 # Licensed under the OpenSSL license (the "License").  You may not use
      5 # this file except in compliance with the License.  You can obtain a copy
      6 # in the file LICENSE in the source distribution or at
      7 # https://www.openssl.org/source/license.html
      8 
      9 #
     10 # ====================================================================
     11 # Written by Andy Polyakov <appro (at] openssl.org> for the OpenSSL
     12 # project. The module is, however, dual licensed under OpenSSL and
     13 # CRYPTOGAMS licenses depending on where you obtain it. For further
     14 # details see http://www.openssl.org/~appro/cryptogams/.
     15 # ====================================================================
     16 #
     17 # sha1_block procedure for x86_64.
     18 #
     19 # It was brought to my attention that on EM64T compiler-generated code
     20 # was far behind 32-bit assembler implementation. This is unlike on
     21 # Opteron where compiler-generated code was only 15% behind 32-bit
     22 # assembler, which originally made it hard to motivate the effort.
     23 # There was suggestion to mechanically translate 32-bit code, but I
     24 # dismissed it, reasoning that x86_64 offers enough register bank
     25 # capacity to fully utilize SHA-1 parallelism. Therefore this fresh
     26 # implementation:-) However! While 64-bit code does perform better
     27 # on Opteron, I failed to beat 32-bit assembler on EM64T core. Well,
     28 # x86_64 does offer larger *addressable* bank, but out-of-order core
     29 # reaches for even more registers through dynamic aliasing, and EM64T
     30 # core must have managed to run-time optimize even 32-bit code just as
     31 # good as 64-bit one. Performance improvement is summarized in the
     32 # following table:
     33 #
     34 #		gcc 3.4		32-bit asm	cycles/byte
     35 # Opteron	+45%		+20%		6.8
     36 # Xeon P4	+65%		+0%		9.9
     37 # Core2		+60%		+10%		7.0
     38 
     39 # August 2009.
     40 #
     41 # The code was revised to minimize code size and to maximize
     42 # "distance" between instructions producing input to 'lea'
     43 # instruction and the 'lea' instruction itself, which is essential
     44 # for Intel Atom core.
     45 
     46 # October 2010.
     47 #
     48 # Add SSSE3, Supplemental[!] SSE3, implementation. The idea behind it
     49 # is to offload message schedule denoted by Wt in NIST specification,
     50 # or Xupdate in OpenSSL source, to SIMD unit. See sha1-586.pl module
     51 # for background and implementation details. The only difference from
     52 # 32-bit code is that 64-bit code doesn't have to spill @X[] elements
     53 # to free temporary registers.
     54 
     55 # April 2011.
     56 #
     57 # Add AVX code path. See sha1-586.pl for further information.
     58 
     59 # May 2013.
     60 #
     61 # Add AVX2+BMI code path. Initial attempt (utilizing BMI instructions
     62 # and loading pair of consecutive blocks to 256-bit %ymm registers)
     63 # did not provide impressive performance improvement till a crucial
     64 # hint regarding the number of Xupdate iterations to pre-compute in
     65 # advance was provided by Ilya Albrekht of Intel Corp.
     66 
     67 # March 2014.
     68 #
     69 # Add support for Intel SHA Extensions.
     70 
     71 ######################################################################
     72 # Current performance is summarized in following table. Numbers are
     73 # CPU clock cycles spent to process single byte (less is better).
     74 #
     75 #		x86_64		SSSE3		AVX[2]
     76 # P4		9.05		-
     77 # Opteron	6.26		-
     78 # Core2		6.55		6.05/+8%	-
     79 # Westmere	6.73		5.30/+27%	-
     80 # Sandy Bridge	7.70		6.10/+26%	4.99/+54%
     81 # Ivy Bridge	6.06		4.67/+30%	4.60/+32%
     82 # Haswell	5.45		4.15/+31%	3.57/+53%
     83 # Skylake	5.18		4.06/+28%	3.54/+46%
     84 # Bulldozer	9.11		5.95/+53%
     85 # Ryzen		4.75		3.80/+24%	1.93/+150%(**)
     86 # VIA Nano	9.32		7.15/+30%
     87 # Atom		10.3		9.17/+12%
     88 # Silvermont	13.1(*)		9.37/+40%
     89 # Knights L	13.2(*)		9.68/+36%	8.30/+59%
     90 # Goldmont	8.13		6.42/+27%	1.70/+380%(**)
     91 #
     92 # (*)	obviously suboptimal result, nothing was done about it,
     93 #	because SSSE3 code is compiled unconditionally;
     94 # (**)	SHAEXT result
     95 
     96 $flavour = shift;
     97 $output  = shift;
     98 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
     99 
    100 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
    101 
    102 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
    103 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
    104 ( $xlate="${dir}../../../perlasm/x86_64-xlate.pl" and -f $xlate) or
    105 die "can't locate x86_64-xlate.pl";
    106 
    107 # In upstream, this is controlled by shelling out to the compiler to check
    108 # versions, but BoringSSL is intended to be used with pre-generated perlasm
    109 # output, so this isn't useful anyway.
    110 #
    111 # TODO(davidben): Enable AVX2 code after testing by setting $avx to 2. Is it
    112 # necessary to disable AVX2 code when SHA Extensions code is disabled? Upstream
    113 # did not tie them together until after $shaext was added.
    114 $avx = 1;
    115 
    116 # TODO(davidben): Consider enabling the Intel SHA Extensions code once it's
    117 # been tested.
    118 $shaext=0;	### set to zero if compiling for 1.0.1
    119 $avx=1		if (!$shaext && $avx);
    120 
    121 open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
    122 *STDOUT=*OUT;
    123 
    124 $ctx="%rdi";	# 1st arg
    125 $inp="%rsi";	# 2nd arg
    126 $num="%rdx";	# 3rd arg
    127 
    128 # reassign arguments in order to produce more compact code
    129 $ctx="%r8";
    130 $inp="%r9";
    131 $num="%r10";
    132 
    133 $t0="%eax";
    134 $t1="%ebx";
    135 $t2="%ecx";
    136 @xi=("%edx","%ebp","%r14d");
    137 $A="%esi";
    138 $B="%edi";
    139 $C="%r11d";
    140 $D="%r12d";
    141 $E="%r13d";
    142 
    143 @V=($A,$B,$C,$D,$E);
    144 
    145 sub BODY_00_19 {
    146 my ($i,$a,$b,$c,$d,$e)=@_;
    147 my $j=$i+1;
    148 $code.=<<___ if ($i==0);
    149 	mov	`4*$i`($inp),$xi[0]
    150 	bswap	$xi[0]
    151 ___
    152 $code.=<<___ if ($i<15);
    153 	mov	`4*$j`($inp),$xi[1]
    154 	mov	$d,$t0
    155 	mov	$xi[0],`4*$i`(%rsp)
    156 	mov	$a,$t2
    157 	bswap	$xi[1]
    158 	xor	$c,$t0
    159 	rol	\$5,$t2
    160 	and	$b,$t0
    161 	lea	0x5a827999($xi[0],$e),$e
    162 	add	$t2,$e
    163 	xor	$d,$t0
    164 	rol	\$30,$b
    165 	add	$t0,$e
    166 ___
    167 $code.=<<___ if ($i>=15);
    168 	xor	`4*($j%16)`(%rsp),$xi[1]
    169 	mov	$d,$t0
    170 	mov	$xi[0],`4*($i%16)`(%rsp)
    171 	mov	$a,$t2
    172 	xor	`4*(($j+2)%16)`(%rsp),$xi[1]
    173 	xor	$c,$t0
    174 	rol	\$5,$t2
    175 	xor	`4*(($j+8)%16)`(%rsp),$xi[1]
    176 	and	$b,$t0
    177 	lea	0x5a827999($xi[0],$e),$e
    178 	rol	\$30,$b
    179 	xor	$d,$t0
    180 	add	$t2,$e
    181 	rol	\$1,$xi[1]
    182 	add	$t0,$e
    183 ___
    184 push(@xi,shift(@xi));
    185 }
    186 
    187 sub BODY_20_39 {
    188 my ($i,$a,$b,$c,$d,$e)=@_;
    189 my $j=$i+1;
    190 my $K=($i<40)?0x6ed9eba1:0xca62c1d6;
    191 $code.=<<___ if ($i<79);
    192 	xor	`4*($j%16)`(%rsp),$xi[1]
    193 	mov	$b,$t0
    194 	`"mov	$xi[0],".4*($i%16)."(%rsp)"	if ($i<72)`
    195 	mov	$a,$t2
    196 	xor	`4*(($j+2)%16)`(%rsp),$xi[1]
    197 	xor	$d,$t0
    198 	rol	\$5,$t2
    199 	xor	`4*(($j+8)%16)`(%rsp),$xi[1]
    200 	lea	$K($xi[0],$e),$e
    201 	xor	$c,$t0
    202 	add	$t2,$e
    203 	rol	\$30,$b
    204 	add	$t0,$e
    205 	rol	\$1,$xi[1]
    206 ___
    207 $code.=<<___ if ($i==79);
    208 	mov	$b,$t0
    209 	mov	$a,$t2
    210 	xor	$d,$t0
    211 	lea	$K($xi[0],$e),$e
    212 	rol	\$5,$t2
    213 	xor	$c,$t0
    214 	add	$t2,$e
    215 	rol	\$30,$b
    216 	add	$t0,$e
    217 ___
    218 push(@xi,shift(@xi));
    219 }
    220 
    221 sub BODY_40_59 {
    222 my ($i,$a,$b,$c,$d,$e)=@_;
    223 my $j=$i+1;
    224 $code.=<<___;
    225 	xor	`4*($j%16)`(%rsp),$xi[1]
    226 	mov	$d,$t0
    227 	mov	$xi[0],`4*($i%16)`(%rsp)
    228 	mov	$d,$t1
    229 	xor	`4*(($j+2)%16)`(%rsp),$xi[1]
    230 	and	$c,$t0
    231 	mov	$a,$t2
    232 	xor	`4*(($j+8)%16)`(%rsp),$xi[1]
    233 	lea	0x8f1bbcdc($xi[0],$e),$e
    234 	xor	$c,$t1
    235 	rol	\$5,$t2
    236 	add	$t0,$e
    237 	rol	\$1,$xi[1]
    238 	and	$b,$t1
    239 	add	$t2,$e
    240 	rol	\$30,$b
    241 	add	$t1,$e
    242 ___
    243 push(@xi,shift(@xi));
    244 }
    245 
    246 $code.=<<___;
    247 .text
    248 .extern	OPENSSL_ia32cap_P
    249 
    250 .globl	sha1_block_data_order
    251 .type	sha1_block_data_order,\@function,3
    252 .align	16
    253 sha1_block_data_order:
    254 	leaq	OPENSSL_ia32cap_P(%rip),%r10
    255 	mov	0(%r10),%r9d
    256 	mov	4(%r10),%r8d
    257 	mov	8(%r10),%r10d
    258 	test	\$`1<<9`,%r8d		# check SSSE3 bit
    259 	jz	.Lialu
    260 ___
    261 $code.=<<___ if ($shaext);
    262 	test	\$`1<<29`,%r10d		# check SHA bit
    263 	jnz	_shaext_shortcut
    264 ___
    265 $code.=<<___ if ($avx>1);
    266 	and	\$`1<<3|1<<5|1<<8`,%r10d	# check AVX2+BMI1+BMI2
    267 	cmp	\$`1<<3|1<<5|1<<8`,%r10d
    268 	je	_avx2_shortcut
    269 ___
    270 $code.=<<___ if ($avx);
    271 	and	\$`1<<28`,%r8d		# mask AVX bit
    272 	and	\$`1<<30`,%r9d		# mask "Intel CPU" bit
    273 	or	%r9d,%r8d
    274 	cmp	\$`1<<28|1<<30`,%r8d
    275 	je	_avx_shortcut
    276 ___
    277 $code.=<<___;
    278 	jmp	_ssse3_shortcut
    279 
    280 .align	16
    281 .Lialu:
    282 	mov	%rsp,%rax
    283 	push	%rbx
    284 	push	%rbp
    285 	push	%r12
    286 	push	%r13
    287 	push	%r14
    288 	mov	%rdi,$ctx	# reassigned argument
    289 	sub	\$`8+16*4`,%rsp
    290 	mov	%rsi,$inp	# reassigned argument
    291 	and	\$-64,%rsp
    292 	mov	%rdx,$num	# reassigned argument
    293 	mov	%rax,`16*4`(%rsp)
    294 .Lprologue:
    295 
    296 	mov	0($ctx),$A
    297 	mov	4($ctx),$B
    298 	mov	8($ctx),$C
    299 	mov	12($ctx),$D
    300 	mov	16($ctx),$E
    301 	jmp	.Lloop
    302 
    303 .align	16
    304 .Lloop:
    305 ___
    306 for($i=0;$i<20;$i++)	{ &BODY_00_19($i,@V); unshift(@V,pop(@V)); }
    307 for(;$i<40;$i++)	{ &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
    308 for(;$i<60;$i++)	{ &BODY_40_59($i,@V); unshift(@V,pop(@V)); }
    309 for(;$i<80;$i++)	{ &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
    310 $code.=<<___;
    311 	add	0($ctx),$A
    312 	add	4($ctx),$B
    313 	add	8($ctx),$C
    314 	add	12($ctx),$D
    315 	add	16($ctx),$E
    316 	mov	$A,0($ctx)
    317 	mov	$B,4($ctx)
    318 	mov	$C,8($ctx)
    319 	mov	$D,12($ctx)
    320 	mov	$E,16($ctx)
    321 
    322 	sub	\$1,$num
    323 	lea	`16*4`($inp),$inp
    324 	jnz	.Lloop
    325 
    326 	mov	`16*4`(%rsp),%rsi
    327 	mov	-40(%rsi),%r14
    328 	mov	-32(%rsi),%r13
    329 	mov	-24(%rsi),%r12
    330 	mov	-16(%rsi),%rbp
    331 	mov	-8(%rsi),%rbx
    332 	lea	(%rsi),%rsp
    333 .Lepilogue:
    334 	ret
    335 .size	sha1_block_data_order,.-sha1_block_data_order
    336 ___
    337 if ($shaext) {{{
    338 ######################################################################
    339 # Intel SHA Extensions implementation of SHA1 update function.
    340 #
    341 my ($ctx,$inp,$num)=("%rdi","%rsi","%rdx");
    342 my ($ABCD,$E,$E_,$BSWAP,$ABCD_SAVE,$E_SAVE)=map("%xmm$_",(0..3,8,9));
    343 my @MSG=map("%xmm$_",(4..7));
    344 
    345 $code.=<<___;
    346 .type	sha1_block_data_order_shaext,\@function,3
    347 .align	32
    348 sha1_block_data_order_shaext:
    349 _shaext_shortcut:
    350 ___
    351 $code.=<<___ if ($win64);
    352 	lea	`-8-4*16`(%rsp),%rsp
    353 	movaps	%xmm6,-8-4*16(%rax)
    354 	movaps	%xmm7,-8-3*16(%rax)
    355 	movaps	%xmm8,-8-2*16(%rax)
    356 	movaps	%xmm9,-8-1*16(%rax)
    357 .Lprologue_shaext:
    358 ___
    359 $code.=<<___;
    360 	movdqu	($ctx),$ABCD
    361 	movd	16($ctx),$E
    362 	movdqa	K_XX_XX+0xa0(%rip),$BSWAP	# byte-n-word swap
    363 
    364 	movdqu	($inp),@MSG[0]
    365 	pshufd	\$0b00011011,$ABCD,$ABCD	# flip word order
    366 	movdqu	0x10($inp),@MSG[1]
    367 	pshufd	\$0b00011011,$E,$E		# flip word order
    368 	movdqu	0x20($inp),@MSG[2]
    369 	pshufb	$BSWAP,@MSG[0]
    370 	movdqu	0x30($inp),@MSG[3]
    371 	pshufb	$BSWAP,@MSG[1]
    372 	pshufb	$BSWAP,@MSG[2]
    373 	movdqa	$E,$E_SAVE			# offload $E
    374 	pshufb	$BSWAP,@MSG[3]
    375 	jmp	.Loop_shaext
    376 
    377 .align	16
    378 .Loop_shaext:
    379 	dec		$num
    380 	lea		0x40($inp),%r8		# next input block
    381 	paddd		@MSG[0],$E
    382 	cmovne		%r8,$inp
    383 	movdqa		$ABCD,$ABCD_SAVE	# offload $ABCD
    384 ___
    385 for($i=0;$i<20-4;$i+=2) {
    386 $code.=<<___;
    387 	sha1msg1	@MSG[1],@MSG[0]
    388 	movdqa		$ABCD,$E_
    389 	sha1rnds4	\$`int($i/5)`,$E,$ABCD	# 0-3...
    390 	sha1nexte	@MSG[1],$E_
    391 	pxor		@MSG[2],@MSG[0]
    392 	sha1msg1	@MSG[2],@MSG[1]
    393 	sha1msg2	@MSG[3],@MSG[0]
    394 
    395 	movdqa		$ABCD,$E
    396 	sha1rnds4	\$`int(($i+1)/5)`,$E_,$ABCD
    397 	sha1nexte	@MSG[2],$E
    398 	pxor		@MSG[3],@MSG[1]
    399 	sha1msg2	@MSG[0],@MSG[1]
    400 ___
    401 	push(@MSG,shift(@MSG));	push(@MSG,shift(@MSG));
    402 }
    403 $code.=<<___;
    404 	movdqu		($inp),@MSG[0]
    405 	movdqa		$ABCD,$E_
    406 	sha1rnds4	\$3,$E,$ABCD		# 64-67
    407 	sha1nexte	@MSG[1],$E_
    408 	movdqu		0x10($inp),@MSG[1]
    409 	pshufb		$BSWAP,@MSG[0]
    410 
    411 	movdqa		$ABCD,$E
    412 	sha1rnds4	\$3,$E_,$ABCD		# 68-71
    413 	sha1nexte	@MSG[2],$E
    414 	movdqu		0x20($inp),@MSG[2]
    415 	pshufb		$BSWAP,@MSG[1]
    416 
    417 	movdqa		$ABCD,$E_
    418 	sha1rnds4	\$3,$E,$ABCD		# 72-75
    419 	sha1nexte	@MSG[3],$E_
    420 	movdqu		0x30($inp),@MSG[3]
    421 	pshufb		$BSWAP,@MSG[2]
    422 
    423 	movdqa		$ABCD,$E
    424 	sha1rnds4	\$3,$E_,$ABCD		# 76-79
    425 	sha1nexte	$E_SAVE,$E
    426 	pshufb		$BSWAP,@MSG[3]
    427 
    428 	paddd		$ABCD_SAVE,$ABCD
    429 	movdqa		$E,$E_SAVE		# offload $E
    430 
    431 	jnz		.Loop_shaext
    432 
    433 	pshufd	\$0b00011011,$ABCD,$ABCD
    434 	pshufd	\$0b00011011,$E,$E
    435 	movdqu	$ABCD,($ctx)
    436 	movd	$E,16($ctx)
    437 ___
    438 $code.=<<___ if ($win64);
    439 	movaps	-8-4*16(%rax),%xmm6
    440 	movaps	-8-3*16(%rax),%xmm7
    441 	movaps	-8-2*16(%rax),%xmm8
    442 	movaps	-8-1*16(%rax),%xmm9
    443 	mov	%rax,%rsp
    444 .Lepilogue_shaext:
    445 ___
    446 $code.=<<___;
    447 	ret
    448 .size	sha1_block_data_order_shaext,.-sha1_block_data_order_shaext
    449 ___
    450 }}}
    451 {{{
    452 my $Xi=4;
    453 my @X=map("%xmm$_",(4..7,0..3));
    454 my @Tx=map("%xmm$_",(8..10));
    455 my $Kx="%xmm11";
    456 my @V=($A,$B,$C,$D,$E)=("%eax","%ebx","%ecx","%edx","%ebp");	# size optimization
    457 my @T=("%esi","%edi");
    458 my $j=0;
    459 my $rx=0;
    460 my $K_XX_XX="%r14";
    461 my $fp="%r11";
    462 
    463 my $_rol=sub { &rol(@_) };
    464 my $_ror=sub { &ror(@_) };
    465 
    466 { my $sn;
    467 sub align32() {
    468   ++$sn;
    469 $code.=<<___;
    470 	jmp	.Lalign32_$sn	# see "Decoded ICache" in manual
    471 .align	32
    472 .Lalign32_$sn:
    473 ___
    474 }
    475 }
    476 
    477 $code.=<<___;
    478 .type	sha1_block_data_order_ssse3,\@function,3
    479 .align	16
    480 sha1_block_data_order_ssse3:
    481 _ssse3_shortcut:
    482 	mov	%rsp,$fp	# frame pointer
    483 	push	%rbx
    484 	push	%rbp
    485 	push	%r12
    486 	push	%r13		# redundant, done to share Win64 SE handler
    487 	push	%r14
    488 	lea	`-64-($win64?6*16:0)`(%rsp),%rsp
    489 ___
    490 $code.=<<___ if ($win64);
    491 	movaps	%xmm6,-40-6*16($fp)
    492 	movaps	%xmm7,-40-5*16($fp)
    493 	movaps	%xmm8,-40-4*16($fp)
    494 	movaps	%xmm9,-40-3*16($fp)
    495 	movaps	%xmm10,-40-2*16($fp)
    496 	movaps	%xmm11,-40-1*16($fp)
    497 .Lprologue_ssse3:
    498 ___
    499 $code.=<<___;
    500 	and	\$-64,%rsp
    501 	mov	%rdi,$ctx	# reassigned argument
    502 	mov	%rsi,$inp	# reassigned argument
    503 	mov	%rdx,$num	# reassigned argument
    504 
    505 	shl	\$6,$num
    506 	add	$inp,$num
    507 	lea	K_XX_XX+64(%rip),$K_XX_XX
    508 
    509 	mov	0($ctx),$A		# load context
    510 	mov	4($ctx),$B
    511 	mov	8($ctx),$C
    512 	mov	12($ctx),$D
    513 	mov	$B,@T[0]		# magic seed
    514 	mov	16($ctx),$E
    515 	mov	$C,@T[1]
    516 	xor	$D,@T[1]
    517 	and	@T[1],@T[0]
    518 
    519 	movdqa	64($K_XX_XX),@X[2]	# pbswap mask
    520 	movdqa	-64($K_XX_XX),@Tx[1]	# K_00_19
    521 	movdqu	0($inp),@X[-4&7]	# load input to %xmm[0-3]
    522 	movdqu	16($inp),@X[-3&7]
    523 	movdqu	32($inp),@X[-2&7]
    524 	movdqu	48($inp),@X[-1&7]
    525 	pshufb	@X[2],@X[-4&7]		# byte swap
    526 	pshufb	@X[2],@X[-3&7]
    527 	pshufb	@X[2],@X[-2&7]
    528 	add	\$64,$inp
    529 	paddd	@Tx[1],@X[-4&7]		# add K_00_19
    530 	pshufb	@X[2],@X[-1&7]
    531 	paddd	@Tx[1],@X[-3&7]
    532 	paddd	@Tx[1],@X[-2&7]
    533 	movdqa	@X[-4&7],0(%rsp)	# X[]+K xfer to IALU
    534 	psubd	@Tx[1],@X[-4&7]		# restore X[]
    535 	movdqa	@X[-3&7],16(%rsp)
    536 	psubd	@Tx[1],@X[-3&7]
    537 	movdqa	@X[-2&7],32(%rsp)
    538 	psubd	@Tx[1],@X[-2&7]
    539 	jmp	.Loop_ssse3
    540 ___
    541 
    542 sub AUTOLOAD()		# thunk [simplified] 32-bit style perlasm
    543 { my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
    544   my $arg = pop;
    545     $arg = "\$$arg" if ($arg*1 eq $arg);
    546     $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n";
    547 }
    548 
    549 sub Xupdate_ssse3_16_31()		# recall that $Xi starts with 4
    550 { use integer;
    551   my $body = shift;
    552   my @insns = (&$body,&$body,&$body,&$body);	# 40 instructions
    553   my ($a,$b,$c,$d,$e);
    554 
    555 	 eval(shift(@insns));		# ror
    556 	&pshufd	(@X[0],@X[-4&7],0xee);	# was &movdqa	(@X[0],@X[-3&7]);
    557 	 eval(shift(@insns));
    558 	&movdqa	(@Tx[0],@X[-1&7]);
    559 	  &paddd	(@Tx[1],@X[-1&7]);
    560 	 eval(shift(@insns));
    561 	 eval(shift(@insns));
    562 
    563 	&punpcklqdq(@X[0],@X[-3&7]);	# compose "X[-14]" in "X[0]", was &palignr(@X[0],@X[-4&7],8);
    564 	 eval(shift(@insns));
    565 	 eval(shift(@insns));		# rol
    566 	 eval(shift(@insns));
    567 	&psrldq	(@Tx[0],4);		# "X[-3]", 3 dwords
    568 	 eval(shift(@insns));
    569 	 eval(shift(@insns));
    570 
    571 	&pxor	(@X[0],@X[-4&7]);	# "X[0]"^="X[-16]"
    572 	 eval(shift(@insns));
    573 	 eval(shift(@insns));		# ror
    574 	&pxor	(@Tx[0],@X[-2&7]);	# "X[-3]"^"X[-8]"
    575 	 eval(shift(@insns));
    576 	 eval(shift(@insns));
    577 	 eval(shift(@insns));
    578 
    579 	&pxor	(@X[0],@Tx[0]);		# "X[0]"^="X[-3]"^"X[-8]"
    580 	 eval(shift(@insns));
    581 	 eval(shift(@insns));		# rol
    582 	  &movdqa	(eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]);	# X[]+K xfer to IALU
    583 	 eval(shift(@insns));
    584 	 eval(shift(@insns));
    585 
    586 	&movdqa	(@Tx[2],@X[0]);
    587 	 eval(shift(@insns));
    588 	 eval(shift(@insns));
    589 	 eval(shift(@insns));		# ror
    590 	&movdqa	(@Tx[0],@X[0]);
    591 	 eval(shift(@insns));
    592 
    593 	&pslldq	(@Tx[2],12);		# "X[0]"<<96, extract one dword
    594 	&paddd	(@X[0],@X[0]);
    595 	 eval(shift(@insns));
    596 	 eval(shift(@insns));
    597 
    598 	&psrld	(@Tx[0],31);
    599 	 eval(shift(@insns));
    600 	 eval(shift(@insns));		# rol
    601 	 eval(shift(@insns));
    602 	&movdqa	(@Tx[1],@Tx[2]);
    603 	 eval(shift(@insns));
    604 	 eval(shift(@insns));
    605 
    606 	&psrld	(@Tx[2],30);
    607 	 eval(shift(@insns));
    608 	 eval(shift(@insns));		# ror
    609 	&por	(@X[0],@Tx[0]);		# "X[0]"<<<=1
    610 	 eval(shift(@insns));
    611 	 eval(shift(@insns));
    612 	 eval(shift(@insns));
    613 
    614 	&pslld	(@Tx[1],2);
    615 	&pxor	(@X[0],@Tx[2]);
    616 	 eval(shift(@insns));
    617 	  &movdqa	(@Tx[2],eval(2*16*(($Xi)/5)-64)."($K_XX_XX)");	# K_XX_XX
    618 	 eval(shift(@insns));		# rol
    619 	 eval(shift(@insns));
    620 	 eval(shift(@insns));
    621 
    622 	&pxor	(@X[0],@Tx[1]);		# "X[0]"^=("X[0]">>96)<<<2
    623 	&pshufd (@Tx[1],@X[-1&7],0xee)	if ($Xi==7);	# was &movdqa	(@Tx[0],@X[-1&7]) in Xupdate_ssse3_32_79
    624 
    625 	 foreach (@insns) { eval; }	# remaining instructions [if any]
    626 
    627   $Xi++;	push(@X,shift(@X));	# "rotate" X[]
    628 		push(@Tx,shift(@Tx));
    629 }
    630 
    631 sub Xupdate_ssse3_32_79()
    632 { use integer;
    633   my $body = shift;
    634   my @insns = (&$body,&$body,&$body,&$body);	# 32 to 44 instructions
    635   my ($a,$b,$c,$d,$e);
    636 
    637 	 eval(shift(@insns))		if ($Xi==8);
    638 	&pxor	(@X[0],@X[-4&7]);	# "X[0]"="X[-32]"^"X[-16]"
    639 	 eval(shift(@insns))		if ($Xi==8);
    640 	 eval(shift(@insns));		# body_20_39
    641 	 eval(shift(@insns));
    642 	 eval(shift(@insns))		if (@insns[1] =~ /_ror/);
    643 	 eval(shift(@insns))		if (@insns[0] =~ /_ror/);
    644 	&punpcklqdq(@Tx[0],@X[-1&7]);	# compose "X[-6]", was &palignr(@Tx[0],@X[-2&7],8);
    645 	 eval(shift(@insns));
    646 	 eval(shift(@insns));		# rol
    647 
    648 	&pxor	(@X[0],@X[-7&7]);	# "X[0]"^="X[-28]"
    649 	 eval(shift(@insns));
    650 	 eval(shift(@insns));
    651 	if ($Xi%5) {
    652 	  &movdqa	(@Tx[2],@Tx[1]);# "perpetuate" K_XX_XX...
    653 	} else {			# ... or load next one
    654 	  &movdqa	(@Tx[2],eval(2*16*($Xi/5)-64)."($K_XX_XX)");
    655 	}
    656 	 eval(shift(@insns));		# ror
    657 	  &paddd	(@Tx[1],@X[-1&7]);
    658 	 eval(shift(@insns));
    659 
    660 	&pxor	(@X[0],@Tx[0]);		# "X[0]"^="X[-6]"
    661 	 eval(shift(@insns));		# body_20_39
    662 	 eval(shift(@insns));
    663 	 eval(shift(@insns));
    664 	 eval(shift(@insns));		# rol
    665 	 eval(shift(@insns))		if (@insns[0] =~ /_ror/);
    666 
    667 	&movdqa	(@Tx[0],@X[0]);
    668 	 eval(shift(@insns));
    669 	 eval(shift(@insns));
    670 	  &movdqa	(eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]);	# X[]+K xfer to IALU
    671 	 eval(shift(@insns));		# ror
    672 	 eval(shift(@insns));
    673 	 eval(shift(@insns));		# body_20_39
    674 
    675 	&pslld	(@X[0],2);
    676 	 eval(shift(@insns));
    677 	 eval(shift(@insns));
    678 	&psrld	(@Tx[0],30);
    679 	 eval(shift(@insns))		if (@insns[0] =~ /_rol/);# rol
    680 	 eval(shift(@insns));
    681 	 eval(shift(@insns));
    682 	 eval(shift(@insns));		# ror
    683 
    684 	&por	(@X[0],@Tx[0]);		# "X[0]"<<<=2
    685 	 eval(shift(@insns));
    686 	 eval(shift(@insns));		# body_20_39
    687 	 eval(shift(@insns))		if (@insns[1] =~ /_rol/);
    688 	 eval(shift(@insns))		if (@insns[0] =~ /_rol/);
    689 	  &pshufd(@Tx[1],@X[-1&7],0xee)	if ($Xi<19);	# was &movdqa	(@Tx[1],@X[0])
    690 	 eval(shift(@insns));
    691 	 eval(shift(@insns));		# rol
    692 	 eval(shift(@insns));
    693 	 eval(shift(@insns));
    694 	 eval(shift(@insns));		# rol
    695 	 eval(shift(@insns));
    696 
    697 	 foreach (@insns) { eval; }	# remaining instructions
    698 
    699   $Xi++;	push(@X,shift(@X));	# "rotate" X[]
    700 		push(@Tx,shift(@Tx));
    701 }
    702 
    703 sub Xuplast_ssse3_80()
    704 { use integer;
    705   my $body = shift;
    706   my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
    707   my ($a,$b,$c,$d,$e);
    708 
    709 	 eval(shift(@insns));
    710 	 eval(shift(@insns));
    711 	 eval(shift(@insns));
    712 	 eval(shift(@insns));
    713 	  &paddd	(@Tx[1],@X[-1&7]);
    714 	 eval(shift(@insns));
    715 	 eval(shift(@insns));
    716 
    717 	  &movdqa	(eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]);	# X[]+K xfer IALU
    718 
    719 	 foreach (@insns) { eval; }		# remaining instructions
    720 
    721 	&cmp	($inp,$num);
    722 	&je	(".Ldone_ssse3");
    723 
    724 	unshift(@Tx,pop(@Tx));
    725 
    726 	&movdqa	(@X[2],"64($K_XX_XX)");		# pbswap mask
    727 	&movdqa	(@Tx[1],"-64($K_XX_XX)");	# K_00_19
    728 	&movdqu	(@X[-4&7],"0($inp)");		# load input
    729 	&movdqu	(@X[-3&7],"16($inp)");
    730 	&movdqu	(@X[-2&7],"32($inp)");
    731 	&movdqu	(@X[-1&7],"48($inp)");
    732 	&pshufb	(@X[-4&7],@X[2]);		# byte swap
    733 	&add	($inp,64);
    734 
    735   $Xi=0;
    736 }
    737 
    738 sub Xloop_ssse3()
    739 { use integer;
    740   my $body = shift;
    741   my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
    742   my ($a,$b,$c,$d,$e);
    743 
    744 	 eval(shift(@insns));
    745 	 eval(shift(@insns));
    746 	 eval(shift(@insns));
    747 	&pshufb	(@X[($Xi-3)&7],@X[2]);
    748 	 eval(shift(@insns));
    749 	 eval(shift(@insns));
    750 	 eval(shift(@insns));
    751 	 eval(shift(@insns));
    752 	&paddd	(@X[($Xi-4)&7],@Tx[1]);
    753 	 eval(shift(@insns));
    754 	 eval(shift(@insns));
    755 	 eval(shift(@insns));
    756 	 eval(shift(@insns));
    757 	&movdqa	(eval(16*$Xi)."(%rsp)",@X[($Xi-4)&7]);	# X[]+K xfer to IALU
    758 	 eval(shift(@insns));
    759 	 eval(shift(@insns));
    760 	 eval(shift(@insns));
    761 	 eval(shift(@insns));
    762 	&psubd	(@X[($Xi-4)&7],@Tx[1]);
    763 
    764 	foreach (@insns) { eval; }
    765   $Xi++;
    766 }
    767 
    768 sub Xtail_ssse3()
    769 { use integer;
    770   my $body = shift;
    771   my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
    772   my ($a,$b,$c,$d,$e);
    773 
    774 	foreach (@insns) { eval; }
    775 }
    776 
    777 sub body_00_19 () {	# ((c^d)&b)^d
    778 	# on start @T[0]=(c^d)&b
    779 	return &body_20_39() if ($rx==19); $rx++;
    780 	(
    781 	'($a,$b,$c,$d,$e)=@V;'.
    782 	'&$_ror	($b,$j?7:2)',	# $b>>>2
    783 	'&xor	(@T[0],$d)',
    784 	'&mov	(@T[1],$a)',	# $b for next round
    785 
    786 	'&add	($e,eval(4*($j&15))."(%rsp)")',	# X[]+K xfer
    787 	'&xor	($b,$c)',	# $c^$d for next round
    788 
    789 	'&$_rol	($a,5)',
    790 	'&add	($e,@T[0])',
    791 	'&and	(@T[1],$b)',	# ($b&($c^$d)) for next round
    792 
    793 	'&xor	($b,$c)',	# restore $b
    794 	'&add	($e,$a);'	.'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));'
    795 	);
    796 }
    797 
    798 sub body_20_39 () {	# b^d^c
    799 	# on entry @T[0]=b^d
    800 	return &body_40_59() if ($rx==39); $rx++;
    801 	(
    802 	'($a,$b,$c,$d,$e)=@V;'.
    803 	'&add	($e,eval(4*($j&15))."(%rsp)")',	# X[]+K xfer
    804 	'&xor	(@T[0],$d)	if($j==19);'.
    805 	'&xor	(@T[0],$c)	if($j> 19)',	# ($b^$d^$c)
    806 	'&mov	(@T[1],$a)',	# $b for next round
    807 
    808 	'&$_rol	($a,5)',
    809 	'&add	($e,@T[0])',
    810 	'&xor	(@T[1],$c)	if ($j< 79)',	# $b^$d for next round
    811 
    812 	'&$_ror	($b,7)',	# $b>>>2
    813 	'&add	($e,$a);'	.'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));'
    814 	);
    815 }
    816 
    817 sub body_40_59 () {	# ((b^c)&(c^d))^c
    818 	# on entry @T[0]=(b^c), (c^=d)
    819 	$rx++;
    820 	(
    821 	'($a,$b,$c,$d,$e)=@V;'.
    822 	'&add	($e,eval(4*($j&15))."(%rsp)")',	# X[]+K xfer
    823 	'&and	(@T[0],$c)	if ($j>=40)',	# (b^c)&(c^d)
    824 	'&xor	($c,$d)		if ($j>=40)',	# restore $c
    825 
    826 	'&$_ror	($b,7)',	# $b>>>2
    827 	'&mov	(@T[1],$a)',	# $b for next round
    828 	'&xor	(@T[0],$c)',
    829 
    830 	'&$_rol	($a,5)',
    831 	'&add	($e,@T[0])',
    832 	'&xor	(@T[1],$c)	if ($j==59);'.
    833 	'&xor	(@T[1],$b)	if ($j< 59)',	# b^c for next round
    834 
    835 	'&xor	($b,$c)		if ($j< 59)',	# c^d for next round
    836 	'&add	($e,$a);'	.'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));'
    837 	);
    838 }
    839 $code.=<<___;
    840 .align	16
    841 .Loop_ssse3:
    842 ___
    843 	&Xupdate_ssse3_16_31(\&body_00_19);
    844 	&Xupdate_ssse3_16_31(\&body_00_19);
    845 	&Xupdate_ssse3_16_31(\&body_00_19);
    846 	&Xupdate_ssse3_16_31(\&body_00_19);
    847 	&Xupdate_ssse3_32_79(\&body_00_19);
    848 	&Xupdate_ssse3_32_79(\&body_20_39);
    849 	&Xupdate_ssse3_32_79(\&body_20_39);
    850 	&Xupdate_ssse3_32_79(\&body_20_39);
    851 	&Xupdate_ssse3_32_79(\&body_20_39);
    852 	&Xupdate_ssse3_32_79(\&body_20_39);
    853 	&Xupdate_ssse3_32_79(\&body_40_59);
    854 	&Xupdate_ssse3_32_79(\&body_40_59);
    855 	&Xupdate_ssse3_32_79(\&body_40_59);
    856 	&Xupdate_ssse3_32_79(\&body_40_59);
    857 	&Xupdate_ssse3_32_79(\&body_40_59);
    858 	&Xupdate_ssse3_32_79(\&body_20_39);
    859 	&Xuplast_ssse3_80(\&body_20_39);	# can jump to "done"
    860 
    861 				$saved_j=$j; @saved_V=@V;
    862 
    863 	&Xloop_ssse3(\&body_20_39);
    864 	&Xloop_ssse3(\&body_20_39);
    865 	&Xloop_ssse3(\&body_20_39);
    866 
    867 $code.=<<___;
    868 	add	0($ctx),$A			# update context
    869 	add	4($ctx),@T[0]
    870 	add	8($ctx),$C
    871 	add	12($ctx),$D
    872 	mov	$A,0($ctx)
    873 	add	16($ctx),$E
    874 	mov	@T[0],4($ctx)
    875 	mov	@T[0],$B			# magic seed
    876 	mov	$C,8($ctx)
    877 	mov	$C,@T[1]
    878 	mov	$D,12($ctx)
    879 	xor	$D,@T[1]
    880 	mov	$E,16($ctx)
    881 	and	@T[1],@T[0]
    882 	jmp	.Loop_ssse3
    883 
    884 .align	16
    885 .Ldone_ssse3:
    886 ___
    887 				$j=$saved_j; @V=@saved_V;
    888 
    889 	&Xtail_ssse3(\&body_20_39);
    890 	&Xtail_ssse3(\&body_20_39);
    891 	&Xtail_ssse3(\&body_20_39);
    892 
    893 $code.=<<___;
    894 	add	0($ctx),$A			# update context
    895 	add	4($ctx),@T[0]
    896 	add	8($ctx),$C
    897 	mov	$A,0($ctx)
    898 	add	12($ctx),$D
    899 	mov	@T[0],4($ctx)
    900 	add	16($ctx),$E
    901 	mov	$C,8($ctx)
    902 	mov	$D,12($ctx)
    903 	mov	$E,16($ctx)
    904 ___
    905 $code.=<<___ if ($win64);
    906 	movaps	-40-6*16($fp),%xmm6
    907 	movaps	-40-5*16($fp),%xmm7
    908 	movaps	-40-4*16($fp),%xmm8
    909 	movaps	-40-3*16($fp),%xmm9
    910 	movaps	-40-2*16($fp),%xmm10
    911 	movaps	-40-1*16($fp),%xmm11
    912 ___
    913 $code.=<<___;
    914 	mov	-40($fp),%r14
    915 	mov	-32($fp),%r13
    916 	mov	-24($fp),%r12
    917 	mov	-16($fp),%rbp
    918 	mov	-8($fp),%rbx
    919 	lea	($fp),%rsp
    920 .Lepilogue_ssse3:
    921 	ret
    922 .size	sha1_block_data_order_ssse3,.-sha1_block_data_order_ssse3
    923 ___
    924 
    925 if ($avx) {
    926 $Xi=4;				# reset variables
    927 @X=map("%xmm$_",(4..7,0..3));
    928 @Tx=map("%xmm$_",(8..10));
    929 $j=0;
    930 $rx=0;
    931 
    932 my $done_avx_label=".Ldone_avx";
    933 
    934 my $_rol=sub { &shld(@_[0],@_) };
    935 my $_ror=sub { &shrd(@_[0],@_) };
    936 
    937 $code.=<<___;
    938 .type	sha1_block_data_order_avx,\@function,3
    939 .align	16
    940 sha1_block_data_order_avx:
    941 _avx_shortcut:
    942 	mov	%rsp,$fp
    943 	push	%rbx
    944 	push	%rbp
    945 	push	%r12
    946 	push	%r13		# redundant, done to share Win64 SE handler
    947 	push	%r14
    948 	lea	`-64-($win64?6*16:0)`(%rsp),%rsp
    949 	vzeroupper
    950 ___
    951 $code.=<<___ if ($win64);
    952 	vmovaps	%xmm6,-40-6*16($fp)
    953 	vmovaps	%xmm7,-40-5*16($fp)
    954 	vmovaps	%xmm8,-40-4*16($fp)
    955 	vmovaps	%xmm9,-40-3*16($fp)
    956 	vmovaps	%xmm10,-40-2*16($fp)
    957 	vmovaps	%xmm11,-40-1*16($fp)
    958 .Lprologue_avx:
    959 ___
    960 $code.=<<___;
    961 	and	\$-64,%rsp
    962 	mov	%rdi,$ctx	# reassigned argument
    963 	mov	%rsi,$inp	# reassigned argument
    964 	mov	%rdx,$num	# reassigned argument
    965 
    966 	shl	\$6,$num
    967 	add	$inp,$num
    968 	lea	K_XX_XX+64(%rip),$K_XX_XX
    969 
    970 	mov	0($ctx),$A		# load context
    971 	mov	4($ctx),$B
    972 	mov	8($ctx),$C
    973 	mov	12($ctx),$D
    974 	mov	$B,@T[0]		# magic seed
    975 	mov	16($ctx),$E
    976 	mov	$C,@T[1]
    977 	xor	$D,@T[1]
    978 	and	@T[1],@T[0]
    979 
    980 	vmovdqa	64($K_XX_XX),@X[2]	# pbswap mask
    981 	vmovdqa	-64($K_XX_XX),$Kx	# K_00_19
    982 	vmovdqu	0($inp),@X[-4&7]	# load input to %xmm[0-3]
    983 	vmovdqu	16($inp),@X[-3&7]
    984 	vmovdqu	32($inp),@X[-2&7]
    985 	vmovdqu	48($inp),@X[-1&7]
    986 	vpshufb	@X[2],@X[-4&7],@X[-4&7]	# byte swap
    987 	add	\$64,$inp
    988 	vpshufb	@X[2],@X[-3&7],@X[-3&7]
    989 	vpshufb	@X[2],@X[-2&7],@X[-2&7]
    990 	vpshufb	@X[2],@X[-1&7],@X[-1&7]
    991 	vpaddd	$Kx,@X[-4&7],@X[0]	# add K_00_19
    992 	vpaddd	$Kx,@X[-3&7],@X[1]
    993 	vpaddd	$Kx,@X[-2&7],@X[2]
    994 	vmovdqa	@X[0],0(%rsp)		# X[]+K xfer to IALU
    995 	vmovdqa	@X[1],16(%rsp)
    996 	vmovdqa	@X[2],32(%rsp)
    997 	jmp	.Loop_avx
    998 ___
    999 
   1000 sub Xupdate_avx_16_31()		# recall that $Xi starts wtih 4
   1001 { use integer;
   1002   my $body = shift;
   1003   my @insns = (&$body,&$body,&$body,&$body);	# 40 instructions
   1004   my ($a,$b,$c,$d,$e);
   1005 
   1006 	 eval(shift(@insns));
   1007 	 eval(shift(@insns));
   1008 	&vpalignr(@X[0],@X[-3&7],@X[-4&7],8);	# compose "X[-14]" in "X[0]"
   1009 	 eval(shift(@insns));
   1010 	 eval(shift(@insns));
   1011 
   1012 	  &vpaddd	(@Tx[1],$Kx,@X[-1&7]);
   1013 	 eval(shift(@insns));
   1014 	 eval(shift(@insns));
   1015 	&vpsrldq(@Tx[0],@X[-1&7],4);		# "X[-3]", 3 dwords
   1016 	 eval(shift(@insns));
   1017 	 eval(shift(@insns));
   1018 	&vpxor	(@X[0],@X[0],@X[-4&7]);		# "X[0]"^="X[-16]"
   1019 	 eval(shift(@insns));
   1020 	 eval(shift(@insns));
   1021 
   1022 	&vpxor	(@Tx[0],@Tx[0],@X[-2&7]);	# "X[-3]"^"X[-8]"
   1023 	 eval(shift(@insns));
   1024 	 eval(shift(@insns));
   1025 	 eval(shift(@insns));
   1026 	 eval(shift(@insns));
   1027 
   1028 	&vpxor	(@X[0],@X[0],@Tx[0]);		# "X[0]"^="X[-3]"^"X[-8]"
   1029 	 eval(shift(@insns));
   1030 	 eval(shift(@insns));
   1031 	  &vmovdqa	(eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]);	# X[]+K xfer to IALU
   1032 	 eval(shift(@insns));
   1033 	 eval(shift(@insns));
   1034 
   1035 	&vpsrld	(@Tx[0],@X[0],31);
   1036 	 eval(shift(@insns));
   1037 	 eval(shift(@insns));
   1038 	 eval(shift(@insns));
   1039 	 eval(shift(@insns));
   1040 
   1041 	&vpslldq(@Tx[2],@X[0],12);		# "X[0]"<<96, extract one dword
   1042 	&vpaddd	(@X[0],@X[0],@X[0]);
   1043 	 eval(shift(@insns));
   1044 	 eval(shift(@insns));
   1045 	 eval(shift(@insns));
   1046 	 eval(shift(@insns));
   1047 
   1048 	&vpsrld	(@Tx[1],@Tx[2],30);
   1049 	&vpor	(@X[0],@X[0],@Tx[0]);		# "X[0]"<<<=1
   1050 	 eval(shift(@insns));
   1051 	 eval(shift(@insns));
   1052 	 eval(shift(@insns));
   1053 	 eval(shift(@insns));
   1054 
   1055 	&vpslld	(@Tx[2],@Tx[2],2);
   1056 	&vpxor	(@X[0],@X[0],@Tx[1]);
   1057 	 eval(shift(@insns));
   1058 	 eval(shift(@insns));
   1059 	 eval(shift(@insns));
   1060 	 eval(shift(@insns));
   1061 
   1062 	&vpxor	(@X[0],@X[0],@Tx[2]);		# "X[0]"^=("X[0]">>96)<<<2
   1063 	 eval(shift(@insns));
   1064 	 eval(shift(@insns));
   1065 	  &vmovdqa	($Kx,eval(2*16*(($Xi)/5)-64)."($K_XX_XX)")	if ($Xi%5==0);	# K_XX_XX
   1066 	 eval(shift(@insns));
   1067 	 eval(shift(@insns));
   1068 
   1069 
   1070 	 foreach (@insns) { eval; }	# remaining instructions [if any]
   1071 
   1072   $Xi++;	push(@X,shift(@X));	# "rotate" X[]
   1073 }
   1074 
   1075 sub Xupdate_avx_32_79()
   1076 { use integer;
   1077   my $body = shift;
   1078   my @insns = (&$body,&$body,&$body,&$body);	# 32 to 44 instructions
   1079   my ($a,$b,$c,$d,$e);
   1080 
   1081 	&vpalignr(@Tx[0],@X[-1&7],@X[-2&7],8);	# compose "X[-6]"
   1082 	&vpxor	(@X[0],@X[0],@X[-4&7]);		# "X[0]"="X[-32]"^"X[-16]"
   1083 	 eval(shift(@insns));		# body_20_39
   1084 	 eval(shift(@insns));
   1085 	 eval(shift(@insns));
   1086 	 eval(shift(@insns));		# rol
   1087 
   1088 	&vpxor	(@X[0],@X[0],@X[-7&7]);		# "X[0]"^="X[-28]"
   1089 	 eval(shift(@insns));
   1090 	 eval(shift(@insns))	if (@insns[0] !~ /&ro[rl]/);
   1091 	  &vpaddd	(@Tx[1],$Kx,@X[-1&7]);
   1092 	  &vmovdqa	($Kx,eval(2*16*($Xi/5)-64)."($K_XX_XX)")	if ($Xi%5==0);
   1093 	 eval(shift(@insns));		# ror
   1094 	 eval(shift(@insns));
   1095 
   1096 	&vpxor	(@X[0],@X[0],@Tx[0]);		# "X[0]"^="X[-6]"
   1097 	 eval(shift(@insns));		# body_20_39
   1098 	 eval(shift(@insns));
   1099 	 eval(shift(@insns));
   1100 	 eval(shift(@insns));		# rol
   1101 
   1102 	&vpsrld	(@Tx[0],@X[0],30);
   1103 	  &vmovdqa	(eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]);	# X[]+K xfer to IALU
   1104 	 eval(shift(@insns));
   1105 	 eval(shift(@insns));
   1106 	 eval(shift(@insns));		# ror
   1107 	 eval(shift(@insns));
   1108 
   1109 	&vpslld	(@X[0],@X[0],2);
   1110 	 eval(shift(@insns));		# body_20_39
   1111 	 eval(shift(@insns));
   1112 	 eval(shift(@insns));
   1113 	 eval(shift(@insns));		# rol
   1114 	 eval(shift(@insns));
   1115 	 eval(shift(@insns));
   1116 	 eval(shift(@insns));		# ror
   1117 	 eval(shift(@insns));
   1118 
   1119 	&vpor	(@X[0],@X[0],@Tx[0]);		# "X[0]"<<<=2
   1120 	 eval(shift(@insns));		# body_20_39
   1121 	 eval(shift(@insns));
   1122 	 eval(shift(@insns));
   1123 	 eval(shift(@insns));		# rol
   1124 	 eval(shift(@insns));
   1125 	 eval(shift(@insns));
   1126 	 eval(shift(@insns));		# rol
   1127 	 eval(shift(@insns));
   1128 
   1129 	 foreach (@insns) { eval; }	# remaining instructions
   1130 
   1131   $Xi++;	push(@X,shift(@X));	# "rotate" X[]
   1132 }
   1133 
   1134 sub Xuplast_avx_80()
   1135 { use integer;
   1136   my $body = shift;
   1137   my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
   1138   my ($a,$b,$c,$d,$e);
   1139 
   1140 	 eval(shift(@insns));
   1141 	  &vpaddd	(@Tx[1],$Kx,@X[-1&7]);
   1142 	 eval(shift(@insns));
   1143 	 eval(shift(@insns));
   1144 	 eval(shift(@insns));
   1145 	 eval(shift(@insns));
   1146 
   1147 	  &vmovdqa	(eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]);	# X[]+K xfer IALU
   1148 
   1149 	 foreach (@insns) { eval; }		# remaining instructions
   1150 
   1151 	&cmp	($inp,$num);
   1152 	&je	($done_avx_label);
   1153 
   1154 	&vmovdqa(@X[2],"64($K_XX_XX)");		# pbswap mask
   1155 	&vmovdqa($Kx,"-64($K_XX_XX)");		# K_00_19
   1156 	&vmovdqu(@X[-4&7],"0($inp)");		# load input
   1157 	&vmovdqu(@X[-3&7],"16($inp)");
   1158 	&vmovdqu(@X[-2&7],"32($inp)");
   1159 	&vmovdqu(@X[-1&7],"48($inp)");
   1160 	&vpshufb(@X[-4&7],@X[-4&7],@X[2]);	# byte swap
   1161 	&add	($inp,64);
   1162 
   1163   $Xi=0;
   1164 }
   1165 
   1166 sub Xloop_avx()
   1167 { use integer;
   1168   my $body = shift;
   1169   my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
   1170   my ($a,$b,$c,$d,$e);
   1171 
   1172 	 eval(shift(@insns));
   1173 	 eval(shift(@insns));
   1174 	&vpshufb(@X[($Xi-3)&7],@X[($Xi-3)&7],@X[2]);
   1175 	 eval(shift(@insns));
   1176 	 eval(shift(@insns));
   1177 	&vpaddd	(@X[$Xi&7],@X[($Xi-4)&7],$Kx);
   1178 	 eval(shift(@insns));
   1179 	 eval(shift(@insns));
   1180 	 eval(shift(@insns));
   1181 	 eval(shift(@insns));
   1182 	&vmovdqa(eval(16*$Xi)."(%rsp)",@X[$Xi&7]);	# X[]+K xfer to IALU
   1183 	 eval(shift(@insns));
   1184 	 eval(shift(@insns));
   1185 
   1186 	foreach (@insns) { eval; }
   1187   $Xi++;
   1188 }
   1189 
   1190 sub Xtail_avx()
   1191 { use integer;
   1192   my $body = shift;
   1193   my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
   1194   my ($a,$b,$c,$d,$e);
   1195 
   1196 	foreach (@insns) { eval; }
   1197 }
   1198 
   1199 $code.=<<___;
   1200 .align	16
   1201 .Loop_avx:
   1202 ___
   1203 	&Xupdate_avx_16_31(\&body_00_19);
   1204 	&Xupdate_avx_16_31(\&body_00_19);
   1205 	&Xupdate_avx_16_31(\&body_00_19);
   1206 	&Xupdate_avx_16_31(\&body_00_19);
   1207 	&Xupdate_avx_32_79(\&body_00_19);
   1208 	&Xupdate_avx_32_79(\&body_20_39);
   1209 	&Xupdate_avx_32_79(\&body_20_39);
   1210 	&Xupdate_avx_32_79(\&body_20_39);
   1211 	&Xupdate_avx_32_79(\&body_20_39);
   1212 	&Xupdate_avx_32_79(\&body_20_39);
   1213 	&Xupdate_avx_32_79(\&body_40_59);
   1214 	&Xupdate_avx_32_79(\&body_40_59);
   1215 	&Xupdate_avx_32_79(\&body_40_59);
   1216 	&Xupdate_avx_32_79(\&body_40_59);
   1217 	&Xupdate_avx_32_79(\&body_40_59);
   1218 	&Xupdate_avx_32_79(\&body_20_39);
   1219 	&Xuplast_avx_80(\&body_20_39);	# can jump to "done"
   1220 
   1221 				$saved_j=$j; @saved_V=@V;
   1222 
   1223 	&Xloop_avx(\&body_20_39);
   1224 	&Xloop_avx(\&body_20_39);
   1225 	&Xloop_avx(\&body_20_39);
   1226 
   1227 $code.=<<___;
   1228 	add	0($ctx),$A			# update context
   1229 	add	4($ctx),@T[0]
   1230 	add	8($ctx),$C
   1231 	add	12($ctx),$D
   1232 	mov	$A,0($ctx)
   1233 	add	16($ctx),$E
   1234 	mov	@T[0],4($ctx)
   1235 	mov	@T[0],$B			# magic seed
   1236 	mov	$C,8($ctx)
   1237 	mov	$C,@T[1]
   1238 	mov	$D,12($ctx)
   1239 	xor	$D,@T[1]
   1240 	mov	$E,16($ctx)
   1241 	and	@T[1],@T[0]
   1242 	jmp	.Loop_avx
   1243 
   1244 .align	16
   1245 $done_avx_label:
   1246 ___
   1247 				$j=$saved_j; @V=@saved_V;
   1248 
   1249 	&Xtail_avx(\&body_20_39);
   1250 	&Xtail_avx(\&body_20_39);
   1251 	&Xtail_avx(\&body_20_39);
   1252 
   1253 $code.=<<___;
   1254 	vzeroupper
   1255 
   1256 	add	0($ctx),$A			# update context
   1257 	add	4($ctx),@T[0]
   1258 	add	8($ctx),$C
   1259 	mov	$A,0($ctx)
   1260 	add	12($ctx),$D
   1261 	mov	@T[0],4($ctx)
   1262 	add	16($ctx),$E
   1263 	mov	$C,8($ctx)
   1264 	mov	$D,12($ctx)
   1265 	mov	$E,16($ctx)
   1266 ___
   1267 $code.=<<___ if ($win64);
   1268 	movaps	-40-6*16($fp),%xmm6
   1269 	movaps	-40-5*16($fp),%xmm7
   1270 	movaps	-40-4*16($fp),%xmm8
   1271 	movaps	-40-3*16($fp),%xmm9
   1272 	movaps	-40-2*16($fp),%xmm10
   1273 	movaps	-40-1*16($fp),%xmm11
   1274 ___
   1275 $code.=<<___;
   1276 	mov	-40($fp),%r14
   1277 	mov	-32($fp),%r13
   1278 	mov	-24($fp),%r12
   1279 	mov	-16($fp),%rbp
   1280 	mov	-8($fp),%rbx
   1281 	lea	($fp),%rsp
   1282 .Lepilogue_avx:
   1283 	ret
   1284 .size	sha1_block_data_order_avx,.-sha1_block_data_order_avx
   1285 ___
   1286 
   1287 if ($avx>1) {
   1288 use integer;
   1289 $Xi=4;					# reset variables
   1290 @X=map("%ymm$_",(4..7,0..3));
   1291 @Tx=map("%ymm$_",(8..10));
   1292 $Kx="%ymm11";
   1293 $j=0;
   1294 
   1295 my @ROTX=("%eax","%ebp","%ebx","%ecx","%edx","%esi");
   1296 my ($a5,$t0)=("%r12d","%edi");
   1297 
   1298 my ($A,$F,$B,$C,$D,$E)=@ROTX;
   1299 my $rx=0;
   1300 my $frame="%r13";
   1301 
   1302 $code.=<<___;
   1303 .type	sha1_block_data_order_avx2,\@function,3
   1304 .align	16
   1305 sha1_block_data_order_avx2:
   1306 _avx2_shortcut:
   1307 	mov	%rsp,$fp
   1308 	push	%rbx
   1309 	push	%rbp
   1310 	push	%r12
   1311 	push	%r13
   1312 	push	%r14
   1313 	vzeroupper
   1314 ___
   1315 $code.=<<___ if ($win64);
   1316 	lea	-6*16(%rsp),%rsp
   1317 	vmovaps	%xmm6,-40-6*16($fp)
   1318 	vmovaps	%xmm7,-40-5*16($fp)
   1319 	vmovaps	%xmm8,-40-4*16($fp)
   1320 	vmovaps	%xmm9,-40-3*16($fp)
   1321 	vmovaps	%xmm10,-40-2*16($fp)
   1322 	vmovaps	%xmm11,-40-1*16($fp)
   1323 .Lprologue_avx2:
   1324 ___
   1325 $code.=<<___;
   1326 	mov	%rdi,$ctx		# reassigned argument
   1327 	mov	%rsi,$inp		# reassigned argument
   1328 	mov	%rdx,$num		# reassigned argument
   1329 
   1330 	lea	-640(%rsp),%rsp
   1331 	shl	\$6,$num
   1332 	 lea	64($inp),$frame
   1333 	and	\$-128,%rsp
   1334 	add	$inp,$num
   1335 	lea	K_XX_XX+64(%rip),$K_XX_XX
   1336 
   1337 	mov	0($ctx),$A		# load context
   1338 	 cmp	$num,$frame
   1339 	 cmovae	$inp,$frame		# next or same block
   1340 	mov	4($ctx),$F
   1341 	mov	8($ctx),$C
   1342 	mov	12($ctx),$D
   1343 	mov	16($ctx),$E
   1344 	vmovdqu	64($K_XX_XX),@X[2]	# pbswap mask
   1345 
   1346 	vmovdqu		($inp),%xmm0
   1347 	vmovdqu		16($inp),%xmm1
   1348 	vmovdqu		32($inp),%xmm2
   1349 	vmovdqu		48($inp),%xmm3
   1350 	lea		64($inp),$inp
   1351 	vinserti128	\$1,($frame),@X[-4&7],@X[-4&7]
   1352 	vinserti128	\$1,16($frame),@X[-3&7],@X[-3&7]
   1353 	vpshufb		@X[2],@X[-4&7],@X[-4&7]
   1354 	vinserti128	\$1,32($frame),@X[-2&7],@X[-2&7]
   1355 	vpshufb		@X[2],@X[-3&7],@X[-3&7]
   1356 	vinserti128	\$1,48($frame),@X[-1&7],@X[-1&7]
   1357 	vpshufb		@X[2],@X[-2&7],@X[-2&7]
   1358 	vmovdqu		-64($K_XX_XX),$Kx	# K_00_19
   1359 	vpshufb		@X[2],@X[-1&7],@X[-1&7]
   1360 
   1361 	vpaddd	$Kx,@X[-4&7],@X[0]	# add K_00_19
   1362 	vpaddd	$Kx,@X[-3&7],@X[1]
   1363 	vmovdqu	@X[0],0(%rsp)		# X[]+K xfer to IALU
   1364 	vpaddd	$Kx,@X[-2&7],@X[2]
   1365 	vmovdqu	@X[1],32(%rsp)
   1366 	vpaddd	$Kx,@X[-1&7],@X[3]
   1367 	vmovdqu	@X[2],64(%rsp)
   1368 	vmovdqu	@X[3],96(%rsp)
   1369 ___
   1370 for (;$Xi<8;$Xi++) {	# Xupdate_avx2_16_31
   1371     use integer;
   1372 
   1373 	&vpalignr(@X[0],@X[-3&7],@X[-4&7],8);	# compose "X[-14]" in "X[0]"
   1374 	&vpsrldq(@Tx[0],@X[-1&7],4);		# "X[-3]", 3 dwords
   1375 	&vpxor	(@X[0],@X[0],@X[-4&7]);		# "X[0]"^="X[-16]"
   1376 	&vpxor	(@Tx[0],@Tx[0],@X[-2&7]);	# "X[-3]"^"X[-8]"
   1377 	&vpxor	(@X[0],@X[0],@Tx[0]);		# "X[0]"^="X[-3]"^"X[-8]"
   1378 	&vpsrld	(@Tx[0],@X[0],31);
   1379 	&vmovdqu($Kx,eval(2*16*(($Xi)/5)-64)."($K_XX_XX)")	if ($Xi%5==0);	# K_XX_XX
   1380 	&vpslldq(@Tx[2],@X[0],12);		# "X[0]"<<96, extract one dword
   1381 	&vpaddd	(@X[0],@X[0],@X[0]);
   1382 	&vpsrld	(@Tx[1],@Tx[2],30);
   1383 	&vpor	(@X[0],@X[0],@Tx[0]);		# "X[0]"<<<=1
   1384 	&vpslld	(@Tx[2],@Tx[2],2);
   1385 	&vpxor	(@X[0],@X[0],@Tx[1]);
   1386 	&vpxor	(@X[0],@X[0],@Tx[2]);		# "X[0]"^=("X[0]">>96)<<<2
   1387 	&vpaddd	(@Tx[1],@X[0],$Kx);
   1388 	&vmovdqu("32*$Xi(%rsp)",@Tx[1]);	# X[]+K xfer to IALU
   1389 
   1390 	push(@X,shift(@X));	# "rotate" X[]
   1391 }
   1392 $code.=<<___;
   1393 	lea	128(%rsp),$frame
   1394 	jmp	.Loop_avx2
   1395 .align	32
   1396 .Loop_avx2:
   1397 	rorx	\$2,$F,$B
   1398 	andn	$D,$F,$t0
   1399 	and	$C,$F
   1400 	xor	$t0,$F
   1401 ___
   1402 sub bodyx_00_19 () {	# 8 instructions, 3 cycles critical path
   1403 	# at start $f=(b&c)^(~b&d), $b>>>=2
   1404 	return &bodyx_20_39() if ($rx==19); $rx++;
   1405 	(
   1406 	'($a,$f,$b,$c,$d,$e)=@ROTX;'.
   1407 
   1408 	'&add	($e,((32*($j/4)+4*($j%4))%256-128)."($frame)");'.	# e+=X[i]+K
   1409 	 '&lea	($frame,"256($frame)")	if ($j%32==31);',
   1410 	'&andn	($t0,$a,$c)',			# ~b&d for next round
   1411 
   1412 	'&add	($e,$f)',			# e+=(b&c)^(~b&d)
   1413 	'&rorx	($a5,$a,27)',			# a<<<5
   1414 	'&rorx	($f,$a,2)',			# b>>>2 for next round
   1415 	'&and	($a,$b)',			# b&c for next round
   1416 
   1417 	'&add	($e,$a5)',			# e+=a<<<5
   1418 	'&xor	($a,$t0);'.			# f=(b&c)^(~b&d) for next round
   1419 
   1420 	'unshift(@ROTX,pop(@ROTX)); $j++;'
   1421 	)
   1422 }
   1423 
   1424 sub bodyx_20_39 () {	# 7 instructions, 2 cycles critical path
   1425 	# on entry $f=b^c^d, $b>>>=2
   1426 	return &bodyx_40_59() if ($rx==39); $rx++;
   1427 	(
   1428 	'($a,$f,$b,$c,$d,$e)=@ROTX;'.
   1429 
   1430 	'&add	($e,((32*($j/4)+4*($j%4))%256-128)."($frame)");'.	# e+=X[i]+K
   1431 	 '&lea	($frame,"256($frame)")	if ($j%32==31);',
   1432 
   1433 	'&lea	($e,"($e,$f)")',		# e+=b^c^d
   1434 	'&rorx	($a5,$a,27)',			# a<<<5
   1435 	'&rorx	($f,$a,2)	if ($j<79)',	# b>>>2 in next round
   1436 	'&xor	($a,$b)		if ($j<79)',	# b^c for next round
   1437 
   1438 	'&add	($e,$a5)',			# e+=a<<<5
   1439 	'&xor	($a,$c)		if ($j<79);'.	# f=b^c^d for next round
   1440 
   1441 	'unshift(@ROTX,pop(@ROTX)); $j++;'
   1442 	)
   1443 }
   1444 
   1445 sub bodyx_40_59 () {	# 10 instructions, 3 cycles critical path
   1446 	# on entry $f=((b^c)&(c^d)), $b>>>=2
   1447 	$rx++;
   1448 	(
   1449 	'($a,$f,$b,$c,$d,$e)=@ROTX;'.
   1450 
   1451 	'&add	($e,((32*($j/4)+4*($j%4))%256-128)."($frame)");'.	# e+=X[i]+K
   1452 	 '&lea	($frame,"256($frame)")	if ($j%32==31);',
   1453 	'&xor	($f,$c)		if ($j>39)',	# (b^c)&(c^d)^c
   1454 	'&mov	($t0,$b)	if ($j<59)',	# count on zero latency
   1455 	'&xor	($t0,$c)	if ($j<59)',	# c^d for next round
   1456 
   1457 	'&lea	($e,"($e,$f)")',		# e+=(b^c)&(c^d)^c
   1458 	'&rorx	($a5,$a,27)',			# a<<<5
   1459 	'&rorx	($f,$a,2)',			# b>>>2 in next round
   1460 	'&xor	($a,$b)',			# b^c for next round
   1461 
   1462 	'&add	($e,$a5)',			# e+=a<<<5
   1463 	'&and	($a,$t0)	if ($j< 59);'.	# f=(b^c)&(c^d) for next round
   1464 	'&xor	($a,$c)		if ($j==59);'.	# f=b^c^d for next round
   1465 
   1466 	'unshift(@ROTX,pop(@ROTX)); $j++;'
   1467 	)
   1468 }
   1469 
   1470 sub Xupdate_avx2_16_31()		# recall that $Xi starts with 4
   1471 { use integer;
   1472   my $body = shift;
   1473   my @insns = (&$body,&$body,&$body,&$body,&$body);	# 35 instructions
   1474   my ($a,$b,$c,$d,$e);
   1475 
   1476 	&vpalignr(@X[0],@X[-3&7],@X[-4&7],8);	# compose "X[-14]" in "X[0]"
   1477 	 eval(shift(@insns));
   1478 	 eval(shift(@insns));
   1479 	 eval(shift(@insns));
   1480 	 eval(shift(@insns));
   1481 
   1482 	&vpsrldq(@Tx[0],@X[-1&7],4);		# "X[-3]", 3 dwords
   1483 	 eval(shift(@insns));
   1484 	 eval(shift(@insns));
   1485 	 eval(shift(@insns));
   1486 
   1487 	&vpxor	(@X[0],@X[0],@X[-4&7]);		# "X[0]"^="X[-16]"
   1488 	&vpxor	(@Tx[0],@Tx[0],@X[-2&7]);	# "X[-3]"^"X[-8]"
   1489 	 eval(shift(@insns));
   1490 	 eval(shift(@insns));
   1491 
   1492 	&vpxor	(@X[0],@X[0],@Tx[0]);		# "X[0]"^="X[-3]"^"X[-8]"
   1493 	 eval(shift(@insns));
   1494 	 eval(shift(@insns));
   1495 	 eval(shift(@insns));
   1496 	 eval(shift(@insns));
   1497 
   1498 	&vpsrld	(@Tx[0],@X[0],31);
   1499 	&vmovdqu($Kx,eval(2*16*(($Xi)/5)-64)."($K_XX_XX)")	if ($Xi%5==0);	# K_XX_XX
   1500 	 eval(shift(@insns));
   1501 	 eval(shift(@insns));
   1502 	 eval(shift(@insns));
   1503 
   1504 	&vpslldq(@Tx[2],@X[0],12);		# "X[0]"<<96, extract one dword
   1505 	&vpaddd	(@X[0],@X[0],@X[0]);
   1506 	 eval(shift(@insns));
   1507 	 eval(shift(@insns));
   1508 
   1509 	&vpsrld	(@Tx[1],@Tx[2],30);
   1510 	&vpor	(@X[0],@X[0],@Tx[0]);		# "X[0]"<<<=1
   1511 	 eval(shift(@insns));
   1512 	 eval(shift(@insns));
   1513 
   1514 	&vpslld	(@Tx[2],@Tx[2],2);
   1515 	&vpxor	(@X[0],@X[0],@Tx[1]);
   1516 	 eval(shift(@insns));
   1517 	 eval(shift(@insns));
   1518 
   1519 	&vpxor	(@X[0],@X[0],@Tx[2]);		# "X[0]"^=("X[0]">>96)<<<2
   1520 	 eval(shift(@insns));
   1521 	 eval(shift(@insns));
   1522 	 eval(shift(@insns));
   1523 
   1524 	&vpaddd	(@Tx[1],@X[0],$Kx);
   1525 	 eval(shift(@insns));
   1526 	 eval(shift(@insns));
   1527 	 eval(shift(@insns));
   1528 	&vmovdqu(eval(32*($Xi))."(%rsp)",@Tx[1]);	# X[]+K xfer to IALU
   1529 
   1530 	 foreach (@insns) { eval; }	# remaining instructions [if any]
   1531 
   1532 	$Xi++;
   1533 	push(@X,shift(@X));	# "rotate" X[]
   1534 }
   1535 
   1536 sub Xupdate_avx2_32_79()
   1537 { use integer;
   1538   my $body = shift;
   1539   my @insns = (&$body,&$body,&$body,&$body,&$body);	# 35 to 50 instructions
   1540   my ($a,$b,$c,$d,$e);
   1541 
   1542 	&vpalignr(@Tx[0],@X[-1&7],@X[-2&7],8);	# compose "X[-6]"
   1543 	&vpxor	(@X[0],@X[0],@X[-4&7]);		# "X[0]"="X[-32]"^"X[-16]"
   1544 	 eval(shift(@insns));
   1545 	 eval(shift(@insns));
   1546 
   1547 	&vpxor	(@X[0],@X[0],@X[-7&7]);		# "X[0]"^="X[-28]"
   1548 	&vmovdqu($Kx,eval(2*16*($Xi/5)-64)."($K_XX_XX)")	if ($Xi%5==0);
   1549 	 eval(shift(@insns));
   1550 	 eval(shift(@insns));
   1551 	 eval(shift(@insns));
   1552 
   1553 	&vpxor	(@X[0],@X[0],@Tx[0]);		# "X[0]"^="X[-6]"
   1554 	 eval(shift(@insns));
   1555 	 eval(shift(@insns));
   1556 	 eval(shift(@insns));
   1557 
   1558 	&vpsrld	(@Tx[0],@X[0],30);
   1559 	&vpslld	(@X[0],@X[0],2);
   1560 	 eval(shift(@insns));
   1561 	 eval(shift(@insns));
   1562 	 eval(shift(@insns));
   1563 
   1564 	#&vpslld	(@X[0],@X[0],2);
   1565 	 eval(shift(@insns));
   1566 	 eval(shift(@insns));
   1567 	 eval(shift(@insns));
   1568 
   1569 	&vpor	(@X[0],@X[0],@Tx[0]);		# "X[0]"<<<=2
   1570 	 eval(shift(@insns));
   1571 	 eval(shift(@insns));
   1572 	 eval(shift(@insns));
   1573 	 eval(shift(@insns));
   1574 
   1575 	&vpaddd	(@Tx[1],@X[0],$Kx);
   1576 	 eval(shift(@insns));
   1577 	 eval(shift(@insns));
   1578 	 eval(shift(@insns));
   1579 	 eval(shift(@insns));
   1580 
   1581 	&vmovdqu("32*$Xi(%rsp)",@Tx[1]);	# X[]+K xfer to IALU
   1582 
   1583 	 foreach (@insns) { eval; }	# remaining instructions
   1584 
   1585 	$Xi++;
   1586 	push(@X,shift(@X));	# "rotate" X[]
   1587 }
   1588 
   1589 sub Xloop_avx2()
   1590 { use integer;
   1591   my $body = shift;
   1592   my @insns = (&$body,&$body,&$body,&$body,&$body);	# 32 instructions
   1593   my ($a,$b,$c,$d,$e);
   1594 
   1595 	 foreach (@insns) { eval; }
   1596 }
   1597 
   1598 	&align32();
   1599 	&Xupdate_avx2_32_79(\&bodyx_00_19);
   1600 	&Xupdate_avx2_32_79(\&bodyx_00_19);
   1601 	&Xupdate_avx2_32_79(\&bodyx_00_19);
   1602 	&Xupdate_avx2_32_79(\&bodyx_00_19);
   1603 
   1604 	&Xupdate_avx2_32_79(\&bodyx_20_39);
   1605 	&Xupdate_avx2_32_79(\&bodyx_20_39);
   1606 	&Xupdate_avx2_32_79(\&bodyx_20_39);
   1607 	&Xupdate_avx2_32_79(\&bodyx_20_39);
   1608 
   1609 	&align32();
   1610 	&Xupdate_avx2_32_79(\&bodyx_40_59);
   1611 	&Xupdate_avx2_32_79(\&bodyx_40_59);
   1612 	&Xupdate_avx2_32_79(\&bodyx_40_59);
   1613 	&Xupdate_avx2_32_79(\&bodyx_40_59);
   1614 
   1615 	&Xloop_avx2(\&bodyx_20_39);
   1616 	&Xloop_avx2(\&bodyx_20_39);
   1617 	&Xloop_avx2(\&bodyx_20_39);
   1618 	&Xloop_avx2(\&bodyx_20_39);
   1619 
   1620 $code.=<<___;
   1621 	lea	128($inp),$frame
   1622 	lea	128($inp),%rdi			# borrow $t0
   1623 	cmp	$num,$frame
   1624 	cmovae	$inp,$frame			# next or previous block
   1625 
   1626 	# output is d-e-[a]-f-b-c => A=d,F=e,C=f,D=b,E=c
   1627 	add	0($ctx),@ROTX[0]		# update context
   1628 	add	4($ctx),@ROTX[1]
   1629 	add	8($ctx),@ROTX[3]
   1630 	mov	@ROTX[0],0($ctx)
   1631 	add	12($ctx),@ROTX[4]
   1632 	mov	@ROTX[1],4($ctx)
   1633 	 mov	@ROTX[0],$A			# A=d
   1634 	add	16($ctx),@ROTX[5]
   1635 	 mov	@ROTX[3],$a5
   1636 	mov	@ROTX[3],8($ctx)
   1637 	 mov	@ROTX[4],$D			# D=b
   1638 	 #xchg	@ROTX[5],$F			# F=c, C=f
   1639 	mov	@ROTX[4],12($ctx)
   1640 	 mov	@ROTX[1],$F			# F=e
   1641 	mov	@ROTX[5],16($ctx)
   1642 	#mov	$F,16($ctx)
   1643 	 mov	@ROTX[5],$E			# E=c
   1644 	 mov	$a5,$C				# C=f
   1645 	 #xchg	$F,$E				# E=c, F=e
   1646 
   1647 	cmp	$num,$inp
   1648 	je	.Ldone_avx2
   1649 ___
   1650 
   1651 $Xi=4;				# reset variables
   1652 @X=map("%ymm$_",(4..7,0..3));
   1653 
   1654 $code.=<<___;
   1655 	vmovdqu	64($K_XX_XX),@X[2]		# pbswap mask
   1656 	cmp	$num,%rdi			# borrowed $t0
   1657 	ja	.Last_avx2
   1658 
   1659 	vmovdqu		-64(%rdi),%xmm0		# low part of @X[-4&7]
   1660 	vmovdqu		-48(%rdi),%xmm1
   1661 	vmovdqu		-32(%rdi),%xmm2
   1662 	vmovdqu		-16(%rdi),%xmm3
   1663 	vinserti128	\$1,0($frame),@X[-4&7],@X[-4&7]
   1664 	vinserti128	\$1,16($frame),@X[-3&7],@X[-3&7]
   1665 	vinserti128	\$1,32($frame),@X[-2&7],@X[-2&7]
   1666 	vinserti128	\$1,48($frame),@X[-1&7],@X[-1&7]
   1667 	jmp	.Last_avx2
   1668 
   1669 .align	32
   1670 .Last_avx2:
   1671 	lea	128+16(%rsp),$frame
   1672 	rorx	\$2,$F,$B
   1673 	andn	$D,$F,$t0
   1674 	and	$C,$F
   1675 	xor	$t0,$F
   1676 	sub	\$-128,$inp
   1677 ___
   1678 	$rx=$j=0;	@ROTX=($A,$F,$B,$C,$D,$E);
   1679 
   1680 	&Xloop_avx2	(\&bodyx_00_19);
   1681 	&Xloop_avx2	(\&bodyx_00_19);
   1682 	&Xloop_avx2	(\&bodyx_00_19);
   1683 	&Xloop_avx2	(\&bodyx_00_19);
   1684 
   1685 	&Xloop_avx2	(\&bodyx_20_39);
   1686 	  &vmovdqu	($Kx,"-64($K_XX_XX)");		# K_00_19
   1687 	  &vpshufb	(@X[-4&7],@X[-4&7],@X[2]);	# byte swap
   1688 	&Xloop_avx2	(\&bodyx_20_39);
   1689 	  &vpshufb	(@X[-3&7],@X[-3&7],@X[2]);
   1690 	  &vpaddd	(@Tx[0],@X[-4&7],$Kx);		# add K_00_19
   1691 	&Xloop_avx2	(\&bodyx_20_39);
   1692 	  &vmovdqu	("0(%rsp)",@Tx[0]);
   1693 	  &vpshufb	(@X[-2&7],@X[-2&7],@X[2]);
   1694 	  &vpaddd	(@Tx[1],@X[-3&7],$Kx);
   1695 	&Xloop_avx2	(\&bodyx_20_39);
   1696 	  &vmovdqu	("32(%rsp)",@Tx[1]);
   1697 	  &vpshufb	(@X[-1&7],@X[-1&7],@X[2]);
   1698 	  &vpaddd	(@X[2],@X[-2&7],$Kx);
   1699 
   1700 	&Xloop_avx2	(\&bodyx_40_59);
   1701 	&align32	();
   1702 	  &vmovdqu	("64(%rsp)",@X[2]);
   1703 	  &vpaddd	(@X[3],@X[-1&7],$Kx);
   1704 	&Xloop_avx2	(\&bodyx_40_59);
   1705 	  &vmovdqu	("96(%rsp)",@X[3]);
   1706 	&Xloop_avx2	(\&bodyx_40_59);
   1707 	&Xupdate_avx2_16_31(\&bodyx_40_59);
   1708 
   1709 	&Xupdate_avx2_16_31(\&bodyx_20_39);
   1710 	&Xupdate_avx2_16_31(\&bodyx_20_39);
   1711 	&Xupdate_avx2_16_31(\&bodyx_20_39);
   1712 	&Xloop_avx2	(\&bodyx_20_39);
   1713 
   1714 $code.=<<___;
   1715 	lea	128(%rsp),$frame
   1716 
   1717 	# output is d-e-[a]-f-b-c => A=d,F=e,C=f,D=b,E=c
   1718 	add	0($ctx),@ROTX[0]		# update context
   1719 	add	4($ctx),@ROTX[1]
   1720 	add	8($ctx),@ROTX[3]
   1721 	mov	@ROTX[0],0($ctx)
   1722 	add	12($ctx),@ROTX[4]
   1723 	mov	@ROTX[1],4($ctx)
   1724 	 mov	@ROTX[0],$A			# A=d
   1725 	add	16($ctx),@ROTX[5]
   1726 	 mov	@ROTX[3],$a5
   1727 	mov	@ROTX[3],8($ctx)
   1728 	 mov	@ROTX[4],$D			# D=b
   1729 	 #xchg	@ROTX[5],$F			# F=c, C=f
   1730 	mov	@ROTX[4],12($ctx)
   1731 	 mov	@ROTX[1],$F			# F=e
   1732 	mov	@ROTX[5],16($ctx)
   1733 	#mov	$F,16($ctx)
   1734 	 mov	@ROTX[5],$E			# E=c
   1735 	 mov	$a5,$C				# C=f
   1736 	 #xchg	$F,$E				# E=c, F=e
   1737 
   1738 	cmp	$num,$inp
   1739 	jbe	.Loop_avx2
   1740 
   1741 .Ldone_avx2:
   1742 	vzeroupper
   1743 ___
   1744 $code.=<<___ if ($win64);
   1745 	movaps	-40-6*16($fp),%xmm6
   1746 	movaps	-40-5*16($fp),%xmm7
   1747 	movaps	-40-4*16($fp),%xmm8
   1748 	movaps	-40-3*16($fp),%xmm9
   1749 	movaps	-40-2*16($fp),%xmm10
   1750 	movaps	-40-1*16($fp),%xmm11
   1751 ___
   1752 $code.=<<___;
   1753 	mov	-40($fp),%r14
   1754 	mov	-32($fp),%r13
   1755 	mov	-24($fp),%r12
   1756 	mov	-16($fp),%rbp
   1757 	mov	-8($fp),%rbx
   1758 	lea	($fp),%rsp
   1759 .Lepilogue_avx2:
   1760 	ret
   1761 .size	sha1_block_data_order_avx2,.-sha1_block_data_order_avx2
   1762 ___
   1763 }
   1764 }
   1765 $code.=<<___;
   1766 .align	64
   1767 K_XX_XX:
   1768 .long	0x5a827999,0x5a827999,0x5a827999,0x5a827999	# K_00_19
   1769 .long	0x5a827999,0x5a827999,0x5a827999,0x5a827999	# K_00_19
   1770 .long	0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1	# K_20_39
   1771 .long	0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1	# K_20_39
   1772 .long	0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc	# K_40_59
   1773 .long	0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc	# K_40_59
   1774 .long	0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6	# K_60_79
   1775 .long	0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6	# K_60_79
   1776 .long	0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f	# pbswap mask
   1777 .long	0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f	# pbswap mask
   1778 .byte	0xf,0xe,0xd,0xc,0xb,0xa,0x9,0x8,0x7,0x6,0x5,0x4,0x3,0x2,0x1,0x0
   1779 ___
   1780 }}}
   1781 $code.=<<___;
   1782 .asciz	"SHA1 block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
   1783 .align	64
   1784 ___
   1785 
   1786 # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
   1787 #		CONTEXT *context,DISPATCHER_CONTEXT *disp)
   1788 if ($win64) {
   1789 $rec="%rcx";
   1790 $frame="%rdx";
   1791 $context="%r8";
   1792 $disp="%r9";
   1793 
   1794 $code.=<<___;
   1795 .extern	__imp_RtlVirtualUnwind
   1796 .type	se_handler,\@abi-omnipotent
   1797 .align	16
   1798 se_handler:
   1799 	push	%rsi
   1800 	push	%rdi
   1801 	push	%rbx
   1802 	push	%rbp
   1803 	push	%r12
   1804 	push	%r13
   1805 	push	%r14
   1806 	push	%r15
   1807 	pushfq
   1808 	sub	\$64,%rsp
   1809 
   1810 	mov	120($context),%rax	# pull context->Rax
   1811 	mov	248($context),%rbx	# pull context->Rip
   1812 
   1813 	lea	.Lprologue(%rip),%r10
   1814 	cmp	%r10,%rbx		# context->Rip<.Lprologue
   1815 	jb	.Lcommon_seh_tail
   1816 
   1817 	mov	152($context),%rax	# pull context->Rsp
   1818 
   1819 	lea	.Lepilogue(%rip),%r10
   1820 	cmp	%r10,%rbx		# context->Rip>=.Lepilogue
   1821 	jae	.Lcommon_seh_tail
   1822 
   1823 	mov	`16*4`(%rax),%rax	# pull saved stack pointer
   1824 
   1825 	mov	-8(%rax),%rbx
   1826 	mov	-16(%rax),%rbp
   1827 	mov	-24(%rax),%r12
   1828 	mov	-32(%rax),%r13
   1829 	mov	-40(%rax),%r14
   1830 	mov	%rbx,144($context)	# restore context->Rbx
   1831 	mov	%rbp,160($context)	# restore context->Rbp
   1832 	mov	%r12,216($context)	# restore context->R12
   1833 	mov	%r13,224($context)	# restore context->R13
   1834 	mov	%r14,232($context)	# restore context->R14
   1835 
   1836 	jmp	.Lcommon_seh_tail
   1837 .size	se_handler,.-se_handler
   1838 ___
   1839 
   1840 $code.=<<___ if ($shaext);
   1841 .type	shaext_handler,\@abi-omnipotent
   1842 .align	16
   1843 shaext_handler:
   1844 	push	%rsi
   1845 	push	%rdi
   1846 	push	%rbx
   1847 	push	%rbp
   1848 	push	%r12
   1849 	push	%r13
   1850 	push	%r14
   1851 	push	%r15
   1852 	pushfq
   1853 	sub	\$64,%rsp
   1854 
   1855 	mov	120($context),%rax	# pull context->Rax
   1856 	mov	248($context),%rbx	# pull context->Rip
   1857 
   1858 	lea	.Lprologue_shaext(%rip),%r10
   1859 	cmp	%r10,%rbx		# context->Rip<.Lprologue
   1860 	jb	.Lcommon_seh_tail
   1861 
   1862 	lea	.Lepilogue_shaext(%rip),%r10
   1863 	cmp	%r10,%rbx		# context->Rip>=.Lepilogue
   1864 	jae	.Lcommon_seh_tail
   1865 
   1866 	lea	-8-4*16(%rax),%rsi
   1867 	lea	512($context),%rdi	# &context.Xmm6
   1868 	mov	\$8,%ecx
   1869 	.long	0xa548f3fc		# cld; rep movsq
   1870 
   1871 	jmp	.Lcommon_seh_tail
   1872 .size	shaext_handler,.-shaext_handler
   1873 ___
   1874 
   1875 $code.=<<___;
   1876 .type	ssse3_handler,\@abi-omnipotent
   1877 .align	16
   1878 ssse3_handler:
   1879 	push	%rsi
   1880 	push	%rdi
   1881 	push	%rbx
   1882 	push	%rbp
   1883 	push	%r12
   1884 	push	%r13
   1885 	push	%r14
   1886 	push	%r15
   1887 	pushfq
   1888 	sub	\$64,%rsp
   1889 
   1890 	mov	120($context),%rax	# pull context->Rax
   1891 	mov	248($context),%rbx	# pull context->Rip
   1892 
   1893 	mov	8($disp),%rsi		# disp->ImageBase
   1894 	mov	56($disp),%r11		# disp->HandlerData
   1895 
   1896 	mov	0(%r11),%r10d		# HandlerData[0]
   1897 	lea	(%rsi,%r10),%r10	# prologue label
   1898 	cmp	%r10,%rbx		# context->Rip<prologue label
   1899 	jb	.Lcommon_seh_tail
   1900 
   1901 	mov	208($context),%rax	# pull context->R11
   1902 
   1903 	mov	4(%r11),%r10d		# HandlerData[1]
   1904 	lea	(%rsi,%r10),%r10	# epilogue label
   1905 	cmp	%r10,%rbx		# context->Rip>=epilogue label
   1906 	jae	.Lcommon_seh_tail
   1907 
   1908 	lea	-40-6*16(%rax),%rsi
   1909 	lea	512($context),%rdi	# &context.Xmm6
   1910 	mov	\$12,%ecx
   1911 	.long	0xa548f3fc		# cld; rep movsq
   1912 
   1913 	mov	-8(%rax),%rbx
   1914 	mov	-16(%rax),%rbp
   1915 	mov	-24(%rax),%r12
   1916 	mov	-32(%rax),%r13
   1917 	mov	-40(%rax),%r14
   1918 	mov	%rbx,144($context)	# restore context->Rbx
   1919 	mov	%rbp,160($context)	# restore context->Rbp
   1920 	mov	%r12,216($context)	# restore cotnext->R12
   1921 	mov	%r13,224($context)	# restore cotnext->R13
   1922 	mov	%r14,232($context)	# restore cotnext->R14
   1923 
   1924 .Lcommon_seh_tail:
   1925 	mov	8(%rax),%rdi
   1926 	mov	16(%rax),%rsi
   1927 	mov	%rax,152($context)	# restore context->Rsp
   1928 	mov	%rsi,168($context)	# restore context->Rsi
   1929 	mov	%rdi,176($context)	# restore context->Rdi
   1930 
   1931 	mov	40($disp),%rdi		# disp->ContextRecord
   1932 	mov	$context,%rsi		# context
   1933 	mov	\$154,%ecx		# sizeof(CONTEXT)
   1934 	.long	0xa548f3fc		# cld; rep movsq
   1935 
   1936 	mov	$disp,%rsi
   1937 	xor	%rcx,%rcx		# arg1, UNW_FLAG_NHANDLER
   1938 	mov	8(%rsi),%rdx		# arg2, disp->ImageBase
   1939 	mov	0(%rsi),%r8		# arg3, disp->ControlPc
   1940 	mov	16(%rsi),%r9		# arg4, disp->FunctionEntry
   1941 	mov	40(%rsi),%r10		# disp->ContextRecord
   1942 	lea	56(%rsi),%r11		# &disp->HandlerData
   1943 	lea	24(%rsi),%r12		# &disp->EstablisherFrame
   1944 	mov	%r10,32(%rsp)		# arg5
   1945 	mov	%r11,40(%rsp)		# arg6
   1946 	mov	%r12,48(%rsp)		# arg7
   1947 	mov	%rcx,56(%rsp)		# arg8, (NULL)
   1948 	call	*__imp_RtlVirtualUnwind(%rip)
   1949 
   1950 	mov	\$1,%eax		# ExceptionContinueSearch
   1951 	add	\$64,%rsp
   1952 	popfq
   1953 	pop	%r15
   1954 	pop	%r14
   1955 	pop	%r13
   1956 	pop	%r12
   1957 	pop	%rbp
   1958 	pop	%rbx
   1959 	pop	%rdi
   1960 	pop	%rsi
   1961 	ret
   1962 .size	ssse3_handler,.-ssse3_handler
   1963 
   1964 .section	.pdata
   1965 .align	4
   1966 	.rva	.LSEH_begin_sha1_block_data_order
   1967 	.rva	.LSEH_end_sha1_block_data_order
   1968 	.rva	.LSEH_info_sha1_block_data_order
   1969 ___
   1970 $code.=<<___ if ($shaext);
   1971 	.rva	.LSEH_begin_sha1_block_data_order_shaext
   1972 	.rva	.LSEH_end_sha1_block_data_order_shaext
   1973 	.rva	.LSEH_info_sha1_block_data_order_shaext
   1974 ___
   1975 $code.=<<___;
   1976 	.rva	.LSEH_begin_sha1_block_data_order_ssse3
   1977 	.rva	.LSEH_end_sha1_block_data_order_ssse3
   1978 	.rva	.LSEH_info_sha1_block_data_order_ssse3
   1979 ___
   1980 $code.=<<___ if ($avx);
   1981 	.rva	.LSEH_begin_sha1_block_data_order_avx
   1982 	.rva	.LSEH_end_sha1_block_data_order_avx
   1983 	.rva	.LSEH_info_sha1_block_data_order_avx
   1984 ___
   1985 $code.=<<___ if ($avx>1);
   1986 	.rva	.LSEH_begin_sha1_block_data_order_avx2
   1987 	.rva	.LSEH_end_sha1_block_data_order_avx2
   1988 	.rva	.LSEH_info_sha1_block_data_order_avx2
   1989 ___
   1990 $code.=<<___;
   1991 .section	.xdata
   1992 .align	8
   1993 .LSEH_info_sha1_block_data_order:
   1994 	.byte	9,0,0,0
   1995 	.rva	se_handler
   1996 ___
   1997 $code.=<<___ if ($shaext);
   1998 .LSEH_info_sha1_block_data_order_shaext:
   1999 	.byte	9,0,0,0
   2000 	.rva	shaext_handler
   2001 ___
   2002 $code.=<<___;
   2003 .LSEH_info_sha1_block_data_order_ssse3:
   2004 	.byte	9,0,0,0
   2005 	.rva	ssse3_handler
   2006 	.rva	.Lprologue_ssse3,.Lepilogue_ssse3	# HandlerData[]
   2007 ___
   2008 $code.=<<___ if ($avx);
   2009 .LSEH_info_sha1_block_data_order_avx:
   2010 	.byte	9,0,0,0
   2011 	.rva	ssse3_handler
   2012 	.rva	.Lprologue_avx,.Lepilogue_avx		# HandlerData[]
   2013 ___
   2014 $code.=<<___ if ($avx>1);
   2015 .LSEH_info_sha1_block_data_order_avx2:
   2016 	.byte	9,0,0,0
   2017 	.rva	ssse3_handler
   2018 	.rva	.Lprologue_avx2,.Lepilogue_avx2		# HandlerData[]
   2019 ___
   2020 }
   2021 
   2022 ####################################################################
   2023 
   2024 sub sha1rnds4 {
   2025     if (@_[0] =~ /\$([x0-9a-f]+),\s*%xmm([0-7]),\s*%xmm([0-7])/) {
   2026       my @opcode=(0x0f,0x3a,0xcc);
   2027 	push @opcode,0xc0|($2&7)|(($3&7)<<3);		# ModR/M
   2028 	my $c=$1;
   2029 	push @opcode,$c=~/^0/?oct($c):$c;
   2030 	return ".byte\t".join(',',@opcode);
   2031     } else {
   2032 	return "sha1rnds4\t".@_[0];
   2033     }
   2034 }
   2035 
   2036 sub sha1op38 {
   2037     my $instr = shift;
   2038     my %opcodelet = (
   2039 		"sha1nexte" => 0xc8,
   2040   		"sha1msg1"  => 0xc9,
   2041 		"sha1msg2"  => 0xca	);
   2042 
   2043     if (defined($opcodelet{$instr}) && @_[0] =~ /%xmm([0-9]+),\s*%xmm([0-9]+)/) {
   2044       my @opcode=(0x0f,0x38);
   2045       my $rex=0;
   2046 	$rex|=0x04			if ($2>=8);
   2047 	$rex|=0x01			if ($1>=8);
   2048 	unshift @opcode,0x40|$rex	if ($rex);
   2049 	push @opcode,$opcodelet{$instr};
   2050 	push @opcode,0xc0|($1&7)|(($2&7)<<3);		# ModR/M
   2051 	return ".byte\t".join(',',@opcode);
   2052     } else {
   2053 	return $instr."\t".@_[0];
   2054     }
   2055 }
   2056 
   2057 foreach (split("\n",$code)) {
   2058 	s/\`([^\`]*)\`/eval $1/geo;
   2059 
   2060 	s/\b(sha1rnds4)\s+(.*)/sha1rnds4($2)/geo	or
   2061 	s/\b(sha1[^\s]*)\s+(.*)/sha1op38($1,$2)/geo;
   2062 
   2063 	print $_,"\n";
   2064 }
   2065 close STDOUT;
   2066