Home | History | Annotate | Download | only in asm
      1 #!/usr/bin/env perl
      2 #
      3 # ====================================================================
      4 # Written by Andy Polyakov <appro (at] fy.chalmers.se> for the OpenSSL
      5 # project. The module is, however, dual licensed under OpenSSL and
      6 # CRYPTOGAMS licenses depending on where you obtain it. For further
      7 # details see http://www.openssl.org/~appro/cryptogams/.
      8 # ====================================================================
      9 #
     10 # sha1_block procedure for x86_64.
     11 #
     12 # It was brought to my attention that on EM64T compiler-generated code
     13 # was far behind 32-bit assembler implementation. This is unlike on
     14 # Opteron where compiler-generated code was only 15% behind 32-bit
     15 # assembler, which originally made it hard to motivate the effort.
     16 # There was suggestion to mechanically translate 32-bit code, but I
     17 # dismissed it, reasoning that x86_64 offers enough register bank
     18 # capacity to fully utilize SHA-1 parallelism. Therefore this fresh
     19 # implementation:-) However! While 64-bit code does perform better
     20 # on Opteron, I failed to beat 32-bit assembler on EM64T core. Well,
     21 # x86_64 does offer larger *addressable* bank, but out-of-order core
     22 # reaches for even more registers through dynamic aliasing, and EM64T
     23 # core must have managed to run-time optimize even 32-bit code just as
     24 # good as 64-bit one. Performance improvement is summarized in the
     25 # following table:
     26 #
     27 #		gcc 3.4		32-bit asm	cycles/byte
     28 # Opteron	+45%		+20%		6.8
     29 # Xeon P4	+65%		+0%		9.9
     30 # Core2		+60%		+10%		7.0
     31 
     32 # August 2009.
     33 #
     34 # The code was revised to minimize code size and to maximize
     35 # "distance" between instructions producing input to 'lea'
     36 # instruction and the 'lea' instruction itself, which is essential
     37 # for Intel Atom core.
     38 
     39 # October 2010.
     40 #
     41 # Add SSSE3, Supplemental[!] SSE3, implementation. The idea behind it
     42 # is to offload message schedule denoted by Wt in NIST specification,
     43 # or Xupdate in OpenSSL source, to SIMD unit. See sha1-586.pl module
     44 # for background and implementation details. The only difference from
     45 # 32-bit code is that 64-bit code doesn't have to spill @X[] elements
     46 # to free temporary registers.
     47 
     48 # April 2011.
     49 #
     50 # Add AVX code path. See sha1-586.pl for further information.
     51 
     52 ######################################################################
     53 # Current performance is summarized in following table. Numbers are
     54 # CPU clock cycles spent to process single byte (less is better).
     55 #
     56 #		x86_64		SSSE3		AVX
     57 # P4		9.8		-
     58 # Opteron	6.6		-
     59 # Core2		6.7		6.1/+10%	-
     60 # Atom		11.0		9.7/+13%	-
     61 # Westmere	7.1		5.6/+27%	-
     62 # Sandy Bridge	7.9		6.3/+25%	5.2/+51%
     63 
     64 $flavour = shift;
     65 $output  = shift;
     66 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
     67 
     68 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
     69 
     70 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
     71 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
     72 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
     73 die "can't locate x86_64-xlate.pl";
     74 
     75 $avx=1 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
     76 		=~ /GNU assembler version ([2-9]\.[0-9]+)/ &&
     77 	   $1>=2.19);
     78 $avx=1 if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
     79 	   `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/ &&
     80 	   $1>=2.09);
     81 $avx=1 if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
     82 	   `ml64 2>&1` =~ /Version ([0-9]+)\./ &&
     83 	   $1>=10);
     84 
     85 open OUT,"| \"$^X\" $xlate $flavour $output";
     86 *STDOUT=*OUT;
     87 
     88 $ctx="%rdi";	# 1st arg
     89 $inp="%rsi";	# 2nd arg
     90 $num="%rdx";	# 3rd arg
     91 
     92 # reassign arguments in order to produce more compact code
     93 $ctx="%r8";
     94 $inp="%r9";
     95 $num="%r10";
     96 
     97 $t0="%eax";
     98 $t1="%ebx";
     99 $t2="%ecx";
    100 @xi=("%edx","%ebp");
    101 $A="%esi";
    102 $B="%edi";
    103 $C="%r11d";
    104 $D="%r12d";
    105 $E="%r13d";
    106 
    107 @V=($A,$B,$C,$D,$E);
    108 
    109 sub BODY_00_19 {
    110 my ($i,$a,$b,$c,$d,$e)=@_;
    111 my $j=$i+1;
    112 $code.=<<___ if ($i==0);
    113 	mov	`4*$i`($inp),$xi[0]
    114 	bswap	$xi[0]
    115 	mov	$xi[0],`4*$i`(%rsp)
    116 ___
    117 $code.=<<___ if ($i<15);
    118 	mov	$c,$t0
    119 	mov	`4*$j`($inp),$xi[1]
    120 	mov	$a,$t2
    121 	xor	$d,$t0
    122 	bswap	$xi[1]
    123 	rol	\$5,$t2
    124 	lea	0x5a827999($xi[0],$e),$e
    125 	and	$b,$t0
    126 	mov	$xi[1],`4*$j`(%rsp)
    127 	add	$t2,$e
    128 	xor	$d,$t0
    129 	rol	\$30,$b
    130 	add	$t0,$e
    131 ___
    132 $code.=<<___ if ($i>=15);
    133 	mov	`4*($j%16)`(%rsp),$xi[1]
    134 	mov	$c,$t0
    135 	mov	$a,$t2
    136 	xor	`4*(($j+2)%16)`(%rsp),$xi[1]
    137 	xor	$d,$t0
    138 	rol	\$5,$t2
    139 	xor	`4*(($j+8)%16)`(%rsp),$xi[1]
    140 	and	$b,$t0
    141 	lea	0x5a827999($xi[0],$e),$e
    142 	xor	`4*(($j+13)%16)`(%rsp),$xi[1]
    143 	xor	$d,$t0
    144 	rol	\$1,$xi[1]
    145 	add	$t2,$e
    146 	rol	\$30,$b
    147 	mov	$xi[1],`4*($j%16)`(%rsp)
    148 	add	$t0,$e
    149 ___
    150 unshift(@xi,pop(@xi));
    151 }
    152 
    153 sub BODY_20_39 {
    154 my ($i,$a,$b,$c,$d,$e)=@_;
    155 my $j=$i+1;
    156 my $K=($i<40)?0x6ed9eba1:0xca62c1d6;
    157 $code.=<<___ if ($i<79);
    158 	mov	`4*($j%16)`(%rsp),$xi[1]
    159 	mov	$c,$t0
    160 	mov	$a,$t2
    161 	xor	`4*(($j+2)%16)`(%rsp),$xi[1]
    162 	xor	$b,$t0
    163 	rol	\$5,$t2
    164 	lea	$K($xi[0],$e),$e
    165 	xor	`4*(($j+8)%16)`(%rsp),$xi[1]
    166 	xor	$d,$t0
    167 	add	$t2,$e
    168 	xor	`4*(($j+13)%16)`(%rsp),$xi[1]
    169 	rol	\$30,$b
    170 	add	$t0,$e
    171 	rol	\$1,$xi[1]
    172 ___
    173 $code.=<<___ if ($i<76);
    174 	mov	$xi[1],`4*($j%16)`(%rsp)
    175 ___
    176 $code.=<<___ if ($i==79);
    177 	mov	$c,$t0
    178 	mov	$a,$t2
    179 	xor	$b,$t0
    180 	lea	$K($xi[0],$e),$e
    181 	rol	\$5,$t2
    182 	xor	$d,$t0
    183 	add	$t2,$e
    184 	rol	\$30,$b
    185 	add	$t0,$e
    186 ___
    187 unshift(@xi,pop(@xi));
    188 }
    189 
    190 sub BODY_40_59 {
    191 my ($i,$a,$b,$c,$d,$e)=@_;
    192 my $j=$i+1;
    193 $code.=<<___;
    194 	mov	`4*($j%16)`(%rsp),$xi[1]
    195 	mov	$c,$t0
    196 	mov	$c,$t1
    197 	xor	`4*(($j+2)%16)`(%rsp),$xi[1]
    198 	and	$d,$t0
    199 	mov	$a,$t2
    200 	xor	`4*(($j+8)%16)`(%rsp),$xi[1]
    201 	xor	$d,$t1
    202 	lea	0x8f1bbcdc($xi[0],$e),$e
    203 	rol	\$5,$t2
    204 	xor	`4*(($j+13)%16)`(%rsp),$xi[1]
    205 	add	$t0,$e
    206 	and	$b,$t1
    207 	rol	\$1,$xi[1]
    208 	add	$t1,$e
    209 	rol	\$30,$b
    210 	mov	$xi[1],`4*($j%16)`(%rsp)
    211 	add	$t2,$e
    212 ___
    213 unshift(@xi,pop(@xi));
    214 }
    215 
    216 $code.=<<___;
    217 .text
    218 .extern	OPENSSL_ia32cap_P
    219 
    220 .globl	sha1_block_data_order
    221 .type	sha1_block_data_order,\@function,3
    222 .align	16
    223 sha1_block_data_order:
    224 	mov	OPENSSL_ia32cap_P+0(%rip),%r9d
    225 	mov	OPENSSL_ia32cap_P+4(%rip),%r8d
    226 	test	\$`1<<9`,%r8d		# check SSSE3 bit
    227 	jz	.Lialu
    228 ___
    229 $code.=<<___ if ($avx);
    230 	and	\$`1<<28`,%r8d		# mask AVX bit
    231 	and	\$`1<<30`,%r9d		# mask "Intel CPU" bit
    232 	or	%r9d,%r8d
    233 	cmp	\$`1<<28|1<<30`,%r8d
    234 	je	_avx_shortcut
    235 ___
    236 $code.=<<___;
    237 	jmp	_ssse3_shortcut
    238 
    239 .align	16
    240 .Lialu:
    241 	push	%rbx
    242 	push	%rbp
    243 	push	%r12
    244 	push	%r13
    245 	mov	%rsp,%r11
    246 	mov	%rdi,$ctx	# reassigned argument
    247 	sub	\$`8+16*4`,%rsp
    248 	mov	%rsi,$inp	# reassigned argument
    249 	and	\$-64,%rsp
    250 	mov	%rdx,$num	# reassigned argument
    251 	mov	%r11,`16*4`(%rsp)
    252 .Lprologue:
    253 
    254 	mov	0($ctx),$A
    255 	mov	4($ctx),$B
    256 	mov	8($ctx),$C
    257 	mov	12($ctx),$D
    258 	mov	16($ctx),$E
    259 	jmp	.Lloop
    260 
    261 .align	16
    262 .Lloop:
    263 ___
    264 for($i=0;$i<20;$i++)	{ &BODY_00_19($i,@V); unshift(@V,pop(@V)); }
    265 for(;$i<40;$i++)	{ &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
    266 for(;$i<60;$i++)	{ &BODY_40_59($i,@V); unshift(@V,pop(@V)); }
    267 for(;$i<80;$i++)	{ &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
    268 $code.=<<___;
    269 	add	0($ctx),$A
    270 	add	4($ctx),$B
    271 	add	8($ctx),$C
    272 	add	12($ctx),$D
    273 	add	16($ctx),$E
    274 	mov	$A,0($ctx)
    275 	mov	$B,4($ctx)
    276 	mov	$C,8($ctx)
    277 	mov	$D,12($ctx)
    278 	mov	$E,16($ctx)
    279 
    280 	sub	\$1,$num
    281 	lea	`16*4`($inp),$inp
    282 	jnz	.Lloop
    283 
    284 	mov	`16*4`(%rsp),%rsi
    285 	mov	(%rsi),%r13
    286 	mov	8(%rsi),%r12
    287 	mov	16(%rsi),%rbp
    288 	mov	24(%rsi),%rbx
    289 	lea	32(%rsi),%rsp
    290 .Lepilogue:
    291 	ret
    292 .size	sha1_block_data_order,.-sha1_block_data_order
    293 ___
    294 {{{
    295 my $Xi=4;
    296 my @X=map("%xmm$_",(4..7,0..3));
    297 my @Tx=map("%xmm$_",(8..10));
    298 my @V=($A,$B,$C,$D,$E)=("%eax","%ebx","%ecx","%edx","%ebp");	# size optimization
    299 my @T=("%esi","%edi");
    300 my $j=0;
    301 my $K_XX_XX="%r11";
    302 
    303 my $_rol=sub { &rol(@_) };
    304 my $_ror=sub { &ror(@_) };
    305 
    306 $code.=<<___;
    307 .type	sha1_block_data_order_ssse3,\@function,3
    308 .align	16
    309 sha1_block_data_order_ssse3:
    310 _ssse3_shortcut:
    311 	push	%rbx
    312 	push	%rbp
    313 	push	%r12
    314 	lea	`-64-($win64?5*16:0)`(%rsp),%rsp
    315 ___
    316 $code.=<<___ if ($win64);
    317 	movaps	%xmm6,64+0(%rsp)
    318 	movaps	%xmm7,64+16(%rsp)
    319 	movaps	%xmm8,64+32(%rsp)
    320 	movaps	%xmm9,64+48(%rsp)
    321 	movaps	%xmm10,64+64(%rsp)
    322 .Lprologue_ssse3:
    323 ___
    324 $code.=<<___;
    325 	mov	%rdi,$ctx	# reassigned argument
    326 	mov	%rsi,$inp	# reassigned argument
    327 	mov	%rdx,$num	# reassigned argument
    328 
    329 	shl	\$6,$num
    330 	add	$inp,$num
    331 	lea	K_XX_XX(%rip),$K_XX_XX
    332 
    333 	mov	0($ctx),$A		# load context
    334 	mov	4($ctx),$B
    335 	mov	8($ctx),$C
    336 	mov	12($ctx),$D
    337 	mov	$B,@T[0]		# magic seed
    338 	mov	16($ctx),$E
    339 
    340 	movdqa	64($K_XX_XX),@X[2]	# pbswap mask
    341 	movdqa	0($K_XX_XX),@Tx[1]	# K_00_19
    342 	movdqu	0($inp),@X[-4&7]	# load input to %xmm[0-3]
    343 	movdqu	16($inp),@X[-3&7]
    344 	movdqu	32($inp),@X[-2&7]
    345 	movdqu	48($inp),@X[-1&7]
    346 	pshufb	@X[2],@X[-4&7]		# byte swap
    347 	add	\$64,$inp
    348 	pshufb	@X[2],@X[-3&7]
    349 	pshufb	@X[2],@X[-2&7]
    350 	pshufb	@X[2],@X[-1&7]
    351 	paddd	@Tx[1],@X[-4&7]		# add K_00_19
    352 	paddd	@Tx[1],@X[-3&7]
    353 	paddd	@Tx[1],@X[-2&7]
    354 	movdqa	@X[-4&7],0(%rsp)	# X[]+K xfer to IALU
    355 	psubd	@Tx[1],@X[-4&7]		# restore X[]
    356 	movdqa	@X[-3&7],16(%rsp)
    357 	psubd	@Tx[1],@X[-3&7]
    358 	movdqa	@X[-2&7],32(%rsp)
    359 	psubd	@Tx[1],@X[-2&7]
    360 	jmp	.Loop_ssse3
    361 ___
    362 
    363 sub AUTOLOAD()		# thunk [simplified] 32-bit style perlasm
    364 { my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
    365   my $arg = pop;
    366     $arg = "\$$arg" if ($arg*1 eq $arg);
    367     $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n";
    368 }
    369 
    370 sub Xupdate_ssse3_16_31()		# recall that $Xi starts wtih 4
    371 { use integer;
    372   my $body = shift;
    373   my @insns = (&$body,&$body,&$body,&$body);	# 40 instructions
    374   my ($a,$b,$c,$d,$e);
    375 
    376 	&movdqa	(@X[0],@X[-3&7]);
    377 	 eval(shift(@insns));
    378 	 eval(shift(@insns));
    379 	&movdqa	(@Tx[0],@X[-1&7]);
    380 	&palignr(@X[0],@X[-4&7],8);	# compose "X[-14]" in "X[0]"
    381 	 eval(shift(@insns));
    382 	 eval(shift(@insns));
    383 
    384 	  &paddd	(@Tx[1],@X[-1&7]);
    385 	 eval(shift(@insns));
    386 	 eval(shift(@insns));
    387 	&psrldq	(@Tx[0],4);		# "X[-3]", 3 dwords
    388 	 eval(shift(@insns));
    389 	 eval(shift(@insns));
    390 	&pxor	(@X[0],@X[-4&7]);	# "X[0]"^="X[-16]"
    391 	 eval(shift(@insns));
    392 	 eval(shift(@insns));
    393 
    394 	&pxor	(@Tx[0],@X[-2&7]);	# "X[-3]"^"X[-8]"
    395 	 eval(shift(@insns));
    396 	 eval(shift(@insns));
    397 	 eval(shift(@insns));
    398 	 eval(shift(@insns));
    399 
    400 	&pxor	(@X[0],@Tx[0]);		# "X[0]"^="X[-3]"^"X[-8]"
    401 	 eval(shift(@insns));
    402 	 eval(shift(@insns));
    403 	  &movdqa	(eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]);	# X[]+K xfer to IALU
    404 	 eval(shift(@insns));
    405 	 eval(shift(@insns));
    406 
    407 	&movdqa	(@Tx[2],@X[0]);
    408 	&movdqa	(@Tx[0],@X[0]);
    409 	 eval(shift(@insns));
    410 	 eval(shift(@insns));
    411 	 eval(shift(@insns));
    412 	 eval(shift(@insns));
    413 
    414 	&pslldq	(@Tx[2],12);		# "X[0]"<<96, extract one dword
    415 	&paddd	(@X[0],@X[0]);
    416 	 eval(shift(@insns));
    417 	 eval(shift(@insns));
    418 	 eval(shift(@insns));
    419 	 eval(shift(@insns));
    420 
    421 	&psrld	(@Tx[0],31);
    422 	 eval(shift(@insns));
    423 	 eval(shift(@insns));
    424 	&movdqa	(@Tx[1],@Tx[2]);
    425 	 eval(shift(@insns));
    426 	 eval(shift(@insns));
    427 
    428 	&psrld	(@Tx[2],30);
    429 	&por	(@X[0],@Tx[0]);		# "X[0]"<<<=1
    430 	 eval(shift(@insns));
    431 	 eval(shift(@insns));
    432 	 eval(shift(@insns));
    433 	 eval(shift(@insns));
    434 
    435 	&pslld	(@Tx[1],2);
    436 	&pxor	(@X[0],@Tx[2]);
    437 	 eval(shift(@insns));
    438 	 eval(shift(@insns));
    439 	  &movdqa	(@Tx[2],eval(16*(($Xi)/5))."($K_XX_XX)");	# K_XX_XX
    440 	 eval(shift(@insns));
    441 	 eval(shift(@insns));
    442 
    443 	&pxor	(@X[0],@Tx[1]);		# "X[0]"^=("X[0]">>96)<<<2
    444 
    445 	 foreach (@insns) { eval; }	# remaining instructions [if any]
    446 
    447   $Xi++;	push(@X,shift(@X));	# "rotate" X[]
    448 		push(@Tx,shift(@Tx));
    449 }
    450 
    451 sub Xupdate_ssse3_32_79()
    452 { use integer;
    453   my $body = shift;
    454   my @insns = (&$body,&$body,&$body,&$body);	# 32 to 48 instructions
    455   my ($a,$b,$c,$d,$e);
    456 
    457 	&movdqa	(@Tx[0],@X[-1&7])	if ($Xi==8);
    458 	 eval(shift(@insns));		# body_20_39
    459 	&pxor	(@X[0],@X[-4&7]);	# "X[0]"="X[-32]"^"X[-16]"
    460 	&palignr(@Tx[0],@X[-2&7],8);	# compose "X[-6]"
    461 	 eval(shift(@insns));
    462 	 eval(shift(@insns));
    463 	 eval(shift(@insns));		# rol
    464 
    465 	&pxor	(@X[0],@X[-7&7]);	# "X[0]"^="X[-28]"
    466 	 eval(shift(@insns));
    467 	 eval(shift(@insns))	if (@insns[0] !~ /&ro[rl]/);
    468 	if ($Xi%5) {
    469 	  &movdqa	(@Tx[2],@Tx[1]);# "perpetuate" K_XX_XX...
    470 	} else {			# ... or load next one
    471 	  &movdqa	(@Tx[2],eval(16*($Xi/5))."($K_XX_XX)");
    472 	}
    473 	  &paddd	(@Tx[1],@X[-1&7]);
    474 	 eval(shift(@insns));		# ror
    475 	 eval(shift(@insns));
    476 
    477 	&pxor	(@X[0],@Tx[0]);		# "X[0]"^="X[-6]"
    478 	 eval(shift(@insns));		# body_20_39
    479 	 eval(shift(@insns));
    480 	 eval(shift(@insns));
    481 	 eval(shift(@insns));		# rol
    482 
    483 	&movdqa	(@Tx[0],@X[0]);
    484 	  &movdqa	(eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]);	# X[]+K xfer to IALU
    485 	 eval(shift(@insns));
    486 	 eval(shift(@insns));
    487 	 eval(shift(@insns));		# ror
    488 	 eval(shift(@insns));
    489 
    490 	&pslld	(@X[0],2);
    491 	 eval(shift(@insns));		# body_20_39
    492 	 eval(shift(@insns));
    493 	&psrld	(@Tx[0],30);
    494 	 eval(shift(@insns));
    495 	 eval(shift(@insns));		# rol
    496 	 eval(shift(@insns));
    497 	 eval(shift(@insns));
    498 	 eval(shift(@insns));		# ror
    499 	 eval(shift(@insns));
    500 
    501 	&por	(@X[0],@Tx[0]);		# "X[0]"<<<=2
    502 	 eval(shift(@insns));		# body_20_39
    503 	 eval(shift(@insns));
    504 	  &movdqa	(@Tx[1],@X[0])	if ($Xi<19);
    505 	 eval(shift(@insns));
    506 	 eval(shift(@insns));		# rol
    507 	 eval(shift(@insns));
    508 	 eval(shift(@insns));
    509 	 eval(shift(@insns));		# rol
    510 	 eval(shift(@insns));
    511 
    512 	 foreach (@insns) { eval; }	# remaining instructions
    513 
    514   $Xi++;	push(@X,shift(@X));	# "rotate" X[]
    515 		push(@Tx,shift(@Tx));
    516 }
    517 
    518 sub Xuplast_ssse3_80()
    519 { use integer;
    520   my $body = shift;
    521   my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
    522   my ($a,$b,$c,$d,$e);
    523 
    524 	 eval(shift(@insns));
    525 	  &paddd	(@Tx[1],@X[-1&7]);
    526 	 eval(shift(@insns));
    527 	 eval(shift(@insns));
    528 	 eval(shift(@insns));
    529 	 eval(shift(@insns));
    530 
    531 	  &movdqa	(eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]);	# X[]+K xfer IALU
    532 
    533 	 foreach (@insns) { eval; }		# remaining instructions
    534 
    535 	&cmp	($inp,$num);
    536 	&je	(".Ldone_ssse3");
    537 
    538 	unshift(@Tx,pop(@Tx));
    539 
    540 	&movdqa	(@X[2],"64($K_XX_XX)");		# pbswap mask
    541 	&movdqa	(@Tx[1],"0($K_XX_XX)");		# K_00_19
    542 	&movdqu	(@X[-4&7],"0($inp)");		# load input
    543 	&movdqu	(@X[-3&7],"16($inp)");
    544 	&movdqu	(@X[-2&7],"32($inp)");
    545 	&movdqu	(@X[-1&7],"48($inp)");
    546 	&pshufb	(@X[-4&7],@X[2]);		# byte swap
    547 	&add	($inp,64);
    548 
    549   $Xi=0;
    550 }
    551 
    552 sub Xloop_ssse3()
    553 { use integer;
    554   my $body = shift;
    555   my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
    556   my ($a,$b,$c,$d,$e);
    557 
    558 	 eval(shift(@insns));
    559 	 eval(shift(@insns));
    560 	&pshufb	(@X[($Xi-3)&7],@X[2]);
    561 	 eval(shift(@insns));
    562 	 eval(shift(@insns));
    563 	&paddd	(@X[($Xi-4)&7],@Tx[1]);
    564 	 eval(shift(@insns));
    565 	 eval(shift(@insns));
    566 	 eval(shift(@insns));
    567 	 eval(shift(@insns));
    568 	&movdqa	(eval(16*$Xi)."(%rsp)",@X[($Xi-4)&7]);	# X[]+K xfer to IALU
    569 	 eval(shift(@insns));
    570 	 eval(shift(@insns));
    571 	&psubd	(@X[($Xi-4)&7],@Tx[1]);
    572 
    573 	foreach (@insns) { eval; }
    574   $Xi++;
    575 }
    576 
    577 sub Xtail_ssse3()
    578 { use integer;
    579   my $body = shift;
    580   my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
    581   my ($a,$b,$c,$d,$e);
    582 
    583 	foreach (@insns) { eval; }
    584 }
    585 
    586 sub body_00_19 () {
    587 	(
    588 	'($a,$b,$c,$d,$e)=@V;'.
    589 	'&add	($e,eval(4*($j&15))."(%rsp)");',	# X[]+K xfer
    590 	'&xor	($c,$d);',
    591 	'&mov	(@T[1],$a);',	# $b in next round
    592 	'&$_rol	($a,5);',
    593 	'&and	(@T[0],$c);',	# ($b&($c^$d))
    594 	'&xor	($c,$d);',	# restore $c
    595 	'&xor	(@T[0],$d);',
    596 	'&add	($e,$a);',
    597 	'&$_ror	($b,$j?7:2);',	# $b>>>2
    598 	'&add	($e,@T[0]);'	.'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));'
    599 	);
    600 }
    601 
    602 sub body_20_39 () {
    603 	(
    604 	'($a,$b,$c,$d,$e)=@V;'.
    605 	'&add	($e,eval(4*($j++&15))."(%rsp)");',	# X[]+K xfer
    606 	'&xor	(@T[0],$d);',	# ($b^$d)
    607 	'&mov	(@T[1],$a);',	# $b in next round
    608 	'&$_rol	($a,5);',
    609 	'&xor	(@T[0],$c);',	# ($b^$d^$c)
    610 	'&add	($e,$a);',
    611 	'&$_ror	($b,7);',	# $b>>>2
    612 	'&add	($e,@T[0]);'	.'unshift(@V,pop(@V)); unshift(@T,pop(@T));'
    613 	);
    614 }
    615 
    616 sub body_40_59 () {
    617 	(
    618 	'($a,$b,$c,$d,$e)=@V;'.
    619 	'&mov	(@T[1],$c);',
    620 	'&xor	($c,$d);',
    621 	'&add	($e,eval(4*($j++&15))."(%rsp)");',	# X[]+K xfer
    622 	'&and	(@T[1],$d);',
    623 	'&and	(@T[0],$c);',	# ($b&($c^$d))
    624 	'&$_ror	($b,7);',	# $b>>>2
    625 	'&add	($e,@T[1]);',
    626 	'&mov	(@T[1],$a);',	# $b in next round
    627 	'&$_rol	($a,5);',
    628 	'&add	($e,@T[0]);',
    629 	'&xor	($c,$d);',	# restore $c
    630 	'&add	($e,$a);'	.'unshift(@V,pop(@V)); unshift(@T,pop(@T));'
    631 	);
    632 }
    633 $code.=<<___;
    634 .align	16
    635 .Loop_ssse3:
    636 ___
    637 	&Xupdate_ssse3_16_31(\&body_00_19);
    638 	&Xupdate_ssse3_16_31(\&body_00_19);
    639 	&Xupdate_ssse3_16_31(\&body_00_19);
    640 	&Xupdate_ssse3_16_31(\&body_00_19);
    641 	&Xupdate_ssse3_32_79(\&body_00_19);
    642 	&Xupdate_ssse3_32_79(\&body_20_39);
    643 	&Xupdate_ssse3_32_79(\&body_20_39);
    644 	&Xupdate_ssse3_32_79(\&body_20_39);
    645 	&Xupdate_ssse3_32_79(\&body_20_39);
    646 	&Xupdate_ssse3_32_79(\&body_20_39);
    647 	&Xupdate_ssse3_32_79(\&body_40_59);
    648 	&Xupdate_ssse3_32_79(\&body_40_59);
    649 	&Xupdate_ssse3_32_79(\&body_40_59);
    650 	&Xupdate_ssse3_32_79(\&body_40_59);
    651 	&Xupdate_ssse3_32_79(\&body_40_59);
    652 	&Xupdate_ssse3_32_79(\&body_20_39);
    653 	&Xuplast_ssse3_80(\&body_20_39);	# can jump to "done"
    654 
    655 				$saved_j=$j; @saved_V=@V;
    656 
    657 	&Xloop_ssse3(\&body_20_39);
    658 	&Xloop_ssse3(\&body_20_39);
    659 	&Xloop_ssse3(\&body_20_39);
    660 
    661 $code.=<<___;
    662 	add	0($ctx),$A			# update context
    663 	add	4($ctx),@T[0]
    664 	add	8($ctx),$C
    665 	add	12($ctx),$D
    666 	mov	$A,0($ctx)
    667 	add	16($ctx),$E
    668 	mov	@T[0],4($ctx)
    669 	mov	@T[0],$B			# magic seed
    670 	mov	$C,8($ctx)
    671 	mov	$D,12($ctx)
    672 	mov	$E,16($ctx)
    673 	jmp	.Loop_ssse3
    674 
    675 .align	16
    676 .Ldone_ssse3:
    677 ___
    678 				$j=$saved_j; @V=@saved_V;
    679 
    680 	&Xtail_ssse3(\&body_20_39);
    681 	&Xtail_ssse3(\&body_20_39);
    682 	&Xtail_ssse3(\&body_20_39);
    683 
    684 $code.=<<___;
    685 	add	0($ctx),$A			# update context
    686 	add	4($ctx),@T[0]
    687 	add	8($ctx),$C
    688 	mov	$A,0($ctx)
    689 	add	12($ctx),$D
    690 	mov	@T[0],4($ctx)
    691 	add	16($ctx),$E
    692 	mov	$C,8($ctx)
    693 	mov	$D,12($ctx)
    694 	mov	$E,16($ctx)
    695 ___
    696 $code.=<<___ if ($win64);
    697 	movaps	64+0(%rsp),%xmm6
    698 	movaps	64+16(%rsp),%xmm7
    699 	movaps	64+32(%rsp),%xmm8
    700 	movaps	64+48(%rsp),%xmm9
    701 	movaps	64+64(%rsp),%xmm10
    702 ___
    703 $code.=<<___;
    704 	lea	`64+($win64?5*16:0)`(%rsp),%rsi
    705 	mov	0(%rsi),%r12
    706 	mov	8(%rsi),%rbp
    707 	mov	16(%rsi),%rbx
    708 	lea	24(%rsi),%rsp
    709 .Lepilogue_ssse3:
    710 	ret
    711 .size	sha1_block_data_order_ssse3,.-sha1_block_data_order_ssse3
    712 ___
    713 
    714 if ($avx) {
    715 my $Xi=4;
    716 my @X=map("%xmm$_",(4..7,0..3));
    717 my @Tx=map("%xmm$_",(8..10));
    718 my @V=($A,$B,$C,$D,$E)=("%eax","%ebx","%ecx","%edx","%ebp");	# size optimization
    719 my @T=("%esi","%edi");
    720 my $j=0;
    721 my $K_XX_XX="%r11";
    722 
    723 my $_rol=sub { &shld(@_[0],@_) };
    724 my $_ror=sub { &shrd(@_[0],@_) };
    725 
    726 $code.=<<___;
    727 .type	sha1_block_data_order_avx,\@function,3
    728 .align	16
    729 sha1_block_data_order_avx:
    730 _avx_shortcut:
    731 	push	%rbx
    732 	push	%rbp
    733 	push	%r12
    734 	lea	`-64-($win64?5*16:0)`(%rsp),%rsp
    735 ___
    736 $code.=<<___ if ($win64);
    737 	movaps	%xmm6,64+0(%rsp)
    738 	movaps	%xmm7,64+16(%rsp)
    739 	movaps	%xmm8,64+32(%rsp)
    740 	movaps	%xmm9,64+48(%rsp)
    741 	movaps	%xmm10,64+64(%rsp)
    742 .Lprologue_avx:
    743 ___
    744 $code.=<<___;
    745 	mov	%rdi,$ctx	# reassigned argument
    746 	mov	%rsi,$inp	# reassigned argument
    747 	mov	%rdx,$num	# reassigned argument
    748 	vzeroall
    749 
    750 	shl	\$6,$num
    751 	add	$inp,$num
    752 	lea	K_XX_XX(%rip),$K_XX_XX
    753 
    754 	mov	0($ctx),$A		# load context
    755 	mov	4($ctx),$B
    756 	mov	8($ctx),$C
    757 	mov	12($ctx),$D
    758 	mov	$B,@T[0]		# magic seed
    759 	mov	16($ctx),$E
    760 
    761 	vmovdqa	64($K_XX_XX),@X[2]	# pbswap mask
    762 	vmovdqa	0($K_XX_XX),@Tx[1]	# K_00_19
    763 	vmovdqu	0($inp),@X[-4&7]	# load input to %xmm[0-3]
    764 	vmovdqu	16($inp),@X[-3&7]
    765 	vmovdqu	32($inp),@X[-2&7]
    766 	vmovdqu	48($inp),@X[-1&7]
    767 	vpshufb	@X[2],@X[-4&7],@X[-4&7]	# byte swap
    768 	add	\$64,$inp
    769 	vpshufb	@X[2],@X[-3&7],@X[-3&7]
    770 	vpshufb	@X[2],@X[-2&7],@X[-2&7]
    771 	vpshufb	@X[2],@X[-1&7],@X[-1&7]
    772 	vpaddd	@Tx[1],@X[-4&7],@X[0]	# add K_00_19
    773 	vpaddd	@Tx[1],@X[-3&7],@X[1]
    774 	vpaddd	@Tx[1],@X[-2&7],@X[2]
    775 	vmovdqa	@X[0],0(%rsp)		# X[]+K xfer to IALU
    776 	vmovdqa	@X[1],16(%rsp)
    777 	vmovdqa	@X[2],32(%rsp)
    778 	jmp	.Loop_avx
    779 ___
    780 
    781 sub Xupdate_avx_16_31()		# recall that $Xi starts wtih 4
    782 { use integer;
    783   my $body = shift;
    784   my @insns = (&$body,&$body,&$body,&$body);	# 40 instructions
    785   my ($a,$b,$c,$d,$e);
    786 
    787 	 eval(shift(@insns));
    788 	 eval(shift(@insns));
    789 	&vpalignr(@X[0],@X[-3&7],@X[-4&7],8);	# compose "X[-14]" in "X[0]"
    790 	 eval(shift(@insns));
    791 	 eval(shift(@insns));
    792 
    793 	  &vpaddd	(@Tx[1],@Tx[1],@X[-1&7]);
    794 	 eval(shift(@insns));
    795 	 eval(shift(@insns));
    796 	&vpsrldq(@Tx[0],@X[-1&7],4);	# "X[-3]", 3 dwords
    797 	 eval(shift(@insns));
    798 	 eval(shift(@insns));
    799 	&vpxor	(@X[0],@X[0],@X[-4&7]);		# "X[0]"^="X[-16]"
    800 	 eval(shift(@insns));
    801 	 eval(shift(@insns));
    802 
    803 	&vpxor	(@Tx[0],@Tx[0],@X[-2&7]);	# "X[-3]"^"X[-8]"
    804 	 eval(shift(@insns));
    805 	 eval(shift(@insns));
    806 	 eval(shift(@insns));
    807 	 eval(shift(@insns));
    808 
    809 	&vpxor	(@X[0],@X[0],@Tx[0]);		# "X[0]"^="X[-3]"^"X[-8]"
    810 	 eval(shift(@insns));
    811 	 eval(shift(@insns));
    812 	  &vmovdqa	(eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]);	# X[]+K xfer to IALU
    813 	 eval(shift(@insns));
    814 	 eval(shift(@insns));
    815 
    816 	&vpsrld	(@Tx[0],@X[0],31);
    817 	 eval(shift(@insns));
    818 	 eval(shift(@insns));
    819 	 eval(shift(@insns));
    820 	 eval(shift(@insns));
    821 
    822 	&vpslldq(@Tx[2],@X[0],12);		# "X[0]"<<96, extract one dword
    823 	&vpaddd	(@X[0],@X[0],@X[0]);
    824 	 eval(shift(@insns));
    825 	 eval(shift(@insns));
    826 	 eval(shift(@insns));
    827 	 eval(shift(@insns));
    828 
    829 	&vpsrld	(@Tx[1],@Tx[2],30);
    830 	&vpor	(@X[0],@X[0],@Tx[0]);		# "X[0]"<<<=1
    831 	 eval(shift(@insns));
    832 	 eval(shift(@insns));
    833 	 eval(shift(@insns));
    834 	 eval(shift(@insns));
    835 
    836 	&vpslld	(@Tx[2],@Tx[2],2);
    837 	&vpxor	(@X[0],@X[0],@Tx[1]);
    838 	 eval(shift(@insns));
    839 	 eval(shift(@insns));
    840 	 eval(shift(@insns));
    841 	 eval(shift(@insns));
    842 
    843 	&vpxor	(@X[0],@X[0],@Tx[2]);		# "X[0]"^=("X[0]">>96)<<<2
    844 	 eval(shift(@insns));
    845 	 eval(shift(@insns));
    846 	  &vmovdqa	(@Tx[2],eval(16*(($Xi)/5))."($K_XX_XX)");	# K_XX_XX
    847 	 eval(shift(@insns));
    848 	 eval(shift(@insns));
    849 
    850 
    851 	 foreach (@insns) { eval; }	# remaining instructions [if any]
    852 
    853   $Xi++;	push(@X,shift(@X));	# "rotate" X[]
    854 		push(@Tx,shift(@Tx));
    855 }
    856 
    857 sub Xupdate_avx_32_79()
    858 { use integer;
    859   my $body = shift;
    860   my @insns = (&$body,&$body,&$body,&$body);	# 32 to 48 instructions
    861   my ($a,$b,$c,$d,$e);
    862 
    863 	&vpalignr(@Tx[0],@X[-1&7],@X[-2&7],8);	# compose "X[-6]"
    864 	&vpxor	(@X[0],@X[0],@X[-4&7]);		# "X[0]"="X[-32]"^"X[-16]"
    865 	 eval(shift(@insns));		# body_20_39
    866 	 eval(shift(@insns));
    867 	 eval(shift(@insns));
    868 	 eval(shift(@insns));		# rol
    869 
    870 	&vpxor	(@X[0],@X[0],@X[-7&7]);		# "X[0]"^="X[-28]"
    871 	 eval(shift(@insns));
    872 	 eval(shift(@insns))	if (@insns[0] !~ /&ro[rl]/);
    873 	if ($Xi%5) {
    874 	  &vmovdqa	(@Tx[2],@Tx[1]);# "perpetuate" K_XX_XX...
    875 	} else {			# ... or load next one
    876 	  &vmovdqa	(@Tx[2],eval(16*($Xi/5))."($K_XX_XX)");
    877 	}
    878 	  &vpaddd	(@Tx[1],@Tx[1],@X[-1&7]);
    879 	 eval(shift(@insns));		# ror
    880 	 eval(shift(@insns));
    881 
    882 	&vpxor	(@X[0],@X[0],@Tx[0]);		# "X[0]"^="X[-6]"
    883 	 eval(shift(@insns));		# body_20_39
    884 	 eval(shift(@insns));
    885 	 eval(shift(@insns));
    886 	 eval(shift(@insns));		# rol
    887 
    888 	&vpsrld	(@Tx[0],@X[0],30);
    889 	  &vmovdqa	(eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]);	# X[]+K xfer to IALU
    890 	 eval(shift(@insns));
    891 	 eval(shift(@insns));
    892 	 eval(shift(@insns));		# ror
    893 	 eval(shift(@insns));
    894 
    895 	&vpslld	(@X[0],@X[0],2);
    896 	 eval(shift(@insns));		# body_20_39
    897 	 eval(shift(@insns));
    898 	 eval(shift(@insns));
    899 	 eval(shift(@insns));		# rol
    900 	 eval(shift(@insns));
    901 	 eval(shift(@insns));
    902 	 eval(shift(@insns));		# ror
    903 	 eval(shift(@insns));
    904 
    905 	&vpor	(@X[0],@X[0],@Tx[0]);		# "X[0]"<<<=2
    906 	 eval(shift(@insns));		# body_20_39
    907 	 eval(shift(@insns));
    908 	  &vmovdqa	(@Tx[1],@X[0])	if ($Xi<19);
    909 	 eval(shift(@insns));
    910 	 eval(shift(@insns));		# rol
    911 	 eval(shift(@insns));
    912 	 eval(shift(@insns));
    913 	 eval(shift(@insns));		# rol
    914 	 eval(shift(@insns));
    915 
    916 	 foreach (@insns) { eval; }	# remaining instructions
    917 
    918   $Xi++;	push(@X,shift(@X));	# "rotate" X[]
    919 		push(@Tx,shift(@Tx));
    920 }
    921 
    922 sub Xuplast_avx_80()
    923 { use integer;
    924   my $body = shift;
    925   my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
    926   my ($a,$b,$c,$d,$e);
    927 
    928 	 eval(shift(@insns));
    929 	  &vpaddd	(@Tx[1],@Tx[1],@X[-1&7]);
    930 	 eval(shift(@insns));
    931 	 eval(shift(@insns));
    932 	 eval(shift(@insns));
    933 	 eval(shift(@insns));
    934 
    935 	  &movdqa	(eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]);	# X[]+K xfer IALU
    936 
    937 	 foreach (@insns) { eval; }		# remaining instructions
    938 
    939 	&cmp	($inp,$num);
    940 	&je	(".Ldone_avx");
    941 
    942 	unshift(@Tx,pop(@Tx));
    943 
    944 	&vmovdqa(@X[2],"64($K_XX_XX)");		# pbswap mask
    945 	&vmovdqa(@Tx[1],"0($K_XX_XX)");		# K_00_19
    946 	&vmovdqu(@X[-4&7],"0($inp)");		# load input
    947 	&vmovdqu(@X[-3&7],"16($inp)");
    948 	&vmovdqu(@X[-2&7],"32($inp)");
    949 	&vmovdqu(@X[-1&7],"48($inp)");
    950 	&vpshufb(@X[-4&7],@X[-4&7],@X[2]);	# byte swap
    951 	&add	($inp,64);
    952 
    953   $Xi=0;
    954 }
    955 
    956 sub Xloop_avx()
    957 { use integer;
    958   my $body = shift;
    959   my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
    960   my ($a,$b,$c,$d,$e);
    961 
    962 	 eval(shift(@insns));
    963 	 eval(shift(@insns));
    964 	&vpshufb(@X[($Xi-3)&7],@X[($Xi-3)&7],@X[2]);
    965 	 eval(shift(@insns));
    966 	 eval(shift(@insns));
    967 	&vpaddd	(@X[$Xi&7],@X[($Xi-4)&7],@Tx[1]);
    968 	 eval(shift(@insns));
    969 	 eval(shift(@insns));
    970 	 eval(shift(@insns));
    971 	 eval(shift(@insns));
    972 	&vmovdqa(eval(16*$Xi)."(%rsp)",@X[$Xi&7]);	# X[]+K xfer to IALU
    973 	 eval(shift(@insns));
    974 	 eval(shift(@insns));
    975 
    976 	foreach (@insns) { eval; }
    977   $Xi++;
    978 }
    979 
    980 sub Xtail_avx()
    981 { use integer;
    982   my $body = shift;
    983   my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
    984   my ($a,$b,$c,$d,$e);
    985 
    986 	foreach (@insns) { eval; }
    987 }
    988 
    989 $code.=<<___;
    990 .align	16
    991 .Loop_avx:
    992 ___
    993 	&Xupdate_avx_16_31(\&body_00_19);
    994 	&Xupdate_avx_16_31(\&body_00_19);
    995 	&Xupdate_avx_16_31(\&body_00_19);
    996 	&Xupdate_avx_16_31(\&body_00_19);
    997 	&Xupdate_avx_32_79(\&body_00_19);
    998 	&Xupdate_avx_32_79(\&body_20_39);
    999 	&Xupdate_avx_32_79(\&body_20_39);
   1000 	&Xupdate_avx_32_79(\&body_20_39);
   1001 	&Xupdate_avx_32_79(\&body_20_39);
   1002 	&Xupdate_avx_32_79(\&body_20_39);
   1003 	&Xupdate_avx_32_79(\&body_40_59);
   1004 	&Xupdate_avx_32_79(\&body_40_59);
   1005 	&Xupdate_avx_32_79(\&body_40_59);
   1006 	&Xupdate_avx_32_79(\&body_40_59);
   1007 	&Xupdate_avx_32_79(\&body_40_59);
   1008 	&Xupdate_avx_32_79(\&body_20_39);
   1009 	&Xuplast_avx_80(\&body_20_39);	# can jump to "done"
   1010 
   1011 				$saved_j=$j; @saved_V=@V;
   1012 
   1013 	&Xloop_avx(\&body_20_39);
   1014 	&Xloop_avx(\&body_20_39);
   1015 	&Xloop_avx(\&body_20_39);
   1016 
   1017 $code.=<<___;
   1018 	add	0($ctx),$A			# update context
   1019 	add	4($ctx),@T[0]
   1020 	add	8($ctx),$C
   1021 	add	12($ctx),$D
   1022 	mov	$A,0($ctx)
   1023 	add	16($ctx),$E
   1024 	mov	@T[0],4($ctx)
   1025 	mov	@T[0],$B			# magic seed
   1026 	mov	$C,8($ctx)
   1027 	mov	$D,12($ctx)
   1028 	mov	$E,16($ctx)
   1029 	jmp	.Loop_avx
   1030 
   1031 .align	16
   1032 .Ldone_avx:
   1033 ___
   1034 				$j=$saved_j; @V=@saved_V;
   1035 
   1036 	&Xtail_avx(\&body_20_39);
   1037 	&Xtail_avx(\&body_20_39);
   1038 	&Xtail_avx(\&body_20_39);
   1039 
   1040 $code.=<<___;
   1041 	vzeroall
   1042 
   1043 	add	0($ctx),$A			# update context
   1044 	add	4($ctx),@T[0]
   1045 	add	8($ctx),$C
   1046 	mov	$A,0($ctx)
   1047 	add	12($ctx),$D
   1048 	mov	@T[0],4($ctx)
   1049 	add	16($ctx),$E
   1050 	mov	$C,8($ctx)
   1051 	mov	$D,12($ctx)
   1052 	mov	$E,16($ctx)
   1053 ___
   1054 $code.=<<___ if ($win64);
   1055 	movaps	64+0(%rsp),%xmm6
   1056 	movaps	64+16(%rsp),%xmm7
   1057 	movaps	64+32(%rsp),%xmm8
   1058 	movaps	64+48(%rsp),%xmm9
   1059 	movaps	64+64(%rsp),%xmm10
   1060 ___
   1061 $code.=<<___;
   1062 	lea	`64+($win64?5*16:0)`(%rsp),%rsi
   1063 	mov	0(%rsi),%r12
   1064 	mov	8(%rsi),%rbp
   1065 	mov	16(%rsi),%rbx
   1066 	lea	24(%rsi),%rsp
   1067 .Lepilogue_avx:
   1068 	ret
   1069 .size	sha1_block_data_order_avx,.-sha1_block_data_order_avx
   1070 ___
   1071 }
   1072 $code.=<<___;
   1073 .align	64
   1074 K_XX_XX:
   1075 .long	0x5a827999,0x5a827999,0x5a827999,0x5a827999	# K_00_19
   1076 .long	0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1	# K_20_39
   1077 .long	0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc	# K_40_59
   1078 .long	0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6	# K_60_79
   1079 .long	0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f	# pbswap mask
   1080 ___
   1081 }}}
   1082 $code.=<<___;
   1083 .asciz	"SHA1 block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
   1084 .align	64
   1085 ___
   1086 
   1087 # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
   1088 #		CONTEXT *context,DISPATCHER_CONTEXT *disp)
   1089 if ($win64) {
   1090 $rec="%rcx";
   1091 $frame="%rdx";
   1092 $context="%r8";
   1093 $disp="%r9";
   1094 
   1095 $code.=<<___;
   1096 .extern	__imp_RtlVirtualUnwind
   1097 .type	se_handler,\@abi-omnipotent
   1098 .align	16
   1099 se_handler:
   1100 	push	%rsi
   1101 	push	%rdi
   1102 	push	%rbx
   1103 	push	%rbp
   1104 	push	%r12
   1105 	push	%r13
   1106 	push	%r14
   1107 	push	%r15
   1108 	pushfq
   1109 	sub	\$64,%rsp
   1110 
   1111 	mov	120($context),%rax	# pull context->Rax
   1112 	mov	248($context),%rbx	# pull context->Rip
   1113 
   1114 	lea	.Lprologue(%rip),%r10
   1115 	cmp	%r10,%rbx		# context->Rip<.Lprologue
   1116 	jb	.Lcommon_seh_tail
   1117 
   1118 	mov	152($context),%rax	# pull context->Rsp
   1119 
   1120 	lea	.Lepilogue(%rip),%r10
   1121 	cmp	%r10,%rbx		# context->Rip>=.Lepilogue
   1122 	jae	.Lcommon_seh_tail
   1123 
   1124 	mov	`16*4`(%rax),%rax	# pull saved stack pointer
   1125 	lea	32(%rax),%rax
   1126 
   1127 	mov	-8(%rax),%rbx
   1128 	mov	-16(%rax),%rbp
   1129 	mov	-24(%rax),%r12
   1130 	mov	-32(%rax),%r13
   1131 	mov	%rbx,144($context)	# restore context->Rbx
   1132 	mov	%rbp,160($context)	# restore context->Rbp
   1133 	mov	%r12,216($context)	# restore context->R12
   1134 	mov	%r13,224($context)	# restore context->R13
   1135 
   1136 	jmp	.Lcommon_seh_tail
   1137 .size	se_handler,.-se_handler
   1138 
   1139 .type	ssse3_handler,\@abi-omnipotent
   1140 .align	16
   1141 ssse3_handler:
   1142 	push	%rsi
   1143 	push	%rdi
   1144 	push	%rbx
   1145 	push	%rbp
   1146 	push	%r12
   1147 	push	%r13
   1148 	push	%r14
   1149 	push	%r15
   1150 	pushfq
   1151 	sub	\$64,%rsp
   1152 
   1153 	mov	120($context),%rax	# pull context->Rax
   1154 	mov	248($context),%rbx	# pull context->Rip
   1155 
   1156 	mov	8($disp),%rsi		# disp->ImageBase
   1157 	mov	56($disp),%r11		# disp->HandlerData
   1158 
   1159 	mov	0(%r11),%r10d		# HandlerData[0]
   1160 	lea	(%rsi,%r10),%r10	# prologue label
   1161 	cmp	%r10,%rbx		# context->Rip<prologue label
   1162 	jb	.Lcommon_seh_tail
   1163 
   1164 	mov	152($context),%rax	# pull context->Rsp
   1165 
   1166 	mov	4(%r11),%r10d		# HandlerData[1]
   1167 	lea	(%rsi,%r10),%r10	# epilogue label
   1168 	cmp	%r10,%rbx		# context->Rip>=epilogue label
   1169 	jae	.Lcommon_seh_tail
   1170 
   1171 	lea	64(%rax),%rsi
   1172 	lea	512($context),%rdi	# &context.Xmm6
   1173 	mov	\$10,%ecx
   1174 	.long	0xa548f3fc		# cld; rep movsq
   1175 	lea	`24+64+5*16`(%rax),%rax	# adjust stack pointer
   1176 
   1177 	mov	-8(%rax),%rbx
   1178 	mov	-16(%rax),%rbp
   1179 	mov	-24(%rax),%r12
   1180 	mov	%rbx,144($context)	# restore context->Rbx
   1181 	mov	%rbp,160($context)	# restore context->Rbp
   1182 	mov	%r12,216($context)	# restore cotnext->R12
   1183 
   1184 .Lcommon_seh_tail:
   1185 	mov	8(%rax),%rdi
   1186 	mov	16(%rax),%rsi
   1187 	mov	%rax,152($context)	# restore context->Rsp
   1188 	mov	%rsi,168($context)	# restore context->Rsi
   1189 	mov	%rdi,176($context)	# restore context->Rdi
   1190 
   1191 	mov	40($disp),%rdi		# disp->ContextRecord
   1192 	mov	$context,%rsi		# context
   1193 	mov	\$154,%ecx		# sizeof(CONTEXT)
   1194 	.long	0xa548f3fc		# cld; rep movsq
   1195 
   1196 	mov	$disp,%rsi
   1197 	xor	%rcx,%rcx		# arg1, UNW_FLAG_NHANDLER
   1198 	mov	8(%rsi),%rdx		# arg2, disp->ImageBase
   1199 	mov	0(%rsi),%r8		# arg3, disp->ControlPc
   1200 	mov	16(%rsi),%r9		# arg4, disp->FunctionEntry
   1201 	mov	40(%rsi),%r10		# disp->ContextRecord
   1202 	lea	56(%rsi),%r11		# &disp->HandlerData
   1203 	lea	24(%rsi),%r12		# &disp->EstablisherFrame
   1204 	mov	%r10,32(%rsp)		# arg5
   1205 	mov	%r11,40(%rsp)		# arg6
   1206 	mov	%r12,48(%rsp)		# arg7
   1207 	mov	%rcx,56(%rsp)		# arg8, (NULL)
   1208 	call	*__imp_RtlVirtualUnwind(%rip)
   1209 
   1210 	mov	\$1,%eax		# ExceptionContinueSearch
   1211 	add	\$64,%rsp
   1212 	popfq
   1213 	pop	%r15
   1214 	pop	%r14
   1215 	pop	%r13
   1216 	pop	%r12
   1217 	pop	%rbp
   1218 	pop	%rbx
   1219 	pop	%rdi
   1220 	pop	%rsi
   1221 	ret
   1222 .size	ssse3_handler,.-ssse3_handler
   1223 
   1224 .section	.pdata
   1225 .align	4
   1226 	.rva	.LSEH_begin_sha1_block_data_order
   1227 	.rva	.LSEH_end_sha1_block_data_order
   1228 	.rva	.LSEH_info_sha1_block_data_order
   1229 	.rva	.LSEH_begin_sha1_block_data_order_ssse3
   1230 	.rva	.LSEH_end_sha1_block_data_order_ssse3
   1231 	.rva	.LSEH_info_sha1_block_data_order_ssse3
   1232 ___
   1233 $code.=<<___ if ($avx);
   1234 	.rva	.LSEH_begin_sha1_block_data_order_avx
   1235 	.rva	.LSEH_end_sha1_block_data_order_avx
   1236 	.rva	.LSEH_info_sha1_block_data_order_avx
   1237 ___
   1238 $code.=<<___;
   1239 .section	.xdata
   1240 .align	8
   1241 .LSEH_info_sha1_block_data_order:
   1242 	.byte	9,0,0,0
   1243 	.rva	se_handler
   1244 .LSEH_info_sha1_block_data_order_ssse3:
   1245 	.byte	9,0,0,0
   1246 	.rva	ssse3_handler
   1247 	.rva	.Lprologue_ssse3,.Lepilogue_ssse3	# HandlerData[]
   1248 ___
   1249 $code.=<<___ if ($avx);
   1250 .LSEH_info_sha1_block_data_order_avx:
   1251 	.byte	9,0,0,0
   1252 	.rva	ssse3_handler
   1253 	.rva	.Lprologue_avx,.Lepilogue_avx		# HandlerData[]
   1254 ___
   1255 }
   1256 
   1257 ####################################################################
   1258 
   1259 $code =~ s/\`([^\`]*)\`/eval $1/gem;
   1260 print $code;
   1261 close STDOUT;
   1262