Home | History | Annotate | Download | only in asm
      1 #!/usr/bin/env perl
      2 #
      3 # ====================================================================
      4 # Written by Andy Polyakov <appro (at] fy.chalmers.se> for the OpenSSL
      5 # project. The module is, however, dual licensed under OpenSSL and
      6 # CRYPTOGAMS licenses depending on where you obtain it. For further
      7 # details see http://www.openssl.org/~appro/cryptogams/.
      8 # ====================================================================
      9 #
     10 # sha1_block procedure for x86_64.
     11 #
     12 # It was brought to my attention that on EM64T compiler-generated code
     13 # was far behind 32-bit assembler implementation. This is unlike on
     14 # Opteron where compiler-generated code was only 15% behind 32-bit
     15 # assembler, which originally made it hard to motivate the effort.
     16 # There was suggestion to mechanically translate 32-bit code, but I
     17 # dismissed it, reasoning that x86_64 offers enough register bank
     18 # capacity to fully utilize SHA-1 parallelism. Therefore this fresh
     19 # implementation:-) However! While 64-bit code does perform better
     20 # on Opteron, I failed to beat 32-bit assembler on EM64T core. Well,
     21 # x86_64 does offer larger *addressable* bank, but out-of-order core
     22 # reaches for even more registers through dynamic aliasing, and EM64T
     23 # core must have managed to run-time optimize even 32-bit code just as
     24 # good as 64-bit one. Performance improvement is summarized in the
     25 # following table:
     26 #
     27 #		gcc 3.4		32-bit asm	cycles/byte
     28 # Opteron	+45%		+20%		6.8
     29 # Xeon P4	+65%		+0%		9.9
     30 # Core2		+60%		+10%		7.0
     31 
     32 # August 2009.
     33 #
     34 # The code was revised to minimize code size and to maximize
     35 # "distance" between instructions producing input to 'lea'
     36 # instruction and the 'lea' instruction itself, which is essential
     37 # for Intel Atom core.
     38 
     39 # October 2010.
     40 #
     41 # Add SSSE3, Supplemental[!] SSE3, implementation. The idea behind it
     42 # is to offload message schedule denoted by Wt in NIST specification,
     43 # or Xupdate in OpenSSL source, to SIMD unit. See sha1-586.pl module
     44 # for background and implementation details. The only difference from
     45 # 32-bit code is that 64-bit code doesn't have to spill @X[] elements
     46 # to free temporary registers.
     47 
     48 # April 2011.
     49 #
     50 # Add AVX code path. See sha1-586.pl for further information.
     51 
     52 ######################################################################
     53 # Current performance is summarized in following table. Numbers are
     54 # CPU clock cycles spent to process single byte (less is better).
     55 #
     56 #		x86_64		SSSE3		AVX
     57 # P4		9.8		-
     58 # Opteron	6.6		-
     59 # Core2		6.7		6.1/+10%	-
     60 # Atom		11.0		9.7/+13%	-
     61 # Westmere	7.1		5.6/+27%	-
     62 # Sandy Bridge	7.9		6.3/+25%	5.2/+51%
     63 
     64 $flavour = shift;
     65 $output  = shift;
     66 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
     67 
     68 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
     69 
     70 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
     71 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
     72 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
     73 die "can't locate x86_64-xlate.pl";
     74 
     75 $avx=1 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
     76 		=~ /GNU assembler version ([2-9]\.[0-9]+)/ &&
     77 	   $1>=2.19);
     78 $avx=1 if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
     79 	   `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/ &&
     80 	   $1>=2.09);
     81 $avx=1 if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
     82 	   `ml64 2>&1` =~ /Version ([0-9]+)\./ &&
     83 	   $1>=10);
     84 
     85 open STDOUT,"| $^X $xlate $flavour $output";
     86 
     87 $ctx="%rdi";	# 1st arg
     88 $inp="%rsi";	# 2nd arg
     89 $num="%rdx";	# 3rd arg
     90 
     91 # reassign arguments in order to produce more compact code
     92 $ctx="%r8";
     93 $inp="%r9";
     94 $num="%r10";
     95 
     96 $t0="%eax";
     97 $t1="%ebx";
     98 $t2="%ecx";
     99 @xi=("%edx","%ebp");
    100 $A="%esi";
    101 $B="%edi";
    102 $C="%r11d";
    103 $D="%r12d";
    104 $E="%r13d";
    105 
    106 @V=($A,$B,$C,$D,$E);
    107 
    108 sub BODY_00_19 {
    109 my ($i,$a,$b,$c,$d,$e)=@_;
    110 my $j=$i+1;
    111 $code.=<<___ if ($i==0);
    112 	mov	`4*$i`($inp),$xi[0]
    113 	bswap	$xi[0]
    114 	mov	$xi[0],`4*$i`(%rsp)
    115 ___
    116 $code.=<<___ if ($i<15);
    117 	mov	$c,$t0
    118 	mov	`4*$j`($inp),$xi[1]
    119 	mov	$a,$t2
    120 	xor	$d,$t0
    121 	bswap	$xi[1]
    122 	rol	\$5,$t2
    123 	lea	0x5a827999($xi[0],$e),$e
    124 	and	$b,$t0
    125 	mov	$xi[1],`4*$j`(%rsp)
    126 	add	$t2,$e
    127 	xor	$d,$t0
    128 	rol	\$30,$b
    129 	add	$t0,$e
    130 ___
    131 $code.=<<___ if ($i>=15);
    132 	mov	`4*($j%16)`(%rsp),$xi[1]
    133 	mov	$c,$t0
    134 	mov	$a,$t2
    135 	xor	`4*(($j+2)%16)`(%rsp),$xi[1]
    136 	xor	$d,$t0
    137 	rol	\$5,$t2
    138 	xor	`4*(($j+8)%16)`(%rsp),$xi[1]
    139 	and	$b,$t0
    140 	lea	0x5a827999($xi[0],$e),$e
    141 	xor	`4*(($j+13)%16)`(%rsp),$xi[1]
    142 	xor	$d,$t0
    143 	rol	\$1,$xi[1]
    144 	add	$t2,$e
    145 	rol	\$30,$b
    146 	mov	$xi[1],`4*($j%16)`(%rsp)
    147 	add	$t0,$e
    148 ___
    149 unshift(@xi,pop(@xi));
    150 }
    151 
    152 sub BODY_20_39 {
    153 my ($i,$a,$b,$c,$d,$e)=@_;
    154 my $j=$i+1;
    155 my $K=($i<40)?0x6ed9eba1:0xca62c1d6;
    156 $code.=<<___ if ($i<79);
    157 	mov	`4*($j%16)`(%rsp),$xi[1]
    158 	mov	$c,$t0
    159 	mov	$a,$t2
    160 	xor	`4*(($j+2)%16)`(%rsp),$xi[1]
    161 	xor	$b,$t0
    162 	rol	\$5,$t2
    163 	lea	$K($xi[0],$e),$e
    164 	xor	`4*(($j+8)%16)`(%rsp),$xi[1]
    165 	xor	$d,$t0
    166 	add	$t2,$e
    167 	xor	`4*(($j+13)%16)`(%rsp),$xi[1]
    168 	rol	\$30,$b
    169 	add	$t0,$e
    170 	rol	\$1,$xi[1]
    171 ___
    172 $code.=<<___ if ($i<76);
    173 	mov	$xi[1],`4*($j%16)`(%rsp)
    174 ___
    175 $code.=<<___ if ($i==79);
    176 	mov	$c,$t0
    177 	mov	$a,$t2
    178 	xor	$b,$t0
    179 	lea	$K($xi[0],$e),$e
    180 	rol	\$5,$t2
    181 	xor	$d,$t0
    182 	add	$t2,$e
    183 	rol	\$30,$b
    184 	add	$t0,$e
    185 ___
    186 unshift(@xi,pop(@xi));
    187 }
    188 
    189 sub BODY_40_59 {
    190 my ($i,$a,$b,$c,$d,$e)=@_;
    191 my $j=$i+1;
    192 $code.=<<___;
    193 	mov	`4*($j%16)`(%rsp),$xi[1]
    194 	mov	$c,$t0
    195 	mov	$c,$t1
    196 	xor	`4*(($j+2)%16)`(%rsp),$xi[1]
    197 	and	$d,$t0
    198 	mov	$a,$t2
    199 	xor	`4*(($j+8)%16)`(%rsp),$xi[1]
    200 	xor	$d,$t1
    201 	lea	0x8f1bbcdc($xi[0],$e),$e
    202 	rol	\$5,$t2
    203 	xor	`4*(($j+13)%16)`(%rsp),$xi[1]
    204 	add	$t0,$e
    205 	and	$b,$t1
    206 	rol	\$1,$xi[1]
    207 	add	$t1,$e
    208 	rol	\$30,$b
    209 	mov	$xi[1],`4*($j%16)`(%rsp)
    210 	add	$t2,$e
    211 ___
    212 unshift(@xi,pop(@xi));
    213 }
    214 
    215 $code.=<<___;
    216 .text
    217 .extern	OPENSSL_ia32cap_P
    218 
    219 .globl	sha1_block_data_order
    220 .type	sha1_block_data_order,\@function,3
    221 .align	16
    222 sha1_block_data_order:
    223 	mov	OPENSSL_ia32cap_P+0(%rip),%r9d
    224 	mov	OPENSSL_ia32cap_P+4(%rip),%r8d
    225 	test	\$`1<<9`,%r8d		# check SSSE3 bit
    226 	jz	.Lialu
    227 ___
    228 $code.=<<___ if ($avx);
    229 	and	\$`1<<28`,%r8d		# mask AVX bit
    230 	and	\$`1<<30`,%r9d		# mask "Intel CPU" bit
    231 	or	%r9d,%r8d
    232 	cmp	\$`1<<28|1<<30`,%r8d
    233 	je	_avx_shortcut
    234 ___
    235 $code.=<<___;
    236 	jmp	_ssse3_shortcut
    237 
    238 .align	16
    239 .Lialu:
    240 	push	%rbx
    241 	push	%rbp
    242 	push	%r12
    243 	push	%r13
    244 	mov	%rsp,%r11
    245 	mov	%rdi,$ctx	# reassigned argument
    246 	sub	\$`8+16*4`,%rsp
    247 	mov	%rsi,$inp	# reassigned argument
    248 	and	\$-64,%rsp
    249 	mov	%rdx,$num	# reassigned argument
    250 	mov	%r11,`16*4`(%rsp)
    251 .Lprologue:
    252 
    253 	mov	0($ctx),$A
    254 	mov	4($ctx),$B
    255 	mov	8($ctx),$C
    256 	mov	12($ctx),$D
    257 	mov	16($ctx),$E
    258 	jmp	.Lloop
    259 
    260 .align	16
    261 .Lloop:
    262 ___
    263 for($i=0;$i<20;$i++)	{ &BODY_00_19($i,@V); unshift(@V,pop(@V)); }
    264 for(;$i<40;$i++)	{ &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
    265 for(;$i<60;$i++)	{ &BODY_40_59($i,@V); unshift(@V,pop(@V)); }
    266 for(;$i<80;$i++)	{ &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
    267 $code.=<<___;
    268 	add	0($ctx),$A
    269 	add	4($ctx),$B
    270 	add	8($ctx),$C
    271 	add	12($ctx),$D
    272 	add	16($ctx),$E
    273 	mov	$A,0($ctx)
    274 	mov	$B,4($ctx)
    275 	mov	$C,8($ctx)
    276 	mov	$D,12($ctx)
    277 	mov	$E,16($ctx)
    278 
    279 	sub	\$1,$num
    280 	lea	`16*4`($inp),$inp
    281 	jnz	.Lloop
    282 
    283 	mov	`16*4`(%rsp),%rsi
    284 	mov	(%rsi),%r13
    285 	mov	8(%rsi),%r12
    286 	mov	16(%rsi),%rbp
    287 	mov	24(%rsi),%rbx
    288 	lea	32(%rsi),%rsp
    289 .Lepilogue:
    290 	ret
    291 .size	sha1_block_data_order,.-sha1_block_data_order
    292 ___
    293 {{{
    294 my $Xi=4;
    295 my @X=map("%xmm$_",(4..7,0..3));
    296 my @Tx=map("%xmm$_",(8..10));
    297 my @V=($A,$B,$C,$D,$E)=("%eax","%ebx","%ecx","%edx","%ebp");	# size optimization
    298 my @T=("%esi","%edi");
    299 my $j=0;
    300 my $K_XX_XX="%r11";
    301 
    302 my $_rol=sub { &rol(@_) };
    303 my $_ror=sub { &ror(@_) };
    304 
    305 $code.=<<___;
    306 .type	sha1_block_data_order_ssse3,\@function,3
    307 .align	16
    308 sha1_block_data_order_ssse3:
    309 _ssse3_shortcut:
    310 	push	%rbx
    311 	push	%rbp
    312 	push	%r12
    313 	lea	`-64-($win64?5*16:0)`(%rsp),%rsp
    314 ___
    315 $code.=<<___ if ($win64);
    316 	movaps	%xmm6,64+0(%rsp)
    317 	movaps	%xmm7,64+16(%rsp)
    318 	movaps	%xmm8,64+32(%rsp)
    319 	movaps	%xmm9,64+48(%rsp)
    320 	movaps	%xmm10,64+64(%rsp)
    321 .Lprologue_ssse3:
    322 ___
    323 $code.=<<___;
    324 	mov	%rdi,$ctx	# reassigned argument
    325 	mov	%rsi,$inp	# reassigned argument
    326 	mov	%rdx,$num	# reassigned argument
    327 
    328 	shl	\$6,$num
    329 	add	$inp,$num
    330 	lea	K_XX_XX(%rip),$K_XX_XX
    331 
    332 	mov	0($ctx),$A		# load context
    333 	mov	4($ctx),$B
    334 	mov	8($ctx),$C
    335 	mov	12($ctx),$D
    336 	mov	$B,@T[0]		# magic seed
    337 	mov	16($ctx),$E
    338 
    339 	movdqa	64($K_XX_XX),@X[2]	# pbswap mask
    340 	movdqa	0($K_XX_XX),@Tx[1]	# K_00_19
    341 	movdqu	0($inp),@X[-4&7]	# load input to %xmm[0-3]
    342 	movdqu	16($inp),@X[-3&7]
    343 	movdqu	32($inp),@X[-2&7]
    344 	movdqu	48($inp),@X[-1&7]
    345 	pshufb	@X[2],@X[-4&7]		# byte swap
    346 	add	\$64,$inp
    347 	pshufb	@X[2],@X[-3&7]
    348 	pshufb	@X[2],@X[-2&7]
    349 	pshufb	@X[2],@X[-1&7]
    350 	paddd	@Tx[1],@X[-4&7]		# add K_00_19
    351 	paddd	@Tx[1],@X[-3&7]
    352 	paddd	@Tx[1],@X[-2&7]
    353 	movdqa	@X[-4&7],0(%rsp)	# X[]+K xfer to IALU
    354 	psubd	@Tx[1],@X[-4&7]		# restore X[]
    355 	movdqa	@X[-3&7],16(%rsp)
    356 	psubd	@Tx[1],@X[-3&7]
    357 	movdqa	@X[-2&7],32(%rsp)
    358 	psubd	@Tx[1],@X[-2&7]
    359 	jmp	.Loop_ssse3
    360 ___
    361 
    362 sub AUTOLOAD()		# thunk [simplified] 32-bit style perlasm
    363 { my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
    364   my $arg = pop;
    365     $arg = "\$$arg" if ($arg*1 eq $arg);
    366     $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n";
    367 }
    368 
    369 sub Xupdate_ssse3_16_31()		# recall that $Xi starts wtih 4
    370 { use integer;
    371   my $body = shift;
    372   my @insns = (&$body,&$body,&$body,&$body);	# 40 instructions
    373   my ($a,$b,$c,$d,$e);
    374 
    375 	&movdqa	(@X[0],@X[-3&7]);
    376 	 eval(shift(@insns));
    377 	 eval(shift(@insns));
    378 	&movdqa	(@Tx[0],@X[-1&7]);
    379 	&palignr(@X[0],@X[-4&7],8);	# compose "X[-14]" in "X[0]"
    380 	 eval(shift(@insns));
    381 	 eval(shift(@insns));
    382 
    383 	  &paddd	(@Tx[1],@X[-1&7]);
    384 	 eval(shift(@insns));
    385 	 eval(shift(@insns));
    386 	&psrldq	(@Tx[0],4);		# "X[-3]", 3 dwords
    387 	 eval(shift(@insns));
    388 	 eval(shift(@insns));
    389 	&pxor	(@X[0],@X[-4&7]);	# "X[0]"^="X[-16]"
    390 	 eval(shift(@insns));
    391 	 eval(shift(@insns));
    392 
    393 	&pxor	(@Tx[0],@X[-2&7]);	# "X[-3]"^"X[-8]"
    394 	 eval(shift(@insns));
    395 	 eval(shift(@insns));
    396 	 eval(shift(@insns));
    397 	 eval(shift(@insns));
    398 
    399 	&pxor	(@X[0],@Tx[0]);		# "X[0]"^="X[-3]"^"X[-8]"
    400 	 eval(shift(@insns));
    401 	 eval(shift(@insns));
    402 	  &movdqa	(eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]);	# X[]+K xfer to IALU
    403 	 eval(shift(@insns));
    404 	 eval(shift(@insns));
    405 
    406 	&movdqa	(@Tx[2],@X[0]);
    407 	&movdqa	(@Tx[0],@X[0]);
    408 	 eval(shift(@insns));
    409 	 eval(shift(@insns));
    410 	 eval(shift(@insns));
    411 	 eval(shift(@insns));
    412 
    413 	&pslldq	(@Tx[2],12);		# "X[0]"<<96, extract one dword
    414 	&paddd	(@X[0],@X[0]);
    415 	 eval(shift(@insns));
    416 	 eval(shift(@insns));
    417 	 eval(shift(@insns));
    418 	 eval(shift(@insns));
    419 
    420 	&psrld	(@Tx[0],31);
    421 	 eval(shift(@insns));
    422 	 eval(shift(@insns));
    423 	&movdqa	(@Tx[1],@Tx[2]);
    424 	 eval(shift(@insns));
    425 	 eval(shift(@insns));
    426 
    427 	&psrld	(@Tx[2],30);
    428 	&por	(@X[0],@Tx[0]);		# "X[0]"<<<=1
    429 	 eval(shift(@insns));
    430 	 eval(shift(@insns));
    431 	 eval(shift(@insns));
    432 	 eval(shift(@insns));
    433 
    434 	&pslld	(@Tx[1],2);
    435 	&pxor	(@X[0],@Tx[2]);
    436 	 eval(shift(@insns));
    437 	 eval(shift(@insns));
    438 	  &movdqa	(@Tx[2],eval(16*(($Xi)/5))."($K_XX_XX)");	# K_XX_XX
    439 	 eval(shift(@insns));
    440 	 eval(shift(@insns));
    441 
    442 	&pxor	(@X[0],@Tx[1]);		# "X[0]"^=("X[0]">>96)<<<2
    443 
    444 	 foreach (@insns) { eval; }	# remaining instructions [if any]
    445 
    446   $Xi++;	push(@X,shift(@X));	# "rotate" X[]
    447 		push(@Tx,shift(@Tx));
    448 }
    449 
    450 sub Xupdate_ssse3_32_79()
    451 { use integer;
    452   my $body = shift;
    453   my @insns = (&$body,&$body,&$body,&$body);	# 32 to 48 instructions
    454   my ($a,$b,$c,$d,$e);
    455 
    456 	&movdqa	(@Tx[0],@X[-1&7])	if ($Xi==8);
    457 	 eval(shift(@insns));		# body_20_39
    458 	&pxor	(@X[0],@X[-4&7]);	# "X[0]"="X[-32]"^"X[-16]"
    459 	&palignr(@Tx[0],@X[-2&7],8);	# compose "X[-6]"
    460 	 eval(shift(@insns));
    461 	 eval(shift(@insns));
    462 	 eval(shift(@insns));		# rol
    463 
    464 	&pxor	(@X[0],@X[-7&7]);	# "X[0]"^="X[-28]"
    465 	 eval(shift(@insns));
    466 	 eval(shift(@insns))	if (@insns[0] !~ /&ro[rl]/);
    467 	if ($Xi%5) {
    468 	  &movdqa	(@Tx[2],@Tx[1]);# "perpetuate" K_XX_XX...
    469 	} else {			# ... or load next one
    470 	  &movdqa	(@Tx[2],eval(16*($Xi/5))."($K_XX_XX)");
    471 	}
    472 	  &paddd	(@Tx[1],@X[-1&7]);
    473 	 eval(shift(@insns));		# ror
    474 	 eval(shift(@insns));
    475 
    476 	&pxor	(@X[0],@Tx[0]);		# "X[0]"^="X[-6]"
    477 	 eval(shift(@insns));		# body_20_39
    478 	 eval(shift(@insns));
    479 	 eval(shift(@insns));
    480 	 eval(shift(@insns));		# rol
    481 
    482 	&movdqa	(@Tx[0],@X[0]);
    483 	  &movdqa	(eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]);	# X[]+K xfer to IALU
    484 	 eval(shift(@insns));
    485 	 eval(shift(@insns));
    486 	 eval(shift(@insns));		# ror
    487 	 eval(shift(@insns));
    488 
    489 	&pslld	(@X[0],2);
    490 	 eval(shift(@insns));		# body_20_39
    491 	 eval(shift(@insns));
    492 	&psrld	(@Tx[0],30);
    493 	 eval(shift(@insns));
    494 	 eval(shift(@insns));		# rol
    495 	 eval(shift(@insns));
    496 	 eval(shift(@insns));
    497 	 eval(shift(@insns));		# ror
    498 	 eval(shift(@insns));
    499 
    500 	&por	(@X[0],@Tx[0]);		# "X[0]"<<<=2
    501 	 eval(shift(@insns));		# body_20_39
    502 	 eval(shift(@insns));
    503 	  &movdqa	(@Tx[1],@X[0])	if ($Xi<19);
    504 	 eval(shift(@insns));
    505 	 eval(shift(@insns));		# rol
    506 	 eval(shift(@insns));
    507 	 eval(shift(@insns));
    508 	 eval(shift(@insns));		# rol
    509 	 eval(shift(@insns));
    510 
    511 	 foreach (@insns) { eval; }	# remaining instructions
    512 
    513   $Xi++;	push(@X,shift(@X));	# "rotate" X[]
    514 		push(@Tx,shift(@Tx));
    515 }
    516 
    517 sub Xuplast_ssse3_80()
    518 { use integer;
    519   my $body = shift;
    520   my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
    521   my ($a,$b,$c,$d,$e);
    522 
    523 	 eval(shift(@insns));
    524 	  &paddd	(@Tx[1],@X[-1&7]);
    525 	 eval(shift(@insns));
    526 	 eval(shift(@insns));
    527 	 eval(shift(@insns));
    528 	 eval(shift(@insns));
    529 
    530 	  &movdqa	(eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]);	# X[]+K xfer IALU
    531 
    532 	 foreach (@insns) { eval; }		# remaining instructions
    533 
    534 	&cmp	($inp,$num);
    535 	&je	(".Ldone_ssse3");
    536 
    537 	unshift(@Tx,pop(@Tx));
    538 
    539 	&movdqa	(@X[2],"64($K_XX_XX)");		# pbswap mask
    540 	&movdqa	(@Tx[1],"0($K_XX_XX)");		# K_00_19
    541 	&movdqu	(@X[-4&7],"0($inp)");		# load input
    542 	&movdqu	(@X[-3&7],"16($inp)");
    543 	&movdqu	(@X[-2&7],"32($inp)");
    544 	&movdqu	(@X[-1&7],"48($inp)");
    545 	&pshufb	(@X[-4&7],@X[2]);		# byte swap
    546 	&add	($inp,64);
    547 
    548   $Xi=0;
    549 }
    550 
    551 sub Xloop_ssse3()
    552 { use integer;
    553   my $body = shift;
    554   my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
    555   my ($a,$b,$c,$d,$e);
    556 
    557 	 eval(shift(@insns));
    558 	 eval(shift(@insns));
    559 	&pshufb	(@X[($Xi-3)&7],@X[2]);
    560 	 eval(shift(@insns));
    561 	 eval(shift(@insns));
    562 	&paddd	(@X[($Xi-4)&7],@Tx[1]);
    563 	 eval(shift(@insns));
    564 	 eval(shift(@insns));
    565 	 eval(shift(@insns));
    566 	 eval(shift(@insns));
    567 	&movdqa	(eval(16*$Xi)."(%rsp)",@X[($Xi-4)&7]);	# X[]+K xfer to IALU
    568 	 eval(shift(@insns));
    569 	 eval(shift(@insns));
    570 	&psubd	(@X[($Xi-4)&7],@Tx[1]);
    571 
    572 	foreach (@insns) { eval; }
    573   $Xi++;
    574 }
    575 
    576 sub Xtail_ssse3()
    577 { use integer;
    578   my $body = shift;
    579   my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
    580   my ($a,$b,$c,$d,$e);
    581 
    582 	foreach (@insns) { eval; }
    583 }
    584 
    585 sub body_00_19 () {
    586 	(
    587 	'($a,$b,$c,$d,$e)=@V;'.
    588 	'&add	($e,eval(4*($j&15))."(%rsp)");',	# X[]+K xfer
    589 	'&xor	($c,$d);',
    590 	'&mov	(@T[1],$a);',	# $b in next round
    591 	'&$_rol	($a,5);',
    592 	'&and	(@T[0],$c);',	# ($b&($c^$d))
    593 	'&xor	($c,$d);',	# restore $c
    594 	'&xor	(@T[0],$d);',
    595 	'&add	($e,$a);',
    596 	'&$_ror	($b,$j?7:2);',	# $b>>>2
    597 	'&add	($e,@T[0]);'	.'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));'
    598 	);
    599 }
    600 
    601 sub body_20_39 () {
    602 	(
    603 	'($a,$b,$c,$d,$e)=@V;'.
    604 	'&add	($e,eval(4*($j++&15))."(%rsp)");',	# X[]+K xfer
    605 	'&xor	(@T[0],$d);',	# ($b^$d)
    606 	'&mov	(@T[1],$a);',	# $b in next round
    607 	'&$_rol	($a,5);',
    608 	'&xor	(@T[0],$c);',	# ($b^$d^$c)
    609 	'&add	($e,$a);',
    610 	'&$_ror	($b,7);',	# $b>>>2
    611 	'&add	($e,@T[0]);'	.'unshift(@V,pop(@V)); unshift(@T,pop(@T));'
    612 	);
    613 }
    614 
    615 sub body_40_59 () {
    616 	(
    617 	'($a,$b,$c,$d,$e)=@V;'.
    618 	'&mov	(@T[1],$c);',
    619 	'&xor	($c,$d);',
    620 	'&add	($e,eval(4*($j++&15))."(%rsp)");',	# X[]+K xfer
    621 	'&and	(@T[1],$d);',
    622 	'&and	(@T[0],$c);',	# ($b&($c^$d))
    623 	'&$_ror	($b,7);',	# $b>>>2
    624 	'&add	($e,@T[1]);',
    625 	'&mov	(@T[1],$a);',	# $b in next round
    626 	'&$_rol	($a,5);',
    627 	'&add	($e,@T[0]);',
    628 	'&xor	($c,$d);',	# restore $c
    629 	'&add	($e,$a);'	.'unshift(@V,pop(@V)); unshift(@T,pop(@T));'
    630 	);
    631 }
    632 $code.=<<___;
    633 .align	16
    634 .Loop_ssse3:
    635 ___
    636 	&Xupdate_ssse3_16_31(\&body_00_19);
    637 	&Xupdate_ssse3_16_31(\&body_00_19);
    638 	&Xupdate_ssse3_16_31(\&body_00_19);
    639 	&Xupdate_ssse3_16_31(\&body_00_19);
    640 	&Xupdate_ssse3_32_79(\&body_00_19);
    641 	&Xupdate_ssse3_32_79(\&body_20_39);
    642 	&Xupdate_ssse3_32_79(\&body_20_39);
    643 	&Xupdate_ssse3_32_79(\&body_20_39);
    644 	&Xupdate_ssse3_32_79(\&body_20_39);
    645 	&Xupdate_ssse3_32_79(\&body_20_39);
    646 	&Xupdate_ssse3_32_79(\&body_40_59);
    647 	&Xupdate_ssse3_32_79(\&body_40_59);
    648 	&Xupdate_ssse3_32_79(\&body_40_59);
    649 	&Xupdate_ssse3_32_79(\&body_40_59);
    650 	&Xupdate_ssse3_32_79(\&body_40_59);
    651 	&Xupdate_ssse3_32_79(\&body_20_39);
    652 	&Xuplast_ssse3_80(\&body_20_39);	# can jump to "done"
    653 
    654 				$saved_j=$j; @saved_V=@V;
    655 
    656 	&Xloop_ssse3(\&body_20_39);
    657 	&Xloop_ssse3(\&body_20_39);
    658 	&Xloop_ssse3(\&body_20_39);
    659 
    660 $code.=<<___;
    661 	add	0($ctx),$A			# update context
    662 	add	4($ctx),@T[0]
    663 	add	8($ctx),$C
    664 	add	12($ctx),$D
    665 	mov	$A,0($ctx)
    666 	add	16($ctx),$E
    667 	mov	@T[0],4($ctx)
    668 	mov	@T[0],$B			# magic seed
    669 	mov	$C,8($ctx)
    670 	mov	$D,12($ctx)
    671 	mov	$E,16($ctx)
    672 	jmp	.Loop_ssse3
    673 
    674 .align	16
    675 .Ldone_ssse3:
    676 ___
    677 				$j=$saved_j; @V=@saved_V;
    678 
    679 	&Xtail_ssse3(\&body_20_39);
    680 	&Xtail_ssse3(\&body_20_39);
    681 	&Xtail_ssse3(\&body_20_39);
    682 
    683 $code.=<<___;
    684 	add	0($ctx),$A			# update context
    685 	add	4($ctx),@T[0]
    686 	add	8($ctx),$C
    687 	mov	$A,0($ctx)
    688 	add	12($ctx),$D
    689 	mov	@T[0],4($ctx)
    690 	add	16($ctx),$E
    691 	mov	$C,8($ctx)
    692 	mov	$D,12($ctx)
    693 	mov	$E,16($ctx)
    694 ___
    695 $code.=<<___ if ($win64);
    696 	movaps	64+0(%rsp),%xmm6
    697 	movaps	64+16(%rsp),%xmm7
    698 	movaps	64+32(%rsp),%xmm8
    699 	movaps	64+48(%rsp),%xmm9
    700 	movaps	64+64(%rsp),%xmm10
    701 ___
    702 $code.=<<___;
    703 	lea	`64+($win64?5*16:0)`(%rsp),%rsi
    704 	mov	0(%rsi),%r12
    705 	mov	8(%rsi),%rbp
    706 	mov	16(%rsi),%rbx
    707 	lea	24(%rsi),%rsp
    708 .Lepilogue_ssse3:
    709 	ret
    710 .size	sha1_block_data_order_ssse3,.-sha1_block_data_order_ssse3
    711 ___
    712 
    713 if ($avx) {
    714 my $Xi=4;
    715 my @X=map("%xmm$_",(4..7,0..3));
    716 my @Tx=map("%xmm$_",(8..10));
    717 my @V=($A,$B,$C,$D,$E)=("%eax","%ebx","%ecx","%edx","%ebp");	# size optimization
    718 my @T=("%esi","%edi");
    719 my $j=0;
    720 my $K_XX_XX="%r11";
    721 
    722 my $_rol=sub { &shld(@_[0],@_) };
    723 my $_ror=sub { &shrd(@_[0],@_) };
    724 
    725 $code.=<<___;
    726 .type	sha1_block_data_order_avx,\@function,3
    727 .align	16
    728 sha1_block_data_order_avx:
    729 _avx_shortcut:
    730 	push	%rbx
    731 	push	%rbp
    732 	push	%r12
    733 	lea	`-64-($win64?5*16:0)`(%rsp),%rsp
    734 ___
    735 $code.=<<___ if ($win64);
    736 	movaps	%xmm6,64+0(%rsp)
    737 	movaps	%xmm7,64+16(%rsp)
    738 	movaps	%xmm8,64+32(%rsp)
    739 	movaps	%xmm9,64+48(%rsp)
    740 	movaps	%xmm10,64+64(%rsp)
    741 .Lprologue_avx:
    742 ___
    743 $code.=<<___;
    744 	mov	%rdi,$ctx	# reassigned argument
    745 	mov	%rsi,$inp	# reassigned argument
    746 	mov	%rdx,$num	# reassigned argument
    747 	vzeroall
    748 
    749 	shl	\$6,$num
    750 	add	$inp,$num
    751 	lea	K_XX_XX(%rip),$K_XX_XX
    752 
    753 	mov	0($ctx),$A		# load context
    754 	mov	4($ctx),$B
    755 	mov	8($ctx),$C
    756 	mov	12($ctx),$D
    757 	mov	$B,@T[0]		# magic seed
    758 	mov	16($ctx),$E
    759 
    760 	vmovdqa	64($K_XX_XX),@X[2]	# pbswap mask
    761 	vmovdqa	0($K_XX_XX),@Tx[1]	# K_00_19
    762 	vmovdqu	0($inp),@X[-4&7]	# load input to %xmm[0-3]
    763 	vmovdqu	16($inp),@X[-3&7]
    764 	vmovdqu	32($inp),@X[-2&7]
    765 	vmovdqu	48($inp),@X[-1&7]
    766 	vpshufb	@X[2],@X[-4&7],@X[-4&7]	# byte swap
    767 	add	\$64,$inp
    768 	vpshufb	@X[2],@X[-3&7],@X[-3&7]
    769 	vpshufb	@X[2],@X[-2&7],@X[-2&7]
    770 	vpshufb	@X[2],@X[-1&7],@X[-1&7]
    771 	vpaddd	@Tx[1],@X[-4&7],@X[0]	# add K_00_19
    772 	vpaddd	@Tx[1],@X[-3&7],@X[1]
    773 	vpaddd	@Tx[1],@X[-2&7],@X[2]
    774 	vmovdqa	@X[0],0(%rsp)		# X[]+K xfer to IALU
    775 	vmovdqa	@X[1],16(%rsp)
    776 	vmovdqa	@X[2],32(%rsp)
    777 	jmp	.Loop_avx
    778 ___
    779 
    780 sub Xupdate_avx_16_31()		# recall that $Xi starts wtih 4
    781 { use integer;
    782   my $body = shift;
    783   my @insns = (&$body,&$body,&$body,&$body);	# 40 instructions
    784   my ($a,$b,$c,$d,$e);
    785 
    786 	 eval(shift(@insns));
    787 	 eval(shift(@insns));
    788 	&vpalignr(@X[0],@X[-3&7],@X[-4&7],8);	# compose "X[-14]" in "X[0]"
    789 	 eval(shift(@insns));
    790 	 eval(shift(@insns));
    791 
    792 	  &vpaddd	(@Tx[1],@Tx[1],@X[-1&7]);
    793 	 eval(shift(@insns));
    794 	 eval(shift(@insns));
    795 	&vpsrldq(@Tx[0],@X[-1&7],4);	# "X[-3]", 3 dwords
    796 	 eval(shift(@insns));
    797 	 eval(shift(@insns));
    798 	&vpxor	(@X[0],@X[0],@X[-4&7]);		# "X[0]"^="X[-16]"
    799 	 eval(shift(@insns));
    800 	 eval(shift(@insns));
    801 
    802 	&vpxor	(@Tx[0],@Tx[0],@X[-2&7]);	# "X[-3]"^"X[-8]"
    803 	 eval(shift(@insns));
    804 	 eval(shift(@insns));
    805 	 eval(shift(@insns));
    806 	 eval(shift(@insns));
    807 
    808 	&vpxor	(@X[0],@X[0],@Tx[0]);		# "X[0]"^="X[-3]"^"X[-8]"
    809 	 eval(shift(@insns));
    810 	 eval(shift(@insns));
    811 	  &vmovdqa	(eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]);	# X[]+K xfer to IALU
    812 	 eval(shift(@insns));
    813 	 eval(shift(@insns));
    814 
    815 	&vpsrld	(@Tx[0],@X[0],31);
    816 	 eval(shift(@insns));
    817 	 eval(shift(@insns));
    818 	 eval(shift(@insns));
    819 	 eval(shift(@insns));
    820 
    821 	&vpslldq(@Tx[2],@X[0],12);		# "X[0]"<<96, extract one dword
    822 	&vpaddd	(@X[0],@X[0],@X[0]);
    823 	 eval(shift(@insns));
    824 	 eval(shift(@insns));
    825 	 eval(shift(@insns));
    826 	 eval(shift(@insns));
    827 
    828 	&vpsrld	(@Tx[1],@Tx[2],30);
    829 	&vpor	(@X[0],@X[0],@Tx[0]);		# "X[0]"<<<=1
    830 	 eval(shift(@insns));
    831 	 eval(shift(@insns));
    832 	 eval(shift(@insns));
    833 	 eval(shift(@insns));
    834 
    835 	&vpslld	(@Tx[2],@Tx[2],2);
    836 	&vpxor	(@X[0],@X[0],@Tx[1]);
    837 	 eval(shift(@insns));
    838 	 eval(shift(@insns));
    839 	 eval(shift(@insns));
    840 	 eval(shift(@insns));
    841 
    842 	&vpxor	(@X[0],@X[0],@Tx[2]);		# "X[0]"^=("X[0]">>96)<<<2
    843 	 eval(shift(@insns));
    844 	 eval(shift(@insns));
    845 	  &vmovdqa	(@Tx[2],eval(16*(($Xi)/5))."($K_XX_XX)");	# K_XX_XX
    846 	 eval(shift(@insns));
    847 	 eval(shift(@insns));
    848 
    849 
    850 	 foreach (@insns) { eval; }	# remaining instructions [if any]
    851 
    852   $Xi++;	push(@X,shift(@X));	# "rotate" X[]
    853 		push(@Tx,shift(@Tx));
    854 }
    855 
    856 sub Xupdate_avx_32_79()
    857 { use integer;
    858   my $body = shift;
    859   my @insns = (&$body,&$body,&$body,&$body);	# 32 to 48 instructions
    860   my ($a,$b,$c,$d,$e);
    861 
    862 	&vpalignr(@Tx[0],@X[-1&7],@X[-2&7],8);	# compose "X[-6]"
    863 	&vpxor	(@X[0],@X[0],@X[-4&7]);		# "X[0]"="X[-32]"^"X[-16]"
    864 	 eval(shift(@insns));		# body_20_39
    865 	 eval(shift(@insns));
    866 	 eval(shift(@insns));
    867 	 eval(shift(@insns));		# rol
    868 
    869 	&vpxor	(@X[0],@X[0],@X[-7&7]);		# "X[0]"^="X[-28]"
    870 	 eval(shift(@insns));
    871 	 eval(shift(@insns))	if (@insns[0] !~ /&ro[rl]/);
    872 	if ($Xi%5) {
    873 	  &vmovdqa	(@Tx[2],@Tx[1]);# "perpetuate" K_XX_XX...
    874 	} else {			# ... or load next one
    875 	  &vmovdqa	(@Tx[2],eval(16*($Xi/5))."($K_XX_XX)");
    876 	}
    877 	  &vpaddd	(@Tx[1],@Tx[1],@X[-1&7]);
    878 	 eval(shift(@insns));		# ror
    879 	 eval(shift(@insns));
    880 
    881 	&vpxor	(@X[0],@X[0],@Tx[0]);		# "X[0]"^="X[-6]"
    882 	 eval(shift(@insns));		# body_20_39
    883 	 eval(shift(@insns));
    884 	 eval(shift(@insns));
    885 	 eval(shift(@insns));		# rol
    886 
    887 	&vpsrld	(@Tx[0],@X[0],30);
    888 	  &vmovdqa	(eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]);	# X[]+K xfer to IALU
    889 	 eval(shift(@insns));
    890 	 eval(shift(@insns));
    891 	 eval(shift(@insns));		# ror
    892 	 eval(shift(@insns));
    893 
    894 	&vpslld	(@X[0],@X[0],2);
    895 	 eval(shift(@insns));		# body_20_39
    896 	 eval(shift(@insns));
    897 	 eval(shift(@insns));
    898 	 eval(shift(@insns));		# rol
    899 	 eval(shift(@insns));
    900 	 eval(shift(@insns));
    901 	 eval(shift(@insns));		# ror
    902 	 eval(shift(@insns));
    903 
    904 	&vpor	(@X[0],@X[0],@Tx[0]);		# "X[0]"<<<=2
    905 	 eval(shift(@insns));		# body_20_39
    906 	 eval(shift(@insns));
    907 	  &vmovdqa	(@Tx[1],@X[0])	if ($Xi<19);
    908 	 eval(shift(@insns));
    909 	 eval(shift(@insns));		# rol
    910 	 eval(shift(@insns));
    911 	 eval(shift(@insns));
    912 	 eval(shift(@insns));		# rol
    913 	 eval(shift(@insns));
    914 
    915 	 foreach (@insns) { eval; }	# remaining instructions
    916 
    917   $Xi++;	push(@X,shift(@X));	# "rotate" X[]
    918 		push(@Tx,shift(@Tx));
    919 }
    920 
    921 sub Xuplast_avx_80()
    922 { use integer;
    923   my $body = shift;
    924   my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
    925   my ($a,$b,$c,$d,$e);
    926 
    927 	 eval(shift(@insns));
    928 	  &vpaddd	(@Tx[1],@Tx[1],@X[-1&7]);
    929 	 eval(shift(@insns));
    930 	 eval(shift(@insns));
    931 	 eval(shift(@insns));
    932 	 eval(shift(@insns));
    933 
    934 	  &movdqa	(eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]);	# X[]+K xfer IALU
    935 
    936 	 foreach (@insns) { eval; }		# remaining instructions
    937 
    938 	&cmp	($inp,$num);
    939 	&je	(".Ldone_avx");
    940 
    941 	unshift(@Tx,pop(@Tx));
    942 
    943 	&vmovdqa(@X[2],"64($K_XX_XX)");		# pbswap mask
    944 	&vmovdqa(@Tx[1],"0($K_XX_XX)");		# K_00_19
    945 	&vmovdqu(@X[-4&7],"0($inp)");		# load input
    946 	&vmovdqu(@X[-3&7],"16($inp)");
    947 	&vmovdqu(@X[-2&7],"32($inp)");
    948 	&vmovdqu(@X[-1&7],"48($inp)");
    949 	&vpshufb(@X[-4&7],@X[-4&7],@X[2]);	# byte swap
    950 	&add	($inp,64);
    951 
    952   $Xi=0;
    953 }
    954 
    955 sub Xloop_avx()
    956 { use integer;
    957   my $body = shift;
    958   my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
    959   my ($a,$b,$c,$d,$e);
    960 
    961 	 eval(shift(@insns));
    962 	 eval(shift(@insns));
    963 	&vpshufb(@X[($Xi-3)&7],@X[($Xi-3)&7],@X[2]);
    964 	 eval(shift(@insns));
    965 	 eval(shift(@insns));
    966 	&vpaddd	(@X[$Xi&7],@X[($Xi-4)&7],@Tx[1]);
    967 	 eval(shift(@insns));
    968 	 eval(shift(@insns));
    969 	 eval(shift(@insns));
    970 	 eval(shift(@insns));
    971 	&vmovdqa(eval(16*$Xi)."(%rsp)",@X[$Xi&7]);	# X[]+K xfer to IALU
    972 	 eval(shift(@insns));
    973 	 eval(shift(@insns));
    974 
    975 	foreach (@insns) { eval; }
    976   $Xi++;
    977 }
    978 
    979 sub Xtail_avx()
    980 { use integer;
    981   my $body = shift;
    982   my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
    983   my ($a,$b,$c,$d,$e);
    984 
    985 	foreach (@insns) { eval; }
    986 }
    987 
    988 $code.=<<___;
    989 .align	16
    990 .Loop_avx:
    991 ___
    992 	&Xupdate_avx_16_31(\&body_00_19);
    993 	&Xupdate_avx_16_31(\&body_00_19);
    994 	&Xupdate_avx_16_31(\&body_00_19);
    995 	&Xupdate_avx_16_31(\&body_00_19);
    996 	&Xupdate_avx_32_79(\&body_00_19);
    997 	&Xupdate_avx_32_79(\&body_20_39);
    998 	&Xupdate_avx_32_79(\&body_20_39);
    999 	&Xupdate_avx_32_79(\&body_20_39);
   1000 	&Xupdate_avx_32_79(\&body_20_39);
   1001 	&Xupdate_avx_32_79(\&body_20_39);
   1002 	&Xupdate_avx_32_79(\&body_40_59);
   1003 	&Xupdate_avx_32_79(\&body_40_59);
   1004 	&Xupdate_avx_32_79(\&body_40_59);
   1005 	&Xupdate_avx_32_79(\&body_40_59);
   1006 	&Xupdate_avx_32_79(\&body_40_59);
   1007 	&Xupdate_avx_32_79(\&body_20_39);
   1008 	&Xuplast_avx_80(\&body_20_39);	# can jump to "done"
   1009 
   1010 				$saved_j=$j; @saved_V=@V;
   1011 
   1012 	&Xloop_avx(\&body_20_39);
   1013 	&Xloop_avx(\&body_20_39);
   1014 	&Xloop_avx(\&body_20_39);
   1015 
   1016 $code.=<<___;
   1017 	add	0($ctx),$A			# update context
   1018 	add	4($ctx),@T[0]
   1019 	add	8($ctx),$C
   1020 	add	12($ctx),$D
   1021 	mov	$A,0($ctx)
   1022 	add	16($ctx),$E
   1023 	mov	@T[0],4($ctx)
   1024 	mov	@T[0],$B			# magic seed
   1025 	mov	$C,8($ctx)
   1026 	mov	$D,12($ctx)
   1027 	mov	$E,16($ctx)
   1028 	jmp	.Loop_avx
   1029 
   1030 .align	16
   1031 .Ldone_avx:
   1032 ___
   1033 				$j=$saved_j; @V=@saved_V;
   1034 
   1035 	&Xtail_avx(\&body_20_39);
   1036 	&Xtail_avx(\&body_20_39);
   1037 	&Xtail_avx(\&body_20_39);
   1038 
   1039 $code.=<<___;
   1040 	vzeroall
   1041 
   1042 	add	0($ctx),$A			# update context
   1043 	add	4($ctx),@T[0]
   1044 	add	8($ctx),$C
   1045 	mov	$A,0($ctx)
   1046 	add	12($ctx),$D
   1047 	mov	@T[0],4($ctx)
   1048 	add	16($ctx),$E
   1049 	mov	$C,8($ctx)
   1050 	mov	$D,12($ctx)
   1051 	mov	$E,16($ctx)
   1052 ___
   1053 $code.=<<___ if ($win64);
   1054 	movaps	64+0(%rsp),%xmm6
   1055 	movaps	64+16(%rsp),%xmm7
   1056 	movaps	64+32(%rsp),%xmm8
   1057 	movaps	64+48(%rsp),%xmm9
   1058 	movaps	64+64(%rsp),%xmm10
   1059 ___
   1060 $code.=<<___;
   1061 	lea	`64+($win64?5*16:0)`(%rsp),%rsi
   1062 	mov	0(%rsi),%r12
   1063 	mov	8(%rsi),%rbp
   1064 	mov	16(%rsi),%rbx
   1065 	lea	24(%rsi),%rsp
   1066 .Lepilogue_avx:
   1067 	ret
   1068 .size	sha1_block_data_order_avx,.-sha1_block_data_order_avx
   1069 ___
   1070 }
   1071 $code.=<<___;
   1072 .align	64
   1073 K_XX_XX:
   1074 .long	0x5a827999,0x5a827999,0x5a827999,0x5a827999	# K_00_19
   1075 .long	0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1	# K_20_39
   1076 .long	0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc	# K_40_59
   1077 .long	0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6	# K_60_79
   1078 .long	0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f	# pbswap mask
   1079 ___
   1080 }}}
   1081 $code.=<<___;
   1082 .asciz	"SHA1 block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
   1083 .align	64
   1084 ___
   1085 
   1086 # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
   1087 #		CONTEXT *context,DISPATCHER_CONTEXT *disp)
   1088 if ($win64) {
   1089 $rec="%rcx";
   1090 $frame="%rdx";
   1091 $context="%r8";
   1092 $disp="%r9";
   1093 
   1094 $code.=<<___;
   1095 .extern	__imp_RtlVirtualUnwind
   1096 .type	se_handler,\@abi-omnipotent
   1097 .align	16
   1098 se_handler:
   1099 	push	%rsi
   1100 	push	%rdi
   1101 	push	%rbx
   1102 	push	%rbp
   1103 	push	%r12
   1104 	push	%r13
   1105 	push	%r14
   1106 	push	%r15
   1107 	pushfq
   1108 	sub	\$64,%rsp
   1109 
   1110 	mov	120($context),%rax	# pull context->Rax
   1111 	mov	248($context),%rbx	# pull context->Rip
   1112 
   1113 	lea	.Lprologue(%rip),%r10
   1114 	cmp	%r10,%rbx		# context->Rip<.Lprologue
   1115 	jb	.Lcommon_seh_tail
   1116 
   1117 	mov	152($context),%rax	# pull context->Rsp
   1118 
   1119 	lea	.Lepilogue(%rip),%r10
   1120 	cmp	%r10,%rbx		# context->Rip>=.Lepilogue
   1121 	jae	.Lcommon_seh_tail
   1122 
   1123 	mov	`16*4`(%rax),%rax	# pull saved stack pointer
   1124 	lea	32(%rax),%rax
   1125 
   1126 	mov	-8(%rax),%rbx
   1127 	mov	-16(%rax),%rbp
   1128 	mov	-24(%rax),%r12
   1129 	mov	-32(%rax),%r13
   1130 	mov	%rbx,144($context)	# restore context->Rbx
   1131 	mov	%rbp,160($context)	# restore context->Rbp
   1132 	mov	%r12,216($context)	# restore context->R12
   1133 	mov	%r13,224($context)	# restore context->R13
   1134 
   1135 	jmp	.Lcommon_seh_tail
   1136 .size	se_handler,.-se_handler
   1137 
   1138 .type	ssse3_handler,\@abi-omnipotent
   1139 .align	16
   1140 ssse3_handler:
   1141 	push	%rsi
   1142 	push	%rdi
   1143 	push	%rbx
   1144 	push	%rbp
   1145 	push	%r12
   1146 	push	%r13
   1147 	push	%r14
   1148 	push	%r15
   1149 	pushfq
   1150 	sub	\$64,%rsp
   1151 
   1152 	mov	120($context),%rax	# pull context->Rax
   1153 	mov	248($context),%rbx	# pull context->Rip
   1154 
   1155 	mov	8($disp),%rsi		# disp->ImageBase
   1156 	mov	56($disp),%r11		# disp->HandlerData
   1157 
   1158 	mov	0(%r11),%r10d		# HandlerData[0]
   1159 	lea	(%rsi,%r10),%r10	# prologue label
   1160 	cmp	%r10,%rbx		# context->Rip<prologue label
   1161 	jb	.Lcommon_seh_tail
   1162 
   1163 	mov	152($context),%rax	# pull context->Rsp
   1164 
   1165 	mov	4(%r11),%r10d		# HandlerData[1]
   1166 	lea	(%rsi,%r10),%r10	# epilogue label
   1167 	cmp	%r10,%rbx		# context->Rip>=epilogue label
   1168 	jae	.Lcommon_seh_tail
   1169 
   1170 	lea	64(%rax),%rsi
   1171 	lea	512($context),%rdi	# &context.Xmm6
   1172 	mov	\$10,%ecx
   1173 	.long	0xa548f3fc		# cld; rep movsq
   1174 	lea	`24+64+5*16`(%rax),%rax	# adjust stack pointer
   1175 
   1176 	mov	-8(%rax),%rbx
   1177 	mov	-16(%rax),%rbp
   1178 	mov	-24(%rax),%r12
   1179 	mov	%rbx,144($context)	# restore context->Rbx
   1180 	mov	%rbp,160($context)	# restore context->Rbp
   1181 	mov	%r12,216($context)	# restore cotnext->R12
   1182 
   1183 .Lcommon_seh_tail:
   1184 	mov	8(%rax),%rdi
   1185 	mov	16(%rax),%rsi
   1186 	mov	%rax,152($context)	# restore context->Rsp
   1187 	mov	%rsi,168($context)	# restore context->Rsi
   1188 	mov	%rdi,176($context)	# restore context->Rdi
   1189 
   1190 	mov	40($disp),%rdi		# disp->ContextRecord
   1191 	mov	$context,%rsi		# context
   1192 	mov	\$154,%ecx		# sizeof(CONTEXT)
   1193 	.long	0xa548f3fc		# cld; rep movsq
   1194 
   1195 	mov	$disp,%rsi
   1196 	xor	%rcx,%rcx		# arg1, UNW_FLAG_NHANDLER
   1197 	mov	8(%rsi),%rdx		# arg2, disp->ImageBase
   1198 	mov	0(%rsi),%r8		# arg3, disp->ControlPc
   1199 	mov	16(%rsi),%r9		# arg4, disp->FunctionEntry
   1200 	mov	40(%rsi),%r10		# disp->ContextRecord
   1201 	lea	56(%rsi),%r11		# &disp->HandlerData
   1202 	lea	24(%rsi),%r12		# &disp->EstablisherFrame
   1203 	mov	%r10,32(%rsp)		# arg5
   1204 	mov	%r11,40(%rsp)		# arg6
   1205 	mov	%r12,48(%rsp)		# arg7
   1206 	mov	%rcx,56(%rsp)		# arg8, (NULL)
   1207 	call	*__imp_RtlVirtualUnwind(%rip)
   1208 
   1209 	mov	\$1,%eax		# ExceptionContinueSearch
   1210 	add	\$64,%rsp
   1211 	popfq
   1212 	pop	%r15
   1213 	pop	%r14
   1214 	pop	%r13
   1215 	pop	%r12
   1216 	pop	%rbp
   1217 	pop	%rbx
   1218 	pop	%rdi
   1219 	pop	%rsi
   1220 	ret
   1221 .size	ssse3_handler,.-ssse3_handler
   1222 
   1223 .section	.pdata
   1224 .align	4
   1225 	.rva	.LSEH_begin_sha1_block_data_order
   1226 	.rva	.LSEH_end_sha1_block_data_order
   1227 	.rva	.LSEH_info_sha1_block_data_order
   1228 	.rva	.LSEH_begin_sha1_block_data_order_ssse3
   1229 	.rva	.LSEH_end_sha1_block_data_order_ssse3
   1230 	.rva	.LSEH_info_sha1_block_data_order_ssse3
   1231 ___
   1232 $code.=<<___ if ($avx);
   1233 	.rva	.LSEH_begin_sha1_block_data_order_avx
   1234 	.rva	.LSEH_end_sha1_block_data_order_avx
   1235 	.rva	.LSEH_info_sha1_block_data_order_avx
   1236 ___
   1237 $code.=<<___;
   1238 .section	.xdata
   1239 .align	8
   1240 .LSEH_info_sha1_block_data_order:
   1241 	.byte	9,0,0,0
   1242 	.rva	se_handler
   1243 .LSEH_info_sha1_block_data_order_ssse3:
   1244 	.byte	9,0,0,0
   1245 	.rva	ssse3_handler
   1246 	.rva	.Lprologue_ssse3,.Lepilogue_ssse3	# HandlerData[]
   1247 ___
   1248 $code.=<<___ if ($avx);
   1249 .LSEH_info_sha1_block_data_order_avx:
   1250 	.byte	9,0,0,0
   1251 	.rva	ssse3_handler
   1252 	.rva	.Lprologue_avx,.Lepilogue_avx		# HandlerData[]
   1253 ___
   1254 }
   1255 
   1256 ####################################################################
   1257 
   1258 $code =~ s/\`([^\`]*)\`/eval $1/gem;
   1259 print $code;
   1260 close STDOUT;
   1261