Home | History | Annotate | Download | only in asm
      1 #!/usr/bin/env perl
      2 #
      3 # ====================================================================
      4 # Written by Andy Polyakov <appro (at] openssl.org> for the OpenSSL
      5 # project. The module is, however, dual licensed under OpenSSL and
      6 # CRYPTOGAMS licenses depending on where you obtain it. For further
      7 # details see http://www.openssl.org/~appro/cryptogams/.
      8 # ====================================================================
      9 #
     10 # June 2011
     11 #
     12 # This is AESNI-CBC+SHA1 "stitch" implementation. The idea, as spelled
     13 # in http://download.intel.com/design/intarch/papers/323686.pdf, is
     14 # that since AESNI-CBC encrypt exhibit *very* low instruction-level
     15 # parallelism, interleaving it with another algorithm would allow to
     16 # utilize processor resources better and achieve better performance.
     17 # SHA1 instruction sequences(*) are taken from sha1-x86_64.pl and
     18 # AESNI code is weaved into it. Below are performance numbers in
     19 # cycles per processed byte, less is better, for standalone AESNI-CBC
     20 # encrypt, sum of the latter and standalone SHA1, and "stitched"
     21 # subroutine:
     22 #
     23 #		AES-128-CBC	+SHA1		stitch      gain
     24 # Westmere	3.77[+5.6]	9.37		6.65	    +41%
     25 # Sandy Bridge	5.05[+5.2(6.3)]	10.25(11.35)	6.16(7.08)  +67%(+60%)
     26 #
     27 #		AES-192-CBC
     28 # Westmere	4.51		10.11		6.97	    +45%
     29 # Sandy Bridge	6.05		11.25(12.35)	6.34(7.27)  +77%(+70%)
     30 #
     31 #		AES-256-CBC
     32 # Westmere	5.25		10.85		7.25	    +50%
     33 # Sandy Bridge	7.05		12.25(13.35)	7.06(7.70)  +74%(+73%)
     34 #
     35 # (*)	There are two code paths: SSSE3 and AVX. See sha1-568.pl for
     36 #	background information. Above numbers in parentheses are SSSE3
     37 #	results collected on AVX-capable CPU, i.e. apply on OSes that
     38 #	don't support AVX.
     39 #
     40 # Needless to mention that it makes no sense to implement "stitched"
     41 # *decrypt* subroutine. Because *both* AESNI-CBC decrypt and SHA1
     42 # fully utilize parallelism, so stitching would not give any gain
     43 # anyway. Well, there might be some, e.g. because of better cache
     44 # locality... For reference, here are performance results for
     45 # standalone AESNI-CBC decrypt:
     46 #
     47 #		AES-128-CBC	AES-192-CBC	AES-256-CBC
     48 # Westmere	1.31		1.55		1.80
     49 # Sandy Bridge	0.93		1.06		1.22
     50 
     51 $flavour = shift;
     52 $output  = shift;
     53 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
     54 
     55 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
     56 
     57 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
     58 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
     59 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
     60 die "can't locate x86_64-xlate.pl";
     61 
     62 $avx=1 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
     63 		=~ /GNU assembler version ([2-9]\.[0-9]+)/ &&
     64 	   $1>=2.19);
     65 $avx=1 if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
     66 	   `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/ &&
     67 	   $1>=2.09);
     68 $avx=1 if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
     69 	   `ml64 2>&1` =~ /Version ([0-9]+)\./ &&
     70 	   $1>=10);
     71 
     72 open OUT,"| \"$^X\" $xlate $flavour $output";
     73 *STDOUT=*OUT;
     74 
     75 # void aesni_cbc_sha1_enc(const void *inp,
     76 #			void *out,
     77 #			size_t length,
     78 #			const AES_KEY *key,
     79 #			unsigned char *iv,
     80 #			SHA_CTX *ctx,
     81 #			const void *in0);
     82 
     83 $code.=<<___;
     84 .text
     85 .extern	OPENSSL_ia32cap_P
     86 
     87 .globl	aesni_cbc_sha1_enc
     88 .type	aesni_cbc_sha1_enc,\@abi-omnipotent
     89 .align	16
     90 aesni_cbc_sha1_enc:
     91 	# caller should check for SSSE3 and AES-NI bits
     92 	mov	OPENSSL_ia32cap_P+0(%rip),%r10d
     93 	mov	OPENSSL_ia32cap_P+4(%rip),%r11d
     94 ___
     95 $code.=<<___ if ($avx);
     96 	and	\$`1<<28`,%r11d		# mask AVX bit
     97 	and	\$`1<<30`,%r10d		# mask "Intel CPU" bit
     98 	or	%r11d,%r10d
     99 	cmp	\$`1<<28|1<<30`,%r10d
    100 	je	aesni_cbc_sha1_enc_avx
    101 ___
    102 $code.=<<___;
    103 	jmp	aesni_cbc_sha1_enc_ssse3
    104 	ret
    105 .size	aesni_cbc_sha1_enc,.-aesni_cbc_sha1_enc
    106 ___
    107 
    108 my ($in0,$out,$len,$key,$ivp,$ctx,$inp)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10");
    109 
    110 my $Xi=4;
    111 my @X=map("%xmm$_",(4..7,0..3));
    112 my @Tx=map("%xmm$_",(8..10));
    113 my @V=($A,$B,$C,$D,$E)=("%eax","%ebx","%ecx","%edx","%ebp");	# size optimization
    114 my @T=("%esi","%edi");
    115 my $j=0; my $jj=0; my $r=0; my $sn=0;
    116 my $K_XX_XX="%r11";
    117 my ($iv,$in,$rndkey0)=map("%xmm$_",(11..13));
    118 my @rndkey=("%xmm14","%xmm15");
    119 
    120 sub AUTOLOAD()		# thunk [simplified] 32-bit style perlasm
    121 { my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
    122   my $arg = pop;
    123     $arg = "\$$arg" if ($arg*1 eq $arg);
    124     $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n";
    125 }
    126 
    127 my $_rol=sub { &rol(@_) };
    128 my $_ror=sub { &ror(@_) };
    129 
    130 $code.=<<___;
    131 .type	aesni_cbc_sha1_enc_ssse3,\@function,6
    132 .align	16
    133 aesni_cbc_sha1_enc_ssse3:
    134 	mov	`($win64?56:8)`(%rsp),$inp	# load 7th argument
    135 	#shr	\$6,$len			# debugging artefact
    136 	#jz	.Lepilogue_ssse3		# debugging artefact
    137 	push	%rbx
    138 	push	%rbp
    139 	push	%r12
    140 	push	%r13
    141 	push	%r14
    142 	push	%r15
    143 	lea	`-104-($win64?10*16:0)`(%rsp),%rsp
    144 	#mov	$in0,$inp			# debugging artefact
    145 	#lea	64(%rsp),$ctx			# debugging artefact
    146 ___
    147 $code.=<<___ if ($win64);
    148 	movaps	%xmm6,96+0(%rsp)
    149 	movaps	%xmm7,96+16(%rsp)
    150 	movaps	%xmm8,96+32(%rsp)
    151 	movaps	%xmm9,96+48(%rsp)
    152 	movaps	%xmm10,96+64(%rsp)
    153 	movaps	%xmm11,96+80(%rsp)
    154 	movaps	%xmm12,96+96(%rsp)
    155 	movaps	%xmm13,96+112(%rsp)
    156 	movaps	%xmm14,96+128(%rsp)
    157 	movaps	%xmm15,96+144(%rsp)
    158 .Lprologue_ssse3:
    159 ___
    160 $code.=<<___;
    161 	mov	$in0,%r12			# reassign arguments
    162 	mov	$out,%r13
    163 	mov	$len,%r14
    164 	mov	$key,%r15
    165 	movdqu	($ivp),$iv			# load IV
    166 	mov	$ivp,88(%rsp)			# save $ivp
    167 ___
    168 my ($in0,$out,$len,$key)=map("%r$_",(12..15));	# reassign arguments
    169 my $rounds="${ivp}d";
    170 $code.=<<___;
    171 	shl	\$6,$len
    172 	sub	$in0,$out
    173 	mov	240($key),$rounds
    174 	add	$inp,$len		# end of input
    175 
    176 	lea	K_XX_XX(%rip),$K_XX_XX
    177 	mov	0($ctx),$A		# load context
    178 	mov	4($ctx),$B
    179 	mov	8($ctx),$C
    180 	mov	12($ctx),$D
    181 	mov	$B,@T[0]		# magic seed
    182 	mov	16($ctx),$E
    183 
    184 	movdqa	64($K_XX_XX),@X[2]	# pbswap mask
    185 	movdqa	0($K_XX_XX),@Tx[1]	# K_00_19
    186 	movdqu	0($inp),@X[-4&7]	# load input to %xmm[0-3]
    187 	movdqu	16($inp),@X[-3&7]
    188 	movdqu	32($inp),@X[-2&7]
    189 	movdqu	48($inp),@X[-1&7]
    190 	pshufb	@X[2],@X[-4&7]		# byte swap
    191 	add	\$64,$inp
    192 	pshufb	@X[2],@X[-3&7]
    193 	pshufb	@X[2],@X[-2&7]
    194 	pshufb	@X[2],@X[-1&7]
    195 	paddd	@Tx[1],@X[-4&7]		# add K_00_19
    196 	paddd	@Tx[1],@X[-3&7]
    197 	paddd	@Tx[1],@X[-2&7]
    198 	movdqa	@X[-4&7],0(%rsp)	# X[]+K xfer to IALU
    199 	psubd	@Tx[1],@X[-4&7]		# restore X[]
    200 	movdqa	@X[-3&7],16(%rsp)
    201 	psubd	@Tx[1],@X[-3&7]
    202 	movdqa	@X[-2&7],32(%rsp)
    203 	psubd	@Tx[1],@X[-2&7]
    204 	movups	($key),$rndkey0		# $key[0]
    205 	movups	16($key),$rndkey[0]	# forward reference
    206 	jmp	.Loop_ssse3
    207 ___
    208 
    209 my $aesenc=sub {
    210   use integer;
    211   my ($n,$k)=($r/10,$r%10);
    212     if ($k==0) {
    213       $code.=<<___;
    214 	movups		`16*$n`($in0),$in		# load input
    215 	xorps		$rndkey0,$in
    216 ___
    217       $code.=<<___ if ($n);
    218 	movups		$iv,`16*($n-1)`($out,$in0)	# write output
    219 ___
    220       $code.=<<___;
    221 	xorps		$in,$iv
    222 	aesenc		$rndkey[0],$iv
    223 	movups		`32+16*$k`($key),$rndkey[1]
    224 ___
    225     } elsif ($k==9) {
    226       $sn++;
    227       $code.=<<___;
    228 	cmp		\$11,$rounds
    229 	jb		.Laesenclast$sn
    230 	movups		`32+16*($k+0)`($key),$rndkey[1]
    231 	aesenc		$rndkey[0],$iv
    232 	movups		`32+16*($k+1)`($key),$rndkey[0]
    233 	aesenc		$rndkey[1],$iv
    234 	je		.Laesenclast$sn
    235 	movups		`32+16*($k+2)`($key),$rndkey[1]
    236 	aesenc		$rndkey[0],$iv
    237 	movups		`32+16*($k+3)`($key),$rndkey[0]
    238 	aesenc		$rndkey[1],$iv
    239 .Laesenclast$sn:
    240 	aesenclast	$rndkey[0],$iv
    241 	movups		16($key),$rndkey[1]		# forward reference
    242 ___
    243     } else {
    244       $code.=<<___;
    245 	aesenc		$rndkey[0],$iv
    246 	movups		`32+16*$k`($key),$rndkey[1]
    247 ___
    248     }
    249     $r++;	unshift(@rndkey,pop(@rndkey));
    250 };
    251 
    252 sub Xupdate_ssse3_16_31()		# recall that $Xi starts wtih 4
    253 { use integer;
    254   my $body = shift;
    255   my @insns = (&$body,&$body,&$body,&$body);	# 40 instructions
    256   my ($a,$b,$c,$d,$e);
    257 
    258 	&movdqa	(@X[0],@X[-3&7]);
    259 	 eval(shift(@insns));
    260 	 eval(shift(@insns));
    261 	&movdqa	(@Tx[0],@X[-1&7]);
    262 	&palignr(@X[0],@X[-4&7],8);	# compose "X[-14]" in "X[0]"
    263 	 eval(shift(@insns));
    264 	 eval(shift(@insns));
    265 
    266 	  &paddd	(@Tx[1],@X[-1&7]);
    267 	 eval(shift(@insns));
    268 	 eval(shift(@insns));
    269 	&psrldq	(@Tx[0],4);		# "X[-3]", 3 dwords
    270 	 eval(shift(@insns));
    271 	 eval(shift(@insns));
    272 	&pxor	(@X[0],@X[-4&7]);	# "X[0]"^="X[-16]"
    273 	 eval(shift(@insns));
    274 	 eval(shift(@insns));
    275 
    276 	&pxor	(@Tx[0],@X[-2&7]);	# "X[-3]"^"X[-8]"
    277 	 eval(shift(@insns));
    278 	 eval(shift(@insns));
    279 	 eval(shift(@insns));
    280 	 eval(shift(@insns));
    281 
    282 	&pxor	(@X[0],@Tx[0]);		# "X[0]"^="X[-3]"^"X[-8]"
    283 	 eval(shift(@insns));
    284 	 eval(shift(@insns));
    285 	  &movdqa	(eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]);	# X[]+K xfer to IALU
    286 	 eval(shift(@insns));
    287 	 eval(shift(@insns));
    288 
    289 	&movdqa	(@Tx[2],@X[0]);
    290 	&movdqa	(@Tx[0],@X[0]);
    291 	 eval(shift(@insns));
    292 	 eval(shift(@insns));
    293 	 eval(shift(@insns));
    294 	 eval(shift(@insns));
    295 
    296 	&pslldq	(@Tx[2],12);		# "X[0]"<<96, extract one dword
    297 	&paddd	(@X[0],@X[0]);
    298 	 eval(shift(@insns));
    299 	 eval(shift(@insns));
    300 	 eval(shift(@insns));
    301 	 eval(shift(@insns));
    302 
    303 	&psrld	(@Tx[0],31);
    304 	 eval(shift(@insns));
    305 	 eval(shift(@insns));
    306 	&movdqa	(@Tx[1],@Tx[2]);
    307 	 eval(shift(@insns));
    308 	 eval(shift(@insns));
    309 
    310 	&psrld	(@Tx[2],30);
    311 	&por	(@X[0],@Tx[0]);		# "X[0]"<<<=1
    312 	 eval(shift(@insns));
    313 	 eval(shift(@insns));
    314 	 eval(shift(@insns));
    315 	 eval(shift(@insns));
    316 
    317 	&pslld	(@Tx[1],2);
    318 	&pxor	(@X[0],@Tx[2]);
    319 	 eval(shift(@insns));
    320 	 eval(shift(@insns));
    321 	  &movdqa	(@Tx[2],eval(16*(($Xi)/5))."($K_XX_XX)");	# K_XX_XX
    322 	 eval(shift(@insns));
    323 	 eval(shift(@insns));
    324 
    325 	&pxor	(@X[0],@Tx[1]);		# "X[0]"^=("X[0]">>96)<<<2
    326 
    327 	 foreach (@insns) { eval; }	# remaining instructions [if any]
    328 
    329   $Xi++;	push(@X,shift(@X));	# "rotate" X[]
    330 		push(@Tx,shift(@Tx));
    331 }
    332 
    333 sub Xupdate_ssse3_32_79()
    334 { use integer;
    335   my $body = shift;
    336   my @insns = (&$body,&$body,&$body,&$body);	# 32 to 48 instructions
    337   my ($a,$b,$c,$d,$e);
    338 
    339 	&movdqa	(@Tx[0],@X[-1&7])	if ($Xi==8);
    340 	 eval(shift(@insns));		# body_20_39
    341 	&pxor	(@X[0],@X[-4&7]);	# "X[0]"="X[-32]"^"X[-16]"
    342 	&palignr(@Tx[0],@X[-2&7],8);	# compose "X[-6]"
    343 	 eval(shift(@insns));
    344 	 eval(shift(@insns));
    345 	 eval(shift(@insns));		# rol
    346 
    347 	&pxor	(@X[0],@X[-7&7]);	# "X[0]"^="X[-28]"
    348 	 eval(shift(@insns));
    349 	 eval(shift(@insns))	if (@insns[0] !~ /&ro[rl]/);
    350 	if ($Xi%5) {
    351 	  &movdqa	(@Tx[2],@Tx[1]);# "perpetuate" K_XX_XX...
    352 	} else {			# ... or load next one
    353 	  &movdqa	(@Tx[2],eval(16*($Xi/5))."($K_XX_XX)");
    354 	}
    355 	  &paddd	(@Tx[1],@X[-1&7]);
    356 	 eval(shift(@insns));		# ror
    357 	 eval(shift(@insns));
    358 
    359 	&pxor	(@X[0],@Tx[0]);		# "X[0]"^="X[-6]"
    360 	 eval(shift(@insns));		# body_20_39
    361 	 eval(shift(@insns));
    362 	 eval(shift(@insns));
    363 	 eval(shift(@insns));		# rol
    364 
    365 	&movdqa	(@Tx[0],@X[0]);
    366 	  &movdqa	(eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]);	# X[]+K xfer to IALU
    367 	 eval(shift(@insns));
    368 	 eval(shift(@insns));
    369 	 eval(shift(@insns));		# ror
    370 	 eval(shift(@insns));
    371 
    372 	&pslld	(@X[0],2);
    373 	 eval(shift(@insns));		# body_20_39
    374 	 eval(shift(@insns));
    375 	&psrld	(@Tx[0],30);
    376 	 eval(shift(@insns));
    377 	 eval(shift(@insns));		# rol
    378 	 eval(shift(@insns));
    379 	 eval(shift(@insns));
    380 	 eval(shift(@insns));		# ror
    381 	 eval(shift(@insns));
    382 
    383 	&por	(@X[0],@Tx[0]);		# "X[0]"<<<=2
    384 	 eval(shift(@insns));		# body_20_39
    385 	 eval(shift(@insns));
    386 	  &movdqa	(@Tx[1],@X[0])	if ($Xi<19);
    387 	 eval(shift(@insns));
    388 	 eval(shift(@insns));		# rol
    389 	 eval(shift(@insns));
    390 	 eval(shift(@insns));
    391 	 eval(shift(@insns));		# rol
    392 	 eval(shift(@insns));
    393 
    394 	 foreach (@insns) { eval; }	# remaining instructions
    395 
    396   $Xi++;	push(@X,shift(@X));	# "rotate" X[]
    397 		push(@Tx,shift(@Tx));
    398 }
    399 
    400 sub Xuplast_ssse3_80()
    401 { use integer;
    402   my $body = shift;
    403   my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
    404   my ($a,$b,$c,$d,$e);
    405 
    406 	 eval(shift(@insns));
    407 	  &paddd	(@Tx[1],@X[-1&7]);
    408 	 eval(shift(@insns));
    409 	 eval(shift(@insns));
    410 	 eval(shift(@insns));
    411 	 eval(shift(@insns));
    412 
    413 	  &movdqa	(eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]);	# X[]+K xfer IALU
    414 
    415 	 foreach (@insns) { eval; }		# remaining instructions
    416 
    417 	&cmp	($inp,$len);
    418 	&je	(".Ldone_ssse3");
    419 
    420 	unshift(@Tx,pop(@Tx));
    421 
    422 	&movdqa	(@X[2],"64($K_XX_XX)");		# pbswap mask
    423 	&movdqa	(@Tx[1],"0($K_XX_XX)");		# K_00_19
    424 	&movdqu	(@X[-4&7],"0($inp)");		# load input
    425 	&movdqu	(@X[-3&7],"16($inp)");
    426 	&movdqu	(@X[-2&7],"32($inp)");
    427 	&movdqu	(@X[-1&7],"48($inp)");
    428 	&pshufb	(@X[-4&7],@X[2]);		# byte swap
    429 	&add	($inp,64);
    430 
    431   $Xi=0;
    432 }
    433 
    434 sub Xloop_ssse3()
    435 { use integer;
    436   my $body = shift;
    437   my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
    438   my ($a,$b,$c,$d,$e);
    439 
    440 	 eval(shift(@insns));
    441 	 eval(shift(@insns));
    442 	&pshufb	(@X[($Xi-3)&7],@X[2]);
    443 	 eval(shift(@insns));
    444 	 eval(shift(@insns));
    445 	&paddd	(@X[($Xi-4)&7],@Tx[1]);
    446 	 eval(shift(@insns));
    447 	 eval(shift(@insns));
    448 	 eval(shift(@insns));
    449 	 eval(shift(@insns));
    450 	&movdqa	(eval(16*$Xi)."(%rsp)",@X[($Xi-4)&7]);	# X[]+K xfer to IALU
    451 	 eval(shift(@insns));
    452 	 eval(shift(@insns));
    453 	&psubd	(@X[($Xi-4)&7],@Tx[1]);
    454 
    455 	foreach (@insns) { eval; }
    456   $Xi++;
    457 }
    458 
    459 sub Xtail_ssse3()
    460 { use integer;
    461   my $body = shift;
    462   my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
    463   my ($a,$b,$c,$d,$e);
    464 
    465 	foreach (@insns) { eval; }
    466 }
    467 
    468 sub body_00_19 () {
    469   use integer;
    470   my ($k,$n);
    471   my @r=(
    472 	'($a,$b,$c,$d,$e)=@V;'.
    473 	'&add	($e,eval(4*($j&15))."(%rsp)");',	# X[]+K xfer
    474 	'&xor	($c,$d);',
    475 	'&mov	(@T[1],$a);',	# $b in next round
    476 	'&$_rol	($a,5);',
    477 	'&and	(@T[0],$c);',	# ($b&($c^$d))
    478 	'&xor	($c,$d);',	# restore $c
    479 	'&xor	(@T[0],$d);',
    480 	'&add	($e,$a);',
    481 	'&$_ror	($b,$j?7:2);',	# $b>>>2
    482 	'&add	($e,@T[0]);'	.'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));'
    483 	);
    484 	$n = scalar(@r);
    485 	$k = (($jj+1)*12/20)*20*$n/12;	# 12 aesencs per these 20 rounds
    486 	@r[$k%$n].='&$aesenc();'	if ($jj==$k/$n);
    487 	$jj++;
    488     return @r;
    489 }
    490 
    491 sub body_20_39 () {
    492   use integer;
    493   my ($k,$n);
    494   my @r=(
    495 	'($a,$b,$c,$d,$e)=@V;'.
    496 	'&add	($e,eval(4*($j++&15))."(%rsp)");',	# X[]+K xfer
    497 	'&xor	(@T[0],$d);',	# ($b^$d)
    498 	'&mov	(@T[1],$a);',	# $b in next round
    499 	'&$_rol	($a,5);',
    500 	'&xor	(@T[0],$c);',	# ($b^$d^$c)
    501 	'&add	($e,$a);',
    502 	'&$_ror	($b,7);',	# $b>>>2
    503 	'&add	($e,@T[0]);'	.'unshift(@V,pop(@V)); unshift(@T,pop(@T));'
    504 	);
    505 	$n = scalar(@r);
    506 	$k = (($jj+1)*8/20)*20*$n/8;	# 8 aesencs per these 20 rounds
    507 	@r[$k%$n].='&$aesenc();'	if ($jj==$k/$n);
    508 	$jj++;
    509     return @r;
    510 }
    511 
    512 sub body_40_59 () {
    513   use integer;
    514   my ($k,$n);
    515   my @r=(
    516 	'($a,$b,$c,$d,$e)=@V;'.
    517 	'&mov	(@T[1],$c);',
    518 	'&xor	($c,$d);',
    519 	'&add	($e,eval(4*($j++&15))."(%rsp)");',	# X[]+K xfer
    520 	'&and	(@T[1],$d);',
    521 	'&and	(@T[0],$c);',	# ($b&($c^$d))
    522 	'&$_ror	($b,7);',	# $b>>>2
    523 	'&add	($e,@T[1]);',
    524 	'&mov	(@T[1],$a);',	# $b in next round
    525 	'&$_rol	($a,5);',
    526 	'&add	($e,@T[0]);',
    527 	'&xor	($c,$d);',	# restore $c
    528 	'&add	($e,$a);'	.'unshift(@V,pop(@V)); unshift(@T,pop(@T));'
    529 	);
    530 	$n = scalar(@r);
    531 	$k=(($jj+1)*12/20)*20*$n/12;	# 12 aesencs per these 20 rounds
    532 	@r[$k%$n].='&$aesenc();'	if ($jj==$k/$n);
    533 	$jj++;
    534     return @r;
    535 }
    536 $code.=<<___;
    537 .align	16
    538 .Loop_ssse3:
    539 ___
    540 	&Xupdate_ssse3_16_31(\&body_00_19);
    541 	&Xupdate_ssse3_16_31(\&body_00_19);
    542 	&Xupdate_ssse3_16_31(\&body_00_19);
    543 	&Xupdate_ssse3_16_31(\&body_00_19);
    544 	&Xupdate_ssse3_32_79(\&body_00_19);
    545 	&Xupdate_ssse3_32_79(\&body_20_39);
    546 	&Xupdate_ssse3_32_79(\&body_20_39);
    547 	&Xupdate_ssse3_32_79(\&body_20_39);
    548 	&Xupdate_ssse3_32_79(\&body_20_39);
    549 	&Xupdate_ssse3_32_79(\&body_20_39);
    550 	&Xupdate_ssse3_32_79(\&body_40_59);
    551 	&Xupdate_ssse3_32_79(\&body_40_59);
    552 	&Xupdate_ssse3_32_79(\&body_40_59);
    553 	&Xupdate_ssse3_32_79(\&body_40_59);
    554 	&Xupdate_ssse3_32_79(\&body_40_59);
    555 	&Xupdate_ssse3_32_79(\&body_20_39);
    556 	&Xuplast_ssse3_80(\&body_20_39);	# can jump to "done"
    557 
    558 				$saved_j=$j; @saved_V=@V;
    559 				$saved_r=$r; @saved_rndkey=@rndkey;
    560 
    561 	&Xloop_ssse3(\&body_20_39);
    562 	&Xloop_ssse3(\&body_20_39);
    563 	&Xloop_ssse3(\&body_20_39);
    564 
    565 $code.=<<___;
    566 	movups	$iv,48($out,$in0)		# write output
    567 	lea	64($in0),$in0
    568 
    569 	add	0($ctx),$A			# update context
    570 	add	4($ctx),@T[0]
    571 	add	8($ctx),$C
    572 	add	12($ctx),$D
    573 	mov	$A,0($ctx)
    574 	add	16($ctx),$E
    575 	mov	@T[0],4($ctx)
    576 	mov	@T[0],$B			# magic seed
    577 	mov	$C,8($ctx)
    578 	mov	$D,12($ctx)
    579 	mov	$E,16($ctx)
    580 	jmp	.Loop_ssse3
    581 
    582 .align	16
    583 .Ldone_ssse3:
    584 ___
    585 				$jj=$j=$saved_j; @V=@saved_V;
    586 				$r=$saved_r;     @rndkey=@saved_rndkey;
    587 
    588 	&Xtail_ssse3(\&body_20_39);
    589 	&Xtail_ssse3(\&body_20_39);
    590 	&Xtail_ssse3(\&body_20_39);
    591 
    592 $code.=<<___;
    593 	movups	$iv,48($out,$in0)		# write output
    594 	mov	88(%rsp),$ivp			# restore $ivp
    595 
    596 	add	0($ctx),$A			# update context
    597 	add	4($ctx),@T[0]
    598 	add	8($ctx),$C
    599 	mov	$A,0($ctx)
    600 	add	12($ctx),$D
    601 	mov	@T[0],4($ctx)
    602 	add	16($ctx),$E
    603 	mov	$C,8($ctx)
    604 	mov	$D,12($ctx)
    605 	mov	$E,16($ctx)
    606 	movups	$iv,($ivp)			# write IV
    607 ___
    608 $code.=<<___ if ($win64);
    609 	movaps	96+0(%rsp),%xmm6
    610 	movaps	96+16(%rsp),%xmm7
    611 	movaps	96+32(%rsp),%xmm8
    612 	movaps	96+48(%rsp),%xmm9
    613 	movaps	96+64(%rsp),%xmm10
    614 	movaps	96+80(%rsp),%xmm11
    615 	movaps	96+96(%rsp),%xmm12
    616 	movaps	96+112(%rsp),%xmm13
    617 	movaps	96+128(%rsp),%xmm14
    618 	movaps	96+144(%rsp),%xmm15
    619 ___
    620 $code.=<<___;
    621 	lea	`104+($win64?10*16:0)`(%rsp),%rsi
    622 	mov	0(%rsi),%r15
    623 	mov	8(%rsi),%r14
    624 	mov	16(%rsi),%r13
    625 	mov	24(%rsi),%r12
    626 	mov	32(%rsi),%rbp
    627 	mov	40(%rsi),%rbx
    628 	lea	48(%rsi),%rsp
    629 .Lepilogue_ssse3:
    630 	ret
    631 .size	aesni_cbc_sha1_enc_ssse3,.-aesni_cbc_sha1_enc_ssse3
    632 ___
    633 
    634 $j=$jj=$r=$sn=0;
    635 
    636 if ($avx) {
    637 my ($in0,$out,$len,$key,$ivp,$ctx,$inp)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10");
    638 
    639 my $Xi=4;
    640 my @X=map("%xmm$_",(4..7,0..3));
    641 my @Tx=map("%xmm$_",(8..10));
    642 my @V=($A,$B,$C,$D,$E)=("%eax","%ebx","%ecx","%edx","%ebp");	# size optimization
    643 my @T=("%esi","%edi");
    644 
    645 my $_rol=sub { &shld(@_[0],@_) };
    646 my $_ror=sub { &shrd(@_[0],@_) };
    647 
    648 $code.=<<___;
    649 .type	aesni_cbc_sha1_enc_avx,\@function,6
    650 .align	16
    651 aesni_cbc_sha1_enc_avx:
    652 	mov	`($win64?56:8)`(%rsp),$inp	# load 7th argument
    653 	#shr	\$6,$len			# debugging artefact
    654 	#jz	.Lepilogue_avx			# debugging artefact
    655 	push	%rbx
    656 	push	%rbp
    657 	push	%r12
    658 	push	%r13
    659 	push	%r14
    660 	push	%r15
    661 	lea	`-104-($win64?10*16:0)`(%rsp),%rsp
    662 	#mov	$in0,$inp			# debugging artefact
    663 	#lea	64(%rsp),$ctx			# debugging artefact
    664 ___
    665 $code.=<<___ if ($win64);
    666 	movaps	%xmm6,96+0(%rsp)
    667 	movaps	%xmm7,96+16(%rsp)
    668 	movaps	%xmm8,96+32(%rsp)
    669 	movaps	%xmm9,96+48(%rsp)
    670 	movaps	%xmm10,96+64(%rsp)
    671 	movaps	%xmm11,96+80(%rsp)
    672 	movaps	%xmm12,96+96(%rsp)
    673 	movaps	%xmm13,96+112(%rsp)
    674 	movaps	%xmm14,96+128(%rsp)
    675 	movaps	%xmm15,96+144(%rsp)
    676 .Lprologue_avx:
    677 ___
    678 $code.=<<___;
    679 	vzeroall
    680 	mov	$in0,%r12			# reassign arguments
    681 	mov	$out,%r13
    682 	mov	$len,%r14
    683 	mov	$key,%r15
    684 	vmovdqu	($ivp),$iv			# load IV
    685 	mov	$ivp,88(%rsp)			# save $ivp
    686 ___
    687 my ($in0,$out,$len,$key)=map("%r$_",(12..15));	# reassign arguments
    688 my $rounds="${ivp}d";
    689 $code.=<<___;
    690 	shl	\$6,$len
    691 	sub	$in0,$out
    692 	mov	240($key),$rounds
    693 	add	\$112,$key		# size optimization
    694 	add	$inp,$len		# end of input
    695 
    696 	lea	K_XX_XX(%rip),$K_XX_XX
    697 	mov	0($ctx),$A		# load context
    698 	mov	4($ctx),$B
    699 	mov	8($ctx),$C
    700 	mov	12($ctx),$D
    701 	mov	$B,@T[0]		# magic seed
    702 	mov	16($ctx),$E
    703 
    704 	vmovdqa	64($K_XX_XX),@X[2]	# pbswap mask
    705 	vmovdqa	0($K_XX_XX),@Tx[1]	# K_00_19
    706 	vmovdqu	0($inp),@X[-4&7]	# load input to %xmm[0-3]
    707 	vmovdqu	16($inp),@X[-3&7]
    708 	vmovdqu	32($inp),@X[-2&7]
    709 	vmovdqu	48($inp),@X[-1&7]
    710 	vpshufb	@X[2],@X[-4&7],@X[-4&7]	# byte swap
    711 	add	\$64,$inp
    712 	vpshufb	@X[2],@X[-3&7],@X[-3&7]
    713 	vpshufb	@X[2],@X[-2&7],@X[-2&7]
    714 	vpshufb	@X[2],@X[-1&7],@X[-1&7]
    715 	vpaddd	@Tx[1],@X[-4&7],@X[0]	# add K_00_19
    716 	vpaddd	@Tx[1],@X[-3&7],@X[1]
    717 	vpaddd	@Tx[1],@X[-2&7],@X[2]
    718 	vmovdqa	@X[0],0(%rsp)		# X[]+K xfer to IALU
    719 	vmovdqa	@X[1],16(%rsp)
    720 	vmovdqa	@X[2],32(%rsp)
    721 	vmovups	-112($key),$rndkey0	# $key[0]
    722 	vmovups	16-112($key),$rndkey[0]	# forward reference
    723 	jmp	.Loop_avx
    724 ___
    725 
    726 my $aesenc=sub {
    727   use integer;
    728   my ($n,$k)=($r/10,$r%10);
    729     if ($k==0) {
    730       $code.=<<___;
    731 	vmovups		`16*$n`($in0),$in		# load input
    732 	vxorps		$rndkey0,$in,$in
    733 ___
    734       $code.=<<___ if ($n);
    735 	vmovups		$iv,`16*($n-1)`($out,$in0)	# write output
    736 ___
    737       $code.=<<___;
    738 	vxorps		$in,$iv,$iv
    739 	vaesenc		$rndkey[0],$iv,$iv
    740 	vmovups		`32+16*$k-112`($key),$rndkey[1]
    741 ___
    742     } elsif ($k==9) {
    743       $sn++;
    744       $code.=<<___;
    745 	cmp		\$11,$rounds
    746 	jb		.Lvaesenclast$sn
    747 	vaesenc		$rndkey[0],$iv,$iv
    748 	vmovups		`32+16*($k+0)-112`($key),$rndkey[1]
    749 	vaesenc		$rndkey[1],$iv,$iv
    750 	vmovups		`32+16*($k+1)-112`($key),$rndkey[0]
    751 	je		.Lvaesenclast$sn
    752 	vaesenc		$rndkey[0],$iv,$iv
    753 	vmovups		`32+16*($k+2)-112`($key),$rndkey[1]
    754 	vaesenc		$rndkey[1],$iv,$iv
    755 	vmovups		`32+16*($k+3)-112`($key),$rndkey[0]
    756 .Lvaesenclast$sn:
    757 	vaesenclast	$rndkey[0],$iv,$iv
    758 	vmovups		16-112($key),$rndkey[1]		# forward reference
    759 ___
    760     } else {
    761       $code.=<<___;
    762 	vaesenc		$rndkey[0],$iv,$iv
    763 	vmovups		`32+16*$k-112`($key),$rndkey[1]
    764 ___
    765     }
    766     $r++;	unshift(@rndkey,pop(@rndkey));
    767 };
    768 
    769 sub Xupdate_avx_16_31()		# recall that $Xi starts wtih 4
    770 { use integer;
    771   my $body = shift;
    772   my @insns = (&$body,&$body,&$body,&$body);	# 40 instructions
    773   my ($a,$b,$c,$d,$e);
    774 
    775 	 eval(shift(@insns));
    776 	 eval(shift(@insns));
    777 	&vpalignr(@X[0],@X[-3&7],@X[-4&7],8);	# compose "X[-14]" in "X[0]"
    778 	 eval(shift(@insns));
    779 	 eval(shift(@insns));
    780 
    781 	  &vpaddd	(@Tx[1],@Tx[1],@X[-1&7]);
    782 	 eval(shift(@insns));
    783 	 eval(shift(@insns));
    784 	&vpsrldq(@Tx[0],@X[-1&7],4);	# "X[-3]", 3 dwords
    785 	 eval(shift(@insns));
    786 	 eval(shift(@insns));
    787 	&vpxor	(@X[0],@X[0],@X[-4&7]);		# "X[0]"^="X[-16]"
    788 	 eval(shift(@insns));
    789 	 eval(shift(@insns));
    790 
    791 	&vpxor	(@Tx[0],@Tx[0],@X[-2&7]);	# "X[-3]"^"X[-8]"
    792 	 eval(shift(@insns));
    793 	 eval(shift(@insns));
    794 	 eval(shift(@insns));
    795 	 eval(shift(@insns));
    796 
    797 	&vpxor	(@X[0],@X[0],@Tx[0]);		# "X[0]"^="X[-3]"^"X[-8]"
    798 	 eval(shift(@insns));
    799 	 eval(shift(@insns));
    800 	  &vmovdqa	(eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]);	# X[]+K xfer to IALU
    801 	 eval(shift(@insns));
    802 	 eval(shift(@insns));
    803 
    804 	&vpsrld	(@Tx[0],@X[0],31);
    805 	 eval(shift(@insns));
    806 	 eval(shift(@insns));
    807 	 eval(shift(@insns));
    808 	 eval(shift(@insns));
    809 
    810 	&vpslldq(@Tx[2],@X[0],12);		# "X[0]"<<96, extract one dword
    811 	&vpaddd	(@X[0],@X[0],@X[0]);
    812 	 eval(shift(@insns));
    813 	 eval(shift(@insns));
    814 	 eval(shift(@insns));
    815 	 eval(shift(@insns));
    816 
    817 	&vpsrld	(@Tx[1],@Tx[2],30);
    818 	&vpor	(@X[0],@X[0],@Tx[0]);		# "X[0]"<<<=1
    819 	 eval(shift(@insns));
    820 	 eval(shift(@insns));
    821 	 eval(shift(@insns));
    822 	 eval(shift(@insns));
    823 
    824 	&vpslld	(@Tx[2],@Tx[2],2);
    825 	&vpxor	(@X[0],@X[0],@Tx[1]);
    826 	 eval(shift(@insns));
    827 	 eval(shift(@insns));
    828 	 eval(shift(@insns));
    829 	 eval(shift(@insns));
    830 
    831 	&vpxor	(@X[0],@X[0],@Tx[2]);		# "X[0]"^=("X[0]">>96)<<<2
    832 	 eval(shift(@insns));
    833 	 eval(shift(@insns));
    834 	  &vmovdqa	(@Tx[2],eval(16*(($Xi)/5))."($K_XX_XX)");	# K_XX_XX
    835 	 eval(shift(@insns));
    836 	 eval(shift(@insns));
    837 
    838 
    839 	 foreach (@insns) { eval; }	# remaining instructions [if any]
    840 
    841   $Xi++;	push(@X,shift(@X));	# "rotate" X[]
    842 		push(@Tx,shift(@Tx));
    843 }
    844 
    845 sub Xupdate_avx_32_79()
    846 { use integer;
    847   my $body = shift;
    848   my @insns = (&$body,&$body,&$body,&$body);	# 32 to 48 instructions
    849   my ($a,$b,$c,$d,$e);
    850 
    851 	&vpalignr(@Tx[0],@X[-1&7],@X[-2&7],8);	# compose "X[-6]"
    852 	&vpxor	(@X[0],@X[0],@X[-4&7]);		# "X[0]"="X[-32]"^"X[-16]"
    853 	 eval(shift(@insns));		# body_20_39
    854 	 eval(shift(@insns));
    855 	 eval(shift(@insns));
    856 	 eval(shift(@insns));		# rol
    857 
    858 	&vpxor	(@X[0],@X[0],@X[-7&7]);		# "X[0]"^="X[-28]"
    859 	 eval(shift(@insns));
    860 	 eval(shift(@insns))	if (@insns[0] !~ /&ro[rl]/);
    861 	if ($Xi%5) {
    862 	  &vmovdqa	(@Tx[2],@Tx[1]);# "perpetuate" K_XX_XX...
    863 	} else {			# ... or load next one
    864 	  &vmovdqa	(@Tx[2],eval(16*($Xi/5))."($K_XX_XX)");
    865 	}
    866 	  &vpaddd	(@Tx[1],@Tx[1],@X[-1&7]);
    867 	 eval(shift(@insns));		# ror
    868 	 eval(shift(@insns));
    869 
    870 	&vpxor	(@X[0],@X[0],@Tx[0]);		# "X[0]"^="X[-6]"
    871 	 eval(shift(@insns));		# body_20_39
    872 	 eval(shift(@insns));
    873 	 eval(shift(@insns));
    874 	 eval(shift(@insns));		# rol
    875 
    876 	&vpsrld	(@Tx[0],@X[0],30);
    877 	  &vmovdqa	(eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]);	# X[]+K xfer to IALU
    878 	 eval(shift(@insns));
    879 	 eval(shift(@insns));
    880 	 eval(shift(@insns));		# ror
    881 	 eval(shift(@insns));
    882 
    883 	&vpslld	(@X[0],@X[0],2);
    884 	 eval(shift(@insns));		# body_20_39
    885 	 eval(shift(@insns));
    886 	 eval(shift(@insns));
    887 	 eval(shift(@insns));		# rol
    888 	 eval(shift(@insns));
    889 	 eval(shift(@insns));
    890 	 eval(shift(@insns));		# ror
    891 	 eval(shift(@insns));
    892 
    893 	&vpor	(@X[0],@X[0],@Tx[0]);		# "X[0]"<<<=2
    894 	 eval(shift(@insns));		# body_20_39
    895 	 eval(shift(@insns));
    896 	  &vmovdqa	(@Tx[1],@X[0])	if ($Xi<19);
    897 	 eval(shift(@insns));
    898 	 eval(shift(@insns));		# rol
    899 	 eval(shift(@insns));
    900 	 eval(shift(@insns));
    901 	 eval(shift(@insns));		# rol
    902 	 eval(shift(@insns));
    903 
    904 	 foreach (@insns) { eval; }	# remaining instructions
    905 
    906   $Xi++;	push(@X,shift(@X));	# "rotate" X[]
    907 		push(@Tx,shift(@Tx));
    908 }
    909 
    910 sub Xuplast_avx_80()
    911 { use integer;
    912   my $body = shift;
    913   my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
    914   my ($a,$b,$c,$d,$e);
    915 
    916 	 eval(shift(@insns));
    917 	  &vpaddd	(@Tx[1],@Tx[1],@X[-1&7]);
    918 	 eval(shift(@insns));
    919 	 eval(shift(@insns));
    920 	 eval(shift(@insns));
    921 	 eval(shift(@insns));
    922 
    923 	  &movdqa	(eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]);	# X[]+K xfer IALU
    924 
    925 	 foreach (@insns) { eval; }		# remaining instructions
    926 
    927 	&cmp	($inp,$len);
    928 	&je	(".Ldone_avx");
    929 
    930 	unshift(@Tx,pop(@Tx));
    931 
    932 	&vmovdqa(@X[2],"64($K_XX_XX)");		# pbswap mask
    933 	&vmovdqa(@Tx[1],"0($K_XX_XX)");		# K_00_19
    934 	&vmovdqu(@X[-4&7],"0($inp)");		# load input
    935 	&vmovdqu(@X[-3&7],"16($inp)");
    936 	&vmovdqu(@X[-2&7],"32($inp)");
    937 	&vmovdqu(@X[-1&7],"48($inp)");
    938 	&vpshufb(@X[-4&7],@X[-4&7],@X[2]);	# byte swap
    939 	&add	($inp,64);
    940 
    941   $Xi=0;
    942 }
    943 
    944 sub Xloop_avx()
    945 { use integer;
    946   my $body = shift;
    947   my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
    948   my ($a,$b,$c,$d,$e);
    949 
    950 	 eval(shift(@insns));
    951 	 eval(shift(@insns));
    952 	&vpshufb(@X[($Xi-3)&7],@X[($Xi-3)&7],@X[2]);
    953 	 eval(shift(@insns));
    954 	 eval(shift(@insns));
    955 	&vpaddd	(@X[$Xi&7],@X[($Xi-4)&7],@Tx[1]);
    956 	 eval(shift(@insns));
    957 	 eval(shift(@insns));
    958 	 eval(shift(@insns));
    959 	 eval(shift(@insns));
    960 	&vmovdqa(eval(16*$Xi)."(%rsp)",@X[$Xi&7]);	# X[]+K xfer to IALU
    961 	 eval(shift(@insns));
    962 	 eval(shift(@insns));
    963 
    964 	foreach (@insns) { eval; }
    965   $Xi++;
    966 }
    967 
    968 sub Xtail_avx()
    969 { use integer;
    970   my $body = shift;
    971   my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
    972   my ($a,$b,$c,$d,$e);
    973 
    974 	foreach (@insns) { eval; }
    975 }
    976 
    977 $code.=<<___;
    978 .align	16
    979 .Loop_avx:
    980 ___
    981 	&Xupdate_avx_16_31(\&body_00_19);
    982 	&Xupdate_avx_16_31(\&body_00_19);
    983 	&Xupdate_avx_16_31(\&body_00_19);
    984 	&Xupdate_avx_16_31(\&body_00_19);
    985 	&Xupdate_avx_32_79(\&body_00_19);
    986 	&Xupdate_avx_32_79(\&body_20_39);
    987 	&Xupdate_avx_32_79(\&body_20_39);
    988 	&Xupdate_avx_32_79(\&body_20_39);
    989 	&Xupdate_avx_32_79(\&body_20_39);
    990 	&Xupdate_avx_32_79(\&body_20_39);
    991 	&Xupdate_avx_32_79(\&body_40_59);
    992 	&Xupdate_avx_32_79(\&body_40_59);
    993 	&Xupdate_avx_32_79(\&body_40_59);
    994 	&Xupdate_avx_32_79(\&body_40_59);
    995 	&Xupdate_avx_32_79(\&body_40_59);
    996 	&Xupdate_avx_32_79(\&body_20_39);
    997 	&Xuplast_avx_80(\&body_20_39);	# can jump to "done"
    998 
    999 				$saved_j=$j; @saved_V=@V;
   1000 				$saved_r=$r; @saved_rndkey=@rndkey;
   1001 
   1002 	&Xloop_avx(\&body_20_39);
   1003 	&Xloop_avx(\&body_20_39);
   1004 	&Xloop_avx(\&body_20_39);
   1005 
   1006 $code.=<<___;
   1007 	vmovups	$iv,48($out,$in0)		# write output
   1008 	lea	64($in0),$in0
   1009 
   1010 	add	0($ctx),$A			# update context
   1011 	add	4($ctx),@T[0]
   1012 	add	8($ctx),$C
   1013 	add	12($ctx),$D
   1014 	mov	$A,0($ctx)
   1015 	add	16($ctx),$E
   1016 	mov	@T[0],4($ctx)
   1017 	mov	@T[0],$B			# magic seed
   1018 	mov	$C,8($ctx)
   1019 	mov	$D,12($ctx)
   1020 	mov	$E,16($ctx)
   1021 	jmp	.Loop_avx
   1022 
   1023 .align	16
   1024 .Ldone_avx:
   1025 ___
   1026 				$jj=$j=$saved_j; @V=@saved_V;
   1027 				$r=$saved_r;     @rndkey=@saved_rndkey;
   1028 
   1029 	&Xtail_avx(\&body_20_39);
   1030 	&Xtail_avx(\&body_20_39);
   1031 	&Xtail_avx(\&body_20_39);
   1032 
   1033 $code.=<<___;
   1034 	vmovups	$iv,48($out,$in0)		# write output
   1035 	mov	88(%rsp),$ivp			# restore $ivp
   1036 
   1037 	add	0($ctx),$A			# update context
   1038 	add	4($ctx),@T[0]
   1039 	add	8($ctx),$C
   1040 	mov	$A,0($ctx)
   1041 	add	12($ctx),$D
   1042 	mov	@T[0],4($ctx)
   1043 	add	16($ctx),$E
   1044 	mov	$C,8($ctx)
   1045 	mov	$D,12($ctx)
   1046 	mov	$E,16($ctx)
   1047 	vmovups	$iv,($ivp)			# write IV
   1048 	vzeroall
   1049 ___
   1050 $code.=<<___ if ($win64);
   1051 	movaps	96+0(%rsp),%xmm6
   1052 	movaps	96+16(%rsp),%xmm7
   1053 	movaps	96+32(%rsp),%xmm8
   1054 	movaps	96+48(%rsp),%xmm9
   1055 	movaps	96+64(%rsp),%xmm10
   1056 	movaps	96+80(%rsp),%xmm11
   1057 	movaps	96+96(%rsp),%xmm12
   1058 	movaps	96+112(%rsp),%xmm13
   1059 	movaps	96+128(%rsp),%xmm14
   1060 	movaps	96+144(%rsp),%xmm15
   1061 ___
   1062 $code.=<<___;
   1063 	lea	`104+($win64?10*16:0)`(%rsp),%rsi
   1064 	mov	0(%rsi),%r15
   1065 	mov	8(%rsi),%r14
   1066 	mov	16(%rsi),%r13
   1067 	mov	24(%rsi),%r12
   1068 	mov	32(%rsi),%rbp
   1069 	mov	40(%rsi),%rbx
   1070 	lea	48(%rsi),%rsp
   1071 .Lepilogue_avx:
   1072 	ret
   1073 .size	aesni_cbc_sha1_enc_avx,.-aesni_cbc_sha1_enc_avx
   1074 ___
   1075 }
   1076 $code.=<<___;
   1077 .align	64
   1078 K_XX_XX:
   1079 .long	0x5a827999,0x5a827999,0x5a827999,0x5a827999	# K_00_19
   1080 .long	0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1	# K_20_39
   1081 .long	0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc	# K_40_59
   1082 .long	0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6	# K_60_79
   1083 .long	0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f	# pbswap mask
   1084 
   1085 .asciz	"AESNI-CBC+SHA1 stitch for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
   1086 .align	64
   1087 ___
   1088 
   1089 # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
   1090 #		CONTEXT *context,DISPATCHER_CONTEXT *disp)
   1091 if ($win64) {
   1092 $rec="%rcx";
   1093 $frame="%rdx";
   1094 $context="%r8";
   1095 $disp="%r9";
   1096 
   1097 $code.=<<___;
   1098 .extern	__imp_RtlVirtualUnwind
   1099 .type	ssse3_handler,\@abi-omnipotent
   1100 .align	16
   1101 ssse3_handler:
   1102 	push	%rsi
   1103 	push	%rdi
   1104 	push	%rbx
   1105 	push	%rbp
   1106 	push	%r12
   1107 	push	%r13
   1108 	push	%r14
   1109 	push	%r15
   1110 	pushfq
   1111 	sub	\$64,%rsp
   1112 
   1113 	mov	120($context),%rax	# pull context->Rax
   1114 	mov	248($context),%rbx	# pull context->Rip
   1115 
   1116 	mov	8($disp),%rsi		# disp->ImageBase
   1117 	mov	56($disp),%r11		# disp->HandlerData
   1118 
   1119 	mov	0(%r11),%r10d		# HandlerData[0]
   1120 	lea	(%rsi,%r10),%r10	# prologue label
   1121 	cmp	%r10,%rbx		# context->Rip<prologue label
   1122 	jb	.Lcommon_seh_tail
   1123 
   1124 	mov	152($context),%rax	# pull context->Rsp
   1125 
   1126 	mov	4(%r11),%r10d		# HandlerData[1]
   1127 	lea	(%rsi,%r10),%r10	# epilogue label
   1128 	cmp	%r10,%rbx		# context->Rip>=epilogue label
   1129 	jae	.Lcommon_seh_tail
   1130 
   1131 	lea	96(%rax),%rsi
   1132 	lea	512($context),%rdi	# &context.Xmm6
   1133 	mov	\$20,%ecx
   1134 	.long	0xa548f3fc		# cld; rep movsq
   1135 	lea	`104+10*16`(%rax),%rax	# adjust stack pointer
   1136 
   1137 	mov	0(%rax),%r15
   1138 	mov	8(%rax),%r14
   1139 	mov	16(%rax),%r13
   1140 	mov	24(%rax),%r12
   1141 	mov	32(%rax),%rbp
   1142 	mov	40(%rax),%rbx
   1143 	lea	48(%rax),%rax
   1144 	mov	%rbx,144($context)	# restore context->Rbx
   1145 	mov	%rbp,160($context)	# restore context->Rbp
   1146 	mov	%r12,216($context)	# restore context->R12
   1147 	mov	%r13,224($context)	# restore context->R13
   1148 	mov	%r14,232($context)	# restore context->R14
   1149 	mov	%r15,240($context)	# restore context->R15
   1150 
   1151 .Lcommon_seh_tail:
   1152 	mov	8(%rax),%rdi
   1153 	mov	16(%rax),%rsi
   1154 	mov	%rax,152($context)	# restore context->Rsp
   1155 	mov	%rsi,168($context)	# restore context->Rsi
   1156 	mov	%rdi,176($context)	# restore context->Rdi
   1157 
   1158 	mov	40($disp),%rdi		# disp->ContextRecord
   1159 	mov	$context,%rsi		# context
   1160 	mov	\$154,%ecx		# sizeof(CONTEXT)
   1161 	.long	0xa548f3fc		# cld; rep movsq
   1162 
   1163 	mov	$disp,%rsi
   1164 	xor	%rcx,%rcx		# arg1, UNW_FLAG_NHANDLER
   1165 	mov	8(%rsi),%rdx		# arg2, disp->ImageBase
   1166 	mov	0(%rsi),%r8		# arg3, disp->ControlPc
   1167 	mov	16(%rsi),%r9		# arg4, disp->FunctionEntry
   1168 	mov	40(%rsi),%r10		# disp->ContextRecord
   1169 	lea	56(%rsi),%r11		# &disp->HandlerData
   1170 	lea	24(%rsi),%r12		# &disp->EstablisherFrame
   1171 	mov	%r10,32(%rsp)		# arg5
   1172 	mov	%r11,40(%rsp)		# arg6
   1173 	mov	%r12,48(%rsp)		# arg7
   1174 	mov	%rcx,56(%rsp)		# arg8, (NULL)
   1175 	call	*__imp_RtlVirtualUnwind(%rip)
   1176 
   1177 	mov	\$1,%eax		# ExceptionContinueSearch
   1178 	add	\$64,%rsp
   1179 	popfq
   1180 	pop	%r15
   1181 	pop	%r14
   1182 	pop	%r13
   1183 	pop	%r12
   1184 	pop	%rbp
   1185 	pop	%rbx
   1186 	pop	%rdi
   1187 	pop	%rsi
   1188 	ret
   1189 .size	ssse3_handler,.-ssse3_handler
   1190 
   1191 .section	.pdata
   1192 .align	4
   1193 	.rva	.LSEH_begin_aesni_cbc_sha1_enc_ssse3
   1194 	.rva	.LSEH_end_aesni_cbc_sha1_enc_ssse3
   1195 	.rva	.LSEH_info_aesni_cbc_sha1_enc_ssse3
   1196 ___
   1197 $code.=<<___ if ($avx);
   1198 	.rva	.LSEH_begin_aesni_cbc_sha1_enc_avx
   1199 	.rva	.LSEH_end_aesni_cbc_sha1_enc_avx
   1200 	.rva	.LSEH_info_aesni_cbc_sha1_enc_avx
   1201 ___
   1202 $code.=<<___;
   1203 .section	.xdata
   1204 .align	8
   1205 .LSEH_info_aesni_cbc_sha1_enc_ssse3:
   1206 	.byte	9,0,0,0
   1207 	.rva	ssse3_handler
   1208 	.rva	.Lprologue_ssse3,.Lepilogue_ssse3	# HandlerData[]
   1209 ___
   1210 $code.=<<___ if ($avx);
   1211 .LSEH_info_aesni_cbc_sha1_enc_avx:
   1212 	.byte	9,0,0,0
   1213 	.rva	ssse3_handler
   1214 	.rva	.Lprologue_avx,.Lepilogue_avx		# HandlerData[]
   1215 ___
   1216 }
   1217 
   1218 ####################################################################
   1219 sub rex {
   1220   local *opcode=shift;
   1221   my ($dst,$src)=@_;
   1222   my $rex=0;
   1223 
   1224     $rex|=0x04			if($dst>=8);
   1225     $rex|=0x01			if($src>=8);
   1226     push @opcode,$rex|0x40	if($rex);
   1227 }
   1228 
   1229 sub aesni {
   1230   my $line=shift;
   1231   my @opcode=(0x66);
   1232 
   1233     if ($line=~/(aes[a-z]+)\s+%xmm([0-9]+),\s*%xmm([0-9]+)/) {
   1234 	my %opcodelet = (
   1235 		"aesenc" => 0xdc,	"aesenclast" => 0xdd
   1236 	);
   1237 	return undef if (!defined($opcodelet{$1}));
   1238 	rex(\@opcode,$3,$2);
   1239 	push @opcode,0x0f,0x38,$opcodelet{$1};
   1240 	push @opcode,0xc0|($2&7)|(($3&7)<<3);	# ModR/M
   1241 	return ".byte\t".join(',',@opcode);
   1242     }
   1243     return $line;
   1244 }
   1245 
   1246 $code =~ s/\`([^\`]*)\`/eval($1)/gem;
   1247 $code =~ s/\b(aes.*%xmm[0-9]+).*$/aesni($1)/gem;
   1248 
   1249 print $code;
   1250 close STDOUT;
   1251