Home | History | Annotate | Download | only in asm
      1 #!/usr/bin/env perl
      2 #
      3 # ====================================================================
      4 # Written by Andy Polyakov <appro (at] openssl.org> for the OpenSSL
      5 # project. The module is, however, dual licensed under OpenSSL and
      6 # CRYPTOGAMS licenses depending on where you obtain it. For further
      7 # details see http://www.openssl.org/~appro/cryptogams/.
      8 # ====================================================================
      9 #
     10 # June 2011
     11 #
     12 # This is AESNI-CBC+SHA1 "stitch" implementation. The idea, as spelled
     13 # in http://download.intel.com/design/intarch/papers/323686.pdf, is
     14 # that since AESNI-CBC encrypt exhibit *very* low instruction-level
     15 # parallelism, interleaving it with another algorithm would allow to
     16 # utilize processor resources better and achieve better performance.
     17 # SHA1 instruction sequences(*) are taken from sha1-x86_64.pl and
     18 # AESNI code is weaved into it. Below are performance numbers in
     19 # cycles per processed byte, less is better, for standalone AESNI-CBC
     20 # encrypt, sum of the latter and standalone SHA1, and "stitched"
     21 # subroutine:
     22 #
     23 #		AES-128-CBC	+SHA1		stitch      gain
     24 # Westmere	3.77[+5.6]	9.37		6.65	    +41%
     25 # Sandy Bridge	5.05[+5.2(6.3)]	10.25(11.35)	6.16(7.08)  +67%(+60%)
     26 #
     27 #		AES-192-CBC
     28 # Westmere	4.51		10.11		6.97	    +45%
     29 # Sandy Bridge	6.05		11.25(12.35)	6.34(7.27)  +77%(+70%)
     30 #
     31 #		AES-256-CBC
     32 # Westmere	5.25		10.85		7.25	    +50%
     33 # Sandy Bridge	7.05		12.25(13.35)	7.06(7.70)  +74%(+73%)
     34 #
     35 # (*)	There are two code paths: SSSE3 and AVX. See sha1-568.pl for
     36 #	background information. Above numbers in parentheses are SSSE3
     37 #	results collected on AVX-capable CPU, i.e. apply on OSes that
     38 #	don't support AVX.
     39 #
     40 # Needless to mention that it makes no sense to implement "stitched"
     41 # *decrypt* subroutine. Because *both* AESNI-CBC decrypt and SHA1
     42 # fully utilize parallelism, so stitching would not give any gain
     43 # anyway. Well, there might be some, e.g. because of better cache
     44 # locality... For reference, here are performance results for
     45 # standalone AESNI-CBC decrypt:
     46 #
     47 #		AES-128-CBC	AES-192-CBC	AES-256-CBC
     48 # Westmere	1.31		1.55		1.80
     49 # Sandy Bridge	0.93		1.06		1.22
     50 
     51 $flavour = shift;
     52 $output  = shift;
     53 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
     54 
     55 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
     56 
     57 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
     58 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
     59 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
     60 die "can't locate x86_64-xlate.pl";
     61 
     62 $avx=1 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
     63 		=~ /GNU assembler version ([2-9]\.[0-9]+)/ &&
     64 	   $1>=2.19);
     65 $avx=1 if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
     66 	   `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/ &&
     67 	   $1>=2.09);
     68 $avx=1 if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
     69 	   `ml64 2>&1` =~ /Version ([0-9]+)\./ &&
     70 	   $1>=10);
     71 
     72 open STDOUT,"| $^X $xlate $flavour $output";
     73 
     74 # void aesni_cbc_sha1_enc(const void *inp,
     75 #			void *out,
     76 #			size_t length,
     77 #			const AES_KEY *key,
     78 #			unsigned char *iv,
     79 #			SHA_CTX *ctx,
     80 #			const void *in0);
     81 
     82 $code.=<<___;
     83 .text
     84 .extern	OPENSSL_ia32cap_P
     85 
     86 .globl	aesni_cbc_sha1_enc
     87 .type	aesni_cbc_sha1_enc,\@abi-omnipotent
     88 .align	16
     89 aesni_cbc_sha1_enc:
     90 	# caller should check for SSSE3 and AES-NI bits
     91 	mov	OPENSSL_ia32cap_P+0(%rip),%r10d
     92 	mov	OPENSSL_ia32cap_P+4(%rip),%r11d
     93 ___
     94 $code.=<<___ if ($avx);
     95 	and	\$`1<<28`,%r11d		# mask AVX bit
     96 	and	\$`1<<30`,%r10d		# mask "Intel CPU" bit
     97 	or	%r11d,%r10d
     98 	cmp	\$`1<<28|1<<30`,%r10d
     99 	je	aesni_cbc_sha1_enc_avx
    100 ___
    101 $code.=<<___;
    102 	jmp	aesni_cbc_sha1_enc_ssse3
    103 	ret
    104 .size	aesni_cbc_sha1_enc,.-aesni_cbc_sha1_enc
    105 ___
    106 
    107 my ($in0,$out,$len,$key,$ivp,$ctx,$inp)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10");
    108 
    109 my $Xi=4;
    110 my @X=map("%xmm$_",(4..7,0..3));
    111 my @Tx=map("%xmm$_",(8..10));
    112 my @V=($A,$B,$C,$D,$E)=("%eax","%ebx","%ecx","%edx","%ebp");	# size optimization
    113 my @T=("%esi","%edi");
    114 my $j=0; my $jj=0; my $r=0; my $sn=0;
    115 my $K_XX_XX="%r11";
    116 my ($iv,$in,$rndkey0)=map("%xmm$_",(11..13));
    117 my @rndkey=("%xmm14","%xmm15");
    118 
    119 sub AUTOLOAD()		# thunk [simplified] 32-bit style perlasm
    120 { my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
    121   my $arg = pop;
    122     $arg = "\$$arg" if ($arg*1 eq $arg);
    123     $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n";
    124 }
    125 
    126 my $_rol=sub { &rol(@_) };
    127 my $_ror=sub { &ror(@_) };
    128 
    129 $code.=<<___;
    130 .type	aesni_cbc_sha1_enc_ssse3,\@function,6
    131 .align	16
    132 aesni_cbc_sha1_enc_ssse3:
    133 	mov	`($win64?56:8)`(%rsp),$inp	# load 7th argument
    134 	#shr	\$6,$len			# debugging artefact
    135 	#jz	.Lepilogue_ssse3		# debugging artefact
    136 	push	%rbx
    137 	push	%rbp
    138 	push	%r12
    139 	push	%r13
    140 	push	%r14
    141 	push	%r15
    142 	lea	`-104-($win64?10*16:0)`(%rsp),%rsp
    143 	#mov	$in0,$inp			# debugging artefact
    144 	#lea	64(%rsp),$ctx			# debugging artefact
    145 ___
    146 $code.=<<___ if ($win64);
    147 	movaps	%xmm6,96+0(%rsp)
    148 	movaps	%xmm7,96+16(%rsp)
    149 	movaps	%xmm8,96+32(%rsp)
    150 	movaps	%xmm9,96+48(%rsp)
    151 	movaps	%xmm10,96+64(%rsp)
    152 	movaps	%xmm11,96+80(%rsp)
    153 	movaps	%xmm12,96+96(%rsp)
    154 	movaps	%xmm13,96+112(%rsp)
    155 	movaps	%xmm14,96+128(%rsp)
    156 	movaps	%xmm15,96+144(%rsp)
    157 .Lprologue_ssse3:
    158 ___
    159 $code.=<<___;
    160 	mov	$in0,%r12			# reassign arguments
    161 	mov	$out,%r13
    162 	mov	$len,%r14
    163 	mov	$key,%r15
    164 	movdqu	($ivp),$iv			# load IV
    165 	mov	$ivp,88(%rsp)			# save $ivp
    166 ___
    167 my ($in0,$out,$len,$key)=map("%r$_",(12..15));	# reassign arguments
    168 my $rounds="${ivp}d";
    169 $code.=<<___;
    170 	shl	\$6,$len
    171 	sub	$in0,$out
    172 	mov	240($key),$rounds
    173 	add	$inp,$len		# end of input
    174 
    175 	lea	K_XX_XX(%rip),$K_XX_XX
    176 	mov	0($ctx),$A		# load context
    177 	mov	4($ctx),$B
    178 	mov	8($ctx),$C
    179 	mov	12($ctx),$D
    180 	mov	$B,@T[0]		# magic seed
    181 	mov	16($ctx),$E
    182 
    183 	movdqa	64($K_XX_XX),@X[2]	# pbswap mask
    184 	movdqa	0($K_XX_XX),@Tx[1]	# K_00_19
    185 	movdqu	0($inp),@X[-4&7]	# load input to %xmm[0-3]
    186 	movdqu	16($inp),@X[-3&7]
    187 	movdqu	32($inp),@X[-2&7]
    188 	movdqu	48($inp),@X[-1&7]
    189 	pshufb	@X[2],@X[-4&7]		# byte swap
    190 	add	\$64,$inp
    191 	pshufb	@X[2],@X[-3&7]
    192 	pshufb	@X[2],@X[-2&7]
    193 	pshufb	@X[2],@X[-1&7]
    194 	paddd	@Tx[1],@X[-4&7]		# add K_00_19
    195 	paddd	@Tx[1],@X[-3&7]
    196 	paddd	@Tx[1],@X[-2&7]
    197 	movdqa	@X[-4&7],0(%rsp)	# X[]+K xfer to IALU
    198 	psubd	@Tx[1],@X[-4&7]		# restore X[]
    199 	movdqa	@X[-3&7],16(%rsp)
    200 	psubd	@Tx[1],@X[-3&7]
    201 	movdqa	@X[-2&7],32(%rsp)
    202 	psubd	@Tx[1],@X[-2&7]
    203 	movups	($key),$rndkey0		# $key[0]
    204 	movups	16($key),$rndkey[0]	# forward reference
    205 	jmp	.Loop_ssse3
    206 ___
    207 
    208 my $aesenc=sub {
    209   use integer;
    210   my ($n,$k)=($r/10,$r%10);
    211     if ($k==0) {
    212       $code.=<<___;
    213 	movups		`16*$n`($in0),$in		# load input
    214 	xorps		$rndkey0,$in
    215 ___
    216       $code.=<<___ if ($n);
    217 	movups		$iv,`16*($n-1)`($out,$in0)	# write output
    218 ___
    219       $code.=<<___;
    220 	xorps		$in,$iv
    221 	aesenc		$rndkey[0],$iv
    222 	movups		`32+16*$k`($key),$rndkey[1]
    223 ___
    224     } elsif ($k==9) {
    225       $sn++;
    226       $code.=<<___;
    227 	cmp		\$11,$rounds
    228 	jb		.Laesenclast$sn
    229 	movups		`32+16*($k+0)`($key),$rndkey[1]
    230 	aesenc		$rndkey[0],$iv
    231 	movups		`32+16*($k+1)`($key),$rndkey[0]
    232 	aesenc		$rndkey[1],$iv
    233 	je		.Laesenclast$sn
    234 	movups		`32+16*($k+2)`($key),$rndkey[1]
    235 	aesenc		$rndkey[0],$iv
    236 	movups		`32+16*($k+3)`($key),$rndkey[0]
    237 	aesenc		$rndkey[1],$iv
    238 .Laesenclast$sn:
    239 	aesenclast	$rndkey[0],$iv
    240 	movups		16($key),$rndkey[1]		# forward reference
    241 ___
    242     } else {
    243       $code.=<<___;
    244 	aesenc		$rndkey[0],$iv
    245 	movups		`32+16*$k`($key),$rndkey[1]
    246 ___
    247     }
    248     $r++;	unshift(@rndkey,pop(@rndkey));
    249 };
    250 
    251 sub Xupdate_ssse3_16_31()		# recall that $Xi starts wtih 4
    252 { use integer;
    253   my $body = shift;
    254   my @insns = (&$body,&$body,&$body,&$body);	# 40 instructions
    255   my ($a,$b,$c,$d,$e);
    256 
    257 	&movdqa	(@X[0],@X[-3&7]);
    258 	 eval(shift(@insns));
    259 	 eval(shift(@insns));
    260 	&movdqa	(@Tx[0],@X[-1&7]);
    261 	&palignr(@X[0],@X[-4&7],8);	# compose "X[-14]" in "X[0]"
    262 	 eval(shift(@insns));
    263 	 eval(shift(@insns));
    264 
    265 	  &paddd	(@Tx[1],@X[-1&7]);
    266 	 eval(shift(@insns));
    267 	 eval(shift(@insns));
    268 	&psrldq	(@Tx[0],4);		# "X[-3]", 3 dwords
    269 	 eval(shift(@insns));
    270 	 eval(shift(@insns));
    271 	&pxor	(@X[0],@X[-4&7]);	# "X[0]"^="X[-16]"
    272 	 eval(shift(@insns));
    273 	 eval(shift(@insns));
    274 
    275 	&pxor	(@Tx[0],@X[-2&7]);	# "X[-3]"^"X[-8]"
    276 	 eval(shift(@insns));
    277 	 eval(shift(@insns));
    278 	 eval(shift(@insns));
    279 	 eval(shift(@insns));
    280 
    281 	&pxor	(@X[0],@Tx[0]);		# "X[0]"^="X[-3]"^"X[-8]"
    282 	 eval(shift(@insns));
    283 	 eval(shift(@insns));
    284 	  &movdqa	(eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]);	# X[]+K xfer to IALU
    285 	 eval(shift(@insns));
    286 	 eval(shift(@insns));
    287 
    288 	&movdqa	(@Tx[2],@X[0]);
    289 	&movdqa	(@Tx[0],@X[0]);
    290 	 eval(shift(@insns));
    291 	 eval(shift(@insns));
    292 	 eval(shift(@insns));
    293 	 eval(shift(@insns));
    294 
    295 	&pslldq	(@Tx[2],12);		# "X[0]"<<96, extract one dword
    296 	&paddd	(@X[0],@X[0]);
    297 	 eval(shift(@insns));
    298 	 eval(shift(@insns));
    299 	 eval(shift(@insns));
    300 	 eval(shift(@insns));
    301 
    302 	&psrld	(@Tx[0],31);
    303 	 eval(shift(@insns));
    304 	 eval(shift(@insns));
    305 	&movdqa	(@Tx[1],@Tx[2]);
    306 	 eval(shift(@insns));
    307 	 eval(shift(@insns));
    308 
    309 	&psrld	(@Tx[2],30);
    310 	&por	(@X[0],@Tx[0]);		# "X[0]"<<<=1
    311 	 eval(shift(@insns));
    312 	 eval(shift(@insns));
    313 	 eval(shift(@insns));
    314 	 eval(shift(@insns));
    315 
    316 	&pslld	(@Tx[1],2);
    317 	&pxor	(@X[0],@Tx[2]);
    318 	 eval(shift(@insns));
    319 	 eval(shift(@insns));
    320 	  &movdqa	(@Tx[2],eval(16*(($Xi)/5))."($K_XX_XX)");	# K_XX_XX
    321 	 eval(shift(@insns));
    322 	 eval(shift(@insns));
    323 
    324 	&pxor	(@X[0],@Tx[1]);		# "X[0]"^=("X[0]">>96)<<<2
    325 
    326 	 foreach (@insns) { eval; }	# remaining instructions [if any]
    327 
    328   $Xi++;	push(@X,shift(@X));	# "rotate" X[]
    329 		push(@Tx,shift(@Tx));
    330 }
    331 
    332 sub Xupdate_ssse3_32_79()
    333 { use integer;
    334   my $body = shift;
    335   my @insns = (&$body,&$body,&$body,&$body);	# 32 to 48 instructions
    336   my ($a,$b,$c,$d,$e);
    337 
    338 	&movdqa	(@Tx[0],@X[-1&7])	if ($Xi==8);
    339 	 eval(shift(@insns));		# body_20_39
    340 	&pxor	(@X[0],@X[-4&7]);	# "X[0]"="X[-32]"^"X[-16]"
    341 	&palignr(@Tx[0],@X[-2&7],8);	# compose "X[-6]"
    342 	 eval(shift(@insns));
    343 	 eval(shift(@insns));
    344 	 eval(shift(@insns));		# rol
    345 
    346 	&pxor	(@X[0],@X[-7&7]);	# "X[0]"^="X[-28]"
    347 	 eval(shift(@insns));
    348 	 eval(shift(@insns))	if (@insns[0] !~ /&ro[rl]/);
    349 	if ($Xi%5) {
    350 	  &movdqa	(@Tx[2],@Tx[1]);# "perpetuate" K_XX_XX...
    351 	} else {			# ... or load next one
    352 	  &movdqa	(@Tx[2],eval(16*($Xi/5))."($K_XX_XX)");
    353 	}
    354 	  &paddd	(@Tx[1],@X[-1&7]);
    355 	 eval(shift(@insns));		# ror
    356 	 eval(shift(@insns));
    357 
    358 	&pxor	(@X[0],@Tx[0]);		# "X[0]"^="X[-6]"
    359 	 eval(shift(@insns));		# body_20_39
    360 	 eval(shift(@insns));
    361 	 eval(shift(@insns));
    362 	 eval(shift(@insns));		# rol
    363 
    364 	&movdqa	(@Tx[0],@X[0]);
    365 	  &movdqa	(eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]);	# X[]+K xfer to IALU
    366 	 eval(shift(@insns));
    367 	 eval(shift(@insns));
    368 	 eval(shift(@insns));		# ror
    369 	 eval(shift(@insns));
    370 
    371 	&pslld	(@X[0],2);
    372 	 eval(shift(@insns));		# body_20_39
    373 	 eval(shift(@insns));
    374 	&psrld	(@Tx[0],30);
    375 	 eval(shift(@insns));
    376 	 eval(shift(@insns));		# rol
    377 	 eval(shift(@insns));
    378 	 eval(shift(@insns));
    379 	 eval(shift(@insns));		# ror
    380 	 eval(shift(@insns));
    381 
    382 	&por	(@X[0],@Tx[0]);		# "X[0]"<<<=2
    383 	 eval(shift(@insns));		# body_20_39
    384 	 eval(shift(@insns));
    385 	  &movdqa	(@Tx[1],@X[0])	if ($Xi<19);
    386 	 eval(shift(@insns));
    387 	 eval(shift(@insns));		# rol
    388 	 eval(shift(@insns));
    389 	 eval(shift(@insns));
    390 	 eval(shift(@insns));		# rol
    391 	 eval(shift(@insns));
    392 
    393 	 foreach (@insns) { eval; }	# remaining instructions
    394 
    395   $Xi++;	push(@X,shift(@X));	# "rotate" X[]
    396 		push(@Tx,shift(@Tx));
    397 }
    398 
    399 sub Xuplast_ssse3_80()
    400 { use integer;
    401   my $body = shift;
    402   my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
    403   my ($a,$b,$c,$d,$e);
    404 
    405 	 eval(shift(@insns));
    406 	  &paddd	(@Tx[1],@X[-1&7]);
    407 	 eval(shift(@insns));
    408 	 eval(shift(@insns));
    409 	 eval(shift(@insns));
    410 	 eval(shift(@insns));
    411 
    412 	  &movdqa	(eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]);	# X[]+K xfer IALU
    413 
    414 	 foreach (@insns) { eval; }		# remaining instructions
    415 
    416 	&cmp	($inp,$len);
    417 	&je	(".Ldone_ssse3");
    418 
    419 	unshift(@Tx,pop(@Tx));
    420 
    421 	&movdqa	(@X[2],"64($K_XX_XX)");		# pbswap mask
    422 	&movdqa	(@Tx[1],"0($K_XX_XX)");		# K_00_19
    423 	&movdqu	(@X[-4&7],"0($inp)");		# load input
    424 	&movdqu	(@X[-3&7],"16($inp)");
    425 	&movdqu	(@X[-2&7],"32($inp)");
    426 	&movdqu	(@X[-1&7],"48($inp)");
    427 	&pshufb	(@X[-4&7],@X[2]);		# byte swap
    428 	&add	($inp,64);
    429 
    430   $Xi=0;
    431 }
    432 
    433 sub Xloop_ssse3()
    434 { use integer;
    435   my $body = shift;
    436   my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
    437   my ($a,$b,$c,$d,$e);
    438 
    439 	 eval(shift(@insns));
    440 	 eval(shift(@insns));
    441 	&pshufb	(@X[($Xi-3)&7],@X[2]);
    442 	 eval(shift(@insns));
    443 	 eval(shift(@insns));
    444 	&paddd	(@X[($Xi-4)&7],@Tx[1]);
    445 	 eval(shift(@insns));
    446 	 eval(shift(@insns));
    447 	 eval(shift(@insns));
    448 	 eval(shift(@insns));
    449 	&movdqa	(eval(16*$Xi)."(%rsp)",@X[($Xi-4)&7]);	# X[]+K xfer to IALU
    450 	 eval(shift(@insns));
    451 	 eval(shift(@insns));
    452 	&psubd	(@X[($Xi-4)&7],@Tx[1]);
    453 
    454 	foreach (@insns) { eval; }
    455   $Xi++;
    456 }
    457 
    458 sub Xtail_ssse3()
    459 { use integer;
    460   my $body = shift;
    461   my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
    462   my ($a,$b,$c,$d,$e);
    463 
    464 	foreach (@insns) { eval; }
    465 }
    466 
    467 sub body_00_19 () {
    468   use integer;
    469   my ($k,$n);
    470   my @r=(
    471 	'($a,$b,$c,$d,$e)=@V;'.
    472 	'&add	($e,eval(4*($j&15))."(%rsp)");',	# X[]+K xfer
    473 	'&xor	($c,$d);',
    474 	'&mov	(@T[1],$a);',	# $b in next round
    475 	'&$_rol	($a,5);',
    476 	'&and	(@T[0],$c);',	# ($b&($c^$d))
    477 	'&xor	($c,$d);',	# restore $c
    478 	'&xor	(@T[0],$d);',
    479 	'&add	($e,$a);',
    480 	'&$_ror	($b,$j?7:2);',	# $b>>>2
    481 	'&add	($e,@T[0]);'	.'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));'
    482 	);
    483 	$n = scalar(@r);
    484 	$k = (($jj+1)*12/20)*20*$n/12;	# 12 aesencs per these 20 rounds
    485 	@r[$k%$n].='&$aesenc();'	if ($jj==$k/$n);
    486 	$jj++;
    487     return @r;
    488 }
    489 
    490 sub body_20_39 () {
    491   use integer;
    492   my ($k,$n);
    493   my @r=(
    494 	'($a,$b,$c,$d,$e)=@V;'.
    495 	'&add	($e,eval(4*($j++&15))."(%rsp)");',	# X[]+K xfer
    496 	'&xor	(@T[0],$d);',	# ($b^$d)
    497 	'&mov	(@T[1],$a);',	# $b in next round
    498 	'&$_rol	($a,5);',
    499 	'&xor	(@T[0],$c);',	# ($b^$d^$c)
    500 	'&add	($e,$a);',
    501 	'&$_ror	($b,7);',	# $b>>>2
    502 	'&add	($e,@T[0]);'	.'unshift(@V,pop(@V)); unshift(@T,pop(@T));'
    503 	);
    504 	$n = scalar(@r);
    505 	$k = (($jj+1)*8/20)*20*$n/8;	# 8 aesencs per these 20 rounds
    506 	@r[$k%$n].='&$aesenc();'	if ($jj==$k/$n);
    507 	$jj++;
    508     return @r;
    509 }
    510 
    511 sub body_40_59 () {
    512   use integer;
    513   my ($k,$n);
    514   my @r=(
    515 	'($a,$b,$c,$d,$e)=@V;'.
    516 	'&mov	(@T[1],$c);',
    517 	'&xor	($c,$d);',
    518 	'&add	($e,eval(4*($j++&15))."(%rsp)");',	# X[]+K xfer
    519 	'&and	(@T[1],$d);',
    520 	'&and	(@T[0],$c);',	# ($b&($c^$d))
    521 	'&$_ror	($b,7);',	# $b>>>2
    522 	'&add	($e,@T[1]);',
    523 	'&mov	(@T[1],$a);',	# $b in next round
    524 	'&$_rol	($a,5);',
    525 	'&add	($e,@T[0]);',
    526 	'&xor	($c,$d);',	# restore $c
    527 	'&add	($e,$a);'	.'unshift(@V,pop(@V)); unshift(@T,pop(@T));'
    528 	);
    529 	$n = scalar(@r);
    530 	$k=(($jj+1)*12/20)*20*$n/12;	# 12 aesencs per these 20 rounds
    531 	@r[$k%$n].='&$aesenc();'	if ($jj==$k/$n);
    532 	$jj++;
    533     return @r;
    534 }
    535 $code.=<<___;
    536 .align	16
    537 .Loop_ssse3:
    538 ___
    539 	&Xupdate_ssse3_16_31(\&body_00_19);
    540 	&Xupdate_ssse3_16_31(\&body_00_19);
    541 	&Xupdate_ssse3_16_31(\&body_00_19);
    542 	&Xupdate_ssse3_16_31(\&body_00_19);
    543 	&Xupdate_ssse3_32_79(\&body_00_19);
    544 	&Xupdate_ssse3_32_79(\&body_20_39);
    545 	&Xupdate_ssse3_32_79(\&body_20_39);
    546 	&Xupdate_ssse3_32_79(\&body_20_39);
    547 	&Xupdate_ssse3_32_79(\&body_20_39);
    548 	&Xupdate_ssse3_32_79(\&body_20_39);
    549 	&Xupdate_ssse3_32_79(\&body_40_59);
    550 	&Xupdate_ssse3_32_79(\&body_40_59);
    551 	&Xupdate_ssse3_32_79(\&body_40_59);
    552 	&Xupdate_ssse3_32_79(\&body_40_59);
    553 	&Xupdate_ssse3_32_79(\&body_40_59);
    554 	&Xupdate_ssse3_32_79(\&body_20_39);
    555 	&Xuplast_ssse3_80(\&body_20_39);	# can jump to "done"
    556 
    557 				$saved_j=$j; @saved_V=@V;
    558 				$saved_r=$r; @saved_rndkey=@rndkey;
    559 
    560 	&Xloop_ssse3(\&body_20_39);
    561 	&Xloop_ssse3(\&body_20_39);
    562 	&Xloop_ssse3(\&body_20_39);
    563 
    564 $code.=<<___;
    565 	movups	$iv,48($out,$in0)		# write output
    566 	lea	64($in0),$in0
    567 
    568 	add	0($ctx),$A			# update context
    569 	add	4($ctx),@T[0]
    570 	add	8($ctx),$C
    571 	add	12($ctx),$D
    572 	mov	$A,0($ctx)
    573 	add	16($ctx),$E
    574 	mov	@T[0],4($ctx)
    575 	mov	@T[0],$B			# magic seed
    576 	mov	$C,8($ctx)
    577 	mov	$D,12($ctx)
    578 	mov	$E,16($ctx)
    579 	jmp	.Loop_ssse3
    580 
    581 .align	16
    582 .Ldone_ssse3:
    583 ___
    584 				$jj=$j=$saved_j; @V=@saved_V;
    585 				$r=$saved_r;     @rndkey=@saved_rndkey;
    586 
    587 	&Xtail_ssse3(\&body_20_39);
    588 	&Xtail_ssse3(\&body_20_39);
    589 	&Xtail_ssse3(\&body_20_39);
    590 
    591 $code.=<<___;
    592 	movups	$iv,48($out,$in0)		# write output
    593 	mov	88(%rsp),$ivp			# restore $ivp
    594 
    595 	add	0($ctx),$A			# update context
    596 	add	4($ctx),@T[0]
    597 	add	8($ctx),$C
    598 	mov	$A,0($ctx)
    599 	add	12($ctx),$D
    600 	mov	@T[0],4($ctx)
    601 	add	16($ctx),$E
    602 	mov	$C,8($ctx)
    603 	mov	$D,12($ctx)
    604 	mov	$E,16($ctx)
    605 	movups	$iv,($ivp)			# write IV
    606 ___
    607 $code.=<<___ if ($win64);
    608 	movaps	96+0(%rsp),%xmm6
    609 	movaps	96+16(%rsp),%xmm7
    610 	movaps	96+32(%rsp),%xmm8
    611 	movaps	96+48(%rsp),%xmm9
    612 	movaps	96+64(%rsp),%xmm10
    613 	movaps	96+80(%rsp),%xmm11
    614 	movaps	96+96(%rsp),%xmm12
    615 	movaps	96+112(%rsp),%xmm13
    616 	movaps	96+128(%rsp),%xmm14
    617 	movaps	96+144(%rsp),%xmm15
    618 ___
    619 $code.=<<___;
    620 	lea	`104+($win64?10*16:0)`(%rsp),%rsi
    621 	mov	0(%rsi),%r15
    622 	mov	8(%rsi),%r14
    623 	mov	16(%rsi),%r13
    624 	mov	24(%rsi),%r12
    625 	mov	32(%rsi),%rbp
    626 	mov	40(%rsi),%rbx
    627 	lea	48(%rsi),%rsp
    628 .Lepilogue_ssse3:
    629 	ret
    630 .size	aesni_cbc_sha1_enc_ssse3,.-aesni_cbc_sha1_enc_ssse3
    631 ___
    632 
    633 $j=$jj=$r=$sn=0;
    634 
    635 if ($avx) {
    636 my ($in0,$out,$len,$key,$ivp,$ctx,$inp)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10");
    637 
    638 my $Xi=4;
    639 my @X=map("%xmm$_",(4..7,0..3));
    640 my @Tx=map("%xmm$_",(8..10));
    641 my @V=($A,$B,$C,$D,$E)=("%eax","%ebx","%ecx","%edx","%ebp");	# size optimization
    642 my @T=("%esi","%edi");
    643 
    644 my $_rol=sub { &shld(@_[0],@_) };
    645 my $_ror=sub { &shrd(@_[0],@_) };
    646 
    647 $code.=<<___;
    648 .type	aesni_cbc_sha1_enc_avx,\@function,6
    649 .align	16
    650 aesni_cbc_sha1_enc_avx:
    651 	mov	`($win64?56:8)`(%rsp),$inp	# load 7th argument
    652 	#shr	\$6,$len			# debugging artefact
    653 	#jz	.Lepilogue_avx			# debugging artefact
    654 	push	%rbx
    655 	push	%rbp
    656 	push	%r12
    657 	push	%r13
    658 	push	%r14
    659 	push	%r15
    660 	lea	`-104-($win64?10*16:0)`(%rsp),%rsp
    661 	#mov	$in0,$inp			# debugging artefact
    662 	#lea	64(%rsp),$ctx			# debugging artefact
    663 ___
    664 $code.=<<___ if ($win64);
    665 	movaps	%xmm6,96+0(%rsp)
    666 	movaps	%xmm7,96+16(%rsp)
    667 	movaps	%xmm8,96+32(%rsp)
    668 	movaps	%xmm9,96+48(%rsp)
    669 	movaps	%xmm10,96+64(%rsp)
    670 	movaps	%xmm11,96+80(%rsp)
    671 	movaps	%xmm12,96+96(%rsp)
    672 	movaps	%xmm13,96+112(%rsp)
    673 	movaps	%xmm14,96+128(%rsp)
    674 	movaps	%xmm15,96+144(%rsp)
    675 .Lprologue_avx:
    676 ___
    677 $code.=<<___;
    678 	vzeroall
    679 	mov	$in0,%r12			# reassign arguments
    680 	mov	$out,%r13
    681 	mov	$len,%r14
    682 	mov	$key,%r15
    683 	vmovdqu	($ivp),$iv			# load IV
    684 	mov	$ivp,88(%rsp)			# save $ivp
    685 ___
    686 my ($in0,$out,$len,$key)=map("%r$_",(12..15));	# reassign arguments
    687 my $rounds="${ivp}d";
    688 $code.=<<___;
    689 	shl	\$6,$len
    690 	sub	$in0,$out
    691 	mov	240($key),$rounds
    692 	add	\$112,$key		# size optimization
    693 	add	$inp,$len		# end of input
    694 
    695 	lea	K_XX_XX(%rip),$K_XX_XX
    696 	mov	0($ctx),$A		# load context
    697 	mov	4($ctx),$B
    698 	mov	8($ctx),$C
    699 	mov	12($ctx),$D
    700 	mov	$B,@T[0]		# magic seed
    701 	mov	16($ctx),$E
    702 
    703 	vmovdqa	64($K_XX_XX),@X[2]	# pbswap mask
    704 	vmovdqa	0($K_XX_XX),@Tx[1]	# K_00_19
    705 	vmovdqu	0($inp),@X[-4&7]	# load input to %xmm[0-3]
    706 	vmovdqu	16($inp),@X[-3&7]
    707 	vmovdqu	32($inp),@X[-2&7]
    708 	vmovdqu	48($inp),@X[-1&7]
    709 	vpshufb	@X[2],@X[-4&7],@X[-4&7]	# byte swap
    710 	add	\$64,$inp
    711 	vpshufb	@X[2],@X[-3&7],@X[-3&7]
    712 	vpshufb	@X[2],@X[-2&7],@X[-2&7]
    713 	vpshufb	@X[2],@X[-1&7],@X[-1&7]
    714 	vpaddd	@Tx[1],@X[-4&7],@X[0]	# add K_00_19
    715 	vpaddd	@Tx[1],@X[-3&7],@X[1]
    716 	vpaddd	@Tx[1],@X[-2&7],@X[2]
    717 	vmovdqa	@X[0],0(%rsp)		# X[]+K xfer to IALU
    718 	vmovdqa	@X[1],16(%rsp)
    719 	vmovdqa	@X[2],32(%rsp)
    720 	vmovups	-112($key),$rndkey0	# $key[0]
    721 	vmovups	16-112($key),$rndkey[0]	# forward reference
    722 	jmp	.Loop_avx
    723 ___
    724 
    725 my $aesenc=sub {
    726   use integer;
    727   my ($n,$k)=($r/10,$r%10);
    728     if ($k==0) {
    729       $code.=<<___;
    730 	vmovups		`16*$n`($in0),$in		# load input
    731 	vxorps		$rndkey0,$in,$in
    732 ___
    733       $code.=<<___ if ($n);
    734 	vmovups		$iv,`16*($n-1)`($out,$in0)	# write output
    735 ___
    736       $code.=<<___;
    737 	vxorps		$in,$iv,$iv
    738 	vaesenc		$rndkey[0],$iv,$iv
    739 	vmovups		`32+16*$k-112`($key),$rndkey[1]
    740 ___
    741     } elsif ($k==9) {
    742       $sn++;
    743       $code.=<<___;
    744 	cmp		\$11,$rounds
    745 	jb		.Lvaesenclast$sn
    746 	vaesenc		$rndkey[0],$iv,$iv
    747 	vmovups		`32+16*($k+0)-112`($key),$rndkey[1]
    748 	vaesenc		$rndkey[1],$iv,$iv
    749 	vmovups		`32+16*($k+1)-112`($key),$rndkey[0]
    750 	je		.Lvaesenclast$sn
    751 	vaesenc		$rndkey[0],$iv,$iv
    752 	vmovups		`32+16*($k+2)-112`($key),$rndkey[1]
    753 	vaesenc		$rndkey[1],$iv,$iv
    754 	vmovups		`32+16*($k+3)-112`($key),$rndkey[0]
    755 .Lvaesenclast$sn:
    756 	vaesenclast	$rndkey[0],$iv,$iv
    757 	vmovups		16-112($key),$rndkey[1]		# forward reference
    758 ___
    759     } else {
    760       $code.=<<___;
    761 	vaesenc		$rndkey[0],$iv,$iv
    762 	vmovups		`32+16*$k-112`($key),$rndkey[1]
    763 ___
    764     }
    765     $r++;	unshift(@rndkey,pop(@rndkey));
    766 };
    767 
    768 sub Xupdate_avx_16_31()		# recall that $Xi starts wtih 4
    769 { use integer;
    770   my $body = shift;
    771   my @insns = (&$body,&$body,&$body,&$body);	# 40 instructions
    772   my ($a,$b,$c,$d,$e);
    773 
    774 	 eval(shift(@insns));
    775 	 eval(shift(@insns));
    776 	&vpalignr(@X[0],@X[-3&7],@X[-4&7],8);	# compose "X[-14]" in "X[0]"
    777 	 eval(shift(@insns));
    778 	 eval(shift(@insns));
    779 
    780 	  &vpaddd	(@Tx[1],@Tx[1],@X[-1&7]);
    781 	 eval(shift(@insns));
    782 	 eval(shift(@insns));
    783 	&vpsrldq(@Tx[0],@X[-1&7],4);	# "X[-3]", 3 dwords
    784 	 eval(shift(@insns));
    785 	 eval(shift(@insns));
    786 	&vpxor	(@X[0],@X[0],@X[-4&7]);		# "X[0]"^="X[-16]"
    787 	 eval(shift(@insns));
    788 	 eval(shift(@insns));
    789 
    790 	&vpxor	(@Tx[0],@Tx[0],@X[-2&7]);	# "X[-3]"^"X[-8]"
    791 	 eval(shift(@insns));
    792 	 eval(shift(@insns));
    793 	 eval(shift(@insns));
    794 	 eval(shift(@insns));
    795 
    796 	&vpxor	(@X[0],@X[0],@Tx[0]);		# "X[0]"^="X[-3]"^"X[-8]"
    797 	 eval(shift(@insns));
    798 	 eval(shift(@insns));
    799 	  &vmovdqa	(eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]);	# X[]+K xfer to IALU
    800 	 eval(shift(@insns));
    801 	 eval(shift(@insns));
    802 
    803 	&vpsrld	(@Tx[0],@X[0],31);
    804 	 eval(shift(@insns));
    805 	 eval(shift(@insns));
    806 	 eval(shift(@insns));
    807 	 eval(shift(@insns));
    808 
    809 	&vpslldq(@Tx[2],@X[0],12);		# "X[0]"<<96, extract one dword
    810 	&vpaddd	(@X[0],@X[0],@X[0]);
    811 	 eval(shift(@insns));
    812 	 eval(shift(@insns));
    813 	 eval(shift(@insns));
    814 	 eval(shift(@insns));
    815 
    816 	&vpsrld	(@Tx[1],@Tx[2],30);
    817 	&vpor	(@X[0],@X[0],@Tx[0]);		# "X[0]"<<<=1
    818 	 eval(shift(@insns));
    819 	 eval(shift(@insns));
    820 	 eval(shift(@insns));
    821 	 eval(shift(@insns));
    822 
    823 	&vpslld	(@Tx[2],@Tx[2],2);
    824 	&vpxor	(@X[0],@X[0],@Tx[1]);
    825 	 eval(shift(@insns));
    826 	 eval(shift(@insns));
    827 	 eval(shift(@insns));
    828 	 eval(shift(@insns));
    829 
    830 	&vpxor	(@X[0],@X[0],@Tx[2]);		# "X[0]"^=("X[0]">>96)<<<2
    831 	 eval(shift(@insns));
    832 	 eval(shift(@insns));
    833 	  &vmovdqa	(@Tx[2],eval(16*(($Xi)/5))."($K_XX_XX)");	# K_XX_XX
    834 	 eval(shift(@insns));
    835 	 eval(shift(@insns));
    836 
    837 
    838 	 foreach (@insns) { eval; }	# remaining instructions [if any]
    839 
    840   $Xi++;	push(@X,shift(@X));	# "rotate" X[]
    841 		push(@Tx,shift(@Tx));
    842 }
    843 
    844 sub Xupdate_avx_32_79()
    845 { use integer;
    846   my $body = shift;
    847   my @insns = (&$body,&$body,&$body,&$body);	# 32 to 48 instructions
    848   my ($a,$b,$c,$d,$e);
    849 
    850 	&vpalignr(@Tx[0],@X[-1&7],@X[-2&7],8);	# compose "X[-6]"
    851 	&vpxor	(@X[0],@X[0],@X[-4&7]);		# "X[0]"="X[-32]"^"X[-16]"
    852 	 eval(shift(@insns));		# body_20_39
    853 	 eval(shift(@insns));
    854 	 eval(shift(@insns));
    855 	 eval(shift(@insns));		# rol
    856 
    857 	&vpxor	(@X[0],@X[0],@X[-7&7]);		# "X[0]"^="X[-28]"
    858 	 eval(shift(@insns));
    859 	 eval(shift(@insns))	if (@insns[0] !~ /&ro[rl]/);
    860 	if ($Xi%5) {
    861 	  &vmovdqa	(@Tx[2],@Tx[1]);# "perpetuate" K_XX_XX...
    862 	} else {			# ... or load next one
    863 	  &vmovdqa	(@Tx[2],eval(16*($Xi/5))."($K_XX_XX)");
    864 	}
    865 	  &vpaddd	(@Tx[1],@Tx[1],@X[-1&7]);
    866 	 eval(shift(@insns));		# ror
    867 	 eval(shift(@insns));
    868 
    869 	&vpxor	(@X[0],@X[0],@Tx[0]);		# "X[0]"^="X[-6]"
    870 	 eval(shift(@insns));		# body_20_39
    871 	 eval(shift(@insns));
    872 	 eval(shift(@insns));
    873 	 eval(shift(@insns));		# rol
    874 
    875 	&vpsrld	(@Tx[0],@X[0],30);
    876 	  &vmovdqa	(eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]);	# X[]+K xfer to IALU
    877 	 eval(shift(@insns));
    878 	 eval(shift(@insns));
    879 	 eval(shift(@insns));		# ror
    880 	 eval(shift(@insns));
    881 
    882 	&vpslld	(@X[0],@X[0],2);
    883 	 eval(shift(@insns));		# body_20_39
    884 	 eval(shift(@insns));
    885 	 eval(shift(@insns));
    886 	 eval(shift(@insns));		# rol
    887 	 eval(shift(@insns));
    888 	 eval(shift(@insns));
    889 	 eval(shift(@insns));		# ror
    890 	 eval(shift(@insns));
    891 
    892 	&vpor	(@X[0],@X[0],@Tx[0]);		# "X[0]"<<<=2
    893 	 eval(shift(@insns));		# body_20_39
    894 	 eval(shift(@insns));
    895 	  &vmovdqa	(@Tx[1],@X[0])	if ($Xi<19);
    896 	 eval(shift(@insns));
    897 	 eval(shift(@insns));		# rol
    898 	 eval(shift(@insns));
    899 	 eval(shift(@insns));
    900 	 eval(shift(@insns));		# rol
    901 	 eval(shift(@insns));
    902 
    903 	 foreach (@insns) { eval; }	# remaining instructions
    904 
    905   $Xi++;	push(@X,shift(@X));	# "rotate" X[]
    906 		push(@Tx,shift(@Tx));
    907 }
    908 
    909 sub Xuplast_avx_80()
    910 { use integer;
    911   my $body = shift;
    912   my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
    913   my ($a,$b,$c,$d,$e);
    914 
    915 	 eval(shift(@insns));
    916 	  &vpaddd	(@Tx[1],@Tx[1],@X[-1&7]);
    917 	 eval(shift(@insns));
    918 	 eval(shift(@insns));
    919 	 eval(shift(@insns));
    920 	 eval(shift(@insns));
    921 
    922 	  &movdqa	(eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]);	# X[]+K xfer IALU
    923 
    924 	 foreach (@insns) { eval; }		# remaining instructions
    925 
    926 	&cmp	($inp,$len);
    927 	&je	(".Ldone_avx");
    928 
    929 	unshift(@Tx,pop(@Tx));
    930 
    931 	&vmovdqa(@X[2],"64($K_XX_XX)");		# pbswap mask
    932 	&vmovdqa(@Tx[1],"0($K_XX_XX)");		# K_00_19
    933 	&vmovdqu(@X[-4&7],"0($inp)");		# load input
    934 	&vmovdqu(@X[-3&7],"16($inp)");
    935 	&vmovdqu(@X[-2&7],"32($inp)");
    936 	&vmovdqu(@X[-1&7],"48($inp)");
    937 	&vpshufb(@X[-4&7],@X[-4&7],@X[2]);	# byte swap
    938 	&add	($inp,64);
    939 
    940   $Xi=0;
    941 }
    942 
    943 sub Xloop_avx()
    944 { use integer;
    945   my $body = shift;
    946   my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
    947   my ($a,$b,$c,$d,$e);
    948 
    949 	 eval(shift(@insns));
    950 	 eval(shift(@insns));
    951 	&vpshufb(@X[($Xi-3)&7],@X[($Xi-3)&7],@X[2]);
    952 	 eval(shift(@insns));
    953 	 eval(shift(@insns));
    954 	&vpaddd	(@X[$Xi&7],@X[($Xi-4)&7],@Tx[1]);
    955 	 eval(shift(@insns));
    956 	 eval(shift(@insns));
    957 	 eval(shift(@insns));
    958 	 eval(shift(@insns));
    959 	&vmovdqa(eval(16*$Xi)."(%rsp)",@X[$Xi&7]);	# X[]+K xfer to IALU
    960 	 eval(shift(@insns));
    961 	 eval(shift(@insns));
    962 
    963 	foreach (@insns) { eval; }
    964   $Xi++;
    965 }
    966 
    967 sub Xtail_avx()
    968 { use integer;
    969   my $body = shift;
    970   my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
    971   my ($a,$b,$c,$d,$e);
    972 
    973 	foreach (@insns) { eval; }
    974 }
    975 
    976 $code.=<<___;
    977 .align	16
    978 .Loop_avx:
    979 ___
    980 	&Xupdate_avx_16_31(\&body_00_19);
    981 	&Xupdate_avx_16_31(\&body_00_19);
    982 	&Xupdate_avx_16_31(\&body_00_19);
    983 	&Xupdate_avx_16_31(\&body_00_19);
    984 	&Xupdate_avx_32_79(\&body_00_19);
    985 	&Xupdate_avx_32_79(\&body_20_39);
    986 	&Xupdate_avx_32_79(\&body_20_39);
    987 	&Xupdate_avx_32_79(\&body_20_39);
    988 	&Xupdate_avx_32_79(\&body_20_39);
    989 	&Xupdate_avx_32_79(\&body_20_39);
    990 	&Xupdate_avx_32_79(\&body_40_59);
    991 	&Xupdate_avx_32_79(\&body_40_59);
    992 	&Xupdate_avx_32_79(\&body_40_59);
    993 	&Xupdate_avx_32_79(\&body_40_59);
    994 	&Xupdate_avx_32_79(\&body_40_59);
    995 	&Xupdate_avx_32_79(\&body_20_39);
    996 	&Xuplast_avx_80(\&body_20_39);	# can jump to "done"
    997 
    998 				$saved_j=$j; @saved_V=@V;
    999 				$saved_r=$r; @saved_rndkey=@rndkey;
   1000 
   1001 	&Xloop_avx(\&body_20_39);
   1002 	&Xloop_avx(\&body_20_39);
   1003 	&Xloop_avx(\&body_20_39);
   1004 
   1005 $code.=<<___;
   1006 	vmovups	$iv,48($out,$in0)		# write output
   1007 	lea	64($in0),$in0
   1008 
   1009 	add	0($ctx),$A			# update context
   1010 	add	4($ctx),@T[0]
   1011 	add	8($ctx),$C
   1012 	add	12($ctx),$D
   1013 	mov	$A,0($ctx)
   1014 	add	16($ctx),$E
   1015 	mov	@T[0],4($ctx)
   1016 	mov	@T[0],$B			# magic seed
   1017 	mov	$C,8($ctx)
   1018 	mov	$D,12($ctx)
   1019 	mov	$E,16($ctx)
   1020 	jmp	.Loop_avx
   1021 
   1022 .align	16
   1023 .Ldone_avx:
   1024 ___
   1025 				$jj=$j=$saved_j; @V=@saved_V;
   1026 				$r=$saved_r;     @rndkey=@saved_rndkey;
   1027 
   1028 	&Xtail_avx(\&body_20_39);
   1029 	&Xtail_avx(\&body_20_39);
   1030 	&Xtail_avx(\&body_20_39);
   1031 
   1032 $code.=<<___;
   1033 	vmovups	$iv,48($out,$in0)		# write output
   1034 	mov	88(%rsp),$ivp			# restore $ivp
   1035 
   1036 	add	0($ctx),$A			# update context
   1037 	add	4($ctx),@T[0]
   1038 	add	8($ctx),$C
   1039 	mov	$A,0($ctx)
   1040 	add	12($ctx),$D
   1041 	mov	@T[0],4($ctx)
   1042 	add	16($ctx),$E
   1043 	mov	$C,8($ctx)
   1044 	mov	$D,12($ctx)
   1045 	mov	$E,16($ctx)
   1046 	vmovups	$iv,($ivp)			# write IV
   1047 	vzeroall
   1048 ___
   1049 $code.=<<___ if ($win64);
   1050 	movaps	96+0(%rsp),%xmm6
   1051 	movaps	96+16(%rsp),%xmm7
   1052 	movaps	96+32(%rsp),%xmm8
   1053 	movaps	96+48(%rsp),%xmm9
   1054 	movaps	96+64(%rsp),%xmm10
   1055 	movaps	96+80(%rsp),%xmm11
   1056 	movaps	96+96(%rsp),%xmm12
   1057 	movaps	96+112(%rsp),%xmm13
   1058 	movaps	96+128(%rsp),%xmm14
   1059 	movaps	96+144(%rsp),%xmm15
   1060 ___
   1061 $code.=<<___;
   1062 	lea	`104+($win64?10*16:0)`(%rsp),%rsi
   1063 	mov	0(%rsi),%r15
   1064 	mov	8(%rsi),%r14
   1065 	mov	16(%rsi),%r13
   1066 	mov	24(%rsi),%r12
   1067 	mov	32(%rsi),%rbp
   1068 	mov	40(%rsi),%rbx
   1069 	lea	48(%rsi),%rsp
   1070 .Lepilogue_avx:
   1071 	ret
   1072 .size	aesni_cbc_sha1_enc_avx,.-aesni_cbc_sha1_enc_avx
   1073 ___
   1074 }
   1075 $code.=<<___;
   1076 .align	64
   1077 K_XX_XX:
   1078 .long	0x5a827999,0x5a827999,0x5a827999,0x5a827999	# K_00_19
   1079 .long	0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1	# K_20_39
   1080 .long	0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc	# K_40_59
   1081 .long	0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6	# K_60_79
   1082 .long	0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f	# pbswap mask
   1083 
   1084 .asciz	"AESNI-CBC+SHA1 stitch for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
   1085 .align	64
   1086 ___
   1087 
   1088 # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
   1089 #		CONTEXT *context,DISPATCHER_CONTEXT *disp)
   1090 if ($win64) {
   1091 $rec="%rcx";
   1092 $frame="%rdx";
   1093 $context="%r8";
   1094 $disp="%r9";
   1095 
   1096 $code.=<<___;
   1097 .extern	__imp_RtlVirtualUnwind
   1098 .type	ssse3_handler,\@abi-omnipotent
   1099 .align	16
   1100 ssse3_handler:
   1101 	push	%rsi
   1102 	push	%rdi
   1103 	push	%rbx
   1104 	push	%rbp
   1105 	push	%r12
   1106 	push	%r13
   1107 	push	%r14
   1108 	push	%r15
   1109 	pushfq
   1110 	sub	\$64,%rsp
   1111 
   1112 	mov	120($context),%rax	# pull context->Rax
   1113 	mov	248($context),%rbx	# pull context->Rip
   1114 
   1115 	mov	8($disp),%rsi		# disp->ImageBase
   1116 	mov	56($disp),%r11		# disp->HandlerData
   1117 
   1118 	mov	0(%r11),%r10d		# HandlerData[0]
   1119 	lea	(%rsi,%r10),%r10	# prologue label
   1120 	cmp	%r10,%rbx		# context->Rip<prologue label
   1121 	jb	.Lcommon_seh_tail
   1122 
   1123 	mov	152($context),%rax	# pull context->Rsp
   1124 
   1125 	mov	4(%r11),%r10d		# HandlerData[1]
   1126 	lea	(%rsi,%r10),%r10	# epilogue label
   1127 	cmp	%r10,%rbx		# context->Rip>=epilogue label
   1128 	jae	.Lcommon_seh_tail
   1129 
   1130 	lea	96(%rax),%rsi
   1131 	lea	512($context),%rdi	# &context.Xmm6
   1132 	mov	\$20,%ecx
   1133 	.long	0xa548f3fc		# cld; rep movsq
   1134 	lea	`104+10*16`(%rax),%rax	# adjust stack pointer
   1135 
   1136 	mov	0(%rax),%r15
   1137 	mov	8(%rax),%r14
   1138 	mov	16(%rax),%r13
   1139 	mov	24(%rax),%r12
   1140 	mov	32(%rax),%rbp
   1141 	mov	40(%rax),%rbx
   1142 	lea	48(%rax),%rax
   1143 	mov	%rbx,144($context)	# restore context->Rbx
   1144 	mov	%rbp,160($context)	# restore context->Rbp
   1145 	mov	%r12,216($context)	# restore context->R12
   1146 	mov	%r13,224($context)	# restore context->R13
   1147 	mov	%r14,232($context)	# restore context->R14
   1148 	mov	%r15,240($context)	# restore context->R15
   1149 
   1150 .Lcommon_seh_tail:
   1151 	mov	8(%rax),%rdi
   1152 	mov	16(%rax),%rsi
   1153 	mov	%rax,152($context)	# restore context->Rsp
   1154 	mov	%rsi,168($context)	# restore context->Rsi
   1155 	mov	%rdi,176($context)	# restore context->Rdi
   1156 
   1157 	mov	40($disp),%rdi		# disp->ContextRecord
   1158 	mov	$context,%rsi		# context
   1159 	mov	\$154,%ecx		# sizeof(CONTEXT)
   1160 	.long	0xa548f3fc		# cld; rep movsq
   1161 
   1162 	mov	$disp,%rsi
   1163 	xor	%rcx,%rcx		# arg1, UNW_FLAG_NHANDLER
   1164 	mov	8(%rsi),%rdx		# arg2, disp->ImageBase
   1165 	mov	0(%rsi),%r8		# arg3, disp->ControlPc
   1166 	mov	16(%rsi),%r9		# arg4, disp->FunctionEntry
   1167 	mov	40(%rsi),%r10		# disp->ContextRecord
   1168 	lea	56(%rsi),%r11		# &disp->HandlerData
   1169 	lea	24(%rsi),%r12		# &disp->EstablisherFrame
   1170 	mov	%r10,32(%rsp)		# arg5
   1171 	mov	%r11,40(%rsp)		# arg6
   1172 	mov	%r12,48(%rsp)		# arg7
   1173 	mov	%rcx,56(%rsp)		# arg8, (NULL)
   1174 	call	*__imp_RtlVirtualUnwind(%rip)
   1175 
   1176 	mov	\$1,%eax		# ExceptionContinueSearch
   1177 	add	\$64,%rsp
   1178 	popfq
   1179 	pop	%r15
   1180 	pop	%r14
   1181 	pop	%r13
   1182 	pop	%r12
   1183 	pop	%rbp
   1184 	pop	%rbx
   1185 	pop	%rdi
   1186 	pop	%rsi
   1187 	ret
   1188 .size	ssse3_handler,.-ssse3_handler
   1189 
   1190 .section	.pdata
   1191 .align	4
   1192 	.rva	.LSEH_begin_aesni_cbc_sha1_enc_ssse3
   1193 	.rva	.LSEH_end_aesni_cbc_sha1_enc_ssse3
   1194 	.rva	.LSEH_info_aesni_cbc_sha1_enc_ssse3
   1195 ___
   1196 $code.=<<___ if ($avx);
   1197 	.rva	.LSEH_begin_aesni_cbc_sha1_enc_avx
   1198 	.rva	.LSEH_end_aesni_cbc_sha1_enc_avx
   1199 	.rva	.LSEH_info_aesni_cbc_sha1_enc_avx
   1200 ___
   1201 $code.=<<___;
   1202 .section	.xdata
   1203 .align	8
   1204 .LSEH_info_aesni_cbc_sha1_enc_ssse3:
   1205 	.byte	9,0,0,0
   1206 	.rva	ssse3_handler
   1207 	.rva	.Lprologue_ssse3,.Lepilogue_ssse3	# HandlerData[]
   1208 ___
   1209 $code.=<<___ if ($avx);
   1210 .LSEH_info_aesni_cbc_sha1_enc_avx:
   1211 	.byte	9,0,0,0
   1212 	.rva	ssse3_handler
   1213 	.rva	.Lprologue_avx,.Lepilogue_avx		# HandlerData[]
   1214 ___
   1215 }
   1216 
   1217 ####################################################################
   1218 sub rex {
   1219   local *opcode=shift;
   1220   my ($dst,$src)=@_;
   1221   my $rex=0;
   1222 
   1223     $rex|=0x04			if($dst>=8);
   1224     $rex|=0x01			if($src>=8);
   1225     push @opcode,$rex|0x40	if($rex);
   1226 }
   1227 
   1228 sub aesni {
   1229   my $line=shift;
   1230   my @opcode=(0x66);
   1231 
   1232     if ($line=~/(aes[a-z]+)\s+%xmm([0-9]+),\s*%xmm([0-9]+)/) {
   1233 	my %opcodelet = (
   1234 		"aesenc" => 0xdc,	"aesenclast" => 0xdd
   1235 	);
   1236 	return undef if (!defined($opcodelet{$1}));
   1237 	rex(\@opcode,$3,$2);
   1238 	push @opcode,0x0f,0x38,$opcodelet{$1};
   1239 	push @opcode,0xc0|($2&7)|(($3&7)<<3);	# ModR/M
   1240 	return ".byte\t".join(',',@opcode);
   1241     }
   1242     return $line;
   1243 }
   1244 
   1245 $code =~ s/\`([^\`]*)\`/eval($1)/gem;
   1246 $code =~ s/\b(aes.*%xmm[0-9]+).*$/aesni($1)/gem;
   1247 
   1248 print $code;
   1249 close STDOUT;
   1250