Home | History | Annotate | Download | only in asm
      1 #! /usr/bin/env perl
      2 # Copyright 2016 The OpenSSL Project Authors. All Rights Reserved.
      3 #
      4 # Licensed under the OpenSSL license (the "License").  You may not use
      5 # this file except in compliance with the License.  You can obtain a copy
      6 # in the file LICENSE in the source distribution or at
      7 # https://www.openssl.org/source/license.html
      8 
      9 #
     10 # ====================================================================
     11 # Written by Andy Polyakov <appro (at] openssl.org> for the OpenSSL
     12 # project. The module is, however, dual licensed under OpenSSL and
     13 # CRYPTOGAMS licenses depending on where you obtain it. For further
     14 # details see http://www.openssl.org/~appro/cryptogams/.
     15 # ====================================================================
     16 #
     17 # November 2014
     18 #
     19 # ChaCha20 for x86_64.
     20 #
     21 # December 2016
     22 #
     23 # Add AVX512F code path.
     24 #
     25 # Performance in cycles per byte out of large buffer.
     26 #
     27 #		IALU/gcc 4.8(i)	1xSSSE3/SSE2	4xSSSE3	    8xAVX2
     28 #
     29 # P4		9.48/+99%	-/22.7(ii)	-
     30 # Core2		7.83/+55%	7.90/8.08	4.35
     31 # Westmere	7.19/+50%	5.60/6.70	3.00
     32 # Sandy Bridge	8.31/+42%	5.45/6.76	2.72
     33 # Ivy Bridge	6.71/+46%	5.40/6.49	2.41
     34 # Haswell	5.92/+43%	5.20/6.45	2.42	    1.23
     35 # Skylake	5.87/+39%	4.70/-		2.31	    1.19
     36 # Silvermont	12.0/+33%	7.75/7.40	7.03(iii)
     37 # Goldmont	10.6/+17%	5.10/-		3.28
     38 # Sledgehammer	7.28/+52%	-/14.2(ii)	-
     39 # Bulldozer	9.66/+28%	9.85/11.1	3.06(iv)
     40 # VIA Nano	10.5/+46%	6.72/8.60	6.05
     41 #
     42 # (i)	compared to older gcc 3.x one can observe >2x improvement on
     43 #	most platforms;
     44 # (ii)	as it can be seen, SSE2 performance is too low on legacy
     45 #	processors; NxSSE2 results are naturally better, but not
     46 #	impressively better than IALU ones, which is why you won't
     47 #	find SSE2 code below;
     48 # (iii)	this is not optimal result for Atom because of MSROM
     49 #	limitations, SSE2 can do better, but gain is considered too
     50 #	low to justify the [maintenance] effort;
     51 # (iv)	Bulldozer actually executes 4xXOP code path that delivers 2.20;
     52 #
     53 # Modified from upstream OpenSSL to remove the XOP code.
     54 
     55 $flavour = shift;
     56 $output  = shift;
     57 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
     58 
     59 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
     60 
     61 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
     62 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
     63 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
     64 die "can't locate x86_64-xlate.pl";
     65 
     66 $avx = 2;
     67 
     68 open OUT,"| \"$^X\" $xlate $flavour $output";
     69 *STDOUT=*OUT;
     70 
     71 # input parameter block
     72 ($out,$inp,$len,$key,$counter)=("%rdi","%rsi","%rdx","%rcx","%r8");
     73 
     74 $code.=<<___;
     75 .text
     76 
     77 .extern OPENSSL_ia32cap_P
     78 
     79 .align	64
     80 .Lzero:
     81 .long	0,0,0,0
     82 .Lone:
     83 .long	1,0,0,0
     84 .Linc:
     85 .long	0,1,2,3
     86 .Lfour:
     87 .long	4,4,4,4
     88 .Lincy:
     89 .long	0,2,4,6,1,3,5,7
     90 .Leight:
     91 .long	8,8,8,8,8,8,8,8
     92 .Lrot16:
     93 .byte	0x2,0x3,0x0,0x1, 0x6,0x7,0x4,0x5, 0xa,0xb,0x8,0x9, 0xe,0xf,0xc,0xd
     94 .Lrot24:
     95 .byte	0x3,0x0,0x1,0x2, 0x7,0x4,0x5,0x6, 0xb,0x8,0x9,0xa, 0xf,0xc,0xd,0xe
     96 .Lsigma:
     97 .asciz	"expand 32-byte k"
     98 .align	64
     99 .Lzeroz:
    100 .long	0,0,0,0, 1,0,0,0, 2,0,0,0, 3,0,0,0
    101 .Lfourz:
    102 .long	4,0,0,0, 4,0,0,0, 4,0,0,0, 4,0,0,0
    103 .Lincz:
    104 .long	0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
    105 .Lsixteen:
    106 .long	16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16
    107 .asciz	"ChaCha20 for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
    108 ___
    109 
    110 sub AUTOLOAD()          # thunk [simplified] 32-bit style perlasm
    111 { my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
    112   my $arg = pop;
    113     $arg = "\$$arg" if ($arg*1 eq $arg);
    114     $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n";
    115 }
    116 
    117 @x=("%eax","%ebx","%ecx","%edx",map("%r${_}d",(8..11)),
    118     "%nox","%nox","%nox","%nox",map("%r${_}d",(12..15)));
    119 @t=("%esi","%edi");
    120 
    121 sub ROUND {			# critical path is 24 cycles per round
    122 my ($a0,$b0,$c0,$d0)=@_;
    123 my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
    124 my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
    125 my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
    126 my ($xc,$xc_)=map("\"$_\"",@t);
    127 my @x=map("\"$_\"",@x);
    128 
    129 	# Consider order in which variables are addressed by their
    130 	# index:
    131 	#
    132 	#	a   b   c   d
    133 	#
    134 	#	0   4   8  12 < even round
    135 	#	1   5   9  13
    136 	#	2   6  10  14
    137 	#	3   7  11  15
    138 	#	0   5  10  15 < odd round
    139 	#	1   6  11  12
    140 	#	2   7   8  13
    141 	#	3   4   9  14
    142 	#
    143 	# 'a', 'b' and 'd's are permanently allocated in registers,
    144 	# @x[0..7,12..15], while 'c's are maintained in memory. If
    145 	# you observe 'c' column, you'll notice that pair of 'c's is
    146 	# invariant between rounds. This means that we have to reload
    147 	# them once per round, in the middle. This is why you'll see
    148 	# bunch of 'c' stores and loads in the middle, but none in
    149 	# the beginning or end.
    150 
    151 	# Normally instructions would be interleaved to favour in-order
    152 	# execution. Generally out-of-order cores manage it gracefully,
    153 	# but not this time for some reason. As in-order execution
    154 	# cores are dying breed, old Atom is the only one around,
    155 	# instructions are left uninterleaved. Besides, Atom is better
    156 	# off executing 1xSSSE3 code anyway...
    157 
    158 	(
    159 	"&add	(@x[$a0],@x[$b0])",	# Q1
    160 	"&xor	(@x[$d0],@x[$a0])",
    161 	"&rol	(@x[$d0],16)",
    162 	 "&add	(@x[$a1],@x[$b1])",	# Q2
    163 	 "&xor	(@x[$d1],@x[$a1])",
    164 	 "&rol	(@x[$d1],16)",
    165 
    166 	"&add	($xc,@x[$d0])",
    167 	"&xor	(@x[$b0],$xc)",
    168 	"&rol	(@x[$b0],12)",
    169 	 "&add	($xc_,@x[$d1])",
    170 	 "&xor	(@x[$b1],$xc_)",
    171 	 "&rol	(@x[$b1],12)",
    172 
    173 	"&add	(@x[$a0],@x[$b0])",
    174 	"&xor	(@x[$d0],@x[$a0])",
    175 	"&rol	(@x[$d0],8)",
    176 	 "&add	(@x[$a1],@x[$b1])",
    177 	 "&xor	(@x[$d1],@x[$a1])",
    178 	 "&rol	(@x[$d1],8)",
    179 
    180 	"&add	($xc,@x[$d0])",
    181 	"&xor	(@x[$b0],$xc)",
    182 	"&rol	(@x[$b0],7)",
    183 	 "&add	($xc_,@x[$d1])",
    184 	 "&xor	(@x[$b1],$xc_)",
    185 	 "&rol	(@x[$b1],7)",
    186 
    187 	"&mov	(\"4*$c0(%rsp)\",$xc)",	# reload pair of 'c's
    188 	 "&mov	(\"4*$c1(%rsp)\",$xc_)",
    189 	"&mov	($xc,\"4*$c2(%rsp)\")",
    190 	 "&mov	($xc_,\"4*$c3(%rsp)\")",
    191 
    192 	"&add	(@x[$a2],@x[$b2])",	# Q3
    193 	"&xor	(@x[$d2],@x[$a2])",
    194 	"&rol	(@x[$d2],16)",
    195 	 "&add	(@x[$a3],@x[$b3])",	# Q4
    196 	 "&xor	(@x[$d3],@x[$a3])",
    197 	 "&rol	(@x[$d3],16)",
    198 
    199 	"&add	($xc,@x[$d2])",
    200 	"&xor	(@x[$b2],$xc)",
    201 	"&rol	(@x[$b2],12)",
    202 	 "&add	($xc_,@x[$d3])",
    203 	 "&xor	(@x[$b3],$xc_)",
    204 	 "&rol	(@x[$b3],12)",
    205 
    206 	"&add	(@x[$a2],@x[$b2])",
    207 	"&xor	(@x[$d2],@x[$a2])",
    208 	"&rol	(@x[$d2],8)",
    209 	 "&add	(@x[$a3],@x[$b3])",
    210 	 "&xor	(@x[$d3],@x[$a3])",
    211 	 "&rol	(@x[$d3],8)",
    212 
    213 	"&add	($xc,@x[$d2])",
    214 	"&xor	(@x[$b2],$xc)",
    215 	"&rol	(@x[$b2],7)",
    216 	 "&add	($xc_,@x[$d3])",
    217 	 "&xor	(@x[$b3],$xc_)",
    218 	 "&rol	(@x[$b3],7)"
    219 	);
    220 }
    221 
    222 ########################################################################
    223 # Generic code path that handles all lengths on pre-SSSE3 processors.
    224 $code.=<<___;
    225 .globl	ChaCha20_ctr32
    226 .type	ChaCha20_ctr32,\@function,5
    227 .align	64
    228 ChaCha20_ctr32:
    229 	cmp	\$0,$len
    230 	je	.Lno_data
    231 	mov	OPENSSL_ia32cap_P+4(%rip),%r10
    232 ___
    233 $code.=<<___	if ($avx>2);
    234 	bt	\$48,%r10		# check for AVX512F
    235 	jc	.LChaCha20_avx512
    236 ___
    237 $code.=<<___;
    238 	test	\$`1<<(41-32)`,%r10d
    239 	jnz	.LChaCha20_ssse3
    240 
    241 	push	%rbx
    242 	push	%rbp
    243 	push	%r12
    244 	push	%r13
    245 	push	%r14
    246 	push	%r15
    247 	sub	\$64+24,%rsp
    248 .Lctr32_body:
    249 
    250 	#movdqa	.Lsigma(%rip),%xmm0
    251 	movdqu	($key),%xmm1
    252 	movdqu	16($key),%xmm2
    253 	movdqu	($counter),%xmm3
    254 	movdqa	.Lone(%rip),%xmm4
    255 
    256 	#movdqa	%xmm0,4*0(%rsp)		# key[0]
    257 	movdqa	%xmm1,4*4(%rsp)		# key[1]
    258 	movdqa	%xmm2,4*8(%rsp)		# key[2]
    259 	movdqa	%xmm3,4*12(%rsp)	# key[3]
    260 	mov	$len,%rbp		# reassign $len
    261 	jmp	.Loop_outer
    262 
    263 .align	32
    264 .Loop_outer:
    265 	mov	\$0x61707865,@x[0]      # 'expa'
    266 	mov	\$0x3320646e,@x[1]      # 'nd 3'
    267 	mov	\$0x79622d32,@x[2]      # '2-by'
    268 	mov	\$0x6b206574,@x[3]      # 'te k'
    269 	mov	4*4(%rsp),@x[4]
    270 	mov	4*5(%rsp),@x[5]
    271 	mov	4*6(%rsp),@x[6]
    272 	mov	4*7(%rsp),@x[7]
    273 	movd	%xmm3,@x[12]
    274 	mov	4*13(%rsp),@x[13]
    275 	mov	4*14(%rsp),@x[14]
    276 	mov	4*15(%rsp),@x[15]
    277 
    278 	mov	%rbp,64+0(%rsp)		# save len
    279 	mov	\$10,%ebp
    280 	mov	$inp,64+8(%rsp)		# save inp
    281 	movq	%xmm2,%rsi		# "@x[8]"
    282 	mov	$out,64+16(%rsp)	# save out
    283 	mov	%rsi,%rdi
    284 	shr	\$32,%rdi		# "@x[9]"
    285 	jmp	.Loop
    286 
    287 .align	32
    288 .Loop:
    289 ___
    290 	foreach (&ROUND (0, 4, 8,12)) { eval; }
    291 	foreach (&ROUND	(0, 5,10,15)) { eval; }
    292 	&dec	("%ebp");
    293 	&jnz	(".Loop");
    294 
    295 $code.=<<___;
    296 	mov	@t[1],4*9(%rsp)		# modulo-scheduled
    297 	mov	@t[0],4*8(%rsp)
    298 	mov	64(%rsp),%rbp		# load len
    299 	movdqa	%xmm2,%xmm1
    300 	mov	64+8(%rsp),$inp		# load inp
    301 	paddd	%xmm4,%xmm3		# increment counter
    302 	mov	64+16(%rsp),$out	# load out
    303 
    304 	add	\$0x61707865,@x[0]      # 'expa'
    305 	add	\$0x3320646e,@x[1]      # 'nd 3'
    306 	add	\$0x79622d32,@x[2]      # '2-by'
    307 	add	\$0x6b206574,@x[3]      # 'te k'
    308 	add	4*4(%rsp),@x[4]
    309 	add	4*5(%rsp),@x[5]
    310 	add	4*6(%rsp),@x[6]
    311 	add	4*7(%rsp),@x[7]
    312 	add	4*12(%rsp),@x[12]
    313 	add	4*13(%rsp),@x[13]
    314 	add	4*14(%rsp),@x[14]
    315 	add	4*15(%rsp),@x[15]
    316 	paddd	4*8(%rsp),%xmm1
    317 
    318 	cmp	\$64,%rbp
    319 	jb	.Ltail
    320 
    321 	xor	4*0($inp),@x[0]		# xor with input
    322 	xor	4*1($inp),@x[1]
    323 	xor	4*2($inp),@x[2]
    324 	xor	4*3($inp),@x[3]
    325 	xor	4*4($inp),@x[4]
    326 	xor	4*5($inp),@x[5]
    327 	xor	4*6($inp),@x[6]
    328 	xor	4*7($inp),@x[7]
    329 	movdqu	4*8($inp),%xmm0
    330 	xor	4*12($inp),@x[12]
    331 	xor	4*13($inp),@x[13]
    332 	xor	4*14($inp),@x[14]
    333 	xor	4*15($inp),@x[15]
    334 	lea	4*16($inp),$inp		# inp+=64
    335 	pxor	%xmm1,%xmm0
    336 
    337 	movdqa	%xmm2,4*8(%rsp)
    338 	movd	%xmm3,4*12(%rsp)
    339 
    340 	mov	@x[0],4*0($out)		# write output
    341 	mov	@x[1],4*1($out)
    342 	mov	@x[2],4*2($out)
    343 	mov	@x[3],4*3($out)
    344 	mov	@x[4],4*4($out)
    345 	mov	@x[5],4*5($out)
    346 	mov	@x[6],4*6($out)
    347 	mov	@x[7],4*7($out)
    348 	movdqu	%xmm0,4*8($out)
    349 	mov	@x[12],4*12($out)
    350 	mov	@x[13],4*13($out)
    351 	mov	@x[14],4*14($out)
    352 	mov	@x[15],4*15($out)
    353 	lea	4*16($out),$out		# out+=64
    354 
    355 	sub	\$64,%rbp
    356 	jnz	.Loop_outer
    357 
    358 	jmp	.Ldone
    359 
    360 .align	16
    361 .Ltail:
    362 	mov	@x[0],4*0(%rsp)
    363 	mov	@x[1],4*1(%rsp)
    364 	xor	%rbx,%rbx
    365 	mov	@x[2],4*2(%rsp)
    366 	mov	@x[3],4*3(%rsp)
    367 	mov	@x[4],4*4(%rsp)
    368 	mov	@x[5],4*5(%rsp)
    369 	mov	@x[6],4*6(%rsp)
    370 	mov	@x[7],4*7(%rsp)
    371 	movdqa	%xmm1,4*8(%rsp)
    372 	mov	@x[12],4*12(%rsp)
    373 	mov	@x[13],4*13(%rsp)
    374 	mov	@x[14],4*14(%rsp)
    375 	mov	@x[15],4*15(%rsp)
    376 
    377 .Loop_tail:
    378 	movzb	($inp,%rbx),%eax
    379 	movzb	(%rsp,%rbx),%edx
    380 	lea	1(%rbx),%rbx
    381 	xor	%edx,%eax
    382 	mov	%al,-1($out,%rbx)
    383 	dec	%rbp
    384 	jnz	.Loop_tail
    385 
    386 .Ldone:
    387 	lea	64+24+48(%rsp),%rsi
    388 	mov	-48(%rsi),%r15
    389 	mov	-40(%rsi),%r14
    390 	mov	-32(%rsi),%r13
    391 	mov	-24(%rsi),%r12
    392 	mov	-16(%rsi),%rbp
    393 	mov	-8(%rsi),%rbx
    394 	lea	(%rsi),%rsp
    395 .Lno_data:
    396 	ret
    397 .size	ChaCha20_ctr32,.-ChaCha20_ctr32
    398 ___
    399 
    400 ########################################################################
    401 # SSSE3 code path that handles shorter lengths
    402 {
    403 my ($a,$b,$c,$d,$t,$t1,$rot16,$rot24)=map("%xmm$_",(0..7));
    404 
    405 sub SSSE3ROUND {	# critical path is 20 "SIMD ticks" per round
    406 	&paddd	($a,$b);
    407 	&pxor	($d,$a);
    408 	&pshufb	($d,$rot16);
    409 
    410 	&paddd	($c,$d);
    411 	&pxor	($b,$c);
    412 	&movdqa	($t,$b);
    413 	&psrld	($b,20);
    414 	&pslld	($t,12);
    415 	&por	($b,$t);
    416 
    417 	&paddd	($a,$b);
    418 	&pxor	($d,$a);
    419 	&pshufb	($d,$rot24);
    420 
    421 	&paddd	($c,$d);
    422 	&pxor	($b,$c);
    423 	&movdqa	($t,$b);
    424 	&psrld	($b,25);
    425 	&pslld	($t,7);
    426 	&por	($b,$t);
    427 }
    428 
    429 my $xframe = $win64 ? 32+8 : 8;
    430 
    431 $code.=<<___;
    432 .type	ChaCha20_ssse3,\@function,5
    433 .align	32
    434 ChaCha20_ssse3:
    435 .LChaCha20_ssse3:
    436 	mov	%rsp,%r9		# frame pointer
    437 ___
    438 $code.=<<___;
    439 	cmp	\$128,$len		# we might throw away some data,
    440 	ja	.LChaCha20_4x		# but overall it won't be slower
    441 
    442 .Ldo_sse3_after_all:
    443 	sub	\$64+$xframe,%rsp
    444 ___
    445 $code.=<<___	if ($win64);
    446 	movaps	%xmm6,-0x28(%r9)
    447 	movaps	%xmm7,-0x18(%r9)
    448 .Lssse3_body:
    449 ___
    450 $code.=<<___;
    451 	movdqa	.Lsigma(%rip),$a
    452 	movdqu	($key),$b
    453 	movdqu	16($key),$c
    454 	movdqu	($counter),$d
    455 	movdqa	.Lrot16(%rip),$rot16
    456 	movdqa	.Lrot24(%rip),$rot24
    457 
    458 	movdqa	$a,0x00(%rsp)
    459 	movdqa	$b,0x10(%rsp)
    460 	movdqa	$c,0x20(%rsp)
    461 	movdqa	$d,0x30(%rsp)
    462 	mov	\$10,$counter		# reuse $counter
    463 	jmp	.Loop_ssse3
    464 
    465 .align	32
    466 .Loop_outer_ssse3:
    467 	movdqa	.Lone(%rip),$d
    468 	movdqa	0x00(%rsp),$a
    469 	movdqa	0x10(%rsp),$b
    470 	movdqa	0x20(%rsp),$c
    471 	paddd	0x30(%rsp),$d
    472 	mov	\$10,$counter
    473 	movdqa	$d,0x30(%rsp)
    474 	jmp	.Loop_ssse3
    475 
    476 .align	32
    477 .Loop_ssse3:
    478 ___
    479 	&SSSE3ROUND();
    480 	&pshufd	($c,$c,0b01001110);
    481 	&pshufd	($b,$b,0b00111001);
    482 	&pshufd	($d,$d,0b10010011);
    483 	&nop	();
    484 
    485 	&SSSE3ROUND();
    486 	&pshufd	($c,$c,0b01001110);
    487 	&pshufd	($b,$b,0b10010011);
    488 	&pshufd	($d,$d,0b00111001);
    489 
    490 	&dec	($counter);
    491 	&jnz	(".Loop_ssse3");
    492 
    493 $code.=<<___;
    494 	paddd	0x00(%rsp),$a
    495 	paddd	0x10(%rsp),$b
    496 	paddd	0x20(%rsp),$c
    497 	paddd	0x30(%rsp),$d
    498 
    499 	cmp	\$64,$len
    500 	jb	.Ltail_ssse3
    501 
    502 	movdqu	0x00($inp),$t
    503 	movdqu	0x10($inp),$t1
    504 	pxor	$t,$a			# xor with input
    505 	movdqu	0x20($inp),$t
    506 	pxor	$t1,$b
    507 	movdqu	0x30($inp),$t1
    508 	lea	0x40($inp),$inp		# inp+=64
    509 	pxor	$t,$c
    510 	pxor	$t1,$d
    511 
    512 	movdqu	$a,0x00($out)		# write output
    513 	movdqu	$b,0x10($out)
    514 	movdqu	$c,0x20($out)
    515 	movdqu	$d,0x30($out)
    516 	lea	0x40($out),$out		# out+=64
    517 
    518 	sub	\$64,$len
    519 	jnz	.Loop_outer_ssse3
    520 
    521 	jmp	.Ldone_ssse3
    522 
    523 .align	16
    524 .Ltail_ssse3:
    525 	movdqa	$a,0x00(%rsp)
    526 	movdqa	$b,0x10(%rsp)
    527 	movdqa	$c,0x20(%rsp)
    528 	movdqa	$d,0x30(%rsp)
    529 	xor	$counter,$counter
    530 
    531 .Loop_tail_ssse3:
    532 	movzb	($inp,$counter),%eax
    533 	movzb	(%rsp,$counter),%ecx
    534 	lea	1($counter),$counter
    535 	xor	%ecx,%eax
    536 	mov	%al,-1($out,$counter)
    537 	dec	$len
    538 	jnz	.Loop_tail_ssse3
    539 
    540 .Ldone_ssse3:
    541 ___
    542 $code.=<<___	if ($win64);
    543 	movaps	-0x28(%r9),%xmm6
    544 	movaps	-0x18(%r9),%xmm7
    545 ___
    546 $code.=<<___;
    547 	lea	(%r9),%rsp
    548 .Lssse3_epilogue:
    549 	ret
    550 .size	ChaCha20_ssse3,.-ChaCha20_ssse3
    551 ___
    552 }
    553 
    554 ########################################################################
    555 # SSSE3 code path that handles longer messages.
    556 {
    557 # assign variables to favor Atom front-end
    558 my ($xd0,$xd1,$xd2,$xd3, $xt0,$xt1,$xt2,$xt3,
    559     $xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3)=map("%xmm$_",(0..15));
    560 my  @xx=($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3,
    561 	"%nox","%nox","%nox","%nox", $xd0,$xd1,$xd2,$xd3);
    562 
    563 sub SSSE3_lane_ROUND {
    564 my ($a0,$b0,$c0,$d0)=@_;
    565 my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
    566 my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
    567 my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
    568 my ($xc,$xc_,$t0,$t1)=map("\"$_\"",$xt0,$xt1,$xt2,$xt3);
    569 my @x=map("\"$_\"",@xx);
    570 
    571 	# Consider order in which variables are addressed by their
    572 	# index:
    573 	#
    574 	#	a   b   c   d
    575 	#
    576 	#	0   4   8  12 < even round
    577 	#	1   5   9  13
    578 	#	2   6  10  14
    579 	#	3   7  11  15
    580 	#	0   5  10  15 < odd round
    581 	#	1   6  11  12
    582 	#	2   7   8  13
    583 	#	3   4   9  14
    584 	#
    585 	# 'a', 'b' and 'd's are permanently allocated in registers,
    586 	# @x[0..7,12..15], while 'c's are maintained in memory. If
    587 	# you observe 'c' column, you'll notice that pair of 'c's is
    588 	# invariant between rounds. This means that we have to reload
    589 	# them once per round, in the middle. This is why you'll see
    590 	# bunch of 'c' stores and loads in the middle, but none in
    591 	# the beginning or end.
    592 
    593 	(
    594 	"&paddd		(@x[$a0],@x[$b0])",	# Q1
    595 	 "&paddd	(@x[$a1],@x[$b1])",	# Q2
    596 	"&pxor		(@x[$d0],@x[$a0])",
    597 	 "&pxor		(@x[$d1],@x[$a1])",
    598 	"&pshufb	(@x[$d0],$t1)",
    599 	 "&pshufb	(@x[$d1],$t1)",
    600 
    601 	"&paddd		($xc,@x[$d0])",
    602 	 "&paddd	($xc_,@x[$d1])",
    603 	"&pxor		(@x[$b0],$xc)",
    604 	 "&pxor		(@x[$b1],$xc_)",
    605 	"&movdqa	($t0,@x[$b0])",
    606 	"&pslld		(@x[$b0],12)",
    607 	"&psrld		($t0,20)",
    608 	 "&movdqa	($t1,@x[$b1])",
    609 	 "&pslld	(@x[$b1],12)",
    610 	"&por		(@x[$b0],$t0)",
    611 	 "&psrld	($t1,20)",
    612 	"&movdqa	($t0,'(%r11)')",	# .Lrot24(%rip)
    613 	 "&por		(@x[$b1],$t1)",
    614 
    615 	"&paddd		(@x[$a0],@x[$b0])",
    616 	 "&paddd	(@x[$a1],@x[$b1])",
    617 	"&pxor		(@x[$d0],@x[$a0])",
    618 	 "&pxor		(@x[$d1],@x[$a1])",
    619 	"&pshufb	(@x[$d0],$t0)",
    620 	 "&pshufb	(@x[$d1],$t0)",
    621 
    622 	"&paddd		($xc,@x[$d0])",
    623 	 "&paddd	($xc_,@x[$d1])",
    624 	"&pxor		(@x[$b0],$xc)",
    625 	 "&pxor		(@x[$b1],$xc_)",
    626 	"&movdqa	($t1,@x[$b0])",
    627 	"&pslld		(@x[$b0],7)",
    628 	"&psrld		($t1,25)",
    629 	 "&movdqa	($t0,@x[$b1])",
    630 	 "&pslld	(@x[$b1],7)",
    631 	"&por		(@x[$b0],$t1)",
    632 	 "&psrld	($t0,25)",
    633 	"&movdqa	($t1,'(%r10)')",	# .Lrot16(%rip)
    634 	 "&por		(@x[$b1],$t0)",
    635 
    636 	"&movdqa	(\"`16*($c0-8)`(%rsp)\",$xc)",	# reload pair of 'c's
    637 	 "&movdqa	(\"`16*($c1-8)`(%rsp)\",$xc_)",
    638 	"&movdqa	($xc,\"`16*($c2-8)`(%rsp)\")",
    639 	 "&movdqa	($xc_,\"`16*($c3-8)`(%rsp)\")",
    640 
    641 	"&paddd		(@x[$a2],@x[$b2])",	# Q3
    642 	 "&paddd	(@x[$a3],@x[$b3])",	# Q4
    643 	"&pxor		(@x[$d2],@x[$a2])",
    644 	 "&pxor		(@x[$d3],@x[$a3])",
    645 	"&pshufb	(@x[$d2],$t1)",
    646 	 "&pshufb	(@x[$d3],$t1)",
    647 
    648 	"&paddd		($xc,@x[$d2])",
    649 	 "&paddd	($xc_,@x[$d3])",
    650 	"&pxor		(@x[$b2],$xc)",
    651 	 "&pxor		(@x[$b3],$xc_)",
    652 	"&movdqa	($t0,@x[$b2])",
    653 	"&pslld		(@x[$b2],12)",
    654 	"&psrld		($t0,20)",
    655 	 "&movdqa	($t1,@x[$b3])",
    656 	 "&pslld	(@x[$b3],12)",
    657 	"&por		(@x[$b2],$t0)",
    658 	 "&psrld	($t1,20)",
    659 	"&movdqa	($t0,'(%r11)')",	# .Lrot24(%rip)
    660 	 "&por		(@x[$b3],$t1)",
    661 
    662 	"&paddd		(@x[$a2],@x[$b2])",
    663 	 "&paddd	(@x[$a3],@x[$b3])",
    664 	"&pxor		(@x[$d2],@x[$a2])",
    665 	 "&pxor		(@x[$d3],@x[$a3])",
    666 	"&pshufb	(@x[$d2],$t0)",
    667 	 "&pshufb	(@x[$d3],$t0)",
    668 
    669 	"&paddd		($xc,@x[$d2])",
    670 	 "&paddd	($xc_,@x[$d3])",
    671 	"&pxor		(@x[$b2],$xc)",
    672 	 "&pxor		(@x[$b3],$xc_)",
    673 	"&movdqa	($t1,@x[$b2])",
    674 	"&pslld		(@x[$b2],7)",
    675 	"&psrld		($t1,25)",
    676 	 "&movdqa	($t0,@x[$b3])",
    677 	 "&pslld	(@x[$b3],7)",
    678 	"&por		(@x[$b2],$t1)",
    679 	 "&psrld	($t0,25)",
    680 	"&movdqa	($t1,'(%r10)')",	# .Lrot16(%rip)
    681 	 "&por		(@x[$b3],$t0)"
    682 	);
    683 }
    684 
    685 my $xframe = $win64 ? 0xa8 : 8;
    686 
    687 $code.=<<___;
    688 .type	ChaCha20_4x,\@function,5
    689 .align	32
    690 ChaCha20_4x:
    691 .LChaCha20_4x:
    692 	mov		%rsp,%r9		# frame pointer
    693 	mov		%r10,%r11
    694 ___
    695 $code.=<<___	if ($avx>1);
    696 	shr		\$32,%r10		# OPENSSL_ia32cap_P+8
    697 	test		\$`1<<5`,%r10		# test AVX2
    698 	jnz		.LChaCha20_8x
    699 ___
    700 $code.=<<___;
    701 	cmp		\$192,$len
    702 	ja		.Lproceed4x
    703 
    704 	and		\$`1<<26|1<<22`,%r11	# isolate XSAVE+MOVBE
    705 	cmp		\$`1<<22`,%r11		# check for MOVBE without XSAVE
    706 	je		.Ldo_sse3_after_all	# to detect Atom
    707 
    708 .Lproceed4x:
    709 	sub		\$0x140+$xframe,%rsp
    710 ___
    711 	################ stack layout
    712 	# +0x00		SIMD equivalent of @x[8-12]
    713 	# ...
    714 	# +0x40		constant copy of key[0-2] smashed by lanes
    715 	# ...
    716 	# +0x100	SIMD counters (with nonce smashed by lanes)
    717 	# ...
    718 	# +0x140
    719 $code.=<<___	if ($win64);
    720 	movaps		%xmm6,-0xa8(%r9)
    721 	movaps		%xmm7,-0x98(%r9)
    722 	movaps		%xmm8,-0x88(%r9)
    723 	movaps		%xmm9,-0x78(%r9)
    724 	movaps		%xmm10,-0x68(%r9)
    725 	movaps		%xmm11,-0x58(%r9)
    726 	movaps		%xmm12,-0x48(%r9)
    727 	movaps		%xmm13,-0x38(%r9)
    728 	movaps		%xmm14,-0x28(%r9)
    729 	movaps		%xmm15,-0x18(%r9)
    730 .L4x_body:
    731 ___
    732 $code.=<<___;
    733 	movdqa		.Lsigma(%rip),$xa3	# key[0]
    734 	movdqu		($key),$xb3		# key[1]
    735 	movdqu		16($key),$xt3		# key[2]
    736 	movdqu		($counter),$xd3		# key[3]
    737 	lea		0x100(%rsp),%rcx	# size optimization
    738 	lea		.Lrot16(%rip),%r10
    739 	lea		.Lrot24(%rip),%r11
    740 
    741 	pshufd		\$0x00,$xa3,$xa0	# smash key by lanes...
    742 	pshufd		\$0x55,$xa3,$xa1
    743 	movdqa		$xa0,0x40(%rsp)		# ... and offload
    744 	pshufd		\$0xaa,$xa3,$xa2
    745 	movdqa		$xa1,0x50(%rsp)
    746 	pshufd		\$0xff,$xa3,$xa3
    747 	movdqa		$xa2,0x60(%rsp)
    748 	movdqa		$xa3,0x70(%rsp)
    749 
    750 	pshufd		\$0x00,$xb3,$xb0
    751 	pshufd		\$0x55,$xb3,$xb1
    752 	movdqa		$xb0,0x80-0x100(%rcx)
    753 	pshufd		\$0xaa,$xb3,$xb2
    754 	movdqa		$xb1,0x90-0x100(%rcx)
    755 	pshufd		\$0xff,$xb3,$xb3
    756 	movdqa		$xb2,0xa0-0x100(%rcx)
    757 	movdqa		$xb3,0xb0-0x100(%rcx)
    758 
    759 	pshufd		\$0x00,$xt3,$xt0	# "$xc0"
    760 	pshufd		\$0x55,$xt3,$xt1	# "$xc1"
    761 	movdqa		$xt0,0xc0-0x100(%rcx)
    762 	pshufd		\$0xaa,$xt3,$xt2	# "$xc2"
    763 	movdqa		$xt1,0xd0-0x100(%rcx)
    764 	pshufd		\$0xff,$xt3,$xt3	# "$xc3"
    765 	movdqa		$xt2,0xe0-0x100(%rcx)
    766 	movdqa		$xt3,0xf0-0x100(%rcx)
    767 
    768 	pshufd		\$0x00,$xd3,$xd0
    769 	pshufd		\$0x55,$xd3,$xd1
    770 	paddd		.Linc(%rip),$xd0	# don't save counters yet
    771 	pshufd		\$0xaa,$xd3,$xd2
    772 	movdqa		$xd1,0x110-0x100(%rcx)
    773 	pshufd		\$0xff,$xd3,$xd3
    774 	movdqa		$xd2,0x120-0x100(%rcx)
    775 	movdqa		$xd3,0x130-0x100(%rcx)
    776 
    777 	jmp		.Loop_enter4x
    778 
    779 .align	32
    780 .Loop_outer4x:
    781 	movdqa		0x40(%rsp),$xa0		# re-load smashed key
    782 	movdqa		0x50(%rsp),$xa1
    783 	movdqa		0x60(%rsp),$xa2
    784 	movdqa		0x70(%rsp),$xa3
    785 	movdqa		0x80-0x100(%rcx),$xb0
    786 	movdqa		0x90-0x100(%rcx),$xb1
    787 	movdqa		0xa0-0x100(%rcx),$xb2
    788 	movdqa		0xb0-0x100(%rcx),$xb3
    789 	movdqa		0xc0-0x100(%rcx),$xt0	# "$xc0"
    790 	movdqa		0xd0-0x100(%rcx),$xt1	# "$xc1"
    791 	movdqa		0xe0-0x100(%rcx),$xt2	# "$xc2"
    792 	movdqa		0xf0-0x100(%rcx),$xt3	# "$xc3"
    793 	movdqa		0x100-0x100(%rcx),$xd0
    794 	movdqa		0x110-0x100(%rcx),$xd1
    795 	movdqa		0x120-0x100(%rcx),$xd2
    796 	movdqa		0x130-0x100(%rcx),$xd3
    797 	paddd		.Lfour(%rip),$xd0	# next SIMD counters
    798 
    799 .Loop_enter4x:
    800 	movdqa		$xt2,0x20(%rsp)		# SIMD equivalent of "@x[10]"
    801 	movdqa		$xt3,0x30(%rsp)		# SIMD equivalent of "@x[11]"
    802 	movdqa		(%r10),$xt3		# .Lrot16(%rip)
    803 	mov		\$10,%eax
    804 	movdqa		$xd0,0x100-0x100(%rcx)	# save SIMD counters
    805 	jmp		.Loop4x
    806 
    807 .align	32
    808 .Loop4x:
    809 ___
    810 	foreach (&SSSE3_lane_ROUND(0, 4, 8,12)) { eval; }
    811 	foreach (&SSSE3_lane_ROUND(0, 5,10,15)) { eval; }
    812 $code.=<<___;
    813 	dec		%eax
    814 	jnz		.Loop4x
    815 
    816 	paddd		0x40(%rsp),$xa0		# accumulate key material
    817 	paddd		0x50(%rsp),$xa1
    818 	paddd		0x60(%rsp),$xa2
    819 	paddd		0x70(%rsp),$xa3
    820 
    821 	movdqa		$xa0,$xt2		# "de-interlace" data
    822 	punpckldq	$xa1,$xa0
    823 	movdqa		$xa2,$xt3
    824 	punpckldq	$xa3,$xa2
    825 	punpckhdq	$xa1,$xt2
    826 	punpckhdq	$xa3,$xt3
    827 	movdqa		$xa0,$xa1
    828 	punpcklqdq	$xa2,$xa0		# "a0"
    829 	movdqa		$xt2,$xa3
    830 	punpcklqdq	$xt3,$xt2		# "a2"
    831 	punpckhqdq	$xa2,$xa1		# "a1"
    832 	punpckhqdq	$xt3,$xa3		# "a3"
    833 ___
    834 	($xa2,$xt2)=($xt2,$xa2);
    835 $code.=<<___;
    836 	paddd		0x80-0x100(%rcx),$xb0
    837 	paddd		0x90-0x100(%rcx),$xb1
    838 	paddd		0xa0-0x100(%rcx),$xb2
    839 	paddd		0xb0-0x100(%rcx),$xb3
    840 
    841 	movdqa		$xa0,0x00(%rsp)		# offload $xaN
    842 	movdqa		$xa1,0x10(%rsp)
    843 	movdqa		0x20(%rsp),$xa0		# "xc2"
    844 	movdqa		0x30(%rsp),$xa1		# "xc3"
    845 
    846 	movdqa		$xb0,$xt2
    847 	punpckldq	$xb1,$xb0
    848 	movdqa		$xb2,$xt3
    849 	punpckldq	$xb3,$xb2
    850 	punpckhdq	$xb1,$xt2
    851 	punpckhdq	$xb3,$xt3
    852 	movdqa		$xb0,$xb1
    853 	punpcklqdq	$xb2,$xb0		# "b0"
    854 	movdqa		$xt2,$xb3
    855 	punpcklqdq	$xt3,$xt2		# "b2"
    856 	punpckhqdq	$xb2,$xb1		# "b1"
    857 	punpckhqdq	$xt3,$xb3		# "b3"
    858 ___
    859 	($xb2,$xt2)=($xt2,$xb2);
    860 	my ($xc0,$xc1,$xc2,$xc3)=($xt0,$xt1,$xa0,$xa1);
    861 $code.=<<___;
    862 	paddd		0xc0-0x100(%rcx),$xc0
    863 	paddd		0xd0-0x100(%rcx),$xc1
    864 	paddd		0xe0-0x100(%rcx),$xc2
    865 	paddd		0xf0-0x100(%rcx),$xc3
    866 
    867 	movdqa		$xa2,0x20(%rsp)		# keep offloading $xaN
    868 	movdqa		$xa3,0x30(%rsp)
    869 
    870 	movdqa		$xc0,$xt2
    871 	punpckldq	$xc1,$xc0
    872 	movdqa		$xc2,$xt3
    873 	punpckldq	$xc3,$xc2
    874 	punpckhdq	$xc1,$xt2
    875 	punpckhdq	$xc3,$xt3
    876 	movdqa		$xc0,$xc1
    877 	punpcklqdq	$xc2,$xc0		# "c0"
    878 	movdqa		$xt2,$xc3
    879 	punpcklqdq	$xt3,$xt2		# "c2"
    880 	punpckhqdq	$xc2,$xc1		# "c1"
    881 	punpckhqdq	$xt3,$xc3		# "c3"
    882 ___
    883 	($xc2,$xt2)=($xt2,$xc2);
    884 	($xt0,$xt1)=($xa2,$xa3);		# use $xaN as temporary
    885 $code.=<<___;
    886 	paddd		0x100-0x100(%rcx),$xd0
    887 	paddd		0x110-0x100(%rcx),$xd1
    888 	paddd		0x120-0x100(%rcx),$xd2
    889 	paddd		0x130-0x100(%rcx),$xd3
    890 
    891 	movdqa		$xd0,$xt2
    892 	punpckldq	$xd1,$xd0
    893 	movdqa		$xd2,$xt3
    894 	punpckldq	$xd3,$xd2
    895 	punpckhdq	$xd1,$xt2
    896 	punpckhdq	$xd3,$xt3
    897 	movdqa		$xd0,$xd1
    898 	punpcklqdq	$xd2,$xd0		# "d0"
    899 	movdqa		$xt2,$xd3
    900 	punpcklqdq	$xt3,$xt2		# "d2"
    901 	punpckhqdq	$xd2,$xd1		# "d1"
    902 	punpckhqdq	$xt3,$xd3		# "d3"
    903 ___
    904 	($xd2,$xt2)=($xt2,$xd2);
    905 $code.=<<___;
    906 	cmp		\$64*4,$len
    907 	jb		.Ltail4x
    908 
    909 	movdqu		0x00($inp),$xt0		# xor with input
    910 	movdqu		0x10($inp),$xt1
    911 	movdqu		0x20($inp),$xt2
    912 	movdqu		0x30($inp),$xt3
    913 	pxor		0x00(%rsp),$xt0		# $xaN is offloaded, remember?
    914 	pxor		$xb0,$xt1
    915 	pxor		$xc0,$xt2
    916 	pxor		$xd0,$xt3
    917 
    918 	 movdqu		$xt0,0x00($out)
    919 	movdqu		0x40($inp),$xt0
    920 	 movdqu		$xt1,0x10($out)
    921 	movdqu		0x50($inp),$xt1
    922 	 movdqu		$xt2,0x20($out)
    923 	movdqu		0x60($inp),$xt2
    924 	 movdqu		$xt3,0x30($out)
    925 	movdqu		0x70($inp),$xt3
    926 	lea		0x80($inp),$inp		# size optimization
    927 	pxor		0x10(%rsp),$xt0
    928 	pxor		$xb1,$xt1
    929 	pxor		$xc1,$xt2
    930 	pxor		$xd1,$xt3
    931 
    932 	 movdqu		$xt0,0x40($out)
    933 	movdqu		0x00($inp),$xt0
    934 	 movdqu		$xt1,0x50($out)
    935 	movdqu		0x10($inp),$xt1
    936 	 movdqu		$xt2,0x60($out)
    937 	movdqu		0x20($inp),$xt2
    938 	 movdqu		$xt3,0x70($out)
    939 	 lea		0x80($out),$out		# size optimization
    940 	movdqu		0x30($inp),$xt3
    941 	pxor		0x20(%rsp),$xt0
    942 	pxor		$xb2,$xt1
    943 	pxor		$xc2,$xt2
    944 	pxor		$xd2,$xt3
    945 
    946 	 movdqu		$xt0,0x00($out)
    947 	movdqu		0x40($inp),$xt0
    948 	 movdqu		$xt1,0x10($out)
    949 	movdqu		0x50($inp),$xt1
    950 	 movdqu		$xt2,0x20($out)
    951 	movdqu		0x60($inp),$xt2
    952 	 movdqu		$xt3,0x30($out)
    953 	movdqu		0x70($inp),$xt3
    954 	lea		0x80($inp),$inp		# inp+=64*4
    955 	pxor		0x30(%rsp),$xt0
    956 	pxor		$xb3,$xt1
    957 	pxor		$xc3,$xt2
    958 	pxor		$xd3,$xt3
    959 	movdqu		$xt0,0x40($out)
    960 	movdqu		$xt1,0x50($out)
    961 	movdqu		$xt2,0x60($out)
    962 	movdqu		$xt3,0x70($out)
    963 	lea		0x80($out),$out		# out+=64*4
    964 
    965 	sub		\$64*4,$len
    966 	jnz		.Loop_outer4x
    967 
    968 	jmp		.Ldone4x
    969 
    970 .Ltail4x:
    971 	cmp		\$192,$len
    972 	jae		.L192_or_more4x
    973 	cmp		\$128,$len
    974 	jae		.L128_or_more4x
    975 	cmp		\$64,$len
    976 	jae		.L64_or_more4x
    977 
    978 	#movdqa		0x00(%rsp),$xt0		# $xaN is offloaded, remember?
    979 	xor		%r10,%r10
    980 	#movdqa		$xt0,0x00(%rsp)
    981 	movdqa		$xb0,0x10(%rsp)
    982 	movdqa		$xc0,0x20(%rsp)
    983 	movdqa		$xd0,0x30(%rsp)
    984 	jmp		.Loop_tail4x
    985 
    986 .align	32
    987 .L64_or_more4x:
    988 	movdqu		0x00($inp),$xt0		# xor with input
    989 	movdqu		0x10($inp),$xt1
    990 	movdqu		0x20($inp),$xt2
    991 	movdqu		0x30($inp),$xt3
    992 	pxor		0x00(%rsp),$xt0		# $xaxN is offloaded, remember?
    993 	pxor		$xb0,$xt1
    994 	pxor		$xc0,$xt2
    995 	pxor		$xd0,$xt3
    996 	movdqu		$xt0,0x00($out)
    997 	movdqu		$xt1,0x10($out)
    998 	movdqu		$xt2,0x20($out)
    999 	movdqu		$xt3,0x30($out)
   1000 	je		.Ldone4x
   1001 
   1002 	movdqa		0x10(%rsp),$xt0		# $xaN is offloaded, remember?
   1003 	lea		0x40($inp),$inp		# inp+=64*1
   1004 	xor		%r10,%r10
   1005 	movdqa		$xt0,0x00(%rsp)
   1006 	movdqa		$xb1,0x10(%rsp)
   1007 	lea		0x40($out),$out		# out+=64*1
   1008 	movdqa		$xc1,0x20(%rsp)
   1009 	sub		\$64,$len		# len-=64*1
   1010 	movdqa		$xd1,0x30(%rsp)
   1011 	jmp		.Loop_tail4x
   1012 
   1013 .align	32
   1014 .L128_or_more4x:
   1015 	movdqu		0x00($inp),$xt0		# xor with input
   1016 	movdqu		0x10($inp),$xt1
   1017 	movdqu		0x20($inp),$xt2
   1018 	movdqu		0x30($inp),$xt3
   1019 	pxor		0x00(%rsp),$xt0		# $xaN is offloaded, remember?
   1020 	pxor		$xb0,$xt1
   1021 	pxor		$xc0,$xt2
   1022 	pxor		$xd0,$xt3
   1023 
   1024 	 movdqu		$xt0,0x00($out)
   1025 	movdqu		0x40($inp),$xt0
   1026 	 movdqu		$xt1,0x10($out)
   1027 	movdqu		0x50($inp),$xt1
   1028 	 movdqu		$xt2,0x20($out)
   1029 	movdqu		0x60($inp),$xt2
   1030 	 movdqu		$xt3,0x30($out)
   1031 	movdqu		0x70($inp),$xt3
   1032 	pxor		0x10(%rsp),$xt0
   1033 	pxor		$xb1,$xt1
   1034 	pxor		$xc1,$xt2
   1035 	pxor		$xd1,$xt3
   1036 	movdqu		$xt0,0x40($out)
   1037 	movdqu		$xt1,0x50($out)
   1038 	movdqu		$xt2,0x60($out)
   1039 	movdqu		$xt3,0x70($out)
   1040 	je		.Ldone4x
   1041 
   1042 	movdqa		0x20(%rsp),$xt0		# $xaN is offloaded, remember?
   1043 	lea		0x80($inp),$inp		# inp+=64*2
   1044 	xor		%r10,%r10
   1045 	movdqa		$xt0,0x00(%rsp)
   1046 	movdqa		$xb2,0x10(%rsp)
   1047 	lea		0x80($out),$out		# out+=64*2
   1048 	movdqa		$xc2,0x20(%rsp)
   1049 	sub		\$128,$len		# len-=64*2
   1050 	movdqa		$xd2,0x30(%rsp)
   1051 	jmp		.Loop_tail4x
   1052 
   1053 .align	32
   1054 .L192_or_more4x:
   1055 	movdqu		0x00($inp),$xt0		# xor with input
   1056 	movdqu		0x10($inp),$xt1
   1057 	movdqu		0x20($inp),$xt2
   1058 	movdqu		0x30($inp),$xt3
   1059 	pxor		0x00(%rsp),$xt0		# $xaN is offloaded, remember?
   1060 	pxor		$xb0,$xt1
   1061 	pxor		$xc0,$xt2
   1062 	pxor		$xd0,$xt3
   1063 
   1064 	 movdqu		$xt0,0x00($out)
   1065 	movdqu		0x40($inp),$xt0
   1066 	 movdqu		$xt1,0x10($out)
   1067 	movdqu		0x50($inp),$xt1
   1068 	 movdqu		$xt2,0x20($out)
   1069 	movdqu		0x60($inp),$xt2
   1070 	 movdqu		$xt3,0x30($out)
   1071 	movdqu		0x70($inp),$xt3
   1072 	lea		0x80($inp),$inp		# size optimization
   1073 	pxor		0x10(%rsp),$xt0
   1074 	pxor		$xb1,$xt1
   1075 	pxor		$xc1,$xt2
   1076 	pxor		$xd1,$xt3
   1077 
   1078 	 movdqu		$xt0,0x40($out)
   1079 	movdqu		0x00($inp),$xt0
   1080 	 movdqu		$xt1,0x50($out)
   1081 	movdqu		0x10($inp),$xt1
   1082 	 movdqu		$xt2,0x60($out)
   1083 	movdqu		0x20($inp),$xt2
   1084 	 movdqu		$xt3,0x70($out)
   1085 	 lea		0x80($out),$out		# size optimization
   1086 	movdqu		0x30($inp),$xt3
   1087 	pxor		0x20(%rsp),$xt0
   1088 	pxor		$xb2,$xt1
   1089 	pxor		$xc2,$xt2
   1090 	pxor		$xd2,$xt3
   1091 	movdqu		$xt0,0x00($out)
   1092 	movdqu		$xt1,0x10($out)
   1093 	movdqu		$xt2,0x20($out)
   1094 	movdqu		$xt3,0x30($out)
   1095 	je		.Ldone4x
   1096 
   1097 	movdqa		0x30(%rsp),$xt0		# $xaN is offloaded, remember?
   1098 	lea		0x40($inp),$inp		# inp+=64*3
   1099 	xor		%r10,%r10
   1100 	movdqa		$xt0,0x00(%rsp)
   1101 	movdqa		$xb3,0x10(%rsp)
   1102 	lea		0x40($out),$out		# out+=64*3
   1103 	movdqa		$xc3,0x20(%rsp)
   1104 	sub		\$192,$len		# len-=64*3
   1105 	movdqa		$xd3,0x30(%rsp)
   1106 
   1107 .Loop_tail4x:
   1108 	movzb		($inp,%r10),%eax
   1109 	movzb		(%rsp,%r10),%ecx
   1110 	lea		1(%r10),%r10
   1111 	xor		%ecx,%eax
   1112 	mov		%al,-1($out,%r10)
   1113 	dec		$len
   1114 	jnz		.Loop_tail4x
   1115 
   1116 .Ldone4x:
   1117 ___
   1118 $code.=<<___	if ($win64);
   1119 	movaps		-0xa8(%r9),%xmm6
   1120 	movaps		-0x98(%r9),%xmm7
   1121 	movaps		-0x88(%r9),%xmm8
   1122 	movaps		-0x78(%r9),%xmm9
   1123 	movaps		-0x68(%r9),%xmm10
   1124 	movaps		-0x58(%r9),%xmm11
   1125 	movaps		-0x48(%r9),%xmm12
   1126 	movaps		-0x38(%r9),%xmm13
   1127 	movaps		-0x28(%r9),%xmm14
   1128 	movaps		-0x18(%r9),%xmm15
   1129 ___
   1130 $code.=<<___;
   1131 	lea		(%r9),%rsp
   1132 .L4x_epilogue:
   1133 	ret
   1134 .size	ChaCha20_4x,.-ChaCha20_4x
   1135 ___
   1136 }
   1137 
   1138 ########################################################################
   1139 # AVX2 code path
   1140 if ($avx>1) {
   1141 my ($xb0,$xb1,$xb2,$xb3, $xd0,$xd1,$xd2,$xd3,
   1142     $xa0,$xa1,$xa2,$xa3, $xt0,$xt1,$xt2,$xt3)=map("%ymm$_",(0..15));
   1143 my @xx=($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3,
   1144 	"%nox","%nox","%nox","%nox", $xd0,$xd1,$xd2,$xd3);
   1145 
   1146 sub AVX2_lane_ROUND {
   1147 my ($a0,$b0,$c0,$d0)=@_;
   1148 my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
   1149 my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
   1150 my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
   1151 my ($xc,$xc_,$t0,$t1)=map("\"$_\"",$xt0,$xt1,$xt2,$xt3);
   1152 my @x=map("\"$_\"",@xx);
   1153 
   1154 	# Consider order in which variables are addressed by their
   1155 	# index:
   1156 	#
   1157 	#	a   b   c   d
   1158 	#
   1159 	#	0   4   8  12 < even round
   1160 	#	1   5   9  13
   1161 	#	2   6  10  14
   1162 	#	3   7  11  15
   1163 	#	0   5  10  15 < odd round
   1164 	#	1   6  11  12
   1165 	#	2   7   8  13
   1166 	#	3   4   9  14
   1167 	#
   1168 	# 'a', 'b' and 'd's are permanently allocated in registers,
   1169 	# @x[0..7,12..15], while 'c's are maintained in memory. If
   1170 	# you observe 'c' column, you'll notice that pair of 'c's is
   1171 	# invariant between rounds. This means that we have to reload
   1172 	# them once per round, in the middle. This is why you'll see
   1173 	# bunch of 'c' stores and loads in the middle, but none in
   1174 	# the beginning or end.
   1175 
   1176 	(
   1177 	"&vpaddd	(@x[$a0],@x[$a0],@x[$b0])",	# Q1
   1178 	"&vpxor		(@x[$d0],@x[$a0],@x[$d0])",
   1179 	"&vpshufb	(@x[$d0],@x[$d0],$t1)",
   1180 	 "&vpaddd	(@x[$a1],@x[$a1],@x[$b1])",	# Q2
   1181 	 "&vpxor	(@x[$d1],@x[$a1],@x[$d1])",
   1182 	 "&vpshufb	(@x[$d1],@x[$d1],$t1)",
   1183 
   1184 	"&vpaddd	($xc,$xc,@x[$d0])",
   1185 	"&vpxor		(@x[$b0],$xc,@x[$b0])",
   1186 	"&vpslld	($t0,@x[$b0],12)",
   1187 	"&vpsrld	(@x[$b0],@x[$b0],20)",
   1188 	"&vpor		(@x[$b0],$t0,@x[$b0])",
   1189 	"&vbroadcasti128($t0,'(%r11)')",		# .Lrot24(%rip)
   1190 	 "&vpaddd	($xc_,$xc_,@x[$d1])",
   1191 	 "&vpxor	(@x[$b1],$xc_,@x[$b1])",
   1192 	 "&vpslld	($t1,@x[$b1],12)",
   1193 	 "&vpsrld	(@x[$b1],@x[$b1],20)",
   1194 	 "&vpor		(@x[$b1],$t1,@x[$b1])",
   1195 
   1196 	"&vpaddd	(@x[$a0],@x[$a0],@x[$b0])",
   1197 	"&vpxor		(@x[$d0],@x[$a0],@x[$d0])",
   1198 	"&vpshufb	(@x[$d0],@x[$d0],$t0)",
   1199 	 "&vpaddd	(@x[$a1],@x[$a1],@x[$b1])",
   1200 	 "&vpxor	(@x[$d1],@x[$a1],@x[$d1])",
   1201 	 "&vpshufb	(@x[$d1],@x[$d1],$t0)",
   1202 
   1203 	"&vpaddd	($xc,$xc,@x[$d0])",
   1204 	"&vpxor		(@x[$b0],$xc,@x[$b0])",
   1205 	"&vpslld	($t1,@x[$b0],7)",
   1206 	"&vpsrld	(@x[$b0],@x[$b0],25)",
   1207 	"&vpor		(@x[$b0],$t1,@x[$b0])",
   1208 	"&vbroadcasti128($t1,'(%r10)')",		# .Lrot16(%rip)
   1209 	 "&vpaddd	($xc_,$xc_,@x[$d1])",
   1210 	 "&vpxor	(@x[$b1],$xc_,@x[$b1])",
   1211 	 "&vpslld	($t0,@x[$b1],7)",
   1212 	 "&vpsrld	(@x[$b1],@x[$b1],25)",
   1213 	 "&vpor		(@x[$b1],$t0,@x[$b1])",
   1214 
   1215 	"&vmovdqa	(\"`32*($c0-8)`(%rsp)\",$xc)",	# reload pair of 'c's
   1216 	 "&vmovdqa	(\"`32*($c1-8)`(%rsp)\",$xc_)",
   1217 	"&vmovdqa	($xc,\"`32*($c2-8)`(%rsp)\")",
   1218 	 "&vmovdqa	($xc_,\"`32*($c3-8)`(%rsp)\")",
   1219 
   1220 	"&vpaddd	(@x[$a2],@x[$a2],@x[$b2])",	# Q3
   1221 	"&vpxor		(@x[$d2],@x[$a2],@x[$d2])",
   1222 	"&vpshufb	(@x[$d2],@x[$d2],$t1)",
   1223 	 "&vpaddd	(@x[$a3],@x[$a3],@x[$b3])",	# Q4
   1224 	 "&vpxor	(@x[$d3],@x[$a3],@x[$d3])",
   1225 	 "&vpshufb	(@x[$d3],@x[$d3],$t1)",
   1226 
   1227 	"&vpaddd	($xc,$xc,@x[$d2])",
   1228 	"&vpxor		(@x[$b2],$xc,@x[$b2])",
   1229 	"&vpslld	($t0,@x[$b2],12)",
   1230 	"&vpsrld	(@x[$b2],@x[$b2],20)",
   1231 	"&vpor		(@x[$b2],$t0,@x[$b2])",
   1232 	"&vbroadcasti128($t0,'(%r11)')",		# .Lrot24(%rip)
   1233 	 "&vpaddd	($xc_,$xc_,@x[$d3])",
   1234 	 "&vpxor	(@x[$b3],$xc_,@x[$b3])",
   1235 	 "&vpslld	($t1,@x[$b3],12)",
   1236 	 "&vpsrld	(@x[$b3],@x[$b3],20)",
   1237 	 "&vpor		(@x[$b3],$t1,@x[$b3])",
   1238 
   1239 	"&vpaddd	(@x[$a2],@x[$a2],@x[$b2])",
   1240 	"&vpxor		(@x[$d2],@x[$a2],@x[$d2])",
   1241 	"&vpshufb	(@x[$d2],@x[$d2],$t0)",
   1242 	 "&vpaddd	(@x[$a3],@x[$a3],@x[$b3])",
   1243 	 "&vpxor	(@x[$d3],@x[$a3],@x[$d3])",
   1244 	 "&vpshufb	(@x[$d3],@x[$d3],$t0)",
   1245 
   1246 	"&vpaddd	($xc,$xc,@x[$d2])",
   1247 	"&vpxor		(@x[$b2],$xc,@x[$b2])",
   1248 	"&vpslld	($t1,@x[$b2],7)",
   1249 	"&vpsrld	(@x[$b2],@x[$b2],25)",
   1250 	"&vpor		(@x[$b2],$t1,@x[$b2])",
   1251 	"&vbroadcasti128($t1,'(%r10)')",		# .Lrot16(%rip)
   1252 	 "&vpaddd	($xc_,$xc_,@x[$d3])",
   1253 	 "&vpxor	(@x[$b3],$xc_,@x[$b3])",
   1254 	 "&vpslld	($t0,@x[$b3],7)",
   1255 	 "&vpsrld	(@x[$b3],@x[$b3],25)",
   1256 	 "&vpor		(@x[$b3],$t0,@x[$b3])"
   1257 	);
   1258 }
   1259 
   1260 my $xframe = $win64 ? 0xa8 : 8;
   1261 
   1262 $code.=<<___;
   1263 .type	ChaCha20_8x,\@function,5
   1264 .align	32
   1265 ChaCha20_8x:
   1266 .LChaCha20_8x:
   1267 	mov		%rsp,%r9		# frame register
   1268 	sub		\$0x280+$xframe,%rsp
   1269 	and		\$-32,%rsp
   1270 ___
   1271 $code.=<<___	if ($win64);
   1272 	movaps		%xmm6,-0xa8(%r9)
   1273 	movaps		%xmm7,-0x98(%r9)
   1274 	movaps		%xmm8,-0x88(%r9)
   1275 	movaps		%xmm9,-0x78(%r9)
   1276 	movaps		%xmm10,-0x68(%r9)
   1277 	movaps		%xmm11,-0x58(%r9)
   1278 	movaps		%xmm12,-0x48(%r9)
   1279 	movaps		%xmm13,-0x38(%r9)
   1280 	movaps		%xmm14,-0x28(%r9)
   1281 	movaps		%xmm15,-0x18(%r9)
   1282 .L8x_body:
   1283 ___
   1284 $code.=<<___;
   1285 	vzeroupper
   1286 
   1287 	################ stack layout
   1288 	# +0x00		SIMD equivalent of @x[8-12]
   1289 	# ...
   1290 	# +0x80		constant copy of key[0-2] smashed by lanes
   1291 	# ...
   1292 	# +0x200	SIMD counters (with nonce smashed by lanes)
   1293 	# ...
   1294 	# +0x280
   1295 
   1296 	vbroadcasti128	.Lsigma(%rip),$xa3	# key[0]
   1297 	vbroadcasti128	($key),$xb3		# key[1]
   1298 	vbroadcasti128	16($key),$xt3		# key[2]
   1299 	vbroadcasti128	($counter),$xd3		# key[3]
   1300 	lea		0x100(%rsp),%rcx	# size optimization
   1301 	lea		0x200(%rsp),%rax	# size optimization
   1302 	lea		.Lrot16(%rip),%r10
   1303 	lea		.Lrot24(%rip),%r11
   1304 
   1305 	vpshufd		\$0x00,$xa3,$xa0	# smash key by lanes...
   1306 	vpshufd		\$0x55,$xa3,$xa1
   1307 	vmovdqa		$xa0,0x80-0x100(%rcx)	# ... and offload
   1308 	vpshufd		\$0xaa,$xa3,$xa2
   1309 	vmovdqa		$xa1,0xa0-0x100(%rcx)
   1310 	vpshufd		\$0xff,$xa3,$xa3
   1311 	vmovdqa		$xa2,0xc0-0x100(%rcx)
   1312 	vmovdqa		$xa3,0xe0-0x100(%rcx)
   1313 
   1314 	vpshufd		\$0x00,$xb3,$xb0
   1315 	vpshufd		\$0x55,$xb3,$xb1
   1316 	vmovdqa		$xb0,0x100-0x100(%rcx)
   1317 	vpshufd		\$0xaa,$xb3,$xb2
   1318 	vmovdqa		$xb1,0x120-0x100(%rcx)
   1319 	vpshufd		\$0xff,$xb3,$xb3
   1320 	vmovdqa		$xb2,0x140-0x100(%rcx)
   1321 	vmovdqa		$xb3,0x160-0x100(%rcx)
   1322 
   1323 	vpshufd		\$0x00,$xt3,$xt0	# "xc0"
   1324 	vpshufd		\$0x55,$xt3,$xt1	# "xc1"
   1325 	vmovdqa		$xt0,0x180-0x200(%rax)
   1326 	vpshufd		\$0xaa,$xt3,$xt2	# "xc2"
   1327 	vmovdqa		$xt1,0x1a0-0x200(%rax)
   1328 	vpshufd		\$0xff,$xt3,$xt3	# "xc3"
   1329 	vmovdqa		$xt2,0x1c0-0x200(%rax)
   1330 	vmovdqa		$xt3,0x1e0-0x200(%rax)
   1331 
   1332 	vpshufd		\$0x00,$xd3,$xd0
   1333 	vpshufd		\$0x55,$xd3,$xd1
   1334 	vpaddd		.Lincy(%rip),$xd0,$xd0	# don't save counters yet
   1335 	vpshufd		\$0xaa,$xd3,$xd2
   1336 	vmovdqa		$xd1,0x220-0x200(%rax)
   1337 	vpshufd		\$0xff,$xd3,$xd3
   1338 	vmovdqa		$xd2,0x240-0x200(%rax)
   1339 	vmovdqa		$xd3,0x260-0x200(%rax)
   1340 
   1341 	jmp		.Loop_enter8x
   1342 
   1343 .align	32
   1344 .Loop_outer8x:
   1345 	vmovdqa		0x80-0x100(%rcx),$xa0	# re-load smashed key
   1346 	vmovdqa		0xa0-0x100(%rcx),$xa1
   1347 	vmovdqa		0xc0-0x100(%rcx),$xa2
   1348 	vmovdqa		0xe0-0x100(%rcx),$xa3
   1349 	vmovdqa		0x100-0x100(%rcx),$xb0
   1350 	vmovdqa		0x120-0x100(%rcx),$xb1
   1351 	vmovdqa		0x140-0x100(%rcx),$xb2
   1352 	vmovdqa		0x160-0x100(%rcx),$xb3
   1353 	vmovdqa		0x180-0x200(%rax),$xt0	# "xc0"
   1354 	vmovdqa		0x1a0-0x200(%rax),$xt1	# "xc1"
   1355 	vmovdqa		0x1c0-0x200(%rax),$xt2	# "xc2"
   1356 	vmovdqa		0x1e0-0x200(%rax),$xt3	# "xc3"
   1357 	vmovdqa		0x200-0x200(%rax),$xd0
   1358 	vmovdqa		0x220-0x200(%rax),$xd1
   1359 	vmovdqa		0x240-0x200(%rax),$xd2
   1360 	vmovdqa		0x260-0x200(%rax),$xd3
   1361 	vpaddd		.Leight(%rip),$xd0,$xd0	# next SIMD counters
   1362 
   1363 .Loop_enter8x:
   1364 	vmovdqa		$xt2,0x40(%rsp)		# SIMD equivalent of "@x[10]"
   1365 	vmovdqa		$xt3,0x60(%rsp)		# SIMD equivalent of "@x[11]"
   1366 	vbroadcasti128	(%r10),$xt3
   1367 	vmovdqa		$xd0,0x200-0x200(%rax)	# save SIMD counters
   1368 	mov		\$10,%eax
   1369 	jmp		.Loop8x
   1370 
   1371 .align	32
   1372 .Loop8x:
   1373 ___
   1374 	foreach (&AVX2_lane_ROUND(0, 4, 8,12)) { eval; }
   1375 	foreach (&AVX2_lane_ROUND(0, 5,10,15)) { eval; }
   1376 $code.=<<___;
   1377 	dec		%eax
   1378 	jnz		.Loop8x
   1379 
   1380 	lea		0x200(%rsp),%rax	# size optimization
   1381 	vpaddd		0x80-0x100(%rcx),$xa0,$xa0	# accumulate key
   1382 	vpaddd		0xa0-0x100(%rcx),$xa1,$xa1
   1383 	vpaddd		0xc0-0x100(%rcx),$xa2,$xa2
   1384 	vpaddd		0xe0-0x100(%rcx),$xa3,$xa3
   1385 
   1386 	vpunpckldq	$xa1,$xa0,$xt2		# "de-interlace" data
   1387 	vpunpckldq	$xa3,$xa2,$xt3
   1388 	vpunpckhdq	$xa1,$xa0,$xa0
   1389 	vpunpckhdq	$xa3,$xa2,$xa2
   1390 	vpunpcklqdq	$xt3,$xt2,$xa1		# "a0"
   1391 	vpunpckhqdq	$xt3,$xt2,$xt2		# "a1"
   1392 	vpunpcklqdq	$xa2,$xa0,$xa3		# "a2"
   1393 	vpunpckhqdq	$xa2,$xa0,$xa0		# "a3"
   1394 ___
   1395 	($xa0,$xa1,$xa2,$xa3,$xt2)=($xa1,$xt2,$xa3,$xa0,$xa2);
   1396 $code.=<<___;
   1397 	vpaddd		0x100-0x100(%rcx),$xb0,$xb0
   1398 	vpaddd		0x120-0x100(%rcx),$xb1,$xb1
   1399 	vpaddd		0x140-0x100(%rcx),$xb2,$xb2
   1400 	vpaddd		0x160-0x100(%rcx),$xb3,$xb3
   1401 
   1402 	vpunpckldq	$xb1,$xb0,$xt2
   1403 	vpunpckldq	$xb3,$xb2,$xt3
   1404 	vpunpckhdq	$xb1,$xb0,$xb0
   1405 	vpunpckhdq	$xb3,$xb2,$xb2
   1406 	vpunpcklqdq	$xt3,$xt2,$xb1		# "b0"
   1407 	vpunpckhqdq	$xt3,$xt2,$xt2		# "b1"
   1408 	vpunpcklqdq	$xb2,$xb0,$xb3		# "b2"
   1409 	vpunpckhqdq	$xb2,$xb0,$xb0		# "b3"
   1410 ___
   1411 	($xb0,$xb1,$xb2,$xb3,$xt2)=($xb1,$xt2,$xb3,$xb0,$xb2);
   1412 $code.=<<___;
   1413 	vperm2i128	\$0x20,$xb0,$xa0,$xt3	# "de-interlace" further
   1414 	vperm2i128	\$0x31,$xb0,$xa0,$xb0
   1415 	vperm2i128	\$0x20,$xb1,$xa1,$xa0
   1416 	vperm2i128	\$0x31,$xb1,$xa1,$xb1
   1417 	vperm2i128	\$0x20,$xb2,$xa2,$xa1
   1418 	vperm2i128	\$0x31,$xb2,$xa2,$xb2
   1419 	vperm2i128	\$0x20,$xb3,$xa3,$xa2
   1420 	vperm2i128	\$0x31,$xb3,$xa3,$xb3
   1421 ___
   1422 	($xa0,$xa1,$xa2,$xa3,$xt3)=($xt3,$xa0,$xa1,$xa2,$xa3);
   1423 	my ($xc0,$xc1,$xc2,$xc3)=($xt0,$xt1,$xa0,$xa1);
   1424 $code.=<<___;
   1425 	vmovdqa		$xa0,0x00(%rsp)		# offload $xaN
   1426 	vmovdqa		$xa1,0x20(%rsp)
   1427 	vmovdqa		0x40(%rsp),$xc2		# $xa0
   1428 	vmovdqa		0x60(%rsp),$xc3		# $xa1
   1429 
   1430 	vpaddd		0x180-0x200(%rax),$xc0,$xc0
   1431 	vpaddd		0x1a0-0x200(%rax),$xc1,$xc1
   1432 	vpaddd		0x1c0-0x200(%rax),$xc2,$xc2
   1433 	vpaddd		0x1e0-0x200(%rax),$xc3,$xc3
   1434 
   1435 	vpunpckldq	$xc1,$xc0,$xt2
   1436 	vpunpckldq	$xc3,$xc2,$xt3
   1437 	vpunpckhdq	$xc1,$xc0,$xc0
   1438 	vpunpckhdq	$xc3,$xc2,$xc2
   1439 	vpunpcklqdq	$xt3,$xt2,$xc1		# "c0"
   1440 	vpunpckhqdq	$xt3,$xt2,$xt2		# "c1"
   1441 	vpunpcklqdq	$xc2,$xc0,$xc3		# "c2"
   1442 	vpunpckhqdq	$xc2,$xc0,$xc0		# "c3"
   1443 ___
   1444 	($xc0,$xc1,$xc2,$xc3,$xt2)=($xc1,$xt2,$xc3,$xc0,$xc2);
   1445 $code.=<<___;
   1446 	vpaddd		0x200-0x200(%rax),$xd0,$xd0
   1447 	vpaddd		0x220-0x200(%rax),$xd1,$xd1
   1448 	vpaddd		0x240-0x200(%rax),$xd2,$xd2
   1449 	vpaddd		0x260-0x200(%rax),$xd3,$xd3
   1450 
   1451 	vpunpckldq	$xd1,$xd0,$xt2
   1452 	vpunpckldq	$xd3,$xd2,$xt3
   1453 	vpunpckhdq	$xd1,$xd0,$xd0
   1454 	vpunpckhdq	$xd3,$xd2,$xd2
   1455 	vpunpcklqdq	$xt3,$xt2,$xd1		# "d0"
   1456 	vpunpckhqdq	$xt3,$xt2,$xt2		# "d1"
   1457 	vpunpcklqdq	$xd2,$xd0,$xd3		# "d2"
   1458 	vpunpckhqdq	$xd2,$xd0,$xd0		# "d3"
   1459 ___
   1460 	($xd0,$xd1,$xd2,$xd3,$xt2)=($xd1,$xt2,$xd3,$xd0,$xd2);
   1461 $code.=<<___;
   1462 	vperm2i128	\$0x20,$xd0,$xc0,$xt3	# "de-interlace" further
   1463 	vperm2i128	\$0x31,$xd0,$xc0,$xd0
   1464 	vperm2i128	\$0x20,$xd1,$xc1,$xc0
   1465 	vperm2i128	\$0x31,$xd1,$xc1,$xd1
   1466 	vperm2i128	\$0x20,$xd2,$xc2,$xc1
   1467 	vperm2i128	\$0x31,$xd2,$xc2,$xd2
   1468 	vperm2i128	\$0x20,$xd3,$xc3,$xc2
   1469 	vperm2i128	\$0x31,$xd3,$xc3,$xd3
   1470 ___
   1471 	($xc0,$xc1,$xc2,$xc3,$xt3)=($xt3,$xc0,$xc1,$xc2,$xc3);
   1472 	($xb0,$xb1,$xb2,$xb3,$xc0,$xc1,$xc2,$xc3)=
   1473 	($xc0,$xc1,$xc2,$xc3,$xb0,$xb1,$xb2,$xb3);
   1474 	($xa0,$xa1)=($xt2,$xt3);
   1475 $code.=<<___;
   1476 	vmovdqa		0x00(%rsp),$xa0		# $xaN was offloaded, remember?
   1477 	vmovdqa		0x20(%rsp),$xa1
   1478 
   1479 	cmp		\$64*8,$len
   1480 	jb		.Ltail8x
   1481 
   1482 	vpxor		0x00($inp),$xa0,$xa0	# xor with input
   1483 	vpxor		0x20($inp),$xb0,$xb0
   1484 	vpxor		0x40($inp),$xc0,$xc0
   1485 	vpxor		0x60($inp),$xd0,$xd0
   1486 	lea		0x80($inp),$inp		# size optimization
   1487 	vmovdqu		$xa0,0x00($out)
   1488 	vmovdqu		$xb0,0x20($out)
   1489 	vmovdqu		$xc0,0x40($out)
   1490 	vmovdqu		$xd0,0x60($out)
   1491 	lea		0x80($out),$out		# size optimization
   1492 
   1493 	vpxor		0x00($inp),$xa1,$xa1
   1494 	vpxor		0x20($inp),$xb1,$xb1
   1495 	vpxor		0x40($inp),$xc1,$xc1
   1496 	vpxor		0x60($inp),$xd1,$xd1
   1497 	lea		0x80($inp),$inp		# size optimization
   1498 	vmovdqu		$xa1,0x00($out)
   1499 	vmovdqu		$xb1,0x20($out)
   1500 	vmovdqu		$xc1,0x40($out)
   1501 	vmovdqu		$xd1,0x60($out)
   1502 	lea		0x80($out),$out		# size optimization
   1503 
   1504 	vpxor		0x00($inp),$xa2,$xa2
   1505 	vpxor		0x20($inp),$xb2,$xb2
   1506 	vpxor		0x40($inp),$xc2,$xc2
   1507 	vpxor		0x60($inp),$xd2,$xd2
   1508 	lea		0x80($inp),$inp		# size optimization
   1509 	vmovdqu		$xa2,0x00($out)
   1510 	vmovdqu		$xb2,0x20($out)
   1511 	vmovdqu		$xc2,0x40($out)
   1512 	vmovdqu		$xd2,0x60($out)
   1513 	lea		0x80($out),$out		# size optimization
   1514 
   1515 	vpxor		0x00($inp),$xa3,$xa3
   1516 	vpxor		0x20($inp),$xb3,$xb3
   1517 	vpxor		0x40($inp),$xc3,$xc3
   1518 	vpxor		0x60($inp),$xd3,$xd3
   1519 	lea		0x80($inp),$inp		# size optimization
   1520 	vmovdqu		$xa3,0x00($out)
   1521 	vmovdqu		$xb3,0x20($out)
   1522 	vmovdqu		$xc3,0x40($out)
   1523 	vmovdqu		$xd3,0x60($out)
   1524 	lea		0x80($out),$out		# size optimization
   1525 
   1526 	sub		\$64*8,$len
   1527 	jnz		.Loop_outer8x
   1528 
   1529 	jmp		.Ldone8x
   1530 
   1531 .Ltail8x:
   1532 	cmp		\$448,$len
   1533 	jae		.L448_or_more8x
   1534 	cmp		\$384,$len
   1535 	jae		.L384_or_more8x
   1536 	cmp		\$320,$len
   1537 	jae		.L320_or_more8x
   1538 	cmp		\$256,$len
   1539 	jae		.L256_or_more8x
   1540 	cmp		\$192,$len
   1541 	jae		.L192_or_more8x
   1542 	cmp		\$128,$len
   1543 	jae		.L128_or_more8x
   1544 	cmp		\$64,$len
   1545 	jae		.L64_or_more8x
   1546 
   1547 	xor		%r10,%r10
   1548 	vmovdqa		$xa0,0x00(%rsp)
   1549 	vmovdqa		$xb0,0x20(%rsp)
   1550 	jmp		.Loop_tail8x
   1551 
   1552 .align	32
   1553 .L64_or_more8x:
   1554 	vpxor		0x00($inp),$xa0,$xa0	# xor with input
   1555 	vpxor		0x20($inp),$xb0,$xb0
   1556 	vmovdqu		$xa0,0x00($out)
   1557 	vmovdqu		$xb0,0x20($out)
   1558 	je		.Ldone8x
   1559 
   1560 	lea		0x40($inp),$inp		# inp+=64*1
   1561 	xor		%r10,%r10
   1562 	vmovdqa		$xc0,0x00(%rsp)
   1563 	lea		0x40($out),$out		# out+=64*1
   1564 	sub		\$64,$len		# len-=64*1
   1565 	vmovdqa		$xd0,0x20(%rsp)
   1566 	jmp		.Loop_tail8x
   1567 
   1568 .align	32
   1569 .L128_or_more8x:
   1570 	vpxor		0x00($inp),$xa0,$xa0	# xor with input
   1571 	vpxor		0x20($inp),$xb0,$xb0
   1572 	vpxor		0x40($inp),$xc0,$xc0
   1573 	vpxor		0x60($inp),$xd0,$xd0
   1574 	vmovdqu		$xa0,0x00($out)
   1575 	vmovdqu		$xb0,0x20($out)
   1576 	vmovdqu		$xc0,0x40($out)
   1577 	vmovdqu		$xd0,0x60($out)
   1578 	je		.Ldone8x
   1579 
   1580 	lea		0x80($inp),$inp		# inp+=64*2
   1581 	xor		%r10,%r10
   1582 	vmovdqa		$xa1,0x00(%rsp)
   1583 	lea		0x80($out),$out		# out+=64*2
   1584 	sub		\$128,$len		# len-=64*2
   1585 	vmovdqa		$xb1,0x20(%rsp)
   1586 	jmp		.Loop_tail8x
   1587 
   1588 .align	32
   1589 .L192_or_more8x:
   1590 	vpxor		0x00($inp),$xa0,$xa0	# xor with input
   1591 	vpxor		0x20($inp),$xb0,$xb0
   1592 	vpxor		0x40($inp),$xc0,$xc0
   1593 	vpxor		0x60($inp),$xd0,$xd0
   1594 	vpxor		0x80($inp),$xa1,$xa1
   1595 	vpxor		0xa0($inp),$xb1,$xb1
   1596 	vmovdqu		$xa0,0x00($out)
   1597 	vmovdqu		$xb0,0x20($out)
   1598 	vmovdqu		$xc0,0x40($out)
   1599 	vmovdqu		$xd0,0x60($out)
   1600 	vmovdqu		$xa1,0x80($out)
   1601 	vmovdqu		$xb1,0xa0($out)
   1602 	je		.Ldone8x
   1603 
   1604 	lea		0xc0($inp),$inp		# inp+=64*3
   1605 	xor		%r10,%r10
   1606 	vmovdqa		$xc1,0x00(%rsp)
   1607 	lea		0xc0($out),$out		# out+=64*3
   1608 	sub		\$192,$len		# len-=64*3
   1609 	vmovdqa		$xd1,0x20(%rsp)
   1610 	jmp		.Loop_tail8x
   1611 
   1612 .align	32
   1613 .L256_or_more8x:
   1614 	vpxor		0x00($inp),$xa0,$xa0	# xor with input
   1615 	vpxor		0x20($inp),$xb0,$xb0
   1616 	vpxor		0x40($inp),$xc0,$xc0
   1617 	vpxor		0x60($inp),$xd0,$xd0
   1618 	vpxor		0x80($inp),$xa1,$xa1
   1619 	vpxor		0xa0($inp),$xb1,$xb1
   1620 	vpxor		0xc0($inp),$xc1,$xc1
   1621 	vpxor		0xe0($inp),$xd1,$xd1
   1622 	vmovdqu		$xa0,0x00($out)
   1623 	vmovdqu		$xb0,0x20($out)
   1624 	vmovdqu		$xc0,0x40($out)
   1625 	vmovdqu		$xd0,0x60($out)
   1626 	vmovdqu		$xa1,0x80($out)
   1627 	vmovdqu		$xb1,0xa0($out)
   1628 	vmovdqu		$xc1,0xc0($out)
   1629 	vmovdqu		$xd1,0xe0($out)
   1630 	je		.Ldone8x
   1631 
   1632 	lea		0x100($inp),$inp	# inp+=64*4
   1633 	xor		%r10,%r10
   1634 	vmovdqa		$xa2,0x00(%rsp)
   1635 	lea		0x100($out),$out	# out+=64*4
   1636 	sub		\$256,$len		# len-=64*4
   1637 	vmovdqa		$xb2,0x20(%rsp)
   1638 	jmp		.Loop_tail8x
   1639 
   1640 .align	32
   1641 .L320_or_more8x:
   1642 	vpxor		0x00($inp),$xa0,$xa0	# xor with input
   1643 	vpxor		0x20($inp),$xb0,$xb0
   1644 	vpxor		0x40($inp),$xc0,$xc0
   1645 	vpxor		0x60($inp),$xd0,$xd0
   1646 	vpxor		0x80($inp),$xa1,$xa1
   1647 	vpxor		0xa0($inp),$xb1,$xb1
   1648 	vpxor		0xc0($inp),$xc1,$xc1
   1649 	vpxor		0xe0($inp),$xd1,$xd1
   1650 	vpxor		0x100($inp),$xa2,$xa2
   1651 	vpxor		0x120($inp),$xb2,$xb2
   1652 	vmovdqu		$xa0,0x00($out)
   1653 	vmovdqu		$xb0,0x20($out)
   1654 	vmovdqu		$xc0,0x40($out)
   1655 	vmovdqu		$xd0,0x60($out)
   1656 	vmovdqu		$xa1,0x80($out)
   1657 	vmovdqu		$xb1,0xa0($out)
   1658 	vmovdqu		$xc1,0xc0($out)
   1659 	vmovdqu		$xd1,0xe0($out)
   1660 	vmovdqu		$xa2,0x100($out)
   1661 	vmovdqu		$xb2,0x120($out)
   1662 	je		.Ldone8x
   1663 
   1664 	lea		0x140($inp),$inp	# inp+=64*5
   1665 	xor		%r10,%r10
   1666 	vmovdqa		$xc2,0x00(%rsp)
   1667 	lea		0x140($out),$out	# out+=64*5
   1668 	sub		\$320,$len		# len-=64*5
   1669 	vmovdqa		$xd2,0x20(%rsp)
   1670 	jmp		.Loop_tail8x
   1671 
   1672 .align	32
   1673 .L384_or_more8x:
   1674 	vpxor		0x00($inp),$xa0,$xa0	# xor with input
   1675 	vpxor		0x20($inp),$xb0,$xb0
   1676 	vpxor		0x40($inp),$xc0,$xc0
   1677 	vpxor		0x60($inp),$xd0,$xd0
   1678 	vpxor		0x80($inp),$xa1,$xa1
   1679 	vpxor		0xa0($inp),$xb1,$xb1
   1680 	vpxor		0xc0($inp),$xc1,$xc1
   1681 	vpxor		0xe0($inp),$xd1,$xd1
   1682 	vpxor		0x100($inp),$xa2,$xa2
   1683 	vpxor		0x120($inp),$xb2,$xb2
   1684 	vpxor		0x140($inp),$xc2,$xc2
   1685 	vpxor		0x160($inp),$xd2,$xd2
   1686 	vmovdqu		$xa0,0x00($out)
   1687 	vmovdqu		$xb0,0x20($out)
   1688 	vmovdqu		$xc0,0x40($out)
   1689 	vmovdqu		$xd0,0x60($out)
   1690 	vmovdqu		$xa1,0x80($out)
   1691 	vmovdqu		$xb1,0xa0($out)
   1692 	vmovdqu		$xc1,0xc0($out)
   1693 	vmovdqu		$xd1,0xe0($out)
   1694 	vmovdqu		$xa2,0x100($out)
   1695 	vmovdqu		$xb2,0x120($out)
   1696 	vmovdqu		$xc2,0x140($out)
   1697 	vmovdqu		$xd2,0x160($out)
   1698 	je		.Ldone8x
   1699 
   1700 	lea		0x180($inp),$inp	# inp+=64*6
   1701 	xor		%r10,%r10
   1702 	vmovdqa		$xa3,0x00(%rsp)
   1703 	lea		0x180($out),$out	# out+=64*6
   1704 	sub		\$384,$len		# len-=64*6
   1705 	vmovdqa		$xb3,0x20(%rsp)
   1706 	jmp		.Loop_tail8x
   1707 
   1708 .align	32
   1709 .L448_or_more8x:
   1710 	vpxor		0x00($inp),$xa0,$xa0	# xor with input
   1711 	vpxor		0x20($inp),$xb0,$xb0
   1712 	vpxor		0x40($inp),$xc0,$xc0
   1713 	vpxor		0x60($inp),$xd0,$xd0
   1714 	vpxor		0x80($inp),$xa1,$xa1
   1715 	vpxor		0xa0($inp),$xb1,$xb1
   1716 	vpxor		0xc0($inp),$xc1,$xc1
   1717 	vpxor		0xe0($inp),$xd1,$xd1
   1718 	vpxor		0x100($inp),$xa2,$xa2
   1719 	vpxor		0x120($inp),$xb2,$xb2
   1720 	vpxor		0x140($inp),$xc2,$xc2
   1721 	vpxor		0x160($inp),$xd2,$xd2
   1722 	vpxor		0x180($inp),$xa3,$xa3
   1723 	vpxor		0x1a0($inp),$xb3,$xb3
   1724 	vmovdqu		$xa0,0x00($out)
   1725 	vmovdqu		$xb0,0x20($out)
   1726 	vmovdqu		$xc0,0x40($out)
   1727 	vmovdqu		$xd0,0x60($out)
   1728 	vmovdqu		$xa1,0x80($out)
   1729 	vmovdqu		$xb1,0xa0($out)
   1730 	vmovdqu		$xc1,0xc0($out)
   1731 	vmovdqu		$xd1,0xe0($out)
   1732 	vmovdqu		$xa2,0x100($out)
   1733 	vmovdqu		$xb2,0x120($out)
   1734 	vmovdqu		$xc2,0x140($out)
   1735 	vmovdqu		$xd2,0x160($out)
   1736 	vmovdqu		$xa3,0x180($out)
   1737 	vmovdqu		$xb3,0x1a0($out)
   1738 	je		.Ldone8x
   1739 
   1740 	lea		0x1c0($inp),$inp	# inp+=64*7
   1741 	xor		%r10,%r10
   1742 	vmovdqa		$xc3,0x00(%rsp)
   1743 	lea		0x1c0($out),$out	# out+=64*7
   1744 	sub		\$448,$len		# len-=64*7
   1745 	vmovdqa		$xd3,0x20(%rsp)
   1746 
   1747 .Loop_tail8x:
   1748 	movzb		($inp,%r10),%eax
   1749 	movzb		(%rsp,%r10),%ecx
   1750 	lea		1(%r10),%r10
   1751 	xor		%ecx,%eax
   1752 	mov		%al,-1($out,%r10)
   1753 	dec		$len
   1754 	jnz		.Loop_tail8x
   1755 
   1756 .Ldone8x:
   1757 	vzeroall
   1758 ___
   1759 $code.=<<___	if ($win64);
   1760 	movaps		-0xa8(%r9),%xmm6
   1761 	movaps		-0x98(%r9),%xmm7
   1762 	movaps		-0x88(%r9),%xmm8
   1763 	movaps		-0x78(%r9),%xmm9
   1764 	movaps		-0x68(%r9),%xmm10
   1765 	movaps		-0x58(%r9),%xmm11
   1766 	movaps		-0x48(%r9),%xmm12
   1767 	movaps		-0x38(%r9),%xmm13
   1768 	movaps		-0x28(%r9),%xmm14
   1769 	movaps		-0x18(%r9),%xmm15
   1770 ___
   1771 $code.=<<___;
   1772 	lea		(%r9),%rsp
   1773 .L8x_epilogue:
   1774 	ret
   1775 .size	ChaCha20_8x,.-ChaCha20_8x
   1776 ___
   1777 }
   1778 
   1779 ########################################################################
   1780 # AVX512 code paths
   1781 if ($avx>2) {
   1782 # This one handles shorter inputs...
   1783 
   1784 my ($a,$b,$c,$d, $a_,$b_,$c_,$d_,$fourz) = map("%zmm$_",(0..3,16..20));
   1785 my ($t0,$t1,$t2,$t3) = map("%xmm$_",(4..7));
   1786 
   1787 sub AVX512ROUND {	# critical path is 14 "SIMD ticks" per round
   1788 	&vpaddd	($a,$a,$b);
   1789 	&vpxord	($d,$d,$a);
   1790 	&vprold	($d,$d,16);
   1791 
   1792 	&vpaddd	($c,$c,$d);
   1793 	&vpxord	($b,$b,$c);
   1794 	&vprold	($b,$b,12);
   1795 
   1796 	&vpaddd	($a,$a,$b);
   1797 	&vpxord	($d,$d,$a);
   1798 	&vprold	($d,$d,8);
   1799 
   1800 	&vpaddd	($c,$c,$d);
   1801 	&vpxord	($b,$b,$c);
   1802 	&vprold	($b,$b,7);
   1803 }
   1804 
   1805 my $xframe = $win64 ? 32+8 : 8;
   1806 
   1807 $code.=<<___;
   1808 .type	ChaCha20_avx512,\@function,5
   1809 .align	32
   1810 ChaCha20_avx512:
   1811 .LChaCha20_avx512:
   1812 	mov	%rsp,%r9		# frame pointer
   1813 	cmp	\$512,$len
   1814 	ja	.LChaCha20_16x
   1815 
   1816 	sub	\$64+$xframe,%rsp
   1817 ___
   1818 $code.=<<___	if ($win64);
   1819 	movaps	%xmm6,-0x28(%r9)
   1820 	movaps	%xmm7,-0x18(%r9)
   1821 .Lavx512_body:
   1822 ___
   1823 $code.=<<___;
   1824 	vbroadcasti32x4	.Lsigma(%rip),$a
   1825 	vbroadcasti32x4	($key),$b
   1826 	vbroadcasti32x4	16($key),$c
   1827 	vbroadcasti32x4	($counter),$d
   1828 
   1829 	vmovdqa32	$a,$a_
   1830 	vmovdqa32	$b,$b_
   1831 	vmovdqa32	$c,$c_
   1832 	vpaddd		.Lzeroz(%rip),$d,$d
   1833 	vmovdqa32	.Lfourz(%rip),$fourz
   1834 	mov		\$10,$counter	# reuse $counter
   1835 	vmovdqa32	$d,$d_
   1836 	jmp		.Loop_avx512
   1837 
   1838 .align	16
   1839 .Loop_outer_avx512:
   1840 	vmovdqa32	$a_,$a
   1841 	vmovdqa32	$b_,$b
   1842 	vmovdqa32	$c_,$c
   1843 	vpaddd		$fourz,$d_,$d
   1844 	mov		\$10,$counter
   1845 	vmovdqa32	$d,$d_
   1846 	jmp		.Loop_avx512
   1847 
   1848 .align	32
   1849 .Loop_avx512:
   1850 ___
   1851 	&AVX512ROUND();
   1852 	&vpshufd	($c,$c,0b01001110);
   1853 	&vpshufd	($b,$b,0b00111001);
   1854 	&vpshufd	($d,$d,0b10010011);
   1855 
   1856 	&AVX512ROUND();
   1857 	&vpshufd	($c,$c,0b01001110);
   1858 	&vpshufd	($b,$b,0b10010011);
   1859 	&vpshufd	($d,$d,0b00111001);
   1860 
   1861 	&dec		($counter);
   1862 	&jnz		(".Loop_avx512");
   1863 
   1864 $code.=<<___;
   1865 	vpaddd		$a_,$a,$a
   1866 	vpaddd		$b_,$b,$b
   1867 	vpaddd		$c_,$c,$c
   1868 	vpaddd		$d_,$d,$d
   1869 
   1870 	sub		\$64,$len
   1871 	jb		.Ltail64_avx512
   1872 
   1873 	vpxor		0x00($inp),%x#$a,$t0	# xor with input
   1874 	vpxor		0x10($inp),%x#$b,$t1
   1875 	vpxor		0x20($inp),%x#$c,$t2
   1876 	vpxor		0x30($inp),%x#$d,$t3
   1877 	lea		0x40($inp),$inp		# inp+=64
   1878 
   1879 	vmovdqu		$t0,0x00($out)		# write output
   1880 	vmovdqu		$t1,0x10($out)
   1881 	vmovdqu		$t2,0x20($out)
   1882 	vmovdqu		$t3,0x30($out)
   1883 	lea		0x40($out),$out		# out+=64
   1884 
   1885 	jz		.Ldone_avx512
   1886 
   1887 	vextracti32x4	\$1,$a,$t0
   1888 	vextracti32x4	\$1,$b,$t1
   1889 	vextracti32x4	\$1,$c,$t2
   1890 	vextracti32x4	\$1,$d,$t3
   1891 
   1892 	sub		\$64,$len
   1893 	jb		.Ltail_avx512
   1894 
   1895 	vpxor		0x00($inp),$t0,$t0	# xor with input
   1896 	vpxor		0x10($inp),$t1,$t1
   1897 	vpxor		0x20($inp),$t2,$t2
   1898 	vpxor		0x30($inp),$t3,$t3
   1899 	lea		0x40($inp),$inp		# inp+=64
   1900 
   1901 	vmovdqu		$t0,0x00($out)		# write output
   1902 	vmovdqu		$t1,0x10($out)
   1903 	vmovdqu		$t2,0x20($out)
   1904 	vmovdqu		$t3,0x30($out)
   1905 	lea		0x40($out),$out		# out+=64
   1906 
   1907 	jz		.Ldone_avx512
   1908 
   1909 	vextracti32x4	\$2,$a,$t0
   1910 	vextracti32x4	\$2,$b,$t1
   1911 	vextracti32x4	\$2,$c,$t2
   1912 	vextracti32x4	\$2,$d,$t3
   1913 
   1914 	sub		\$64,$len
   1915 	jb		.Ltail_avx512
   1916 
   1917 	vpxor		0x00($inp),$t0,$t0	# xor with input
   1918 	vpxor		0x10($inp),$t1,$t1
   1919 	vpxor		0x20($inp),$t2,$t2
   1920 	vpxor		0x30($inp),$t3,$t3
   1921 	lea		0x40($inp),$inp		# inp+=64
   1922 
   1923 	vmovdqu		$t0,0x00($out)		# write output
   1924 	vmovdqu		$t1,0x10($out)
   1925 	vmovdqu		$t2,0x20($out)
   1926 	vmovdqu		$t3,0x30($out)
   1927 	lea		0x40($out),$out		# out+=64
   1928 
   1929 	jz		.Ldone_avx512
   1930 
   1931 	vextracti32x4	\$3,$a,$t0
   1932 	vextracti32x4	\$3,$b,$t1
   1933 	vextracti32x4	\$3,$c,$t2
   1934 	vextracti32x4	\$3,$d,$t3
   1935 
   1936 	sub		\$64,$len
   1937 	jb		.Ltail_avx512
   1938 
   1939 	vpxor		0x00($inp),$t0,$t0	# xor with input
   1940 	vpxor		0x10($inp),$t1,$t1
   1941 	vpxor		0x20($inp),$t2,$t2
   1942 	vpxor		0x30($inp),$t3,$t3
   1943 	lea		0x40($inp),$inp		# inp+=64
   1944 
   1945 	vmovdqu		$t0,0x00($out)		# write output
   1946 	vmovdqu		$t1,0x10($out)
   1947 	vmovdqu		$t2,0x20($out)
   1948 	vmovdqu		$t3,0x30($out)
   1949 	lea		0x40($out),$out		# out+=64
   1950 
   1951 	jnz		.Loop_outer_avx512
   1952 
   1953 	jmp		.Ldone_avx512
   1954 
   1955 .align	16
   1956 .Ltail64_avx512:
   1957 	vmovdqa		%x#$a,0x00(%rsp)
   1958 	vmovdqa		%x#$b,0x10(%rsp)
   1959 	vmovdqa		%x#$c,0x20(%rsp)
   1960 	vmovdqa		%x#$d,0x30(%rsp)
   1961 	add		\$64,$len
   1962 	jmp		.Loop_tail_avx512
   1963 
   1964 .align	16
   1965 .Ltail_avx512:
   1966 	vmovdqa		$t0,0x00(%rsp)
   1967 	vmovdqa		$t1,0x10(%rsp)
   1968 	vmovdqa		$t2,0x20(%rsp)
   1969 	vmovdqa		$t3,0x30(%rsp)
   1970 	add		\$64,$len
   1971 
   1972 .Loop_tail_avx512:
   1973 	movzb		($inp,$counter),%eax
   1974 	movzb		(%rsp,$counter),%ecx
   1975 	lea		1($counter),$counter
   1976 	xor		%ecx,%eax
   1977 	mov		%al,-1($out,$counter)
   1978 	dec		$len
   1979 	jnz		.Loop_tail_avx512
   1980 
   1981 	vmovdqa32	$a_,0x00(%rsp)
   1982 
   1983 .Ldone_avx512:
   1984 	vzeroall
   1985 ___
   1986 $code.=<<___	if ($win64);
   1987 	movaps	-0x28(%r9),%xmm6
   1988 	movaps	-0x18(%r9),%xmm7
   1989 ___
   1990 $code.=<<___;
   1991 	lea	(%r9),%rsp
   1992 .Lavx512_epilogue:
   1993 	ret
   1994 .size	ChaCha20_avx512,.-ChaCha20_avx512
   1995 ___
   1996 }
   1997 if ($avx>2) {
   1998 # This one handles longer inputs...
   1999 
   2000 my ($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3,
   2001     $xc0,$xc1,$xc2,$xc3, $xd0,$xd1,$xd2,$xd3)=map("%zmm$_",(0..15));
   2002 my  @xx=($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3,
   2003 	 $xc0,$xc1,$xc2,$xc3, $xd0,$xd1,$xd2,$xd3);
   2004 my @key=map("%zmm$_",(16..31));
   2005 my ($xt0,$xt1,$xt2,$xt3)=@key[0..3];
   2006 
   2007 sub AVX512_lane_ROUND {
   2008 my ($a0,$b0,$c0,$d0)=@_;
   2009 my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
   2010 my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
   2011 my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
   2012 my @x=map("\"$_\"",@xx);
   2013 
   2014 	(
   2015 	"&vpaddd	(@x[$a0],@x[$a0],@x[$b0])",	# Q1
   2016 	 "&vpaddd	(@x[$a1],@x[$a1],@x[$b1])",	# Q2
   2017 	  "&vpaddd	(@x[$a2],@x[$a2],@x[$b2])",	# Q3
   2018 	   "&vpaddd	(@x[$a3],@x[$a3],@x[$b3])",	# Q4
   2019 	"&vpxord	(@x[$d0],@x[$d0],@x[$a0])",
   2020 	 "&vpxord	(@x[$d1],@x[$d1],@x[$a1])",
   2021 	  "&vpxord	(@x[$d2],@x[$d2],@x[$a2])",
   2022 	   "&vpxord	(@x[$d3],@x[$d3],@x[$a3])",
   2023 	"&vprold	(@x[$d0],@x[$d0],16)",
   2024 	 "&vprold	(@x[$d1],@x[$d1],16)",
   2025 	  "&vprold	(@x[$d2],@x[$d2],16)",
   2026 	   "&vprold	(@x[$d3],@x[$d3],16)",
   2027 
   2028 	"&vpaddd	(@x[$c0],@x[$c0],@x[$d0])",
   2029 	 "&vpaddd	(@x[$c1],@x[$c1],@x[$d1])",
   2030 	  "&vpaddd	(@x[$c2],@x[$c2],@x[$d2])",
   2031 	   "&vpaddd	(@x[$c3],@x[$c3],@x[$d3])",
   2032 	"&vpxord	(@x[$b0],@x[$b0],@x[$c0])",
   2033 	 "&vpxord	(@x[$b1],@x[$b1],@x[$c1])",
   2034 	  "&vpxord	(@x[$b2],@x[$b2],@x[$c2])",
   2035 	   "&vpxord	(@x[$b3],@x[$b3],@x[$c3])",
   2036 	"&vprold	(@x[$b0],@x[$b0],12)",
   2037 	 "&vprold	(@x[$b1],@x[$b1],12)",
   2038 	  "&vprold	(@x[$b2],@x[$b2],12)",
   2039 	   "&vprold	(@x[$b3],@x[$b3],12)",
   2040 
   2041 	"&vpaddd	(@x[$a0],@x[$a0],@x[$b0])",
   2042 	 "&vpaddd	(@x[$a1],@x[$a1],@x[$b1])",
   2043 	  "&vpaddd	(@x[$a2],@x[$a2],@x[$b2])",
   2044 	   "&vpaddd	(@x[$a3],@x[$a3],@x[$b3])",
   2045 	"&vpxord	(@x[$d0],@x[$d0],@x[$a0])",
   2046 	 "&vpxord	(@x[$d1],@x[$d1],@x[$a1])",
   2047 	  "&vpxord	(@x[$d2],@x[$d2],@x[$a2])",
   2048 	   "&vpxord	(@x[$d3],@x[$d3],@x[$a3])",
   2049 	"&vprold	(@x[$d0],@x[$d0],8)",
   2050 	 "&vprold	(@x[$d1],@x[$d1],8)",
   2051 	  "&vprold	(@x[$d2],@x[$d2],8)",
   2052 	   "&vprold	(@x[$d3],@x[$d3],8)",
   2053 
   2054 	"&vpaddd	(@x[$c0],@x[$c0],@x[$d0])",
   2055 	 "&vpaddd	(@x[$c1],@x[$c1],@x[$d1])",
   2056 	  "&vpaddd	(@x[$c2],@x[$c2],@x[$d2])",
   2057 	   "&vpaddd	(@x[$c3],@x[$c3],@x[$d3])",
   2058 	"&vpxord	(@x[$b0],@x[$b0],@x[$c0])",
   2059 	 "&vpxord	(@x[$b1],@x[$b1],@x[$c1])",
   2060 	  "&vpxord	(@x[$b2],@x[$b2],@x[$c2])",
   2061 	   "&vpxord	(@x[$b3],@x[$b3],@x[$c3])",
   2062 	"&vprold	(@x[$b0],@x[$b0],7)",
   2063 	 "&vprold	(@x[$b1],@x[$b1],7)",
   2064 	  "&vprold	(@x[$b2],@x[$b2],7)",
   2065 	   "&vprold	(@x[$b3],@x[$b3],7)"
   2066 	);
   2067 }
   2068 
   2069 my $xframe = $win64 ? 0xa8 : 8;
   2070 
   2071 $code.=<<___;
   2072 .type	ChaCha20_16x,\@function,5
   2073 .align	32
   2074 ChaCha20_16x:
   2075 .LChaCha20_16x:
   2076 	mov		%rsp,%r9		# frame register
   2077 	sub		\$64+$xframe,%rsp
   2078 	and		\$-64,%rsp
   2079 ___
   2080 $code.=<<___	if ($win64);
   2081 	movaps		%xmm6,-0xa8(%r9)
   2082 	movaps		%xmm7,-0x98(%r9)
   2083 	movaps		%xmm8,-0x88(%r9)
   2084 	movaps		%xmm9,-0x78(%r9)
   2085 	movaps		%xmm10,-0x68(%r9)
   2086 	movaps		%xmm11,-0x58(%r9)
   2087 	movaps		%xmm12,-0x48(%r9)
   2088 	movaps		%xmm13,-0x38(%r9)
   2089 	movaps		%xmm14,-0x28(%r9)
   2090 	movaps		%xmm15,-0x18(%r9)
   2091 .L16x_body:
   2092 ___
   2093 $code.=<<___;
   2094 	vzeroupper
   2095 
   2096 	lea		.Lsigma(%rip),%r10
   2097 	vbroadcasti32x4	(%r10),$xa3		# key[0]
   2098 	vbroadcasti32x4	($key),$xb3		# key[1]
   2099 	vbroadcasti32x4	16($key),$xc3		# key[2]
   2100 	vbroadcasti32x4	($counter),$xd3		# key[3]
   2101 
   2102 	vpshufd		\$0x00,$xa3,$xa0	# smash key by lanes...
   2103 	vpshufd		\$0x55,$xa3,$xa1
   2104 	vpshufd		\$0xaa,$xa3,$xa2
   2105 	vpshufd		\$0xff,$xa3,$xa3
   2106 	vmovdqa64	$xa0,@key[0]
   2107 	vmovdqa64	$xa1,@key[1]
   2108 	vmovdqa64	$xa2,@key[2]
   2109 	vmovdqa64	$xa3,@key[3]
   2110 
   2111 	vpshufd		\$0x00,$xb3,$xb0
   2112 	vpshufd		\$0x55,$xb3,$xb1
   2113 	vpshufd		\$0xaa,$xb3,$xb2
   2114 	vpshufd		\$0xff,$xb3,$xb3
   2115 	vmovdqa64	$xb0,@key[4]
   2116 	vmovdqa64	$xb1,@key[5]
   2117 	vmovdqa64	$xb2,@key[6]
   2118 	vmovdqa64	$xb3,@key[7]
   2119 
   2120 	vpshufd		\$0x00,$xc3,$xc0
   2121 	vpshufd		\$0x55,$xc3,$xc1
   2122 	vpshufd		\$0xaa,$xc3,$xc2
   2123 	vpshufd		\$0xff,$xc3,$xc3
   2124 	vmovdqa64	$xc0,@key[8]
   2125 	vmovdqa64	$xc1,@key[9]
   2126 	vmovdqa64	$xc2,@key[10]
   2127 	vmovdqa64	$xc3,@key[11]
   2128 
   2129 	vpshufd		\$0x00,$xd3,$xd0
   2130 	vpshufd		\$0x55,$xd3,$xd1
   2131 	vpshufd		\$0xaa,$xd3,$xd2
   2132 	vpshufd		\$0xff,$xd3,$xd3
   2133 	vpaddd		.Lincz(%rip),$xd0,$xd0	# don't save counters yet
   2134 	vmovdqa64	$xd0,@key[12]
   2135 	vmovdqa64	$xd1,@key[13]
   2136 	vmovdqa64	$xd2,@key[14]
   2137 	vmovdqa64	$xd3,@key[15]
   2138 
   2139 	mov		\$10,%eax
   2140 	jmp		.Loop16x
   2141 
   2142 .align	32
   2143 .Loop_outer16x:
   2144 	vpbroadcastd	0(%r10),$xa0		# reload key
   2145 	vpbroadcastd	4(%r10),$xa1
   2146 	vpbroadcastd	8(%r10),$xa2
   2147 	vpbroadcastd	12(%r10),$xa3
   2148 	vpaddd		.Lsixteen(%rip),@key[12],@key[12]	# next SIMD counters
   2149 	vmovdqa64	@key[4],$xb0
   2150 	vmovdqa64	@key[5],$xb1
   2151 	vmovdqa64	@key[6],$xb2
   2152 	vmovdqa64	@key[7],$xb3
   2153 	vmovdqa64	@key[8],$xc0
   2154 	vmovdqa64	@key[9],$xc1
   2155 	vmovdqa64	@key[10],$xc2
   2156 	vmovdqa64	@key[11],$xc3
   2157 	vmovdqa64	@key[12],$xd0
   2158 	vmovdqa64	@key[13],$xd1
   2159 	vmovdqa64	@key[14],$xd2
   2160 	vmovdqa64	@key[15],$xd3
   2161 
   2162 	vmovdqa64	$xa0,@key[0]
   2163 	vmovdqa64	$xa1,@key[1]
   2164 	vmovdqa64	$xa2,@key[2]
   2165 	vmovdqa64	$xa3,@key[3]
   2166 
   2167 	mov		\$10,%eax
   2168 	jmp		.Loop16x
   2169 
   2170 .align	32
   2171 .Loop16x:
   2172 ___
   2173 	foreach (&AVX512_lane_ROUND(0, 4, 8,12)) { eval; }
   2174 	foreach (&AVX512_lane_ROUND(0, 5,10,15)) { eval; }
   2175 $code.=<<___;
   2176 	dec		%eax
   2177 	jnz		.Loop16x
   2178 
   2179 	vpaddd		@key[0],$xa0,$xa0	# accumulate key
   2180 	vpaddd		@key[1],$xa1,$xa1
   2181 	vpaddd		@key[2],$xa2,$xa2
   2182 	vpaddd		@key[3],$xa3,$xa3
   2183 
   2184 	vpunpckldq	$xa1,$xa0,$xt2		# "de-interlace" data
   2185 	vpunpckldq	$xa3,$xa2,$xt3
   2186 	vpunpckhdq	$xa1,$xa0,$xa0
   2187 	vpunpckhdq	$xa3,$xa2,$xa2
   2188 	vpunpcklqdq	$xt3,$xt2,$xa1		# "a0"
   2189 	vpunpckhqdq	$xt3,$xt2,$xt2		# "a1"
   2190 	vpunpcklqdq	$xa2,$xa0,$xa3		# "a2"
   2191 	vpunpckhqdq	$xa2,$xa0,$xa0		# "a3"
   2192 ___
   2193 	($xa0,$xa1,$xa2,$xa3,$xt2)=($xa1,$xt2,$xa3,$xa0,$xa2);
   2194 $code.=<<___;
   2195 	vpaddd		@key[4],$xb0,$xb0
   2196 	vpaddd		@key[5],$xb1,$xb1
   2197 	vpaddd		@key[6],$xb2,$xb2
   2198 	vpaddd		@key[7],$xb3,$xb3
   2199 
   2200 	vpunpckldq	$xb1,$xb0,$xt2
   2201 	vpunpckldq	$xb3,$xb2,$xt3
   2202 	vpunpckhdq	$xb1,$xb0,$xb0
   2203 	vpunpckhdq	$xb3,$xb2,$xb2
   2204 	vpunpcklqdq	$xt3,$xt2,$xb1		# "b0"
   2205 	vpunpckhqdq	$xt3,$xt2,$xt2		# "b1"
   2206 	vpunpcklqdq	$xb2,$xb0,$xb3		# "b2"
   2207 	vpunpckhqdq	$xb2,$xb0,$xb0		# "b3"
   2208 ___
   2209 	($xb0,$xb1,$xb2,$xb3,$xt2)=($xb1,$xt2,$xb3,$xb0,$xb2);
   2210 $code.=<<___;
   2211 	vshufi32x4	\$0x44,$xb0,$xa0,$xt3	# "de-interlace" further
   2212 	vshufi32x4	\$0xee,$xb0,$xa0,$xb0
   2213 	vshufi32x4	\$0x44,$xb1,$xa1,$xa0
   2214 	vshufi32x4	\$0xee,$xb1,$xa1,$xb1
   2215 	vshufi32x4	\$0x44,$xb2,$xa2,$xa1
   2216 	vshufi32x4	\$0xee,$xb2,$xa2,$xb2
   2217 	vshufi32x4	\$0x44,$xb3,$xa3,$xa2
   2218 	vshufi32x4	\$0xee,$xb3,$xa3,$xb3
   2219 ___
   2220 	($xa0,$xa1,$xa2,$xa3,$xt3)=($xt3,$xa0,$xa1,$xa2,$xa3);
   2221 $code.=<<___;
   2222 	vpaddd		@key[8],$xc0,$xc0
   2223 	vpaddd		@key[9],$xc1,$xc1
   2224 	vpaddd		@key[10],$xc2,$xc2
   2225 	vpaddd		@key[11],$xc3,$xc3
   2226 
   2227 	vpunpckldq	$xc1,$xc0,$xt2
   2228 	vpunpckldq	$xc3,$xc2,$xt3
   2229 	vpunpckhdq	$xc1,$xc0,$xc0
   2230 	vpunpckhdq	$xc3,$xc2,$xc2
   2231 	vpunpcklqdq	$xt3,$xt2,$xc1		# "c0"
   2232 	vpunpckhqdq	$xt3,$xt2,$xt2		# "c1"
   2233 	vpunpcklqdq	$xc2,$xc0,$xc3		# "c2"
   2234 	vpunpckhqdq	$xc2,$xc0,$xc0		# "c3"
   2235 ___
   2236 	($xc0,$xc1,$xc2,$xc3,$xt2)=($xc1,$xt2,$xc3,$xc0,$xc2);
   2237 $code.=<<___;
   2238 	vpaddd		@key[12],$xd0,$xd0
   2239 	vpaddd		@key[13],$xd1,$xd1
   2240 	vpaddd		@key[14],$xd2,$xd2
   2241 	vpaddd		@key[15],$xd3,$xd3
   2242 
   2243 	vpunpckldq	$xd1,$xd0,$xt2
   2244 	vpunpckldq	$xd3,$xd2,$xt3
   2245 	vpunpckhdq	$xd1,$xd0,$xd0
   2246 	vpunpckhdq	$xd3,$xd2,$xd2
   2247 	vpunpcklqdq	$xt3,$xt2,$xd1		# "d0"
   2248 	vpunpckhqdq	$xt3,$xt2,$xt2		# "d1"
   2249 	vpunpcklqdq	$xd2,$xd0,$xd3		# "d2"
   2250 	vpunpckhqdq	$xd2,$xd0,$xd0		# "d3"
   2251 ___
   2252 	($xd0,$xd1,$xd2,$xd3,$xt2)=($xd1,$xt2,$xd3,$xd0,$xd2);
   2253 $code.=<<___;
   2254 	vshufi32x4	\$0x44,$xd0,$xc0,$xt3	# "de-interlace" further
   2255 	vshufi32x4	\$0xee,$xd0,$xc0,$xd0
   2256 	vshufi32x4	\$0x44,$xd1,$xc1,$xc0
   2257 	vshufi32x4	\$0xee,$xd1,$xc1,$xd1
   2258 	vshufi32x4	\$0x44,$xd2,$xc2,$xc1
   2259 	vshufi32x4	\$0xee,$xd2,$xc2,$xd2
   2260 	vshufi32x4	\$0x44,$xd3,$xc3,$xc2
   2261 	vshufi32x4	\$0xee,$xd3,$xc3,$xd3
   2262 ___
   2263 	($xc0,$xc1,$xc2,$xc3,$xt3)=($xt3,$xc0,$xc1,$xc2,$xc3);
   2264 $code.=<<___;
   2265 	vshufi32x4	\$0x88,$xc0,$xa0,$xt0	# "de-interlace" further
   2266 	vshufi32x4	\$0xdd,$xc0,$xa0,$xa0
   2267 	 vshufi32x4	\$0x88,$xd0,$xb0,$xc0
   2268 	 vshufi32x4	\$0xdd,$xd0,$xb0,$xd0
   2269 	vshufi32x4	\$0x88,$xc1,$xa1,$xt1
   2270 	vshufi32x4	\$0xdd,$xc1,$xa1,$xa1
   2271 	 vshufi32x4	\$0x88,$xd1,$xb1,$xc1
   2272 	 vshufi32x4	\$0xdd,$xd1,$xb1,$xd1
   2273 	vshufi32x4	\$0x88,$xc2,$xa2,$xt2
   2274 	vshufi32x4	\$0xdd,$xc2,$xa2,$xa2
   2275 	 vshufi32x4	\$0x88,$xd2,$xb2,$xc2
   2276 	 vshufi32x4	\$0xdd,$xd2,$xb2,$xd2
   2277 	vshufi32x4	\$0x88,$xc3,$xa3,$xt3
   2278 	vshufi32x4	\$0xdd,$xc3,$xa3,$xa3
   2279 	 vshufi32x4	\$0x88,$xd3,$xb3,$xc3
   2280 	 vshufi32x4	\$0xdd,$xd3,$xb3,$xd3
   2281 ___
   2282 	($xa0,$xa1,$xa2,$xa3,$xb0,$xb1,$xb2,$xb3)=
   2283 	($xt0,$xt1,$xt2,$xt3,$xa0,$xa1,$xa2,$xa3);
   2284 
   2285 	($xa0,$xb0,$xc0,$xd0, $xa1,$xb1,$xc1,$xd1,
   2286 	 $xa2,$xb2,$xc2,$xd2, $xa3,$xb3,$xc3,$xd3) =
   2287 	($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3,
   2288 	 $xc0,$xc1,$xc2,$xc3, $xd0,$xd1,$xd2,$xd3);
   2289 $code.=<<___;
   2290 	cmp		\$64*16,$len
   2291 	jb		.Ltail16x
   2292 
   2293 	vpxord		0x00($inp),$xa0,$xa0	# xor with input
   2294 	vpxord		0x40($inp),$xb0,$xb0
   2295 	vpxord		0x80($inp),$xc0,$xc0
   2296 	vpxord		0xc0($inp),$xd0,$xd0
   2297 	vmovdqu32	$xa0,0x00($out)
   2298 	vmovdqu32	$xb0,0x40($out)
   2299 	vmovdqu32	$xc0,0x80($out)
   2300 	vmovdqu32	$xd0,0xc0($out)
   2301 
   2302 	vpxord		0x100($inp),$xa1,$xa1
   2303 	vpxord		0x140($inp),$xb1,$xb1
   2304 	vpxord		0x180($inp),$xc1,$xc1
   2305 	vpxord		0x1c0($inp),$xd1,$xd1
   2306 	vmovdqu32	$xa1,0x100($out)
   2307 	vmovdqu32	$xb1,0x140($out)
   2308 	vmovdqu32	$xc1,0x180($out)
   2309 	vmovdqu32	$xd1,0x1c0($out)
   2310 
   2311 	vpxord		0x200($inp),$xa2,$xa2
   2312 	vpxord		0x240($inp),$xb2,$xb2
   2313 	vpxord		0x280($inp),$xc2,$xc2
   2314 	vpxord		0x2c0($inp),$xd2,$xd2
   2315 	vmovdqu32	$xa2,0x200($out)
   2316 	vmovdqu32	$xb2,0x240($out)
   2317 	vmovdqu32	$xc2,0x280($out)
   2318 	vmovdqu32	$xd2,0x2c0($out)
   2319 
   2320 	vpxord		0x300($inp),$xa3,$xa3
   2321 	vpxord		0x340($inp),$xb3,$xb3
   2322 	vpxord		0x380($inp),$xc3,$xc3
   2323 	vpxord		0x3c0($inp),$xd3,$xd3
   2324 	lea		0x400($inp),$inp
   2325 	vmovdqu32	$xa3,0x300($out)
   2326 	vmovdqu32	$xb3,0x340($out)
   2327 	vmovdqu32	$xc3,0x380($out)
   2328 	vmovdqu32	$xd3,0x3c0($out)
   2329 	lea		0x400($out),$out
   2330 
   2331 	sub		\$64*16,$len
   2332 	jnz		.Loop_outer16x
   2333 
   2334 	jmp		.Ldone16x
   2335 
   2336 .align	32
   2337 .Ltail16x:
   2338 	xor		%r10,%r10
   2339 	sub		$inp,$out
   2340 	cmp		\$64*1,$len
   2341 	jb		.Less_than_64_16x
   2342 	vpxord		($inp),$xa0,$xa0	# xor with input
   2343 	vmovdqu32	$xa0,($out,$inp)
   2344 	je		.Ldone16x
   2345 	vmovdqa32	$xb0,$xa0
   2346 	lea		64($inp),$inp
   2347 
   2348 	cmp		\$64*2,$len
   2349 	jb		.Less_than_64_16x
   2350 	vpxord		($inp),$xb0,$xb0
   2351 	vmovdqu32	$xb0,($out,$inp)
   2352 	je		.Ldone16x
   2353 	vmovdqa32	$xc0,$xa0
   2354 	lea		64($inp),$inp
   2355 
   2356 	cmp		\$64*3,$len
   2357 	jb		.Less_than_64_16x
   2358 	vpxord		($inp),$xc0,$xc0
   2359 	vmovdqu32	$xc0,($out,$inp)
   2360 	je		.Ldone16x
   2361 	vmovdqa32	$xd0,$xa0
   2362 	lea		64($inp),$inp
   2363 
   2364 	cmp		\$64*4,$len
   2365 	jb		.Less_than_64_16x
   2366 	vpxord		($inp),$xd0,$xd0
   2367 	vmovdqu32	$xd0,($out,$inp)
   2368 	je		.Ldone16x
   2369 	vmovdqa32	$xa1,$xa0
   2370 	lea		64($inp),$inp
   2371 
   2372 	cmp		\$64*5,$len
   2373 	jb		.Less_than_64_16x
   2374 	vpxord		($inp),$xa1,$xa1
   2375 	vmovdqu32	$xa1,($out,$inp)
   2376 	je		.Ldone16x
   2377 	vmovdqa32	$xb1,$xa0
   2378 	lea		64($inp),$inp
   2379 
   2380 	cmp		\$64*6,$len
   2381 	jb		.Less_than_64_16x
   2382 	vpxord		($inp),$xb1,$xb1
   2383 	vmovdqu32	$xb1,($out,$inp)
   2384 	je		.Ldone16x
   2385 	vmovdqa32	$xc1,$xa0
   2386 	lea		64($inp),$inp
   2387 
   2388 	cmp		\$64*7,$len
   2389 	jb		.Less_than_64_16x
   2390 	vpxord		($inp),$xc1,$xc1
   2391 	vmovdqu32	$xc1,($out,$inp)
   2392 	je		.Ldone16x
   2393 	vmovdqa32	$xd1,$xa0
   2394 	lea		64($inp),$inp
   2395 
   2396 	cmp		\$64*8,$len
   2397 	jb		.Less_than_64_16x
   2398 	vpxord		($inp),$xd1,$xd1
   2399 	vmovdqu32	$xd1,($out,$inp)
   2400 	je		.Ldone16x
   2401 	vmovdqa32	$xa2,$xa0
   2402 	lea		64($inp),$inp
   2403 
   2404 	cmp		\$64*9,$len
   2405 	jb		.Less_than_64_16x
   2406 	vpxord		($inp),$xa2,$xa2
   2407 	vmovdqu32	$xa2,($out,$inp)
   2408 	je		.Ldone16x
   2409 	vmovdqa32	$xb2,$xa0
   2410 	lea		64($inp),$inp
   2411 
   2412 	cmp		\$64*10,$len
   2413 	jb		.Less_than_64_16x
   2414 	vpxord		($inp),$xb2,$xb2
   2415 	vmovdqu32	$xb2,($out,$inp)
   2416 	je		.Ldone16x
   2417 	vmovdqa32	$xc2,$xa0
   2418 	lea		64($inp),$inp
   2419 
   2420 	cmp		\$64*11,$len
   2421 	jb		.Less_than_64_16x
   2422 	vpxord		($inp),$xc2,$xc2
   2423 	vmovdqu32	$xc2,($out,$inp)
   2424 	je		.Ldone16x
   2425 	vmovdqa32	$xd2,$xa0
   2426 	lea		64($inp),$inp
   2427 
   2428 	cmp		\$64*12,$len
   2429 	jb		.Less_than_64_16x
   2430 	vpxord		($inp),$xd2,$xd2
   2431 	vmovdqu32	$xd2,($out,$inp)
   2432 	je		.Ldone16x
   2433 	vmovdqa32	$xa3,$xa0
   2434 	lea		64($inp),$inp
   2435 
   2436 	cmp		\$64*13,$len
   2437 	jb		.Less_than_64_16x
   2438 	vpxord		($inp),$xa3,$xa3
   2439 	vmovdqu32	$xa3,($out,$inp)
   2440 	je		.Ldone16x
   2441 	vmovdqa32	$xb3,$xa0
   2442 	lea		64($inp),$inp
   2443 
   2444 	cmp		\$64*14,$len
   2445 	jb		.Less_than_64_16x
   2446 	vpxord		($inp),$xb3,$xb3
   2447 	vmovdqu32	$xb3,($out,$inp)
   2448 	je		.Ldone16x
   2449 	vmovdqa32	$xc3,$xa0
   2450 	lea		64($inp),$inp
   2451 
   2452 	cmp		\$64*15,$len
   2453 	jb		.Less_than_64_16x
   2454 	vpxord		($inp),$xc3,$xc3
   2455 	vmovdqu32	$xc3,($out,$inp)
   2456 	je		.Ldone16x
   2457 	vmovdqa32	$xd3,$xa0
   2458 	lea		64($inp),$inp
   2459 
   2460 .Less_than_64_16x:
   2461 	vmovdqa32	$xa0,0x00(%rsp)
   2462 	lea		($out,$inp),$out
   2463 	and		\$63,$len
   2464 
   2465 .Loop_tail16x:
   2466 	movzb		($inp,%r10),%eax
   2467 	movzb		(%rsp,%r10),%ecx
   2468 	lea		1(%r10),%r10
   2469 	xor		%ecx,%eax
   2470 	mov		%al,-1($out,%r10)
   2471 	dec		$len
   2472 	jnz		.Loop_tail16x
   2473 
   2474 	vpxord		$xa0,$xa0,$xa0
   2475 	vmovdqa32	$xa0,0(%rsp)
   2476 
   2477 .Ldone16x:
   2478 	vzeroall
   2479 ___
   2480 $code.=<<___	if ($win64);
   2481 	movaps		-0xa8(%r9),%xmm6
   2482 	movaps		-0x98(%r9),%xmm7
   2483 	movaps		-0x88(%r9),%xmm8
   2484 	movaps		-0x78(%r9),%xmm9
   2485 	movaps		-0x68(%r9),%xmm10
   2486 	movaps		-0x58(%r9),%xmm11
   2487 	movaps		-0x48(%r9),%xmm12
   2488 	movaps		-0x38(%r9),%xmm13
   2489 	movaps		-0x28(%r9),%xmm14
   2490 	movaps		-0x18(%r9),%xmm15
   2491 ___
   2492 $code.=<<___;
   2493 	lea		(%r9),%rsp
   2494 .L16x_epilogue:
   2495 	ret
   2496 .size	ChaCha20_16x,.-ChaCha20_16x
   2497 ___
   2498 }
   2499 
   2500 # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
   2501 #		CONTEXT *context,DISPATCHER_CONTEXT *disp)
   2502 if ($win64) {
   2503 $rec="%rcx";
   2504 $frame="%rdx";
   2505 $context="%r8";
   2506 $disp="%r9";
   2507 
   2508 $code.=<<___;
   2509 .extern	__imp_RtlVirtualUnwind
   2510 .type	se_handler,\@abi-omnipotent
   2511 .align	16
   2512 se_handler:
   2513 	push	%rsi
   2514 	push	%rdi
   2515 	push	%rbx
   2516 	push	%rbp
   2517 	push	%r12
   2518 	push	%r13
   2519 	push	%r14
   2520 	push	%r15
   2521 	pushfq
   2522 	sub	\$64,%rsp
   2523 
   2524 	mov	120($context),%rax	# pull context->Rax
   2525 	mov	248($context),%rbx	# pull context->Rip
   2526 
   2527 	mov	8($disp),%rsi		# disp->ImageBase
   2528 	mov	56($disp),%r11		# disp->HandlerData
   2529 
   2530 	lea	.Lctr32_body(%rip),%r10
   2531 	cmp	%r10,%rbx		# context->Rip<.Lprologue
   2532 	jb	.Lcommon_seh_tail
   2533 
   2534 	mov	152($context),%rax	# pull context->Rsp
   2535 
   2536 	lea	.Lno_data(%rip),%r10	# epilogue label
   2537 	cmp	%r10,%rbx		# context->Rip>=.Lepilogue
   2538 	jae	.Lcommon_seh_tail
   2539 
   2540 	lea	64+24+48(%rax),%rax
   2541 
   2542 	mov	-8(%rax),%rbx
   2543 	mov	-16(%rax),%rbp
   2544 	mov	-24(%rax),%r12
   2545 	mov	-32(%rax),%r13
   2546 	mov	-40(%rax),%r14
   2547 	mov	-48(%rax),%r15
   2548 	mov	%rbx,144($context)	# restore context->Rbx
   2549 	mov	%rbp,160($context)	# restore context->Rbp
   2550 	mov	%r12,216($context)	# restore context->R12
   2551 	mov	%r13,224($context)	# restore context->R13
   2552 	mov	%r14,232($context)	# restore context->R14
   2553 	mov	%r15,240($context)	# restore context->R14
   2554 
   2555 .Lcommon_seh_tail:
   2556 	mov	8(%rax),%rdi
   2557 	mov	16(%rax),%rsi
   2558 	mov	%rax,152($context)	# restore context->Rsp
   2559 	mov	%rsi,168($context)	# restore context->Rsi
   2560 	mov	%rdi,176($context)	# restore context->Rdi
   2561 
   2562 	mov	40($disp),%rdi		# disp->ContextRecord
   2563 	mov	$context,%rsi		# context
   2564 	mov	\$154,%ecx		# sizeof(CONTEXT)
   2565 	.long	0xa548f3fc		# cld; rep movsq
   2566 
   2567 	mov	$disp,%rsi
   2568 	xor	%rcx,%rcx		# arg1, UNW_FLAG_NHANDLER
   2569 	mov	8(%rsi),%rdx		# arg2, disp->ImageBase
   2570 	mov	0(%rsi),%r8		# arg3, disp->ControlPc
   2571 	mov	16(%rsi),%r9		# arg4, disp->FunctionEntry
   2572 	mov	40(%rsi),%r10		# disp->ContextRecord
   2573 	lea	56(%rsi),%r11		# &disp->HandlerData
   2574 	lea	24(%rsi),%r12		# &disp->EstablisherFrame
   2575 	mov	%r10,32(%rsp)		# arg5
   2576 	mov	%r11,40(%rsp)		# arg6
   2577 	mov	%r12,48(%rsp)		# arg7
   2578 	mov	%rcx,56(%rsp)		# arg8, (NULL)
   2579 	call	*__imp_RtlVirtualUnwind(%rip)
   2580 
   2581 	mov	\$1,%eax		# ExceptionContinueSearch
   2582 	add	\$64,%rsp
   2583 	popfq
   2584 	pop	%r15
   2585 	pop	%r14
   2586 	pop	%r13
   2587 	pop	%r12
   2588 	pop	%rbp
   2589 	pop	%rbx
   2590 	pop	%rdi
   2591 	pop	%rsi
   2592 	ret
   2593 .size	se_handler,.-se_handler
   2594 
   2595 .type	ssse3_handler,\@abi-omnipotent
   2596 .align	16
   2597 ssse3_handler:
   2598 	push	%rsi
   2599 	push	%rdi
   2600 	push	%rbx
   2601 	push	%rbp
   2602 	push	%r12
   2603 	push	%r13
   2604 	push	%r14
   2605 	push	%r15
   2606 	pushfq
   2607 	sub	\$64,%rsp
   2608 
   2609 	mov	120($context),%rax	# pull context->Rax
   2610 	mov	248($context),%rbx	# pull context->Rip
   2611 
   2612 	mov	8($disp),%rsi		# disp->ImageBase
   2613 	mov	56($disp),%r11		# disp->HandlerData
   2614 
   2615 	mov	0(%r11),%r10d		# HandlerData[0]
   2616 	lea	(%rsi,%r10),%r10	# prologue label
   2617 	cmp	%r10,%rbx		# context->Rip<prologue label
   2618 	jb	.Lcommon_seh_tail
   2619 
   2620 	mov	192($context),%rax	# pull context->R9
   2621 
   2622 	mov	4(%r11),%r10d		# HandlerData[1]
   2623 	lea	(%rsi,%r10),%r10	# epilogue label
   2624 	cmp	%r10,%rbx		# context->Rip>=epilogue label
   2625 	jae	.Lcommon_seh_tail
   2626 
   2627 	lea	-0x28(%rax),%rsi
   2628 	lea	512($context),%rdi	# &context.Xmm6
   2629 	mov	\$4,%ecx
   2630 	.long	0xa548f3fc		# cld; rep movsq
   2631 
   2632 	jmp	.Lcommon_seh_tail
   2633 .size	ssse3_handler,.-ssse3_handler
   2634 
   2635 .type	full_handler,\@abi-omnipotent
   2636 .align	16
   2637 full_handler:
   2638 	push	%rsi
   2639 	push	%rdi
   2640 	push	%rbx
   2641 	push	%rbp
   2642 	push	%r12
   2643 	push	%r13
   2644 	push	%r14
   2645 	push	%r15
   2646 	pushfq
   2647 	sub	\$64,%rsp
   2648 
   2649 	mov	120($context),%rax	# pull context->Rax
   2650 	mov	248($context),%rbx	# pull context->Rip
   2651 
   2652 	mov	8($disp),%rsi		# disp->ImageBase
   2653 	mov	56($disp),%r11		# disp->HandlerData
   2654 
   2655 	mov	0(%r11),%r10d		# HandlerData[0]
   2656 	lea	(%rsi,%r10),%r10	# prologue label
   2657 	cmp	%r10,%rbx		# context->Rip<prologue label
   2658 	jb	.Lcommon_seh_tail
   2659 
   2660 	mov	192($context),%rax	# pull context->R9
   2661 
   2662 	mov	4(%r11),%r10d		# HandlerData[1]
   2663 	lea	(%rsi,%r10),%r10	# epilogue label
   2664 	cmp	%r10,%rbx		# context->Rip>=epilogue label
   2665 	jae	.Lcommon_seh_tail
   2666 
   2667 	lea	-0xa8(%rax),%rsi
   2668 	lea	512($context),%rdi	# &context.Xmm6
   2669 	mov	\$20,%ecx
   2670 	.long	0xa548f3fc		# cld; rep movsq
   2671 
   2672 	jmp	.Lcommon_seh_tail
   2673 .size	full_handler,.-full_handler
   2674 
   2675 .section	.pdata
   2676 .align	4
   2677 	.rva	.LSEH_begin_ChaCha20_ctr32
   2678 	.rva	.LSEH_end_ChaCha20_ctr32
   2679 	.rva	.LSEH_info_ChaCha20_ctr32
   2680 
   2681 	.rva	.LSEH_begin_ChaCha20_ssse3
   2682 	.rva	.LSEH_end_ChaCha20_ssse3
   2683 	.rva	.LSEH_info_ChaCha20_ssse3
   2684 
   2685 	.rva	.LSEH_begin_ChaCha20_4x
   2686 	.rva	.LSEH_end_ChaCha20_4x
   2687 	.rva	.LSEH_info_ChaCha20_4x
   2688 ___
   2689 $code.=<<___ if ($avx>1);
   2690 	.rva	.LSEH_begin_ChaCha20_8x
   2691 	.rva	.LSEH_end_ChaCha20_8x
   2692 	.rva	.LSEH_info_ChaCha20_8x
   2693 ___
   2694 $code.=<<___ if ($avx>2);
   2695 	.rva	.LSEH_begin_ChaCha20_avx512
   2696 	.rva	.LSEH_end_ChaCha20_avx512
   2697 	.rva	.LSEH_info_ChaCha20_avx512
   2698 
   2699 	.rva	.LSEH_begin_ChaCha20_16x
   2700 	.rva	.LSEH_end_ChaCha20_16x
   2701 	.rva	.LSEH_info_ChaCha20_16x
   2702 ___
   2703 $code.=<<___;
   2704 .section	.xdata
   2705 .align	8
   2706 .LSEH_info_ChaCha20_ctr32:
   2707 	.byte	9,0,0,0
   2708 	.rva	se_handler
   2709 
   2710 .LSEH_info_ChaCha20_ssse3:
   2711 	.byte	9,0,0,0
   2712 	.rva	ssse3_handler
   2713 	.rva	.Lssse3_body,.Lssse3_epilogue
   2714 
   2715 .LSEH_info_ChaCha20_4x:
   2716 	.byte	9,0,0,0
   2717 	.rva	full_handler
   2718 	.rva	.L4x_body,.L4x_epilogue
   2719 ___
   2720 $code.=<<___ if ($avx>1);
   2721 .LSEH_info_ChaCha20_8x:
   2722 	.byte	9,0,0,0
   2723 	.rva	full_handler
   2724 	.rva	.L8x_body,.L8x_epilogue			# HandlerData[]
   2725 ___
   2726 $code.=<<___ if ($avx>2);
   2727 .LSEH_info_ChaCha20_avx512:
   2728 	.byte	9,0,0,0
   2729 	.rva	ssse3_handler
   2730 	.rva	.Lavx512_body,.Lavx512_epilogue		# HandlerData[]
   2731 
   2732 .LSEH_info_ChaCha20_16x:
   2733 	.byte	9,0,0,0
   2734 	.rva	full_handler
   2735 	.rva	.L16x_body,.L16x_epilogue		# HandlerData[]
   2736 ___
   2737 }
   2738 
   2739 foreach (split("\n",$code)) {
   2740 	s/\`([^\`]*)\`/eval $1/ge;
   2741 
   2742 	s/%x#%[yz]/%x/g;	# "down-shift"
   2743 
   2744 	print $_,"\n";
   2745 }
   2746 
   2747 close STDOUT;
   2748