Home | History | Annotate | Download | only in chacha
      1 #if defined(__x86_64__) && !defined(OPENSSL_NO_ASM)
      2 .text
      3 
      4 .extern	OPENSSL_ia32cap_P
      5 .hidden OPENSSL_ia32cap_P
      6 
      7 .align	64
      8 .Lzero:
      9 .long	0,0,0,0
     10 .Lone:
     11 .long	1,0,0,0
     12 .Linc:
     13 .long	0,1,2,3
     14 .Lfour:
     15 .long	4,4,4,4
     16 .Lincy:
     17 .long	0,2,4,6,1,3,5,7
     18 .Leight:
     19 .long	8,8,8,8,8,8,8,8
     20 .Lrot16:
     21 .byte	0x2,0x3,0x0,0x1, 0x6,0x7,0x4,0x5, 0xa,0xb,0x8,0x9, 0xe,0xf,0xc,0xd
     22 .Lrot24:
     23 .byte	0x3,0x0,0x1,0x2, 0x7,0x4,0x5,0x6, 0xb,0x8,0x9,0xa, 0xf,0xc,0xd,0xe
     24 .Lsigma:
     25 .byte	101,120,112,97,110,100,32,51,50,45,98,121,116,101,32,107,0
     26 .align	64
     27 .Lzeroz:
     28 .long	0,0,0,0, 1,0,0,0, 2,0,0,0, 3,0,0,0
     29 .Lfourz:
     30 .long	4,0,0,0, 4,0,0,0, 4,0,0,0, 4,0,0,0
     31 .Lincz:
     32 .long	0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
     33 .Lsixteen:
     34 .long	16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16
     35 .byte	67,104,97,67,104,97,50,48,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
     36 .globl	ChaCha20_ctr32
     37 .hidden ChaCha20_ctr32
     38 .type	ChaCha20_ctr32,@function
     39 .align	64
     40 ChaCha20_ctr32:
     41 	cmpq	$0,%rdx
     42 	je	.Lno_data
     43 	movq	OPENSSL_ia32cap_P+4(%rip),%r10
     44 	testl	$512,%r10d
     45 	jnz	.LChaCha20_ssse3
     46 
     47 	pushq	%rbx
     48 	pushq	%rbp
     49 	pushq	%r12
     50 	pushq	%r13
     51 	pushq	%r14
     52 	pushq	%r15
     53 	subq	$64+24,%rsp
     54 .Lctr32_body:
     55 
     56 
     57 	movdqu	(%rcx),%xmm1
     58 	movdqu	16(%rcx),%xmm2
     59 	movdqu	(%r8),%xmm3
     60 	movdqa	.Lone(%rip),%xmm4
     61 
     62 
     63 	movdqa	%xmm1,16(%rsp)
     64 	movdqa	%xmm2,32(%rsp)
     65 	movdqa	%xmm3,48(%rsp)
     66 	movq	%rdx,%rbp
     67 	jmp	.Loop_outer
     68 
     69 .align	32
     70 .Loop_outer:
     71 	movl	$0x61707865,%eax
     72 	movl	$0x3320646e,%ebx
     73 	movl	$0x79622d32,%ecx
     74 	movl	$0x6b206574,%edx
     75 	movl	16(%rsp),%r8d
     76 	movl	20(%rsp),%r9d
     77 	movl	24(%rsp),%r10d
     78 	movl	28(%rsp),%r11d
     79 	movd	%xmm3,%r12d
     80 	movl	52(%rsp),%r13d
     81 	movl	56(%rsp),%r14d
     82 	movl	60(%rsp),%r15d
     83 
     84 	movq	%rbp,64+0(%rsp)
     85 	movl	$10,%ebp
     86 	movq	%rsi,64+8(%rsp)
     87 .byte	102,72,15,126,214
     88 	movq	%rdi,64+16(%rsp)
     89 	movq	%rsi,%rdi
     90 	shrq	$32,%rdi
     91 	jmp	.Loop
     92 
     93 .align	32
     94 .Loop:
     95 	addl	%r8d,%eax
     96 	xorl	%eax,%r12d
     97 	roll	$16,%r12d
     98 	addl	%r9d,%ebx
     99 	xorl	%ebx,%r13d
    100 	roll	$16,%r13d
    101 	addl	%r12d,%esi
    102 	xorl	%esi,%r8d
    103 	roll	$12,%r8d
    104 	addl	%r13d,%edi
    105 	xorl	%edi,%r9d
    106 	roll	$12,%r9d
    107 	addl	%r8d,%eax
    108 	xorl	%eax,%r12d
    109 	roll	$8,%r12d
    110 	addl	%r9d,%ebx
    111 	xorl	%ebx,%r13d
    112 	roll	$8,%r13d
    113 	addl	%r12d,%esi
    114 	xorl	%esi,%r8d
    115 	roll	$7,%r8d
    116 	addl	%r13d,%edi
    117 	xorl	%edi,%r9d
    118 	roll	$7,%r9d
    119 	movl	%esi,32(%rsp)
    120 	movl	%edi,36(%rsp)
    121 	movl	40(%rsp),%esi
    122 	movl	44(%rsp),%edi
    123 	addl	%r10d,%ecx
    124 	xorl	%ecx,%r14d
    125 	roll	$16,%r14d
    126 	addl	%r11d,%edx
    127 	xorl	%edx,%r15d
    128 	roll	$16,%r15d
    129 	addl	%r14d,%esi
    130 	xorl	%esi,%r10d
    131 	roll	$12,%r10d
    132 	addl	%r15d,%edi
    133 	xorl	%edi,%r11d
    134 	roll	$12,%r11d
    135 	addl	%r10d,%ecx
    136 	xorl	%ecx,%r14d
    137 	roll	$8,%r14d
    138 	addl	%r11d,%edx
    139 	xorl	%edx,%r15d
    140 	roll	$8,%r15d
    141 	addl	%r14d,%esi
    142 	xorl	%esi,%r10d
    143 	roll	$7,%r10d
    144 	addl	%r15d,%edi
    145 	xorl	%edi,%r11d
    146 	roll	$7,%r11d
    147 	addl	%r9d,%eax
    148 	xorl	%eax,%r15d
    149 	roll	$16,%r15d
    150 	addl	%r10d,%ebx
    151 	xorl	%ebx,%r12d
    152 	roll	$16,%r12d
    153 	addl	%r15d,%esi
    154 	xorl	%esi,%r9d
    155 	roll	$12,%r9d
    156 	addl	%r12d,%edi
    157 	xorl	%edi,%r10d
    158 	roll	$12,%r10d
    159 	addl	%r9d,%eax
    160 	xorl	%eax,%r15d
    161 	roll	$8,%r15d
    162 	addl	%r10d,%ebx
    163 	xorl	%ebx,%r12d
    164 	roll	$8,%r12d
    165 	addl	%r15d,%esi
    166 	xorl	%esi,%r9d
    167 	roll	$7,%r9d
    168 	addl	%r12d,%edi
    169 	xorl	%edi,%r10d
    170 	roll	$7,%r10d
    171 	movl	%esi,40(%rsp)
    172 	movl	%edi,44(%rsp)
    173 	movl	32(%rsp),%esi
    174 	movl	36(%rsp),%edi
    175 	addl	%r11d,%ecx
    176 	xorl	%ecx,%r13d
    177 	roll	$16,%r13d
    178 	addl	%r8d,%edx
    179 	xorl	%edx,%r14d
    180 	roll	$16,%r14d
    181 	addl	%r13d,%esi
    182 	xorl	%esi,%r11d
    183 	roll	$12,%r11d
    184 	addl	%r14d,%edi
    185 	xorl	%edi,%r8d
    186 	roll	$12,%r8d
    187 	addl	%r11d,%ecx
    188 	xorl	%ecx,%r13d
    189 	roll	$8,%r13d
    190 	addl	%r8d,%edx
    191 	xorl	%edx,%r14d
    192 	roll	$8,%r14d
    193 	addl	%r13d,%esi
    194 	xorl	%esi,%r11d
    195 	roll	$7,%r11d
    196 	addl	%r14d,%edi
    197 	xorl	%edi,%r8d
    198 	roll	$7,%r8d
    199 	decl	%ebp
    200 	jnz	.Loop
    201 	movl	%edi,36(%rsp)
    202 	movl	%esi,32(%rsp)
    203 	movq	64(%rsp),%rbp
    204 	movdqa	%xmm2,%xmm1
    205 	movq	64+8(%rsp),%rsi
    206 	paddd	%xmm4,%xmm3
    207 	movq	64+16(%rsp),%rdi
    208 
    209 	addl	$0x61707865,%eax
    210 	addl	$0x3320646e,%ebx
    211 	addl	$0x79622d32,%ecx
    212 	addl	$0x6b206574,%edx
    213 	addl	16(%rsp),%r8d
    214 	addl	20(%rsp),%r9d
    215 	addl	24(%rsp),%r10d
    216 	addl	28(%rsp),%r11d
    217 	addl	48(%rsp),%r12d
    218 	addl	52(%rsp),%r13d
    219 	addl	56(%rsp),%r14d
    220 	addl	60(%rsp),%r15d
    221 	paddd	32(%rsp),%xmm1
    222 
    223 	cmpq	$64,%rbp
    224 	jb	.Ltail
    225 
    226 	xorl	0(%rsi),%eax
    227 	xorl	4(%rsi),%ebx
    228 	xorl	8(%rsi),%ecx
    229 	xorl	12(%rsi),%edx
    230 	xorl	16(%rsi),%r8d
    231 	xorl	20(%rsi),%r9d
    232 	xorl	24(%rsi),%r10d
    233 	xorl	28(%rsi),%r11d
    234 	movdqu	32(%rsi),%xmm0
    235 	xorl	48(%rsi),%r12d
    236 	xorl	52(%rsi),%r13d
    237 	xorl	56(%rsi),%r14d
    238 	xorl	60(%rsi),%r15d
    239 	leaq	64(%rsi),%rsi
    240 	pxor	%xmm1,%xmm0
    241 
    242 	movdqa	%xmm2,32(%rsp)
    243 	movd	%xmm3,48(%rsp)
    244 
    245 	movl	%eax,0(%rdi)
    246 	movl	%ebx,4(%rdi)
    247 	movl	%ecx,8(%rdi)
    248 	movl	%edx,12(%rdi)
    249 	movl	%r8d,16(%rdi)
    250 	movl	%r9d,20(%rdi)
    251 	movl	%r10d,24(%rdi)
    252 	movl	%r11d,28(%rdi)
    253 	movdqu	%xmm0,32(%rdi)
    254 	movl	%r12d,48(%rdi)
    255 	movl	%r13d,52(%rdi)
    256 	movl	%r14d,56(%rdi)
    257 	movl	%r15d,60(%rdi)
    258 	leaq	64(%rdi),%rdi
    259 
    260 	subq	$64,%rbp
    261 	jnz	.Loop_outer
    262 
    263 	jmp	.Ldone
    264 
    265 .align	16
    266 .Ltail:
    267 	movl	%eax,0(%rsp)
    268 	movl	%ebx,4(%rsp)
    269 	xorq	%rbx,%rbx
    270 	movl	%ecx,8(%rsp)
    271 	movl	%edx,12(%rsp)
    272 	movl	%r8d,16(%rsp)
    273 	movl	%r9d,20(%rsp)
    274 	movl	%r10d,24(%rsp)
    275 	movl	%r11d,28(%rsp)
    276 	movdqa	%xmm1,32(%rsp)
    277 	movl	%r12d,48(%rsp)
    278 	movl	%r13d,52(%rsp)
    279 	movl	%r14d,56(%rsp)
    280 	movl	%r15d,60(%rsp)
    281 
    282 .Loop_tail:
    283 	movzbl	(%rsi,%rbx,1),%eax
    284 	movzbl	(%rsp,%rbx,1),%edx
    285 	leaq	1(%rbx),%rbx
    286 	xorl	%edx,%eax
    287 	movb	%al,-1(%rdi,%rbx,1)
    288 	decq	%rbp
    289 	jnz	.Loop_tail
    290 
    291 .Ldone:
    292 	leaq	64+24+48(%rsp),%rsi
    293 	movq	-48(%rsi),%r15
    294 	movq	-40(%rsi),%r14
    295 	movq	-32(%rsi),%r13
    296 	movq	-24(%rsi),%r12
    297 	movq	-16(%rsi),%rbp
    298 	movq	-8(%rsi),%rbx
    299 	leaq	(%rsi),%rsp
    300 .Lno_data:
    301 	.byte	0xf3,0xc3
    302 .size	ChaCha20_ctr32,.-ChaCha20_ctr32
    303 .type	ChaCha20_ssse3,@function
    304 .align	32
    305 ChaCha20_ssse3:
    306 .LChaCha20_ssse3:
    307 	movq	%rsp,%r9
    308 	cmpq	$128,%rdx
    309 	ja	.LChaCha20_4x
    310 
    311 .Ldo_sse3_after_all:
    312 	subq	$64+8,%rsp
    313 	movdqa	.Lsigma(%rip),%xmm0
    314 	movdqu	(%rcx),%xmm1
    315 	movdqu	16(%rcx),%xmm2
    316 	movdqu	(%r8),%xmm3
    317 	movdqa	.Lrot16(%rip),%xmm6
    318 	movdqa	.Lrot24(%rip),%xmm7
    319 
    320 	movdqa	%xmm0,0(%rsp)
    321 	movdqa	%xmm1,16(%rsp)
    322 	movdqa	%xmm2,32(%rsp)
    323 	movdqa	%xmm3,48(%rsp)
    324 	movq	$10,%r8
    325 	jmp	.Loop_ssse3
    326 
    327 .align	32
    328 .Loop_outer_ssse3:
    329 	movdqa	.Lone(%rip),%xmm3
    330 	movdqa	0(%rsp),%xmm0
    331 	movdqa	16(%rsp),%xmm1
    332 	movdqa	32(%rsp),%xmm2
    333 	paddd	48(%rsp),%xmm3
    334 	movq	$10,%r8
    335 	movdqa	%xmm3,48(%rsp)
    336 	jmp	.Loop_ssse3
    337 
    338 .align	32
    339 .Loop_ssse3:
    340 	paddd	%xmm1,%xmm0
    341 	pxor	%xmm0,%xmm3
    342 .byte	102,15,56,0,222
    343 	paddd	%xmm3,%xmm2
    344 	pxor	%xmm2,%xmm1
    345 	movdqa	%xmm1,%xmm4
    346 	psrld	$20,%xmm1
    347 	pslld	$12,%xmm4
    348 	por	%xmm4,%xmm1
    349 	paddd	%xmm1,%xmm0
    350 	pxor	%xmm0,%xmm3
    351 .byte	102,15,56,0,223
    352 	paddd	%xmm3,%xmm2
    353 	pxor	%xmm2,%xmm1
    354 	movdqa	%xmm1,%xmm4
    355 	psrld	$25,%xmm1
    356 	pslld	$7,%xmm4
    357 	por	%xmm4,%xmm1
    358 	pshufd	$78,%xmm2,%xmm2
    359 	pshufd	$57,%xmm1,%xmm1
    360 	pshufd	$147,%xmm3,%xmm3
    361 	nop
    362 	paddd	%xmm1,%xmm0
    363 	pxor	%xmm0,%xmm3
    364 .byte	102,15,56,0,222
    365 	paddd	%xmm3,%xmm2
    366 	pxor	%xmm2,%xmm1
    367 	movdqa	%xmm1,%xmm4
    368 	psrld	$20,%xmm1
    369 	pslld	$12,%xmm4
    370 	por	%xmm4,%xmm1
    371 	paddd	%xmm1,%xmm0
    372 	pxor	%xmm0,%xmm3
    373 .byte	102,15,56,0,223
    374 	paddd	%xmm3,%xmm2
    375 	pxor	%xmm2,%xmm1
    376 	movdqa	%xmm1,%xmm4
    377 	psrld	$25,%xmm1
    378 	pslld	$7,%xmm4
    379 	por	%xmm4,%xmm1
    380 	pshufd	$78,%xmm2,%xmm2
    381 	pshufd	$147,%xmm1,%xmm1
    382 	pshufd	$57,%xmm3,%xmm3
    383 	decq	%r8
    384 	jnz	.Loop_ssse3
    385 	paddd	0(%rsp),%xmm0
    386 	paddd	16(%rsp),%xmm1
    387 	paddd	32(%rsp),%xmm2
    388 	paddd	48(%rsp),%xmm3
    389 
    390 	cmpq	$64,%rdx
    391 	jb	.Ltail_ssse3
    392 
    393 	movdqu	0(%rsi),%xmm4
    394 	movdqu	16(%rsi),%xmm5
    395 	pxor	%xmm4,%xmm0
    396 	movdqu	32(%rsi),%xmm4
    397 	pxor	%xmm5,%xmm1
    398 	movdqu	48(%rsi),%xmm5
    399 	leaq	64(%rsi),%rsi
    400 	pxor	%xmm4,%xmm2
    401 	pxor	%xmm5,%xmm3
    402 
    403 	movdqu	%xmm0,0(%rdi)
    404 	movdqu	%xmm1,16(%rdi)
    405 	movdqu	%xmm2,32(%rdi)
    406 	movdqu	%xmm3,48(%rdi)
    407 	leaq	64(%rdi),%rdi
    408 
    409 	subq	$64,%rdx
    410 	jnz	.Loop_outer_ssse3
    411 
    412 	jmp	.Ldone_ssse3
    413 
    414 .align	16
    415 .Ltail_ssse3:
    416 	movdqa	%xmm0,0(%rsp)
    417 	movdqa	%xmm1,16(%rsp)
    418 	movdqa	%xmm2,32(%rsp)
    419 	movdqa	%xmm3,48(%rsp)
    420 	xorq	%r8,%r8
    421 
    422 .Loop_tail_ssse3:
    423 	movzbl	(%rsi,%r8,1),%eax
    424 	movzbl	(%rsp,%r8,1),%ecx
    425 	leaq	1(%r8),%r8
    426 	xorl	%ecx,%eax
    427 	movb	%al,-1(%rdi,%r8,1)
    428 	decq	%rdx
    429 	jnz	.Loop_tail_ssse3
    430 
    431 .Ldone_ssse3:
    432 	leaq	(%r9),%rsp
    433 .Lssse3_epilogue:
    434 	.byte	0xf3,0xc3
    435 .size	ChaCha20_ssse3,.-ChaCha20_ssse3
    436 .type	ChaCha20_4x,@function
    437 .align	32
    438 ChaCha20_4x:
    439 .LChaCha20_4x:
    440 	movq	%rsp,%r9
    441 	movq	%r10,%r11
    442 	shrq	$32,%r10
    443 	testq	$32,%r10
    444 	jnz	.LChaCha20_8x
    445 	cmpq	$192,%rdx
    446 	ja	.Lproceed4x
    447 
    448 	andq	$71303168,%r11
    449 	cmpq	$4194304,%r11
    450 	je	.Ldo_sse3_after_all
    451 
    452 .Lproceed4x:
    453 	subq	$0x140+8,%rsp
    454 	movdqa	.Lsigma(%rip),%xmm11
    455 	movdqu	(%rcx),%xmm15
    456 	movdqu	16(%rcx),%xmm7
    457 	movdqu	(%r8),%xmm3
    458 	leaq	256(%rsp),%rcx
    459 	leaq	.Lrot16(%rip),%r10
    460 	leaq	.Lrot24(%rip),%r11
    461 
    462 	pshufd	$0x00,%xmm11,%xmm8
    463 	pshufd	$0x55,%xmm11,%xmm9
    464 	movdqa	%xmm8,64(%rsp)
    465 	pshufd	$0xaa,%xmm11,%xmm10
    466 	movdqa	%xmm9,80(%rsp)
    467 	pshufd	$0xff,%xmm11,%xmm11
    468 	movdqa	%xmm10,96(%rsp)
    469 	movdqa	%xmm11,112(%rsp)
    470 
    471 	pshufd	$0x00,%xmm15,%xmm12
    472 	pshufd	$0x55,%xmm15,%xmm13
    473 	movdqa	%xmm12,128-256(%rcx)
    474 	pshufd	$0xaa,%xmm15,%xmm14
    475 	movdqa	%xmm13,144-256(%rcx)
    476 	pshufd	$0xff,%xmm15,%xmm15
    477 	movdqa	%xmm14,160-256(%rcx)
    478 	movdqa	%xmm15,176-256(%rcx)
    479 
    480 	pshufd	$0x00,%xmm7,%xmm4
    481 	pshufd	$0x55,%xmm7,%xmm5
    482 	movdqa	%xmm4,192-256(%rcx)
    483 	pshufd	$0xaa,%xmm7,%xmm6
    484 	movdqa	%xmm5,208-256(%rcx)
    485 	pshufd	$0xff,%xmm7,%xmm7
    486 	movdqa	%xmm6,224-256(%rcx)
    487 	movdqa	%xmm7,240-256(%rcx)
    488 
    489 	pshufd	$0x00,%xmm3,%xmm0
    490 	pshufd	$0x55,%xmm3,%xmm1
    491 	paddd	.Linc(%rip),%xmm0
    492 	pshufd	$0xaa,%xmm3,%xmm2
    493 	movdqa	%xmm1,272-256(%rcx)
    494 	pshufd	$0xff,%xmm3,%xmm3
    495 	movdqa	%xmm2,288-256(%rcx)
    496 	movdqa	%xmm3,304-256(%rcx)
    497 
    498 	jmp	.Loop_enter4x
    499 
    500 .align	32
    501 .Loop_outer4x:
    502 	movdqa	64(%rsp),%xmm8
    503 	movdqa	80(%rsp),%xmm9
    504 	movdqa	96(%rsp),%xmm10
    505 	movdqa	112(%rsp),%xmm11
    506 	movdqa	128-256(%rcx),%xmm12
    507 	movdqa	144-256(%rcx),%xmm13
    508 	movdqa	160-256(%rcx),%xmm14
    509 	movdqa	176-256(%rcx),%xmm15
    510 	movdqa	192-256(%rcx),%xmm4
    511 	movdqa	208-256(%rcx),%xmm5
    512 	movdqa	224-256(%rcx),%xmm6
    513 	movdqa	240-256(%rcx),%xmm7
    514 	movdqa	256-256(%rcx),%xmm0
    515 	movdqa	272-256(%rcx),%xmm1
    516 	movdqa	288-256(%rcx),%xmm2
    517 	movdqa	304-256(%rcx),%xmm3
    518 	paddd	.Lfour(%rip),%xmm0
    519 
    520 .Loop_enter4x:
    521 	movdqa	%xmm6,32(%rsp)
    522 	movdqa	%xmm7,48(%rsp)
    523 	movdqa	(%r10),%xmm7
    524 	movl	$10,%eax
    525 	movdqa	%xmm0,256-256(%rcx)
    526 	jmp	.Loop4x
    527 
    528 .align	32
    529 .Loop4x:
    530 	paddd	%xmm12,%xmm8
    531 	paddd	%xmm13,%xmm9
    532 	pxor	%xmm8,%xmm0
    533 	pxor	%xmm9,%xmm1
    534 .byte	102,15,56,0,199
    535 .byte	102,15,56,0,207
    536 	paddd	%xmm0,%xmm4
    537 	paddd	%xmm1,%xmm5
    538 	pxor	%xmm4,%xmm12
    539 	pxor	%xmm5,%xmm13
    540 	movdqa	%xmm12,%xmm6
    541 	pslld	$12,%xmm12
    542 	psrld	$20,%xmm6
    543 	movdqa	%xmm13,%xmm7
    544 	pslld	$12,%xmm13
    545 	por	%xmm6,%xmm12
    546 	psrld	$20,%xmm7
    547 	movdqa	(%r11),%xmm6
    548 	por	%xmm7,%xmm13
    549 	paddd	%xmm12,%xmm8
    550 	paddd	%xmm13,%xmm9
    551 	pxor	%xmm8,%xmm0
    552 	pxor	%xmm9,%xmm1
    553 .byte	102,15,56,0,198
    554 .byte	102,15,56,0,206
    555 	paddd	%xmm0,%xmm4
    556 	paddd	%xmm1,%xmm5
    557 	pxor	%xmm4,%xmm12
    558 	pxor	%xmm5,%xmm13
    559 	movdqa	%xmm12,%xmm7
    560 	pslld	$7,%xmm12
    561 	psrld	$25,%xmm7
    562 	movdqa	%xmm13,%xmm6
    563 	pslld	$7,%xmm13
    564 	por	%xmm7,%xmm12
    565 	psrld	$25,%xmm6
    566 	movdqa	(%r10),%xmm7
    567 	por	%xmm6,%xmm13
    568 	movdqa	%xmm4,0(%rsp)
    569 	movdqa	%xmm5,16(%rsp)
    570 	movdqa	32(%rsp),%xmm4
    571 	movdqa	48(%rsp),%xmm5
    572 	paddd	%xmm14,%xmm10
    573 	paddd	%xmm15,%xmm11
    574 	pxor	%xmm10,%xmm2
    575 	pxor	%xmm11,%xmm3
    576 .byte	102,15,56,0,215
    577 .byte	102,15,56,0,223
    578 	paddd	%xmm2,%xmm4
    579 	paddd	%xmm3,%xmm5
    580 	pxor	%xmm4,%xmm14
    581 	pxor	%xmm5,%xmm15
    582 	movdqa	%xmm14,%xmm6
    583 	pslld	$12,%xmm14
    584 	psrld	$20,%xmm6
    585 	movdqa	%xmm15,%xmm7
    586 	pslld	$12,%xmm15
    587 	por	%xmm6,%xmm14
    588 	psrld	$20,%xmm7
    589 	movdqa	(%r11),%xmm6
    590 	por	%xmm7,%xmm15
    591 	paddd	%xmm14,%xmm10
    592 	paddd	%xmm15,%xmm11
    593 	pxor	%xmm10,%xmm2
    594 	pxor	%xmm11,%xmm3
    595 .byte	102,15,56,0,214
    596 .byte	102,15,56,0,222
    597 	paddd	%xmm2,%xmm4
    598 	paddd	%xmm3,%xmm5
    599 	pxor	%xmm4,%xmm14
    600 	pxor	%xmm5,%xmm15
    601 	movdqa	%xmm14,%xmm7
    602 	pslld	$7,%xmm14
    603 	psrld	$25,%xmm7
    604 	movdqa	%xmm15,%xmm6
    605 	pslld	$7,%xmm15
    606 	por	%xmm7,%xmm14
    607 	psrld	$25,%xmm6
    608 	movdqa	(%r10),%xmm7
    609 	por	%xmm6,%xmm15
    610 	paddd	%xmm13,%xmm8
    611 	paddd	%xmm14,%xmm9
    612 	pxor	%xmm8,%xmm3
    613 	pxor	%xmm9,%xmm0
    614 .byte	102,15,56,0,223
    615 .byte	102,15,56,0,199
    616 	paddd	%xmm3,%xmm4
    617 	paddd	%xmm0,%xmm5
    618 	pxor	%xmm4,%xmm13
    619 	pxor	%xmm5,%xmm14
    620 	movdqa	%xmm13,%xmm6
    621 	pslld	$12,%xmm13
    622 	psrld	$20,%xmm6
    623 	movdqa	%xmm14,%xmm7
    624 	pslld	$12,%xmm14
    625 	por	%xmm6,%xmm13
    626 	psrld	$20,%xmm7
    627 	movdqa	(%r11),%xmm6
    628 	por	%xmm7,%xmm14
    629 	paddd	%xmm13,%xmm8
    630 	paddd	%xmm14,%xmm9
    631 	pxor	%xmm8,%xmm3
    632 	pxor	%xmm9,%xmm0
    633 .byte	102,15,56,0,222
    634 .byte	102,15,56,0,198
    635 	paddd	%xmm3,%xmm4
    636 	paddd	%xmm0,%xmm5
    637 	pxor	%xmm4,%xmm13
    638 	pxor	%xmm5,%xmm14
    639 	movdqa	%xmm13,%xmm7
    640 	pslld	$7,%xmm13
    641 	psrld	$25,%xmm7
    642 	movdqa	%xmm14,%xmm6
    643 	pslld	$7,%xmm14
    644 	por	%xmm7,%xmm13
    645 	psrld	$25,%xmm6
    646 	movdqa	(%r10),%xmm7
    647 	por	%xmm6,%xmm14
    648 	movdqa	%xmm4,32(%rsp)
    649 	movdqa	%xmm5,48(%rsp)
    650 	movdqa	0(%rsp),%xmm4
    651 	movdqa	16(%rsp),%xmm5
    652 	paddd	%xmm15,%xmm10
    653 	paddd	%xmm12,%xmm11
    654 	pxor	%xmm10,%xmm1
    655 	pxor	%xmm11,%xmm2
    656 .byte	102,15,56,0,207
    657 .byte	102,15,56,0,215
    658 	paddd	%xmm1,%xmm4
    659 	paddd	%xmm2,%xmm5
    660 	pxor	%xmm4,%xmm15
    661 	pxor	%xmm5,%xmm12
    662 	movdqa	%xmm15,%xmm6
    663 	pslld	$12,%xmm15
    664 	psrld	$20,%xmm6
    665 	movdqa	%xmm12,%xmm7
    666 	pslld	$12,%xmm12
    667 	por	%xmm6,%xmm15
    668 	psrld	$20,%xmm7
    669 	movdqa	(%r11),%xmm6
    670 	por	%xmm7,%xmm12
    671 	paddd	%xmm15,%xmm10
    672 	paddd	%xmm12,%xmm11
    673 	pxor	%xmm10,%xmm1
    674 	pxor	%xmm11,%xmm2
    675 .byte	102,15,56,0,206
    676 .byte	102,15,56,0,214
    677 	paddd	%xmm1,%xmm4
    678 	paddd	%xmm2,%xmm5
    679 	pxor	%xmm4,%xmm15
    680 	pxor	%xmm5,%xmm12
    681 	movdqa	%xmm15,%xmm7
    682 	pslld	$7,%xmm15
    683 	psrld	$25,%xmm7
    684 	movdqa	%xmm12,%xmm6
    685 	pslld	$7,%xmm12
    686 	por	%xmm7,%xmm15
    687 	psrld	$25,%xmm6
    688 	movdqa	(%r10),%xmm7
    689 	por	%xmm6,%xmm12
    690 	decl	%eax
    691 	jnz	.Loop4x
    692 
    693 	paddd	64(%rsp),%xmm8
    694 	paddd	80(%rsp),%xmm9
    695 	paddd	96(%rsp),%xmm10
    696 	paddd	112(%rsp),%xmm11
    697 
    698 	movdqa	%xmm8,%xmm6
    699 	punpckldq	%xmm9,%xmm8
    700 	movdqa	%xmm10,%xmm7
    701 	punpckldq	%xmm11,%xmm10
    702 	punpckhdq	%xmm9,%xmm6
    703 	punpckhdq	%xmm11,%xmm7
    704 	movdqa	%xmm8,%xmm9
    705 	punpcklqdq	%xmm10,%xmm8
    706 	movdqa	%xmm6,%xmm11
    707 	punpcklqdq	%xmm7,%xmm6
    708 	punpckhqdq	%xmm10,%xmm9
    709 	punpckhqdq	%xmm7,%xmm11
    710 	paddd	128-256(%rcx),%xmm12
    711 	paddd	144-256(%rcx),%xmm13
    712 	paddd	160-256(%rcx),%xmm14
    713 	paddd	176-256(%rcx),%xmm15
    714 
    715 	movdqa	%xmm8,0(%rsp)
    716 	movdqa	%xmm9,16(%rsp)
    717 	movdqa	32(%rsp),%xmm8
    718 	movdqa	48(%rsp),%xmm9
    719 
    720 	movdqa	%xmm12,%xmm10
    721 	punpckldq	%xmm13,%xmm12
    722 	movdqa	%xmm14,%xmm7
    723 	punpckldq	%xmm15,%xmm14
    724 	punpckhdq	%xmm13,%xmm10
    725 	punpckhdq	%xmm15,%xmm7
    726 	movdqa	%xmm12,%xmm13
    727 	punpcklqdq	%xmm14,%xmm12
    728 	movdqa	%xmm10,%xmm15
    729 	punpcklqdq	%xmm7,%xmm10
    730 	punpckhqdq	%xmm14,%xmm13
    731 	punpckhqdq	%xmm7,%xmm15
    732 	paddd	192-256(%rcx),%xmm4
    733 	paddd	208-256(%rcx),%xmm5
    734 	paddd	224-256(%rcx),%xmm8
    735 	paddd	240-256(%rcx),%xmm9
    736 
    737 	movdqa	%xmm6,32(%rsp)
    738 	movdqa	%xmm11,48(%rsp)
    739 
    740 	movdqa	%xmm4,%xmm14
    741 	punpckldq	%xmm5,%xmm4
    742 	movdqa	%xmm8,%xmm7
    743 	punpckldq	%xmm9,%xmm8
    744 	punpckhdq	%xmm5,%xmm14
    745 	punpckhdq	%xmm9,%xmm7
    746 	movdqa	%xmm4,%xmm5
    747 	punpcklqdq	%xmm8,%xmm4
    748 	movdqa	%xmm14,%xmm9
    749 	punpcklqdq	%xmm7,%xmm14
    750 	punpckhqdq	%xmm8,%xmm5
    751 	punpckhqdq	%xmm7,%xmm9
    752 	paddd	256-256(%rcx),%xmm0
    753 	paddd	272-256(%rcx),%xmm1
    754 	paddd	288-256(%rcx),%xmm2
    755 	paddd	304-256(%rcx),%xmm3
    756 
    757 	movdqa	%xmm0,%xmm8
    758 	punpckldq	%xmm1,%xmm0
    759 	movdqa	%xmm2,%xmm7
    760 	punpckldq	%xmm3,%xmm2
    761 	punpckhdq	%xmm1,%xmm8
    762 	punpckhdq	%xmm3,%xmm7
    763 	movdqa	%xmm0,%xmm1
    764 	punpcklqdq	%xmm2,%xmm0
    765 	movdqa	%xmm8,%xmm3
    766 	punpcklqdq	%xmm7,%xmm8
    767 	punpckhqdq	%xmm2,%xmm1
    768 	punpckhqdq	%xmm7,%xmm3
    769 	cmpq	$256,%rdx
    770 	jb	.Ltail4x
    771 
    772 	movdqu	0(%rsi),%xmm6
    773 	movdqu	16(%rsi),%xmm11
    774 	movdqu	32(%rsi),%xmm2
    775 	movdqu	48(%rsi),%xmm7
    776 	pxor	0(%rsp),%xmm6
    777 	pxor	%xmm12,%xmm11
    778 	pxor	%xmm4,%xmm2
    779 	pxor	%xmm0,%xmm7
    780 
    781 	movdqu	%xmm6,0(%rdi)
    782 	movdqu	64(%rsi),%xmm6
    783 	movdqu	%xmm11,16(%rdi)
    784 	movdqu	80(%rsi),%xmm11
    785 	movdqu	%xmm2,32(%rdi)
    786 	movdqu	96(%rsi),%xmm2
    787 	movdqu	%xmm7,48(%rdi)
    788 	movdqu	112(%rsi),%xmm7
    789 	leaq	128(%rsi),%rsi
    790 	pxor	16(%rsp),%xmm6
    791 	pxor	%xmm13,%xmm11
    792 	pxor	%xmm5,%xmm2
    793 	pxor	%xmm1,%xmm7
    794 
    795 	movdqu	%xmm6,64(%rdi)
    796 	movdqu	0(%rsi),%xmm6
    797 	movdqu	%xmm11,80(%rdi)
    798 	movdqu	16(%rsi),%xmm11
    799 	movdqu	%xmm2,96(%rdi)
    800 	movdqu	32(%rsi),%xmm2
    801 	movdqu	%xmm7,112(%rdi)
    802 	leaq	128(%rdi),%rdi
    803 	movdqu	48(%rsi),%xmm7
    804 	pxor	32(%rsp),%xmm6
    805 	pxor	%xmm10,%xmm11
    806 	pxor	%xmm14,%xmm2
    807 	pxor	%xmm8,%xmm7
    808 
    809 	movdqu	%xmm6,0(%rdi)
    810 	movdqu	64(%rsi),%xmm6
    811 	movdqu	%xmm11,16(%rdi)
    812 	movdqu	80(%rsi),%xmm11
    813 	movdqu	%xmm2,32(%rdi)
    814 	movdqu	96(%rsi),%xmm2
    815 	movdqu	%xmm7,48(%rdi)
    816 	movdqu	112(%rsi),%xmm7
    817 	leaq	128(%rsi),%rsi
    818 	pxor	48(%rsp),%xmm6
    819 	pxor	%xmm15,%xmm11
    820 	pxor	%xmm9,%xmm2
    821 	pxor	%xmm3,%xmm7
    822 	movdqu	%xmm6,64(%rdi)
    823 	movdqu	%xmm11,80(%rdi)
    824 	movdqu	%xmm2,96(%rdi)
    825 	movdqu	%xmm7,112(%rdi)
    826 	leaq	128(%rdi),%rdi
    827 
    828 	subq	$256,%rdx
    829 	jnz	.Loop_outer4x
    830 
    831 	jmp	.Ldone4x
    832 
    833 .Ltail4x:
    834 	cmpq	$192,%rdx
    835 	jae	.L192_or_more4x
    836 	cmpq	$128,%rdx
    837 	jae	.L128_or_more4x
    838 	cmpq	$64,%rdx
    839 	jae	.L64_or_more4x
    840 
    841 
    842 	xorq	%r10,%r10
    843 
    844 	movdqa	%xmm12,16(%rsp)
    845 	movdqa	%xmm4,32(%rsp)
    846 	movdqa	%xmm0,48(%rsp)
    847 	jmp	.Loop_tail4x
    848 
    849 .align	32
    850 .L64_or_more4x:
    851 	movdqu	0(%rsi),%xmm6
    852 	movdqu	16(%rsi),%xmm11
    853 	movdqu	32(%rsi),%xmm2
    854 	movdqu	48(%rsi),%xmm7
    855 	pxor	0(%rsp),%xmm6
    856 	pxor	%xmm12,%xmm11
    857 	pxor	%xmm4,%xmm2
    858 	pxor	%xmm0,%xmm7
    859 	movdqu	%xmm6,0(%rdi)
    860 	movdqu	%xmm11,16(%rdi)
    861 	movdqu	%xmm2,32(%rdi)
    862 	movdqu	%xmm7,48(%rdi)
    863 	je	.Ldone4x
    864 
    865 	movdqa	16(%rsp),%xmm6
    866 	leaq	64(%rsi),%rsi
    867 	xorq	%r10,%r10
    868 	movdqa	%xmm6,0(%rsp)
    869 	movdqa	%xmm13,16(%rsp)
    870 	leaq	64(%rdi),%rdi
    871 	movdqa	%xmm5,32(%rsp)
    872 	subq	$64,%rdx
    873 	movdqa	%xmm1,48(%rsp)
    874 	jmp	.Loop_tail4x
    875 
    876 .align	32
    877 .L128_or_more4x:
    878 	movdqu	0(%rsi),%xmm6
    879 	movdqu	16(%rsi),%xmm11
    880 	movdqu	32(%rsi),%xmm2
    881 	movdqu	48(%rsi),%xmm7
    882 	pxor	0(%rsp),%xmm6
    883 	pxor	%xmm12,%xmm11
    884 	pxor	%xmm4,%xmm2
    885 	pxor	%xmm0,%xmm7
    886 
    887 	movdqu	%xmm6,0(%rdi)
    888 	movdqu	64(%rsi),%xmm6
    889 	movdqu	%xmm11,16(%rdi)
    890 	movdqu	80(%rsi),%xmm11
    891 	movdqu	%xmm2,32(%rdi)
    892 	movdqu	96(%rsi),%xmm2
    893 	movdqu	%xmm7,48(%rdi)
    894 	movdqu	112(%rsi),%xmm7
    895 	pxor	16(%rsp),%xmm6
    896 	pxor	%xmm13,%xmm11
    897 	pxor	%xmm5,%xmm2
    898 	pxor	%xmm1,%xmm7
    899 	movdqu	%xmm6,64(%rdi)
    900 	movdqu	%xmm11,80(%rdi)
    901 	movdqu	%xmm2,96(%rdi)
    902 	movdqu	%xmm7,112(%rdi)
    903 	je	.Ldone4x
    904 
    905 	movdqa	32(%rsp),%xmm6
    906 	leaq	128(%rsi),%rsi
    907 	xorq	%r10,%r10
    908 	movdqa	%xmm6,0(%rsp)
    909 	movdqa	%xmm10,16(%rsp)
    910 	leaq	128(%rdi),%rdi
    911 	movdqa	%xmm14,32(%rsp)
    912 	subq	$128,%rdx
    913 	movdqa	%xmm8,48(%rsp)
    914 	jmp	.Loop_tail4x
    915 
    916 .align	32
    917 .L192_or_more4x:
    918 	movdqu	0(%rsi),%xmm6
    919 	movdqu	16(%rsi),%xmm11
    920 	movdqu	32(%rsi),%xmm2
    921 	movdqu	48(%rsi),%xmm7
    922 	pxor	0(%rsp),%xmm6
    923 	pxor	%xmm12,%xmm11
    924 	pxor	%xmm4,%xmm2
    925 	pxor	%xmm0,%xmm7
    926 
    927 	movdqu	%xmm6,0(%rdi)
    928 	movdqu	64(%rsi),%xmm6
    929 	movdqu	%xmm11,16(%rdi)
    930 	movdqu	80(%rsi),%xmm11
    931 	movdqu	%xmm2,32(%rdi)
    932 	movdqu	96(%rsi),%xmm2
    933 	movdqu	%xmm7,48(%rdi)
    934 	movdqu	112(%rsi),%xmm7
    935 	leaq	128(%rsi),%rsi
    936 	pxor	16(%rsp),%xmm6
    937 	pxor	%xmm13,%xmm11
    938 	pxor	%xmm5,%xmm2
    939 	pxor	%xmm1,%xmm7
    940 
    941 	movdqu	%xmm6,64(%rdi)
    942 	movdqu	0(%rsi),%xmm6
    943 	movdqu	%xmm11,80(%rdi)
    944 	movdqu	16(%rsi),%xmm11
    945 	movdqu	%xmm2,96(%rdi)
    946 	movdqu	32(%rsi),%xmm2
    947 	movdqu	%xmm7,112(%rdi)
    948 	leaq	128(%rdi),%rdi
    949 	movdqu	48(%rsi),%xmm7
    950 	pxor	32(%rsp),%xmm6
    951 	pxor	%xmm10,%xmm11
    952 	pxor	%xmm14,%xmm2
    953 	pxor	%xmm8,%xmm7
    954 	movdqu	%xmm6,0(%rdi)
    955 	movdqu	%xmm11,16(%rdi)
    956 	movdqu	%xmm2,32(%rdi)
    957 	movdqu	%xmm7,48(%rdi)
    958 	je	.Ldone4x
    959 
    960 	movdqa	48(%rsp),%xmm6
    961 	leaq	64(%rsi),%rsi
    962 	xorq	%r10,%r10
    963 	movdqa	%xmm6,0(%rsp)
    964 	movdqa	%xmm15,16(%rsp)
    965 	leaq	64(%rdi),%rdi
    966 	movdqa	%xmm9,32(%rsp)
    967 	subq	$192,%rdx
    968 	movdqa	%xmm3,48(%rsp)
    969 
    970 .Loop_tail4x:
    971 	movzbl	(%rsi,%r10,1),%eax
    972 	movzbl	(%rsp,%r10,1),%ecx
    973 	leaq	1(%r10),%r10
    974 	xorl	%ecx,%eax
    975 	movb	%al,-1(%rdi,%r10,1)
    976 	decq	%rdx
    977 	jnz	.Loop_tail4x
    978 
    979 .Ldone4x:
    980 	leaq	(%r9),%rsp
    981 .L4x_epilogue:
    982 	.byte	0xf3,0xc3
    983 .size	ChaCha20_4x,.-ChaCha20_4x
    984 .type	ChaCha20_8x,@function
    985 .align	32
    986 ChaCha20_8x:
    987 .LChaCha20_8x:
    988 	movq	%rsp,%r9
    989 	subq	$0x280+8,%rsp
    990 	andq	$-32,%rsp
    991 	vzeroupper
    992 
    993 
    994 
    995 
    996 
    997 
    998 
    999 
   1000 
   1001 
   1002 	vbroadcasti128	.Lsigma(%rip),%ymm11
   1003 	vbroadcasti128	(%rcx),%ymm3
   1004 	vbroadcasti128	16(%rcx),%ymm15
   1005 	vbroadcasti128	(%r8),%ymm7
   1006 	leaq	256(%rsp),%rcx
   1007 	leaq	512(%rsp),%rax
   1008 	leaq	.Lrot16(%rip),%r10
   1009 	leaq	.Lrot24(%rip),%r11
   1010 
   1011 	vpshufd	$0x00,%ymm11,%ymm8
   1012 	vpshufd	$0x55,%ymm11,%ymm9
   1013 	vmovdqa	%ymm8,128-256(%rcx)
   1014 	vpshufd	$0xaa,%ymm11,%ymm10
   1015 	vmovdqa	%ymm9,160-256(%rcx)
   1016 	vpshufd	$0xff,%ymm11,%ymm11
   1017 	vmovdqa	%ymm10,192-256(%rcx)
   1018 	vmovdqa	%ymm11,224-256(%rcx)
   1019 
   1020 	vpshufd	$0x00,%ymm3,%ymm0
   1021 	vpshufd	$0x55,%ymm3,%ymm1
   1022 	vmovdqa	%ymm0,256-256(%rcx)
   1023 	vpshufd	$0xaa,%ymm3,%ymm2
   1024 	vmovdqa	%ymm1,288-256(%rcx)
   1025 	vpshufd	$0xff,%ymm3,%ymm3
   1026 	vmovdqa	%ymm2,320-256(%rcx)
   1027 	vmovdqa	%ymm3,352-256(%rcx)
   1028 
   1029 	vpshufd	$0x00,%ymm15,%ymm12
   1030 	vpshufd	$0x55,%ymm15,%ymm13
   1031 	vmovdqa	%ymm12,384-512(%rax)
   1032 	vpshufd	$0xaa,%ymm15,%ymm14
   1033 	vmovdqa	%ymm13,416-512(%rax)
   1034 	vpshufd	$0xff,%ymm15,%ymm15
   1035 	vmovdqa	%ymm14,448-512(%rax)
   1036 	vmovdqa	%ymm15,480-512(%rax)
   1037 
   1038 	vpshufd	$0x00,%ymm7,%ymm4
   1039 	vpshufd	$0x55,%ymm7,%ymm5
   1040 	vpaddd	.Lincy(%rip),%ymm4,%ymm4
   1041 	vpshufd	$0xaa,%ymm7,%ymm6
   1042 	vmovdqa	%ymm5,544-512(%rax)
   1043 	vpshufd	$0xff,%ymm7,%ymm7
   1044 	vmovdqa	%ymm6,576-512(%rax)
   1045 	vmovdqa	%ymm7,608-512(%rax)
   1046 
   1047 	jmp	.Loop_enter8x
   1048 
   1049 .align	32
   1050 .Loop_outer8x:
   1051 	vmovdqa	128-256(%rcx),%ymm8
   1052 	vmovdqa	160-256(%rcx),%ymm9
   1053 	vmovdqa	192-256(%rcx),%ymm10
   1054 	vmovdqa	224-256(%rcx),%ymm11
   1055 	vmovdqa	256-256(%rcx),%ymm0
   1056 	vmovdqa	288-256(%rcx),%ymm1
   1057 	vmovdqa	320-256(%rcx),%ymm2
   1058 	vmovdqa	352-256(%rcx),%ymm3
   1059 	vmovdqa	384-512(%rax),%ymm12
   1060 	vmovdqa	416-512(%rax),%ymm13
   1061 	vmovdqa	448-512(%rax),%ymm14
   1062 	vmovdqa	480-512(%rax),%ymm15
   1063 	vmovdqa	512-512(%rax),%ymm4
   1064 	vmovdqa	544-512(%rax),%ymm5
   1065 	vmovdqa	576-512(%rax),%ymm6
   1066 	vmovdqa	608-512(%rax),%ymm7
   1067 	vpaddd	.Leight(%rip),%ymm4,%ymm4
   1068 
   1069 .Loop_enter8x:
   1070 	vmovdqa	%ymm14,64(%rsp)
   1071 	vmovdqa	%ymm15,96(%rsp)
   1072 	vbroadcasti128	(%r10),%ymm15
   1073 	vmovdqa	%ymm4,512-512(%rax)
   1074 	movl	$10,%eax
   1075 	jmp	.Loop8x
   1076 
   1077 .align	32
   1078 .Loop8x:
   1079 	vpaddd	%ymm0,%ymm8,%ymm8
   1080 	vpxor	%ymm4,%ymm8,%ymm4
   1081 	vpshufb	%ymm15,%ymm4,%ymm4
   1082 	vpaddd	%ymm1,%ymm9,%ymm9
   1083 	vpxor	%ymm5,%ymm9,%ymm5
   1084 	vpshufb	%ymm15,%ymm5,%ymm5
   1085 	vpaddd	%ymm4,%ymm12,%ymm12
   1086 	vpxor	%ymm0,%ymm12,%ymm0
   1087 	vpslld	$12,%ymm0,%ymm14
   1088 	vpsrld	$20,%ymm0,%ymm0
   1089 	vpor	%ymm0,%ymm14,%ymm0
   1090 	vbroadcasti128	(%r11),%ymm14
   1091 	vpaddd	%ymm5,%ymm13,%ymm13
   1092 	vpxor	%ymm1,%ymm13,%ymm1
   1093 	vpslld	$12,%ymm1,%ymm15
   1094 	vpsrld	$20,%ymm1,%ymm1
   1095 	vpor	%ymm1,%ymm15,%ymm1
   1096 	vpaddd	%ymm0,%ymm8,%ymm8
   1097 	vpxor	%ymm4,%ymm8,%ymm4
   1098 	vpshufb	%ymm14,%ymm4,%ymm4
   1099 	vpaddd	%ymm1,%ymm9,%ymm9
   1100 	vpxor	%ymm5,%ymm9,%ymm5
   1101 	vpshufb	%ymm14,%ymm5,%ymm5
   1102 	vpaddd	%ymm4,%ymm12,%ymm12
   1103 	vpxor	%ymm0,%ymm12,%ymm0
   1104 	vpslld	$7,%ymm0,%ymm15
   1105 	vpsrld	$25,%ymm0,%ymm0
   1106 	vpor	%ymm0,%ymm15,%ymm0
   1107 	vbroadcasti128	(%r10),%ymm15
   1108 	vpaddd	%ymm5,%ymm13,%ymm13
   1109 	vpxor	%ymm1,%ymm13,%ymm1
   1110 	vpslld	$7,%ymm1,%ymm14
   1111 	vpsrld	$25,%ymm1,%ymm1
   1112 	vpor	%ymm1,%ymm14,%ymm1
   1113 	vmovdqa	%ymm12,0(%rsp)
   1114 	vmovdqa	%ymm13,32(%rsp)
   1115 	vmovdqa	64(%rsp),%ymm12
   1116 	vmovdqa	96(%rsp),%ymm13
   1117 	vpaddd	%ymm2,%ymm10,%ymm10
   1118 	vpxor	%ymm6,%ymm10,%ymm6
   1119 	vpshufb	%ymm15,%ymm6,%ymm6
   1120 	vpaddd	%ymm3,%ymm11,%ymm11
   1121 	vpxor	%ymm7,%ymm11,%ymm7
   1122 	vpshufb	%ymm15,%ymm7,%ymm7
   1123 	vpaddd	%ymm6,%ymm12,%ymm12
   1124 	vpxor	%ymm2,%ymm12,%ymm2
   1125 	vpslld	$12,%ymm2,%ymm14
   1126 	vpsrld	$20,%ymm2,%ymm2
   1127 	vpor	%ymm2,%ymm14,%ymm2
   1128 	vbroadcasti128	(%r11),%ymm14
   1129 	vpaddd	%ymm7,%ymm13,%ymm13
   1130 	vpxor	%ymm3,%ymm13,%ymm3
   1131 	vpslld	$12,%ymm3,%ymm15
   1132 	vpsrld	$20,%ymm3,%ymm3
   1133 	vpor	%ymm3,%ymm15,%ymm3
   1134 	vpaddd	%ymm2,%ymm10,%ymm10
   1135 	vpxor	%ymm6,%ymm10,%ymm6
   1136 	vpshufb	%ymm14,%ymm6,%ymm6
   1137 	vpaddd	%ymm3,%ymm11,%ymm11
   1138 	vpxor	%ymm7,%ymm11,%ymm7
   1139 	vpshufb	%ymm14,%ymm7,%ymm7
   1140 	vpaddd	%ymm6,%ymm12,%ymm12
   1141 	vpxor	%ymm2,%ymm12,%ymm2
   1142 	vpslld	$7,%ymm2,%ymm15
   1143 	vpsrld	$25,%ymm2,%ymm2
   1144 	vpor	%ymm2,%ymm15,%ymm2
   1145 	vbroadcasti128	(%r10),%ymm15
   1146 	vpaddd	%ymm7,%ymm13,%ymm13
   1147 	vpxor	%ymm3,%ymm13,%ymm3
   1148 	vpslld	$7,%ymm3,%ymm14
   1149 	vpsrld	$25,%ymm3,%ymm3
   1150 	vpor	%ymm3,%ymm14,%ymm3
   1151 	vpaddd	%ymm1,%ymm8,%ymm8
   1152 	vpxor	%ymm7,%ymm8,%ymm7
   1153 	vpshufb	%ymm15,%ymm7,%ymm7
   1154 	vpaddd	%ymm2,%ymm9,%ymm9
   1155 	vpxor	%ymm4,%ymm9,%ymm4
   1156 	vpshufb	%ymm15,%ymm4,%ymm4
   1157 	vpaddd	%ymm7,%ymm12,%ymm12
   1158 	vpxor	%ymm1,%ymm12,%ymm1
   1159 	vpslld	$12,%ymm1,%ymm14
   1160 	vpsrld	$20,%ymm1,%ymm1
   1161 	vpor	%ymm1,%ymm14,%ymm1
   1162 	vbroadcasti128	(%r11),%ymm14
   1163 	vpaddd	%ymm4,%ymm13,%ymm13
   1164 	vpxor	%ymm2,%ymm13,%ymm2
   1165 	vpslld	$12,%ymm2,%ymm15
   1166 	vpsrld	$20,%ymm2,%ymm2
   1167 	vpor	%ymm2,%ymm15,%ymm2
   1168 	vpaddd	%ymm1,%ymm8,%ymm8
   1169 	vpxor	%ymm7,%ymm8,%ymm7
   1170 	vpshufb	%ymm14,%ymm7,%ymm7
   1171 	vpaddd	%ymm2,%ymm9,%ymm9
   1172 	vpxor	%ymm4,%ymm9,%ymm4
   1173 	vpshufb	%ymm14,%ymm4,%ymm4
   1174 	vpaddd	%ymm7,%ymm12,%ymm12
   1175 	vpxor	%ymm1,%ymm12,%ymm1
   1176 	vpslld	$7,%ymm1,%ymm15
   1177 	vpsrld	$25,%ymm1,%ymm1
   1178 	vpor	%ymm1,%ymm15,%ymm1
   1179 	vbroadcasti128	(%r10),%ymm15
   1180 	vpaddd	%ymm4,%ymm13,%ymm13
   1181 	vpxor	%ymm2,%ymm13,%ymm2
   1182 	vpslld	$7,%ymm2,%ymm14
   1183 	vpsrld	$25,%ymm2,%ymm2
   1184 	vpor	%ymm2,%ymm14,%ymm2
   1185 	vmovdqa	%ymm12,64(%rsp)
   1186 	vmovdqa	%ymm13,96(%rsp)
   1187 	vmovdqa	0(%rsp),%ymm12
   1188 	vmovdqa	32(%rsp),%ymm13
   1189 	vpaddd	%ymm3,%ymm10,%ymm10
   1190 	vpxor	%ymm5,%ymm10,%ymm5
   1191 	vpshufb	%ymm15,%ymm5,%ymm5
   1192 	vpaddd	%ymm0,%ymm11,%ymm11
   1193 	vpxor	%ymm6,%ymm11,%ymm6
   1194 	vpshufb	%ymm15,%ymm6,%ymm6
   1195 	vpaddd	%ymm5,%ymm12,%ymm12
   1196 	vpxor	%ymm3,%ymm12,%ymm3
   1197 	vpslld	$12,%ymm3,%ymm14
   1198 	vpsrld	$20,%ymm3,%ymm3
   1199 	vpor	%ymm3,%ymm14,%ymm3
   1200 	vbroadcasti128	(%r11),%ymm14
   1201 	vpaddd	%ymm6,%ymm13,%ymm13
   1202 	vpxor	%ymm0,%ymm13,%ymm0
   1203 	vpslld	$12,%ymm0,%ymm15
   1204 	vpsrld	$20,%ymm0,%ymm0
   1205 	vpor	%ymm0,%ymm15,%ymm0
   1206 	vpaddd	%ymm3,%ymm10,%ymm10
   1207 	vpxor	%ymm5,%ymm10,%ymm5
   1208 	vpshufb	%ymm14,%ymm5,%ymm5
   1209 	vpaddd	%ymm0,%ymm11,%ymm11
   1210 	vpxor	%ymm6,%ymm11,%ymm6
   1211 	vpshufb	%ymm14,%ymm6,%ymm6
   1212 	vpaddd	%ymm5,%ymm12,%ymm12
   1213 	vpxor	%ymm3,%ymm12,%ymm3
   1214 	vpslld	$7,%ymm3,%ymm15
   1215 	vpsrld	$25,%ymm3,%ymm3
   1216 	vpor	%ymm3,%ymm15,%ymm3
   1217 	vbroadcasti128	(%r10),%ymm15
   1218 	vpaddd	%ymm6,%ymm13,%ymm13
   1219 	vpxor	%ymm0,%ymm13,%ymm0
   1220 	vpslld	$7,%ymm0,%ymm14
   1221 	vpsrld	$25,%ymm0,%ymm0
   1222 	vpor	%ymm0,%ymm14,%ymm0
   1223 	decl	%eax
   1224 	jnz	.Loop8x
   1225 
   1226 	leaq	512(%rsp),%rax
   1227 	vpaddd	128-256(%rcx),%ymm8,%ymm8
   1228 	vpaddd	160-256(%rcx),%ymm9,%ymm9
   1229 	vpaddd	192-256(%rcx),%ymm10,%ymm10
   1230 	vpaddd	224-256(%rcx),%ymm11,%ymm11
   1231 
   1232 	vpunpckldq	%ymm9,%ymm8,%ymm14
   1233 	vpunpckldq	%ymm11,%ymm10,%ymm15
   1234 	vpunpckhdq	%ymm9,%ymm8,%ymm8
   1235 	vpunpckhdq	%ymm11,%ymm10,%ymm10
   1236 	vpunpcklqdq	%ymm15,%ymm14,%ymm9
   1237 	vpunpckhqdq	%ymm15,%ymm14,%ymm14
   1238 	vpunpcklqdq	%ymm10,%ymm8,%ymm11
   1239 	vpunpckhqdq	%ymm10,%ymm8,%ymm8
   1240 	vpaddd	256-256(%rcx),%ymm0,%ymm0
   1241 	vpaddd	288-256(%rcx),%ymm1,%ymm1
   1242 	vpaddd	320-256(%rcx),%ymm2,%ymm2
   1243 	vpaddd	352-256(%rcx),%ymm3,%ymm3
   1244 
   1245 	vpunpckldq	%ymm1,%ymm0,%ymm10
   1246 	vpunpckldq	%ymm3,%ymm2,%ymm15
   1247 	vpunpckhdq	%ymm1,%ymm0,%ymm0
   1248 	vpunpckhdq	%ymm3,%ymm2,%ymm2
   1249 	vpunpcklqdq	%ymm15,%ymm10,%ymm1
   1250 	vpunpckhqdq	%ymm15,%ymm10,%ymm10
   1251 	vpunpcklqdq	%ymm2,%ymm0,%ymm3
   1252 	vpunpckhqdq	%ymm2,%ymm0,%ymm0
   1253 	vperm2i128	$0x20,%ymm1,%ymm9,%ymm15
   1254 	vperm2i128	$0x31,%ymm1,%ymm9,%ymm1
   1255 	vperm2i128	$0x20,%ymm10,%ymm14,%ymm9
   1256 	vperm2i128	$0x31,%ymm10,%ymm14,%ymm10
   1257 	vperm2i128	$0x20,%ymm3,%ymm11,%ymm14
   1258 	vperm2i128	$0x31,%ymm3,%ymm11,%ymm3
   1259 	vperm2i128	$0x20,%ymm0,%ymm8,%ymm11
   1260 	vperm2i128	$0x31,%ymm0,%ymm8,%ymm0
   1261 	vmovdqa	%ymm15,0(%rsp)
   1262 	vmovdqa	%ymm9,32(%rsp)
   1263 	vmovdqa	64(%rsp),%ymm15
   1264 	vmovdqa	96(%rsp),%ymm9
   1265 
   1266 	vpaddd	384-512(%rax),%ymm12,%ymm12
   1267 	vpaddd	416-512(%rax),%ymm13,%ymm13
   1268 	vpaddd	448-512(%rax),%ymm15,%ymm15
   1269 	vpaddd	480-512(%rax),%ymm9,%ymm9
   1270 
   1271 	vpunpckldq	%ymm13,%ymm12,%ymm2
   1272 	vpunpckldq	%ymm9,%ymm15,%ymm8
   1273 	vpunpckhdq	%ymm13,%ymm12,%ymm12
   1274 	vpunpckhdq	%ymm9,%ymm15,%ymm15
   1275 	vpunpcklqdq	%ymm8,%ymm2,%ymm13
   1276 	vpunpckhqdq	%ymm8,%ymm2,%ymm2
   1277 	vpunpcklqdq	%ymm15,%ymm12,%ymm9
   1278 	vpunpckhqdq	%ymm15,%ymm12,%ymm12
   1279 	vpaddd	512-512(%rax),%ymm4,%ymm4
   1280 	vpaddd	544-512(%rax),%ymm5,%ymm5
   1281 	vpaddd	576-512(%rax),%ymm6,%ymm6
   1282 	vpaddd	608-512(%rax),%ymm7,%ymm7
   1283 
   1284 	vpunpckldq	%ymm5,%ymm4,%ymm15
   1285 	vpunpckldq	%ymm7,%ymm6,%ymm8
   1286 	vpunpckhdq	%ymm5,%ymm4,%ymm4
   1287 	vpunpckhdq	%ymm7,%ymm6,%ymm6
   1288 	vpunpcklqdq	%ymm8,%ymm15,%ymm5
   1289 	vpunpckhqdq	%ymm8,%ymm15,%ymm15
   1290 	vpunpcklqdq	%ymm6,%ymm4,%ymm7
   1291 	vpunpckhqdq	%ymm6,%ymm4,%ymm4
   1292 	vperm2i128	$0x20,%ymm5,%ymm13,%ymm8
   1293 	vperm2i128	$0x31,%ymm5,%ymm13,%ymm5
   1294 	vperm2i128	$0x20,%ymm15,%ymm2,%ymm13
   1295 	vperm2i128	$0x31,%ymm15,%ymm2,%ymm15
   1296 	vperm2i128	$0x20,%ymm7,%ymm9,%ymm2
   1297 	vperm2i128	$0x31,%ymm7,%ymm9,%ymm7
   1298 	vperm2i128	$0x20,%ymm4,%ymm12,%ymm9
   1299 	vperm2i128	$0x31,%ymm4,%ymm12,%ymm4
   1300 	vmovdqa	0(%rsp),%ymm6
   1301 	vmovdqa	32(%rsp),%ymm12
   1302 
   1303 	cmpq	$512,%rdx
   1304 	jb	.Ltail8x
   1305 
   1306 	vpxor	0(%rsi),%ymm6,%ymm6
   1307 	vpxor	32(%rsi),%ymm8,%ymm8
   1308 	vpxor	64(%rsi),%ymm1,%ymm1
   1309 	vpxor	96(%rsi),%ymm5,%ymm5
   1310 	leaq	128(%rsi),%rsi
   1311 	vmovdqu	%ymm6,0(%rdi)
   1312 	vmovdqu	%ymm8,32(%rdi)
   1313 	vmovdqu	%ymm1,64(%rdi)
   1314 	vmovdqu	%ymm5,96(%rdi)
   1315 	leaq	128(%rdi),%rdi
   1316 
   1317 	vpxor	0(%rsi),%ymm12,%ymm12
   1318 	vpxor	32(%rsi),%ymm13,%ymm13
   1319 	vpxor	64(%rsi),%ymm10,%ymm10
   1320 	vpxor	96(%rsi),%ymm15,%ymm15
   1321 	leaq	128(%rsi),%rsi
   1322 	vmovdqu	%ymm12,0(%rdi)
   1323 	vmovdqu	%ymm13,32(%rdi)
   1324 	vmovdqu	%ymm10,64(%rdi)
   1325 	vmovdqu	%ymm15,96(%rdi)
   1326 	leaq	128(%rdi),%rdi
   1327 
   1328 	vpxor	0(%rsi),%ymm14,%ymm14
   1329 	vpxor	32(%rsi),%ymm2,%ymm2
   1330 	vpxor	64(%rsi),%ymm3,%ymm3
   1331 	vpxor	96(%rsi),%ymm7,%ymm7
   1332 	leaq	128(%rsi),%rsi
   1333 	vmovdqu	%ymm14,0(%rdi)
   1334 	vmovdqu	%ymm2,32(%rdi)
   1335 	vmovdqu	%ymm3,64(%rdi)
   1336 	vmovdqu	%ymm7,96(%rdi)
   1337 	leaq	128(%rdi),%rdi
   1338 
   1339 	vpxor	0(%rsi),%ymm11,%ymm11
   1340 	vpxor	32(%rsi),%ymm9,%ymm9
   1341 	vpxor	64(%rsi),%ymm0,%ymm0
   1342 	vpxor	96(%rsi),%ymm4,%ymm4
   1343 	leaq	128(%rsi),%rsi
   1344 	vmovdqu	%ymm11,0(%rdi)
   1345 	vmovdqu	%ymm9,32(%rdi)
   1346 	vmovdqu	%ymm0,64(%rdi)
   1347 	vmovdqu	%ymm4,96(%rdi)
   1348 	leaq	128(%rdi),%rdi
   1349 
   1350 	subq	$512,%rdx
   1351 	jnz	.Loop_outer8x
   1352 
   1353 	jmp	.Ldone8x
   1354 
   1355 .Ltail8x:
   1356 	cmpq	$448,%rdx
   1357 	jae	.L448_or_more8x
   1358 	cmpq	$384,%rdx
   1359 	jae	.L384_or_more8x
   1360 	cmpq	$320,%rdx
   1361 	jae	.L320_or_more8x
   1362 	cmpq	$256,%rdx
   1363 	jae	.L256_or_more8x
   1364 	cmpq	$192,%rdx
   1365 	jae	.L192_or_more8x
   1366 	cmpq	$128,%rdx
   1367 	jae	.L128_or_more8x
   1368 	cmpq	$64,%rdx
   1369 	jae	.L64_or_more8x
   1370 
   1371 	xorq	%r10,%r10
   1372 	vmovdqa	%ymm6,0(%rsp)
   1373 	vmovdqa	%ymm8,32(%rsp)
   1374 	jmp	.Loop_tail8x
   1375 
   1376 .align	32
   1377 .L64_or_more8x:
   1378 	vpxor	0(%rsi),%ymm6,%ymm6
   1379 	vpxor	32(%rsi),%ymm8,%ymm8
   1380 	vmovdqu	%ymm6,0(%rdi)
   1381 	vmovdqu	%ymm8,32(%rdi)
   1382 	je	.Ldone8x
   1383 
   1384 	leaq	64(%rsi),%rsi
   1385 	xorq	%r10,%r10
   1386 	vmovdqa	%ymm1,0(%rsp)
   1387 	leaq	64(%rdi),%rdi
   1388 	subq	$64,%rdx
   1389 	vmovdqa	%ymm5,32(%rsp)
   1390 	jmp	.Loop_tail8x
   1391 
   1392 .align	32
   1393 .L128_or_more8x:
   1394 	vpxor	0(%rsi),%ymm6,%ymm6
   1395 	vpxor	32(%rsi),%ymm8,%ymm8
   1396 	vpxor	64(%rsi),%ymm1,%ymm1
   1397 	vpxor	96(%rsi),%ymm5,%ymm5
   1398 	vmovdqu	%ymm6,0(%rdi)
   1399 	vmovdqu	%ymm8,32(%rdi)
   1400 	vmovdqu	%ymm1,64(%rdi)
   1401 	vmovdqu	%ymm5,96(%rdi)
   1402 	je	.Ldone8x
   1403 
   1404 	leaq	128(%rsi),%rsi
   1405 	xorq	%r10,%r10
   1406 	vmovdqa	%ymm12,0(%rsp)
   1407 	leaq	128(%rdi),%rdi
   1408 	subq	$128,%rdx
   1409 	vmovdqa	%ymm13,32(%rsp)
   1410 	jmp	.Loop_tail8x
   1411 
   1412 .align	32
   1413 .L192_or_more8x:
   1414 	vpxor	0(%rsi),%ymm6,%ymm6
   1415 	vpxor	32(%rsi),%ymm8,%ymm8
   1416 	vpxor	64(%rsi),%ymm1,%ymm1
   1417 	vpxor	96(%rsi),%ymm5,%ymm5
   1418 	vpxor	128(%rsi),%ymm12,%ymm12
   1419 	vpxor	160(%rsi),%ymm13,%ymm13
   1420 	vmovdqu	%ymm6,0(%rdi)
   1421 	vmovdqu	%ymm8,32(%rdi)
   1422 	vmovdqu	%ymm1,64(%rdi)
   1423 	vmovdqu	%ymm5,96(%rdi)
   1424 	vmovdqu	%ymm12,128(%rdi)
   1425 	vmovdqu	%ymm13,160(%rdi)
   1426 	je	.Ldone8x
   1427 
   1428 	leaq	192(%rsi),%rsi
   1429 	xorq	%r10,%r10
   1430 	vmovdqa	%ymm10,0(%rsp)
   1431 	leaq	192(%rdi),%rdi
   1432 	subq	$192,%rdx
   1433 	vmovdqa	%ymm15,32(%rsp)
   1434 	jmp	.Loop_tail8x
   1435 
   1436 .align	32
   1437 .L256_or_more8x:
   1438 	vpxor	0(%rsi),%ymm6,%ymm6
   1439 	vpxor	32(%rsi),%ymm8,%ymm8
   1440 	vpxor	64(%rsi),%ymm1,%ymm1
   1441 	vpxor	96(%rsi),%ymm5,%ymm5
   1442 	vpxor	128(%rsi),%ymm12,%ymm12
   1443 	vpxor	160(%rsi),%ymm13,%ymm13
   1444 	vpxor	192(%rsi),%ymm10,%ymm10
   1445 	vpxor	224(%rsi),%ymm15,%ymm15
   1446 	vmovdqu	%ymm6,0(%rdi)
   1447 	vmovdqu	%ymm8,32(%rdi)
   1448 	vmovdqu	%ymm1,64(%rdi)
   1449 	vmovdqu	%ymm5,96(%rdi)
   1450 	vmovdqu	%ymm12,128(%rdi)
   1451 	vmovdqu	%ymm13,160(%rdi)
   1452 	vmovdqu	%ymm10,192(%rdi)
   1453 	vmovdqu	%ymm15,224(%rdi)
   1454 	je	.Ldone8x
   1455 
   1456 	leaq	256(%rsi),%rsi
   1457 	xorq	%r10,%r10
   1458 	vmovdqa	%ymm14,0(%rsp)
   1459 	leaq	256(%rdi),%rdi
   1460 	subq	$256,%rdx
   1461 	vmovdqa	%ymm2,32(%rsp)
   1462 	jmp	.Loop_tail8x
   1463 
   1464 .align	32
   1465 .L320_or_more8x:
   1466 	vpxor	0(%rsi),%ymm6,%ymm6
   1467 	vpxor	32(%rsi),%ymm8,%ymm8
   1468 	vpxor	64(%rsi),%ymm1,%ymm1
   1469 	vpxor	96(%rsi),%ymm5,%ymm5
   1470 	vpxor	128(%rsi),%ymm12,%ymm12
   1471 	vpxor	160(%rsi),%ymm13,%ymm13
   1472 	vpxor	192(%rsi),%ymm10,%ymm10
   1473 	vpxor	224(%rsi),%ymm15,%ymm15
   1474 	vpxor	256(%rsi),%ymm14,%ymm14
   1475 	vpxor	288(%rsi),%ymm2,%ymm2
   1476 	vmovdqu	%ymm6,0(%rdi)
   1477 	vmovdqu	%ymm8,32(%rdi)
   1478 	vmovdqu	%ymm1,64(%rdi)
   1479 	vmovdqu	%ymm5,96(%rdi)
   1480 	vmovdqu	%ymm12,128(%rdi)
   1481 	vmovdqu	%ymm13,160(%rdi)
   1482 	vmovdqu	%ymm10,192(%rdi)
   1483 	vmovdqu	%ymm15,224(%rdi)
   1484 	vmovdqu	%ymm14,256(%rdi)
   1485 	vmovdqu	%ymm2,288(%rdi)
   1486 	je	.Ldone8x
   1487 
   1488 	leaq	320(%rsi),%rsi
   1489 	xorq	%r10,%r10
   1490 	vmovdqa	%ymm3,0(%rsp)
   1491 	leaq	320(%rdi),%rdi
   1492 	subq	$320,%rdx
   1493 	vmovdqa	%ymm7,32(%rsp)
   1494 	jmp	.Loop_tail8x
   1495 
   1496 .align	32
   1497 .L384_or_more8x:
   1498 	vpxor	0(%rsi),%ymm6,%ymm6
   1499 	vpxor	32(%rsi),%ymm8,%ymm8
   1500 	vpxor	64(%rsi),%ymm1,%ymm1
   1501 	vpxor	96(%rsi),%ymm5,%ymm5
   1502 	vpxor	128(%rsi),%ymm12,%ymm12
   1503 	vpxor	160(%rsi),%ymm13,%ymm13
   1504 	vpxor	192(%rsi),%ymm10,%ymm10
   1505 	vpxor	224(%rsi),%ymm15,%ymm15
   1506 	vpxor	256(%rsi),%ymm14,%ymm14
   1507 	vpxor	288(%rsi),%ymm2,%ymm2
   1508 	vpxor	320(%rsi),%ymm3,%ymm3
   1509 	vpxor	352(%rsi),%ymm7,%ymm7
   1510 	vmovdqu	%ymm6,0(%rdi)
   1511 	vmovdqu	%ymm8,32(%rdi)
   1512 	vmovdqu	%ymm1,64(%rdi)
   1513 	vmovdqu	%ymm5,96(%rdi)
   1514 	vmovdqu	%ymm12,128(%rdi)
   1515 	vmovdqu	%ymm13,160(%rdi)
   1516 	vmovdqu	%ymm10,192(%rdi)
   1517 	vmovdqu	%ymm15,224(%rdi)
   1518 	vmovdqu	%ymm14,256(%rdi)
   1519 	vmovdqu	%ymm2,288(%rdi)
   1520 	vmovdqu	%ymm3,320(%rdi)
   1521 	vmovdqu	%ymm7,352(%rdi)
   1522 	je	.Ldone8x
   1523 
   1524 	leaq	384(%rsi),%rsi
   1525 	xorq	%r10,%r10
   1526 	vmovdqa	%ymm11,0(%rsp)
   1527 	leaq	384(%rdi),%rdi
   1528 	subq	$384,%rdx
   1529 	vmovdqa	%ymm9,32(%rsp)
   1530 	jmp	.Loop_tail8x
   1531 
   1532 .align	32
   1533 .L448_or_more8x:
   1534 	vpxor	0(%rsi),%ymm6,%ymm6
   1535 	vpxor	32(%rsi),%ymm8,%ymm8
   1536 	vpxor	64(%rsi),%ymm1,%ymm1
   1537 	vpxor	96(%rsi),%ymm5,%ymm5
   1538 	vpxor	128(%rsi),%ymm12,%ymm12
   1539 	vpxor	160(%rsi),%ymm13,%ymm13
   1540 	vpxor	192(%rsi),%ymm10,%ymm10
   1541 	vpxor	224(%rsi),%ymm15,%ymm15
   1542 	vpxor	256(%rsi),%ymm14,%ymm14
   1543 	vpxor	288(%rsi),%ymm2,%ymm2
   1544 	vpxor	320(%rsi),%ymm3,%ymm3
   1545 	vpxor	352(%rsi),%ymm7,%ymm7
   1546 	vpxor	384(%rsi),%ymm11,%ymm11
   1547 	vpxor	416(%rsi),%ymm9,%ymm9
   1548 	vmovdqu	%ymm6,0(%rdi)
   1549 	vmovdqu	%ymm8,32(%rdi)
   1550 	vmovdqu	%ymm1,64(%rdi)
   1551 	vmovdqu	%ymm5,96(%rdi)
   1552 	vmovdqu	%ymm12,128(%rdi)
   1553 	vmovdqu	%ymm13,160(%rdi)
   1554 	vmovdqu	%ymm10,192(%rdi)
   1555 	vmovdqu	%ymm15,224(%rdi)
   1556 	vmovdqu	%ymm14,256(%rdi)
   1557 	vmovdqu	%ymm2,288(%rdi)
   1558 	vmovdqu	%ymm3,320(%rdi)
   1559 	vmovdqu	%ymm7,352(%rdi)
   1560 	vmovdqu	%ymm11,384(%rdi)
   1561 	vmovdqu	%ymm9,416(%rdi)
   1562 	je	.Ldone8x
   1563 
   1564 	leaq	448(%rsi),%rsi
   1565 	xorq	%r10,%r10
   1566 	vmovdqa	%ymm0,0(%rsp)
   1567 	leaq	448(%rdi),%rdi
   1568 	subq	$448,%rdx
   1569 	vmovdqa	%ymm4,32(%rsp)
   1570 
   1571 .Loop_tail8x:
   1572 	movzbl	(%rsi,%r10,1),%eax
   1573 	movzbl	(%rsp,%r10,1),%ecx
   1574 	leaq	1(%r10),%r10
   1575 	xorl	%ecx,%eax
   1576 	movb	%al,-1(%rdi,%r10,1)
   1577 	decq	%rdx
   1578 	jnz	.Loop_tail8x
   1579 
   1580 .Ldone8x:
   1581 	vzeroall
   1582 	leaq	(%r9),%rsp
   1583 .L8x_epilogue:
   1584 	.byte	0xf3,0xc3
   1585 .size	ChaCha20_8x,.-ChaCha20_8x
   1586 #endif
   1587