Home | History | Annotate | Download | only in chacha
      1 #if defined(__x86_64__) && !defined(OPENSSL_NO_ASM)
      2 .text
      3 
      4 
      5 
      6 .p2align	6
      7 L$zero:
      8 .long	0,0,0,0
      9 L$one:
     10 .long	1,0,0,0
     11 L$inc:
     12 .long	0,1,2,3
     13 L$four:
     14 .long	4,4,4,4
     15 L$incy:
     16 .long	0,2,4,6,1,3,5,7
     17 L$eight:
     18 .long	8,8,8,8,8,8,8,8
     19 L$rot16:
     20 .byte	0x2,0x3,0x0,0x1, 0x6,0x7,0x4,0x5, 0xa,0xb,0x8,0x9, 0xe,0xf,0xc,0xd
     21 L$rot24:
     22 .byte	0x3,0x0,0x1,0x2, 0x7,0x4,0x5,0x6, 0xb,0x8,0x9,0xa, 0xf,0xc,0xd,0xe
     23 L$sigma:
     24 .byte	101,120,112,97,110,100,32,51,50,45,98,121,116,101,32,107,0
     25 .p2align	6
     26 L$zeroz:
     27 .long	0,0,0,0, 1,0,0,0, 2,0,0,0, 3,0,0,0
     28 L$fourz:
     29 .long	4,0,0,0, 4,0,0,0, 4,0,0,0, 4,0,0,0
     30 L$incz:
     31 .long	0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
     32 L$sixteen:
     33 .long	16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16
     34 .byte	67,104,97,67,104,97,50,48,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
     35 .globl	_ChaCha20_ctr32
     36 .private_extern _ChaCha20_ctr32
     37 
     38 .p2align	6
     39 _ChaCha20_ctr32:
     40 	cmpq	$0,%rdx
     41 	je	L$no_data
     42 	movq	_OPENSSL_ia32cap_P+4(%rip),%r10
     43 	testl	$512,%r10d
     44 	jnz	L$ChaCha20_ssse3
     45 
     46 	pushq	%rbx
     47 	pushq	%rbp
     48 	pushq	%r12
     49 	pushq	%r13
     50 	pushq	%r14
     51 	pushq	%r15
     52 	subq	$64+24,%rsp
     53 L$ctr32_body:
     54 
     55 
     56 	movdqu	(%rcx),%xmm1
     57 	movdqu	16(%rcx),%xmm2
     58 	movdqu	(%r8),%xmm3
     59 	movdqa	L$one(%rip),%xmm4
     60 
     61 
     62 	movdqa	%xmm1,16(%rsp)
     63 	movdqa	%xmm2,32(%rsp)
     64 	movdqa	%xmm3,48(%rsp)
     65 	movq	%rdx,%rbp
     66 	jmp	L$oop_outer
     67 
     68 .p2align	5
     69 L$oop_outer:
     70 	movl	$0x61707865,%eax
     71 	movl	$0x3320646e,%ebx
     72 	movl	$0x79622d32,%ecx
     73 	movl	$0x6b206574,%edx
     74 	movl	16(%rsp),%r8d
     75 	movl	20(%rsp),%r9d
     76 	movl	24(%rsp),%r10d
     77 	movl	28(%rsp),%r11d
     78 	movd	%xmm3,%r12d
     79 	movl	52(%rsp),%r13d
     80 	movl	56(%rsp),%r14d
     81 	movl	60(%rsp),%r15d
     82 
     83 	movq	%rbp,64+0(%rsp)
     84 	movl	$10,%ebp
     85 	movq	%rsi,64+8(%rsp)
     86 .byte	102,72,15,126,214
     87 	movq	%rdi,64+16(%rsp)
     88 	movq	%rsi,%rdi
     89 	shrq	$32,%rdi
     90 	jmp	L$oop
     91 
     92 .p2align	5
     93 L$oop:
     94 	addl	%r8d,%eax
     95 	xorl	%eax,%r12d
     96 	roll	$16,%r12d
     97 	addl	%r9d,%ebx
     98 	xorl	%ebx,%r13d
     99 	roll	$16,%r13d
    100 	addl	%r12d,%esi
    101 	xorl	%esi,%r8d
    102 	roll	$12,%r8d
    103 	addl	%r13d,%edi
    104 	xorl	%edi,%r9d
    105 	roll	$12,%r9d
    106 	addl	%r8d,%eax
    107 	xorl	%eax,%r12d
    108 	roll	$8,%r12d
    109 	addl	%r9d,%ebx
    110 	xorl	%ebx,%r13d
    111 	roll	$8,%r13d
    112 	addl	%r12d,%esi
    113 	xorl	%esi,%r8d
    114 	roll	$7,%r8d
    115 	addl	%r13d,%edi
    116 	xorl	%edi,%r9d
    117 	roll	$7,%r9d
    118 	movl	%esi,32(%rsp)
    119 	movl	%edi,36(%rsp)
    120 	movl	40(%rsp),%esi
    121 	movl	44(%rsp),%edi
    122 	addl	%r10d,%ecx
    123 	xorl	%ecx,%r14d
    124 	roll	$16,%r14d
    125 	addl	%r11d,%edx
    126 	xorl	%edx,%r15d
    127 	roll	$16,%r15d
    128 	addl	%r14d,%esi
    129 	xorl	%esi,%r10d
    130 	roll	$12,%r10d
    131 	addl	%r15d,%edi
    132 	xorl	%edi,%r11d
    133 	roll	$12,%r11d
    134 	addl	%r10d,%ecx
    135 	xorl	%ecx,%r14d
    136 	roll	$8,%r14d
    137 	addl	%r11d,%edx
    138 	xorl	%edx,%r15d
    139 	roll	$8,%r15d
    140 	addl	%r14d,%esi
    141 	xorl	%esi,%r10d
    142 	roll	$7,%r10d
    143 	addl	%r15d,%edi
    144 	xorl	%edi,%r11d
    145 	roll	$7,%r11d
    146 	addl	%r9d,%eax
    147 	xorl	%eax,%r15d
    148 	roll	$16,%r15d
    149 	addl	%r10d,%ebx
    150 	xorl	%ebx,%r12d
    151 	roll	$16,%r12d
    152 	addl	%r15d,%esi
    153 	xorl	%esi,%r9d
    154 	roll	$12,%r9d
    155 	addl	%r12d,%edi
    156 	xorl	%edi,%r10d
    157 	roll	$12,%r10d
    158 	addl	%r9d,%eax
    159 	xorl	%eax,%r15d
    160 	roll	$8,%r15d
    161 	addl	%r10d,%ebx
    162 	xorl	%ebx,%r12d
    163 	roll	$8,%r12d
    164 	addl	%r15d,%esi
    165 	xorl	%esi,%r9d
    166 	roll	$7,%r9d
    167 	addl	%r12d,%edi
    168 	xorl	%edi,%r10d
    169 	roll	$7,%r10d
    170 	movl	%esi,40(%rsp)
    171 	movl	%edi,44(%rsp)
    172 	movl	32(%rsp),%esi
    173 	movl	36(%rsp),%edi
    174 	addl	%r11d,%ecx
    175 	xorl	%ecx,%r13d
    176 	roll	$16,%r13d
    177 	addl	%r8d,%edx
    178 	xorl	%edx,%r14d
    179 	roll	$16,%r14d
    180 	addl	%r13d,%esi
    181 	xorl	%esi,%r11d
    182 	roll	$12,%r11d
    183 	addl	%r14d,%edi
    184 	xorl	%edi,%r8d
    185 	roll	$12,%r8d
    186 	addl	%r11d,%ecx
    187 	xorl	%ecx,%r13d
    188 	roll	$8,%r13d
    189 	addl	%r8d,%edx
    190 	xorl	%edx,%r14d
    191 	roll	$8,%r14d
    192 	addl	%r13d,%esi
    193 	xorl	%esi,%r11d
    194 	roll	$7,%r11d
    195 	addl	%r14d,%edi
    196 	xorl	%edi,%r8d
    197 	roll	$7,%r8d
    198 	decl	%ebp
    199 	jnz	L$oop
    200 	movl	%edi,36(%rsp)
    201 	movl	%esi,32(%rsp)
    202 	movq	64(%rsp),%rbp
    203 	movdqa	%xmm2,%xmm1
    204 	movq	64+8(%rsp),%rsi
    205 	paddd	%xmm4,%xmm3
    206 	movq	64+16(%rsp),%rdi
    207 
    208 	addl	$0x61707865,%eax
    209 	addl	$0x3320646e,%ebx
    210 	addl	$0x79622d32,%ecx
    211 	addl	$0x6b206574,%edx
    212 	addl	16(%rsp),%r8d
    213 	addl	20(%rsp),%r9d
    214 	addl	24(%rsp),%r10d
    215 	addl	28(%rsp),%r11d
    216 	addl	48(%rsp),%r12d
    217 	addl	52(%rsp),%r13d
    218 	addl	56(%rsp),%r14d
    219 	addl	60(%rsp),%r15d
    220 	paddd	32(%rsp),%xmm1
    221 
    222 	cmpq	$64,%rbp
    223 	jb	L$tail
    224 
    225 	xorl	0(%rsi),%eax
    226 	xorl	4(%rsi),%ebx
    227 	xorl	8(%rsi),%ecx
    228 	xorl	12(%rsi),%edx
    229 	xorl	16(%rsi),%r8d
    230 	xorl	20(%rsi),%r9d
    231 	xorl	24(%rsi),%r10d
    232 	xorl	28(%rsi),%r11d
    233 	movdqu	32(%rsi),%xmm0
    234 	xorl	48(%rsi),%r12d
    235 	xorl	52(%rsi),%r13d
    236 	xorl	56(%rsi),%r14d
    237 	xorl	60(%rsi),%r15d
    238 	leaq	64(%rsi),%rsi
    239 	pxor	%xmm1,%xmm0
    240 
    241 	movdqa	%xmm2,32(%rsp)
    242 	movd	%xmm3,48(%rsp)
    243 
    244 	movl	%eax,0(%rdi)
    245 	movl	%ebx,4(%rdi)
    246 	movl	%ecx,8(%rdi)
    247 	movl	%edx,12(%rdi)
    248 	movl	%r8d,16(%rdi)
    249 	movl	%r9d,20(%rdi)
    250 	movl	%r10d,24(%rdi)
    251 	movl	%r11d,28(%rdi)
    252 	movdqu	%xmm0,32(%rdi)
    253 	movl	%r12d,48(%rdi)
    254 	movl	%r13d,52(%rdi)
    255 	movl	%r14d,56(%rdi)
    256 	movl	%r15d,60(%rdi)
    257 	leaq	64(%rdi),%rdi
    258 
    259 	subq	$64,%rbp
    260 	jnz	L$oop_outer
    261 
    262 	jmp	L$done
    263 
    264 .p2align	4
    265 L$tail:
    266 	movl	%eax,0(%rsp)
    267 	movl	%ebx,4(%rsp)
    268 	xorq	%rbx,%rbx
    269 	movl	%ecx,8(%rsp)
    270 	movl	%edx,12(%rsp)
    271 	movl	%r8d,16(%rsp)
    272 	movl	%r9d,20(%rsp)
    273 	movl	%r10d,24(%rsp)
    274 	movl	%r11d,28(%rsp)
    275 	movdqa	%xmm1,32(%rsp)
    276 	movl	%r12d,48(%rsp)
    277 	movl	%r13d,52(%rsp)
    278 	movl	%r14d,56(%rsp)
    279 	movl	%r15d,60(%rsp)
    280 
    281 L$oop_tail:
    282 	movzbl	(%rsi,%rbx,1),%eax
    283 	movzbl	(%rsp,%rbx,1),%edx
    284 	leaq	1(%rbx),%rbx
    285 	xorl	%edx,%eax
    286 	movb	%al,-1(%rdi,%rbx,1)
    287 	decq	%rbp
    288 	jnz	L$oop_tail
    289 
    290 L$done:
    291 	leaq	64+24+48(%rsp),%rsi
    292 	movq	-48(%rsi),%r15
    293 	movq	-40(%rsi),%r14
    294 	movq	-32(%rsi),%r13
    295 	movq	-24(%rsi),%r12
    296 	movq	-16(%rsi),%rbp
    297 	movq	-8(%rsi),%rbx
    298 	leaq	(%rsi),%rsp
    299 L$no_data:
    300 	.byte	0xf3,0xc3
    301 
    302 
    303 .p2align	5
    304 ChaCha20_ssse3:
    305 L$ChaCha20_ssse3:
    306 	movq	%rsp,%r9
    307 	cmpq	$128,%rdx
    308 	ja	L$ChaCha20_4x
    309 
    310 L$do_sse3_after_all:
    311 	subq	$64+8,%rsp
    312 	movdqa	L$sigma(%rip),%xmm0
    313 	movdqu	(%rcx),%xmm1
    314 	movdqu	16(%rcx),%xmm2
    315 	movdqu	(%r8),%xmm3
    316 	movdqa	L$rot16(%rip),%xmm6
    317 	movdqa	L$rot24(%rip),%xmm7
    318 
    319 	movdqa	%xmm0,0(%rsp)
    320 	movdqa	%xmm1,16(%rsp)
    321 	movdqa	%xmm2,32(%rsp)
    322 	movdqa	%xmm3,48(%rsp)
    323 	movq	$10,%r8
    324 	jmp	L$oop_ssse3
    325 
    326 .p2align	5
    327 L$oop_outer_ssse3:
    328 	movdqa	L$one(%rip),%xmm3
    329 	movdqa	0(%rsp),%xmm0
    330 	movdqa	16(%rsp),%xmm1
    331 	movdqa	32(%rsp),%xmm2
    332 	paddd	48(%rsp),%xmm3
    333 	movq	$10,%r8
    334 	movdqa	%xmm3,48(%rsp)
    335 	jmp	L$oop_ssse3
    336 
    337 .p2align	5
    338 L$oop_ssse3:
    339 	paddd	%xmm1,%xmm0
    340 	pxor	%xmm0,%xmm3
    341 .byte	102,15,56,0,222
    342 	paddd	%xmm3,%xmm2
    343 	pxor	%xmm2,%xmm1
    344 	movdqa	%xmm1,%xmm4
    345 	psrld	$20,%xmm1
    346 	pslld	$12,%xmm4
    347 	por	%xmm4,%xmm1
    348 	paddd	%xmm1,%xmm0
    349 	pxor	%xmm0,%xmm3
    350 .byte	102,15,56,0,223
    351 	paddd	%xmm3,%xmm2
    352 	pxor	%xmm2,%xmm1
    353 	movdqa	%xmm1,%xmm4
    354 	psrld	$25,%xmm1
    355 	pslld	$7,%xmm4
    356 	por	%xmm4,%xmm1
    357 	pshufd	$78,%xmm2,%xmm2
    358 	pshufd	$57,%xmm1,%xmm1
    359 	pshufd	$147,%xmm3,%xmm3
    360 	nop
    361 	paddd	%xmm1,%xmm0
    362 	pxor	%xmm0,%xmm3
    363 .byte	102,15,56,0,222
    364 	paddd	%xmm3,%xmm2
    365 	pxor	%xmm2,%xmm1
    366 	movdqa	%xmm1,%xmm4
    367 	psrld	$20,%xmm1
    368 	pslld	$12,%xmm4
    369 	por	%xmm4,%xmm1
    370 	paddd	%xmm1,%xmm0
    371 	pxor	%xmm0,%xmm3
    372 .byte	102,15,56,0,223
    373 	paddd	%xmm3,%xmm2
    374 	pxor	%xmm2,%xmm1
    375 	movdqa	%xmm1,%xmm4
    376 	psrld	$25,%xmm1
    377 	pslld	$7,%xmm4
    378 	por	%xmm4,%xmm1
    379 	pshufd	$78,%xmm2,%xmm2
    380 	pshufd	$147,%xmm1,%xmm1
    381 	pshufd	$57,%xmm3,%xmm3
    382 	decq	%r8
    383 	jnz	L$oop_ssse3
    384 	paddd	0(%rsp),%xmm0
    385 	paddd	16(%rsp),%xmm1
    386 	paddd	32(%rsp),%xmm2
    387 	paddd	48(%rsp),%xmm3
    388 
    389 	cmpq	$64,%rdx
    390 	jb	L$tail_ssse3
    391 
    392 	movdqu	0(%rsi),%xmm4
    393 	movdqu	16(%rsi),%xmm5
    394 	pxor	%xmm4,%xmm0
    395 	movdqu	32(%rsi),%xmm4
    396 	pxor	%xmm5,%xmm1
    397 	movdqu	48(%rsi),%xmm5
    398 	leaq	64(%rsi),%rsi
    399 	pxor	%xmm4,%xmm2
    400 	pxor	%xmm5,%xmm3
    401 
    402 	movdqu	%xmm0,0(%rdi)
    403 	movdqu	%xmm1,16(%rdi)
    404 	movdqu	%xmm2,32(%rdi)
    405 	movdqu	%xmm3,48(%rdi)
    406 	leaq	64(%rdi),%rdi
    407 
    408 	subq	$64,%rdx
    409 	jnz	L$oop_outer_ssse3
    410 
    411 	jmp	L$done_ssse3
    412 
    413 .p2align	4
    414 L$tail_ssse3:
    415 	movdqa	%xmm0,0(%rsp)
    416 	movdqa	%xmm1,16(%rsp)
    417 	movdqa	%xmm2,32(%rsp)
    418 	movdqa	%xmm3,48(%rsp)
    419 	xorq	%r8,%r8
    420 
    421 L$oop_tail_ssse3:
    422 	movzbl	(%rsi,%r8,1),%eax
    423 	movzbl	(%rsp,%r8,1),%ecx
    424 	leaq	1(%r8),%r8
    425 	xorl	%ecx,%eax
    426 	movb	%al,-1(%rdi,%r8,1)
    427 	decq	%rdx
    428 	jnz	L$oop_tail_ssse3
    429 
    430 L$done_ssse3:
    431 	leaq	(%r9),%rsp
    432 L$ssse3_epilogue:
    433 	.byte	0xf3,0xc3
    434 
    435 
    436 .p2align	5
    437 ChaCha20_4x:
    438 L$ChaCha20_4x:
    439 	movq	%rsp,%r9
    440 	movq	%r10,%r11
    441 	shrq	$32,%r10
    442 	testq	$32,%r10
    443 	jnz	L$ChaCha20_8x
    444 	cmpq	$192,%rdx
    445 	ja	L$proceed4x
    446 
    447 	andq	$71303168,%r11
    448 	cmpq	$4194304,%r11
    449 	je	L$do_sse3_after_all
    450 
    451 L$proceed4x:
    452 	subq	$0x140+8,%rsp
    453 	movdqa	L$sigma(%rip),%xmm11
    454 	movdqu	(%rcx),%xmm15
    455 	movdqu	16(%rcx),%xmm7
    456 	movdqu	(%r8),%xmm3
    457 	leaq	256(%rsp),%rcx
    458 	leaq	L$rot16(%rip),%r10
    459 	leaq	L$rot24(%rip),%r11
    460 
    461 	pshufd	$0x00,%xmm11,%xmm8
    462 	pshufd	$0x55,%xmm11,%xmm9
    463 	movdqa	%xmm8,64(%rsp)
    464 	pshufd	$0xaa,%xmm11,%xmm10
    465 	movdqa	%xmm9,80(%rsp)
    466 	pshufd	$0xff,%xmm11,%xmm11
    467 	movdqa	%xmm10,96(%rsp)
    468 	movdqa	%xmm11,112(%rsp)
    469 
    470 	pshufd	$0x00,%xmm15,%xmm12
    471 	pshufd	$0x55,%xmm15,%xmm13
    472 	movdqa	%xmm12,128-256(%rcx)
    473 	pshufd	$0xaa,%xmm15,%xmm14
    474 	movdqa	%xmm13,144-256(%rcx)
    475 	pshufd	$0xff,%xmm15,%xmm15
    476 	movdqa	%xmm14,160-256(%rcx)
    477 	movdqa	%xmm15,176-256(%rcx)
    478 
    479 	pshufd	$0x00,%xmm7,%xmm4
    480 	pshufd	$0x55,%xmm7,%xmm5
    481 	movdqa	%xmm4,192-256(%rcx)
    482 	pshufd	$0xaa,%xmm7,%xmm6
    483 	movdqa	%xmm5,208-256(%rcx)
    484 	pshufd	$0xff,%xmm7,%xmm7
    485 	movdqa	%xmm6,224-256(%rcx)
    486 	movdqa	%xmm7,240-256(%rcx)
    487 
    488 	pshufd	$0x00,%xmm3,%xmm0
    489 	pshufd	$0x55,%xmm3,%xmm1
    490 	paddd	L$inc(%rip),%xmm0
    491 	pshufd	$0xaa,%xmm3,%xmm2
    492 	movdqa	%xmm1,272-256(%rcx)
    493 	pshufd	$0xff,%xmm3,%xmm3
    494 	movdqa	%xmm2,288-256(%rcx)
    495 	movdqa	%xmm3,304-256(%rcx)
    496 
    497 	jmp	L$oop_enter4x
    498 
    499 .p2align	5
    500 L$oop_outer4x:
    501 	movdqa	64(%rsp),%xmm8
    502 	movdqa	80(%rsp),%xmm9
    503 	movdqa	96(%rsp),%xmm10
    504 	movdqa	112(%rsp),%xmm11
    505 	movdqa	128-256(%rcx),%xmm12
    506 	movdqa	144-256(%rcx),%xmm13
    507 	movdqa	160-256(%rcx),%xmm14
    508 	movdqa	176-256(%rcx),%xmm15
    509 	movdqa	192-256(%rcx),%xmm4
    510 	movdqa	208-256(%rcx),%xmm5
    511 	movdqa	224-256(%rcx),%xmm6
    512 	movdqa	240-256(%rcx),%xmm7
    513 	movdqa	256-256(%rcx),%xmm0
    514 	movdqa	272-256(%rcx),%xmm1
    515 	movdqa	288-256(%rcx),%xmm2
    516 	movdqa	304-256(%rcx),%xmm3
    517 	paddd	L$four(%rip),%xmm0
    518 
    519 L$oop_enter4x:
    520 	movdqa	%xmm6,32(%rsp)
    521 	movdqa	%xmm7,48(%rsp)
    522 	movdqa	(%r10),%xmm7
    523 	movl	$10,%eax
    524 	movdqa	%xmm0,256-256(%rcx)
    525 	jmp	L$oop4x
    526 
    527 .p2align	5
    528 L$oop4x:
    529 	paddd	%xmm12,%xmm8
    530 	paddd	%xmm13,%xmm9
    531 	pxor	%xmm8,%xmm0
    532 	pxor	%xmm9,%xmm1
    533 .byte	102,15,56,0,199
    534 .byte	102,15,56,0,207
    535 	paddd	%xmm0,%xmm4
    536 	paddd	%xmm1,%xmm5
    537 	pxor	%xmm4,%xmm12
    538 	pxor	%xmm5,%xmm13
    539 	movdqa	%xmm12,%xmm6
    540 	pslld	$12,%xmm12
    541 	psrld	$20,%xmm6
    542 	movdqa	%xmm13,%xmm7
    543 	pslld	$12,%xmm13
    544 	por	%xmm6,%xmm12
    545 	psrld	$20,%xmm7
    546 	movdqa	(%r11),%xmm6
    547 	por	%xmm7,%xmm13
    548 	paddd	%xmm12,%xmm8
    549 	paddd	%xmm13,%xmm9
    550 	pxor	%xmm8,%xmm0
    551 	pxor	%xmm9,%xmm1
    552 .byte	102,15,56,0,198
    553 .byte	102,15,56,0,206
    554 	paddd	%xmm0,%xmm4
    555 	paddd	%xmm1,%xmm5
    556 	pxor	%xmm4,%xmm12
    557 	pxor	%xmm5,%xmm13
    558 	movdqa	%xmm12,%xmm7
    559 	pslld	$7,%xmm12
    560 	psrld	$25,%xmm7
    561 	movdqa	%xmm13,%xmm6
    562 	pslld	$7,%xmm13
    563 	por	%xmm7,%xmm12
    564 	psrld	$25,%xmm6
    565 	movdqa	(%r10),%xmm7
    566 	por	%xmm6,%xmm13
    567 	movdqa	%xmm4,0(%rsp)
    568 	movdqa	%xmm5,16(%rsp)
    569 	movdqa	32(%rsp),%xmm4
    570 	movdqa	48(%rsp),%xmm5
    571 	paddd	%xmm14,%xmm10
    572 	paddd	%xmm15,%xmm11
    573 	pxor	%xmm10,%xmm2
    574 	pxor	%xmm11,%xmm3
    575 .byte	102,15,56,0,215
    576 .byte	102,15,56,0,223
    577 	paddd	%xmm2,%xmm4
    578 	paddd	%xmm3,%xmm5
    579 	pxor	%xmm4,%xmm14
    580 	pxor	%xmm5,%xmm15
    581 	movdqa	%xmm14,%xmm6
    582 	pslld	$12,%xmm14
    583 	psrld	$20,%xmm6
    584 	movdqa	%xmm15,%xmm7
    585 	pslld	$12,%xmm15
    586 	por	%xmm6,%xmm14
    587 	psrld	$20,%xmm7
    588 	movdqa	(%r11),%xmm6
    589 	por	%xmm7,%xmm15
    590 	paddd	%xmm14,%xmm10
    591 	paddd	%xmm15,%xmm11
    592 	pxor	%xmm10,%xmm2
    593 	pxor	%xmm11,%xmm3
    594 .byte	102,15,56,0,214
    595 .byte	102,15,56,0,222
    596 	paddd	%xmm2,%xmm4
    597 	paddd	%xmm3,%xmm5
    598 	pxor	%xmm4,%xmm14
    599 	pxor	%xmm5,%xmm15
    600 	movdqa	%xmm14,%xmm7
    601 	pslld	$7,%xmm14
    602 	psrld	$25,%xmm7
    603 	movdqa	%xmm15,%xmm6
    604 	pslld	$7,%xmm15
    605 	por	%xmm7,%xmm14
    606 	psrld	$25,%xmm6
    607 	movdqa	(%r10),%xmm7
    608 	por	%xmm6,%xmm15
    609 	paddd	%xmm13,%xmm8
    610 	paddd	%xmm14,%xmm9
    611 	pxor	%xmm8,%xmm3
    612 	pxor	%xmm9,%xmm0
    613 .byte	102,15,56,0,223
    614 .byte	102,15,56,0,199
    615 	paddd	%xmm3,%xmm4
    616 	paddd	%xmm0,%xmm5
    617 	pxor	%xmm4,%xmm13
    618 	pxor	%xmm5,%xmm14
    619 	movdqa	%xmm13,%xmm6
    620 	pslld	$12,%xmm13
    621 	psrld	$20,%xmm6
    622 	movdqa	%xmm14,%xmm7
    623 	pslld	$12,%xmm14
    624 	por	%xmm6,%xmm13
    625 	psrld	$20,%xmm7
    626 	movdqa	(%r11),%xmm6
    627 	por	%xmm7,%xmm14
    628 	paddd	%xmm13,%xmm8
    629 	paddd	%xmm14,%xmm9
    630 	pxor	%xmm8,%xmm3
    631 	pxor	%xmm9,%xmm0
    632 .byte	102,15,56,0,222
    633 .byte	102,15,56,0,198
    634 	paddd	%xmm3,%xmm4
    635 	paddd	%xmm0,%xmm5
    636 	pxor	%xmm4,%xmm13
    637 	pxor	%xmm5,%xmm14
    638 	movdqa	%xmm13,%xmm7
    639 	pslld	$7,%xmm13
    640 	psrld	$25,%xmm7
    641 	movdqa	%xmm14,%xmm6
    642 	pslld	$7,%xmm14
    643 	por	%xmm7,%xmm13
    644 	psrld	$25,%xmm6
    645 	movdqa	(%r10),%xmm7
    646 	por	%xmm6,%xmm14
    647 	movdqa	%xmm4,32(%rsp)
    648 	movdqa	%xmm5,48(%rsp)
    649 	movdqa	0(%rsp),%xmm4
    650 	movdqa	16(%rsp),%xmm5
    651 	paddd	%xmm15,%xmm10
    652 	paddd	%xmm12,%xmm11
    653 	pxor	%xmm10,%xmm1
    654 	pxor	%xmm11,%xmm2
    655 .byte	102,15,56,0,207
    656 .byte	102,15,56,0,215
    657 	paddd	%xmm1,%xmm4
    658 	paddd	%xmm2,%xmm5
    659 	pxor	%xmm4,%xmm15
    660 	pxor	%xmm5,%xmm12
    661 	movdqa	%xmm15,%xmm6
    662 	pslld	$12,%xmm15
    663 	psrld	$20,%xmm6
    664 	movdqa	%xmm12,%xmm7
    665 	pslld	$12,%xmm12
    666 	por	%xmm6,%xmm15
    667 	psrld	$20,%xmm7
    668 	movdqa	(%r11),%xmm6
    669 	por	%xmm7,%xmm12
    670 	paddd	%xmm15,%xmm10
    671 	paddd	%xmm12,%xmm11
    672 	pxor	%xmm10,%xmm1
    673 	pxor	%xmm11,%xmm2
    674 .byte	102,15,56,0,206
    675 .byte	102,15,56,0,214
    676 	paddd	%xmm1,%xmm4
    677 	paddd	%xmm2,%xmm5
    678 	pxor	%xmm4,%xmm15
    679 	pxor	%xmm5,%xmm12
    680 	movdqa	%xmm15,%xmm7
    681 	pslld	$7,%xmm15
    682 	psrld	$25,%xmm7
    683 	movdqa	%xmm12,%xmm6
    684 	pslld	$7,%xmm12
    685 	por	%xmm7,%xmm15
    686 	psrld	$25,%xmm6
    687 	movdqa	(%r10),%xmm7
    688 	por	%xmm6,%xmm12
    689 	decl	%eax
    690 	jnz	L$oop4x
    691 
    692 	paddd	64(%rsp),%xmm8
    693 	paddd	80(%rsp),%xmm9
    694 	paddd	96(%rsp),%xmm10
    695 	paddd	112(%rsp),%xmm11
    696 
    697 	movdqa	%xmm8,%xmm6
    698 	punpckldq	%xmm9,%xmm8
    699 	movdqa	%xmm10,%xmm7
    700 	punpckldq	%xmm11,%xmm10
    701 	punpckhdq	%xmm9,%xmm6
    702 	punpckhdq	%xmm11,%xmm7
    703 	movdqa	%xmm8,%xmm9
    704 	punpcklqdq	%xmm10,%xmm8
    705 	movdqa	%xmm6,%xmm11
    706 	punpcklqdq	%xmm7,%xmm6
    707 	punpckhqdq	%xmm10,%xmm9
    708 	punpckhqdq	%xmm7,%xmm11
    709 	paddd	128-256(%rcx),%xmm12
    710 	paddd	144-256(%rcx),%xmm13
    711 	paddd	160-256(%rcx),%xmm14
    712 	paddd	176-256(%rcx),%xmm15
    713 
    714 	movdqa	%xmm8,0(%rsp)
    715 	movdqa	%xmm9,16(%rsp)
    716 	movdqa	32(%rsp),%xmm8
    717 	movdqa	48(%rsp),%xmm9
    718 
    719 	movdqa	%xmm12,%xmm10
    720 	punpckldq	%xmm13,%xmm12
    721 	movdqa	%xmm14,%xmm7
    722 	punpckldq	%xmm15,%xmm14
    723 	punpckhdq	%xmm13,%xmm10
    724 	punpckhdq	%xmm15,%xmm7
    725 	movdqa	%xmm12,%xmm13
    726 	punpcklqdq	%xmm14,%xmm12
    727 	movdqa	%xmm10,%xmm15
    728 	punpcklqdq	%xmm7,%xmm10
    729 	punpckhqdq	%xmm14,%xmm13
    730 	punpckhqdq	%xmm7,%xmm15
    731 	paddd	192-256(%rcx),%xmm4
    732 	paddd	208-256(%rcx),%xmm5
    733 	paddd	224-256(%rcx),%xmm8
    734 	paddd	240-256(%rcx),%xmm9
    735 
    736 	movdqa	%xmm6,32(%rsp)
    737 	movdqa	%xmm11,48(%rsp)
    738 
    739 	movdqa	%xmm4,%xmm14
    740 	punpckldq	%xmm5,%xmm4
    741 	movdqa	%xmm8,%xmm7
    742 	punpckldq	%xmm9,%xmm8
    743 	punpckhdq	%xmm5,%xmm14
    744 	punpckhdq	%xmm9,%xmm7
    745 	movdqa	%xmm4,%xmm5
    746 	punpcklqdq	%xmm8,%xmm4
    747 	movdqa	%xmm14,%xmm9
    748 	punpcklqdq	%xmm7,%xmm14
    749 	punpckhqdq	%xmm8,%xmm5
    750 	punpckhqdq	%xmm7,%xmm9
    751 	paddd	256-256(%rcx),%xmm0
    752 	paddd	272-256(%rcx),%xmm1
    753 	paddd	288-256(%rcx),%xmm2
    754 	paddd	304-256(%rcx),%xmm3
    755 
    756 	movdqa	%xmm0,%xmm8
    757 	punpckldq	%xmm1,%xmm0
    758 	movdqa	%xmm2,%xmm7
    759 	punpckldq	%xmm3,%xmm2
    760 	punpckhdq	%xmm1,%xmm8
    761 	punpckhdq	%xmm3,%xmm7
    762 	movdqa	%xmm0,%xmm1
    763 	punpcklqdq	%xmm2,%xmm0
    764 	movdqa	%xmm8,%xmm3
    765 	punpcklqdq	%xmm7,%xmm8
    766 	punpckhqdq	%xmm2,%xmm1
    767 	punpckhqdq	%xmm7,%xmm3
    768 	cmpq	$256,%rdx
    769 	jb	L$tail4x
    770 
    771 	movdqu	0(%rsi),%xmm6
    772 	movdqu	16(%rsi),%xmm11
    773 	movdqu	32(%rsi),%xmm2
    774 	movdqu	48(%rsi),%xmm7
    775 	pxor	0(%rsp),%xmm6
    776 	pxor	%xmm12,%xmm11
    777 	pxor	%xmm4,%xmm2
    778 	pxor	%xmm0,%xmm7
    779 
    780 	movdqu	%xmm6,0(%rdi)
    781 	movdqu	64(%rsi),%xmm6
    782 	movdqu	%xmm11,16(%rdi)
    783 	movdqu	80(%rsi),%xmm11
    784 	movdqu	%xmm2,32(%rdi)
    785 	movdqu	96(%rsi),%xmm2
    786 	movdqu	%xmm7,48(%rdi)
    787 	movdqu	112(%rsi),%xmm7
    788 	leaq	128(%rsi),%rsi
    789 	pxor	16(%rsp),%xmm6
    790 	pxor	%xmm13,%xmm11
    791 	pxor	%xmm5,%xmm2
    792 	pxor	%xmm1,%xmm7
    793 
    794 	movdqu	%xmm6,64(%rdi)
    795 	movdqu	0(%rsi),%xmm6
    796 	movdqu	%xmm11,80(%rdi)
    797 	movdqu	16(%rsi),%xmm11
    798 	movdqu	%xmm2,96(%rdi)
    799 	movdqu	32(%rsi),%xmm2
    800 	movdqu	%xmm7,112(%rdi)
    801 	leaq	128(%rdi),%rdi
    802 	movdqu	48(%rsi),%xmm7
    803 	pxor	32(%rsp),%xmm6
    804 	pxor	%xmm10,%xmm11
    805 	pxor	%xmm14,%xmm2
    806 	pxor	%xmm8,%xmm7
    807 
    808 	movdqu	%xmm6,0(%rdi)
    809 	movdqu	64(%rsi),%xmm6
    810 	movdqu	%xmm11,16(%rdi)
    811 	movdqu	80(%rsi),%xmm11
    812 	movdqu	%xmm2,32(%rdi)
    813 	movdqu	96(%rsi),%xmm2
    814 	movdqu	%xmm7,48(%rdi)
    815 	movdqu	112(%rsi),%xmm7
    816 	leaq	128(%rsi),%rsi
    817 	pxor	48(%rsp),%xmm6
    818 	pxor	%xmm15,%xmm11
    819 	pxor	%xmm9,%xmm2
    820 	pxor	%xmm3,%xmm7
    821 	movdqu	%xmm6,64(%rdi)
    822 	movdqu	%xmm11,80(%rdi)
    823 	movdqu	%xmm2,96(%rdi)
    824 	movdqu	%xmm7,112(%rdi)
    825 	leaq	128(%rdi),%rdi
    826 
    827 	subq	$256,%rdx
    828 	jnz	L$oop_outer4x
    829 
    830 	jmp	L$done4x
    831 
    832 L$tail4x:
    833 	cmpq	$192,%rdx
    834 	jae	L$192_or_more4x
    835 	cmpq	$128,%rdx
    836 	jae	L$128_or_more4x
    837 	cmpq	$64,%rdx
    838 	jae	L$64_or_more4x
    839 
    840 
    841 	xorq	%r10,%r10
    842 
    843 	movdqa	%xmm12,16(%rsp)
    844 	movdqa	%xmm4,32(%rsp)
    845 	movdqa	%xmm0,48(%rsp)
    846 	jmp	L$oop_tail4x
    847 
    848 .p2align	5
    849 L$64_or_more4x:
    850 	movdqu	0(%rsi),%xmm6
    851 	movdqu	16(%rsi),%xmm11
    852 	movdqu	32(%rsi),%xmm2
    853 	movdqu	48(%rsi),%xmm7
    854 	pxor	0(%rsp),%xmm6
    855 	pxor	%xmm12,%xmm11
    856 	pxor	%xmm4,%xmm2
    857 	pxor	%xmm0,%xmm7
    858 	movdqu	%xmm6,0(%rdi)
    859 	movdqu	%xmm11,16(%rdi)
    860 	movdqu	%xmm2,32(%rdi)
    861 	movdqu	%xmm7,48(%rdi)
    862 	je	L$done4x
    863 
    864 	movdqa	16(%rsp),%xmm6
    865 	leaq	64(%rsi),%rsi
    866 	xorq	%r10,%r10
    867 	movdqa	%xmm6,0(%rsp)
    868 	movdqa	%xmm13,16(%rsp)
    869 	leaq	64(%rdi),%rdi
    870 	movdqa	%xmm5,32(%rsp)
    871 	subq	$64,%rdx
    872 	movdqa	%xmm1,48(%rsp)
    873 	jmp	L$oop_tail4x
    874 
    875 .p2align	5
    876 L$128_or_more4x:
    877 	movdqu	0(%rsi),%xmm6
    878 	movdqu	16(%rsi),%xmm11
    879 	movdqu	32(%rsi),%xmm2
    880 	movdqu	48(%rsi),%xmm7
    881 	pxor	0(%rsp),%xmm6
    882 	pxor	%xmm12,%xmm11
    883 	pxor	%xmm4,%xmm2
    884 	pxor	%xmm0,%xmm7
    885 
    886 	movdqu	%xmm6,0(%rdi)
    887 	movdqu	64(%rsi),%xmm6
    888 	movdqu	%xmm11,16(%rdi)
    889 	movdqu	80(%rsi),%xmm11
    890 	movdqu	%xmm2,32(%rdi)
    891 	movdqu	96(%rsi),%xmm2
    892 	movdqu	%xmm7,48(%rdi)
    893 	movdqu	112(%rsi),%xmm7
    894 	pxor	16(%rsp),%xmm6
    895 	pxor	%xmm13,%xmm11
    896 	pxor	%xmm5,%xmm2
    897 	pxor	%xmm1,%xmm7
    898 	movdqu	%xmm6,64(%rdi)
    899 	movdqu	%xmm11,80(%rdi)
    900 	movdqu	%xmm2,96(%rdi)
    901 	movdqu	%xmm7,112(%rdi)
    902 	je	L$done4x
    903 
    904 	movdqa	32(%rsp),%xmm6
    905 	leaq	128(%rsi),%rsi
    906 	xorq	%r10,%r10
    907 	movdqa	%xmm6,0(%rsp)
    908 	movdqa	%xmm10,16(%rsp)
    909 	leaq	128(%rdi),%rdi
    910 	movdqa	%xmm14,32(%rsp)
    911 	subq	$128,%rdx
    912 	movdqa	%xmm8,48(%rsp)
    913 	jmp	L$oop_tail4x
    914 
    915 .p2align	5
    916 L$192_or_more4x:
    917 	movdqu	0(%rsi),%xmm6
    918 	movdqu	16(%rsi),%xmm11
    919 	movdqu	32(%rsi),%xmm2
    920 	movdqu	48(%rsi),%xmm7
    921 	pxor	0(%rsp),%xmm6
    922 	pxor	%xmm12,%xmm11
    923 	pxor	%xmm4,%xmm2
    924 	pxor	%xmm0,%xmm7
    925 
    926 	movdqu	%xmm6,0(%rdi)
    927 	movdqu	64(%rsi),%xmm6
    928 	movdqu	%xmm11,16(%rdi)
    929 	movdqu	80(%rsi),%xmm11
    930 	movdqu	%xmm2,32(%rdi)
    931 	movdqu	96(%rsi),%xmm2
    932 	movdqu	%xmm7,48(%rdi)
    933 	movdqu	112(%rsi),%xmm7
    934 	leaq	128(%rsi),%rsi
    935 	pxor	16(%rsp),%xmm6
    936 	pxor	%xmm13,%xmm11
    937 	pxor	%xmm5,%xmm2
    938 	pxor	%xmm1,%xmm7
    939 
    940 	movdqu	%xmm6,64(%rdi)
    941 	movdqu	0(%rsi),%xmm6
    942 	movdqu	%xmm11,80(%rdi)
    943 	movdqu	16(%rsi),%xmm11
    944 	movdqu	%xmm2,96(%rdi)
    945 	movdqu	32(%rsi),%xmm2
    946 	movdqu	%xmm7,112(%rdi)
    947 	leaq	128(%rdi),%rdi
    948 	movdqu	48(%rsi),%xmm7
    949 	pxor	32(%rsp),%xmm6
    950 	pxor	%xmm10,%xmm11
    951 	pxor	%xmm14,%xmm2
    952 	pxor	%xmm8,%xmm7
    953 	movdqu	%xmm6,0(%rdi)
    954 	movdqu	%xmm11,16(%rdi)
    955 	movdqu	%xmm2,32(%rdi)
    956 	movdqu	%xmm7,48(%rdi)
    957 	je	L$done4x
    958 
    959 	movdqa	48(%rsp),%xmm6
    960 	leaq	64(%rsi),%rsi
    961 	xorq	%r10,%r10
    962 	movdqa	%xmm6,0(%rsp)
    963 	movdqa	%xmm15,16(%rsp)
    964 	leaq	64(%rdi),%rdi
    965 	movdqa	%xmm9,32(%rsp)
    966 	subq	$192,%rdx
    967 	movdqa	%xmm3,48(%rsp)
    968 
    969 L$oop_tail4x:
    970 	movzbl	(%rsi,%r10,1),%eax
    971 	movzbl	(%rsp,%r10,1),%ecx
    972 	leaq	1(%r10),%r10
    973 	xorl	%ecx,%eax
    974 	movb	%al,-1(%rdi,%r10,1)
    975 	decq	%rdx
    976 	jnz	L$oop_tail4x
    977 
    978 L$done4x:
    979 	leaq	(%r9),%rsp
    980 L$4x_epilogue:
    981 	.byte	0xf3,0xc3
    982 
    983 
    984 .p2align	5
    985 ChaCha20_8x:
    986 L$ChaCha20_8x:
    987 	movq	%rsp,%r9
    988 	subq	$0x280+8,%rsp
    989 	andq	$-32,%rsp
    990 	vzeroupper
    991 
    992 
    993 
    994 
    995 
    996 
    997 
    998 
    999 
   1000 
   1001 	vbroadcasti128	L$sigma(%rip),%ymm11
   1002 	vbroadcasti128	(%rcx),%ymm3
   1003 	vbroadcasti128	16(%rcx),%ymm15
   1004 	vbroadcasti128	(%r8),%ymm7
   1005 	leaq	256(%rsp),%rcx
   1006 	leaq	512(%rsp),%rax
   1007 	leaq	L$rot16(%rip),%r10
   1008 	leaq	L$rot24(%rip),%r11
   1009 
   1010 	vpshufd	$0x00,%ymm11,%ymm8
   1011 	vpshufd	$0x55,%ymm11,%ymm9
   1012 	vmovdqa	%ymm8,128-256(%rcx)
   1013 	vpshufd	$0xaa,%ymm11,%ymm10
   1014 	vmovdqa	%ymm9,160-256(%rcx)
   1015 	vpshufd	$0xff,%ymm11,%ymm11
   1016 	vmovdqa	%ymm10,192-256(%rcx)
   1017 	vmovdqa	%ymm11,224-256(%rcx)
   1018 
   1019 	vpshufd	$0x00,%ymm3,%ymm0
   1020 	vpshufd	$0x55,%ymm3,%ymm1
   1021 	vmovdqa	%ymm0,256-256(%rcx)
   1022 	vpshufd	$0xaa,%ymm3,%ymm2
   1023 	vmovdqa	%ymm1,288-256(%rcx)
   1024 	vpshufd	$0xff,%ymm3,%ymm3
   1025 	vmovdqa	%ymm2,320-256(%rcx)
   1026 	vmovdqa	%ymm3,352-256(%rcx)
   1027 
   1028 	vpshufd	$0x00,%ymm15,%ymm12
   1029 	vpshufd	$0x55,%ymm15,%ymm13
   1030 	vmovdqa	%ymm12,384-512(%rax)
   1031 	vpshufd	$0xaa,%ymm15,%ymm14
   1032 	vmovdqa	%ymm13,416-512(%rax)
   1033 	vpshufd	$0xff,%ymm15,%ymm15
   1034 	vmovdqa	%ymm14,448-512(%rax)
   1035 	vmovdqa	%ymm15,480-512(%rax)
   1036 
   1037 	vpshufd	$0x00,%ymm7,%ymm4
   1038 	vpshufd	$0x55,%ymm7,%ymm5
   1039 	vpaddd	L$incy(%rip),%ymm4,%ymm4
   1040 	vpshufd	$0xaa,%ymm7,%ymm6
   1041 	vmovdqa	%ymm5,544-512(%rax)
   1042 	vpshufd	$0xff,%ymm7,%ymm7
   1043 	vmovdqa	%ymm6,576-512(%rax)
   1044 	vmovdqa	%ymm7,608-512(%rax)
   1045 
   1046 	jmp	L$oop_enter8x
   1047 
   1048 .p2align	5
   1049 L$oop_outer8x:
   1050 	vmovdqa	128-256(%rcx),%ymm8
   1051 	vmovdqa	160-256(%rcx),%ymm9
   1052 	vmovdqa	192-256(%rcx),%ymm10
   1053 	vmovdqa	224-256(%rcx),%ymm11
   1054 	vmovdqa	256-256(%rcx),%ymm0
   1055 	vmovdqa	288-256(%rcx),%ymm1
   1056 	vmovdqa	320-256(%rcx),%ymm2
   1057 	vmovdqa	352-256(%rcx),%ymm3
   1058 	vmovdqa	384-512(%rax),%ymm12
   1059 	vmovdqa	416-512(%rax),%ymm13
   1060 	vmovdqa	448-512(%rax),%ymm14
   1061 	vmovdqa	480-512(%rax),%ymm15
   1062 	vmovdqa	512-512(%rax),%ymm4
   1063 	vmovdqa	544-512(%rax),%ymm5
   1064 	vmovdqa	576-512(%rax),%ymm6
   1065 	vmovdqa	608-512(%rax),%ymm7
   1066 	vpaddd	L$eight(%rip),%ymm4,%ymm4
   1067 
   1068 L$oop_enter8x:
   1069 	vmovdqa	%ymm14,64(%rsp)
   1070 	vmovdqa	%ymm15,96(%rsp)
   1071 	vbroadcasti128	(%r10),%ymm15
   1072 	vmovdqa	%ymm4,512-512(%rax)
   1073 	movl	$10,%eax
   1074 	jmp	L$oop8x
   1075 
   1076 .p2align	5
   1077 L$oop8x:
   1078 	vpaddd	%ymm0,%ymm8,%ymm8
   1079 	vpxor	%ymm4,%ymm8,%ymm4
   1080 	vpshufb	%ymm15,%ymm4,%ymm4
   1081 	vpaddd	%ymm1,%ymm9,%ymm9
   1082 	vpxor	%ymm5,%ymm9,%ymm5
   1083 	vpshufb	%ymm15,%ymm5,%ymm5
   1084 	vpaddd	%ymm4,%ymm12,%ymm12
   1085 	vpxor	%ymm0,%ymm12,%ymm0
   1086 	vpslld	$12,%ymm0,%ymm14
   1087 	vpsrld	$20,%ymm0,%ymm0
   1088 	vpor	%ymm0,%ymm14,%ymm0
   1089 	vbroadcasti128	(%r11),%ymm14
   1090 	vpaddd	%ymm5,%ymm13,%ymm13
   1091 	vpxor	%ymm1,%ymm13,%ymm1
   1092 	vpslld	$12,%ymm1,%ymm15
   1093 	vpsrld	$20,%ymm1,%ymm1
   1094 	vpor	%ymm1,%ymm15,%ymm1
   1095 	vpaddd	%ymm0,%ymm8,%ymm8
   1096 	vpxor	%ymm4,%ymm8,%ymm4
   1097 	vpshufb	%ymm14,%ymm4,%ymm4
   1098 	vpaddd	%ymm1,%ymm9,%ymm9
   1099 	vpxor	%ymm5,%ymm9,%ymm5
   1100 	vpshufb	%ymm14,%ymm5,%ymm5
   1101 	vpaddd	%ymm4,%ymm12,%ymm12
   1102 	vpxor	%ymm0,%ymm12,%ymm0
   1103 	vpslld	$7,%ymm0,%ymm15
   1104 	vpsrld	$25,%ymm0,%ymm0
   1105 	vpor	%ymm0,%ymm15,%ymm0
   1106 	vbroadcasti128	(%r10),%ymm15
   1107 	vpaddd	%ymm5,%ymm13,%ymm13
   1108 	vpxor	%ymm1,%ymm13,%ymm1
   1109 	vpslld	$7,%ymm1,%ymm14
   1110 	vpsrld	$25,%ymm1,%ymm1
   1111 	vpor	%ymm1,%ymm14,%ymm1
   1112 	vmovdqa	%ymm12,0(%rsp)
   1113 	vmovdqa	%ymm13,32(%rsp)
   1114 	vmovdqa	64(%rsp),%ymm12
   1115 	vmovdqa	96(%rsp),%ymm13
   1116 	vpaddd	%ymm2,%ymm10,%ymm10
   1117 	vpxor	%ymm6,%ymm10,%ymm6
   1118 	vpshufb	%ymm15,%ymm6,%ymm6
   1119 	vpaddd	%ymm3,%ymm11,%ymm11
   1120 	vpxor	%ymm7,%ymm11,%ymm7
   1121 	vpshufb	%ymm15,%ymm7,%ymm7
   1122 	vpaddd	%ymm6,%ymm12,%ymm12
   1123 	vpxor	%ymm2,%ymm12,%ymm2
   1124 	vpslld	$12,%ymm2,%ymm14
   1125 	vpsrld	$20,%ymm2,%ymm2
   1126 	vpor	%ymm2,%ymm14,%ymm2
   1127 	vbroadcasti128	(%r11),%ymm14
   1128 	vpaddd	%ymm7,%ymm13,%ymm13
   1129 	vpxor	%ymm3,%ymm13,%ymm3
   1130 	vpslld	$12,%ymm3,%ymm15
   1131 	vpsrld	$20,%ymm3,%ymm3
   1132 	vpor	%ymm3,%ymm15,%ymm3
   1133 	vpaddd	%ymm2,%ymm10,%ymm10
   1134 	vpxor	%ymm6,%ymm10,%ymm6
   1135 	vpshufb	%ymm14,%ymm6,%ymm6
   1136 	vpaddd	%ymm3,%ymm11,%ymm11
   1137 	vpxor	%ymm7,%ymm11,%ymm7
   1138 	vpshufb	%ymm14,%ymm7,%ymm7
   1139 	vpaddd	%ymm6,%ymm12,%ymm12
   1140 	vpxor	%ymm2,%ymm12,%ymm2
   1141 	vpslld	$7,%ymm2,%ymm15
   1142 	vpsrld	$25,%ymm2,%ymm2
   1143 	vpor	%ymm2,%ymm15,%ymm2
   1144 	vbroadcasti128	(%r10),%ymm15
   1145 	vpaddd	%ymm7,%ymm13,%ymm13
   1146 	vpxor	%ymm3,%ymm13,%ymm3
   1147 	vpslld	$7,%ymm3,%ymm14
   1148 	vpsrld	$25,%ymm3,%ymm3
   1149 	vpor	%ymm3,%ymm14,%ymm3
   1150 	vpaddd	%ymm1,%ymm8,%ymm8
   1151 	vpxor	%ymm7,%ymm8,%ymm7
   1152 	vpshufb	%ymm15,%ymm7,%ymm7
   1153 	vpaddd	%ymm2,%ymm9,%ymm9
   1154 	vpxor	%ymm4,%ymm9,%ymm4
   1155 	vpshufb	%ymm15,%ymm4,%ymm4
   1156 	vpaddd	%ymm7,%ymm12,%ymm12
   1157 	vpxor	%ymm1,%ymm12,%ymm1
   1158 	vpslld	$12,%ymm1,%ymm14
   1159 	vpsrld	$20,%ymm1,%ymm1
   1160 	vpor	%ymm1,%ymm14,%ymm1
   1161 	vbroadcasti128	(%r11),%ymm14
   1162 	vpaddd	%ymm4,%ymm13,%ymm13
   1163 	vpxor	%ymm2,%ymm13,%ymm2
   1164 	vpslld	$12,%ymm2,%ymm15
   1165 	vpsrld	$20,%ymm2,%ymm2
   1166 	vpor	%ymm2,%ymm15,%ymm2
   1167 	vpaddd	%ymm1,%ymm8,%ymm8
   1168 	vpxor	%ymm7,%ymm8,%ymm7
   1169 	vpshufb	%ymm14,%ymm7,%ymm7
   1170 	vpaddd	%ymm2,%ymm9,%ymm9
   1171 	vpxor	%ymm4,%ymm9,%ymm4
   1172 	vpshufb	%ymm14,%ymm4,%ymm4
   1173 	vpaddd	%ymm7,%ymm12,%ymm12
   1174 	vpxor	%ymm1,%ymm12,%ymm1
   1175 	vpslld	$7,%ymm1,%ymm15
   1176 	vpsrld	$25,%ymm1,%ymm1
   1177 	vpor	%ymm1,%ymm15,%ymm1
   1178 	vbroadcasti128	(%r10),%ymm15
   1179 	vpaddd	%ymm4,%ymm13,%ymm13
   1180 	vpxor	%ymm2,%ymm13,%ymm2
   1181 	vpslld	$7,%ymm2,%ymm14
   1182 	vpsrld	$25,%ymm2,%ymm2
   1183 	vpor	%ymm2,%ymm14,%ymm2
   1184 	vmovdqa	%ymm12,64(%rsp)
   1185 	vmovdqa	%ymm13,96(%rsp)
   1186 	vmovdqa	0(%rsp),%ymm12
   1187 	vmovdqa	32(%rsp),%ymm13
   1188 	vpaddd	%ymm3,%ymm10,%ymm10
   1189 	vpxor	%ymm5,%ymm10,%ymm5
   1190 	vpshufb	%ymm15,%ymm5,%ymm5
   1191 	vpaddd	%ymm0,%ymm11,%ymm11
   1192 	vpxor	%ymm6,%ymm11,%ymm6
   1193 	vpshufb	%ymm15,%ymm6,%ymm6
   1194 	vpaddd	%ymm5,%ymm12,%ymm12
   1195 	vpxor	%ymm3,%ymm12,%ymm3
   1196 	vpslld	$12,%ymm3,%ymm14
   1197 	vpsrld	$20,%ymm3,%ymm3
   1198 	vpor	%ymm3,%ymm14,%ymm3
   1199 	vbroadcasti128	(%r11),%ymm14
   1200 	vpaddd	%ymm6,%ymm13,%ymm13
   1201 	vpxor	%ymm0,%ymm13,%ymm0
   1202 	vpslld	$12,%ymm0,%ymm15
   1203 	vpsrld	$20,%ymm0,%ymm0
   1204 	vpor	%ymm0,%ymm15,%ymm0
   1205 	vpaddd	%ymm3,%ymm10,%ymm10
   1206 	vpxor	%ymm5,%ymm10,%ymm5
   1207 	vpshufb	%ymm14,%ymm5,%ymm5
   1208 	vpaddd	%ymm0,%ymm11,%ymm11
   1209 	vpxor	%ymm6,%ymm11,%ymm6
   1210 	vpshufb	%ymm14,%ymm6,%ymm6
   1211 	vpaddd	%ymm5,%ymm12,%ymm12
   1212 	vpxor	%ymm3,%ymm12,%ymm3
   1213 	vpslld	$7,%ymm3,%ymm15
   1214 	vpsrld	$25,%ymm3,%ymm3
   1215 	vpor	%ymm3,%ymm15,%ymm3
   1216 	vbroadcasti128	(%r10),%ymm15
   1217 	vpaddd	%ymm6,%ymm13,%ymm13
   1218 	vpxor	%ymm0,%ymm13,%ymm0
   1219 	vpslld	$7,%ymm0,%ymm14
   1220 	vpsrld	$25,%ymm0,%ymm0
   1221 	vpor	%ymm0,%ymm14,%ymm0
   1222 	decl	%eax
   1223 	jnz	L$oop8x
   1224 
   1225 	leaq	512(%rsp),%rax
   1226 	vpaddd	128-256(%rcx),%ymm8,%ymm8
   1227 	vpaddd	160-256(%rcx),%ymm9,%ymm9
   1228 	vpaddd	192-256(%rcx),%ymm10,%ymm10
   1229 	vpaddd	224-256(%rcx),%ymm11,%ymm11
   1230 
   1231 	vpunpckldq	%ymm9,%ymm8,%ymm14
   1232 	vpunpckldq	%ymm11,%ymm10,%ymm15
   1233 	vpunpckhdq	%ymm9,%ymm8,%ymm8
   1234 	vpunpckhdq	%ymm11,%ymm10,%ymm10
   1235 	vpunpcklqdq	%ymm15,%ymm14,%ymm9
   1236 	vpunpckhqdq	%ymm15,%ymm14,%ymm14
   1237 	vpunpcklqdq	%ymm10,%ymm8,%ymm11
   1238 	vpunpckhqdq	%ymm10,%ymm8,%ymm8
   1239 	vpaddd	256-256(%rcx),%ymm0,%ymm0
   1240 	vpaddd	288-256(%rcx),%ymm1,%ymm1
   1241 	vpaddd	320-256(%rcx),%ymm2,%ymm2
   1242 	vpaddd	352-256(%rcx),%ymm3,%ymm3
   1243 
   1244 	vpunpckldq	%ymm1,%ymm0,%ymm10
   1245 	vpunpckldq	%ymm3,%ymm2,%ymm15
   1246 	vpunpckhdq	%ymm1,%ymm0,%ymm0
   1247 	vpunpckhdq	%ymm3,%ymm2,%ymm2
   1248 	vpunpcklqdq	%ymm15,%ymm10,%ymm1
   1249 	vpunpckhqdq	%ymm15,%ymm10,%ymm10
   1250 	vpunpcklqdq	%ymm2,%ymm0,%ymm3
   1251 	vpunpckhqdq	%ymm2,%ymm0,%ymm0
   1252 	vperm2i128	$0x20,%ymm1,%ymm9,%ymm15
   1253 	vperm2i128	$0x31,%ymm1,%ymm9,%ymm1
   1254 	vperm2i128	$0x20,%ymm10,%ymm14,%ymm9
   1255 	vperm2i128	$0x31,%ymm10,%ymm14,%ymm10
   1256 	vperm2i128	$0x20,%ymm3,%ymm11,%ymm14
   1257 	vperm2i128	$0x31,%ymm3,%ymm11,%ymm3
   1258 	vperm2i128	$0x20,%ymm0,%ymm8,%ymm11
   1259 	vperm2i128	$0x31,%ymm0,%ymm8,%ymm0
   1260 	vmovdqa	%ymm15,0(%rsp)
   1261 	vmovdqa	%ymm9,32(%rsp)
   1262 	vmovdqa	64(%rsp),%ymm15
   1263 	vmovdqa	96(%rsp),%ymm9
   1264 
   1265 	vpaddd	384-512(%rax),%ymm12,%ymm12
   1266 	vpaddd	416-512(%rax),%ymm13,%ymm13
   1267 	vpaddd	448-512(%rax),%ymm15,%ymm15
   1268 	vpaddd	480-512(%rax),%ymm9,%ymm9
   1269 
   1270 	vpunpckldq	%ymm13,%ymm12,%ymm2
   1271 	vpunpckldq	%ymm9,%ymm15,%ymm8
   1272 	vpunpckhdq	%ymm13,%ymm12,%ymm12
   1273 	vpunpckhdq	%ymm9,%ymm15,%ymm15
   1274 	vpunpcklqdq	%ymm8,%ymm2,%ymm13
   1275 	vpunpckhqdq	%ymm8,%ymm2,%ymm2
   1276 	vpunpcklqdq	%ymm15,%ymm12,%ymm9
   1277 	vpunpckhqdq	%ymm15,%ymm12,%ymm12
   1278 	vpaddd	512-512(%rax),%ymm4,%ymm4
   1279 	vpaddd	544-512(%rax),%ymm5,%ymm5
   1280 	vpaddd	576-512(%rax),%ymm6,%ymm6
   1281 	vpaddd	608-512(%rax),%ymm7,%ymm7
   1282 
   1283 	vpunpckldq	%ymm5,%ymm4,%ymm15
   1284 	vpunpckldq	%ymm7,%ymm6,%ymm8
   1285 	vpunpckhdq	%ymm5,%ymm4,%ymm4
   1286 	vpunpckhdq	%ymm7,%ymm6,%ymm6
   1287 	vpunpcklqdq	%ymm8,%ymm15,%ymm5
   1288 	vpunpckhqdq	%ymm8,%ymm15,%ymm15
   1289 	vpunpcklqdq	%ymm6,%ymm4,%ymm7
   1290 	vpunpckhqdq	%ymm6,%ymm4,%ymm4
   1291 	vperm2i128	$0x20,%ymm5,%ymm13,%ymm8
   1292 	vperm2i128	$0x31,%ymm5,%ymm13,%ymm5
   1293 	vperm2i128	$0x20,%ymm15,%ymm2,%ymm13
   1294 	vperm2i128	$0x31,%ymm15,%ymm2,%ymm15
   1295 	vperm2i128	$0x20,%ymm7,%ymm9,%ymm2
   1296 	vperm2i128	$0x31,%ymm7,%ymm9,%ymm7
   1297 	vperm2i128	$0x20,%ymm4,%ymm12,%ymm9
   1298 	vperm2i128	$0x31,%ymm4,%ymm12,%ymm4
   1299 	vmovdqa	0(%rsp),%ymm6
   1300 	vmovdqa	32(%rsp),%ymm12
   1301 
   1302 	cmpq	$512,%rdx
   1303 	jb	L$tail8x
   1304 
   1305 	vpxor	0(%rsi),%ymm6,%ymm6
   1306 	vpxor	32(%rsi),%ymm8,%ymm8
   1307 	vpxor	64(%rsi),%ymm1,%ymm1
   1308 	vpxor	96(%rsi),%ymm5,%ymm5
   1309 	leaq	128(%rsi),%rsi
   1310 	vmovdqu	%ymm6,0(%rdi)
   1311 	vmovdqu	%ymm8,32(%rdi)
   1312 	vmovdqu	%ymm1,64(%rdi)
   1313 	vmovdqu	%ymm5,96(%rdi)
   1314 	leaq	128(%rdi),%rdi
   1315 
   1316 	vpxor	0(%rsi),%ymm12,%ymm12
   1317 	vpxor	32(%rsi),%ymm13,%ymm13
   1318 	vpxor	64(%rsi),%ymm10,%ymm10
   1319 	vpxor	96(%rsi),%ymm15,%ymm15
   1320 	leaq	128(%rsi),%rsi
   1321 	vmovdqu	%ymm12,0(%rdi)
   1322 	vmovdqu	%ymm13,32(%rdi)
   1323 	vmovdqu	%ymm10,64(%rdi)
   1324 	vmovdqu	%ymm15,96(%rdi)
   1325 	leaq	128(%rdi),%rdi
   1326 
   1327 	vpxor	0(%rsi),%ymm14,%ymm14
   1328 	vpxor	32(%rsi),%ymm2,%ymm2
   1329 	vpxor	64(%rsi),%ymm3,%ymm3
   1330 	vpxor	96(%rsi),%ymm7,%ymm7
   1331 	leaq	128(%rsi),%rsi
   1332 	vmovdqu	%ymm14,0(%rdi)
   1333 	vmovdqu	%ymm2,32(%rdi)
   1334 	vmovdqu	%ymm3,64(%rdi)
   1335 	vmovdqu	%ymm7,96(%rdi)
   1336 	leaq	128(%rdi),%rdi
   1337 
   1338 	vpxor	0(%rsi),%ymm11,%ymm11
   1339 	vpxor	32(%rsi),%ymm9,%ymm9
   1340 	vpxor	64(%rsi),%ymm0,%ymm0
   1341 	vpxor	96(%rsi),%ymm4,%ymm4
   1342 	leaq	128(%rsi),%rsi
   1343 	vmovdqu	%ymm11,0(%rdi)
   1344 	vmovdqu	%ymm9,32(%rdi)
   1345 	vmovdqu	%ymm0,64(%rdi)
   1346 	vmovdqu	%ymm4,96(%rdi)
   1347 	leaq	128(%rdi),%rdi
   1348 
   1349 	subq	$512,%rdx
   1350 	jnz	L$oop_outer8x
   1351 
   1352 	jmp	L$done8x
   1353 
   1354 L$tail8x:
   1355 	cmpq	$448,%rdx
   1356 	jae	L$448_or_more8x
   1357 	cmpq	$384,%rdx
   1358 	jae	L$384_or_more8x
   1359 	cmpq	$320,%rdx
   1360 	jae	L$320_or_more8x
   1361 	cmpq	$256,%rdx
   1362 	jae	L$256_or_more8x
   1363 	cmpq	$192,%rdx
   1364 	jae	L$192_or_more8x
   1365 	cmpq	$128,%rdx
   1366 	jae	L$128_or_more8x
   1367 	cmpq	$64,%rdx
   1368 	jae	L$64_or_more8x
   1369 
   1370 	xorq	%r10,%r10
   1371 	vmovdqa	%ymm6,0(%rsp)
   1372 	vmovdqa	%ymm8,32(%rsp)
   1373 	jmp	L$oop_tail8x
   1374 
   1375 .p2align	5
   1376 L$64_or_more8x:
   1377 	vpxor	0(%rsi),%ymm6,%ymm6
   1378 	vpxor	32(%rsi),%ymm8,%ymm8
   1379 	vmovdqu	%ymm6,0(%rdi)
   1380 	vmovdqu	%ymm8,32(%rdi)
   1381 	je	L$done8x
   1382 
   1383 	leaq	64(%rsi),%rsi
   1384 	xorq	%r10,%r10
   1385 	vmovdqa	%ymm1,0(%rsp)
   1386 	leaq	64(%rdi),%rdi
   1387 	subq	$64,%rdx
   1388 	vmovdqa	%ymm5,32(%rsp)
   1389 	jmp	L$oop_tail8x
   1390 
   1391 .p2align	5
   1392 L$128_or_more8x:
   1393 	vpxor	0(%rsi),%ymm6,%ymm6
   1394 	vpxor	32(%rsi),%ymm8,%ymm8
   1395 	vpxor	64(%rsi),%ymm1,%ymm1
   1396 	vpxor	96(%rsi),%ymm5,%ymm5
   1397 	vmovdqu	%ymm6,0(%rdi)
   1398 	vmovdqu	%ymm8,32(%rdi)
   1399 	vmovdqu	%ymm1,64(%rdi)
   1400 	vmovdqu	%ymm5,96(%rdi)
   1401 	je	L$done8x
   1402 
   1403 	leaq	128(%rsi),%rsi
   1404 	xorq	%r10,%r10
   1405 	vmovdqa	%ymm12,0(%rsp)
   1406 	leaq	128(%rdi),%rdi
   1407 	subq	$128,%rdx
   1408 	vmovdqa	%ymm13,32(%rsp)
   1409 	jmp	L$oop_tail8x
   1410 
   1411 .p2align	5
   1412 L$192_or_more8x:
   1413 	vpxor	0(%rsi),%ymm6,%ymm6
   1414 	vpxor	32(%rsi),%ymm8,%ymm8
   1415 	vpxor	64(%rsi),%ymm1,%ymm1
   1416 	vpxor	96(%rsi),%ymm5,%ymm5
   1417 	vpxor	128(%rsi),%ymm12,%ymm12
   1418 	vpxor	160(%rsi),%ymm13,%ymm13
   1419 	vmovdqu	%ymm6,0(%rdi)
   1420 	vmovdqu	%ymm8,32(%rdi)
   1421 	vmovdqu	%ymm1,64(%rdi)
   1422 	vmovdqu	%ymm5,96(%rdi)
   1423 	vmovdqu	%ymm12,128(%rdi)
   1424 	vmovdqu	%ymm13,160(%rdi)
   1425 	je	L$done8x
   1426 
   1427 	leaq	192(%rsi),%rsi
   1428 	xorq	%r10,%r10
   1429 	vmovdqa	%ymm10,0(%rsp)
   1430 	leaq	192(%rdi),%rdi
   1431 	subq	$192,%rdx
   1432 	vmovdqa	%ymm15,32(%rsp)
   1433 	jmp	L$oop_tail8x
   1434 
   1435 .p2align	5
   1436 L$256_or_more8x:
   1437 	vpxor	0(%rsi),%ymm6,%ymm6
   1438 	vpxor	32(%rsi),%ymm8,%ymm8
   1439 	vpxor	64(%rsi),%ymm1,%ymm1
   1440 	vpxor	96(%rsi),%ymm5,%ymm5
   1441 	vpxor	128(%rsi),%ymm12,%ymm12
   1442 	vpxor	160(%rsi),%ymm13,%ymm13
   1443 	vpxor	192(%rsi),%ymm10,%ymm10
   1444 	vpxor	224(%rsi),%ymm15,%ymm15
   1445 	vmovdqu	%ymm6,0(%rdi)
   1446 	vmovdqu	%ymm8,32(%rdi)
   1447 	vmovdqu	%ymm1,64(%rdi)
   1448 	vmovdqu	%ymm5,96(%rdi)
   1449 	vmovdqu	%ymm12,128(%rdi)
   1450 	vmovdqu	%ymm13,160(%rdi)
   1451 	vmovdqu	%ymm10,192(%rdi)
   1452 	vmovdqu	%ymm15,224(%rdi)
   1453 	je	L$done8x
   1454 
   1455 	leaq	256(%rsi),%rsi
   1456 	xorq	%r10,%r10
   1457 	vmovdqa	%ymm14,0(%rsp)
   1458 	leaq	256(%rdi),%rdi
   1459 	subq	$256,%rdx
   1460 	vmovdqa	%ymm2,32(%rsp)
   1461 	jmp	L$oop_tail8x
   1462 
   1463 .p2align	5
   1464 L$320_or_more8x:
   1465 	vpxor	0(%rsi),%ymm6,%ymm6
   1466 	vpxor	32(%rsi),%ymm8,%ymm8
   1467 	vpxor	64(%rsi),%ymm1,%ymm1
   1468 	vpxor	96(%rsi),%ymm5,%ymm5
   1469 	vpxor	128(%rsi),%ymm12,%ymm12
   1470 	vpxor	160(%rsi),%ymm13,%ymm13
   1471 	vpxor	192(%rsi),%ymm10,%ymm10
   1472 	vpxor	224(%rsi),%ymm15,%ymm15
   1473 	vpxor	256(%rsi),%ymm14,%ymm14
   1474 	vpxor	288(%rsi),%ymm2,%ymm2
   1475 	vmovdqu	%ymm6,0(%rdi)
   1476 	vmovdqu	%ymm8,32(%rdi)
   1477 	vmovdqu	%ymm1,64(%rdi)
   1478 	vmovdqu	%ymm5,96(%rdi)
   1479 	vmovdqu	%ymm12,128(%rdi)
   1480 	vmovdqu	%ymm13,160(%rdi)
   1481 	vmovdqu	%ymm10,192(%rdi)
   1482 	vmovdqu	%ymm15,224(%rdi)
   1483 	vmovdqu	%ymm14,256(%rdi)
   1484 	vmovdqu	%ymm2,288(%rdi)
   1485 	je	L$done8x
   1486 
   1487 	leaq	320(%rsi),%rsi
   1488 	xorq	%r10,%r10
   1489 	vmovdqa	%ymm3,0(%rsp)
   1490 	leaq	320(%rdi),%rdi
   1491 	subq	$320,%rdx
   1492 	vmovdqa	%ymm7,32(%rsp)
   1493 	jmp	L$oop_tail8x
   1494 
   1495 .p2align	5
   1496 L$384_or_more8x:
   1497 	vpxor	0(%rsi),%ymm6,%ymm6
   1498 	vpxor	32(%rsi),%ymm8,%ymm8
   1499 	vpxor	64(%rsi),%ymm1,%ymm1
   1500 	vpxor	96(%rsi),%ymm5,%ymm5
   1501 	vpxor	128(%rsi),%ymm12,%ymm12
   1502 	vpxor	160(%rsi),%ymm13,%ymm13
   1503 	vpxor	192(%rsi),%ymm10,%ymm10
   1504 	vpxor	224(%rsi),%ymm15,%ymm15
   1505 	vpxor	256(%rsi),%ymm14,%ymm14
   1506 	vpxor	288(%rsi),%ymm2,%ymm2
   1507 	vpxor	320(%rsi),%ymm3,%ymm3
   1508 	vpxor	352(%rsi),%ymm7,%ymm7
   1509 	vmovdqu	%ymm6,0(%rdi)
   1510 	vmovdqu	%ymm8,32(%rdi)
   1511 	vmovdqu	%ymm1,64(%rdi)
   1512 	vmovdqu	%ymm5,96(%rdi)
   1513 	vmovdqu	%ymm12,128(%rdi)
   1514 	vmovdqu	%ymm13,160(%rdi)
   1515 	vmovdqu	%ymm10,192(%rdi)
   1516 	vmovdqu	%ymm15,224(%rdi)
   1517 	vmovdqu	%ymm14,256(%rdi)
   1518 	vmovdqu	%ymm2,288(%rdi)
   1519 	vmovdqu	%ymm3,320(%rdi)
   1520 	vmovdqu	%ymm7,352(%rdi)
   1521 	je	L$done8x
   1522 
   1523 	leaq	384(%rsi),%rsi
   1524 	xorq	%r10,%r10
   1525 	vmovdqa	%ymm11,0(%rsp)
   1526 	leaq	384(%rdi),%rdi
   1527 	subq	$384,%rdx
   1528 	vmovdqa	%ymm9,32(%rsp)
   1529 	jmp	L$oop_tail8x
   1530 
   1531 .p2align	5
   1532 L$448_or_more8x:
   1533 	vpxor	0(%rsi),%ymm6,%ymm6
   1534 	vpxor	32(%rsi),%ymm8,%ymm8
   1535 	vpxor	64(%rsi),%ymm1,%ymm1
   1536 	vpxor	96(%rsi),%ymm5,%ymm5
   1537 	vpxor	128(%rsi),%ymm12,%ymm12
   1538 	vpxor	160(%rsi),%ymm13,%ymm13
   1539 	vpxor	192(%rsi),%ymm10,%ymm10
   1540 	vpxor	224(%rsi),%ymm15,%ymm15
   1541 	vpxor	256(%rsi),%ymm14,%ymm14
   1542 	vpxor	288(%rsi),%ymm2,%ymm2
   1543 	vpxor	320(%rsi),%ymm3,%ymm3
   1544 	vpxor	352(%rsi),%ymm7,%ymm7
   1545 	vpxor	384(%rsi),%ymm11,%ymm11
   1546 	vpxor	416(%rsi),%ymm9,%ymm9
   1547 	vmovdqu	%ymm6,0(%rdi)
   1548 	vmovdqu	%ymm8,32(%rdi)
   1549 	vmovdqu	%ymm1,64(%rdi)
   1550 	vmovdqu	%ymm5,96(%rdi)
   1551 	vmovdqu	%ymm12,128(%rdi)
   1552 	vmovdqu	%ymm13,160(%rdi)
   1553 	vmovdqu	%ymm10,192(%rdi)
   1554 	vmovdqu	%ymm15,224(%rdi)
   1555 	vmovdqu	%ymm14,256(%rdi)
   1556 	vmovdqu	%ymm2,288(%rdi)
   1557 	vmovdqu	%ymm3,320(%rdi)
   1558 	vmovdqu	%ymm7,352(%rdi)
   1559 	vmovdqu	%ymm11,384(%rdi)
   1560 	vmovdqu	%ymm9,416(%rdi)
   1561 	je	L$done8x
   1562 
   1563 	leaq	448(%rsi),%rsi
   1564 	xorq	%r10,%r10
   1565 	vmovdqa	%ymm0,0(%rsp)
   1566 	leaq	448(%rdi),%rdi
   1567 	subq	$448,%rdx
   1568 	vmovdqa	%ymm4,32(%rsp)
   1569 
   1570 L$oop_tail8x:
   1571 	movzbl	(%rsi,%r10,1),%eax
   1572 	movzbl	(%rsp,%r10,1),%ecx
   1573 	leaq	1(%r10),%r10
   1574 	xorl	%ecx,%eax
   1575 	movb	%al,-1(%rdi,%r10,1)
   1576 	decq	%rdx
   1577 	jnz	L$oop_tail8x
   1578 
   1579 L$done8x:
   1580 	vzeroall
   1581 	leaq	(%r9),%rsp
   1582 L$8x_epilogue:
   1583 	.byte	0xf3,0xc3
   1584 
   1585 #endif
   1586