1 #if defined(__x86_64__) && !defined(OPENSSL_NO_ASM) 2 .text 3 4 5 6 .p2align 6 7 L$zero: 8 .long 0,0,0,0 9 L$one: 10 .long 1,0,0,0 11 L$inc: 12 .long 0,1,2,3 13 L$four: 14 .long 4,4,4,4 15 L$incy: 16 .long 0,2,4,6,1,3,5,7 17 L$eight: 18 .long 8,8,8,8,8,8,8,8 19 L$rot16: 20 .byte 0x2,0x3,0x0,0x1, 0x6,0x7,0x4,0x5, 0xa,0xb,0x8,0x9, 0xe,0xf,0xc,0xd 21 L$rot24: 22 .byte 0x3,0x0,0x1,0x2, 0x7,0x4,0x5,0x6, 0xb,0x8,0x9,0xa, 0xf,0xc,0xd,0xe 23 L$sigma: 24 .byte 101,120,112,97,110,100,32,51,50,45,98,121,116,101,32,107,0 25 .p2align 6 26 L$zeroz: 27 .long 0,0,0,0, 1,0,0,0, 2,0,0,0, 3,0,0,0 28 L$fourz: 29 .long 4,0,0,0, 4,0,0,0, 4,0,0,0, 4,0,0,0 30 L$incz: 31 .long 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15 32 L$sixteen: 33 .long 16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16 34 .byte 67,104,97,67,104,97,50,48,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 35 .globl _ChaCha20_ctr32 36 .private_extern _ChaCha20_ctr32 37 38 .p2align 6 39 _ChaCha20_ctr32: 40 cmpq $0,%rdx 41 je L$no_data 42 movq _OPENSSL_ia32cap_P+4(%rip),%r10 43 testl $512,%r10d 44 jnz L$ChaCha20_ssse3 45 46 pushq %rbx 47 pushq %rbp 48 pushq %r12 49 pushq %r13 50 pushq %r14 51 pushq %r15 52 subq $64+24,%rsp 53 L$ctr32_body: 54 55 56 movdqu (%rcx),%xmm1 57 movdqu 16(%rcx),%xmm2 58 movdqu (%r8),%xmm3 59 movdqa L$one(%rip),%xmm4 60 61 62 movdqa %xmm1,16(%rsp) 63 movdqa %xmm2,32(%rsp) 64 movdqa %xmm3,48(%rsp) 65 movq %rdx,%rbp 66 jmp L$oop_outer 67 68 .p2align 5 69 L$oop_outer: 70 movl $0x61707865,%eax 71 movl $0x3320646e,%ebx 72 movl $0x79622d32,%ecx 73 movl $0x6b206574,%edx 74 movl 16(%rsp),%r8d 75 movl 20(%rsp),%r9d 76 movl 24(%rsp),%r10d 77 movl 28(%rsp),%r11d 78 movd %xmm3,%r12d 79 movl 52(%rsp),%r13d 80 movl 56(%rsp),%r14d 81 movl 60(%rsp),%r15d 82 83 movq %rbp,64+0(%rsp) 84 movl $10,%ebp 85 movq %rsi,64+8(%rsp) 86 .byte 102,72,15,126,214 87 movq %rdi,64+16(%rsp) 88 movq %rsi,%rdi 89 shrq $32,%rdi 90 jmp L$oop 91 92 .p2align 5 93 L$oop: 94 addl %r8d,%eax 95 xorl %eax,%r12d 96 roll $16,%r12d 97 addl %r9d,%ebx 98 xorl %ebx,%r13d 99 roll $16,%r13d 100 addl %r12d,%esi 101 xorl %esi,%r8d 102 roll $12,%r8d 103 addl %r13d,%edi 104 xorl %edi,%r9d 105 roll $12,%r9d 106 addl %r8d,%eax 107 xorl %eax,%r12d 108 roll $8,%r12d 109 addl %r9d,%ebx 110 xorl %ebx,%r13d 111 roll $8,%r13d 112 addl %r12d,%esi 113 xorl %esi,%r8d 114 roll $7,%r8d 115 addl %r13d,%edi 116 xorl %edi,%r9d 117 roll $7,%r9d 118 movl %esi,32(%rsp) 119 movl %edi,36(%rsp) 120 movl 40(%rsp),%esi 121 movl 44(%rsp),%edi 122 addl %r10d,%ecx 123 xorl %ecx,%r14d 124 roll $16,%r14d 125 addl %r11d,%edx 126 xorl %edx,%r15d 127 roll $16,%r15d 128 addl %r14d,%esi 129 xorl %esi,%r10d 130 roll $12,%r10d 131 addl %r15d,%edi 132 xorl %edi,%r11d 133 roll $12,%r11d 134 addl %r10d,%ecx 135 xorl %ecx,%r14d 136 roll $8,%r14d 137 addl %r11d,%edx 138 xorl %edx,%r15d 139 roll $8,%r15d 140 addl %r14d,%esi 141 xorl %esi,%r10d 142 roll $7,%r10d 143 addl %r15d,%edi 144 xorl %edi,%r11d 145 roll $7,%r11d 146 addl %r9d,%eax 147 xorl %eax,%r15d 148 roll $16,%r15d 149 addl %r10d,%ebx 150 xorl %ebx,%r12d 151 roll $16,%r12d 152 addl %r15d,%esi 153 xorl %esi,%r9d 154 roll $12,%r9d 155 addl %r12d,%edi 156 xorl %edi,%r10d 157 roll $12,%r10d 158 addl %r9d,%eax 159 xorl %eax,%r15d 160 roll $8,%r15d 161 addl %r10d,%ebx 162 xorl %ebx,%r12d 163 roll $8,%r12d 164 addl %r15d,%esi 165 xorl %esi,%r9d 166 roll $7,%r9d 167 addl %r12d,%edi 168 xorl %edi,%r10d 169 roll $7,%r10d 170 movl %esi,40(%rsp) 171 movl %edi,44(%rsp) 172 movl 32(%rsp),%esi 173 movl 36(%rsp),%edi 174 addl %r11d,%ecx 175 xorl %ecx,%r13d 176 roll $16,%r13d 177 addl %r8d,%edx 178 xorl %edx,%r14d 179 roll $16,%r14d 180 addl %r13d,%esi 181 xorl %esi,%r11d 182 roll $12,%r11d 183 addl %r14d,%edi 184 xorl %edi,%r8d 185 roll $12,%r8d 186 addl %r11d,%ecx 187 xorl %ecx,%r13d 188 roll $8,%r13d 189 addl %r8d,%edx 190 xorl %edx,%r14d 191 roll $8,%r14d 192 addl %r13d,%esi 193 xorl %esi,%r11d 194 roll $7,%r11d 195 addl %r14d,%edi 196 xorl %edi,%r8d 197 roll $7,%r8d 198 decl %ebp 199 jnz L$oop 200 movl %edi,36(%rsp) 201 movl %esi,32(%rsp) 202 movq 64(%rsp),%rbp 203 movdqa %xmm2,%xmm1 204 movq 64+8(%rsp),%rsi 205 paddd %xmm4,%xmm3 206 movq 64+16(%rsp),%rdi 207 208 addl $0x61707865,%eax 209 addl $0x3320646e,%ebx 210 addl $0x79622d32,%ecx 211 addl $0x6b206574,%edx 212 addl 16(%rsp),%r8d 213 addl 20(%rsp),%r9d 214 addl 24(%rsp),%r10d 215 addl 28(%rsp),%r11d 216 addl 48(%rsp),%r12d 217 addl 52(%rsp),%r13d 218 addl 56(%rsp),%r14d 219 addl 60(%rsp),%r15d 220 paddd 32(%rsp),%xmm1 221 222 cmpq $64,%rbp 223 jb L$tail 224 225 xorl 0(%rsi),%eax 226 xorl 4(%rsi),%ebx 227 xorl 8(%rsi),%ecx 228 xorl 12(%rsi),%edx 229 xorl 16(%rsi),%r8d 230 xorl 20(%rsi),%r9d 231 xorl 24(%rsi),%r10d 232 xorl 28(%rsi),%r11d 233 movdqu 32(%rsi),%xmm0 234 xorl 48(%rsi),%r12d 235 xorl 52(%rsi),%r13d 236 xorl 56(%rsi),%r14d 237 xorl 60(%rsi),%r15d 238 leaq 64(%rsi),%rsi 239 pxor %xmm1,%xmm0 240 241 movdqa %xmm2,32(%rsp) 242 movd %xmm3,48(%rsp) 243 244 movl %eax,0(%rdi) 245 movl %ebx,4(%rdi) 246 movl %ecx,8(%rdi) 247 movl %edx,12(%rdi) 248 movl %r8d,16(%rdi) 249 movl %r9d,20(%rdi) 250 movl %r10d,24(%rdi) 251 movl %r11d,28(%rdi) 252 movdqu %xmm0,32(%rdi) 253 movl %r12d,48(%rdi) 254 movl %r13d,52(%rdi) 255 movl %r14d,56(%rdi) 256 movl %r15d,60(%rdi) 257 leaq 64(%rdi),%rdi 258 259 subq $64,%rbp 260 jnz L$oop_outer 261 262 jmp L$done 263 264 .p2align 4 265 L$tail: 266 movl %eax,0(%rsp) 267 movl %ebx,4(%rsp) 268 xorq %rbx,%rbx 269 movl %ecx,8(%rsp) 270 movl %edx,12(%rsp) 271 movl %r8d,16(%rsp) 272 movl %r9d,20(%rsp) 273 movl %r10d,24(%rsp) 274 movl %r11d,28(%rsp) 275 movdqa %xmm1,32(%rsp) 276 movl %r12d,48(%rsp) 277 movl %r13d,52(%rsp) 278 movl %r14d,56(%rsp) 279 movl %r15d,60(%rsp) 280 281 L$oop_tail: 282 movzbl (%rsi,%rbx,1),%eax 283 movzbl (%rsp,%rbx,1),%edx 284 leaq 1(%rbx),%rbx 285 xorl %edx,%eax 286 movb %al,-1(%rdi,%rbx,1) 287 decq %rbp 288 jnz L$oop_tail 289 290 L$done: 291 leaq 64+24+48(%rsp),%rsi 292 movq -48(%rsi),%r15 293 movq -40(%rsi),%r14 294 movq -32(%rsi),%r13 295 movq -24(%rsi),%r12 296 movq -16(%rsi),%rbp 297 movq -8(%rsi),%rbx 298 leaq (%rsi),%rsp 299 L$no_data: 300 .byte 0xf3,0xc3 301 302 303 .p2align 5 304 ChaCha20_ssse3: 305 L$ChaCha20_ssse3: 306 movq %rsp,%r9 307 cmpq $128,%rdx 308 ja L$ChaCha20_4x 309 310 L$do_sse3_after_all: 311 subq $64+8,%rsp 312 movdqa L$sigma(%rip),%xmm0 313 movdqu (%rcx),%xmm1 314 movdqu 16(%rcx),%xmm2 315 movdqu (%r8),%xmm3 316 movdqa L$rot16(%rip),%xmm6 317 movdqa L$rot24(%rip),%xmm7 318 319 movdqa %xmm0,0(%rsp) 320 movdqa %xmm1,16(%rsp) 321 movdqa %xmm2,32(%rsp) 322 movdqa %xmm3,48(%rsp) 323 movq $10,%r8 324 jmp L$oop_ssse3 325 326 .p2align 5 327 L$oop_outer_ssse3: 328 movdqa L$one(%rip),%xmm3 329 movdqa 0(%rsp),%xmm0 330 movdqa 16(%rsp),%xmm1 331 movdqa 32(%rsp),%xmm2 332 paddd 48(%rsp),%xmm3 333 movq $10,%r8 334 movdqa %xmm3,48(%rsp) 335 jmp L$oop_ssse3 336 337 .p2align 5 338 L$oop_ssse3: 339 paddd %xmm1,%xmm0 340 pxor %xmm0,%xmm3 341 .byte 102,15,56,0,222 342 paddd %xmm3,%xmm2 343 pxor %xmm2,%xmm1 344 movdqa %xmm1,%xmm4 345 psrld $20,%xmm1 346 pslld $12,%xmm4 347 por %xmm4,%xmm1 348 paddd %xmm1,%xmm0 349 pxor %xmm0,%xmm3 350 .byte 102,15,56,0,223 351 paddd %xmm3,%xmm2 352 pxor %xmm2,%xmm1 353 movdqa %xmm1,%xmm4 354 psrld $25,%xmm1 355 pslld $7,%xmm4 356 por %xmm4,%xmm1 357 pshufd $78,%xmm2,%xmm2 358 pshufd $57,%xmm1,%xmm1 359 pshufd $147,%xmm3,%xmm3 360 nop 361 paddd %xmm1,%xmm0 362 pxor %xmm0,%xmm3 363 .byte 102,15,56,0,222 364 paddd %xmm3,%xmm2 365 pxor %xmm2,%xmm1 366 movdqa %xmm1,%xmm4 367 psrld $20,%xmm1 368 pslld $12,%xmm4 369 por %xmm4,%xmm1 370 paddd %xmm1,%xmm0 371 pxor %xmm0,%xmm3 372 .byte 102,15,56,0,223 373 paddd %xmm3,%xmm2 374 pxor %xmm2,%xmm1 375 movdqa %xmm1,%xmm4 376 psrld $25,%xmm1 377 pslld $7,%xmm4 378 por %xmm4,%xmm1 379 pshufd $78,%xmm2,%xmm2 380 pshufd $147,%xmm1,%xmm1 381 pshufd $57,%xmm3,%xmm3 382 decq %r8 383 jnz L$oop_ssse3 384 paddd 0(%rsp),%xmm0 385 paddd 16(%rsp),%xmm1 386 paddd 32(%rsp),%xmm2 387 paddd 48(%rsp),%xmm3 388 389 cmpq $64,%rdx 390 jb L$tail_ssse3 391 392 movdqu 0(%rsi),%xmm4 393 movdqu 16(%rsi),%xmm5 394 pxor %xmm4,%xmm0 395 movdqu 32(%rsi),%xmm4 396 pxor %xmm5,%xmm1 397 movdqu 48(%rsi),%xmm5 398 leaq 64(%rsi),%rsi 399 pxor %xmm4,%xmm2 400 pxor %xmm5,%xmm3 401 402 movdqu %xmm0,0(%rdi) 403 movdqu %xmm1,16(%rdi) 404 movdqu %xmm2,32(%rdi) 405 movdqu %xmm3,48(%rdi) 406 leaq 64(%rdi),%rdi 407 408 subq $64,%rdx 409 jnz L$oop_outer_ssse3 410 411 jmp L$done_ssse3 412 413 .p2align 4 414 L$tail_ssse3: 415 movdqa %xmm0,0(%rsp) 416 movdqa %xmm1,16(%rsp) 417 movdqa %xmm2,32(%rsp) 418 movdqa %xmm3,48(%rsp) 419 xorq %r8,%r8 420 421 L$oop_tail_ssse3: 422 movzbl (%rsi,%r8,1),%eax 423 movzbl (%rsp,%r8,1),%ecx 424 leaq 1(%r8),%r8 425 xorl %ecx,%eax 426 movb %al,-1(%rdi,%r8,1) 427 decq %rdx 428 jnz L$oop_tail_ssse3 429 430 L$done_ssse3: 431 leaq (%r9),%rsp 432 L$ssse3_epilogue: 433 .byte 0xf3,0xc3 434 435 436 .p2align 5 437 ChaCha20_4x: 438 L$ChaCha20_4x: 439 movq %rsp,%r9 440 movq %r10,%r11 441 shrq $32,%r10 442 testq $32,%r10 443 jnz L$ChaCha20_8x 444 cmpq $192,%rdx 445 ja L$proceed4x 446 447 andq $71303168,%r11 448 cmpq $4194304,%r11 449 je L$do_sse3_after_all 450 451 L$proceed4x: 452 subq $0x140+8,%rsp 453 movdqa L$sigma(%rip),%xmm11 454 movdqu (%rcx),%xmm15 455 movdqu 16(%rcx),%xmm7 456 movdqu (%r8),%xmm3 457 leaq 256(%rsp),%rcx 458 leaq L$rot16(%rip),%r10 459 leaq L$rot24(%rip),%r11 460 461 pshufd $0x00,%xmm11,%xmm8 462 pshufd $0x55,%xmm11,%xmm9 463 movdqa %xmm8,64(%rsp) 464 pshufd $0xaa,%xmm11,%xmm10 465 movdqa %xmm9,80(%rsp) 466 pshufd $0xff,%xmm11,%xmm11 467 movdqa %xmm10,96(%rsp) 468 movdqa %xmm11,112(%rsp) 469 470 pshufd $0x00,%xmm15,%xmm12 471 pshufd $0x55,%xmm15,%xmm13 472 movdqa %xmm12,128-256(%rcx) 473 pshufd $0xaa,%xmm15,%xmm14 474 movdqa %xmm13,144-256(%rcx) 475 pshufd $0xff,%xmm15,%xmm15 476 movdqa %xmm14,160-256(%rcx) 477 movdqa %xmm15,176-256(%rcx) 478 479 pshufd $0x00,%xmm7,%xmm4 480 pshufd $0x55,%xmm7,%xmm5 481 movdqa %xmm4,192-256(%rcx) 482 pshufd $0xaa,%xmm7,%xmm6 483 movdqa %xmm5,208-256(%rcx) 484 pshufd $0xff,%xmm7,%xmm7 485 movdqa %xmm6,224-256(%rcx) 486 movdqa %xmm7,240-256(%rcx) 487 488 pshufd $0x00,%xmm3,%xmm0 489 pshufd $0x55,%xmm3,%xmm1 490 paddd L$inc(%rip),%xmm0 491 pshufd $0xaa,%xmm3,%xmm2 492 movdqa %xmm1,272-256(%rcx) 493 pshufd $0xff,%xmm3,%xmm3 494 movdqa %xmm2,288-256(%rcx) 495 movdqa %xmm3,304-256(%rcx) 496 497 jmp L$oop_enter4x 498 499 .p2align 5 500 L$oop_outer4x: 501 movdqa 64(%rsp),%xmm8 502 movdqa 80(%rsp),%xmm9 503 movdqa 96(%rsp),%xmm10 504 movdqa 112(%rsp),%xmm11 505 movdqa 128-256(%rcx),%xmm12 506 movdqa 144-256(%rcx),%xmm13 507 movdqa 160-256(%rcx),%xmm14 508 movdqa 176-256(%rcx),%xmm15 509 movdqa 192-256(%rcx),%xmm4 510 movdqa 208-256(%rcx),%xmm5 511 movdqa 224-256(%rcx),%xmm6 512 movdqa 240-256(%rcx),%xmm7 513 movdqa 256-256(%rcx),%xmm0 514 movdqa 272-256(%rcx),%xmm1 515 movdqa 288-256(%rcx),%xmm2 516 movdqa 304-256(%rcx),%xmm3 517 paddd L$four(%rip),%xmm0 518 519 L$oop_enter4x: 520 movdqa %xmm6,32(%rsp) 521 movdqa %xmm7,48(%rsp) 522 movdqa (%r10),%xmm7 523 movl $10,%eax 524 movdqa %xmm0,256-256(%rcx) 525 jmp L$oop4x 526 527 .p2align 5 528 L$oop4x: 529 paddd %xmm12,%xmm8 530 paddd %xmm13,%xmm9 531 pxor %xmm8,%xmm0 532 pxor %xmm9,%xmm1 533 .byte 102,15,56,0,199 534 .byte 102,15,56,0,207 535 paddd %xmm0,%xmm4 536 paddd %xmm1,%xmm5 537 pxor %xmm4,%xmm12 538 pxor %xmm5,%xmm13 539 movdqa %xmm12,%xmm6 540 pslld $12,%xmm12 541 psrld $20,%xmm6 542 movdqa %xmm13,%xmm7 543 pslld $12,%xmm13 544 por %xmm6,%xmm12 545 psrld $20,%xmm7 546 movdqa (%r11),%xmm6 547 por %xmm7,%xmm13 548 paddd %xmm12,%xmm8 549 paddd %xmm13,%xmm9 550 pxor %xmm8,%xmm0 551 pxor %xmm9,%xmm1 552 .byte 102,15,56,0,198 553 .byte 102,15,56,0,206 554 paddd %xmm0,%xmm4 555 paddd %xmm1,%xmm5 556 pxor %xmm4,%xmm12 557 pxor %xmm5,%xmm13 558 movdqa %xmm12,%xmm7 559 pslld $7,%xmm12 560 psrld $25,%xmm7 561 movdqa %xmm13,%xmm6 562 pslld $7,%xmm13 563 por %xmm7,%xmm12 564 psrld $25,%xmm6 565 movdqa (%r10),%xmm7 566 por %xmm6,%xmm13 567 movdqa %xmm4,0(%rsp) 568 movdqa %xmm5,16(%rsp) 569 movdqa 32(%rsp),%xmm4 570 movdqa 48(%rsp),%xmm5 571 paddd %xmm14,%xmm10 572 paddd %xmm15,%xmm11 573 pxor %xmm10,%xmm2 574 pxor %xmm11,%xmm3 575 .byte 102,15,56,0,215 576 .byte 102,15,56,0,223 577 paddd %xmm2,%xmm4 578 paddd %xmm3,%xmm5 579 pxor %xmm4,%xmm14 580 pxor %xmm5,%xmm15 581 movdqa %xmm14,%xmm6 582 pslld $12,%xmm14 583 psrld $20,%xmm6 584 movdqa %xmm15,%xmm7 585 pslld $12,%xmm15 586 por %xmm6,%xmm14 587 psrld $20,%xmm7 588 movdqa (%r11),%xmm6 589 por %xmm7,%xmm15 590 paddd %xmm14,%xmm10 591 paddd %xmm15,%xmm11 592 pxor %xmm10,%xmm2 593 pxor %xmm11,%xmm3 594 .byte 102,15,56,0,214 595 .byte 102,15,56,0,222 596 paddd %xmm2,%xmm4 597 paddd %xmm3,%xmm5 598 pxor %xmm4,%xmm14 599 pxor %xmm5,%xmm15 600 movdqa %xmm14,%xmm7 601 pslld $7,%xmm14 602 psrld $25,%xmm7 603 movdqa %xmm15,%xmm6 604 pslld $7,%xmm15 605 por %xmm7,%xmm14 606 psrld $25,%xmm6 607 movdqa (%r10),%xmm7 608 por %xmm6,%xmm15 609 paddd %xmm13,%xmm8 610 paddd %xmm14,%xmm9 611 pxor %xmm8,%xmm3 612 pxor %xmm9,%xmm0 613 .byte 102,15,56,0,223 614 .byte 102,15,56,0,199 615 paddd %xmm3,%xmm4 616 paddd %xmm0,%xmm5 617 pxor %xmm4,%xmm13 618 pxor %xmm5,%xmm14 619 movdqa %xmm13,%xmm6 620 pslld $12,%xmm13 621 psrld $20,%xmm6 622 movdqa %xmm14,%xmm7 623 pslld $12,%xmm14 624 por %xmm6,%xmm13 625 psrld $20,%xmm7 626 movdqa (%r11),%xmm6 627 por %xmm7,%xmm14 628 paddd %xmm13,%xmm8 629 paddd %xmm14,%xmm9 630 pxor %xmm8,%xmm3 631 pxor %xmm9,%xmm0 632 .byte 102,15,56,0,222 633 .byte 102,15,56,0,198 634 paddd %xmm3,%xmm4 635 paddd %xmm0,%xmm5 636 pxor %xmm4,%xmm13 637 pxor %xmm5,%xmm14 638 movdqa %xmm13,%xmm7 639 pslld $7,%xmm13 640 psrld $25,%xmm7 641 movdqa %xmm14,%xmm6 642 pslld $7,%xmm14 643 por %xmm7,%xmm13 644 psrld $25,%xmm6 645 movdqa (%r10),%xmm7 646 por %xmm6,%xmm14 647 movdqa %xmm4,32(%rsp) 648 movdqa %xmm5,48(%rsp) 649 movdqa 0(%rsp),%xmm4 650 movdqa 16(%rsp),%xmm5 651 paddd %xmm15,%xmm10 652 paddd %xmm12,%xmm11 653 pxor %xmm10,%xmm1 654 pxor %xmm11,%xmm2 655 .byte 102,15,56,0,207 656 .byte 102,15,56,0,215 657 paddd %xmm1,%xmm4 658 paddd %xmm2,%xmm5 659 pxor %xmm4,%xmm15 660 pxor %xmm5,%xmm12 661 movdqa %xmm15,%xmm6 662 pslld $12,%xmm15 663 psrld $20,%xmm6 664 movdqa %xmm12,%xmm7 665 pslld $12,%xmm12 666 por %xmm6,%xmm15 667 psrld $20,%xmm7 668 movdqa (%r11),%xmm6 669 por %xmm7,%xmm12 670 paddd %xmm15,%xmm10 671 paddd %xmm12,%xmm11 672 pxor %xmm10,%xmm1 673 pxor %xmm11,%xmm2 674 .byte 102,15,56,0,206 675 .byte 102,15,56,0,214 676 paddd %xmm1,%xmm4 677 paddd %xmm2,%xmm5 678 pxor %xmm4,%xmm15 679 pxor %xmm5,%xmm12 680 movdqa %xmm15,%xmm7 681 pslld $7,%xmm15 682 psrld $25,%xmm7 683 movdqa %xmm12,%xmm6 684 pslld $7,%xmm12 685 por %xmm7,%xmm15 686 psrld $25,%xmm6 687 movdqa (%r10),%xmm7 688 por %xmm6,%xmm12 689 decl %eax 690 jnz L$oop4x 691 692 paddd 64(%rsp),%xmm8 693 paddd 80(%rsp),%xmm9 694 paddd 96(%rsp),%xmm10 695 paddd 112(%rsp),%xmm11 696 697 movdqa %xmm8,%xmm6 698 punpckldq %xmm9,%xmm8 699 movdqa %xmm10,%xmm7 700 punpckldq %xmm11,%xmm10 701 punpckhdq %xmm9,%xmm6 702 punpckhdq %xmm11,%xmm7 703 movdqa %xmm8,%xmm9 704 punpcklqdq %xmm10,%xmm8 705 movdqa %xmm6,%xmm11 706 punpcklqdq %xmm7,%xmm6 707 punpckhqdq %xmm10,%xmm9 708 punpckhqdq %xmm7,%xmm11 709 paddd 128-256(%rcx),%xmm12 710 paddd 144-256(%rcx),%xmm13 711 paddd 160-256(%rcx),%xmm14 712 paddd 176-256(%rcx),%xmm15 713 714 movdqa %xmm8,0(%rsp) 715 movdqa %xmm9,16(%rsp) 716 movdqa 32(%rsp),%xmm8 717 movdqa 48(%rsp),%xmm9 718 719 movdqa %xmm12,%xmm10 720 punpckldq %xmm13,%xmm12 721 movdqa %xmm14,%xmm7 722 punpckldq %xmm15,%xmm14 723 punpckhdq %xmm13,%xmm10 724 punpckhdq %xmm15,%xmm7 725 movdqa %xmm12,%xmm13 726 punpcklqdq %xmm14,%xmm12 727 movdqa %xmm10,%xmm15 728 punpcklqdq %xmm7,%xmm10 729 punpckhqdq %xmm14,%xmm13 730 punpckhqdq %xmm7,%xmm15 731 paddd 192-256(%rcx),%xmm4 732 paddd 208-256(%rcx),%xmm5 733 paddd 224-256(%rcx),%xmm8 734 paddd 240-256(%rcx),%xmm9 735 736 movdqa %xmm6,32(%rsp) 737 movdqa %xmm11,48(%rsp) 738 739 movdqa %xmm4,%xmm14 740 punpckldq %xmm5,%xmm4 741 movdqa %xmm8,%xmm7 742 punpckldq %xmm9,%xmm8 743 punpckhdq %xmm5,%xmm14 744 punpckhdq %xmm9,%xmm7 745 movdqa %xmm4,%xmm5 746 punpcklqdq %xmm8,%xmm4 747 movdqa %xmm14,%xmm9 748 punpcklqdq %xmm7,%xmm14 749 punpckhqdq %xmm8,%xmm5 750 punpckhqdq %xmm7,%xmm9 751 paddd 256-256(%rcx),%xmm0 752 paddd 272-256(%rcx),%xmm1 753 paddd 288-256(%rcx),%xmm2 754 paddd 304-256(%rcx),%xmm3 755 756 movdqa %xmm0,%xmm8 757 punpckldq %xmm1,%xmm0 758 movdqa %xmm2,%xmm7 759 punpckldq %xmm3,%xmm2 760 punpckhdq %xmm1,%xmm8 761 punpckhdq %xmm3,%xmm7 762 movdqa %xmm0,%xmm1 763 punpcklqdq %xmm2,%xmm0 764 movdqa %xmm8,%xmm3 765 punpcklqdq %xmm7,%xmm8 766 punpckhqdq %xmm2,%xmm1 767 punpckhqdq %xmm7,%xmm3 768 cmpq $256,%rdx 769 jb L$tail4x 770 771 movdqu 0(%rsi),%xmm6 772 movdqu 16(%rsi),%xmm11 773 movdqu 32(%rsi),%xmm2 774 movdqu 48(%rsi),%xmm7 775 pxor 0(%rsp),%xmm6 776 pxor %xmm12,%xmm11 777 pxor %xmm4,%xmm2 778 pxor %xmm0,%xmm7 779 780 movdqu %xmm6,0(%rdi) 781 movdqu 64(%rsi),%xmm6 782 movdqu %xmm11,16(%rdi) 783 movdqu 80(%rsi),%xmm11 784 movdqu %xmm2,32(%rdi) 785 movdqu 96(%rsi),%xmm2 786 movdqu %xmm7,48(%rdi) 787 movdqu 112(%rsi),%xmm7 788 leaq 128(%rsi),%rsi 789 pxor 16(%rsp),%xmm6 790 pxor %xmm13,%xmm11 791 pxor %xmm5,%xmm2 792 pxor %xmm1,%xmm7 793 794 movdqu %xmm6,64(%rdi) 795 movdqu 0(%rsi),%xmm6 796 movdqu %xmm11,80(%rdi) 797 movdqu 16(%rsi),%xmm11 798 movdqu %xmm2,96(%rdi) 799 movdqu 32(%rsi),%xmm2 800 movdqu %xmm7,112(%rdi) 801 leaq 128(%rdi),%rdi 802 movdqu 48(%rsi),%xmm7 803 pxor 32(%rsp),%xmm6 804 pxor %xmm10,%xmm11 805 pxor %xmm14,%xmm2 806 pxor %xmm8,%xmm7 807 808 movdqu %xmm6,0(%rdi) 809 movdqu 64(%rsi),%xmm6 810 movdqu %xmm11,16(%rdi) 811 movdqu 80(%rsi),%xmm11 812 movdqu %xmm2,32(%rdi) 813 movdqu 96(%rsi),%xmm2 814 movdqu %xmm7,48(%rdi) 815 movdqu 112(%rsi),%xmm7 816 leaq 128(%rsi),%rsi 817 pxor 48(%rsp),%xmm6 818 pxor %xmm15,%xmm11 819 pxor %xmm9,%xmm2 820 pxor %xmm3,%xmm7 821 movdqu %xmm6,64(%rdi) 822 movdqu %xmm11,80(%rdi) 823 movdqu %xmm2,96(%rdi) 824 movdqu %xmm7,112(%rdi) 825 leaq 128(%rdi),%rdi 826 827 subq $256,%rdx 828 jnz L$oop_outer4x 829 830 jmp L$done4x 831 832 L$tail4x: 833 cmpq $192,%rdx 834 jae L$192_or_more4x 835 cmpq $128,%rdx 836 jae L$128_or_more4x 837 cmpq $64,%rdx 838 jae L$64_or_more4x 839 840 841 xorq %r10,%r10 842 843 movdqa %xmm12,16(%rsp) 844 movdqa %xmm4,32(%rsp) 845 movdqa %xmm0,48(%rsp) 846 jmp L$oop_tail4x 847 848 .p2align 5 849 L$64_or_more4x: 850 movdqu 0(%rsi),%xmm6 851 movdqu 16(%rsi),%xmm11 852 movdqu 32(%rsi),%xmm2 853 movdqu 48(%rsi),%xmm7 854 pxor 0(%rsp),%xmm6 855 pxor %xmm12,%xmm11 856 pxor %xmm4,%xmm2 857 pxor %xmm0,%xmm7 858 movdqu %xmm6,0(%rdi) 859 movdqu %xmm11,16(%rdi) 860 movdqu %xmm2,32(%rdi) 861 movdqu %xmm7,48(%rdi) 862 je L$done4x 863 864 movdqa 16(%rsp),%xmm6 865 leaq 64(%rsi),%rsi 866 xorq %r10,%r10 867 movdqa %xmm6,0(%rsp) 868 movdqa %xmm13,16(%rsp) 869 leaq 64(%rdi),%rdi 870 movdqa %xmm5,32(%rsp) 871 subq $64,%rdx 872 movdqa %xmm1,48(%rsp) 873 jmp L$oop_tail4x 874 875 .p2align 5 876 L$128_or_more4x: 877 movdqu 0(%rsi),%xmm6 878 movdqu 16(%rsi),%xmm11 879 movdqu 32(%rsi),%xmm2 880 movdqu 48(%rsi),%xmm7 881 pxor 0(%rsp),%xmm6 882 pxor %xmm12,%xmm11 883 pxor %xmm4,%xmm2 884 pxor %xmm0,%xmm7 885 886 movdqu %xmm6,0(%rdi) 887 movdqu 64(%rsi),%xmm6 888 movdqu %xmm11,16(%rdi) 889 movdqu 80(%rsi),%xmm11 890 movdqu %xmm2,32(%rdi) 891 movdqu 96(%rsi),%xmm2 892 movdqu %xmm7,48(%rdi) 893 movdqu 112(%rsi),%xmm7 894 pxor 16(%rsp),%xmm6 895 pxor %xmm13,%xmm11 896 pxor %xmm5,%xmm2 897 pxor %xmm1,%xmm7 898 movdqu %xmm6,64(%rdi) 899 movdqu %xmm11,80(%rdi) 900 movdqu %xmm2,96(%rdi) 901 movdqu %xmm7,112(%rdi) 902 je L$done4x 903 904 movdqa 32(%rsp),%xmm6 905 leaq 128(%rsi),%rsi 906 xorq %r10,%r10 907 movdqa %xmm6,0(%rsp) 908 movdqa %xmm10,16(%rsp) 909 leaq 128(%rdi),%rdi 910 movdqa %xmm14,32(%rsp) 911 subq $128,%rdx 912 movdqa %xmm8,48(%rsp) 913 jmp L$oop_tail4x 914 915 .p2align 5 916 L$192_or_more4x: 917 movdqu 0(%rsi),%xmm6 918 movdqu 16(%rsi),%xmm11 919 movdqu 32(%rsi),%xmm2 920 movdqu 48(%rsi),%xmm7 921 pxor 0(%rsp),%xmm6 922 pxor %xmm12,%xmm11 923 pxor %xmm4,%xmm2 924 pxor %xmm0,%xmm7 925 926 movdqu %xmm6,0(%rdi) 927 movdqu 64(%rsi),%xmm6 928 movdqu %xmm11,16(%rdi) 929 movdqu 80(%rsi),%xmm11 930 movdqu %xmm2,32(%rdi) 931 movdqu 96(%rsi),%xmm2 932 movdqu %xmm7,48(%rdi) 933 movdqu 112(%rsi),%xmm7 934 leaq 128(%rsi),%rsi 935 pxor 16(%rsp),%xmm6 936 pxor %xmm13,%xmm11 937 pxor %xmm5,%xmm2 938 pxor %xmm1,%xmm7 939 940 movdqu %xmm6,64(%rdi) 941 movdqu 0(%rsi),%xmm6 942 movdqu %xmm11,80(%rdi) 943 movdqu 16(%rsi),%xmm11 944 movdqu %xmm2,96(%rdi) 945 movdqu 32(%rsi),%xmm2 946 movdqu %xmm7,112(%rdi) 947 leaq 128(%rdi),%rdi 948 movdqu 48(%rsi),%xmm7 949 pxor 32(%rsp),%xmm6 950 pxor %xmm10,%xmm11 951 pxor %xmm14,%xmm2 952 pxor %xmm8,%xmm7 953 movdqu %xmm6,0(%rdi) 954 movdqu %xmm11,16(%rdi) 955 movdqu %xmm2,32(%rdi) 956 movdqu %xmm7,48(%rdi) 957 je L$done4x 958 959 movdqa 48(%rsp),%xmm6 960 leaq 64(%rsi),%rsi 961 xorq %r10,%r10 962 movdqa %xmm6,0(%rsp) 963 movdqa %xmm15,16(%rsp) 964 leaq 64(%rdi),%rdi 965 movdqa %xmm9,32(%rsp) 966 subq $192,%rdx 967 movdqa %xmm3,48(%rsp) 968 969 L$oop_tail4x: 970 movzbl (%rsi,%r10,1),%eax 971 movzbl (%rsp,%r10,1),%ecx 972 leaq 1(%r10),%r10 973 xorl %ecx,%eax 974 movb %al,-1(%rdi,%r10,1) 975 decq %rdx 976 jnz L$oop_tail4x 977 978 L$done4x: 979 leaq (%r9),%rsp 980 L$4x_epilogue: 981 .byte 0xf3,0xc3 982 983 984 .p2align 5 985 ChaCha20_8x: 986 L$ChaCha20_8x: 987 movq %rsp,%r9 988 subq $0x280+8,%rsp 989 andq $-32,%rsp 990 vzeroupper 991 992 993 994 995 996 997 998 999 1000 1001 vbroadcasti128 L$sigma(%rip),%ymm11 1002 vbroadcasti128 (%rcx),%ymm3 1003 vbroadcasti128 16(%rcx),%ymm15 1004 vbroadcasti128 (%r8),%ymm7 1005 leaq 256(%rsp),%rcx 1006 leaq 512(%rsp),%rax 1007 leaq L$rot16(%rip),%r10 1008 leaq L$rot24(%rip),%r11 1009 1010 vpshufd $0x00,%ymm11,%ymm8 1011 vpshufd $0x55,%ymm11,%ymm9 1012 vmovdqa %ymm8,128-256(%rcx) 1013 vpshufd $0xaa,%ymm11,%ymm10 1014 vmovdqa %ymm9,160-256(%rcx) 1015 vpshufd $0xff,%ymm11,%ymm11 1016 vmovdqa %ymm10,192-256(%rcx) 1017 vmovdqa %ymm11,224-256(%rcx) 1018 1019 vpshufd $0x00,%ymm3,%ymm0 1020 vpshufd $0x55,%ymm3,%ymm1 1021 vmovdqa %ymm0,256-256(%rcx) 1022 vpshufd $0xaa,%ymm3,%ymm2 1023 vmovdqa %ymm1,288-256(%rcx) 1024 vpshufd $0xff,%ymm3,%ymm3 1025 vmovdqa %ymm2,320-256(%rcx) 1026 vmovdqa %ymm3,352-256(%rcx) 1027 1028 vpshufd $0x00,%ymm15,%ymm12 1029 vpshufd $0x55,%ymm15,%ymm13 1030 vmovdqa %ymm12,384-512(%rax) 1031 vpshufd $0xaa,%ymm15,%ymm14 1032 vmovdqa %ymm13,416-512(%rax) 1033 vpshufd $0xff,%ymm15,%ymm15 1034 vmovdqa %ymm14,448-512(%rax) 1035 vmovdqa %ymm15,480-512(%rax) 1036 1037 vpshufd $0x00,%ymm7,%ymm4 1038 vpshufd $0x55,%ymm7,%ymm5 1039 vpaddd L$incy(%rip),%ymm4,%ymm4 1040 vpshufd $0xaa,%ymm7,%ymm6 1041 vmovdqa %ymm5,544-512(%rax) 1042 vpshufd $0xff,%ymm7,%ymm7 1043 vmovdqa %ymm6,576-512(%rax) 1044 vmovdqa %ymm7,608-512(%rax) 1045 1046 jmp L$oop_enter8x 1047 1048 .p2align 5 1049 L$oop_outer8x: 1050 vmovdqa 128-256(%rcx),%ymm8 1051 vmovdqa 160-256(%rcx),%ymm9 1052 vmovdqa 192-256(%rcx),%ymm10 1053 vmovdqa 224-256(%rcx),%ymm11 1054 vmovdqa 256-256(%rcx),%ymm0 1055 vmovdqa 288-256(%rcx),%ymm1 1056 vmovdqa 320-256(%rcx),%ymm2 1057 vmovdqa 352-256(%rcx),%ymm3 1058 vmovdqa 384-512(%rax),%ymm12 1059 vmovdqa 416-512(%rax),%ymm13 1060 vmovdqa 448-512(%rax),%ymm14 1061 vmovdqa 480-512(%rax),%ymm15 1062 vmovdqa 512-512(%rax),%ymm4 1063 vmovdqa 544-512(%rax),%ymm5 1064 vmovdqa 576-512(%rax),%ymm6 1065 vmovdqa 608-512(%rax),%ymm7 1066 vpaddd L$eight(%rip),%ymm4,%ymm4 1067 1068 L$oop_enter8x: 1069 vmovdqa %ymm14,64(%rsp) 1070 vmovdqa %ymm15,96(%rsp) 1071 vbroadcasti128 (%r10),%ymm15 1072 vmovdqa %ymm4,512-512(%rax) 1073 movl $10,%eax 1074 jmp L$oop8x 1075 1076 .p2align 5 1077 L$oop8x: 1078 vpaddd %ymm0,%ymm8,%ymm8 1079 vpxor %ymm4,%ymm8,%ymm4 1080 vpshufb %ymm15,%ymm4,%ymm4 1081 vpaddd %ymm1,%ymm9,%ymm9 1082 vpxor %ymm5,%ymm9,%ymm5 1083 vpshufb %ymm15,%ymm5,%ymm5 1084 vpaddd %ymm4,%ymm12,%ymm12 1085 vpxor %ymm0,%ymm12,%ymm0 1086 vpslld $12,%ymm0,%ymm14 1087 vpsrld $20,%ymm0,%ymm0 1088 vpor %ymm0,%ymm14,%ymm0 1089 vbroadcasti128 (%r11),%ymm14 1090 vpaddd %ymm5,%ymm13,%ymm13 1091 vpxor %ymm1,%ymm13,%ymm1 1092 vpslld $12,%ymm1,%ymm15 1093 vpsrld $20,%ymm1,%ymm1 1094 vpor %ymm1,%ymm15,%ymm1 1095 vpaddd %ymm0,%ymm8,%ymm8 1096 vpxor %ymm4,%ymm8,%ymm4 1097 vpshufb %ymm14,%ymm4,%ymm4 1098 vpaddd %ymm1,%ymm9,%ymm9 1099 vpxor %ymm5,%ymm9,%ymm5 1100 vpshufb %ymm14,%ymm5,%ymm5 1101 vpaddd %ymm4,%ymm12,%ymm12 1102 vpxor %ymm0,%ymm12,%ymm0 1103 vpslld $7,%ymm0,%ymm15 1104 vpsrld $25,%ymm0,%ymm0 1105 vpor %ymm0,%ymm15,%ymm0 1106 vbroadcasti128 (%r10),%ymm15 1107 vpaddd %ymm5,%ymm13,%ymm13 1108 vpxor %ymm1,%ymm13,%ymm1 1109 vpslld $7,%ymm1,%ymm14 1110 vpsrld $25,%ymm1,%ymm1 1111 vpor %ymm1,%ymm14,%ymm1 1112 vmovdqa %ymm12,0(%rsp) 1113 vmovdqa %ymm13,32(%rsp) 1114 vmovdqa 64(%rsp),%ymm12 1115 vmovdqa 96(%rsp),%ymm13 1116 vpaddd %ymm2,%ymm10,%ymm10 1117 vpxor %ymm6,%ymm10,%ymm6 1118 vpshufb %ymm15,%ymm6,%ymm6 1119 vpaddd %ymm3,%ymm11,%ymm11 1120 vpxor %ymm7,%ymm11,%ymm7 1121 vpshufb %ymm15,%ymm7,%ymm7 1122 vpaddd %ymm6,%ymm12,%ymm12 1123 vpxor %ymm2,%ymm12,%ymm2 1124 vpslld $12,%ymm2,%ymm14 1125 vpsrld $20,%ymm2,%ymm2 1126 vpor %ymm2,%ymm14,%ymm2 1127 vbroadcasti128 (%r11),%ymm14 1128 vpaddd %ymm7,%ymm13,%ymm13 1129 vpxor %ymm3,%ymm13,%ymm3 1130 vpslld $12,%ymm3,%ymm15 1131 vpsrld $20,%ymm3,%ymm3 1132 vpor %ymm3,%ymm15,%ymm3 1133 vpaddd %ymm2,%ymm10,%ymm10 1134 vpxor %ymm6,%ymm10,%ymm6 1135 vpshufb %ymm14,%ymm6,%ymm6 1136 vpaddd %ymm3,%ymm11,%ymm11 1137 vpxor %ymm7,%ymm11,%ymm7 1138 vpshufb %ymm14,%ymm7,%ymm7 1139 vpaddd %ymm6,%ymm12,%ymm12 1140 vpxor %ymm2,%ymm12,%ymm2 1141 vpslld $7,%ymm2,%ymm15 1142 vpsrld $25,%ymm2,%ymm2 1143 vpor %ymm2,%ymm15,%ymm2 1144 vbroadcasti128 (%r10),%ymm15 1145 vpaddd %ymm7,%ymm13,%ymm13 1146 vpxor %ymm3,%ymm13,%ymm3 1147 vpslld $7,%ymm3,%ymm14 1148 vpsrld $25,%ymm3,%ymm3 1149 vpor %ymm3,%ymm14,%ymm3 1150 vpaddd %ymm1,%ymm8,%ymm8 1151 vpxor %ymm7,%ymm8,%ymm7 1152 vpshufb %ymm15,%ymm7,%ymm7 1153 vpaddd %ymm2,%ymm9,%ymm9 1154 vpxor %ymm4,%ymm9,%ymm4 1155 vpshufb %ymm15,%ymm4,%ymm4 1156 vpaddd %ymm7,%ymm12,%ymm12 1157 vpxor %ymm1,%ymm12,%ymm1 1158 vpslld $12,%ymm1,%ymm14 1159 vpsrld $20,%ymm1,%ymm1 1160 vpor %ymm1,%ymm14,%ymm1 1161 vbroadcasti128 (%r11),%ymm14 1162 vpaddd %ymm4,%ymm13,%ymm13 1163 vpxor %ymm2,%ymm13,%ymm2 1164 vpslld $12,%ymm2,%ymm15 1165 vpsrld $20,%ymm2,%ymm2 1166 vpor %ymm2,%ymm15,%ymm2 1167 vpaddd %ymm1,%ymm8,%ymm8 1168 vpxor %ymm7,%ymm8,%ymm7 1169 vpshufb %ymm14,%ymm7,%ymm7 1170 vpaddd %ymm2,%ymm9,%ymm9 1171 vpxor %ymm4,%ymm9,%ymm4 1172 vpshufb %ymm14,%ymm4,%ymm4 1173 vpaddd %ymm7,%ymm12,%ymm12 1174 vpxor %ymm1,%ymm12,%ymm1 1175 vpslld $7,%ymm1,%ymm15 1176 vpsrld $25,%ymm1,%ymm1 1177 vpor %ymm1,%ymm15,%ymm1 1178 vbroadcasti128 (%r10),%ymm15 1179 vpaddd %ymm4,%ymm13,%ymm13 1180 vpxor %ymm2,%ymm13,%ymm2 1181 vpslld $7,%ymm2,%ymm14 1182 vpsrld $25,%ymm2,%ymm2 1183 vpor %ymm2,%ymm14,%ymm2 1184 vmovdqa %ymm12,64(%rsp) 1185 vmovdqa %ymm13,96(%rsp) 1186 vmovdqa 0(%rsp),%ymm12 1187 vmovdqa 32(%rsp),%ymm13 1188 vpaddd %ymm3,%ymm10,%ymm10 1189 vpxor %ymm5,%ymm10,%ymm5 1190 vpshufb %ymm15,%ymm5,%ymm5 1191 vpaddd %ymm0,%ymm11,%ymm11 1192 vpxor %ymm6,%ymm11,%ymm6 1193 vpshufb %ymm15,%ymm6,%ymm6 1194 vpaddd %ymm5,%ymm12,%ymm12 1195 vpxor %ymm3,%ymm12,%ymm3 1196 vpslld $12,%ymm3,%ymm14 1197 vpsrld $20,%ymm3,%ymm3 1198 vpor %ymm3,%ymm14,%ymm3 1199 vbroadcasti128 (%r11),%ymm14 1200 vpaddd %ymm6,%ymm13,%ymm13 1201 vpxor %ymm0,%ymm13,%ymm0 1202 vpslld $12,%ymm0,%ymm15 1203 vpsrld $20,%ymm0,%ymm0 1204 vpor %ymm0,%ymm15,%ymm0 1205 vpaddd %ymm3,%ymm10,%ymm10 1206 vpxor %ymm5,%ymm10,%ymm5 1207 vpshufb %ymm14,%ymm5,%ymm5 1208 vpaddd %ymm0,%ymm11,%ymm11 1209 vpxor %ymm6,%ymm11,%ymm6 1210 vpshufb %ymm14,%ymm6,%ymm6 1211 vpaddd %ymm5,%ymm12,%ymm12 1212 vpxor %ymm3,%ymm12,%ymm3 1213 vpslld $7,%ymm3,%ymm15 1214 vpsrld $25,%ymm3,%ymm3 1215 vpor %ymm3,%ymm15,%ymm3 1216 vbroadcasti128 (%r10),%ymm15 1217 vpaddd %ymm6,%ymm13,%ymm13 1218 vpxor %ymm0,%ymm13,%ymm0 1219 vpslld $7,%ymm0,%ymm14 1220 vpsrld $25,%ymm0,%ymm0 1221 vpor %ymm0,%ymm14,%ymm0 1222 decl %eax 1223 jnz L$oop8x 1224 1225 leaq 512(%rsp),%rax 1226 vpaddd 128-256(%rcx),%ymm8,%ymm8 1227 vpaddd 160-256(%rcx),%ymm9,%ymm9 1228 vpaddd 192-256(%rcx),%ymm10,%ymm10 1229 vpaddd 224-256(%rcx),%ymm11,%ymm11 1230 1231 vpunpckldq %ymm9,%ymm8,%ymm14 1232 vpunpckldq %ymm11,%ymm10,%ymm15 1233 vpunpckhdq %ymm9,%ymm8,%ymm8 1234 vpunpckhdq %ymm11,%ymm10,%ymm10 1235 vpunpcklqdq %ymm15,%ymm14,%ymm9 1236 vpunpckhqdq %ymm15,%ymm14,%ymm14 1237 vpunpcklqdq %ymm10,%ymm8,%ymm11 1238 vpunpckhqdq %ymm10,%ymm8,%ymm8 1239 vpaddd 256-256(%rcx),%ymm0,%ymm0 1240 vpaddd 288-256(%rcx),%ymm1,%ymm1 1241 vpaddd 320-256(%rcx),%ymm2,%ymm2 1242 vpaddd 352-256(%rcx),%ymm3,%ymm3 1243 1244 vpunpckldq %ymm1,%ymm0,%ymm10 1245 vpunpckldq %ymm3,%ymm2,%ymm15 1246 vpunpckhdq %ymm1,%ymm0,%ymm0 1247 vpunpckhdq %ymm3,%ymm2,%ymm2 1248 vpunpcklqdq %ymm15,%ymm10,%ymm1 1249 vpunpckhqdq %ymm15,%ymm10,%ymm10 1250 vpunpcklqdq %ymm2,%ymm0,%ymm3 1251 vpunpckhqdq %ymm2,%ymm0,%ymm0 1252 vperm2i128 $0x20,%ymm1,%ymm9,%ymm15 1253 vperm2i128 $0x31,%ymm1,%ymm9,%ymm1 1254 vperm2i128 $0x20,%ymm10,%ymm14,%ymm9 1255 vperm2i128 $0x31,%ymm10,%ymm14,%ymm10 1256 vperm2i128 $0x20,%ymm3,%ymm11,%ymm14 1257 vperm2i128 $0x31,%ymm3,%ymm11,%ymm3 1258 vperm2i128 $0x20,%ymm0,%ymm8,%ymm11 1259 vperm2i128 $0x31,%ymm0,%ymm8,%ymm0 1260 vmovdqa %ymm15,0(%rsp) 1261 vmovdqa %ymm9,32(%rsp) 1262 vmovdqa 64(%rsp),%ymm15 1263 vmovdqa 96(%rsp),%ymm9 1264 1265 vpaddd 384-512(%rax),%ymm12,%ymm12 1266 vpaddd 416-512(%rax),%ymm13,%ymm13 1267 vpaddd 448-512(%rax),%ymm15,%ymm15 1268 vpaddd 480-512(%rax),%ymm9,%ymm9 1269 1270 vpunpckldq %ymm13,%ymm12,%ymm2 1271 vpunpckldq %ymm9,%ymm15,%ymm8 1272 vpunpckhdq %ymm13,%ymm12,%ymm12 1273 vpunpckhdq %ymm9,%ymm15,%ymm15 1274 vpunpcklqdq %ymm8,%ymm2,%ymm13 1275 vpunpckhqdq %ymm8,%ymm2,%ymm2 1276 vpunpcklqdq %ymm15,%ymm12,%ymm9 1277 vpunpckhqdq %ymm15,%ymm12,%ymm12 1278 vpaddd 512-512(%rax),%ymm4,%ymm4 1279 vpaddd 544-512(%rax),%ymm5,%ymm5 1280 vpaddd 576-512(%rax),%ymm6,%ymm6 1281 vpaddd 608-512(%rax),%ymm7,%ymm7 1282 1283 vpunpckldq %ymm5,%ymm4,%ymm15 1284 vpunpckldq %ymm7,%ymm6,%ymm8 1285 vpunpckhdq %ymm5,%ymm4,%ymm4 1286 vpunpckhdq %ymm7,%ymm6,%ymm6 1287 vpunpcklqdq %ymm8,%ymm15,%ymm5 1288 vpunpckhqdq %ymm8,%ymm15,%ymm15 1289 vpunpcklqdq %ymm6,%ymm4,%ymm7 1290 vpunpckhqdq %ymm6,%ymm4,%ymm4 1291 vperm2i128 $0x20,%ymm5,%ymm13,%ymm8 1292 vperm2i128 $0x31,%ymm5,%ymm13,%ymm5 1293 vperm2i128 $0x20,%ymm15,%ymm2,%ymm13 1294 vperm2i128 $0x31,%ymm15,%ymm2,%ymm15 1295 vperm2i128 $0x20,%ymm7,%ymm9,%ymm2 1296 vperm2i128 $0x31,%ymm7,%ymm9,%ymm7 1297 vperm2i128 $0x20,%ymm4,%ymm12,%ymm9 1298 vperm2i128 $0x31,%ymm4,%ymm12,%ymm4 1299 vmovdqa 0(%rsp),%ymm6 1300 vmovdqa 32(%rsp),%ymm12 1301 1302 cmpq $512,%rdx 1303 jb L$tail8x 1304 1305 vpxor 0(%rsi),%ymm6,%ymm6 1306 vpxor 32(%rsi),%ymm8,%ymm8 1307 vpxor 64(%rsi),%ymm1,%ymm1 1308 vpxor 96(%rsi),%ymm5,%ymm5 1309 leaq 128(%rsi),%rsi 1310 vmovdqu %ymm6,0(%rdi) 1311 vmovdqu %ymm8,32(%rdi) 1312 vmovdqu %ymm1,64(%rdi) 1313 vmovdqu %ymm5,96(%rdi) 1314 leaq 128(%rdi),%rdi 1315 1316 vpxor 0(%rsi),%ymm12,%ymm12 1317 vpxor 32(%rsi),%ymm13,%ymm13 1318 vpxor 64(%rsi),%ymm10,%ymm10 1319 vpxor 96(%rsi),%ymm15,%ymm15 1320 leaq 128(%rsi),%rsi 1321 vmovdqu %ymm12,0(%rdi) 1322 vmovdqu %ymm13,32(%rdi) 1323 vmovdqu %ymm10,64(%rdi) 1324 vmovdqu %ymm15,96(%rdi) 1325 leaq 128(%rdi),%rdi 1326 1327 vpxor 0(%rsi),%ymm14,%ymm14 1328 vpxor 32(%rsi),%ymm2,%ymm2 1329 vpxor 64(%rsi),%ymm3,%ymm3 1330 vpxor 96(%rsi),%ymm7,%ymm7 1331 leaq 128(%rsi),%rsi 1332 vmovdqu %ymm14,0(%rdi) 1333 vmovdqu %ymm2,32(%rdi) 1334 vmovdqu %ymm3,64(%rdi) 1335 vmovdqu %ymm7,96(%rdi) 1336 leaq 128(%rdi),%rdi 1337 1338 vpxor 0(%rsi),%ymm11,%ymm11 1339 vpxor 32(%rsi),%ymm9,%ymm9 1340 vpxor 64(%rsi),%ymm0,%ymm0 1341 vpxor 96(%rsi),%ymm4,%ymm4 1342 leaq 128(%rsi),%rsi 1343 vmovdqu %ymm11,0(%rdi) 1344 vmovdqu %ymm9,32(%rdi) 1345 vmovdqu %ymm0,64(%rdi) 1346 vmovdqu %ymm4,96(%rdi) 1347 leaq 128(%rdi),%rdi 1348 1349 subq $512,%rdx 1350 jnz L$oop_outer8x 1351 1352 jmp L$done8x 1353 1354 L$tail8x: 1355 cmpq $448,%rdx 1356 jae L$448_or_more8x 1357 cmpq $384,%rdx 1358 jae L$384_or_more8x 1359 cmpq $320,%rdx 1360 jae L$320_or_more8x 1361 cmpq $256,%rdx 1362 jae L$256_or_more8x 1363 cmpq $192,%rdx 1364 jae L$192_or_more8x 1365 cmpq $128,%rdx 1366 jae L$128_or_more8x 1367 cmpq $64,%rdx 1368 jae L$64_or_more8x 1369 1370 xorq %r10,%r10 1371 vmovdqa %ymm6,0(%rsp) 1372 vmovdqa %ymm8,32(%rsp) 1373 jmp L$oop_tail8x 1374 1375 .p2align 5 1376 L$64_or_more8x: 1377 vpxor 0(%rsi),%ymm6,%ymm6 1378 vpxor 32(%rsi),%ymm8,%ymm8 1379 vmovdqu %ymm6,0(%rdi) 1380 vmovdqu %ymm8,32(%rdi) 1381 je L$done8x 1382 1383 leaq 64(%rsi),%rsi 1384 xorq %r10,%r10 1385 vmovdqa %ymm1,0(%rsp) 1386 leaq 64(%rdi),%rdi 1387 subq $64,%rdx 1388 vmovdqa %ymm5,32(%rsp) 1389 jmp L$oop_tail8x 1390 1391 .p2align 5 1392 L$128_or_more8x: 1393 vpxor 0(%rsi),%ymm6,%ymm6 1394 vpxor 32(%rsi),%ymm8,%ymm8 1395 vpxor 64(%rsi),%ymm1,%ymm1 1396 vpxor 96(%rsi),%ymm5,%ymm5 1397 vmovdqu %ymm6,0(%rdi) 1398 vmovdqu %ymm8,32(%rdi) 1399 vmovdqu %ymm1,64(%rdi) 1400 vmovdqu %ymm5,96(%rdi) 1401 je L$done8x 1402 1403 leaq 128(%rsi),%rsi 1404 xorq %r10,%r10 1405 vmovdqa %ymm12,0(%rsp) 1406 leaq 128(%rdi),%rdi 1407 subq $128,%rdx 1408 vmovdqa %ymm13,32(%rsp) 1409 jmp L$oop_tail8x 1410 1411 .p2align 5 1412 L$192_or_more8x: 1413 vpxor 0(%rsi),%ymm6,%ymm6 1414 vpxor 32(%rsi),%ymm8,%ymm8 1415 vpxor 64(%rsi),%ymm1,%ymm1 1416 vpxor 96(%rsi),%ymm5,%ymm5 1417 vpxor 128(%rsi),%ymm12,%ymm12 1418 vpxor 160(%rsi),%ymm13,%ymm13 1419 vmovdqu %ymm6,0(%rdi) 1420 vmovdqu %ymm8,32(%rdi) 1421 vmovdqu %ymm1,64(%rdi) 1422 vmovdqu %ymm5,96(%rdi) 1423 vmovdqu %ymm12,128(%rdi) 1424 vmovdqu %ymm13,160(%rdi) 1425 je L$done8x 1426 1427 leaq 192(%rsi),%rsi 1428 xorq %r10,%r10 1429 vmovdqa %ymm10,0(%rsp) 1430 leaq 192(%rdi),%rdi 1431 subq $192,%rdx 1432 vmovdqa %ymm15,32(%rsp) 1433 jmp L$oop_tail8x 1434 1435 .p2align 5 1436 L$256_or_more8x: 1437 vpxor 0(%rsi),%ymm6,%ymm6 1438 vpxor 32(%rsi),%ymm8,%ymm8 1439 vpxor 64(%rsi),%ymm1,%ymm1 1440 vpxor 96(%rsi),%ymm5,%ymm5 1441 vpxor 128(%rsi),%ymm12,%ymm12 1442 vpxor 160(%rsi),%ymm13,%ymm13 1443 vpxor 192(%rsi),%ymm10,%ymm10 1444 vpxor 224(%rsi),%ymm15,%ymm15 1445 vmovdqu %ymm6,0(%rdi) 1446 vmovdqu %ymm8,32(%rdi) 1447 vmovdqu %ymm1,64(%rdi) 1448 vmovdqu %ymm5,96(%rdi) 1449 vmovdqu %ymm12,128(%rdi) 1450 vmovdqu %ymm13,160(%rdi) 1451 vmovdqu %ymm10,192(%rdi) 1452 vmovdqu %ymm15,224(%rdi) 1453 je L$done8x 1454 1455 leaq 256(%rsi),%rsi 1456 xorq %r10,%r10 1457 vmovdqa %ymm14,0(%rsp) 1458 leaq 256(%rdi),%rdi 1459 subq $256,%rdx 1460 vmovdqa %ymm2,32(%rsp) 1461 jmp L$oop_tail8x 1462 1463 .p2align 5 1464 L$320_or_more8x: 1465 vpxor 0(%rsi),%ymm6,%ymm6 1466 vpxor 32(%rsi),%ymm8,%ymm8 1467 vpxor 64(%rsi),%ymm1,%ymm1 1468 vpxor 96(%rsi),%ymm5,%ymm5 1469 vpxor 128(%rsi),%ymm12,%ymm12 1470 vpxor 160(%rsi),%ymm13,%ymm13 1471 vpxor 192(%rsi),%ymm10,%ymm10 1472 vpxor 224(%rsi),%ymm15,%ymm15 1473 vpxor 256(%rsi),%ymm14,%ymm14 1474 vpxor 288(%rsi),%ymm2,%ymm2 1475 vmovdqu %ymm6,0(%rdi) 1476 vmovdqu %ymm8,32(%rdi) 1477 vmovdqu %ymm1,64(%rdi) 1478 vmovdqu %ymm5,96(%rdi) 1479 vmovdqu %ymm12,128(%rdi) 1480 vmovdqu %ymm13,160(%rdi) 1481 vmovdqu %ymm10,192(%rdi) 1482 vmovdqu %ymm15,224(%rdi) 1483 vmovdqu %ymm14,256(%rdi) 1484 vmovdqu %ymm2,288(%rdi) 1485 je L$done8x 1486 1487 leaq 320(%rsi),%rsi 1488 xorq %r10,%r10 1489 vmovdqa %ymm3,0(%rsp) 1490 leaq 320(%rdi),%rdi 1491 subq $320,%rdx 1492 vmovdqa %ymm7,32(%rsp) 1493 jmp L$oop_tail8x 1494 1495 .p2align 5 1496 L$384_or_more8x: 1497 vpxor 0(%rsi),%ymm6,%ymm6 1498 vpxor 32(%rsi),%ymm8,%ymm8 1499 vpxor 64(%rsi),%ymm1,%ymm1 1500 vpxor 96(%rsi),%ymm5,%ymm5 1501 vpxor 128(%rsi),%ymm12,%ymm12 1502 vpxor 160(%rsi),%ymm13,%ymm13 1503 vpxor 192(%rsi),%ymm10,%ymm10 1504 vpxor 224(%rsi),%ymm15,%ymm15 1505 vpxor 256(%rsi),%ymm14,%ymm14 1506 vpxor 288(%rsi),%ymm2,%ymm2 1507 vpxor 320(%rsi),%ymm3,%ymm3 1508 vpxor 352(%rsi),%ymm7,%ymm7 1509 vmovdqu %ymm6,0(%rdi) 1510 vmovdqu %ymm8,32(%rdi) 1511 vmovdqu %ymm1,64(%rdi) 1512 vmovdqu %ymm5,96(%rdi) 1513 vmovdqu %ymm12,128(%rdi) 1514 vmovdqu %ymm13,160(%rdi) 1515 vmovdqu %ymm10,192(%rdi) 1516 vmovdqu %ymm15,224(%rdi) 1517 vmovdqu %ymm14,256(%rdi) 1518 vmovdqu %ymm2,288(%rdi) 1519 vmovdqu %ymm3,320(%rdi) 1520 vmovdqu %ymm7,352(%rdi) 1521 je L$done8x 1522 1523 leaq 384(%rsi),%rsi 1524 xorq %r10,%r10 1525 vmovdqa %ymm11,0(%rsp) 1526 leaq 384(%rdi),%rdi 1527 subq $384,%rdx 1528 vmovdqa %ymm9,32(%rsp) 1529 jmp L$oop_tail8x 1530 1531 .p2align 5 1532 L$448_or_more8x: 1533 vpxor 0(%rsi),%ymm6,%ymm6 1534 vpxor 32(%rsi),%ymm8,%ymm8 1535 vpxor 64(%rsi),%ymm1,%ymm1 1536 vpxor 96(%rsi),%ymm5,%ymm5 1537 vpxor 128(%rsi),%ymm12,%ymm12 1538 vpxor 160(%rsi),%ymm13,%ymm13 1539 vpxor 192(%rsi),%ymm10,%ymm10 1540 vpxor 224(%rsi),%ymm15,%ymm15 1541 vpxor 256(%rsi),%ymm14,%ymm14 1542 vpxor 288(%rsi),%ymm2,%ymm2 1543 vpxor 320(%rsi),%ymm3,%ymm3 1544 vpxor 352(%rsi),%ymm7,%ymm7 1545 vpxor 384(%rsi),%ymm11,%ymm11 1546 vpxor 416(%rsi),%ymm9,%ymm9 1547 vmovdqu %ymm6,0(%rdi) 1548 vmovdqu %ymm8,32(%rdi) 1549 vmovdqu %ymm1,64(%rdi) 1550 vmovdqu %ymm5,96(%rdi) 1551 vmovdqu %ymm12,128(%rdi) 1552 vmovdqu %ymm13,160(%rdi) 1553 vmovdqu %ymm10,192(%rdi) 1554 vmovdqu %ymm15,224(%rdi) 1555 vmovdqu %ymm14,256(%rdi) 1556 vmovdqu %ymm2,288(%rdi) 1557 vmovdqu %ymm3,320(%rdi) 1558 vmovdqu %ymm7,352(%rdi) 1559 vmovdqu %ymm11,384(%rdi) 1560 vmovdqu %ymm9,416(%rdi) 1561 je L$done8x 1562 1563 leaq 448(%rsi),%rsi 1564 xorq %r10,%r10 1565 vmovdqa %ymm0,0(%rsp) 1566 leaq 448(%rdi),%rdi 1567 subq $448,%rdx 1568 vmovdqa %ymm4,32(%rsp) 1569 1570 L$oop_tail8x: 1571 movzbl (%rsi,%r10,1),%eax 1572 movzbl (%rsp,%r10,1),%ecx 1573 leaq 1(%r10),%r10 1574 xorl %ecx,%eax 1575 movb %al,-1(%rdi,%r10,1) 1576 decq %rdx 1577 jnz L$oop_tail8x 1578 1579 L$done8x: 1580 vzeroall 1581 leaq (%r9),%rsp 1582 L$8x_epilogue: 1583 .byte 0xf3,0xc3 1584 1585 #endif 1586