1 #if defined(__i386__) 2 .text 3 .globl ChaCha20_ctr32 4 .hidden ChaCha20_ctr32 5 .type ChaCha20_ctr32,@function 6 .align 16 7 ChaCha20_ctr32: 8 .L_ChaCha20_ctr32_begin: 9 pushl %ebp 10 pushl %ebx 11 pushl %esi 12 pushl %edi 13 xorl %eax,%eax 14 cmpl 28(%esp),%eax 15 je .L000no_data 16 call .Lpic_point 17 .Lpic_point: 18 popl %eax 19 leal OPENSSL_ia32cap_P-.Lpic_point(%eax),%ebp 20 testl $16777216,(%ebp) 21 jz .L001x86 22 testl $512,4(%ebp) 23 jz .L001x86 24 jmp .Lssse3_shortcut 25 .L001x86: 26 movl 32(%esp),%esi 27 movl 36(%esp),%edi 28 subl $132,%esp 29 movl (%esi),%eax 30 movl 4(%esi),%ebx 31 movl 8(%esi),%ecx 32 movl 12(%esi),%edx 33 movl %eax,80(%esp) 34 movl %ebx,84(%esp) 35 movl %ecx,88(%esp) 36 movl %edx,92(%esp) 37 movl 16(%esi),%eax 38 movl 20(%esi),%ebx 39 movl 24(%esi),%ecx 40 movl 28(%esi),%edx 41 movl %eax,96(%esp) 42 movl %ebx,100(%esp) 43 movl %ecx,104(%esp) 44 movl %edx,108(%esp) 45 movl (%edi),%eax 46 movl 4(%edi),%ebx 47 movl 8(%edi),%ecx 48 movl 12(%edi),%edx 49 subl $1,%eax 50 movl %eax,112(%esp) 51 movl %ebx,116(%esp) 52 movl %ecx,120(%esp) 53 movl %edx,124(%esp) 54 jmp .L002entry 55 .align 16 56 .L003outer_loop: 57 movl %ebx,156(%esp) 58 movl %eax,152(%esp) 59 movl %ecx,160(%esp) 60 .L002entry: 61 movl $1634760805,%eax 62 movl $857760878,4(%esp) 63 movl $2036477234,8(%esp) 64 movl $1797285236,12(%esp) 65 movl 84(%esp),%ebx 66 movl 88(%esp),%ebp 67 movl 104(%esp),%ecx 68 movl 108(%esp),%esi 69 movl 116(%esp),%edx 70 movl 120(%esp),%edi 71 movl %ebx,20(%esp) 72 movl %ebp,24(%esp) 73 movl %ecx,40(%esp) 74 movl %esi,44(%esp) 75 movl %edx,52(%esp) 76 movl %edi,56(%esp) 77 movl 92(%esp),%ebx 78 movl 124(%esp),%edi 79 movl 112(%esp),%edx 80 movl 80(%esp),%ebp 81 movl 96(%esp),%ecx 82 movl 100(%esp),%esi 83 addl $1,%edx 84 movl %ebx,28(%esp) 85 movl %edi,60(%esp) 86 movl %edx,112(%esp) 87 movl $10,%ebx 88 jmp .L004loop 89 .align 16 90 .L004loop: 91 addl %ebp,%eax 92 movl %ebx,128(%esp) 93 movl %ebp,%ebx 94 xorl %eax,%edx 95 roll $16,%edx 96 addl %edx,%ecx 97 xorl %ecx,%ebx 98 movl 52(%esp),%edi 99 roll $12,%ebx 100 movl 20(%esp),%ebp 101 addl %ebx,%eax 102 xorl %eax,%edx 103 movl %eax,(%esp) 104 roll $8,%edx 105 movl 4(%esp),%eax 106 addl %edx,%ecx 107 movl %edx,48(%esp) 108 xorl %ecx,%ebx 109 addl %ebp,%eax 110 roll $7,%ebx 111 xorl %eax,%edi 112 movl %ecx,32(%esp) 113 roll $16,%edi 114 movl %ebx,16(%esp) 115 addl %edi,%esi 116 movl 40(%esp),%ecx 117 xorl %esi,%ebp 118 movl 56(%esp),%edx 119 roll $12,%ebp 120 movl 24(%esp),%ebx 121 addl %ebp,%eax 122 xorl %eax,%edi 123 movl %eax,4(%esp) 124 roll $8,%edi 125 movl 8(%esp),%eax 126 addl %edi,%esi 127 movl %edi,52(%esp) 128 xorl %esi,%ebp 129 addl %ebx,%eax 130 roll $7,%ebp 131 xorl %eax,%edx 132 movl %esi,36(%esp) 133 roll $16,%edx 134 movl %ebp,20(%esp) 135 addl %edx,%ecx 136 movl 44(%esp),%esi 137 xorl %ecx,%ebx 138 movl 60(%esp),%edi 139 roll $12,%ebx 140 movl 28(%esp),%ebp 141 addl %ebx,%eax 142 xorl %eax,%edx 143 movl %eax,8(%esp) 144 roll $8,%edx 145 movl 12(%esp),%eax 146 addl %edx,%ecx 147 movl %edx,56(%esp) 148 xorl %ecx,%ebx 149 addl %ebp,%eax 150 roll $7,%ebx 151 xorl %eax,%edi 152 roll $16,%edi 153 movl %ebx,24(%esp) 154 addl %edi,%esi 155 xorl %esi,%ebp 156 roll $12,%ebp 157 movl 20(%esp),%ebx 158 addl %ebp,%eax 159 xorl %eax,%edi 160 movl %eax,12(%esp) 161 roll $8,%edi 162 movl (%esp),%eax 163 addl %edi,%esi 164 movl %edi,%edx 165 xorl %esi,%ebp 166 addl %ebx,%eax 167 roll $7,%ebp 168 xorl %eax,%edx 169 roll $16,%edx 170 movl %ebp,28(%esp) 171 addl %edx,%ecx 172 xorl %ecx,%ebx 173 movl 48(%esp),%edi 174 roll $12,%ebx 175 movl 24(%esp),%ebp 176 addl %ebx,%eax 177 xorl %eax,%edx 178 movl %eax,(%esp) 179 roll $8,%edx 180 movl 4(%esp),%eax 181 addl %edx,%ecx 182 movl %edx,60(%esp) 183 xorl %ecx,%ebx 184 addl %ebp,%eax 185 roll $7,%ebx 186 xorl %eax,%edi 187 movl %ecx,40(%esp) 188 roll $16,%edi 189 movl %ebx,20(%esp) 190 addl %edi,%esi 191 movl 32(%esp),%ecx 192 xorl %esi,%ebp 193 movl 52(%esp),%edx 194 roll $12,%ebp 195 movl 28(%esp),%ebx 196 addl %ebp,%eax 197 xorl %eax,%edi 198 movl %eax,4(%esp) 199 roll $8,%edi 200 movl 8(%esp),%eax 201 addl %edi,%esi 202 movl %edi,48(%esp) 203 xorl %esi,%ebp 204 addl %ebx,%eax 205 roll $7,%ebp 206 xorl %eax,%edx 207 movl %esi,44(%esp) 208 roll $16,%edx 209 movl %ebp,24(%esp) 210 addl %edx,%ecx 211 movl 36(%esp),%esi 212 xorl %ecx,%ebx 213 movl 56(%esp),%edi 214 roll $12,%ebx 215 movl 16(%esp),%ebp 216 addl %ebx,%eax 217 xorl %eax,%edx 218 movl %eax,8(%esp) 219 roll $8,%edx 220 movl 12(%esp),%eax 221 addl %edx,%ecx 222 movl %edx,52(%esp) 223 xorl %ecx,%ebx 224 addl %ebp,%eax 225 roll $7,%ebx 226 xorl %eax,%edi 227 roll $16,%edi 228 movl %ebx,28(%esp) 229 addl %edi,%esi 230 xorl %esi,%ebp 231 movl 48(%esp),%edx 232 roll $12,%ebp 233 movl 128(%esp),%ebx 234 addl %ebp,%eax 235 xorl %eax,%edi 236 movl %eax,12(%esp) 237 roll $8,%edi 238 movl (%esp),%eax 239 addl %edi,%esi 240 movl %edi,56(%esp) 241 xorl %esi,%ebp 242 roll $7,%ebp 243 decl %ebx 244 jnz .L004loop 245 movl 160(%esp),%ebx 246 addl $1634760805,%eax 247 addl 80(%esp),%ebp 248 addl 96(%esp),%ecx 249 addl 100(%esp),%esi 250 cmpl $64,%ebx 251 jb .L005tail 252 movl 156(%esp),%ebx 253 addl 112(%esp),%edx 254 addl 120(%esp),%edi 255 xorl (%ebx),%eax 256 xorl 16(%ebx),%ebp 257 movl %eax,(%esp) 258 movl 152(%esp),%eax 259 xorl 32(%ebx),%ecx 260 xorl 36(%ebx),%esi 261 xorl 48(%ebx),%edx 262 xorl 56(%ebx),%edi 263 movl %ebp,16(%eax) 264 movl %ecx,32(%eax) 265 movl %esi,36(%eax) 266 movl %edx,48(%eax) 267 movl %edi,56(%eax) 268 movl 4(%esp),%ebp 269 movl 8(%esp),%ecx 270 movl 12(%esp),%esi 271 movl 20(%esp),%edx 272 movl 24(%esp),%edi 273 addl $857760878,%ebp 274 addl $2036477234,%ecx 275 addl $1797285236,%esi 276 addl 84(%esp),%edx 277 addl 88(%esp),%edi 278 xorl 4(%ebx),%ebp 279 xorl 8(%ebx),%ecx 280 xorl 12(%ebx),%esi 281 xorl 20(%ebx),%edx 282 xorl 24(%ebx),%edi 283 movl %ebp,4(%eax) 284 movl %ecx,8(%eax) 285 movl %esi,12(%eax) 286 movl %edx,20(%eax) 287 movl %edi,24(%eax) 288 movl 28(%esp),%ebp 289 movl 40(%esp),%ecx 290 movl 44(%esp),%esi 291 movl 52(%esp),%edx 292 movl 60(%esp),%edi 293 addl 92(%esp),%ebp 294 addl 104(%esp),%ecx 295 addl 108(%esp),%esi 296 addl 116(%esp),%edx 297 addl 124(%esp),%edi 298 xorl 28(%ebx),%ebp 299 xorl 40(%ebx),%ecx 300 xorl 44(%ebx),%esi 301 xorl 52(%ebx),%edx 302 xorl 60(%ebx),%edi 303 leal 64(%ebx),%ebx 304 movl %ebp,28(%eax) 305 movl (%esp),%ebp 306 movl %ecx,40(%eax) 307 movl 160(%esp),%ecx 308 movl %esi,44(%eax) 309 movl %edx,52(%eax) 310 movl %edi,60(%eax) 311 movl %ebp,(%eax) 312 leal 64(%eax),%eax 313 subl $64,%ecx 314 jnz .L003outer_loop 315 jmp .L006done 316 .L005tail: 317 addl 112(%esp),%edx 318 addl 120(%esp),%edi 319 movl %eax,(%esp) 320 movl %ebp,16(%esp) 321 movl %ecx,32(%esp) 322 movl %esi,36(%esp) 323 movl %edx,48(%esp) 324 movl %edi,56(%esp) 325 movl 4(%esp),%ebp 326 movl 8(%esp),%ecx 327 movl 12(%esp),%esi 328 movl 20(%esp),%edx 329 movl 24(%esp),%edi 330 addl $857760878,%ebp 331 addl $2036477234,%ecx 332 addl $1797285236,%esi 333 addl 84(%esp),%edx 334 addl 88(%esp),%edi 335 movl %ebp,4(%esp) 336 movl %ecx,8(%esp) 337 movl %esi,12(%esp) 338 movl %edx,20(%esp) 339 movl %edi,24(%esp) 340 movl 28(%esp),%ebp 341 movl 40(%esp),%ecx 342 movl 44(%esp),%esi 343 movl 52(%esp),%edx 344 movl 60(%esp),%edi 345 addl 92(%esp),%ebp 346 addl 104(%esp),%ecx 347 addl 108(%esp),%esi 348 addl 116(%esp),%edx 349 addl 124(%esp),%edi 350 movl %ebp,28(%esp) 351 movl 156(%esp),%ebp 352 movl %ecx,40(%esp) 353 movl 152(%esp),%ecx 354 movl %esi,44(%esp) 355 xorl %esi,%esi 356 movl %edx,52(%esp) 357 movl %edi,60(%esp) 358 xorl %eax,%eax 359 xorl %edx,%edx 360 .L007tail_loop: 361 movb (%esi,%ebp,1),%al 362 movb (%esp,%esi,1),%dl 363 leal 1(%esi),%esi 364 xorb %dl,%al 365 movb %al,-1(%ecx,%esi,1) 366 decl %ebx 367 jnz .L007tail_loop 368 .L006done: 369 addl $132,%esp 370 .L000no_data: 371 popl %edi 372 popl %esi 373 popl %ebx 374 popl %ebp 375 ret 376 .size ChaCha20_ctr32,.-.L_ChaCha20_ctr32_begin 377 .globl ChaCha20_ssse3 378 .hidden ChaCha20_ssse3 379 .type ChaCha20_ssse3,@function 380 .align 16 381 ChaCha20_ssse3: 382 .L_ChaCha20_ssse3_begin: 383 pushl %ebp 384 pushl %ebx 385 pushl %esi 386 pushl %edi 387 .Lssse3_shortcut: 388 movl 20(%esp),%edi 389 movl 24(%esp),%esi 390 movl 28(%esp),%ecx 391 movl 32(%esp),%edx 392 movl 36(%esp),%ebx 393 movl %esp,%ebp 394 subl $524,%esp 395 andl $-64,%esp 396 movl %ebp,512(%esp) 397 leal .Lssse3_data-.Lpic_point(%eax),%eax 398 movdqu (%ebx),%xmm3 399 cmpl $256,%ecx 400 jb .L0081x 401 movl %edx,516(%esp) 402 movl %ebx,520(%esp) 403 subl $256,%ecx 404 leal 384(%esp),%ebp 405 movdqu (%edx),%xmm7 406 pshufd $0,%xmm3,%xmm0 407 pshufd $85,%xmm3,%xmm1 408 pshufd $170,%xmm3,%xmm2 409 pshufd $255,%xmm3,%xmm3 410 paddd 48(%eax),%xmm0 411 pshufd $0,%xmm7,%xmm4 412 pshufd $85,%xmm7,%xmm5 413 psubd 64(%eax),%xmm0 414 pshufd $170,%xmm7,%xmm6 415 pshufd $255,%xmm7,%xmm7 416 movdqa %xmm0,64(%ebp) 417 movdqa %xmm1,80(%ebp) 418 movdqa %xmm2,96(%ebp) 419 movdqa %xmm3,112(%ebp) 420 movdqu 16(%edx),%xmm3 421 movdqa %xmm4,-64(%ebp) 422 movdqa %xmm5,-48(%ebp) 423 movdqa %xmm6,-32(%ebp) 424 movdqa %xmm7,-16(%ebp) 425 movdqa 32(%eax),%xmm7 426 leal 128(%esp),%ebx 427 pshufd $0,%xmm3,%xmm0 428 pshufd $85,%xmm3,%xmm1 429 pshufd $170,%xmm3,%xmm2 430 pshufd $255,%xmm3,%xmm3 431 pshufd $0,%xmm7,%xmm4 432 pshufd $85,%xmm7,%xmm5 433 pshufd $170,%xmm7,%xmm6 434 pshufd $255,%xmm7,%xmm7 435 movdqa %xmm0,(%ebp) 436 movdqa %xmm1,16(%ebp) 437 movdqa %xmm2,32(%ebp) 438 movdqa %xmm3,48(%ebp) 439 movdqa %xmm4,-128(%ebp) 440 movdqa %xmm5,-112(%ebp) 441 movdqa %xmm6,-96(%ebp) 442 movdqa %xmm7,-80(%ebp) 443 leal 128(%esi),%esi 444 leal 128(%edi),%edi 445 jmp .L009outer_loop 446 .align 16 447 .L009outer_loop: 448 movdqa -112(%ebp),%xmm1 449 movdqa -96(%ebp),%xmm2 450 movdqa -80(%ebp),%xmm3 451 movdqa -48(%ebp),%xmm5 452 movdqa -32(%ebp),%xmm6 453 movdqa -16(%ebp),%xmm7 454 movdqa %xmm1,-112(%ebx) 455 movdqa %xmm2,-96(%ebx) 456 movdqa %xmm3,-80(%ebx) 457 movdqa %xmm5,-48(%ebx) 458 movdqa %xmm6,-32(%ebx) 459 movdqa %xmm7,-16(%ebx) 460 movdqa 32(%ebp),%xmm2 461 movdqa 48(%ebp),%xmm3 462 movdqa 64(%ebp),%xmm4 463 movdqa 80(%ebp),%xmm5 464 movdqa 96(%ebp),%xmm6 465 movdqa 112(%ebp),%xmm7 466 paddd 64(%eax),%xmm4 467 movdqa %xmm2,32(%ebx) 468 movdqa %xmm3,48(%ebx) 469 movdqa %xmm4,64(%ebx) 470 movdqa %xmm5,80(%ebx) 471 movdqa %xmm6,96(%ebx) 472 movdqa %xmm7,112(%ebx) 473 movdqa %xmm4,64(%ebp) 474 movdqa -128(%ebp),%xmm0 475 movdqa %xmm4,%xmm6 476 movdqa -64(%ebp),%xmm3 477 movdqa (%ebp),%xmm4 478 movdqa 16(%ebp),%xmm5 479 movl $10,%edx 480 nop 481 .align 16 482 .L010loop: 483 paddd %xmm3,%xmm0 484 movdqa %xmm3,%xmm2 485 pxor %xmm0,%xmm6 486 pshufb (%eax),%xmm6 487 paddd %xmm6,%xmm4 488 pxor %xmm4,%xmm2 489 movdqa -48(%ebx),%xmm3 490 movdqa %xmm2,%xmm1 491 pslld $12,%xmm2 492 psrld $20,%xmm1 493 por %xmm1,%xmm2 494 movdqa -112(%ebx),%xmm1 495 paddd %xmm2,%xmm0 496 movdqa 80(%ebx),%xmm7 497 pxor %xmm0,%xmm6 498 movdqa %xmm0,-128(%ebx) 499 pshufb 16(%eax),%xmm6 500 paddd %xmm6,%xmm4 501 movdqa %xmm6,64(%ebx) 502 pxor %xmm4,%xmm2 503 paddd %xmm3,%xmm1 504 movdqa %xmm2,%xmm0 505 pslld $7,%xmm2 506 psrld $25,%xmm0 507 pxor %xmm1,%xmm7 508 por %xmm0,%xmm2 509 movdqa %xmm4,(%ebx) 510 pshufb (%eax),%xmm7 511 movdqa %xmm2,-64(%ebx) 512 paddd %xmm7,%xmm5 513 movdqa 32(%ebx),%xmm4 514 pxor %xmm5,%xmm3 515 movdqa -32(%ebx),%xmm2 516 movdqa %xmm3,%xmm0 517 pslld $12,%xmm3 518 psrld $20,%xmm0 519 por %xmm0,%xmm3 520 movdqa -96(%ebx),%xmm0 521 paddd %xmm3,%xmm1 522 movdqa 96(%ebx),%xmm6 523 pxor %xmm1,%xmm7 524 movdqa %xmm1,-112(%ebx) 525 pshufb 16(%eax),%xmm7 526 paddd %xmm7,%xmm5 527 movdqa %xmm7,80(%ebx) 528 pxor %xmm5,%xmm3 529 paddd %xmm2,%xmm0 530 movdqa %xmm3,%xmm1 531 pslld $7,%xmm3 532 psrld $25,%xmm1 533 pxor %xmm0,%xmm6 534 por %xmm1,%xmm3 535 movdqa %xmm5,16(%ebx) 536 pshufb (%eax),%xmm6 537 movdqa %xmm3,-48(%ebx) 538 paddd %xmm6,%xmm4 539 movdqa 48(%ebx),%xmm5 540 pxor %xmm4,%xmm2 541 movdqa -16(%ebx),%xmm3 542 movdqa %xmm2,%xmm1 543 pslld $12,%xmm2 544 psrld $20,%xmm1 545 por %xmm1,%xmm2 546 movdqa -80(%ebx),%xmm1 547 paddd %xmm2,%xmm0 548 movdqa 112(%ebx),%xmm7 549 pxor %xmm0,%xmm6 550 movdqa %xmm0,-96(%ebx) 551 pshufb 16(%eax),%xmm6 552 paddd %xmm6,%xmm4 553 movdqa %xmm6,96(%ebx) 554 pxor %xmm4,%xmm2 555 paddd %xmm3,%xmm1 556 movdqa %xmm2,%xmm0 557 pslld $7,%xmm2 558 psrld $25,%xmm0 559 pxor %xmm1,%xmm7 560 por %xmm0,%xmm2 561 pshufb (%eax),%xmm7 562 movdqa %xmm2,-32(%ebx) 563 paddd %xmm7,%xmm5 564 pxor %xmm5,%xmm3 565 movdqa -48(%ebx),%xmm2 566 movdqa %xmm3,%xmm0 567 pslld $12,%xmm3 568 psrld $20,%xmm0 569 por %xmm0,%xmm3 570 movdqa -128(%ebx),%xmm0 571 paddd %xmm3,%xmm1 572 pxor %xmm1,%xmm7 573 movdqa %xmm1,-80(%ebx) 574 pshufb 16(%eax),%xmm7 575 paddd %xmm7,%xmm5 576 movdqa %xmm7,%xmm6 577 pxor %xmm5,%xmm3 578 paddd %xmm2,%xmm0 579 movdqa %xmm3,%xmm1 580 pslld $7,%xmm3 581 psrld $25,%xmm1 582 pxor %xmm0,%xmm6 583 por %xmm1,%xmm3 584 pshufb (%eax),%xmm6 585 movdqa %xmm3,-16(%ebx) 586 paddd %xmm6,%xmm4 587 pxor %xmm4,%xmm2 588 movdqa -32(%ebx),%xmm3 589 movdqa %xmm2,%xmm1 590 pslld $12,%xmm2 591 psrld $20,%xmm1 592 por %xmm1,%xmm2 593 movdqa -112(%ebx),%xmm1 594 paddd %xmm2,%xmm0 595 movdqa 64(%ebx),%xmm7 596 pxor %xmm0,%xmm6 597 movdqa %xmm0,-128(%ebx) 598 pshufb 16(%eax),%xmm6 599 paddd %xmm6,%xmm4 600 movdqa %xmm6,112(%ebx) 601 pxor %xmm4,%xmm2 602 paddd %xmm3,%xmm1 603 movdqa %xmm2,%xmm0 604 pslld $7,%xmm2 605 psrld $25,%xmm0 606 pxor %xmm1,%xmm7 607 por %xmm0,%xmm2 608 movdqa %xmm4,32(%ebx) 609 pshufb (%eax),%xmm7 610 movdqa %xmm2,-48(%ebx) 611 paddd %xmm7,%xmm5 612 movdqa (%ebx),%xmm4 613 pxor %xmm5,%xmm3 614 movdqa -16(%ebx),%xmm2 615 movdqa %xmm3,%xmm0 616 pslld $12,%xmm3 617 psrld $20,%xmm0 618 por %xmm0,%xmm3 619 movdqa -96(%ebx),%xmm0 620 paddd %xmm3,%xmm1 621 movdqa 80(%ebx),%xmm6 622 pxor %xmm1,%xmm7 623 movdqa %xmm1,-112(%ebx) 624 pshufb 16(%eax),%xmm7 625 paddd %xmm7,%xmm5 626 movdqa %xmm7,64(%ebx) 627 pxor %xmm5,%xmm3 628 paddd %xmm2,%xmm0 629 movdqa %xmm3,%xmm1 630 pslld $7,%xmm3 631 psrld $25,%xmm1 632 pxor %xmm0,%xmm6 633 por %xmm1,%xmm3 634 movdqa %xmm5,48(%ebx) 635 pshufb (%eax),%xmm6 636 movdqa %xmm3,-32(%ebx) 637 paddd %xmm6,%xmm4 638 movdqa 16(%ebx),%xmm5 639 pxor %xmm4,%xmm2 640 movdqa -64(%ebx),%xmm3 641 movdqa %xmm2,%xmm1 642 pslld $12,%xmm2 643 psrld $20,%xmm1 644 por %xmm1,%xmm2 645 movdqa -80(%ebx),%xmm1 646 paddd %xmm2,%xmm0 647 movdqa 96(%ebx),%xmm7 648 pxor %xmm0,%xmm6 649 movdqa %xmm0,-96(%ebx) 650 pshufb 16(%eax),%xmm6 651 paddd %xmm6,%xmm4 652 movdqa %xmm6,80(%ebx) 653 pxor %xmm4,%xmm2 654 paddd %xmm3,%xmm1 655 movdqa %xmm2,%xmm0 656 pslld $7,%xmm2 657 psrld $25,%xmm0 658 pxor %xmm1,%xmm7 659 por %xmm0,%xmm2 660 pshufb (%eax),%xmm7 661 movdqa %xmm2,-16(%ebx) 662 paddd %xmm7,%xmm5 663 pxor %xmm5,%xmm3 664 movdqa %xmm3,%xmm0 665 pslld $12,%xmm3 666 psrld $20,%xmm0 667 por %xmm0,%xmm3 668 movdqa -128(%ebx),%xmm0 669 paddd %xmm3,%xmm1 670 movdqa 64(%ebx),%xmm6 671 pxor %xmm1,%xmm7 672 movdqa %xmm1,-80(%ebx) 673 pshufb 16(%eax),%xmm7 674 paddd %xmm7,%xmm5 675 movdqa %xmm7,96(%ebx) 676 pxor %xmm5,%xmm3 677 movdqa %xmm3,%xmm1 678 pslld $7,%xmm3 679 psrld $25,%xmm1 680 por %xmm1,%xmm3 681 decl %edx 682 jnz .L010loop 683 movdqa %xmm3,-64(%ebx) 684 movdqa %xmm4,(%ebx) 685 movdqa %xmm5,16(%ebx) 686 movdqa %xmm6,64(%ebx) 687 movdqa %xmm7,96(%ebx) 688 movdqa -112(%ebx),%xmm1 689 movdqa -96(%ebx),%xmm2 690 movdqa -80(%ebx),%xmm3 691 paddd -128(%ebp),%xmm0 692 paddd -112(%ebp),%xmm1 693 paddd -96(%ebp),%xmm2 694 paddd -80(%ebp),%xmm3 695 movdqa %xmm0,%xmm6 696 punpckldq %xmm1,%xmm0 697 movdqa %xmm2,%xmm7 698 punpckldq %xmm3,%xmm2 699 punpckhdq %xmm1,%xmm6 700 punpckhdq %xmm3,%xmm7 701 movdqa %xmm0,%xmm1 702 punpcklqdq %xmm2,%xmm0 703 movdqa %xmm6,%xmm3 704 punpcklqdq %xmm7,%xmm6 705 punpckhqdq %xmm2,%xmm1 706 punpckhqdq %xmm7,%xmm3 707 movdqu -128(%esi),%xmm4 708 movdqu -64(%esi),%xmm5 709 movdqu (%esi),%xmm2 710 movdqu 64(%esi),%xmm7 711 leal 16(%esi),%esi 712 pxor %xmm0,%xmm4 713 movdqa -64(%ebx),%xmm0 714 pxor %xmm1,%xmm5 715 movdqa -48(%ebx),%xmm1 716 pxor %xmm2,%xmm6 717 movdqa -32(%ebx),%xmm2 718 pxor %xmm3,%xmm7 719 movdqa -16(%ebx),%xmm3 720 movdqu %xmm4,-128(%edi) 721 movdqu %xmm5,-64(%edi) 722 movdqu %xmm6,(%edi) 723 movdqu %xmm7,64(%edi) 724 leal 16(%edi),%edi 725 paddd -64(%ebp),%xmm0 726 paddd -48(%ebp),%xmm1 727 paddd -32(%ebp),%xmm2 728 paddd -16(%ebp),%xmm3 729 movdqa %xmm0,%xmm6 730 punpckldq %xmm1,%xmm0 731 movdqa %xmm2,%xmm7 732 punpckldq %xmm3,%xmm2 733 punpckhdq %xmm1,%xmm6 734 punpckhdq %xmm3,%xmm7 735 movdqa %xmm0,%xmm1 736 punpcklqdq %xmm2,%xmm0 737 movdqa %xmm6,%xmm3 738 punpcklqdq %xmm7,%xmm6 739 punpckhqdq %xmm2,%xmm1 740 punpckhqdq %xmm7,%xmm3 741 movdqu -128(%esi),%xmm4 742 movdqu -64(%esi),%xmm5 743 movdqu (%esi),%xmm2 744 movdqu 64(%esi),%xmm7 745 leal 16(%esi),%esi 746 pxor %xmm0,%xmm4 747 movdqa (%ebx),%xmm0 748 pxor %xmm1,%xmm5 749 movdqa 16(%ebx),%xmm1 750 pxor %xmm2,%xmm6 751 movdqa 32(%ebx),%xmm2 752 pxor %xmm3,%xmm7 753 movdqa 48(%ebx),%xmm3 754 movdqu %xmm4,-128(%edi) 755 movdqu %xmm5,-64(%edi) 756 movdqu %xmm6,(%edi) 757 movdqu %xmm7,64(%edi) 758 leal 16(%edi),%edi 759 paddd (%ebp),%xmm0 760 paddd 16(%ebp),%xmm1 761 paddd 32(%ebp),%xmm2 762 paddd 48(%ebp),%xmm3 763 movdqa %xmm0,%xmm6 764 punpckldq %xmm1,%xmm0 765 movdqa %xmm2,%xmm7 766 punpckldq %xmm3,%xmm2 767 punpckhdq %xmm1,%xmm6 768 punpckhdq %xmm3,%xmm7 769 movdqa %xmm0,%xmm1 770 punpcklqdq %xmm2,%xmm0 771 movdqa %xmm6,%xmm3 772 punpcklqdq %xmm7,%xmm6 773 punpckhqdq %xmm2,%xmm1 774 punpckhqdq %xmm7,%xmm3 775 movdqu -128(%esi),%xmm4 776 movdqu -64(%esi),%xmm5 777 movdqu (%esi),%xmm2 778 movdqu 64(%esi),%xmm7 779 leal 16(%esi),%esi 780 pxor %xmm0,%xmm4 781 movdqa 64(%ebx),%xmm0 782 pxor %xmm1,%xmm5 783 movdqa 80(%ebx),%xmm1 784 pxor %xmm2,%xmm6 785 movdqa 96(%ebx),%xmm2 786 pxor %xmm3,%xmm7 787 movdqa 112(%ebx),%xmm3 788 movdqu %xmm4,-128(%edi) 789 movdqu %xmm5,-64(%edi) 790 movdqu %xmm6,(%edi) 791 movdqu %xmm7,64(%edi) 792 leal 16(%edi),%edi 793 paddd 64(%ebp),%xmm0 794 paddd 80(%ebp),%xmm1 795 paddd 96(%ebp),%xmm2 796 paddd 112(%ebp),%xmm3 797 movdqa %xmm0,%xmm6 798 punpckldq %xmm1,%xmm0 799 movdqa %xmm2,%xmm7 800 punpckldq %xmm3,%xmm2 801 punpckhdq %xmm1,%xmm6 802 punpckhdq %xmm3,%xmm7 803 movdqa %xmm0,%xmm1 804 punpcklqdq %xmm2,%xmm0 805 movdqa %xmm6,%xmm3 806 punpcklqdq %xmm7,%xmm6 807 punpckhqdq %xmm2,%xmm1 808 punpckhqdq %xmm7,%xmm3 809 movdqu -128(%esi),%xmm4 810 movdqu -64(%esi),%xmm5 811 movdqu (%esi),%xmm2 812 movdqu 64(%esi),%xmm7 813 leal 208(%esi),%esi 814 pxor %xmm0,%xmm4 815 pxor %xmm1,%xmm5 816 pxor %xmm2,%xmm6 817 pxor %xmm3,%xmm7 818 movdqu %xmm4,-128(%edi) 819 movdqu %xmm5,-64(%edi) 820 movdqu %xmm6,(%edi) 821 movdqu %xmm7,64(%edi) 822 leal 208(%edi),%edi 823 subl $256,%ecx 824 jnc .L009outer_loop 825 addl $256,%ecx 826 jz .L011done 827 movl 520(%esp),%ebx 828 leal -128(%esi),%esi 829 movl 516(%esp),%edx 830 leal -128(%edi),%edi 831 movd 64(%ebp),%xmm2 832 movdqu (%ebx),%xmm3 833 paddd 96(%eax),%xmm2 834 pand 112(%eax),%xmm3 835 por %xmm2,%xmm3 836 .L0081x: 837 movdqa 32(%eax),%xmm0 838 movdqu (%edx),%xmm1 839 movdqu 16(%edx),%xmm2 840 movdqa (%eax),%xmm6 841 movdqa 16(%eax),%xmm7 842 movl %ebp,48(%esp) 843 movdqa %xmm0,(%esp) 844 movdqa %xmm1,16(%esp) 845 movdqa %xmm2,32(%esp) 846 movdqa %xmm3,48(%esp) 847 movl $10,%edx 848 jmp .L012loop1x 849 .align 16 850 .L013outer1x: 851 movdqa 80(%eax),%xmm3 852 movdqa (%esp),%xmm0 853 movdqa 16(%esp),%xmm1 854 movdqa 32(%esp),%xmm2 855 paddd 48(%esp),%xmm3 856 movl $10,%edx 857 movdqa %xmm3,48(%esp) 858 jmp .L012loop1x 859 .align 16 860 .L012loop1x: 861 paddd %xmm1,%xmm0 862 pxor %xmm0,%xmm3 863 .byte 102,15,56,0,222 864 paddd %xmm3,%xmm2 865 pxor %xmm2,%xmm1 866 movdqa %xmm1,%xmm4 867 psrld $20,%xmm1 868 pslld $12,%xmm4 869 por %xmm4,%xmm1 870 paddd %xmm1,%xmm0 871 pxor %xmm0,%xmm3 872 .byte 102,15,56,0,223 873 paddd %xmm3,%xmm2 874 pxor %xmm2,%xmm1 875 movdqa %xmm1,%xmm4 876 psrld $25,%xmm1 877 pslld $7,%xmm4 878 por %xmm4,%xmm1 879 pshufd $78,%xmm2,%xmm2 880 pshufd $57,%xmm1,%xmm1 881 pshufd $147,%xmm3,%xmm3 882 nop 883 paddd %xmm1,%xmm0 884 pxor %xmm0,%xmm3 885 .byte 102,15,56,0,222 886 paddd %xmm3,%xmm2 887 pxor %xmm2,%xmm1 888 movdqa %xmm1,%xmm4 889 psrld $20,%xmm1 890 pslld $12,%xmm4 891 por %xmm4,%xmm1 892 paddd %xmm1,%xmm0 893 pxor %xmm0,%xmm3 894 .byte 102,15,56,0,223 895 paddd %xmm3,%xmm2 896 pxor %xmm2,%xmm1 897 movdqa %xmm1,%xmm4 898 psrld $25,%xmm1 899 pslld $7,%xmm4 900 por %xmm4,%xmm1 901 pshufd $78,%xmm2,%xmm2 902 pshufd $147,%xmm1,%xmm1 903 pshufd $57,%xmm3,%xmm3 904 decl %edx 905 jnz .L012loop1x 906 paddd (%esp),%xmm0 907 paddd 16(%esp),%xmm1 908 paddd 32(%esp),%xmm2 909 paddd 48(%esp),%xmm3 910 cmpl $64,%ecx 911 jb .L014tail 912 movdqu (%esi),%xmm4 913 movdqu 16(%esi),%xmm5 914 pxor %xmm4,%xmm0 915 movdqu 32(%esi),%xmm4 916 pxor %xmm5,%xmm1 917 movdqu 48(%esi),%xmm5 918 pxor %xmm4,%xmm2 919 pxor %xmm5,%xmm3 920 leal 64(%esi),%esi 921 movdqu %xmm0,(%edi) 922 movdqu %xmm1,16(%edi) 923 movdqu %xmm2,32(%edi) 924 movdqu %xmm3,48(%edi) 925 leal 64(%edi),%edi 926 subl $64,%ecx 927 jnz .L013outer1x 928 jmp .L011done 929 .L014tail: 930 movdqa %xmm0,(%esp) 931 movdqa %xmm1,16(%esp) 932 movdqa %xmm2,32(%esp) 933 movdqa %xmm3,48(%esp) 934 xorl %eax,%eax 935 xorl %edx,%edx 936 xorl %ebp,%ebp 937 .L015tail_loop: 938 movb (%esp,%ebp,1),%al 939 movb (%esi,%ebp,1),%dl 940 leal 1(%ebp),%ebp 941 xorb %dl,%al 942 movb %al,-1(%edi,%ebp,1) 943 decl %ecx 944 jnz .L015tail_loop 945 .L011done: 946 movl 512(%esp),%esp 947 popl %edi 948 popl %esi 949 popl %ebx 950 popl %ebp 951 ret 952 .size ChaCha20_ssse3,.-.L_ChaCha20_ssse3_begin 953 .align 64 954 .Lssse3_data: 955 .byte 2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13 956 .byte 3,0,1,2,7,4,5,6,11,8,9,10,15,12,13,14 957 .long 1634760805,857760878,2036477234,1797285236 958 .long 0,1,2,3 959 .long 4,4,4,4 960 .long 1,0,0,0 961 .long 4,0,0,0 962 .long 0,-1,-1,-1 963 .align 64 964 .byte 67,104,97,67,104,97,50,48,32,102,111,114,32,120,56,54 965 .byte 44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32 966 .byte 60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111 967 .byte 114,103,62,0 968 #endif 969