1 #if defined(__x86_64__) && !defined(OPENSSL_NO_ASM) 2 .text 3 4 5 6 .globl _bn_mul_mont 7 .private_extern _bn_mul_mont 8 9 .p2align 4 10 _bn_mul_mont: 11 12 movl %r9d,%r9d 13 movq %rsp,%rax 14 15 testl $3,%r9d 16 jnz L$mul_enter 17 cmpl $8,%r9d 18 jb L$mul_enter 19 cmpq %rsi,%rdx 20 jne L$mul4x_enter 21 testl $7,%r9d 22 jz L$sqr8x_enter 23 jmp L$mul4x_enter 24 25 .p2align 4 26 L$mul_enter: 27 pushq %rbx 28 29 pushq %rbp 30 31 pushq %r12 32 33 pushq %r13 34 35 pushq %r14 36 37 pushq %r15 38 39 40 negq %r9 41 movq %rsp,%r11 42 leaq -16(%rsp,%r9,8),%r10 43 negq %r9 44 andq $-1024,%r10 45 46 47 48 49 50 51 52 53 54 subq %r10,%r11 55 andq $-4096,%r11 56 leaq (%r10,%r11,1),%rsp 57 movq (%rsp),%r11 58 cmpq %r10,%rsp 59 ja L$mul_page_walk 60 jmp L$mul_page_walk_done 61 62 .p2align 4 63 L$mul_page_walk: 64 leaq -4096(%rsp),%rsp 65 movq (%rsp),%r11 66 cmpq %r10,%rsp 67 ja L$mul_page_walk 68 L$mul_page_walk_done: 69 70 movq %rax,8(%rsp,%r9,8) 71 72 L$mul_body: 73 movq %rdx,%r12 74 movq (%r8),%r8 75 movq (%r12),%rbx 76 movq (%rsi),%rax 77 78 xorq %r14,%r14 79 xorq %r15,%r15 80 81 movq %r8,%rbp 82 mulq %rbx 83 movq %rax,%r10 84 movq (%rcx),%rax 85 86 imulq %r10,%rbp 87 movq %rdx,%r11 88 89 mulq %rbp 90 addq %rax,%r10 91 movq 8(%rsi),%rax 92 adcq $0,%rdx 93 movq %rdx,%r13 94 95 leaq 1(%r15),%r15 96 jmp L$1st_enter 97 98 .p2align 4 99 L$1st: 100 addq %rax,%r13 101 movq (%rsi,%r15,8),%rax 102 adcq $0,%rdx 103 addq %r11,%r13 104 movq %r10,%r11 105 adcq $0,%rdx 106 movq %r13,-16(%rsp,%r15,8) 107 movq %rdx,%r13 108 109 L$1st_enter: 110 mulq %rbx 111 addq %rax,%r11 112 movq (%rcx,%r15,8),%rax 113 adcq $0,%rdx 114 leaq 1(%r15),%r15 115 movq %rdx,%r10 116 117 mulq %rbp 118 cmpq %r9,%r15 119 jne L$1st 120 121 addq %rax,%r13 122 movq (%rsi),%rax 123 adcq $0,%rdx 124 addq %r11,%r13 125 adcq $0,%rdx 126 movq %r13,-16(%rsp,%r15,8) 127 movq %rdx,%r13 128 movq %r10,%r11 129 130 xorq %rdx,%rdx 131 addq %r11,%r13 132 adcq $0,%rdx 133 movq %r13,-8(%rsp,%r9,8) 134 movq %rdx,(%rsp,%r9,8) 135 136 leaq 1(%r14),%r14 137 jmp L$outer 138 .p2align 4 139 L$outer: 140 movq (%r12,%r14,8),%rbx 141 xorq %r15,%r15 142 movq %r8,%rbp 143 movq (%rsp),%r10 144 mulq %rbx 145 addq %rax,%r10 146 movq (%rcx),%rax 147 adcq $0,%rdx 148 149 imulq %r10,%rbp 150 movq %rdx,%r11 151 152 mulq %rbp 153 addq %rax,%r10 154 movq 8(%rsi),%rax 155 adcq $0,%rdx 156 movq 8(%rsp),%r10 157 movq %rdx,%r13 158 159 leaq 1(%r15),%r15 160 jmp L$inner_enter 161 162 .p2align 4 163 L$inner: 164 addq %rax,%r13 165 movq (%rsi,%r15,8),%rax 166 adcq $0,%rdx 167 addq %r10,%r13 168 movq (%rsp,%r15,8),%r10 169 adcq $0,%rdx 170 movq %r13,-16(%rsp,%r15,8) 171 movq %rdx,%r13 172 173 L$inner_enter: 174 mulq %rbx 175 addq %rax,%r11 176 movq (%rcx,%r15,8),%rax 177 adcq $0,%rdx 178 addq %r11,%r10 179 movq %rdx,%r11 180 adcq $0,%r11 181 leaq 1(%r15),%r15 182 183 mulq %rbp 184 cmpq %r9,%r15 185 jne L$inner 186 187 addq %rax,%r13 188 movq (%rsi),%rax 189 adcq $0,%rdx 190 addq %r10,%r13 191 movq (%rsp,%r15,8),%r10 192 adcq $0,%rdx 193 movq %r13,-16(%rsp,%r15,8) 194 movq %rdx,%r13 195 196 xorq %rdx,%rdx 197 addq %r11,%r13 198 adcq $0,%rdx 199 addq %r10,%r13 200 adcq $0,%rdx 201 movq %r13,-8(%rsp,%r9,8) 202 movq %rdx,(%rsp,%r9,8) 203 204 leaq 1(%r14),%r14 205 cmpq %r9,%r14 206 jb L$outer 207 208 xorq %r14,%r14 209 movq (%rsp),%rax 210 leaq (%rsp),%rsi 211 movq %r9,%r15 212 jmp L$sub 213 .p2align 4 214 L$sub: 215 sbbq (%rcx,%r14,8),%rax 216 movq %rax,(%rdi,%r14,8) 217 movq 8(%rsi,%r14,8),%rax 218 leaq 1(%r14),%r14 219 decq %r15 220 jnz L$sub 221 222 sbbq $0,%rax 223 xorq %r14,%r14 224 andq %rax,%rsi 225 notq %rax 226 movq %rdi,%rcx 227 andq %rax,%rcx 228 movq %r9,%r15 229 orq %rcx,%rsi 230 .p2align 4 231 L$copy: 232 movq (%rsi,%r14,8),%rax 233 movq %r14,(%rsp,%r14,8) 234 movq %rax,(%rdi,%r14,8) 235 leaq 1(%r14),%r14 236 subq $1,%r15 237 jnz L$copy 238 239 movq 8(%rsp,%r9,8),%rsi 240 241 movq $1,%rax 242 movq -48(%rsi),%r15 243 244 movq -40(%rsi),%r14 245 246 movq -32(%rsi),%r13 247 248 movq -24(%rsi),%r12 249 250 movq -16(%rsi),%rbp 251 252 movq -8(%rsi),%rbx 253 254 leaq (%rsi),%rsp 255 256 L$mul_epilogue: 257 .byte 0xf3,0xc3 258 259 260 261 .p2align 4 262 bn_mul4x_mont: 263 264 movl %r9d,%r9d 265 movq %rsp,%rax 266 267 L$mul4x_enter: 268 pushq %rbx 269 270 pushq %rbp 271 272 pushq %r12 273 274 pushq %r13 275 276 pushq %r14 277 278 pushq %r15 279 280 281 negq %r9 282 movq %rsp,%r11 283 leaq -32(%rsp,%r9,8),%r10 284 negq %r9 285 andq $-1024,%r10 286 287 subq %r10,%r11 288 andq $-4096,%r11 289 leaq (%r10,%r11,1),%rsp 290 movq (%rsp),%r11 291 cmpq %r10,%rsp 292 ja L$mul4x_page_walk 293 jmp L$mul4x_page_walk_done 294 295 L$mul4x_page_walk: 296 leaq -4096(%rsp),%rsp 297 movq (%rsp),%r11 298 cmpq %r10,%rsp 299 ja L$mul4x_page_walk 300 L$mul4x_page_walk_done: 301 302 movq %rax,8(%rsp,%r9,8) 303 304 L$mul4x_body: 305 movq %rdi,16(%rsp,%r9,8) 306 movq %rdx,%r12 307 movq (%r8),%r8 308 movq (%r12),%rbx 309 movq (%rsi),%rax 310 311 xorq %r14,%r14 312 xorq %r15,%r15 313 314 movq %r8,%rbp 315 mulq %rbx 316 movq %rax,%r10 317 movq (%rcx),%rax 318 319 imulq %r10,%rbp 320 movq %rdx,%r11 321 322 mulq %rbp 323 addq %rax,%r10 324 movq 8(%rsi),%rax 325 adcq $0,%rdx 326 movq %rdx,%rdi 327 328 mulq %rbx 329 addq %rax,%r11 330 movq 8(%rcx),%rax 331 adcq $0,%rdx 332 movq %rdx,%r10 333 334 mulq %rbp 335 addq %rax,%rdi 336 movq 16(%rsi),%rax 337 adcq $0,%rdx 338 addq %r11,%rdi 339 leaq 4(%r15),%r15 340 adcq $0,%rdx 341 movq %rdi,(%rsp) 342 movq %rdx,%r13 343 jmp L$1st4x 344 .p2align 4 345 L$1st4x: 346 mulq %rbx 347 addq %rax,%r10 348 movq -16(%rcx,%r15,8),%rax 349 adcq $0,%rdx 350 movq %rdx,%r11 351 352 mulq %rbp 353 addq %rax,%r13 354 movq -8(%rsi,%r15,8),%rax 355 adcq $0,%rdx 356 addq %r10,%r13 357 adcq $0,%rdx 358 movq %r13,-24(%rsp,%r15,8) 359 movq %rdx,%rdi 360 361 mulq %rbx 362 addq %rax,%r11 363 movq -8(%rcx,%r15,8),%rax 364 adcq $0,%rdx 365 movq %rdx,%r10 366 367 mulq %rbp 368 addq %rax,%rdi 369 movq (%rsi,%r15,8),%rax 370 adcq $0,%rdx 371 addq %r11,%rdi 372 adcq $0,%rdx 373 movq %rdi,-16(%rsp,%r15,8) 374 movq %rdx,%r13 375 376 mulq %rbx 377 addq %rax,%r10 378 movq (%rcx,%r15,8),%rax 379 adcq $0,%rdx 380 movq %rdx,%r11 381 382 mulq %rbp 383 addq %rax,%r13 384 movq 8(%rsi,%r15,8),%rax 385 adcq $0,%rdx 386 addq %r10,%r13 387 adcq $0,%rdx 388 movq %r13,-8(%rsp,%r15,8) 389 movq %rdx,%rdi 390 391 mulq %rbx 392 addq %rax,%r11 393 movq 8(%rcx,%r15,8),%rax 394 adcq $0,%rdx 395 leaq 4(%r15),%r15 396 movq %rdx,%r10 397 398 mulq %rbp 399 addq %rax,%rdi 400 movq -16(%rsi,%r15,8),%rax 401 adcq $0,%rdx 402 addq %r11,%rdi 403 adcq $0,%rdx 404 movq %rdi,-32(%rsp,%r15,8) 405 movq %rdx,%r13 406 cmpq %r9,%r15 407 jb L$1st4x 408 409 mulq %rbx 410 addq %rax,%r10 411 movq -16(%rcx,%r15,8),%rax 412 adcq $0,%rdx 413 movq %rdx,%r11 414 415 mulq %rbp 416 addq %rax,%r13 417 movq -8(%rsi,%r15,8),%rax 418 adcq $0,%rdx 419 addq %r10,%r13 420 adcq $0,%rdx 421 movq %r13,-24(%rsp,%r15,8) 422 movq %rdx,%rdi 423 424 mulq %rbx 425 addq %rax,%r11 426 movq -8(%rcx,%r15,8),%rax 427 adcq $0,%rdx 428 movq %rdx,%r10 429 430 mulq %rbp 431 addq %rax,%rdi 432 movq (%rsi),%rax 433 adcq $0,%rdx 434 addq %r11,%rdi 435 adcq $0,%rdx 436 movq %rdi,-16(%rsp,%r15,8) 437 movq %rdx,%r13 438 439 xorq %rdi,%rdi 440 addq %r10,%r13 441 adcq $0,%rdi 442 movq %r13,-8(%rsp,%r15,8) 443 movq %rdi,(%rsp,%r15,8) 444 445 leaq 1(%r14),%r14 446 .p2align 2 447 L$outer4x: 448 movq (%r12,%r14,8),%rbx 449 xorq %r15,%r15 450 movq (%rsp),%r10 451 movq %r8,%rbp 452 mulq %rbx 453 addq %rax,%r10 454 movq (%rcx),%rax 455 adcq $0,%rdx 456 457 imulq %r10,%rbp 458 movq %rdx,%r11 459 460 mulq %rbp 461 addq %rax,%r10 462 movq 8(%rsi),%rax 463 adcq $0,%rdx 464 movq %rdx,%rdi 465 466 mulq %rbx 467 addq %rax,%r11 468 movq 8(%rcx),%rax 469 adcq $0,%rdx 470 addq 8(%rsp),%r11 471 adcq $0,%rdx 472 movq %rdx,%r10 473 474 mulq %rbp 475 addq %rax,%rdi 476 movq 16(%rsi),%rax 477 adcq $0,%rdx 478 addq %r11,%rdi 479 leaq 4(%r15),%r15 480 adcq $0,%rdx 481 movq %rdi,(%rsp) 482 movq %rdx,%r13 483 jmp L$inner4x 484 .p2align 4 485 L$inner4x: 486 mulq %rbx 487 addq %rax,%r10 488 movq -16(%rcx,%r15,8),%rax 489 adcq $0,%rdx 490 addq -16(%rsp,%r15,8),%r10 491 adcq $0,%rdx 492 movq %rdx,%r11 493 494 mulq %rbp 495 addq %rax,%r13 496 movq -8(%rsi,%r15,8),%rax 497 adcq $0,%rdx 498 addq %r10,%r13 499 adcq $0,%rdx 500 movq %r13,-24(%rsp,%r15,8) 501 movq %rdx,%rdi 502 503 mulq %rbx 504 addq %rax,%r11 505 movq -8(%rcx,%r15,8),%rax 506 adcq $0,%rdx 507 addq -8(%rsp,%r15,8),%r11 508 adcq $0,%rdx 509 movq %rdx,%r10 510 511 mulq %rbp 512 addq %rax,%rdi 513 movq (%rsi,%r15,8),%rax 514 adcq $0,%rdx 515 addq %r11,%rdi 516 adcq $0,%rdx 517 movq %rdi,-16(%rsp,%r15,8) 518 movq %rdx,%r13 519 520 mulq %rbx 521 addq %rax,%r10 522 movq (%rcx,%r15,8),%rax 523 adcq $0,%rdx 524 addq (%rsp,%r15,8),%r10 525 adcq $0,%rdx 526 movq %rdx,%r11 527 528 mulq %rbp 529 addq %rax,%r13 530 movq 8(%rsi,%r15,8),%rax 531 adcq $0,%rdx 532 addq %r10,%r13 533 adcq $0,%rdx 534 movq %r13,-8(%rsp,%r15,8) 535 movq %rdx,%rdi 536 537 mulq %rbx 538 addq %rax,%r11 539 movq 8(%rcx,%r15,8),%rax 540 adcq $0,%rdx 541 addq 8(%rsp,%r15,8),%r11 542 adcq $0,%rdx 543 leaq 4(%r15),%r15 544 movq %rdx,%r10 545 546 mulq %rbp 547 addq %rax,%rdi 548 movq -16(%rsi,%r15,8),%rax 549 adcq $0,%rdx 550 addq %r11,%rdi 551 adcq $0,%rdx 552 movq %rdi,-32(%rsp,%r15,8) 553 movq %rdx,%r13 554 cmpq %r9,%r15 555 jb L$inner4x 556 557 mulq %rbx 558 addq %rax,%r10 559 movq -16(%rcx,%r15,8),%rax 560 adcq $0,%rdx 561 addq -16(%rsp,%r15,8),%r10 562 adcq $0,%rdx 563 movq %rdx,%r11 564 565 mulq %rbp 566 addq %rax,%r13 567 movq -8(%rsi,%r15,8),%rax 568 adcq $0,%rdx 569 addq %r10,%r13 570 adcq $0,%rdx 571 movq %r13,-24(%rsp,%r15,8) 572 movq %rdx,%rdi 573 574 mulq %rbx 575 addq %rax,%r11 576 movq -8(%rcx,%r15,8),%rax 577 adcq $0,%rdx 578 addq -8(%rsp,%r15,8),%r11 579 adcq $0,%rdx 580 leaq 1(%r14),%r14 581 movq %rdx,%r10 582 583 mulq %rbp 584 addq %rax,%rdi 585 movq (%rsi),%rax 586 adcq $0,%rdx 587 addq %r11,%rdi 588 adcq $0,%rdx 589 movq %rdi,-16(%rsp,%r15,8) 590 movq %rdx,%r13 591 592 xorq %rdi,%rdi 593 addq %r10,%r13 594 adcq $0,%rdi 595 addq (%rsp,%r9,8),%r13 596 adcq $0,%rdi 597 movq %r13,-8(%rsp,%r15,8) 598 movq %rdi,(%rsp,%r15,8) 599 600 cmpq %r9,%r14 601 jb L$outer4x 602 movq 16(%rsp,%r9,8),%rdi 603 leaq -4(%r9),%r15 604 movq 0(%rsp),%rax 605 pxor %xmm0,%xmm0 606 movq 8(%rsp),%rdx 607 shrq $2,%r15 608 leaq (%rsp),%rsi 609 xorq %r14,%r14 610 611 subq 0(%rcx),%rax 612 movq 16(%rsi),%rbx 613 movq 24(%rsi),%rbp 614 sbbq 8(%rcx),%rdx 615 jmp L$sub4x 616 .p2align 4 617 L$sub4x: 618 movq %rax,0(%rdi,%r14,8) 619 movq %rdx,8(%rdi,%r14,8) 620 sbbq 16(%rcx,%r14,8),%rbx 621 movq 32(%rsi,%r14,8),%rax 622 movq 40(%rsi,%r14,8),%rdx 623 sbbq 24(%rcx,%r14,8),%rbp 624 movq %rbx,16(%rdi,%r14,8) 625 movq %rbp,24(%rdi,%r14,8) 626 sbbq 32(%rcx,%r14,8),%rax 627 movq 48(%rsi,%r14,8),%rbx 628 movq 56(%rsi,%r14,8),%rbp 629 sbbq 40(%rcx,%r14,8),%rdx 630 leaq 4(%r14),%r14 631 decq %r15 632 jnz L$sub4x 633 634 movq %rax,0(%rdi,%r14,8) 635 movq 32(%rsi,%r14,8),%rax 636 sbbq 16(%rcx,%r14,8),%rbx 637 movq %rdx,8(%rdi,%r14,8) 638 sbbq 24(%rcx,%r14,8),%rbp 639 movq %rbx,16(%rdi,%r14,8) 640 641 sbbq $0,%rax 642 movq %rbp,24(%rdi,%r14,8) 643 xorq %r14,%r14 644 andq %rax,%rsi 645 notq %rax 646 movq %rdi,%rcx 647 andq %rax,%rcx 648 leaq -4(%r9),%r15 649 orq %rcx,%rsi 650 shrq $2,%r15 651 652 movdqu (%rsi),%xmm1 653 movdqa %xmm0,(%rsp) 654 movdqu %xmm1,(%rdi) 655 jmp L$copy4x 656 .p2align 4 657 L$copy4x: 658 movdqu 16(%rsi,%r14,1),%xmm2 659 movdqu 32(%rsi,%r14,1),%xmm1 660 movdqa %xmm0,16(%rsp,%r14,1) 661 movdqu %xmm2,16(%rdi,%r14,1) 662 movdqa %xmm0,32(%rsp,%r14,1) 663 movdqu %xmm1,32(%rdi,%r14,1) 664 leaq 32(%r14),%r14 665 decq %r15 666 jnz L$copy4x 667 668 movdqu 16(%rsi,%r14,1),%xmm2 669 movdqa %xmm0,16(%rsp,%r14,1) 670 movdqu %xmm2,16(%rdi,%r14,1) 671 movq 8(%rsp,%r9,8),%rsi 672 673 movq $1,%rax 674 movq -48(%rsi),%r15 675 676 movq -40(%rsi),%r14 677 678 movq -32(%rsi),%r13 679 680 movq -24(%rsi),%r12 681 682 movq -16(%rsi),%rbp 683 684 movq -8(%rsi),%rbx 685 686 leaq (%rsi),%rsp 687 688 L$mul4x_epilogue: 689 .byte 0xf3,0xc3 690 691 692 693 694 695 .p2align 5 696 bn_sqr8x_mont: 697 698 movq %rsp,%rax 699 700 L$sqr8x_enter: 701 pushq %rbx 702 703 pushq %rbp 704 705 pushq %r12 706 707 pushq %r13 708 709 pushq %r14 710 711 pushq %r15 712 713 L$sqr8x_prologue: 714 715 movl %r9d,%r10d 716 shll $3,%r9d 717 shlq $3+2,%r10 718 negq %r9 719 720 721 722 723 724 725 leaq -64(%rsp,%r9,2),%r11 726 movq %rsp,%rbp 727 movq (%r8),%r8 728 subq %rsi,%r11 729 andq $4095,%r11 730 cmpq %r11,%r10 731 jb L$sqr8x_sp_alt 732 subq %r11,%rbp 733 leaq -64(%rbp,%r9,2),%rbp 734 jmp L$sqr8x_sp_done 735 736 .p2align 5 737 L$sqr8x_sp_alt: 738 leaq 4096-64(,%r9,2),%r10 739 leaq -64(%rbp,%r9,2),%rbp 740 subq %r10,%r11 741 movq $0,%r10 742 cmovcq %r10,%r11 743 subq %r11,%rbp 744 L$sqr8x_sp_done: 745 andq $-64,%rbp 746 movq %rsp,%r11 747 subq %rbp,%r11 748 andq $-4096,%r11 749 leaq (%r11,%rbp,1),%rsp 750 movq (%rsp),%r10 751 cmpq %rbp,%rsp 752 ja L$sqr8x_page_walk 753 jmp L$sqr8x_page_walk_done 754 755 .p2align 4 756 L$sqr8x_page_walk: 757 leaq -4096(%rsp),%rsp 758 movq (%rsp),%r10 759 cmpq %rbp,%rsp 760 ja L$sqr8x_page_walk 761 L$sqr8x_page_walk_done: 762 763 movq %r9,%r10 764 negq %r9 765 766 movq %r8,32(%rsp) 767 movq %rax,40(%rsp) 768 769 L$sqr8x_body: 770 771 .byte 102,72,15,110,209 772 pxor %xmm0,%xmm0 773 .byte 102,72,15,110,207 774 .byte 102,73,15,110,218 775 call _bn_sqr8x_internal 776 777 778 779 780 leaq (%rdi,%r9,1),%rbx 781 movq %r9,%rcx 782 movq %r9,%rdx 783 .byte 102,72,15,126,207 784 sarq $3+2,%rcx 785 jmp L$sqr8x_sub 786 787 .p2align 5 788 L$sqr8x_sub: 789 movq 0(%rbx),%r12 790 movq 8(%rbx),%r13 791 movq 16(%rbx),%r14 792 movq 24(%rbx),%r15 793 leaq 32(%rbx),%rbx 794 sbbq 0(%rbp),%r12 795 sbbq 8(%rbp),%r13 796 sbbq 16(%rbp),%r14 797 sbbq 24(%rbp),%r15 798 leaq 32(%rbp),%rbp 799 movq %r12,0(%rdi) 800 movq %r13,8(%rdi) 801 movq %r14,16(%rdi) 802 movq %r15,24(%rdi) 803 leaq 32(%rdi),%rdi 804 incq %rcx 805 jnz L$sqr8x_sub 806 807 sbbq $0,%rax 808 leaq (%rbx,%r9,1),%rbx 809 leaq (%rdi,%r9,1),%rdi 810 811 .byte 102,72,15,110,200 812 pxor %xmm0,%xmm0 813 pshufd $0,%xmm1,%xmm1 814 movq 40(%rsp),%rsi 815 816 jmp L$sqr8x_cond_copy 817 818 .p2align 5 819 L$sqr8x_cond_copy: 820 movdqa 0(%rbx),%xmm2 821 movdqa 16(%rbx),%xmm3 822 leaq 32(%rbx),%rbx 823 movdqu 0(%rdi),%xmm4 824 movdqu 16(%rdi),%xmm5 825 leaq 32(%rdi),%rdi 826 movdqa %xmm0,-32(%rbx) 827 movdqa %xmm0,-16(%rbx) 828 movdqa %xmm0,-32(%rbx,%rdx,1) 829 movdqa %xmm0,-16(%rbx,%rdx,1) 830 pcmpeqd %xmm1,%xmm0 831 pand %xmm1,%xmm2 832 pand %xmm1,%xmm3 833 pand %xmm0,%xmm4 834 pand %xmm0,%xmm5 835 pxor %xmm0,%xmm0 836 por %xmm2,%xmm4 837 por %xmm3,%xmm5 838 movdqu %xmm4,-32(%rdi) 839 movdqu %xmm5,-16(%rdi) 840 addq $32,%r9 841 jnz L$sqr8x_cond_copy 842 843 movq $1,%rax 844 movq -48(%rsi),%r15 845 846 movq -40(%rsi),%r14 847 848 movq -32(%rsi),%r13 849 850 movq -24(%rsi),%r12 851 852 movq -16(%rsi),%rbp 853 854 movq -8(%rsi),%rbx 855 856 leaq (%rsi),%rsp 857 858 L$sqr8x_epilogue: 859 .byte 0xf3,0xc3 860 861 862 .byte 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 863 .p2align 4 864 #endif 865