1 #if defined(__x86_64__) && !defined(OPENSSL_NO_ASM) 2 .text 3 4 .extern OPENSSL_ia32cap_P 5 .hidden OPENSSL_ia32cap_P 6 7 .globl bn_mul_mont 8 .hidden bn_mul_mont 9 .type bn_mul_mont,@function 10 .align 16 11 bn_mul_mont: 12 .cfi_startproc 13 movl %r9d,%r9d 14 movq %rsp,%rax 15 .cfi_def_cfa_register %rax 16 testl $3,%r9d 17 jnz .Lmul_enter 18 cmpl $8,%r9d 19 jb .Lmul_enter 20 cmpq %rsi,%rdx 21 jne .Lmul4x_enter 22 testl $7,%r9d 23 jz .Lsqr8x_enter 24 jmp .Lmul4x_enter 25 26 .align 16 27 .Lmul_enter: 28 pushq %rbx 29 .cfi_offset %rbx,-16 30 pushq %rbp 31 .cfi_offset %rbp,-24 32 pushq %r12 33 .cfi_offset %r12,-32 34 pushq %r13 35 .cfi_offset %r13,-40 36 pushq %r14 37 .cfi_offset %r14,-48 38 pushq %r15 39 .cfi_offset %r15,-56 40 41 negq %r9 42 movq %rsp,%r11 43 leaq -16(%rsp,%r9,8),%r10 44 negq %r9 45 andq $-1024,%r10 46 47 48 49 50 51 52 53 54 55 subq %r10,%r11 56 andq $-4096,%r11 57 leaq (%r10,%r11,1),%rsp 58 movq (%rsp),%r11 59 cmpq %r10,%rsp 60 ja .Lmul_page_walk 61 jmp .Lmul_page_walk_done 62 63 .align 16 64 .Lmul_page_walk: 65 leaq -4096(%rsp),%rsp 66 movq (%rsp),%r11 67 cmpq %r10,%rsp 68 ja .Lmul_page_walk 69 .Lmul_page_walk_done: 70 71 movq %rax,8(%rsp,%r9,8) 72 .cfi_escape 0x0f,0x0a,0x77,0x08,0x79,0x00,0x38,0x1e,0x22,0x06,0x23,0x08 73 .Lmul_body: 74 movq %rdx,%r12 75 movq (%r8),%r8 76 movq (%r12),%rbx 77 movq (%rsi),%rax 78 79 xorq %r14,%r14 80 xorq %r15,%r15 81 82 movq %r8,%rbp 83 mulq %rbx 84 movq %rax,%r10 85 movq (%rcx),%rax 86 87 imulq %r10,%rbp 88 movq %rdx,%r11 89 90 mulq %rbp 91 addq %rax,%r10 92 movq 8(%rsi),%rax 93 adcq $0,%rdx 94 movq %rdx,%r13 95 96 leaq 1(%r15),%r15 97 jmp .L1st_enter 98 99 .align 16 100 .L1st: 101 addq %rax,%r13 102 movq (%rsi,%r15,8),%rax 103 adcq $0,%rdx 104 addq %r11,%r13 105 movq %r10,%r11 106 adcq $0,%rdx 107 movq %r13,-16(%rsp,%r15,8) 108 movq %rdx,%r13 109 110 .L1st_enter: 111 mulq %rbx 112 addq %rax,%r11 113 movq (%rcx,%r15,8),%rax 114 adcq $0,%rdx 115 leaq 1(%r15),%r15 116 movq %rdx,%r10 117 118 mulq %rbp 119 cmpq %r9,%r15 120 jne .L1st 121 122 addq %rax,%r13 123 movq (%rsi),%rax 124 adcq $0,%rdx 125 addq %r11,%r13 126 adcq $0,%rdx 127 movq %r13,-16(%rsp,%r15,8) 128 movq %rdx,%r13 129 movq %r10,%r11 130 131 xorq %rdx,%rdx 132 addq %r11,%r13 133 adcq $0,%rdx 134 movq %r13,-8(%rsp,%r9,8) 135 movq %rdx,(%rsp,%r9,8) 136 137 leaq 1(%r14),%r14 138 jmp .Louter 139 .align 16 140 .Louter: 141 movq (%r12,%r14,8),%rbx 142 xorq %r15,%r15 143 movq %r8,%rbp 144 movq (%rsp),%r10 145 mulq %rbx 146 addq %rax,%r10 147 movq (%rcx),%rax 148 adcq $0,%rdx 149 150 imulq %r10,%rbp 151 movq %rdx,%r11 152 153 mulq %rbp 154 addq %rax,%r10 155 movq 8(%rsi),%rax 156 adcq $0,%rdx 157 movq 8(%rsp),%r10 158 movq %rdx,%r13 159 160 leaq 1(%r15),%r15 161 jmp .Linner_enter 162 163 .align 16 164 .Linner: 165 addq %rax,%r13 166 movq (%rsi,%r15,8),%rax 167 adcq $0,%rdx 168 addq %r10,%r13 169 movq (%rsp,%r15,8),%r10 170 adcq $0,%rdx 171 movq %r13,-16(%rsp,%r15,8) 172 movq %rdx,%r13 173 174 .Linner_enter: 175 mulq %rbx 176 addq %rax,%r11 177 movq (%rcx,%r15,8),%rax 178 adcq $0,%rdx 179 addq %r11,%r10 180 movq %rdx,%r11 181 adcq $0,%r11 182 leaq 1(%r15),%r15 183 184 mulq %rbp 185 cmpq %r9,%r15 186 jne .Linner 187 188 addq %rax,%r13 189 movq (%rsi),%rax 190 adcq $0,%rdx 191 addq %r10,%r13 192 movq (%rsp,%r15,8),%r10 193 adcq $0,%rdx 194 movq %r13,-16(%rsp,%r15,8) 195 movq %rdx,%r13 196 197 xorq %rdx,%rdx 198 addq %r11,%r13 199 adcq $0,%rdx 200 addq %r10,%r13 201 adcq $0,%rdx 202 movq %r13,-8(%rsp,%r9,8) 203 movq %rdx,(%rsp,%r9,8) 204 205 leaq 1(%r14),%r14 206 cmpq %r9,%r14 207 jb .Louter 208 209 xorq %r14,%r14 210 movq (%rsp),%rax 211 leaq (%rsp),%rsi 212 movq %r9,%r15 213 jmp .Lsub 214 .align 16 215 .Lsub: 216 sbbq (%rcx,%r14,8),%rax 217 movq %rax,(%rdi,%r14,8) 218 movq 8(%rsi,%r14,8),%rax 219 leaq 1(%r14),%r14 220 decq %r15 221 jnz .Lsub 222 223 sbbq $0,%rax 224 xorq %r14,%r14 225 andq %rax,%rsi 226 notq %rax 227 movq %rdi,%rcx 228 andq %rax,%rcx 229 movq %r9,%r15 230 orq %rcx,%rsi 231 .align 16 232 .Lcopy: 233 movq (%rsi,%r14,8),%rax 234 movq %r14,(%rsp,%r14,8) 235 movq %rax,(%rdi,%r14,8) 236 leaq 1(%r14),%r14 237 subq $1,%r15 238 jnz .Lcopy 239 240 movq 8(%rsp,%r9,8),%rsi 241 .cfi_def_cfa %rsi,8 242 movq $1,%rax 243 movq -48(%rsi),%r15 244 .cfi_restore %r15 245 movq -40(%rsi),%r14 246 .cfi_restore %r14 247 movq -32(%rsi),%r13 248 .cfi_restore %r13 249 movq -24(%rsi),%r12 250 .cfi_restore %r12 251 movq -16(%rsi),%rbp 252 .cfi_restore %rbp 253 movq -8(%rsi),%rbx 254 .cfi_restore %rbx 255 leaq (%rsi),%rsp 256 .cfi_def_cfa_register %rsp 257 .Lmul_epilogue: 258 .byte 0xf3,0xc3 259 .cfi_endproc 260 .size bn_mul_mont,.-bn_mul_mont 261 .type bn_mul4x_mont,@function 262 .align 16 263 bn_mul4x_mont: 264 .cfi_startproc 265 movl %r9d,%r9d 266 movq %rsp,%rax 267 .cfi_def_cfa_register %rax 268 .Lmul4x_enter: 269 pushq %rbx 270 .cfi_offset %rbx,-16 271 pushq %rbp 272 .cfi_offset %rbp,-24 273 pushq %r12 274 .cfi_offset %r12,-32 275 pushq %r13 276 .cfi_offset %r13,-40 277 pushq %r14 278 .cfi_offset %r14,-48 279 pushq %r15 280 .cfi_offset %r15,-56 281 282 negq %r9 283 movq %rsp,%r11 284 leaq -32(%rsp,%r9,8),%r10 285 negq %r9 286 andq $-1024,%r10 287 288 subq %r10,%r11 289 andq $-4096,%r11 290 leaq (%r10,%r11,1),%rsp 291 movq (%rsp),%r11 292 cmpq %r10,%rsp 293 ja .Lmul4x_page_walk 294 jmp .Lmul4x_page_walk_done 295 296 .Lmul4x_page_walk: 297 leaq -4096(%rsp),%rsp 298 movq (%rsp),%r11 299 cmpq %r10,%rsp 300 ja .Lmul4x_page_walk 301 .Lmul4x_page_walk_done: 302 303 movq %rax,8(%rsp,%r9,8) 304 .cfi_escape 0x0f,0x0a,0x77,0x08,0x79,0x00,0x38,0x1e,0x22,0x06,0x23,0x08 305 .Lmul4x_body: 306 movq %rdi,16(%rsp,%r9,8) 307 movq %rdx,%r12 308 movq (%r8),%r8 309 movq (%r12),%rbx 310 movq (%rsi),%rax 311 312 xorq %r14,%r14 313 xorq %r15,%r15 314 315 movq %r8,%rbp 316 mulq %rbx 317 movq %rax,%r10 318 movq (%rcx),%rax 319 320 imulq %r10,%rbp 321 movq %rdx,%r11 322 323 mulq %rbp 324 addq %rax,%r10 325 movq 8(%rsi),%rax 326 adcq $0,%rdx 327 movq %rdx,%rdi 328 329 mulq %rbx 330 addq %rax,%r11 331 movq 8(%rcx),%rax 332 adcq $0,%rdx 333 movq %rdx,%r10 334 335 mulq %rbp 336 addq %rax,%rdi 337 movq 16(%rsi),%rax 338 adcq $0,%rdx 339 addq %r11,%rdi 340 leaq 4(%r15),%r15 341 adcq $0,%rdx 342 movq %rdi,(%rsp) 343 movq %rdx,%r13 344 jmp .L1st4x 345 .align 16 346 .L1st4x: 347 mulq %rbx 348 addq %rax,%r10 349 movq -16(%rcx,%r15,8),%rax 350 adcq $0,%rdx 351 movq %rdx,%r11 352 353 mulq %rbp 354 addq %rax,%r13 355 movq -8(%rsi,%r15,8),%rax 356 adcq $0,%rdx 357 addq %r10,%r13 358 adcq $0,%rdx 359 movq %r13,-24(%rsp,%r15,8) 360 movq %rdx,%rdi 361 362 mulq %rbx 363 addq %rax,%r11 364 movq -8(%rcx,%r15,8),%rax 365 adcq $0,%rdx 366 movq %rdx,%r10 367 368 mulq %rbp 369 addq %rax,%rdi 370 movq (%rsi,%r15,8),%rax 371 adcq $0,%rdx 372 addq %r11,%rdi 373 adcq $0,%rdx 374 movq %rdi,-16(%rsp,%r15,8) 375 movq %rdx,%r13 376 377 mulq %rbx 378 addq %rax,%r10 379 movq (%rcx,%r15,8),%rax 380 adcq $0,%rdx 381 movq %rdx,%r11 382 383 mulq %rbp 384 addq %rax,%r13 385 movq 8(%rsi,%r15,8),%rax 386 adcq $0,%rdx 387 addq %r10,%r13 388 adcq $0,%rdx 389 movq %r13,-8(%rsp,%r15,8) 390 movq %rdx,%rdi 391 392 mulq %rbx 393 addq %rax,%r11 394 movq 8(%rcx,%r15,8),%rax 395 adcq $0,%rdx 396 leaq 4(%r15),%r15 397 movq %rdx,%r10 398 399 mulq %rbp 400 addq %rax,%rdi 401 movq -16(%rsi,%r15,8),%rax 402 adcq $0,%rdx 403 addq %r11,%rdi 404 adcq $0,%rdx 405 movq %rdi,-32(%rsp,%r15,8) 406 movq %rdx,%r13 407 cmpq %r9,%r15 408 jb .L1st4x 409 410 mulq %rbx 411 addq %rax,%r10 412 movq -16(%rcx,%r15,8),%rax 413 adcq $0,%rdx 414 movq %rdx,%r11 415 416 mulq %rbp 417 addq %rax,%r13 418 movq -8(%rsi,%r15,8),%rax 419 adcq $0,%rdx 420 addq %r10,%r13 421 adcq $0,%rdx 422 movq %r13,-24(%rsp,%r15,8) 423 movq %rdx,%rdi 424 425 mulq %rbx 426 addq %rax,%r11 427 movq -8(%rcx,%r15,8),%rax 428 adcq $0,%rdx 429 movq %rdx,%r10 430 431 mulq %rbp 432 addq %rax,%rdi 433 movq (%rsi),%rax 434 adcq $0,%rdx 435 addq %r11,%rdi 436 adcq $0,%rdx 437 movq %rdi,-16(%rsp,%r15,8) 438 movq %rdx,%r13 439 440 xorq %rdi,%rdi 441 addq %r10,%r13 442 adcq $0,%rdi 443 movq %r13,-8(%rsp,%r15,8) 444 movq %rdi,(%rsp,%r15,8) 445 446 leaq 1(%r14),%r14 447 .align 4 448 .Louter4x: 449 movq (%r12,%r14,8),%rbx 450 xorq %r15,%r15 451 movq (%rsp),%r10 452 movq %r8,%rbp 453 mulq %rbx 454 addq %rax,%r10 455 movq (%rcx),%rax 456 adcq $0,%rdx 457 458 imulq %r10,%rbp 459 movq %rdx,%r11 460 461 mulq %rbp 462 addq %rax,%r10 463 movq 8(%rsi),%rax 464 adcq $0,%rdx 465 movq %rdx,%rdi 466 467 mulq %rbx 468 addq %rax,%r11 469 movq 8(%rcx),%rax 470 adcq $0,%rdx 471 addq 8(%rsp),%r11 472 adcq $0,%rdx 473 movq %rdx,%r10 474 475 mulq %rbp 476 addq %rax,%rdi 477 movq 16(%rsi),%rax 478 adcq $0,%rdx 479 addq %r11,%rdi 480 leaq 4(%r15),%r15 481 adcq $0,%rdx 482 movq %rdi,(%rsp) 483 movq %rdx,%r13 484 jmp .Linner4x 485 .align 16 486 .Linner4x: 487 mulq %rbx 488 addq %rax,%r10 489 movq -16(%rcx,%r15,8),%rax 490 adcq $0,%rdx 491 addq -16(%rsp,%r15,8),%r10 492 adcq $0,%rdx 493 movq %rdx,%r11 494 495 mulq %rbp 496 addq %rax,%r13 497 movq -8(%rsi,%r15,8),%rax 498 adcq $0,%rdx 499 addq %r10,%r13 500 adcq $0,%rdx 501 movq %r13,-24(%rsp,%r15,8) 502 movq %rdx,%rdi 503 504 mulq %rbx 505 addq %rax,%r11 506 movq -8(%rcx,%r15,8),%rax 507 adcq $0,%rdx 508 addq -8(%rsp,%r15,8),%r11 509 adcq $0,%rdx 510 movq %rdx,%r10 511 512 mulq %rbp 513 addq %rax,%rdi 514 movq (%rsi,%r15,8),%rax 515 adcq $0,%rdx 516 addq %r11,%rdi 517 adcq $0,%rdx 518 movq %rdi,-16(%rsp,%r15,8) 519 movq %rdx,%r13 520 521 mulq %rbx 522 addq %rax,%r10 523 movq (%rcx,%r15,8),%rax 524 adcq $0,%rdx 525 addq (%rsp,%r15,8),%r10 526 adcq $0,%rdx 527 movq %rdx,%r11 528 529 mulq %rbp 530 addq %rax,%r13 531 movq 8(%rsi,%r15,8),%rax 532 adcq $0,%rdx 533 addq %r10,%r13 534 adcq $0,%rdx 535 movq %r13,-8(%rsp,%r15,8) 536 movq %rdx,%rdi 537 538 mulq %rbx 539 addq %rax,%r11 540 movq 8(%rcx,%r15,8),%rax 541 adcq $0,%rdx 542 addq 8(%rsp,%r15,8),%r11 543 adcq $0,%rdx 544 leaq 4(%r15),%r15 545 movq %rdx,%r10 546 547 mulq %rbp 548 addq %rax,%rdi 549 movq -16(%rsi,%r15,8),%rax 550 adcq $0,%rdx 551 addq %r11,%rdi 552 adcq $0,%rdx 553 movq %rdi,-32(%rsp,%r15,8) 554 movq %rdx,%r13 555 cmpq %r9,%r15 556 jb .Linner4x 557 558 mulq %rbx 559 addq %rax,%r10 560 movq -16(%rcx,%r15,8),%rax 561 adcq $0,%rdx 562 addq -16(%rsp,%r15,8),%r10 563 adcq $0,%rdx 564 movq %rdx,%r11 565 566 mulq %rbp 567 addq %rax,%r13 568 movq -8(%rsi,%r15,8),%rax 569 adcq $0,%rdx 570 addq %r10,%r13 571 adcq $0,%rdx 572 movq %r13,-24(%rsp,%r15,8) 573 movq %rdx,%rdi 574 575 mulq %rbx 576 addq %rax,%r11 577 movq -8(%rcx,%r15,8),%rax 578 adcq $0,%rdx 579 addq -8(%rsp,%r15,8),%r11 580 adcq $0,%rdx 581 leaq 1(%r14),%r14 582 movq %rdx,%r10 583 584 mulq %rbp 585 addq %rax,%rdi 586 movq (%rsi),%rax 587 adcq $0,%rdx 588 addq %r11,%rdi 589 adcq $0,%rdx 590 movq %rdi,-16(%rsp,%r15,8) 591 movq %rdx,%r13 592 593 xorq %rdi,%rdi 594 addq %r10,%r13 595 adcq $0,%rdi 596 addq (%rsp,%r9,8),%r13 597 adcq $0,%rdi 598 movq %r13,-8(%rsp,%r15,8) 599 movq %rdi,(%rsp,%r15,8) 600 601 cmpq %r9,%r14 602 jb .Louter4x 603 movq 16(%rsp,%r9,8),%rdi 604 leaq -4(%r9),%r15 605 movq 0(%rsp),%rax 606 pxor %xmm0,%xmm0 607 movq 8(%rsp),%rdx 608 shrq $2,%r15 609 leaq (%rsp),%rsi 610 xorq %r14,%r14 611 612 subq 0(%rcx),%rax 613 movq 16(%rsi),%rbx 614 movq 24(%rsi),%rbp 615 sbbq 8(%rcx),%rdx 616 jmp .Lsub4x 617 .align 16 618 .Lsub4x: 619 movq %rax,0(%rdi,%r14,8) 620 movq %rdx,8(%rdi,%r14,8) 621 sbbq 16(%rcx,%r14,8),%rbx 622 movq 32(%rsi,%r14,8),%rax 623 movq 40(%rsi,%r14,8),%rdx 624 sbbq 24(%rcx,%r14,8),%rbp 625 movq %rbx,16(%rdi,%r14,8) 626 movq %rbp,24(%rdi,%r14,8) 627 sbbq 32(%rcx,%r14,8),%rax 628 movq 48(%rsi,%r14,8),%rbx 629 movq 56(%rsi,%r14,8),%rbp 630 sbbq 40(%rcx,%r14,8),%rdx 631 leaq 4(%r14),%r14 632 decq %r15 633 jnz .Lsub4x 634 635 movq %rax,0(%rdi,%r14,8) 636 movq 32(%rsi,%r14,8),%rax 637 sbbq 16(%rcx,%r14,8),%rbx 638 movq %rdx,8(%rdi,%r14,8) 639 sbbq 24(%rcx,%r14,8),%rbp 640 movq %rbx,16(%rdi,%r14,8) 641 642 sbbq $0,%rax 643 movq %rbp,24(%rdi,%r14,8) 644 xorq %r14,%r14 645 andq %rax,%rsi 646 notq %rax 647 movq %rdi,%rcx 648 andq %rax,%rcx 649 leaq -4(%r9),%r15 650 orq %rcx,%rsi 651 shrq $2,%r15 652 653 movdqu (%rsi),%xmm1 654 movdqa %xmm0,(%rsp) 655 movdqu %xmm1,(%rdi) 656 jmp .Lcopy4x 657 .align 16 658 .Lcopy4x: 659 movdqu 16(%rsi,%r14,1),%xmm2 660 movdqu 32(%rsi,%r14,1),%xmm1 661 movdqa %xmm0,16(%rsp,%r14,1) 662 movdqu %xmm2,16(%rdi,%r14,1) 663 movdqa %xmm0,32(%rsp,%r14,1) 664 movdqu %xmm1,32(%rdi,%r14,1) 665 leaq 32(%r14),%r14 666 decq %r15 667 jnz .Lcopy4x 668 669 movdqu 16(%rsi,%r14,1),%xmm2 670 movdqa %xmm0,16(%rsp,%r14,1) 671 movdqu %xmm2,16(%rdi,%r14,1) 672 movq 8(%rsp,%r9,8),%rsi 673 .cfi_def_cfa %rsi, 8 674 movq $1,%rax 675 movq -48(%rsi),%r15 676 .cfi_restore %r15 677 movq -40(%rsi),%r14 678 .cfi_restore %r14 679 movq -32(%rsi),%r13 680 .cfi_restore %r13 681 movq -24(%rsi),%r12 682 .cfi_restore %r12 683 movq -16(%rsi),%rbp 684 .cfi_restore %rbp 685 movq -8(%rsi),%rbx 686 .cfi_restore %rbx 687 leaq (%rsi),%rsp 688 .cfi_def_cfa_register %rsp 689 .Lmul4x_epilogue: 690 .byte 0xf3,0xc3 691 .cfi_endproc 692 .size bn_mul4x_mont,.-bn_mul4x_mont 693 .extern bn_sqr8x_internal 694 .hidden bn_sqr8x_internal 695 696 .type bn_sqr8x_mont,@function 697 .align 32 698 bn_sqr8x_mont: 699 .cfi_startproc 700 movq %rsp,%rax 701 .cfi_def_cfa_register %rax 702 .Lsqr8x_enter: 703 pushq %rbx 704 .cfi_offset %rbx,-16 705 pushq %rbp 706 .cfi_offset %rbp,-24 707 pushq %r12 708 .cfi_offset %r12,-32 709 pushq %r13 710 .cfi_offset %r13,-40 711 pushq %r14 712 .cfi_offset %r14,-48 713 pushq %r15 714 .cfi_offset %r15,-56 715 .Lsqr8x_prologue: 716 717 movl %r9d,%r10d 718 shll $3,%r9d 719 shlq $3+2,%r10 720 negq %r9 721 722 723 724 725 726 727 leaq -64(%rsp,%r9,2),%r11 728 movq %rsp,%rbp 729 movq (%r8),%r8 730 subq %rsi,%r11 731 andq $4095,%r11 732 cmpq %r11,%r10 733 jb .Lsqr8x_sp_alt 734 subq %r11,%rbp 735 leaq -64(%rbp,%r9,2),%rbp 736 jmp .Lsqr8x_sp_done 737 738 .align 32 739 .Lsqr8x_sp_alt: 740 leaq 4096-64(,%r9,2),%r10 741 leaq -64(%rbp,%r9,2),%rbp 742 subq %r10,%r11 743 movq $0,%r10 744 cmovcq %r10,%r11 745 subq %r11,%rbp 746 .Lsqr8x_sp_done: 747 andq $-64,%rbp 748 movq %rsp,%r11 749 subq %rbp,%r11 750 andq $-4096,%r11 751 leaq (%r11,%rbp,1),%rsp 752 movq (%rsp),%r10 753 cmpq %rbp,%rsp 754 ja .Lsqr8x_page_walk 755 jmp .Lsqr8x_page_walk_done 756 757 .align 16 758 .Lsqr8x_page_walk: 759 leaq -4096(%rsp),%rsp 760 movq (%rsp),%r10 761 cmpq %rbp,%rsp 762 ja .Lsqr8x_page_walk 763 .Lsqr8x_page_walk_done: 764 765 movq %r9,%r10 766 negq %r9 767 768 movq %r8,32(%rsp) 769 movq %rax,40(%rsp) 770 .cfi_escape 0x0f,0x05,0x77,0x28,0x06,0x23,0x08 771 .Lsqr8x_body: 772 773 .byte 102,72,15,110,209 774 pxor %xmm0,%xmm0 775 .byte 102,72,15,110,207 776 .byte 102,73,15,110,218 777 call bn_sqr8x_internal 778 779 780 781 782 leaq (%rdi,%r9,1),%rbx 783 movq %r9,%rcx 784 movq %r9,%rdx 785 .byte 102,72,15,126,207 786 sarq $3+2,%rcx 787 jmp .Lsqr8x_sub 788 789 .align 32 790 .Lsqr8x_sub: 791 movq 0(%rbx),%r12 792 movq 8(%rbx),%r13 793 movq 16(%rbx),%r14 794 movq 24(%rbx),%r15 795 leaq 32(%rbx),%rbx 796 sbbq 0(%rbp),%r12 797 sbbq 8(%rbp),%r13 798 sbbq 16(%rbp),%r14 799 sbbq 24(%rbp),%r15 800 leaq 32(%rbp),%rbp 801 movq %r12,0(%rdi) 802 movq %r13,8(%rdi) 803 movq %r14,16(%rdi) 804 movq %r15,24(%rdi) 805 leaq 32(%rdi),%rdi 806 incq %rcx 807 jnz .Lsqr8x_sub 808 809 sbbq $0,%rax 810 leaq (%rbx,%r9,1),%rbx 811 leaq (%rdi,%r9,1),%rdi 812 813 .byte 102,72,15,110,200 814 pxor %xmm0,%xmm0 815 pshufd $0,%xmm1,%xmm1 816 movq 40(%rsp),%rsi 817 .cfi_def_cfa %rsi,8 818 jmp .Lsqr8x_cond_copy 819 820 .align 32 821 .Lsqr8x_cond_copy: 822 movdqa 0(%rbx),%xmm2 823 movdqa 16(%rbx),%xmm3 824 leaq 32(%rbx),%rbx 825 movdqu 0(%rdi),%xmm4 826 movdqu 16(%rdi),%xmm5 827 leaq 32(%rdi),%rdi 828 movdqa %xmm0,-32(%rbx) 829 movdqa %xmm0,-16(%rbx) 830 movdqa %xmm0,-32(%rbx,%rdx,1) 831 movdqa %xmm0,-16(%rbx,%rdx,1) 832 pcmpeqd %xmm1,%xmm0 833 pand %xmm1,%xmm2 834 pand %xmm1,%xmm3 835 pand %xmm0,%xmm4 836 pand %xmm0,%xmm5 837 pxor %xmm0,%xmm0 838 por %xmm2,%xmm4 839 por %xmm3,%xmm5 840 movdqu %xmm4,-32(%rdi) 841 movdqu %xmm5,-16(%rdi) 842 addq $32,%r9 843 jnz .Lsqr8x_cond_copy 844 845 movq $1,%rax 846 movq -48(%rsi),%r15 847 .cfi_restore %r15 848 movq -40(%rsi),%r14 849 .cfi_restore %r14 850 movq -32(%rsi),%r13 851 .cfi_restore %r13 852 movq -24(%rsi),%r12 853 .cfi_restore %r12 854 movq -16(%rsi),%rbp 855 .cfi_restore %rbp 856 movq -8(%rsi),%rbx 857 .cfi_restore %rbx 858 leaq (%rsi),%rsp 859 .cfi_def_cfa_register %rsp 860 .Lsqr8x_epilogue: 861 .byte 0xf3,0xc3 862 .cfi_endproc 863 .size bn_sqr8x_mont,.-bn_sqr8x_mont 864 .byte 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 865 .align 16 866 #endif 867