1 #if defined(__x86_64__) 2 .text 3 4 .extern OPENSSL_ia32cap_P 5 .hidden OPENSSL_ia32cap_P 6 7 .globl bn_mul_mont 8 .hidden bn_mul_mont 9 .type bn_mul_mont,@function 10 .align 16 11 bn_mul_mont: 12 .cfi_startproc 13 movl %r9d,%r9d 14 movq %rsp,%rax 15 .cfi_def_cfa_register %rax 16 testl $3,%r9d 17 jnz .Lmul_enter 18 cmpl $8,%r9d 19 jb .Lmul_enter 20 cmpq %rsi,%rdx 21 jne .Lmul4x_enter 22 testl $7,%r9d 23 jz .Lsqr8x_enter 24 jmp .Lmul4x_enter 25 26 .align 16 27 .Lmul_enter: 28 pushq %rbx 29 .cfi_offset %rbx,-16 30 pushq %rbp 31 .cfi_offset %rbp,-24 32 pushq %r12 33 .cfi_offset %r12,-32 34 pushq %r13 35 .cfi_offset %r13,-40 36 pushq %r14 37 .cfi_offset %r14,-48 38 pushq %r15 39 .cfi_offset %r15,-56 40 41 negq %r9 42 movq %rsp,%r11 43 leaq -16(%rsp,%r9,8),%r10 44 negq %r9 45 andq $-1024,%r10 46 47 48 49 50 51 52 53 54 55 subq %r10,%r11 56 andq $-4096,%r11 57 leaq (%r10,%r11,1),%rsp 58 movq (%rsp),%r11 59 cmpq %r10,%rsp 60 ja .Lmul_page_walk 61 jmp .Lmul_page_walk_done 62 63 .align 16 64 .Lmul_page_walk: 65 leaq -4096(%rsp),%rsp 66 movq (%rsp),%r11 67 cmpq %r10,%rsp 68 ja .Lmul_page_walk 69 .Lmul_page_walk_done: 70 71 movq %rax,8(%rsp,%r9,8) 72 .cfi_escape 0x0f,0x0a,0x77,0x08,0x79,0x00,0x38,0x1e,0x22,0x06,0x23,0x08 73 .Lmul_body: 74 movq %rdx,%r12 75 movq (%r8),%r8 76 movq (%r12),%rbx 77 movq (%rsi),%rax 78 79 xorq %r14,%r14 80 xorq %r15,%r15 81 82 movq %r8,%rbp 83 mulq %rbx 84 movq %rax,%r10 85 movq (%rcx),%rax 86 87 imulq %r10,%rbp 88 movq %rdx,%r11 89 90 mulq %rbp 91 addq %rax,%r10 92 movq 8(%rsi),%rax 93 adcq $0,%rdx 94 movq %rdx,%r13 95 96 leaq 1(%r15),%r15 97 jmp .L1st_enter 98 99 .align 16 100 .L1st: 101 addq %rax,%r13 102 movq (%rsi,%r15,8),%rax 103 adcq $0,%rdx 104 addq %r11,%r13 105 movq %r10,%r11 106 adcq $0,%rdx 107 movq %r13,-16(%rsp,%r15,8) 108 movq %rdx,%r13 109 110 .L1st_enter: 111 mulq %rbx 112 addq %rax,%r11 113 movq (%rcx,%r15,8),%rax 114 adcq $0,%rdx 115 leaq 1(%r15),%r15 116 movq %rdx,%r10 117 118 mulq %rbp 119 cmpq %r9,%r15 120 jne .L1st 121 122 addq %rax,%r13 123 movq (%rsi),%rax 124 adcq $0,%rdx 125 addq %r11,%r13 126 adcq $0,%rdx 127 movq %r13,-16(%rsp,%r15,8) 128 movq %rdx,%r13 129 movq %r10,%r11 130 131 xorq %rdx,%rdx 132 addq %r11,%r13 133 adcq $0,%rdx 134 movq %r13,-8(%rsp,%r9,8) 135 movq %rdx,(%rsp,%r9,8) 136 137 leaq 1(%r14),%r14 138 jmp .Louter 139 .align 16 140 .Louter: 141 movq (%r12,%r14,8),%rbx 142 xorq %r15,%r15 143 movq %r8,%rbp 144 movq (%rsp),%r10 145 mulq %rbx 146 addq %rax,%r10 147 movq (%rcx),%rax 148 adcq $0,%rdx 149 150 imulq %r10,%rbp 151 movq %rdx,%r11 152 153 mulq %rbp 154 addq %rax,%r10 155 movq 8(%rsi),%rax 156 adcq $0,%rdx 157 movq 8(%rsp),%r10 158 movq %rdx,%r13 159 160 leaq 1(%r15),%r15 161 jmp .Linner_enter 162 163 .align 16 164 .Linner: 165 addq %rax,%r13 166 movq (%rsi,%r15,8),%rax 167 adcq $0,%rdx 168 addq %r10,%r13 169 movq (%rsp,%r15,8),%r10 170 adcq $0,%rdx 171 movq %r13,-16(%rsp,%r15,8) 172 movq %rdx,%r13 173 174 .Linner_enter: 175 mulq %rbx 176 addq %rax,%r11 177 movq (%rcx,%r15,8),%rax 178 adcq $0,%rdx 179 addq %r11,%r10 180 movq %rdx,%r11 181 adcq $0,%r11 182 leaq 1(%r15),%r15 183 184 mulq %rbp 185 cmpq %r9,%r15 186 jne .Linner 187 188 addq %rax,%r13 189 movq (%rsi),%rax 190 adcq $0,%rdx 191 addq %r10,%r13 192 movq (%rsp,%r15,8),%r10 193 adcq $0,%rdx 194 movq %r13,-16(%rsp,%r15,8) 195 movq %rdx,%r13 196 197 xorq %rdx,%rdx 198 addq %r11,%r13 199 adcq $0,%rdx 200 addq %r10,%r13 201 adcq $0,%rdx 202 movq %r13,-8(%rsp,%r9,8) 203 movq %rdx,(%rsp,%r9,8) 204 205 leaq 1(%r14),%r14 206 cmpq %r9,%r14 207 jb .Louter 208 209 xorq %r14,%r14 210 movq (%rsp),%rax 211 leaq (%rsp),%rsi 212 movq %r9,%r15 213 jmp .Lsub 214 .align 16 215 .Lsub: sbbq (%rcx,%r14,8),%rax 216 movq %rax,(%rdi,%r14,8) 217 movq 8(%rsi,%r14,8),%rax 218 leaq 1(%r14),%r14 219 decq %r15 220 jnz .Lsub 221 222 sbbq $0,%rax 223 xorq %r14,%r14 224 andq %rax,%rsi 225 notq %rax 226 movq %rdi,%rcx 227 andq %rax,%rcx 228 movq %r9,%r15 229 orq %rcx,%rsi 230 .align 16 231 .Lcopy: 232 movq (%rsi,%r14,8),%rax 233 movq %r14,(%rsp,%r14,8) 234 movq %rax,(%rdi,%r14,8) 235 leaq 1(%r14),%r14 236 subq $1,%r15 237 jnz .Lcopy 238 239 movq 8(%rsp,%r9,8),%rsi 240 .cfi_def_cfa %rsi,8 241 movq $1,%rax 242 movq -48(%rsi),%r15 243 .cfi_restore %r15 244 movq -40(%rsi),%r14 245 .cfi_restore %r14 246 movq -32(%rsi),%r13 247 .cfi_restore %r13 248 movq -24(%rsi),%r12 249 .cfi_restore %r12 250 movq -16(%rsi),%rbp 251 .cfi_restore %rbp 252 movq -8(%rsi),%rbx 253 .cfi_restore %rbx 254 leaq (%rsi),%rsp 255 .cfi_def_cfa_register %rsp 256 .Lmul_epilogue: 257 .byte 0xf3,0xc3 258 .cfi_endproc 259 .size bn_mul_mont,.-bn_mul_mont 260 .type bn_mul4x_mont,@function 261 .align 16 262 bn_mul4x_mont: 263 .cfi_startproc 264 movl %r9d,%r9d 265 movq %rsp,%rax 266 .cfi_def_cfa_register %rax 267 .Lmul4x_enter: 268 pushq %rbx 269 .cfi_offset %rbx,-16 270 pushq %rbp 271 .cfi_offset %rbp,-24 272 pushq %r12 273 .cfi_offset %r12,-32 274 pushq %r13 275 .cfi_offset %r13,-40 276 pushq %r14 277 .cfi_offset %r14,-48 278 pushq %r15 279 .cfi_offset %r15,-56 280 281 negq %r9 282 movq %rsp,%r11 283 leaq -32(%rsp,%r9,8),%r10 284 negq %r9 285 andq $-1024,%r10 286 287 subq %r10,%r11 288 andq $-4096,%r11 289 leaq (%r10,%r11,1),%rsp 290 movq (%rsp),%r11 291 cmpq %r10,%rsp 292 ja .Lmul4x_page_walk 293 jmp .Lmul4x_page_walk_done 294 295 .Lmul4x_page_walk: 296 leaq -4096(%rsp),%rsp 297 movq (%rsp),%r11 298 cmpq %r10,%rsp 299 ja .Lmul4x_page_walk 300 .Lmul4x_page_walk_done: 301 302 movq %rax,8(%rsp,%r9,8) 303 .cfi_escape 0x0f,0x0a,0x77,0x08,0x79,0x00,0x38,0x1e,0x22,0x06,0x23,0x08 304 .Lmul4x_body: 305 movq %rdi,16(%rsp,%r9,8) 306 movq %rdx,%r12 307 movq (%r8),%r8 308 movq (%r12),%rbx 309 movq (%rsi),%rax 310 311 xorq %r14,%r14 312 xorq %r15,%r15 313 314 movq %r8,%rbp 315 mulq %rbx 316 movq %rax,%r10 317 movq (%rcx),%rax 318 319 imulq %r10,%rbp 320 movq %rdx,%r11 321 322 mulq %rbp 323 addq %rax,%r10 324 movq 8(%rsi),%rax 325 adcq $0,%rdx 326 movq %rdx,%rdi 327 328 mulq %rbx 329 addq %rax,%r11 330 movq 8(%rcx),%rax 331 adcq $0,%rdx 332 movq %rdx,%r10 333 334 mulq %rbp 335 addq %rax,%rdi 336 movq 16(%rsi),%rax 337 adcq $0,%rdx 338 addq %r11,%rdi 339 leaq 4(%r15),%r15 340 adcq $0,%rdx 341 movq %rdi,(%rsp) 342 movq %rdx,%r13 343 jmp .L1st4x 344 .align 16 345 .L1st4x: 346 mulq %rbx 347 addq %rax,%r10 348 movq -16(%rcx,%r15,8),%rax 349 adcq $0,%rdx 350 movq %rdx,%r11 351 352 mulq %rbp 353 addq %rax,%r13 354 movq -8(%rsi,%r15,8),%rax 355 adcq $0,%rdx 356 addq %r10,%r13 357 adcq $0,%rdx 358 movq %r13,-24(%rsp,%r15,8) 359 movq %rdx,%rdi 360 361 mulq %rbx 362 addq %rax,%r11 363 movq -8(%rcx,%r15,8),%rax 364 adcq $0,%rdx 365 movq %rdx,%r10 366 367 mulq %rbp 368 addq %rax,%rdi 369 movq (%rsi,%r15,8),%rax 370 adcq $0,%rdx 371 addq %r11,%rdi 372 adcq $0,%rdx 373 movq %rdi,-16(%rsp,%r15,8) 374 movq %rdx,%r13 375 376 mulq %rbx 377 addq %rax,%r10 378 movq (%rcx,%r15,8),%rax 379 adcq $0,%rdx 380 movq %rdx,%r11 381 382 mulq %rbp 383 addq %rax,%r13 384 movq 8(%rsi,%r15,8),%rax 385 adcq $0,%rdx 386 addq %r10,%r13 387 adcq $0,%rdx 388 movq %r13,-8(%rsp,%r15,8) 389 movq %rdx,%rdi 390 391 mulq %rbx 392 addq %rax,%r11 393 movq 8(%rcx,%r15,8),%rax 394 adcq $0,%rdx 395 leaq 4(%r15),%r15 396 movq %rdx,%r10 397 398 mulq %rbp 399 addq %rax,%rdi 400 movq -16(%rsi,%r15,8),%rax 401 adcq $0,%rdx 402 addq %r11,%rdi 403 adcq $0,%rdx 404 movq %rdi,-32(%rsp,%r15,8) 405 movq %rdx,%r13 406 cmpq %r9,%r15 407 jb .L1st4x 408 409 mulq %rbx 410 addq %rax,%r10 411 movq -16(%rcx,%r15,8),%rax 412 adcq $0,%rdx 413 movq %rdx,%r11 414 415 mulq %rbp 416 addq %rax,%r13 417 movq -8(%rsi,%r15,8),%rax 418 adcq $0,%rdx 419 addq %r10,%r13 420 adcq $0,%rdx 421 movq %r13,-24(%rsp,%r15,8) 422 movq %rdx,%rdi 423 424 mulq %rbx 425 addq %rax,%r11 426 movq -8(%rcx,%r15,8),%rax 427 adcq $0,%rdx 428 movq %rdx,%r10 429 430 mulq %rbp 431 addq %rax,%rdi 432 movq (%rsi),%rax 433 adcq $0,%rdx 434 addq %r11,%rdi 435 adcq $0,%rdx 436 movq %rdi,-16(%rsp,%r15,8) 437 movq %rdx,%r13 438 439 xorq %rdi,%rdi 440 addq %r10,%r13 441 adcq $0,%rdi 442 movq %r13,-8(%rsp,%r15,8) 443 movq %rdi,(%rsp,%r15,8) 444 445 leaq 1(%r14),%r14 446 .align 4 447 .Louter4x: 448 movq (%r12,%r14,8),%rbx 449 xorq %r15,%r15 450 movq (%rsp),%r10 451 movq %r8,%rbp 452 mulq %rbx 453 addq %rax,%r10 454 movq (%rcx),%rax 455 adcq $0,%rdx 456 457 imulq %r10,%rbp 458 movq %rdx,%r11 459 460 mulq %rbp 461 addq %rax,%r10 462 movq 8(%rsi),%rax 463 adcq $0,%rdx 464 movq %rdx,%rdi 465 466 mulq %rbx 467 addq %rax,%r11 468 movq 8(%rcx),%rax 469 adcq $0,%rdx 470 addq 8(%rsp),%r11 471 adcq $0,%rdx 472 movq %rdx,%r10 473 474 mulq %rbp 475 addq %rax,%rdi 476 movq 16(%rsi),%rax 477 adcq $0,%rdx 478 addq %r11,%rdi 479 leaq 4(%r15),%r15 480 adcq $0,%rdx 481 movq %rdi,(%rsp) 482 movq %rdx,%r13 483 jmp .Linner4x 484 .align 16 485 .Linner4x: 486 mulq %rbx 487 addq %rax,%r10 488 movq -16(%rcx,%r15,8),%rax 489 adcq $0,%rdx 490 addq -16(%rsp,%r15,8),%r10 491 adcq $0,%rdx 492 movq %rdx,%r11 493 494 mulq %rbp 495 addq %rax,%r13 496 movq -8(%rsi,%r15,8),%rax 497 adcq $0,%rdx 498 addq %r10,%r13 499 adcq $0,%rdx 500 movq %r13,-24(%rsp,%r15,8) 501 movq %rdx,%rdi 502 503 mulq %rbx 504 addq %rax,%r11 505 movq -8(%rcx,%r15,8),%rax 506 adcq $0,%rdx 507 addq -8(%rsp,%r15,8),%r11 508 adcq $0,%rdx 509 movq %rdx,%r10 510 511 mulq %rbp 512 addq %rax,%rdi 513 movq (%rsi,%r15,8),%rax 514 adcq $0,%rdx 515 addq %r11,%rdi 516 adcq $0,%rdx 517 movq %rdi,-16(%rsp,%r15,8) 518 movq %rdx,%r13 519 520 mulq %rbx 521 addq %rax,%r10 522 movq (%rcx,%r15,8),%rax 523 adcq $0,%rdx 524 addq (%rsp,%r15,8),%r10 525 adcq $0,%rdx 526 movq %rdx,%r11 527 528 mulq %rbp 529 addq %rax,%r13 530 movq 8(%rsi,%r15,8),%rax 531 adcq $0,%rdx 532 addq %r10,%r13 533 adcq $0,%rdx 534 movq %r13,-8(%rsp,%r15,8) 535 movq %rdx,%rdi 536 537 mulq %rbx 538 addq %rax,%r11 539 movq 8(%rcx,%r15,8),%rax 540 adcq $0,%rdx 541 addq 8(%rsp,%r15,8),%r11 542 adcq $0,%rdx 543 leaq 4(%r15),%r15 544 movq %rdx,%r10 545 546 mulq %rbp 547 addq %rax,%rdi 548 movq -16(%rsi,%r15,8),%rax 549 adcq $0,%rdx 550 addq %r11,%rdi 551 adcq $0,%rdx 552 movq %rdi,-32(%rsp,%r15,8) 553 movq %rdx,%r13 554 cmpq %r9,%r15 555 jb .Linner4x 556 557 mulq %rbx 558 addq %rax,%r10 559 movq -16(%rcx,%r15,8),%rax 560 adcq $0,%rdx 561 addq -16(%rsp,%r15,8),%r10 562 adcq $0,%rdx 563 movq %rdx,%r11 564 565 mulq %rbp 566 addq %rax,%r13 567 movq -8(%rsi,%r15,8),%rax 568 adcq $0,%rdx 569 addq %r10,%r13 570 adcq $0,%rdx 571 movq %r13,-24(%rsp,%r15,8) 572 movq %rdx,%rdi 573 574 mulq %rbx 575 addq %rax,%r11 576 movq -8(%rcx,%r15,8),%rax 577 adcq $0,%rdx 578 addq -8(%rsp,%r15,8),%r11 579 adcq $0,%rdx 580 leaq 1(%r14),%r14 581 movq %rdx,%r10 582 583 mulq %rbp 584 addq %rax,%rdi 585 movq (%rsi),%rax 586 adcq $0,%rdx 587 addq %r11,%rdi 588 adcq $0,%rdx 589 movq %rdi,-16(%rsp,%r15,8) 590 movq %rdx,%r13 591 592 xorq %rdi,%rdi 593 addq %r10,%r13 594 adcq $0,%rdi 595 addq (%rsp,%r9,8),%r13 596 adcq $0,%rdi 597 movq %r13,-8(%rsp,%r15,8) 598 movq %rdi,(%rsp,%r15,8) 599 600 cmpq %r9,%r14 601 jb .Louter4x 602 movq 16(%rsp,%r9,8),%rdi 603 leaq -4(%r9),%r15 604 movq 0(%rsp),%rax 605 pxor %xmm0,%xmm0 606 movq 8(%rsp),%rdx 607 shrq $2,%r15 608 leaq (%rsp),%rsi 609 xorq %r14,%r14 610 611 subq 0(%rcx),%rax 612 movq 16(%rsi),%rbx 613 movq 24(%rsi),%rbp 614 sbbq 8(%rcx),%rdx 615 jmp .Lsub4x 616 .align 16 617 .Lsub4x: 618 movq %rax,0(%rdi,%r14,8) 619 movq %rdx,8(%rdi,%r14,8) 620 sbbq 16(%rcx,%r14,8),%rbx 621 movq 32(%rsi,%r14,8),%rax 622 movq 40(%rsi,%r14,8),%rdx 623 sbbq 24(%rcx,%r14,8),%rbp 624 movq %rbx,16(%rdi,%r14,8) 625 movq %rbp,24(%rdi,%r14,8) 626 sbbq 32(%rcx,%r14,8),%rax 627 movq 48(%rsi,%r14,8),%rbx 628 movq 56(%rsi,%r14,8),%rbp 629 sbbq 40(%rcx,%r14,8),%rdx 630 leaq 4(%r14),%r14 631 decq %r15 632 jnz .Lsub4x 633 634 movq %rax,0(%rdi,%r14,8) 635 movq 32(%rsi,%r14,8),%rax 636 sbbq 16(%rcx,%r14,8),%rbx 637 movq %rdx,8(%rdi,%r14,8) 638 sbbq 24(%rcx,%r14,8),%rbp 639 movq %rbx,16(%rdi,%r14,8) 640 641 sbbq $0,%rax 642 movq %rbp,24(%rdi,%r14,8) 643 xorq %r14,%r14 644 andq %rax,%rsi 645 notq %rax 646 movq %rdi,%rcx 647 andq %rax,%rcx 648 leaq -4(%r9),%r15 649 orq %rcx,%rsi 650 shrq $2,%r15 651 652 movdqu (%rsi),%xmm1 653 movdqa %xmm0,(%rsp) 654 movdqu %xmm1,(%rdi) 655 jmp .Lcopy4x 656 .align 16 657 .Lcopy4x: 658 movdqu 16(%rsi,%r14,1),%xmm2 659 movdqu 32(%rsi,%r14,1),%xmm1 660 movdqa %xmm0,16(%rsp,%r14,1) 661 movdqu %xmm2,16(%rdi,%r14,1) 662 movdqa %xmm0,32(%rsp,%r14,1) 663 movdqu %xmm1,32(%rdi,%r14,1) 664 leaq 32(%r14),%r14 665 decq %r15 666 jnz .Lcopy4x 667 668 movdqu 16(%rsi,%r14,1),%xmm2 669 movdqa %xmm0,16(%rsp,%r14,1) 670 movdqu %xmm2,16(%rdi,%r14,1) 671 movq 8(%rsp,%r9,8),%rsi 672 .cfi_def_cfa %rsi, 8 673 movq $1,%rax 674 movq -48(%rsi),%r15 675 .cfi_restore %r15 676 movq -40(%rsi),%r14 677 .cfi_restore %r14 678 movq -32(%rsi),%r13 679 .cfi_restore %r13 680 movq -24(%rsi),%r12 681 .cfi_restore %r12 682 movq -16(%rsi),%rbp 683 .cfi_restore %rbp 684 movq -8(%rsi),%rbx 685 .cfi_restore %rbx 686 leaq (%rsi),%rsp 687 .cfi_def_cfa_register %rsp 688 .Lmul4x_epilogue: 689 .byte 0xf3,0xc3 690 .cfi_endproc 691 .size bn_mul4x_mont,.-bn_mul4x_mont 692 .extern bn_sqr8x_internal 693 .hidden bn_sqr8x_internal 694 695 .type bn_sqr8x_mont,@function 696 .align 32 697 bn_sqr8x_mont: 698 .cfi_startproc 699 movq %rsp,%rax 700 .cfi_def_cfa_register %rax 701 .Lsqr8x_enter: 702 pushq %rbx 703 .cfi_offset %rbx,-16 704 pushq %rbp 705 .cfi_offset %rbp,-24 706 pushq %r12 707 .cfi_offset %r12,-32 708 pushq %r13 709 .cfi_offset %r13,-40 710 pushq %r14 711 .cfi_offset %r14,-48 712 pushq %r15 713 .cfi_offset %r15,-56 714 .Lsqr8x_prologue: 715 716 movl %r9d,%r10d 717 shll $3,%r9d 718 shlq $3+2,%r10 719 negq %r9 720 721 722 723 724 725 726 leaq -64(%rsp,%r9,2),%r11 727 movq %rsp,%rbp 728 movq (%r8),%r8 729 subq %rsi,%r11 730 andq $4095,%r11 731 cmpq %r11,%r10 732 jb .Lsqr8x_sp_alt 733 subq %r11,%rbp 734 leaq -64(%rbp,%r9,2),%rbp 735 jmp .Lsqr8x_sp_done 736 737 .align 32 738 .Lsqr8x_sp_alt: 739 leaq 4096-64(,%r9,2),%r10 740 leaq -64(%rbp,%r9,2),%rbp 741 subq %r10,%r11 742 movq $0,%r10 743 cmovcq %r10,%r11 744 subq %r11,%rbp 745 .Lsqr8x_sp_done: 746 andq $-64,%rbp 747 movq %rsp,%r11 748 subq %rbp,%r11 749 andq $-4096,%r11 750 leaq (%r11,%rbp,1),%rsp 751 movq (%rsp),%r10 752 cmpq %rbp,%rsp 753 ja .Lsqr8x_page_walk 754 jmp .Lsqr8x_page_walk_done 755 756 .align 16 757 .Lsqr8x_page_walk: 758 leaq -4096(%rsp),%rsp 759 movq (%rsp),%r10 760 cmpq %rbp,%rsp 761 ja .Lsqr8x_page_walk 762 .Lsqr8x_page_walk_done: 763 764 movq %r9,%r10 765 negq %r9 766 767 movq %r8,32(%rsp) 768 movq %rax,40(%rsp) 769 .cfi_escape 0x0f,0x05,0x77,0x28,0x06,0x23,0x08 770 .Lsqr8x_body: 771 772 .byte 102,72,15,110,209 773 pxor %xmm0,%xmm0 774 .byte 102,72,15,110,207 775 .byte 102,73,15,110,218 776 call bn_sqr8x_internal 777 778 779 780 781 leaq (%rdi,%r9,1),%rbx 782 movq %r9,%rcx 783 movq %r9,%rdx 784 .byte 102,72,15,126,207 785 sarq $3+2,%rcx 786 jmp .Lsqr8x_sub 787 788 .align 32 789 .Lsqr8x_sub: 790 movq 0(%rbx),%r12 791 movq 8(%rbx),%r13 792 movq 16(%rbx),%r14 793 movq 24(%rbx),%r15 794 leaq 32(%rbx),%rbx 795 sbbq 0(%rbp),%r12 796 sbbq 8(%rbp),%r13 797 sbbq 16(%rbp),%r14 798 sbbq 24(%rbp),%r15 799 leaq 32(%rbp),%rbp 800 movq %r12,0(%rdi) 801 movq %r13,8(%rdi) 802 movq %r14,16(%rdi) 803 movq %r15,24(%rdi) 804 leaq 32(%rdi),%rdi 805 incq %rcx 806 jnz .Lsqr8x_sub 807 808 sbbq $0,%rax 809 leaq (%rbx,%r9,1),%rbx 810 leaq (%rdi,%r9,1),%rdi 811 812 .byte 102,72,15,110,200 813 pxor %xmm0,%xmm0 814 pshufd $0,%xmm1,%xmm1 815 movq 40(%rsp),%rsi 816 .cfi_def_cfa %rsi,8 817 jmp .Lsqr8x_cond_copy 818 819 .align 32 820 .Lsqr8x_cond_copy: 821 movdqa 0(%rbx),%xmm2 822 movdqa 16(%rbx),%xmm3 823 leaq 32(%rbx),%rbx 824 movdqu 0(%rdi),%xmm4 825 movdqu 16(%rdi),%xmm5 826 leaq 32(%rdi),%rdi 827 movdqa %xmm0,-32(%rbx) 828 movdqa %xmm0,-16(%rbx) 829 movdqa %xmm0,-32(%rbx,%rdx,1) 830 movdqa %xmm0,-16(%rbx,%rdx,1) 831 pcmpeqd %xmm1,%xmm0 832 pand %xmm1,%xmm2 833 pand %xmm1,%xmm3 834 pand %xmm0,%xmm4 835 pand %xmm0,%xmm5 836 pxor %xmm0,%xmm0 837 por %xmm2,%xmm4 838 por %xmm3,%xmm5 839 movdqu %xmm4,-32(%rdi) 840 movdqu %xmm5,-16(%rdi) 841 addq $32,%r9 842 jnz .Lsqr8x_cond_copy 843 844 movq $1,%rax 845 movq -48(%rsi),%r15 846 .cfi_restore %r15 847 movq -40(%rsi),%r14 848 .cfi_restore %r14 849 movq -32(%rsi),%r13 850 .cfi_restore %r13 851 movq -24(%rsi),%r12 852 .cfi_restore %r12 853 movq -16(%rsi),%rbp 854 .cfi_restore %rbp 855 movq -8(%rsi),%rbx 856 .cfi_restore %rbx 857 leaq (%rsi),%rsp 858 .cfi_def_cfa_register %rsp 859 .Lsqr8x_epilogue: 860 .byte 0xf3,0xc3 861 .cfi_endproc 862 .size bn_sqr8x_mont,.-bn_sqr8x_mont 863 .byte 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 864 .align 16 865 #endif 866