1 #if defined(__x86_64__) && !defined(OPENSSL_NO_ASM) 2 .text 3 4 .extern OPENSSL_ia32cap_P 5 .hidden OPENSSL_ia32cap_P 6 7 .globl bn_mul_mont_gather5 8 .hidden bn_mul_mont_gather5 9 .type bn_mul_mont_gather5,@function 10 .align 64 11 bn_mul_mont_gather5: 12 .cfi_startproc 13 movl %r9d,%r9d 14 movq %rsp,%rax 15 .cfi_def_cfa_register %rax 16 testl $7,%r9d 17 jnz .Lmul_enter 18 jmp .Lmul4x_enter 19 20 .align 16 21 .Lmul_enter: 22 movd 8(%rsp),%xmm5 23 pushq %rbx 24 .cfi_offset %rbx,-16 25 pushq %rbp 26 .cfi_offset %rbp,-24 27 pushq %r12 28 .cfi_offset %r12,-32 29 pushq %r13 30 .cfi_offset %r13,-40 31 pushq %r14 32 .cfi_offset %r14,-48 33 pushq %r15 34 .cfi_offset %r15,-56 35 36 negq %r9 37 movq %rsp,%r11 38 leaq -280(%rsp,%r9,8),%r10 39 negq %r9 40 andq $-1024,%r10 41 42 43 44 45 46 47 48 49 50 subq %r10,%r11 51 andq $-4096,%r11 52 leaq (%r10,%r11,1),%rsp 53 movq (%rsp),%r11 54 cmpq %r10,%rsp 55 ja .Lmul_page_walk 56 jmp .Lmul_page_walk_done 57 58 .Lmul_page_walk: 59 leaq -4096(%rsp),%rsp 60 movq (%rsp),%r11 61 cmpq %r10,%rsp 62 ja .Lmul_page_walk 63 .Lmul_page_walk_done: 64 65 leaq .Linc(%rip),%r10 66 movq %rax,8(%rsp,%r9,8) 67 .cfi_escape 0x0f,0x0a,0x77,0x08,0x79,0x00,0x38,0x1e,0x22,0x06,0x23,0x08 68 .Lmul_body: 69 70 leaq 128(%rdx),%r12 71 movdqa 0(%r10),%xmm0 72 movdqa 16(%r10),%xmm1 73 leaq 24-112(%rsp,%r9,8),%r10 74 andq $-16,%r10 75 76 pshufd $0,%xmm5,%xmm5 77 movdqa %xmm1,%xmm4 78 movdqa %xmm1,%xmm2 79 paddd %xmm0,%xmm1 80 pcmpeqd %xmm5,%xmm0 81 .byte 0x67 82 movdqa %xmm4,%xmm3 83 paddd %xmm1,%xmm2 84 pcmpeqd %xmm5,%xmm1 85 movdqa %xmm0,112(%r10) 86 movdqa %xmm4,%xmm0 87 88 paddd %xmm2,%xmm3 89 pcmpeqd %xmm5,%xmm2 90 movdqa %xmm1,128(%r10) 91 movdqa %xmm4,%xmm1 92 93 paddd %xmm3,%xmm0 94 pcmpeqd %xmm5,%xmm3 95 movdqa %xmm2,144(%r10) 96 movdqa %xmm4,%xmm2 97 98 paddd %xmm0,%xmm1 99 pcmpeqd %xmm5,%xmm0 100 movdqa %xmm3,160(%r10) 101 movdqa %xmm4,%xmm3 102 paddd %xmm1,%xmm2 103 pcmpeqd %xmm5,%xmm1 104 movdqa %xmm0,176(%r10) 105 movdqa %xmm4,%xmm0 106 107 paddd %xmm2,%xmm3 108 pcmpeqd %xmm5,%xmm2 109 movdqa %xmm1,192(%r10) 110 movdqa %xmm4,%xmm1 111 112 paddd %xmm3,%xmm0 113 pcmpeqd %xmm5,%xmm3 114 movdqa %xmm2,208(%r10) 115 movdqa %xmm4,%xmm2 116 117 paddd %xmm0,%xmm1 118 pcmpeqd %xmm5,%xmm0 119 movdqa %xmm3,224(%r10) 120 movdqa %xmm4,%xmm3 121 paddd %xmm1,%xmm2 122 pcmpeqd %xmm5,%xmm1 123 movdqa %xmm0,240(%r10) 124 movdqa %xmm4,%xmm0 125 126 paddd %xmm2,%xmm3 127 pcmpeqd %xmm5,%xmm2 128 movdqa %xmm1,256(%r10) 129 movdqa %xmm4,%xmm1 130 131 paddd %xmm3,%xmm0 132 pcmpeqd %xmm5,%xmm3 133 movdqa %xmm2,272(%r10) 134 movdqa %xmm4,%xmm2 135 136 paddd %xmm0,%xmm1 137 pcmpeqd %xmm5,%xmm0 138 movdqa %xmm3,288(%r10) 139 movdqa %xmm4,%xmm3 140 paddd %xmm1,%xmm2 141 pcmpeqd %xmm5,%xmm1 142 movdqa %xmm0,304(%r10) 143 144 paddd %xmm2,%xmm3 145 .byte 0x67 146 pcmpeqd %xmm5,%xmm2 147 movdqa %xmm1,320(%r10) 148 149 pcmpeqd %xmm5,%xmm3 150 movdqa %xmm2,336(%r10) 151 pand 64(%r12),%xmm0 152 153 pand 80(%r12),%xmm1 154 pand 96(%r12),%xmm2 155 movdqa %xmm3,352(%r10) 156 pand 112(%r12),%xmm3 157 por %xmm2,%xmm0 158 por %xmm3,%xmm1 159 movdqa -128(%r12),%xmm4 160 movdqa -112(%r12),%xmm5 161 movdqa -96(%r12),%xmm2 162 pand 112(%r10),%xmm4 163 movdqa -80(%r12),%xmm3 164 pand 128(%r10),%xmm5 165 por %xmm4,%xmm0 166 pand 144(%r10),%xmm2 167 por %xmm5,%xmm1 168 pand 160(%r10),%xmm3 169 por %xmm2,%xmm0 170 por %xmm3,%xmm1 171 movdqa -64(%r12),%xmm4 172 movdqa -48(%r12),%xmm5 173 movdqa -32(%r12),%xmm2 174 pand 176(%r10),%xmm4 175 movdqa -16(%r12),%xmm3 176 pand 192(%r10),%xmm5 177 por %xmm4,%xmm0 178 pand 208(%r10),%xmm2 179 por %xmm5,%xmm1 180 pand 224(%r10),%xmm3 181 por %xmm2,%xmm0 182 por %xmm3,%xmm1 183 movdqa 0(%r12),%xmm4 184 movdqa 16(%r12),%xmm5 185 movdqa 32(%r12),%xmm2 186 pand 240(%r10),%xmm4 187 movdqa 48(%r12),%xmm3 188 pand 256(%r10),%xmm5 189 por %xmm4,%xmm0 190 pand 272(%r10),%xmm2 191 por %xmm5,%xmm1 192 pand 288(%r10),%xmm3 193 por %xmm2,%xmm0 194 por %xmm3,%xmm1 195 por %xmm1,%xmm0 196 pshufd $0x4e,%xmm0,%xmm1 197 por %xmm1,%xmm0 198 leaq 256(%r12),%r12 199 .byte 102,72,15,126,195 200 201 movq (%r8),%r8 202 movq (%rsi),%rax 203 204 xorq %r14,%r14 205 xorq %r15,%r15 206 207 movq %r8,%rbp 208 mulq %rbx 209 movq %rax,%r10 210 movq (%rcx),%rax 211 212 imulq %r10,%rbp 213 movq %rdx,%r11 214 215 mulq %rbp 216 addq %rax,%r10 217 movq 8(%rsi),%rax 218 adcq $0,%rdx 219 movq %rdx,%r13 220 221 leaq 1(%r15),%r15 222 jmp .L1st_enter 223 224 .align 16 225 .L1st: 226 addq %rax,%r13 227 movq (%rsi,%r15,8),%rax 228 adcq $0,%rdx 229 addq %r11,%r13 230 movq %r10,%r11 231 adcq $0,%rdx 232 movq %r13,-16(%rsp,%r15,8) 233 movq %rdx,%r13 234 235 .L1st_enter: 236 mulq %rbx 237 addq %rax,%r11 238 movq (%rcx,%r15,8),%rax 239 adcq $0,%rdx 240 leaq 1(%r15),%r15 241 movq %rdx,%r10 242 243 mulq %rbp 244 cmpq %r9,%r15 245 jne .L1st 246 247 248 addq %rax,%r13 249 adcq $0,%rdx 250 addq %r11,%r13 251 adcq $0,%rdx 252 movq %r13,-16(%rsp,%r9,8) 253 movq %rdx,%r13 254 movq %r10,%r11 255 256 xorq %rdx,%rdx 257 addq %r11,%r13 258 adcq $0,%rdx 259 movq %r13,-8(%rsp,%r9,8) 260 movq %rdx,(%rsp,%r9,8) 261 262 leaq 1(%r14),%r14 263 jmp .Louter 264 .align 16 265 .Louter: 266 leaq 24+128(%rsp,%r9,8),%rdx 267 andq $-16,%rdx 268 pxor %xmm4,%xmm4 269 pxor %xmm5,%xmm5 270 movdqa -128(%r12),%xmm0 271 movdqa -112(%r12),%xmm1 272 movdqa -96(%r12),%xmm2 273 movdqa -80(%r12),%xmm3 274 pand -128(%rdx),%xmm0 275 pand -112(%rdx),%xmm1 276 por %xmm0,%xmm4 277 pand -96(%rdx),%xmm2 278 por %xmm1,%xmm5 279 pand -80(%rdx),%xmm3 280 por %xmm2,%xmm4 281 por %xmm3,%xmm5 282 movdqa -64(%r12),%xmm0 283 movdqa -48(%r12),%xmm1 284 movdqa -32(%r12),%xmm2 285 movdqa -16(%r12),%xmm3 286 pand -64(%rdx),%xmm0 287 pand -48(%rdx),%xmm1 288 por %xmm0,%xmm4 289 pand -32(%rdx),%xmm2 290 por %xmm1,%xmm5 291 pand -16(%rdx),%xmm3 292 por %xmm2,%xmm4 293 por %xmm3,%xmm5 294 movdqa 0(%r12),%xmm0 295 movdqa 16(%r12),%xmm1 296 movdqa 32(%r12),%xmm2 297 movdqa 48(%r12),%xmm3 298 pand 0(%rdx),%xmm0 299 pand 16(%rdx),%xmm1 300 por %xmm0,%xmm4 301 pand 32(%rdx),%xmm2 302 por %xmm1,%xmm5 303 pand 48(%rdx),%xmm3 304 por %xmm2,%xmm4 305 por %xmm3,%xmm5 306 movdqa 64(%r12),%xmm0 307 movdqa 80(%r12),%xmm1 308 movdqa 96(%r12),%xmm2 309 movdqa 112(%r12),%xmm3 310 pand 64(%rdx),%xmm0 311 pand 80(%rdx),%xmm1 312 por %xmm0,%xmm4 313 pand 96(%rdx),%xmm2 314 por %xmm1,%xmm5 315 pand 112(%rdx),%xmm3 316 por %xmm2,%xmm4 317 por %xmm3,%xmm5 318 por %xmm5,%xmm4 319 pshufd $0x4e,%xmm4,%xmm0 320 por %xmm4,%xmm0 321 leaq 256(%r12),%r12 322 323 movq (%rsi),%rax 324 .byte 102,72,15,126,195 325 326 xorq %r15,%r15 327 movq %r8,%rbp 328 movq (%rsp),%r10 329 330 mulq %rbx 331 addq %rax,%r10 332 movq (%rcx),%rax 333 adcq $0,%rdx 334 335 imulq %r10,%rbp 336 movq %rdx,%r11 337 338 mulq %rbp 339 addq %rax,%r10 340 movq 8(%rsi),%rax 341 adcq $0,%rdx 342 movq 8(%rsp),%r10 343 movq %rdx,%r13 344 345 leaq 1(%r15),%r15 346 jmp .Linner_enter 347 348 .align 16 349 .Linner: 350 addq %rax,%r13 351 movq (%rsi,%r15,8),%rax 352 adcq $0,%rdx 353 addq %r10,%r13 354 movq (%rsp,%r15,8),%r10 355 adcq $0,%rdx 356 movq %r13,-16(%rsp,%r15,8) 357 movq %rdx,%r13 358 359 .Linner_enter: 360 mulq %rbx 361 addq %rax,%r11 362 movq (%rcx,%r15,8),%rax 363 adcq $0,%rdx 364 addq %r11,%r10 365 movq %rdx,%r11 366 adcq $0,%r11 367 leaq 1(%r15),%r15 368 369 mulq %rbp 370 cmpq %r9,%r15 371 jne .Linner 372 373 addq %rax,%r13 374 adcq $0,%rdx 375 addq %r10,%r13 376 movq (%rsp,%r9,8),%r10 377 adcq $0,%rdx 378 movq %r13,-16(%rsp,%r9,8) 379 movq %rdx,%r13 380 381 xorq %rdx,%rdx 382 addq %r11,%r13 383 adcq $0,%rdx 384 addq %r10,%r13 385 adcq $0,%rdx 386 movq %r13,-8(%rsp,%r9,8) 387 movq %rdx,(%rsp,%r9,8) 388 389 leaq 1(%r14),%r14 390 cmpq %r9,%r14 391 jb .Louter 392 393 xorq %r14,%r14 394 movq (%rsp),%rax 395 leaq (%rsp),%rsi 396 movq %r9,%r15 397 jmp .Lsub 398 .align 16 399 .Lsub: 400 sbbq (%rcx,%r14,8),%rax 401 movq %rax,(%rdi,%r14,8) 402 movq 8(%rsi,%r14,8),%rax 403 leaq 1(%r14),%r14 404 decq %r15 405 jnz .Lsub 406 407 sbbq $0,%rax 408 xorq %r14,%r14 409 andq %rax,%rsi 410 notq %rax 411 movq %rdi,%rcx 412 andq %rax,%rcx 413 movq %r9,%r15 414 orq %rcx,%rsi 415 .align 16 416 .Lcopy: 417 movq (%rsi,%r14,8),%rax 418 movq %r14,(%rsp,%r14,8) 419 movq %rax,(%rdi,%r14,8) 420 leaq 1(%r14),%r14 421 subq $1,%r15 422 jnz .Lcopy 423 424 movq 8(%rsp,%r9,8),%rsi 425 .cfi_def_cfa %rsi,8 426 movq $1,%rax 427 428 movq -48(%rsi),%r15 429 .cfi_restore %r15 430 movq -40(%rsi),%r14 431 .cfi_restore %r14 432 movq -32(%rsi),%r13 433 .cfi_restore %r13 434 movq -24(%rsi),%r12 435 .cfi_restore %r12 436 movq -16(%rsi),%rbp 437 .cfi_restore %rbp 438 movq -8(%rsi),%rbx 439 .cfi_restore %rbx 440 leaq (%rsi),%rsp 441 .cfi_def_cfa_register %rsp 442 .Lmul_epilogue: 443 .byte 0xf3,0xc3 444 .cfi_endproc 445 .size bn_mul_mont_gather5,.-bn_mul_mont_gather5 446 .type bn_mul4x_mont_gather5,@function 447 .align 32 448 bn_mul4x_mont_gather5: 449 .cfi_startproc 450 .byte 0x67 451 movq %rsp,%rax 452 .cfi_def_cfa_register %rax 453 .Lmul4x_enter: 454 pushq %rbx 455 .cfi_offset %rbx,-16 456 pushq %rbp 457 .cfi_offset %rbp,-24 458 pushq %r12 459 .cfi_offset %r12,-32 460 pushq %r13 461 .cfi_offset %r13,-40 462 pushq %r14 463 .cfi_offset %r14,-48 464 pushq %r15 465 .cfi_offset %r15,-56 466 .Lmul4x_prologue: 467 468 .byte 0x67 469 shll $3,%r9d 470 leaq (%r9,%r9,2),%r10 471 negq %r9 472 473 474 475 476 477 478 479 480 481 482 leaq -320(%rsp,%r9,2),%r11 483 movq %rsp,%rbp 484 subq %rdi,%r11 485 andq $4095,%r11 486 cmpq %r11,%r10 487 jb .Lmul4xsp_alt 488 subq %r11,%rbp 489 leaq -320(%rbp,%r9,2),%rbp 490 jmp .Lmul4xsp_done 491 492 .align 32 493 .Lmul4xsp_alt: 494 leaq 4096-320(,%r9,2),%r10 495 leaq -320(%rbp,%r9,2),%rbp 496 subq %r10,%r11 497 movq $0,%r10 498 cmovcq %r10,%r11 499 subq %r11,%rbp 500 .Lmul4xsp_done: 501 andq $-64,%rbp 502 movq %rsp,%r11 503 subq %rbp,%r11 504 andq $-4096,%r11 505 leaq (%r11,%rbp,1),%rsp 506 movq (%rsp),%r10 507 cmpq %rbp,%rsp 508 ja .Lmul4x_page_walk 509 jmp .Lmul4x_page_walk_done 510 511 .Lmul4x_page_walk: 512 leaq -4096(%rsp),%rsp 513 movq (%rsp),%r10 514 cmpq %rbp,%rsp 515 ja .Lmul4x_page_walk 516 .Lmul4x_page_walk_done: 517 518 negq %r9 519 520 movq %rax,40(%rsp) 521 .cfi_escape 0x0f,0x05,0x77,0x28,0x06,0x23,0x08 522 .Lmul4x_body: 523 524 call mul4x_internal 525 526 movq 40(%rsp),%rsi 527 .cfi_def_cfa %rsi,8 528 movq $1,%rax 529 530 movq -48(%rsi),%r15 531 .cfi_restore %r15 532 movq -40(%rsi),%r14 533 .cfi_restore %r14 534 movq -32(%rsi),%r13 535 .cfi_restore %r13 536 movq -24(%rsi),%r12 537 .cfi_restore %r12 538 movq -16(%rsi),%rbp 539 .cfi_restore %rbp 540 movq -8(%rsi),%rbx 541 .cfi_restore %rbx 542 leaq (%rsi),%rsp 543 .cfi_def_cfa_register %rsp 544 .Lmul4x_epilogue: 545 .byte 0xf3,0xc3 546 .cfi_endproc 547 .size bn_mul4x_mont_gather5,.-bn_mul4x_mont_gather5 548 549 .type mul4x_internal,@function 550 .align 32 551 mul4x_internal: 552 shlq $5,%r9 553 movd 8(%rax),%xmm5 554 leaq .Linc(%rip),%rax 555 leaq 128(%rdx,%r9,1),%r13 556 shrq $5,%r9 557 movdqa 0(%rax),%xmm0 558 movdqa 16(%rax),%xmm1 559 leaq 88-112(%rsp,%r9,1),%r10 560 leaq 128(%rdx),%r12 561 562 pshufd $0,%xmm5,%xmm5 563 movdqa %xmm1,%xmm4 564 .byte 0x67,0x67 565 movdqa %xmm1,%xmm2 566 paddd %xmm0,%xmm1 567 pcmpeqd %xmm5,%xmm0 568 .byte 0x67 569 movdqa %xmm4,%xmm3 570 paddd %xmm1,%xmm2 571 pcmpeqd %xmm5,%xmm1 572 movdqa %xmm0,112(%r10) 573 movdqa %xmm4,%xmm0 574 575 paddd %xmm2,%xmm3 576 pcmpeqd %xmm5,%xmm2 577 movdqa %xmm1,128(%r10) 578 movdqa %xmm4,%xmm1 579 580 paddd %xmm3,%xmm0 581 pcmpeqd %xmm5,%xmm3 582 movdqa %xmm2,144(%r10) 583 movdqa %xmm4,%xmm2 584 585 paddd %xmm0,%xmm1 586 pcmpeqd %xmm5,%xmm0 587 movdqa %xmm3,160(%r10) 588 movdqa %xmm4,%xmm3 589 paddd %xmm1,%xmm2 590 pcmpeqd %xmm5,%xmm1 591 movdqa %xmm0,176(%r10) 592 movdqa %xmm4,%xmm0 593 594 paddd %xmm2,%xmm3 595 pcmpeqd %xmm5,%xmm2 596 movdqa %xmm1,192(%r10) 597 movdqa %xmm4,%xmm1 598 599 paddd %xmm3,%xmm0 600 pcmpeqd %xmm5,%xmm3 601 movdqa %xmm2,208(%r10) 602 movdqa %xmm4,%xmm2 603 604 paddd %xmm0,%xmm1 605 pcmpeqd %xmm5,%xmm0 606 movdqa %xmm3,224(%r10) 607 movdqa %xmm4,%xmm3 608 paddd %xmm1,%xmm2 609 pcmpeqd %xmm5,%xmm1 610 movdqa %xmm0,240(%r10) 611 movdqa %xmm4,%xmm0 612 613 paddd %xmm2,%xmm3 614 pcmpeqd %xmm5,%xmm2 615 movdqa %xmm1,256(%r10) 616 movdqa %xmm4,%xmm1 617 618 paddd %xmm3,%xmm0 619 pcmpeqd %xmm5,%xmm3 620 movdqa %xmm2,272(%r10) 621 movdqa %xmm4,%xmm2 622 623 paddd %xmm0,%xmm1 624 pcmpeqd %xmm5,%xmm0 625 movdqa %xmm3,288(%r10) 626 movdqa %xmm4,%xmm3 627 paddd %xmm1,%xmm2 628 pcmpeqd %xmm5,%xmm1 629 movdqa %xmm0,304(%r10) 630 631 paddd %xmm2,%xmm3 632 .byte 0x67 633 pcmpeqd %xmm5,%xmm2 634 movdqa %xmm1,320(%r10) 635 636 pcmpeqd %xmm5,%xmm3 637 movdqa %xmm2,336(%r10) 638 pand 64(%r12),%xmm0 639 640 pand 80(%r12),%xmm1 641 pand 96(%r12),%xmm2 642 movdqa %xmm3,352(%r10) 643 pand 112(%r12),%xmm3 644 por %xmm2,%xmm0 645 por %xmm3,%xmm1 646 movdqa -128(%r12),%xmm4 647 movdqa -112(%r12),%xmm5 648 movdqa -96(%r12),%xmm2 649 pand 112(%r10),%xmm4 650 movdqa -80(%r12),%xmm3 651 pand 128(%r10),%xmm5 652 por %xmm4,%xmm0 653 pand 144(%r10),%xmm2 654 por %xmm5,%xmm1 655 pand 160(%r10),%xmm3 656 por %xmm2,%xmm0 657 por %xmm3,%xmm1 658 movdqa -64(%r12),%xmm4 659 movdqa -48(%r12),%xmm5 660 movdqa -32(%r12),%xmm2 661 pand 176(%r10),%xmm4 662 movdqa -16(%r12),%xmm3 663 pand 192(%r10),%xmm5 664 por %xmm4,%xmm0 665 pand 208(%r10),%xmm2 666 por %xmm5,%xmm1 667 pand 224(%r10),%xmm3 668 por %xmm2,%xmm0 669 por %xmm3,%xmm1 670 movdqa 0(%r12),%xmm4 671 movdqa 16(%r12),%xmm5 672 movdqa 32(%r12),%xmm2 673 pand 240(%r10),%xmm4 674 movdqa 48(%r12),%xmm3 675 pand 256(%r10),%xmm5 676 por %xmm4,%xmm0 677 pand 272(%r10),%xmm2 678 por %xmm5,%xmm1 679 pand 288(%r10),%xmm3 680 por %xmm2,%xmm0 681 por %xmm3,%xmm1 682 por %xmm1,%xmm0 683 pshufd $0x4e,%xmm0,%xmm1 684 por %xmm1,%xmm0 685 leaq 256(%r12),%r12 686 .byte 102,72,15,126,195 687 688 movq %r13,16+8(%rsp) 689 movq %rdi,56+8(%rsp) 690 691 movq (%r8),%r8 692 movq (%rsi),%rax 693 leaq (%rsi,%r9,1),%rsi 694 negq %r9 695 696 movq %r8,%rbp 697 mulq %rbx 698 movq %rax,%r10 699 movq (%rcx),%rax 700 701 imulq %r10,%rbp 702 leaq 64+8(%rsp),%r14 703 movq %rdx,%r11 704 705 mulq %rbp 706 addq %rax,%r10 707 movq 8(%rsi,%r9,1),%rax 708 adcq $0,%rdx 709 movq %rdx,%rdi 710 711 mulq %rbx 712 addq %rax,%r11 713 movq 8(%rcx),%rax 714 adcq $0,%rdx 715 movq %rdx,%r10 716 717 mulq %rbp 718 addq %rax,%rdi 719 movq 16(%rsi,%r9,1),%rax 720 adcq $0,%rdx 721 addq %r11,%rdi 722 leaq 32(%r9),%r15 723 leaq 32(%rcx),%rcx 724 adcq $0,%rdx 725 movq %rdi,(%r14) 726 movq %rdx,%r13 727 jmp .L1st4x 728 729 .align 32 730 .L1st4x: 731 mulq %rbx 732 addq %rax,%r10 733 movq -16(%rcx),%rax 734 leaq 32(%r14),%r14 735 adcq $0,%rdx 736 movq %rdx,%r11 737 738 mulq %rbp 739 addq %rax,%r13 740 movq -8(%rsi,%r15,1),%rax 741 adcq $0,%rdx 742 addq %r10,%r13 743 adcq $0,%rdx 744 movq %r13,-24(%r14) 745 movq %rdx,%rdi 746 747 mulq %rbx 748 addq %rax,%r11 749 movq -8(%rcx),%rax 750 adcq $0,%rdx 751 movq %rdx,%r10 752 753 mulq %rbp 754 addq %rax,%rdi 755 movq (%rsi,%r15,1),%rax 756 adcq $0,%rdx 757 addq %r11,%rdi 758 adcq $0,%rdx 759 movq %rdi,-16(%r14) 760 movq %rdx,%r13 761 762 mulq %rbx 763 addq %rax,%r10 764 movq 0(%rcx),%rax 765 adcq $0,%rdx 766 movq %rdx,%r11 767 768 mulq %rbp 769 addq %rax,%r13 770 movq 8(%rsi,%r15,1),%rax 771 adcq $0,%rdx 772 addq %r10,%r13 773 adcq $0,%rdx 774 movq %r13,-8(%r14) 775 movq %rdx,%rdi 776 777 mulq %rbx 778 addq %rax,%r11 779 movq 8(%rcx),%rax 780 adcq $0,%rdx 781 movq %rdx,%r10 782 783 mulq %rbp 784 addq %rax,%rdi 785 movq 16(%rsi,%r15,1),%rax 786 adcq $0,%rdx 787 addq %r11,%rdi 788 leaq 32(%rcx),%rcx 789 adcq $0,%rdx 790 movq %rdi,(%r14) 791 movq %rdx,%r13 792 793 addq $32,%r15 794 jnz .L1st4x 795 796 mulq %rbx 797 addq %rax,%r10 798 movq -16(%rcx),%rax 799 leaq 32(%r14),%r14 800 adcq $0,%rdx 801 movq %rdx,%r11 802 803 mulq %rbp 804 addq %rax,%r13 805 movq -8(%rsi),%rax 806 adcq $0,%rdx 807 addq %r10,%r13 808 adcq $0,%rdx 809 movq %r13,-24(%r14) 810 movq %rdx,%rdi 811 812 mulq %rbx 813 addq %rax,%r11 814 movq -8(%rcx),%rax 815 adcq $0,%rdx 816 movq %rdx,%r10 817 818 mulq %rbp 819 addq %rax,%rdi 820 movq (%rsi,%r9,1),%rax 821 adcq $0,%rdx 822 addq %r11,%rdi 823 adcq $0,%rdx 824 movq %rdi,-16(%r14) 825 movq %rdx,%r13 826 827 leaq (%rcx,%r9,1),%rcx 828 829 xorq %rdi,%rdi 830 addq %r10,%r13 831 adcq $0,%rdi 832 movq %r13,-8(%r14) 833 834 jmp .Louter4x 835 836 .align 32 837 .Louter4x: 838 leaq 16+128(%r14),%rdx 839 pxor %xmm4,%xmm4 840 pxor %xmm5,%xmm5 841 movdqa -128(%r12),%xmm0 842 movdqa -112(%r12),%xmm1 843 movdqa -96(%r12),%xmm2 844 movdqa -80(%r12),%xmm3 845 pand -128(%rdx),%xmm0 846 pand -112(%rdx),%xmm1 847 por %xmm0,%xmm4 848 pand -96(%rdx),%xmm2 849 por %xmm1,%xmm5 850 pand -80(%rdx),%xmm3 851 por %xmm2,%xmm4 852 por %xmm3,%xmm5 853 movdqa -64(%r12),%xmm0 854 movdqa -48(%r12),%xmm1 855 movdqa -32(%r12),%xmm2 856 movdqa -16(%r12),%xmm3 857 pand -64(%rdx),%xmm0 858 pand -48(%rdx),%xmm1 859 por %xmm0,%xmm4 860 pand -32(%rdx),%xmm2 861 por %xmm1,%xmm5 862 pand -16(%rdx),%xmm3 863 por %xmm2,%xmm4 864 por %xmm3,%xmm5 865 movdqa 0(%r12),%xmm0 866 movdqa 16(%r12),%xmm1 867 movdqa 32(%r12),%xmm2 868 movdqa 48(%r12),%xmm3 869 pand 0(%rdx),%xmm0 870 pand 16(%rdx),%xmm1 871 por %xmm0,%xmm4 872 pand 32(%rdx),%xmm2 873 por %xmm1,%xmm5 874 pand 48(%rdx),%xmm3 875 por %xmm2,%xmm4 876 por %xmm3,%xmm5 877 movdqa 64(%r12),%xmm0 878 movdqa 80(%r12),%xmm1 879 movdqa 96(%r12),%xmm2 880 movdqa 112(%r12),%xmm3 881 pand 64(%rdx),%xmm0 882 pand 80(%rdx),%xmm1 883 por %xmm0,%xmm4 884 pand 96(%rdx),%xmm2 885 por %xmm1,%xmm5 886 pand 112(%rdx),%xmm3 887 por %xmm2,%xmm4 888 por %xmm3,%xmm5 889 por %xmm5,%xmm4 890 pshufd $0x4e,%xmm4,%xmm0 891 por %xmm4,%xmm0 892 leaq 256(%r12),%r12 893 .byte 102,72,15,126,195 894 895 movq (%r14,%r9,1),%r10 896 movq %r8,%rbp 897 mulq %rbx 898 addq %rax,%r10 899 movq (%rcx),%rax 900 adcq $0,%rdx 901 902 imulq %r10,%rbp 903 movq %rdx,%r11 904 movq %rdi,(%r14) 905 906 leaq (%r14,%r9,1),%r14 907 908 mulq %rbp 909 addq %rax,%r10 910 movq 8(%rsi,%r9,1),%rax 911 adcq $0,%rdx 912 movq %rdx,%rdi 913 914 mulq %rbx 915 addq %rax,%r11 916 movq 8(%rcx),%rax 917 adcq $0,%rdx 918 addq 8(%r14),%r11 919 adcq $0,%rdx 920 movq %rdx,%r10 921 922 mulq %rbp 923 addq %rax,%rdi 924 movq 16(%rsi,%r9,1),%rax 925 adcq $0,%rdx 926 addq %r11,%rdi 927 leaq 32(%r9),%r15 928 leaq 32(%rcx),%rcx 929 adcq $0,%rdx 930 movq %rdx,%r13 931 jmp .Linner4x 932 933 .align 32 934 .Linner4x: 935 mulq %rbx 936 addq %rax,%r10 937 movq -16(%rcx),%rax 938 adcq $0,%rdx 939 addq 16(%r14),%r10 940 leaq 32(%r14),%r14 941 adcq $0,%rdx 942 movq %rdx,%r11 943 944 mulq %rbp 945 addq %rax,%r13 946 movq -8(%rsi,%r15,1),%rax 947 adcq $0,%rdx 948 addq %r10,%r13 949 adcq $0,%rdx 950 movq %rdi,-32(%r14) 951 movq %rdx,%rdi 952 953 mulq %rbx 954 addq %rax,%r11 955 movq -8(%rcx),%rax 956 adcq $0,%rdx 957 addq -8(%r14),%r11 958 adcq $0,%rdx 959 movq %rdx,%r10 960 961 mulq %rbp 962 addq %rax,%rdi 963 movq (%rsi,%r15,1),%rax 964 adcq $0,%rdx 965 addq %r11,%rdi 966 adcq $0,%rdx 967 movq %r13,-24(%r14) 968 movq %rdx,%r13 969 970 mulq %rbx 971 addq %rax,%r10 972 movq 0(%rcx),%rax 973 adcq $0,%rdx 974 addq (%r14),%r10 975 adcq $0,%rdx 976 movq %rdx,%r11 977 978 mulq %rbp 979 addq %rax,%r13 980 movq 8(%rsi,%r15,1),%rax 981 adcq $0,%rdx 982 addq %r10,%r13 983 adcq $0,%rdx 984 movq %rdi,-16(%r14) 985 movq %rdx,%rdi 986 987 mulq %rbx 988 addq %rax,%r11 989 movq 8(%rcx),%rax 990 adcq $0,%rdx 991 addq 8(%r14),%r11 992 adcq $0,%rdx 993 movq %rdx,%r10 994 995 mulq %rbp 996 addq %rax,%rdi 997 movq 16(%rsi,%r15,1),%rax 998 adcq $0,%rdx 999 addq %r11,%rdi 1000 leaq 32(%rcx),%rcx 1001 adcq $0,%rdx 1002 movq %r13,-8(%r14) 1003 movq %rdx,%r13 1004 1005 addq $32,%r15 1006 jnz .Linner4x 1007 1008 mulq %rbx 1009 addq %rax,%r10 1010 movq -16(%rcx),%rax 1011 adcq $0,%rdx 1012 addq 16(%r14),%r10 1013 leaq 32(%r14),%r14 1014 adcq $0,%rdx 1015 movq %rdx,%r11 1016 1017 mulq %rbp 1018 addq %rax,%r13 1019 movq -8(%rsi),%rax 1020 adcq $0,%rdx 1021 addq %r10,%r13 1022 adcq $0,%rdx 1023 movq %rdi,-32(%r14) 1024 movq %rdx,%rdi 1025 1026 mulq %rbx 1027 addq %rax,%r11 1028 movq %rbp,%rax 1029 movq -8(%rcx),%rbp 1030 adcq $0,%rdx 1031 addq -8(%r14),%r11 1032 adcq $0,%rdx 1033 movq %rdx,%r10 1034 1035 mulq %rbp 1036 addq %rax,%rdi 1037 movq (%rsi,%r9,1),%rax 1038 adcq $0,%rdx 1039 addq %r11,%rdi 1040 adcq $0,%rdx 1041 movq %r13,-24(%r14) 1042 movq %rdx,%r13 1043 1044 movq %rdi,-16(%r14) 1045 leaq (%rcx,%r9,1),%rcx 1046 1047 xorq %rdi,%rdi 1048 addq %r10,%r13 1049 adcq $0,%rdi 1050 addq (%r14),%r13 1051 adcq $0,%rdi 1052 movq %r13,-8(%r14) 1053 1054 cmpq 16+8(%rsp),%r12 1055 jb .Louter4x 1056 xorq %rax,%rax 1057 subq %r13,%rbp 1058 adcq %r15,%r15 1059 orq %r15,%rdi 1060 subq %rdi,%rax 1061 leaq (%r14,%r9,1),%rbx 1062 movq (%rcx),%r12 1063 leaq (%rcx),%rbp 1064 movq %r9,%rcx 1065 sarq $3+2,%rcx 1066 movq 56+8(%rsp),%rdi 1067 decq %r12 1068 xorq %r10,%r10 1069 movq 8(%rbp),%r13 1070 movq 16(%rbp),%r14 1071 movq 24(%rbp),%r15 1072 jmp .Lsqr4x_sub_entry 1073 .size mul4x_internal,.-mul4x_internal 1074 .globl bn_power5 1075 .hidden bn_power5 1076 .type bn_power5,@function 1077 .align 32 1078 bn_power5: 1079 .cfi_startproc 1080 movq %rsp,%rax 1081 .cfi_def_cfa_register %rax 1082 pushq %rbx 1083 .cfi_offset %rbx,-16 1084 pushq %rbp 1085 .cfi_offset %rbp,-24 1086 pushq %r12 1087 .cfi_offset %r12,-32 1088 pushq %r13 1089 .cfi_offset %r13,-40 1090 pushq %r14 1091 .cfi_offset %r14,-48 1092 pushq %r15 1093 .cfi_offset %r15,-56 1094 .Lpower5_prologue: 1095 1096 shll $3,%r9d 1097 leal (%r9,%r9,2),%r10d 1098 negq %r9 1099 movq (%r8),%r8 1100 1101 1102 1103 1104 1105 1106 1107 1108 leaq -320(%rsp,%r9,2),%r11 1109 movq %rsp,%rbp 1110 subq %rdi,%r11 1111 andq $4095,%r11 1112 cmpq %r11,%r10 1113 jb .Lpwr_sp_alt 1114 subq %r11,%rbp 1115 leaq -320(%rbp,%r9,2),%rbp 1116 jmp .Lpwr_sp_done 1117 1118 .align 32 1119 .Lpwr_sp_alt: 1120 leaq 4096-320(,%r9,2),%r10 1121 leaq -320(%rbp,%r9,2),%rbp 1122 subq %r10,%r11 1123 movq $0,%r10 1124 cmovcq %r10,%r11 1125 subq %r11,%rbp 1126 .Lpwr_sp_done: 1127 andq $-64,%rbp 1128 movq %rsp,%r11 1129 subq %rbp,%r11 1130 andq $-4096,%r11 1131 leaq (%r11,%rbp,1),%rsp 1132 movq (%rsp),%r10 1133 cmpq %rbp,%rsp 1134 ja .Lpwr_page_walk 1135 jmp .Lpwr_page_walk_done 1136 1137 .Lpwr_page_walk: 1138 leaq -4096(%rsp),%rsp 1139 movq (%rsp),%r10 1140 cmpq %rbp,%rsp 1141 ja .Lpwr_page_walk 1142 .Lpwr_page_walk_done: 1143 1144 movq %r9,%r10 1145 negq %r9 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 movq %r8,32(%rsp) 1157 movq %rax,40(%rsp) 1158 .cfi_escape 0x0f,0x05,0x77,0x28,0x06,0x23,0x08 1159 .Lpower5_body: 1160 .byte 102,72,15,110,207 1161 .byte 102,72,15,110,209 1162 .byte 102,73,15,110,218 1163 .byte 102,72,15,110,226 1164 1165 call __bn_sqr8x_internal 1166 call __bn_post4x_internal 1167 call __bn_sqr8x_internal 1168 call __bn_post4x_internal 1169 call __bn_sqr8x_internal 1170 call __bn_post4x_internal 1171 call __bn_sqr8x_internal 1172 call __bn_post4x_internal 1173 call __bn_sqr8x_internal 1174 call __bn_post4x_internal 1175 1176 .byte 102,72,15,126,209 1177 .byte 102,72,15,126,226 1178 movq %rsi,%rdi 1179 movq 40(%rsp),%rax 1180 leaq 32(%rsp),%r8 1181 1182 call mul4x_internal 1183 1184 movq 40(%rsp),%rsi 1185 .cfi_def_cfa %rsi,8 1186 movq $1,%rax 1187 movq -48(%rsi),%r15 1188 .cfi_restore %r15 1189 movq -40(%rsi),%r14 1190 .cfi_restore %r14 1191 movq -32(%rsi),%r13 1192 .cfi_restore %r13 1193 movq -24(%rsi),%r12 1194 .cfi_restore %r12 1195 movq -16(%rsi),%rbp 1196 .cfi_restore %rbp 1197 movq -8(%rsi),%rbx 1198 .cfi_restore %rbx 1199 leaq (%rsi),%rsp 1200 .cfi_def_cfa_register %rsp 1201 .Lpower5_epilogue: 1202 .byte 0xf3,0xc3 1203 .cfi_endproc 1204 .size bn_power5,.-bn_power5 1205 1206 .globl bn_sqr8x_internal 1207 .hidden bn_sqr8x_internal 1208 .hidden bn_sqr8x_internal 1209 .type bn_sqr8x_internal,@function 1210 .align 32 1211 bn_sqr8x_internal: 1212 __bn_sqr8x_internal: 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 leaq 32(%r10),%rbp 1287 leaq (%rsi,%r9,1),%rsi 1288 1289 movq %r9,%rcx 1290 1291 1292 movq -32(%rsi,%rbp,1),%r14 1293 leaq 48+8(%rsp,%r9,2),%rdi 1294 movq -24(%rsi,%rbp,1),%rax 1295 leaq -32(%rdi,%rbp,1),%rdi 1296 movq -16(%rsi,%rbp,1),%rbx 1297 movq %rax,%r15 1298 1299 mulq %r14 1300 movq %rax,%r10 1301 movq %rbx,%rax 1302 movq %rdx,%r11 1303 movq %r10,-24(%rdi,%rbp,1) 1304 1305 mulq %r14 1306 addq %rax,%r11 1307 movq %rbx,%rax 1308 adcq $0,%rdx 1309 movq %r11,-16(%rdi,%rbp,1) 1310 movq %rdx,%r10 1311 1312 1313 movq -8(%rsi,%rbp,1),%rbx 1314 mulq %r15 1315 movq %rax,%r12 1316 movq %rbx,%rax 1317 movq %rdx,%r13 1318 1319 leaq (%rbp),%rcx 1320 mulq %r14 1321 addq %rax,%r10 1322 movq %rbx,%rax 1323 movq %rdx,%r11 1324 adcq $0,%r11 1325 addq %r12,%r10 1326 adcq $0,%r11 1327 movq %r10,-8(%rdi,%rcx,1) 1328 jmp .Lsqr4x_1st 1329 1330 .align 32 1331 .Lsqr4x_1st: 1332 movq (%rsi,%rcx,1),%rbx 1333 mulq %r15 1334 addq %rax,%r13 1335 movq %rbx,%rax 1336 movq %rdx,%r12 1337 adcq $0,%r12 1338 1339 mulq %r14 1340 addq %rax,%r11 1341 movq %rbx,%rax 1342 movq 8(%rsi,%rcx,1),%rbx 1343 movq %rdx,%r10 1344 adcq $0,%r10 1345 addq %r13,%r11 1346 adcq $0,%r10 1347 1348 1349 mulq %r15 1350 addq %rax,%r12 1351 movq %rbx,%rax 1352 movq %r11,(%rdi,%rcx,1) 1353 movq %rdx,%r13 1354 adcq $0,%r13 1355 1356 mulq %r14 1357 addq %rax,%r10 1358 movq %rbx,%rax 1359 movq 16(%rsi,%rcx,1),%rbx 1360 movq %rdx,%r11 1361 adcq $0,%r11 1362 addq %r12,%r10 1363 adcq $0,%r11 1364 1365 mulq %r15 1366 addq %rax,%r13 1367 movq %rbx,%rax 1368 movq %r10,8(%rdi,%rcx,1) 1369 movq %rdx,%r12 1370 adcq $0,%r12 1371 1372 mulq %r14 1373 addq %rax,%r11 1374 movq %rbx,%rax 1375 movq 24(%rsi,%rcx,1),%rbx 1376 movq %rdx,%r10 1377 adcq $0,%r10 1378 addq %r13,%r11 1379 adcq $0,%r10 1380 1381 1382 mulq %r15 1383 addq %rax,%r12 1384 movq %rbx,%rax 1385 movq %r11,16(%rdi,%rcx,1) 1386 movq %rdx,%r13 1387 adcq $0,%r13 1388 leaq 32(%rcx),%rcx 1389 1390 mulq %r14 1391 addq %rax,%r10 1392 movq %rbx,%rax 1393 movq %rdx,%r11 1394 adcq $0,%r11 1395 addq %r12,%r10 1396 adcq $0,%r11 1397 movq %r10,-8(%rdi,%rcx,1) 1398 1399 cmpq $0,%rcx 1400 jne .Lsqr4x_1st 1401 1402 mulq %r15 1403 addq %rax,%r13 1404 leaq 16(%rbp),%rbp 1405 adcq $0,%rdx 1406 addq %r11,%r13 1407 adcq $0,%rdx 1408 1409 movq %r13,(%rdi) 1410 movq %rdx,%r12 1411 movq %rdx,8(%rdi) 1412 jmp .Lsqr4x_outer 1413 1414 .align 32 1415 .Lsqr4x_outer: 1416 movq -32(%rsi,%rbp,1),%r14 1417 leaq 48+8(%rsp,%r9,2),%rdi 1418 movq -24(%rsi,%rbp,1),%rax 1419 leaq -32(%rdi,%rbp,1),%rdi 1420 movq -16(%rsi,%rbp,1),%rbx 1421 movq %rax,%r15 1422 1423 mulq %r14 1424 movq -24(%rdi,%rbp,1),%r10 1425 addq %rax,%r10 1426 movq %rbx,%rax 1427 adcq $0,%rdx 1428 movq %r10,-24(%rdi,%rbp,1) 1429 movq %rdx,%r11 1430 1431 mulq %r14 1432 addq %rax,%r11 1433 movq %rbx,%rax 1434 adcq $0,%rdx 1435 addq -16(%rdi,%rbp,1),%r11 1436 movq %rdx,%r10 1437 adcq $0,%r10 1438 movq %r11,-16(%rdi,%rbp,1) 1439 1440 xorq %r12,%r12 1441 1442 movq -8(%rsi,%rbp,1),%rbx 1443 mulq %r15 1444 addq %rax,%r12 1445 movq %rbx,%rax 1446 adcq $0,%rdx 1447 addq -8(%rdi,%rbp,1),%r12 1448 movq %rdx,%r13 1449 adcq $0,%r13 1450 1451 mulq %r14 1452 addq %rax,%r10 1453 movq %rbx,%rax 1454 adcq $0,%rdx 1455 addq %r12,%r10 1456 movq %rdx,%r11 1457 adcq $0,%r11 1458 movq %r10,-8(%rdi,%rbp,1) 1459 1460 leaq (%rbp),%rcx 1461 jmp .Lsqr4x_inner 1462 1463 .align 32 1464 .Lsqr4x_inner: 1465 movq (%rsi,%rcx,1),%rbx 1466 mulq %r15 1467 addq %rax,%r13 1468 movq %rbx,%rax 1469 movq %rdx,%r12 1470 adcq $0,%r12 1471 addq (%rdi,%rcx,1),%r13 1472 adcq $0,%r12 1473 1474 .byte 0x67 1475 mulq %r14 1476 addq %rax,%r11 1477 movq %rbx,%rax 1478 movq 8(%rsi,%rcx,1),%rbx 1479 movq %rdx,%r10 1480 adcq $0,%r10 1481 addq %r13,%r11 1482 adcq $0,%r10 1483 1484 mulq %r15 1485 addq %rax,%r12 1486 movq %r11,(%rdi,%rcx,1) 1487 movq %rbx,%rax 1488 movq %rdx,%r13 1489 adcq $0,%r13 1490 addq 8(%rdi,%rcx,1),%r12 1491 leaq 16(%rcx),%rcx 1492 adcq $0,%r13 1493 1494 mulq %r14 1495 addq %rax,%r10 1496 movq %rbx,%rax 1497 adcq $0,%rdx 1498 addq %r12,%r10 1499 movq %rdx,%r11 1500 adcq $0,%r11 1501 movq %r10,-8(%rdi,%rcx,1) 1502 1503 cmpq $0,%rcx 1504 jne .Lsqr4x_inner 1505 1506 .byte 0x67 1507 mulq %r15 1508 addq %rax,%r13 1509 adcq $0,%rdx 1510 addq %r11,%r13 1511 adcq $0,%rdx 1512 1513 movq %r13,(%rdi) 1514 movq %rdx,%r12 1515 movq %rdx,8(%rdi) 1516 1517 addq $16,%rbp 1518 jnz .Lsqr4x_outer 1519 1520 1521 movq -32(%rsi),%r14 1522 leaq 48+8(%rsp,%r9,2),%rdi 1523 movq -24(%rsi),%rax 1524 leaq -32(%rdi,%rbp,1),%rdi 1525 movq -16(%rsi),%rbx 1526 movq %rax,%r15 1527 1528 mulq %r14 1529 addq %rax,%r10 1530 movq %rbx,%rax 1531 movq %rdx,%r11 1532 adcq $0,%r11 1533 1534 mulq %r14 1535 addq %rax,%r11 1536 movq %rbx,%rax 1537 movq %r10,-24(%rdi) 1538 movq %rdx,%r10 1539 adcq $0,%r10 1540 addq %r13,%r11 1541 movq -8(%rsi),%rbx 1542 adcq $0,%r10 1543 1544 mulq %r15 1545 addq %rax,%r12 1546 movq %rbx,%rax 1547 movq %r11,-16(%rdi) 1548 movq %rdx,%r13 1549 adcq $0,%r13 1550 1551 mulq %r14 1552 addq %rax,%r10 1553 movq %rbx,%rax 1554 movq %rdx,%r11 1555 adcq $0,%r11 1556 addq %r12,%r10 1557 adcq $0,%r11 1558 movq %r10,-8(%rdi) 1559 1560 mulq %r15 1561 addq %rax,%r13 1562 movq -16(%rsi),%rax 1563 adcq $0,%rdx 1564 addq %r11,%r13 1565 adcq $0,%rdx 1566 1567 movq %r13,(%rdi) 1568 movq %rdx,%r12 1569 movq %rdx,8(%rdi) 1570 1571 mulq %rbx 1572 addq $16,%rbp 1573 xorq %r14,%r14 1574 subq %r9,%rbp 1575 xorq %r15,%r15 1576 1577 addq %r12,%rax 1578 adcq $0,%rdx 1579 movq %rax,8(%rdi) 1580 movq %rdx,16(%rdi) 1581 movq %r15,24(%rdi) 1582 1583 movq -16(%rsi,%rbp,1),%rax 1584 leaq 48+8(%rsp),%rdi 1585 xorq %r10,%r10 1586 movq 8(%rdi),%r11 1587 1588 leaq (%r14,%r10,2),%r12 1589 shrq $63,%r10 1590 leaq (%rcx,%r11,2),%r13 1591 shrq $63,%r11 1592 orq %r10,%r13 1593 movq 16(%rdi),%r10 1594 movq %r11,%r14 1595 mulq %rax 1596 negq %r15 1597 movq 24(%rdi),%r11 1598 adcq %rax,%r12 1599 movq -8(%rsi,%rbp,1),%rax 1600 movq %r12,(%rdi) 1601 adcq %rdx,%r13 1602 1603 leaq (%r14,%r10,2),%rbx 1604 movq %r13,8(%rdi) 1605 sbbq %r15,%r15 1606 shrq $63,%r10 1607 leaq (%rcx,%r11,2),%r8 1608 shrq $63,%r11 1609 orq %r10,%r8 1610 movq 32(%rdi),%r10 1611 movq %r11,%r14 1612 mulq %rax 1613 negq %r15 1614 movq 40(%rdi),%r11 1615 adcq %rax,%rbx 1616 movq 0(%rsi,%rbp,1),%rax 1617 movq %rbx,16(%rdi) 1618 adcq %rdx,%r8 1619 leaq 16(%rbp),%rbp 1620 movq %r8,24(%rdi) 1621 sbbq %r15,%r15 1622 leaq 64(%rdi),%rdi 1623 jmp .Lsqr4x_shift_n_add 1624 1625 .align 32 1626 .Lsqr4x_shift_n_add: 1627 leaq (%r14,%r10,2),%r12 1628 shrq $63,%r10 1629 leaq (%rcx,%r11,2),%r13 1630 shrq $63,%r11 1631 orq %r10,%r13 1632 movq -16(%rdi),%r10 1633 movq %r11,%r14 1634 mulq %rax 1635 negq %r15 1636 movq -8(%rdi),%r11 1637 adcq %rax,%r12 1638 movq -8(%rsi,%rbp,1),%rax 1639 movq %r12,-32(%rdi) 1640 adcq %rdx,%r13 1641 1642 leaq (%r14,%r10,2),%rbx 1643 movq %r13,-24(%rdi) 1644 sbbq %r15,%r15 1645 shrq $63,%r10 1646 leaq (%rcx,%r11,2),%r8 1647 shrq $63,%r11 1648 orq %r10,%r8 1649 movq 0(%rdi),%r10 1650 movq %r11,%r14 1651 mulq %rax 1652 negq %r15 1653 movq 8(%rdi),%r11 1654 adcq %rax,%rbx 1655 movq 0(%rsi,%rbp,1),%rax 1656 movq %rbx,-16(%rdi) 1657 adcq %rdx,%r8 1658 1659 leaq (%r14,%r10,2),%r12 1660 movq %r8,-8(%rdi) 1661 sbbq %r15,%r15 1662 shrq $63,%r10 1663 leaq (%rcx,%r11,2),%r13 1664 shrq $63,%r11 1665 orq %r10,%r13 1666 movq 16(%rdi),%r10 1667 movq %r11,%r14 1668 mulq %rax 1669 negq %r15 1670 movq 24(%rdi),%r11 1671 adcq %rax,%r12 1672 movq 8(%rsi,%rbp,1),%rax 1673 movq %r12,0(%rdi) 1674 adcq %rdx,%r13 1675 1676 leaq (%r14,%r10,2),%rbx 1677 movq %r13,8(%rdi) 1678 sbbq %r15,%r15 1679 shrq $63,%r10 1680 leaq (%rcx,%r11,2),%r8 1681 shrq $63,%r11 1682 orq %r10,%r8 1683 movq 32(%rdi),%r10 1684 movq %r11,%r14 1685 mulq %rax 1686 negq %r15 1687 movq 40(%rdi),%r11 1688 adcq %rax,%rbx 1689 movq 16(%rsi,%rbp,1),%rax 1690 movq %rbx,16(%rdi) 1691 adcq %rdx,%r8 1692 movq %r8,24(%rdi) 1693 sbbq %r15,%r15 1694 leaq 64(%rdi),%rdi 1695 addq $32,%rbp 1696 jnz .Lsqr4x_shift_n_add 1697 1698 leaq (%r14,%r10,2),%r12 1699 .byte 0x67 1700 shrq $63,%r10 1701 leaq (%rcx,%r11,2),%r13 1702 shrq $63,%r11 1703 orq %r10,%r13 1704 movq -16(%rdi),%r10 1705 movq %r11,%r14 1706 mulq %rax 1707 negq %r15 1708 movq -8(%rdi),%r11 1709 adcq %rax,%r12 1710 movq -8(%rsi),%rax 1711 movq %r12,-32(%rdi) 1712 adcq %rdx,%r13 1713 1714 leaq (%r14,%r10,2),%rbx 1715 movq %r13,-24(%rdi) 1716 sbbq %r15,%r15 1717 shrq $63,%r10 1718 leaq (%rcx,%r11,2),%r8 1719 shrq $63,%r11 1720 orq %r10,%r8 1721 mulq %rax 1722 negq %r15 1723 adcq %rax,%rbx 1724 adcq %rdx,%r8 1725 movq %rbx,-16(%rdi) 1726 movq %r8,-8(%rdi) 1727 .byte 102,72,15,126,213 1728 __bn_sqr8x_reduction: 1729 xorq %rax,%rax 1730 leaq (%r9,%rbp,1),%rcx 1731 leaq 48+8(%rsp,%r9,2),%rdx 1732 movq %rcx,0+8(%rsp) 1733 leaq 48+8(%rsp,%r9,1),%rdi 1734 movq %rdx,8+8(%rsp) 1735 negq %r9 1736 jmp .L8x_reduction_loop 1737 1738 .align 32 1739 .L8x_reduction_loop: 1740 leaq (%rdi,%r9,1),%rdi 1741 .byte 0x66 1742 movq 0(%rdi),%rbx 1743 movq 8(%rdi),%r9 1744 movq 16(%rdi),%r10 1745 movq 24(%rdi),%r11 1746 movq 32(%rdi),%r12 1747 movq 40(%rdi),%r13 1748 movq 48(%rdi),%r14 1749 movq 56(%rdi),%r15 1750 movq %rax,(%rdx) 1751 leaq 64(%rdi),%rdi 1752 1753 .byte 0x67 1754 movq %rbx,%r8 1755 imulq 32+8(%rsp),%rbx 1756 movq 0(%rbp),%rax 1757 movl $8,%ecx 1758 jmp .L8x_reduce 1759 1760 .align 32 1761 .L8x_reduce: 1762 mulq %rbx 1763 movq 8(%rbp),%rax 1764 negq %r8 1765 movq %rdx,%r8 1766 adcq $0,%r8 1767 1768 mulq %rbx 1769 addq %rax,%r9 1770 movq 16(%rbp),%rax 1771 adcq $0,%rdx 1772 addq %r9,%r8 1773 movq %rbx,48-8+8(%rsp,%rcx,8) 1774 movq %rdx,%r9 1775 adcq $0,%r9 1776 1777 mulq %rbx 1778 addq %rax,%r10 1779 movq 24(%rbp),%rax 1780 adcq $0,%rdx 1781 addq %r10,%r9 1782 movq 32+8(%rsp),%rsi 1783 movq %rdx,%r10 1784 adcq $0,%r10 1785 1786 mulq %rbx 1787 addq %rax,%r11 1788 movq 32(%rbp),%rax 1789 adcq $0,%rdx 1790 imulq %r8,%rsi 1791 addq %r11,%r10 1792 movq %rdx,%r11 1793 adcq $0,%r11 1794 1795 mulq %rbx 1796 addq %rax,%r12 1797 movq 40(%rbp),%rax 1798 adcq $0,%rdx 1799 addq %r12,%r11 1800 movq %rdx,%r12 1801 adcq $0,%r12 1802 1803 mulq %rbx 1804 addq %rax,%r13 1805 movq 48(%rbp),%rax 1806 adcq $0,%rdx 1807 addq %r13,%r12 1808 movq %rdx,%r13 1809 adcq $0,%r13 1810 1811 mulq %rbx 1812 addq %rax,%r14 1813 movq 56(%rbp),%rax 1814 adcq $0,%rdx 1815 addq %r14,%r13 1816 movq %rdx,%r14 1817 adcq $0,%r14 1818 1819 mulq %rbx 1820 movq %rsi,%rbx 1821 addq %rax,%r15 1822 movq 0(%rbp),%rax 1823 adcq $0,%rdx 1824 addq %r15,%r14 1825 movq %rdx,%r15 1826 adcq $0,%r15 1827 1828 decl %ecx 1829 jnz .L8x_reduce 1830 1831 leaq 64(%rbp),%rbp 1832 xorq %rax,%rax 1833 movq 8+8(%rsp),%rdx 1834 cmpq 0+8(%rsp),%rbp 1835 jae .L8x_no_tail 1836 1837 .byte 0x66 1838 addq 0(%rdi),%r8 1839 adcq 8(%rdi),%r9 1840 adcq 16(%rdi),%r10 1841 adcq 24(%rdi),%r11 1842 adcq 32(%rdi),%r12 1843 adcq 40(%rdi),%r13 1844 adcq 48(%rdi),%r14 1845 adcq 56(%rdi),%r15 1846 sbbq %rsi,%rsi 1847 1848 movq 48+56+8(%rsp),%rbx 1849 movl $8,%ecx 1850 movq 0(%rbp),%rax 1851 jmp .L8x_tail 1852 1853 .align 32 1854 .L8x_tail: 1855 mulq %rbx 1856 addq %rax,%r8 1857 movq 8(%rbp),%rax 1858 movq %r8,(%rdi) 1859 movq %rdx,%r8 1860 adcq $0,%r8 1861 1862 mulq %rbx 1863 addq %rax,%r9 1864 movq 16(%rbp),%rax 1865 adcq $0,%rdx 1866 addq %r9,%r8 1867 leaq 8(%rdi),%rdi 1868 movq %rdx,%r9 1869 adcq $0,%r9 1870 1871 mulq %rbx 1872 addq %rax,%r10 1873 movq 24(%rbp),%rax 1874 adcq $0,%rdx 1875 addq %r10,%r9 1876 movq %rdx,%r10 1877 adcq $0,%r10 1878 1879 mulq %rbx 1880 addq %rax,%r11 1881 movq 32(%rbp),%rax 1882 adcq $0,%rdx 1883 addq %r11,%r10 1884 movq %rdx,%r11 1885 adcq $0,%r11 1886 1887 mulq %rbx 1888 addq %rax,%r12 1889 movq 40(%rbp),%rax 1890 adcq $0,%rdx 1891 addq %r12,%r11 1892 movq %rdx,%r12 1893 adcq $0,%r12 1894 1895 mulq %rbx 1896 addq %rax,%r13 1897 movq 48(%rbp),%rax 1898 adcq $0,%rdx 1899 addq %r13,%r12 1900 movq %rdx,%r13 1901 adcq $0,%r13 1902 1903 mulq %rbx 1904 addq %rax,%r14 1905 movq 56(%rbp),%rax 1906 adcq $0,%rdx 1907 addq %r14,%r13 1908 movq %rdx,%r14 1909 adcq $0,%r14 1910 1911 mulq %rbx 1912 movq 48-16+8(%rsp,%rcx,8),%rbx 1913 addq %rax,%r15 1914 adcq $0,%rdx 1915 addq %r15,%r14 1916 movq 0(%rbp),%rax 1917 movq %rdx,%r15 1918 adcq $0,%r15 1919 1920 decl %ecx 1921 jnz .L8x_tail 1922 1923 leaq 64(%rbp),%rbp 1924 movq 8+8(%rsp),%rdx 1925 cmpq 0+8(%rsp),%rbp 1926 jae .L8x_tail_done 1927 1928 movq 48+56+8(%rsp),%rbx 1929 negq %rsi 1930 movq 0(%rbp),%rax 1931 adcq 0(%rdi),%r8 1932 adcq 8(%rdi),%r9 1933 adcq 16(%rdi),%r10 1934 adcq 24(%rdi),%r11 1935 adcq 32(%rdi),%r12 1936 adcq 40(%rdi),%r13 1937 adcq 48(%rdi),%r14 1938 adcq 56(%rdi),%r15 1939 sbbq %rsi,%rsi 1940 1941 movl $8,%ecx 1942 jmp .L8x_tail 1943 1944 .align 32 1945 .L8x_tail_done: 1946 xorq %rax,%rax 1947 addq (%rdx),%r8 1948 adcq $0,%r9 1949 adcq $0,%r10 1950 adcq $0,%r11 1951 adcq $0,%r12 1952 adcq $0,%r13 1953 adcq $0,%r14 1954 adcq $0,%r15 1955 adcq $0,%rax 1956 1957 negq %rsi 1958 .L8x_no_tail: 1959 adcq 0(%rdi),%r8 1960 adcq 8(%rdi),%r9 1961 adcq 16(%rdi),%r10 1962 adcq 24(%rdi),%r11 1963 adcq 32(%rdi),%r12 1964 adcq 40(%rdi),%r13 1965 adcq 48(%rdi),%r14 1966 adcq 56(%rdi),%r15 1967 adcq $0,%rax 1968 movq -8(%rbp),%rcx 1969 xorq %rsi,%rsi 1970 1971 .byte 102,72,15,126,213 1972 1973 movq %r8,0(%rdi) 1974 movq %r9,8(%rdi) 1975 .byte 102,73,15,126,217 1976 movq %r10,16(%rdi) 1977 movq %r11,24(%rdi) 1978 movq %r12,32(%rdi) 1979 movq %r13,40(%rdi) 1980 movq %r14,48(%rdi) 1981 movq %r15,56(%rdi) 1982 leaq 64(%rdi),%rdi 1983 1984 cmpq %rdx,%rdi 1985 jb .L8x_reduction_loop 1986 .byte 0xf3,0xc3 1987 .size bn_sqr8x_internal,.-bn_sqr8x_internal 1988 .type __bn_post4x_internal,@function 1989 .align 32 1990 __bn_post4x_internal: 1991 movq 0(%rbp),%r12 1992 leaq (%rdi,%r9,1),%rbx 1993 movq %r9,%rcx 1994 .byte 102,72,15,126,207 1995 negq %rax 1996 .byte 102,72,15,126,206 1997 sarq $3+2,%rcx 1998 decq %r12 1999 xorq %r10,%r10 2000 movq 8(%rbp),%r13 2001 movq 16(%rbp),%r14 2002 movq 24(%rbp),%r15 2003 jmp .Lsqr4x_sub_entry 2004 2005 .align 16 2006 .Lsqr4x_sub: 2007 movq 0(%rbp),%r12 2008 movq 8(%rbp),%r13 2009 movq 16(%rbp),%r14 2010 movq 24(%rbp),%r15 2011 .Lsqr4x_sub_entry: 2012 leaq 32(%rbp),%rbp 2013 notq %r12 2014 notq %r13 2015 notq %r14 2016 notq %r15 2017 andq %rax,%r12 2018 andq %rax,%r13 2019 andq %rax,%r14 2020 andq %rax,%r15 2021 2022 negq %r10 2023 adcq 0(%rbx),%r12 2024 adcq 8(%rbx),%r13 2025 adcq 16(%rbx),%r14 2026 adcq 24(%rbx),%r15 2027 movq %r12,0(%rdi) 2028 leaq 32(%rbx),%rbx 2029 movq %r13,8(%rdi) 2030 sbbq %r10,%r10 2031 movq %r14,16(%rdi) 2032 movq %r15,24(%rdi) 2033 leaq 32(%rdi),%rdi 2034 2035 incq %rcx 2036 jnz .Lsqr4x_sub 2037 2038 movq %r9,%r10 2039 negq %r9 2040 .byte 0xf3,0xc3 2041 .size __bn_post4x_internal,.-__bn_post4x_internal 2042 .globl bn_from_montgomery 2043 .hidden bn_from_montgomery 2044 .type bn_from_montgomery,@function 2045 .align 32 2046 bn_from_montgomery: 2047 testl $7,%r9d 2048 jz bn_from_mont8x 2049 xorl %eax,%eax 2050 .byte 0xf3,0xc3 2051 .size bn_from_montgomery,.-bn_from_montgomery 2052 2053 .type bn_from_mont8x,@function 2054 .align 32 2055 bn_from_mont8x: 2056 .cfi_startproc 2057 .byte 0x67 2058 movq %rsp,%rax 2059 .cfi_def_cfa_register %rax 2060 pushq %rbx 2061 .cfi_offset %rbx,-16 2062 pushq %rbp 2063 .cfi_offset %rbp,-24 2064 pushq %r12 2065 .cfi_offset %r12,-32 2066 pushq %r13 2067 .cfi_offset %r13,-40 2068 pushq %r14 2069 .cfi_offset %r14,-48 2070 pushq %r15 2071 .cfi_offset %r15,-56 2072 .Lfrom_prologue: 2073 2074 shll $3,%r9d 2075 leaq (%r9,%r9,2),%r10 2076 negq %r9 2077 movq (%r8),%r8 2078 2079 2080 2081 2082 2083 2084 2085 2086 leaq -320(%rsp,%r9,2),%r11 2087 movq %rsp,%rbp 2088 subq %rdi,%r11 2089 andq $4095,%r11 2090 cmpq %r11,%r10 2091 jb .Lfrom_sp_alt 2092 subq %r11,%rbp 2093 leaq -320(%rbp,%r9,2),%rbp 2094 jmp .Lfrom_sp_done 2095 2096 .align 32 2097 .Lfrom_sp_alt: 2098 leaq 4096-320(,%r9,2),%r10 2099 leaq -320(%rbp,%r9,2),%rbp 2100 subq %r10,%r11 2101 movq $0,%r10 2102 cmovcq %r10,%r11 2103 subq %r11,%rbp 2104 .Lfrom_sp_done: 2105 andq $-64,%rbp 2106 movq %rsp,%r11 2107 subq %rbp,%r11 2108 andq $-4096,%r11 2109 leaq (%r11,%rbp,1),%rsp 2110 movq (%rsp),%r10 2111 cmpq %rbp,%rsp 2112 ja .Lfrom_page_walk 2113 jmp .Lfrom_page_walk_done 2114 2115 .Lfrom_page_walk: 2116 leaq -4096(%rsp),%rsp 2117 movq (%rsp),%r10 2118 cmpq %rbp,%rsp 2119 ja .Lfrom_page_walk 2120 .Lfrom_page_walk_done: 2121 2122 movq %r9,%r10 2123 negq %r9 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 movq %r8,32(%rsp) 2135 movq %rax,40(%rsp) 2136 .cfi_escape 0x0f,0x05,0x77,0x28,0x06,0x23,0x08 2137 .Lfrom_body: 2138 movq %r9,%r11 2139 leaq 48(%rsp),%rax 2140 pxor %xmm0,%xmm0 2141 jmp .Lmul_by_1 2142 2143 .align 32 2144 .Lmul_by_1: 2145 movdqu (%rsi),%xmm1 2146 movdqu 16(%rsi),%xmm2 2147 movdqu 32(%rsi),%xmm3 2148 movdqa %xmm0,(%rax,%r9,1) 2149 movdqu 48(%rsi),%xmm4 2150 movdqa %xmm0,16(%rax,%r9,1) 2151 .byte 0x48,0x8d,0xb6,0x40,0x00,0x00,0x00 2152 movdqa %xmm1,(%rax) 2153 movdqa %xmm0,32(%rax,%r9,1) 2154 movdqa %xmm2,16(%rax) 2155 movdqa %xmm0,48(%rax,%r9,1) 2156 movdqa %xmm3,32(%rax) 2157 movdqa %xmm4,48(%rax) 2158 leaq 64(%rax),%rax 2159 subq $64,%r11 2160 jnz .Lmul_by_1 2161 2162 .byte 102,72,15,110,207 2163 .byte 102,72,15,110,209 2164 .byte 0x67 2165 movq %rcx,%rbp 2166 .byte 102,73,15,110,218 2167 call __bn_sqr8x_reduction 2168 call __bn_post4x_internal 2169 2170 pxor %xmm0,%xmm0 2171 leaq 48(%rsp),%rax 2172 jmp .Lfrom_mont_zero 2173 2174 .align 32 2175 .Lfrom_mont_zero: 2176 movq 40(%rsp),%rsi 2177 .cfi_def_cfa %rsi,8 2178 movdqa %xmm0,0(%rax) 2179 movdqa %xmm0,16(%rax) 2180 movdqa %xmm0,32(%rax) 2181 movdqa %xmm0,48(%rax) 2182 leaq 64(%rax),%rax 2183 subq $32,%r9 2184 jnz .Lfrom_mont_zero 2185 2186 movq $1,%rax 2187 movq -48(%rsi),%r15 2188 .cfi_restore %r15 2189 movq -40(%rsi),%r14 2190 .cfi_restore %r14 2191 movq -32(%rsi),%r13 2192 .cfi_restore %r13 2193 movq -24(%rsi),%r12 2194 .cfi_restore %r12 2195 movq -16(%rsi),%rbp 2196 .cfi_restore %rbp 2197 movq -8(%rsi),%rbx 2198 .cfi_restore %rbx 2199 leaq (%rsi),%rsp 2200 .cfi_def_cfa_register %rsp 2201 .Lfrom_epilogue: 2202 .byte 0xf3,0xc3 2203 .cfi_endproc 2204 .size bn_from_mont8x,.-bn_from_mont8x 2205 .globl bn_scatter5 2206 .hidden bn_scatter5 2207 .type bn_scatter5,@function 2208 .align 16 2209 bn_scatter5: 2210 cmpl $0,%esi 2211 jz .Lscatter_epilogue 2212 leaq (%rdx,%rcx,8),%rdx 2213 .Lscatter: 2214 movq (%rdi),%rax 2215 leaq 8(%rdi),%rdi 2216 movq %rax,(%rdx) 2217 leaq 256(%rdx),%rdx 2218 subl $1,%esi 2219 jnz .Lscatter 2220 .Lscatter_epilogue: 2221 .byte 0xf3,0xc3 2222 .size bn_scatter5,.-bn_scatter5 2223 2224 .globl bn_gather5 2225 .hidden bn_gather5 2226 .type bn_gather5,@function 2227 .align 32 2228 bn_gather5: 2229 .LSEH_begin_bn_gather5: 2230 2231 .byte 0x4c,0x8d,0x14,0x24 2232 .byte 0x48,0x81,0xec,0x08,0x01,0x00,0x00 2233 leaq .Linc(%rip),%rax 2234 andq $-16,%rsp 2235 2236 movd %ecx,%xmm5 2237 movdqa 0(%rax),%xmm0 2238 movdqa 16(%rax),%xmm1 2239 leaq 128(%rdx),%r11 2240 leaq 128(%rsp),%rax 2241 2242 pshufd $0,%xmm5,%xmm5 2243 movdqa %xmm1,%xmm4 2244 movdqa %xmm1,%xmm2 2245 paddd %xmm0,%xmm1 2246 pcmpeqd %xmm5,%xmm0 2247 movdqa %xmm4,%xmm3 2248 2249 paddd %xmm1,%xmm2 2250 pcmpeqd %xmm5,%xmm1 2251 movdqa %xmm0,-128(%rax) 2252 movdqa %xmm4,%xmm0 2253 2254 paddd %xmm2,%xmm3 2255 pcmpeqd %xmm5,%xmm2 2256 movdqa %xmm1,-112(%rax) 2257 movdqa %xmm4,%xmm1 2258 2259 paddd %xmm3,%xmm0 2260 pcmpeqd %xmm5,%xmm3 2261 movdqa %xmm2,-96(%rax) 2262 movdqa %xmm4,%xmm2 2263 paddd %xmm0,%xmm1 2264 pcmpeqd %xmm5,%xmm0 2265 movdqa %xmm3,-80(%rax) 2266 movdqa %xmm4,%xmm3 2267 2268 paddd %xmm1,%xmm2 2269 pcmpeqd %xmm5,%xmm1 2270 movdqa %xmm0,-64(%rax) 2271 movdqa %xmm4,%xmm0 2272 2273 paddd %xmm2,%xmm3 2274 pcmpeqd %xmm5,%xmm2 2275 movdqa %xmm1,-48(%rax) 2276 movdqa %xmm4,%xmm1 2277 2278 paddd %xmm3,%xmm0 2279 pcmpeqd %xmm5,%xmm3 2280 movdqa %xmm2,-32(%rax) 2281 movdqa %xmm4,%xmm2 2282 paddd %xmm0,%xmm1 2283 pcmpeqd %xmm5,%xmm0 2284 movdqa %xmm3,-16(%rax) 2285 movdqa %xmm4,%xmm3 2286 2287 paddd %xmm1,%xmm2 2288 pcmpeqd %xmm5,%xmm1 2289 movdqa %xmm0,0(%rax) 2290 movdqa %xmm4,%xmm0 2291 2292 paddd %xmm2,%xmm3 2293 pcmpeqd %xmm5,%xmm2 2294 movdqa %xmm1,16(%rax) 2295 movdqa %xmm4,%xmm1 2296 2297 paddd %xmm3,%xmm0 2298 pcmpeqd %xmm5,%xmm3 2299 movdqa %xmm2,32(%rax) 2300 movdqa %xmm4,%xmm2 2301 paddd %xmm0,%xmm1 2302 pcmpeqd %xmm5,%xmm0 2303 movdqa %xmm3,48(%rax) 2304 movdqa %xmm4,%xmm3 2305 2306 paddd %xmm1,%xmm2 2307 pcmpeqd %xmm5,%xmm1 2308 movdqa %xmm0,64(%rax) 2309 movdqa %xmm4,%xmm0 2310 2311 paddd %xmm2,%xmm3 2312 pcmpeqd %xmm5,%xmm2 2313 movdqa %xmm1,80(%rax) 2314 movdqa %xmm4,%xmm1 2315 2316 paddd %xmm3,%xmm0 2317 pcmpeqd %xmm5,%xmm3 2318 movdqa %xmm2,96(%rax) 2319 movdqa %xmm4,%xmm2 2320 movdqa %xmm3,112(%rax) 2321 jmp .Lgather 2322 2323 .align 32 2324 .Lgather: 2325 pxor %xmm4,%xmm4 2326 pxor %xmm5,%xmm5 2327 movdqa -128(%r11),%xmm0 2328 movdqa -112(%r11),%xmm1 2329 movdqa -96(%r11),%xmm2 2330 pand -128(%rax),%xmm0 2331 movdqa -80(%r11),%xmm3 2332 pand -112(%rax),%xmm1 2333 por %xmm0,%xmm4 2334 pand -96(%rax),%xmm2 2335 por %xmm1,%xmm5 2336 pand -80(%rax),%xmm3 2337 por %xmm2,%xmm4 2338 por %xmm3,%xmm5 2339 movdqa -64(%r11),%xmm0 2340 movdqa -48(%r11),%xmm1 2341 movdqa -32(%r11),%xmm2 2342 pand -64(%rax),%xmm0 2343 movdqa -16(%r11),%xmm3 2344 pand -48(%rax),%xmm1 2345 por %xmm0,%xmm4 2346 pand -32(%rax),%xmm2 2347 por %xmm1,%xmm5 2348 pand -16(%rax),%xmm3 2349 por %xmm2,%xmm4 2350 por %xmm3,%xmm5 2351 movdqa 0(%r11),%xmm0 2352 movdqa 16(%r11),%xmm1 2353 movdqa 32(%r11),%xmm2 2354 pand 0(%rax),%xmm0 2355 movdqa 48(%r11),%xmm3 2356 pand 16(%rax),%xmm1 2357 por %xmm0,%xmm4 2358 pand 32(%rax),%xmm2 2359 por %xmm1,%xmm5 2360 pand 48(%rax),%xmm3 2361 por %xmm2,%xmm4 2362 por %xmm3,%xmm5 2363 movdqa 64(%r11),%xmm0 2364 movdqa 80(%r11),%xmm1 2365 movdqa 96(%r11),%xmm2 2366 pand 64(%rax),%xmm0 2367 movdqa 112(%r11),%xmm3 2368 pand 80(%rax),%xmm1 2369 por %xmm0,%xmm4 2370 pand 96(%rax),%xmm2 2371 por %xmm1,%xmm5 2372 pand 112(%rax),%xmm3 2373 por %xmm2,%xmm4 2374 por %xmm3,%xmm5 2375 por %xmm5,%xmm4 2376 leaq 256(%r11),%r11 2377 pshufd $0x4e,%xmm4,%xmm0 2378 por %xmm4,%xmm0 2379 movq %xmm0,(%rdi) 2380 leaq 8(%rdi),%rdi 2381 subl $1,%esi 2382 jnz .Lgather 2383 2384 leaq (%r10),%rsp 2385 .byte 0xf3,0xc3 2386 .LSEH_end_bn_gather5: 2387 .size bn_gather5,.-bn_gather5 2388 .align 64 2389 .Linc: 2390 .long 0,0, 1,1 2391 .long 2,2, 2,2 2392 .byte 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,119,105,116,104,32,115,99,97,116,116,101,114,47,103,97,116,104,101,114,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 2393 #endif 2394