1 #!/usr/bin/env perl 2 3 # ==================================================================== 4 # Written by Andy Polyakov <appro (at] openssl.org> for the OpenSSL 5 # project. The module is, however, dual licensed under OpenSSL and 6 # CRYPTOGAMS licenses depending on where you obtain it. For further 7 # details see http://www.openssl.org/~appro/cryptogams/. 8 # ==================================================================== 9 10 # October 2005. 11 # 12 # Montgomery multiplication routine for x86_64. While it gives modest 13 # 9% improvement of rsa4096 sign on Opteron, rsa512 sign runs more 14 # than twice, >2x, as fast. Most common rsa1024 sign is improved by 15 # respectful 50%. It remains to be seen if loop unrolling and 16 # dedicated squaring routine can provide further improvement... 17 18 # July 2011. 19 # 20 # Add dedicated squaring procedure. Performance improvement varies 21 # from platform to platform, but in average it's ~5%/15%/25%/33% 22 # for 512-/1024-/2048-/4096-bit RSA *sign* benchmarks respectively. 23 24 # August 2011. 25 # 26 # Unroll and modulo-schedule inner loops in such manner that they 27 # are "fallen through" for input lengths of 8, which is critical for 28 # 1024-bit RSA *sign*. Average performance improvement in comparison 29 # to *initial* version of this module from 2005 is ~0%/30%/40%/45% 30 # for 512-/1024-/2048-/4096-bit RSA *sign* benchmarks respectively. 31 32 # June 2013. 33 # 34 # Optimize reduction in squaring procedure and improve 1024+-bit RSA 35 # sign performance by 10-16% on Intel Sandy Bridge and later 36 # (virtually same on non-Intel processors). 37 38 # August 2013. 39 # 40 # Add MULX/ADOX/ADCX code path. 41 42 $flavour = shift; 43 $output = shift; 44 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } 45 46 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); 47 48 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 49 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or 50 ( $xlate="${dir}../../../perlasm/x86_64-xlate.pl" and -f $xlate) or 51 die "can't locate x86_64-xlate.pl"; 52 53 open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""; 54 *STDOUT=*OUT; 55 56 # In upstream, this is controlled by shelling out to the compiler to check 57 # versions, but BoringSSL is intended to be used with pre-generated perlasm 58 # output, so this isn't useful anyway. 59 # 60 # TODO(davidben): Set $addx to one once build problems are resolved. 61 $addx = 0; 62 63 # int bn_mul_mont( 64 $rp="%rdi"; # BN_ULONG *rp, 65 $ap="%rsi"; # const BN_ULONG *ap, 66 $bp="%rdx"; # const BN_ULONG *bp, 67 $np="%rcx"; # const BN_ULONG *np, 68 $n0="%r8"; # const BN_ULONG *n0, 69 $num="%r9"; # int num); 70 $lo0="%r10"; 71 $hi0="%r11"; 72 $hi1="%r13"; 73 $i="%r14"; 74 $j="%r15"; 75 $m0="%rbx"; 76 $m1="%rbp"; 77 78 $code=<<___; 79 .text 80 81 .extern OPENSSL_ia32cap_P 82 83 .globl bn_mul_mont 84 .type bn_mul_mont,\@function,6 85 .align 16 86 bn_mul_mont: 87 .cfi_startproc 88 mov ${num}d,${num}d 89 mov %rsp,%rax 90 .cfi_def_cfa_register %rax 91 test \$3,${num}d 92 jnz .Lmul_enter 93 cmp \$8,${num}d 94 jb .Lmul_enter 95 ___ 96 $code.=<<___ if ($addx); 97 leaq OPENSSL_ia32cap_P(%rip),%r11 98 mov 8(%r11),%r11d 99 ___ 100 $code.=<<___; 101 cmp $ap,$bp 102 jne .Lmul4x_enter 103 test \$7,${num}d 104 jz .Lsqr8x_enter 105 jmp .Lmul4x_enter 106 107 .align 16 108 .Lmul_enter: 109 push %rbx 110 .cfi_push %rbx 111 push %rbp 112 .cfi_push %rbp 113 push %r12 114 .cfi_push %r12 115 push %r13 116 .cfi_push %r13 117 push %r14 118 .cfi_push %r14 119 push %r15 120 .cfi_push %r15 121 122 neg $num 123 mov %rsp,%r11 124 lea -16(%rsp,$num,8),%r10 # future alloca(8*(num+2)) 125 neg $num # restore $num 126 and \$-1024,%r10 # minimize TLB usage 127 128 # An OS-agnostic version of __chkstk. 129 # 130 # Some OSes (Windows) insist on stack being "wired" to 131 # physical memory in strictly sequential manner, i.e. if stack 132 # allocation spans two pages, then reference to farmost one can 133 # be punishable by SEGV. But page walking can do good even on 134 # other OSes, because it guarantees that villain thread hits 135 # the guard page before it can make damage to innocent one... 136 sub %r10,%r11 137 and \$-4096,%r11 138 lea (%r10,%r11),%rsp 139 mov (%rsp),%r11 140 cmp %r10,%rsp 141 ja .Lmul_page_walk 142 jmp .Lmul_page_walk_done 143 144 .align 16 145 .Lmul_page_walk: 146 lea -4096(%rsp),%rsp 147 mov (%rsp),%r11 148 cmp %r10,%rsp 149 ja .Lmul_page_walk 150 .Lmul_page_walk_done: 151 152 mov %rax,8(%rsp,$num,8) # tp[num+1]=%rsp 153 .cfi_cfa_expression %rsp+8,$num,8,mul,plus,deref,+8 154 .Lmul_body: 155 mov $bp,%r12 # reassign $bp 156 ___ 157 $bp="%r12"; 158 $code.=<<___; 159 mov ($n0),$n0 # pull n0[0] value 160 mov ($bp),$m0 # m0=bp[0] 161 mov ($ap),%rax 162 163 xor $i,$i # i=0 164 xor $j,$j # j=0 165 166 mov $n0,$m1 167 mulq $m0 # ap[0]*bp[0] 168 mov %rax,$lo0 169 mov ($np),%rax 170 171 imulq $lo0,$m1 # "tp[0]"*n0 172 mov %rdx,$hi0 173 174 mulq $m1 # np[0]*m1 175 add %rax,$lo0 # discarded 176 mov 8($ap),%rax 177 adc \$0,%rdx 178 mov %rdx,$hi1 179 180 lea 1($j),$j # j++ 181 jmp .L1st_enter 182 183 .align 16 184 .L1st: 185 add %rax,$hi1 186 mov ($ap,$j,8),%rax 187 adc \$0,%rdx 188 add $hi0,$hi1 # np[j]*m1+ap[j]*bp[0] 189 mov $lo0,$hi0 190 adc \$0,%rdx 191 mov $hi1,-16(%rsp,$j,8) # tp[j-1] 192 mov %rdx,$hi1 193 194 .L1st_enter: 195 mulq $m0 # ap[j]*bp[0] 196 add %rax,$hi0 197 mov ($np,$j,8),%rax 198 adc \$0,%rdx 199 lea 1($j),$j # j++ 200 mov %rdx,$lo0 201 202 mulq $m1 # np[j]*m1 203 cmp $num,$j 204 jne .L1st 205 206 add %rax,$hi1 207 mov ($ap),%rax # ap[0] 208 adc \$0,%rdx 209 add $hi0,$hi1 # np[j]*m1+ap[j]*bp[0] 210 adc \$0,%rdx 211 mov $hi1,-16(%rsp,$j,8) # tp[j-1] 212 mov %rdx,$hi1 213 mov $lo0,$hi0 214 215 xor %rdx,%rdx 216 add $hi0,$hi1 217 adc \$0,%rdx 218 mov $hi1,-8(%rsp,$num,8) 219 mov %rdx,(%rsp,$num,8) # store upmost overflow bit 220 221 lea 1($i),$i # i++ 222 jmp .Louter 223 .align 16 224 .Louter: 225 mov ($bp,$i,8),$m0 # m0=bp[i] 226 xor $j,$j # j=0 227 mov $n0,$m1 228 mov (%rsp),$lo0 229 mulq $m0 # ap[0]*bp[i] 230 add %rax,$lo0 # ap[0]*bp[i]+tp[0] 231 mov ($np),%rax 232 adc \$0,%rdx 233 234 imulq $lo0,$m1 # tp[0]*n0 235 mov %rdx,$hi0 236 237 mulq $m1 # np[0]*m1 238 add %rax,$lo0 # discarded 239 mov 8($ap),%rax 240 adc \$0,%rdx 241 mov 8(%rsp),$lo0 # tp[1] 242 mov %rdx,$hi1 243 244 lea 1($j),$j # j++ 245 jmp .Linner_enter 246 247 .align 16 248 .Linner: 249 add %rax,$hi1 250 mov ($ap,$j,8),%rax 251 adc \$0,%rdx 252 add $lo0,$hi1 # np[j]*m1+ap[j]*bp[i]+tp[j] 253 mov (%rsp,$j,8),$lo0 254 adc \$0,%rdx 255 mov $hi1,-16(%rsp,$j,8) # tp[j-1] 256 mov %rdx,$hi1 257 258 .Linner_enter: 259 mulq $m0 # ap[j]*bp[i] 260 add %rax,$hi0 261 mov ($np,$j,8),%rax 262 adc \$0,%rdx 263 add $hi0,$lo0 # ap[j]*bp[i]+tp[j] 264 mov %rdx,$hi0 265 adc \$0,$hi0 266 lea 1($j),$j # j++ 267 268 mulq $m1 # np[j]*m1 269 cmp $num,$j 270 jne .Linner 271 272 add %rax,$hi1 273 mov ($ap),%rax # ap[0] 274 adc \$0,%rdx 275 add $lo0,$hi1 # np[j]*m1+ap[j]*bp[i]+tp[j] 276 mov (%rsp,$j,8),$lo0 277 adc \$0,%rdx 278 mov $hi1,-16(%rsp,$j,8) # tp[j-1] 279 mov %rdx,$hi1 280 281 xor %rdx,%rdx 282 add $hi0,$hi1 283 adc \$0,%rdx 284 add $lo0,$hi1 # pull upmost overflow bit 285 adc \$0,%rdx 286 mov $hi1,-8(%rsp,$num,8) 287 mov %rdx,(%rsp,$num,8) # store upmost overflow bit 288 289 lea 1($i),$i # i++ 290 cmp $num,$i 291 jb .Louter 292 293 xor $i,$i # i=0 and clear CF! 294 mov (%rsp),%rax # tp[0] 295 lea (%rsp),$ap # borrow ap for tp 296 mov $num,$j # j=num 297 jmp .Lsub 298 .align 16 299 .Lsub: 300 sbb ($np,$i,8),%rax 301 mov %rax,($rp,$i,8) # rp[i]=tp[i]-np[i] 302 mov 8($ap,$i,8),%rax # tp[i+1] 303 lea 1($i),$i # i++ 304 dec $j # doesnn't affect CF! 305 jnz .Lsub 306 307 sbb \$0,%rax # handle upmost overflow bit 308 xor $i,$i 309 and %rax,$ap 310 not %rax 311 mov $rp,$np 312 and %rax,$np 313 mov $num,$j # j=num 314 or $np,$ap # ap=borrow?tp:rp 315 .align 16 316 .Lcopy: # copy or in-place refresh 317 mov ($ap,$i,8),%rax 318 mov $i,(%rsp,$i,8) # zap temporary vector 319 mov %rax,($rp,$i,8) # rp[i]=tp[i] 320 lea 1($i),$i 321 sub \$1,$j 322 jnz .Lcopy 323 324 mov 8(%rsp,$num,8),%rsi # restore %rsp 325 .cfi_def_cfa %rsi,8 326 mov \$1,%rax 327 mov -48(%rsi),%r15 328 .cfi_restore %r15 329 mov -40(%rsi),%r14 330 .cfi_restore %r14 331 mov -32(%rsi),%r13 332 .cfi_restore %r13 333 mov -24(%rsi),%r12 334 .cfi_restore %r12 335 mov -16(%rsi),%rbp 336 .cfi_restore %rbp 337 mov -8(%rsi),%rbx 338 .cfi_restore %rbx 339 lea (%rsi),%rsp 340 .cfi_def_cfa_register %rsp 341 .Lmul_epilogue: 342 ret 343 .cfi_endproc 344 .size bn_mul_mont,.-bn_mul_mont 345 ___ 346 {{{ 347 my @A=("%r10","%r11"); 348 my @N=("%r13","%rdi"); 349 $code.=<<___; 350 .type bn_mul4x_mont,\@function,6 351 .align 16 352 bn_mul4x_mont: 353 .cfi_startproc 354 mov ${num}d,${num}d 355 mov %rsp,%rax 356 .cfi_def_cfa_register %rax 357 .Lmul4x_enter: 358 ___ 359 $code.=<<___ if ($addx); 360 and \$0x80100,%r11d 361 cmp \$0x80100,%r11d 362 je .Lmulx4x_enter 363 ___ 364 $code.=<<___; 365 push %rbx 366 .cfi_push %rbx 367 push %rbp 368 .cfi_push %rbp 369 push %r12 370 .cfi_push %r12 371 push %r13 372 .cfi_push %r13 373 push %r14 374 .cfi_push %r14 375 push %r15 376 .cfi_push %r15 377 378 neg $num 379 mov %rsp,%r11 380 lea -32(%rsp,$num,8),%r10 # future alloca(8*(num+4)) 381 neg $num # restore 382 and \$-1024,%r10 # minimize TLB usage 383 384 sub %r10,%r11 385 and \$-4096,%r11 386 lea (%r10,%r11),%rsp 387 mov (%rsp),%r11 388 cmp %r10,%rsp 389 ja .Lmul4x_page_walk 390 jmp .Lmul4x_page_walk_done 391 392 .Lmul4x_page_walk: 393 lea -4096(%rsp),%rsp 394 mov (%rsp),%r11 395 cmp %r10,%rsp 396 ja .Lmul4x_page_walk 397 .Lmul4x_page_walk_done: 398 399 mov %rax,8(%rsp,$num,8) # tp[num+1]=%rsp 400 .cfi_cfa_expression %rsp+8,$num,8,mul,plus,deref,+8 401 .Lmul4x_body: 402 mov $rp,16(%rsp,$num,8) # tp[num+2]=$rp 403 mov %rdx,%r12 # reassign $bp 404 ___ 405 $bp="%r12"; 406 $code.=<<___; 407 mov ($n0),$n0 # pull n0[0] value 408 mov ($bp),$m0 # m0=bp[0] 409 mov ($ap),%rax 410 411 xor $i,$i # i=0 412 xor $j,$j # j=0 413 414 mov $n0,$m1 415 mulq $m0 # ap[0]*bp[0] 416 mov %rax,$A[0] 417 mov ($np),%rax 418 419 imulq $A[0],$m1 # "tp[0]"*n0 420 mov %rdx,$A[1] 421 422 mulq $m1 # np[0]*m1 423 add %rax,$A[0] # discarded 424 mov 8($ap),%rax 425 adc \$0,%rdx 426 mov %rdx,$N[1] 427 428 mulq $m0 429 add %rax,$A[1] 430 mov 8($np),%rax 431 adc \$0,%rdx 432 mov %rdx,$A[0] 433 434 mulq $m1 435 add %rax,$N[1] 436 mov 16($ap),%rax 437 adc \$0,%rdx 438 add $A[1],$N[1] 439 lea 4($j),$j # j++ 440 adc \$0,%rdx 441 mov $N[1],(%rsp) 442 mov %rdx,$N[0] 443 jmp .L1st4x 444 .align 16 445 .L1st4x: 446 mulq $m0 # ap[j]*bp[0] 447 add %rax,$A[0] 448 mov -16($np,$j,8),%rax 449 adc \$0,%rdx 450 mov %rdx,$A[1] 451 452 mulq $m1 # np[j]*m1 453 add %rax,$N[0] 454 mov -8($ap,$j,8),%rax 455 adc \$0,%rdx 456 add $A[0],$N[0] # np[j]*m1+ap[j]*bp[0] 457 adc \$0,%rdx 458 mov $N[0],-24(%rsp,$j,8) # tp[j-1] 459 mov %rdx,$N[1] 460 461 mulq $m0 # ap[j]*bp[0] 462 add %rax,$A[1] 463 mov -8($np,$j,8),%rax 464 adc \$0,%rdx 465 mov %rdx,$A[0] 466 467 mulq $m1 # np[j]*m1 468 add %rax,$N[1] 469 mov ($ap,$j,8),%rax 470 adc \$0,%rdx 471 add $A[1],$N[1] # np[j]*m1+ap[j]*bp[0] 472 adc \$0,%rdx 473 mov $N[1],-16(%rsp,$j,8) # tp[j-1] 474 mov %rdx,$N[0] 475 476 mulq $m0 # ap[j]*bp[0] 477 add %rax,$A[0] 478 mov ($np,$j,8),%rax 479 adc \$0,%rdx 480 mov %rdx,$A[1] 481 482 mulq $m1 # np[j]*m1 483 add %rax,$N[0] 484 mov 8($ap,$j,8),%rax 485 adc \$0,%rdx 486 add $A[0],$N[0] # np[j]*m1+ap[j]*bp[0] 487 adc \$0,%rdx 488 mov $N[0],-8(%rsp,$j,8) # tp[j-1] 489 mov %rdx,$N[1] 490 491 mulq $m0 # ap[j]*bp[0] 492 add %rax,$A[1] 493 mov 8($np,$j,8),%rax 494 adc \$0,%rdx 495 lea 4($j),$j # j++ 496 mov %rdx,$A[0] 497 498 mulq $m1 # np[j]*m1 499 add %rax,$N[1] 500 mov -16($ap,$j,8),%rax 501 adc \$0,%rdx 502 add $A[1],$N[1] # np[j]*m1+ap[j]*bp[0] 503 adc \$0,%rdx 504 mov $N[1],-32(%rsp,$j,8) # tp[j-1] 505 mov %rdx,$N[0] 506 cmp $num,$j 507 jb .L1st4x 508 509 mulq $m0 # ap[j]*bp[0] 510 add %rax,$A[0] 511 mov -16($np,$j,8),%rax 512 adc \$0,%rdx 513 mov %rdx,$A[1] 514 515 mulq $m1 # np[j]*m1 516 add %rax,$N[0] 517 mov -8($ap,$j,8),%rax 518 adc \$0,%rdx 519 add $A[0],$N[0] # np[j]*m1+ap[j]*bp[0] 520 adc \$0,%rdx 521 mov $N[0],-24(%rsp,$j,8) # tp[j-1] 522 mov %rdx,$N[1] 523 524 mulq $m0 # ap[j]*bp[0] 525 add %rax,$A[1] 526 mov -8($np,$j,8),%rax 527 adc \$0,%rdx 528 mov %rdx,$A[0] 529 530 mulq $m1 # np[j]*m1 531 add %rax,$N[1] 532 mov ($ap),%rax # ap[0] 533 adc \$0,%rdx 534 add $A[1],$N[1] # np[j]*m1+ap[j]*bp[0] 535 adc \$0,%rdx 536 mov $N[1],-16(%rsp,$j,8) # tp[j-1] 537 mov %rdx,$N[0] 538 539 xor $N[1],$N[1] 540 add $A[0],$N[0] 541 adc \$0,$N[1] 542 mov $N[0],-8(%rsp,$j,8) 543 mov $N[1],(%rsp,$j,8) # store upmost overflow bit 544 545 lea 1($i),$i # i++ 546 .align 4 547 .Louter4x: 548 mov ($bp,$i,8),$m0 # m0=bp[i] 549 xor $j,$j # j=0 550 mov (%rsp),$A[0] 551 mov $n0,$m1 552 mulq $m0 # ap[0]*bp[i] 553 add %rax,$A[0] # ap[0]*bp[i]+tp[0] 554 mov ($np),%rax 555 adc \$0,%rdx 556 557 imulq $A[0],$m1 # tp[0]*n0 558 mov %rdx,$A[1] 559 560 mulq $m1 # np[0]*m1 561 add %rax,$A[0] # "$N[0]", discarded 562 mov 8($ap),%rax 563 adc \$0,%rdx 564 mov %rdx,$N[1] 565 566 mulq $m0 # ap[j]*bp[i] 567 add %rax,$A[1] 568 mov 8($np),%rax 569 adc \$0,%rdx 570 add 8(%rsp),$A[1] # +tp[1] 571 adc \$0,%rdx 572 mov %rdx,$A[0] 573 574 mulq $m1 # np[j]*m1 575 add %rax,$N[1] 576 mov 16($ap),%rax 577 adc \$0,%rdx 578 add $A[1],$N[1] # np[j]*m1+ap[j]*bp[i]+tp[j] 579 lea 4($j),$j # j+=2 580 adc \$0,%rdx 581 mov $N[1],(%rsp) # tp[j-1] 582 mov %rdx,$N[0] 583 jmp .Linner4x 584 .align 16 585 .Linner4x: 586 mulq $m0 # ap[j]*bp[i] 587 add %rax,$A[0] 588 mov -16($np,$j,8),%rax 589 adc \$0,%rdx 590 add -16(%rsp,$j,8),$A[0] # ap[j]*bp[i]+tp[j] 591 adc \$0,%rdx 592 mov %rdx,$A[1] 593 594 mulq $m1 # np[j]*m1 595 add %rax,$N[0] 596 mov -8($ap,$j,8),%rax 597 adc \$0,%rdx 598 add $A[0],$N[0] 599 adc \$0,%rdx 600 mov $N[0],-24(%rsp,$j,8) # tp[j-1] 601 mov %rdx,$N[1] 602 603 mulq $m0 # ap[j]*bp[i] 604 add %rax,$A[1] 605 mov -8($np,$j,8),%rax 606 adc \$0,%rdx 607 add -8(%rsp,$j,8),$A[1] 608 adc \$0,%rdx 609 mov %rdx,$A[0] 610 611 mulq $m1 # np[j]*m1 612 add %rax,$N[1] 613 mov ($ap,$j,8),%rax 614 adc \$0,%rdx 615 add $A[1],$N[1] 616 adc \$0,%rdx 617 mov $N[1],-16(%rsp,$j,8) # tp[j-1] 618 mov %rdx,$N[0] 619 620 mulq $m0 # ap[j]*bp[i] 621 add %rax,$A[0] 622 mov ($np,$j,8),%rax 623 adc \$0,%rdx 624 add (%rsp,$j,8),$A[0] # ap[j]*bp[i]+tp[j] 625 adc \$0,%rdx 626 mov %rdx,$A[1] 627 628 mulq $m1 # np[j]*m1 629 add %rax,$N[0] 630 mov 8($ap,$j,8),%rax 631 adc \$0,%rdx 632 add $A[0],$N[0] 633 adc \$0,%rdx 634 mov $N[0],-8(%rsp,$j,8) # tp[j-1] 635 mov %rdx,$N[1] 636 637 mulq $m0 # ap[j]*bp[i] 638 add %rax,$A[1] 639 mov 8($np,$j,8),%rax 640 adc \$0,%rdx 641 add 8(%rsp,$j,8),$A[1] 642 adc \$0,%rdx 643 lea 4($j),$j # j++ 644 mov %rdx,$A[0] 645 646 mulq $m1 # np[j]*m1 647 add %rax,$N[1] 648 mov -16($ap,$j,8),%rax 649 adc \$0,%rdx 650 add $A[1],$N[1] 651 adc \$0,%rdx 652 mov $N[1],-32(%rsp,$j,8) # tp[j-1] 653 mov %rdx,$N[0] 654 cmp $num,$j 655 jb .Linner4x 656 657 mulq $m0 # ap[j]*bp[i] 658 add %rax,$A[0] 659 mov -16($np,$j,8),%rax 660 adc \$0,%rdx 661 add -16(%rsp,$j,8),$A[0] # ap[j]*bp[i]+tp[j] 662 adc \$0,%rdx 663 mov %rdx,$A[1] 664 665 mulq $m1 # np[j]*m1 666 add %rax,$N[0] 667 mov -8($ap,$j,8),%rax 668 adc \$0,%rdx 669 add $A[0],$N[0] 670 adc \$0,%rdx 671 mov $N[0],-24(%rsp,$j,8) # tp[j-1] 672 mov %rdx,$N[1] 673 674 mulq $m0 # ap[j]*bp[i] 675 add %rax,$A[1] 676 mov -8($np,$j,8),%rax 677 adc \$0,%rdx 678 add -8(%rsp,$j,8),$A[1] 679 adc \$0,%rdx 680 lea 1($i),$i # i++ 681 mov %rdx,$A[0] 682 683 mulq $m1 # np[j]*m1 684 add %rax,$N[1] 685 mov ($ap),%rax # ap[0] 686 adc \$0,%rdx 687 add $A[1],$N[1] 688 adc \$0,%rdx 689 mov $N[1],-16(%rsp,$j,8) # tp[j-1] 690 mov %rdx,$N[0] 691 692 xor $N[1],$N[1] 693 add $A[0],$N[0] 694 adc \$0,$N[1] 695 add (%rsp,$num,8),$N[0] # pull upmost overflow bit 696 adc \$0,$N[1] 697 mov $N[0],-8(%rsp,$j,8) 698 mov $N[1],(%rsp,$j,8) # store upmost overflow bit 699 700 cmp $num,$i 701 jb .Louter4x 702 ___ 703 { 704 my @ri=("%rax","%rdx",$m0,$m1); 705 $code.=<<___; 706 mov 16(%rsp,$num,8),$rp # restore $rp 707 lea -4($num),$j 708 mov 0(%rsp),@ri[0] # tp[0] 709 pxor %xmm0,%xmm0 710 mov 8(%rsp),@ri[1] # tp[1] 711 shr \$2,$j # j=num/4-1 712 lea (%rsp),$ap # borrow ap for tp 713 xor $i,$i # i=0 and clear CF! 714 715 sub 0($np),@ri[0] 716 mov 16($ap),@ri[2] # tp[2] 717 mov 24($ap),@ri[3] # tp[3] 718 sbb 8($np),@ri[1] 719 jmp .Lsub4x 720 .align 16 721 .Lsub4x: 722 mov @ri[0],0($rp,$i,8) # rp[i]=tp[i]-np[i] 723 mov @ri[1],8($rp,$i,8) # rp[i]=tp[i]-np[i] 724 sbb 16($np,$i,8),@ri[2] 725 mov 32($ap,$i,8),@ri[0] # tp[i+1] 726 mov 40($ap,$i,8),@ri[1] 727 sbb 24($np,$i,8),@ri[3] 728 mov @ri[2],16($rp,$i,8) # rp[i]=tp[i]-np[i] 729 mov @ri[3],24($rp,$i,8) # rp[i]=tp[i]-np[i] 730 sbb 32($np,$i,8),@ri[0] 731 mov 48($ap,$i,8),@ri[2] 732 mov 56($ap,$i,8),@ri[3] 733 sbb 40($np,$i,8),@ri[1] 734 lea 4($i),$i # i++ 735 dec $j # doesnn't affect CF! 736 jnz .Lsub4x 737 738 mov @ri[0],0($rp,$i,8) # rp[i]=tp[i]-np[i] 739 mov 32($ap,$i,8),@ri[0] # load overflow bit 740 sbb 16($np,$i,8),@ri[2] 741 mov @ri[1],8($rp,$i,8) # rp[i]=tp[i]-np[i] 742 sbb 24($np,$i,8),@ri[3] 743 mov @ri[2],16($rp,$i,8) # rp[i]=tp[i]-np[i] 744 745 sbb \$0,@ri[0] # handle upmost overflow bit 746 mov @ri[3],24($rp,$i,8) # rp[i]=tp[i]-np[i] 747 xor $i,$i # i=0 748 and @ri[0],$ap 749 not @ri[0] 750 mov $rp,$np 751 and @ri[0],$np 752 lea -4($num),$j 753 or $np,$ap # ap=borrow?tp:rp 754 shr \$2,$j # j=num/4-1 755 756 movdqu ($ap),%xmm1 757 movdqa %xmm0,(%rsp) 758 movdqu %xmm1,($rp) 759 jmp .Lcopy4x 760 .align 16 761 .Lcopy4x: # copy or in-place refresh 762 movdqu 16($ap,$i),%xmm2 763 movdqu 32($ap,$i),%xmm1 764 movdqa %xmm0,16(%rsp,$i) 765 movdqu %xmm2,16($rp,$i) 766 movdqa %xmm0,32(%rsp,$i) 767 movdqu %xmm1,32($rp,$i) 768 lea 32($i),$i 769 dec $j 770 jnz .Lcopy4x 771 772 movdqu 16($ap,$i),%xmm2 773 movdqa %xmm0,16(%rsp,$i) 774 movdqu %xmm2,16($rp,$i) 775 ___ 776 } 777 $code.=<<___; 778 mov 8(%rsp,$num,8),%rsi # restore %rsp 779 .cfi_def_cfa %rsi, 8 780 mov \$1,%rax 781 mov -48(%rsi),%r15 782 .cfi_restore %r15 783 mov -40(%rsi),%r14 784 .cfi_restore %r14 785 mov -32(%rsi),%r13 786 .cfi_restore %r13 787 mov -24(%rsi),%r12 788 .cfi_restore %r12 789 mov -16(%rsi),%rbp 790 .cfi_restore %rbp 791 mov -8(%rsi),%rbx 792 .cfi_restore %rbx 793 lea (%rsi),%rsp 794 .cfi_def_cfa_register %rsp 795 .Lmul4x_epilogue: 796 ret 797 .cfi_endproc 798 .size bn_mul4x_mont,.-bn_mul4x_mont 799 ___ 800 }}} 801 {{{ 803 ###################################################################### 804 # void bn_sqr8x_mont( 805 my $rptr="%rdi"; # const BN_ULONG *rptr, 806 my $aptr="%rsi"; # const BN_ULONG *aptr, 807 my $bptr="%rdx"; # not used 808 my $nptr="%rcx"; # const BN_ULONG *nptr, 809 my $n0 ="%r8"; # const BN_ULONG *n0); 810 my $num ="%r9"; # int num, has to be divisible by 8 811 812 my ($i,$j,$tptr)=("%rbp","%rcx",$rptr); 813 my @A0=("%r10","%r11"); 814 my @A1=("%r12","%r13"); 815 my ($a0,$a1,$ai)=("%r14","%r15","%rbx"); 816 817 $code.=<<___ if ($addx); 818 .extern bn_sqrx8x_internal # see x86_64-mont5 module 819 ___ 820 $code.=<<___; 821 .extern bn_sqr8x_internal # see x86_64-mont5 module 822 823 .type bn_sqr8x_mont,\@function,6 824 .align 32 825 bn_sqr8x_mont: 826 .cfi_startproc 827 mov %rsp,%rax 828 .cfi_def_cfa_register %rax 829 .Lsqr8x_enter: 830 push %rbx 831 .cfi_push %rbx 832 push %rbp 833 .cfi_push %rbp 834 push %r12 835 .cfi_push %r12 836 push %r13 837 .cfi_push %r13 838 push %r14 839 .cfi_push %r14 840 push %r15 841 .cfi_push %r15 842 .Lsqr8x_prologue: 843 844 mov ${num}d,%r10d 845 shl \$3,${num}d # convert $num to bytes 846 shl \$3+2,%r10 # 4*$num 847 neg $num 848 849 ############################################################## 850 # ensure that stack frame doesn't alias with $aptr modulo 851 # 4096. this is done to allow memory disambiguation logic 852 # do its job. 853 # 854 lea -64(%rsp,$num,2),%r11 855 mov %rsp,%rbp 856 mov ($n0),$n0 # *n0 857 sub $aptr,%r11 858 and \$4095,%r11 859 cmp %r11,%r10 860 jb .Lsqr8x_sp_alt 861 sub %r11,%rbp # align with $aptr 862 lea -64(%rbp,$num,2),%rbp # future alloca(frame+2*$num) 863 jmp .Lsqr8x_sp_done 864 865 .align 32 866 .Lsqr8x_sp_alt: 867 lea 4096-64(,$num,2),%r10 # 4096-frame-2*$num 868 lea -64(%rbp,$num,2),%rbp # future alloca(frame+2*$num) 869 sub %r10,%r11 870 mov \$0,%r10 871 cmovc %r10,%r11 872 sub %r11,%rbp 873 .Lsqr8x_sp_done: 874 and \$-64,%rbp 875 mov %rsp,%r11 876 sub %rbp,%r11 877 and \$-4096,%r11 878 lea (%rbp,%r11),%rsp 879 mov (%rsp),%r10 880 cmp %rbp,%rsp 881 ja .Lsqr8x_page_walk 882 jmp .Lsqr8x_page_walk_done 883 884 .align 16 885 .Lsqr8x_page_walk: 886 lea -4096(%rsp),%rsp 887 mov (%rsp),%r10 888 cmp %rbp,%rsp 889 ja .Lsqr8x_page_walk 890 .Lsqr8x_page_walk_done: 891 892 mov $num,%r10 893 neg $num 894 895 mov $n0, 32(%rsp) 896 mov %rax, 40(%rsp) # save original %rsp 897 .cfi_cfa_expression %rsp+40,deref,+8 898 .Lsqr8x_body: 899 900 movq $nptr, %xmm2 # save pointer to modulus 901 pxor %xmm0,%xmm0 902 movq $rptr,%xmm1 # save $rptr 903 movq %r10, %xmm3 # -$num 904 ___ 905 $code.=<<___ if ($addx); 906 leaq OPENSSL_ia32cap_P(%rip),%rax 907 mov 8(%rax),%eax 908 and \$0x80100,%eax 909 cmp \$0x80100,%eax 910 jne .Lsqr8x_nox 911 912 call bn_sqrx8x_internal # see x86_64-mont5 module 913 # %rax top-most carry 914 # %rbp nptr 915 # %rcx -8*num 916 # %r8 end of tp[2*num] 917 lea (%r8,%rcx),%rbx 918 mov %rcx,$num 919 mov %rcx,%rdx 920 movq %xmm1,$rptr 921 sar \$3+2,%rcx # %cf=0 922 jmp .Lsqr8x_sub 923 924 .align 32 925 .Lsqr8x_nox: 926 ___ 927 $code.=<<___; 928 call bn_sqr8x_internal # see x86_64-mont5 module 929 # %rax top-most carry 930 # %rbp nptr 931 # %r8 -8*num 932 # %rdi end of tp[2*num] 933 lea (%rdi,$num),%rbx 934 mov $num,%rcx 935 mov $num,%rdx 936 movq %xmm1,$rptr 937 sar \$3+2,%rcx # %cf=0 938 jmp .Lsqr8x_sub 939 940 .align 32 941 .Lsqr8x_sub: 942 mov 8*0(%rbx),%r12 943 mov 8*1(%rbx),%r13 944 mov 8*2(%rbx),%r14 945 mov 8*3(%rbx),%r15 946 lea 8*4(%rbx),%rbx 947 sbb 8*0(%rbp),%r12 948 sbb 8*1(%rbp),%r13 949 sbb 8*2(%rbp),%r14 950 sbb 8*3(%rbp),%r15 951 lea 8*4(%rbp),%rbp 952 mov %r12,8*0($rptr) 953 mov %r13,8*1($rptr) 954 mov %r14,8*2($rptr) 955 mov %r15,8*3($rptr) 956 lea 8*4($rptr),$rptr 957 inc %rcx # preserves %cf 958 jnz .Lsqr8x_sub 959 960 sbb \$0,%rax # top-most carry 961 lea (%rbx,$num),%rbx # rewind 962 lea ($rptr,$num),$rptr # rewind 963 964 movq %rax,%xmm1 965 pxor %xmm0,%xmm0 966 pshufd \$0,%xmm1,%xmm1 967 mov 40(%rsp),%rsi # restore %rsp 968 .cfi_def_cfa %rsi,8 969 jmp .Lsqr8x_cond_copy 970 971 .align 32 972 .Lsqr8x_cond_copy: 973 movdqa 16*0(%rbx),%xmm2 974 movdqa 16*1(%rbx),%xmm3 975 lea 16*2(%rbx),%rbx 976 movdqu 16*0($rptr),%xmm4 977 movdqu 16*1($rptr),%xmm5 978 lea 16*2($rptr),$rptr 979 movdqa %xmm0,-16*2(%rbx) # zero tp 980 movdqa %xmm0,-16*1(%rbx) 981 movdqa %xmm0,-16*2(%rbx,%rdx) 982 movdqa %xmm0,-16*1(%rbx,%rdx) 983 pcmpeqd %xmm1,%xmm0 984 pand %xmm1,%xmm2 985 pand %xmm1,%xmm3 986 pand %xmm0,%xmm4 987 pand %xmm0,%xmm5 988 pxor %xmm0,%xmm0 989 por %xmm2,%xmm4 990 por %xmm3,%xmm5 991 movdqu %xmm4,-16*2($rptr) 992 movdqu %xmm5,-16*1($rptr) 993 add \$32,$num 994 jnz .Lsqr8x_cond_copy 995 996 mov \$1,%rax 997 mov -48(%rsi),%r15 998 .cfi_restore %r15 999 mov -40(%rsi),%r14 1000 .cfi_restore %r14 1001 mov -32(%rsi),%r13 1002 .cfi_restore %r13 1003 mov -24(%rsi),%r12 1004 .cfi_restore %r12 1005 mov -16(%rsi),%rbp 1006 .cfi_restore %rbp 1007 mov -8(%rsi),%rbx 1008 .cfi_restore %rbx 1009 lea (%rsi),%rsp 1010 .cfi_def_cfa_register %rsp 1011 .Lsqr8x_epilogue: 1012 ret 1013 .cfi_endproc 1014 .size bn_sqr8x_mont,.-bn_sqr8x_mont 1015 ___ 1016 }}} 1017 1019 if ($addx) {{{ 1020 my $bp="%rdx"; # original value 1021 1022 $code.=<<___; 1023 .type bn_mulx4x_mont,\@function,6 1024 .align 32 1025 bn_mulx4x_mont: 1026 .cfi_startproc 1027 mov %rsp,%rax 1028 .cfi_def_cfa_register %rax 1029 .Lmulx4x_enter: 1030 push %rbx 1031 .cfi_push %rbx 1032 push %rbp 1033 .cfi_push %rbp 1034 push %r12 1035 .cfi_push %r12 1036 push %r13 1037 .cfi_push %r13 1038 push %r14 1039 .cfi_push %r14 1040 push %r15 1041 .cfi_push %r15 1042 .Lmulx4x_prologue: 1043 1044 shl \$3,${num}d # convert $num to bytes 1045 xor %r10,%r10 1046 sub $num,%r10 # -$num 1047 mov ($n0),$n0 # *n0 1048 lea -72(%rsp,%r10),%rbp # future alloca(frame+$num+8) 1049 and \$-128,%rbp 1050 mov %rsp,%r11 1051 sub %rbp,%r11 1052 and \$-4096,%r11 1053 lea (%rbp,%r11),%rsp 1054 mov (%rsp),%r10 1055 cmp %rbp,%rsp 1056 ja .Lmulx4x_page_walk 1057 jmp .Lmulx4x_page_walk_done 1058 1059 .align 16 1060 .Lmulx4x_page_walk: 1061 lea -4096(%rsp),%rsp 1062 mov (%rsp),%r10 1063 cmp %rbp,%rsp 1064 ja .Lmulx4x_page_walk 1065 .Lmulx4x_page_walk_done: 1066 1067 lea ($bp,$num),%r10 1068 ############################################################## 1069 # Stack layout 1070 # +0 num 1071 # +8 off-loaded &b[i] 1072 # +16 end of b[num] 1073 # +24 saved n0 1074 # +32 saved rp 1075 # +40 saved %rsp 1076 # +48 inner counter 1077 # +56 1078 # +64 tmp[num+1] 1079 # 1080 mov $num,0(%rsp) # save $num 1081 shr \$5,$num 1082 mov %r10,16(%rsp) # end of b[num] 1083 sub \$1,$num 1084 mov $n0, 24(%rsp) # save *n0 1085 mov $rp, 32(%rsp) # save $rp 1086 mov %rax,40(%rsp) # save original %rsp 1087 .cfi_cfa_expression %rsp+40,deref,+8 1088 mov $num,48(%rsp) # inner counter 1089 jmp .Lmulx4x_body 1090 1091 .align 32 1092 .Lmulx4x_body: 1093 ___ 1094 my ($aptr, $bptr, $nptr, $tptr, $mi, $bi, $zero, $num)= 1095 ("%rsi","%rdi","%rcx","%rbx","%r8","%r9","%rbp","%rax"); 1096 my $rptr=$bptr; 1097 $code.=<<___; 1098 lea 8($bp),$bptr 1099 mov ($bp),%rdx # b[0], $bp==%rdx actually 1100 lea 64+32(%rsp),$tptr 1101 mov %rdx,$bi 1102 1103 mulx 0*8($aptr),$mi,%rax # a[0]*b[0] 1104 mulx 1*8($aptr),%r11,%r14 # a[1]*b[0] 1105 add %rax,%r11 1106 mov $bptr,8(%rsp) # off-load &b[i] 1107 mulx 2*8($aptr),%r12,%r13 # ... 1108 adc %r14,%r12 1109 adc \$0,%r13 1110 1111 mov $mi,$bptr # borrow $bptr 1112 imulq 24(%rsp),$mi # "t[0]"*n0 1113 xor $zero,$zero # cf=0, of=0 1114 1115 mulx 3*8($aptr),%rax,%r14 1116 mov $mi,%rdx 1117 lea 4*8($aptr),$aptr 1118 adcx %rax,%r13 1119 adcx $zero,%r14 # cf=0 1120 1121 mulx 0*8($nptr),%rax,%r10 1122 adcx %rax,$bptr # discarded 1123 adox %r11,%r10 1124 mulx 1*8($nptr),%rax,%r11 1125 adcx %rax,%r10 1126 adox %r12,%r11 1127 .byte 0xc4,0x62,0xfb,0xf6,0xa1,0x10,0x00,0x00,0x00 # mulx 2*8($nptr),%rax,%r12 1128 mov 48(%rsp),$bptr # counter value 1129 mov %r10,-4*8($tptr) 1130 adcx %rax,%r11 1131 adox %r13,%r12 1132 mulx 3*8($nptr),%rax,%r15 1133 mov $bi,%rdx 1134 mov %r11,-3*8($tptr) 1135 adcx %rax,%r12 1136 adox $zero,%r15 # of=0 1137 lea 4*8($nptr),$nptr 1138 mov %r12,-2*8($tptr) 1139 1140 jmp .Lmulx4x_1st 1141 1142 .align 32 1143 .Lmulx4x_1st: 1144 adcx $zero,%r15 # cf=0, modulo-scheduled 1145 mulx 0*8($aptr),%r10,%rax # a[4]*b[0] 1146 adcx %r14,%r10 1147 mulx 1*8($aptr),%r11,%r14 # a[5]*b[0] 1148 adcx %rax,%r11 1149 mulx 2*8($aptr),%r12,%rax # ... 1150 adcx %r14,%r12 1151 mulx 3*8($aptr),%r13,%r14 1152 .byte 0x67,0x67 1153 mov $mi,%rdx 1154 adcx %rax,%r13 1155 adcx $zero,%r14 # cf=0 1156 lea 4*8($aptr),$aptr 1157 lea 4*8($tptr),$tptr 1158 1159 adox %r15,%r10 1160 mulx 0*8($nptr),%rax,%r15 1161 adcx %rax,%r10 1162 adox %r15,%r11 1163 mulx 1*8($nptr),%rax,%r15 1164 adcx %rax,%r11 1165 adox %r15,%r12 1166 mulx 2*8($nptr),%rax,%r15 1167 mov %r10,-5*8($tptr) 1168 adcx %rax,%r12 1169 mov %r11,-4*8($tptr) 1170 adox %r15,%r13 1171 mulx 3*8($nptr),%rax,%r15 1172 mov $bi,%rdx 1173 mov %r12,-3*8($tptr) 1174 adcx %rax,%r13 1175 adox $zero,%r15 1176 lea 4*8($nptr),$nptr 1177 mov %r13,-2*8($tptr) 1178 1179 dec $bptr # of=0, pass cf 1180 jnz .Lmulx4x_1st 1181 1182 mov 0(%rsp),$num # load num 1183 mov 8(%rsp),$bptr # re-load &b[i] 1184 adc $zero,%r15 # modulo-scheduled 1185 add %r15,%r14 1186 sbb %r15,%r15 # top-most carry 1187 mov %r14,-1*8($tptr) 1188 jmp .Lmulx4x_outer 1189 1190 .align 32 1191 .Lmulx4x_outer: 1192 mov ($bptr),%rdx # b[i] 1193 lea 8($bptr),$bptr # b++ 1194 sub $num,$aptr # rewind $aptr 1195 mov %r15,($tptr) # save top-most carry 1196 lea 64+4*8(%rsp),$tptr 1197 sub $num,$nptr # rewind $nptr 1198 1199 mulx 0*8($aptr),$mi,%r11 # a[0]*b[i] 1200 xor %ebp,%ebp # xor $zero,$zero # cf=0, of=0 1201 mov %rdx,$bi 1202 mulx 1*8($aptr),%r14,%r12 # a[1]*b[i] 1203 adox -4*8($tptr),$mi 1204 adcx %r14,%r11 1205 mulx 2*8($aptr),%r15,%r13 # ... 1206 adox -3*8($tptr),%r11 1207 adcx %r15,%r12 1208 adox -2*8($tptr),%r12 1209 adcx $zero,%r13 1210 adox $zero,%r13 1211 1212 mov $bptr,8(%rsp) # off-load &b[i] 1213 mov $mi,%r15 1214 imulq 24(%rsp),$mi # "t[0]"*n0 1215 xor %ebp,%ebp # xor $zero,$zero # cf=0, of=0 1216 1217 mulx 3*8($aptr),%rax,%r14 1218 mov $mi,%rdx 1219 adcx %rax,%r13 1220 adox -1*8($tptr),%r13 1221 adcx $zero,%r14 1222 lea 4*8($aptr),$aptr 1223 adox $zero,%r14 1224 1225 mulx 0*8($nptr),%rax,%r10 1226 adcx %rax,%r15 # discarded 1227 adox %r11,%r10 1228 mulx 1*8($nptr),%rax,%r11 1229 adcx %rax,%r10 1230 adox %r12,%r11 1231 mulx 2*8($nptr),%rax,%r12 1232 mov %r10,-4*8($tptr) 1233 adcx %rax,%r11 1234 adox %r13,%r12 1235 mulx 3*8($nptr),%rax,%r15 1236 mov $bi,%rdx 1237 mov %r11,-3*8($tptr) 1238 lea 4*8($nptr),$nptr 1239 adcx %rax,%r12 1240 adox $zero,%r15 # of=0 1241 mov 48(%rsp),$bptr # counter value 1242 mov %r12,-2*8($tptr) 1243 1244 jmp .Lmulx4x_inner 1245 1246 .align 32 1247 .Lmulx4x_inner: 1248 mulx 0*8($aptr),%r10,%rax # a[4]*b[i] 1249 adcx $zero,%r15 # cf=0, modulo-scheduled 1250 adox %r14,%r10 1251 mulx 1*8($aptr),%r11,%r14 # a[5]*b[i] 1252 adcx 0*8($tptr),%r10 1253 adox %rax,%r11 1254 mulx 2*8($aptr),%r12,%rax # ... 1255 adcx 1*8($tptr),%r11 1256 adox %r14,%r12 1257 mulx 3*8($aptr),%r13,%r14 1258 mov $mi,%rdx 1259 adcx 2*8($tptr),%r12 1260 adox %rax,%r13 1261 adcx 3*8($tptr),%r13 1262 adox $zero,%r14 # of=0 1263 lea 4*8($aptr),$aptr 1264 lea 4*8($tptr),$tptr 1265 adcx $zero,%r14 # cf=0 1266 1267 adox %r15,%r10 1268 mulx 0*8($nptr),%rax,%r15 1269 adcx %rax,%r10 1270 adox %r15,%r11 1271 mulx 1*8($nptr),%rax,%r15 1272 adcx %rax,%r11 1273 adox %r15,%r12 1274 mulx 2*8($nptr),%rax,%r15 1275 mov %r10,-5*8($tptr) 1276 adcx %rax,%r12 1277 adox %r15,%r13 1278 mulx 3*8($nptr),%rax,%r15 1279 mov $bi,%rdx 1280 mov %r11,-4*8($tptr) 1281 mov %r12,-3*8($tptr) 1282 adcx %rax,%r13 1283 adox $zero,%r15 1284 lea 4*8($nptr),$nptr 1285 mov %r13,-2*8($tptr) 1286 1287 dec $bptr # of=0, pass cf 1288 jnz .Lmulx4x_inner 1289 1290 mov 0(%rsp),$num # load num 1291 mov 8(%rsp),$bptr # re-load &b[i] 1292 adc $zero,%r15 # modulo-scheduled 1293 sub 0*8($tptr),$zero # pull top-most carry 1294 adc %r15,%r14 1295 sbb %r15,%r15 # top-most carry 1296 mov %r14,-1*8($tptr) 1297 1298 cmp 16(%rsp),$bptr 1299 jne .Lmulx4x_outer 1300 1301 lea 64(%rsp),$tptr 1302 sub $num,$nptr # rewind $nptr 1303 neg %r15 1304 mov $num,%rdx 1305 shr \$3+2,$num # %cf=0 1306 mov 32(%rsp),$rptr # restore rp 1307 jmp .Lmulx4x_sub 1308 1309 .align 32 1310 .Lmulx4x_sub: 1311 mov 8*0($tptr),%r11 1312 mov 8*1($tptr),%r12 1313 mov 8*2($tptr),%r13 1314 mov 8*3($tptr),%r14 1315 lea 8*4($tptr),$tptr 1316 sbb 8*0($nptr),%r11 1317 sbb 8*1($nptr),%r12 1318 sbb 8*2($nptr),%r13 1319 sbb 8*3($nptr),%r14 1320 lea 8*4($nptr),$nptr 1321 mov %r11,8*0($rptr) 1322 mov %r12,8*1($rptr) 1323 mov %r13,8*2($rptr) 1324 mov %r14,8*3($rptr) 1325 lea 8*4($rptr),$rptr 1326 dec $num # preserves %cf 1327 jnz .Lmulx4x_sub 1328 1329 sbb \$0,%r15 # top-most carry 1330 lea 64(%rsp),$tptr 1331 sub %rdx,$rptr # rewind 1332 1333 movq %r15,%xmm1 1334 pxor %xmm0,%xmm0 1335 pshufd \$0,%xmm1,%xmm1 1336 mov 40(%rsp),%rsi # restore %rsp 1337 .cfi_def_cfa %rsi,8 1338 jmp .Lmulx4x_cond_copy 1339 1340 .align 32 1341 .Lmulx4x_cond_copy: 1342 movdqa 16*0($tptr),%xmm2 1343 movdqa 16*1($tptr),%xmm3 1344 lea 16*2($tptr),$tptr 1345 movdqu 16*0($rptr),%xmm4 1346 movdqu 16*1($rptr),%xmm5 1347 lea 16*2($rptr),$rptr 1348 movdqa %xmm0,-16*2($tptr) # zero tp 1349 movdqa %xmm0,-16*1($tptr) 1350 pcmpeqd %xmm1,%xmm0 1351 pand %xmm1,%xmm2 1352 pand %xmm1,%xmm3 1353 pand %xmm0,%xmm4 1354 pand %xmm0,%xmm5 1355 pxor %xmm0,%xmm0 1356 por %xmm2,%xmm4 1357 por %xmm3,%xmm5 1358 movdqu %xmm4,-16*2($rptr) 1359 movdqu %xmm5,-16*1($rptr) 1360 sub \$32,%rdx 1361 jnz .Lmulx4x_cond_copy 1362 1363 mov %rdx,($tptr) 1364 1365 mov \$1,%rax 1366 mov -48(%rsi),%r15 1367 .cfi_restore %r15 1368 mov -40(%rsi),%r14 1369 .cfi_restore %r14 1370 mov -32(%rsi),%r13 1371 .cfi_restore %r13 1372 mov -24(%rsi),%r12 1373 .cfi_restore %r12 1374 mov -16(%rsi),%rbp 1375 .cfi_restore %rbp 1376 mov -8(%rsi),%rbx 1377 .cfi_restore %rbx 1378 lea (%rsi),%rsp 1379 .cfi_def_cfa_register %rsp 1380 .Lmulx4x_epilogue: 1381 ret 1382 .cfi_endproc 1383 .size bn_mulx4x_mont,.-bn_mulx4x_mont 1384 ___ 1385 }}} 1386 $code.=<<___; 1387 .asciz "Montgomery Multiplication for x86_64, CRYPTOGAMS by <appro\@openssl.org>" 1388 .align 16 1389 ___ 1390 1391 # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, 1392 # CONTEXT *context,DISPATCHER_CONTEXT *disp) 1393 if ($win64) { 1394 $rec="%rcx"; 1395 $frame="%rdx"; 1396 $context="%r8"; 1397 $disp="%r9"; 1398 1399 $code.=<<___; 1400 .extern __imp_RtlVirtualUnwind 1401 .type mul_handler,\@abi-omnipotent 1402 .align 16 1403 mul_handler: 1404 push %rsi 1405 push %rdi 1406 push %rbx 1407 push %rbp 1408 push %r12 1409 push %r13 1410 push %r14 1411 push %r15 1412 pushfq 1413 sub \$64,%rsp 1414 1415 mov 120($context),%rax # pull context->Rax 1416 mov 248($context),%rbx # pull context->Rip 1417 1418 mov 8($disp),%rsi # disp->ImageBase 1419 mov 56($disp),%r11 # disp->HandlerData 1420 1421 mov 0(%r11),%r10d # HandlerData[0] 1422 lea (%rsi,%r10),%r10 # end of prologue label 1423 cmp %r10,%rbx # context->Rip<end of prologue label 1424 jb .Lcommon_seh_tail 1425 1426 mov 152($context),%rax # pull context->Rsp 1427 1428 mov 4(%r11),%r10d # HandlerData[1] 1429 lea (%rsi,%r10),%r10 # epilogue label 1430 cmp %r10,%rbx # context->Rip>=epilogue label 1431 jae .Lcommon_seh_tail 1432 1433 mov 192($context),%r10 # pull $num 1434 mov 8(%rax,%r10,8),%rax # pull saved stack pointer 1435 1436 jmp .Lcommon_pop_regs 1437 .size mul_handler,.-mul_handler 1438 1439 .type sqr_handler,\@abi-omnipotent 1440 .align 16 1441 sqr_handler: 1442 push %rsi 1443 push %rdi 1444 push %rbx 1445 push %rbp 1446 push %r12 1447 push %r13 1448 push %r14 1449 push %r15 1450 pushfq 1451 sub \$64,%rsp 1452 1453 mov 120($context),%rax # pull context->Rax 1454 mov 248($context),%rbx # pull context->Rip 1455 1456 mov 8($disp),%rsi # disp->ImageBase 1457 mov 56($disp),%r11 # disp->HandlerData 1458 1459 mov 0(%r11),%r10d # HandlerData[0] 1460 lea (%rsi,%r10),%r10 # end of prologue label 1461 cmp %r10,%rbx # context->Rip<.Lsqr_prologue 1462 jb .Lcommon_seh_tail 1463 1464 mov 4(%r11),%r10d # HandlerData[1] 1465 lea (%rsi,%r10),%r10 # body label 1466 cmp %r10,%rbx # context->Rip<.Lsqr_body 1467 jb .Lcommon_pop_regs 1468 1469 mov 152($context),%rax # pull context->Rsp 1470 1471 mov 8(%r11),%r10d # HandlerData[2] 1472 lea (%rsi,%r10),%r10 # epilogue label 1473 cmp %r10,%rbx # context->Rip>=.Lsqr_epilogue 1474 jae .Lcommon_seh_tail 1475 1476 mov 40(%rax),%rax # pull saved stack pointer 1477 1478 .Lcommon_pop_regs: 1479 mov -8(%rax),%rbx 1480 mov -16(%rax),%rbp 1481 mov -24(%rax),%r12 1482 mov -32(%rax),%r13 1483 mov -40(%rax),%r14 1484 mov -48(%rax),%r15 1485 mov %rbx,144($context) # restore context->Rbx 1486 mov %rbp,160($context) # restore context->Rbp 1487 mov %r12,216($context) # restore context->R12 1488 mov %r13,224($context) # restore context->R13 1489 mov %r14,232($context) # restore context->R14 1490 mov %r15,240($context) # restore context->R15 1491 1492 .Lcommon_seh_tail: 1493 mov 8(%rax),%rdi 1494 mov 16(%rax),%rsi 1495 mov %rax,152($context) # restore context->Rsp 1496 mov %rsi,168($context) # restore context->Rsi 1497 mov %rdi,176($context) # restore context->Rdi 1498 1499 mov 40($disp),%rdi # disp->ContextRecord 1500 mov $context,%rsi # context 1501 mov \$154,%ecx # sizeof(CONTEXT) 1502 .long 0xa548f3fc # cld; rep movsq 1503 1504 mov $disp,%rsi 1505 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER 1506 mov 8(%rsi),%rdx # arg2, disp->ImageBase 1507 mov 0(%rsi),%r8 # arg3, disp->ControlPc 1508 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry 1509 mov 40(%rsi),%r10 # disp->ContextRecord 1510 lea 56(%rsi),%r11 # &disp->HandlerData 1511 lea 24(%rsi),%r12 # &disp->EstablisherFrame 1512 mov %r10,32(%rsp) # arg5 1513 mov %r11,40(%rsp) # arg6 1514 mov %r12,48(%rsp) # arg7 1515 mov %rcx,56(%rsp) # arg8, (NULL) 1516 call *__imp_RtlVirtualUnwind(%rip) 1517 1518 mov \$1,%eax # ExceptionContinueSearch 1519 add \$64,%rsp 1520 popfq 1521 pop %r15 1522 pop %r14 1523 pop %r13 1524 pop %r12 1525 pop %rbp 1526 pop %rbx 1527 pop %rdi 1528 pop %rsi 1529 ret 1530 .size sqr_handler,.-sqr_handler 1531 1532 .section .pdata 1533 .align 4 1534 .rva .LSEH_begin_bn_mul_mont 1535 .rva .LSEH_end_bn_mul_mont 1536 .rva .LSEH_info_bn_mul_mont 1537 1538 .rva .LSEH_begin_bn_mul4x_mont 1539 .rva .LSEH_end_bn_mul4x_mont 1540 .rva .LSEH_info_bn_mul4x_mont 1541 1542 .rva .LSEH_begin_bn_sqr8x_mont 1543 .rva .LSEH_end_bn_sqr8x_mont 1544 .rva .LSEH_info_bn_sqr8x_mont 1545 ___ 1546 $code.=<<___ if ($addx); 1547 .rva .LSEH_begin_bn_mulx4x_mont 1548 .rva .LSEH_end_bn_mulx4x_mont 1549 .rva .LSEH_info_bn_mulx4x_mont 1550 ___ 1551 $code.=<<___; 1552 .section .xdata 1553 .align 8 1554 .LSEH_info_bn_mul_mont: 1555 .byte 9,0,0,0 1556 .rva mul_handler 1557 .rva .Lmul_body,.Lmul_epilogue # HandlerData[] 1558 .LSEH_info_bn_mul4x_mont: 1559 .byte 9,0,0,0 1560 .rva mul_handler 1561 .rva .Lmul4x_body,.Lmul4x_epilogue # HandlerData[] 1562 .LSEH_info_bn_sqr8x_mont: 1563 .byte 9,0,0,0 1564 .rva sqr_handler 1565 .rva .Lsqr8x_prologue,.Lsqr8x_body,.Lsqr8x_epilogue # HandlerData[] 1566 .align 8 1567 ___ 1568 $code.=<<___ if ($addx); 1569 .LSEH_info_bn_mulx4x_mont: 1570 .byte 9,0,0,0 1571 .rva sqr_handler 1572 .rva .Lmulx4x_prologue,.Lmulx4x_body,.Lmulx4x_epilogue # HandlerData[] 1573 .align 8 1574 ___ 1575 } 1576 1577 print $code; 1578 close STDOUT; 1579