1 #!/usr/bin/env perl 2 3 # Copyright (c) 2014, Intel Corporation. 4 # 5 # Permission to use, copy, modify, and/or distribute this software for any 6 # purpose with or without fee is hereby granted, provided that the above 7 # copyright notice and this permission notice appear in all copies. 8 # 9 # THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 10 # WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 11 # MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY 12 # SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 13 # WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION 14 # OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN 15 # CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 16 17 # Developers and authors: 18 # Shay Gueron (1, 2), and Vlad Krasnov (1) 19 # (1) Intel Corporation, Israel Development Center 20 # (2) University of Haifa 21 22 # Reference: 23 # S.Gueron and V.Krasnov, "Fast Prime Field Elliptic Curve Cryptography with 24 # 256 Bit Primes" 25 26 # Further optimization by <appro (at] openssl.org>: 27 # 28 # this/original 29 # Opteron +12-49% 30 # Bulldozer +14-45% 31 # P4 +18-46% 32 # Westmere +12-34% 33 # Sandy Bridge +9-35% 34 # Ivy Bridge +9-35% 35 # Haswell +8-37% 36 # Broadwell +18-58% 37 # Atom +15-50% 38 # VIA Nano +43-160% 39 # 40 # Ranges denote minimum and maximum improvement coefficients depending 41 # on benchmark. 42 43 $flavour = shift; 44 $output = shift; 45 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } 46 47 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); 48 49 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 50 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or 51 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or 52 die "can't locate x86_64-xlate.pl"; 53 54 open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""; 55 *STDOUT=*OUT; 56 57 # TODO: enable these after testing. $avx goes to two and $addx to one. 58 $avx=0; 59 $addx=0; 60 61 $code.=<<___; 62 .text 63 .extern OPENSSL_ia32cap_P 64 65 # The polynomial 66 .align 64 67 .Lpoly: 68 .quad 0xffffffffffffffff, 0x00000000ffffffff, 0x0000000000000000, 0xffffffff00000001 69 70 .LOne: 71 .long 1,1,1,1,1,1,1,1 72 .LTwo: 73 .long 2,2,2,2,2,2,2,2 74 .LThree: 75 .long 3,3,3,3,3,3,3,3 76 .LONE_mont: 77 .quad 0x0000000000000001, 0xffffffff00000000, 0xffffffffffffffff, 0x00000000fffffffe 78 ___ 79 80 { 81 my ($a0,$a1,$a2,$a3)=map("%r$_",(8..11)); 82 my ($t0,$t1,$t2,$t3,$t4)=("%rax","%rdx","%rcx","%r12","%r13"); 83 my ($r_ptr,$a_ptr,$b_ptr)=("%rdi","%rsi","%rdx"); 84 85 $code.=<<___; 86 87 ################################################################################ 88 # void ecp_nistz256_neg(uint64_t res[4], uint64_t a[4]); 89 .globl ecp_nistz256_neg 90 .type ecp_nistz256_neg,\@function,2 91 .align 32 92 ecp_nistz256_neg: 93 push %r12 94 push %r13 95 96 xor $a0, $a0 97 xor $a1, $a1 98 xor $a2, $a2 99 xor $a3, $a3 100 xor $t4, $t4 101 102 sub 8*0($a_ptr), $a0 103 sbb 8*1($a_ptr), $a1 104 sbb 8*2($a_ptr), $a2 105 mov $a0, $t0 106 sbb 8*3($a_ptr), $a3 107 lea .Lpoly(%rip), $a_ptr 108 mov $a1, $t1 109 sbb \$0, $t4 110 111 add 8*0($a_ptr), $a0 112 mov $a2, $t2 113 adc 8*1($a_ptr), $a1 114 adc 8*2($a_ptr), $a2 115 mov $a3, $t3 116 adc 8*3($a_ptr), $a3 117 test $t4, $t4 118 119 cmovz $t0, $a0 120 cmovz $t1, $a1 121 mov $a0, 8*0($r_ptr) 122 cmovz $t2, $a2 123 mov $a1, 8*1($r_ptr) 124 cmovz $t3, $a3 125 mov $a2, 8*2($r_ptr) 126 mov $a3, 8*3($r_ptr) 127 128 pop %r13 129 pop %r12 130 ret 131 .size ecp_nistz256_neg,.-ecp_nistz256_neg 132 ___ 133 } 134 { 135 my ($r_ptr,$a_ptr,$b_org,$b_ptr)=("%rdi","%rsi","%rdx","%rbx"); 136 my ($acc0,$acc1,$acc2,$acc3,$acc4,$acc5,$acc6,$acc7)=map("%r$_",(8..15)); 137 my ($t0,$t1,$t2,$t3,$t4)=("%rcx","%rbp","%rbx","%rdx","%rax"); 138 my ($poly1,$poly3)=($acc6,$acc7); 139 140 $code.=<<___; 141 ################################################################################ 142 # void ecp_nistz256_mul_mont( 143 # uint64_t res[4], 144 # uint64_t a[4], 145 # uint64_t b[4]); 146 147 .globl ecp_nistz256_mul_mont 148 .type ecp_nistz256_mul_mont,\@function,3 149 .align 32 150 ecp_nistz256_mul_mont: 151 ___ 152 $code.=<<___ if ($addx); 153 mov \$0x80100, %ecx 154 and OPENSSL_ia32cap_P+8(%rip), %ecx 155 ___ 156 $code.=<<___; 157 .Lmul_mont: 158 push %rbp 159 push %rbx 160 push %r12 161 push %r13 162 push %r14 163 push %r15 164 ___ 165 $code.=<<___ if ($addx); 166 cmp \$0x80100, %ecx 167 je .Lmul_montx 168 ___ 169 $code.=<<___; 170 mov $b_org, $b_ptr 171 mov 8*0($b_org), %rax 172 mov 8*0($a_ptr), $acc1 173 mov 8*1($a_ptr), $acc2 174 mov 8*2($a_ptr), $acc3 175 mov 8*3($a_ptr), $acc4 176 177 call __ecp_nistz256_mul_montq 178 ___ 179 $code.=<<___ if ($addx); 180 jmp .Lmul_mont_done 181 182 .align 32 183 .Lmul_montx: 184 mov $b_org, $b_ptr 185 mov 8*0($b_org), %rdx 186 mov 8*0($a_ptr), $acc1 187 mov 8*1($a_ptr), $acc2 188 mov 8*2($a_ptr), $acc3 189 mov 8*3($a_ptr), $acc4 190 lea -128($a_ptr), $a_ptr # control u-op density 191 192 call __ecp_nistz256_mul_montx 193 ___ 194 $code.=<<___; 195 .Lmul_mont_done: 196 pop %r15 197 pop %r14 198 pop %r13 199 pop %r12 200 pop %rbx 201 pop %rbp 202 ret 203 .size ecp_nistz256_mul_mont,.-ecp_nistz256_mul_mont 204 205 .type __ecp_nistz256_mul_montq,\@abi-omnipotent 206 .align 32 207 __ecp_nistz256_mul_montq: 208 ######################################################################## 209 # Multiply a by b[0] 210 mov %rax, $t1 211 mulq $acc1 212 mov .Lpoly+8*1(%rip),$poly1 213 mov %rax, $acc0 214 mov $t1, %rax 215 mov %rdx, $acc1 216 217 mulq $acc2 218 mov .Lpoly+8*3(%rip),$poly3 219 add %rax, $acc1 220 mov $t1, %rax 221 adc \$0, %rdx 222 mov %rdx, $acc2 223 224 mulq $acc3 225 add %rax, $acc2 226 mov $t1, %rax 227 adc \$0, %rdx 228 mov %rdx, $acc3 229 230 mulq $acc4 231 add %rax, $acc3 232 mov $acc0, %rax 233 adc \$0, %rdx 234 xor $acc5, $acc5 235 mov %rdx, $acc4 236 237 ######################################################################## 238 # First reduction step 239 # Basically now we want to multiply acc[0] by p256, 240 # and add the result to the acc. 241 # Due to the special form of p256 we do some optimizations 242 # 243 # acc[0] x p256[0..1] = acc[0] x 2^96 - acc[0] 244 # then we add acc[0] and get acc[0] x 2^96 245 246 mov $acc0, $t1 247 shl \$32, $acc0 248 mulq $poly3 249 shr \$32, $t1 250 add $acc0, $acc1 # +=acc[0]<<96 251 adc $t1, $acc2 252 adc %rax, $acc3 253 mov 8*1($b_ptr), %rax 254 adc %rdx, $acc4 255 adc \$0, $acc5 256 xor $acc0, $acc0 257 258 ######################################################################## 259 # Multiply by b[1] 260 mov %rax, $t1 261 mulq 8*0($a_ptr) 262 add %rax, $acc1 263 mov $t1, %rax 264 adc \$0, %rdx 265 mov %rdx, $t0 266 267 mulq 8*1($a_ptr) 268 add $t0, $acc2 269 adc \$0, %rdx 270 add %rax, $acc2 271 mov $t1, %rax 272 adc \$0, %rdx 273 mov %rdx, $t0 274 275 mulq 8*2($a_ptr) 276 add $t0, $acc3 277 adc \$0, %rdx 278 add %rax, $acc3 279 mov $t1, %rax 280 adc \$0, %rdx 281 mov %rdx, $t0 282 283 mulq 8*3($a_ptr) 284 add $t0, $acc4 285 adc \$0, %rdx 286 add %rax, $acc4 287 mov $acc1, %rax 288 adc %rdx, $acc5 289 adc \$0, $acc0 290 291 ######################################################################## 292 # Second reduction step 293 mov $acc1, $t1 294 shl \$32, $acc1 295 mulq $poly3 296 shr \$32, $t1 297 add $acc1, $acc2 298 adc $t1, $acc3 299 adc %rax, $acc4 300 mov 8*2($b_ptr), %rax 301 adc %rdx, $acc5 302 adc \$0, $acc0 303 xor $acc1, $acc1 304 305 ######################################################################## 306 # Multiply by b[2] 307 mov %rax, $t1 308 mulq 8*0($a_ptr) 309 add %rax, $acc2 310 mov $t1, %rax 311 adc \$0, %rdx 312 mov %rdx, $t0 313 314 mulq 8*1($a_ptr) 315 add $t0, $acc3 316 adc \$0, %rdx 317 add %rax, $acc3 318 mov $t1, %rax 319 adc \$0, %rdx 320 mov %rdx, $t0 321 322 mulq 8*2($a_ptr) 323 add $t0, $acc4 324 adc \$0, %rdx 325 add %rax, $acc4 326 mov $t1, %rax 327 adc \$0, %rdx 328 mov %rdx, $t0 329 330 mulq 8*3($a_ptr) 331 add $t0, $acc5 332 adc \$0, %rdx 333 add %rax, $acc5 334 mov $acc2, %rax 335 adc %rdx, $acc0 336 adc \$0, $acc1 337 338 ######################################################################## 339 # Third reduction step 340 mov $acc2, $t1 341 shl \$32, $acc2 342 mulq $poly3 343 shr \$32, $t1 344 add $acc2, $acc3 345 adc $t1, $acc4 346 adc %rax, $acc5 347 mov 8*3($b_ptr), %rax 348 adc %rdx, $acc0 349 adc \$0, $acc1 350 xor $acc2, $acc2 351 352 ######################################################################## 353 # Multiply by b[3] 354 mov %rax, $t1 355 mulq 8*0($a_ptr) 356 add %rax, $acc3 357 mov $t1, %rax 358 adc \$0, %rdx 359 mov %rdx, $t0 360 361 mulq 8*1($a_ptr) 362 add $t0, $acc4 363 adc \$0, %rdx 364 add %rax, $acc4 365 mov $t1, %rax 366 adc \$0, %rdx 367 mov %rdx, $t0 368 369 mulq 8*2($a_ptr) 370 add $t0, $acc5 371 adc \$0, %rdx 372 add %rax, $acc5 373 mov $t1, %rax 374 adc \$0, %rdx 375 mov %rdx, $t0 376 377 mulq 8*3($a_ptr) 378 add $t0, $acc0 379 adc \$0, %rdx 380 add %rax, $acc0 381 mov $acc3, %rax 382 adc %rdx, $acc1 383 adc \$0, $acc2 384 385 ######################################################################## 386 # Final reduction step 387 mov $acc3, $t1 388 shl \$32, $acc3 389 mulq $poly3 390 shr \$32, $t1 391 add $acc3, $acc4 392 adc $t1, $acc5 393 mov $acc4, $t0 394 adc %rax, $acc0 395 adc %rdx, $acc1 396 mov $acc5, $t1 397 adc \$0, $acc2 398 399 ######################################################################## 400 # Branch-less conditional subtraction of P 401 sub \$-1, $acc4 # .Lpoly[0] 402 mov $acc0, $t2 403 sbb $poly1, $acc5 # .Lpoly[1] 404 sbb \$0, $acc0 # .Lpoly[2] 405 mov $acc1, $t3 406 sbb $poly3, $acc1 # .Lpoly[3] 407 sbb \$0, $acc2 408 409 cmovc $t0, $acc4 410 cmovc $t1, $acc5 411 mov $acc4, 8*0($r_ptr) 412 cmovc $t2, $acc0 413 mov $acc5, 8*1($r_ptr) 414 cmovc $t3, $acc1 415 mov $acc0, 8*2($r_ptr) 416 mov $acc1, 8*3($r_ptr) 417 418 ret 419 .size __ecp_nistz256_mul_montq,.-__ecp_nistz256_mul_montq 420 421 ################################################################################ 422 # void ecp_nistz256_sqr_mont( 423 # uint64_t res[4], 424 # uint64_t a[4]); 425 426 # we optimize the square according to S.Gueron and V.Krasnov, 427 # "Speeding up Big-Number Squaring" 428 .globl ecp_nistz256_sqr_mont 429 .type ecp_nistz256_sqr_mont,\@function,2 430 .align 32 431 ecp_nistz256_sqr_mont: 432 ___ 433 $code.=<<___ if ($addx); 434 mov \$0x80100, %ecx 435 and OPENSSL_ia32cap_P+8(%rip), %ecx 436 ___ 437 $code.=<<___; 438 push %rbp 439 push %rbx 440 push %r12 441 push %r13 442 push %r14 443 push %r15 444 ___ 445 $code.=<<___ if ($addx); 446 cmp \$0x80100, %ecx 447 je .Lsqr_montx 448 ___ 449 $code.=<<___; 450 mov 8*0($a_ptr), %rax 451 mov 8*1($a_ptr), $acc6 452 mov 8*2($a_ptr), $acc7 453 mov 8*3($a_ptr), $acc0 454 455 call __ecp_nistz256_sqr_montq 456 ___ 457 $code.=<<___ if ($addx); 458 jmp .Lsqr_mont_done 459 460 .align 32 461 .Lsqr_montx: 462 mov 8*0($a_ptr), %rdx 463 mov 8*1($a_ptr), $acc6 464 mov 8*2($a_ptr), $acc7 465 mov 8*3($a_ptr), $acc0 466 lea -128($a_ptr), $a_ptr # control u-op density 467 468 call __ecp_nistz256_sqr_montx 469 ___ 470 $code.=<<___; 471 .Lsqr_mont_done: 472 pop %r15 473 pop %r14 474 pop %r13 475 pop %r12 476 pop %rbx 477 pop %rbp 478 ret 479 .size ecp_nistz256_sqr_mont,.-ecp_nistz256_sqr_mont 480 481 .type __ecp_nistz256_sqr_montq,\@abi-omnipotent 482 .align 32 483 __ecp_nistz256_sqr_montq: 484 mov %rax, $acc5 485 mulq $acc6 # a[1]*a[0] 486 mov %rax, $acc1 487 mov $acc7, %rax 488 mov %rdx, $acc2 489 490 mulq $acc5 # a[0]*a[2] 491 add %rax, $acc2 492 mov $acc0, %rax 493 adc \$0, %rdx 494 mov %rdx, $acc3 495 496 mulq $acc5 # a[0]*a[3] 497 add %rax, $acc3 498 mov $acc7, %rax 499 adc \$0, %rdx 500 mov %rdx, $acc4 501 502 ################################# 503 mulq $acc6 # a[1]*a[2] 504 add %rax, $acc3 505 mov $acc0, %rax 506 adc \$0, %rdx 507 mov %rdx, $t1 508 509 mulq $acc6 # a[1]*a[3] 510 add %rax, $acc4 511 mov $acc0, %rax 512 adc \$0, %rdx 513 add $t1, $acc4 514 mov %rdx, $acc5 515 adc \$0, $acc5 516 517 ################################# 518 mulq $acc7 # a[2]*a[3] 519 xor $acc7, $acc7 520 add %rax, $acc5 521 mov 8*0($a_ptr), %rax 522 mov %rdx, $acc6 523 adc \$0, $acc6 524 525 add $acc1, $acc1 # acc1:6<<1 526 adc $acc2, $acc2 527 adc $acc3, $acc3 528 adc $acc4, $acc4 529 adc $acc5, $acc5 530 adc $acc6, $acc6 531 adc \$0, $acc7 532 533 mulq %rax 534 mov %rax, $acc0 535 mov 8*1($a_ptr), %rax 536 mov %rdx, $t0 537 538 mulq %rax 539 add $t0, $acc1 540 adc %rax, $acc2 541 mov 8*2($a_ptr), %rax 542 adc \$0, %rdx 543 mov %rdx, $t0 544 545 mulq %rax 546 add $t0, $acc3 547 adc %rax, $acc4 548 mov 8*3($a_ptr), %rax 549 adc \$0, %rdx 550 mov %rdx, $t0 551 552 mulq %rax 553 add $t0, $acc5 554 adc %rax, $acc6 555 mov $acc0, %rax 556 adc %rdx, $acc7 557 558 mov .Lpoly+8*1(%rip), $a_ptr 559 mov .Lpoly+8*3(%rip), $t1 560 561 ########################################## 562 # Now the reduction 563 # First iteration 564 mov $acc0, $t0 565 shl \$32, $acc0 566 mulq $t1 567 shr \$32, $t0 568 add $acc0, $acc1 # +=acc[0]<<96 569 adc $t0, $acc2 570 adc %rax, $acc3 571 mov $acc1, %rax 572 adc \$0, %rdx 573 574 ########################################## 575 # Second iteration 576 mov $acc1, $t0 577 shl \$32, $acc1 578 mov %rdx, $acc0 579 mulq $t1 580 shr \$32, $t0 581 add $acc1, $acc2 582 adc $t0, $acc3 583 adc %rax, $acc0 584 mov $acc2, %rax 585 adc \$0, %rdx 586 587 ########################################## 588 # Third iteration 589 mov $acc2, $t0 590 shl \$32, $acc2 591 mov %rdx, $acc1 592 mulq $t1 593 shr \$32, $t0 594 add $acc2, $acc3 595 adc $t0, $acc0 596 adc %rax, $acc1 597 mov $acc3, %rax 598 adc \$0, %rdx 599 600 ########################################### 601 # Last iteration 602 mov $acc3, $t0 603 shl \$32, $acc3 604 mov %rdx, $acc2 605 mulq $t1 606 shr \$32, $t0 607 add $acc3, $acc0 608 adc $t0, $acc1 609 adc %rax, $acc2 610 adc \$0, %rdx 611 xor $acc3, $acc3 612 613 ############################################ 614 # Add the rest of the acc 615 add $acc0, $acc4 616 adc $acc1, $acc5 617 mov $acc4, $acc0 618 adc $acc2, $acc6 619 adc %rdx, $acc7 620 mov $acc5, $acc1 621 adc \$0, $acc3 622 623 sub \$-1, $acc4 # .Lpoly[0] 624 mov $acc6, $acc2 625 sbb $a_ptr, $acc5 # .Lpoly[1] 626 sbb \$0, $acc6 # .Lpoly[2] 627 mov $acc7, $t0 628 sbb $t1, $acc7 # .Lpoly[3] 629 sbb \$0, $acc3 630 631 cmovc $acc0, $acc4 632 cmovc $acc1, $acc5 633 mov $acc4, 8*0($r_ptr) 634 cmovc $acc2, $acc6 635 mov $acc5, 8*1($r_ptr) 636 cmovc $t0, $acc7 637 mov $acc6, 8*2($r_ptr) 638 mov $acc7, 8*3($r_ptr) 639 640 ret 641 .size __ecp_nistz256_sqr_montq,.-__ecp_nistz256_sqr_montq 642 ___ 643 644 if ($addx) { 645 $code.=<<___; 646 .type __ecp_nistz256_mul_montx,\@abi-omnipotent 647 .align 32 648 __ecp_nistz256_mul_montx: 649 ######################################################################## 650 # Multiply by b[0] 651 mulx $acc1, $acc0, $acc1 652 mulx $acc2, $t0, $acc2 653 mov \$32, $poly1 654 xor $acc5, $acc5 # cf=0 655 mulx $acc3, $t1, $acc3 656 mov .Lpoly+8*3(%rip), $poly3 657 adc $t0, $acc1 658 mulx $acc4, $t0, $acc4 659 mov $acc0, %rdx 660 adc $t1, $acc2 661 shlx $poly1,$acc0,$t1 662 adc $t0, $acc3 663 shrx $poly1,$acc0,$t0 664 adc \$0, $acc4 665 666 ######################################################################## 667 # First reduction step 668 add $t1, $acc1 669 adc $t0, $acc2 670 671 mulx $poly3, $t0, $t1 672 mov 8*1($b_ptr), %rdx 673 adc $t0, $acc3 674 adc $t1, $acc4 675 adc \$0, $acc5 676 xor $acc0, $acc0 # $acc0=0,cf=0,of=0 677 678 ######################################################################## 679 # Multiply by b[1] 680 mulx 8*0+128($a_ptr), $t0, $t1 681 adcx $t0, $acc1 682 adox $t1, $acc2 683 684 mulx 8*1+128($a_ptr), $t0, $t1 685 adcx $t0, $acc2 686 adox $t1, $acc3 687 688 mulx 8*2+128($a_ptr), $t0, $t1 689 adcx $t0, $acc3 690 adox $t1, $acc4 691 692 mulx 8*3+128($a_ptr), $t0, $t1 693 mov $acc1, %rdx 694 adcx $t0, $acc4 695 shlx $poly1, $acc1, $t0 696 adox $t1, $acc5 697 shrx $poly1, $acc1, $t1 698 699 adcx $acc0, $acc5 700 adox $acc0, $acc0 701 adc \$0, $acc0 702 703 ######################################################################## 704 # Second reduction step 705 add $t0, $acc2 706 adc $t1, $acc3 707 708 mulx $poly3, $t0, $t1 709 mov 8*2($b_ptr), %rdx 710 adc $t0, $acc4 711 adc $t1, $acc5 712 adc \$0, $acc0 713 xor $acc1 ,$acc1 # $acc1=0,cf=0,of=0 714 715 ######################################################################## 716 # Multiply by b[2] 717 mulx 8*0+128($a_ptr), $t0, $t1 718 adcx $t0, $acc2 719 adox $t1, $acc3 720 721 mulx 8*1+128($a_ptr), $t0, $t1 722 adcx $t0, $acc3 723 adox $t1, $acc4 724 725 mulx 8*2+128($a_ptr), $t0, $t1 726 adcx $t0, $acc4 727 adox $t1, $acc5 728 729 mulx 8*3+128($a_ptr), $t0, $t1 730 mov $acc2, %rdx 731 adcx $t0, $acc5 732 shlx $poly1, $acc2, $t0 733 adox $t1, $acc0 734 shrx $poly1, $acc2, $t1 735 736 adcx $acc1, $acc0 737 adox $acc1, $acc1 738 adc \$0, $acc1 739 740 ######################################################################## 741 # Third reduction step 742 add $t0, $acc3 743 adc $t1, $acc4 744 745 mulx $poly3, $t0, $t1 746 mov 8*3($b_ptr), %rdx 747 adc $t0, $acc5 748 adc $t1, $acc0 749 adc \$0, $acc1 750 xor $acc2, $acc2 # $acc2=0,cf=0,of=0 751 752 ######################################################################## 753 # Multiply by b[3] 754 mulx 8*0+128($a_ptr), $t0, $t1 755 adcx $t0, $acc3 756 adox $t1, $acc4 757 758 mulx 8*1+128($a_ptr), $t0, $t1 759 adcx $t0, $acc4 760 adox $t1, $acc5 761 762 mulx 8*2+128($a_ptr), $t0, $t1 763 adcx $t0, $acc5 764 adox $t1, $acc0 765 766 mulx 8*3+128($a_ptr), $t0, $t1 767 mov $acc3, %rdx 768 adcx $t0, $acc0 769 shlx $poly1, $acc3, $t0 770 adox $t1, $acc1 771 shrx $poly1, $acc3, $t1 772 773 adcx $acc2, $acc1 774 adox $acc2, $acc2 775 adc \$0, $acc2 776 777 ######################################################################## 778 # Fourth reduction step 779 add $t0, $acc4 780 adc $t1, $acc5 781 782 mulx $poly3, $t0, $t1 783 mov $acc4, $t2 784 mov .Lpoly+8*1(%rip), $poly1 785 adc $t0, $acc0 786 mov $acc5, $t3 787 adc $t1, $acc1 788 adc \$0, $acc2 789 790 ######################################################################## 791 # Branch-less conditional subtraction of P 792 xor %eax, %eax 793 mov $acc0, $t0 794 sbb \$-1, $acc4 # .Lpoly[0] 795 sbb $poly1, $acc5 # .Lpoly[1] 796 sbb \$0, $acc0 # .Lpoly[2] 797 mov $acc1, $t1 798 sbb $poly3, $acc1 # .Lpoly[3] 799 sbb \$0, $acc2 800 801 cmovc $t2, $acc4 802 cmovc $t3, $acc5 803 mov $acc4, 8*0($r_ptr) 804 cmovc $t0, $acc0 805 mov $acc5, 8*1($r_ptr) 806 cmovc $t1, $acc1 807 mov $acc0, 8*2($r_ptr) 808 mov $acc1, 8*3($r_ptr) 809 810 ret 811 .size __ecp_nistz256_mul_montx,.-__ecp_nistz256_mul_montx 812 813 .type __ecp_nistz256_sqr_montx,\@abi-omnipotent 814 .align 32 815 __ecp_nistz256_sqr_montx: 816 mulx $acc6, $acc1, $acc2 # a[0]*a[1] 817 mulx $acc7, $t0, $acc3 # a[0]*a[2] 818 xor %eax, %eax 819 adc $t0, $acc2 820 mulx $acc0, $t1, $acc4 # a[0]*a[3] 821 mov $acc6, %rdx 822 adc $t1, $acc3 823 adc \$0, $acc4 824 xor $acc5, $acc5 # $acc5=0,cf=0,of=0 825 826 ################################# 827 mulx $acc7, $t0, $t1 # a[1]*a[2] 828 adcx $t0, $acc3 829 adox $t1, $acc4 830 831 mulx $acc0, $t0, $t1 # a[1]*a[3] 832 mov $acc7, %rdx 833 adcx $t0, $acc4 834 adox $t1, $acc5 835 adc \$0, $acc5 836 837 ################################# 838 mulx $acc0, $t0, $acc6 # a[2]*a[3] 839 mov 8*0+128($a_ptr), %rdx 840 xor $acc7, $acc7 # $acc7=0,cf=0,of=0 841 adcx $acc1, $acc1 # acc1:6<<1 842 adox $t0, $acc5 843 adcx $acc2, $acc2 844 adox $acc7, $acc6 # of=0 845 846 mulx %rdx, $acc0, $t1 847 mov 8*1+128($a_ptr), %rdx 848 adcx $acc3, $acc3 849 adox $t1, $acc1 850 adcx $acc4, $acc4 851 mulx %rdx, $t0, $t4 852 mov 8*2+128($a_ptr), %rdx 853 adcx $acc5, $acc5 854 adox $t0, $acc2 855 adcx $acc6, $acc6 856 .byte 0x67 857 mulx %rdx, $t0, $t1 858 mov 8*3+128($a_ptr), %rdx 859 adox $t4, $acc3 860 adcx $acc7, $acc7 861 adox $t0, $acc4 862 mov \$32, $a_ptr 863 adox $t1, $acc5 864 .byte 0x67,0x67 865 mulx %rdx, $t0, $t4 866 mov $acc0, %rdx 867 adox $t0, $acc6 868 shlx $a_ptr, $acc0, $t0 869 adox $t4, $acc7 870 shrx $a_ptr, $acc0, $t4 871 mov .Lpoly+8*3(%rip), $t1 872 873 # reduction step 1 874 add $t0, $acc1 875 adc $t4, $acc2 876 877 mulx $t1, $t0, $acc0 878 mov $acc1, %rdx 879 adc $t0, $acc3 880 shlx $a_ptr, $acc1, $t0 881 adc \$0, $acc0 882 shrx $a_ptr, $acc1, $t4 883 884 # reduction step 2 885 add $t0, $acc2 886 adc $t4, $acc3 887 888 mulx $t1, $t0, $acc1 889 mov $acc2, %rdx 890 adc $t0, $acc0 891 shlx $a_ptr, $acc2, $t0 892 adc \$0, $acc1 893 shrx $a_ptr, $acc2, $t4 894 895 # reduction step 3 896 add $t0, $acc3 897 adc $t4, $acc0 898 899 mulx $t1, $t0, $acc2 900 mov $acc3, %rdx 901 adc $t0, $acc1 902 shlx $a_ptr, $acc3, $t0 903 adc \$0, $acc2 904 shrx $a_ptr, $acc3, $t4 905 906 # reduction step 4 907 add $t0, $acc0 908 adc $t4, $acc1 909 910 mulx $t1, $t0, $acc3 911 adc $t0, $acc2 912 adc \$0, $acc3 913 914 xor $t3, $t3 # cf=0 915 adc $acc0, $acc4 # accumulate upper half 916 mov .Lpoly+8*1(%rip), $a_ptr 917 adc $acc1, $acc5 918 mov $acc4, $acc0 919 adc $acc2, $acc6 920 adc $acc3, $acc7 921 mov $acc5, $acc1 922 adc \$0, $t3 923 924 xor %eax, %eax # cf=0 925 sbb \$-1, $acc4 # .Lpoly[0] 926 mov $acc6, $acc2 927 sbb $a_ptr, $acc5 # .Lpoly[1] 928 sbb \$0, $acc6 # .Lpoly[2] 929 mov $acc7, $acc3 930 sbb $t1, $acc7 # .Lpoly[3] 931 sbb \$0, $t3 932 933 cmovc $acc0, $acc4 934 cmovc $acc1, $acc5 935 mov $acc4, 8*0($r_ptr) 936 cmovc $acc2, $acc6 937 mov $acc5, 8*1($r_ptr) 938 cmovc $acc3, $acc7 939 mov $acc6, 8*2($r_ptr) 940 mov $acc7, 8*3($r_ptr) 941 942 ret 943 .size __ecp_nistz256_sqr_montx,.-__ecp_nistz256_sqr_montx 944 ___ 945 } 946 } 947 { 948 my ($val,$in_t,$index)=$win64?("%rcx","%rdx","%r8d"):("%rdi","%rsi","%edx"); 949 my ($ONE,$INDEX,$Ra,$Rb,$Rc,$Rd,$Re,$Rf)=map("%xmm$_",(0..7)); 950 my ($M0,$T0a,$T0b,$T0c,$T0d,$T0e,$T0f,$TMP0)=map("%xmm$_",(8..15)); 951 my ($M1,$T2a,$T2b,$TMP2,$M2,$T2a,$T2b,$TMP2)=map("%xmm$_",(8..15)); 952 953 $code.=<<___; 954 ################################################################################ 955 # void ecp_nistz256_select_w5(uint64_t *val, uint64_t *in_t, int index); 956 .globl ecp_nistz256_select_w5 957 .type ecp_nistz256_select_w5,\@abi-omnipotent 958 .align 32 959 ecp_nistz256_select_w5: 960 ___ 961 $code.=<<___ if ($avx>1); 962 mov OPENSSL_ia32cap_P+8(%rip), %eax 963 test \$`1<<5`, %eax 964 jnz .Lavx2_select_w5 965 ___ 966 $code.=<<___ if ($win64); 967 lea -0x88(%rsp), %rax 968 .LSEH_begin_ecp_nistz256_select_w5: 969 .byte 0x48,0x8d,0x60,0xe0 #lea -0x20(%rax), %rsp 970 .byte 0x0f,0x29,0x70,0xe0 #movaps %xmm6, -0x20(%rax) 971 .byte 0x0f,0x29,0x78,0xf0 #movaps %xmm7, -0x10(%rax) 972 .byte 0x44,0x0f,0x29,0x00 #movaps %xmm8, 0(%rax) 973 .byte 0x44,0x0f,0x29,0x48,0x10 #movaps %xmm9, 0x10(%rax) 974 .byte 0x44,0x0f,0x29,0x50,0x20 #movaps %xmm10, 0x20(%rax) 975 .byte 0x44,0x0f,0x29,0x58,0x30 #movaps %xmm11, 0x30(%rax) 976 .byte 0x44,0x0f,0x29,0x60,0x40 #movaps %xmm12, 0x40(%rax) 977 .byte 0x44,0x0f,0x29,0x68,0x50 #movaps %xmm13, 0x50(%rax) 978 .byte 0x44,0x0f,0x29,0x70,0x60 #movaps %xmm14, 0x60(%rax) 979 .byte 0x44,0x0f,0x29,0x78,0x70 #movaps %xmm15, 0x70(%rax) 980 ___ 981 $code.=<<___; 982 movdqa .LOne(%rip), $ONE 983 movd $index, $INDEX 984 985 pxor $Ra, $Ra 986 pxor $Rb, $Rb 987 pxor $Rc, $Rc 988 pxor $Rd, $Rd 989 pxor $Re, $Re 990 pxor $Rf, $Rf 991 992 movdqa $ONE, $M0 993 pshufd \$0, $INDEX, $INDEX 994 995 mov \$16, %rax 996 .Lselect_loop_sse_w5: 997 998 movdqa $M0, $TMP0 999 paddd $ONE, $M0 1000 pcmpeqd $INDEX, $TMP0 1001 1002 movdqa 16*0($in_t), $T0a 1003 movdqa 16*1($in_t), $T0b 1004 movdqa 16*2($in_t), $T0c 1005 movdqa 16*3($in_t), $T0d 1006 movdqa 16*4($in_t), $T0e 1007 movdqa 16*5($in_t), $T0f 1008 lea 16*6($in_t), $in_t 1009 1010 pand $TMP0, $T0a 1011 pand $TMP0, $T0b 1012 por $T0a, $Ra 1013 pand $TMP0, $T0c 1014 por $T0b, $Rb 1015 pand $TMP0, $T0d 1016 por $T0c, $Rc 1017 pand $TMP0, $T0e 1018 por $T0d, $Rd 1019 pand $TMP0, $T0f 1020 por $T0e, $Re 1021 por $T0f, $Rf 1022 1023 dec %rax 1024 jnz .Lselect_loop_sse_w5 1025 1026 movdqu $Ra, 16*0($val) 1027 movdqu $Rb, 16*1($val) 1028 movdqu $Rc, 16*2($val) 1029 movdqu $Rd, 16*3($val) 1030 movdqu $Re, 16*4($val) 1031 movdqu $Rf, 16*5($val) 1032 ___ 1033 $code.=<<___ if ($win64); 1034 movaps (%rsp), %xmm6 1035 movaps 0x10(%rsp), %xmm7 1036 movaps 0x20(%rsp), %xmm8 1037 movaps 0x30(%rsp), %xmm9 1038 movaps 0x40(%rsp), %xmm10 1039 movaps 0x50(%rsp), %xmm11 1040 movaps 0x60(%rsp), %xmm12 1041 movaps 0x70(%rsp), %xmm13 1042 movaps 0x80(%rsp), %xmm14 1043 movaps 0x90(%rsp), %xmm15 1044 lea 0xa8(%rsp), %rsp 1045 .LSEH_end_ecp_nistz256_select_w5: 1046 ___ 1047 $code.=<<___; 1048 ret 1049 .size ecp_nistz256_select_w5,.-ecp_nistz256_select_w5 1050 1051 ################################################################################ 1052 # void ecp_nistz256_select_w7(uint64_t *val, uint64_t *in_t, int index); 1053 .globl ecp_nistz256_select_w7 1054 .type ecp_nistz256_select_w7,\@abi-omnipotent 1055 .align 32 1056 ecp_nistz256_select_w7: 1057 ___ 1058 $code.=<<___ if ($avx>1); 1059 mov OPENSSL_ia32cap_P+8(%rip), %eax 1060 test \$`1<<5`, %eax 1061 jnz .Lavx2_select_w7 1062 ___ 1063 $code.=<<___ if ($win64); 1064 lea -0x88(%rsp), %rax 1065 .LSEH_begin_ecp_nistz256_select_w7: 1066 .byte 0x48,0x8d,0x60,0xe0 #lea -0x20(%rax), %rsp 1067 .byte 0x0f,0x29,0x70,0xe0 #movaps %xmm6, -0x20(%rax) 1068 .byte 0x0f,0x29,0x78,0xf0 #movaps %xmm7, -0x10(%rax) 1069 .byte 0x44,0x0f,0x29,0x00 #movaps %xmm8, 0(%rax) 1070 .byte 0x44,0x0f,0x29,0x48,0x10 #movaps %xmm9, 0x10(%rax) 1071 .byte 0x44,0x0f,0x29,0x50,0x20 #movaps %xmm10, 0x20(%rax) 1072 .byte 0x44,0x0f,0x29,0x58,0x30 #movaps %xmm11, 0x30(%rax) 1073 .byte 0x44,0x0f,0x29,0x60,0x40 #movaps %xmm12, 0x40(%rax) 1074 .byte 0x44,0x0f,0x29,0x68,0x50 #movaps %xmm13, 0x50(%rax) 1075 .byte 0x44,0x0f,0x29,0x70,0x60 #movaps %xmm14, 0x60(%rax) 1076 .byte 0x44,0x0f,0x29,0x78,0x70 #movaps %xmm15, 0x70(%rax) 1077 ___ 1078 $code.=<<___; 1079 movdqa .LOne(%rip), $M0 1080 movd $index, $INDEX 1081 1082 pxor $Ra, $Ra 1083 pxor $Rb, $Rb 1084 pxor $Rc, $Rc 1085 pxor $Rd, $Rd 1086 1087 movdqa $M0, $ONE 1088 pshufd \$0, $INDEX, $INDEX 1089 mov \$64, %rax 1090 1091 .Lselect_loop_sse_w7: 1092 movdqa $M0, $TMP0 1093 paddd $ONE, $M0 1094 movdqa 16*0($in_t), $T0a 1095 movdqa 16*1($in_t), $T0b 1096 pcmpeqd $INDEX, $TMP0 1097 movdqa 16*2($in_t), $T0c 1098 movdqa 16*3($in_t), $T0d 1099 lea 16*4($in_t), $in_t 1100 1101 pand $TMP0, $T0a 1102 pand $TMP0, $T0b 1103 por $T0a, $Ra 1104 pand $TMP0, $T0c 1105 por $T0b, $Rb 1106 pand $TMP0, $T0d 1107 por $T0c, $Rc 1108 prefetcht0 255($in_t) 1109 por $T0d, $Rd 1110 1111 dec %rax 1112 jnz .Lselect_loop_sse_w7 1113 1114 movdqu $Ra, 16*0($val) 1115 movdqu $Rb, 16*1($val) 1116 movdqu $Rc, 16*2($val) 1117 movdqu $Rd, 16*3($val) 1118 ___ 1119 $code.=<<___ if ($win64); 1120 movaps (%rsp), %xmm6 1121 movaps 0x10(%rsp), %xmm7 1122 movaps 0x20(%rsp), %xmm8 1123 movaps 0x30(%rsp), %xmm9 1124 movaps 0x40(%rsp), %xmm10 1125 movaps 0x50(%rsp), %xmm11 1126 movaps 0x60(%rsp), %xmm12 1127 movaps 0x70(%rsp), %xmm13 1128 movaps 0x80(%rsp), %xmm14 1129 movaps 0x90(%rsp), %xmm15 1130 lea 0xa8(%rsp), %rsp 1131 .LSEH_end_ecp_nistz256_select_w7: 1132 ___ 1133 $code.=<<___; 1134 ret 1135 .size ecp_nistz256_select_w7,.-ecp_nistz256_select_w7 1136 ___ 1137 } 1138 if ($avx>1) { 1139 my ($val,$in_t,$index)=$win64?("%rcx","%rdx","%r8d"):("%rdi","%rsi","%edx"); 1140 my ($TWO,$INDEX,$Ra,$Rb,$Rc)=map("%ymm$_",(0..4)); 1141 my ($M0,$T0a,$T0b,$T0c,$TMP0)=map("%ymm$_",(5..9)); 1142 my ($M1,$T1a,$T1b,$T1c,$TMP1)=map("%ymm$_",(10..14)); 1143 1144 $code.=<<___; 1145 ################################################################################ 1146 # void ecp_nistz256_avx2_select_w5(uint64_t *val, uint64_t *in_t, int index); 1147 .type ecp_nistz256_avx2_select_w5,\@abi-omnipotent 1148 .align 32 1149 ecp_nistz256_avx2_select_w5: 1150 .Lavx2_select_w5: 1151 vzeroupper 1152 ___ 1153 $code.=<<___ if ($win64); 1154 lea -0x88(%rsp), %rax 1155 .LSEH_begin_ecp_nistz256_avx2_select_w5: 1156 .byte 0x48,0x8d,0x60,0xe0 #lea -0x20(%rax), %rsp 1157 .byte 0xc5,0xf8,0x29,0x70,0xe0 #vmovaps %xmm6, -0x20(%rax) 1158 .byte 0xc5,0xf8,0x29,0x78,0xf0 #vmovaps %xmm7, -0x10(%rax) 1159 .byte 0xc5,0x78,0x29,0x40,0x00 #vmovaps %xmm8, 8(%rax) 1160 .byte 0xc5,0x78,0x29,0x48,0x10 #vmovaps %xmm9, 0x10(%rax) 1161 .byte 0xc5,0x78,0x29,0x50,0x20 #vmovaps %xmm10, 0x20(%rax) 1162 .byte 0xc5,0x78,0x29,0x58,0x30 #vmovaps %xmm11, 0x30(%rax) 1163 .byte 0xc5,0x78,0x29,0x60,0x40 #vmovaps %xmm12, 0x40(%rax) 1164 .byte 0xc5,0x78,0x29,0x68,0x50 #vmovaps %xmm13, 0x50(%rax) 1165 .byte 0xc5,0x78,0x29,0x70,0x60 #vmovaps %xmm14, 0x60(%rax) 1166 .byte 0xc5,0x78,0x29,0x78,0x70 #vmovaps %xmm15, 0x70(%rax) 1167 ___ 1168 $code.=<<___; 1169 vmovdqa .LTwo(%rip), $TWO 1170 1171 vpxor $Ra, $Ra, $Ra 1172 vpxor $Rb, $Rb, $Rb 1173 vpxor $Rc, $Rc, $Rc 1174 1175 vmovdqa .LOne(%rip), $M0 1176 vmovdqa .LTwo(%rip), $M1 1177 1178 vmovd $index, %xmm1 1179 vpermd $INDEX, $Ra, $INDEX 1180 1181 mov \$8, %rax 1182 .Lselect_loop_avx2_w5: 1183 1184 vmovdqa 32*0($in_t), $T0a 1185 vmovdqa 32*1($in_t), $T0b 1186 vmovdqa 32*2($in_t), $T0c 1187 1188 vmovdqa 32*3($in_t), $T1a 1189 vmovdqa 32*4($in_t), $T1b 1190 vmovdqa 32*5($in_t), $T1c 1191 1192 vpcmpeqd $INDEX, $M0, $TMP0 1193 vpcmpeqd $INDEX, $M1, $TMP1 1194 1195 vpaddd $TWO, $M0, $M0 1196 vpaddd $TWO, $M1, $M1 1197 lea 32*6($in_t), $in_t 1198 1199 vpand $TMP0, $T0a, $T0a 1200 vpand $TMP0, $T0b, $T0b 1201 vpand $TMP0, $T0c, $T0c 1202 vpand $TMP1, $T1a, $T1a 1203 vpand $TMP1, $T1b, $T1b 1204 vpand $TMP1, $T1c, $T1c 1205 1206 vpxor $T0a, $Ra, $Ra 1207 vpxor $T0b, $Rb, $Rb 1208 vpxor $T0c, $Rc, $Rc 1209 vpxor $T1a, $Ra, $Ra 1210 vpxor $T1b, $Rb, $Rb 1211 vpxor $T1c, $Rc, $Rc 1212 1213 dec %rax 1214 jnz .Lselect_loop_avx2_w5 1215 1216 vmovdqu $Ra, 32*0($val) 1217 vmovdqu $Rb, 32*1($val) 1218 vmovdqu $Rc, 32*2($val) 1219 vzeroupper 1220 ___ 1221 $code.=<<___ if ($win64); 1222 movaps (%rsp), %xmm6 1223 movaps 0x10(%rsp), %xmm7 1224 movaps 0x20(%rsp), %xmm8 1225 movaps 0x30(%rsp), %xmm9 1226 movaps 0x40(%rsp), %xmm10 1227 movaps 0x50(%rsp), %xmm11 1228 movaps 0x60(%rsp), %xmm12 1229 movaps 0x70(%rsp), %xmm13 1230 movaps 0x80(%rsp), %xmm14 1231 movaps 0x90(%rsp), %xmm15 1232 lea 0xa8(%rsp), %rsp 1233 .LSEH_end_ecp_nistz256_avx2_select_w5: 1234 ___ 1235 $code.=<<___; 1236 ret 1237 .size ecp_nistz256_avx2_select_w5,.-ecp_nistz256_avx2_select_w5 1238 ___ 1239 } 1240 if ($avx>1) { 1241 my ($val,$in_t,$index)=$win64?("%rcx","%rdx","%r8d"):("%rdi","%rsi","%edx"); 1242 my ($THREE,$INDEX,$Ra,$Rb)=map("%ymm$_",(0..3)); 1243 my ($M0,$T0a,$T0b,$TMP0)=map("%ymm$_",(4..7)); 1244 my ($M1,$T1a,$T1b,$TMP1)=map("%ymm$_",(8..11)); 1245 my ($M2,$T2a,$T2b,$TMP2)=map("%ymm$_",(12..15)); 1246 1247 $code.=<<___; 1248 1249 ################################################################################ 1250 # void ecp_nistz256_avx2_select_w7(uint64_t *val, uint64_t *in_t, int index); 1251 .globl ecp_nistz256_avx2_select_w7 1252 .type ecp_nistz256_avx2_select_w7,\@abi-omnipotent 1253 .align 32 1254 ecp_nistz256_avx2_select_w7: 1255 .Lavx2_select_w7: 1256 vzeroupper 1257 ___ 1258 $code.=<<___ if ($win64); 1259 lea -0x88(%rsp), %rax 1260 .LSEH_begin_ecp_nistz256_avx2_select_w7: 1261 .byte 0x48,0x8d,0x60,0xe0 #lea -0x20(%rax), %rsp 1262 .byte 0xc5,0xf8,0x29,0x70,0xe0 #vmovaps %xmm6, -0x20(%rax) 1263 .byte 0xc5,0xf8,0x29,0x78,0xf0 #vmovaps %xmm7, -0x10(%rax) 1264 .byte 0xc5,0x78,0x29,0x40,0x00 #vmovaps %xmm8, 8(%rax) 1265 .byte 0xc5,0x78,0x29,0x48,0x10 #vmovaps %xmm9, 0x10(%rax) 1266 .byte 0xc5,0x78,0x29,0x50,0x20 #vmovaps %xmm10, 0x20(%rax) 1267 .byte 0xc5,0x78,0x29,0x58,0x30 #vmovaps %xmm11, 0x30(%rax) 1268 .byte 0xc5,0x78,0x29,0x60,0x40 #vmovaps %xmm12, 0x40(%rax) 1269 .byte 0xc5,0x78,0x29,0x68,0x50 #vmovaps %xmm13, 0x50(%rax) 1270 .byte 0xc5,0x78,0x29,0x70,0x60 #vmovaps %xmm14, 0x60(%rax) 1271 .byte 0xc5,0x78,0x29,0x78,0x70 #vmovaps %xmm15, 0x70(%rax) 1272 ___ 1273 $code.=<<___; 1274 vmovdqa .LThree(%rip), $THREE 1275 1276 vpxor $Ra, $Ra, $Ra 1277 vpxor $Rb, $Rb, $Rb 1278 1279 vmovdqa .LOne(%rip), $M0 1280 vmovdqa .LTwo(%rip), $M1 1281 vmovdqa .LThree(%rip), $M2 1282 1283 vmovd $index, %xmm1 1284 vpermd $INDEX, $Ra, $INDEX 1285 # Skip index = 0, because it is implicitly the point at infinity 1286 1287 mov \$21, %rax 1288 .Lselect_loop_avx2_w7: 1289 1290 vmovdqa 32*0($in_t), $T0a 1291 vmovdqa 32*1($in_t), $T0b 1292 1293 vmovdqa 32*2($in_t), $T1a 1294 vmovdqa 32*3($in_t), $T1b 1295 1296 vmovdqa 32*4($in_t), $T2a 1297 vmovdqa 32*5($in_t), $T2b 1298 1299 vpcmpeqd $INDEX, $M0, $TMP0 1300 vpcmpeqd $INDEX, $M1, $TMP1 1301 vpcmpeqd $INDEX, $M2, $TMP2 1302 1303 vpaddd $THREE, $M0, $M0 1304 vpaddd $THREE, $M1, $M1 1305 vpaddd $THREE, $M2, $M2 1306 lea 32*6($in_t), $in_t 1307 1308 vpand $TMP0, $T0a, $T0a 1309 vpand $TMP0, $T0b, $T0b 1310 vpand $TMP1, $T1a, $T1a 1311 vpand $TMP1, $T1b, $T1b 1312 vpand $TMP2, $T2a, $T2a 1313 vpand $TMP2, $T2b, $T2b 1314 1315 vpxor $T0a, $Ra, $Ra 1316 vpxor $T0b, $Rb, $Rb 1317 vpxor $T1a, $Ra, $Ra 1318 vpxor $T1b, $Rb, $Rb 1319 vpxor $T2a, $Ra, $Ra 1320 vpxor $T2b, $Rb, $Rb 1321 1322 dec %rax 1323 jnz .Lselect_loop_avx2_w7 1324 1325 1326 vmovdqa 32*0($in_t), $T0a 1327 vmovdqa 32*1($in_t), $T0b 1328 1329 vpcmpeqd $INDEX, $M0, $TMP0 1330 1331 vpand $TMP0, $T0a, $T0a 1332 vpand $TMP0, $T0b, $T0b 1333 1334 vpxor $T0a, $Ra, $Ra 1335 vpxor $T0b, $Rb, $Rb 1336 1337 vmovdqu $Ra, 32*0($val) 1338 vmovdqu $Rb, 32*1($val) 1339 vzeroupper 1340 ___ 1341 $code.=<<___ if ($win64); 1342 movaps (%rsp), %xmm6 1343 movaps 0x10(%rsp), %xmm7 1344 movaps 0x20(%rsp), %xmm8 1345 movaps 0x30(%rsp), %xmm9 1346 movaps 0x40(%rsp), %xmm10 1347 movaps 0x50(%rsp), %xmm11 1348 movaps 0x60(%rsp), %xmm12 1349 movaps 0x70(%rsp), %xmm13 1350 movaps 0x80(%rsp), %xmm14 1351 movaps 0x90(%rsp), %xmm15 1352 lea 0xa8(%rsp), %rsp 1353 .LSEH_end_ecp_nistz256_avx2_select_w7: 1354 ___ 1355 $code.=<<___; 1356 ret 1357 .size ecp_nistz256_avx2_select_w7,.-ecp_nistz256_avx2_select_w7 1358 ___ 1359 } else { 1360 $code.=<<___; 1361 .globl ecp_nistz256_avx2_select_w7 1362 .type ecp_nistz256_avx2_select_w7,\@function,3 1363 .align 32 1364 ecp_nistz256_avx2_select_w7: 1365 .byte 0x0f,0x0b # ud2 1366 ret 1367 .size ecp_nistz256_avx2_select_w7,.-ecp_nistz256_avx2_select_w7 1368 ___ 1369 } 1370 {{{ 1371 ######################################################################## 1372 # This block implements higher level point_double, point_add and 1373 # point_add_affine. The key to performance in this case is to allow 1374 # out-of-order execution logic to overlap computations from next step 1375 # with tail processing from current step. By using tailored calling 1376 # sequence we minimize inter-step overhead to give processor better 1377 # shot at overlapping operations... 1378 # 1379 # You will notice that input data is copied to stack. Trouble is that 1380 # there are no registers to spare for holding original pointers and 1381 # reloading them, pointers, would create undesired dependencies on 1382 # effective addresses calculation paths. In other words it's too done 1383 # to favour out-of-order execution logic. 1384 # <appro (at] openssl.org> 1385 1386 my ($r_ptr,$a_ptr,$b_org,$b_ptr)=("%rdi","%rsi","%rdx","%rbx"); 1387 my ($acc0,$acc1,$acc2,$acc3,$acc4,$acc5,$acc6,$acc7)=map("%r$_",(8..15)); 1388 my ($t0,$t1,$t2,$t3,$t4)=("%rax","%rbp","%rcx",$acc4,$acc4); 1389 my ($poly1,$poly3)=($acc6,$acc7); 1390 1391 sub load_for_mul () { 1392 my ($a,$b,$src0) = @_; 1393 my $bias = $src0 eq "%rax" ? 0 : -128; 1394 1395 " mov $b, $src0 1396 lea $b, $b_ptr 1397 mov 8*0+$a, $acc1 1398 mov 8*1+$a, $acc2 1399 lea $bias+$a, $a_ptr 1400 mov 8*2+$a, $acc3 1401 mov 8*3+$a, $acc4" 1402 } 1403 1404 sub load_for_sqr () { 1405 my ($a,$src0) = @_; 1406 my $bias = $src0 eq "%rax" ? 0 : -128; 1407 1408 " mov 8*0+$a, $src0 1409 mov 8*1+$a, $acc6 1410 lea $bias+$a, $a_ptr 1411 mov 8*2+$a, $acc7 1412 mov 8*3+$a, $acc0" 1413 } 1414 1415 { 1416 ######################################################################## 1417 # operate in 4-5-0-1 "name space" that matches multiplication output 1418 # 1419 my ($a0,$a1,$a2,$a3,$t3,$t4)=($acc4,$acc5,$acc0,$acc1,$acc2,$acc3); 1420 1421 $code.=<<___; 1422 .type __ecp_nistz256_add_toq,\@abi-omnipotent 1423 .align 32 1424 __ecp_nistz256_add_toq: 1425 xor $t4,$t4 1426 add 8*0($b_ptr), $a0 1427 adc 8*1($b_ptr), $a1 1428 mov $a0, $t0 1429 adc 8*2($b_ptr), $a2 1430 adc 8*3($b_ptr), $a3 1431 mov $a1, $t1 1432 adc \$0, $t4 1433 1434 sub \$-1, $a0 1435 mov $a2, $t2 1436 sbb $poly1, $a1 1437 sbb \$0, $a2 1438 mov $a3, $t3 1439 sbb $poly3, $a3 1440 sbb \$0, $t4 1441 1442 cmovc $t0, $a0 1443 cmovc $t1, $a1 1444 mov $a0, 8*0($r_ptr) 1445 cmovc $t2, $a2 1446 mov $a1, 8*1($r_ptr) 1447 cmovc $t3, $a3 1448 mov $a2, 8*2($r_ptr) 1449 mov $a3, 8*3($r_ptr) 1450 1451 ret 1452 .size __ecp_nistz256_add_toq,.-__ecp_nistz256_add_toq 1453 1454 .type __ecp_nistz256_sub_fromq,\@abi-omnipotent 1455 .align 32 1456 __ecp_nistz256_sub_fromq: 1457 sub 8*0($b_ptr), $a0 1458 sbb 8*1($b_ptr), $a1 1459 mov $a0, $t0 1460 sbb 8*2($b_ptr), $a2 1461 sbb 8*3($b_ptr), $a3 1462 mov $a1, $t1 1463 sbb $t4, $t4 1464 1465 add \$-1, $a0 1466 mov $a2, $t2 1467 adc $poly1, $a1 1468 adc \$0, $a2 1469 mov $a3, $t3 1470 adc $poly3, $a3 1471 test $t4, $t4 1472 1473 cmovz $t0, $a0 1474 cmovz $t1, $a1 1475 mov $a0, 8*0($r_ptr) 1476 cmovz $t2, $a2 1477 mov $a1, 8*1($r_ptr) 1478 cmovz $t3, $a3 1479 mov $a2, 8*2($r_ptr) 1480 mov $a3, 8*3($r_ptr) 1481 1482 ret 1483 .size __ecp_nistz256_sub_fromq,.-__ecp_nistz256_sub_fromq 1484 1485 .type __ecp_nistz256_subq,\@abi-omnipotent 1486 .align 32 1487 __ecp_nistz256_subq: 1488 sub $a0, $t0 1489 sbb $a1, $t1 1490 mov $t0, $a0 1491 sbb $a2, $t2 1492 sbb $a3, $t3 1493 mov $t1, $a1 1494 sbb $t4, $t4 1495 1496 add \$-1, $t0 1497 mov $t2, $a2 1498 adc $poly1, $t1 1499 adc \$0, $t2 1500 mov $t3, $a3 1501 adc $poly3, $t3 1502 test $t4, $t4 1503 1504 cmovnz $t0, $a0 1505 cmovnz $t1, $a1 1506 cmovnz $t2, $a2 1507 cmovnz $t3, $a3 1508 1509 ret 1510 .size __ecp_nistz256_subq,.-__ecp_nistz256_subq 1511 1512 .type __ecp_nistz256_mul_by_2q,\@abi-omnipotent 1513 .align 32 1514 __ecp_nistz256_mul_by_2q: 1515 xor $t4, $t4 1516 add $a0, $a0 # a0:a3+a0:a3 1517 adc $a1, $a1 1518 mov $a0, $t0 1519 adc $a2, $a2 1520 adc $a3, $a3 1521 mov $a1, $t1 1522 adc \$0, $t4 1523 1524 sub \$-1, $a0 1525 mov $a2, $t2 1526 sbb $poly1, $a1 1527 sbb \$0, $a2 1528 mov $a3, $t3 1529 sbb $poly3, $a3 1530 sbb \$0, $t4 1531 1532 cmovc $t0, $a0 1533 cmovc $t1, $a1 1534 mov $a0, 8*0($r_ptr) 1535 cmovc $t2, $a2 1536 mov $a1, 8*1($r_ptr) 1537 cmovc $t3, $a3 1538 mov $a2, 8*2($r_ptr) 1539 mov $a3, 8*3($r_ptr) 1540 1541 ret 1542 .size __ecp_nistz256_mul_by_2q,.-__ecp_nistz256_mul_by_2q 1543 ___ 1544 } 1545 sub gen_double () { 1546 my $x = shift; 1547 my ($src0,$sfx,$bias); 1548 my ($S,$M,$Zsqr,$in_x,$tmp0)=map(32*$_,(0..4)); 1549 1550 if ($x ne "x") { 1551 $src0 = "%rax"; 1552 $sfx = ""; 1553 $bias = 0; 1554 1555 $code.=<<___; 1556 .globl ecp_nistz256_point_double 1557 .type ecp_nistz256_point_double,\@function,2 1558 .align 32 1559 ecp_nistz256_point_double: 1560 ___ 1561 $code.=<<___ if ($addx); 1562 mov \$0x80100, %ecx 1563 and OPENSSL_ia32cap_P+8(%rip), %ecx 1564 cmp \$0x80100, %ecx 1565 je .Lpoint_doublex 1566 ___ 1567 } else { 1568 $src0 = "%rdx"; 1569 $sfx = "x"; 1570 $bias = 128; 1571 1572 $code.=<<___; 1573 .type ecp_nistz256_point_doublex,\@function,2 1574 .align 32 1575 ecp_nistz256_point_doublex: 1576 .Lpoint_doublex: 1577 ___ 1578 } 1579 $code.=<<___; 1580 push %rbp 1581 push %rbx 1582 push %r12 1583 push %r13 1584 push %r14 1585 push %r15 1586 sub \$32*5+8, %rsp 1587 1588 .Lpoint_double_shortcut$x: 1589 movdqu 0x00($a_ptr), %xmm0 # copy *(P256_POINT *)$a_ptr.x 1590 mov $a_ptr, $b_ptr # backup copy 1591 movdqu 0x10($a_ptr), %xmm1 1592 mov 0x20+8*0($a_ptr), $acc4 # load in_y in "5-4-0-1" order 1593 mov 0x20+8*1($a_ptr), $acc5 1594 mov 0x20+8*2($a_ptr), $acc0 1595 mov 0x20+8*3($a_ptr), $acc1 1596 mov .Lpoly+8*1(%rip), $poly1 1597 mov .Lpoly+8*3(%rip), $poly3 1598 movdqa %xmm0, $in_x(%rsp) 1599 movdqa %xmm1, $in_x+0x10(%rsp) 1600 lea 0x20($r_ptr), $acc2 1601 lea 0x40($r_ptr), $acc3 1602 movq $r_ptr, %xmm0 1603 movq $acc2, %xmm1 1604 movq $acc3, %xmm2 1605 1606 lea $S(%rsp), $r_ptr 1607 call __ecp_nistz256_mul_by_2$x # p256_mul_by_2(S, in_y); 1608 1609 mov 0x40+8*0($a_ptr), $src0 1610 mov 0x40+8*1($a_ptr), $acc6 1611 mov 0x40+8*2($a_ptr), $acc7 1612 mov 0x40+8*3($a_ptr), $acc0 1613 lea 0x40-$bias($a_ptr), $a_ptr 1614 lea $Zsqr(%rsp), $r_ptr 1615 call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(Zsqr, in_z); 1616 1617 `&load_for_sqr("$S(%rsp)", "$src0")` 1618 lea $S(%rsp), $r_ptr 1619 call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(S, S); 1620 1621 mov 0x20($b_ptr), $src0 # $b_ptr is still valid 1622 mov 0x40+8*0($b_ptr), $acc1 1623 mov 0x40+8*1($b_ptr), $acc2 1624 mov 0x40+8*2($b_ptr), $acc3 1625 mov 0x40+8*3($b_ptr), $acc4 1626 lea 0x40-$bias($b_ptr), $a_ptr 1627 lea 0x20($b_ptr), $b_ptr 1628 movq %xmm2, $r_ptr 1629 call __ecp_nistz256_mul_mont$x # p256_mul_mont(res_z, in_z, in_y); 1630 call __ecp_nistz256_mul_by_2$x # p256_mul_by_2(res_z, res_z); 1631 1632 mov $in_x+8*0(%rsp), $acc4 # "5-4-0-1" order 1633 mov $in_x+8*1(%rsp), $acc5 1634 lea $Zsqr(%rsp), $b_ptr 1635 mov $in_x+8*2(%rsp), $acc0 1636 mov $in_x+8*3(%rsp), $acc1 1637 lea $M(%rsp), $r_ptr 1638 call __ecp_nistz256_add_to$x # p256_add(M, in_x, Zsqr); 1639 1640 mov $in_x+8*0(%rsp), $acc4 # "5-4-0-1" order 1641 mov $in_x+8*1(%rsp), $acc5 1642 lea $Zsqr(%rsp), $b_ptr 1643 mov $in_x+8*2(%rsp), $acc0 1644 mov $in_x+8*3(%rsp), $acc1 1645 lea $Zsqr(%rsp), $r_ptr 1646 call __ecp_nistz256_sub_from$x # p256_sub(Zsqr, in_x, Zsqr); 1647 1648 `&load_for_sqr("$S(%rsp)", "$src0")` 1649 movq %xmm1, $r_ptr 1650 call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(res_y, S); 1651 ___ 1652 { 1653 ######## ecp_nistz256_div_by_2(res_y, res_y); ########################## 1654 # operate in 4-5-6-7 "name space" that matches squaring output 1655 # 1656 my ($poly1,$poly3)=($a_ptr,$t1); 1657 my ($a0,$a1,$a2,$a3,$t3,$t4,$t1)=($acc4,$acc5,$acc6,$acc7,$acc0,$acc1,$acc2); 1658 1659 $code.=<<___; 1660 xor $t4, $t4 1661 mov $a0, $t0 1662 add \$-1, $a0 1663 mov $a1, $t1 1664 adc $poly1, $a1 1665 mov $a2, $t2 1666 adc \$0, $a2 1667 mov $a3, $t3 1668 adc $poly3, $a3 1669 adc \$0, $t4 1670 xor $a_ptr, $a_ptr # borrow $a_ptr 1671 test \$1, $t0 1672 1673 cmovz $t0, $a0 1674 cmovz $t1, $a1 1675 cmovz $t2, $a2 1676 cmovz $t3, $a3 1677 cmovz $a_ptr, $t4 1678 1679 mov $a1, $t0 # a0:a3>>1 1680 shr \$1, $a0 1681 shl \$63, $t0 1682 mov $a2, $t1 1683 shr \$1, $a1 1684 or $t0, $a0 1685 shl \$63, $t1 1686 mov $a3, $t2 1687 shr \$1, $a2 1688 or $t1, $a1 1689 shl \$63, $t2 1690 mov $a0, 8*0($r_ptr) 1691 shr \$1, $a3 1692 mov $a1, 8*1($r_ptr) 1693 shl \$63, $t4 1694 or $t2, $a2 1695 or $t4, $a3 1696 mov $a2, 8*2($r_ptr) 1697 mov $a3, 8*3($r_ptr) 1698 ___ 1699 } 1700 $code.=<<___; 1701 `&load_for_mul("$M(%rsp)", "$Zsqr(%rsp)", "$src0")` 1702 lea $M(%rsp), $r_ptr 1703 call __ecp_nistz256_mul_mont$x # p256_mul_mont(M, M, Zsqr); 1704 1705 lea $tmp0(%rsp), $r_ptr 1706 call __ecp_nistz256_mul_by_2$x 1707 1708 lea $M(%rsp), $b_ptr 1709 lea $M(%rsp), $r_ptr 1710 call __ecp_nistz256_add_to$x # p256_mul_by_3(M, M); 1711 1712 `&load_for_mul("$S(%rsp)", "$in_x(%rsp)", "$src0")` 1713 lea $S(%rsp), $r_ptr 1714 call __ecp_nistz256_mul_mont$x # p256_mul_mont(S, S, in_x); 1715 1716 lea $tmp0(%rsp), $r_ptr 1717 call __ecp_nistz256_mul_by_2$x # p256_mul_by_2(tmp0, S); 1718 1719 `&load_for_sqr("$M(%rsp)", "$src0")` 1720 movq %xmm0, $r_ptr 1721 call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(res_x, M); 1722 1723 lea $tmp0(%rsp), $b_ptr 1724 mov $acc6, $acc0 # harmonize sqr output and sub input 1725 mov $acc7, $acc1 1726 mov $a_ptr, $poly1 1727 mov $t1, $poly3 1728 call __ecp_nistz256_sub_from$x # p256_sub(res_x, res_x, tmp0); 1729 1730 mov $S+8*0(%rsp), $t0 1731 mov $S+8*1(%rsp), $t1 1732 mov $S+8*2(%rsp), $t2 1733 mov $S+8*3(%rsp), $acc2 # "4-5-0-1" order 1734 lea $S(%rsp), $r_ptr 1735 call __ecp_nistz256_sub$x # p256_sub(S, S, res_x); 1736 1737 mov $M(%rsp), $src0 1738 lea $M(%rsp), $b_ptr 1739 mov $acc4, $acc6 # harmonize sub output and mul input 1740 xor %ecx, %ecx 1741 mov $acc4, $S+8*0(%rsp) # have to save:-( 1742 mov $acc5, $acc2 1743 mov $acc5, $S+8*1(%rsp) 1744 cmovz $acc0, $acc3 1745 mov $acc0, $S+8*2(%rsp) 1746 lea $S-$bias(%rsp), $a_ptr 1747 cmovz $acc1, $acc4 1748 mov $acc1, $S+8*3(%rsp) 1749 mov $acc6, $acc1 1750 lea $S(%rsp), $r_ptr 1751 call __ecp_nistz256_mul_mont$x # p256_mul_mont(S, S, M); 1752 1753 movq %xmm1, $b_ptr 1754 movq %xmm1, $r_ptr 1755 call __ecp_nistz256_sub_from$x # p256_sub(res_y, S, res_y); 1756 1757 add \$32*5+8, %rsp 1758 pop %r15 1759 pop %r14 1760 pop %r13 1761 pop %r12 1762 pop %rbx 1763 pop %rbp 1764 ret 1765 .size ecp_nistz256_point_double$sfx,.-ecp_nistz256_point_double$sfx 1766 ___ 1767 } 1768 &gen_double("q"); 1769 1770 sub gen_add () { 1771 my $x = shift; 1772 my ($src0,$sfx,$bias); 1773 my ($H,$Hsqr,$R,$Rsqr,$Hcub, 1774 $U1,$U2,$S1,$S2, 1775 $res_x,$res_y,$res_z, 1776 $in1_x,$in1_y,$in1_z, 1777 $in2_x,$in2_y,$in2_z)=map(32*$_,(0..17)); 1778 my ($Z1sqr, $Z2sqr) = ($Hsqr, $Rsqr); 1779 1780 if ($x ne "x") { 1781 $src0 = "%rax"; 1782 $sfx = ""; 1783 $bias = 0; 1784 1785 $code.=<<___; 1786 .globl ecp_nistz256_point_add 1787 .type ecp_nistz256_point_add,\@function,3 1788 .align 32 1789 ecp_nistz256_point_add: 1790 ___ 1791 $code.=<<___ if ($addx); 1792 mov \$0x80100, %ecx 1793 and OPENSSL_ia32cap_P+8(%rip), %ecx 1794 cmp \$0x80100, %ecx 1795 je .Lpoint_addx 1796 ___ 1797 } else { 1798 $src0 = "%rdx"; 1799 $sfx = "x"; 1800 $bias = 128; 1801 1802 $code.=<<___; 1803 .type ecp_nistz256_point_addx,\@function,3 1804 .align 32 1805 ecp_nistz256_point_addx: 1806 .Lpoint_addx: 1807 ___ 1808 } 1809 $code.=<<___; 1810 push %rbp 1811 push %rbx 1812 push %r12 1813 push %r13 1814 push %r14 1815 push %r15 1816 sub \$32*18+8, %rsp 1817 1818 movdqu 0x00($a_ptr), %xmm0 # copy *(P256_POINT *)$a_ptr 1819 movdqu 0x10($a_ptr), %xmm1 1820 movdqu 0x20($a_ptr), %xmm2 1821 movdqu 0x30($a_ptr), %xmm3 1822 movdqu 0x40($a_ptr), %xmm4 1823 movdqu 0x50($a_ptr), %xmm5 1824 mov $a_ptr, $b_ptr # reassign 1825 mov $b_org, $a_ptr # reassign 1826 movdqa %xmm0, $in1_x(%rsp) 1827 movdqa %xmm1, $in1_x+0x10(%rsp) 1828 movdqa %xmm2, $in1_y(%rsp) 1829 movdqa %xmm3, $in1_y+0x10(%rsp) 1830 movdqa %xmm4, $in1_z(%rsp) 1831 movdqa %xmm5, $in1_z+0x10(%rsp) 1832 por %xmm4, %xmm5 1833 1834 movdqu 0x00($a_ptr), %xmm0 # copy *(P256_POINT *)$b_ptr 1835 pshufd \$0xb1, %xmm5, %xmm3 1836 movdqu 0x10($a_ptr), %xmm1 1837 movdqu 0x20($a_ptr), %xmm2 1838 por %xmm3, %xmm5 1839 movdqu 0x30($a_ptr), %xmm3 1840 mov 0x40+8*0($a_ptr), $src0 # load original in2_z 1841 mov 0x40+8*1($a_ptr), $acc6 1842 mov 0x40+8*2($a_ptr), $acc7 1843 mov 0x40+8*3($a_ptr), $acc0 1844 movdqa %xmm0, $in2_x(%rsp) 1845 pshufd \$0x1e, %xmm5, %xmm4 1846 movdqa %xmm1, $in2_x+0x10(%rsp) 1847 movdqu 0x40($a_ptr),%xmm0 # in2_z again 1848 movdqu 0x50($a_ptr),%xmm1 1849 movdqa %xmm2, $in2_y(%rsp) 1850 movdqa %xmm3, $in2_y+0x10(%rsp) 1851 por %xmm4, %xmm5 1852 pxor %xmm4, %xmm4 1853 por %xmm0, %xmm1 1854 movq $r_ptr, %xmm0 # save $r_ptr 1855 1856 lea 0x40-$bias($a_ptr), $a_ptr # $a_ptr is still valid 1857 mov $src0, $in2_z+8*0(%rsp) # make in2_z copy 1858 mov $acc6, $in2_z+8*1(%rsp) 1859 mov $acc7, $in2_z+8*2(%rsp) 1860 mov $acc0, $in2_z+8*3(%rsp) 1861 lea $Z2sqr(%rsp), $r_ptr # Z2^2 1862 call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(Z2sqr, in2_z); 1863 1864 pcmpeqd %xmm4, %xmm5 1865 pshufd \$0xb1, %xmm1, %xmm4 1866 por %xmm1, %xmm4 1867 pshufd \$0, %xmm5, %xmm5 # in1infty 1868 pshufd \$0x1e, %xmm4, %xmm3 1869 por %xmm3, %xmm4 1870 pxor %xmm3, %xmm3 1871 pcmpeqd %xmm3, %xmm4 1872 pshufd \$0, %xmm4, %xmm4 # in2infty 1873 mov 0x40+8*0($b_ptr), $src0 # load original in1_z 1874 mov 0x40+8*1($b_ptr), $acc6 1875 mov 0x40+8*2($b_ptr), $acc7 1876 mov 0x40+8*3($b_ptr), $acc0 1877 movq $b_ptr, %xmm1 1878 1879 lea 0x40-$bias($b_ptr), $a_ptr 1880 lea $Z1sqr(%rsp), $r_ptr # Z1^2 1881 call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(Z1sqr, in1_z); 1882 1883 `&load_for_mul("$Z2sqr(%rsp)", "$in2_z(%rsp)", "$src0")` 1884 lea $S1(%rsp), $r_ptr # S1 = Z2^3 1885 call __ecp_nistz256_mul_mont$x # p256_mul_mont(S1, Z2sqr, in2_z); 1886 1887 `&load_for_mul("$Z1sqr(%rsp)", "$in1_z(%rsp)", "$src0")` 1888 lea $S2(%rsp), $r_ptr # S2 = Z1^3 1889 call __ecp_nistz256_mul_mont$x # p256_mul_mont(S2, Z1sqr, in1_z); 1890 1891 `&load_for_mul("$S1(%rsp)", "$in1_y(%rsp)", "$src0")` 1892 lea $S1(%rsp), $r_ptr # S1 = Y1*Z2^3 1893 call __ecp_nistz256_mul_mont$x # p256_mul_mont(S1, S1, in1_y); 1894 1895 `&load_for_mul("$S2(%rsp)", "$in2_y(%rsp)", "$src0")` 1896 lea $S2(%rsp), $r_ptr # S2 = Y2*Z1^3 1897 call __ecp_nistz256_mul_mont$x # p256_mul_mont(S2, S2, in2_y); 1898 1899 lea $S1(%rsp), $b_ptr 1900 lea $R(%rsp), $r_ptr # R = S2 - S1 1901 call __ecp_nistz256_sub_from$x # p256_sub(R, S2, S1); 1902 1903 or $acc5, $acc4 # see if result is zero 1904 movdqa %xmm4, %xmm2 1905 or $acc0, $acc4 1906 or $acc1, $acc4 1907 por %xmm5, %xmm2 # in1infty || in2infty 1908 movq $acc4, %xmm3 1909 1910 `&load_for_mul("$Z2sqr(%rsp)", "$in1_x(%rsp)", "$src0")` 1911 lea $U1(%rsp), $r_ptr # U1 = X1*Z2^2 1912 call __ecp_nistz256_mul_mont$x # p256_mul_mont(U1, in1_x, Z2sqr); 1913 1914 `&load_for_mul("$Z1sqr(%rsp)", "$in2_x(%rsp)", "$src0")` 1915 lea $U2(%rsp), $r_ptr # U2 = X2*Z1^2 1916 call __ecp_nistz256_mul_mont$x # p256_mul_mont(U2, in2_x, Z1sqr); 1917 1918 lea $U1(%rsp), $b_ptr 1919 lea $H(%rsp), $r_ptr # H = U2 - U1 1920 call __ecp_nistz256_sub_from$x # p256_sub(H, U2, U1); 1921 1922 or $acc5, $acc4 # see if result is zero 1923 or $acc0, $acc4 1924 or $acc1, $acc4 1925 1926 .byte 0x3e # predict taken 1927 jnz .Ladd_proceed$x # is_equal(U1,U2)? 1928 movq %xmm2, $acc0 1929 movq %xmm3, $acc1 1930 test $acc0, $acc0 1931 jnz .Ladd_proceed$x # (in1infty || in2infty)? 1932 test $acc1, $acc1 1933 jz .Ladd_double$x # is_equal(S1,S2)? 1934 1935 movq %xmm0, $r_ptr # restore $r_ptr 1936 pxor %xmm0, %xmm0 1937 movdqu %xmm0, 0x00($r_ptr) 1938 movdqu %xmm0, 0x10($r_ptr) 1939 movdqu %xmm0, 0x20($r_ptr) 1940 movdqu %xmm0, 0x30($r_ptr) 1941 movdqu %xmm0, 0x40($r_ptr) 1942 movdqu %xmm0, 0x50($r_ptr) 1943 jmp .Ladd_done$x 1944 1945 .align 32 1946 .Ladd_double$x: 1947 movq %xmm1, $a_ptr # restore $a_ptr 1948 movq %xmm0, $r_ptr # restore $r_ptr 1949 add \$`32*(18-5)`, %rsp # difference in frame sizes 1950 jmp .Lpoint_double_shortcut$x 1951 1952 .align 32 1953 .Ladd_proceed$x: 1954 `&load_for_sqr("$R(%rsp)", "$src0")` 1955 lea $Rsqr(%rsp), $r_ptr # R^2 1956 call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(Rsqr, R); 1957 1958 `&load_for_mul("$H(%rsp)", "$in1_z(%rsp)", "$src0")` 1959 lea $res_z(%rsp), $r_ptr # Z3 = H*Z1*Z2 1960 call __ecp_nistz256_mul_mont$x # p256_mul_mont(res_z, H, in1_z); 1961 1962 `&load_for_sqr("$H(%rsp)", "$src0")` 1963 lea $Hsqr(%rsp), $r_ptr # H^2 1964 call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(Hsqr, H); 1965 1966 `&load_for_mul("$res_z(%rsp)", "$in2_z(%rsp)", "$src0")` 1967 lea $res_z(%rsp), $r_ptr # Z3 = H*Z1*Z2 1968 call __ecp_nistz256_mul_mont$x # p256_mul_mont(res_z, res_z, in2_z); 1969 1970 `&load_for_mul("$Hsqr(%rsp)", "$H(%rsp)", "$src0")` 1971 lea $Hcub(%rsp), $r_ptr # H^3 1972 call __ecp_nistz256_mul_mont$x # p256_mul_mont(Hcub, Hsqr, H); 1973 1974 `&load_for_mul("$Hsqr(%rsp)", "$U1(%rsp)", "$src0")` 1975 lea $U2(%rsp), $r_ptr # U1*H^2 1976 call __ecp_nistz256_mul_mont$x # p256_mul_mont(U2, U1, Hsqr); 1977 ___ 1978 { 1979 ####################################################################### 1980 # operate in 4-5-0-1 "name space" that matches multiplication output 1981 # 1982 my ($acc0,$acc1,$acc2,$acc3,$t3,$t4)=($acc4,$acc5,$acc0,$acc1,$acc2,$acc3); 1983 my ($poly1, $poly3)=($acc6,$acc7); 1984 1985 $code.=<<___; 1986 #lea $U2(%rsp), $a_ptr 1987 #lea $Hsqr(%rsp), $r_ptr # 2*U1*H^2 1988 #call __ecp_nistz256_mul_by_2 # ecp_nistz256_mul_by_2(Hsqr, U2); 1989 1990 xor $t4, $t4 1991 add $acc0, $acc0 # a0:a3+a0:a3 1992 lea $Rsqr(%rsp), $a_ptr 1993 adc $acc1, $acc1 1994 mov $acc0, $t0 1995 adc $acc2, $acc2 1996 adc $acc3, $acc3 1997 mov $acc1, $t1 1998 adc \$0, $t4 1999 2000 sub \$-1, $acc0 2001 mov $acc2, $t2 2002 sbb $poly1, $acc1 2003 sbb \$0, $acc2 2004 mov $acc3, $t3 2005 sbb $poly3, $acc3 2006 sbb \$0, $t4 2007 2008 cmovc $t0, $acc0 2009 mov 8*0($a_ptr), $t0 2010 cmovc $t1, $acc1 2011 mov 8*1($a_ptr), $t1 2012 cmovc $t2, $acc2 2013 mov 8*2($a_ptr), $t2 2014 cmovc $t3, $acc3 2015 mov 8*3($a_ptr), $t3 2016 2017 call __ecp_nistz256_sub$x # p256_sub(res_x, Rsqr, Hsqr); 2018 2019 lea $Hcub(%rsp), $b_ptr 2020 lea $res_x(%rsp), $r_ptr 2021 call __ecp_nistz256_sub_from$x # p256_sub(res_x, res_x, Hcub); 2022 2023 mov $U2+8*0(%rsp), $t0 2024 mov $U2+8*1(%rsp), $t1 2025 mov $U2+8*2(%rsp), $t2 2026 mov $U2+8*3(%rsp), $t3 2027 lea $res_y(%rsp), $r_ptr 2028 2029 call __ecp_nistz256_sub$x # p256_sub(res_y, U2, res_x); 2030 2031 mov $acc0, 8*0($r_ptr) # save the result, as 2032 mov $acc1, 8*1($r_ptr) # __ecp_nistz256_sub doesn't 2033 mov $acc2, 8*2($r_ptr) 2034 mov $acc3, 8*3($r_ptr) 2035 ___ 2036 } 2037 $code.=<<___; 2038 `&load_for_mul("$S1(%rsp)", "$Hcub(%rsp)", "$src0")` 2039 lea $S2(%rsp), $r_ptr 2040 call __ecp_nistz256_mul_mont$x # p256_mul_mont(S2, S1, Hcub); 2041 2042 `&load_for_mul("$R(%rsp)", "$res_y(%rsp)", "$src0")` 2043 lea $res_y(%rsp), $r_ptr 2044 call __ecp_nistz256_mul_mont$x # p256_mul_mont(res_y, R, res_y); 2045 2046 lea $S2(%rsp), $b_ptr 2047 lea $res_y(%rsp), $r_ptr 2048 call __ecp_nistz256_sub_from$x # p256_sub(res_y, res_y, S2); 2049 2050 movq %xmm0, $r_ptr # restore $r_ptr 2051 2052 movdqa %xmm5, %xmm0 # copy_conditional(res_z, in2_z, in1infty); 2053 movdqa %xmm5, %xmm1 2054 pandn $res_z(%rsp), %xmm0 2055 movdqa %xmm5, %xmm2 2056 pandn $res_z+0x10(%rsp), %xmm1 2057 movdqa %xmm5, %xmm3 2058 pand $in2_z(%rsp), %xmm2 2059 pand $in2_z+0x10(%rsp), %xmm3 2060 por %xmm0, %xmm2 2061 por %xmm1, %xmm3 2062 2063 movdqa %xmm4, %xmm0 # copy_conditional(res_z, in1_z, in2infty); 2064 movdqa %xmm4, %xmm1 2065 pandn %xmm2, %xmm0 2066 movdqa %xmm4, %xmm2 2067 pandn %xmm3, %xmm1 2068 movdqa %xmm4, %xmm3 2069 pand $in1_z(%rsp), %xmm2 2070 pand $in1_z+0x10(%rsp), %xmm3 2071 por %xmm0, %xmm2 2072 por %xmm1, %xmm3 2073 movdqu %xmm2, 0x40($r_ptr) 2074 movdqu %xmm3, 0x50($r_ptr) 2075 2076 movdqa %xmm5, %xmm0 # copy_conditional(res_x, in2_x, in1infty); 2077 movdqa %xmm5, %xmm1 2078 pandn $res_x(%rsp), %xmm0 2079 movdqa %xmm5, %xmm2 2080 pandn $res_x+0x10(%rsp), %xmm1 2081 movdqa %xmm5, %xmm3 2082 pand $in2_x(%rsp), %xmm2 2083 pand $in2_x+0x10(%rsp), %xmm3 2084 por %xmm0, %xmm2 2085 por %xmm1, %xmm3 2086 2087 movdqa %xmm4, %xmm0 # copy_conditional(res_x, in1_x, in2infty); 2088 movdqa %xmm4, %xmm1 2089 pandn %xmm2, %xmm0 2090 movdqa %xmm4, %xmm2 2091 pandn %xmm3, %xmm1 2092 movdqa %xmm4, %xmm3 2093 pand $in1_x(%rsp), %xmm2 2094 pand $in1_x+0x10(%rsp), %xmm3 2095 por %xmm0, %xmm2 2096 por %xmm1, %xmm3 2097 movdqu %xmm2, 0x00($r_ptr) 2098 movdqu %xmm3, 0x10($r_ptr) 2099 2100 movdqa %xmm5, %xmm0 # copy_conditional(res_y, in2_y, in1infty); 2101 movdqa %xmm5, %xmm1 2102 pandn $res_y(%rsp), %xmm0 2103 movdqa %xmm5, %xmm2 2104 pandn $res_y+0x10(%rsp), %xmm1 2105 movdqa %xmm5, %xmm3 2106 pand $in2_y(%rsp), %xmm2 2107 pand $in2_y+0x10(%rsp), %xmm3 2108 por %xmm0, %xmm2 2109 por %xmm1, %xmm3 2110 2111 movdqa %xmm4, %xmm0 # copy_conditional(res_y, in1_y, in2infty); 2112 movdqa %xmm4, %xmm1 2113 pandn %xmm2, %xmm0 2114 movdqa %xmm4, %xmm2 2115 pandn %xmm3, %xmm1 2116 movdqa %xmm4, %xmm3 2117 pand $in1_y(%rsp), %xmm2 2118 pand $in1_y+0x10(%rsp), %xmm3 2119 por %xmm0, %xmm2 2120 por %xmm1, %xmm3 2121 movdqu %xmm2, 0x20($r_ptr) 2122 movdqu %xmm3, 0x30($r_ptr) 2123 2124 .Ladd_done$x: 2125 add \$32*18+8, %rsp 2126 pop %r15 2127 pop %r14 2128 pop %r13 2129 pop %r12 2130 pop %rbx 2131 pop %rbp 2132 ret 2133 .size ecp_nistz256_point_add$sfx,.-ecp_nistz256_point_add$sfx 2134 ___ 2135 } 2136 &gen_add("q"); 2137 2138 sub gen_add_affine () { 2139 my $x = shift; 2140 my ($src0,$sfx,$bias); 2141 my ($U2,$S2,$H,$R,$Hsqr,$Hcub,$Rsqr, 2142 $res_x,$res_y,$res_z, 2143 $in1_x,$in1_y,$in1_z, 2144 $in2_x,$in2_y)=map(32*$_,(0..14)); 2145 my $Z1sqr = $S2; 2146 2147 if ($x ne "x") { 2148 $src0 = "%rax"; 2149 $sfx = ""; 2150 $bias = 0; 2151 2152 $code.=<<___; 2153 .globl ecp_nistz256_point_add_affine 2154 .type ecp_nistz256_point_add_affine,\@function,3 2155 .align 32 2156 ecp_nistz256_point_add_affine: 2157 ___ 2158 $code.=<<___ if ($addx); 2159 mov \$0x80100, %ecx 2160 and OPENSSL_ia32cap_P+8(%rip), %ecx 2161 cmp \$0x80100, %ecx 2162 je .Lpoint_add_affinex 2163 ___ 2164 } else { 2165 $src0 = "%rdx"; 2166 $sfx = "x"; 2167 $bias = 128; 2168 2169 $code.=<<___; 2170 .type ecp_nistz256_point_add_affinex,\@function,3 2171 .align 32 2172 ecp_nistz256_point_add_affinex: 2173 .Lpoint_add_affinex: 2174 ___ 2175 } 2176 $code.=<<___; 2177 push %rbp 2178 push %rbx 2179 push %r12 2180 push %r13 2181 push %r14 2182 push %r15 2183 sub \$32*15+8, %rsp 2184 2185 movdqu 0x00($a_ptr), %xmm0 # copy *(P256_POINT *)$a_ptr 2186 mov $b_org, $b_ptr # reassign 2187 movdqu 0x10($a_ptr), %xmm1 2188 movdqu 0x20($a_ptr), %xmm2 2189 movdqu 0x30($a_ptr), %xmm3 2190 movdqu 0x40($a_ptr), %xmm4 2191 movdqu 0x50($a_ptr), %xmm5 2192 mov 0x40+8*0($a_ptr), $src0 # load original in1_z 2193 mov 0x40+8*1($a_ptr), $acc6 2194 mov 0x40+8*2($a_ptr), $acc7 2195 mov 0x40+8*3($a_ptr), $acc0 2196 movdqa %xmm0, $in1_x(%rsp) 2197 movdqa %xmm1, $in1_x+0x10(%rsp) 2198 movdqa %xmm2, $in1_y(%rsp) 2199 movdqa %xmm3, $in1_y+0x10(%rsp) 2200 movdqa %xmm4, $in1_z(%rsp) 2201 movdqa %xmm5, $in1_z+0x10(%rsp) 2202 por %xmm4, %xmm5 2203 2204 movdqu 0x00($b_ptr), %xmm0 # copy *(P256_POINT_AFFINE *)$b_ptr 2205 pshufd \$0xb1, %xmm5, %xmm3 2206 movdqu 0x10($b_ptr), %xmm1 2207 movdqu 0x20($b_ptr), %xmm2 2208 por %xmm3, %xmm5 2209 movdqu 0x30($b_ptr), %xmm3 2210 movdqa %xmm0, $in2_x(%rsp) 2211 pshufd \$0x1e, %xmm5, %xmm4 2212 movdqa %xmm1, $in2_x+0x10(%rsp) 2213 por %xmm0, %xmm1 2214 movq $r_ptr, %xmm0 # save $r_ptr 2215 movdqa %xmm2, $in2_y(%rsp) 2216 movdqa %xmm3, $in2_y+0x10(%rsp) 2217 por %xmm2, %xmm3 2218 por %xmm4, %xmm5 2219 pxor %xmm4, %xmm4 2220 por %xmm1, %xmm3 2221 2222 lea 0x40-$bias($a_ptr), $a_ptr # $a_ptr is still valid 2223 lea $Z1sqr(%rsp), $r_ptr # Z1^2 2224 call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(Z1sqr, in1_z); 2225 2226 pcmpeqd %xmm4, %xmm5 2227 pshufd \$0xb1, %xmm3, %xmm4 2228 mov 0x00($b_ptr), $src0 # $b_ptr is still valid 2229 #lea 0x00($b_ptr), $b_ptr 2230 mov $acc4, $acc1 # harmonize sqr output and mul input 2231 por %xmm3, %xmm4 2232 pshufd \$0, %xmm5, %xmm5 # in1infty 2233 pshufd \$0x1e, %xmm4, %xmm3 2234 mov $acc5, $acc2 2235 por %xmm3, %xmm4 2236 pxor %xmm3, %xmm3 2237 mov $acc6, $acc3 2238 pcmpeqd %xmm3, %xmm4 2239 pshufd \$0, %xmm4, %xmm4 # in2infty 2240 2241 lea $Z1sqr-$bias(%rsp), $a_ptr 2242 mov $acc7, $acc4 2243 lea $U2(%rsp), $r_ptr # U2 = X2*Z1^2 2244 call __ecp_nistz256_mul_mont$x # p256_mul_mont(U2, Z1sqr, in2_x); 2245 2246 lea $in1_x(%rsp), $b_ptr 2247 lea $H(%rsp), $r_ptr # H = U2 - U1 2248 call __ecp_nistz256_sub_from$x # p256_sub(H, U2, in1_x); 2249 2250 `&load_for_mul("$Z1sqr(%rsp)", "$in1_z(%rsp)", "$src0")` 2251 lea $S2(%rsp), $r_ptr # S2 = Z1^3 2252 call __ecp_nistz256_mul_mont$x # p256_mul_mont(S2, Z1sqr, in1_z); 2253 2254 `&load_for_mul("$H(%rsp)", "$in1_z(%rsp)", "$src0")` 2255 lea $res_z(%rsp), $r_ptr # Z3 = H*Z1*Z2 2256 call __ecp_nistz256_mul_mont$x # p256_mul_mont(res_z, H, in1_z); 2257 2258 `&load_for_mul("$S2(%rsp)", "$in2_y(%rsp)", "$src0")` 2259 lea $S2(%rsp), $r_ptr # S2 = Y2*Z1^3 2260 call __ecp_nistz256_mul_mont$x # p256_mul_mont(S2, S2, in2_y); 2261 2262 lea $in1_y(%rsp), $b_ptr 2263 lea $R(%rsp), $r_ptr # R = S2 - S1 2264 call __ecp_nistz256_sub_from$x # p256_sub(R, S2, in1_y); 2265 2266 `&load_for_sqr("$H(%rsp)", "$src0")` 2267 lea $Hsqr(%rsp), $r_ptr # H^2 2268 call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(Hsqr, H); 2269 2270 `&load_for_sqr("$R(%rsp)", "$src0")` 2271 lea $Rsqr(%rsp), $r_ptr # R^2 2272 call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(Rsqr, R); 2273 2274 `&load_for_mul("$H(%rsp)", "$Hsqr(%rsp)", "$src0")` 2275 lea $Hcub(%rsp), $r_ptr # H^3 2276 call __ecp_nistz256_mul_mont$x # p256_mul_mont(Hcub, Hsqr, H); 2277 2278 `&load_for_mul("$Hsqr(%rsp)", "$in1_x(%rsp)", "$src0")` 2279 lea $U2(%rsp), $r_ptr # U1*H^2 2280 call __ecp_nistz256_mul_mont$x # p256_mul_mont(U2, in1_x, Hsqr); 2281 ___ 2282 { 2283 ####################################################################### 2284 # operate in 4-5-0-1 "name space" that matches multiplication output 2285 # 2286 my ($acc0,$acc1,$acc2,$acc3,$t3,$t4)=($acc4,$acc5,$acc0,$acc1,$acc2,$acc3); 2287 my ($poly1, $poly3)=($acc6,$acc7); 2288 2289 $code.=<<___; 2290 #lea $U2(%rsp), $a_ptr 2291 #lea $Hsqr(%rsp), $r_ptr # 2*U1*H^2 2292 #call __ecp_nistz256_mul_by_2 # ecp_nistz256_mul_by_2(Hsqr, U2); 2293 2294 xor $t4, $t4 2295 add $acc0, $acc0 # a0:a3+a0:a3 2296 lea $Rsqr(%rsp), $a_ptr 2297 adc $acc1, $acc1 2298 mov $acc0, $t0 2299 adc $acc2, $acc2 2300 adc $acc3, $acc3 2301 mov $acc1, $t1 2302 adc \$0, $t4 2303 2304 sub \$-1, $acc0 2305 mov $acc2, $t2 2306 sbb $poly1, $acc1 2307 sbb \$0, $acc2 2308 mov $acc3, $t3 2309 sbb $poly3, $acc3 2310 sbb \$0, $t4 2311 2312 cmovc $t0, $acc0 2313 mov 8*0($a_ptr), $t0 2314 cmovc $t1, $acc1 2315 mov 8*1($a_ptr), $t1 2316 cmovc $t2, $acc2 2317 mov 8*2($a_ptr), $t2 2318 cmovc $t3, $acc3 2319 mov 8*3($a_ptr), $t3 2320 2321 call __ecp_nistz256_sub$x # p256_sub(res_x, Rsqr, Hsqr); 2322 2323 lea $Hcub(%rsp), $b_ptr 2324 lea $res_x(%rsp), $r_ptr 2325 call __ecp_nistz256_sub_from$x # p256_sub(res_x, res_x, Hcub); 2326 2327 mov $U2+8*0(%rsp), $t0 2328 mov $U2+8*1(%rsp), $t1 2329 mov $U2+8*2(%rsp), $t2 2330 mov $U2+8*3(%rsp), $t3 2331 lea $H(%rsp), $r_ptr 2332 2333 call __ecp_nistz256_sub$x # p256_sub(H, U2, res_x); 2334 2335 mov $acc0, 8*0($r_ptr) # save the result, as 2336 mov $acc1, 8*1($r_ptr) # __ecp_nistz256_sub doesn't 2337 mov $acc2, 8*2($r_ptr) 2338 mov $acc3, 8*3($r_ptr) 2339 ___ 2340 } 2341 $code.=<<___; 2342 `&load_for_mul("$Hcub(%rsp)", "$in1_y(%rsp)", "$src0")` 2343 lea $S2(%rsp), $r_ptr 2344 call __ecp_nistz256_mul_mont$x # p256_mul_mont(S2, Hcub, in1_y); 2345 2346 `&load_for_mul("$H(%rsp)", "$R(%rsp)", "$src0")` 2347 lea $H(%rsp), $r_ptr 2348 call __ecp_nistz256_mul_mont$x # p256_mul_mont(H, H, R); 2349 2350 lea $S2(%rsp), $b_ptr 2351 lea $res_y(%rsp), $r_ptr 2352 call __ecp_nistz256_sub_from$x # p256_sub(res_y, H, S2); 2353 2354 movq %xmm0, $r_ptr # restore $r_ptr 2355 2356 movdqa %xmm5, %xmm0 # copy_conditional(res_z, ONE, in1infty); 2357 movdqa %xmm5, %xmm1 2358 pandn $res_z(%rsp), %xmm0 2359 movdqa %xmm5, %xmm2 2360 pandn $res_z+0x10(%rsp), %xmm1 2361 movdqa %xmm5, %xmm3 2362 pand .LONE_mont(%rip), %xmm2 2363 pand .LONE_mont+0x10(%rip), %xmm3 2364 por %xmm0, %xmm2 2365 por %xmm1, %xmm3 2366 2367 movdqa %xmm4, %xmm0 # copy_conditional(res_z, in1_z, in2infty); 2368 movdqa %xmm4, %xmm1 2369 pandn %xmm2, %xmm0 2370 movdqa %xmm4, %xmm2 2371 pandn %xmm3, %xmm1 2372 movdqa %xmm4, %xmm3 2373 pand $in1_z(%rsp), %xmm2 2374 pand $in1_z+0x10(%rsp), %xmm3 2375 por %xmm0, %xmm2 2376 por %xmm1, %xmm3 2377 movdqu %xmm2, 0x40($r_ptr) 2378 movdqu %xmm3, 0x50($r_ptr) 2379 2380 movdqa %xmm5, %xmm0 # copy_conditional(res_x, in2_x, in1infty); 2381 movdqa %xmm5, %xmm1 2382 pandn $res_x(%rsp), %xmm0 2383 movdqa %xmm5, %xmm2 2384 pandn $res_x+0x10(%rsp), %xmm1 2385 movdqa %xmm5, %xmm3 2386 pand $in2_x(%rsp), %xmm2 2387 pand $in2_x+0x10(%rsp), %xmm3 2388 por %xmm0, %xmm2 2389 por %xmm1, %xmm3 2390 2391 movdqa %xmm4, %xmm0 # copy_conditional(res_x, in1_x, in2infty); 2392 movdqa %xmm4, %xmm1 2393 pandn %xmm2, %xmm0 2394 movdqa %xmm4, %xmm2 2395 pandn %xmm3, %xmm1 2396 movdqa %xmm4, %xmm3 2397 pand $in1_x(%rsp), %xmm2 2398 pand $in1_x+0x10(%rsp), %xmm3 2399 por %xmm0, %xmm2 2400 por %xmm1, %xmm3 2401 movdqu %xmm2, 0x00($r_ptr) 2402 movdqu %xmm3, 0x10($r_ptr) 2403 2404 movdqa %xmm5, %xmm0 # copy_conditional(res_y, in2_y, in1infty); 2405 movdqa %xmm5, %xmm1 2406 pandn $res_y(%rsp), %xmm0 2407 movdqa %xmm5, %xmm2 2408 pandn $res_y+0x10(%rsp), %xmm1 2409 movdqa %xmm5, %xmm3 2410 pand $in2_y(%rsp), %xmm2 2411 pand $in2_y+0x10(%rsp), %xmm3 2412 por %xmm0, %xmm2 2413 por %xmm1, %xmm3 2414 2415 movdqa %xmm4, %xmm0 # copy_conditional(res_y, in1_y, in2infty); 2416 movdqa %xmm4, %xmm1 2417 pandn %xmm2, %xmm0 2418 movdqa %xmm4, %xmm2 2419 pandn %xmm3, %xmm1 2420 movdqa %xmm4, %xmm3 2421 pand $in1_y(%rsp), %xmm2 2422 pand $in1_y+0x10(%rsp), %xmm3 2423 por %xmm0, %xmm2 2424 por %xmm1, %xmm3 2425 movdqu %xmm2, 0x20($r_ptr) 2426 movdqu %xmm3, 0x30($r_ptr) 2427 2428 add \$32*15+8, %rsp 2429 pop %r15 2430 pop %r14 2431 pop %r13 2432 pop %r12 2433 pop %rbx 2434 pop %rbp 2435 ret 2436 .size ecp_nistz256_point_add_affine$sfx,.-ecp_nistz256_point_add_affine$sfx 2437 ___ 2438 } 2439 &gen_add_affine("q"); 2440 2441 ######################################################################## 2442 # AD*X magic 2443 # 2444 if ($addx) { { 2445 ######################################################################## 2446 # operate in 4-5-0-1 "name space" that matches multiplication output 2447 # 2448 my ($a0,$a1,$a2,$a3,$t3,$t4)=($acc4,$acc5,$acc0,$acc1,$acc2,$acc3); 2449 2450 $code.=<<___; 2451 .type __ecp_nistz256_add_tox,\@abi-omnipotent 2452 .align 32 2453 __ecp_nistz256_add_tox: 2454 xor $t4, $t4 2455 adc 8*0($b_ptr), $a0 2456 adc 8*1($b_ptr), $a1 2457 mov $a0, $t0 2458 adc 8*2($b_ptr), $a2 2459 adc 8*3($b_ptr), $a3 2460 mov $a1, $t1 2461 adc \$0, $t4 2462 2463 xor $t3, $t3 2464 sbb \$-1, $a0 2465 mov $a2, $t2 2466 sbb $poly1, $a1 2467 sbb \$0, $a2 2468 mov $a3, $t3 2469 sbb $poly3, $a3 2470 sbb \$0, $t4 2471 2472 cmovc $t0, $a0 2473 cmovc $t1, $a1 2474 mov $a0, 8*0($r_ptr) 2475 cmovc $t2, $a2 2476 mov $a1, 8*1($r_ptr) 2477 cmovc $t3, $a3 2478 mov $a2, 8*2($r_ptr) 2479 mov $a3, 8*3($r_ptr) 2480 2481 ret 2482 .size __ecp_nistz256_add_tox,.-__ecp_nistz256_add_tox 2483 2484 .type __ecp_nistz256_sub_fromx,\@abi-omnipotent 2485 .align 32 2486 __ecp_nistz256_sub_fromx: 2487 xor $t4, $t4 2488 sbb 8*0($b_ptr), $a0 2489 sbb 8*1($b_ptr), $a1 2490 mov $a0, $t0 2491 sbb 8*2($b_ptr), $a2 2492 sbb 8*3($b_ptr), $a3 2493 mov $a1, $t1 2494 sbb \$0, $t4 2495 2496 xor $t3, $t3 2497 adc \$-1, $a0 2498 mov $a2, $t2 2499 adc $poly1, $a1 2500 adc \$0, $a2 2501 mov $a3, $t3 2502 adc $poly3, $a3 2503 2504 bt \$0, $t4 2505 cmovnc $t0, $a0 2506 cmovnc $t1, $a1 2507 mov $a0, 8*0($r_ptr) 2508 cmovnc $t2, $a2 2509 mov $a1, 8*1($r_ptr) 2510 cmovnc $t3, $a3 2511 mov $a2, 8*2($r_ptr) 2512 mov $a3, 8*3($r_ptr) 2513 2514 ret 2515 .size __ecp_nistz256_sub_fromx,.-__ecp_nistz256_sub_fromx 2516 2517 .type __ecp_nistz256_subx,\@abi-omnipotent 2518 .align 32 2519 __ecp_nistz256_subx: 2520 xor $t4, $t4 2521 sbb $a0, $t0 2522 sbb $a1, $t1 2523 mov $t0, $a0 2524 sbb $a2, $t2 2525 sbb $a3, $t3 2526 mov $t1, $a1 2527 sbb \$0, $t4 2528 2529 xor $a3 ,$a3 2530 adc \$-1, $t0 2531 mov $t2, $a2 2532 adc $poly1, $t1 2533 adc \$0, $t2 2534 mov $t3, $a3 2535 adc $poly3, $t3 2536 2537 bt \$0, $t4 2538 cmovc $t0, $a0 2539 cmovc $t1, $a1 2540 cmovc $t2, $a2 2541 cmovc $t3, $a3 2542 2543 ret 2544 .size __ecp_nistz256_subx,.-__ecp_nistz256_subx 2545 2546 .type __ecp_nistz256_mul_by_2x,\@abi-omnipotent 2547 .align 32 2548 __ecp_nistz256_mul_by_2x: 2549 xor $t4, $t4 2550 adc $a0, $a0 # a0:a3+a0:a3 2551 adc $a1, $a1 2552 mov $a0, $t0 2553 adc $a2, $a2 2554 adc $a3, $a3 2555 mov $a1, $t1 2556 adc \$0, $t4 2557 2558 xor $t3, $t3 2559 sbb \$-1, $a0 2560 mov $a2, $t2 2561 sbb $poly1, $a1 2562 sbb \$0, $a2 2563 mov $a3, $t3 2564 sbb $poly3, $a3 2565 sbb \$0, $t4 2566 2567 cmovc $t0, $a0 2568 cmovc $t1, $a1 2569 mov $a0, 8*0($r_ptr) 2570 cmovc $t2, $a2 2571 mov $a1, 8*1($r_ptr) 2572 cmovc $t3, $a3 2573 mov $a2, 8*2($r_ptr) 2574 mov $a3, 8*3($r_ptr) 2575 2576 ret 2577 .size __ecp_nistz256_mul_by_2x,.-__ecp_nistz256_mul_by_2x 2578 ___ 2579 } 2580 &gen_double("x"); 2581 &gen_add("x"); 2582 &gen_add_affine("x"); 2583 } 2584 }}} 2585 2586 $code =~ s/\`([^\`]*)\`/eval $1/gem; 2587 print $code; 2588 close STDOUT; 2589