1 #!/usr/bin/env perl 2 3 # Copyright (c) 2014, Intel Corporation. 4 # 5 # Permission to use, copy, modify, and/or distribute this software for any 6 # purpose with or without fee is hereby granted, provided that the above 7 # copyright notice and this permission notice appear in all copies. 8 # 9 # THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 10 # WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 11 # MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY 12 # SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 13 # WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION 14 # OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN 15 # CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 16 17 # Developers and authors: 18 # Shay Gueron (1, 2), and Vlad Krasnov (1) 19 # (1) Intel Corporation, Israel Development Center 20 # (2) University of Haifa 21 22 # Reference: 23 # S.Gueron and V.Krasnov, "Fast Prime Field Elliptic Curve Cryptography with 24 # 256 Bit Primes" 25 26 # Further optimization by <appro (at] openssl.org>: 27 # 28 # this/original 29 # Opteron +12-49% 30 # Bulldozer +14-45% 31 # P4 +18-46% 32 # Westmere +12-34% 33 # Sandy Bridge +9-35% 34 # Ivy Bridge +9-35% 35 # Haswell +8-37% 36 # Broadwell +18-58% 37 # Atom +15-50% 38 # VIA Nano +43-160% 39 # 40 # Ranges denote minimum and maximum improvement coefficients depending 41 # on benchmark. 42 43 $flavour = shift; 44 $output = shift; 45 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } 46 47 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); 48 49 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 50 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or 51 ( $xlate="${dir}../../../perlasm/x86_64-xlate.pl" and -f $xlate) or 52 die "can't locate x86_64-xlate.pl"; 53 54 open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""; 55 *STDOUT=*OUT; 56 57 # TODO(davidben): Set $addx to one once build problems are resolved. 58 $avx = 2; 59 $addx = 0; 60 61 $code.=<<___; 62 .text 63 .extern OPENSSL_ia32cap_P 64 65 # The polynomial 66 .align 64 67 .Lpoly: 68 .quad 0xffffffffffffffff, 0x00000000ffffffff, 0x0000000000000000, 0xffffffff00000001 69 70 .LOne: 71 .long 1,1,1,1,1,1,1,1 72 .LTwo: 73 .long 2,2,2,2,2,2,2,2 74 .LThree: 75 .long 3,3,3,3,3,3,3,3 76 .LONE_mont: 77 .quad 0x0000000000000001, 0xffffffff00000000, 0xffffffffffffffff, 0x00000000fffffffe 78 ___ 79 80 { 81 my ($a0,$a1,$a2,$a3)=map("%r$_",(8..11)); 82 my ($t0,$t1,$t2,$t3,$t4)=("%rax","%rdx","%rcx","%r12","%r13"); 83 my ($r_ptr,$a_ptr,$b_ptr)=("%rdi","%rsi","%rdx"); 84 85 $code.=<<___; 86 87 ################################################################################ 88 # void ecp_nistz256_neg(uint64_t res[4], uint64_t a[4]); 89 .globl ecp_nistz256_neg 90 .type ecp_nistz256_neg,\@function,2 91 .align 32 92 ecp_nistz256_neg: 93 push %r12 94 push %r13 95 96 xor $a0, $a0 97 xor $a1, $a1 98 xor $a2, $a2 99 xor $a3, $a3 100 xor $t4, $t4 101 102 sub 8*0($a_ptr), $a0 103 sbb 8*1($a_ptr), $a1 104 sbb 8*2($a_ptr), $a2 105 mov $a0, $t0 106 sbb 8*3($a_ptr), $a3 107 lea .Lpoly(%rip), $a_ptr 108 mov $a1, $t1 109 sbb \$0, $t4 110 111 add 8*0($a_ptr), $a0 112 mov $a2, $t2 113 adc 8*1($a_ptr), $a1 114 adc 8*2($a_ptr), $a2 115 mov $a3, $t3 116 adc 8*3($a_ptr), $a3 117 test $t4, $t4 118 119 cmovz $t0, $a0 120 cmovz $t1, $a1 121 mov $a0, 8*0($r_ptr) 122 cmovz $t2, $a2 123 mov $a1, 8*1($r_ptr) 124 cmovz $t3, $a3 125 mov $a2, 8*2($r_ptr) 126 mov $a3, 8*3($r_ptr) 127 128 pop %r13 129 pop %r12 130 ret 131 .size ecp_nistz256_neg,.-ecp_nistz256_neg 132 ___ 133 } 134 { 135 my ($r_ptr,$a_ptr,$b_org,$b_ptr)=("%rdi","%rsi","%rdx","%rbx"); 136 my ($acc0,$acc1,$acc2,$acc3,$acc4,$acc5,$acc6,$acc7)=map("%r$_",(8..15)); 137 my ($t0,$t1,$t2,$t3,$t4)=("%rcx","%rbp","%rbx","%rdx","%rax"); 138 my ($poly1,$poly3)=($acc6,$acc7); 139 140 $code.=<<___; 141 ################################################################################ 142 # void ecp_nistz256_mul_mont( 143 # uint64_t res[4], 144 # uint64_t a[4], 145 # uint64_t b[4]); 146 147 .globl ecp_nistz256_mul_mont 148 .type ecp_nistz256_mul_mont,\@function,3 149 .align 32 150 ecp_nistz256_mul_mont: 151 ___ 152 $code.=<<___ if ($addx); 153 leaq OPENSSL_ia32cap_P(%rip), %rcx 154 mov 8(%rcx), %rcx 155 and \$0x80100, %ecx 156 ___ 157 $code.=<<___; 158 .Lmul_mont: 159 push %rbp 160 push %rbx 161 push %r12 162 push %r13 163 push %r14 164 push %r15 165 ___ 166 $code.=<<___ if ($addx); 167 cmp \$0x80100, %ecx 168 je .Lmul_montx 169 ___ 170 $code.=<<___; 171 mov $b_org, $b_ptr 172 mov 8*0($b_org), %rax 173 mov 8*0($a_ptr), $acc1 174 mov 8*1($a_ptr), $acc2 175 mov 8*2($a_ptr), $acc3 176 mov 8*3($a_ptr), $acc4 177 178 call __ecp_nistz256_mul_montq 179 ___ 180 $code.=<<___ if ($addx); 181 jmp .Lmul_mont_done 182 183 .align 32 184 .Lmul_montx: 185 mov $b_org, $b_ptr 186 mov 8*0($b_org), %rdx 187 mov 8*0($a_ptr), $acc1 188 mov 8*1($a_ptr), $acc2 189 mov 8*2($a_ptr), $acc3 190 mov 8*3($a_ptr), $acc4 191 lea -128($a_ptr), $a_ptr # control u-op density 192 193 call __ecp_nistz256_mul_montx 194 ___ 195 $code.=<<___; 196 .Lmul_mont_done: 197 pop %r15 198 pop %r14 199 pop %r13 200 pop %r12 201 pop %rbx 202 pop %rbp 203 ret 204 .size ecp_nistz256_mul_mont,.-ecp_nistz256_mul_mont 205 206 .type __ecp_nistz256_mul_montq,\@abi-omnipotent 207 .align 32 208 __ecp_nistz256_mul_montq: 209 ######################################################################## 210 # Multiply a by b[0] 211 mov %rax, $t1 212 mulq $acc1 213 mov .Lpoly+8*1(%rip),$poly1 214 mov %rax, $acc0 215 mov $t1, %rax 216 mov %rdx, $acc1 217 218 mulq $acc2 219 mov .Lpoly+8*3(%rip),$poly3 220 add %rax, $acc1 221 mov $t1, %rax 222 adc \$0, %rdx 223 mov %rdx, $acc2 224 225 mulq $acc3 226 add %rax, $acc2 227 mov $t1, %rax 228 adc \$0, %rdx 229 mov %rdx, $acc3 230 231 mulq $acc4 232 add %rax, $acc3 233 mov $acc0, %rax 234 adc \$0, %rdx 235 xor $acc5, $acc5 236 mov %rdx, $acc4 237 238 ######################################################################## 239 # First reduction step 240 # Basically now we want to multiply acc[0] by p256, 241 # and add the result to the acc. 242 # Due to the special form of p256 we do some optimizations 243 # 244 # acc[0] x p256[0..1] = acc[0] x 2^96 - acc[0] 245 # then we add acc[0] and get acc[0] x 2^96 246 247 mov $acc0, $t1 248 shl \$32, $acc0 249 mulq $poly3 250 shr \$32, $t1 251 add $acc0, $acc1 # +=acc[0]<<96 252 adc $t1, $acc2 253 adc %rax, $acc3 254 mov 8*1($b_ptr), %rax 255 adc %rdx, $acc4 256 adc \$0, $acc5 257 xor $acc0, $acc0 258 259 ######################################################################## 260 # Multiply by b[1] 261 mov %rax, $t1 262 mulq 8*0($a_ptr) 263 add %rax, $acc1 264 mov $t1, %rax 265 adc \$0, %rdx 266 mov %rdx, $t0 267 268 mulq 8*1($a_ptr) 269 add $t0, $acc2 270 adc \$0, %rdx 271 add %rax, $acc2 272 mov $t1, %rax 273 adc \$0, %rdx 274 mov %rdx, $t0 275 276 mulq 8*2($a_ptr) 277 add $t0, $acc3 278 adc \$0, %rdx 279 add %rax, $acc3 280 mov $t1, %rax 281 adc \$0, %rdx 282 mov %rdx, $t0 283 284 mulq 8*3($a_ptr) 285 add $t0, $acc4 286 adc \$0, %rdx 287 add %rax, $acc4 288 mov $acc1, %rax 289 adc %rdx, $acc5 290 adc \$0, $acc0 291 292 ######################################################################## 293 # Second reduction step 294 mov $acc1, $t1 295 shl \$32, $acc1 296 mulq $poly3 297 shr \$32, $t1 298 add $acc1, $acc2 299 adc $t1, $acc3 300 adc %rax, $acc4 301 mov 8*2($b_ptr), %rax 302 adc %rdx, $acc5 303 adc \$0, $acc0 304 xor $acc1, $acc1 305 306 ######################################################################## 307 # Multiply by b[2] 308 mov %rax, $t1 309 mulq 8*0($a_ptr) 310 add %rax, $acc2 311 mov $t1, %rax 312 adc \$0, %rdx 313 mov %rdx, $t0 314 315 mulq 8*1($a_ptr) 316 add $t0, $acc3 317 adc \$0, %rdx 318 add %rax, $acc3 319 mov $t1, %rax 320 adc \$0, %rdx 321 mov %rdx, $t0 322 323 mulq 8*2($a_ptr) 324 add $t0, $acc4 325 adc \$0, %rdx 326 add %rax, $acc4 327 mov $t1, %rax 328 adc \$0, %rdx 329 mov %rdx, $t0 330 331 mulq 8*3($a_ptr) 332 add $t0, $acc5 333 adc \$0, %rdx 334 add %rax, $acc5 335 mov $acc2, %rax 336 adc %rdx, $acc0 337 adc \$0, $acc1 338 339 ######################################################################## 340 # Third reduction step 341 mov $acc2, $t1 342 shl \$32, $acc2 343 mulq $poly3 344 shr \$32, $t1 345 add $acc2, $acc3 346 adc $t1, $acc4 347 adc %rax, $acc5 348 mov 8*3($b_ptr), %rax 349 adc %rdx, $acc0 350 adc \$0, $acc1 351 xor $acc2, $acc2 352 353 ######################################################################## 354 # Multiply by b[3] 355 mov %rax, $t1 356 mulq 8*0($a_ptr) 357 add %rax, $acc3 358 mov $t1, %rax 359 adc \$0, %rdx 360 mov %rdx, $t0 361 362 mulq 8*1($a_ptr) 363 add $t0, $acc4 364 adc \$0, %rdx 365 add %rax, $acc4 366 mov $t1, %rax 367 adc \$0, %rdx 368 mov %rdx, $t0 369 370 mulq 8*2($a_ptr) 371 add $t0, $acc5 372 adc \$0, %rdx 373 add %rax, $acc5 374 mov $t1, %rax 375 adc \$0, %rdx 376 mov %rdx, $t0 377 378 mulq 8*3($a_ptr) 379 add $t0, $acc0 380 adc \$0, %rdx 381 add %rax, $acc0 382 mov $acc3, %rax 383 adc %rdx, $acc1 384 adc \$0, $acc2 385 386 ######################################################################## 387 # Final reduction step 388 mov $acc3, $t1 389 shl \$32, $acc3 390 mulq $poly3 391 shr \$32, $t1 392 add $acc3, $acc4 393 adc $t1, $acc5 394 mov $acc4, $t0 395 adc %rax, $acc0 396 adc %rdx, $acc1 397 mov $acc5, $t1 398 adc \$0, $acc2 399 400 ######################################################################## 401 # Branch-less conditional subtraction of P 402 sub \$-1, $acc4 # .Lpoly[0] 403 mov $acc0, $t2 404 sbb $poly1, $acc5 # .Lpoly[1] 405 sbb \$0, $acc0 # .Lpoly[2] 406 mov $acc1, $t3 407 sbb $poly3, $acc1 # .Lpoly[3] 408 sbb \$0, $acc2 409 410 cmovc $t0, $acc4 411 cmovc $t1, $acc5 412 mov $acc4, 8*0($r_ptr) 413 cmovc $t2, $acc0 414 mov $acc5, 8*1($r_ptr) 415 cmovc $t3, $acc1 416 mov $acc0, 8*2($r_ptr) 417 mov $acc1, 8*3($r_ptr) 418 419 ret 420 .size __ecp_nistz256_mul_montq,.-__ecp_nistz256_mul_montq 421 422 ################################################################################ 423 # void ecp_nistz256_sqr_mont( 424 # uint64_t res[4], 425 # uint64_t a[4]); 426 427 # we optimize the square according to S.Gueron and V.Krasnov, 428 # "Speeding up Big-Number Squaring" 429 .globl ecp_nistz256_sqr_mont 430 .type ecp_nistz256_sqr_mont,\@function,2 431 .align 32 432 ecp_nistz256_sqr_mont: 433 ___ 434 $code.=<<___ if ($addx); 435 leaq OPENSSL_ia32cap_P(%rip), %rcx 436 mov 8(%rcx), %rcx 437 and \$0x80100, %ecx 438 ___ 439 $code.=<<___; 440 push %rbp 441 push %rbx 442 push %r12 443 push %r13 444 push %r14 445 push %r15 446 ___ 447 $code.=<<___ if ($addx); 448 cmp \$0x80100, %ecx 449 je .Lsqr_montx 450 ___ 451 $code.=<<___; 452 mov 8*0($a_ptr), %rax 453 mov 8*1($a_ptr), $acc6 454 mov 8*2($a_ptr), $acc7 455 mov 8*3($a_ptr), $acc0 456 457 call __ecp_nistz256_sqr_montq 458 ___ 459 $code.=<<___ if ($addx); 460 jmp .Lsqr_mont_done 461 462 .align 32 463 .Lsqr_montx: 464 mov 8*0($a_ptr), %rdx 465 mov 8*1($a_ptr), $acc6 466 mov 8*2($a_ptr), $acc7 467 mov 8*3($a_ptr), $acc0 468 lea -128($a_ptr), $a_ptr # control u-op density 469 470 call __ecp_nistz256_sqr_montx 471 ___ 472 $code.=<<___; 473 .Lsqr_mont_done: 474 pop %r15 475 pop %r14 476 pop %r13 477 pop %r12 478 pop %rbx 479 pop %rbp 480 ret 481 .size ecp_nistz256_sqr_mont,.-ecp_nistz256_sqr_mont 482 483 .type __ecp_nistz256_sqr_montq,\@abi-omnipotent 484 .align 32 485 __ecp_nistz256_sqr_montq: 486 mov %rax, $acc5 487 mulq $acc6 # a[1]*a[0] 488 mov %rax, $acc1 489 mov $acc7, %rax 490 mov %rdx, $acc2 491 492 mulq $acc5 # a[0]*a[2] 493 add %rax, $acc2 494 mov $acc0, %rax 495 adc \$0, %rdx 496 mov %rdx, $acc3 497 498 mulq $acc5 # a[0]*a[3] 499 add %rax, $acc3 500 mov $acc7, %rax 501 adc \$0, %rdx 502 mov %rdx, $acc4 503 504 ################################# 505 mulq $acc6 # a[1]*a[2] 506 add %rax, $acc3 507 mov $acc0, %rax 508 adc \$0, %rdx 509 mov %rdx, $t1 510 511 mulq $acc6 # a[1]*a[3] 512 add %rax, $acc4 513 mov $acc0, %rax 514 adc \$0, %rdx 515 add $t1, $acc4 516 mov %rdx, $acc5 517 adc \$0, $acc5 518 519 ################################# 520 mulq $acc7 # a[2]*a[3] 521 xor $acc7, $acc7 522 add %rax, $acc5 523 mov 8*0($a_ptr), %rax 524 mov %rdx, $acc6 525 adc \$0, $acc6 526 527 add $acc1, $acc1 # acc1:6<<1 528 adc $acc2, $acc2 529 adc $acc3, $acc3 530 adc $acc4, $acc4 531 adc $acc5, $acc5 532 adc $acc6, $acc6 533 adc \$0, $acc7 534 535 mulq %rax 536 mov %rax, $acc0 537 mov 8*1($a_ptr), %rax 538 mov %rdx, $t0 539 540 mulq %rax 541 add $t0, $acc1 542 adc %rax, $acc2 543 mov 8*2($a_ptr), %rax 544 adc \$0, %rdx 545 mov %rdx, $t0 546 547 mulq %rax 548 add $t0, $acc3 549 adc %rax, $acc4 550 mov 8*3($a_ptr), %rax 551 adc \$0, %rdx 552 mov %rdx, $t0 553 554 mulq %rax 555 add $t0, $acc5 556 adc %rax, $acc6 557 mov $acc0, %rax 558 adc %rdx, $acc7 559 560 mov .Lpoly+8*1(%rip), $a_ptr 561 mov .Lpoly+8*3(%rip), $t1 562 563 ########################################## 564 # Now the reduction 565 # First iteration 566 mov $acc0, $t0 567 shl \$32, $acc0 568 mulq $t1 569 shr \$32, $t0 570 add $acc0, $acc1 # +=acc[0]<<96 571 adc $t0, $acc2 572 adc %rax, $acc3 573 mov $acc1, %rax 574 adc \$0, %rdx 575 576 ########################################## 577 # Second iteration 578 mov $acc1, $t0 579 shl \$32, $acc1 580 mov %rdx, $acc0 581 mulq $t1 582 shr \$32, $t0 583 add $acc1, $acc2 584 adc $t0, $acc3 585 adc %rax, $acc0 586 mov $acc2, %rax 587 adc \$0, %rdx 588 589 ########################################## 590 # Third iteration 591 mov $acc2, $t0 592 shl \$32, $acc2 593 mov %rdx, $acc1 594 mulq $t1 595 shr \$32, $t0 596 add $acc2, $acc3 597 adc $t0, $acc0 598 adc %rax, $acc1 599 mov $acc3, %rax 600 adc \$0, %rdx 601 602 ########################################### 603 # Last iteration 604 mov $acc3, $t0 605 shl \$32, $acc3 606 mov %rdx, $acc2 607 mulq $t1 608 shr \$32, $t0 609 add $acc3, $acc0 610 adc $t0, $acc1 611 adc %rax, $acc2 612 adc \$0, %rdx 613 xor $acc3, $acc3 614 615 ############################################ 616 # Add the rest of the acc 617 add $acc0, $acc4 618 adc $acc1, $acc5 619 mov $acc4, $acc0 620 adc $acc2, $acc6 621 adc %rdx, $acc7 622 mov $acc5, $acc1 623 adc \$0, $acc3 624 625 sub \$-1, $acc4 # .Lpoly[0] 626 mov $acc6, $acc2 627 sbb $a_ptr, $acc5 # .Lpoly[1] 628 sbb \$0, $acc6 # .Lpoly[2] 629 mov $acc7, $t0 630 sbb $t1, $acc7 # .Lpoly[3] 631 sbb \$0, $acc3 632 633 cmovc $acc0, $acc4 634 cmovc $acc1, $acc5 635 mov $acc4, 8*0($r_ptr) 636 cmovc $acc2, $acc6 637 mov $acc5, 8*1($r_ptr) 638 cmovc $t0, $acc7 639 mov $acc6, 8*2($r_ptr) 640 mov $acc7, 8*3($r_ptr) 641 642 ret 643 .size __ecp_nistz256_sqr_montq,.-__ecp_nistz256_sqr_montq 644 ___ 645 646 if ($addx) { 647 $code.=<<___; 648 .type __ecp_nistz256_mul_montx,\@abi-omnipotent 649 .align 32 650 __ecp_nistz256_mul_montx: 651 ######################################################################## 652 # Multiply by b[0] 653 mulx $acc1, $acc0, $acc1 654 mulx $acc2, $t0, $acc2 655 mov \$32, $poly1 656 xor $acc5, $acc5 # cf=0 657 mulx $acc3, $t1, $acc3 658 mov .Lpoly+8*3(%rip), $poly3 659 adc $t0, $acc1 660 mulx $acc4, $t0, $acc4 661 mov $acc0, %rdx 662 adc $t1, $acc2 663 shlx $poly1,$acc0,$t1 664 adc $t0, $acc3 665 shrx $poly1,$acc0,$t0 666 adc \$0, $acc4 667 668 ######################################################################## 669 # First reduction step 670 add $t1, $acc1 671 adc $t0, $acc2 672 673 mulx $poly3, $t0, $t1 674 mov 8*1($b_ptr), %rdx 675 adc $t0, $acc3 676 adc $t1, $acc4 677 adc \$0, $acc5 678 xor $acc0, $acc0 # $acc0=0,cf=0,of=0 679 680 ######################################################################## 681 # Multiply by b[1] 682 mulx 8*0+128($a_ptr), $t0, $t1 683 adcx $t0, $acc1 684 adox $t1, $acc2 685 686 mulx 8*1+128($a_ptr), $t0, $t1 687 adcx $t0, $acc2 688 adox $t1, $acc3 689 690 mulx 8*2+128($a_ptr), $t0, $t1 691 adcx $t0, $acc3 692 adox $t1, $acc4 693 694 mulx 8*3+128($a_ptr), $t0, $t1 695 mov $acc1, %rdx 696 adcx $t0, $acc4 697 shlx $poly1, $acc1, $t0 698 adox $t1, $acc5 699 shrx $poly1, $acc1, $t1 700 701 adcx $acc0, $acc5 702 adox $acc0, $acc0 703 adc \$0, $acc0 704 705 ######################################################################## 706 # Second reduction step 707 add $t0, $acc2 708 adc $t1, $acc3 709 710 mulx $poly3, $t0, $t1 711 mov 8*2($b_ptr), %rdx 712 adc $t0, $acc4 713 adc $t1, $acc5 714 adc \$0, $acc0 715 xor $acc1 ,$acc1 # $acc1=0,cf=0,of=0 716 717 ######################################################################## 718 # Multiply by b[2] 719 mulx 8*0+128($a_ptr), $t0, $t1 720 adcx $t0, $acc2 721 adox $t1, $acc3 722 723 mulx 8*1+128($a_ptr), $t0, $t1 724 adcx $t0, $acc3 725 adox $t1, $acc4 726 727 mulx 8*2+128($a_ptr), $t0, $t1 728 adcx $t0, $acc4 729 adox $t1, $acc5 730 731 mulx 8*3+128($a_ptr), $t0, $t1 732 mov $acc2, %rdx 733 adcx $t0, $acc5 734 shlx $poly1, $acc2, $t0 735 adox $t1, $acc0 736 shrx $poly1, $acc2, $t1 737 738 adcx $acc1, $acc0 739 adox $acc1, $acc1 740 adc \$0, $acc1 741 742 ######################################################################## 743 # Third reduction step 744 add $t0, $acc3 745 adc $t1, $acc4 746 747 mulx $poly3, $t0, $t1 748 mov 8*3($b_ptr), %rdx 749 adc $t0, $acc5 750 adc $t1, $acc0 751 adc \$0, $acc1 752 xor $acc2, $acc2 # $acc2=0,cf=0,of=0 753 754 ######################################################################## 755 # Multiply by b[3] 756 mulx 8*0+128($a_ptr), $t0, $t1 757 adcx $t0, $acc3 758 adox $t1, $acc4 759 760 mulx 8*1+128($a_ptr), $t0, $t1 761 adcx $t0, $acc4 762 adox $t1, $acc5 763 764 mulx 8*2+128($a_ptr), $t0, $t1 765 adcx $t0, $acc5 766 adox $t1, $acc0 767 768 mulx 8*3+128($a_ptr), $t0, $t1 769 mov $acc3, %rdx 770 adcx $t0, $acc0 771 shlx $poly1, $acc3, $t0 772 adox $t1, $acc1 773 shrx $poly1, $acc3, $t1 774 775 adcx $acc2, $acc1 776 adox $acc2, $acc2 777 adc \$0, $acc2 778 779 ######################################################################## 780 # Fourth reduction step 781 add $t0, $acc4 782 adc $t1, $acc5 783 784 mulx $poly3, $t0, $t1 785 mov $acc4, $t2 786 mov .Lpoly+8*1(%rip), $poly1 787 adc $t0, $acc0 788 mov $acc5, $t3 789 adc $t1, $acc1 790 adc \$0, $acc2 791 792 ######################################################################## 793 # Branch-less conditional subtraction of P 794 xor %eax, %eax 795 mov $acc0, $t0 796 sbb \$-1, $acc4 # .Lpoly[0] 797 sbb $poly1, $acc5 # .Lpoly[1] 798 sbb \$0, $acc0 # .Lpoly[2] 799 mov $acc1, $t1 800 sbb $poly3, $acc1 # .Lpoly[3] 801 sbb \$0, $acc2 802 803 cmovc $t2, $acc4 804 cmovc $t3, $acc5 805 mov $acc4, 8*0($r_ptr) 806 cmovc $t0, $acc0 807 mov $acc5, 8*1($r_ptr) 808 cmovc $t1, $acc1 809 mov $acc0, 8*2($r_ptr) 810 mov $acc1, 8*3($r_ptr) 811 812 ret 813 .size __ecp_nistz256_mul_montx,.-__ecp_nistz256_mul_montx 814 815 .type __ecp_nistz256_sqr_montx,\@abi-omnipotent 816 .align 32 817 __ecp_nistz256_sqr_montx: 818 mulx $acc6, $acc1, $acc2 # a[0]*a[1] 819 mulx $acc7, $t0, $acc3 # a[0]*a[2] 820 xor %eax, %eax 821 adc $t0, $acc2 822 mulx $acc0, $t1, $acc4 # a[0]*a[3] 823 mov $acc6, %rdx 824 adc $t1, $acc3 825 adc \$0, $acc4 826 xor $acc5, $acc5 # $acc5=0,cf=0,of=0 827 828 ################################# 829 mulx $acc7, $t0, $t1 # a[1]*a[2] 830 adcx $t0, $acc3 831 adox $t1, $acc4 832 833 mulx $acc0, $t0, $t1 # a[1]*a[3] 834 mov $acc7, %rdx 835 adcx $t0, $acc4 836 adox $t1, $acc5 837 adc \$0, $acc5 838 839 ################################# 840 mulx $acc0, $t0, $acc6 # a[2]*a[3] 841 mov 8*0+128($a_ptr), %rdx 842 xor $acc7, $acc7 # $acc7=0,cf=0,of=0 843 adcx $acc1, $acc1 # acc1:6<<1 844 adox $t0, $acc5 845 adcx $acc2, $acc2 846 adox $acc7, $acc6 # of=0 847 848 mulx %rdx, $acc0, $t1 849 mov 8*1+128($a_ptr), %rdx 850 adcx $acc3, $acc3 851 adox $t1, $acc1 852 adcx $acc4, $acc4 853 mulx %rdx, $t0, $t4 854 mov 8*2+128($a_ptr), %rdx 855 adcx $acc5, $acc5 856 adox $t0, $acc2 857 adcx $acc6, $acc6 858 .byte 0x67 859 mulx %rdx, $t0, $t1 860 mov 8*3+128($a_ptr), %rdx 861 adox $t4, $acc3 862 adcx $acc7, $acc7 863 adox $t0, $acc4 864 mov \$32, $a_ptr 865 adox $t1, $acc5 866 .byte 0x67,0x67 867 mulx %rdx, $t0, $t4 868 mov .Lpoly+8*3(%rip), %rdx 869 adox $t0, $acc6 870 shlx $a_ptr, $acc0, $t0 871 adox $t4, $acc7 872 shrx $a_ptr, $acc0, $t4 873 mov %rdx,$t1 874 875 # reduction step 1 876 add $t0, $acc1 877 adc $t4, $acc2 878 879 mulx $acc0, $t0, $acc0 880 adc $t0, $acc3 881 shlx $a_ptr, $acc1, $t0 882 adc \$0, $acc0 883 shrx $a_ptr, $acc1, $t4 884 885 # reduction step 2 886 add $t0, $acc2 887 adc $t4, $acc3 888 889 mulx $acc1, $t0, $acc1 890 adc $t0, $acc0 891 shlx $a_ptr, $acc2, $t0 892 adc \$0, $acc1 893 shrx $a_ptr, $acc2, $t4 894 895 # reduction step 3 896 add $t0, $acc3 897 adc $t4, $acc0 898 899 mulx $acc2, $t0, $acc2 900 adc $t0, $acc1 901 shlx $a_ptr, $acc3, $t0 902 adc \$0, $acc2 903 shrx $a_ptr, $acc3, $t4 904 905 # reduction step 4 906 add $t0, $acc0 907 adc $t4, $acc1 908 909 mulx $acc3, $t0, $acc3 910 adc $t0, $acc2 911 adc \$0, $acc3 912 913 xor $t3, $t3 914 add $acc0, $acc4 # accumulate upper half 915 mov .Lpoly+8*1(%rip), $a_ptr 916 adc $acc1, $acc5 917 mov $acc4, $acc0 918 adc $acc2, $acc6 919 adc $acc3, $acc7 920 mov $acc5, $acc1 921 adc \$0, $t3 922 923 sub \$-1, $acc4 # .Lpoly[0] 924 mov $acc6, $acc2 925 sbb $a_ptr, $acc5 # .Lpoly[1] 926 sbb \$0, $acc6 # .Lpoly[2] 927 mov $acc7, $acc3 928 sbb $t1, $acc7 # .Lpoly[3] 929 sbb \$0, $t3 930 931 cmovc $acc0, $acc4 932 cmovc $acc1, $acc5 933 mov $acc4, 8*0($r_ptr) 934 cmovc $acc2, $acc6 935 mov $acc5, 8*1($r_ptr) 936 cmovc $acc3, $acc7 937 mov $acc6, 8*2($r_ptr) 938 mov $acc7, 8*3($r_ptr) 939 940 ret 941 .size __ecp_nistz256_sqr_montx,.-__ecp_nistz256_sqr_montx 942 ___ 943 } 944 } 945 { 946 my ($val,$in_t,$index)=$win64?("%rcx","%rdx","%r8d"):("%rdi","%rsi","%edx"); 947 my ($ONE,$INDEX,$Ra,$Rb,$Rc,$Rd,$Re,$Rf)=map("%xmm$_",(0..7)); 948 my ($M0,$T0a,$T0b,$T0c,$T0d,$T0e,$T0f,$TMP0)=map("%xmm$_",(8..15)); 949 my ($M1,$T2a,$T2b,$TMP2,$M2,$T2a,$T2b,$TMP2)=map("%xmm$_",(8..15)); 950 951 $code.=<<___; 952 ################################################################################ 953 # void ecp_nistz256_select_w5(uint64_t *val, uint64_t *in_t, int index); 954 .globl ecp_nistz256_select_w5 955 .type ecp_nistz256_select_w5,\@abi-omnipotent 956 .align 32 957 ecp_nistz256_select_w5: 958 ___ 959 $code.=<<___ if ($avx>1); 960 leaq OPENSSL_ia32cap_P(%rip), %rax 961 mov 8(%rax), %rax 962 test \$`1<<5`, %eax 963 jnz .Lavx2_select_w5 964 ___ 965 $code.=<<___ if ($win64); 966 lea -0x88(%rsp), %rax 967 .LSEH_begin_ecp_nistz256_select_w5: 968 .byte 0x48,0x8d,0x60,0xe0 #lea -0x20(%rax), %rsp 969 .byte 0x0f,0x29,0x70,0xe0 #movaps %xmm6, -0x20(%rax) 970 .byte 0x0f,0x29,0x78,0xf0 #movaps %xmm7, -0x10(%rax) 971 .byte 0x44,0x0f,0x29,0x00 #movaps %xmm8, 0(%rax) 972 .byte 0x44,0x0f,0x29,0x48,0x10 #movaps %xmm9, 0x10(%rax) 973 .byte 0x44,0x0f,0x29,0x50,0x20 #movaps %xmm10, 0x20(%rax) 974 .byte 0x44,0x0f,0x29,0x58,0x30 #movaps %xmm11, 0x30(%rax) 975 .byte 0x44,0x0f,0x29,0x60,0x40 #movaps %xmm12, 0x40(%rax) 976 .byte 0x44,0x0f,0x29,0x68,0x50 #movaps %xmm13, 0x50(%rax) 977 .byte 0x44,0x0f,0x29,0x70,0x60 #movaps %xmm14, 0x60(%rax) 978 .byte 0x44,0x0f,0x29,0x78,0x70 #movaps %xmm15, 0x70(%rax) 979 ___ 980 $code.=<<___; 981 movdqa .LOne(%rip), $ONE 982 movd $index, $INDEX 983 984 pxor $Ra, $Ra 985 pxor $Rb, $Rb 986 pxor $Rc, $Rc 987 pxor $Rd, $Rd 988 pxor $Re, $Re 989 pxor $Rf, $Rf 990 991 movdqa $ONE, $M0 992 pshufd \$0, $INDEX, $INDEX 993 994 mov \$16, %rax 995 .Lselect_loop_sse_w5: 996 997 movdqa $M0, $TMP0 998 paddd $ONE, $M0 999 pcmpeqd $INDEX, $TMP0 1000 1001 movdqa 16*0($in_t), $T0a 1002 movdqa 16*1($in_t), $T0b 1003 movdqa 16*2($in_t), $T0c 1004 movdqa 16*3($in_t), $T0d 1005 movdqa 16*4($in_t), $T0e 1006 movdqa 16*5($in_t), $T0f 1007 lea 16*6($in_t), $in_t 1008 1009 pand $TMP0, $T0a 1010 pand $TMP0, $T0b 1011 por $T0a, $Ra 1012 pand $TMP0, $T0c 1013 por $T0b, $Rb 1014 pand $TMP0, $T0d 1015 por $T0c, $Rc 1016 pand $TMP0, $T0e 1017 por $T0d, $Rd 1018 pand $TMP0, $T0f 1019 por $T0e, $Re 1020 por $T0f, $Rf 1021 1022 dec %rax 1023 jnz .Lselect_loop_sse_w5 1024 1025 movdqu $Ra, 16*0($val) 1026 movdqu $Rb, 16*1($val) 1027 movdqu $Rc, 16*2($val) 1028 movdqu $Rd, 16*3($val) 1029 movdqu $Re, 16*4($val) 1030 movdqu $Rf, 16*5($val) 1031 ___ 1032 $code.=<<___ if ($win64); 1033 movaps (%rsp), %xmm6 1034 movaps 0x10(%rsp), %xmm7 1035 movaps 0x20(%rsp), %xmm8 1036 movaps 0x30(%rsp), %xmm9 1037 movaps 0x40(%rsp), %xmm10 1038 movaps 0x50(%rsp), %xmm11 1039 movaps 0x60(%rsp), %xmm12 1040 movaps 0x70(%rsp), %xmm13 1041 movaps 0x80(%rsp), %xmm14 1042 movaps 0x90(%rsp), %xmm15 1043 lea 0xa8(%rsp), %rsp 1044 .LSEH_end_ecp_nistz256_select_w5: 1045 ___ 1046 $code.=<<___; 1047 ret 1048 .size ecp_nistz256_select_w5,.-ecp_nistz256_select_w5 1049 1050 ################################################################################ 1051 # void ecp_nistz256_select_w7(uint64_t *val, uint64_t *in_t, int index); 1052 .globl ecp_nistz256_select_w7 1053 .type ecp_nistz256_select_w7,\@abi-omnipotent 1054 .align 32 1055 ecp_nistz256_select_w7: 1056 ___ 1057 $code.=<<___ if ($avx>1); 1058 leaq OPENSSL_ia32cap_P(%rip), %rax 1059 mov 8(%rax), %rax 1060 test \$`1<<5`, %eax 1061 jnz .Lavx2_select_w7 1062 ___ 1063 $code.=<<___ if ($win64); 1064 lea -0x88(%rsp), %rax 1065 .LSEH_begin_ecp_nistz256_select_w7: 1066 .byte 0x48,0x8d,0x60,0xe0 #lea -0x20(%rax), %rsp 1067 .byte 0x0f,0x29,0x70,0xe0 #movaps %xmm6, -0x20(%rax) 1068 .byte 0x0f,0x29,0x78,0xf0 #movaps %xmm7, -0x10(%rax) 1069 .byte 0x44,0x0f,0x29,0x00 #movaps %xmm8, 0(%rax) 1070 .byte 0x44,0x0f,0x29,0x48,0x10 #movaps %xmm9, 0x10(%rax) 1071 .byte 0x44,0x0f,0x29,0x50,0x20 #movaps %xmm10, 0x20(%rax) 1072 .byte 0x44,0x0f,0x29,0x58,0x30 #movaps %xmm11, 0x30(%rax) 1073 .byte 0x44,0x0f,0x29,0x60,0x40 #movaps %xmm12, 0x40(%rax) 1074 .byte 0x44,0x0f,0x29,0x68,0x50 #movaps %xmm13, 0x50(%rax) 1075 .byte 0x44,0x0f,0x29,0x70,0x60 #movaps %xmm14, 0x60(%rax) 1076 .byte 0x44,0x0f,0x29,0x78,0x70 #movaps %xmm15, 0x70(%rax) 1077 ___ 1078 $code.=<<___; 1079 movdqa .LOne(%rip), $M0 1080 movd $index, $INDEX 1081 1082 pxor $Ra, $Ra 1083 pxor $Rb, $Rb 1084 pxor $Rc, $Rc 1085 pxor $Rd, $Rd 1086 1087 movdqa $M0, $ONE 1088 pshufd \$0, $INDEX, $INDEX 1089 mov \$64, %rax 1090 1091 .Lselect_loop_sse_w7: 1092 movdqa $M0, $TMP0 1093 paddd $ONE, $M0 1094 movdqa 16*0($in_t), $T0a 1095 movdqa 16*1($in_t), $T0b 1096 pcmpeqd $INDEX, $TMP0 1097 movdqa 16*2($in_t), $T0c 1098 movdqa 16*3($in_t), $T0d 1099 lea 16*4($in_t), $in_t 1100 1101 pand $TMP0, $T0a 1102 pand $TMP0, $T0b 1103 por $T0a, $Ra 1104 pand $TMP0, $T0c 1105 por $T0b, $Rb 1106 pand $TMP0, $T0d 1107 por $T0c, $Rc 1108 prefetcht0 255($in_t) 1109 por $T0d, $Rd 1110 1111 dec %rax 1112 jnz .Lselect_loop_sse_w7 1113 1114 movdqu $Ra, 16*0($val) 1115 movdqu $Rb, 16*1($val) 1116 movdqu $Rc, 16*2($val) 1117 movdqu $Rd, 16*3($val) 1118 ___ 1119 $code.=<<___ if ($win64); 1120 movaps (%rsp), %xmm6 1121 movaps 0x10(%rsp), %xmm7 1122 movaps 0x20(%rsp), %xmm8 1123 movaps 0x30(%rsp), %xmm9 1124 movaps 0x40(%rsp), %xmm10 1125 movaps 0x50(%rsp), %xmm11 1126 movaps 0x60(%rsp), %xmm12 1127 movaps 0x70(%rsp), %xmm13 1128 movaps 0x80(%rsp), %xmm14 1129 movaps 0x90(%rsp), %xmm15 1130 lea 0xa8(%rsp), %rsp 1131 .LSEH_end_ecp_nistz256_select_w7: 1132 ___ 1133 $code.=<<___; 1134 ret 1135 .size ecp_nistz256_select_w7,.-ecp_nistz256_select_w7 1136 ___ 1137 } 1138 if ($avx>1) { 1139 my ($val,$in_t,$index)=$win64?("%rcx","%rdx","%r8d"):("%rdi","%rsi","%edx"); 1140 my ($TWO,$INDEX,$Ra,$Rb,$Rc)=map("%ymm$_",(0..4)); 1141 my ($M0,$T0a,$T0b,$T0c,$TMP0)=map("%ymm$_",(5..9)); 1142 my ($M1,$T1a,$T1b,$T1c,$TMP1)=map("%ymm$_",(10..14)); 1143 1144 $code.=<<___; 1145 ################################################################################ 1146 # void ecp_nistz256_avx2_select_w5(uint64_t *val, uint64_t *in_t, int index); 1147 .type ecp_nistz256_avx2_select_w5,\@abi-omnipotent 1148 .align 32 1149 ecp_nistz256_avx2_select_w5: 1150 .Lavx2_select_w5: 1151 vzeroupper 1152 ___ 1153 $code.=<<___ if ($win64); 1154 lea -0x88(%rsp), %rax 1155 .LSEH_begin_ecp_nistz256_avx2_select_w5: 1156 .byte 0x48,0x8d,0x60,0xe0 #lea -0x20(%rax), %rsp 1157 .byte 0xc5,0xf8,0x29,0x70,0xe0 #vmovaps %xmm6, -0x20(%rax) 1158 .byte 0xc5,0xf8,0x29,0x78,0xf0 #vmovaps %xmm7, -0x10(%rax) 1159 .byte 0xc5,0x78,0x29,0x40,0x00 #vmovaps %xmm8, 8(%rax) 1160 .byte 0xc5,0x78,0x29,0x48,0x10 #vmovaps %xmm9, 0x10(%rax) 1161 .byte 0xc5,0x78,0x29,0x50,0x20 #vmovaps %xmm10, 0x20(%rax) 1162 .byte 0xc5,0x78,0x29,0x58,0x30 #vmovaps %xmm11, 0x30(%rax) 1163 .byte 0xc5,0x78,0x29,0x60,0x40 #vmovaps %xmm12, 0x40(%rax) 1164 .byte 0xc5,0x78,0x29,0x68,0x50 #vmovaps %xmm13, 0x50(%rax) 1165 .byte 0xc5,0x78,0x29,0x70,0x60 #vmovaps %xmm14, 0x60(%rax) 1166 .byte 0xc5,0x78,0x29,0x78,0x70 #vmovaps %xmm15, 0x70(%rax) 1167 ___ 1168 $code.=<<___; 1169 vmovdqa .LTwo(%rip), $TWO 1170 1171 vpxor $Ra, $Ra, $Ra 1172 vpxor $Rb, $Rb, $Rb 1173 vpxor $Rc, $Rc, $Rc 1174 1175 vmovdqa .LOne(%rip), $M0 1176 vmovdqa .LTwo(%rip), $M1 1177 1178 vmovd $index, %xmm1 1179 vpermd $INDEX, $Ra, $INDEX 1180 1181 mov \$8, %rax 1182 .Lselect_loop_avx2_w5: 1183 1184 vmovdqa 32*0($in_t), $T0a 1185 vmovdqa 32*1($in_t), $T0b 1186 vmovdqa 32*2($in_t), $T0c 1187 1188 vmovdqa 32*3($in_t), $T1a 1189 vmovdqa 32*4($in_t), $T1b 1190 vmovdqa 32*5($in_t), $T1c 1191 1192 vpcmpeqd $INDEX, $M0, $TMP0 1193 vpcmpeqd $INDEX, $M1, $TMP1 1194 1195 vpaddd $TWO, $M0, $M0 1196 vpaddd $TWO, $M1, $M1 1197 lea 32*6($in_t), $in_t 1198 1199 vpand $TMP0, $T0a, $T0a 1200 vpand $TMP0, $T0b, $T0b 1201 vpand $TMP0, $T0c, $T0c 1202 vpand $TMP1, $T1a, $T1a 1203 vpand $TMP1, $T1b, $T1b 1204 vpand $TMP1, $T1c, $T1c 1205 1206 vpxor $T0a, $Ra, $Ra 1207 vpxor $T0b, $Rb, $Rb 1208 vpxor $T0c, $Rc, $Rc 1209 vpxor $T1a, $Ra, $Ra 1210 vpxor $T1b, $Rb, $Rb 1211 vpxor $T1c, $Rc, $Rc 1212 1213 dec %rax 1214 jnz .Lselect_loop_avx2_w5 1215 1216 vmovdqu $Ra, 32*0($val) 1217 vmovdqu $Rb, 32*1($val) 1218 vmovdqu $Rc, 32*2($val) 1219 vzeroupper 1220 ___ 1221 $code.=<<___ if ($win64); 1222 movaps (%rsp), %xmm6 1223 movaps 0x10(%rsp), %xmm7 1224 movaps 0x20(%rsp), %xmm8 1225 movaps 0x30(%rsp), %xmm9 1226 movaps 0x40(%rsp), %xmm10 1227 movaps 0x50(%rsp), %xmm11 1228 movaps 0x60(%rsp), %xmm12 1229 movaps 0x70(%rsp), %xmm13 1230 movaps 0x80(%rsp), %xmm14 1231 movaps 0x90(%rsp), %xmm15 1232 lea 0xa8(%rsp), %rsp 1233 .LSEH_end_ecp_nistz256_avx2_select_w5: 1234 ___ 1235 $code.=<<___; 1236 ret 1237 .size ecp_nistz256_avx2_select_w5,.-ecp_nistz256_avx2_select_w5 1238 ___ 1239 } 1240 if ($avx>1) { 1241 my ($val,$in_t,$index)=$win64?("%rcx","%rdx","%r8d"):("%rdi","%rsi","%edx"); 1242 my ($THREE,$INDEX,$Ra,$Rb)=map("%ymm$_",(0..3)); 1243 my ($M0,$T0a,$T0b,$TMP0)=map("%ymm$_",(4..7)); 1244 my ($M1,$T1a,$T1b,$TMP1)=map("%ymm$_",(8..11)); 1245 my ($M2,$T2a,$T2b,$TMP2)=map("%ymm$_",(12..15)); 1246 1247 $code.=<<___; 1248 1249 ################################################################################ 1250 # void ecp_nistz256_avx2_select_w7(uint64_t *val, uint64_t *in_t, int index); 1251 .globl ecp_nistz256_avx2_select_w7 1252 .type ecp_nistz256_avx2_select_w7,\@abi-omnipotent 1253 .align 32 1254 ecp_nistz256_avx2_select_w7: 1255 .Lavx2_select_w7: 1256 vzeroupper 1257 ___ 1258 $code.=<<___ if ($win64); 1259 lea -0x88(%rsp), %rax 1260 .LSEH_begin_ecp_nistz256_avx2_select_w7: 1261 .byte 0x48,0x8d,0x60,0xe0 #lea -0x20(%rax), %rsp 1262 .byte 0xc5,0xf8,0x29,0x70,0xe0 #vmovaps %xmm6, -0x20(%rax) 1263 .byte 0xc5,0xf8,0x29,0x78,0xf0 #vmovaps %xmm7, -0x10(%rax) 1264 .byte 0xc5,0x78,0x29,0x40,0x00 #vmovaps %xmm8, 8(%rax) 1265 .byte 0xc5,0x78,0x29,0x48,0x10 #vmovaps %xmm9, 0x10(%rax) 1266 .byte 0xc5,0x78,0x29,0x50,0x20 #vmovaps %xmm10, 0x20(%rax) 1267 .byte 0xc5,0x78,0x29,0x58,0x30 #vmovaps %xmm11, 0x30(%rax) 1268 .byte 0xc5,0x78,0x29,0x60,0x40 #vmovaps %xmm12, 0x40(%rax) 1269 .byte 0xc5,0x78,0x29,0x68,0x50 #vmovaps %xmm13, 0x50(%rax) 1270 .byte 0xc5,0x78,0x29,0x70,0x60 #vmovaps %xmm14, 0x60(%rax) 1271 .byte 0xc5,0x78,0x29,0x78,0x70 #vmovaps %xmm15, 0x70(%rax) 1272 ___ 1273 $code.=<<___; 1274 vmovdqa .LThree(%rip), $THREE 1275 1276 vpxor $Ra, $Ra, $Ra 1277 vpxor $Rb, $Rb, $Rb 1278 1279 vmovdqa .LOne(%rip), $M0 1280 vmovdqa .LTwo(%rip), $M1 1281 vmovdqa .LThree(%rip), $M2 1282 1283 vmovd $index, %xmm1 1284 vpermd $INDEX, $Ra, $INDEX 1285 # Skip index = 0, because it is implicitly the point at infinity 1286 1287 mov \$21, %rax 1288 .Lselect_loop_avx2_w7: 1289 1290 vmovdqa 32*0($in_t), $T0a 1291 vmovdqa 32*1($in_t), $T0b 1292 1293 vmovdqa 32*2($in_t), $T1a 1294 vmovdqa 32*3($in_t), $T1b 1295 1296 vmovdqa 32*4($in_t), $T2a 1297 vmovdqa 32*5($in_t), $T2b 1298 1299 vpcmpeqd $INDEX, $M0, $TMP0 1300 vpcmpeqd $INDEX, $M1, $TMP1 1301 vpcmpeqd $INDEX, $M2, $TMP2 1302 1303 vpaddd $THREE, $M0, $M0 1304 vpaddd $THREE, $M1, $M1 1305 vpaddd $THREE, $M2, $M2 1306 lea 32*6($in_t), $in_t 1307 1308 vpand $TMP0, $T0a, $T0a 1309 vpand $TMP0, $T0b, $T0b 1310 vpand $TMP1, $T1a, $T1a 1311 vpand $TMP1, $T1b, $T1b 1312 vpand $TMP2, $T2a, $T2a 1313 vpand $TMP2, $T2b, $T2b 1314 1315 vpxor $T0a, $Ra, $Ra 1316 vpxor $T0b, $Rb, $Rb 1317 vpxor $T1a, $Ra, $Ra 1318 vpxor $T1b, $Rb, $Rb 1319 vpxor $T2a, $Ra, $Ra 1320 vpxor $T2b, $Rb, $Rb 1321 1322 dec %rax 1323 jnz .Lselect_loop_avx2_w7 1324 1325 1326 vmovdqa 32*0($in_t), $T0a 1327 vmovdqa 32*1($in_t), $T0b 1328 1329 vpcmpeqd $INDEX, $M0, $TMP0 1330 1331 vpand $TMP0, $T0a, $T0a 1332 vpand $TMP0, $T0b, $T0b 1333 1334 vpxor $T0a, $Ra, $Ra 1335 vpxor $T0b, $Rb, $Rb 1336 1337 vmovdqu $Ra, 32*0($val) 1338 vmovdqu $Rb, 32*1($val) 1339 vzeroupper 1340 ___ 1341 $code.=<<___ if ($win64); 1342 movaps (%rsp), %xmm6 1343 movaps 0x10(%rsp), %xmm7 1344 movaps 0x20(%rsp), %xmm8 1345 movaps 0x30(%rsp), %xmm9 1346 movaps 0x40(%rsp), %xmm10 1347 movaps 0x50(%rsp), %xmm11 1348 movaps 0x60(%rsp), %xmm12 1349 movaps 0x70(%rsp), %xmm13 1350 movaps 0x80(%rsp), %xmm14 1351 movaps 0x90(%rsp), %xmm15 1352 lea 0xa8(%rsp), %rsp 1353 .LSEH_end_ecp_nistz256_avx2_select_w7: 1354 ___ 1355 $code.=<<___; 1356 ret 1357 .size ecp_nistz256_avx2_select_w7,.-ecp_nistz256_avx2_select_w7 1358 ___ 1359 } else { 1360 $code.=<<___; 1361 .globl ecp_nistz256_avx2_select_w7 1362 .type ecp_nistz256_avx2_select_w7,\@function,3 1363 .align 32 1364 ecp_nistz256_avx2_select_w7: 1365 .byte 0x0f,0x0b # ud2 1366 ret 1367 .size ecp_nistz256_avx2_select_w7,.-ecp_nistz256_avx2_select_w7 1368 ___ 1369 } 1370 {{{ 1371 ######################################################################## 1372 # This block implements higher level point_double, point_add and 1373 # point_add_affine. The key to performance in this case is to allow 1374 # out-of-order execution logic to overlap computations from next step 1375 # with tail processing from current step. By using tailored calling 1376 # sequence we minimize inter-step overhead to give processor better 1377 # shot at overlapping operations... 1378 # 1379 # You will notice that input data is copied to stack. Trouble is that 1380 # there are no registers to spare for holding original pointers and 1381 # reloading them, pointers, would create undesired dependencies on 1382 # effective addresses calculation paths. In other words it's too done 1383 # to favour out-of-order execution logic. 1384 # <appro (at] openssl.org> 1385 1386 my ($r_ptr,$a_ptr,$b_org,$b_ptr)=("%rdi","%rsi","%rdx","%rbx"); 1387 my ($acc0,$acc1,$acc2,$acc3,$acc4,$acc5,$acc6,$acc7)=map("%r$_",(8..15)); 1388 my ($t0,$t1,$t2,$t3,$t4)=("%rax","%rbp","%rcx",$acc4,$acc4); 1389 my ($poly1,$poly3)=($acc6,$acc7); 1390 1391 sub load_for_mul () { 1392 my ($a,$b,$src0) = @_; 1393 my $bias = $src0 eq "%rax" ? 0 : -128; 1394 1395 " mov $b, $src0 1396 lea $b, $b_ptr 1397 mov 8*0+$a, $acc1 1398 mov 8*1+$a, $acc2 1399 lea $bias+$a, $a_ptr 1400 mov 8*2+$a, $acc3 1401 mov 8*3+$a, $acc4" 1402 } 1403 1404 sub load_for_sqr () { 1405 my ($a,$src0) = @_; 1406 my $bias = $src0 eq "%rax" ? 0 : -128; 1407 1408 " mov 8*0+$a, $src0 1409 mov 8*1+$a, $acc6 1410 lea $bias+$a, $a_ptr 1411 mov 8*2+$a, $acc7 1412 mov 8*3+$a, $acc0" 1413 } 1414 1415 { 1416 ######################################################################## 1417 # operate in 4-5-0-1 "name space" that matches multiplication output 1418 # 1419 my ($a0,$a1,$a2,$a3,$t3,$t4)=($acc4,$acc5,$acc0,$acc1,$acc2,$acc3); 1420 1421 $code.=<<___; 1422 .type __ecp_nistz256_add_toq,\@abi-omnipotent 1423 .align 32 1424 __ecp_nistz256_add_toq: 1425 xor $t4,$t4 1426 add 8*0($b_ptr), $a0 1427 adc 8*1($b_ptr), $a1 1428 mov $a0, $t0 1429 adc 8*2($b_ptr), $a2 1430 adc 8*3($b_ptr), $a3 1431 mov $a1, $t1 1432 adc \$0, $t4 1433 1434 sub \$-1, $a0 1435 mov $a2, $t2 1436 sbb $poly1, $a1 1437 sbb \$0, $a2 1438 mov $a3, $t3 1439 sbb $poly3, $a3 1440 sbb \$0, $t4 1441 1442 cmovc $t0, $a0 1443 cmovc $t1, $a1 1444 mov $a0, 8*0($r_ptr) 1445 cmovc $t2, $a2 1446 mov $a1, 8*1($r_ptr) 1447 cmovc $t3, $a3 1448 mov $a2, 8*2($r_ptr) 1449 mov $a3, 8*3($r_ptr) 1450 1451 ret 1452 .size __ecp_nistz256_add_toq,.-__ecp_nistz256_add_toq 1453 1454 .type __ecp_nistz256_sub_fromq,\@abi-omnipotent 1455 .align 32 1456 __ecp_nistz256_sub_fromq: 1457 sub 8*0($b_ptr), $a0 1458 sbb 8*1($b_ptr), $a1 1459 mov $a0, $t0 1460 sbb 8*2($b_ptr), $a2 1461 sbb 8*3($b_ptr), $a3 1462 mov $a1, $t1 1463 sbb $t4, $t4 1464 1465 add \$-1, $a0 1466 mov $a2, $t2 1467 adc $poly1, $a1 1468 adc \$0, $a2 1469 mov $a3, $t3 1470 adc $poly3, $a3 1471 test $t4, $t4 1472 1473 cmovz $t0, $a0 1474 cmovz $t1, $a1 1475 mov $a0, 8*0($r_ptr) 1476 cmovz $t2, $a2 1477 mov $a1, 8*1($r_ptr) 1478 cmovz $t3, $a3 1479 mov $a2, 8*2($r_ptr) 1480 mov $a3, 8*3($r_ptr) 1481 1482 ret 1483 .size __ecp_nistz256_sub_fromq,.-__ecp_nistz256_sub_fromq 1484 1485 .type __ecp_nistz256_subq,\@abi-omnipotent 1486 .align 32 1487 __ecp_nistz256_subq: 1488 sub $a0, $t0 1489 sbb $a1, $t1 1490 mov $t0, $a0 1491 sbb $a2, $t2 1492 sbb $a3, $t3 1493 mov $t1, $a1 1494 sbb $t4, $t4 1495 1496 add \$-1, $t0 1497 mov $t2, $a2 1498 adc $poly1, $t1 1499 adc \$0, $t2 1500 mov $t3, $a3 1501 adc $poly3, $t3 1502 test $t4, $t4 1503 1504 cmovnz $t0, $a0 1505 cmovnz $t1, $a1 1506 cmovnz $t2, $a2 1507 cmovnz $t3, $a3 1508 1509 ret 1510 .size __ecp_nistz256_subq,.-__ecp_nistz256_subq 1511 1512 .type __ecp_nistz256_mul_by_2q,\@abi-omnipotent 1513 .align 32 1514 __ecp_nistz256_mul_by_2q: 1515 xor $t4, $t4 1516 add $a0, $a0 # a0:a3+a0:a3 1517 adc $a1, $a1 1518 mov $a0, $t0 1519 adc $a2, $a2 1520 adc $a3, $a3 1521 mov $a1, $t1 1522 adc \$0, $t4 1523 1524 sub \$-1, $a0 1525 mov $a2, $t2 1526 sbb $poly1, $a1 1527 sbb \$0, $a2 1528 mov $a3, $t3 1529 sbb $poly3, $a3 1530 sbb \$0, $t4 1531 1532 cmovc $t0, $a0 1533 cmovc $t1, $a1 1534 mov $a0, 8*0($r_ptr) 1535 cmovc $t2, $a2 1536 mov $a1, 8*1($r_ptr) 1537 cmovc $t3, $a3 1538 mov $a2, 8*2($r_ptr) 1539 mov $a3, 8*3($r_ptr) 1540 1541 ret 1542 .size __ecp_nistz256_mul_by_2q,.-__ecp_nistz256_mul_by_2q 1543 ___ 1544 } 1545 sub gen_double () { 1546 my $x = shift; 1547 my ($src0,$sfx,$bias); 1548 my ($S,$M,$Zsqr,$in_x,$tmp0)=map(32*$_,(0..4)); 1549 1550 if ($x ne "x") { 1551 $src0 = "%rax"; 1552 $sfx = ""; 1553 $bias = 0; 1554 1555 $code.=<<___; 1556 .globl ecp_nistz256_point_double 1557 .type ecp_nistz256_point_double,\@function,2 1558 .align 32 1559 ecp_nistz256_point_double: 1560 ___ 1561 $code.=<<___ if ($addx); 1562 leaq OPENSSL_ia32cap_P(%rip), %rcx 1563 mov 8(%rcx), %rcx 1564 and \$0x80100, %ecx 1565 cmp \$0x80100, %ecx 1566 je .Lpoint_doublex 1567 ___ 1568 } else { 1569 $src0 = "%rdx"; 1570 $sfx = "x"; 1571 $bias = 128; 1572 1573 $code.=<<___; 1574 .type ecp_nistz256_point_doublex,\@function,2 1575 .align 32 1576 ecp_nistz256_point_doublex: 1577 .Lpoint_doublex: 1578 ___ 1579 } 1580 $code.=<<___; 1581 push %rbp 1582 push %rbx 1583 push %r12 1584 push %r13 1585 push %r14 1586 push %r15 1587 sub \$32*5+8, %rsp 1588 1589 .Lpoint_double_shortcut$x: 1590 movdqu 0x00($a_ptr), %xmm0 # copy *(P256_POINT *)$a_ptr.x 1591 mov $a_ptr, $b_ptr # backup copy 1592 movdqu 0x10($a_ptr), %xmm1 1593 mov 0x20+8*0($a_ptr), $acc4 # load in_y in "5-4-0-1" order 1594 mov 0x20+8*1($a_ptr), $acc5 1595 mov 0x20+8*2($a_ptr), $acc0 1596 mov 0x20+8*3($a_ptr), $acc1 1597 mov .Lpoly+8*1(%rip), $poly1 1598 mov .Lpoly+8*3(%rip), $poly3 1599 movdqa %xmm0, $in_x(%rsp) 1600 movdqa %xmm1, $in_x+0x10(%rsp) 1601 lea 0x20($r_ptr), $acc2 1602 lea 0x40($r_ptr), $acc3 1603 movq $r_ptr, %xmm0 1604 movq $acc2, %xmm1 1605 movq $acc3, %xmm2 1606 1607 lea $S(%rsp), $r_ptr 1608 call __ecp_nistz256_mul_by_2$x # p256_mul_by_2(S, in_y); 1609 1610 mov 0x40+8*0($a_ptr), $src0 1611 mov 0x40+8*1($a_ptr), $acc6 1612 mov 0x40+8*2($a_ptr), $acc7 1613 mov 0x40+8*3($a_ptr), $acc0 1614 lea 0x40-$bias($a_ptr), $a_ptr 1615 lea $Zsqr(%rsp), $r_ptr 1616 call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(Zsqr, in_z); 1617 1618 `&load_for_sqr("$S(%rsp)", "$src0")` 1619 lea $S(%rsp), $r_ptr 1620 call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(S, S); 1621 1622 mov 0x20($b_ptr), $src0 # $b_ptr is still valid 1623 mov 0x40+8*0($b_ptr), $acc1 1624 mov 0x40+8*1($b_ptr), $acc2 1625 mov 0x40+8*2($b_ptr), $acc3 1626 mov 0x40+8*3($b_ptr), $acc4 1627 lea 0x40-$bias($b_ptr), $a_ptr 1628 lea 0x20($b_ptr), $b_ptr 1629 movq %xmm2, $r_ptr 1630 call __ecp_nistz256_mul_mont$x # p256_mul_mont(res_z, in_z, in_y); 1631 call __ecp_nistz256_mul_by_2$x # p256_mul_by_2(res_z, res_z); 1632 1633 mov $in_x+8*0(%rsp), $acc4 # "5-4-0-1" order 1634 mov $in_x+8*1(%rsp), $acc5 1635 lea $Zsqr(%rsp), $b_ptr 1636 mov $in_x+8*2(%rsp), $acc0 1637 mov $in_x+8*3(%rsp), $acc1 1638 lea $M(%rsp), $r_ptr 1639 call __ecp_nistz256_add_to$x # p256_add(M, in_x, Zsqr); 1640 1641 mov $in_x+8*0(%rsp), $acc4 # "5-4-0-1" order 1642 mov $in_x+8*1(%rsp), $acc5 1643 lea $Zsqr(%rsp), $b_ptr 1644 mov $in_x+8*2(%rsp), $acc0 1645 mov $in_x+8*3(%rsp), $acc1 1646 lea $Zsqr(%rsp), $r_ptr 1647 call __ecp_nistz256_sub_from$x # p256_sub(Zsqr, in_x, Zsqr); 1648 1649 `&load_for_sqr("$S(%rsp)", "$src0")` 1650 movq %xmm1, $r_ptr 1651 call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(res_y, S); 1652 ___ 1653 { 1654 ######## ecp_nistz256_div_by_2(res_y, res_y); ########################## 1655 # operate in 4-5-6-7 "name space" that matches squaring output 1656 # 1657 my ($poly1,$poly3)=($a_ptr,$t1); 1658 my ($a0,$a1,$a2,$a3,$t3,$t4,$t1)=($acc4,$acc5,$acc6,$acc7,$acc0,$acc1,$acc2); 1659 1660 $code.=<<___; 1661 xor $t4, $t4 1662 mov $a0, $t0 1663 add \$-1, $a0 1664 mov $a1, $t1 1665 adc $poly1, $a1 1666 mov $a2, $t2 1667 adc \$0, $a2 1668 mov $a3, $t3 1669 adc $poly3, $a3 1670 adc \$0, $t4 1671 xor $a_ptr, $a_ptr # borrow $a_ptr 1672 test \$1, $t0 1673 1674 cmovz $t0, $a0 1675 cmovz $t1, $a1 1676 cmovz $t2, $a2 1677 cmovz $t3, $a3 1678 cmovz $a_ptr, $t4 1679 1680 mov $a1, $t0 # a0:a3>>1 1681 shr \$1, $a0 1682 shl \$63, $t0 1683 mov $a2, $t1 1684 shr \$1, $a1 1685 or $t0, $a0 1686 shl \$63, $t1 1687 mov $a3, $t2 1688 shr \$1, $a2 1689 or $t1, $a1 1690 shl \$63, $t2 1691 mov $a0, 8*0($r_ptr) 1692 shr \$1, $a3 1693 mov $a1, 8*1($r_ptr) 1694 shl \$63, $t4 1695 or $t2, $a2 1696 or $t4, $a3 1697 mov $a2, 8*2($r_ptr) 1698 mov $a3, 8*3($r_ptr) 1699 ___ 1700 } 1701 $code.=<<___; 1702 `&load_for_mul("$M(%rsp)", "$Zsqr(%rsp)", "$src0")` 1703 lea $M(%rsp), $r_ptr 1704 call __ecp_nistz256_mul_mont$x # p256_mul_mont(M, M, Zsqr); 1705 1706 lea $tmp0(%rsp), $r_ptr 1707 call __ecp_nistz256_mul_by_2$x 1708 1709 lea $M(%rsp), $b_ptr 1710 lea $M(%rsp), $r_ptr 1711 call __ecp_nistz256_add_to$x # p256_mul_by_3(M, M); 1712 1713 `&load_for_mul("$S(%rsp)", "$in_x(%rsp)", "$src0")` 1714 lea $S(%rsp), $r_ptr 1715 call __ecp_nistz256_mul_mont$x # p256_mul_mont(S, S, in_x); 1716 1717 lea $tmp0(%rsp), $r_ptr 1718 call __ecp_nistz256_mul_by_2$x # p256_mul_by_2(tmp0, S); 1719 1720 `&load_for_sqr("$M(%rsp)", "$src0")` 1721 movq %xmm0, $r_ptr 1722 call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(res_x, M); 1723 1724 lea $tmp0(%rsp), $b_ptr 1725 mov $acc6, $acc0 # harmonize sqr output and sub input 1726 mov $acc7, $acc1 1727 mov $a_ptr, $poly1 1728 mov $t1, $poly3 1729 call __ecp_nistz256_sub_from$x # p256_sub(res_x, res_x, tmp0); 1730 1731 mov $S+8*0(%rsp), $t0 1732 mov $S+8*1(%rsp), $t1 1733 mov $S+8*2(%rsp), $t2 1734 mov $S+8*3(%rsp), $acc2 # "4-5-0-1" order 1735 lea $S(%rsp), $r_ptr 1736 call __ecp_nistz256_sub$x # p256_sub(S, S, res_x); 1737 1738 mov $M(%rsp), $src0 1739 lea $M(%rsp), $b_ptr 1740 mov $acc4, $acc6 # harmonize sub output and mul input 1741 xor %ecx, %ecx 1742 mov $acc4, $S+8*0(%rsp) # have to save:-( 1743 mov $acc5, $acc2 1744 mov $acc5, $S+8*1(%rsp) 1745 cmovz $acc0, $acc3 1746 mov $acc0, $S+8*2(%rsp) 1747 lea $S-$bias(%rsp), $a_ptr 1748 cmovz $acc1, $acc4 1749 mov $acc1, $S+8*3(%rsp) 1750 mov $acc6, $acc1 1751 lea $S(%rsp), $r_ptr 1752 call __ecp_nistz256_mul_mont$x # p256_mul_mont(S, S, M); 1753 1754 movq %xmm1, $b_ptr 1755 movq %xmm1, $r_ptr 1756 call __ecp_nistz256_sub_from$x # p256_sub(res_y, S, res_y); 1757 1758 add \$32*5+8, %rsp 1759 pop %r15 1760 pop %r14 1761 pop %r13 1762 pop %r12 1763 pop %rbx 1764 pop %rbp 1765 ret 1766 .size ecp_nistz256_point_double$sfx,.-ecp_nistz256_point_double$sfx 1767 ___ 1768 } 1769 &gen_double("q"); 1770 1771 sub gen_add () { 1772 my $x = shift; 1773 my ($src0,$sfx,$bias); 1774 my ($H,$Hsqr,$R,$Rsqr,$Hcub, 1775 $U1,$U2,$S1,$S2, 1776 $res_x,$res_y,$res_z, 1777 $in1_x,$in1_y,$in1_z, 1778 $in2_x,$in2_y,$in2_z)=map(32*$_,(0..17)); 1779 my ($Z1sqr, $Z2sqr) = ($Hsqr, $Rsqr); 1780 1781 if ($x ne "x") { 1782 $src0 = "%rax"; 1783 $sfx = ""; 1784 $bias = 0; 1785 1786 $code.=<<___; 1787 .globl ecp_nistz256_point_add 1788 .type ecp_nistz256_point_add,\@function,3 1789 .align 32 1790 ecp_nistz256_point_add: 1791 ___ 1792 $code.=<<___ if ($addx); 1793 leaq OPENSSL_ia32cap_P(%rip), %rcx 1794 mov 8(%rcx), %rcx 1795 and \$0x80100, %ecx 1796 cmp \$0x80100, %ecx 1797 je .Lpoint_addx 1798 ___ 1799 } else { 1800 $src0 = "%rdx"; 1801 $sfx = "x"; 1802 $bias = 128; 1803 1804 $code.=<<___; 1805 .type ecp_nistz256_point_addx,\@function,3 1806 .align 32 1807 ecp_nistz256_point_addx: 1808 .Lpoint_addx: 1809 ___ 1810 } 1811 $code.=<<___; 1812 push %rbp 1813 push %rbx 1814 push %r12 1815 push %r13 1816 push %r14 1817 push %r15 1818 sub \$32*18+8, %rsp 1819 1820 movdqu 0x00($a_ptr), %xmm0 # copy *(P256_POINT *)$a_ptr 1821 movdqu 0x10($a_ptr), %xmm1 1822 movdqu 0x20($a_ptr), %xmm2 1823 movdqu 0x30($a_ptr), %xmm3 1824 movdqu 0x40($a_ptr), %xmm4 1825 movdqu 0x50($a_ptr), %xmm5 1826 mov $a_ptr, $b_ptr # reassign 1827 mov $b_org, $a_ptr # reassign 1828 movdqa %xmm0, $in1_x(%rsp) 1829 movdqa %xmm1, $in1_x+0x10(%rsp) 1830 movdqa %xmm2, $in1_y(%rsp) 1831 movdqa %xmm3, $in1_y+0x10(%rsp) 1832 movdqa %xmm4, $in1_z(%rsp) 1833 movdqa %xmm5, $in1_z+0x10(%rsp) 1834 por %xmm4, %xmm5 1835 1836 movdqu 0x00($a_ptr), %xmm0 # copy *(P256_POINT *)$b_ptr 1837 pshufd \$0xb1, %xmm5, %xmm3 1838 movdqu 0x10($a_ptr), %xmm1 1839 movdqu 0x20($a_ptr), %xmm2 1840 por %xmm3, %xmm5 1841 movdqu 0x30($a_ptr), %xmm3 1842 mov 0x40+8*0($a_ptr), $src0 # load original in2_z 1843 mov 0x40+8*1($a_ptr), $acc6 1844 mov 0x40+8*2($a_ptr), $acc7 1845 mov 0x40+8*3($a_ptr), $acc0 1846 movdqa %xmm0, $in2_x(%rsp) 1847 pshufd \$0x1e, %xmm5, %xmm4 1848 movdqa %xmm1, $in2_x+0x10(%rsp) 1849 movdqu 0x40($a_ptr),%xmm0 # in2_z again 1850 movdqu 0x50($a_ptr),%xmm1 1851 movdqa %xmm2, $in2_y(%rsp) 1852 movdqa %xmm3, $in2_y+0x10(%rsp) 1853 por %xmm4, %xmm5 1854 pxor %xmm4, %xmm4 1855 por %xmm0, %xmm1 1856 movq $r_ptr, %xmm0 # save $r_ptr 1857 1858 lea 0x40-$bias($a_ptr), $a_ptr # $a_ptr is still valid 1859 mov $src0, $in2_z+8*0(%rsp) # make in2_z copy 1860 mov $acc6, $in2_z+8*1(%rsp) 1861 mov $acc7, $in2_z+8*2(%rsp) 1862 mov $acc0, $in2_z+8*3(%rsp) 1863 lea $Z2sqr(%rsp), $r_ptr # Z2^2 1864 call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(Z2sqr, in2_z); 1865 1866 pcmpeqd %xmm4, %xmm5 1867 pshufd \$0xb1, %xmm1, %xmm4 1868 por %xmm1, %xmm4 1869 pshufd \$0, %xmm5, %xmm5 # in1infty 1870 pshufd \$0x1e, %xmm4, %xmm3 1871 por %xmm3, %xmm4 1872 pxor %xmm3, %xmm3 1873 pcmpeqd %xmm3, %xmm4 1874 pshufd \$0, %xmm4, %xmm4 # in2infty 1875 mov 0x40+8*0($b_ptr), $src0 # load original in1_z 1876 mov 0x40+8*1($b_ptr), $acc6 1877 mov 0x40+8*2($b_ptr), $acc7 1878 mov 0x40+8*3($b_ptr), $acc0 1879 movq $b_ptr, %xmm1 1880 1881 lea 0x40-$bias($b_ptr), $a_ptr 1882 lea $Z1sqr(%rsp), $r_ptr # Z1^2 1883 call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(Z1sqr, in1_z); 1884 1885 `&load_for_mul("$Z2sqr(%rsp)", "$in2_z(%rsp)", "$src0")` 1886 lea $S1(%rsp), $r_ptr # S1 = Z2^3 1887 call __ecp_nistz256_mul_mont$x # p256_mul_mont(S1, Z2sqr, in2_z); 1888 1889 `&load_for_mul("$Z1sqr(%rsp)", "$in1_z(%rsp)", "$src0")` 1890 lea $S2(%rsp), $r_ptr # S2 = Z1^3 1891 call __ecp_nistz256_mul_mont$x # p256_mul_mont(S2, Z1sqr, in1_z); 1892 1893 `&load_for_mul("$S1(%rsp)", "$in1_y(%rsp)", "$src0")` 1894 lea $S1(%rsp), $r_ptr # S1 = Y1*Z2^3 1895 call __ecp_nistz256_mul_mont$x # p256_mul_mont(S1, S1, in1_y); 1896 1897 `&load_for_mul("$S2(%rsp)", "$in2_y(%rsp)", "$src0")` 1898 lea $S2(%rsp), $r_ptr # S2 = Y2*Z1^3 1899 call __ecp_nistz256_mul_mont$x # p256_mul_mont(S2, S2, in2_y); 1900 1901 lea $S1(%rsp), $b_ptr 1902 lea $R(%rsp), $r_ptr # R = S2 - S1 1903 call __ecp_nistz256_sub_from$x # p256_sub(R, S2, S1); 1904 1905 or $acc5, $acc4 # see if result is zero 1906 movdqa %xmm4, %xmm2 1907 or $acc0, $acc4 1908 or $acc1, $acc4 1909 por %xmm5, %xmm2 # in1infty || in2infty 1910 movq $acc4, %xmm3 1911 1912 `&load_for_mul("$Z2sqr(%rsp)", "$in1_x(%rsp)", "$src0")` 1913 lea $U1(%rsp), $r_ptr # U1 = X1*Z2^2 1914 call __ecp_nistz256_mul_mont$x # p256_mul_mont(U1, in1_x, Z2sqr); 1915 1916 `&load_for_mul("$Z1sqr(%rsp)", "$in2_x(%rsp)", "$src0")` 1917 lea $U2(%rsp), $r_ptr # U2 = X2*Z1^2 1918 call __ecp_nistz256_mul_mont$x # p256_mul_mont(U2, in2_x, Z1sqr); 1919 1920 lea $U1(%rsp), $b_ptr 1921 lea $H(%rsp), $r_ptr # H = U2 - U1 1922 call __ecp_nistz256_sub_from$x # p256_sub(H, U2, U1); 1923 1924 or $acc5, $acc4 # see if result is zero 1925 or $acc0, $acc4 1926 or $acc1, $acc4 1927 1928 .byte 0x3e # predict taken 1929 jnz .Ladd_proceed$x # is_equal(U1,U2)? 1930 movq %xmm2, $acc0 1931 movq %xmm3, $acc1 1932 test $acc0, $acc0 1933 jnz .Ladd_proceed$x # (in1infty || in2infty)? 1934 test $acc1, $acc1 1935 jz .Ladd_double$x # is_equal(S1,S2)? 1936 1937 movq %xmm0, $r_ptr # restore $r_ptr 1938 pxor %xmm0, %xmm0 1939 movdqu %xmm0, 0x00($r_ptr) 1940 movdqu %xmm0, 0x10($r_ptr) 1941 movdqu %xmm0, 0x20($r_ptr) 1942 movdqu %xmm0, 0x30($r_ptr) 1943 movdqu %xmm0, 0x40($r_ptr) 1944 movdqu %xmm0, 0x50($r_ptr) 1945 jmp .Ladd_done$x 1946 1947 .align 32 1948 .Ladd_double$x: 1949 movq %xmm1, $a_ptr # restore $a_ptr 1950 movq %xmm0, $r_ptr # restore $r_ptr 1951 add \$`32*(18-5)`, %rsp # difference in frame sizes 1952 jmp .Lpoint_double_shortcut$x 1953 1954 .align 32 1955 .Ladd_proceed$x: 1956 `&load_for_sqr("$R(%rsp)", "$src0")` 1957 lea $Rsqr(%rsp), $r_ptr # R^2 1958 call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(Rsqr, R); 1959 1960 `&load_for_mul("$H(%rsp)", "$in1_z(%rsp)", "$src0")` 1961 lea $res_z(%rsp), $r_ptr # Z3 = H*Z1*Z2 1962 call __ecp_nistz256_mul_mont$x # p256_mul_mont(res_z, H, in1_z); 1963 1964 `&load_for_sqr("$H(%rsp)", "$src0")` 1965 lea $Hsqr(%rsp), $r_ptr # H^2 1966 call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(Hsqr, H); 1967 1968 `&load_for_mul("$res_z(%rsp)", "$in2_z(%rsp)", "$src0")` 1969 lea $res_z(%rsp), $r_ptr # Z3 = H*Z1*Z2 1970 call __ecp_nistz256_mul_mont$x # p256_mul_mont(res_z, res_z, in2_z); 1971 1972 `&load_for_mul("$Hsqr(%rsp)", "$H(%rsp)", "$src0")` 1973 lea $Hcub(%rsp), $r_ptr # H^3 1974 call __ecp_nistz256_mul_mont$x # p256_mul_mont(Hcub, Hsqr, H); 1975 1976 `&load_for_mul("$Hsqr(%rsp)", "$U1(%rsp)", "$src0")` 1977 lea $U2(%rsp), $r_ptr # U1*H^2 1978 call __ecp_nistz256_mul_mont$x # p256_mul_mont(U2, U1, Hsqr); 1979 ___ 1980 { 1981 ####################################################################### 1982 # operate in 4-5-0-1 "name space" that matches multiplication output 1983 # 1984 my ($acc0,$acc1,$acc2,$acc3,$t3,$t4)=($acc4,$acc5,$acc0,$acc1,$acc2,$acc3); 1985 my ($poly1, $poly3)=($acc6,$acc7); 1986 1987 $code.=<<___; 1988 #lea $U2(%rsp), $a_ptr 1989 #lea $Hsqr(%rsp), $r_ptr # 2*U1*H^2 1990 #call __ecp_nistz256_mul_by_2 # ecp_nistz256_mul_by_2(Hsqr, U2); 1991 1992 xor $t4, $t4 1993 add $acc0, $acc0 # a0:a3+a0:a3 1994 lea $Rsqr(%rsp), $a_ptr 1995 adc $acc1, $acc1 1996 mov $acc0, $t0 1997 adc $acc2, $acc2 1998 adc $acc3, $acc3 1999 mov $acc1, $t1 2000 adc \$0, $t4 2001 2002 sub \$-1, $acc0 2003 mov $acc2, $t2 2004 sbb $poly1, $acc1 2005 sbb \$0, $acc2 2006 mov $acc3, $t3 2007 sbb $poly3, $acc3 2008 sbb \$0, $t4 2009 2010 cmovc $t0, $acc0 2011 mov 8*0($a_ptr), $t0 2012 cmovc $t1, $acc1 2013 mov 8*1($a_ptr), $t1 2014 cmovc $t2, $acc2 2015 mov 8*2($a_ptr), $t2 2016 cmovc $t3, $acc3 2017 mov 8*3($a_ptr), $t3 2018 2019 call __ecp_nistz256_sub$x # p256_sub(res_x, Rsqr, Hsqr); 2020 2021 lea $Hcub(%rsp), $b_ptr 2022 lea $res_x(%rsp), $r_ptr 2023 call __ecp_nistz256_sub_from$x # p256_sub(res_x, res_x, Hcub); 2024 2025 mov $U2+8*0(%rsp), $t0 2026 mov $U2+8*1(%rsp), $t1 2027 mov $U2+8*2(%rsp), $t2 2028 mov $U2+8*3(%rsp), $t3 2029 lea $res_y(%rsp), $r_ptr 2030 2031 call __ecp_nistz256_sub$x # p256_sub(res_y, U2, res_x); 2032 2033 mov $acc0, 8*0($r_ptr) # save the result, as 2034 mov $acc1, 8*1($r_ptr) # __ecp_nistz256_sub doesn't 2035 mov $acc2, 8*2($r_ptr) 2036 mov $acc3, 8*3($r_ptr) 2037 ___ 2038 } 2039 $code.=<<___; 2040 `&load_for_mul("$S1(%rsp)", "$Hcub(%rsp)", "$src0")` 2041 lea $S2(%rsp), $r_ptr 2042 call __ecp_nistz256_mul_mont$x # p256_mul_mont(S2, S1, Hcub); 2043 2044 `&load_for_mul("$R(%rsp)", "$res_y(%rsp)", "$src0")` 2045 lea $res_y(%rsp), $r_ptr 2046 call __ecp_nistz256_mul_mont$x # p256_mul_mont(res_y, R, res_y); 2047 2048 lea $S2(%rsp), $b_ptr 2049 lea $res_y(%rsp), $r_ptr 2050 call __ecp_nistz256_sub_from$x # p256_sub(res_y, res_y, S2); 2051 2052 movq %xmm0, $r_ptr # restore $r_ptr 2053 2054 movdqa %xmm5, %xmm0 # copy_conditional(res_z, in2_z, in1infty); 2055 movdqa %xmm5, %xmm1 2056 pandn $res_z(%rsp), %xmm0 2057 movdqa %xmm5, %xmm2 2058 pandn $res_z+0x10(%rsp), %xmm1 2059 movdqa %xmm5, %xmm3 2060 pand $in2_z(%rsp), %xmm2 2061 pand $in2_z+0x10(%rsp), %xmm3 2062 por %xmm0, %xmm2 2063 por %xmm1, %xmm3 2064 2065 movdqa %xmm4, %xmm0 # copy_conditional(res_z, in1_z, in2infty); 2066 movdqa %xmm4, %xmm1 2067 pandn %xmm2, %xmm0 2068 movdqa %xmm4, %xmm2 2069 pandn %xmm3, %xmm1 2070 movdqa %xmm4, %xmm3 2071 pand $in1_z(%rsp), %xmm2 2072 pand $in1_z+0x10(%rsp), %xmm3 2073 por %xmm0, %xmm2 2074 por %xmm1, %xmm3 2075 movdqu %xmm2, 0x40($r_ptr) 2076 movdqu %xmm3, 0x50($r_ptr) 2077 2078 movdqa %xmm5, %xmm0 # copy_conditional(res_x, in2_x, in1infty); 2079 movdqa %xmm5, %xmm1 2080 pandn $res_x(%rsp), %xmm0 2081 movdqa %xmm5, %xmm2 2082 pandn $res_x+0x10(%rsp), %xmm1 2083 movdqa %xmm5, %xmm3 2084 pand $in2_x(%rsp), %xmm2 2085 pand $in2_x+0x10(%rsp), %xmm3 2086 por %xmm0, %xmm2 2087 por %xmm1, %xmm3 2088 2089 movdqa %xmm4, %xmm0 # copy_conditional(res_x, in1_x, in2infty); 2090 movdqa %xmm4, %xmm1 2091 pandn %xmm2, %xmm0 2092 movdqa %xmm4, %xmm2 2093 pandn %xmm3, %xmm1 2094 movdqa %xmm4, %xmm3 2095 pand $in1_x(%rsp), %xmm2 2096 pand $in1_x+0x10(%rsp), %xmm3 2097 por %xmm0, %xmm2 2098 por %xmm1, %xmm3 2099 movdqu %xmm2, 0x00($r_ptr) 2100 movdqu %xmm3, 0x10($r_ptr) 2101 2102 movdqa %xmm5, %xmm0 # copy_conditional(res_y, in2_y, in1infty); 2103 movdqa %xmm5, %xmm1 2104 pandn $res_y(%rsp), %xmm0 2105 movdqa %xmm5, %xmm2 2106 pandn $res_y+0x10(%rsp), %xmm1 2107 movdqa %xmm5, %xmm3 2108 pand $in2_y(%rsp), %xmm2 2109 pand $in2_y+0x10(%rsp), %xmm3 2110 por %xmm0, %xmm2 2111 por %xmm1, %xmm3 2112 2113 movdqa %xmm4, %xmm0 # copy_conditional(res_y, in1_y, in2infty); 2114 movdqa %xmm4, %xmm1 2115 pandn %xmm2, %xmm0 2116 movdqa %xmm4, %xmm2 2117 pandn %xmm3, %xmm1 2118 movdqa %xmm4, %xmm3 2119 pand $in1_y(%rsp), %xmm2 2120 pand $in1_y+0x10(%rsp), %xmm3 2121 por %xmm0, %xmm2 2122 por %xmm1, %xmm3 2123 movdqu %xmm2, 0x20($r_ptr) 2124 movdqu %xmm3, 0x30($r_ptr) 2125 2126 .Ladd_done$x: 2127 add \$32*18+8, %rsp 2128 pop %r15 2129 pop %r14 2130 pop %r13 2131 pop %r12 2132 pop %rbx 2133 pop %rbp 2134 ret 2135 .size ecp_nistz256_point_add$sfx,.-ecp_nistz256_point_add$sfx 2136 ___ 2137 } 2138 &gen_add("q"); 2139 2140 sub gen_add_affine () { 2141 my $x = shift; 2142 my ($src0,$sfx,$bias); 2143 my ($U2,$S2,$H,$R,$Hsqr,$Hcub,$Rsqr, 2144 $res_x,$res_y,$res_z, 2145 $in1_x,$in1_y,$in1_z, 2146 $in2_x,$in2_y)=map(32*$_,(0..14)); 2147 my $Z1sqr = $S2; 2148 2149 if ($x ne "x") { 2150 $src0 = "%rax"; 2151 $sfx = ""; 2152 $bias = 0; 2153 2154 $code.=<<___; 2155 .globl ecp_nistz256_point_add_affine 2156 .type ecp_nistz256_point_add_affine,\@function,3 2157 .align 32 2158 ecp_nistz256_point_add_affine: 2159 ___ 2160 $code.=<<___ if ($addx); 2161 leaq OPENSSL_ia32cap_P(%rip), %rcx 2162 mov 8(%rcx), %rcx 2163 and \$0x80100, %ecx 2164 cmp \$0x80100, %ecx 2165 je .Lpoint_add_affinex 2166 ___ 2167 } else { 2168 $src0 = "%rdx"; 2169 $sfx = "x"; 2170 $bias = 128; 2171 2172 $code.=<<___; 2173 .type ecp_nistz256_point_add_affinex,\@function,3 2174 .align 32 2175 ecp_nistz256_point_add_affinex: 2176 .Lpoint_add_affinex: 2177 ___ 2178 } 2179 $code.=<<___; 2180 push %rbp 2181 push %rbx 2182 push %r12 2183 push %r13 2184 push %r14 2185 push %r15 2186 sub \$32*15+8, %rsp 2187 2188 movdqu 0x00($a_ptr), %xmm0 # copy *(P256_POINT *)$a_ptr 2189 mov $b_org, $b_ptr # reassign 2190 movdqu 0x10($a_ptr), %xmm1 2191 movdqu 0x20($a_ptr), %xmm2 2192 movdqu 0x30($a_ptr), %xmm3 2193 movdqu 0x40($a_ptr), %xmm4 2194 movdqu 0x50($a_ptr), %xmm5 2195 mov 0x40+8*0($a_ptr), $src0 # load original in1_z 2196 mov 0x40+8*1($a_ptr), $acc6 2197 mov 0x40+8*2($a_ptr), $acc7 2198 mov 0x40+8*3($a_ptr), $acc0 2199 movdqa %xmm0, $in1_x(%rsp) 2200 movdqa %xmm1, $in1_x+0x10(%rsp) 2201 movdqa %xmm2, $in1_y(%rsp) 2202 movdqa %xmm3, $in1_y+0x10(%rsp) 2203 movdqa %xmm4, $in1_z(%rsp) 2204 movdqa %xmm5, $in1_z+0x10(%rsp) 2205 por %xmm4, %xmm5 2206 2207 movdqu 0x00($b_ptr), %xmm0 # copy *(P256_POINT_AFFINE *)$b_ptr 2208 pshufd \$0xb1, %xmm5, %xmm3 2209 movdqu 0x10($b_ptr), %xmm1 2210 movdqu 0x20($b_ptr), %xmm2 2211 por %xmm3, %xmm5 2212 movdqu 0x30($b_ptr), %xmm3 2213 movdqa %xmm0, $in2_x(%rsp) 2214 pshufd \$0x1e, %xmm5, %xmm4 2215 movdqa %xmm1, $in2_x+0x10(%rsp) 2216 por %xmm0, %xmm1 2217 movq $r_ptr, %xmm0 # save $r_ptr 2218 movdqa %xmm2, $in2_y(%rsp) 2219 movdqa %xmm3, $in2_y+0x10(%rsp) 2220 por %xmm2, %xmm3 2221 por %xmm4, %xmm5 2222 pxor %xmm4, %xmm4 2223 por %xmm1, %xmm3 2224 2225 lea 0x40-$bias($a_ptr), $a_ptr # $a_ptr is still valid 2226 lea $Z1sqr(%rsp), $r_ptr # Z1^2 2227 call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(Z1sqr, in1_z); 2228 2229 pcmpeqd %xmm4, %xmm5 2230 pshufd \$0xb1, %xmm3, %xmm4 2231 mov 0x00($b_ptr), $src0 # $b_ptr is still valid 2232 #lea 0x00($b_ptr), $b_ptr 2233 mov $acc4, $acc1 # harmonize sqr output and mul input 2234 por %xmm3, %xmm4 2235 pshufd \$0, %xmm5, %xmm5 # in1infty 2236 pshufd \$0x1e, %xmm4, %xmm3 2237 mov $acc5, $acc2 2238 por %xmm3, %xmm4 2239 pxor %xmm3, %xmm3 2240 mov $acc6, $acc3 2241 pcmpeqd %xmm3, %xmm4 2242 pshufd \$0, %xmm4, %xmm4 # in2infty 2243 2244 lea $Z1sqr-$bias(%rsp), $a_ptr 2245 mov $acc7, $acc4 2246 lea $U2(%rsp), $r_ptr # U2 = X2*Z1^2 2247 call __ecp_nistz256_mul_mont$x # p256_mul_mont(U2, Z1sqr, in2_x); 2248 2249 lea $in1_x(%rsp), $b_ptr 2250 lea $H(%rsp), $r_ptr # H = U2 - U1 2251 call __ecp_nistz256_sub_from$x # p256_sub(H, U2, in1_x); 2252 2253 `&load_for_mul("$Z1sqr(%rsp)", "$in1_z(%rsp)", "$src0")` 2254 lea $S2(%rsp), $r_ptr # S2 = Z1^3 2255 call __ecp_nistz256_mul_mont$x # p256_mul_mont(S2, Z1sqr, in1_z); 2256 2257 `&load_for_mul("$H(%rsp)", "$in1_z(%rsp)", "$src0")` 2258 lea $res_z(%rsp), $r_ptr # Z3 = H*Z1*Z2 2259 call __ecp_nistz256_mul_mont$x # p256_mul_mont(res_z, H, in1_z); 2260 2261 `&load_for_mul("$S2(%rsp)", "$in2_y(%rsp)", "$src0")` 2262 lea $S2(%rsp), $r_ptr # S2 = Y2*Z1^3 2263 call __ecp_nistz256_mul_mont$x # p256_mul_mont(S2, S2, in2_y); 2264 2265 lea $in1_y(%rsp), $b_ptr 2266 lea $R(%rsp), $r_ptr # R = S2 - S1 2267 call __ecp_nistz256_sub_from$x # p256_sub(R, S2, in1_y); 2268 2269 `&load_for_sqr("$H(%rsp)", "$src0")` 2270 lea $Hsqr(%rsp), $r_ptr # H^2 2271 call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(Hsqr, H); 2272 2273 `&load_for_sqr("$R(%rsp)", "$src0")` 2274 lea $Rsqr(%rsp), $r_ptr # R^2 2275 call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(Rsqr, R); 2276 2277 `&load_for_mul("$H(%rsp)", "$Hsqr(%rsp)", "$src0")` 2278 lea $Hcub(%rsp), $r_ptr # H^3 2279 call __ecp_nistz256_mul_mont$x # p256_mul_mont(Hcub, Hsqr, H); 2280 2281 `&load_for_mul("$Hsqr(%rsp)", "$in1_x(%rsp)", "$src0")` 2282 lea $U2(%rsp), $r_ptr # U1*H^2 2283 call __ecp_nistz256_mul_mont$x # p256_mul_mont(U2, in1_x, Hsqr); 2284 ___ 2285 { 2286 ####################################################################### 2287 # operate in 4-5-0-1 "name space" that matches multiplication output 2288 # 2289 my ($acc0,$acc1,$acc2,$acc3,$t3,$t4)=($acc4,$acc5,$acc0,$acc1,$acc2,$acc3); 2290 my ($poly1, $poly3)=($acc6,$acc7); 2291 2292 $code.=<<___; 2293 #lea $U2(%rsp), $a_ptr 2294 #lea $Hsqr(%rsp), $r_ptr # 2*U1*H^2 2295 #call __ecp_nistz256_mul_by_2 # ecp_nistz256_mul_by_2(Hsqr, U2); 2296 2297 xor $t4, $t4 2298 add $acc0, $acc0 # a0:a3+a0:a3 2299 lea $Rsqr(%rsp), $a_ptr 2300 adc $acc1, $acc1 2301 mov $acc0, $t0 2302 adc $acc2, $acc2 2303 adc $acc3, $acc3 2304 mov $acc1, $t1 2305 adc \$0, $t4 2306 2307 sub \$-1, $acc0 2308 mov $acc2, $t2 2309 sbb $poly1, $acc1 2310 sbb \$0, $acc2 2311 mov $acc3, $t3 2312 sbb $poly3, $acc3 2313 sbb \$0, $t4 2314 2315 cmovc $t0, $acc0 2316 mov 8*0($a_ptr), $t0 2317 cmovc $t1, $acc1 2318 mov 8*1($a_ptr), $t1 2319 cmovc $t2, $acc2 2320 mov 8*2($a_ptr), $t2 2321 cmovc $t3, $acc3 2322 mov 8*3($a_ptr), $t3 2323 2324 call __ecp_nistz256_sub$x # p256_sub(res_x, Rsqr, Hsqr); 2325 2326 lea $Hcub(%rsp), $b_ptr 2327 lea $res_x(%rsp), $r_ptr 2328 call __ecp_nistz256_sub_from$x # p256_sub(res_x, res_x, Hcub); 2329 2330 mov $U2+8*0(%rsp), $t0 2331 mov $U2+8*1(%rsp), $t1 2332 mov $U2+8*2(%rsp), $t2 2333 mov $U2+8*3(%rsp), $t3 2334 lea $H(%rsp), $r_ptr 2335 2336 call __ecp_nistz256_sub$x # p256_sub(H, U2, res_x); 2337 2338 mov $acc0, 8*0($r_ptr) # save the result, as 2339 mov $acc1, 8*1($r_ptr) # __ecp_nistz256_sub doesn't 2340 mov $acc2, 8*2($r_ptr) 2341 mov $acc3, 8*3($r_ptr) 2342 ___ 2343 } 2344 $code.=<<___; 2345 `&load_for_mul("$Hcub(%rsp)", "$in1_y(%rsp)", "$src0")` 2346 lea $S2(%rsp), $r_ptr 2347 call __ecp_nistz256_mul_mont$x # p256_mul_mont(S2, Hcub, in1_y); 2348 2349 `&load_for_mul("$H(%rsp)", "$R(%rsp)", "$src0")` 2350 lea $H(%rsp), $r_ptr 2351 call __ecp_nistz256_mul_mont$x # p256_mul_mont(H, H, R); 2352 2353 lea $S2(%rsp), $b_ptr 2354 lea $res_y(%rsp), $r_ptr 2355 call __ecp_nistz256_sub_from$x # p256_sub(res_y, H, S2); 2356 2357 movq %xmm0, $r_ptr # restore $r_ptr 2358 2359 movdqa %xmm5, %xmm0 # copy_conditional(res_z, ONE, in1infty); 2360 movdqa %xmm5, %xmm1 2361 pandn $res_z(%rsp), %xmm0 2362 movdqa %xmm5, %xmm2 2363 pandn $res_z+0x10(%rsp), %xmm1 2364 movdqa %xmm5, %xmm3 2365 pand .LONE_mont(%rip), %xmm2 2366 pand .LONE_mont+0x10(%rip), %xmm3 2367 por %xmm0, %xmm2 2368 por %xmm1, %xmm3 2369 2370 movdqa %xmm4, %xmm0 # copy_conditional(res_z, in1_z, in2infty); 2371 movdqa %xmm4, %xmm1 2372 pandn %xmm2, %xmm0 2373 movdqa %xmm4, %xmm2 2374 pandn %xmm3, %xmm1 2375 movdqa %xmm4, %xmm3 2376 pand $in1_z(%rsp), %xmm2 2377 pand $in1_z+0x10(%rsp), %xmm3 2378 por %xmm0, %xmm2 2379 por %xmm1, %xmm3 2380 movdqu %xmm2, 0x40($r_ptr) 2381 movdqu %xmm3, 0x50($r_ptr) 2382 2383 movdqa %xmm5, %xmm0 # copy_conditional(res_x, in2_x, in1infty); 2384 movdqa %xmm5, %xmm1 2385 pandn $res_x(%rsp), %xmm0 2386 movdqa %xmm5, %xmm2 2387 pandn $res_x+0x10(%rsp), %xmm1 2388 movdqa %xmm5, %xmm3 2389 pand $in2_x(%rsp), %xmm2 2390 pand $in2_x+0x10(%rsp), %xmm3 2391 por %xmm0, %xmm2 2392 por %xmm1, %xmm3 2393 2394 movdqa %xmm4, %xmm0 # copy_conditional(res_x, in1_x, in2infty); 2395 movdqa %xmm4, %xmm1 2396 pandn %xmm2, %xmm0 2397 movdqa %xmm4, %xmm2 2398 pandn %xmm3, %xmm1 2399 movdqa %xmm4, %xmm3 2400 pand $in1_x(%rsp), %xmm2 2401 pand $in1_x+0x10(%rsp), %xmm3 2402 por %xmm0, %xmm2 2403 por %xmm1, %xmm3 2404 movdqu %xmm2, 0x00($r_ptr) 2405 movdqu %xmm3, 0x10($r_ptr) 2406 2407 movdqa %xmm5, %xmm0 # copy_conditional(res_y, in2_y, in1infty); 2408 movdqa %xmm5, %xmm1 2409 pandn $res_y(%rsp), %xmm0 2410 movdqa %xmm5, %xmm2 2411 pandn $res_y+0x10(%rsp), %xmm1 2412 movdqa %xmm5, %xmm3 2413 pand $in2_y(%rsp), %xmm2 2414 pand $in2_y+0x10(%rsp), %xmm3 2415 por %xmm0, %xmm2 2416 por %xmm1, %xmm3 2417 2418 movdqa %xmm4, %xmm0 # copy_conditional(res_y, in1_y, in2infty); 2419 movdqa %xmm4, %xmm1 2420 pandn %xmm2, %xmm0 2421 movdqa %xmm4, %xmm2 2422 pandn %xmm3, %xmm1 2423 movdqa %xmm4, %xmm3 2424 pand $in1_y(%rsp), %xmm2 2425 pand $in1_y+0x10(%rsp), %xmm3 2426 por %xmm0, %xmm2 2427 por %xmm1, %xmm3 2428 movdqu %xmm2, 0x20($r_ptr) 2429 movdqu %xmm3, 0x30($r_ptr) 2430 2431 add \$32*15+8, %rsp 2432 pop %r15 2433 pop %r14 2434 pop %r13 2435 pop %r12 2436 pop %rbx 2437 pop %rbp 2438 ret 2439 .size ecp_nistz256_point_add_affine$sfx,.-ecp_nistz256_point_add_affine$sfx 2440 ___ 2441 } 2442 &gen_add_affine("q"); 2443 2444 ######################################################################## 2445 # AD*X magic 2446 # 2447 if ($addx) { { 2448 ######################################################################## 2449 # operate in 4-5-0-1 "name space" that matches multiplication output 2450 # 2451 my ($a0,$a1,$a2,$a3,$t3,$t4)=($acc4,$acc5,$acc0,$acc1,$acc2,$acc3); 2452 2453 $code.=<<___; 2454 .type __ecp_nistz256_add_tox,\@abi-omnipotent 2455 .align 32 2456 __ecp_nistz256_add_tox: 2457 xor $t4, $t4 2458 adc 8*0($b_ptr), $a0 2459 adc 8*1($b_ptr), $a1 2460 mov $a0, $t0 2461 adc 8*2($b_ptr), $a2 2462 adc 8*3($b_ptr), $a3 2463 mov $a1, $t1 2464 adc \$0, $t4 2465 2466 xor $t3, $t3 2467 sbb \$-1, $a0 2468 mov $a2, $t2 2469 sbb $poly1, $a1 2470 sbb \$0, $a2 2471 mov $a3, $t3 2472 sbb $poly3, $a3 2473 sbb \$0, $t4 2474 2475 cmovc $t0, $a0 2476 cmovc $t1, $a1 2477 mov $a0, 8*0($r_ptr) 2478 cmovc $t2, $a2 2479 mov $a1, 8*1($r_ptr) 2480 cmovc $t3, $a3 2481 mov $a2, 8*2($r_ptr) 2482 mov $a3, 8*3($r_ptr) 2483 2484 ret 2485 .size __ecp_nistz256_add_tox,.-__ecp_nistz256_add_tox 2486 2487 .type __ecp_nistz256_sub_fromx,\@abi-omnipotent 2488 .align 32 2489 __ecp_nistz256_sub_fromx: 2490 xor $t4, $t4 2491 sbb 8*0($b_ptr), $a0 2492 sbb 8*1($b_ptr), $a1 2493 mov $a0, $t0 2494 sbb 8*2($b_ptr), $a2 2495 sbb 8*3($b_ptr), $a3 2496 mov $a1, $t1 2497 sbb \$0, $t4 2498 2499 xor $t3, $t3 2500 adc \$-1, $a0 2501 mov $a2, $t2 2502 adc $poly1, $a1 2503 adc \$0, $a2 2504 mov $a3, $t3 2505 adc $poly3, $a3 2506 2507 bt \$0, $t4 2508 cmovnc $t0, $a0 2509 cmovnc $t1, $a1 2510 mov $a0, 8*0($r_ptr) 2511 cmovnc $t2, $a2 2512 mov $a1, 8*1($r_ptr) 2513 cmovnc $t3, $a3 2514 mov $a2, 8*2($r_ptr) 2515 mov $a3, 8*3($r_ptr) 2516 2517 ret 2518 .size __ecp_nistz256_sub_fromx,.-__ecp_nistz256_sub_fromx 2519 2520 .type __ecp_nistz256_subx,\@abi-omnipotent 2521 .align 32 2522 __ecp_nistz256_subx: 2523 xor $t4, $t4 2524 sbb $a0, $t0 2525 sbb $a1, $t1 2526 mov $t0, $a0 2527 sbb $a2, $t2 2528 sbb $a3, $t3 2529 mov $t1, $a1 2530 sbb \$0, $t4 2531 2532 xor $a3 ,$a3 2533 adc \$-1, $t0 2534 mov $t2, $a2 2535 adc $poly1, $t1 2536 adc \$0, $t2 2537 mov $t3, $a3 2538 adc $poly3, $t3 2539 2540 bt \$0, $t4 2541 cmovc $t0, $a0 2542 cmovc $t1, $a1 2543 cmovc $t2, $a2 2544 cmovc $t3, $a3 2545 2546 ret 2547 .size __ecp_nistz256_subx,.-__ecp_nistz256_subx 2548 2549 .type __ecp_nistz256_mul_by_2x,\@abi-omnipotent 2550 .align 32 2551 __ecp_nistz256_mul_by_2x: 2552 xor $t4, $t4 2553 adc $a0, $a0 # a0:a3+a0:a3 2554 adc $a1, $a1 2555 mov $a0, $t0 2556 adc $a2, $a2 2557 adc $a3, $a3 2558 mov $a1, $t1 2559 adc \$0, $t4 2560 2561 xor $t3, $t3 2562 sbb \$-1, $a0 2563 mov $a2, $t2 2564 sbb $poly1, $a1 2565 sbb \$0, $a2 2566 mov $a3, $t3 2567 sbb $poly3, $a3 2568 sbb \$0, $t4 2569 2570 cmovc $t0, $a0 2571 cmovc $t1, $a1 2572 mov $a0, 8*0($r_ptr) 2573 cmovc $t2, $a2 2574 mov $a1, 8*1($r_ptr) 2575 cmovc $t3, $a3 2576 mov $a2, 8*2($r_ptr) 2577 mov $a3, 8*3($r_ptr) 2578 2579 ret 2580 .size __ecp_nistz256_mul_by_2x,.-__ecp_nistz256_mul_by_2x 2581 ___ 2582 } 2583 &gen_double("x"); 2584 &gen_add("x"); 2585 &gen_add_affine("x"); 2586 } 2587 }}} 2588 2589 $code =~ s/\`([^\`]*)\`/eval $1/gem; 2590 print $code; 2591 close STDOUT; 2592