1 #!/usr/bin/env perl 2 3 # ==================================================================== 4 # Written by Andy Polyakov <appro (at] openssl.org> for the OpenSSL 5 # project. The module is, however, dual licensed under OpenSSL and 6 # CRYPTOGAMS licenses depending on where you obtain it. For further 7 # details see http://www.openssl.org/~appro/cryptogams/. 8 # ==================================================================== 9 10 # August 2011. 11 # 12 # Companion to x86_64-mont.pl that optimizes cache-timing attack 13 # countermeasures. The subroutines are produced by replacing bp[i] 14 # references in their x86_64-mont.pl counterparts with cache-neutral 15 # references to powers table computed in BN_mod_exp_mont_consttime. 16 # In addition subroutine that scatters elements of the powers table 17 # is implemented, so that scatter-/gathering can be tuned without 18 # bn_exp.c modifications. 19 20 # August 2013. 21 # 22 # Add MULX/AD*X code paths and additional interfaces to optimize for 23 # branch prediction unit. For input lengths that are multiples of 8 24 # the np argument is not just modulus value, but one interleaved 25 # with 0. This is to optimize post-condition... 26 27 $flavour = shift; 28 $output = shift; 29 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } 30 31 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); 32 33 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 34 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or 35 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or 36 die "can't locate x86_64-xlate.pl"; 37 38 open OUT,"| \"$^X\" $xlate $flavour $output"; 39 *STDOUT=*OUT; 40 41 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` 42 =~ /GNU assembler version ([2-9]\.[0-9]+)/) { 43 $addx = ($1>=2.23); 44 } 45 46 if (!$addx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) && 47 `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) { 48 $addx = ($1>=2.10); 49 } 50 51 if (!$addx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) && 52 `ml64 2>&1` =~ /Version ([0-9]+)\./) { 53 $addx = ($1>=12); 54 } 55 56 # int bn_mul_mont_gather5( 57 $rp="%rdi"; # BN_ULONG *rp, 58 $ap="%rsi"; # const BN_ULONG *ap, 59 $bp="%rdx"; # const BN_ULONG *bp, 60 $np="%rcx"; # const BN_ULONG *np, 61 $n0="%r8"; # const BN_ULONG *n0, 62 $num="%r9"; # int num, 63 # int idx); # 0 to 2^5-1, "index" in $bp holding 64 # pre-computed powers of a', interlaced 65 # in such manner that b[0] is $bp[idx], 66 # b[1] is [2^5+idx], etc. 67 $lo0="%r10"; 68 $hi0="%r11"; 69 $hi1="%r13"; 70 $i="%r14"; 71 $j="%r15"; 72 $m0="%rbx"; 73 $m1="%rbp"; 74 75 $code=<<___; 76 .text 77 78 .extern OPENSSL_ia32cap_P 79 80 .globl bn_mul_mont_gather5 81 .type bn_mul_mont_gather5,\@function,6 82 .align 64 83 bn_mul_mont_gather5: 84 test \$7,${num}d 85 jnz .Lmul_enter 86 ___ 87 $code.=<<___ if ($addx); 88 mov OPENSSL_ia32cap_P+8(%rip),%r11d 89 ___ 90 $code.=<<___; 91 jmp .Lmul4x_enter 92 93 .align 16 94 .Lmul_enter: 95 mov ${num}d,${num}d 96 mov %rsp,%rax 97 mov `($win64?56:8)`(%rsp),%r10d # load 7th argument 98 push %rbx 99 push %rbp 100 push %r12 101 push %r13 102 push %r14 103 push %r15 104 ___ 105 $code.=<<___ if ($win64); 106 lea -0x28(%rsp),%rsp 107 movaps %xmm6,(%rsp) 108 movaps %xmm7,0x10(%rsp) 109 ___ 110 $code.=<<___; 111 lea 2($num),%r11 112 neg %r11 113 lea (%rsp,%r11,8),%rsp # tp=alloca(8*(num+2)) 114 and \$-1024,%rsp # minimize TLB usage 115 116 mov %rax,8(%rsp,$num,8) # tp[num+1]=%rsp 117 .Lmul_body: 118 mov $bp,%r12 # reassign $bp 119 ___ 120 $bp="%r12"; 121 $STRIDE=2**5*8; # 5 is "window size" 122 $N=$STRIDE/4; # should match cache line size 123 $code.=<<___; 124 mov %r10,%r11 125 shr \$`log($N/8)/log(2)`,%r10 126 and \$`$N/8-1`,%r11 127 not %r10 128 lea .Lmagic_masks(%rip),%rax 129 and \$`2**5/($N/8)-1`,%r10 # 5 is "window size" 130 lea 96($bp,%r11,8),$bp # pointer within 1st cache line 131 movq 0(%rax,%r10,8),%xmm4 # set of masks denoting which 132 movq 8(%rax,%r10,8),%xmm5 # cache line contains element 133 movq 16(%rax,%r10,8),%xmm6 # denoted by 7th argument 134 movq 24(%rax,%r10,8),%xmm7 135 136 movq `0*$STRIDE/4-96`($bp),%xmm0 137 movq `1*$STRIDE/4-96`($bp),%xmm1 138 pand %xmm4,%xmm0 139 movq `2*$STRIDE/4-96`($bp),%xmm2 140 pand %xmm5,%xmm1 141 movq `3*$STRIDE/4-96`($bp),%xmm3 142 pand %xmm6,%xmm2 143 por %xmm1,%xmm0 144 pand %xmm7,%xmm3 145 por %xmm2,%xmm0 146 lea $STRIDE($bp),$bp 147 por %xmm3,%xmm0 148 149 movq %xmm0,$m0 # m0=bp[0] 150 151 mov ($n0),$n0 # pull n0[0] value 152 mov ($ap),%rax 153 154 xor $i,$i # i=0 155 xor $j,$j # j=0 156 157 movq `0*$STRIDE/4-96`($bp),%xmm0 158 movq `1*$STRIDE/4-96`($bp),%xmm1 159 pand %xmm4,%xmm0 160 movq `2*$STRIDE/4-96`($bp),%xmm2 161 pand %xmm5,%xmm1 162 163 mov $n0,$m1 164 mulq $m0 # ap[0]*bp[0] 165 mov %rax,$lo0 166 mov ($np),%rax 167 168 movq `3*$STRIDE/4-96`($bp),%xmm3 169 pand %xmm6,%xmm2 170 por %xmm1,%xmm0 171 pand %xmm7,%xmm3 172 173 imulq $lo0,$m1 # "tp[0]"*n0 174 mov %rdx,$hi0 175 176 por %xmm2,%xmm0 177 lea $STRIDE($bp),$bp 178 por %xmm3,%xmm0 179 180 mulq $m1 # np[0]*m1 181 add %rax,$lo0 # discarded 182 mov 8($ap),%rax 183 adc \$0,%rdx 184 mov %rdx,$hi1 185 186 lea 1($j),$j # j++ 187 jmp .L1st_enter 188 189 .align 16 190 .L1st: 191 add %rax,$hi1 192 mov ($ap,$j,8),%rax 193 adc \$0,%rdx 194 add $hi0,$hi1 # np[j]*m1+ap[j]*bp[0] 195 mov $lo0,$hi0 196 adc \$0,%rdx 197 mov $hi1,-16(%rsp,$j,8) # tp[j-1] 198 mov %rdx,$hi1 199 200 .L1st_enter: 201 mulq $m0 # ap[j]*bp[0] 202 add %rax,$hi0 203 mov ($np,$j,8),%rax 204 adc \$0,%rdx 205 lea 1($j),$j # j++ 206 mov %rdx,$lo0 207 208 mulq $m1 # np[j]*m1 209 cmp $num,$j 210 jne .L1st 211 212 movq %xmm0,$m0 # bp[1] 213 214 add %rax,$hi1 215 mov ($ap),%rax # ap[0] 216 adc \$0,%rdx 217 add $hi0,$hi1 # np[j]*m1+ap[j]*bp[0] 218 adc \$0,%rdx 219 mov $hi1,-16(%rsp,$j,8) # tp[j-1] 220 mov %rdx,$hi1 221 mov $lo0,$hi0 222 223 xor %rdx,%rdx 224 add $hi0,$hi1 225 adc \$0,%rdx 226 mov $hi1,-8(%rsp,$num,8) 227 mov %rdx,(%rsp,$num,8) # store upmost overflow bit 228 229 lea 1($i),$i # i++ 230 jmp .Louter 231 .align 16 232 .Louter: 233 xor $j,$j # j=0 234 mov $n0,$m1 235 mov (%rsp),$lo0 236 237 movq `0*$STRIDE/4-96`($bp),%xmm0 238 movq `1*$STRIDE/4-96`($bp),%xmm1 239 pand %xmm4,%xmm0 240 movq `2*$STRIDE/4-96`($bp),%xmm2 241 pand %xmm5,%xmm1 242 243 mulq $m0 # ap[0]*bp[i] 244 add %rax,$lo0 # ap[0]*bp[i]+tp[0] 245 mov ($np),%rax 246 adc \$0,%rdx 247 248 movq `3*$STRIDE/4-96`($bp),%xmm3 249 pand %xmm6,%xmm2 250 por %xmm1,%xmm0 251 pand %xmm7,%xmm3 252 253 imulq $lo0,$m1 # tp[0]*n0 254 mov %rdx,$hi0 255 256 por %xmm2,%xmm0 257 lea $STRIDE($bp),$bp 258 por %xmm3,%xmm0 259 260 mulq $m1 # np[0]*m1 261 add %rax,$lo0 # discarded 262 mov 8($ap),%rax 263 adc \$0,%rdx 264 mov 8(%rsp),$lo0 # tp[1] 265 mov %rdx,$hi1 266 267 lea 1($j),$j # j++ 268 jmp .Linner_enter 269 270 .align 16 271 .Linner: 272 add %rax,$hi1 273 mov ($ap,$j,8),%rax 274 adc \$0,%rdx 275 add $lo0,$hi1 # np[j]*m1+ap[j]*bp[i]+tp[j] 276 mov (%rsp,$j,8),$lo0 277 adc \$0,%rdx 278 mov $hi1,-16(%rsp,$j,8) # tp[j-1] 279 mov %rdx,$hi1 280 281 .Linner_enter: 282 mulq $m0 # ap[j]*bp[i] 283 add %rax,$hi0 284 mov ($np,$j,8),%rax 285 adc \$0,%rdx 286 add $hi0,$lo0 # ap[j]*bp[i]+tp[j] 287 mov %rdx,$hi0 288 adc \$0,$hi0 289 lea 1($j),$j # j++ 290 291 mulq $m1 # np[j]*m1 292 cmp $num,$j 293 jne .Linner 294 295 movq %xmm0,$m0 # bp[i+1] 296 297 add %rax,$hi1 298 mov ($ap),%rax # ap[0] 299 adc \$0,%rdx 300 add $lo0,$hi1 # np[j]*m1+ap[j]*bp[i]+tp[j] 301 mov (%rsp,$j,8),$lo0 302 adc \$0,%rdx 303 mov $hi1,-16(%rsp,$j,8) # tp[j-1] 304 mov %rdx,$hi1 305 306 xor %rdx,%rdx 307 add $hi0,$hi1 308 adc \$0,%rdx 309 add $lo0,$hi1 # pull upmost overflow bit 310 adc \$0,%rdx 311 mov $hi1,-8(%rsp,$num,8) 312 mov %rdx,(%rsp,$num,8) # store upmost overflow bit 313 314 lea 1($i),$i # i++ 315 cmp $num,$i 316 jb .Louter 317 318 xor $i,$i # i=0 and clear CF! 319 mov (%rsp),%rax # tp[0] 320 lea (%rsp),$ap # borrow ap for tp 321 mov $num,$j # j=num 322 jmp .Lsub 323 .align 16 324 .Lsub: sbb ($np,$i,8),%rax 325 mov %rax,($rp,$i,8) # rp[i]=tp[i]-np[i] 326 mov 8($ap,$i,8),%rax # tp[i+1] 327 lea 1($i),$i # i++ 328 dec $j # doesnn't affect CF! 329 jnz .Lsub 330 331 sbb \$0,%rax # handle upmost overflow bit 332 xor $i,$i 333 mov $num,$j # j=num 334 .align 16 335 .Lcopy: # copy or in-place refresh 336 mov (%rsp,$i,8),$ap 337 mov ($rp,$i,8),$np 338 xor $np,$ap # conditional select: 339 and %rax,$ap # ((ap ^ np) & %rax) ^ np 340 xor $np,$ap # ap = borrow?tp:rp 341 mov $i,(%rsp,$i,8) # zap temporary vector 342 mov $ap,($rp,$i,8) # rp[i]=tp[i] 343 lea 1($i),$i 344 sub \$1,$j 345 jnz .Lcopy 346 347 mov 8(%rsp,$num,8),%rsi # restore %rsp 348 mov \$1,%rax 349 ___ 350 $code.=<<___ if ($win64); 351 movaps -88(%rsi),%xmm6 352 movaps -72(%rsi),%xmm7 353 ___ 354 $code.=<<___; 355 mov -48(%rsi),%r15 356 mov -40(%rsi),%r14 357 mov -32(%rsi),%r13 358 mov -24(%rsi),%r12 359 mov -16(%rsi),%rbp 360 mov -8(%rsi),%rbx 361 lea (%rsi),%rsp 362 .Lmul_epilogue: 363 ret 364 .size bn_mul_mont_gather5,.-bn_mul_mont_gather5 365 ___ 366 {{{ 367 my @A=("%r10","%r11"); 368 my @N=("%r13","%rdi"); 369 $code.=<<___; 370 .type bn_mul4x_mont_gather5,\@function,6 371 .align 32 372 bn_mul4x_mont_gather5: 373 .Lmul4x_enter: 374 ___ 375 $code.=<<___ if ($addx); 376 and \$0x80100,%r11d 377 cmp \$0x80100,%r11d 378 je .Lmulx4x_enter 379 ___ 380 $code.=<<___; 381 .byte 0x67 382 mov %rsp,%rax 383 push %rbx 384 push %rbp 385 push %r12 386 push %r13 387 push %r14 388 push %r15 389 ___ 390 $code.=<<___ if ($win64); 391 lea -0x28(%rsp),%rsp 392 movaps %xmm6,(%rsp) 393 movaps %xmm7,0x10(%rsp) 394 ___ 395 $code.=<<___; 396 .byte 0x67 397 mov ${num}d,%r10d 398 shl \$3,${num}d 399 shl \$3+2,%r10d # 4*$num 400 neg $num # -$num 401 402 ############################################################## 403 # ensure that stack frame doesn't alias with $aptr+4*$num 404 # modulo 4096, which covers ret[num], am[num] and n[2*num] 405 # (see bn_exp.c). this is done to allow memory disambiguation 406 # logic do its magic. [excessive frame is allocated in order 407 # to allow bn_from_mont8x to clear it.] 408 # 409 lea -64(%rsp,$num,2),%r11 410 sub $ap,%r11 411 and \$4095,%r11 412 cmp %r11,%r10 413 jb .Lmul4xsp_alt 414 sub %r11,%rsp # align with $ap 415 lea -64(%rsp,$num,2),%rsp # alloca(128+num*8) 416 jmp .Lmul4xsp_done 417 418 .align 32 419 .Lmul4xsp_alt: 420 lea 4096-64(,$num,2),%r10 421 lea -64(%rsp,$num,2),%rsp # alloca(128+num*8) 422 sub %r10,%r11 423 mov \$0,%r10 424 cmovc %r10,%r11 425 sub %r11,%rsp 426 .Lmul4xsp_done: 427 and \$-64,%rsp 428 neg $num 429 430 mov %rax,40(%rsp) 431 .Lmul4x_body: 432 433 call mul4x_internal 434 435 mov 40(%rsp),%rsi # restore %rsp 436 mov \$1,%rax 437 ___ 438 $code.=<<___ if ($win64); 439 movaps -88(%rsi),%xmm6 440 movaps -72(%rsi),%xmm7 441 ___ 442 $code.=<<___; 443 mov -48(%rsi),%r15 444 mov -40(%rsi),%r14 445 mov -32(%rsi),%r13 446 mov -24(%rsi),%r12 447 mov -16(%rsi),%rbp 448 mov -8(%rsi),%rbx 449 lea (%rsi),%rsp 450 .Lmul4x_epilogue: 451 ret 452 .size bn_mul4x_mont_gather5,.-bn_mul4x_mont_gather5 453 454 .type mul4x_internal,\@abi-omnipotent 455 .align 32 456 mul4x_internal: 457 shl \$5,$num 458 mov `($win64?56:8)`(%rax),%r10d # load 7th argument 459 lea 256(%rdx,$num),%r13 460 shr \$5,$num # restore $num 461 ___ 462 $bp="%r12"; 463 $STRIDE=2**5*8; # 5 is "window size" 464 $N=$STRIDE/4; # should match cache line size 465 $tp=$i; 466 $code.=<<___; 467 mov %r10,%r11 468 shr \$`log($N/8)/log(2)`,%r10 469 and \$`$N/8-1`,%r11 470 not %r10 471 lea .Lmagic_masks(%rip),%rax 472 and \$`2**5/($N/8)-1`,%r10 # 5 is "window size" 473 lea 96(%rdx,%r11,8),$bp # pointer within 1st cache line 474 movq 0(%rax,%r10,8),%xmm4 # set of masks denoting which 475 movq 8(%rax,%r10,8),%xmm5 # cache line contains element 476 add \$7,%r11 477 movq 16(%rax,%r10,8),%xmm6 # denoted by 7th argument 478 movq 24(%rax,%r10,8),%xmm7 479 and \$7,%r11 480 481 movq `0*$STRIDE/4-96`($bp),%xmm0 482 lea $STRIDE($bp),$tp # borrow $tp 483 movq `1*$STRIDE/4-96`($bp),%xmm1 484 pand %xmm4,%xmm0 485 movq `2*$STRIDE/4-96`($bp),%xmm2 486 pand %xmm5,%xmm1 487 movq `3*$STRIDE/4-96`($bp),%xmm3 488 pand %xmm6,%xmm2 489 .byte 0x67 490 por %xmm1,%xmm0 491 movq `0*$STRIDE/4-96`($tp),%xmm1 492 .byte 0x67 493 pand %xmm7,%xmm3 494 .byte 0x67 495 por %xmm2,%xmm0 496 movq `1*$STRIDE/4-96`($tp),%xmm2 497 .byte 0x67 498 pand %xmm4,%xmm1 499 .byte 0x67 500 por %xmm3,%xmm0 501 movq `2*$STRIDE/4-96`($tp),%xmm3 502 503 movq %xmm0,$m0 # m0=bp[0] 504 movq `3*$STRIDE/4-96`($tp),%xmm0 505 mov %r13,16+8(%rsp) # save end of b[num] 506 mov $rp, 56+8(%rsp) # save $rp 507 508 mov ($n0),$n0 # pull n0[0] value 509 mov ($ap),%rax 510 lea ($ap,$num),$ap # end of a[num] 511 neg $num 512 513 mov $n0,$m1 514 mulq $m0 # ap[0]*bp[0] 515 mov %rax,$A[0] 516 mov ($np),%rax 517 518 pand %xmm5,%xmm2 519 pand %xmm6,%xmm3 520 por %xmm2,%xmm1 521 522 imulq $A[0],$m1 # "tp[0]"*n0 523 ############################################################## 524 # $tp is chosen so that writing to top-most element of the 525 # vector occurs just "above" references to powers table, 526 # "above" modulo cache-line size, which effectively precludes 527 # possibility of memory disambiguation logic failure when 528 # accessing the table. 529 # 530 lea 64+8(%rsp,%r11,8),$tp 531 mov %rdx,$A[1] 532 533 pand %xmm7,%xmm0 534 por %xmm3,%xmm1 535 lea 2*$STRIDE($bp),$bp 536 por %xmm1,%xmm0 537 538 mulq $m1 # np[0]*m1 539 add %rax,$A[0] # discarded 540 mov 8($ap,$num),%rax 541 adc \$0,%rdx 542 mov %rdx,$N[1] 543 544 mulq $m0 545 add %rax,$A[1] 546 mov 16*1($np),%rax # interleaved with 0, therefore 16*n 547 adc \$0,%rdx 548 mov %rdx,$A[0] 549 550 mulq $m1 551 add %rax,$N[1] 552 mov 16($ap,$num),%rax 553 adc \$0,%rdx 554 add $A[1],$N[1] 555 lea 4*8($num),$j # j=4 556 lea 16*4($np),$np 557 adc \$0,%rdx 558 mov $N[1],($tp) 559 mov %rdx,$N[0] 560 jmp .L1st4x 561 562 .align 32 563 .L1st4x: 564 mulq $m0 # ap[j]*bp[0] 565 add %rax,$A[0] 566 mov -16*2($np),%rax 567 lea 32($tp),$tp 568 adc \$0,%rdx 569 mov %rdx,$A[1] 570 571 mulq $m1 # np[j]*m1 572 add %rax,$N[0] 573 mov -8($ap,$j),%rax 574 adc \$0,%rdx 575 add $A[0],$N[0] # np[j]*m1+ap[j]*bp[0] 576 adc \$0,%rdx 577 mov $N[0],-24($tp) # tp[j-1] 578 mov %rdx,$N[1] 579 580 mulq $m0 # ap[j]*bp[0] 581 add %rax,$A[1] 582 mov -16*1($np),%rax 583 adc \$0,%rdx 584 mov %rdx,$A[0] 585 586 mulq $m1 # np[j]*m1 587 add %rax,$N[1] 588 mov ($ap,$j),%rax 589 adc \$0,%rdx 590 add $A[1],$N[1] # np[j]*m1+ap[j]*bp[0] 591 adc \$0,%rdx 592 mov $N[1],-16($tp) # tp[j-1] 593 mov %rdx,$N[0] 594 595 mulq $m0 # ap[j]*bp[0] 596 add %rax,$A[0] 597 mov 16*0($np),%rax 598 adc \$0,%rdx 599 mov %rdx,$A[1] 600 601 mulq $m1 # np[j]*m1 602 add %rax,$N[0] 603 mov 8($ap,$j),%rax 604 adc \$0,%rdx 605 add $A[0],$N[0] # np[j]*m1+ap[j]*bp[0] 606 adc \$0,%rdx 607 mov $N[0],-8($tp) # tp[j-1] 608 mov %rdx,$N[1] 609 610 mulq $m0 # ap[j]*bp[0] 611 add %rax,$A[1] 612 mov 16*1($np),%rax 613 adc \$0,%rdx 614 mov %rdx,$A[0] 615 616 mulq $m1 # np[j]*m1 617 add %rax,$N[1] 618 mov 16($ap,$j),%rax 619 adc \$0,%rdx 620 add $A[1],$N[1] # np[j]*m1+ap[j]*bp[0] 621 lea 16*4($np),$np 622 adc \$0,%rdx 623 mov $N[1],($tp) # tp[j-1] 624 mov %rdx,$N[0] 625 626 add \$32,$j # j+=4 627 jnz .L1st4x 628 629 mulq $m0 # ap[j]*bp[0] 630 add %rax,$A[0] 631 mov -16*2($np),%rax 632 lea 32($tp),$tp 633 adc \$0,%rdx 634 mov %rdx,$A[1] 635 636 mulq $m1 # np[j]*m1 637 add %rax,$N[0] 638 mov -8($ap),%rax 639 adc \$0,%rdx 640 add $A[0],$N[0] # np[j]*m1+ap[j]*bp[0] 641 adc \$0,%rdx 642 mov $N[0],-24($tp) # tp[j-1] 643 mov %rdx,$N[1] 644 645 mulq $m0 # ap[j]*bp[0] 646 add %rax,$A[1] 647 mov -16*1($np),%rax 648 adc \$0,%rdx 649 mov %rdx,$A[0] 650 651 mulq $m1 # np[j]*m1 652 add %rax,$N[1] 653 mov ($ap,$num),%rax # ap[0] 654 adc \$0,%rdx 655 add $A[1],$N[1] # np[j]*m1+ap[j]*bp[0] 656 adc \$0,%rdx 657 mov $N[1],-16($tp) # tp[j-1] 658 mov %rdx,$N[0] 659 660 movq %xmm0,$m0 # bp[1] 661 lea ($np,$num,2),$np # rewind $np 662 663 xor $N[1],$N[1] 664 add $A[0],$N[0] 665 adc \$0,$N[1] 666 mov $N[0],-8($tp) 667 668 jmp .Louter4x 669 670 .align 32 671 .Louter4x: 672 mov ($tp,$num),$A[0] 673 mov $n0,$m1 674 mulq $m0 # ap[0]*bp[i] 675 add %rax,$A[0] # ap[0]*bp[i]+tp[0] 676 mov ($np),%rax 677 adc \$0,%rdx 678 679 movq `0*$STRIDE/4-96`($bp),%xmm0 680 movq `1*$STRIDE/4-96`($bp),%xmm1 681 pand %xmm4,%xmm0 682 movq `2*$STRIDE/4-96`($bp),%xmm2 683 pand %xmm5,%xmm1 684 movq `3*$STRIDE/4-96`($bp),%xmm3 685 686 imulq $A[0],$m1 # tp[0]*n0 687 .byte 0x67 688 mov %rdx,$A[1] 689 mov $N[1],($tp) # store upmost overflow bit 690 691 pand %xmm6,%xmm2 692 por %xmm1,%xmm0 693 pand %xmm7,%xmm3 694 por %xmm2,%xmm0 695 lea ($tp,$num),$tp # rewind $tp 696 lea $STRIDE($bp),$bp 697 por %xmm3,%xmm0 698 699 mulq $m1 # np[0]*m1 700 add %rax,$A[0] # "$N[0]", discarded 701 mov 8($ap,$num),%rax 702 adc \$0,%rdx 703 mov %rdx,$N[1] 704 705 mulq $m0 # ap[j]*bp[i] 706 add %rax,$A[1] 707 mov 16*1($np),%rax # interleaved with 0, therefore 16*n 708 adc \$0,%rdx 709 add 8($tp),$A[1] # +tp[1] 710 adc \$0,%rdx 711 mov %rdx,$A[0] 712 713 mulq $m1 # np[j]*m1 714 add %rax,$N[1] 715 mov 16($ap,$num),%rax 716 adc \$0,%rdx 717 add $A[1],$N[1] # np[j]*m1+ap[j]*bp[i]+tp[j] 718 lea 4*8($num),$j # j=4 719 lea 16*4($np),$np 720 adc \$0,%rdx 721 mov %rdx,$N[0] 722 jmp .Linner4x 723 724 .align 32 725 .Linner4x: 726 mulq $m0 # ap[j]*bp[i] 727 add %rax,$A[0] 728 mov -16*2($np),%rax 729 adc \$0,%rdx 730 add 16($tp),$A[0] # ap[j]*bp[i]+tp[j] 731 lea 32($tp),$tp 732 adc \$0,%rdx 733 mov %rdx,$A[1] 734 735 mulq $m1 # np[j]*m1 736 add %rax,$N[0] 737 mov -8($ap,$j),%rax 738 adc \$0,%rdx 739 add $A[0],$N[0] 740 adc \$0,%rdx 741 mov $N[1],-32($tp) # tp[j-1] 742 mov %rdx,$N[1] 743 744 mulq $m0 # ap[j]*bp[i] 745 add %rax,$A[1] 746 mov -16*1($np),%rax 747 adc \$0,%rdx 748 add -8($tp),$A[1] 749 adc \$0,%rdx 750 mov %rdx,$A[0] 751 752 mulq $m1 # np[j]*m1 753 add %rax,$N[1] 754 mov ($ap,$j),%rax 755 adc \$0,%rdx 756 add $A[1],$N[1] 757 adc \$0,%rdx 758 mov $N[0],-24($tp) # tp[j-1] 759 mov %rdx,$N[0] 760 761 mulq $m0 # ap[j]*bp[i] 762 add %rax,$A[0] 763 mov 16*0($np),%rax 764 adc \$0,%rdx 765 add ($tp),$A[0] # ap[j]*bp[i]+tp[j] 766 adc \$0,%rdx 767 mov %rdx,$A[1] 768 769 mulq $m1 # np[j]*m1 770 add %rax,$N[0] 771 mov 8($ap,$j),%rax 772 adc \$0,%rdx 773 add $A[0],$N[0] 774 adc \$0,%rdx 775 mov $N[1],-16($tp) # tp[j-1] 776 mov %rdx,$N[1] 777 778 mulq $m0 # ap[j]*bp[i] 779 add %rax,$A[1] 780 mov 16*1($np),%rax 781 adc \$0,%rdx 782 add 8($tp),$A[1] 783 adc \$0,%rdx 784 mov %rdx,$A[0] 785 786 mulq $m1 # np[j]*m1 787 add %rax,$N[1] 788 mov 16($ap,$j),%rax 789 adc \$0,%rdx 790 add $A[1],$N[1] 791 lea 16*4($np),$np 792 adc \$0,%rdx 793 mov $N[0],-8($tp) # tp[j-1] 794 mov %rdx,$N[0] 795 796 add \$32,$j # j+=4 797 jnz .Linner4x 798 799 mulq $m0 # ap[j]*bp[i] 800 add %rax,$A[0] 801 mov -16*2($np),%rax 802 adc \$0,%rdx 803 add 16($tp),$A[0] # ap[j]*bp[i]+tp[j] 804 lea 32($tp),$tp 805 adc \$0,%rdx 806 mov %rdx,$A[1] 807 808 mulq $m1 # np[j]*m1 809 add %rax,$N[0] 810 mov -8($ap),%rax 811 adc \$0,%rdx 812 add $A[0],$N[0] 813 adc \$0,%rdx 814 mov $N[1],-32($tp) # tp[j-1] 815 mov %rdx,$N[1] 816 817 mulq $m0 # ap[j]*bp[i] 818 add %rax,$A[1] 819 mov $m1,%rax 820 mov -16*1($np),$m1 821 adc \$0,%rdx 822 add -8($tp),$A[1] 823 adc \$0,%rdx 824 mov %rdx,$A[0] 825 826 mulq $m1 # np[j]*m1 827 add %rax,$N[1] 828 mov ($ap,$num),%rax # ap[0] 829 adc \$0,%rdx 830 add $A[1],$N[1] 831 adc \$0,%rdx 832 mov $N[0],-24($tp) # tp[j-1] 833 mov %rdx,$N[0] 834 835 movq %xmm0,$m0 # bp[i+1] 836 mov $N[1],-16($tp) # tp[j-1] 837 lea ($np,$num,2),$np # rewind $np 838 839 xor $N[1],$N[1] 840 add $A[0],$N[0] 841 adc \$0,$N[1] 842 add ($tp),$N[0] # pull upmost overflow bit 843 adc \$0,$N[1] # upmost overflow bit 844 mov $N[0],-8($tp) 845 846 cmp 16+8(%rsp),$bp 847 jb .Louter4x 848 ___ 849 if (1) { 850 $code.=<<___; 851 sub $N[0],$m1 # compare top-most words 852 adc $j,$j # $j is zero 853 or $j,$N[1] 854 xor \$1,$N[1] 855 lea ($tp,$num),%rbx # tptr in .sqr4x_sub 856 lea ($np,$N[1],8),%rbp # nptr in .sqr4x_sub 857 mov %r9,%rcx 858 sar \$3+2,%rcx # cf=0 859 mov 56+8(%rsp),%rdi # rptr in .sqr4x_sub 860 jmp .Lsqr4x_sub 861 ___ 862 } else { 863 my @ri=("%rax",$bp,$m0,$m1); 864 my $rp="%rdx"; 865 $code.=<<___ 866 xor \$1,$N[1] 867 lea ($tp,$num),$tp # rewind $tp 868 sar \$5,$num # cf=0 869 lea ($np,$N[1],8),$np 870 mov 56+8(%rsp),$rp # restore $rp 871 jmp .Lsub4x 872 873 .align 32 874 .Lsub4x: 875 .byte 0x66 876 mov 8*0($tp),@ri[0] 877 mov 8*1($tp),@ri[1] 878 .byte 0x66 879 sbb 16*0($np),@ri[0] 880 mov 8*2($tp),@ri[2] 881 sbb 16*1($np),@ri[1] 882 mov 3*8($tp),@ri[3] 883 lea 4*8($tp),$tp 884 sbb 16*2($np),@ri[2] 885 mov @ri[0],8*0($rp) 886 sbb 16*3($np),@ri[3] 887 lea 16*4($np),$np 888 mov @ri[1],8*1($rp) 889 mov @ri[2],8*2($rp) 890 mov @ri[3],8*3($rp) 891 lea 8*4($rp),$rp 892 893 inc $num 894 jnz .Lsub4x 895 896 ret 897 ___ 898 } 899 $code.=<<___; 900 .size mul4x_internal,.-mul4x_internal 901 ___ 902 }}} 903 {{{ 905 ###################################################################### 906 # void bn_power5( 907 my $rptr="%rdi"; # BN_ULONG *rptr, 908 my $aptr="%rsi"; # const BN_ULONG *aptr, 909 my $bptr="%rdx"; # const void *table, 910 my $nptr="%rcx"; # const BN_ULONG *nptr, 911 my $n0 ="%r8"; # const BN_ULONG *n0); 912 my $num ="%r9"; # int num, has to be divisible by 8 913 # int pwr 914 915 my ($i,$j,$tptr)=("%rbp","%rcx",$rptr); 916 my @A0=("%r10","%r11"); 917 my @A1=("%r12","%r13"); 918 my ($a0,$a1,$ai)=("%r14","%r15","%rbx"); 919 920 $code.=<<___; 921 .globl bn_power5 922 .type bn_power5,\@function,6 923 .align 32 924 bn_power5: 925 ___ 926 $code.=<<___ if ($addx); 927 mov OPENSSL_ia32cap_P+8(%rip),%r11d 928 and \$0x80100,%r11d 929 cmp \$0x80100,%r11d 930 je .Lpowerx5_enter 931 ___ 932 $code.=<<___; 933 mov %rsp,%rax 934 push %rbx 935 push %rbp 936 push %r12 937 push %r13 938 push %r14 939 push %r15 940 ___ 941 $code.=<<___ if ($win64); 942 lea -0x28(%rsp),%rsp 943 movaps %xmm6,(%rsp) 944 movaps %xmm7,0x10(%rsp) 945 ___ 946 $code.=<<___; 947 mov ${num}d,%r10d 948 shl \$3,${num}d # convert $num to bytes 949 shl \$3+2,%r10d # 4*$num 950 neg $num 951 mov ($n0),$n0 # *n0 952 953 ############################################################## 954 # ensure that stack frame doesn't alias with $aptr+4*$num 955 # modulo 4096, which covers ret[num], am[num] and n[2*num] 956 # (see bn_exp.c). this is done to allow memory disambiguation 957 # logic do its magic. 958 # 959 lea -64(%rsp,$num,2),%r11 960 sub $aptr,%r11 961 and \$4095,%r11 962 cmp %r11,%r10 963 jb .Lpwr_sp_alt 964 sub %r11,%rsp # align with $aptr 965 lea -64(%rsp,$num,2),%rsp # alloca(frame+2*$num) 966 jmp .Lpwr_sp_done 967 968 .align 32 969 .Lpwr_sp_alt: 970 lea 4096-64(,$num,2),%r10 # 4096-frame-2*$num 971 lea -64(%rsp,$num,2),%rsp # alloca(frame+2*$num) 972 sub %r10,%r11 973 mov \$0,%r10 974 cmovc %r10,%r11 975 sub %r11,%rsp 976 .Lpwr_sp_done: 977 and \$-64,%rsp 978 mov $num,%r10 979 neg $num 980 981 ############################################################## 982 # Stack layout 983 # 984 # +0 saved $num, used in reduction section 985 # +8 &t[2*$num], used in reduction section 986 # +32 saved *n0 987 # +40 saved %rsp 988 # +48 t[2*$num] 989 # 990 mov $n0, 32(%rsp) 991 mov %rax, 40(%rsp) # save original %rsp 992 .Lpower5_body: 993 movq $rptr,%xmm1 # save $rptr 994 movq $nptr,%xmm2 # save $nptr 995 movq %r10, %xmm3 # -$num 996 movq $bptr,%xmm4 997 998 call __bn_sqr8x_internal 999 call __bn_sqr8x_internal 1000 call __bn_sqr8x_internal 1001 call __bn_sqr8x_internal 1002 call __bn_sqr8x_internal 1003 1004 movq %xmm2,$nptr 1005 movq %xmm4,$bptr 1006 mov $aptr,$rptr 1007 mov 40(%rsp),%rax 1008 lea 32(%rsp),$n0 1009 1010 call mul4x_internal 1011 1012 mov 40(%rsp),%rsi # restore %rsp 1013 mov \$1,%rax 1014 mov -48(%rsi),%r15 1015 mov -40(%rsi),%r14 1016 mov -32(%rsi),%r13 1017 mov -24(%rsi),%r12 1018 mov -16(%rsi),%rbp 1019 mov -8(%rsi),%rbx 1020 lea (%rsi),%rsp 1021 .Lpower5_epilogue: 1022 ret 1023 .size bn_power5,.-bn_power5 1024 1025 .globl bn_sqr8x_internal 1026 .hidden bn_sqr8x_internal 1027 .type bn_sqr8x_internal,\@abi-omnipotent 1028 .align 32 1029 bn_sqr8x_internal: 1030 __bn_sqr8x_internal: 1031 ############################################################## 1032 # Squaring part: 1033 # 1034 # a) multiply-n-add everything but a[i]*a[i]; 1035 # b) shift result of a) by 1 to the left and accumulate 1036 # a[i]*a[i] products; 1037 # 1038 ############################################################## 1039 # a[1]a[0] 1040 # a[2]a[0] 1041 # a[3]a[0] 1042 # a[2]a[1] 1043 # a[4]a[0] 1044 # a[3]a[1] 1045 # a[5]a[0] 1046 # a[4]a[1] 1047 # a[3]a[2] 1048 # a[6]a[0] 1049 # a[5]a[1] 1050 # a[4]a[2] 1051 # a[7]a[0] 1052 # a[6]a[1] 1053 # a[5]a[2] 1054 # a[4]a[3] 1055 # a[7]a[1] 1056 # a[6]a[2] 1057 # a[5]a[3] 1058 # a[7]a[2] 1059 # a[6]a[3] 1060 # a[5]a[4] 1061 # a[7]a[3] 1062 # a[6]a[4] 1063 # a[7]a[4] 1064 # a[6]a[5] 1065 # a[7]a[5] 1066 # a[7]a[6] 1067 # a[1]a[0] 1068 # a[2]a[0] 1069 # a[3]a[0] 1070 # a[4]a[0] 1071 # a[5]a[0] 1072 # a[6]a[0] 1073 # a[7]a[0] 1074 # a[2]a[1] 1075 # a[3]a[1] 1076 # a[4]a[1] 1077 # a[5]a[1] 1078 # a[6]a[1] 1079 # a[7]a[1] 1080 # a[3]a[2] 1081 # a[4]a[2] 1082 # a[5]a[2] 1083 # a[6]a[2] 1084 # a[7]a[2] 1085 # a[4]a[3] 1086 # a[5]a[3] 1087 # a[6]a[3] 1088 # a[7]a[3] 1089 # a[5]a[4] 1090 # a[6]a[4] 1091 # a[7]a[4] 1092 # a[6]a[5] 1093 # a[7]a[5] 1094 # a[7]a[6] 1095 # a[0]a[0] 1096 # a[1]a[1] 1097 # a[2]a[2] 1098 # a[3]a[3] 1099 # a[4]a[4] 1100 # a[5]a[5] 1101 # a[6]a[6] 1102 # a[7]a[7] 1103 1104 lea 32(%r10),$i # $i=-($num-32) 1105 lea ($aptr,$num),$aptr # end of a[] buffer, ($aptr,$i)=&ap[2] 1106 1107 mov $num,$j # $j=$num 1108 1109 # comments apply to $num==8 case 1110 mov -32($aptr,$i),$a0 # a[0] 1111 lea 48+8(%rsp,$num,2),$tptr # end of tp[] buffer, &tp[2*$num] 1112 mov -24($aptr,$i),%rax # a[1] 1113 lea -32($tptr,$i),$tptr # end of tp[] window, &tp[2*$num-"$i"] 1114 mov -16($aptr,$i),$ai # a[2] 1115 mov %rax,$a1 1116 1117 mul $a0 # a[1]*a[0] 1118 mov %rax,$A0[0] # a[1]*a[0] 1119 mov $ai,%rax # a[2] 1120 mov %rdx,$A0[1] 1121 mov $A0[0],-24($tptr,$i) # t[1] 1122 1123 mul $a0 # a[2]*a[0] 1124 add %rax,$A0[1] 1125 mov $ai,%rax 1126 adc \$0,%rdx 1127 mov $A0[1],-16($tptr,$i) # t[2] 1128 mov %rdx,$A0[0] 1129 1130 1131 mov -8($aptr,$i),$ai # a[3] 1132 mul $a1 # a[2]*a[1] 1133 mov %rax,$A1[0] # a[2]*a[1]+t[3] 1134 mov $ai,%rax 1135 mov %rdx,$A1[1] 1136 1137 lea ($i),$j 1138 mul $a0 # a[3]*a[0] 1139 add %rax,$A0[0] # a[3]*a[0]+a[2]*a[1]+t[3] 1140 mov $ai,%rax 1141 mov %rdx,$A0[1] 1142 adc \$0,$A0[1] 1143 add $A1[0],$A0[0] 1144 adc \$0,$A0[1] 1145 mov $A0[0],-8($tptr,$j) # t[3] 1146 jmp .Lsqr4x_1st 1147 1148 .align 32 1149 .Lsqr4x_1st: 1150 mov ($aptr,$j),$ai # a[4] 1151 mul $a1 # a[3]*a[1] 1152 add %rax,$A1[1] # a[3]*a[1]+t[4] 1153 mov $ai,%rax 1154 mov %rdx,$A1[0] 1155 adc \$0,$A1[0] 1156 1157 mul $a0 # a[4]*a[0] 1158 add %rax,$A0[1] # a[4]*a[0]+a[3]*a[1]+t[4] 1159 mov $ai,%rax # a[3] 1160 mov 8($aptr,$j),$ai # a[5] 1161 mov %rdx,$A0[0] 1162 adc \$0,$A0[0] 1163 add $A1[1],$A0[1] 1164 adc \$0,$A0[0] 1165 1166 1167 mul $a1 # a[4]*a[3] 1168 add %rax,$A1[0] # a[4]*a[3]+t[5] 1169 mov $ai,%rax 1170 mov $A0[1],($tptr,$j) # t[4] 1171 mov %rdx,$A1[1] 1172 adc \$0,$A1[1] 1173 1174 mul $a0 # a[5]*a[2] 1175 add %rax,$A0[0] # a[5]*a[2]+a[4]*a[3]+t[5] 1176 mov $ai,%rax 1177 mov 16($aptr,$j),$ai # a[6] 1178 mov %rdx,$A0[1] 1179 adc \$0,$A0[1] 1180 add $A1[0],$A0[0] 1181 adc \$0,$A0[1] 1182 1183 mul $a1 # a[5]*a[3] 1184 add %rax,$A1[1] # a[5]*a[3]+t[6] 1185 mov $ai,%rax 1186 mov $A0[0],8($tptr,$j) # t[5] 1187 mov %rdx,$A1[0] 1188 adc \$0,$A1[0] 1189 1190 mul $a0 # a[6]*a[2] 1191 add %rax,$A0[1] # a[6]*a[2]+a[5]*a[3]+t[6] 1192 mov $ai,%rax # a[3] 1193 mov 24($aptr,$j),$ai # a[7] 1194 mov %rdx,$A0[0] 1195 adc \$0,$A0[0] 1196 add $A1[1],$A0[1] 1197 adc \$0,$A0[0] 1198 1199 1200 mul $a1 # a[6]*a[5] 1201 add %rax,$A1[0] # a[6]*a[5]+t[7] 1202 mov $ai,%rax 1203 mov $A0[1],16($tptr,$j) # t[6] 1204 mov %rdx,$A1[1] 1205 adc \$0,$A1[1] 1206 lea 32($j),$j 1207 1208 mul $a0 # a[7]*a[4] 1209 add %rax,$A0[0] # a[7]*a[4]+a[6]*a[5]+t[6] 1210 mov $ai,%rax 1211 mov %rdx,$A0[1] 1212 adc \$0,$A0[1] 1213 add $A1[0],$A0[0] 1214 adc \$0,$A0[1] 1215 mov $A0[0],-8($tptr,$j) # t[7] 1216 1217 cmp \$0,$j 1218 jne .Lsqr4x_1st 1219 1220 mul $a1 # a[7]*a[5] 1221 add %rax,$A1[1] 1222 lea 16($i),$i 1223 adc \$0,%rdx 1224 add $A0[1],$A1[1] 1225 adc \$0,%rdx 1226 1227 mov $A1[1],($tptr) # t[8] 1228 mov %rdx,$A1[0] 1229 mov %rdx,8($tptr) # t[9] 1230 jmp .Lsqr4x_outer 1231 1232 .align 32 1233 .Lsqr4x_outer: # comments apply to $num==6 case 1234 mov -32($aptr,$i),$a0 # a[0] 1235 lea 48+8(%rsp,$num,2),$tptr # end of tp[] buffer, &tp[2*$num] 1236 mov -24($aptr,$i),%rax # a[1] 1237 lea -32($tptr,$i),$tptr # end of tp[] window, &tp[2*$num-"$i"] 1238 mov -16($aptr,$i),$ai # a[2] 1239 mov %rax,$a1 1240 1241 mul $a0 # a[1]*a[0] 1242 mov -24($tptr,$i),$A0[0] # t[1] 1243 add %rax,$A0[0] # a[1]*a[0]+t[1] 1244 mov $ai,%rax # a[2] 1245 adc \$0,%rdx 1246 mov $A0[0],-24($tptr,$i) # t[1] 1247 mov %rdx,$A0[1] 1248 1249 mul $a0 # a[2]*a[0] 1250 add %rax,$A0[1] 1251 mov $ai,%rax 1252 adc \$0,%rdx 1253 add -16($tptr,$i),$A0[1] # a[2]*a[0]+t[2] 1254 mov %rdx,$A0[0] 1255 adc \$0,$A0[0] 1256 mov $A0[1],-16($tptr,$i) # t[2] 1257 1258 xor $A1[0],$A1[0] 1259 1260 mov -8($aptr,$i),$ai # a[3] 1261 mul $a1 # a[2]*a[1] 1262 add %rax,$A1[0] # a[2]*a[1]+t[3] 1263 mov $ai,%rax 1264 adc \$0,%rdx 1265 add -8($tptr,$i),$A1[0] 1266 mov %rdx,$A1[1] 1267 adc \$0,$A1[1] 1268 1269 mul $a0 # a[3]*a[0] 1270 add %rax,$A0[0] # a[3]*a[0]+a[2]*a[1]+t[3] 1271 mov $ai,%rax 1272 adc \$0,%rdx 1273 add $A1[0],$A0[0] 1274 mov %rdx,$A0[1] 1275 adc \$0,$A0[1] 1276 mov $A0[0],-8($tptr,$i) # t[3] 1277 1278 lea ($i),$j 1279 jmp .Lsqr4x_inner 1280 1281 .align 32 1282 .Lsqr4x_inner: 1283 mov ($aptr,$j),$ai # a[4] 1284 mul $a1 # a[3]*a[1] 1285 add %rax,$A1[1] # a[3]*a[1]+t[4] 1286 mov $ai,%rax 1287 mov %rdx,$A1[0] 1288 adc \$0,$A1[0] 1289 add ($tptr,$j),$A1[1] 1290 adc \$0,$A1[0] 1291 1292 .byte 0x67 1293 mul $a0 # a[4]*a[0] 1294 add %rax,$A0[1] # a[4]*a[0]+a[3]*a[1]+t[4] 1295 mov $ai,%rax # a[3] 1296 mov 8($aptr,$j),$ai # a[5] 1297 mov %rdx,$A0[0] 1298 adc \$0,$A0[0] 1299 add $A1[1],$A0[1] 1300 adc \$0,$A0[0] 1301 1302 mul $a1 # a[4]*a[3] 1303 add %rax,$A1[0] # a[4]*a[3]+t[5] 1304 mov $A0[1],($tptr,$j) # t[4] 1305 mov $ai,%rax 1306 mov %rdx,$A1[1] 1307 adc \$0,$A1[1] 1308 add 8($tptr,$j),$A1[0] 1309 lea 16($j),$j # j++ 1310 adc \$0,$A1[1] 1311 1312 mul $a0 # a[5]*a[2] 1313 add %rax,$A0[0] # a[5]*a[2]+a[4]*a[3]+t[5] 1314 mov $ai,%rax 1315 adc \$0,%rdx 1316 add $A1[0],$A0[0] 1317 mov %rdx,$A0[1] 1318 adc \$0,$A0[1] 1319 mov $A0[0],-8($tptr,$j) # t[5], "preloaded t[1]" below 1320 1321 cmp \$0,$j 1322 jne .Lsqr4x_inner 1323 1324 .byte 0x67 1325 mul $a1 # a[5]*a[3] 1326 add %rax,$A1[1] 1327 adc \$0,%rdx 1328 add $A0[1],$A1[1] 1329 adc \$0,%rdx 1330 1331 mov $A1[1],($tptr) # t[6], "preloaded t[2]" below 1332 mov %rdx,$A1[0] 1333 mov %rdx,8($tptr) # t[7], "preloaded t[3]" below 1334 1335 add \$16,$i 1336 jnz .Lsqr4x_outer 1337 1338 # comments apply to $num==4 case 1339 mov -32($aptr),$a0 # a[0] 1340 lea 48+8(%rsp,$num,2),$tptr # end of tp[] buffer, &tp[2*$num] 1341 mov -24($aptr),%rax # a[1] 1342 lea -32($tptr,$i),$tptr # end of tp[] window, &tp[2*$num-"$i"] 1343 mov -16($aptr),$ai # a[2] 1344 mov %rax,$a1 1345 1346 mul $a0 # a[1]*a[0] 1347 add %rax,$A0[0] # a[1]*a[0]+t[1], preloaded t[1] 1348 mov $ai,%rax # a[2] 1349 mov %rdx,$A0[1] 1350 adc \$0,$A0[1] 1351 1352 mul $a0 # a[2]*a[0] 1353 add %rax,$A0[1] 1354 mov $ai,%rax 1355 mov $A0[0],-24($tptr) # t[1] 1356 mov %rdx,$A0[0] 1357 adc \$0,$A0[0] 1358 add $A1[1],$A0[1] # a[2]*a[0]+t[2], preloaded t[2] 1359 mov -8($aptr),$ai # a[3] 1360 adc \$0,$A0[0] 1361 1362 mul $a1 # a[2]*a[1] 1363 add %rax,$A1[0] # a[2]*a[1]+t[3], preloaded t[3] 1364 mov $ai,%rax 1365 mov $A0[1],-16($tptr) # t[2] 1366 mov %rdx,$A1[1] 1367 adc \$0,$A1[1] 1368 1369 mul $a0 # a[3]*a[0] 1370 add %rax,$A0[0] # a[3]*a[0]+a[2]*a[1]+t[3] 1371 mov $ai,%rax 1372 mov %rdx,$A0[1] 1373 adc \$0,$A0[1] 1374 add $A1[0],$A0[0] 1375 adc \$0,$A0[1] 1376 mov $A0[0],-8($tptr) # t[3] 1377 1378 mul $a1 # a[3]*a[1] 1379 add %rax,$A1[1] 1380 mov -16($aptr),%rax # a[2] 1381 adc \$0,%rdx 1382 add $A0[1],$A1[1] 1383 adc \$0,%rdx 1384 1385 mov $A1[1],($tptr) # t[4] 1386 mov %rdx,$A1[0] 1387 mov %rdx,8($tptr) # t[5] 1388 1389 mul $ai # a[2]*a[3] 1390 ___ 1391 { 1392 my ($shift,$carry)=($a0,$a1); 1393 my @S=(@A1,$ai,$n0); 1394 $code.=<<___; 1395 add \$16,$i 1396 xor $shift,$shift 1397 sub $num,$i # $i=16-$num 1398 xor $carry,$carry 1399 1400 add $A1[0],%rax # t[5] 1401 adc \$0,%rdx 1402 mov %rax,8($tptr) # t[5] 1403 mov %rdx,16($tptr) # t[6] 1404 mov $carry,24($tptr) # t[7] 1405 1406 mov -16($aptr,$i),%rax # a[0] 1407 lea 48+8(%rsp),$tptr 1408 xor $A0[0],$A0[0] # t[0] 1409 mov 8($tptr),$A0[1] # t[1] 1410 1411 lea ($shift,$A0[0],2),$S[0] # t[2*i]<<1 | shift 1412 shr \$63,$A0[0] 1413 lea ($j,$A0[1],2),$S[1] # t[2*i+1]<<1 | 1414 shr \$63,$A0[1] 1415 or $A0[0],$S[1] # | t[2*i]>>63 1416 mov 16($tptr),$A0[0] # t[2*i+2] # prefetch 1417 mov $A0[1],$shift # shift=t[2*i+1]>>63 1418 mul %rax # a[i]*a[i] 1419 neg $carry # mov $carry,cf 1420 mov 24($tptr),$A0[1] # t[2*i+2+1] # prefetch 1421 adc %rax,$S[0] 1422 mov -8($aptr,$i),%rax # a[i+1] # prefetch 1423 mov $S[0],($tptr) 1424 adc %rdx,$S[1] 1425 1426 lea ($shift,$A0[0],2),$S[2] # t[2*i]<<1 | shift 1427 mov $S[1],8($tptr) 1428 sbb $carry,$carry # mov cf,$carry 1429 shr \$63,$A0[0] 1430 lea ($j,$A0[1],2),$S[3] # t[2*i+1]<<1 | 1431 shr \$63,$A0[1] 1432 or $A0[0],$S[3] # | t[2*i]>>63 1433 mov 32($tptr),$A0[0] # t[2*i+2] # prefetch 1434 mov $A0[1],$shift # shift=t[2*i+1]>>63 1435 mul %rax # a[i]*a[i] 1436 neg $carry # mov $carry,cf 1437 mov 40($tptr),$A0[1] # t[2*i+2+1] # prefetch 1438 adc %rax,$S[2] 1439 mov 0($aptr,$i),%rax # a[i+1] # prefetch 1440 mov $S[2],16($tptr) 1441 adc %rdx,$S[3] 1442 lea 16($i),$i 1443 mov $S[3],24($tptr) 1444 sbb $carry,$carry # mov cf,$carry 1445 lea 64($tptr),$tptr 1446 jmp .Lsqr4x_shift_n_add 1447 1448 .align 32 1449 .Lsqr4x_shift_n_add: 1450 lea ($shift,$A0[0],2),$S[0] # t[2*i]<<1 | shift 1451 shr \$63,$A0[0] 1452 lea ($j,$A0[1],2),$S[1] # t[2*i+1]<<1 | 1453 shr \$63,$A0[1] 1454 or $A0[0],$S[1] # | t[2*i]>>63 1455 mov -16($tptr),$A0[0] # t[2*i+2] # prefetch 1456 mov $A0[1],$shift # shift=t[2*i+1]>>63 1457 mul %rax # a[i]*a[i] 1458 neg $carry # mov $carry,cf 1459 mov -8($tptr),$A0[1] # t[2*i+2+1] # prefetch 1460 adc %rax,$S[0] 1461 mov -8($aptr,$i),%rax # a[i+1] # prefetch 1462 mov $S[0],-32($tptr) 1463 adc %rdx,$S[1] 1464 1465 lea ($shift,$A0[0],2),$S[2] # t[2*i]<<1 | shift 1466 mov $S[1],-24($tptr) 1467 sbb $carry,$carry # mov cf,$carry 1468 shr \$63,$A0[0] 1469 lea ($j,$A0[1],2),$S[3] # t[2*i+1]<<1 | 1470 shr \$63,$A0[1] 1471 or $A0[0],$S[3] # | t[2*i]>>63 1472 mov 0($tptr),$A0[0] # t[2*i+2] # prefetch 1473 mov $A0[1],$shift # shift=t[2*i+1]>>63 1474 mul %rax # a[i]*a[i] 1475 neg $carry # mov $carry,cf 1476 mov 8($tptr),$A0[1] # t[2*i+2+1] # prefetch 1477 adc %rax,$S[2] 1478 mov 0($aptr,$i),%rax # a[i+1] # prefetch 1479 mov $S[2],-16($tptr) 1480 adc %rdx,$S[3] 1481 1482 lea ($shift,$A0[0],2),$S[0] # t[2*i]<<1 | shift 1483 mov $S[3],-8($tptr) 1484 sbb $carry,$carry # mov cf,$carry 1485 shr \$63,$A0[0] 1486 lea ($j,$A0[1],2),$S[1] # t[2*i+1]<<1 | 1487 shr \$63,$A0[1] 1488 or $A0[0],$S[1] # | t[2*i]>>63 1489 mov 16($tptr),$A0[0] # t[2*i+2] # prefetch 1490 mov $A0[1],$shift # shift=t[2*i+1]>>63 1491 mul %rax # a[i]*a[i] 1492 neg $carry # mov $carry,cf 1493 mov 24($tptr),$A0[1] # t[2*i+2+1] # prefetch 1494 adc %rax,$S[0] 1495 mov 8($aptr,$i),%rax # a[i+1] # prefetch 1496 mov $S[0],0($tptr) 1497 adc %rdx,$S[1] 1498 1499 lea ($shift,$A0[0],2),$S[2] # t[2*i]<<1 | shift 1500 mov $S[1],8($tptr) 1501 sbb $carry,$carry # mov cf,$carry 1502 shr \$63,$A0[0] 1503 lea ($j,$A0[1],2),$S[3] # t[2*i+1]<<1 | 1504 shr \$63,$A0[1] 1505 or $A0[0],$S[3] # | t[2*i]>>63 1506 mov 32($tptr),$A0[0] # t[2*i+2] # prefetch 1507 mov $A0[1],$shift # shift=t[2*i+1]>>63 1508 mul %rax # a[i]*a[i] 1509 neg $carry # mov $carry,cf 1510 mov 40($tptr),$A0[1] # t[2*i+2+1] # prefetch 1511 adc %rax,$S[2] 1512 mov 16($aptr,$i),%rax # a[i+1] # prefetch 1513 mov $S[2],16($tptr) 1514 adc %rdx,$S[3] 1515 mov $S[3],24($tptr) 1516 sbb $carry,$carry # mov cf,$carry 1517 lea 64($tptr),$tptr 1518 add \$32,$i 1519 jnz .Lsqr4x_shift_n_add 1520 1521 lea ($shift,$A0[0],2),$S[0] # t[2*i]<<1 | shift 1522 .byte 0x67 1523 shr \$63,$A0[0] 1524 lea ($j,$A0[1],2),$S[1] # t[2*i+1]<<1 | 1525 shr \$63,$A0[1] 1526 or $A0[0],$S[1] # | t[2*i]>>63 1527 mov -16($tptr),$A0[0] # t[2*i+2] # prefetch 1528 mov $A0[1],$shift # shift=t[2*i+1]>>63 1529 mul %rax # a[i]*a[i] 1530 neg $carry # mov $carry,cf 1531 mov -8($tptr),$A0[1] # t[2*i+2+1] # prefetch 1532 adc %rax,$S[0] 1533 mov -8($aptr),%rax # a[i+1] # prefetch 1534 mov $S[0],-32($tptr) 1535 adc %rdx,$S[1] 1536 1537 lea ($shift,$A0[0],2),$S[2] # t[2*i]<<1|shift 1538 mov $S[1],-24($tptr) 1539 sbb $carry,$carry # mov cf,$carry 1540 shr \$63,$A0[0] 1541 lea ($j,$A0[1],2),$S[3] # t[2*i+1]<<1 | 1542 shr \$63,$A0[1] 1543 or $A0[0],$S[3] # | t[2*i]>>63 1544 mul %rax # a[i]*a[i] 1545 neg $carry # mov $carry,cf 1546 adc %rax,$S[2] 1547 adc %rdx,$S[3] 1548 mov $S[2],-16($tptr) 1549 mov $S[3],-8($tptr) 1550 ___ 1551 } 1553 ###################################################################### 1554 # Montgomery reduction part, "word-by-word" algorithm. 1555 # 1556 # This new path is inspired by multiple submissions from Intel, by 1557 # Shay Gueron, Vlad Krasnov, Erdinc Ozturk, James Guilford, 1558 # Vinodh Gopal... 1559 { 1560 my ($nptr,$tptr,$carry,$m0)=("%rbp","%rdi","%rsi","%rbx"); 1561 1562 $code.=<<___; 1563 movq %xmm2,$nptr 1564 sqr8x_reduction: 1565 xor %rax,%rax 1566 lea ($nptr,$num,2),%rcx # end of n[] 1567 lea 48+8(%rsp,$num,2),%rdx # end of t[] buffer 1568 mov %rcx,0+8(%rsp) 1569 lea 48+8(%rsp,$num),$tptr # end of initial t[] window 1570 mov %rdx,8+8(%rsp) 1571 neg $num 1572 jmp .L8x_reduction_loop 1573 1574 .align 32 1575 .L8x_reduction_loop: 1576 lea ($tptr,$num),$tptr # start of current t[] window 1577 .byte 0x66 1578 mov 8*0($tptr),$m0 1579 mov 8*1($tptr),%r9 1580 mov 8*2($tptr),%r10 1581 mov 8*3($tptr),%r11 1582 mov 8*4($tptr),%r12 1583 mov 8*5($tptr),%r13 1584 mov 8*6($tptr),%r14 1585 mov 8*7($tptr),%r15 1586 mov %rax,(%rdx) # store top-most carry bit 1587 lea 8*8($tptr),$tptr 1588 1589 .byte 0x67 1590 mov $m0,%r8 1591 imulq 32+8(%rsp),$m0 # n0*a[0] 1592 mov 16*0($nptr),%rax # n[0] 1593 mov \$8,%ecx 1594 jmp .L8x_reduce 1595 1596 .align 32 1597 .L8x_reduce: 1598 mulq $m0 1599 mov 16*1($nptr),%rax # n[1] 1600 neg %r8 1601 mov %rdx,%r8 1602 adc \$0,%r8 1603 1604 mulq $m0 1605 add %rax,%r9 1606 mov 16*2($nptr),%rax 1607 adc \$0,%rdx 1608 add %r9,%r8 1609 mov $m0,48-8+8(%rsp,%rcx,8) # put aside n0*a[i] 1610 mov %rdx,%r9 1611 adc \$0,%r9 1612 1613 mulq $m0 1614 add %rax,%r10 1615 mov 16*3($nptr),%rax 1616 adc \$0,%rdx 1617 add %r10,%r9 1618 mov 32+8(%rsp),$carry # pull n0, borrow $carry 1619 mov %rdx,%r10 1620 adc \$0,%r10 1621 1622 mulq $m0 1623 add %rax,%r11 1624 mov 16*4($nptr),%rax 1625 adc \$0,%rdx 1626 imulq %r8,$carry # modulo-scheduled 1627 add %r11,%r10 1628 mov %rdx,%r11 1629 adc \$0,%r11 1630 1631 mulq $m0 1632 add %rax,%r12 1633 mov 16*5($nptr),%rax 1634 adc \$0,%rdx 1635 add %r12,%r11 1636 mov %rdx,%r12 1637 adc \$0,%r12 1638 1639 mulq $m0 1640 add %rax,%r13 1641 mov 16*6($nptr),%rax 1642 adc \$0,%rdx 1643 add %r13,%r12 1644 mov %rdx,%r13 1645 adc \$0,%r13 1646 1647 mulq $m0 1648 add %rax,%r14 1649 mov 16*7($nptr),%rax 1650 adc \$0,%rdx 1651 add %r14,%r13 1652 mov %rdx,%r14 1653 adc \$0,%r14 1654 1655 mulq $m0 1656 mov $carry,$m0 # n0*a[i] 1657 add %rax,%r15 1658 mov 16*0($nptr),%rax # n[0] 1659 adc \$0,%rdx 1660 add %r15,%r14 1661 mov %rdx,%r15 1662 adc \$0,%r15 1663 1664 dec %ecx 1665 jnz .L8x_reduce 1666 1667 lea 16*8($nptr),$nptr 1668 xor %rax,%rax 1669 mov 8+8(%rsp),%rdx # pull end of t[] 1670 cmp 0+8(%rsp),$nptr # end of n[]? 1671 jae .L8x_no_tail 1672 1673 .byte 0x66 1674 add 8*0($tptr),%r8 1675 adc 8*1($tptr),%r9 1676 adc 8*2($tptr),%r10 1677 adc 8*3($tptr),%r11 1678 adc 8*4($tptr),%r12 1679 adc 8*5($tptr),%r13 1680 adc 8*6($tptr),%r14 1681 adc 8*7($tptr),%r15 1682 sbb $carry,$carry # top carry 1683 1684 mov 48+56+8(%rsp),$m0 # pull n0*a[0] 1685 mov \$8,%ecx 1686 mov 16*0($nptr),%rax 1687 jmp .L8x_tail 1688 1689 .align 32 1690 .L8x_tail: 1691 mulq $m0 1692 add %rax,%r8 1693 mov 16*1($nptr),%rax 1694 mov %r8,($tptr) # save result 1695 mov %rdx,%r8 1696 adc \$0,%r8 1697 1698 mulq $m0 1699 add %rax,%r9 1700 mov 16*2($nptr),%rax 1701 adc \$0,%rdx 1702 add %r9,%r8 1703 lea 8($tptr),$tptr # $tptr++ 1704 mov %rdx,%r9 1705 adc \$0,%r9 1706 1707 mulq $m0 1708 add %rax,%r10 1709 mov 16*3($nptr),%rax 1710 adc \$0,%rdx 1711 add %r10,%r9 1712 mov %rdx,%r10 1713 adc \$0,%r10 1714 1715 mulq $m0 1716 add %rax,%r11 1717 mov 16*4($nptr),%rax 1718 adc \$0,%rdx 1719 add %r11,%r10 1720 mov %rdx,%r11 1721 adc \$0,%r11 1722 1723 mulq $m0 1724 add %rax,%r12 1725 mov 16*5($nptr),%rax 1726 adc \$0,%rdx 1727 add %r12,%r11 1728 mov %rdx,%r12 1729 adc \$0,%r12 1730 1731 mulq $m0 1732 add %rax,%r13 1733 mov 16*6($nptr),%rax 1734 adc \$0,%rdx 1735 add %r13,%r12 1736 mov %rdx,%r13 1737 adc \$0,%r13 1738 1739 mulq $m0 1740 add %rax,%r14 1741 mov 16*7($nptr),%rax 1742 adc \$0,%rdx 1743 add %r14,%r13 1744 mov %rdx,%r14 1745 adc \$0,%r14 1746 1747 mulq $m0 1748 mov 48-16+8(%rsp,%rcx,8),$m0# pull n0*a[i] 1749 add %rax,%r15 1750 adc \$0,%rdx 1751 add %r15,%r14 1752 mov 16*0($nptr),%rax # pull n[0] 1753 mov %rdx,%r15 1754 adc \$0,%r15 1755 1756 dec %ecx 1757 jnz .L8x_tail 1758 1759 lea 16*8($nptr),$nptr 1760 mov 8+8(%rsp),%rdx # pull end of t[] 1761 cmp 0+8(%rsp),$nptr # end of n[]? 1762 jae .L8x_tail_done # break out of loop 1763 1764 mov 48+56+8(%rsp),$m0 # pull n0*a[0] 1765 neg $carry 1766 mov 8*0($nptr),%rax # pull n[0] 1767 adc 8*0($tptr),%r8 1768 adc 8*1($tptr),%r9 1769 adc 8*2($tptr),%r10 1770 adc 8*3($tptr),%r11 1771 adc 8*4($tptr),%r12 1772 adc 8*5($tptr),%r13 1773 adc 8*6($tptr),%r14 1774 adc 8*7($tptr),%r15 1775 sbb $carry,$carry # top carry 1776 1777 mov \$8,%ecx 1778 jmp .L8x_tail 1779 1780 .align 32 1781 .L8x_tail_done: 1782 add (%rdx),%r8 # can this overflow? 1783 xor %rax,%rax 1784 1785 neg $carry 1786 .L8x_no_tail: 1787 adc 8*0($tptr),%r8 1788 adc 8*1($tptr),%r9 1789 adc 8*2($tptr),%r10 1790 adc 8*3($tptr),%r11 1791 adc 8*4($tptr),%r12 1792 adc 8*5($tptr),%r13 1793 adc 8*6($tptr),%r14 1794 adc 8*7($tptr),%r15 1795 adc \$0,%rax # top-most carry 1796 mov -16($nptr),%rcx # np[num-1] 1797 xor $carry,$carry 1798 1799 movq %xmm2,$nptr # restore $nptr 1800 1801 mov %r8,8*0($tptr) # store top 512 bits 1802 mov %r9,8*1($tptr) 1803 movq %xmm3,$num # $num is %r9, can't be moved upwards 1804 mov %r10,8*2($tptr) 1805 mov %r11,8*3($tptr) 1806 mov %r12,8*4($tptr) 1807 mov %r13,8*5($tptr) 1808 mov %r14,8*6($tptr) 1809 mov %r15,8*7($tptr) 1810 lea 8*8($tptr),$tptr 1811 1812 cmp %rdx,$tptr # end of t[]? 1813 jb .L8x_reduction_loop 1814 ___ 1815 } 1817 ############################################################## 1818 # Post-condition, 4x unrolled 1819 # 1820 { 1821 my ($tptr,$nptr)=("%rbx","%rbp"); 1822 $code.=<<___; 1823 #xor %rsi,%rsi # %rsi was $carry above 1824 sub %r15,%rcx # compare top-most words 1825 lea (%rdi,$num),$tptr # %rdi was $tptr above 1826 adc %rsi,%rsi 1827 mov $num,%rcx 1828 or %rsi,%rax 1829 movq %xmm1,$rptr # restore $rptr 1830 xor \$1,%rax 1831 movq %xmm1,$aptr # prepare for back-to-back call 1832 lea ($nptr,%rax,8),$nptr 1833 sar \$3+2,%rcx # cf=0 1834 jmp .Lsqr4x_sub 1835 1836 .align 32 1837 .Lsqr4x_sub: 1838 .byte 0x66 1839 mov 8*0($tptr),%r12 1840 mov 8*1($tptr),%r13 1841 sbb 16*0($nptr),%r12 1842 mov 8*2($tptr),%r14 1843 sbb 16*1($nptr),%r13 1844 mov 8*3($tptr),%r15 1845 lea 8*4($tptr),$tptr 1846 sbb 16*2($nptr),%r14 1847 mov %r12,8*0($rptr) 1848 sbb 16*3($nptr),%r15 1849 lea 16*4($nptr),$nptr 1850 mov %r13,8*1($rptr) 1851 mov %r14,8*2($rptr) 1852 mov %r15,8*3($rptr) 1853 lea 8*4($rptr),$rptr 1854 1855 inc %rcx # pass %cf 1856 jnz .Lsqr4x_sub 1857 ___ 1858 } 1859 $code.=<<___; 1860 mov $num,%r10 # prepare for back-to-back call 1861 neg $num # restore $num 1862 ret 1863 .size bn_sqr8x_internal,.-bn_sqr8x_internal 1864 ___ 1865 { 1866 $code.=<<___; 1867 .globl bn_from_montgomery 1868 .type bn_from_montgomery,\@abi-omnipotent 1869 .align 32 1870 bn_from_montgomery: 1871 testl \$7,`($win64?"48(%rsp)":"%r9d")` 1872 jz bn_from_mont8x 1873 xor %eax,%eax 1874 ret 1875 .size bn_from_montgomery,.-bn_from_montgomery 1876 1877 .type bn_from_mont8x,\@function,6 1878 .align 32 1879 bn_from_mont8x: 1880 .byte 0x67 1881 mov %rsp,%rax 1882 push %rbx 1883 push %rbp 1884 push %r12 1885 push %r13 1886 push %r14 1887 push %r15 1888 ___ 1889 $code.=<<___ if ($win64); 1890 lea -0x28(%rsp),%rsp 1891 movaps %xmm6,(%rsp) 1892 movaps %xmm7,0x10(%rsp) 1893 ___ 1894 $code.=<<___; 1895 .byte 0x67 1896 mov ${num}d,%r10d 1897 shl \$3,${num}d # convert $num to bytes 1898 shl \$3+2,%r10d # 4*$num 1899 neg $num 1900 mov ($n0),$n0 # *n0 1901 1902 ############################################################## 1903 # ensure that stack frame doesn't alias with $aptr+4*$num 1904 # modulo 4096, which covers ret[num], am[num] and n[2*num] 1905 # (see bn_exp.c). this is done to allow memory disambiguation 1906 # logic do its magic. 1907 # 1908 lea -64(%rsp,$num,2),%r11 1909 sub $aptr,%r11 1910 and \$4095,%r11 1911 cmp %r11,%r10 1912 jb .Lfrom_sp_alt 1913 sub %r11,%rsp # align with $aptr 1914 lea -64(%rsp,$num,2),%rsp # alloca(frame+2*$num) 1915 jmp .Lfrom_sp_done 1916 1917 .align 32 1918 .Lfrom_sp_alt: 1919 lea 4096-64(,$num,2),%r10 # 4096-frame-2*$num 1920 lea -64(%rsp,$num,2),%rsp # alloca(frame+2*$num) 1921 sub %r10,%r11 1922 mov \$0,%r10 1923 cmovc %r10,%r11 1924 sub %r11,%rsp 1925 .Lfrom_sp_done: 1926 and \$-64,%rsp 1927 mov $num,%r10 1928 neg $num 1929 1930 ############################################################## 1931 # Stack layout 1932 # 1933 # +0 saved $num, used in reduction section 1934 # +8 &t[2*$num], used in reduction section 1935 # +32 saved *n0 1936 # +40 saved %rsp 1937 # +48 t[2*$num] 1938 # 1939 mov $n0, 32(%rsp) 1940 mov %rax, 40(%rsp) # save original %rsp 1941 .Lfrom_body: 1942 mov $num,%r11 1943 lea 48(%rsp),%rax 1944 pxor %xmm0,%xmm0 1945 jmp .Lmul_by_1 1946 1947 .align 32 1948 .Lmul_by_1: 1949 movdqu ($aptr),%xmm1 1950 movdqu 16($aptr),%xmm2 1951 movdqu 32($aptr),%xmm3 1952 movdqa %xmm0,(%rax,$num) 1953 movdqu 48($aptr),%xmm4 1954 movdqa %xmm0,16(%rax,$num) 1955 .byte 0x48,0x8d,0xb6,0x40,0x00,0x00,0x00 # lea 64($aptr),$aptr 1956 movdqa %xmm1,(%rax) 1957 movdqa %xmm0,32(%rax,$num) 1958 movdqa %xmm2,16(%rax) 1959 movdqa %xmm0,48(%rax,$num) 1960 movdqa %xmm3,32(%rax) 1961 movdqa %xmm4,48(%rax) 1962 lea 64(%rax),%rax 1963 sub \$64,%r11 1964 jnz .Lmul_by_1 1965 1966 movq $rptr,%xmm1 1967 movq $nptr,%xmm2 1968 .byte 0x67 1969 mov $nptr,%rbp 1970 movq %r10, %xmm3 # -num 1971 ___ 1972 $code.=<<___ if ($addx); 1973 mov OPENSSL_ia32cap_P+8(%rip),%r11d 1974 and \$0x80100,%r11d 1975 cmp \$0x80100,%r11d 1976 jne .Lfrom_mont_nox 1977 1978 lea (%rax,$num),$rptr 1979 call sqrx8x_reduction 1980 1981 pxor %xmm0,%xmm0 1982 lea 48(%rsp),%rax 1983 mov 40(%rsp),%rsi # restore %rsp 1984 jmp .Lfrom_mont_zero 1985 1986 .align 32 1987 .Lfrom_mont_nox: 1988 ___ 1989 $code.=<<___; 1990 call sqr8x_reduction 1991 1992 pxor %xmm0,%xmm0 1993 lea 48(%rsp),%rax 1994 mov 40(%rsp),%rsi # restore %rsp 1995 jmp .Lfrom_mont_zero 1996 1997 .align 32 1998 .Lfrom_mont_zero: 1999 movdqa %xmm0,16*0(%rax) 2000 movdqa %xmm0,16*1(%rax) 2001 movdqa %xmm0,16*2(%rax) 2002 movdqa %xmm0,16*3(%rax) 2003 lea 16*4(%rax),%rax 2004 sub \$32,$num 2005 jnz .Lfrom_mont_zero 2006 2007 mov \$1,%rax 2008 mov -48(%rsi),%r15 2009 mov -40(%rsi),%r14 2010 mov -32(%rsi),%r13 2011 mov -24(%rsi),%r12 2012 mov -16(%rsi),%rbp 2013 mov -8(%rsi),%rbx 2014 lea (%rsi),%rsp 2015 .Lfrom_epilogue: 2016 ret 2017 .size bn_from_mont8x,.-bn_from_mont8x 2018 ___ 2019 } 2020 }}} 2021 2023 if ($addx) {{{ 2024 my $bp="%rdx"; # restore original value 2025 2026 $code.=<<___; 2027 .type bn_mulx4x_mont_gather5,\@function,6 2028 .align 32 2029 bn_mulx4x_mont_gather5: 2030 .Lmulx4x_enter: 2031 .byte 0x67 2032 mov %rsp,%rax 2033 push %rbx 2034 push %rbp 2035 push %r12 2036 push %r13 2037 push %r14 2038 push %r15 2039 ___ 2040 $code.=<<___ if ($win64); 2041 lea -0x28(%rsp),%rsp 2042 movaps %xmm6,(%rsp) 2043 movaps %xmm7,0x10(%rsp) 2044 ___ 2045 $code.=<<___; 2046 .byte 0x67 2047 mov ${num}d,%r10d 2048 shl \$3,${num}d # convert $num to bytes 2049 shl \$3+2,%r10d # 4*$num 2050 neg $num # -$num 2051 mov ($n0),$n0 # *n0 2052 2053 ############################################################## 2054 # ensure that stack frame doesn't alias with $aptr+4*$num 2055 # modulo 4096, which covers a[num], ret[num] and n[2*num] 2056 # (see bn_exp.c). this is done to allow memory disambiguation 2057 # logic do its magic. [excessive frame is allocated in order 2058 # to allow bn_from_mont8x to clear it.] 2059 # 2060 lea -64(%rsp,$num,2),%r11 2061 sub $ap,%r11 2062 and \$4095,%r11 2063 cmp %r11,%r10 2064 jb .Lmulx4xsp_alt 2065 sub %r11,%rsp # align with $aptr 2066 lea -64(%rsp,$num,2),%rsp # alloca(frame+$num) 2067 jmp .Lmulx4xsp_done 2068 2069 .align 32 2070 .Lmulx4xsp_alt: 2071 lea 4096-64(,$num,2),%r10 # 4096-frame-$num 2072 lea -64(%rsp,$num,2),%rsp # alloca(frame+$num) 2073 sub %r10,%r11 2074 mov \$0,%r10 2075 cmovc %r10,%r11 2076 sub %r11,%rsp 2077 .Lmulx4xsp_done: 2078 and \$-64,%rsp # ensure alignment 2079 ############################################################## 2080 # Stack layout 2081 # +0 -num 2082 # +8 off-loaded &b[i] 2083 # +16 end of b[num] 2084 # +24 inner counter 2085 # +32 saved n0 2086 # +40 saved %rsp 2087 # +48 2088 # +56 saved rp 2089 # +64 tmp[num+1] 2090 # 2091 mov $n0, 32(%rsp) # save *n0 2092 mov %rax,40(%rsp) # save original %rsp 2093 .Lmulx4x_body: 2094 call mulx4x_internal 2095 2096 mov 40(%rsp),%rsi # restore %rsp 2097 mov \$1,%rax 2098 ___ 2099 $code.=<<___ if ($win64); 2100 movaps -88(%rsi),%xmm6 2101 movaps -72(%rsi),%xmm7 2102 ___ 2103 $code.=<<___; 2104 mov -48(%rsi),%r15 2105 mov -40(%rsi),%r14 2106 mov -32(%rsi),%r13 2107 mov -24(%rsi),%r12 2108 mov -16(%rsi),%rbp 2109 mov -8(%rsi),%rbx 2110 lea (%rsi),%rsp 2111 .Lmulx4x_epilogue: 2112 ret 2113 .size bn_mulx4x_mont_gather5,.-bn_mulx4x_mont_gather5 2114 2115 .type mulx4x_internal,\@abi-omnipotent 2116 .align 32 2117 mulx4x_internal: 2118 .byte 0x4c,0x89,0x8c,0x24,0x08,0x00,0x00,0x00 # mov $num,8(%rsp) # save -$num 2119 .byte 0x67 2120 neg $num # restore $num 2121 shl \$5,$num 2122 lea 256($bp,$num),%r13 2123 shr \$5+5,$num 2124 mov `($win64?56:8)`(%rax),%r10d # load 7th argument 2125 sub \$1,$num 2126 mov %r13,16+8(%rsp) # end of b[num] 2127 mov $num,24+8(%rsp) # inner counter 2128 mov $rp, 56+8(%rsp) # save $rp 2129 ___ 2130 my ($aptr, $bptr, $nptr, $tptr, $mi, $bi, $zero, $num)= 2131 ("%rsi","%rdi","%rcx","%rbx","%r8","%r9","%rbp","%rax"); 2132 my $rptr=$bptr; 2133 my $STRIDE=2**5*8; # 5 is "window size" 2134 my $N=$STRIDE/4; # should match cache line size 2135 $code.=<<___; 2136 mov %r10,%r11 2137 shr \$`log($N/8)/log(2)`,%r10 2138 and \$`$N/8-1`,%r11 2139 not %r10 2140 lea .Lmagic_masks(%rip),%rax 2141 and \$`2**5/($N/8)-1`,%r10 # 5 is "window size" 2142 lea 96($bp,%r11,8),$bptr # pointer within 1st cache line 2143 movq 0(%rax,%r10,8),%xmm4 # set of masks denoting which 2144 movq 8(%rax,%r10,8),%xmm5 # cache line contains element 2145 add \$7,%r11 2146 movq 16(%rax,%r10,8),%xmm6 # denoted by 7th argument 2147 movq 24(%rax,%r10,8),%xmm7 2148 and \$7,%r11 2149 2150 movq `0*$STRIDE/4-96`($bptr),%xmm0 2151 lea $STRIDE($bptr),$tptr # borrow $tptr 2152 movq `1*$STRIDE/4-96`($bptr),%xmm1 2153 pand %xmm4,%xmm0 2154 movq `2*$STRIDE/4-96`($bptr),%xmm2 2155 pand %xmm5,%xmm1 2156 movq `3*$STRIDE/4-96`($bptr),%xmm3 2157 pand %xmm6,%xmm2 2158 por %xmm1,%xmm0 2159 movq `0*$STRIDE/4-96`($tptr),%xmm1 2160 pand %xmm7,%xmm3 2161 por %xmm2,%xmm0 2162 movq `1*$STRIDE/4-96`($tptr),%xmm2 2163 por %xmm3,%xmm0 2164 .byte 0x67,0x67 2165 pand %xmm4,%xmm1 2166 movq `2*$STRIDE/4-96`($tptr),%xmm3 2167 2168 movq %xmm0,%rdx # bp[0] 2169 movq `3*$STRIDE/4-96`($tptr),%xmm0 2170 lea 2*$STRIDE($bptr),$bptr # next &b[i] 2171 pand %xmm5,%xmm2 2172 .byte 0x67,0x67 2173 pand %xmm6,%xmm3 2174 ############################################################## 2175 # $tptr is chosen so that writing to top-most element of the 2176 # vector occurs just "above" references to powers table, 2177 # "above" modulo cache-line size, which effectively precludes 2178 # possibility of memory disambiguation logic failure when 2179 # accessing the table. 2180 # 2181 lea 64+8*4+8(%rsp,%r11,8),$tptr 2182 2183 mov %rdx,$bi 2184 mulx 0*8($aptr),$mi,%rax # a[0]*b[0] 2185 mulx 1*8($aptr),%r11,%r12 # a[1]*b[0] 2186 add %rax,%r11 2187 mulx 2*8($aptr),%rax,%r13 # ... 2188 adc %rax,%r12 2189 adc \$0,%r13 2190 mulx 3*8($aptr),%rax,%r14 2191 2192 mov $mi,%r15 2193 imulq 32+8(%rsp),$mi # "t[0]"*n0 2194 xor $zero,$zero # cf=0, of=0 2195 mov $mi,%rdx 2196 2197 por %xmm2,%xmm1 2198 pand %xmm7,%xmm0 2199 por %xmm3,%xmm1 2200 mov $bptr,8+8(%rsp) # off-load &b[i] 2201 por %xmm1,%xmm0 2202 2203 .byte 0x48,0x8d,0xb6,0x20,0x00,0x00,0x00 # lea 4*8($aptr),$aptr 2204 adcx %rax,%r13 2205 adcx $zero,%r14 # cf=0 2206 2207 mulx 0*16($nptr),%rax,%r10 2208 adcx %rax,%r15 # discarded 2209 adox %r11,%r10 2210 mulx 1*16($nptr),%rax,%r11 2211 adcx %rax,%r10 2212 adox %r12,%r11 2213 mulx 2*16($nptr),%rax,%r12 2214 mov 24+8(%rsp),$bptr # counter value 2215 .byte 0x66 2216 mov %r10,-8*4($tptr) 2217 adcx %rax,%r11 2218 adox %r13,%r12 2219 mulx 3*16($nptr),%rax,%r15 2220 .byte 0x67,0x67 2221 mov $bi,%rdx 2222 mov %r11,-8*3($tptr) 2223 adcx %rax,%r12 2224 adox $zero,%r15 # of=0 2225 .byte 0x48,0x8d,0x89,0x40,0x00,0x00,0x00 # lea 4*16($nptr),$nptr 2226 mov %r12,-8*2($tptr) 2227 #jmp .Lmulx4x_1st 2228 2229 .align 32 2230 .Lmulx4x_1st: 2231 adcx $zero,%r15 # cf=0, modulo-scheduled 2232 mulx 0*8($aptr),%r10,%rax # a[4]*b[0] 2233 adcx %r14,%r10 2234 mulx 1*8($aptr),%r11,%r14 # a[5]*b[0] 2235 adcx %rax,%r11 2236 mulx 2*8($aptr),%r12,%rax # ... 2237 adcx %r14,%r12 2238 mulx 3*8($aptr),%r13,%r14 2239 .byte 0x67,0x67 2240 mov $mi,%rdx 2241 adcx %rax,%r13 2242 adcx $zero,%r14 # cf=0 2243 lea 4*8($aptr),$aptr 2244 lea 4*8($tptr),$tptr 2245 2246 adox %r15,%r10 2247 mulx 0*16($nptr),%rax,%r15 2248 adcx %rax,%r10 2249 adox %r15,%r11 2250 mulx 1*16($nptr),%rax,%r15 2251 adcx %rax,%r11 2252 adox %r15,%r12 2253 mulx 2*16($nptr),%rax,%r15 2254 mov %r10,-5*8($tptr) 2255 adcx %rax,%r12 2256 mov %r11,-4*8($tptr) 2257 adox %r15,%r13 2258 mulx 3*16($nptr),%rax,%r15 2259 mov $bi,%rdx 2260 mov %r12,-3*8($tptr) 2261 adcx %rax,%r13 2262 adox $zero,%r15 2263 lea 4*16($nptr),$nptr 2264 mov %r13,-2*8($tptr) 2265 2266 dec $bptr # of=0, pass cf 2267 jnz .Lmulx4x_1st 2268 2269 mov 8(%rsp),$num # load -num 2270 movq %xmm0,%rdx # bp[1] 2271 adc $zero,%r15 # modulo-scheduled 2272 lea ($aptr,$num),$aptr # rewind $aptr 2273 add %r15,%r14 2274 mov 8+8(%rsp),$bptr # re-load &b[i] 2275 adc $zero,$zero # top-most carry 2276 mov %r14,-1*8($tptr) 2277 jmp .Lmulx4x_outer 2278 2279 .align 32 2280 .Lmulx4x_outer: 2281 mov $zero,($tptr) # save top-most carry 2282 lea 4*8($tptr,$num),$tptr # rewind $tptr 2283 mulx 0*8($aptr),$mi,%r11 # a[0]*b[i] 2284 xor $zero,$zero # cf=0, of=0 2285 mov %rdx,$bi 2286 mulx 1*8($aptr),%r14,%r12 # a[1]*b[i] 2287 adox -4*8($tptr),$mi # +t[0] 2288 adcx %r14,%r11 2289 mulx 2*8($aptr),%r15,%r13 # ... 2290 adox -3*8($tptr),%r11 2291 adcx %r15,%r12 2292 mulx 3*8($aptr),%rdx,%r14 2293 adox -2*8($tptr),%r12 2294 adcx %rdx,%r13 2295 lea ($nptr,$num,2),$nptr # rewind $nptr 2296 lea 4*8($aptr),$aptr 2297 adox -1*8($tptr),%r13 2298 adcx $zero,%r14 2299 adox $zero,%r14 2300 2301 .byte 0x67 2302 mov $mi,%r15 2303 imulq 32+8(%rsp),$mi # "t[0]"*n0 2304 2305 movq `0*$STRIDE/4-96`($bptr),%xmm0 2306 .byte 0x67,0x67 2307 mov $mi,%rdx 2308 movq `1*$STRIDE/4-96`($bptr),%xmm1 2309 .byte 0x67 2310 pand %xmm4,%xmm0 2311 movq `2*$STRIDE/4-96`($bptr),%xmm2 2312 .byte 0x67 2313 pand %xmm5,%xmm1 2314 movq `3*$STRIDE/4-96`($bptr),%xmm3 2315 add \$$STRIDE,$bptr # next &b[i] 2316 .byte 0x67 2317 pand %xmm6,%xmm2 2318 por %xmm1,%xmm0 2319 pand %xmm7,%xmm3 2320 xor $zero,$zero # cf=0, of=0 2321 mov $bptr,8+8(%rsp) # off-load &b[i] 2322 2323 mulx 0*16($nptr),%rax,%r10 2324 adcx %rax,%r15 # discarded 2325 adox %r11,%r10 2326 mulx 1*16($nptr),%rax,%r11 2327 adcx %rax,%r10 2328 adox %r12,%r11 2329 mulx 2*16($nptr),%rax,%r12 2330 adcx %rax,%r11 2331 adox %r13,%r12 2332 mulx 3*16($nptr),%rax,%r15 2333 mov $bi,%rdx 2334 por %xmm2,%xmm0 2335 mov 24+8(%rsp),$bptr # counter value 2336 mov %r10,-8*4($tptr) 2337 por %xmm3,%xmm0 2338 adcx %rax,%r12 2339 mov %r11,-8*3($tptr) 2340 adox $zero,%r15 # of=0 2341 mov %r12,-8*2($tptr) 2342 lea 4*16($nptr),$nptr 2343 jmp .Lmulx4x_inner 2344 2345 .align 32 2346 .Lmulx4x_inner: 2347 mulx 0*8($aptr),%r10,%rax # a[4]*b[i] 2348 adcx $zero,%r15 # cf=0, modulo-scheduled 2349 adox %r14,%r10 2350 mulx 1*8($aptr),%r11,%r14 # a[5]*b[i] 2351 adcx 0*8($tptr),%r10 2352 adox %rax,%r11 2353 mulx 2*8($aptr),%r12,%rax # ... 2354 adcx 1*8($tptr),%r11 2355 adox %r14,%r12 2356 mulx 3*8($aptr),%r13,%r14 2357 mov $mi,%rdx 2358 adcx 2*8($tptr),%r12 2359 adox %rax,%r13 2360 adcx 3*8($tptr),%r13 2361 adox $zero,%r14 # of=0 2362 lea 4*8($aptr),$aptr 2363 lea 4*8($tptr),$tptr 2364 adcx $zero,%r14 # cf=0 2365 2366 adox %r15,%r10 2367 mulx 0*16($nptr),%rax,%r15 2368 adcx %rax,%r10 2369 adox %r15,%r11 2370 mulx 1*16($nptr),%rax,%r15 2371 adcx %rax,%r11 2372 adox %r15,%r12 2373 mulx 2*16($nptr),%rax,%r15 2374 mov %r10,-5*8($tptr) 2375 adcx %rax,%r12 2376 adox %r15,%r13 2377 mov %r11,-4*8($tptr) 2378 mulx 3*16($nptr),%rax,%r15 2379 mov $bi,%rdx 2380 lea 4*16($nptr),$nptr 2381 mov %r12,-3*8($tptr) 2382 adcx %rax,%r13 2383 adox $zero,%r15 2384 mov %r13,-2*8($tptr) 2385 2386 dec $bptr # of=0, pass cf 2387 jnz .Lmulx4x_inner 2388 2389 mov 0+8(%rsp),$num # load -num 2390 movq %xmm0,%rdx # bp[i+1] 2391 adc $zero,%r15 # modulo-scheduled 2392 sub 0*8($tptr),$bptr # pull top-most carry to %cf 2393 mov 8+8(%rsp),$bptr # re-load &b[i] 2394 mov 16+8(%rsp),%r10 2395 adc %r15,%r14 2396 lea ($aptr,$num),$aptr # rewind $aptr 2397 adc $zero,$zero # top-most carry 2398 mov %r14,-1*8($tptr) 2399 2400 cmp %r10,$bptr 2401 jb .Lmulx4x_outer 2402 2403 mov -16($nptr),%r10 2404 xor %r15,%r15 2405 sub %r14,%r10 # compare top-most words 2406 adc %r15,%r15 2407 or %r15,$zero 2408 xor \$1,$zero 2409 lea ($tptr,$num),%rdi # rewind $tptr 2410 lea ($nptr,$num,2),$nptr # rewind $nptr 2411 .byte 0x67,0x67 2412 sar \$3+2,$num # cf=0 2413 lea ($nptr,$zero,8),%rbp 2414 mov 56+8(%rsp),%rdx # restore rp 2415 mov $num,%rcx 2416 jmp .Lsqrx4x_sub # common post-condition 2417 .size mulx4x_internal,.-mulx4x_internal 2418 ___ 2419 }{ 2421 ###################################################################### 2422 # void bn_power5( 2423 my $rptr="%rdi"; # BN_ULONG *rptr, 2424 my $aptr="%rsi"; # const BN_ULONG *aptr, 2425 my $bptr="%rdx"; # const void *table, 2426 my $nptr="%rcx"; # const BN_ULONG *nptr, 2427 my $n0 ="%r8"; # const BN_ULONG *n0); 2428 my $num ="%r9"; # int num, has to be divisible by 8 2429 # int pwr); 2430 2431 my ($i,$j,$tptr)=("%rbp","%rcx",$rptr); 2432 my @A0=("%r10","%r11"); 2433 my @A1=("%r12","%r13"); 2434 my ($a0,$a1,$ai)=("%r14","%r15","%rbx"); 2435 2436 $code.=<<___; 2437 .type bn_powerx5,\@function,6 2438 .align 32 2439 bn_powerx5: 2440 .Lpowerx5_enter: 2441 .byte 0x67 2442 mov %rsp,%rax 2443 push %rbx 2444 push %rbp 2445 push %r12 2446 push %r13 2447 push %r14 2448 push %r15 2449 ___ 2450 $code.=<<___ if ($win64); 2451 lea -0x28(%rsp),%rsp 2452 movaps %xmm6,(%rsp) 2453 movaps %xmm7,0x10(%rsp) 2454 ___ 2455 $code.=<<___; 2456 .byte 0x67 2457 mov ${num}d,%r10d 2458 shl \$3,${num}d # convert $num to bytes 2459 shl \$3+2,%r10d # 4*$num 2460 neg $num 2461 mov ($n0),$n0 # *n0 2462 2463 ############################################################## 2464 # ensure that stack frame doesn't alias with $aptr+4*$num 2465 # modulo 4096, which covers ret[num], am[num] and n[2*num] 2466 # (see bn_exp.c). this is done to allow memory disambiguation 2467 # logic do its magic. 2468 # 2469 lea -64(%rsp,$num,2),%r11 2470 sub $aptr,%r11 2471 and \$4095,%r11 2472 cmp %r11,%r10 2473 jb .Lpwrx_sp_alt 2474 sub %r11,%rsp # align with $aptr 2475 lea -64(%rsp,$num,2),%rsp # alloca(frame+2*$num) 2476 jmp .Lpwrx_sp_done 2477 2478 .align 32 2479 .Lpwrx_sp_alt: 2480 lea 4096-64(,$num,2),%r10 # 4096-frame-2*$num 2481 lea -64(%rsp,$num,2),%rsp # alloca(frame+2*$num) 2482 sub %r10,%r11 2483 mov \$0,%r10 2484 cmovc %r10,%r11 2485 sub %r11,%rsp 2486 .Lpwrx_sp_done: 2487 and \$-64,%rsp 2488 mov $num,%r10 2489 neg $num 2490 2491 ############################################################## 2492 # Stack layout 2493 # 2494 # +0 saved $num, used in reduction section 2495 # +8 &t[2*$num], used in reduction section 2496 # +16 intermediate carry bit 2497 # +24 top-most carry bit, used in reduction section 2498 # +32 saved *n0 2499 # +40 saved %rsp 2500 # +48 t[2*$num] 2501 # 2502 pxor %xmm0,%xmm0 2503 movq $rptr,%xmm1 # save $rptr 2504 movq $nptr,%xmm2 # save $nptr 2505 movq %r10, %xmm3 # -$num 2506 movq $bptr,%xmm4 2507 mov $n0, 32(%rsp) 2508 mov %rax, 40(%rsp) # save original %rsp 2509 .Lpowerx5_body: 2510 2511 call __bn_sqrx8x_internal 2512 call __bn_sqrx8x_internal 2513 call __bn_sqrx8x_internal 2514 call __bn_sqrx8x_internal 2515 call __bn_sqrx8x_internal 2516 2517 mov %r10,$num # -num 2518 mov $aptr,$rptr 2519 movq %xmm2,$nptr 2520 movq %xmm4,$bptr 2521 mov 40(%rsp),%rax 2522 2523 call mulx4x_internal 2524 2525 mov 40(%rsp),%rsi # restore %rsp 2526 mov \$1,%rax 2527 ___ 2528 $code.=<<___ if ($win64); 2529 movaps -88(%rsi),%xmm6 2530 movaps -72(%rsi),%xmm7 2531 ___ 2532 $code.=<<___; 2533 mov -48(%rsi),%r15 2534 mov -40(%rsi),%r14 2535 mov -32(%rsi),%r13 2536 mov -24(%rsi),%r12 2537 mov -16(%rsi),%rbp 2538 mov -8(%rsi),%rbx 2539 lea (%rsi),%rsp 2540 .Lpowerx5_epilogue: 2541 ret 2542 .size bn_powerx5,.-bn_powerx5 2543 2544 .globl bn_sqrx8x_internal 2545 .hidden bn_sqrx8x_internal 2546 .type bn_sqrx8x_internal,\@abi-omnipotent 2547 .align 32 2548 bn_sqrx8x_internal: 2549 __bn_sqrx8x_internal: 2550 ################################################################## 2551 # Squaring part: 2552 # 2553 # a) multiply-n-add everything but a[i]*a[i]; 2554 # b) shift result of a) by 1 to the left and accumulate 2555 # a[i]*a[i] products; 2556 # 2557 ################################################################## 2558 # a[7]a[7]a[6]a[6]a[5]a[5]a[4]a[4]a[3]a[3]a[2]a[2]a[1]a[1]a[0]a[0] 2559 # a[1]a[0] 2560 # a[2]a[0] 2561 # a[3]a[0] 2562 # a[2]a[1] 2563 # a[3]a[1] 2564 # a[3]a[2] 2565 # 2566 # a[4]a[0] 2567 # a[5]a[0] 2568 # a[6]a[0] 2569 # a[7]a[0] 2570 # a[4]a[1] 2571 # a[5]a[1] 2572 # a[6]a[1] 2573 # a[7]a[1] 2574 # a[4]a[2] 2575 # a[5]a[2] 2576 # a[6]a[2] 2577 # a[7]a[2] 2578 # a[4]a[3] 2579 # a[5]a[3] 2580 # a[6]a[3] 2581 # a[7]a[3] 2582 # 2583 # a[5]a[4] 2584 # a[6]a[4] 2585 # a[7]a[4] 2586 # a[6]a[5] 2587 # a[7]a[5] 2588 # a[7]a[6] 2589 # a[7]a[7]a[6]a[6]a[5]a[5]a[4]a[4]a[3]a[3]a[2]a[2]a[1]a[1]a[0]a[0] 2590 ___ 2591 { 2592 my ($zero,$carry)=("%rbp","%rcx"); 2593 my $aaptr=$zero; 2594 $code.=<<___; 2595 lea 48+8(%rsp),$tptr 2596 lea ($aptr,$num),$aaptr 2597 mov $num,0+8(%rsp) # save $num 2598 mov $aaptr,8+8(%rsp) # save end of $aptr 2599 jmp .Lsqr8x_zero_start 2600 2601 .align 32 2602 .byte 0x66,0x66,0x66,0x2e,0x0f,0x1f,0x84,0x00,0x00,0x00,0x00,0x00 2603 .Lsqrx8x_zero: 2604 .byte 0x3e 2605 movdqa %xmm0,0*8($tptr) 2606 movdqa %xmm0,2*8($tptr) 2607 movdqa %xmm0,4*8($tptr) 2608 movdqa %xmm0,6*8($tptr) 2609 .Lsqr8x_zero_start: # aligned at 32 2610 movdqa %xmm0,8*8($tptr) 2611 movdqa %xmm0,10*8($tptr) 2612 movdqa %xmm0,12*8($tptr) 2613 movdqa %xmm0,14*8($tptr) 2614 lea 16*8($tptr),$tptr 2615 sub \$64,$num 2616 jnz .Lsqrx8x_zero 2617 2618 mov 0*8($aptr),%rdx # a[0], modulo-scheduled 2619 #xor %r9,%r9 # t[1], ex-$num, zero already 2620 xor %r10,%r10 2621 xor %r11,%r11 2622 xor %r12,%r12 2623 xor %r13,%r13 2624 xor %r14,%r14 2625 xor %r15,%r15 2626 lea 48+8(%rsp),$tptr 2627 xor $zero,$zero # cf=0, cf=0 2628 jmp .Lsqrx8x_outer_loop 2629 2630 .align 32 2631 .Lsqrx8x_outer_loop: 2632 mulx 1*8($aptr),%r8,%rax # a[1]*a[0] 2633 adcx %r9,%r8 # a[1]*a[0]+=t[1] 2634 adox %rax,%r10 2635 mulx 2*8($aptr),%r9,%rax # a[2]*a[0] 2636 adcx %r10,%r9 2637 adox %rax,%r11 2638 .byte 0xc4,0xe2,0xab,0xf6,0x86,0x18,0x00,0x00,0x00 # mulx 3*8($aptr),%r10,%rax # ... 2639 adcx %r11,%r10 2640 adox %rax,%r12 2641 .byte 0xc4,0xe2,0xa3,0xf6,0x86,0x20,0x00,0x00,0x00 # mulx 4*8($aptr),%r11,%rax 2642 adcx %r12,%r11 2643 adox %rax,%r13 2644 mulx 5*8($aptr),%r12,%rax 2645 adcx %r13,%r12 2646 adox %rax,%r14 2647 mulx 6*8($aptr),%r13,%rax 2648 adcx %r14,%r13 2649 adox %r15,%rax 2650 mulx 7*8($aptr),%r14,%r15 2651 mov 1*8($aptr),%rdx # a[1] 2652 adcx %rax,%r14 2653 adox $zero,%r15 2654 adc 8*8($tptr),%r15 2655 mov %r8,1*8($tptr) # t[1] 2656 mov %r9,2*8($tptr) # t[2] 2657 sbb $carry,$carry # mov %cf,$carry 2658 xor $zero,$zero # cf=0, of=0 2659 2660 2661 mulx 2*8($aptr),%r8,%rbx # a[2]*a[1] 2662 mulx 3*8($aptr),%r9,%rax # a[3]*a[1] 2663 adcx %r10,%r8 2664 adox %rbx,%r9 2665 mulx 4*8($aptr),%r10,%rbx # ... 2666 adcx %r11,%r9 2667 adox %rax,%r10 2668 .byte 0xc4,0xe2,0xa3,0xf6,0x86,0x28,0x00,0x00,0x00 # mulx 5*8($aptr),%r11,%rax 2669 adcx %r12,%r10 2670 adox %rbx,%r11 2671 .byte 0xc4,0xe2,0x9b,0xf6,0x9e,0x30,0x00,0x00,0x00 # mulx 6*8($aptr),%r12,%rbx 2672 adcx %r13,%r11 2673 adox %r14,%r12 2674 .byte 0xc4,0x62,0x93,0xf6,0xb6,0x38,0x00,0x00,0x00 # mulx 7*8($aptr),%r13,%r14 2675 mov 2*8($aptr),%rdx # a[2] 2676 adcx %rax,%r12 2677 adox %rbx,%r13 2678 adcx %r15,%r13 2679 adox $zero,%r14 # of=0 2680 adcx $zero,%r14 # cf=0 2681 2682 mov %r8,3*8($tptr) # t[3] 2683 mov %r9,4*8($tptr) # t[4] 2684 2685 mulx 3*8($aptr),%r8,%rbx # a[3]*a[2] 2686 mulx 4*8($aptr),%r9,%rax # a[4]*a[2] 2687 adcx %r10,%r8 2688 adox %rbx,%r9 2689 mulx 5*8($aptr),%r10,%rbx # ... 2690 adcx %r11,%r9 2691 adox %rax,%r10 2692 .byte 0xc4,0xe2,0xa3,0xf6,0x86,0x30,0x00,0x00,0x00 # mulx 6*8($aptr),%r11,%rax 2693 adcx %r12,%r10 2694 adox %r13,%r11 2695 .byte 0xc4,0x62,0x9b,0xf6,0xae,0x38,0x00,0x00,0x00 # mulx 7*8($aptr),%r12,%r13 2696 .byte 0x3e 2697 mov 3*8($aptr),%rdx # a[3] 2698 adcx %rbx,%r11 2699 adox %rax,%r12 2700 adcx %r14,%r12 2701 mov %r8,5*8($tptr) # t[5] 2702 mov %r9,6*8($tptr) # t[6] 2703 mulx 4*8($aptr),%r8,%rax # a[4]*a[3] 2704 adox $zero,%r13 # of=0 2705 adcx $zero,%r13 # cf=0 2706 2707 mulx 5*8($aptr),%r9,%rbx # a[5]*a[3] 2708 adcx %r10,%r8 2709 adox %rax,%r9 2710 mulx 6*8($aptr),%r10,%rax # ... 2711 adcx %r11,%r9 2712 adox %r12,%r10 2713 mulx 7*8($aptr),%r11,%r12 2714 mov 4*8($aptr),%rdx # a[4] 2715 mov 5*8($aptr),%r14 # a[5] 2716 adcx %rbx,%r10 2717 adox %rax,%r11 2718 mov 6*8($aptr),%r15 # a[6] 2719 adcx %r13,%r11 2720 adox $zero,%r12 # of=0 2721 adcx $zero,%r12 # cf=0 2722 2723 mov %r8,7*8($tptr) # t[7] 2724 mov %r9,8*8($tptr) # t[8] 2725 2726 mulx %r14,%r9,%rax # a[5]*a[4] 2727 mov 7*8($aptr),%r8 # a[7] 2728 adcx %r10,%r9 2729 mulx %r15,%r10,%rbx # a[6]*a[4] 2730 adox %rax,%r10 2731 adcx %r11,%r10 2732 mulx %r8,%r11,%rax # a[7]*a[4] 2733 mov %r14,%rdx # a[5] 2734 adox %rbx,%r11 2735 adcx %r12,%r11 2736 #adox $zero,%rax # of=0 2737 adcx $zero,%rax # cf=0 2738 2739 mulx %r15,%r14,%rbx # a[6]*a[5] 2740 mulx %r8,%r12,%r13 # a[7]*a[5] 2741 mov %r15,%rdx # a[6] 2742 lea 8*8($aptr),$aptr 2743 adcx %r14,%r11 2744 adox %rbx,%r12 2745 adcx %rax,%r12 2746 adox $zero,%r13 2747 2748 .byte 0x67,0x67 2749 mulx %r8,%r8,%r14 # a[7]*a[6] 2750 adcx %r8,%r13 2751 adcx $zero,%r14 2752 2753 cmp 8+8(%rsp),$aptr 2754 je .Lsqrx8x_outer_break 2755 2756 neg $carry # mov $carry,%cf 2757 mov \$-8,%rcx 2758 mov $zero,%r15 2759 mov 8*8($tptr),%r8 2760 adcx 9*8($tptr),%r9 # +=t[9] 2761 adcx 10*8($tptr),%r10 # ... 2762 adcx 11*8($tptr),%r11 2763 adc 12*8($tptr),%r12 2764 adc 13*8($tptr),%r13 2765 adc 14*8($tptr),%r14 2766 adc 15*8($tptr),%r15 2767 lea ($aptr),$aaptr 2768 lea 2*64($tptr),$tptr 2769 sbb %rax,%rax # mov %cf,$carry 2770 2771 mov -64($aptr),%rdx # a[0] 2772 mov %rax,16+8(%rsp) # offload $carry 2773 mov $tptr,24+8(%rsp) 2774 2775 #lea 8*8($tptr),$tptr # see 2*8*8($tptr) above 2776 xor %eax,%eax # cf=0, of=0 2777 jmp .Lsqrx8x_loop 2778 2779 .align 32 2780 .Lsqrx8x_loop: 2781 mov %r8,%rbx 2782 mulx 0*8($aaptr),%rax,%r8 # a[8]*a[i] 2783 adcx %rax,%rbx # +=t[8] 2784 adox %r9,%r8 2785 2786 mulx 1*8($aaptr),%rax,%r9 # ... 2787 adcx %rax,%r8 2788 adox %r10,%r9 2789 2790 mulx 2*8($aaptr),%rax,%r10 2791 adcx %rax,%r9 2792 adox %r11,%r10 2793 2794 mulx 3*8($aaptr),%rax,%r11 2795 adcx %rax,%r10 2796 adox %r12,%r11 2797 2798 .byte 0xc4,0x62,0xfb,0xf6,0xa5,0x20,0x00,0x00,0x00 # mulx 4*8($aaptr),%rax,%r12 2799 adcx %rax,%r11 2800 adox %r13,%r12 2801 2802 mulx 5*8($aaptr),%rax,%r13 2803 adcx %rax,%r12 2804 adox %r14,%r13 2805 2806 mulx 6*8($aaptr),%rax,%r14 2807 mov %rbx,($tptr,%rcx,8) # store t[8+i] 2808 mov \$0,%ebx 2809 adcx %rax,%r13 2810 adox %r15,%r14 2811 2812 .byte 0xc4,0x62,0xfb,0xf6,0xbd,0x38,0x00,0x00,0x00 # mulx 7*8($aaptr),%rax,%r15 2813 mov 8($aptr,%rcx,8),%rdx # a[i] 2814 adcx %rax,%r14 2815 adox %rbx,%r15 # %rbx is 0, of=0 2816 adcx %rbx,%r15 # cf=0 2817 2818 .byte 0x67 2819 inc %rcx # of=0 2820 jnz .Lsqrx8x_loop 2821 2822 lea 8*8($aaptr),$aaptr 2823 mov \$-8,%rcx 2824 cmp 8+8(%rsp),$aaptr # done? 2825 je .Lsqrx8x_break 2826 2827 sub 16+8(%rsp),%rbx # mov 16(%rsp),%cf 2828 .byte 0x66 2829 mov -64($aptr),%rdx 2830 adcx 0*8($tptr),%r8 2831 adcx 1*8($tptr),%r9 2832 adc 2*8($tptr),%r10 2833 adc 3*8($tptr),%r11 2834 adc 4*8($tptr),%r12 2835 adc 5*8($tptr),%r13 2836 adc 6*8($tptr),%r14 2837 adc 7*8($tptr),%r15 2838 lea 8*8($tptr),$tptr 2839 .byte 0x67 2840 sbb %rax,%rax # mov %cf,%rax 2841 xor %ebx,%ebx # cf=0, of=0 2842 mov %rax,16+8(%rsp) # offload carry 2843 jmp .Lsqrx8x_loop 2844 2845 .align 32 2846 .Lsqrx8x_break: 2847 sub 16+8(%rsp),%r8 # consume last carry 2848 mov 24+8(%rsp),$carry # initial $tptr, borrow $carry 2849 mov 0*8($aptr),%rdx # a[8], modulo-scheduled 2850 xor %ebp,%ebp # xor $zero,$zero 2851 mov %r8,0*8($tptr) 2852 cmp $carry,$tptr # cf=0, of=0 2853 je .Lsqrx8x_outer_loop 2854 2855 mov %r9,1*8($tptr) 2856 mov 1*8($carry),%r9 2857 mov %r10,2*8($tptr) 2858 mov 2*8($carry),%r10 2859 mov %r11,3*8($tptr) 2860 mov 3*8($carry),%r11 2861 mov %r12,4*8($tptr) 2862 mov 4*8($carry),%r12 2863 mov %r13,5*8($tptr) 2864 mov 5*8($carry),%r13 2865 mov %r14,6*8($tptr) 2866 mov 6*8($carry),%r14 2867 mov %r15,7*8($tptr) 2868 mov 7*8($carry),%r15 2869 mov $carry,$tptr 2870 jmp .Lsqrx8x_outer_loop 2871 2872 .align 32 2873 .Lsqrx8x_outer_break: 2874 mov %r9,9*8($tptr) # t[9] 2875 movq %xmm3,%rcx # -$num 2876 mov %r10,10*8($tptr) # ... 2877 mov %r11,11*8($tptr) 2878 mov %r12,12*8($tptr) 2879 mov %r13,13*8($tptr) 2880 mov %r14,14*8($tptr) 2881 ___ 2882 }{ 2884 my $i="%rcx"; 2885 $code.=<<___; 2886 lea 48+8(%rsp),$tptr 2887 mov ($aptr,$i),%rdx # a[0] 2888 2889 mov 8($tptr),$A0[1] # t[1] 2890 xor $A0[0],$A0[0] # t[0], of=0, cf=0 2891 mov 0+8(%rsp),$num # restore $num 2892 adox $A0[1],$A0[1] 2893 mov 16($tptr),$A1[0] # t[2] # prefetch 2894 mov 24($tptr),$A1[1] # t[3] # prefetch 2895 #jmp .Lsqrx4x_shift_n_add # happens to be aligned 2896 2897 .align 32 2898 .Lsqrx4x_shift_n_add: 2899 mulx %rdx,%rax,%rbx 2900 adox $A1[0],$A1[0] 2901 adcx $A0[0],%rax 2902 .byte 0x48,0x8b,0x94,0x0e,0x08,0x00,0x00,0x00 # mov 8($aptr,$i),%rdx # a[i+1] # prefetch 2903 .byte 0x4c,0x8b,0x97,0x20,0x00,0x00,0x00 # mov 32($tptr),$A0[0] # t[2*i+4] # prefetch 2904 adox $A1[1],$A1[1] 2905 adcx $A0[1],%rbx 2906 mov 40($tptr),$A0[1] # t[2*i+4+1] # prefetch 2907 mov %rax,0($tptr) 2908 mov %rbx,8($tptr) 2909 2910 mulx %rdx,%rax,%rbx 2911 adox $A0[0],$A0[0] 2912 adcx $A1[0],%rax 2913 mov 16($aptr,$i),%rdx # a[i+2] # prefetch 2914 mov 48($tptr),$A1[0] # t[2*i+6] # prefetch 2915 adox $A0[1],$A0[1] 2916 adcx $A1[1],%rbx 2917 mov 56($tptr),$A1[1] # t[2*i+6+1] # prefetch 2918 mov %rax,16($tptr) 2919 mov %rbx,24($tptr) 2920 2921 mulx %rdx,%rax,%rbx 2922 adox $A1[0],$A1[0] 2923 adcx $A0[0],%rax 2924 mov 24($aptr,$i),%rdx # a[i+3] # prefetch 2925 lea 32($i),$i 2926 mov 64($tptr),$A0[0] # t[2*i+8] # prefetch 2927 adox $A1[1],$A1[1] 2928 adcx $A0[1],%rbx 2929 mov 72($tptr),$A0[1] # t[2*i+8+1] # prefetch 2930 mov %rax,32($tptr) 2931 mov %rbx,40($tptr) 2932 2933 mulx %rdx,%rax,%rbx 2934 adox $A0[0],$A0[0] 2935 adcx $A1[0],%rax 2936 jrcxz .Lsqrx4x_shift_n_add_break 2937 .byte 0x48,0x8b,0x94,0x0e,0x00,0x00,0x00,0x00 # mov 0($aptr,$i),%rdx # a[i+4] # prefetch 2938 adox $A0[1],$A0[1] 2939 adcx $A1[1],%rbx 2940 mov 80($tptr),$A1[0] # t[2*i+10] # prefetch 2941 mov 88($tptr),$A1[1] # t[2*i+10+1] # prefetch 2942 mov %rax,48($tptr) 2943 mov %rbx,56($tptr) 2944 lea 64($tptr),$tptr 2945 nop 2946 jmp .Lsqrx4x_shift_n_add 2947 2948 .align 32 2949 .Lsqrx4x_shift_n_add_break: 2950 adcx $A1[1],%rbx 2951 mov %rax,48($tptr) 2952 mov %rbx,56($tptr) 2953 lea 64($tptr),$tptr # end of t[] buffer 2954 ___ 2955 } 2957 ###################################################################### 2958 # Montgomery reduction part, "word-by-word" algorithm. 2959 # 2960 # This new path is inspired by multiple submissions from Intel, by 2961 # Shay Gueron, Vlad Krasnov, Erdinc Ozturk, James Guilford, 2962 # Vinodh Gopal... 2963 { 2964 my ($nptr,$carry,$m0)=("%rbp","%rsi","%rdx"); 2965 2966 $code.=<<___; 2967 movq %xmm2,$nptr 2968 sqrx8x_reduction: 2969 xor %eax,%eax # initial top-most carry bit 2970 mov 32+8(%rsp),%rbx # n0 2971 mov 48+8(%rsp),%rdx # "%r8", 8*0($tptr) 2972 lea -128($nptr,$num,2),%rcx # end of n[] 2973 #lea 48+8(%rsp,$num,2),$tptr # end of t[] buffer 2974 mov %rcx, 0+8(%rsp) # save end of n[] 2975 mov $tptr,8+8(%rsp) # save end of t[] 2976 2977 lea 48+8(%rsp),$tptr # initial t[] window 2978 jmp .Lsqrx8x_reduction_loop 2979 2980 .align 32 2981 .Lsqrx8x_reduction_loop: 2982 mov 8*1($tptr),%r9 2983 mov 8*2($tptr),%r10 2984 mov 8*3($tptr),%r11 2985 mov 8*4($tptr),%r12 2986 mov %rdx,%r8 2987 imulq %rbx,%rdx # n0*a[i] 2988 mov 8*5($tptr),%r13 2989 mov 8*6($tptr),%r14 2990 mov 8*7($tptr),%r15 2991 mov %rax,24+8(%rsp) # store top-most carry bit 2992 2993 lea 8*8($tptr),$tptr 2994 xor $carry,$carry # cf=0,of=0 2995 mov \$-8,%rcx 2996 jmp .Lsqrx8x_reduce 2997 2998 .align 32 2999 .Lsqrx8x_reduce: 3000 mov %r8, %rbx 3001 mulx 16*0($nptr),%rax,%r8 # n[0] 3002 adcx %rbx,%rax # discarded 3003 adox %r9,%r8 3004 3005 mulx 16*1($nptr),%rbx,%r9 # n[1] 3006 adcx %rbx,%r8 3007 adox %r10,%r9 3008 3009 mulx 16*2($nptr),%rbx,%r10 3010 adcx %rbx,%r9 3011 adox %r11,%r10 3012 3013 mulx 16*3($nptr),%rbx,%r11 3014 adcx %rbx,%r10 3015 adox %r12,%r11 3016 3017 .byte 0xc4,0x62,0xe3,0xf6,0xa5,0x40,0x00,0x00,0x00 # mulx 16*4($nptr),%rbx,%r12 3018 mov %rdx,%rax 3019 mov %r8,%rdx 3020 adcx %rbx,%r11 3021 adox %r13,%r12 3022 3023 mulx 32+8(%rsp),%rbx,%rdx # %rdx discarded 3024 mov %rax,%rdx 3025 mov %rax,64+48+8(%rsp,%rcx,8) # put aside n0*a[i] 3026 3027 mulx 16*5($nptr),%rax,%r13 3028 adcx %rax,%r12 3029 adox %r14,%r13 3030 3031 mulx 16*6($nptr),%rax,%r14 3032 adcx %rax,%r13 3033 adox %r15,%r14 3034 3035 mulx 16*7($nptr),%rax,%r15 3036 mov %rbx,%rdx 3037 adcx %rax,%r14 3038 adox $carry,%r15 # $carry is 0 3039 adcx $carry,%r15 # cf=0 3040 3041 .byte 0x67,0x67,0x67 3042 inc %rcx # of=0 3043 jnz .Lsqrx8x_reduce 3044 3045 mov $carry,%rax # xor %rax,%rax 3046 cmp 0+8(%rsp),$nptr # end of n[]? 3047 jae .Lsqrx8x_no_tail 3048 3049 mov 48+8(%rsp),%rdx # pull n0*a[0] 3050 add 8*0($tptr),%r8 3051 lea 16*8($nptr),$nptr 3052 mov \$-8,%rcx 3053 adcx 8*1($tptr),%r9 3054 adcx 8*2($tptr),%r10 3055 adc 8*3($tptr),%r11 3056 adc 8*4($tptr),%r12 3057 adc 8*5($tptr),%r13 3058 adc 8*6($tptr),%r14 3059 adc 8*7($tptr),%r15 3060 lea 8*8($tptr),$tptr 3061 sbb %rax,%rax # top carry 3062 3063 xor $carry,$carry # of=0, cf=0 3064 mov %rax,16+8(%rsp) 3065 jmp .Lsqrx8x_tail 3066 3067 .align 32 3068 .Lsqrx8x_tail: 3069 mov %r8,%rbx 3070 mulx 16*0($nptr),%rax,%r8 3071 adcx %rax,%rbx 3072 adox %r9,%r8 3073 3074 mulx 16*1($nptr),%rax,%r9 3075 adcx %rax,%r8 3076 adox %r10,%r9 3077 3078 mulx 16*2($nptr),%rax,%r10 3079 adcx %rax,%r9 3080 adox %r11,%r10 3081 3082 mulx 16*3($nptr),%rax,%r11 3083 adcx %rax,%r10 3084 adox %r12,%r11 3085 3086 .byte 0xc4,0x62,0xfb,0xf6,0xa5,0x40,0x00,0x00,0x00 # mulx 16*4($nptr),%rax,%r12 3087 adcx %rax,%r11 3088 adox %r13,%r12 3089 3090 mulx 16*5($nptr),%rax,%r13 3091 adcx %rax,%r12 3092 adox %r14,%r13 3093 3094 mulx 16*6($nptr),%rax,%r14 3095 adcx %rax,%r13 3096 adox %r15,%r14 3097 3098 mulx 16*7($nptr),%rax,%r15 3099 mov 72+48+8(%rsp,%rcx,8),%rdx # pull n0*a[i] 3100 adcx %rax,%r14 3101 adox $carry,%r15 3102 mov %rbx,($tptr,%rcx,8) # save result 3103 mov %r8,%rbx 3104 adcx $carry,%r15 # cf=0 3105 3106 inc %rcx # of=0 3107 jnz .Lsqrx8x_tail 3108 3109 cmp 0+8(%rsp),$nptr # end of n[]? 3110 jae .Lsqrx8x_tail_done # break out of loop 3111 3112 sub 16+8(%rsp),$carry # mov 16(%rsp),%cf 3113 mov 48+8(%rsp),%rdx # pull n0*a[0] 3114 lea 16*8($nptr),$nptr 3115 adc 8*0($tptr),%r8 3116 adc 8*1($tptr),%r9 3117 adc 8*2($tptr),%r10 3118 adc 8*3($tptr),%r11 3119 adc 8*4($tptr),%r12 3120 adc 8*5($tptr),%r13 3121 adc 8*6($tptr),%r14 3122 adc 8*7($tptr),%r15 3123 lea 8*8($tptr),$tptr 3124 sbb %rax,%rax 3125 sub \$8,%rcx # mov \$-8,%rcx 3126 3127 xor $carry,$carry # of=0, cf=0 3128 mov %rax,16+8(%rsp) 3129 jmp .Lsqrx8x_tail 3130 3131 .align 32 3132 .Lsqrx8x_tail_done: 3133 add 24+8(%rsp),%r8 # can this overflow? 3134 mov $carry,%rax # xor %rax,%rax 3135 3136 sub 16+8(%rsp),$carry # mov 16(%rsp),%cf 3137 .Lsqrx8x_no_tail: # %cf is 0 if jumped here 3138 adc 8*0($tptr),%r8 3139 movq %xmm3,%rcx 3140 adc 8*1($tptr),%r9 3141 mov 16*7($nptr),$carry 3142 movq %xmm2,$nptr # restore $nptr 3143 adc 8*2($tptr),%r10 3144 adc 8*3($tptr),%r11 3145 adc 8*4($tptr),%r12 3146 adc 8*5($tptr),%r13 3147 adc 8*6($tptr),%r14 3148 adc 8*7($tptr),%r15 3149 adc %rax,%rax # top-most carry 3150 3151 mov 32+8(%rsp),%rbx # n0 3152 mov 8*8($tptr,%rcx),%rdx # modulo-scheduled "%r8" 3153 3154 mov %r8,8*0($tptr) # store top 512 bits 3155 lea 8*8($tptr),%r8 # borrow %r8 3156 mov %r9,8*1($tptr) 3157 mov %r10,8*2($tptr) 3158 mov %r11,8*3($tptr) 3159 mov %r12,8*4($tptr) 3160 mov %r13,8*5($tptr) 3161 mov %r14,8*6($tptr) 3162 mov %r15,8*7($tptr) 3163 3164 lea 8*8($tptr,%rcx),$tptr # start of current t[] window 3165 cmp 8+8(%rsp),%r8 # end of t[]? 3166 jb .Lsqrx8x_reduction_loop 3167 ___ 3168 } 3170 ############################################################## 3171 # Post-condition, 4x unrolled 3172 # 3173 { 3174 my ($rptr,$nptr)=("%rdx","%rbp"); 3175 my @ri=map("%r$_",(10..13)); 3176 my @ni=map("%r$_",(14..15)); 3177 $code.=<<___; 3178 xor %rbx,%rbx 3179 sub %r15,%rsi # compare top-most words 3180 adc %rbx,%rbx 3181 mov %rcx,%r10 # -$num 3182 .byte 0x67 3183 or %rbx,%rax 3184 .byte 0x67 3185 mov %rcx,%r9 # -$num 3186 xor \$1,%rax 3187 sar \$3+2,%rcx # cf=0 3188 #lea 48+8(%rsp,%r9),$tptr 3189 lea ($nptr,%rax,8),$nptr 3190 movq %xmm1,$rptr # restore $rptr 3191 movq %xmm1,$aptr # prepare for back-to-back call 3192 jmp .Lsqrx4x_sub 3193 3194 .align 32 3195 .Lsqrx4x_sub: 3196 .byte 0x66 3197 mov 8*0($tptr),%r12 3198 mov 8*1($tptr),%r13 3199 sbb 16*0($nptr),%r12 3200 mov 8*2($tptr),%r14 3201 sbb 16*1($nptr),%r13 3202 mov 8*3($tptr),%r15 3203 lea 8*4($tptr),$tptr 3204 sbb 16*2($nptr),%r14 3205 mov %r12,8*0($rptr) 3206 sbb 16*3($nptr),%r15 3207 lea 16*4($nptr),$nptr 3208 mov %r13,8*1($rptr) 3209 mov %r14,8*2($rptr) 3210 mov %r15,8*3($rptr) 3211 lea 8*4($rptr),$rptr 3212 3213 inc %rcx 3214 jnz .Lsqrx4x_sub 3215 ___ 3216 } 3217 $code.=<<___; 3218 neg %r9 # restore $num 3219 3220 ret 3221 .size bn_sqrx8x_internal,.-bn_sqrx8x_internal 3222 ___ 3223 }}} 3224 { 3225 my ($inp,$num,$tbl,$idx)=$win64?("%rcx","%edx","%r8", "%r9d") : # Win64 order 3226 ("%rdi","%esi","%rdx","%ecx"); # Unix order 3227 my $out=$inp; 3228 my $STRIDE=2**5*8; 3229 my $N=$STRIDE/4; 3230 3231 $code.=<<___; 3232 .globl bn_scatter5 3233 .type bn_scatter5,\@abi-omnipotent 3234 .align 16 3235 bn_scatter5: 3236 cmp \$0, $num 3237 jz .Lscatter_epilogue 3238 lea ($tbl,$idx,8),$tbl 3239 .Lscatter: 3240 mov ($inp),%rax 3241 lea 8($inp),$inp 3242 mov %rax,($tbl) 3243 lea 32*8($tbl),$tbl 3244 sub \$1,$num 3245 jnz .Lscatter 3246 .Lscatter_epilogue: 3247 ret 3248 .size bn_scatter5,.-bn_scatter5 3249 3250 .globl bn_gather5 3251 .type bn_gather5,\@abi-omnipotent 3252 .align 16 3253 bn_gather5: 3254 ___ 3255 $code.=<<___ if ($win64); 3256 .LSEH_begin_bn_gather5: 3257 # I can't trust assembler to use specific encoding:-( 3258 .byte 0x48,0x83,0xec,0x28 #sub \$0x28,%rsp 3259 .byte 0x0f,0x29,0x34,0x24 #movaps %xmm6,(%rsp) 3260 .byte 0x0f,0x29,0x7c,0x24,0x10 #movdqa %xmm7,0x10(%rsp) 3261 ___ 3262 $code.=<<___; 3263 mov $idx,%r11d 3264 shr \$`log($N/8)/log(2)`,$idx 3265 and \$`$N/8-1`,%r11 3266 not $idx 3267 lea .Lmagic_masks(%rip),%rax 3268 and \$`2**5/($N/8)-1`,$idx # 5 is "window size" 3269 lea 128($tbl,%r11,8),$tbl # pointer within 1st cache line 3270 movq 0(%rax,$idx,8),%xmm4 # set of masks denoting which 3271 movq 8(%rax,$idx,8),%xmm5 # cache line contains element 3272 movq 16(%rax,$idx,8),%xmm6 # denoted by 7th argument 3273 movq 24(%rax,$idx,8),%xmm7 3274 jmp .Lgather 3275 .align 16 3276 .Lgather: 3277 movq `0*$STRIDE/4-128`($tbl),%xmm0 3278 movq `1*$STRIDE/4-128`($tbl),%xmm1 3279 pand %xmm4,%xmm0 3280 movq `2*$STRIDE/4-128`($tbl),%xmm2 3281 pand %xmm5,%xmm1 3282 movq `3*$STRIDE/4-128`($tbl),%xmm3 3283 pand %xmm6,%xmm2 3284 por %xmm1,%xmm0 3285 pand %xmm7,%xmm3 3286 .byte 0x67,0x67 3287 por %xmm2,%xmm0 3288 lea $STRIDE($tbl),$tbl 3289 por %xmm3,%xmm0 3290 3291 movq %xmm0,($out) # m0=bp[0] 3292 lea 8($out),$out 3293 sub \$1,$num 3294 jnz .Lgather 3295 ___ 3296 $code.=<<___ if ($win64); 3297 movaps (%rsp),%xmm6 3298 movaps 0x10(%rsp),%xmm7 3299 lea 0x28(%rsp),%rsp 3300 ___ 3301 $code.=<<___; 3302 ret 3303 .LSEH_end_bn_gather5: 3304 .size bn_gather5,.-bn_gather5 3305 ___ 3306 } 3307 $code.=<<___; 3308 .align 64 3309 .Lmagic_masks: 3310 .long 0,0, 0,0, 0,0, -1,-1 3311 .long 0,0, 0,0, 0,0, 0,0 3312 .asciz "Montgomery Multiplication with scatter/gather for x86_64, CRYPTOGAMS by <appro\@openssl.org>" 3313 ___ 3314 3315 # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, 3316 # CONTEXT *context,DISPATCHER_CONTEXT *disp) 3317 if ($win64) { 3318 $rec="%rcx"; 3319 $frame="%rdx"; 3320 $context="%r8"; 3321 $disp="%r9"; 3322 3323 $code.=<<___; 3324 .extern __imp_RtlVirtualUnwind 3325 .type mul_handler,\@abi-omnipotent 3326 .align 16 3327 mul_handler: 3328 push %rsi 3329 push %rdi 3330 push %rbx 3331 push %rbp 3332 push %r12 3333 push %r13 3334 push %r14 3335 push %r15 3336 pushfq 3337 sub \$64,%rsp 3338 3339 mov 120($context),%rax # pull context->Rax 3340 mov 248($context),%rbx # pull context->Rip 3341 3342 mov 8($disp),%rsi # disp->ImageBase 3343 mov 56($disp),%r11 # disp->HandlerData 3344 3345 mov 0(%r11),%r10d # HandlerData[0] 3346 lea (%rsi,%r10),%r10 # end of prologue label 3347 cmp %r10,%rbx # context->Rip<end of prologue label 3348 jb .Lcommon_seh_tail 3349 3350 mov 152($context),%rax # pull context->Rsp 3351 3352 mov 4(%r11),%r10d # HandlerData[1] 3353 lea (%rsi,%r10),%r10 # epilogue label 3354 cmp %r10,%rbx # context->Rip>=epilogue label 3355 jae .Lcommon_seh_tail 3356 3357 lea .Lmul_epilogue(%rip),%r10 3358 cmp %r10,%rbx 3359 jb .Lbody_40 3360 3361 mov 192($context),%r10 # pull $num 3362 mov 8(%rax,%r10,8),%rax # pull saved stack pointer 3363 jmp .Lbody_proceed 3364 3365 .Lbody_40: 3366 mov 40(%rax),%rax # pull saved stack pointer 3367 .Lbody_proceed: 3368 3369 movaps -88(%rax),%xmm0 3370 movaps -72(%rax),%xmm1 3371 3372 mov -8(%rax),%rbx 3373 mov -16(%rax),%rbp 3374 mov -24(%rax),%r12 3375 mov -32(%rax),%r13 3376 mov -40(%rax),%r14 3377 mov -48(%rax),%r15 3378 mov %rbx,144($context) # restore context->Rbx 3379 mov %rbp,160($context) # restore context->Rbp 3380 mov %r12,216($context) # restore context->R12 3381 mov %r13,224($context) # restore context->R13 3382 mov %r14,232($context) # restore context->R14 3383 mov %r15,240($context) # restore context->R15 3384 movups %xmm0,512($context) # restore context->Xmm6 3385 movups %xmm1,528($context) # restore context->Xmm7 3386 3387 .Lcommon_seh_tail: 3388 mov 8(%rax),%rdi 3389 mov 16(%rax),%rsi 3390 mov %rax,152($context) # restore context->Rsp 3391 mov %rsi,168($context) # restore context->Rsi 3392 mov %rdi,176($context) # restore context->Rdi 3393 3394 mov 40($disp),%rdi # disp->ContextRecord 3395 mov $context,%rsi # context 3396 mov \$154,%ecx # sizeof(CONTEXT) 3397 .long 0xa548f3fc # cld; rep movsq 3398 3399 mov $disp,%rsi 3400 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER 3401 mov 8(%rsi),%rdx # arg2, disp->ImageBase 3402 mov 0(%rsi),%r8 # arg3, disp->ControlPc 3403 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry 3404 mov 40(%rsi),%r10 # disp->ContextRecord 3405 lea 56(%rsi),%r11 # &disp->HandlerData 3406 lea 24(%rsi),%r12 # &disp->EstablisherFrame 3407 mov %r10,32(%rsp) # arg5 3408 mov %r11,40(%rsp) # arg6 3409 mov %r12,48(%rsp) # arg7 3410 mov %rcx,56(%rsp) # arg8, (NULL) 3411 call *__imp_RtlVirtualUnwind(%rip) 3412 3413 mov \$1,%eax # ExceptionContinueSearch 3414 add \$64,%rsp 3415 popfq 3416 pop %r15 3417 pop %r14 3418 pop %r13 3419 pop %r12 3420 pop %rbp 3421 pop %rbx 3422 pop %rdi 3423 pop %rsi 3424 ret 3425 .size mul_handler,.-mul_handler 3426 3427 .section .pdata 3428 .align 4 3429 .rva .LSEH_begin_bn_mul_mont_gather5 3430 .rva .LSEH_end_bn_mul_mont_gather5 3431 .rva .LSEH_info_bn_mul_mont_gather5 3432 3433 .rva .LSEH_begin_bn_mul4x_mont_gather5 3434 .rva .LSEH_end_bn_mul4x_mont_gather5 3435 .rva .LSEH_info_bn_mul4x_mont_gather5 3436 3437 .rva .LSEH_begin_bn_power5 3438 .rva .LSEH_end_bn_power5 3439 .rva .LSEH_info_bn_power5 3440 3441 .rva .LSEH_begin_bn_from_mont8x 3442 .rva .LSEH_end_bn_from_mont8x 3443 .rva .LSEH_info_bn_from_mont8x 3444 ___ 3445 $code.=<<___ if ($addx); 3446 .rva .LSEH_begin_bn_mulx4x_mont_gather5 3447 .rva .LSEH_end_bn_mulx4x_mont_gather5 3448 .rva .LSEH_info_bn_mulx4x_mont_gather5 3449 3450 .rva .LSEH_begin_bn_powerx5 3451 .rva .LSEH_end_bn_powerx5 3452 .rva .LSEH_info_bn_powerx5 3453 ___ 3454 $code.=<<___; 3455 .rva .LSEH_begin_bn_gather5 3456 .rva .LSEH_end_bn_gather5 3457 .rva .LSEH_info_bn_gather5 3458 3459 .section .xdata 3460 .align 8 3461 .LSEH_info_bn_mul_mont_gather5: 3462 .byte 9,0,0,0 3463 .rva mul_handler 3464 .rva .Lmul_body,.Lmul_epilogue # HandlerData[] 3465 .align 8 3466 .LSEH_info_bn_mul4x_mont_gather5: 3467 .byte 9,0,0,0 3468 .rva mul_handler 3469 .rva .Lmul4x_body,.Lmul4x_epilogue # HandlerData[] 3470 .align 8 3471 .LSEH_info_bn_power5: 3472 .byte 9,0,0,0 3473 .rva mul_handler 3474 .rva .Lpower5_body,.Lpower5_epilogue # HandlerData[] 3475 .align 8 3476 .LSEH_info_bn_from_mont8x: 3477 .byte 9,0,0,0 3478 .rva mul_handler 3479 .rva .Lfrom_body,.Lfrom_epilogue # HandlerData[] 3480 ___ 3481 $code.=<<___ if ($addx); 3482 .align 8 3483 .LSEH_info_bn_mulx4x_mont_gather5: 3484 .byte 9,0,0,0 3485 .rva mul_handler 3486 .rva .Lmulx4x_body,.Lmulx4x_epilogue # HandlerData[] 3487 .align 8 3488 .LSEH_info_bn_powerx5: 3489 .byte 9,0,0,0 3490 .rva mul_handler 3491 .rva .Lpowerx5_body,.Lpowerx5_epilogue # HandlerData[] 3492 ___ 3493 $code.=<<___; 3494 .align 8 3495 .LSEH_info_bn_gather5: 3496 .byte 0x01,0x0d,0x05,0x00 3497 .byte 0x0d,0x78,0x01,0x00 #movaps 0x10(rsp),xmm7 3498 .byte 0x08,0x68,0x00,0x00 #movaps (rsp),xmm6 3499 .byte 0x04,0x42,0x00,0x00 #sub rsp,0x28 3500 .align 8 3501 ___ 3502 } 3503 3504 $code =~ s/\`([^\`]*)\`/eval($1)/gem; 3505 3506 print $code; 3507 close STDOUT; 3508