1 #!/usr/bin/env perl 2 3 # ==================================================================== 4 # Written by Andy Polyakov <appro (at] openssl.org> for the OpenSSL 5 # project. The module is, however, dual licensed under OpenSSL and 6 # CRYPTOGAMS licenses depending on where you obtain it. For further 7 # details see http://www.openssl.org/~appro/cryptogams/. 8 # ==================================================================== 9 10 # August 2011. 11 # 12 # Companion to x86_64-mont.pl that optimizes cache-timing attack 13 # countermeasures. The subroutines are produced by replacing bp[i] 14 # references in their x86_64-mont.pl counterparts with cache-neutral 15 # references to powers table computed in BN_mod_exp_mont_consttime. 16 # In addition subroutine that scatters elements of the powers table 17 # is implemented, so that scatter-/gathering can be tuned without 18 # bn_exp.c modifications. 19 20 $flavour = shift; 21 $output = shift; 22 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } 23 24 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); 25 26 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 27 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or 28 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or 29 die "can't locate x86_64-xlate.pl"; 30 31 open STDOUT,"| $^X $xlate $flavour $output"; 32 33 # int bn_mul_mont_gather5( 34 $rp="%rdi"; # BN_ULONG *rp, 35 $ap="%rsi"; # const BN_ULONG *ap, 36 $bp="%rdx"; # const BN_ULONG *bp, 37 $np="%rcx"; # const BN_ULONG *np, 38 $n0="%r8"; # const BN_ULONG *n0, 39 $num="%r9"; # int num, 40 # int idx); # 0 to 2^5-1, "index" in $bp holding 41 # pre-computed powers of a', interlaced 42 # in such manner that b[0] is $bp[idx], 43 # b[1] is [2^5+idx], etc. 44 $lo0="%r10"; 45 $hi0="%r11"; 46 $hi1="%r13"; 47 $i="%r14"; 48 $j="%r15"; 49 $m0="%rbx"; 50 $m1="%rbp"; 51 52 $code=<<___; 53 .text 54 55 .globl bn_mul_mont_gather5 56 .type bn_mul_mont_gather5,\@function,6 57 .align 64 58 bn_mul_mont_gather5: 59 test \$3,${num}d 60 jnz .Lmul_enter 61 cmp \$8,${num}d 62 jb .Lmul_enter 63 jmp .Lmul4x_enter 64 65 .align 16 66 .Lmul_enter: 67 mov ${num}d,${num}d 68 mov `($win64?56:8)`(%rsp),%r10d # load 7th argument 69 push %rbx 70 push %rbp 71 push %r12 72 push %r13 73 push %r14 74 push %r15 75 ___ 76 $code.=<<___ if ($win64); 77 lea -0x28(%rsp),%rsp 78 movaps %xmm6,(%rsp) 79 movaps %xmm7,0x10(%rsp) 80 .Lmul_alloca: 81 ___ 82 $code.=<<___; 83 mov %rsp,%rax 84 lea 2($num),%r11 85 neg %r11 86 lea (%rsp,%r11,8),%rsp # tp=alloca(8*(num+2)) 87 and \$-1024,%rsp # minimize TLB usage 88 89 mov %rax,8(%rsp,$num,8) # tp[num+1]=%rsp 90 .Lmul_body: 91 mov $bp,%r12 # reassign $bp 92 ___ 93 $bp="%r12"; 94 $STRIDE=2**5*8; # 5 is "window size" 95 $N=$STRIDE/4; # should match cache line size 96 $code.=<<___; 97 mov %r10,%r11 98 shr \$`log($N/8)/log(2)`,%r10 99 and \$`$N/8-1`,%r11 100 not %r10 101 lea .Lmagic_masks(%rip),%rax 102 and \$`2**5/($N/8)-1`,%r10 # 5 is "window size" 103 lea 96($bp,%r11,8),$bp # pointer within 1st cache line 104 movq 0(%rax,%r10,8),%xmm4 # set of masks denoting which 105 movq 8(%rax,%r10,8),%xmm5 # cache line contains element 106 movq 16(%rax,%r10,8),%xmm6 # denoted by 7th argument 107 movq 24(%rax,%r10,8),%xmm7 108 109 movq `0*$STRIDE/4-96`($bp),%xmm0 110 movq `1*$STRIDE/4-96`($bp),%xmm1 111 pand %xmm4,%xmm0 112 movq `2*$STRIDE/4-96`($bp),%xmm2 113 pand %xmm5,%xmm1 114 movq `3*$STRIDE/4-96`($bp),%xmm3 115 pand %xmm6,%xmm2 116 por %xmm1,%xmm0 117 pand %xmm7,%xmm3 118 por %xmm2,%xmm0 119 lea $STRIDE($bp),$bp 120 por %xmm3,%xmm0 121 122 movq %xmm0,$m0 # m0=bp[0] 123 124 mov ($n0),$n0 # pull n0[0] value 125 mov ($ap),%rax 126 127 xor $i,$i # i=0 128 xor $j,$j # j=0 129 130 movq `0*$STRIDE/4-96`($bp),%xmm0 131 movq `1*$STRIDE/4-96`($bp),%xmm1 132 pand %xmm4,%xmm0 133 movq `2*$STRIDE/4-96`($bp),%xmm2 134 pand %xmm5,%xmm1 135 136 mov $n0,$m1 137 mulq $m0 # ap[0]*bp[0] 138 mov %rax,$lo0 139 mov ($np),%rax 140 141 movq `3*$STRIDE/4-96`($bp),%xmm3 142 pand %xmm6,%xmm2 143 por %xmm1,%xmm0 144 pand %xmm7,%xmm3 145 146 imulq $lo0,$m1 # "tp[0]"*n0 147 mov %rdx,$hi0 148 149 por %xmm2,%xmm0 150 lea $STRIDE($bp),$bp 151 por %xmm3,%xmm0 152 153 mulq $m1 # np[0]*m1 154 add %rax,$lo0 # discarded 155 mov 8($ap),%rax 156 adc \$0,%rdx 157 mov %rdx,$hi1 158 159 lea 1($j),$j # j++ 160 jmp .L1st_enter 161 162 .align 16 163 .L1st: 164 add %rax,$hi1 165 mov ($ap,$j,8),%rax 166 adc \$0,%rdx 167 add $hi0,$hi1 # np[j]*m1+ap[j]*bp[0] 168 mov $lo0,$hi0 169 adc \$0,%rdx 170 mov $hi1,-16(%rsp,$j,8) # tp[j-1] 171 mov %rdx,$hi1 172 173 .L1st_enter: 174 mulq $m0 # ap[j]*bp[0] 175 add %rax,$hi0 176 mov ($np,$j,8),%rax 177 adc \$0,%rdx 178 lea 1($j),$j # j++ 179 mov %rdx,$lo0 180 181 mulq $m1 # np[j]*m1 182 cmp $num,$j 183 jne .L1st 184 185 movq %xmm0,$m0 # bp[1] 186 187 add %rax,$hi1 188 mov ($ap),%rax # ap[0] 189 adc \$0,%rdx 190 add $hi0,$hi1 # np[j]*m1+ap[j]*bp[0] 191 adc \$0,%rdx 192 mov $hi1,-16(%rsp,$j,8) # tp[j-1] 193 mov %rdx,$hi1 194 mov $lo0,$hi0 195 196 xor %rdx,%rdx 197 add $hi0,$hi1 198 adc \$0,%rdx 199 mov $hi1,-8(%rsp,$num,8) 200 mov %rdx,(%rsp,$num,8) # store upmost overflow bit 201 202 lea 1($i),$i # i++ 203 jmp .Louter 204 .align 16 205 .Louter: 206 xor $j,$j # j=0 207 mov $n0,$m1 208 mov (%rsp),$lo0 209 210 movq `0*$STRIDE/4-96`($bp),%xmm0 211 movq `1*$STRIDE/4-96`($bp),%xmm1 212 pand %xmm4,%xmm0 213 movq `2*$STRIDE/4-96`($bp),%xmm2 214 pand %xmm5,%xmm1 215 216 mulq $m0 # ap[0]*bp[i] 217 add %rax,$lo0 # ap[0]*bp[i]+tp[0] 218 mov ($np),%rax 219 adc \$0,%rdx 220 221 movq `3*$STRIDE/4-96`($bp),%xmm3 222 pand %xmm6,%xmm2 223 por %xmm1,%xmm0 224 pand %xmm7,%xmm3 225 226 imulq $lo0,$m1 # tp[0]*n0 227 mov %rdx,$hi0 228 229 por %xmm2,%xmm0 230 lea $STRIDE($bp),$bp 231 por %xmm3,%xmm0 232 233 mulq $m1 # np[0]*m1 234 add %rax,$lo0 # discarded 235 mov 8($ap),%rax 236 adc \$0,%rdx 237 mov 8(%rsp),$lo0 # tp[1] 238 mov %rdx,$hi1 239 240 lea 1($j),$j # j++ 241 jmp .Linner_enter 242 243 .align 16 244 .Linner: 245 add %rax,$hi1 246 mov ($ap,$j,8),%rax 247 adc \$0,%rdx 248 add $lo0,$hi1 # np[j]*m1+ap[j]*bp[i]+tp[j] 249 mov (%rsp,$j,8),$lo0 250 adc \$0,%rdx 251 mov $hi1,-16(%rsp,$j,8) # tp[j-1] 252 mov %rdx,$hi1 253 254 .Linner_enter: 255 mulq $m0 # ap[j]*bp[i] 256 add %rax,$hi0 257 mov ($np,$j,8),%rax 258 adc \$0,%rdx 259 add $hi0,$lo0 # ap[j]*bp[i]+tp[j] 260 mov %rdx,$hi0 261 adc \$0,$hi0 262 lea 1($j),$j # j++ 263 264 mulq $m1 # np[j]*m1 265 cmp $num,$j 266 jne .Linner 267 268 movq %xmm0,$m0 # bp[i+1] 269 270 add %rax,$hi1 271 mov ($ap),%rax # ap[0] 272 adc \$0,%rdx 273 add $lo0,$hi1 # np[j]*m1+ap[j]*bp[i]+tp[j] 274 mov (%rsp,$j,8),$lo0 275 adc \$0,%rdx 276 mov $hi1,-16(%rsp,$j,8) # tp[j-1] 277 mov %rdx,$hi1 278 279 xor %rdx,%rdx 280 add $hi0,$hi1 281 adc \$0,%rdx 282 add $lo0,$hi1 # pull upmost overflow bit 283 adc \$0,%rdx 284 mov $hi1,-8(%rsp,$num,8) 285 mov %rdx,(%rsp,$num,8) # store upmost overflow bit 286 287 lea 1($i),$i # i++ 288 cmp $num,$i 289 jl .Louter 290 291 xor $i,$i # i=0 and clear CF! 292 mov (%rsp),%rax # tp[0] 293 lea (%rsp),$ap # borrow ap for tp 294 mov $num,$j # j=num 295 jmp .Lsub 296 .align 16 297 .Lsub: sbb ($np,$i,8),%rax 298 mov %rax,($rp,$i,8) # rp[i]=tp[i]-np[i] 299 mov 8($ap,$i,8),%rax # tp[i+1] 300 lea 1($i),$i # i++ 301 dec $j # doesnn't affect CF! 302 jnz .Lsub 303 304 sbb \$0,%rax # handle upmost overflow bit 305 xor $i,$i 306 and %rax,$ap 307 not %rax 308 mov $rp,$np 309 and %rax,$np 310 mov $num,$j # j=num 311 or $np,$ap # ap=borrow?tp:rp 312 .align 16 313 .Lcopy: # copy or in-place refresh 314 mov ($ap,$i,8),%rax 315 mov $i,(%rsp,$i,8) # zap temporary vector 316 mov %rax,($rp,$i,8) # rp[i]=tp[i] 317 lea 1($i),$i 318 sub \$1,$j 319 jnz .Lcopy 320 321 mov 8(%rsp,$num,8),%rsi # restore %rsp 322 mov \$1,%rax 323 ___ 324 $code.=<<___ if ($win64); 325 movaps (%rsi),%xmm6 326 movaps 0x10(%rsi),%xmm7 327 lea 0x28(%rsi),%rsi 328 ___ 329 $code.=<<___; 330 mov (%rsi),%r15 331 mov 8(%rsi),%r14 332 mov 16(%rsi),%r13 333 mov 24(%rsi),%r12 334 mov 32(%rsi),%rbp 335 mov 40(%rsi),%rbx 336 lea 48(%rsi),%rsp 337 .Lmul_epilogue: 338 ret 339 .size bn_mul_mont_gather5,.-bn_mul_mont_gather5 340 ___ 341 {{{ 342 my @A=("%r10","%r11"); 343 my @N=("%r13","%rdi"); 344 $code.=<<___; 345 .type bn_mul4x_mont_gather5,\@function,6 346 .align 16 347 bn_mul4x_mont_gather5: 348 .Lmul4x_enter: 349 mov ${num}d,${num}d 350 mov `($win64?56:8)`(%rsp),%r10d # load 7th argument 351 push %rbx 352 push %rbp 353 push %r12 354 push %r13 355 push %r14 356 push %r15 357 ___ 358 $code.=<<___ if ($win64); 359 lea -0x28(%rsp),%rsp 360 movaps %xmm6,(%rsp) 361 movaps %xmm7,0x10(%rsp) 362 .Lmul4x_alloca: 363 ___ 364 $code.=<<___; 365 mov %rsp,%rax 366 lea 4($num),%r11 367 neg %r11 368 lea (%rsp,%r11,8),%rsp # tp=alloca(8*(num+4)) 369 and \$-1024,%rsp # minimize TLB usage 370 371 mov %rax,8(%rsp,$num,8) # tp[num+1]=%rsp 372 .Lmul4x_body: 373 mov $rp,16(%rsp,$num,8) # tp[num+2]=$rp 374 mov %rdx,%r12 # reassign $bp 375 ___ 376 $bp="%r12"; 377 $STRIDE=2**5*8; # 5 is "window size" 378 $N=$STRIDE/4; # should match cache line size 379 $code.=<<___; 380 mov %r10,%r11 381 shr \$`log($N/8)/log(2)`,%r10 382 and \$`$N/8-1`,%r11 383 not %r10 384 lea .Lmagic_masks(%rip),%rax 385 and \$`2**5/($N/8)-1`,%r10 # 5 is "window size" 386 lea 96($bp,%r11,8),$bp # pointer within 1st cache line 387 movq 0(%rax,%r10,8),%xmm4 # set of masks denoting which 388 movq 8(%rax,%r10,8),%xmm5 # cache line contains element 389 movq 16(%rax,%r10,8),%xmm6 # denoted by 7th argument 390 movq 24(%rax,%r10,8),%xmm7 391 392 movq `0*$STRIDE/4-96`($bp),%xmm0 393 movq `1*$STRIDE/4-96`($bp),%xmm1 394 pand %xmm4,%xmm0 395 movq `2*$STRIDE/4-96`($bp),%xmm2 396 pand %xmm5,%xmm1 397 movq `3*$STRIDE/4-96`($bp),%xmm3 398 pand %xmm6,%xmm2 399 por %xmm1,%xmm0 400 pand %xmm7,%xmm3 401 por %xmm2,%xmm0 402 lea $STRIDE($bp),$bp 403 por %xmm3,%xmm0 404 405 movq %xmm0,$m0 # m0=bp[0] 406 mov ($n0),$n0 # pull n0[0] value 407 mov ($ap),%rax 408 409 xor $i,$i # i=0 410 xor $j,$j # j=0 411 412 movq `0*$STRIDE/4-96`($bp),%xmm0 413 movq `1*$STRIDE/4-96`($bp),%xmm1 414 pand %xmm4,%xmm0 415 movq `2*$STRIDE/4-96`($bp),%xmm2 416 pand %xmm5,%xmm1 417 418 mov $n0,$m1 419 mulq $m0 # ap[0]*bp[0] 420 mov %rax,$A[0] 421 mov ($np),%rax 422 423 movq `3*$STRIDE/4-96`($bp),%xmm3 424 pand %xmm6,%xmm2 425 por %xmm1,%xmm0 426 pand %xmm7,%xmm3 427 428 imulq $A[0],$m1 # "tp[0]"*n0 429 mov %rdx,$A[1] 430 431 por %xmm2,%xmm0 432 lea $STRIDE($bp),$bp 433 por %xmm3,%xmm0 434 435 mulq $m1 # np[0]*m1 436 add %rax,$A[0] # discarded 437 mov 8($ap),%rax 438 adc \$0,%rdx 439 mov %rdx,$N[1] 440 441 mulq $m0 442 add %rax,$A[1] 443 mov 8($np),%rax 444 adc \$0,%rdx 445 mov %rdx,$A[0] 446 447 mulq $m1 448 add %rax,$N[1] 449 mov 16($ap),%rax 450 adc \$0,%rdx 451 add $A[1],$N[1] 452 lea 4($j),$j # j++ 453 adc \$0,%rdx 454 mov $N[1],(%rsp) 455 mov %rdx,$N[0] 456 jmp .L1st4x 457 .align 16 458 .L1st4x: 459 mulq $m0 # ap[j]*bp[0] 460 add %rax,$A[0] 461 mov -16($np,$j,8),%rax 462 adc \$0,%rdx 463 mov %rdx,$A[1] 464 465 mulq $m1 # np[j]*m1 466 add %rax,$N[0] 467 mov -8($ap,$j,8),%rax 468 adc \$0,%rdx 469 add $A[0],$N[0] # np[j]*m1+ap[j]*bp[0] 470 adc \$0,%rdx 471 mov $N[0],-24(%rsp,$j,8) # tp[j-1] 472 mov %rdx,$N[1] 473 474 mulq $m0 # ap[j]*bp[0] 475 add %rax,$A[1] 476 mov -8($np,$j,8),%rax 477 adc \$0,%rdx 478 mov %rdx,$A[0] 479 480 mulq $m1 # np[j]*m1 481 add %rax,$N[1] 482 mov ($ap,$j,8),%rax 483 adc \$0,%rdx 484 add $A[1],$N[1] # np[j]*m1+ap[j]*bp[0] 485 adc \$0,%rdx 486 mov $N[1],-16(%rsp,$j,8) # tp[j-1] 487 mov %rdx,$N[0] 488 489 mulq $m0 # ap[j]*bp[0] 490 add %rax,$A[0] 491 mov ($np,$j,8),%rax 492 adc \$0,%rdx 493 mov %rdx,$A[1] 494 495 mulq $m1 # np[j]*m1 496 add %rax,$N[0] 497 mov 8($ap,$j,8),%rax 498 adc \$0,%rdx 499 add $A[0],$N[0] # np[j]*m1+ap[j]*bp[0] 500 adc \$0,%rdx 501 mov $N[0],-8(%rsp,$j,8) # tp[j-1] 502 mov %rdx,$N[1] 503 504 mulq $m0 # ap[j]*bp[0] 505 add %rax,$A[1] 506 mov 8($np,$j,8),%rax 507 adc \$0,%rdx 508 lea 4($j),$j # j++ 509 mov %rdx,$A[0] 510 511 mulq $m1 # np[j]*m1 512 add %rax,$N[1] 513 mov -16($ap,$j,8),%rax 514 adc \$0,%rdx 515 add $A[1],$N[1] # np[j]*m1+ap[j]*bp[0] 516 adc \$0,%rdx 517 mov $N[1],-32(%rsp,$j,8) # tp[j-1] 518 mov %rdx,$N[0] 519 cmp $num,$j 520 jl .L1st4x 521 522 mulq $m0 # ap[j]*bp[0] 523 add %rax,$A[0] 524 mov -16($np,$j,8),%rax 525 adc \$0,%rdx 526 mov %rdx,$A[1] 527 528 mulq $m1 # np[j]*m1 529 add %rax,$N[0] 530 mov -8($ap,$j,8),%rax 531 adc \$0,%rdx 532 add $A[0],$N[0] # np[j]*m1+ap[j]*bp[0] 533 adc \$0,%rdx 534 mov $N[0],-24(%rsp,$j,8) # tp[j-1] 535 mov %rdx,$N[1] 536 537 mulq $m0 # ap[j]*bp[0] 538 add %rax,$A[1] 539 mov -8($np,$j,8),%rax 540 adc \$0,%rdx 541 mov %rdx,$A[0] 542 543 mulq $m1 # np[j]*m1 544 add %rax,$N[1] 545 mov ($ap),%rax # ap[0] 546 adc \$0,%rdx 547 add $A[1],$N[1] # np[j]*m1+ap[j]*bp[0] 548 adc \$0,%rdx 549 mov $N[1],-16(%rsp,$j,8) # tp[j-1] 550 mov %rdx,$N[0] 551 552 movq %xmm0,$m0 # bp[1] 553 554 xor $N[1],$N[1] 555 add $A[0],$N[0] 556 adc \$0,$N[1] 557 mov $N[0],-8(%rsp,$j,8) 558 mov $N[1],(%rsp,$j,8) # store upmost overflow bit 559 560 lea 1($i),$i # i++ 561 .align 4 562 .Louter4x: 563 xor $j,$j # j=0 564 movq `0*$STRIDE/4-96`($bp),%xmm0 565 movq `1*$STRIDE/4-96`($bp),%xmm1 566 pand %xmm4,%xmm0 567 movq `2*$STRIDE/4-96`($bp),%xmm2 568 pand %xmm5,%xmm1 569 570 mov (%rsp),$A[0] 571 mov $n0,$m1 572 mulq $m0 # ap[0]*bp[i] 573 add %rax,$A[0] # ap[0]*bp[i]+tp[0] 574 mov ($np),%rax 575 adc \$0,%rdx 576 577 movq `3*$STRIDE/4-96`($bp),%xmm3 578 pand %xmm6,%xmm2 579 por %xmm1,%xmm0 580 pand %xmm7,%xmm3 581 582 imulq $A[0],$m1 # tp[0]*n0 583 mov %rdx,$A[1] 584 585 por %xmm2,%xmm0 586 lea $STRIDE($bp),$bp 587 por %xmm3,%xmm0 588 589 mulq $m1 # np[0]*m1 590 add %rax,$A[0] # "$N[0]", discarded 591 mov 8($ap),%rax 592 adc \$0,%rdx 593 mov %rdx,$N[1] 594 595 mulq $m0 # ap[j]*bp[i] 596 add %rax,$A[1] 597 mov 8($np),%rax 598 adc \$0,%rdx 599 add 8(%rsp),$A[1] # +tp[1] 600 adc \$0,%rdx 601 mov %rdx,$A[0] 602 603 mulq $m1 # np[j]*m1 604 add %rax,$N[1] 605 mov 16($ap),%rax 606 adc \$0,%rdx 607 add $A[1],$N[1] # np[j]*m1+ap[j]*bp[i]+tp[j] 608 lea 4($j),$j # j+=2 609 adc \$0,%rdx 610 mov %rdx,$N[0] 611 jmp .Linner4x 612 .align 16 613 .Linner4x: 614 mulq $m0 # ap[j]*bp[i] 615 add %rax,$A[0] 616 mov -16($np,$j,8),%rax 617 adc \$0,%rdx 618 add -16(%rsp,$j,8),$A[0] # ap[j]*bp[i]+tp[j] 619 adc \$0,%rdx 620 mov %rdx,$A[1] 621 622 mulq $m1 # np[j]*m1 623 add %rax,$N[0] 624 mov -8($ap,$j,8),%rax 625 adc \$0,%rdx 626 add $A[0],$N[0] 627 adc \$0,%rdx 628 mov $N[1],-32(%rsp,$j,8) # tp[j-1] 629 mov %rdx,$N[1] 630 631 mulq $m0 # ap[j]*bp[i] 632 add %rax,$A[1] 633 mov -8($np,$j,8),%rax 634 adc \$0,%rdx 635 add -8(%rsp,$j,8),$A[1] 636 adc \$0,%rdx 637 mov %rdx,$A[0] 638 639 mulq $m1 # np[j]*m1 640 add %rax,$N[1] 641 mov ($ap,$j,8),%rax 642 adc \$0,%rdx 643 add $A[1],$N[1] 644 adc \$0,%rdx 645 mov $N[0],-24(%rsp,$j,8) # tp[j-1] 646 mov %rdx,$N[0] 647 648 mulq $m0 # ap[j]*bp[i] 649 add %rax,$A[0] 650 mov ($np,$j,8),%rax 651 adc \$0,%rdx 652 add (%rsp,$j,8),$A[0] # ap[j]*bp[i]+tp[j] 653 adc \$0,%rdx 654 mov %rdx,$A[1] 655 656 mulq $m1 # np[j]*m1 657 add %rax,$N[0] 658 mov 8($ap,$j,8),%rax 659 adc \$0,%rdx 660 add $A[0],$N[0] 661 adc \$0,%rdx 662 mov $N[1],-16(%rsp,$j,8) # tp[j-1] 663 mov %rdx,$N[1] 664 665 mulq $m0 # ap[j]*bp[i] 666 add %rax,$A[1] 667 mov 8($np,$j,8),%rax 668 adc \$0,%rdx 669 add 8(%rsp,$j,8),$A[1] 670 adc \$0,%rdx 671 lea 4($j),$j # j++ 672 mov %rdx,$A[0] 673 674 mulq $m1 # np[j]*m1 675 add %rax,$N[1] 676 mov -16($ap,$j,8),%rax 677 adc \$0,%rdx 678 add $A[1],$N[1] 679 adc \$0,%rdx 680 mov $N[0],-40(%rsp,$j,8) # tp[j-1] 681 mov %rdx,$N[0] 682 cmp $num,$j 683 jl .Linner4x 684 685 mulq $m0 # ap[j]*bp[i] 686 add %rax,$A[0] 687 mov -16($np,$j,8),%rax 688 adc \$0,%rdx 689 add -16(%rsp,$j,8),$A[0] # ap[j]*bp[i]+tp[j] 690 adc \$0,%rdx 691 mov %rdx,$A[1] 692 693 mulq $m1 # np[j]*m1 694 add %rax,$N[0] 695 mov -8($ap,$j,8),%rax 696 adc \$0,%rdx 697 add $A[0],$N[0] 698 adc \$0,%rdx 699 mov $N[1],-32(%rsp,$j,8) # tp[j-1] 700 mov %rdx,$N[1] 701 702 mulq $m0 # ap[j]*bp[i] 703 add %rax,$A[1] 704 mov -8($np,$j,8),%rax 705 adc \$0,%rdx 706 add -8(%rsp,$j,8),$A[1] 707 adc \$0,%rdx 708 lea 1($i),$i # i++ 709 mov %rdx,$A[0] 710 711 mulq $m1 # np[j]*m1 712 add %rax,$N[1] 713 mov ($ap),%rax # ap[0] 714 adc \$0,%rdx 715 add $A[1],$N[1] 716 adc \$0,%rdx 717 mov $N[0],-24(%rsp,$j,8) # tp[j-1] 718 mov %rdx,$N[0] 719 720 movq %xmm0,$m0 # bp[i+1] 721 mov $N[1],-16(%rsp,$j,8) # tp[j-1] 722 723 xor $N[1],$N[1] 724 add $A[0],$N[0] 725 adc \$0,$N[1] 726 add (%rsp,$num,8),$N[0] # pull upmost overflow bit 727 adc \$0,$N[1] 728 mov $N[0],-8(%rsp,$j,8) 729 mov $N[1],(%rsp,$j,8) # store upmost overflow bit 730 731 cmp $num,$i 732 jl .Louter4x 733 ___ 734 { 735 my @ri=("%rax","%rdx",$m0,$m1); 736 $code.=<<___; 737 mov 16(%rsp,$num,8),$rp # restore $rp 738 mov 0(%rsp),@ri[0] # tp[0] 739 pxor %xmm0,%xmm0 740 mov 8(%rsp),@ri[1] # tp[1] 741 shr \$2,$num # num/=4 742 lea (%rsp),$ap # borrow ap for tp 743 xor $i,$i # i=0 and clear CF! 744 745 sub 0($np),@ri[0] 746 mov 16($ap),@ri[2] # tp[2] 747 mov 24($ap),@ri[3] # tp[3] 748 sbb 8($np),@ri[1] 749 lea -1($num),$j # j=num/4-1 750 jmp .Lsub4x 751 .align 16 752 .Lsub4x: 753 mov @ri[0],0($rp,$i,8) # rp[i]=tp[i]-np[i] 754 mov @ri[1],8($rp,$i,8) # rp[i]=tp[i]-np[i] 755 sbb 16($np,$i,8),@ri[2] 756 mov 32($ap,$i,8),@ri[0] # tp[i+1] 757 mov 40($ap,$i,8),@ri[1] 758 sbb 24($np,$i,8),@ri[3] 759 mov @ri[2],16($rp,$i,8) # rp[i]=tp[i]-np[i] 760 mov @ri[3],24($rp,$i,8) # rp[i]=tp[i]-np[i] 761 sbb 32($np,$i,8),@ri[0] 762 mov 48($ap,$i,8),@ri[2] 763 mov 56($ap,$i,8),@ri[3] 764 sbb 40($np,$i,8),@ri[1] 765 lea 4($i),$i # i++ 766 dec $j # doesnn't affect CF! 767 jnz .Lsub4x 768 769 mov @ri[0],0($rp,$i,8) # rp[i]=tp[i]-np[i] 770 mov 32($ap,$i,8),@ri[0] # load overflow bit 771 sbb 16($np,$i,8),@ri[2] 772 mov @ri[1],8($rp,$i,8) # rp[i]=tp[i]-np[i] 773 sbb 24($np,$i,8),@ri[3] 774 mov @ri[2],16($rp,$i,8) # rp[i]=tp[i]-np[i] 775 776 sbb \$0,@ri[0] # handle upmost overflow bit 777 mov @ri[3],24($rp,$i,8) # rp[i]=tp[i]-np[i] 778 xor $i,$i # i=0 779 and @ri[0],$ap 780 not @ri[0] 781 mov $rp,$np 782 and @ri[0],$np 783 lea -1($num),$j 784 or $np,$ap # ap=borrow?tp:rp 785 786 movdqu ($ap),%xmm1 787 movdqa %xmm0,(%rsp) 788 movdqu %xmm1,($rp) 789 jmp .Lcopy4x 790 .align 16 791 .Lcopy4x: # copy or in-place refresh 792 movdqu 16($ap,$i),%xmm2 793 movdqu 32($ap,$i),%xmm1 794 movdqa %xmm0,16(%rsp,$i) 795 movdqu %xmm2,16($rp,$i) 796 movdqa %xmm0,32(%rsp,$i) 797 movdqu %xmm1,32($rp,$i) 798 lea 32($i),$i 799 dec $j 800 jnz .Lcopy4x 801 802 shl \$2,$num 803 movdqu 16($ap,$i),%xmm2 804 movdqa %xmm0,16(%rsp,$i) 805 movdqu %xmm2,16($rp,$i) 806 ___ 807 } 808 $code.=<<___; 809 mov 8(%rsp,$num,8),%rsi # restore %rsp 810 mov \$1,%rax 811 ___ 812 $code.=<<___ if ($win64); 813 movaps (%rsi),%xmm6 814 movaps 0x10(%rsi),%xmm7 815 lea 0x28(%rsi),%rsi 816 ___ 817 $code.=<<___; 818 mov (%rsi),%r15 819 mov 8(%rsi),%r14 820 mov 16(%rsi),%r13 821 mov 24(%rsi),%r12 822 mov 32(%rsi),%rbp 823 mov 40(%rsi),%rbx 824 lea 48(%rsi),%rsp 825 .Lmul4x_epilogue: 826 ret 827 .size bn_mul4x_mont_gather5,.-bn_mul4x_mont_gather5 828 ___ 829 }}} 830 831 { 832 my ($inp,$num,$tbl,$idx)=$win64?("%rcx","%rdx","%r8", "%r9") : # Win64 order 833 ("%rdi","%rsi","%rdx","%rcx"); # Unix order 834 my $out=$inp; 835 my $STRIDE=2**5*8; 836 my $N=$STRIDE/4; 837 838 $code.=<<___; 839 .globl bn_scatter5 840 .type bn_scatter5,\@abi-omnipotent 841 .align 16 842 bn_scatter5: 843 cmp \$0, $num 844 jz .Lscatter_epilogue 845 lea ($tbl,$idx,8),$tbl 846 .Lscatter: 847 mov ($inp),%rax 848 lea 8($inp),$inp 849 mov %rax,($tbl) 850 lea 32*8($tbl),$tbl 851 sub \$1,$num 852 jnz .Lscatter 853 .Lscatter_epilogue: 854 ret 855 .size bn_scatter5,.-bn_scatter5 856 857 .globl bn_gather5 858 .type bn_gather5,\@abi-omnipotent 859 .align 16 860 bn_gather5: 861 ___ 862 $code.=<<___ if ($win64); 863 .LSEH_begin_bn_gather5: 864 # I can't trust assembler to use specific encoding:-( 865 .byte 0x48,0x83,0xec,0x28 #sub \$0x28,%rsp 866 .byte 0x0f,0x29,0x34,0x24 #movaps %xmm6,(%rsp) 867 .byte 0x0f,0x29,0x7c,0x24,0x10 #movdqa %xmm7,0x10(%rsp) 868 ___ 869 $code.=<<___; 870 mov $idx,%r11 871 shr \$`log($N/8)/log(2)`,$idx 872 and \$`$N/8-1`,%r11 873 not $idx 874 lea .Lmagic_masks(%rip),%rax 875 and \$`2**5/($N/8)-1`,$idx # 5 is "window size" 876 lea 96($tbl,%r11,8),$tbl # pointer within 1st cache line 877 movq 0(%rax,$idx,8),%xmm4 # set of masks denoting which 878 movq 8(%rax,$idx,8),%xmm5 # cache line contains element 879 movq 16(%rax,$idx,8),%xmm6 # denoted by 7th argument 880 movq 24(%rax,$idx,8),%xmm7 881 jmp .Lgather 882 .align 16 883 .Lgather: 884 movq `0*$STRIDE/4-96`($tbl),%xmm0 885 movq `1*$STRIDE/4-96`($tbl),%xmm1 886 pand %xmm4,%xmm0 887 movq `2*$STRIDE/4-96`($tbl),%xmm2 888 pand %xmm5,%xmm1 889 movq `3*$STRIDE/4-96`($tbl),%xmm3 890 pand %xmm6,%xmm2 891 por %xmm1,%xmm0 892 pand %xmm7,%xmm3 893 por %xmm2,%xmm0 894 lea $STRIDE($tbl),$tbl 895 por %xmm3,%xmm0 896 897 movq %xmm0,($out) # m0=bp[0] 898 lea 8($out),$out 899 sub \$1,$num 900 jnz .Lgather 901 ___ 902 $code.=<<___ if ($win64); 903 movaps %xmm6,(%rsp) 904 movaps %xmm7,0x10(%rsp) 905 lea 0x28(%rsp),%rsp 906 ___ 907 $code.=<<___; 908 ret 909 .LSEH_end_bn_gather5: 910 .size bn_gather5,.-bn_gather5 911 ___ 912 } 913 $code.=<<___; 914 .align 64 915 .Lmagic_masks: 916 .long 0,0, 0,0, 0,0, -1,-1 917 .long 0,0, 0,0, 0,0, 0,0 918 .asciz "Montgomery Multiplication with scatter/gather for x86_64, CRYPTOGAMS by <appro\@openssl.org>" 919 ___ 920 921 # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, 922 # CONTEXT *context,DISPATCHER_CONTEXT *disp) 923 if ($win64) { 924 $rec="%rcx"; 925 $frame="%rdx"; 926 $context="%r8"; 927 $disp="%r9"; 928 929 $code.=<<___; 930 .extern __imp_RtlVirtualUnwind 931 .type mul_handler,\@abi-omnipotent 932 .align 16 933 mul_handler: 934 push %rsi 935 push %rdi 936 push %rbx 937 push %rbp 938 push %r12 939 push %r13 940 push %r14 941 push %r15 942 pushfq 943 sub \$64,%rsp 944 945 mov 120($context),%rax # pull context->Rax 946 mov 248($context),%rbx # pull context->Rip 947 948 mov 8($disp),%rsi # disp->ImageBase 949 mov 56($disp),%r11 # disp->HandlerData 950 951 mov 0(%r11),%r10d # HandlerData[0] 952 lea (%rsi,%r10),%r10 # end of prologue label 953 cmp %r10,%rbx # context->Rip<end of prologue label 954 jb .Lcommon_seh_tail 955 956 lea `40+48`(%rax),%rax 957 958 mov 4(%r11),%r10d # HandlerData[1] 959 lea (%rsi,%r10),%r10 # end of alloca label 960 cmp %r10,%rbx # context->Rip<end of alloca label 961 jb .Lcommon_seh_tail 962 963 mov 152($context),%rax # pull context->Rsp 964 965 mov 8(%r11),%r10d # HandlerData[2] 966 lea (%rsi,%r10),%r10 # epilogue label 967 cmp %r10,%rbx # context->Rip>=epilogue label 968 jae .Lcommon_seh_tail 969 970 mov 192($context),%r10 # pull $num 971 mov 8(%rax,%r10,8),%rax # pull saved stack pointer 972 973 movaps (%rax),%xmm0 974 movaps 16(%rax),%xmm1 975 lea `40+48`(%rax),%rax 976 977 mov -8(%rax),%rbx 978 mov -16(%rax),%rbp 979 mov -24(%rax),%r12 980 mov -32(%rax),%r13 981 mov -40(%rax),%r14 982 mov -48(%rax),%r15 983 mov %rbx,144($context) # restore context->Rbx 984 mov %rbp,160($context) # restore context->Rbp 985 mov %r12,216($context) # restore context->R12 986 mov %r13,224($context) # restore context->R13 987 mov %r14,232($context) # restore context->R14 988 mov %r15,240($context) # restore context->R15 989 movups %xmm0,512($context) # restore context->Xmm6 990 movups %xmm1,528($context) # restore context->Xmm7 991 992 .Lcommon_seh_tail: 993 mov 8(%rax),%rdi 994 mov 16(%rax),%rsi 995 mov %rax,152($context) # restore context->Rsp 996 mov %rsi,168($context) # restore context->Rsi 997 mov %rdi,176($context) # restore context->Rdi 998 999 mov 40($disp),%rdi # disp->ContextRecord 1000 mov $context,%rsi # context 1001 mov \$154,%ecx # sizeof(CONTEXT) 1002 .long 0xa548f3fc # cld; rep movsq 1003 1004 mov $disp,%rsi 1005 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER 1006 mov 8(%rsi),%rdx # arg2, disp->ImageBase 1007 mov 0(%rsi),%r8 # arg3, disp->ControlPc 1008 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry 1009 mov 40(%rsi),%r10 # disp->ContextRecord 1010 lea 56(%rsi),%r11 # &disp->HandlerData 1011 lea 24(%rsi),%r12 # &disp->EstablisherFrame 1012 mov %r10,32(%rsp) # arg5 1013 mov %r11,40(%rsp) # arg6 1014 mov %r12,48(%rsp) # arg7 1015 mov %rcx,56(%rsp) # arg8, (NULL) 1016 call *__imp_RtlVirtualUnwind(%rip) 1017 1018 mov \$1,%eax # ExceptionContinueSearch 1019 add \$64,%rsp 1020 popfq 1021 pop %r15 1022 pop %r14 1023 pop %r13 1024 pop %r12 1025 pop %rbp 1026 pop %rbx 1027 pop %rdi 1028 pop %rsi 1029 ret 1030 .size mul_handler,.-mul_handler 1031 1032 .section .pdata 1033 .align 4 1034 .rva .LSEH_begin_bn_mul_mont_gather5 1035 .rva .LSEH_end_bn_mul_mont_gather5 1036 .rva .LSEH_info_bn_mul_mont_gather5 1037 1038 .rva .LSEH_begin_bn_mul4x_mont_gather5 1039 .rva .LSEH_end_bn_mul4x_mont_gather5 1040 .rva .LSEH_info_bn_mul4x_mont_gather5 1041 1042 .rva .LSEH_begin_bn_gather5 1043 .rva .LSEH_end_bn_gather5 1044 .rva .LSEH_info_bn_gather5 1045 1046 .section .xdata 1047 .align 8 1048 .LSEH_info_bn_mul_mont_gather5: 1049 .byte 9,0,0,0 1050 .rva mul_handler 1051 .rva .Lmul_alloca,.Lmul_body,.Lmul_epilogue # HandlerData[] 1052 .align 8 1053 .LSEH_info_bn_mul4x_mont_gather5: 1054 .byte 9,0,0,0 1055 .rva mul_handler 1056 .rva .Lmul4x_alloca,.Lmul4x_body,.Lmul4x_epilogue # HandlerData[] 1057 .align 8 1058 .LSEH_info_bn_gather5: 1059 .byte 0x01,0x0d,0x05,0x00 1060 .byte 0x0d,0x78,0x01,0x00 #movaps 0x10(rsp),xmm7 1061 .byte 0x08,0x68,0x00,0x00 #movaps (rsp),xmm6 1062 .byte 0x04,0x42,0x00,0x00 #sub rsp,0x28 1063 .align 8 1064 ___ 1065 } 1066 1067 $code =~ s/\`([^\`]*)\`/eval($1)/gem; 1068 1069 print $code; 1070 close STDOUT; 1071