1 #!/usr/bin/env perl 2 3 ############################################################################## 4 # # 5 # Copyright (c) 2012, Intel Corporation # 6 # # 7 # All rights reserved. # 8 # # 9 # Redistribution and use in source and binary forms, with or without # 10 # modification, are permitted provided that the following conditions are # 11 # met: # 12 # # 13 # * Redistributions of source code must retain the above copyright # 14 # notice, this list of conditions and the following disclaimer. # 15 # # 16 # * Redistributions in binary form must reproduce the above copyright # 17 # notice, this list of conditions and the following disclaimer in the # 18 # documentation and/or other materials provided with the # 19 # distribution. # 20 # # 21 # * Neither the name of the Intel Corporation nor the names of its # 22 # contributors may be used to endorse or promote products derived from # 23 # this software without specific prior written permission. # 24 # # 25 # # 26 # THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY # 27 # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE # 28 # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR # 29 # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR # 30 # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, # 31 # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, # 32 # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR # 33 # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF # 34 # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING # 35 # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS # 36 # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # 37 # # 38 ############################################################################## 39 # Developers and authors: # 40 # Shay Gueron (1, 2), and Vlad Krasnov (1) # 41 # (1) Intel Corporation, Israel Development Center, Haifa, Israel # 42 # (2) University of Haifa, Israel # 43 ############################################################################## 44 # Reference: # 45 # [1] S. Gueron, V. Krasnov: "Software Implementation of Modular # 46 # Exponentiation, Using Advanced Vector Instructions Architectures", # 47 # F. Ozbudak and F. Rodriguez-Henriquez (Eds.): WAIFI 2012, LNCS 7369, # 48 # pp. 119?135, 2012. Springer-Verlag Berlin Heidelberg 2012 # 49 # [2] S. Gueron: "Efficient Software Implementations of Modular # 50 # Exponentiation", Journal of Cryptographic Engineering 2:31-43 (2012). # 51 # [3] S. Gueron, V. Krasnov: "Speeding up Big-numbers Squaring",IEEE # 52 # Proceedings of 9th International Conference on Information Technology: # 53 # New Generations (ITNG 2012), pp.821-823 (2012) # 54 # [4] S. Gueron, V. Krasnov: "[PATCH] Efficient and side channel analysis # 55 # resistant 1024-bit modular exponentiation, for optimizing RSA2048 # 56 # on AVX2 capable x86_64 platforms", # 57 # http://rt.openssl.org/Ticket/Display.html?id=2850&user=guest&pass=guest# 58 ############################################################################## 59 # 60 # +13% improvement over original submission by <appro (at] openssl.org> 61 # 62 # rsa2048 sign/sec OpenSSL 1.0.1 scalar(*) this 63 # 2.3GHz Haswell 621 765/+23% 1113/+79% 64 # 2.3GHz Broadwell(**) 688 1200(***)/+74% 1120/+63% 65 # 66 # (*) if system doesn't support AVX2, for reference purposes; 67 # (**) scaled to 2.3GHz to simplify comparison; 68 # (***) scalar AD*X code is faster than AVX2 and is preferred code 69 # path for Broadwell; 70 71 $flavour = shift; 72 $output = shift; 73 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } 74 75 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); 76 77 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 78 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or 79 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or 80 die "can't locate x86_64-xlate.pl"; 81 82 # In upstream, this is controlled by shelling out to the compiler to check 83 # versions, but BoringSSL is intended to be used with pre-generated perlasm 84 # output, so this isn't useful anyway. 85 # 86 # TODO(davidben): Enable these after testing. $avx goes up to 2 and $addx to 1. 87 $avx = 0; 88 $addx = 0; 89 90 open OUT,"| \"$^X\" $xlate $flavour $output"; 91 *STDOUT = *OUT; 92 93 if ($avx>1) {{{ 94 { # void AMS_WW( 95 my $rp="%rdi"; # BN_ULONG *rp, 96 my $ap="%rsi"; # const BN_ULONG *ap, 97 my $np="%rdx"; # const BN_ULONG *np, 98 my $n0="%ecx"; # const BN_ULONG n0, 99 my $rep="%r8d"; # int repeat); 100 101 # The registers that hold the accumulated redundant result 102 # The AMM works on 1024 bit operands, and redundant word size is 29 103 # Therefore: ceil(1024/29)/4 = 9 104 my $ACC0="%ymm0"; 105 my $ACC1="%ymm1"; 106 my $ACC2="%ymm2"; 107 my $ACC3="%ymm3"; 108 my $ACC4="%ymm4"; 109 my $ACC5="%ymm5"; 110 my $ACC6="%ymm6"; 111 my $ACC7="%ymm7"; 112 my $ACC8="%ymm8"; 113 my $ACC9="%ymm9"; 114 # Registers that hold the broadcasted words of bp, currently used 115 my $B1="%ymm10"; 116 my $B2="%ymm11"; 117 # Registers that hold the broadcasted words of Y, currently used 118 my $Y1="%ymm12"; 119 my $Y2="%ymm13"; 120 # Helper registers 121 my $TEMP1="%ymm14"; 122 my $AND_MASK="%ymm15"; 123 # alu registers that hold the first words of the ACC 124 my $r0="%r9"; 125 my $r1="%r10"; 126 my $r2="%r11"; 127 my $r3="%r12"; 128 129 my $i="%r14d"; # loop counter 130 my $tmp = "%r15"; 131 132 my $FrameSize=32*18+32*8; # place for A^2 and 2*A 133 134 my $aap=$r0; 135 my $tp0="%rbx"; 136 my $tp1=$r3; 137 my $tpa=$tmp; 138 139 $np="%r13"; # reassigned argument 140 141 $code.=<<___; 142 .text 143 144 .globl rsaz_1024_sqr_avx2 145 .type rsaz_1024_sqr_avx2,\@function,5 146 .align 64 147 rsaz_1024_sqr_avx2: # 702 cycles, 14% faster than rsaz_1024_mul_avx2 148 lea (%rsp), %rax 149 push %rbx 150 push %rbp 151 push %r12 152 push %r13 153 push %r14 154 push %r15 155 vzeroupper 156 ___ 157 $code.=<<___ if ($win64); 158 lea -0xa8(%rsp),%rsp 159 vmovaps %xmm6,-0xd8(%rax) 160 vmovaps %xmm7,-0xc8(%rax) 161 vmovaps %xmm8,-0xb8(%rax) 162 vmovaps %xmm9,-0xa8(%rax) 163 vmovaps %xmm10,-0x98(%rax) 164 vmovaps %xmm11,-0x88(%rax) 165 vmovaps %xmm12,-0x78(%rax) 166 vmovaps %xmm13,-0x68(%rax) 167 vmovaps %xmm14,-0x58(%rax) 168 vmovaps %xmm15,-0x48(%rax) 169 .Lsqr_1024_body: 170 ___ 171 $code.=<<___; 172 mov %rax,%rbp 173 mov %rdx, $np # reassigned argument 174 sub \$$FrameSize, %rsp 175 mov $np, $tmp 176 sub \$-128, $rp # size optimization 177 sub \$-128, $ap 178 sub \$-128, $np 179 180 and \$4095, $tmp # see if $np crosses page 181 add \$32*10, $tmp 182 shr \$12, $tmp 183 vpxor $ACC9,$ACC9,$ACC9 184 jz .Lsqr_1024_no_n_copy 185 186 # unaligned 256-bit load that crosses page boundary can 187 # cause >2x performance degradation here, so if $np does 188 # cross page boundary, copy it to stack and make sure stack 189 # frame doesn't... 190 sub \$32*10,%rsp 191 vmovdqu 32*0-128($np), $ACC0 192 and \$-2048, %rsp 193 vmovdqu 32*1-128($np), $ACC1 194 vmovdqu 32*2-128($np), $ACC2 195 vmovdqu 32*3-128($np), $ACC3 196 vmovdqu 32*4-128($np), $ACC4 197 vmovdqu 32*5-128($np), $ACC5 198 vmovdqu 32*6-128($np), $ACC6 199 vmovdqu 32*7-128($np), $ACC7 200 vmovdqu 32*8-128($np), $ACC8 201 lea $FrameSize+128(%rsp),$np 202 vmovdqu $ACC0, 32*0-128($np) 203 vmovdqu $ACC1, 32*1-128($np) 204 vmovdqu $ACC2, 32*2-128($np) 205 vmovdqu $ACC3, 32*3-128($np) 206 vmovdqu $ACC4, 32*4-128($np) 207 vmovdqu $ACC5, 32*5-128($np) 208 vmovdqu $ACC6, 32*6-128($np) 209 vmovdqu $ACC7, 32*7-128($np) 210 vmovdqu $ACC8, 32*8-128($np) 211 vmovdqu $ACC9, 32*9-128($np) # $ACC9 is zero 212 213 .Lsqr_1024_no_n_copy: 214 and \$-1024, %rsp 215 216 vmovdqu 32*1-128($ap), $ACC1 217 vmovdqu 32*2-128($ap), $ACC2 218 vmovdqu 32*3-128($ap), $ACC3 219 vmovdqu 32*4-128($ap), $ACC4 220 vmovdqu 32*5-128($ap), $ACC5 221 vmovdqu 32*6-128($ap), $ACC6 222 vmovdqu 32*7-128($ap), $ACC7 223 vmovdqu 32*8-128($ap), $ACC8 224 225 lea 192(%rsp), $tp0 # 64+128=192 226 vpbroadcastq .Land_mask(%rip), $AND_MASK 227 jmp .LOOP_GRANDE_SQR_1024 228 229 .align 32 230 .LOOP_GRANDE_SQR_1024: 231 lea 32*18+128(%rsp), $aap # size optimization 232 lea 448(%rsp), $tp1 # 64+128+256=448 233 234 # the squaring is performed as described in Variant B of 235 # "Speeding up Big-Number Squaring", so start by calculating 236 # the A*2=A+A vector 237 vpaddq $ACC1, $ACC1, $ACC1 238 vpbroadcastq 32*0-128($ap), $B1 239 vpaddq $ACC2, $ACC2, $ACC2 240 vmovdqa $ACC1, 32*0-128($aap) 241 vpaddq $ACC3, $ACC3, $ACC3 242 vmovdqa $ACC2, 32*1-128($aap) 243 vpaddq $ACC4, $ACC4, $ACC4 244 vmovdqa $ACC3, 32*2-128($aap) 245 vpaddq $ACC5, $ACC5, $ACC5 246 vmovdqa $ACC4, 32*3-128($aap) 247 vpaddq $ACC6, $ACC6, $ACC6 248 vmovdqa $ACC5, 32*4-128($aap) 249 vpaddq $ACC7, $ACC7, $ACC7 250 vmovdqa $ACC6, 32*5-128($aap) 251 vpaddq $ACC8, $ACC8, $ACC8 252 vmovdqa $ACC7, 32*6-128($aap) 253 vpxor $ACC9, $ACC9, $ACC9 254 vmovdqa $ACC8, 32*7-128($aap) 255 256 vpmuludq 32*0-128($ap), $B1, $ACC0 257 vpbroadcastq 32*1-128($ap), $B2 258 vmovdqu $ACC9, 32*9-192($tp0) # zero upper half 259 vpmuludq $B1, $ACC1, $ACC1 260 vmovdqu $ACC9, 32*10-448($tp1) 261 vpmuludq $B1, $ACC2, $ACC2 262 vmovdqu $ACC9, 32*11-448($tp1) 263 vpmuludq $B1, $ACC3, $ACC3 264 vmovdqu $ACC9, 32*12-448($tp1) 265 vpmuludq $B1, $ACC4, $ACC4 266 vmovdqu $ACC9, 32*13-448($tp1) 267 vpmuludq $B1, $ACC5, $ACC5 268 vmovdqu $ACC9, 32*14-448($tp1) 269 vpmuludq $B1, $ACC6, $ACC6 270 vmovdqu $ACC9, 32*15-448($tp1) 271 vpmuludq $B1, $ACC7, $ACC7 272 vmovdqu $ACC9, 32*16-448($tp1) 273 vpmuludq $B1, $ACC8, $ACC8 274 vpbroadcastq 32*2-128($ap), $B1 275 vmovdqu $ACC9, 32*17-448($tp1) 276 277 mov $ap, $tpa 278 mov \$4, $i 279 jmp .Lsqr_entry_1024 280 ___ 281 $TEMP0=$Y1; 282 $TEMP2=$Y2; 283 $code.=<<___; 284 .align 32 285 .LOOP_SQR_1024: 286 vpbroadcastq 32*1-128($tpa), $B2 287 vpmuludq 32*0-128($ap), $B1, $ACC0 288 vpaddq 32*0-192($tp0), $ACC0, $ACC0 289 vpmuludq 32*0-128($aap), $B1, $ACC1 290 vpaddq 32*1-192($tp0), $ACC1, $ACC1 291 vpmuludq 32*1-128($aap), $B1, $ACC2 292 vpaddq 32*2-192($tp0), $ACC2, $ACC2 293 vpmuludq 32*2-128($aap), $B1, $ACC3 294 vpaddq 32*3-192($tp0), $ACC3, $ACC3 295 vpmuludq 32*3-128($aap), $B1, $ACC4 296 vpaddq 32*4-192($tp0), $ACC4, $ACC4 297 vpmuludq 32*4-128($aap), $B1, $ACC5 298 vpaddq 32*5-192($tp0), $ACC5, $ACC5 299 vpmuludq 32*5-128($aap), $B1, $ACC6 300 vpaddq 32*6-192($tp0), $ACC6, $ACC6 301 vpmuludq 32*6-128($aap), $B1, $ACC7 302 vpaddq 32*7-192($tp0), $ACC7, $ACC7 303 vpmuludq 32*7-128($aap), $B1, $ACC8 304 vpbroadcastq 32*2-128($tpa), $B1 305 vpaddq 32*8-192($tp0), $ACC8, $ACC8 306 .Lsqr_entry_1024: 307 vmovdqu $ACC0, 32*0-192($tp0) 308 vmovdqu $ACC1, 32*1-192($tp0) 309 310 vpmuludq 32*1-128($ap), $B2, $TEMP0 311 vpaddq $TEMP0, $ACC2, $ACC2 312 vpmuludq 32*1-128($aap), $B2, $TEMP1 313 vpaddq $TEMP1, $ACC3, $ACC3 314 vpmuludq 32*2-128($aap), $B2, $TEMP2 315 vpaddq $TEMP2, $ACC4, $ACC4 316 vpmuludq 32*3-128($aap), $B2, $TEMP0 317 vpaddq $TEMP0, $ACC5, $ACC5 318 vpmuludq 32*4-128($aap), $B2, $TEMP1 319 vpaddq $TEMP1, $ACC6, $ACC6 320 vpmuludq 32*5-128($aap), $B2, $TEMP2 321 vpaddq $TEMP2, $ACC7, $ACC7 322 vpmuludq 32*6-128($aap), $B2, $TEMP0 323 vpaddq $TEMP0, $ACC8, $ACC8 324 vpmuludq 32*7-128($aap), $B2, $ACC0 325 vpbroadcastq 32*3-128($tpa), $B2 326 vpaddq 32*9-192($tp0), $ACC0, $ACC0 327 328 vmovdqu $ACC2, 32*2-192($tp0) 329 vmovdqu $ACC3, 32*3-192($tp0) 330 331 vpmuludq 32*2-128($ap), $B1, $TEMP2 332 vpaddq $TEMP2, $ACC4, $ACC4 333 vpmuludq 32*2-128($aap), $B1, $TEMP0 334 vpaddq $TEMP0, $ACC5, $ACC5 335 vpmuludq 32*3-128($aap), $B1, $TEMP1 336 vpaddq $TEMP1, $ACC6, $ACC6 337 vpmuludq 32*4-128($aap), $B1, $TEMP2 338 vpaddq $TEMP2, $ACC7, $ACC7 339 vpmuludq 32*5-128($aap), $B1, $TEMP0 340 vpaddq $TEMP0, $ACC8, $ACC8 341 vpmuludq 32*6-128($aap), $B1, $TEMP1 342 vpaddq $TEMP1, $ACC0, $ACC0 343 vpmuludq 32*7-128($aap), $B1, $ACC1 344 vpbroadcastq 32*4-128($tpa), $B1 345 vpaddq 32*10-448($tp1), $ACC1, $ACC1 346 347 vmovdqu $ACC4, 32*4-192($tp0) 348 vmovdqu $ACC5, 32*5-192($tp0) 349 350 vpmuludq 32*3-128($ap), $B2, $TEMP0 351 vpaddq $TEMP0, $ACC6, $ACC6 352 vpmuludq 32*3-128($aap), $B2, $TEMP1 353 vpaddq $TEMP1, $ACC7, $ACC7 354 vpmuludq 32*4-128($aap), $B2, $TEMP2 355 vpaddq $TEMP2, $ACC8, $ACC8 356 vpmuludq 32*5-128($aap), $B2, $TEMP0 357 vpaddq $TEMP0, $ACC0, $ACC0 358 vpmuludq 32*6-128($aap), $B2, $TEMP1 359 vpaddq $TEMP1, $ACC1, $ACC1 360 vpmuludq 32*7-128($aap), $B2, $ACC2 361 vpbroadcastq 32*5-128($tpa), $B2 362 vpaddq 32*11-448($tp1), $ACC2, $ACC2 363 364 vmovdqu $ACC6, 32*6-192($tp0) 365 vmovdqu $ACC7, 32*7-192($tp0) 366 367 vpmuludq 32*4-128($ap), $B1, $TEMP0 368 vpaddq $TEMP0, $ACC8, $ACC8 369 vpmuludq 32*4-128($aap), $B1, $TEMP1 370 vpaddq $TEMP1, $ACC0, $ACC0 371 vpmuludq 32*5-128($aap), $B1, $TEMP2 372 vpaddq $TEMP2, $ACC1, $ACC1 373 vpmuludq 32*6-128($aap), $B1, $TEMP0 374 vpaddq $TEMP0, $ACC2, $ACC2 375 vpmuludq 32*7-128($aap), $B1, $ACC3 376 vpbroadcastq 32*6-128($tpa), $B1 377 vpaddq 32*12-448($tp1), $ACC3, $ACC3 378 379 vmovdqu $ACC8, 32*8-192($tp0) 380 vmovdqu $ACC0, 32*9-192($tp0) 381 lea 8($tp0), $tp0 382 383 vpmuludq 32*5-128($ap), $B2, $TEMP2 384 vpaddq $TEMP2, $ACC1, $ACC1 385 vpmuludq 32*5-128($aap), $B2, $TEMP0 386 vpaddq $TEMP0, $ACC2, $ACC2 387 vpmuludq 32*6-128($aap), $B2, $TEMP1 388 vpaddq $TEMP1, $ACC3, $ACC3 389 vpmuludq 32*7-128($aap), $B2, $ACC4 390 vpbroadcastq 32*7-128($tpa), $B2 391 vpaddq 32*13-448($tp1), $ACC4, $ACC4 392 393 vmovdqu $ACC1, 32*10-448($tp1) 394 vmovdqu $ACC2, 32*11-448($tp1) 395 396 vpmuludq 32*6-128($ap), $B1, $TEMP0 397 vpaddq $TEMP0, $ACC3, $ACC3 398 vpmuludq 32*6-128($aap), $B1, $TEMP1 399 vpbroadcastq 32*8-128($tpa), $ACC0 # borrow $ACC0 for $B1 400 vpaddq $TEMP1, $ACC4, $ACC4 401 vpmuludq 32*7-128($aap), $B1, $ACC5 402 vpbroadcastq 32*0+8-128($tpa), $B1 # for next iteration 403 vpaddq 32*14-448($tp1), $ACC5, $ACC5 404 405 vmovdqu $ACC3, 32*12-448($tp1) 406 vmovdqu $ACC4, 32*13-448($tp1) 407 lea 8($tpa), $tpa 408 409 vpmuludq 32*7-128($ap), $B2, $TEMP0 410 vpaddq $TEMP0, $ACC5, $ACC5 411 vpmuludq 32*7-128($aap), $B2, $ACC6 412 vpaddq 32*15-448($tp1), $ACC6, $ACC6 413 414 vpmuludq 32*8-128($ap), $ACC0, $ACC7 415 vmovdqu $ACC5, 32*14-448($tp1) 416 vpaddq 32*16-448($tp1), $ACC7, $ACC7 417 vmovdqu $ACC6, 32*15-448($tp1) 418 vmovdqu $ACC7, 32*16-448($tp1) 419 lea 8($tp1), $tp1 420 421 dec $i 422 jnz .LOOP_SQR_1024 423 ___ 424 $ZERO = $ACC9; 425 $TEMP0 = $B1; 426 $TEMP2 = $B2; 427 $TEMP3 = $Y1; 428 $TEMP4 = $Y2; 429 $code.=<<___; 430 #we need to fix indexes 32-39 to avoid overflow 431 vmovdqu 32*8(%rsp), $ACC8 # 32*8-192($tp0), 432 vmovdqu 32*9(%rsp), $ACC1 # 32*9-192($tp0) 433 vmovdqu 32*10(%rsp), $ACC2 # 32*10-192($tp0) 434 lea 192(%rsp), $tp0 # 64+128=192 435 436 vpsrlq \$29, $ACC8, $TEMP1 437 vpand $AND_MASK, $ACC8, $ACC8 438 vpsrlq \$29, $ACC1, $TEMP2 439 vpand $AND_MASK, $ACC1, $ACC1 440 441 vpermq \$0x93, $TEMP1, $TEMP1 442 vpxor $ZERO, $ZERO, $ZERO 443 vpermq \$0x93, $TEMP2, $TEMP2 444 445 vpblendd \$3, $ZERO, $TEMP1, $TEMP0 446 vpblendd \$3, $TEMP1, $TEMP2, $TEMP1 447 vpaddq $TEMP0, $ACC8, $ACC8 448 vpblendd \$3, $TEMP2, $ZERO, $TEMP2 449 vpaddq $TEMP1, $ACC1, $ACC1 450 vpaddq $TEMP2, $ACC2, $ACC2 451 vmovdqu $ACC1, 32*9-192($tp0) 452 vmovdqu $ACC2, 32*10-192($tp0) 453 454 mov (%rsp), %rax 455 mov 8(%rsp), $r1 456 mov 16(%rsp), $r2 457 mov 24(%rsp), $r3 458 vmovdqu 32*1(%rsp), $ACC1 459 vmovdqu 32*2-192($tp0), $ACC2 460 vmovdqu 32*3-192($tp0), $ACC3 461 vmovdqu 32*4-192($tp0), $ACC4 462 vmovdqu 32*5-192($tp0), $ACC5 463 vmovdqu 32*6-192($tp0), $ACC6 464 vmovdqu 32*7-192($tp0), $ACC7 465 466 mov %rax, $r0 467 imull $n0, %eax 468 and \$0x1fffffff, %eax 469 vmovd %eax, $Y1 470 471 mov %rax, %rdx 472 imulq -128($np), %rax 473 vpbroadcastq $Y1, $Y1 474 add %rax, $r0 475 mov %rdx, %rax 476 imulq 8-128($np), %rax 477 shr \$29, $r0 478 add %rax, $r1 479 mov %rdx, %rax 480 imulq 16-128($np), %rax 481 add $r0, $r1 482 add %rax, $r2 483 imulq 24-128($np), %rdx 484 add %rdx, $r3 485 486 mov $r1, %rax 487 imull $n0, %eax 488 and \$0x1fffffff, %eax 489 490 mov \$9, $i 491 jmp .LOOP_REDUCE_1024 492 493 .align 32 494 .LOOP_REDUCE_1024: 495 vmovd %eax, $Y2 496 vpbroadcastq $Y2, $Y2 497 498 vpmuludq 32*1-128($np), $Y1, $TEMP0 499 mov %rax, %rdx 500 imulq -128($np), %rax 501 vpaddq $TEMP0, $ACC1, $ACC1 502 add %rax, $r1 503 vpmuludq 32*2-128($np), $Y1, $TEMP1 504 mov %rdx, %rax 505 imulq 8-128($np), %rax 506 vpaddq $TEMP1, $ACC2, $ACC2 507 vpmuludq 32*3-128($np), $Y1, $TEMP2 508 .byte 0x67 509 add %rax, $r2 510 .byte 0x67 511 mov %rdx, %rax 512 imulq 16-128($np), %rax 513 shr \$29, $r1 514 vpaddq $TEMP2, $ACC3, $ACC3 515 vpmuludq 32*4-128($np), $Y1, $TEMP0 516 add %rax, $r3 517 add $r1, $r2 518 vpaddq $TEMP0, $ACC4, $ACC4 519 vpmuludq 32*5-128($np), $Y1, $TEMP1 520 mov $r2, %rax 521 imull $n0, %eax 522 vpaddq $TEMP1, $ACC5, $ACC5 523 vpmuludq 32*6-128($np), $Y1, $TEMP2 524 and \$0x1fffffff, %eax 525 vpaddq $TEMP2, $ACC6, $ACC6 526 vpmuludq 32*7-128($np), $Y1, $TEMP0 527 vpaddq $TEMP0, $ACC7, $ACC7 528 vpmuludq 32*8-128($np), $Y1, $TEMP1 529 vmovd %eax, $Y1 530 #vmovdqu 32*1-8-128($np), $TEMP2 # moved below 531 vpaddq $TEMP1, $ACC8, $ACC8 532 #vmovdqu 32*2-8-128($np), $TEMP0 # moved below 533 vpbroadcastq $Y1, $Y1 534 535 vpmuludq 32*1-8-128($np), $Y2, $TEMP2 # see above 536 vmovdqu 32*3-8-128($np), $TEMP1 537 mov %rax, %rdx 538 imulq -128($np), %rax 539 vpaddq $TEMP2, $ACC1, $ACC1 540 vpmuludq 32*2-8-128($np), $Y2, $TEMP0 # see above 541 vmovdqu 32*4-8-128($np), $TEMP2 542 add %rax, $r2 543 mov %rdx, %rax 544 imulq 8-128($np), %rax 545 vpaddq $TEMP0, $ACC2, $ACC2 546 add $r3, %rax 547 shr \$29, $r2 548 vpmuludq $Y2, $TEMP1, $TEMP1 549 vmovdqu 32*5-8-128($np), $TEMP0 550 add $r2, %rax 551 vpaddq $TEMP1, $ACC3, $ACC3 552 vpmuludq $Y2, $TEMP2, $TEMP2 553 vmovdqu 32*6-8-128($np), $TEMP1 554 .byte 0x67 555 mov %rax, $r3 556 imull $n0, %eax 557 vpaddq $TEMP2, $ACC4, $ACC4 558 vpmuludq $Y2, $TEMP0, $TEMP0 559 .byte 0xc4,0x41,0x7e,0x6f,0x9d,0x58,0x00,0x00,0x00 # vmovdqu 32*7-8-128($np), $TEMP2 560 and \$0x1fffffff, %eax 561 vpaddq $TEMP0, $ACC5, $ACC5 562 vpmuludq $Y2, $TEMP1, $TEMP1 563 vmovdqu 32*8-8-128($np), $TEMP0 564 vpaddq $TEMP1, $ACC6, $ACC6 565 vpmuludq $Y2, $TEMP2, $TEMP2 566 vmovdqu 32*9-8-128($np), $ACC9 567 vmovd %eax, $ACC0 # borrow ACC0 for Y2 568 imulq -128($np), %rax 569 vpaddq $TEMP2, $ACC7, $ACC7 570 vpmuludq $Y2, $TEMP0, $TEMP0 571 vmovdqu 32*1-16-128($np), $TEMP1 572 vpbroadcastq $ACC0, $ACC0 573 vpaddq $TEMP0, $ACC8, $ACC8 574 vpmuludq $Y2, $ACC9, $ACC9 575 vmovdqu 32*2-16-128($np), $TEMP2 576 add %rax, $r3 577 578 ___ 579 ($ACC0,$Y2)=($Y2,$ACC0); 580 $code.=<<___; 581 vmovdqu 32*1-24-128($np), $ACC0 582 vpmuludq $Y1, $TEMP1, $TEMP1 583 vmovdqu 32*3-16-128($np), $TEMP0 584 vpaddq $TEMP1, $ACC1, $ACC1 585 vpmuludq $Y2, $ACC0, $ACC0 586 vpmuludq $Y1, $TEMP2, $TEMP2 587 .byte 0xc4,0x41,0x7e,0x6f,0xb5,0xf0,0xff,0xff,0xff # vmovdqu 32*4-16-128($np), $TEMP1 588 vpaddq $ACC1, $ACC0, $ACC0 589 vpaddq $TEMP2, $ACC2, $ACC2 590 vpmuludq $Y1, $TEMP0, $TEMP0 591 vmovdqu 32*5-16-128($np), $TEMP2 592 .byte 0x67 593 vmovq $ACC0, %rax 594 vmovdqu $ACC0, (%rsp) # transfer $r0-$r3 595 vpaddq $TEMP0, $ACC3, $ACC3 596 vpmuludq $Y1, $TEMP1, $TEMP1 597 vmovdqu 32*6-16-128($np), $TEMP0 598 vpaddq $TEMP1, $ACC4, $ACC4 599 vpmuludq $Y1, $TEMP2, $TEMP2 600 vmovdqu 32*7-16-128($np), $TEMP1 601 vpaddq $TEMP2, $ACC5, $ACC5 602 vpmuludq $Y1, $TEMP0, $TEMP0 603 vmovdqu 32*8-16-128($np), $TEMP2 604 vpaddq $TEMP0, $ACC6, $ACC6 605 vpmuludq $Y1, $TEMP1, $TEMP1 606 shr \$29, $r3 607 vmovdqu 32*9-16-128($np), $TEMP0 608 add $r3, %rax 609 vpaddq $TEMP1, $ACC7, $ACC7 610 vpmuludq $Y1, $TEMP2, $TEMP2 611 #vmovdqu 32*2-24-128($np), $TEMP1 # moved below 612 mov %rax, $r0 613 imull $n0, %eax 614 vpaddq $TEMP2, $ACC8, $ACC8 615 vpmuludq $Y1, $TEMP0, $TEMP0 616 and \$0x1fffffff, %eax 617 vmovd %eax, $Y1 618 vmovdqu 32*3-24-128($np), $TEMP2 619 .byte 0x67 620 vpaddq $TEMP0, $ACC9, $ACC9 621 vpbroadcastq $Y1, $Y1 622 623 vpmuludq 32*2-24-128($np), $Y2, $TEMP1 # see above 624 vmovdqu 32*4-24-128($np), $TEMP0 625 mov %rax, %rdx 626 imulq -128($np), %rax 627 mov 8(%rsp), $r1 628 vpaddq $TEMP1, $ACC2, $ACC1 629 vpmuludq $Y2, $TEMP2, $TEMP2 630 vmovdqu 32*5-24-128($np), $TEMP1 631 add %rax, $r0 632 mov %rdx, %rax 633 imulq 8-128($np), %rax 634 .byte 0x67 635 shr \$29, $r0 636 mov 16(%rsp), $r2 637 vpaddq $TEMP2, $ACC3, $ACC2 638 vpmuludq $Y2, $TEMP0, $TEMP0 639 vmovdqu 32*6-24-128($np), $TEMP2 640 add %rax, $r1 641 mov %rdx, %rax 642 imulq 16-128($np), %rax 643 vpaddq $TEMP0, $ACC4, $ACC3 644 vpmuludq $Y2, $TEMP1, $TEMP1 645 vmovdqu 32*7-24-128($np), $TEMP0 646 imulq 24-128($np), %rdx # future $r3 647 add %rax, $r2 648 lea ($r0,$r1), %rax 649 vpaddq $TEMP1, $ACC5, $ACC4 650 vpmuludq $Y2, $TEMP2, $TEMP2 651 vmovdqu 32*8-24-128($np), $TEMP1 652 mov %rax, $r1 653 imull $n0, %eax 654 vpmuludq $Y2, $TEMP0, $TEMP0 655 vpaddq $TEMP2, $ACC6, $ACC5 656 vmovdqu 32*9-24-128($np), $TEMP2 657 and \$0x1fffffff, %eax 658 vpaddq $TEMP0, $ACC7, $ACC6 659 vpmuludq $Y2, $TEMP1, $TEMP1 660 add 24(%rsp), %rdx 661 vpaddq $TEMP1, $ACC8, $ACC7 662 vpmuludq $Y2, $TEMP2, $TEMP2 663 vpaddq $TEMP2, $ACC9, $ACC8 664 vmovq $r3, $ACC9 665 mov %rdx, $r3 666 667 dec $i 668 jnz .LOOP_REDUCE_1024 669 ___ 670 ($ACC0,$Y2)=($Y2,$ACC0); 671 $code.=<<___; 672 lea 448(%rsp), $tp1 # size optimization 673 vpaddq $ACC9, $Y2, $ACC0 674 vpxor $ZERO, $ZERO, $ZERO 675 676 vpaddq 32*9-192($tp0), $ACC0, $ACC0 677 vpaddq 32*10-448($tp1), $ACC1, $ACC1 678 vpaddq 32*11-448($tp1), $ACC2, $ACC2 679 vpaddq 32*12-448($tp1), $ACC3, $ACC3 680 vpaddq 32*13-448($tp1), $ACC4, $ACC4 681 vpaddq 32*14-448($tp1), $ACC5, $ACC5 682 vpaddq 32*15-448($tp1), $ACC6, $ACC6 683 vpaddq 32*16-448($tp1), $ACC7, $ACC7 684 vpaddq 32*17-448($tp1), $ACC8, $ACC8 685 686 vpsrlq \$29, $ACC0, $TEMP1 687 vpand $AND_MASK, $ACC0, $ACC0 688 vpsrlq \$29, $ACC1, $TEMP2 689 vpand $AND_MASK, $ACC1, $ACC1 690 vpsrlq \$29, $ACC2, $TEMP3 691 vpermq \$0x93, $TEMP1, $TEMP1 692 vpand $AND_MASK, $ACC2, $ACC2 693 vpsrlq \$29, $ACC3, $TEMP4 694 vpermq \$0x93, $TEMP2, $TEMP2 695 vpand $AND_MASK, $ACC3, $ACC3 696 vpermq \$0x93, $TEMP3, $TEMP3 697 698 vpblendd \$3, $ZERO, $TEMP1, $TEMP0 699 vpermq \$0x93, $TEMP4, $TEMP4 700 vpblendd \$3, $TEMP1, $TEMP2, $TEMP1 701 vpaddq $TEMP0, $ACC0, $ACC0 702 vpblendd \$3, $TEMP2, $TEMP3, $TEMP2 703 vpaddq $TEMP1, $ACC1, $ACC1 704 vpblendd \$3, $TEMP3, $TEMP4, $TEMP3 705 vpaddq $TEMP2, $ACC2, $ACC2 706 vpblendd \$3, $TEMP4, $ZERO, $TEMP4 707 vpaddq $TEMP3, $ACC3, $ACC3 708 vpaddq $TEMP4, $ACC4, $ACC4 709 710 vpsrlq \$29, $ACC0, $TEMP1 711 vpand $AND_MASK, $ACC0, $ACC0 712 vpsrlq \$29, $ACC1, $TEMP2 713 vpand $AND_MASK, $ACC1, $ACC1 714 vpsrlq \$29, $ACC2, $TEMP3 715 vpermq \$0x93, $TEMP1, $TEMP1 716 vpand $AND_MASK, $ACC2, $ACC2 717 vpsrlq \$29, $ACC3, $TEMP4 718 vpermq \$0x93, $TEMP2, $TEMP2 719 vpand $AND_MASK, $ACC3, $ACC3 720 vpermq \$0x93, $TEMP3, $TEMP3 721 722 vpblendd \$3, $ZERO, $TEMP1, $TEMP0 723 vpermq \$0x93, $TEMP4, $TEMP4 724 vpblendd \$3, $TEMP1, $TEMP2, $TEMP1 725 vpaddq $TEMP0, $ACC0, $ACC0 726 vpblendd \$3, $TEMP2, $TEMP3, $TEMP2 727 vpaddq $TEMP1, $ACC1, $ACC1 728 vmovdqu $ACC0, 32*0-128($rp) 729 vpblendd \$3, $TEMP3, $TEMP4, $TEMP3 730 vpaddq $TEMP2, $ACC2, $ACC2 731 vmovdqu $ACC1, 32*1-128($rp) 732 vpblendd \$3, $TEMP4, $ZERO, $TEMP4 733 vpaddq $TEMP3, $ACC3, $ACC3 734 vmovdqu $ACC2, 32*2-128($rp) 735 vpaddq $TEMP4, $ACC4, $ACC4 736 vmovdqu $ACC3, 32*3-128($rp) 737 ___ 738 $TEMP5=$ACC0; 739 $code.=<<___; 740 vpsrlq \$29, $ACC4, $TEMP1 741 vpand $AND_MASK, $ACC4, $ACC4 742 vpsrlq \$29, $ACC5, $TEMP2 743 vpand $AND_MASK, $ACC5, $ACC5 744 vpsrlq \$29, $ACC6, $TEMP3 745 vpermq \$0x93, $TEMP1, $TEMP1 746 vpand $AND_MASK, $ACC6, $ACC6 747 vpsrlq \$29, $ACC7, $TEMP4 748 vpermq \$0x93, $TEMP2, $TEMP2 749 vpand $AND_MASK, $ACC7, $ACC7 750 vpsrlq \$29, $ACC8, $TEMP5 751 vpermq \$0x93, $TEMP3, $TEMP3 752 vpand $AND_MASK, $ACC8, $ACC8 753 vpermq \$0x93, $TEMP4, $TEMP4 754 755 vpblendd \$3, $ZERO, $TEMP1, $TEMP0 756 vpermq \$0x93, $TEMP5, $TEMP5 757 vpblendd \$3, $TEMP1, $TEMP2, $TEMP1 758 vpaddq $TEMP0, $ACC4, $ACC4 759 vpblendd \$3, $TEMP2, $TEMP3, $TEMP2 760 vpaddq $TEMP1, $ACC5, $ACC5 761 vpblendd \$3, $TEMP3, $TEMP4, $TEMP3 762 vpaddq $TEMP2, $ACC6, $ACC6 763 vpblendd \$3, $TEMP4, $TEMP5, $TEMP4 764 vpaddq $TEMP3, $ACC7, $ACC7 765 vpaddq $TEMP4, $ACC8, $ACC8 766 767 vpsrlq \$29, $ACC4, $TEMP1 768 vpand $AND_MASK, $ACC4, $ACC4 769 vpsrlq \$29, $ACC5, $TEMP2 770 vpand $AND_MASK, $ACC5, $ACC5 771 vpsrlq \$29, $ACC6, $TEMP3 772 vpermq \$0x93, $TEMP1, $TEMP1 773 vpand $AND_MASK, $ACC6, $ACC6 774 vpsrlq \$29, $ACC7, $TEMP4 775 vpermq \$0x93, $TEMP2, $TEMP2 776 vpand $AND_MASK, $ACC7, $ACC7 777 vpsrlq \$29, $ACC8, $TEMP5 778 vpermq \$0x93, $TEMP3, $TEMP3 779 vpand $AND_MASK, $ACC8, $ACC8 780 vpermq \$0x93, $TEMP4, $TEMP4 781 782 vpblendd \$3, $ZERO, $TEMP1, $TEMP0 783 vpermq \$0x93, $TEMP5, $TEMP5 784 vpblendd \$3, $TEMP1, $TEMP2, $TEMP1 785 vpaddq $TEMP0, $ACC4, $ACC4 786 vpblendd \$3, $TEMP2, $TEMP3, $TEMP2 787 vpaddq $TEMP1, $ACC5, $ACC5 788 vmovdqu $ACC4, 32*4-128($rp) 789 vpblendd \$3, $TEMP3, $TEMP4, $TEMP3 790 vpaddq $TEMP2, $ACC6, $ACC6 791 vmovdqu $ACC5, 32*5-128($rp) 792 vpblendd \$3, $TEMP4, $TEMP5, $TEMP4 793 vpaddq $TEMP3, $ACC7, $ACC7 794 vmovdqu $ACC6, 32*6-128($rp) 795 vpaddq $TEMP4, $ACC8, $ACC8 796 vmovdqu $ACC7, 32*7-128($rp) 797 vmovdqu $ACC8, 32*8-128($rp) 798 799 mov $rp, $ap 800 dec $rep 801 jne .LOOP_GRANDE_SQR_1024 802 803 vzeroall 804 mov %rbp, %rax 805 ___ 806 $code.=<<___ if ($win64); 807 movaps -0xd8(%rax),%xmm6 808 movaps -0xc8(%rax),%xmm7 809 movaps -0xb8(%rax),%xmm8 810 movaps -0xa8(%rax),%xmm9 811 movaps -0x98(%rax),%xmm10 812 movaps -0x88(%rax),%xmm11 813 movaps -0x78(%rax),%xmm12 814 movaps -0x68(%rax),%xmm13 815 movaps -0x58(%rax),%xmm14 816 movaps -0x48(%rax),%xmm15 817 ___ 818 $code.=<<___; 819 mov -48(%rax),%r15 820 mov -40(%rax),%r14 821 mov -32(%rax),%r13 822 mov -24(%rax),%r12 823 mov -16(%rax),%rbp 824 mov -8(%rax),%rbx 825 lea (%rax),%rsp # restore %rsp 826 .Lsqr_1024_epilogue: 827 ret 828 .size rsaz_1024_sqr_avx2,.-rsaz_1024_sqr_avx2 829 ___ 830 } 831 832 { # void AMM_WW( 833 my $rp="%rdi"; # BN_ULONG *rp, 834 my $ap="%rsi"; # const BN_ULONG *ap, 835 my $bp="%rdx"; # const BN_ULONG *bp, 836 my $np="%rcx"; # const BN_ULONG *np, 837 my $n0="%r8d"; # unsigned int n0); 838 839 # The registers that hold the accumulated redundant result 840 # The AMM works on 1024 bit operands, and redundant word size is 29 841 # Therefore: ceil(1024/29)/4 = 9 842 my $ACC0="%ymm0"; 843 my $ACC1="%ymm1"; 844 my $ACC2="%ymm2"; 845 my $ACC3="%ymm3"; 846 my $ACC4="%ymm4"; 847 my $ACC5="%ymm5"; 848 my $ACC6="%ymm6"; 849 my $ACC7="%ymm7"; 850 my $ACC8="%ymm8"; 851 my $ACC9="%ymm9"; 852 853 # Registers that hold the broadcasted words of multiplier, currently used 854 my $Bi="%ymm10"; 855 my $Yi="%ymm11"; 856 857 # Helper registers 858 my $TEMP0=$ACC0; 859 my $TEMP1="%ymm12"; 860 my $TEMP2="%ymm13"; 861 my $ZERO="%ymm14"; 862 my $AND_MASK="%ymm15"; 863 864 # alu registers that hold the first words of the ACC 865 my $r0="%r9"; 866 my $r1="%r10"; 867 my $r2="%r11"; 868 my $r3="%r12"; 869 870 my $i="%r14d"; 871 my $tmp="%r15"; 872 873 $bp="%r13"; # reassigned argument 874 875 $code.=<<___; 876 .globl rsaz_1024_mul_avx2 877 .type rsaz_1024_mul_avx2,\@function,5 878 .align 64 879 rsaz_1024_mul_avx2: 880 lea (%rsp), %rax 881 push %rbx 882 push %rbp 883 push %r12 884 push %r13 885 push %r14 886 push %r15 887 ___ 888 $code.=<<___ if ($win64); 889 vzeroupper 890 lea -0xa8(%rsp),%rsp 891 vmovaps %xmm6,-0xd8(%rax) 892 vmovaps %xmm7,-0xc8(%rax) 893 vmovaps %xmm8,-0xb8(%rax) 894 vmovaps %xmm9,-0xa8(%rax) 895 vmovaps %xmm10,-0x98(%rax) 896 vmovaps %xmm11,-0x88(%rax) 897 vmovaps %xmm12,-0x78(%rax) 898 vmovaps %xmm13,-0x68(%rax) 899 vmovaps %xmm14,-0x58(%rax) 900 vmovaps %xmm15,-0x48(%rax) 901 .Lmul_1024_body: 902 ___ 903 $code.=<<___; 904 mov %rax,%rbp 905 vzeroall 906 mov %rdx, $bp # reassigned argument 907 sub \$64,%rsp 908 909 # unaligned 256-bit load that crosses page boundary can 910 # cause severe performance degradation here, so if $ap does 911 # cross page boundary, swap it with $bp [meaning that caller 912 # is advised to lay down $ap and $bp next to each other, so 913 # that only one can cross page boundary]. 914 .byte 0x67,0x67 915 mov $ap, $tmp 916 and \$4095, $tmp 917 add \$32*10, $tmp 918 shr \$12, $tmp 919 mov $ap, $tmp 920 cmovnz $bp, $ap 921 cmovnz $tmp, $bp 922 923 mov $np, $tmp 924 sub \$-128,$ap # size optimization 925 sub \$-128,$np 926 sub \$-128,$rp 927 928 and \$4095, $tmp # see if $np crosses page 929 add \$32*10, $tmp 930 .byte 0x67,0x67 931 shr \$12, $tmp 932 jz .Lmul_1024_no_n_copy 933 934 # unaligned 256-bit load that crosses page boundary can 935 # cause severe performance degradation here, so if $np does 936 # cross page boundary, copy it to stack and make sure stack 937 # frame doesn't... 938 sub \$32*10,%rsp 939 vmovdqu 32*0-128($np), $ACC0 940 and \$-512, %rsp 941 vmovdqu 32*1-128($np), $ACC1 942 vmovdqu 32*2-128($np), $ACC2 943 vmovdqu 32*3-128($np), $ACC3 944 vmovdqu 32*4-128($np), $ACC4 945 vmovdqu 32*5-128($np), $ACC5 946 vmovdqu 32*6-128($np), $ACC6 947 vmovdqu 32*7-128($np), $ACC7 948 vmovdqu 32*8-128($np), $ACC8 949 lea 64+128(%rsp),$np 950 vmovdqu $ACC0, 32*0-128($np) 951 vpxor $ACC0, $ACC0, $ACC0 952 vmovdqu $ACC1, 32*1-128($np) 953 vpxor $ACC1, $ACC1, $ACC1 954 vmovdqu $ACC2, 32*2-128($np) 955 vpxor $ACC2, $ACC2, $ACC2 956 vmovdqu $ACC3, 32*3-128($np) 957 vpxor $ACC3, $ACC3, $ACC3 958 vmovdqu $ACC4, 32*4-128($np) 959 vpxor $ACC4, $ACC4, $ACC4 960 vmovdqu $ACC5, 32*5-128($np) 961 vpxor $ACC5, $ACC5, $ACC5 962 vmovdqu $ACC6, 32*6-128($np) 963 vpxor $ACC6, $ACC6, $ACC6 964 vmovdqu $ACC7, 32*7-128($np) 965 vpxor $ACC7, $ACC7, $ACC7 966 vmovdqu $ACC8, 32*8-128($np) 967 vmovdqa $ACC0, $ACC8 968 vmovdqu $ACC9, 32*9-128($np) # $ACC9 is zero after vzeroall 969 .Lmul_1024_no_n_copy: 970 and \$-64,%rsp 971 972 mov ($bp), %rbx 973 vpbroadcastq ($bp), $Bi 974 vmovdqu $ACC0, (%rsp) # clear top of stack 975 xor $r0, $r0 976 .byte 0x67 977 xor $r1, $r1 978 xor $r2, $r2 979 xor $r3, $r3 980 981 vmovdqu .Land_mask(%rip), $AND_MASK 982 mov \$9, $i 983 vmovdqu $ACC9, 32*9-128($rp) # $ACC9 is zero after vzeroall 984 jmp .Loop_mul_1024 985 986 .align 32 987 .Loop_mul_1024: 988 vpsrlq \$29, $ACC3, $ACC9 # correct $ACC3(*) 989 mov %rbx, %rax 990 imulq -128($ap), %rax 991 add $r0, %rax 992 mov %rbx, $r1 993 imulq 8-128($ap), $r1 994 add 8(%rsp), $r1 995 996 mov %rax, $r0 997 imull $n0, %eax 998 and \$0x1fffffff, %eax 999 1000 mov %rbx, $r2 1001 imulq 16-128($ap), $r2 1002 add 16(%rsp), $r2 1003 1004 mov %rbx, $r3 1005 imulq 24-128($ap), $r3 1006 add 24(%rsp), $r3 1007 vpmuludq 32*1-128($ap),$Bi,$TEMP0 1008 vmovd %eax, $Yi 1009 vpaddq $TEMP0,$ACC1,$ACC1 1010 vpmuludq 32*2-128($ap),$Bi,$TEMP1 1011 vpbroadcastq $Yi, $Yi 1012 vpaddq $TEMP1,$ACC2,$ACC2 1013 vpmuludq 32*3-128($ap),$Bi,$TEMP2 1014 vpand $AND_MASK, $ACC3, $ACC3 # correct $ACC3 1015 vpaddq $TEMP2,$ACC3,$ACC3 1016 vpmuludq 32*4-128($ap),$Bi,$TEMP0 1017 vpaddq $TEMP0,$ACC4,$ACC4 1018 vpmuludq 32*5-128($ap),$Bi,$TEMP1 1019 vpaddq $TEMP1,$ACC5,$ACC5 1020 vpmuludq 32*6-128($ap),$Bi,$TEMP2 1021 vpaddq $TEMP2,$ACC6,$ACC6 1022 vpmuludq 32*7-128($ap),$Bi,$TEMP0 1023 vpermq \$0x93, $ACC9, $ACC9 # correct $ACC3 1024 vpaddq $TEMP0,$ACC7,$ACC7 1025 vpmuludq 32*8-128($ap),$Bi,$TEMP1 1026 vpbroadcastq 8($bp), $Bi 1027 vpaddq $TEMP1,$ACC8,$ACC8 1028 1029 mov %rax,%rdx 1030 imulq -128($np),%rax 1031 add %rax,$r0 1032 mov %rdx,%rax 1033 imulq 8-128($np),%rax 1034 add %rax,$r1 1035 mov %rdx,%rax 1036 imulq 16-128($np),%rax 1037 add %rax,$r2 1038 shr \$29, $r0 1039 imulq 24-128($np),%rdx 1040 add %rdx,$r3 1041 add $r0, $r1 1042 1043 vpmuludq 32*1-128($np),$Yi,$TEMP2 1044 vmovq $Bi, %rbx 1045 vpaddq $TEMP2,$ACC1,$ACC1 1046 vpmuludq 32*2-128($np),$Yi,$TEMP0 1047 vpaddq $TEMP0,$ACC2,$ACC2 1048 vpmuludq 32*3-128($np),$Yi,$TEMP1 1049 vpaddq $TEMP1,$ACC3,$ACC3 1050 vpmuludq 32*4-128($np),$Yi,$TEMP2 1051 vpaddq $TEMP2,$ACC4,$ACC4 1052 vpmuludq 32*5-128($np),$Yi,$TEMP0 1053 vpaddq $TEMP0,$ACC5,$ACC5 1054 vpmuludq 32*6-128($np),$Yi,$TEMP1 1055 vpaddq $TEMP1,$ACC6,$ACC6 1056 vpmuludq 32*7-128($np),$Yi,$TEMP2 1057 vpblendd \$3, $ZERO, $ACC9, $ACC9 # correct $ACC3 1058 vpaddq $TEMP2,$ACC7,$ACC7 1059 vpmuludq 32*8-128($np),$Yi,$TEMP0 1060 vpaddq $ACC9, $ACC3, $ACC3 # correct $ACC3 1061 vpaddq $TEMP0,$ACC8,$ACC8 1062 1063 mov %rbx, %rax 1064 imulq -128($ap),%rax 1065 add %rax,$r1 1066 vmovdqu -8+32*1-128($ap),$TEMP1 1067 mov %rbx, %rax 1068 imulq 8-128($ap),%rax 1069 add %rax,$r2 1070 vmovdqu -8+32*2-128($ap),$TEMP2 1071 1072 mov $r1, %rax 1073 imull $n0, %eax 1074 and \$0x1fffffff, %eax 1075 1076 imulq 16-128($ap),%rbx 1077 add %rbx,$r3 1078 vpmuludq $Bi,$TEMP1,$TEMP1 1079 vmovd %eax, $Yi 1080 vmovdqu -8+32*3-128($ap),$TEMP0 1081 vpaddq $TEMP1,$ACC1,$ACC1 1082 vpmuludq $Bi,$TEMP2,$TEMP2 1083 vpbroadcastq $Yi, $Yi 1084 vmovdqu -8+32*4-128($ap),$TEMP1 1085 vpaddq $TEMP2,$ACC2,$ACC2 1086 vpmuludq $Bi,$TEMP0,$TEMP0 1087 vmovdqu -8+32*5-128($ap),$TEMP2 1088 vpaddq $TEMP0,$ACC3,$ACC3 1089 vpmuludq $Bi,$TEMP1,$TEMP1 1090 vmovdqu -8+32*6-128($ap),$TEMP0 1091 vpaddq $TEMP1,$ACC4,$ACC4 1092 vpmuludq $Bi,$TEMP2,$TEMP2 1093 vmovdqu -8+32*7-128($ap),$TEMP1 1094 vpaddq $TEMP2,$ACC5,$ACC5 1095 vpmuludq $Bi,$TEMP0,$TEMP0 1096 vmovdqu -8+32*8-128($ap),$TEMP2 1097 vpaddq $TEMP0,$ACC6,$ACC6 1098 vpmuludq $Bi,$TEMP1,$TEMP1 1099 vmovdqu -8+32*9-128($ap),$ACC9 1100 vpaddq $TEMP1,$ACC7,$ACC7 1101 vpmuludq $Bi,$TEMP2,$TEMP2 1102 vpaddq $TEMP2,$ACC8,$ACC8 1103 vpmuludq $Bi,$ACC9,$ACC9 1104 vpbroadcastq 16($bp), $Bi 1105 1106 mov %rax,%rdx 1107 imulq -128($np),%rax 1108 add %rax,$r1 1109 vmovdqu -8+32*1-128($np),$TEMP0 1110 mov %rdx,%rax 1111 imulq 8-128($np),%rax 1112 add %rax,$r2 1113 vmovdqu -8+32*2-128($np),$TEMP1 1114 shr \$29, $r1 1115 imulq 16-128($np),%rdx 1116 add %rdx,$r3 1117 add $r1, $r2 1118 1119 vpmuludq $Yi,$TEMP0,$TEMP0 1120 vmovq $Bi, %rbx 1121 vmovdqu -8+32*3-128($np),$TEMP2 1122 vpaddq $TEMP0,$ACC1,$ACC1 1123 vpmuludq $Yi,$TEMP1,$TEMP1 1124 vmovdqu -8+32*4-128($np),$TEMP0 1125 vpaddq $TEMP1,$ACC2,$ACC2 1126 vpmuludq $Yi,$TEMP2,$TEMP2 1127 vmovdqu -8+32*5-128($np),$TEMP1 1128 vpaddq $TEMP2,$ACC3,$ACC3 1129 vpmuludq $Yi,$TEMP0,$TEMP0 1130 vmovdqu -8+32*6-128($np),$TEMP2 1131 vpaddq $TEMP0,$ACC4,$ACC4 1132 vpmuludq $Yi,$TEMP1,$TEMP1 1133 vmovdqu -8+32*7-128($np),$TEMP0 1134 vpaddq $TEMP1,$ACC5,$ACC5 1135 vpmuludq $Yi,$TEMP2,$TEMP2 1136 vmovdqu -8+32*8-128($np),$TEMP1 1137 vpaddq $TEMP2,$ACC6,$ACC6 1138 vpmuludq $Yi,$TEMP0,$TEMP0 1139 vmovdqu -8+32*9-128($np),$TEMP2 1140 vpaddq $TEMP0,$ACC7,$ACC7 1141 vpmuludq $Yi,$TEMP1,$TEMP1 1142 vpaddq $TEMP1,$ACC8,$ACC8 1143 vpmuludq $Yi,$TEMP2,$TEMP2 1144 vpaddq $TEMP2,$ACC9,$ACC9 1145 1146 vmovdqu -16+32*1-128($ap),$TEMP0 1147 mov %rbx,%rax 1148 imulq -128($ap),%rax 1149 add $r2,%rax 1150 1151 vmovdqu -16+32*2-128($ap),$TEMP1 1152 mov %rax,$r2 1153 imull $n0, %eax 1154 and \$0x1fffffff, %eax 1155 1156 imulq 8-128($ap),%rbx 1157 add %rbx,$r3 1158 vpmuludq $Bi,$TEMP0,$TEMP0 1159 vmovd %eax, $Yi 1160 vmovdqu -16+32*3-128($ap),$TEMP2 1161 vpaddq $TEMP0,$ACC1,$ACC1 1162 vpmuludq $Bi,$TEMP1,$TEMP1 1163 vpbroadcastq $Yi, $Yi 1164 vmovdqu -16+32*4-128($ap),$TEMP0 1165 vpaddq $TEMP1,$ACC2,$ACC2 1166 vpmuludq $Bi,$TEMP2,$TEMP2 1167 vmovdqu -16+32*5-128($ap),$TEMP1 1168 vpaddq $TEMP2,$ACC3,$ACC3 1169 vpmuludq $Bi,$TEMP0,$TEMP0 1170 vmovdqu -16+32*6-128($ap),$TEMP2 1171 vpaddq $TEMP0,$ACC4,$ACC4 1172 vpmuludq $Bi,$TEMP1,$TEMP1 1173 vmovdqu -16+32*7-128($ap),$TEMP0 1174 vpaddq $TEMP1,$ACC5,$ACC5 1175 vpmuludq $Bi,$TEMP2,$TEMP2 1176 vmovdqu -16+32*8-128($ap),$TEMP1 1177 vpaddq $TEMP2,$ACC6,$ACC6 1178 vpmuludq $Bi,$TEMP0,$TEMP0 1179 vmovdqu -16+32*9-128($ap),$TEMP2 1180 vpaddq $TEMP0,$ACC7,$ACC7 1181 vpmuludq $Bi,$TEMP1,$TEMP1 1182 vpaddq $TEMP1,$ACC8,$ACC8 1183 vpmuludq $Bi,$TEMP2,$TEMP2 1184 vpbroadcastq 24($bp), $Bi 1185 vpaddq $TEMP2,$ACC9,$ACC9 1186 1187 vmovdqu -16+32*1-128($np),$TEMP0 1188 mov %rax,%rdx 1189 imulq -128($np),%rax 1190 add %rax,$r2 1191 vmovdqu -16+32*2-128($np),$TEMP1 1192 imulq 8-128($np),%rdx 1193 add %rdx,$r3 1194 shr \$29, $r2 1195 1196 vpmuludq $Yi,$TEMP0,$TEMP0 1197 vmovq $Bi, %rbx 1198 vmovdqu -16+32*3-128($np),$TEMP2 1199 vpaddq $TEMP0,$ACC1,$ACC1 1200 vpmuludq $Yi,$TEMP1,$TEMP1 1201 vmovdqu -16+32*4-128($np),$TEMP0 1202 vpaddq $TEMP1,$ACC2,$ACC2 1203 vpmuludq $Yi,$TEMP2,$TEMP2 1204 vmovdqu -16+32*5-128($np),$TEMP1 1205 vpaddq $TEMP2,$ACC3,$ACC3 1206 vpmuludq $Yi,$TEMP0,$TEMP0 1207 vmovdqu -16+32*6-128($np),$TEMP2 1208 vpaddq $TEMP0,$ACC4,$ACC4 1209 vpmuludq $Yi,$TEMP1,$TEMP1 1210 vmovdqu -16+32*7-128($np),$TEMP0 1211 vpaddq $TEMP1,$ACC5,$ACC5 1212 vpmuludq $Yi,$TEMP2,$TEMP2 1213 vmovdqu -16+32*8-128($np),$TEMP1 1214 vpaddq $TEMP2,$ACC6,$ACC6 1215 vpmuludq $Yi,$TEMP0,$TEMP0 1216 vmovdqu -16+32*9-128($np),$TEMP2 1217 vpaddq $TEMP0,$ACC7,$ACC7 1218 vpmuludq $Yi,$TEMP1,$TEMP1 1219 vmovdqu -24+32*1-128($ap),$TEMP0 1220 vpaddq $TEMP1,$ACC8,$ACC8 1221 vpmuludq $Yi,$TEMP2,$TEMP2 1222 vmovdqu -24+32*2-128($ap),$TEMP1 1223 vpaddq $TEMP2,$ACC9,$ACC9 1224 1225 add $r2, $r3 1226 imulq -128($ap),%rbx 1227 add %rbx,$r3 1228 1229 mov $r3, %rax 1230 imull $n0, %eax 1231 and \$0x1fffffff, %eax 1232 1233 vpmuludq $Bi,$TEMP0,$TEMP0 1234 vmovd %eax, $Yi 1235 vmovdqu -24+32*3-128($ap),$TEMP2 1236 vpaddq $TEMP0,$ACC1,$ACC1 1237 vpmuludq $Bi,$TEMP1,$TEMP1 1238 vpbroadcastq $Yi, $Yi 1239 vmovdqu -24+32*4-128($ap),$TEMP0 1240 vpaddq $TEMP1,$ACC2,$ACC2 1241 vpmuludq $Bi,$TEMP2,$TEMP2 1242 vmovdqu -24+32*5-128($ap),$TEMP1 1243 vpaddq $TEMP2,$ACC3,$ACC3 1244 vpmuludq $Bi,$TEMP0,$TEMP0 1245 vmovdqu -24+32*6-128($ap),$TEMP2 1246 vpaddq $TEMP0,$ACC4,$ACC4 1247 vpmuludq $Bi,$TEMP1,$TEMP1 1248 vmovdqu -24+32*7-128($ap),$TEMP0 1249 vpaddq $TEMP1,$ACC5,$ACC5 1250 vpmuludq $Bi,$TEMP2,$TEMP2 1251 vmovdqu -24+32*8-128($ap),$TEMP1 1252 vpaddq $TEMP2,$ACC6,$ACC6 1253 vpmuludq $Bi,$TEMP0,$TEMP0 1254 vmovdqu -24+32*9-128($ap),$TEMP2 1255 vpaddq $TEMP0,$ACC7,$ACC7 1256 vpmuludq $Bi,$TEMP1,$TEMP1 1257 vpaddq $TEMP1,$ACC8,$ACC8 1258 vpmuludq $Bi,$TEMP2,$TEMP2 1259 vpbroadcastq 32($bp), $Bi 1260 vpaddq $TEMP2,$ACC9,$ACC9 1261 add \$32, $bp # $bp++ 1262 1263 vmovdqu -24+32*1-128($np),$TEMP0 1264 imulq -128($np),%rax 1265 add %rax,$r3 1266 shr \$29, $r3 1267 1268 vmovdqu -24+32*2-128($np),$TEMP1 1269 vpmuludq $Yi,$TEMP0,$TEMP0 1270 vmovq $Bi, %rbx 1271 vmovdqu -24+32*3-128($np),$TEMP2 1272 vpaddq $TEMP0,$ACC1,$ACC0 # $ACC0==$TEMP0 1273 vpmuludq $Yi,$TEMP1,$TEMP1 1274 vmovdqu $ACC0, (%rsp) # transfer $r0-$r3 1275 vpaddq $TEMP1,$ACC2,$ACC1 1276 vmovdqu -24+32*4-128($np),$TEMP0 1277 vpmuludq $Yi,$TEMP2,$TEMP2 1278 vmovdqu -24+32*5-128($np),$TEMP1 1279 vpaddq $TEMP2,$ACC3,$ACC2 1280 vpmuludq $Yi,$TEMP0,$TEMP0 1281 vmovdqu -24+32*6-128($np),$TEMP2 1282 vpaddq $TEMP0,$ACC4,$ACC3 1283 vpmuludq $Yi,$TEMP1,$TEMP1 1284 vmovdqu -24+32*7-128($np),$TEMP0 1285 vpaddq $TEMP1,$ACC5,$ACC4 1286 vpmuludq $Yi,$TEMP2,$TEMP2 1287 vmovdqu -24+32*8-128($np),$TEMP1 1288 vpaddq $TEMP2,$ACC6,$ACC5 1289 vpmuludq $Yi,$TEMP0,$TEMP0 1290 vmovdqu -24+32*9-128($np),$TEMP2 1291 mov $r3, $r0 1292 vpaddq $TEMP0,$ACC7,$ACC6 1293 vpmuludq $Yi,$TEMP1,$TEMP1 1294 add (%rsp), $r0 1295 vpaddq $TEMP1,$ACC8,$ACC7 1296 vpmuludq $Yi,$TEMP2,$TEMP2 1297 vmovq $r3, $TEMP1 1298 vpaddq $TEMP2,$ACC9,$ACC8 1299 1300 dec $i 1301 jnz .Loop_mul_1024 1302 ___ 1303 1304 # (*) Original implementation was correcting ACC1-ACC3 for overflow 1305 # after 7 loop runs, or after 28 iterations, or 56 additions. 1306 # But as we underutilize resources, it's possible to correct in 1307 # each iteration with marginal performance loss. But then, as 1308 # we do it in each iteration, we can correct less digits, and 1309 # avoid performance penalties completely. Also note that we 1310 # correct only three digits out of four. This works because 1311 # most significant digit is subjected to less additions. 1312 1313 $TEMP0 = $ACC9; 1314 $TEMP3 = $Bi; 1315 $TEMP4 = $Yi; 1316 $code.=<<___; 1317 vpermq \$0, $AND_MASK, $AND_MASK 1318 vpaddq (%rsp), $TEMP1, $ACC0 1319 1320 vpsrlq \$29, $ACC0, $TEMP1 1321 vpand $AND_MASK, $ACC0, $ACC0 1322 vpsrlq \$29, $ACC1, $TEMP2 1323 vpand $AND_MASK, $ACC1, $ACC1 1324 vpsrlq \$29, $ACC2, $TEMP3 1325 vpermq \$0x93, $TEMP1, $TEMP1 1326 vpand $AND_MASK, $ACC2, $ACC2 1327 vpsrlq \$29, $ACC3, $TEMP4 1328 vpermq \$0x93, $TEMP2, $TEMP2 1329 vpand $AND_MASK, $ACC3, $ACC3 1330 1331 vpblendd \$3, $ZERO, $TEMP1, $TEMP0 1332 vpermq \$0x93, $TEMP3, $TEMP3 1333 vpblendd \$3, $TEMP1, $TEMP2, $TEMP1 1334 vpermq \$0x93, $TEMP4, $TEMP4 1335 vpaddq $TEMP0, $ACC0, $ACC0 1336 vpblendd \$3, $TEMP2, $TEMP3, $TEMP2 1337 vpaddq $TEMP1, $ACC1, $ACC1 1338 vpblendd \$3, $TEMP3, $TEMP4, $TEMP3 1339 vpaddq $TEMP2, $ACC2, $ACC2 1340 vpblendd \$3, $TEMP4, $ZERO, $TEMP4 1341 vpaddq $TEMP3, $ACC3, $ACC3 1342 vpaddq $TEMP4, $ACC4, $ACC4 1343 1344 vpsrlq \$29, $ACC0, $TEMP1 1345 vpand $AND_MASK, $ACC0, $ACC0 1346 vpsrlq \$29, $ACC1, $TEMP2 1347 vpand $AND_MASK, $ACC1, $ACC1 1348 vpsrlq \$29, $ACC2, $TEMP3 1349 vpermq \$0x93, $TEMP1, $TEMP1 1350 vpand $AND_MASK, $ACC2, $ACC2 1351 vpsrlq \$29, $ACC3, $TEMP4 1352 vpermq \$0x93, $TEMP2, $TEMP2 1353 vpand $AND_MASK, $ACC3, $ACC3 1354 vpermq \$0x93, $TEMP3, $TEMP3 1355 1356 vpblendd \$3, $ZERO, $TEMP1, $TEMP0 1357 vpermq \$0x93, $TEMP4, $TEMP4 1358 vpblendd \$3, $TEMP1, $TEMP2, $TEMP1 1359 vpaddq $TEMP0, $ACC0, $ACC0 1360 vpblendd \$3, $TEMP2, $TEMP3, $TEMP2 1361 vpaddq $TEMP1, $ACC1, $ACC1 1362 vpblendd \$3, $TEMP3, $TEMP4, $TEMP3 1363 vpaddq $TEMP2, $ACC2, $ACC2 1364 vpblendd \$3, $TEMP4, $ZERO, $TEMP4 1365 vpaddq $TEMP3, $ACC3, $ACC3 1366 vpaddq $TEMP4, $ACC4, $ACC4 1367 1368 vmovdqu $ACC0, 0-128($rp) 1369 vmovdqu $ACC1, 32-128($rp) 1370 vmovdqu $ACC2, 64-128($rp) 1371 vmovdqu $ACC3, 96-128($rp) 1372 ___ 1373 1374 $TEMP5=$ACC0; 1375 $code.=<<___; 1376 vpsrlq \$29, $ACC4, $TEMP1 1377 vpand $AND_MASK, $ACC4, $ACC4 1378 vpsrlq \$29, $ACC5, $TEMP2 1379 vpand $AND_MASK, $ACC5, $ACC5 1380 vpsrlq \$29, $ACC6, $TEMP3 1381 vpermq \$0x93, $TEMP1, $TEMP1 1382 vpand $AND_MASK, $ACC6, $ACC6 1383 vpsrlq \$29, $ACC7, $TEMP4 1384 vpermq \$0x93, $TEMP2, $TEMP2 1385 vpand $AND_MASK, $ACC7, $ACC7 1386 vpsrlq \$29, $ACC8, $TEMP5 1387 vpermq \$0x93, $TEMP3, $TEMP3 1388 vpand $AND_MASK, $ACC8, $ACC8 1389 vpermq \$0x93, $TEMP4, $TEMP4 1390 1391 vpblendd \$3, $ZERO, $TEMP1, $TEMP0 1392 vpermq \$0x93, $TEMP5, $TEMP5 1393 vpblendd \$3, $TEMP1, $TEMP2, $TEMP1 1394 vpaddq $TEMP0, $ACC4, $ACC4 1395 vpblendd \$3, $TEMP2, $TEMP3, $TEMP2 1396 vpaddq $TEMP1, $ACC5, $ACC5 1397 vpblendd \$3, $TEMP3, $TEMP4, $TEMP3 1398 vpaddq $TEMP2, $ACC6, $ACC6 1399 vpblendd \$3, $TEMP4, $TEMP5, $TEMP4 1400 vpaddq $TEMP3, $ACC7, $ACC7 1401 vpaddq $TEMP4, $ACC8, $ACC8 1402 1403 vpsrlq \$29, $ACC4, $TEMP1 1404 vpand $AND_MASK, $ACC4, $ACC4 1405 vpsrlq \$29, $ACC5, $TEMP2 1406 vpand $AND_MASK, $ACC5, $ACC5 1407 vpsrlq \$29, $ACC6, $TEMP3 1408 vpermq \$0x93, $TEMP1, $TEMP1 1409 vpand $AND_MASK, $ACC6, $ACC6 1410 vpsrlq \$29, $ACC7, $TEMP4 1411 vpermq \$0x93, $TEMP2, $TEMP2 1412 vpand $AND_MASK, $ACC7, $ACC7 1413 vpsrlq \$29, $ACC8, $TEMP5 1414 vpermq \$0x93, $TEMP3, $TEMP3 1415 vpand $AND_MASK, $ACC8, $ACC8 1416 vpermq \$0x93, $TEMP4, $TEMP4 1417 1418 vpblendd \$3, $ZERO, $TEMP1, $TEMP0 1419 vpermq \$0x93, $TEMP5, $TEMP5 1420 vpblendd \$3, $TEMP1, $TEMP2, $TEMP1 1421 vpaddq $TEMP0, $ACC4, $ACC4 1422 vpblendd \$3, $TEMP2, $TEMP3, $TEMP2 1423 vpaddq $TEMP1, $ACC5, $ACC5 1424 vpblendd \$3, $TEMP3, $TEMP4, $TEMP3 1425 vpaddq $TEMP2, $ACC6, $ACC6 1426 vpblendd \$3, $TEMP4, $TEMP5, $TEMP4 1427 vpaddq $TEMP3, $ACC7, $ACC7 1428 vpaddq $TEMP4, $ACC8, $ACC8 1429 1430 vmovdqu $ACC4, 128-128($rp) 1431 vmovdqu $ACC5, 160-128($rp) 1432 vmovdqu $ACC6, 192-128($rp) 1433 vmovdqu $ACC7, 224-128($rp) 1434 vmovdqu $ACC8, 256-128($rp) 1435 vzeroupper 1436 1437 mov %rbp, %rax 1438 ___ 1439 $code.=<<___ if ($win64); 1440 movaps -0xd8(%rax),%xmm6 1441 movaps -0xc8(%rax),%xmm7 1442 movaps -0xb8(%rax),%xmm8 1443 movaps -0xa8(%rax),%xmm9 1444 movaps -0x98(%rax),%xmm10 1445 movaps -0x88(%rax),%xmm11 1446 movaps -0x78(%rax),%xmm12 1447 movaps -0x68(%rax),%xmm13 1448 movaps -0x58(%rax),%xmm14 1449 movaps -0x48(%rax),%xmm15 1450 ___ 1451 $code.=<<___; 1452 mov -48(%rax),%r15 1453 mov -40(%rax),%r14 1454 mov -32(%rax),%r13 1455 mov -24(%rax),%r12 1456 mov -16(%rax),%rbp 1457 mov -8(%rax),%rbx 1458 lea (%rax),%rsp # restore %rsp 1459 .Lmul_1024_epilogue: 1460 ret 1461 .size rsaz_1024_mul_avx2,.-rsaz_1024_mul_avx2 1462 ___ 1463 } 1464 { 1465 my ($out,$inp) = $win64 ? ("%rcx","%rdx") : ("%rdi","%rsi"); 1466 my @T = map("%r$_",(8..11)); 1467 1468 $code.=<<___; 1469 .globl rsaz_1024_red2norm_avx2 1470 .type rsaz_1024_red2norm_avx2,\@abi-omnipotent 1471 .align 32 1472 rsaz_1024_red2norm_avx2: 1473 sub \$-128,$inp # size optimization 1474 xor %rax,%rax 1475 ___ 1476 1477 for ($j=0,$i=0; $i<16; $i++) { 1478 my $k=0; 1479 while (29*$j<64*($i+1)) { # load data till boundary 1480 $code.=" mov `8*$j-128`($inp), @T[0]\n"; 1481 $j++; $k++; push(@T,shift(@T)); 1482 } 1483 $l=$k; 1484 while ($k>1) { # shift loaded data but last value 1485 $code.=" shl \$`29*($j-$k)`,@T[-$k]\n"; 1486 $k--; 1487 } 1488 $code.=<<___; # shift last value 1489 mov @T[-1], @T[0] 1490 shl \$`29*($j-1)`, @T[-1] 1491 shr \$`-29*($j-1)`, @T[0] 1492 ___ 1493 while ($l) { # accumulate all values 1494 $code.=" add @T[-$l], %rax\n"; 1495 $l--; 1496 } 1497 $code.=<<___; 1498 adc \$0, @T[0] # consume eventual carry 1499 mov %rax, 8*$i($out) 1500 mov @T[0], %rax 1501 ___ 1502 push(@T,shift(@T)); 1503 } 1504 $code.=<<___; 1505 ret 1506 .size rsaz_1024_red2norm_avx2,.-rsaz_1024_red2norm_avx2 1507 1508 .globl rsaz_1024_norm2red_avx2 1509 .type rsaz_1024_norm2red_avx2,\@abi-omnipotent 1510 .align 32 1511 rsaz_1024_norm2red_avx2: 1512 sub \$-128,$out # size optimization 1513 mov ($inp),@T[0] 1514 mov \$0x1fffffff,%eax 1515 ___ 1516 for ($j=0,$i=0; $i<16; $i++) { 1517 $code.=" mov `8*($i+1)`($inp),@T[1]\n" if ($i<15); 1518 $code.=" xor @T[1],@T[1]\n" if ($i==15); 1519 my $k=1; 1520 while (29*($j+1)<64*($i+1)) { 1521 $code.=<<___; 1522 mov @T[0],@T[-$k] 1523 shr \$`29*$j`,@T[-$k] 1524 and %rax,@T[-$k] # &0x1fffffff 1525 mov @T[-$k],`8*$j-128`($out) 1526 ___ 1527 $j++; $k++; 1528 } 1529 $code.=<<___; 1530 shrd \$`29*$j`,@T[1],@T[0] 1531 and %rax,@T[0] 1532 mov @T[0],`8*$j-128`($out) 1533 ___ 1534 $j++; 1535 push(@T,shift(@T)); 1536 } 1537 $code.=<<___; 1538 mov @T[0],`8*$j-128`($out) # zero 1539 mov @T[0],`8*($j+1)-128`($out) 1540 mov @T[0],`8*($j+2)-128`($out) 1541 mov @T[0],`8*($j+3)-128`($out) 1542 ret 1543 .size rsaz_1024_norm2red_avx2,.-rsaz_1024_norm2red_avx2 1544 ___ 1545 } 1546 { 1547 my ($out,$inp,$power) = $win64 ? ("%rcx","%rdx","%r8d") : ("%rdi","%rsi","%edx"); 1548 1549 $code.=<<___; 1550 .globl rsaz_1024_scatter5_avx2 1551 .type rsaz_1024_scatter5_avx2,\@abi-omnipotent 1552 .align 32 1553 rsaz_1024_scatter5_avx2: 1554 vzeroupper 1555 vmovdqu .Lscatter_permd(%rip),%ymm5 1556 shl \$4,$power 1557 lea ($out,$power),$out 1558 mov \$9,%eax 1559 jmp .Loop_scatter_1024 1560 1561 .align 32 1562 .Loop_scatter_1024: 1563 vmovdqu ($inp),%ymm0 1564 lea 32($inp),$inp 1565 vpermd %ymm0,%ymm5,%ymm0 1566 vmovdqu %xmm0,($out) 1567 lea 16*32($out),$out 1568 dec %eax 1569 jnz .Loop_scatter_1024 1570 1571 vzeroupper 1572 ret 1573 .size rsaz_1024_scatter5_avx2,.-rsaz_1024_scatter5_avx2 1574 1575 .globl rsaz_1024_gather5_avx2 1576 .type rsaz_1024_gather5_avx2,\@abi-omnipotent 1577 .align 32 1578 rsaz_1024_gather5_avx2: 1579 ___ 1580 $code.=<<___ if ($win64); 1581 lea -0x88(%rsp),%rax 1582 vzeroupper 1583 .LSEH_begin_rsaz_1024_gather5: 1584 # I can't trust assembler to use specific encoding:-( 1585 .byte 0x48,0x8d,0x60,0xe0 #lea -0x20(%rax),%rsp 1586 .byte 0xc5,0xf8,0x29,0x70,0xe0 #vmovaps %xmm6,-0x20(%rax) 1587 .byte 0xc5,0xf8,0x29,0x78,0xf0 #vmovaps %xmm7,-0x10(%rax) 1588 .byte 0xc5,0x78,0x29,0x40,0x00 #vmovaps %xmm8,0(%rax) 1589 .byte 0xc5,0x78,0x29,0x48,0x10 #vmovaps %xmm9,0x10(%rax) 1590 .byte 0xc5,0x78,0x29,0x50,0x20 #vmovaps %xmm10,0x20(%rax) 1591 .byte 0xc5,0x78,0x29,0x58,0x30 #vmovaps %xmm11,0x30(%rax) 1592 .byte 0xc5,0x78,0x29,0x60,0x40 #vmovaps %xmm12,0x40(%rax) 1593 .byte 0xc5,0x78,0x29,0x68,0x50 #vmovaps %xmm13,0x50(%rax) 1594 .byte 0xc5,0x78,0x29,0x70,0x60 #vmovaps %xmm14,0x60(%rax) 1595 .byte 0xc5,0x78,0x29,0x78,0x70 #vmovaps %xmm15,0x70(%rax) 1596 ___ 1597 $code.=<<___; 1598 lea .Lgather_table(%rip),%r11 1599 mov $power,%eax 1600 and \$3,$power 1601 shr \$2,%eax # cache line number 1602 shl \$4,$power # offset within cache line 1603 1604 vmovdqu -32(%r11),%ymm7 # .Lgather_permd 1605 vpbroadcastb 8(%r11,%rax), %xmm8 1606 vpbroadcastb 7(%r11,%rax), %xmm9 1607 vpbroadcastb 6(%r11,%rax), %xmm10 1608 vpbroadcastb 5(%r11,%rax), %xmm11 1609 vpbroadcastb 4(%r11,%rax), %xmm12 1610 vpbroadcastb 3(%r11,%rax), %xmm13 1611 vpbroadcastb 2(%r11,%rax), %xmm14 1612 vpbroadcastb 1(%r11,%rax), %xmm15 1613 1614 lea 64($inp,$power),$inp 1615 mov \$64,%r11 # size optimization 1616 mov \$9,%eax 1617 jmp .Loop_gather_1024 1618 1619 .align 32 1620 .Loop_gather_1024: 1621 vpand -64($inp), %xmm8,%xmm0 1622 vpand ($inp), %xmm9,%xmm1 1623 vpand 64($inp), %xmm10,%xmm2 1624 vpand ($inp,%r11,2), %xmm11,%xmm3 1625 vpor %xmm0,%xmm1,%xmm1 1626 vpand 64($inp,%r11,2), %xmm12,%xmm4 1627 vpor %xmm2,%xmm3,%xmm3 1628 vpand ($inp,%r11,4), %xmm13,%xmm5 1629 vpor %xmm1,%xmm3,%xmm3 1630 vpand 64($inp,%r11,4), %xmm14,%xmm6 1631 vpor %xmm4,%xmm5,%xmm5 1632 vpand -128($inp,%r11,8), %xmm15,%xmm2 1633 lea ($inp,%r11,8),$inp 1634 vpor %xmm3,%xmm5,%xmm5 1635 vpor %xmm2,%xmm6,%xmm6 1636 vpor %xmm5,%xmm6,%xmm6 1637 vpermd %ymm6,%ymm7,%ymm6 1638 vmovdqu %ymm6,($out) 1639 lea 32($out),$out 1640 dec %eax 1641 jnz .Loop_gather_1024 1642 1643 vpxor %ymm0,%ymm0,%ymm0 1644 vmovdqu %ymm0,($out) 1645 vzeroupper 1646 ___ 1647 $code.=<<___ if ($win64); 1648 movaps (%rsp),%xmm6 1649 movaps 0x10(%rsp),%xmm7 1650 movaps 0x20(%rsp),%xmm8 1651 movaps 0x30(%rsp),%xmm9 1652 movaps 0x40(%rsp),%xmm10 1653 movaps 0x50(%rsp),%xmm11 1654 movaps 0x60(%rsp),%xmm12 1655 movaps 0x70(%rsp),%xmm13 1656 movaps 0x80(%rsp),%xmm14 1657 movaps 0x90(%rsp),%xmm15 1658 lea 0xa8(%rsp),%rsp 1659 .LSEH_end_rsaz_1024_gather5: 1660 ___ 1661 $code.=<<___; 1662 ret 1663 .size rsaz_1024_gather5_avx2,.-rsaz_1024_gather5_avx2 1664 ___ 1665 } 1666 1667 $code.=<<___; 1668 .extern OPENSSL_ia32cap_P 1669 .globl rsaz_avx2_eligible 1670 .type rsaz_avx2_eligible,\@abi-omnipotent 1671 .align 32 1672 rsaz_avx2_eligible: 1673 mov OPENSSL_ia32cap_P+8(%rip),%eax 1674 ___ 1675 $code.=<<___ if ($addx); 1676 mov \$`1<<8|1<<19`,%ecx 1677 mov \$0,%edx 1678 and %eax,%ecx 1679 cmp \$`1<<8|1<<19`,%ecx # check for BMI2+AD*X 1680 cmove %edx,%eax 1681 ___ 1682 $code.=<<___; 1683 and \$`1<<5`,%eax 1684 shr \$5,%eax 1685 ret 1686 .size rsaz_avx2_eligible,.-rsaz_avx2_eligible 1687 1688 .align 64 1689 .Land_mask: 1690 .quad 0x1fffffff,0x1fffffff,0x1fffffff,-1 1691 .Lscatter_permd: 1692 .long 0,2,4,6,7,7,7,7 1693 .Lgather_permd: 1694 .long 0,7,1,7,2,7,3,7 1695 .Lgather_table: 1696 .byte 0,0,0,0,0,0,0,0, 0xff,0,0,0,0,0,0,0 1697 .align 64 1698 ___ 1699 1700 if ($win64) { 1701 $rec="%rcx"; 1702 $frame="%rdx"; 1703 $context="%r8"; 1704 $disp="%r9"; 1705 1706 $code.=<<___ 1707 .extern __imp_RtlVirtualUnwind 1708 .type rsaz_se_handler,\@abi-omnipotent 1709 .align 16 1710 rsaz_se_handler: 1711 push %rsi 1712 push %rdi 1713 push %rbx 1714 push %rbp 1715 push %r12 1716 push %r13 1717 push %r14 1718 push %r15 1719 pushfq 1720 sub \$64,%rsp 1721 1722 mov 120($context),%rax # pull context->Rax 1723 mov 248($context),%rbx # pull context->Rip 1724 1725 mov 8($disp),%rsi # disp->ImageBase 1726 mov 56($disp),%r11 # disp->HandlerData 1727 1728 mov 0(%r11),%r10d # HandlerData[0] 1729 lea (%rsi,%r10),%r10 # prologue label 1730 cmp %r10,%rbx # context->Rip<prologue label 1731 jb .Lcommon_seh_tail 1732 1733 mov 152($context),%rax # pull context->Rsp 1734 1735 mov 4(%r11),%r10d # HandlerData[1] 1736 lea (%rsi,%r10),%r10 # epilogue label 1737 cmp %r10,%rbx # context->Rip>=epilogue label 1738 jae .Lcommon_seh_tail 1739 1740 mov 160($context),%rax # pull context->Rbp 1741 1742 mov -48(%rax),%r15 1743 mov -40(%rax),%r14 1744 mov -32(%rax),%r13 1745 mov -24(%rax),%r12 1746 mov -16(%rax),%rbp 1747 mov -8(%rax),%rbx 1748 mov %r15,240($context) 1749 mov %r14,232($context) 1750 mov %r13,224($context) 1751 mov %r12,216($context) 1752 mov %rbp,160($context) 1753 mov %rbx,144($context) 1754 1755 lea -0xd8(%rax),%rsi # %xmm save area 1756 lea 512($context),%rdi # & context.Xmm6 1757 mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax) 1758 .long 0xa548f3fc # cld; rep movsq 1759 1760 .Lcommon_seh_tail: 1761 mov 8(%rax),%rdi 1762 mov 16(%rax),%rsi 1763 mov %rax,152($context) # restore context->Rsp 1764 mov %rsi,168($context) # restore context->Rsi 1765 mov %rdi,176($context) # restore context->Rdi 1766 1767 mov 40($disp),%rdi # disp->ContextRecord 1768 mov $context,%rsi # context 1769 mov \$154,%ecx # sizeof(CONTEXT) 1770 .long 0xa548f3fc # cld; rep movsq 1771 1772 mov $disp,%rsi 1773 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER 1774 mov 8(%rsi),%rdx # arg2, disp->ImageBase 1775 mov 0(%rsi),%r8 # arg3, disp->ControlPc 1776 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry 1777 mov 40(%rsi),%r10 # disp->ContextRecord 1778 lea 56(%rsi),%r11 # &disp->HandlerData 1779 lea 24(%rsi),%r12 # &disp->EstablisherFrame 1780 mov %r10,32(%rsp) # arg5 1781 mov %r11,40(%rsp) # arg6 1782 mov %r12,48(%rsp) # arg7 1783 mov %rcx,56(%rsp) # arg8, (NULL) 1784 call *__imp_RtlVirtualUnwind(%rip) 1785 1786 mov \$1,%eax # ExceptionContinueSearch 1787 add \$64,%rsp 1788 popfq 1789 pop %r15 1790 pop %r14 1791 pop %r13 1792 pop %r12 1793 pop %rbp 1794 pop %rbx 1795 pop %rdi 1796 pop %rsi 1797 ret 1798 .size rsaz_se_handler,.-rsaz_se_handler 1799 1800 .section .pdata 1801 .align 4 1802 .rva .LSEH_begin_rsaz_1024_sqr_avx2 1803 .rva .LSEH_end_rsaz_1024_sqr_avx2 1804 .rva .LSEH_info_rsaz_1024_sqr_avx2 1805 1806 .rva .LSEH_begin_rsaz_1024_mul_avx2 1807 .rva .LSEH_end_rsaz_1024_mul_avx2 1808 .rva .LSEH_info_rsaz_1024_mul_avx2 1809 1810 .rva .LSEH_begin_rsaz_1024_gather5 1811 .rva .LSEH_end_rsaz_1024_gather5 1812 .rva .LSEH_info_rsaz_1024_gather5 1813 .section .xdata 1814 .align 8 1815 .LSEH_info_rsaz_1024_sqr_avx2: 1816 .byte 9,0,0,0 1817 .rva rsaz_se_handler 1818 .rva .Lsqr_1024_body,.Lsqr_1024_epilogue 1819 .LSEH_info_rsaz_1024_mul_avx2: 1820 .byte 9,0,0,0 1821 .rva rsaz_se_handler 1822 .rva .Lmul_1024_body,.Lmul_1024_epilogue 1823 .LSEH_info_rsaz_1024_gather5: 1824 .byte 0x01,0x33,0x16,0x00 1825 .byte 0x36,0xf8,0x09,0x00 #vmovaps 0x90(rsp),xmm15 1826 .byte 0x31,0xe8,0x08,0x00 #vmovaps 0x80(rsp),xmm14 1827 .byte 0x2c,0xd8,0x07,0x00 #vmovaps 0x70(rsp),xmm13 1828 .byte 0x27,0xc8,0x06,0x00 #vmovaps 0x60(rsp),xmm12 1829 .byte 0x22,0xb8,0x05,0x00 #vmovaps 0x50(rsp),xmm11 1830 .byte 0x1d,0xa8,0x04,0x00 #vmovaps 0x40(rsp),xmm10 1831 .byte 0x18,0x98,0x03,0x00 #vmovaps 0x30(rsp),xmm9 1832 .byte 0x13,0x88,0x02,0x00 #vmovaps 0x20(rsp),xmm8 1833 .byte 0x0e,0x78,0x01,0x00 #vmovaps 0x10(rsp),xmm7 1834 .byte 0x09,0x68,0x00,0x00 #vmovaps 0x00(rsp),xmm6 1835 .byte 0x04,0x01,0x15,0x00 #sub rsp,0xa8 1836 ___ 1837 } 1838 1839 foreach (split("\n",$code)) { 1840 s/\`([^\`]*)\`/eval($1)/ge; 1841 1842 s/\b(sh[rl]d?\s+\$)(-?[0-9]+)/$1.$2%64/ge or 1843 1844 s/\b(vmov[dq])\b(.+)%ymm([0-9]+)/$1$2%xmm$3/go or 1845 s/\b(vmovdqu)\b(.+)%x%ymm([0-9]+)/$1$2%xmm$3/go or 1846 s/\b(vpinsr[qd])\b(.+)%ymm([0-9]+)/$1$2%xmm$3/go or 1847 s/\b(vpextr[qd])\b(.+)%ymm([0-9]+)/$1$2%xmm$3/go or 1848 s/\b(vpbroadcast[qd]\s+)%ymm([0-9]+)/$1%xmm$2/go; 1849 print $_,"\n"; 1850 } 1851 1852 }}} else {{{ 1853 print <<___; # assembler is too old 1854 .text 1855 1856 .globl rsaz_avx2_eligible 1857 .type rsaz_avx2_eligible,\@abi-omnipotent 1858 rsaz_avx2_eligible: 1859 xor %eax,%eax 1860 ret 1861 .size rsaz_avx2_eligible,.-rsaz_avx2_eligible 1862 1863 .globl rsaz_1024_sqr_avx2 1864 .globl rsaz_1024_mul_avx2 1865 .globl rsaz_1024_norm2red_avx2 1866 .globl rsaz_1024_red2norm_avx2 1867 .globl rsaz_1024_scatter5_avx2 1868 .globl rsaz_1024_gather5_avx2 1869 .type rsaz_1024_sqr_avx2,\@abi-omnipotent 1870 rsaz_1024_sqr_avx2: 1871 rsaz_1024_mul_avx2: 1872 rsaz_1024_norm2red_avx2: 1873 rsaz_1024_red2norm_avx2: 1874 rsaz_1024_scatter5_avx2: 1875 rsaz_1024_gather5_avx2: 1876 .byte 0x0f,0x0b # ud2 1877 ret 1878 .size rsaz_1024_sqr_avx2,.-rsaz_1024_sqr_avx2 1879 ___ 1880 }}} 1881 1882 close STDOUT; 1883