1 #!/usr/bin/env perl 2 3 ############################################################################## 4 # # 5 # Copyright (c) 2012, Intel Corporation # 6 # # 7 # All rights reserved. # 8 # # 9 # Redistribution and use in source and binary forms, with or without # 10 # modification, are permitted provided that the following conditions are # 11 # met: # 12 # # 13 # * Redistributions of source code must retain the above copyright # 14 # notice, this list of conditions and the following disclaimer. # 15 # # 16 # * Redistributions in binary form must reproduce the above copyright # 17 # notice, this list of conditions and the following disclaimer in the # 18 # documentation and/or other materials provided with the # 19 # distribution. # 20 # # 21 # * Neither the name of the Intel Corporation nor the names of its # 22 # contributors may be used to endorse or promote products derived from # 23 # this software without specific prior written permission. # 24 # # 25 # # 26 # THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY # 27 # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE # 28 # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR # 29 # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR # 30 # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, # 31 # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, # 32 # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR # 33 # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF # 34 # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING # 35 # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS # 36 # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # 37 # # 38 ############################################################################## 39 # Developers and authors: # 40 # Shay Gueron (1, 2), and Vlad Krasnov (1) # 41 # (1) Intel Corporation, Israel Development Center, Haifa, Israel # 42 # (2) University of Haifa, Israel # 43 ############################################################################## 44 # Reference: # 45 # [1] S. Gueron, V. Krasnov: "Software Implementation of Modular # 46 # Exponentiation, Using Advanced Vector Instructions Architectures", # 47 # F. Ozbudak and F. Rodriguez-Henriquez (Eds.): WAIFI 2012, LNCS 7369, # 48 # pp. 119?135, 2012. Springer-Verlag Berlin Heidelberg 2012 # 49 # [2] S. Gueron: "Efficient Software Implementations of Modular # 50 # Exponentiation", Journal of Cryptographic Engineering 2:31-43 (2012). # 51 # [3] S. Gueron, V. Krasnov: "Speeding up Big-numbers Squaring",IEEE # 52 # Proceedings of 9th International Conference on Information Technology: # 53 # New Generations (ITNG 2012), pp.821-823 (2012) # 54 # [4] S. Gueron, V. Krasnov: "[PATCH] Efficient and side channel analysis # 55 # resistant 1024-bit modular exponentiation, for optimizing RSA2048 # 56 # on AVX2 capable x86_64 platforms", # 57 # http://rt.openssl.org/Ticket/Display.html?id=2850&user=guest&pass=guest# 58 ############################################################################## 59 # 60 # +13% improvement over original submission by <appro (at] openssl.org> 61 # 62 # rsa2048 sign/sec OpenSSL 1.0.1 scalar(*) this 63 # 2.3GHz Haswell 621 765/+23% 1113/+79% 64 # 65 # (*) if system doesn't support AVX2, for reference purposes; 66 67 $flavour = shift; 68 $output = shift; 69 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } 70 71 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); 72 73 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 74 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or 75 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or 76 die "can't locate x86_64-xlate.pl"; 77 78 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` 79 =~ /GNU assembler version ([2-9]\.[0-9]+)/) { 80 $avx = ($1>=2.19) + ($1>=2.22); 81 $addx = ($1>=2.23); 82 } 83 84 if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) && 85 `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) { 86 $avx = ($1>=2.09) + ($1>=2.10); 87 $addx = ($1>=2.10); 88 } 89 90 if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) && 91 `ml64 2>&1` =~ /Version ([0-9]+)\./) { 92 $avx = ($1>=10) + ($1>=11); 93 $addx = ($1>=11); 94 } 95 96 if (!$avx && `$ENV{CC} -v 2>&1` =~ /(^clang version|based on LLVM) ([3-9])\.([0-9]+)/) { 97 my $ver = $2 + $3/100.0; # 3.1->3.01, 3.10->3.10 98 $avx = ($ver>=3.0) + ($ver>=3.01); 99 $addx = ($ver>=3.03); 100 } 101 102 open OUT,"| $^X $xlate $flavour $output"; 103 *STDOUT = *OUT; 104 105 if ($avx>1) {{{ 106 { # void AMS_WW( 107 my $rp="%rdi"; # BN_ULONG *rp, 108 my $ap="%rsi"; # const BN_ULONG *ap, 109 my $np="%rdx"; # const BN_ULONG *np, 110 my $n0="%ecx"; # const BN_ULONG n0, 111 my $rep="%r8d"; # int repeat); 112 113 # The registers that hold the accumulated redundant result 114 # The AMM works on 1024 bit operands, and redundant word size is 29 115 # Therefore: ceil(1024/29)/4 = 9 116 my $ACC0="%ymm0"; 117 my $ACC1="%ymm1"; 118 my $ACC2="%ymm2"; 119 my $ACC3="%ymm3"; 120 my $ACC4="%ymm4"; 121 my $ACC5="%ymm5"; 122 my $ACC6="%ymm6"; 123 my $ACC7="%ymm7"; 124 my $ACC8="%ymm8"; 125 my $ACC9="%ymm9"; 126 # Registers that hold the broadcasted words of bp, currently used 127 my $B1="%ymm10"; 128 my $B2="%ymm11"; 129 # Registers that hold the broadcasted words of Y, currently used 130 my $Y1="%ymm12"; 131 my $Y2="%ymm13"; 132 # Helper registers 133 my $TEMP1="%ymm14"; 134 my $AND_MASK="%ymm15"; 135 # alu registers that hold the first words of the ACC 136 my $r0="%r9"; 137 my $r1="%r10"; 138 my $r2="%r11"; 139 my $r3="%r12"; 140 141 my $i="%r14d"; # loop counter 142 my $tmp = "%r15"; 143 144 my $FrameSize=32*18+32*8; # place for A^2 and 2*A 145 146 my $aap=$r0; 147 my $tp0="%rbx"; 148 my $tp1=$r3; 149 my $tpa=$tmp; 150 151 $np="%r13"; # reassigned argument 152 153 $code.=<<___; 154 .text 155 156 .globl rsaz_1024_sqr_avx2 157 .type rsaz_1024_sqr_avx2,\@function,5 158 .align 64 159 rsaz_1024_sqr_avx2: # 702 cycles, 14% faster than rsaz_1024_mul_avx2 160 lea (%rsp), %rax 161 push %rbx 162 push %rbp 163 push %r12 164 push %r13 165 push %r14 166 push %r15 167 vzeroupper 168 ___ 169 $code.=<<___ if ($win64); 170 lea -0xa8(%rsp),%rsp 171 vmovaps %xmm6,-0xd8(%rax) 172 vmovaps %xmm7,-0xc8(%rax) 173 vmovaps %xmm8,-0xb8(%rax) 174 vmovaps %xmm9,-0xa8(%rax) 175 vmovaps %xmm10,-0x98(%rax) 176 vmovaps %xmm11,-0x88(%rax) 177 vmovaps %xmm12,-0x78(%rax) 178 vmovaps %xmm13,-0x68(%rax) 179 vmovaps %xmm14,-0x58(%rax) 180 vmovaps %xmm15,-0x48(%rax) 181 .Lsqr_1024_body: 182 ___ 183 $code.=<<___; 184 mov %rax,%rbp 185 mov %rdx, $np # reassigned argument 186 sub \$$FrameSize, %rsp 187 mov $np, $tmp 188 sub \$-128, $rp # size optimization 189 sub \$-128, $ap 190 sub \$-128, $np 191 192 and \$4095, $tmp # see if $np crosses page 193 add \$32*10, $tmp 194 shr \$12, $tmp 195 vpxor $ACC9,$ACC9,$ACC9 196 jz .Lsqr_1024_no_n_copy 197 198 # unaligned 256-bit load that crosses page boundary can 199 # cause >2x performance degradation here, so if $np does 200 # cross page boundary, copy it to stack and make sure stack 201 # frame doesn't... 202 sub \$32*10,%rsp 203 vmovdqu 32*0-128($np), $ACC0 204 and \$-2048, %rsp 205 vmovdqu 32*1-128($np), $ACC1 206 vmovdqu 32*2-128($np), $ACC2 207 vmovdqu 32*3-128($np), $ACC3 208 vmovdqu 32*4-128($np), $ACC4 209 vmovdqu 32*5-128($np), $ACC5 210 vmovdqu 32*6-128($np), $ACC6 211 vmovdqu 32*7-128($np), $ACC7 212 vmovdqu 32*8-128($np), $ACC8 213 lea $FrameSize+128(%rsp),$np 214 vmovdqu $ACC0, 32*0-128($np) 215 vmovdqu $ACC1, 32*1-128($np) 216 vmovdqu $ACC2, 32*2-128($np) 217 vmovdqu $ACC3, 32*3-128($np) 218 vmovdqu $ACC4, 32*4-128($np) 219 vmovdqu $ACC5, 32*5-128($np) 220 vmovdqu $ACC6, 32*6-128($np) 221 vmovdqu $ACC7, 32*7-128($np) 222 vmovdqu $ACC8, 32*8-128($np) 223 vmovdqu $ACC9, 32*9-128($np) # $ACC9 is zero 224 225 .Lsqr_1024_no_n_copy: 226 and \$-1024, %rsp 227 228 vmovdqu 32*1-128($ap), $ACC1 229 vmovdqu 32*2-128($ap), $ACC2 230 vmovdqu 32*3-128($ap), $ACC3 231 vmovdqu 32*4-128($ap), $ACC4 232 vmovdqu 32*5-128($ap), $ACC5 233 vmovdqu 32*6-128($ap), $ACC6 234 vmovdqu 32*7-128($ap), $ACC7 235 vmovdqu 32*8-128($ap), $ACC8 236 237 lea 192(%rsp), $tp0 # 64+128=192 238 vpbroadcastq .Land_mask(%rip), $AND_MASK 239 jmp .LOOP_GRANDE_SQR_1024 240 241 .align 32 242 .LOOP_GRANDE_SQR_1024: 243 lea 32*18+128(%rsp), $aap # size optimization 244 lea 448(%rsp), $tp1 # 64+128+256=448 245 246 # the squaring is performed as described in Variant B of 247 # "Speeding up Big-Number Squaring", so start by calculating 248 # the A*2=A+A vector 249 vpaddq $ACC1, $ACC1, $ACC1 250 vpbroadcastq 32*0-128($ap), $B1 251 vpaddq $ACC2, $ACC2, $ACC2 252 vmovdqa $ACC1, 32*0-128($aap) 253 vpaddq $ACC3, $ACC3, $ACC3 254 vmovdqa $ACC2, 32*1-128($aap) 255 vpaddq $ACC4, $ACC4, $ACC4 256 vmovdqa $ACC3, 32*2-128($aap) 257 vpaddq $ACC5, $ACC5, $ACC5 258 vmovdqa $ACC4, 32*3-128($aap) 259 vpaddq $ACC6, $ACC6, $ACC6 260 vmovdqa $ACC5, 32*4-128($aap) 261 vpaddq $ACC7, $ACC7, $ACC7 262 vmovdqa $ACC6, 32*5-128($aap) 263 vpaddq $ACC8, $ACC8, $ACC8 264 vmovdqa $ACC7, 32*6-128($aap) 265 vpxor $ACC9, $ACC9, $ACC9 266 vmovdqa $ACC8, 32*7-128($aap) 267 268 vpmuludq 32*0-128($ap), $B1, $ACC0 269 vpbroadcastq 32*1-128($ap), $B2 270 vmovdqu $ACC9, 32*9-192($tp0) # zero upper half 271 vpmuludq $B1, $ACC1, $ACC1 272 vmovdqu $ACC9, 32*10-448($tp1) 273 vpmuludq $B1, $ACC2, $ACC2 274 vmovdqu $ACC9, 32*11-448($tp1) 275 vpmuludq $B1, $ACC3, $ACC3 276 vmovdqu $ACC9, 32*12-448($tp1) 277 vpmuludq $B1, $ACC4, $ACC4 278 vmovdqu $ACC9, 32*13-448($tp1) 279 vpmuludq $B1, $ACC5, $ACC5 280 vmovdqu $ACC9, 32*14-448($tp1) 281 vpmuludq $B1, $ACC6, $ACC6 282 vmovdqu $ACC9, 32*15-448($tp1) 283 vpmuludq $B1, $ACC7, $ACC7 284 vmovdqu $ACC9, 32*16-448($tp1) 285 vpmuludq $B1, $ACC8, $ACC8 286 vpbroadcastq 32*2-128($ap), $B1 287 vmovdqu $ACC9, 32*17-448($tp1) 288 289 mov $ap, $tpa 290 mov \$4, $i 291 jmp .Lsqr_entry_1024 292 ___ 293 $TEMP0=$Y1; 294 $TEMP2=$Y2; 295 $code.=<<___; 296 .align 32 297 .LOOP_SQR_1024: 298 vpbroadcastq 32*1-128($tpa), $B2 299 vpmuludq 32*0-128($ap), $B1, $ACC0 300 vpaddq 32*0-192($tp0), $ACC0, $ACC0 301 vpmuludq 32*0-128($aap), $B1, $ACC1 302 vpaddq 32*1-192($tp0), $ACC1, $ACC1 303 vpmuludq 32*1-128($aap), $B1, $ACC2 304 vpaddq 32*2-192($tp0), $ACC2, $ACC2 305 vpmuludq 32*2-128($aap), $B1, $ACC3 306 vpaddq 32*3-192($tp0), $ACC3, $ACC3 307 vpmuludq 32*3-128($aap), $B1, $ACC4 308 vpaddq 32*4-192($tp0), $ACC4, $ACC4 309 vpmuludq 32*4-128($aap), $B1, $ACC5 310 vpaddq 32*5-192($tp0), $ACC5, $ACC5 311 vpmuludq 32*5-128($aap), $B1, $ACC6 312 vpaddq 32*6-192($tp0), $ACC6, $ACC6 313 vpmuludq 32*6-128($aap), $B1, $ACC7 314 vpaddq 32*7-192($tp0), $ACC7, $ACC7 315 vpmuludq 32*7-128($aap), $B1, $ACC8 316 vpbroadcastq 32*2-128($tpa), $B1 317 vpaddq 32*8-192($tp0), $ACC8, $ACC8 318 .Lsqr_entry_1024: 319 vmovdqu $ACC0, 32*0-192($tp0) 320 vmovdqu $ACC1, 32*1-192($tp0) 321 322 vpmuludq 32*1-128($ap), $B2, $TEMP0 323 vpaddq $TEMP0, $ACC2, $ACC2 324 vpmuludq 32*1-128($aap), $B2, $TEMP1 325 vpaddq $TEMP1, $ACC3, $ACC3 326 vpmuludq 32*2-128($aap), $B2, $TEMP2 327 vpaddq $TEMP2, $ACC4, $ACC4 328 vpmuludq 32*3-128($aap), $B2, $TEMP0 329 vpaddq $TEMP0, $ACC5, $ACC5 330 vpmuludq 32*4-128($aap), $B2, $TEMP1 331 vpaddq $TEMP1, $ACC6, $ACC6 332 vpmuludq 32*5-128($aap), $B2, $TEMP2 333 vpaddq $TEMP2, $ACC7, $ACC7 334 vpmuludq 32*6-128($aap), $B2, $TEMP0 335 vpaddq $TEMP0, $ACC8, $ACC8 336 vpmuludq 32*7-128($aap), $B2, $ACC0 337 vpbroadcastq 32*3-128($tpa), $B2 338 vpaddq 32*9-192($tp0), $ACC0, $ACC0 339 340 vmovdqu $ACC2, 32*2-192($tp0) 341 vmovdqu $ACC3, 32*3-192($tp0) 342 343 vpmuludq 32*2-128($ap), $B1, $TEMP2 344 vpaddq $TEMP2, $ACC4, $ACC4 345 vpmuludq 32*2-128($aap), $B1, $TEMP0 346 vpaddq $TEMP0, $ACC5, $ACC5 347 vpmuludq 32*3-128($aap), $B1, $TEMP1 348 vpaddq $TEMP1, $ACC6, $ACC6 349 vpmuludq 32*4-128($aap), $B1, $TEMP2 350 vpaddq $TEMP2, $ACC7, $ACC7 351 vpmuludq 32*5-128($aap), $B1, $TEMP0 352 vpaddq $TEMP0, $ACC8, $ACC8 353 vpmuludq 32*6-128($aap), $B1, $TEMP1 354 vpaddq $TEMP1, $ACC0, $ACC0 355 vpmuludq 32*7-128($aap), $B1, $ACC1 356 vpbroadcastq 32*4-128($tpa), $B1 357 vpaddq 32*10-448($tp1), $ACC1, $ACC1 358 359 vmovdqu $ACC4, 32*4-192($tp0) 360 vmovdqu $ACC5, 32*5-192($tp0) 361 362 vpmuludq 32*3-128($ap), $B2, $TEMP0 363 vpaddq $TEMP0, $ACC6, $ACC6 364 vpmuludq 32*3-128($aap), $B2, $TEMP1 365 vpaddq $TEMP1, $ACC7, $ACC7 366 vpmuludq 32*4-128($aap), $B2, $TEMP2 367 vpaddq $TEMP2, $ACC8, $ACC8 368 vpmuludq 32*5-128($aap), $B2, $TEMP0 369 vpaddq $TEMP0, $ACC0, $ACC0 370 vpmuludq 32*6-128($aap), $B2, $TEMP1 371 vpaddq $TEMP1, $ACC1, $ACC1 372 vpmuludq 32*7-128($aap), $B2, $ACC2 373 vpbroadcastq 32*5-128($tpa), $B2 374 vpaddq 32*11-448($tp1), $ACC2, $ACC2 375 376 vmovdqu $ACC6, 32*6-192($tp0) 377 vmovdqu $ACC7, 32*7-192($tp0) 378 379 vpmuludq 32*4-128($ap), $B1, $TEMP0 380 vpaddq $TEMP0, $ACC8, $ACC8 381 vpmuludq 32*4-128($aap), $B1, $TEMP1 382 vpaddq $TEMP1, $ACC0, $ACC0 383 vpmuludq 32*5-128($aap), $B1, $TEMP2 384 vpaddq $TEMP2, $ACC1, $ACC1 385 vpmuludq 32*6-128($aap), $B1, $TEMP0 386 vpaddq $TEMP0, $ACC2, $ACC2 387 vpmuludq 32*7-128($aap), $B1, $ACC3 388 vpbroadcastq 32*6-128($tpa), $B1 389 vpaddq 32*12-448($tp1), $ACC3, $ACC3 390 391 vmovdqu $ACC8, 32*8-192($tp0) 392 vmovdqu $ACC0, 32*9-192($tp0) 393 lea 8($tp0), $tp0 394 395 vpmuludq 32*5-128($ap), $B2, $TEMP2 396 vpaddq $TEMP2, $ACC1, $ACC1 397 vpmuludq 32*5-128($aap), $B2, $TEMP0 398 vpaddq $TEMP0, $ACC2, $ACC2 399 vpmuludq 32*6-128($aap), $B2, $TEMP1 400 vpaddq $TEMP1, $ACC3, $ACC3 401 vpmuludq 32*7-128($aap), $B2, $ACC4 402 vpbroadcastq 32*7-128($tpa), $B2 403 vpaddq 32*13-448($tp1), $ACC4, $ACC4 404 405 vmovdqu $ACC1, 32*10-448($tp1) 406 vmovdqu $ACC2, 32*11-448($tp1) 407 408 vpmuludq 32*6-128($ap), $B1, $TEMP0 409 vpaddq $TEMP0, $ACC3, $ACC3 410 vpmuludq 32*6-128($aap), $B1, $TEMP1 411 vpbroadcastq 32*8-128($tpa), $ACC0 # borrow $ACC0 for $B1 412 vpaddq $TEMP1, $ACC4, $ACC4 413 vpmuludq 32*7-128($aap), $B1, $ACC5 414 vpbroadcastq 32*0+8-128($tpa), $B1 # for next iteration 415 vpaddq 32*14-448($tp1), $ACC5, $ACC5 416 417 vmovdqu $ACC3, 32*12-448($tp1) 418 vmovdqu $ACC4, 32*13-448($tp1) 419 lea 8($tpa), $tpa 420 421 vpmuludq 32*7-128($ap), $B2, $TEMP0 422 vpaddq $TEMP0, $ACC5, $ACC5 423 vpmuludq 32*7-128($aap), $B2, $ACC6 424 vpaddq 32*15-448($tp1), $ACC6, $ACC6 425 426 vpmuludq 32*8-128($ap), $ACC0, $ACC7 427 vmovdqu $ACC5, 32*14-448($tp1) 428 vpaddq 32*16-448($tp1), $ACC7, $ACC7 429 vmovdqu $ACC6, 32*15-448($tp1) 430 vmovdqu $ACC7, 32*16-448($tp1) 431 lea 8($tp1), $tp1 432 433 dec $i 434 jnz .LOOP_SQR_1024 435 ___ 436 $ZERO = $ACC9; 437 $TEMP0 = $B1; 438 $TEMP2 = $B2; 439 $TEMP3 = $Y1; 440 $TEMP4 = $Y2; 441 $code.=<<___; 442 #we need to fix indexes 32-39 to avoid overflow 443 vmovdqu 32*8(%rsp), $ACC8 # 32*8-192($tp0), 444 vmovdqu 32*9(%rsp), $ACC1 # 32*9-192($tp0) 445 vmovdqu 32*10(%rsp), $ACC2 # 32*10-192($tp0) 446 lea 192(%rsp), $tp0 # 64+128=192 447 448 vpsrlq \$29, $ACC8, $TEMP1 449 vpand $AND_MASK, $ACC8, $ACC8 450 vpsrlq \$29, $ACC1, $TEMP2 451 vpand $AND_MASK, $ACC1, $ACC1 452 453 vpermq \$0x93, $TEMP1, $TEMP1 454 vpxor $ZERO, $ZERO, $ZERO 455 vpermq \$0x93, $TEMP2, $TEMP2 456 457 vpblendd \$3, $ZERO, $TEMP1, $TEMP0 458 vpblendd \$3, $TEMP1, $TEMP2, $TEMP1 459 vpaddq $TEMP0, $ACC8, $ACC8 460 vpblendd \$3, $TEMP2, $ZERO, $TEMP2 461 vpaddq $TEMP1, $ACC1, $ACC1 462 vpaddq $TEMP2, $ACC2, $ACC2 463 vmovdqu $ACC1, 32*9-192($tp0) 464 vmovdqu $ACC2, 32*10-192($tp0) 465 466 mov (%rsp), %rax 467 mov 8(%rsp), $r1 468 mov 16(%rsp), $r2 469 mov 24(%rsp), $r3 470 vmovdqu 32*1(%rsp), $ACC1 471 vmovdqu 32*2-192($tp0), $ACC2 472 vmovdqu 32*3-192($tp0), $ACC3 473 vmovdqu 32*4-192($tp0), $ACC4 474 vmovdqu 32*5-192($tp0), $ACC5 475 vmovdqu 32*6-192($tp0), $ACC6 476 vmovdqu 32*7-192($tp0), $ACC7 477 478 mov %rax, $r0 479 imull $n0, %eax 480 and \$0x1fffffff, %eax 481 vmovd %eax, $Y1 482 483 mov %rax, %rdx 484 imulq -128($np), %rax 485 vpbroadcastq $Y1, $Y1 486 add %rax, $r0 487 mov %rdx, %rax 488 imulq 8-128($np), %rax 489 shr \$29, $r0 490 add %rax, $r1 491 mov %rdx, %rax 492 imulq 16-128($np), %rax 493 add $r0, $r1 494 add %rax, $r2 495 imulq 24-128($np), %rdx 496 add %rdx, $r3 497 498 mov $r1, %rax 499 imull $n0, %eax 500 and \$0x1fffffff, %eax 501 502 mov \$9, $i 503 jmp .LOOP_REDUCE_1024 504 505 .align 32 506 .LOOP_REDUCE_1024: 507 vmovd %eax, $Y2 508 vpbroadcastq $Y2, $Y2 509 510 vpmuludq 32*1-128($np), $Y1, $TEMP0 511 mov %rax, %rdx 512 imulq -128($np), %rax 513 vpaddq $TEMP0, $ACC1, $ACC1 514 add %rax, $r1 515 vpmuludq 32*2-128($np), $Y1, $TEMP1 516 mov %rdx, %rax 517 imulq 8-128($np), %rax 518 vpaddq $TEMP1, $ACC2, $ACC2 519 vpmuludq 32*3-128($np), $Y1, $TEMP2 520 .byte 0x67 521 add %rax, $r2 522 .byte 0x67 523 mov %rdx, %rax 524 imulq 16-128($np), %rax 525 shr \$29, $r1 526 vpaddq $TEMP2, $ACC3, $ACC3 527 vpmuludq 32*4-128($np), $Y1, $TEMP0 528 add %rax, $r3 529 add $r1, $r2 530 vpaddq $TEMP0, $ACC4, $ACC4 531 vpmuludq 32*5-128($np), $Y1, $TEMP1 532 mov $r2, %rax 533 imull $n0, %eax 534 vpaddq $TEMP1, $ACC5, $ACC5 535 vpmuludq 32*6-128($np), $Y1, $TEMP2 536 and \$0x1fffffff, %eax 537 vpaddq $TEMP2, $ACC6, $ACC6 538 vpmuludq 32*7-128($np), $Y1, $TEMP0 539 vpaddq $TEMP0, $ACC7, $ACC7 540 vpmuludq 32*8-128($np), $Y1, $TEMP1 541 vmovd %eax, $Y1 542 #vmovdqu 32*1-8-128($np), $TEMP2 # moved below 543 vpaddq $TEMP1, $ACC8, $ACC8 544 #vmovdqu 32*2-8-128($np), $TEMP0 # moved below 545 vpbroadcastq $Y1, $Y1 546 547 vpmuludq 32*1-8-128($np), $Y2, $TEMP2 # see above 548 vmovdqu 32*3-8-128($np), $TEMP1 549 mov %rax, %rdx 550 imulq -128($np), %rax 551 vpaddq $TEMP2, $ACC1, $ACC1 552 vpmuludq 32*2-8-128($np), $Y2, $TEMP0 # see above 553 vmovdqu 32*4-8-128($np), $TEMP2 554 add %rax, $r2 555 mov %rdx, %rax 556 imulq 8-128($np), %rax 557 vpaddq $TEMP0, $ACC2, $ACC2 558 add $r3, %rax 559 shr \$29, $r2 560 vpmuludq $Y2, $TEMP1, $TEMP1 561 vmovdqu 32*5-8-128($np), $TEMP0 562 add $r2, %rax 563 vpaddq $TEMP1, $ACC3, $ACC3 564 vpmuludq $Y2, $TEMP2, $TEMP2 565 vmovdqu 32*6-8-128($np), $TEMP1 566 .byte 0x67 567 mov %rax, $r3 568 imull $n0, %eax 569 vpaddq $TEMP2, $ACC4, $ACC4 570 vpmuludq $Y2, $TEMP0, $TEMP0 571 .byte 0xc4,0x41,0x7e,0x6f,0x9d,0x58,0x00,0x00,0x00 # vmovdqu 32*7-8-128($np), $TEMP2 572 and \$0x1fffffff, %eax 573 vpaddq $TEMP0, $ACC5, $ACC5 574 vpmuludq $Y2, $TEMP1, $TEMP1 575 vmovdqu 32*8-8-128($np), $TEMP0 576 vpaddq $TEMP1, $ACC6, $ACC6 577 vpmuludq $Y2, $TEMP2, $TEMP2 578 vmovdqu 32*9-8-128($np), $ACC9 579 vmovd %eax, $ACC0 # borrow ACC0 for Y2 580 imulq -128($np), %rax 581 vpaddq $TEMP2, $ACC7, $ACC7 582 vpmuludq $Y2, $TEMP0, $TEMP0 583 vmovdqu 32*1-16-128($np), $TEMP1 584 vpbroadcastq $ACC0, $ACC0 585 vpaddq $TEMP0, $ACC8, $ACC8 586 vpmuludq $Y2, $ACC9, $ACC9 587 vmovdqu 32*2-16-128($np), $TEMP2 588 add %rax, $r3 589 590 ___ 591 ($ACC0,$Y2)=($Y2,$ACC0); 592 $code.=<<___; 593 vmovdqu 32*1-24-128($np), $ACC0 594 vpmuludq $Y1, $TEMP1, $TEMP1 595 vmovdqu 32*3-16-128($np), $TEMP0 596 vpaddq $TEMP1, $ACC1, $ACC1 597 vpmuludq $Y2, $ACC0, $ACC0 598 vpmuludq $Y1, $TEMP2, $TEMP2 599 .byte 0xc4,0x41,0x7e,0x6f,0xb5,0xf0,0xff,0xff,0xff # vmovdqu 32*4-16-128($np), $TEMP1 600 vpaddq $ACC1, $ACC0, $ACC0 601 vpaddq $TEMP2, $ACC2, $ACC2 602 vpmuludq $Y1, $TEMP0, $TEMP0 603 vmovdqu 32*5-16-128($np), $TEMP2 604 .byte 0x67 605 vmovq $ACC0, %rax 606 vmovdqu $ACC0, (%rsp) # transfer $r0-$r3 607 vpaddq $TEMP0, $ACC3, $ACC3 608 vpmuludq $Y1, $TEMP1, $TEMP1 609 vmovdqu 32*6-16-128($np), $TEMP0 610 vpaddq $TEMP1, $ACC4, $ACC4 611 vpmuludq $Y1, $TEMP2, $TEMP2 612 vmovdqu 32*7-16-128($np), $TEMP1 613 vpaddq $TEMP2, $ACC5, $ACC5 614 vpmuludq $Y1, $TEMP0, $TEMP0 615 vmovdqu 32*8-16-128($np), $TEMP2 616 vpaddq $TEMP0, $ACC6, $ACC6 617 vpmuludq $Y1, $TEMP1, $TEMP1 618 shr \$29, $r3 619 vmovdqu 32*9-16-128($np), $TEMP0 620 add $r3, %rax 621 vpaddq $TEMP1, $ACC7, $ACC7 622 vpmuludq $Y1, $TEMP2, $TEMP2 623 #vmovdqu 32*2-24-128($np), $TEMP1 # moved below 624 mov %rax, $r0 625 imull $n0, %eax 626 vpaddq $TEMP2, $ACC8, $ACC8 627 vpmuludq $Y1, $TEMP0, $TEMP0 628 and \$0x1fffffff, %eax 629 vmovd %eax, $Y1 630 vmovdqu 32*3-24-128($np), $TEMP2 631 .byte 0x67 632 vpaddq $TEMP0, $ACC9, $ACC9 633 vpbroadcastq $Y1, $Y1 634 635 vpmuludq 32*2-24-128($np), $Y2, $TEMP1 # see above 636 vmovdqu 32*4-24-128($np), $TEMP0 637 mov %rax, %rdx 638 imulq -128($np), %rax 639 mov 8(%rsp), $r1 640 vpaddq $TEMP1, $ACC2, $ACC1 641 vpmuludq $Y2, $TEMP2, $TEMP2 642 vmovdqu 32*5-24-128($np), $TEMP1 643 add %rax, $r0 644 mov %rdx, %rax 645 imulq 8-128($np), %rax 646 .byte 0x67 647 shr \$29, $r0 648 mov 16(%rsp), $r2 649 vpaddq $TEMP2, $ACC3, $ACC2 650 vpmuludq $Y2, $TEMP0, $TEMP0 651 vmovdqu 32*6-24-128($np), $TEMP2 652 add %rax, $r1 653 mov %rdx, %rax 654 imulq 16-128($np), %rax 655 vpaddq $TEMP0, $ACC4, $ACC3 656 vpmuludq $Y2, $TEMP1, $TEMP1 657 vmovdqu 32*7-24-128($np), $TEMP0 658 imulq 24-128($np), %rdx # future $r3 659 add %rax, $r2 660 lea ($r0,$r1), %rax 661 vpaddq $TEMP1, $ACC5, $ACC4 662 vpmuludq $Y2, $TEMP2, $TEMP2 663 vmovdqu 32*8-24-128($np), $TEMP1 664 mov %rax, $r1 665 imull $n0, %eax 666 vpmuludq $Y2, $TEMP0, $TEMP0 667 vpaddq $TEMP2, $ACC6, $ACC5 668 vmovdqu 32*9-24-128($np), $TEMP2 669 and \$0x1fffffff, %eax 670 vpaddq $TEMP0, $ACC7, $ACC6 671 vpmuludq $Y2, $TEMP1, $TEMP1 672 add 24(%rsp), %rdx 673 vpaddq $TEMP1, $ACC8, $ACC7 674 vpmuludq $Y2, $TEMP2, $TEMP2 675 vpaddq $TEMP2, $ACC9, $ACC8 676 vmovq $r3, $ACC9 677 mov %rdx, $r3 678 679 dec $i 680 jnz .LOOP_REDUCE_1024 681 ___ 682 ($ACC0,$Y2)=($Y2,$ACC0); 683 $code.=<<___; 684 lea 448(%rsp), $tp1 # size optimization 685 vpaddq $ACC9, $Y2, $ACC0 686 vpxor $ZERO, $ZERO, $ZERO 687 688 vpaddq 32*9-192($tp0), $ACC0, $ACC0 689 vpaddq 32*10-448($tp1), $ACC1, $ACC1 690 vpaddq 32*11-448($tp1), $ACC2, $ACC2 691 vpaddq 32*12-448($tp1), $ACC3, $ACC3 692 vpaddq 32*13-448($tp1), $ACC4, $ACC4 693 vpaddq 32*14-448($tp1), $ACC5, $ACC5 694 vpaddq 32*15-448($tp1), $ACC6, $ACC6 695 vpaddq 32*16-448($tp1), $ACC7, $ACC7 696 vpaddq 32*17-448($tp1), $ACC8, $ACC8 697 698 vpsrlq \$29, $ACC0, $TEMP1 699 vpand $AND_MASK, $ACC0, $ACC0 700 vpsrlq \$29, $ACC1, $TEMP2 701 vpand $AND_MASK, $ACC1, $ACC1 702 vpsrlq \$29, $ACC2, $TEMP3 703 vpermq \$0x93, $TEMP1, $TEMP1 704 vpand $AND_MASK, $ACC2, $ACC2 705 vpsrlq \$29, $ACC3, $TEMP4 706 vpermq \$0x93, $TEMP2, $TEMP2 707 vpand $AND_MASK, $ACC3, $ACC3 708 vpermq \$0x93, $TEMP3, $TEMP3 709 710 vpblendd \$3, $ZERO, $TEMP1, $TEMP0 711 vpermq \$0x93, $TEMP4, $TEMP4 712 vpblendd \$3, $TEMP1, $TEMP2, $TEMP1 713 vpaddq $TEMP0, $ACC0, $ACC0 714 vpblendd \$3, $TEMP2, $TEMP3, $TEMP2 715 vpaddq $TEMP1, $ACC1, $ACC1 716 vpblendd \$3, $TEMP3, $TEMP4, $TEMP3 717 vpaddq $TEMP2, $ACC2, $ACC2 718 vpblendd \$3, $TEMP4, $ZERO, $TEMP4 719 vpaddq $TEMP3, $ACC3, $ACC3 720 vpaddq $TEMP4, $ACC4, $ACC4 721 722 vpsrlq \$29, $ACC0, $TEMP1 723 vpand $AND_MASK, $ACC0, $ACC0 724 vpsrlq \$29, $ACC1, $TEMP2 725 vpand $AND_MASK, $ACC1, $ACC1 726 vpsrlq \$29, $ACC2, $TEMP3 727 vpermq \$0x93, $TEMP1, $TEMP1 728 vpand $AND_MASK, $ACC2, $ACC2 729 vpsrlq \$29, $ACC3, $TEMP4 730 vpermq \$0x93, $TEMP2, $TEMP2 731 vpand $AND_MASK, $ACC3, $ACC3 732 vpermq \$0x93, $TEMP3, $TEMP3 733 734 vpblendd \$3, $ZERO, $TEMP1, $TEMP0 735 vpermq \$0x93, $TEMP4, $TEMP4 736 vpblendd \$3, $TEMP1, $TEMP2, $TEMP1 737 vpaddq $TEMP0, $ACC0, $ACC0 738 vpblendd \$3, $TEMP2, $TEMP3, $TEMP2 739 vpaddq $TEMP1, $ACC1, $ACC1 740 vmovdqu $ACC0, 32*0-128($rp) 741 vpblendd \$3, $TEMP3, $TEMP4, $TEMP3 742 vpaddq $TEMP2, $ACC2, $ACC2 743 vmovdqu $ACC1, 32*1-128($rp) 744 vpblendd \$3, $TEMP4, $ZERO, $TEMP4 745 vpaddq $TEMP3, $ACC3, $ACC3 746 vmovdqu $ACC2, 32*2-128($rp) 747 vpaddq $TEMP4, $ACC4, $ACC4 748 vmovdqu $ACC3, 32*3-128($rp) 749 ___ 750 $TEMP5=$ACC0; 751 $code.=<<___; 752 vpsrlq \$29, $ACC4, $TEMP1 753 vpand $AND_MASK, $ACC4, $ACC4 754 vpsrlq \$29, $ACC5, $TEMP2 755 vpand $AND_MASK, $ACC5, $ACC5 756 vpsrlq \$29, $ACC6, $TEMP3 757 vpermq \$0x93, $TEMP1, $TEMP1 758 vpand $AND_MASK, $ACC6, $ACC6 759 vpsrlq \$29, $ACC7, $TEMP4 760 vpermq \$0x93, $TEMP2, $TEMP2 761 vpand $AND_MASK, $ACC7, $ACC7 762 vpsrlq \$29, $ACC8, $TEMP5 763 vpermq \$0x93, $TEMP3, $TEMP3 764 vpand $AND_MASK, $ACC8, $ACC8 765 vpermq \$0x93, $TEMP4, $TEMP4 766 767 vpblendd \$3, $ZERO, $TEMP1, $TEMP0 768 vpermq \$0x93, $TEMP5, $TEMP5 769 vpblendd \$3, $TEMP1, $TEMP2, $TEMP1 770 vpaddq $TEMP0, $ACC4, $ACC4 771 vpblendd \$3, $TEMP2, $TEMP3, $TEMP2 772 vpaddq $TEMP1, $ACC5, $ACC5 773 vpblendd \$3, $TEMP3, $TEMP4, $TEMP3 774 vpaddq $TEMP2, $ACC6, $ACC6 775 vpblendd \$3, $TEMP4, $TEMP5, $TEMP4 776 vpaddq $TEMP3, $ACC7, $ACC7 777 vpaddq $TEMP4, $ACC8, $ACC8 778 779 vpsrlq \$29, $ACC4, $TEMP1 780 vpand $AND_MASK, $ACC4, $ACC4 781 vpsrlq \$29, $ACC5, $TEMP2 782 vpand $AND_MASK, $ACC5, $ACC5 783 vpsrlq \$29, $ACC6, $TEMP3 784 vpermq \$0x93, $TEMP1, $TEMP1 785 vpand $AND_MASK, $ACC6, $ACC6 786 vpsrlq \$29, $ACC7, $TEMP4 787 vpermq \$0x93, $TEMP2, $TEMP2 788 vpand $AND_MASK, $ACC7, $ACC7 789 vpsrlq \$29, $ACC8, $TEMP5 790 vpermq \$0x93, $TEMP3, $TEMP3 791 vpand $AND_MASK, $ACC8, $ACC8 792 vpermq \$0x93, $TEMP4, $TEMP4 793 794 vpblendd \$3, $ZERO, $TEMP1, $TEMP0 795 vpermq \$0x93, $TEMP5, $TEMP5 796 vpblendd \$3, $TEMP1, $TEMP2, $TEMP1 797 vpaddq $TEMP0, $ACC4, $ACC4 798 vpblendd \$3, $TEMP2, $TEMP3, $TEMP2 799 vpaddq $TEMP1, $ACC5, $ACC5 800 vmovdqu $ACC4, 32*4-128($rp) 801 vpblendd \$3, $TEMP3, $TEMP4, $TEMP3 802 vpaddq $TEMP2, $ACC6, $ACC6 803 vmovdqu $ACC5, 32*5-128($rp) 804 vpblendd \$3, $TEMP4, $TEMP5, $TEMP4 805 vpaddq $TEMP3, $ACC7, $ACC7 806 vmovdqu $ACC6, 32*6-128($rp) 807 vpaddq $TEMP4, $ACC8, $ACC8 808 vmovdqu $ACC7, 32*7-128($rp) 809 vmovdqu $ACC8, 32*8-128($rp) 810 811 mov $rp, $ap 812 dec $rep 813 jne .LOOP_GRANDE_SQR_1024 814 815 vzeroall 816 mov %rbp, %rax 817 ___ 818 $code.=<<___ if ($win64); 819 movaps -0xd8(%rax),%xmm6 820 movaps -0xc8(%rax),%xmm7 821 movaps -0xb8(%rax),%xmm8 822 movaps -0xa8(%rax),%xmm9 823 movaps -0x98(%rax),%xmm10 824 movaps -0x88(%rax),%xmm11 825 movaps -0x78(%rax),%xmm12 826 movaps -0x68(%rax),%xmm13 827 movaps -0x58(%rax),%xmm14 828 movaps -0x48(%rax),%xmm15 829 ___ 830 $code.=<<___; 831 mov -48(%rax),%r15 832 mov -40(%rax),%r14 833 mov -32(%rax),%r13 834 mov -24(%rax),%r12 835 mov -16(%rax),%rbp 836 mov -8(%rax),%rbx 837 lea (%rax),%rsp # restore %rsp 838 .Lsqr_1024_epilogue: 839 ret 840 .size rsaz_1024_sqr_avx2,.-rsaz_1024_sqr_avx2 841 ___ 842 } 843 844 { # void AMM_WW( 845 my $rp="%rdi"; # BN_ULONG *rp, 846 my $ap="%rsi"; # const BN_ULONG *ap, 847 my $bp="%rdx"; # const BN_ULONG *bp, 848 my $np="%rcx"; # const BN_ULONG *np, 849 my $n0="%r8d"; # unsigned int n0); 850 851 # The registers that hold the accumulated redundant result 852 # The AMM works on 1024 bit operands, and redundant word size is 29 853 # Therefore: ceil(1024/29)/4 = 9 854 my $ACC0="%ymm0"; 855 my $ACC1="%ymm1"; 856 my $ACC2="%ymm2"; 857 my $ACC3="%ymm3"; 858 my $ACC4="%ymm4"; 859 my $ACC5="%ymm5"; 860 my $ACC6="%ymm6"; 861 my $ACC7="%ymm7"; 862 my $ACC8="%ymm8"; 863 my $ACC9="%ymm9"; 864 865 # Registers that hold the broadcasted words of multiplier, currently used 866 my $Bi="%ymm10"; 867 my $Yi="%ymm11"; 868 869 # Helper registers 870 my $TEMP0=$ACC0; 871 my $TEMP1="%ymm12"; 872 my $TEMP2="%ymm13"; 873 my $ZERO="%ymm14"; 874 my $AND_MASK="%ymm15"; 875 876 # alu registers that hold the first words of the ACC 877 my $r0="%r9"; 878 my $r1="%r10"; 879 my $r2="%r11"; 880 my $r3="%r12"; 881 882 my $i="%r14d"; 883 my $tmp="%r15"; 884 885 $bp="%r13"; # reassigned argument 886 887 $code.=<<___; 888 .globl rsaz_1024_mul_avx2 889 .type rsaz_1024_mul_avx2,\@function,5 890 .align 64 891 rsaz_1024_mul_avx2: 892 lea (%rsp), %rax 893 push %rbx 894 push %rbp 895 push %r12 896 push %r13 897 push %r14 898 push %r15 899 ___ 900 $code.=<<___ if ($win64); 901 vzeroupper 902 lea -0xa8(%rsp),%rsp 903 vmovaps %xmm6,-0xd8(%rax) 904 vmovaps %xmm7,-0xc8(%rax) 905 vmovaps %xmm8,-0xb8(%rax) 906 vmovaps %xmm9,-0xa8(%rax) 907 vmovaps %xmm10,-0x98(%rax) 908 vmovaps %xmm11,-0x88(%rax) 909 vmovaps %xmm12,-0x78(%rax) 910 vmovaps %xmm13,-0x68(%rax) 911 vmovaps %xmm14,-0x58(%rax) 912 vmovaps %xmm15,-0x48(%rax) 913 .Lmul_1024_body: 914 ___ 915 $code.=<<___; 916 mov %rax,%rbp 917 vzeroall 918 mov %rdx, $bp # reassigned argument 919 sub \$64,%rsp 920 921 # unaligned 256-bit load that crosses page boundary can 922 # cause severe performance degradation here, so if $ap does 923 # cross page boundary, swap it with $bp [meaning that caller 924 # is advised to lay down $ap and $bp next to each other, so 925 # that only one can cross page boundary]. 926 .byte 0x67,0x67 927 mov $ap, $tmp 928 and \$4095, $tmp 929 add \$32*10, $tmp 930 shr \$12, $tmp 931 mov $ap, $tmp 932 cmovnz $bp, $ap 933 cmovnz $tmp, $bp 934 935 mov $np, $tmp 936 sub \$-128,$ap # size optimization 937 sub \$-128,$np 938 sub \$-128,$rp 939 940 and \$4095, $tmp # see if $np crosses page 941 add \$32*10, $tmp 942 .byte 0x67,0x67 943 shr \$12, $tmp 944 jz .Lmul_1024_no_n_copy 945 946 # unaligned 256-bit load that crosses page boundary can 947 # cause severe performance degradation here, so if $np does 948 # cross page boundary, copy it to stack and make sure stack 949 # frame doesn't... 950 sub \$32*10,%rsp 951 vmovdqu 32*0-128($np), $ACC0 952 and \$-512, %rsp 953 vmovdqu 32*1-128($np), $ACC1 954 vmovdqu 32*2-128($np), $ACC2 955 vmovdqu 32*3-128($np), $ACC3 956 vmovdqu 32*4-128($np), $ACC4 957 vmovdqu 32*5-128($np), $ACC5 958 vmovdqu 32*6-128($np), $ACC6 959 vmovdqu 32*7-128($np), $ACC7 960 vmovdqu 32*8-128($np), $ACC8 961 lea 64+128(%rsp),$np 962 vmovdqu $ACC0, 32*0-128($np) 963 vpxor $ACC0, $ACC0, $ACC0 964 vmovdqu $ACC1, 32*1-128($np) 965 vpxor $ACC1, $ACC1, $ACC1 966 vmovdqu $ACC2, 32*2-128($np) 967 vpxor $ACC2, $ACC2, $ACC2 968 vmovdqu $ACC3, 32*3-128($np) 969 vpxor $ACC3, $ACC3, $ACC3 970 vmovdqu $ACC4, 32*4-128($np) 971 vpxor $ACC4, $ACC4, $ACC4 972 vmovdqu $ACC5, 32*5-128($np) 973 vpxor $ACC5, $ACC5, $ACC5 974 vmovdqu $ACC6, 32*6-128($np) 975 vpxor $ACC6, $ACC6, $ACC6 976 vmovdqu $ACC7, 32*7-128($np) 977 vpxor $ACC7, $ACC7, $ACC7 978 vmovdqu $ACC8, 32*8-128($np) 979 vmovdqa $ACC0, $ACC8 980 vmovdqu $ACC9, 32*9-128($np) # $ACC9 is zero after vzeroall 981 .Lmul_1024_no_n_copy: 982 and \$-64,%rsp 983 984 mov ($bp), %rbx 985 vpbroadcastq ($bp), $Bi 986 vmovdqu $ACC0, (%rsp) # clear top of stack 987 xor $r0, $r0 988 .byte 0x67 989 xor $r1, $r1 990 xor $r2, $r2 991 xor $r3, $r3 992 993 vmovdqu .Land_mask(%rip), $AND_MASK 994 mov \$9, $i 995 vmovdqu $ACC9, 32*9-128($rp) # $ACC9 is zero after vzeroall 996 jmp .Loop_mul_1024 997 998 .align 32 999 .Loop_mul_1024: 1000 vpsrlq \$29, $ACC3, $ACC9 # correct $ACC3(*) 1001 mov %rbx, %rax 1002 imulq -128($ap), %rax 1003 add $r0, %rax 1004 mov %rbx, $r1 1005 imulq 8-128($ap), $r1 1006 add 8(%rsp), $r1 1007 1008 mov %rax, $r0 1009 imull $n0, %eax 1010 and \$0x1fffffff, %eax 1011 1012 mov %rbx, $r2 1013 imulq 16-128($ap), $r2 1014 add 16(%rsp), $r2 1015 1016 mov %rbx, $r3 1017 imulq 24-128($ap), $r3 1018 add 24(%rsp), $r3 1019 vpmuludq 32*1-128($ap),$Bi,$TEMP0 1020 vmovd %eax, $Yi 1021 vpaddq $TEMP0,$ACC1,$ACC1 1022 vpmuludq 32*2-128($ap),$Bi,$TEMP1 1023 vpbroadcastq $Yi, $Yi 1024 vpaddq $TEMP1,$ACC2,$ACC2 1025 vpmuludq 32*3-128($ap),$Bi,$TEMP2 1026 vpand $AND_MASK, $ACC3, $ACC3 # correct $ACC3 1027 vpaddq $TEMP2,$ACC3,$ACC3 1028 vpmuludq 32*4-128($ap),$Bi,$TEMP0 1029 vpaddq $TEMP0,$ACC4,$ACC4 1030 vpmuludq 32*5-128($ap),$Bi,$TEMP1 1031 vpaddq $TEMP1,$ACC5,$ACC5 1032 vpmuludq 32*6-128($ap),$Bi,$TEMP2 1033 vpaddq $TEMP2,$ACC6,$ACC6 1034 vpmuludq 32*7-128($ap),$Bi,$TEMP0 1035 vpermq \$0x93, $ACC9, $ACC9 # correct $ACC3 1036 vpaddq $TEMP0,$ACC7,$ACC7 1037 vpmuludq 32*8-128($ap),$Bi,$TEMP1 1038 vpbroadcastq 8($bp), $Bi 1039 vpaddq $TEMP1,$ACC8,$ACC8 1040 1041 mov %rax,%rdx 1042 imulq -128($np),%rax 1043 add %rax,$r0 1044 mov %rdx,%rax 1045 imulq 8-128($np),%rax 1046 add %rax,$r1 1047 mov %rdx,%rax 1048 imulq 16-128($np),%rax 1049 add %rax,$r2 1050 shr \$29, $r0 1051 imulq 24-128($np),%rdx 1052 add %rdx,$r3 1053 add $r0, $r1 1054 1055 vpmuludq 32*1-128($np),$Yi,$TEMP2 1056 vmovq $Bi, %rbx 1057 vpaddq $TEMP2,$ACC1,$ACC1 1058 vpmuludq 32*2-128($np),$Yi,$TEMP0 1059 vpaddq $TEMP0,$ACC2,$ACC2 1060 vpmuludq 32*3-128($np),$Yi,$TEMP1 1061 vpaddq $TEMP1,$ACC3,$ACC3 1062 vpmuludq 32*4-128($np),$Yi,$TEMP2 1063 vpaddq $TEMP2,$ACC4,$ACC4 1064 vpmuludq 32*5-128($np),$Yi,$TEMP0 1065 vpaddq $TEMP0,$ACC5,$ACC5 1066 vpmuludq 32*6-128($np),$Yi,$TEMP1 1067 vpaddq $TEMP1,$ACC6,$ACC6 1068 vpmuludq 32*7-128($np),$Yi,$TEMP2 1069 vpblendd \$3, $ZERO, $ACC9, $ACC9 # correct $ACC3 1070 vpaddq $TEMP2,$ACC7,$ACC7 1071 vpmuludq 32*8-128($np),$Yi,$TEMP0 1072 vpaddq $ACC9, $ACC3, $ACC3 # correct $ACC3 1073 vpaddq $TEMP0,$ACC8,$ACC8 1074 1075 mov %rbx, %rax 1076 imulq -128($ap),%rax 1077 add %rax,$r1 1078 vmovdqu -8+32*1-128($ap),$TEMP1 1079 mov %rbx, %rax 1080 imulq 8-128($ap),%rax 1081 add %rax,$r2 1082 vmovdqu -8+32*2-128($ap),$TEMP2 1083 1084 mov $r1, %rax 1085 imull $n0, %eax 1086 and \$0x1fffffff, %eax 1087 1088 imulq 16-128($ap),%rbx 1089 add %rbx,$r3 1090 vpmuludq $Bi,$TEMP1,$TEMP1 1091 vmovd %eax, $Yi 1092 vmovdqu -8+32*3-128($ap),$TEMP0 1093 vpaddq $TEMP1,$ACC1,$ACC1 1094 vpmuludq $Bi,$TEMP2,$TEMP2 1095 vpbroadcastq $Yi, $Yi 1096 vmovdqu -8+32*4-128($ap),$TEMP1 1097 vpaddq $TEMP2,$ACC2,$ACC2 1098 vpmuludq $Bi,$TEMP0,$TEMP0 1099 vmovdqu -8+32*5-128($ap),$TEMP2 1100 vpaddq $TEMP0,$ACC3,$ACC3 1101 vpmuludq $Bi,$TEMP1,$TEMP1 1102 vmovdqu -8+32*6-128($ap),$TEMP0 1103 vpaddq $TEMP1,$ACC4,$ACC4 1104 vpmuludq $Bi,$TEMP2,$TEMP2 1105 vmovdqu -8+32*7-128($ap),$TEMP1 1106 vpaddq $TEMP2,$ACC5,$ACC5 1107 vpmuludq $Bi,$TEMP0,$TEMP0 1108 vmovdqu -8+32*8-128($ap),$TEMP2 1109 vpaddq $TEMP0,$ACC6,$ACC6 1110 vpmuludq $Bi,$TEMP1,$TEMP1 1111 vmovdqu -8+32*9-128($ap),$ACC9 1112 vpaddq $TEMP1,$ACC7,$ACC7 1113 vpmuludq $Bi,$TEMP2,$TEMP2 1114 vpaddq $TEMP2,$ACC8,$ACC8 1115 vpmuludq $Bi,$ACC9,$ACC9 1116 vpbroadcastq 16($bp), $Bi 1117 1118 mov %rax,%rdx 1119 imulq -128($np),%rax 1120 add %rax,$r1 1121 vmovdqu -8+32*1-128($np),$TEMP0 1122 mov %rdx,%rax 1123 imulq 8-128($np),%rax 1124 add %rax,$r2 1125 vmovdqu -8+32*2-128($np),$TEMP1 1126 shr \$29, $r1 1127 imulq 16-128($np),%rdx 1128 add %rdx,$r3 1129 add $r1, $r2 1130 1131 vpmuludq $Yi,$TEMP0,$TEMP0 1132 vmovq $Bi, %rbx 1133 vmovdqu -8+32*3-128($np),$TEMP2 1134 vpaddq $TEMP0,$ACC1,$ACC1 1135 vpmuludq $Yi,$TEMP1,$TEMP1 1136 vmovdqu -8+32*4-128($np),$TEMP0 1137 vpaddq $TEMP1,$ACC2,$ACC2 1138 vpmuludq $Yi,$TEMP2,$TEMP2 1139 vmovdqu -8+32*5-128($np),$TEMP1 1140 vpaddq $TEMP2,$ACC3,$ACC3 1141 vpmuludq $Yi,$TEMP0,$TEMP0 1142 vmovdqu -8+32*6-128($np),$TEMP2 1143 vpaddq $TEMP0,$ACC4,$ACC4 1144 vpmuludq $Yi,$TEMP1,$TEMP1 1145 vmovdqu -8+32*7-128($np),$TEMP0 1146 vpaddq $TEMP1,$ACC5,$ACC5 1147 vpmuludq $Yi,$TEMP2,$TEMP2 1148 vmovdqu -8+32*8-128($np),$TEMP1 1149 vpaddq $TEMP2,$ACC6,$ACC6 1150 vpmuludq $Yi,$TEMP0,$TEMP0 1151 vmovdqu -8+32*9-128($np),$TEMP2 1152 vpaddq $TEMP0,$ACC7,$ACC7 1153 vpmuludq $Yi,$TEMP1,$TEMP1 1154 vpaddq $TEMP1,$ACC8,$ACC8 1155 vpmuludq $Yi,$TEMP2,$TEMP2 1156 vpaddq $TEMP2,$ACC9,$ACC9 1157 1158 vmovdqu -16+32*1-128($ap),$TEMP0 1159 mov %rbx,%rax 1160 imulq -128($ap),%rax 1161 add $r2,%rax 1162 1163 vmovdqu -16+32*2-128($ap),$TEMP1 1164 mov %rax,$r2 1165 imull $n0, %eax 1166 and \$0x1fffffff, %eax 1167 1168 imulq 8-128($ap),%rbx 1169 add %rbx,$r3 1170 vpmuludq $Bi,$TEMP0,$TEMP0 1171 vmovd %eax, $Yi 1172 vmovdqu -16+32*3-128($ap),$TEMP2 1173 vpaddq $TEMP0,$ACC1,$ACC1 1174 vpmuludq $Bi,$TEMP1,$TEMP1 1175 vpbroadcastq $Yi, $Yi 1176 vmovdqu -16+32*4-128($ap),$TEMP0 1177 vpaddq $TEMP1,$ACC2,$ACC2 1178 vpmuludq $Bi,$TEMP2,$TEMP2 1179 vmovdqu -16+32*5-128($ap),$TEMP1 1180 vpaddq $TEMP2,$ACC3,$ACC3 1181 vpmuludq $Bi,$TEMP0,$TEMP0 1182 vmovdqu -16+32*6-128($ap),$TEMP2 1183 vpaddq $TEMP0,$ACC4,$ACC4 1184 vpmuludq $Bi,$TEMP1,$TEMP1 1185 vmovdqu -16+32*7-128($ap),$TEMP0 1186 vpaddq $TEMP1,$ACC5,$ACC5 1187 vpmuludq $Bi,$TEMP2,$TEMP2 1188 vmovdqu -16+32*8-128($ap),$TEMP1 1189 vpaddq $TEMP2,$ACC6,$ACC6 1190 vpmuludq $Bi,$TEMP0,$TEMP0 1191 vmovdqu -16+32*9-128($ap),$TEMP2 1192 vpaddq $TEMP0,$ACC7,$ACC7 1193 vpmuludq $Bi,$TEMP1,$TEMP1 1194 vpaddq $TEMP1,$ACC8,$ACC8 1195 vpmuludq $Bi,$TEMP2,$TEMP2 1196 vpbroadcastq 24($bp), $Bi 1197 vpaddq $TEMP2,$ACC9,$ACC9 1198 1199 vmovdqu -16+32*1-128($np),$TEMP0 1200 mov %rax,%rdx 1201 imulq -128($np),%rax 1202 add %rax,$r2 1203 vmovdqu -16+32*2-128($np),$TEMP1 1204 imulq 8-128($np),%rdx 1205 add %rdx,$r3 1206 shr \$29, $r2 1207 1208 vpmuludq $Yi,$TEMP0,$TEMP0 1209 vmovq $Bi, %rbx 1210 vmovdqu -16+32*3-128($np),$TEMP2 1211 vpaddq $TEMP0,$ACC1,$ACC1 1212 vpmuludq $Yi,$TEMP1,$TEMP1 1213 vmovdqu -16+32*4-128($np),$TEMP0 1214 vpaddq $TEMP1,$ACC2,$ACC2 1215 vpmuludq $Yi,$TEMP2,$TEMP2 1216 vmovdqu -16+32*5-128($np),$TEMP1 1217 vpaddq $TEMP2,$ACC3,$ACC3 1218 vpmuludq $Yi,$TEMP0,$TEMP0 1219 vmovdqu -16+32*6-128($np),$TEMP2 1220 vpaddq $TEMP0,$ACC4,$ACC4 1221 vpmuludq $Yi,$TEMP1,$TEMP1 1222 vmovdqu -16+32*7-128($np),$TEMP0 1223 vpaddq $TEMP1,$ACC5,$ACC5 1224 vpmuludq $Yi,$TEMP2,$TEMP2 1225 vmovdqu -16+32*8-128($np),$TEMP1 1226 vpaddq $TEMP2,$ACC6,$ACC6 1227 vpmuludq $Yi,$TEMP0,$TEMP0 1228 vmovdqu -16+32*9-128($np),$TEMP2 1229 vpaddq $TEMP0,$ACC7,$ACC7 1230 vpmuludq $Yi,$TEMP1,$TEMP1 1231 vmovdqu -24+32*1-128($ap),$TEMP0 1232 vpaddq $TEMP1,$ACC8,$ACC8 1233 vpmuludq $Yi,$TEMP2,$TEMP2 1234 vmovdqu -24+32*2-128($ap),$TEMP1 1235 vpaddq $TEMP2,$ACC9,$ACC9 1236 1237 add $r2, $r3 1238 imulq -128($ap),%rbx 1239 add %rbx,$r3 1240 1241 mov $r3, %rax 1242 imull $n0, %eax 1243 and \$0x1fffffff, %eax 1244 1245 vpmuludq $Bi,$TEMP0,$TEMP0 1246 vmovd %eax, $Yi 1247 vmovdqu -24+32*3-128($ap),$TEMP2 1248 vpaddq $TEMP0,$ACC1,$ACC1 1249 vpmuludq $Bi,$TEMP1,$TEMP1 1250 vpbroadcastq $Yi, $Yi 1251 vmovdqu -24+32*4-128($ap),$TEMP0 1252 vpaddq $TEMP1,$ACC2,$ACC2 1253 vpmuludq $Bi,$TEMP2,$TEMP2 1254 vmovdqu -24+32*5-128($ap),$TEMP1 1255 vpaddq $TEMP2,$ACC3,$ACC3 1256 vpmuludq $Bi,$TEMP0,$TEMP0 1257 vmovdqu -24+32*6-128($ap),$TEMP2 1258 vpaddq $TEMP0,$ACC4,$ACC4 1259 vpmuludq $Bi,$TEMP1,$TEMP1 1260 vmovdqu -24+32*7-128($ap),$TEMP0 1261 vpaddq $TEMP1,$ACC5,$ACC5 1262 vpmuludq $Bi,$TEMP2,$TEMP2 1263 vmovdqu -24+32*8-128($ap),$TEMP1 1264 vpaddq $TEMP2,$ACC6,$ACC6 1265 vpmuludq $Bi,$TEMP0,$TEMP0 1266 vmovdqu -24+32*9-128($ap),$TEMP2 1267 vpaddq $TEMP0,$ACC7,$ACC7 1268 vpmuludq $Bi,$TEMP1,$TEMP1 1269 vpaddq $TEMP1,$ACC8,$ACC8 1270 vpmuludq $Bi,$TEMP2,$TEMP2 1271 vpbroadcastq 32($bp), $Bi 1272 vpaddq $TEMP2,$ACC9,$ACC9 1273 add \$32, $bp # $bp++ 1274 1275 vmovdqu -24+32*1-128($np),$TEMP0 1276 imulq -128($np),%rax 1277 add %rax,$r3 1278 shr \$29, $r3 1279 1280 vmovdqu -24+32*2-128($np),$TEMP1 1281 vpmuludq $Yi,$TEMP0,$TEMP0 1282 vmovq $Bi, %rbx 1283 vmovdqu -24+32*3-128($np),$TEMP2 1284 vpaddq $TEMP0,$ACC1,$ACC0 # $ACC0==$TEMP0 1285 vpmuludq $Yi,$TEMP1,$TEMP1 1286 vmovdqu $ACC0, (%rsp) # transfer $r0-$r3 1287 vpaddq $TEMP1,$ACC2,$ACC1 1288 vmovdqu -24+32*4-128($np),$TEMP0 1289 vpmuludq $Yi,$TEMP2,$TEMP2 1290 vmovdqu -24+32*5-128($np),$TEMP1 1291 vpaddq $TEMP2,$ACC3,$ACC2 1292 vpmuludq $Yi,$TEMP0,$TEMP0 1293 vmovdqu -24+32*6-128($np),$TEMP2 1294 vpaddq $TEMP0,$ACC4,$ACC3 1295 vpmuludq $Yi,$TEMP1,$TEMP1 1296 vmovdqu -24+32*7-128($np),$TEMP0 1297 vpaddq $TEMP1,$ACC5,$ACC4 1298 vpmuludq $Yi,$TEMP2,$TEMP2 1299 vmovdqu -24+32*8-128($np),$TEMP1 1300 vpaddq $TEMP2,$ACC6,$ACC5 1301 vpmuludq $Yi,$TEMP0,$TEMP0 1302 vmovdqu -24+32*9-128($np),$TEMP2 1303 mov $r3, $r0 1304 vpaddq $TEMP0,$ACC7,$ACC6 1305 vpmuludq $Yi,$TEMP1,$TEMP1 1306 add (%rsp), $r0 1307 vpaddq $TEMP1,$ACC8,$ACC7 1308 vpmuludq $Yi,$TEMP2,$TEMP2 1309 vmovq $r3, $TEMP1 1310 vpaddq $TEMP2,$ACC9,$ACC8 1311 1312 dec $i 1313 jnz .Loop_mul_1024 1314 ___ 1315 1316 # (*) Original implementation was correcting ACC1-ACC3 for overflow 1317 # after 7 loop runs, or after 28 iterations, or 56 additions. 1318 # But as we underutilize resources, it's possible to correct in 1319 # each iteration with marginal performance loss. But then, as 1320 # we do it in each iteration, we can correct less digits, and 1321 # avoid performance penalties completely. Also note that we 1322 # correct only three digits out of four. This works because 1323 # most significant digit is subjected to less additions. 1324 1325 $TEMP0 = $ACC9; 1326 $TEMP3 = $Bi; 1327 $TEMP4 = $Yi; 1328 $code.=<<___; 1329 vpermq \$0, $AND_MASK, $AND_MASK 1330 vpaddq (%rsp), $TEMP1, $ACC0 1331 1332 vpsrlq \$29, $ACC0, $TEMP1 1333 vpand $AND_MASK, $ACC0, $ACC0 1334 vpsrlq \$29, $ACC1, $TEMP2 1335 vpand $AND_MASK, $ACC1, $ACC1 1336 vpsrlq \$29, $ACC2, $TEMP3 1337 vpermq \$0x93, $TEMP1, $TEMP1 1338 vpand $AND_MASK, $ACC2, $ACC2 1339 vpsrlq \$29, $ACC3, $TEMP4 1340 vpermq \$0x93, $TEMP2, $TEMP2 1341 vpand $AND_MASK, $ACC3, $ACC3 1342 1343 vpblendd \$3, $ZERO, $TEMP1, $TEMP0 1344 vpermq \$0x93, $TEMP3, $TEMP3 1345 vpblendd \$3, $TEMP1, $TEMP2, $TEMP1 1346 vpermq \$0x93, $TEMP4, $TEMP4 1347 vpaddq $TEMP0, $ACC0, $ACC0 1348 vpblendd \$3, $TEMP2, $TEMP3, $TEMP2 1349 vpaddq $TEMP1, $ACC1, $ACC1 1350 vpblendd \$3, $TEMP3, $TEMP4, $TEMP3 1351 vpaddq $TEMP2, $ACC2, $ACC2 1352 vpblendd \$3, $TEMP4, $ZERO, $TEMP4 1353 vpaddq $TEMP3, $ACC3, $ACC3 1354 vpaddq $TEMP4, $ACC4, $ACC4 1355 1356 vpsrlq \$29, $ACC0, $TEMP1 1357 vpand $AND_MASK, $ACC0, $ACC0 1358 vpsrlq \$29, $ACC1, $TEMP2 1359 vpand $AND_MASK, $ACC1, $ACC1 1360 vpsrlq \$29, $ACC2, $TEMP3 1361 vpermq \$0x93, $TEMP1, $TEMP1 1362 vpand $AND_MASK, $ACC2, $ACC2 1363 vpsrlq \$29, $ACC3, $TEMP4 1364 vpermq \$0x93, $TEMP2, $TEMP2 1365 vpand $AND_MASK, $ACC3, $ACC3 1366 vpermq \$0x93, $TEMP3, $TEMP3 1367 1368 vpblendd \$3, $ZERO, $TEMP1, $TEMP0 1369 vpermq \$0x93, $TEMP4, $TEMP4 1370 vpblendd \$3, $TEMP1, $TEMP2, $TEMP1 1371 vpaddq $TEMP0, $ACC0, $ACC0 1372 vpblendd \$3, $TEMP2, $TEMP3, $TEMP2 1373 vpaddq $TEMP1, $ACC1, $ACC1 1374 vpblendd \$3, $TEMP3, $TEMP4, $TEMP3 1375 vpaddq $TEMP2, $ACC2, $ACC2 1376 vpblendd \$3, $TEMP4, $ZERO, $TEMP4 1377 vpaddq $TEMP3, $ACC3, $ACC3 1378 vpaddq $TEMP4, $ACC4, $ACC4 1379 1380 vmovdqu $ACC0, 0-128($rp) 1381 vmovdqu $ACC1, 32-128($rp) 1382 vmovdqu $ACC2, 64-128($rp) 1383 vmovdqu $ACC3, 96-128($rp) 1384 ___ 1385 1386 $TEMP5=$ACC0; 1387 $code.=<<___; 1388 vpsrlq \$29, $ACC4, $TEMP1 1389 vpand $AND_MASK, $ACC4, $ACC4 1390 vpsrlq \$29, $ACC5, $TEMP2 1391 vpand $AND_MASK, $ACC5, $ACC5 1392 vpsrlq \$29, $ACC6, $TEMP3 1393 vpermq \$0x93, $TEMP1, $TEMP1 1394 vpand $AND_MASK, $ACC6, $ACC6 1395 vpsrlq \$29, $ACC7, $TEMP4 1396 vpermq \$0x93, $TEMP2, $TEMP2 1397 vpand $AND_MASK, $ACC7, $ACC7 1398 vpsrlq \$29, $ACC8, $TEMP5 1399 vpermq \$0x93, $TEMP3, $TEMP3 1400 vpand $AND_MASK, $ACC8, $ACC8 1401 vpermq \$0x93, $TEMP4, $TEMP4 1402 1403 vpblendd \$3, $ZERO, $TEMP1, $TEMP0 1404 vpermq \$0x93, $TEMP5, $TEMP5 1405 vpblendd \$3, $TEMP1, $TEMP2, $TEMP1 1406 vpaddq $TEMP0, $ACC4, $ACC4 1407 vpblendd \$3, $TEMP2, $TEMP3, $TEMP2 1408 vpaddq $TEMP1, $ACC5, $ACC5 1409 vpblendd \$3, $TEMP3, $TEMP4, $TEMP3 1410 vpaddq $TEMP2, $ACC6, $ACC6 1411 vpblendd \$3, $TEMP4, $TEMP5, $TEMP4 1412 vpaddq $TEMP3, $ACC7, $ACC7 1413 vpaddq $TEMP4, $ACC8, $ACC8 1414 1415 vpsrlq \$29, $ACC4, $TEMP1 1416 vpand $AND_MASK, $ACC4, $ACC4 1417 vpsrlq \$29, $ACC5, $TEMP2 1418 vpand $AND_MASK, $ACC5, $ACC5 1419 vpsrlq \$29, $ACC6, $TEMP3 1420 vpermq \$0x93, $TEMP1, $TEMP1 1421 vpand $AND_MASK, $ACC6, $ACC6 1422 vpsrlq \$29, $ACC7, $TEMP4 1423 vpermq \$0x93, $TEMP2, $TEMP2 1424 vpand $AND_MASK, $ACC7, $ACC7 1425 vpsrlq \$29, $ACC8, $TEMP5 1426 vpermq \$0x93, $TEMP3, $TEMP3 1427 vpand $AND_MASK, $ACC8, $ACC8 1428 vpermq \$0x93, $TEMP4, $TEMP4 1429 1430 vpblendd \$3, $ZERO, $TEMP1, $TEMP0 1431 vpermq \$0x93, $TEMP5, $TEMP5 1432 vpblendd \$3, $TEMP1, $TEMP2, $TEMP1 1433 vpaddq $TEMP0, $ACC4, $ACC4 1434 vpblendd \$3, $TEMP2, $TEMP3, $TEMP2 1435 vpaddq $TEMP1, $ACC5, $ACC5 1436 vpblendd \$3, $TEMP3, $TEMP4, $TEMP3 1437 vpaddq $TEMP2, $ACC6, $ACC6 1438 vpblendd \$3, $TEMP4, $TEMP5, $TEMP4 1439 vpaddq $TEMP3, $ACC7, $ACC7 1440 vpaddq $TEMP4, $ACC8, $ACC8 1441 1442 vmovdqu $ACC4, 128-128($rp) 1443 vmovdqu $ACC5, 160-128($rp) 1444 vmovdqu $ACC6, 192-128($rp) 1445 vmovdqu $ACC7, 224-128($rp) 1446 vmovdqu $ACC8, 256-128($rp) 1447 vzeroupper 1448 1449 mov %rbp, %rax 1450 ___ 1451 $code.=<<___ if ($win64); 1452 movaps -0xd8(%rax),%xmm6 1453 movaps -0xc8(%rax),%xmm7 1454 movaps -0xb8(%rax),%xmm8 1455 movaps -0xa8(%rax),%xmm9 1456 movaps -0x98(%rax),%xmm10 1457 movaps -0x88(%rax),%xmm11 1458 movaps -0x78(%rax),%xmm12 1459 movaps -0x68(%rax),%xmm13 1460 movaps -0x58(%rax),%xmm14 1461 movaps -0x48(%rax),%xmm15 1462 ___ 1463 $code.=<<___; 1464 mov -48(%rax),%r15 1465 mov -40(%rax),%r14 1466 mov -32(%rax),%r13 1467 mov -24(%rax),%r12 1468 mov -16(%rax),%rbp 1469 mov -8(%rax),%rbx 1470 lea (%rax),%rsp # restore %rsp 1471 .Lmul_1024_epilogue: 1472 ret 1473 .size rsaz_1024_mul_avx2,.-rsaz_1024_mul_avx2 1474 ___ 1475 } 1476 { 1477 my ($out,$inp) = $win64 ? ("%rcx","%rdx") : ("%rdi","%rsi"); 1478 my @T = map("%r$_",(8..11)); 1479 1480 $code.=<<___; 1481 .globl rsaz_1024_red2norm_avx2 1482 .type rsaz_1024_red2norm_avx2,\@abi-omnipotent 1483 .align 32 1484 rsaz_1024_red2norm_avx2: 1485 sub \$-128,$inp # size optimization 1486 xor %rax,%rax 1487 ___ 1488 1489 for ($j=0,$i=0; $i<16; $i++) { 1490 my $k=0; 1491 while (29*$j<64*($i+1)) { # load data till boundary 1492 $code.=" mov `8*$j-128`($inp), @T[0]\n"; 1493 $j++; $k++; push(@T,shift(@T)); 1494 } 1495 $l=$k; 1496 while ($k>1) { # shift loaded data but last value 1497 $code.=" shl \$`29*($j-$k)`,@T[-$k]\n"; 1498 $k--; 1499 } 1500 $code.=<<___; # shift last value 1501 mov @T[-1], @T[0] 1502 shl \$`29*($j-1)`, @T[-1] 1503 shr \$`-29*($j-1)`, @T[0] 1504 ___ 1505 while ($l) { # accumulate all values 1506 $code.=" add @T[-$l], %rax\n"; 1507 $l--; 1508 } 1509 $code.=<<___; 1510 adc \$0, @T[0] # consume eventual carry 1511 mov %rax, 8*$i($out) 1512 mov @T[0], %rax 1513 ___ 1514 push(@T,shift(@T)); 1515 } 1516 $code.=<<___; 1517 ret 1518 .size rsaz_1024_red2norm_avx2,.-rsaz_1024_red2norm_avx2 1519 1520 .globl rsaz_1024_norm2red_avx2 1521 .type rsaz_1024_norm2red_avx2,\@abi-omnipotent 1522 .align 32 1523 rsaz_1024_norm2red_avx2: 1524 sub \$-128,$out # size optimization 1525 mov ($inp),@T[0] 1526 mov \$0x1fffffff,%eax 1527 ___ 1528 for ($j=0,$i=0; $i<16; $i++) { 1529 $code.=" mov `8*($i+1)`($inp),@T[1]\n" if ($i<15); 1530 $code.=" xor @T[1],@T[1]\n" if ($i==15); 1531 my $k=1; 1532 while (29*($j+1)<64*($i+1)) { 1533 $code.=<<___; 1534 mov @T[0],@T[-$k] 1535 shr \$`29*$j`,@T[-$k] 1536 and %rax,@T[-$k] # &0x1fffffff 1537 mov @T[-$k],`8*$j-128`($out) 1538 ___ 1539 $j++; $k++; 1540 } 1541 $code.=<<___; 1542 shrd \$`29*$j`,@T[1],@T[0] 1543 and %rax,@T[0] 1544 mov @T[0],`8*$j-128`($out) 1545 ___ 1546 $j++; 1547 push(@T,shift(@T)); 1548 } 1549 $code.=<<___; 1550 mov @T[0],`8*$j-128`($out) # zero 1551 mov @T[0],`8*($j+1)-128`($out) 1552 mov @T[0],`8*($j+2)-128`($out) 1553 mov @T[0],`8*($j+3)-128`($out) 1554 ret 1555 .size rsaz_1024_norm2red_avx2,.-rsaz_1024_norm2red_avx2 1556 ___ 1557 } 1558 { 1559 my ($out,$inp,$power) = $win64 ? ("%rcx","%rdx","%r8d") : ("%rdi","%rsi","%edx"); 1560 1561 $code.=<<___; 1562 .globl rsaz_1024_scatter5_avx2 1563 .type rsaz_1024_scatter5_avx2,\@abi-omnipotent 1564 .align 32 1565 rsaz_1024_scatter5_avx2: 1566 vzeroupper 1567 vmovdqu .Lscatter_permd(%rip),%ymm5 1568 shl \$4,$power 1569 lea ($out,$power),$out 1570 mov \$9,%eax 1571 jmp .Loop_scatter_1024 1572 1573 .align 32 1574 .Loop_scatter_1024: 1575 vmovdqu ($inp),%ymm0 1576 lea 32($inp),$inp 1577 vpermd %ymm0,%ymm5,%ymm0 1578 vmovdqu %xmm0,($out) 1579 lea 16*32($out),$out 1580 dec %eax 1581 jnz .Loop_scatter_1024 1582 1583 vzeroupper 1584 ret 1585 .size rsaz_1024_scatter5_avx2,.-rsaz_1024_scatter5_avx2 1586 1587 .globl rsaz_1024_gather5_avx2 1588 .type rsaz_1024_gather5_avx2,\@abi-omnipotent 1589 .align 32 1590 rsaz_1024_gather5_avx2: 1591 ___ 1592 $code.=<<___ if ($win64); 1593 lea -0x88(%rsp),%rax 1594 vzeroupper 1595 .LSEH_begin_rsaz_1024_gather5: 1596 # I can't trust assembler to use specific encoding:-( 1597 .byte 0x48,0x8d,0x60,0xe0 #lea -0x20(%rax),%rsp 1598 .byte 0xc5,0xf8,0x29,0x70,0xe0 #vmovaps %xmm6,-0x20(%rax) 1599 .byte 0xc5,0xf8,0x29,0x78,0xf0 #vmovaps %xmm7,-0x10(%rax) 1600 .byte 0xc5,0x78,0x29,0x40,0x00 #vmovaps %xmm8,0(%rax) 1601 .byte 0xc5,0x78,0x29,0x48,0x10 #vmovaps %xmm9,0x10(%rax) 1602 .byte 0xc5,0x78,0x29,0x50,0x20 #vmovaps %xmm10,0x20(%rax) 1603 .byte 0xc5,0x78,0x29,0x58,0x30 #vmovaps %xmm11,0x30(%rax) 1604 .byte 0xc5,0x78,0x29,0x60,0x40 #vmovaps %xmm12,0x40(%rax) 1605 .byte 0xc5,0x78,0x29,0x68,0x50 #vmovaps %xmm13,0x50(%rax) 1606 .byte 0xc5,0x78,0x29,0x70,0x60 #vmovaps %xmm14,0x60(%rax) 1607 .byte 0xc5,0x78,0x29,0x78,0x70 #vmovaps %xmm15,0x70(%rax) 1608 ___ 1609 $code.=<<___; 1610 lea .Lgather_table(%rip),%r11 1611 mov $power,%eax 1612 and \$3,$power 1613 shr \$2,%eax # cache line number 1614 shl \$4,$power # offset within cache line 1615 1616 vmovdqu -32(%r11),%ymm7 # .Lgather_permd 1617 vpbroadcastb 8(%r11,%rax), %xmm8 1618 vpbroadcastb 7(%r11,%rax), %xmm9 1619 vpbroadcastb 6(%r11,%rax), %xmm10 1620 vpbroadcastb 5(%r11,%rax), %xmm11 1621 vpbroadcastb 4(%r11,%rax), %xmm12 1622 vpbroadcastb 3(%r11,%rax), %xmm13 1623 vpbroadcastb 2(%r11,%rax), %xmm14 1624 vpbroadcastb 1(%r11,%rax), %xmm15 1625 1626 lea 64($inp,$power),$inp 1627 mov \$64,%r11 # size optimization 1628 mov \$9,%eax 1629 jmp .Loop_gather_1024 1630 1631 .align 32 1632 .Loop_gather_1024: 1633 vpand -64($inp), %xmm8,%xmm0 1634 vpand ($inp), %xmm9,%xmm1 1635 vpand 64($inp), %xmm10,%xmm2 1636 vpand ($inp,%r11,2), %xmm11,%xmm3 1637 vpor %xmm0,%xmm1,%xmm1 1638 vpand 64($inp,%r11,2), %xmm12,%xmm4 1639 vpor %xmm2,%xmm3,%xmm3 1640 vpand ($inp,%r11,4), %xmm13,%xmm5 1641 vpor %xmm1,%xmm3,%xmm3 1642 vpand 64($inp,%r11,4), %xmm14,%xmm6 1643 vpor %xmm4,%xmm5,%xmm5 1644 vpand -128($inp,%r11,8), %xmm15,%xmm2 1645 lea ($inp,%r11,8),$inp 1646 vpor %xmm3,%xmm5,%xmm5 1647 vpor %xmm2,%xmm6,%xmm6 1648 vpor %xmm5,%xmm6,%xmm6 1649 vpermd %ymm6,%ymm7,%ymm6 1650 vmovdqu %ymm6,($out) 1651 lea 32($out),$out 1652 dec %eax 1653 jnz .Loop_gather_1024 1654 1655 vpxor %ymm0,%ymm0,%ymm0 1656 vmovdqu %ymm0,($out) 1657 vzeroupper 1658 ___ 1659 $code.=<<___ if ($win64); 1660 movaps (%rsp),%xmm6 1661 movaps 0x10(%rsp),%xmm7 1662 movaps 0x20(%rsp),%xmm8 1663 movaps 0x30(%rsp),%xmm9 1664 movaps 0x40(%rsp),%xmm10 1665 movaps 0x50(%rsp),%xmm11 1666 movaps 0x60(%rsp),%xmm12 1667 movaps 0x70(%rsp),%xmm13 1668 movaps 0x80(%rsp),%xmm14 1669 movaps 0x90(%rsp),%xmm15 1670 lea 0xa8(%rsp),%rsp 1671 .LSEH_end_rsaz_1024_gather5: 1672 ___ 1673 $code.=<<___; 1674 ret 1675 .size rsaz_1024_gather5_avx2,.-rsaz_1024_gather5_avx2 1676 ___ 1677 } 1678 1679 $code.=<<___; 1680 .extern OPENSSL_ia32cap_P 1681 .globl rsaz_avx2_eligible 1682 .type rsaz_avx2_eligible,\@abi-omnipotent 1683 .align 32 1684 rsaz_avx2_eligible: 1685 mov OPENSSL_ia32cap_P+8(%rip),%eax 1686 ___ 1687 $code.=<<___ if ($addx); 1688 mov \$`1<<8|1<<19`,%ecx 1689 mov \$0,%edx 1690 and %eax,%ecx 1691 cmp \$`1<<8|1<<19`,%ecx # check for BMI2+AD*X 1692 cmove %edx,%eax 1693 ___ 1694 $code.=<<___; 1695 and \$`1<<5`,%eax 1696 shr \$5,%eax 1697 ret 1698 .size rsaz_avx2_eligible,.-rsaz_avx2_eligible 1699 1700 .align 64 1701 .Land_mask: 1702 .quad 0x1fffffff,0x1fffffff,0x1fffffff,-1 1703 .Lscatter_permd: 1704 .long 0,2,4,6,7,7,7,7 1705 .Lgather_permd: 1706 .long 0,7,1,7,2,7,3,7 1707 .Lgather_table: 1708 .byte 0,0,0,0,0,0,0,0, 0xff,0,0,0,0,0,0,0 1709 .align 64 1710 ___ 1711 1712 if ($win64) { 1713 $rec="%rcx"; 1714 $frame="%rdx"; 1715 $context="%r8"; 1716 $disp="%r9"; 1717 1718 $code.=<<___ 1719 .extern __imp_RtlVirtualUnwind 1720 .type rsaz_se_handler,\@abi-omnipotent 1721 .align 16 1722 rsaz_se_handler: 1723 push %rsi 1724 push %rdi 1725 push %rbx 1726 push %rbp 1727 push %r12 1728 push %r13 1729 push %r14 1730 push %r15 1731 pushfq 1732 sub \$64,%rsp 1733 1734 mov 120($context),%rax # pull context->Rax 1735 mov 248($context),%rbx # pull context->Rip 1736 1737 mov 8($disp),%rsi # disp->ImageBase 1738 mov 56($disp),%r11 # disp->HandlerData 1739 1740 mov 0(%r11),%r10d # HandlerData[0] 1741 lea (%rsi,%r10),%r10 # prologue label 1742 cmp %r10,%rbx # context->Rip<prologue label 1743 jb .Lcommon_seh_tail 1744 1745 mov 152($context),%rax # pull context->Rsp 1746 1747 mov 4(%r11),%r10d # HandlerData[1] 1748 lea (%rsi,%r10),%r10 # epilogue label 1749 cmp %r10,%rbx # context->Rip>=epilogue label 1750 jae .Lcommon_seh_tail 1751 1752 mov 160($context),%rax # pull context->Rbp 1753 1754 mov -48(%rax),%r15 1755 mov -40(%rax),%r14 1756 mov -32(%rax),%r13 1757 mov -24(%rax),%r12 1758 mov -16(%rax),%rbp 1759 mov -8(%rax),%rbx 1760 mov %r15,240($context) 1761 mov %r14,232($context) 1762 mov %r13,224($context) 1763 mov %r12,216($context) 1764 mov %rbp,160($context) 1765 mov %rbx,144($context) 1766 1767 lea -0xd8(%rax),%rsi # %xmm save area 1768 lea 512($context),%rdi # & context.Xmm6 1769 mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax) 1770 .long 0xa548f3fc # cld; rep movsq 1771 1772 .Lcommon_seh_tail: 1773 mov 8(%rax),%rdi 1774 mov 16(%rax),%rsi 1775 mov %rax,152($context) # restore context->Rsp 1776 mov %rsi,168($context) # restore context->Rsi 1777 mov %rdi,176($context) # restore context->Rdi 1778 1779 mov 40($disp),%rdi # disp->ContextRecord 1780 mov $context,%rsi # context 1781 mov \$154,%ecx # sizeof(CONTEXT) 1782 .long 0xa548f3fc # cld; rep movsq 1783 1784 mov $disp,%rsi 1785 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER 1786 mov 8(%rsi),%rdx # arg2, disp->ImageBase 1787 mov 0(%rsi),%r8 # arg3, disp->ControlPc 1788 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry 1789 mov 40(%rsi),%r10 # disp->ContextRecord 1790 lea 56(%rsi),%r11 # &disp->HandlerData 1791 lea 24(%rsi),%r12 # &disp->EstablisherFrame 1792 mov %r10,32(%rsp) # arg5 1793 mov %r11,40(%rsp) # arg6 1794 mov %r12,48(%rsp) # arg7 1795 mov %rcx,56(%rsp) # arg8, (NULL) 1796 call *__imp_RtlVirtualUnwind(%rip) 1797 1798 mov \$1,%eax # ExceptionContinueSearch 1799 add \$64,%rsp 1800 popfq 1801 pop %r15 1802 pop %r14 1803 pop %r13 1804 pop %r12 1805 pop %rbp 1806 pop %rbx 1807 pop %rdi 1808 pop %rsi 1809 ret 1810 .size rsaz_se_handler,.-rsaz_se_handler 1811 1812 .section .pdata 1813 .align 4 1814 .rva .LSEH_begin_rsaz_1024_sqr_avx2 1815 .rva .LSEH_end_rsaz_1024_sqr_avx2 1816 .rva .LSEH_info_rsaz_1024_sqr_avx2 1817 1818 .rva .LSEH_begin_rsaz_1024_mul_avx2 1819 .rva .LSEH_end_rsaz_1024_mul_avx2 1820 .rva .LSEH_info_rsaz_1024_mul_avx2 1821 1822 .rva .LSEH_begin_rsaz_1024_gather5 1823 .rva .LSEH_end_rsaz_1024_gather5 1824 .rva .LSEH_info_rsaz_1024_gather5 1825 .section .xdata 1826 .align 8 1827 .LSEH_info_rsaz_1024_sqr_avx2: 1828 .byte 9,0,0,0 1829 .rva rsaz_se_handler 1830 .rva .Lsqr_1024_body,.Lsqr_1024_epilogue 1831 .LSEH_info_rsaz_1024_mul_avx2: 1832 .byte 9,0,0,0 1833 .rva rsaz_se_handler 1834 .rva .Lmul_1024_body,.Lmul_1024_epilogue 1835 .LSEH_info_rsaz_1024_gather5: 1836 .byte 0x01,0x33,0x16,0x00 1837 .byte 0x36,0xf8,0x09,0x00 #vmovaps 0x90(rsp),xmm15 1838 .byte 0x31,0xe8,0x08,0x00 #vmovaps 0x80(rsp),xmm14 1839 .byte 0x2c,0xd8,0x07,0x00 #vmovaps 0x70(rsp),xmm13 1840 .byte 0x27,0xc8,0x06,0x00 #vmovaps 0x60(rsp),xmm12 1841 .byte 0x22,0xb8,0x05,0x00 #vmovaps 0x50(rsp),xmm11 1842 .byte 0x1d,0xa8,0x04,0x00 #vmovaps 0x40(rsp),xmm10 1843 .byte 0x18,0x98,0x03,0x00 #vmovaps 0x30(rsp),xmm9 1844 .byte 0x13,0x88,0x02,0x00 #vmovaps 0x20(rsp),xmm8 1845 .byte 0x0e,0x78,0x01,0x00 #vmovaps 0x10(rsp),xmm7 1846 .byte 0x09,0x68,0x00,0x00 #vmovaps 0x00(rsp),xmm6 1847 .byte 0x04,0x01,0x15,0x00 #sub rsp,0xa8 1848 ___ 1849 } 1850 1851 foreach (split("\n",$code)) { 1852 s/\`([^\`]*)\`/eval($1)/ge; 1853 1854 s/\b(sh[rl]d?\s+\$)(-?[0-9]+)/$1.$2%64/ge or 1855 1856 s/\b(vmov[dq])\b(.+)%ymm([0-9]+)/$1$2%xmm$3/go or 1857 s/\b(vmovdqu)\b(.+)%x%ymm([0-9]+)/$1$2%xmm$3/go or 1858 s/\b(vpinsr[qd])\b(.+)%ymm([0-9]+)/$1$2%xmm$3/go or 1859 s/\b(vpextr[qd])\b(.+)%ymm([0-9]+)/$1$2%xmm$3/go or 1860 s/\b(vpbroadcast[qd]\s+)%ymm([0-9]+)/$1%xmm$2/go; 1861 print $_,"\n"; 1862 } 1863 1864 }}} else {{{ 1865 print <<___; # assembler is too old 1866 .text 1867 1868 .globl rsaz_avx2_eligible 1869 .type rsaz_avx2_eligible,\@abi-omnipotent 1870 rsaz_avx2_eligible: 1871 xor %eax,%eax 1872 ret 1873 .size rsaz_avx2_eligible,.-rsaz_avx2_eligible 1874 1875 .globl rsaz_1024_sqr_avx2 1876 .globl rsaz_1024_mul_avx2 1877 .globl rsaz_1024_norm2red_avx2 1878 .globl rsaz_1024_red2norm_avx2 1879 .globl rsaz_1024_scatter5_avx2 1880 .globl rsaz_1024_gather5_avx2 1881 .type rsaz_1024_sqr_avx2,\@abi-omnipotent 1882 rsaz_1024_sqr_avx2: 1883 rsaz_1024_mul_avx2: 1884 rsaz_1024_norm2red_avx2: 1885 rsaz_1024_red2norm_avx2: 1886 rsaz_1024_scatter5_avx2: 1887 rsaz_1024_gather5_avx2: 1888 .byte 0x0f,0x0b # ud2 1889 ret 1890 .size rsaz_1024_sqr_avx2,.-rsaz_1024_sqr_avx2 1891 ___ 1892 }}} 1893 1894 close STDOUT; 1895