1 #! /usr/bin/env perl 2 # Copyright 2006-2016 The OpenSSL Project Authors. All Rights Reserved. 3 # 4 # Licensed under the OpenSSL license (the "License"). You may not use 5 # this file except in compliance with the License. You can obtain a copy 6 # in the file LICENSE in the source distribution or at 7 # https://www.openssl.org/source/license.html 8 9 # 10 # ==================================================================== 11 # Written by Andy Polyakov <appro (at] openssl.org> for the OpenSSL 12 # project. The module is, however, dual licensed under OpenSSL and 13 # CRYPTOGAMS licenses depending on where you obtain it. For further 14 # details see http://www.openssl.org/~appro/cryptogams/. 15 # ==================================================================== 16 # 17 # sha1_block procedure for x86_64. 18 # 19 # It was brought to my attention that on EM64T compiler-generated code 20 # was far behind 32-bit assembler implementation. This is unlike on 21 # Opteron where compiler-generated code was only 15% behind 32-bit 22 # assembler, which originally made it hard to motivate the effort. 23 # There was suggestion to mechanically translate 32-bit code, but I 24 # dismissed it, reasoning that x86_64 offers enough register bank 25 # capacity to fully utilize SHA-1 parallelism. Therefore this fresh 26 # implementation:-) However! While 64-bit code does perform better 27 # on Opteron, I failed to beat 32-bit assembler on EM64T core. Well, 28 # x86_64 does offer larger *addressable* bank, but out-of-order core 29 # reaches for even more registers through dynamic aliasing, and EM64T 30 # core must have managed to run-time optimize even 32-bit code just as 31 # good as 64-bit one. Performance improvement is summarized in the 32 # following table: 33 # 34 # gcc 3.4 32-bit asm cycles/byte 35 # Opteron +45% +20% 6.8 36 # Xeon P4 +65% +0% 9.9 37 # Core2 +60% +10% 7.0 38 39 # August 2009. 40 # 41 # The code was revised to minimize code size and to maximize 42 # "distance" between instructions producing input to 'lea' 43 # instruction and the 'lea' instruction itself, which is essential 44 # for Intel Atom core. 45 46 # October 2010. 47 # 48 # Add SSSE3, Supplemental[!] SSE3, implementation. The idea behind it 49 # is to offload message schedule denoted by Wt in NIST specification, 50 # or Xupdate in OpenSSL source, to SIMD unit. See sha1-586.pl module 51 # for background and implementation details. The only difference from 52 # 32-bit code is that 64-bit code doesn't have to spill @X[] elements 53 # to free temporary registers. 54 55 # April 2011. 56 # 57 # Add AVX code path. See sha1-586.pl for further information. 58 59 # May 2013. 60 # 61 # Add AVX2+BMI code path. Initial attempt (utilizing BMI instructions 62 # and loading pair of consecutive blocks to 256-bit %ymm registers) 63 # did not provide impressive performance improvement till a crucial 64 # hint regarding the number of Xupdate iterations to pre-compute in 65 # advance was provided by Ilya Albrekht of Intel Corp. 66 67 # March 2014. 68 # 69 # Add support for Intel SHA Extensions. 70 71 ###################################################################### 72 # Current performance is summarized in following table. Numbers are 73 # CPU clock cycles spent to process single byte (less is better). 74 # 75 # x86_64 SSSE3 AVX[2] 76 # P4 9.05 - 77 # Opteron 6.26 - 78 # Core2 6.55 6.05/+8% - 79 # Westmere 6.73 5.30/+27% - 80 # Sandy Bridge 7.70 6.10/+26% 4.99/+54% 81 # Ivy Bridge 6.06 4.67/+30% 4.60/+32% 82 # Haswell 5.45 4.15/+31% 3.57/+53% 83 # Skylake 5.18 4.06/+28% 3.54/+46% 84 # Bulldozer 9.11 5.95/+53% 85 # Ryzen 4.75 3.80/+24% 1.93/+150%(**) 86 # VIA Nano 9.32 7.15/+30% 87 # Atom 10.3 9.17/+12% 88 # Silvermont 13.1(*) 9.37/+40% 89 # Knights L 13.2(*) 9.68/+36% 8.30/+59% 90 # Goldmont 8.13 6.42/+27% 1.70/+380%(**) 91 # 92 # (*) obviously suboptimal result, nothing was done about it, 93 # because SSSE3 code is compiled unconditionally; 94 # (**) SHAEXT result 95 96 $flavour = shift; 97 $output = shift; 98 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } 99 100 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); 101 102 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 103 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or 104 ( $xlate="${dir}../../../perlasm/x86_64-xlate.pl" and -f $xlate) or 105 die "can't locate x86_64-xlate.pl"; 106 107 # In upstream, this is controlled by shelling out to the compiler to check 108 # versions, but BoringSSL is intended to be used with pre-generated perlasm 109 # output, so this isn't useful anyway. 110 # 111 # TODO(davidben): Enable AVX2 code after testing by setting $avx to 2. Is it 112 # necessary to disable AVX2 code when SHA Extensions code is disabled? Upstream 113 # did not tie them together until after $shaext was added. 114 $avx = 1; 115 116 # TODO(davidben): Consider enabling the Intel SHA Extensions code once it's 117 # been tested. 118 $shaext=0; ### set to zero if compiling for 1.0.1 119 $avx=1 if (!$shaext && $avx); 120 121 open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""; 122 *STDOUT=*OUT; 123 124 $ctx="%rdi"; # 1st arg 125 $inp="%rsi"; # 2nd arg 126 $num="%rdx"; # 3rd arg 127 128 # reassign arguments in order to produce more compact code 129 $ctx="%r8"; 130 $inp="%r9"; 131 $num="%r10"; 132 133 $t0="%eax"; 134 $t1="%ebx"; 135 $t2="%ecx"; 136 @xi=("%edx","%ebp","%r14d"); 137 $A="%esi"; 138 $B="%edi"; 139 $C="%r11d"; 140 $D="%r12d"; 141 $E="%r13d"; 142 143 @V=($A,$B,$C,$D,$E); 144 145 sub BODY_00_19 { 146 my ($i,$a,$b,$c,$d,$e)=@_; 147 my $j=$i+1; 148 $code.=<<___ if ($i==0); 149 mov `4*$i`($inp),$xi[0] 150 bswap $xi[0] 151 ___ 152 $code.=<<___ if ($i<15); 153 mov `4*$j`($inp),$xi[1] 154 mov $d,$t0 155 mov $xi[0],`4*$i`(%rsp) 156 mov $a,$t2 157 bswap $xi[1] 158 xor $c,$t0 159 rol \$5,$t2 160 and $b,$t0 161 lea 0x5a827999($xi[0],$e),$e 162 add $t2,$e 163 xor $d,$t0 164 rol \$30,$b 165 add $t0,$e 166 ___ 167 $code.=<<___ if ($i>=15); 168 xor `4*($j%16)`(%rsp),$xi[1] 169 mov $d,$t0 170 mov $xi[0],`4*($i%16)`(%rsp) 171 mov $a,$t2 172 xor `4*(($j+2)%16)`(%rsp),$xi[1] 173 xor $c,$t0 174 rol \$5,$t2 175 xor `4*(($j+8)%16)`(%rsp),$xi[1] 176 and $b,$t0 177 lea 0x5a827999($xi[0],$e),$e 178 rol \$30,$b 179 xor $d,$t0 180 add $t2,$e 181 rol \$1,$xi[1] 182 add $t0,$e 183 ___ 184 push(@xi,shift(@xi)); 185 } 186 187 sub BODY_20_39 { 188 my ($i,$a,$b,$c,$d,$e)=@_; 189 my $j=$i+1; 190 my $K=($i<40)?0x6ed9eba1:0xca62c1d6; 191 $code.=<<___ if ($i<79); 192 xor `4*($j%16)`(%rsp),$xi[1] 193 mov $b,$t0 194 `"mov $xi[0],".4*($i%16)."(%rsp)" if ($i<72)` 195 mov $a,$t2 196 xor `4*(($j+2)%16)`(%rsp),$xi[1] 197 xor $d,$t0 198 rol \$5,$t2 199 xor `4*(($j+8)%16)`(%rsp),$xi[1] 200 lea $K($xi[0],$e),$e 201 xor $c,$t0 202 add $t2,$e 203 rol \$30,$b 204 add $t0,$e 205 rol \$1,$xi[1] 206 ___ 207 $code.=<<___ if ($i==79); 208 mov $b,$t0 209 mov $a,$t2 210 xor $d,$t0 211 lea $K($xi[0],$e),$e 212 rol \$5,$t2 213 xor $c,$t0 214 add $t2,$e 215 rol \$30,$b 216 add $t0,$e 217 ___ 218 push(@xi,shift(@xi)); 219 } 220 221 sub BODY_40_59 { 222 my ($i,$a,$b,$c,$d,$e)=@_; 223 my $j=$i+1; 224 $code.=<<___; 225 xor `4*($j%16)`(%rsp),$xi[1] 226 mov $d,$t0 227 mov $xi[0],`4*($i%16)`(%rsp) 228 mov $d,$t1 229 xor `4*(($j+2)%16)`(%rsp),$xi[1] 230 and $c,$t0 231 mov $a,$t2 232 xor `4*(($j+8)%16)`(%rsp),$xi[1] 233 lea 0x8f1bbcdc($xi[0],$e),$e 234 xor $c,$t1 235 rol \$5,$t2 236 add $t0,$e 237 rol \$1,$xi[1] 238 and $b,$t1 239 add $t2,$e 240 rol \$30,$b 241 add $t1,$e 242 ___ 243 push(@xi,shift(@xi)); 244 } 245 246 $code.=<<___; 247 .text 248 .extern OPENSSL_ia32cap_P 249 250 .globl sha1_block_data_order 251 .type sha1_block_data_order,\@function,3 252 .align 16 253 sha1_block_data_order: 254 leaq OPENSSL_ia32cap_P(%rip),%r10 255 mov 0(%r10),%r9d 256 mov 4(%r10),%r8d 257 mov 8(%r10),%r10d 258 test \$`1<<9`,%r8d # check SSSE3 bit 259 jz .Lialu 260 ___ 261 $code.=<<___ if ($shaext); 262 test \$`1<<29`,%r10d # check SHA bit 263 jnz _shaext_shortcut 264 ___ 265 $code.=<<___ if ($avx>1); 266 and \$`1<<3|1<<5|1<<8`,%r10d # check AVX2+BMI1+BMI2 267 cmp \$`1<<3|1<<5|1<<8`,%r10d 268 je _avx2_shortcut 269 ___ 270 $code.=<<___ if ($avx); 271 and \$`1<<28`,%r8d # mask AVX bit 272 and \$`1<<30`,%r9d # mask "Intel CPU" bit 273 or %r9d,%r8d 274 cmp \$`1<<28|1<<30`,%r8d 275 je _avx_shortcut 276 ___ 277 $code.=<<___; 278 jmp _ssse3_shortcut 279 280 .align 16 281 .Lialu: 282 mov %rsp,%rax 283 push %rbx 284 push %rbp 285 push %r12 286 push %r13 287 push %r14 288 mov %rdi,$ctx # reassigned argument 289 sub \$`8+16*4`,%rsp 290 mov %rsi,$inp # reassigned argument 291 and \$-64,%rsp 292 mov %rdx,$num # reassigned argument 293 mov %rax,`16*4`(%rsp) 294 .Lprologue: 295 296 mov 0($ctx),$A 297 mov 4($ctx),$B 298 mov 8($ctx),$C 299 mov 12($ctx),$D 300 mov 16($ctx),$E 301 jmp .Lloop 302 303 .align 16 304 .Lloop: 305 ___ 306 for($i=0;$i<20;$i++) { &BODY_00_19($i,@V); unshift(@V,pop(@V)); } 307 for(;$i<40;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); } 308 for(;$i<60;$i++) { &BODY_40_59($i,@V); unshift(@V,pop(@V)); } 309 for(;$i<80;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); } 310 $code.=<<___; 311 add 0($ctx),$A 312 add 4($ctx),$B 313 add 8($ctx),$C 314 add 12($ctx),$D 315 add 16($ctx),$E 316 mov $A,0($ctx) 317 mov $B,4($ctx) 318 mov $C,8($ctx) 319 mov $D,12($ctx) 320 mov $E,16($ctx) 321 322 sub \$1,$num 323 lea `16*4`($inp),$inp 324 jnz .Lloop 325 326 mov `16*4`(%rsp),%rsi 327 mov -40(%rsi),%r14 328 mov -32(%rsi),%r13 329 mov -24(%rsi),%r12 330 mov -16(%rsi),%rbp 331 mov -8(%rsi),%rbx 332 lea (%rsi),%rsp 333 .Lepilogue: 334 ret 335 .size sha1_block_data_order,.-sha1_block_data_order 336 ___ 337 if ($shaext) {{{ 338 ###################################################################### 339 # Intel SHA Extensions implementation of SHA1 update function. 340 # 341 my ($ctx,$inp,$num)=("%rdi","%rsi","%rdx"); 342 my ($ABCD,$E,$E_,$BSWAP,$ABCD_SAVE,$E_SAVE)=map("%xmm$_",(0..3,8,9)); 343 my @MSG=map("%xmm$_",(4..7)); 344 345 $code.=<<___; 346 .type sha1_block_data_order_shaext,\@function,3 347 .align 32 348 sha1_block_data_order_shaext: 349 _shaext_shortcut: 350 ___ 351 $code.=<<___ if ($win64); 352 lea `-8-4*16`(%rsp),%rsp 353 movaps %xmm6,-8-4*16(%rax) 354 movaps %xmm7,-8-3*16(%rax) 355 movaps %xmm8,-8-2*16(%rax) 356 movaps %xmm9,-8-1*16(%rax) 357 .Lprologue_shaext: 358 ___ 359 $code.=<<___; 360 movdqu ($ctx),$ABCD 361 movd 16($ctx),$E 362 movdqa K_XX_XX+0xa0(%rip),$BSWAP # byte-n-word swap 363 364 movdqu ($inp),@MSG[0] 365 pshufd \$0b00011011,$ABCD,$ABCD # flip word order 366 movdqu 0x10($inp),@MSG[1] 367 pshufd \$0b00011011,$E,$E # flip word order 368 movdqu 0x20($inp),@MSG[2] 369 pshufb $BSWAP,@MSG[0] 370 movdqu 0x30($inp),@MSG[3] 371 pshufb $BSWAP,@MSG[1] 372 pshufb $BSWAP,@MSG[2] 373 movdqa $E,$E_SAVE # offload $E 374 pshufb $BSWAP,@MSG[3] 375 jmp .Loop_shaext 376 377 .align 16 378 .Loop_shaext: 379 dec $num 380 lea 0x40($inp),%r8 # next input block 381 paddd @MSG[0],$E 382 cmovne %r8,$inp 383 movdqa $ABCD,$ABCD_SAVE # offload $ABCD 384 ___ 385 for($i=0;$i<20-4;$i+=2) { 386 $code.=<<___; 387 sha1msg1 @MSG[1],@MSG[0] 388 movdqa $ABCD,$E_ 389 sha1rnds4 \$`int($i/5)`,$E,$ABCD # 0-3... 390 sha1nexte @MSG[1],$E_ 391 pxor @MSG[2],@MSG[0] 392 sha1msg1 @MSG[2],@MSG[1] 393 sha1msg2 @MSG[3],@MSG[0] 394 395 movdqa $ABCD,$E 396 sha1rnds4 \$`int(($i+1)/5)`,$E_,$ABCD 397 sha1nexte @MSG[2],$E 398 pxor @MSG[3],@MSG[1] 399 sha1msg2 @MSG[0],@MSG[1] 400 ___ 401 push(@MSG,shift(@MSG)); push(@MSG,shift(@MSG)); 402 } 403 $code.=<<___; 404 movdqu ($inp),@MSG[0] 405 movdqa $ABCD,$E_ 406 sha1rnds4 \$3,$E,$ABCD # 64-67 407 sha1nexte @MSG[1],$E_ 408 movdqu 0x10($inp),@MSG[1] 409 pshufb $BSWAP,@MSG[0] 410 411 movdqa $ABCD,$E 412 sha1rnds4 \$3,$E_,$ABCD # 68-71 413 sha1nexte @MSG[2],$E 414 movdqu 0x20($inp),@MSG[2] 415 pshufb $BSWAP,@MSG[1] 416 417 movdqa $ABCD,$E_ 418 sha1rnds4 \$3,$E,$ABCD # 72-75 419 sha1nexte @MSG[3],$E_ 420 movdqu 0x30($inp),@MSG[3] 421 pshufb $BSWAP,@MSG[2] 422 423 movdqa $ABCD,$E 424 sha1rnds4 \$3,$E_,$ABCD # 76-79 425 sha1nexte $E_SAVE,$E 426 pshufb $BSWAP,@MSG[3] 427 428 paddd $ABCD_SAVE,$ABCD 429 movdqa $E,$E_SAVE # offload $E 430 431 jnz .Loop_shaext 432 433 pshufd \$0b00011011,$ABCD,$ABCD 434 pshufd \$0b00011011,$E,$E 435 movdqu $ABCD,($ctx) 436 movd $E,16($ctx) 437 ___ 438 $code.=<<___ if ($win64); 439 movaps -8-4*16(%rax),%xmm6 440 movaps -8-3*16(%rax),%xmm7 441 movaps -8-2*16(%rax),%xmm8 442 movaps -8-1*16(%rax),%xmm9 443 mov %rax,%rsp 444 .Lepilogue_shaext: 445 ___ 446 $code.=<<___; 447 ret 448 .size sha1_block_data_order_shaext,.-sha1_block_data_order_shaext 449 ___ 450 }}} 451 {{{ 452 my $Xi=4; 453 my @X=map("%xmm$_",(4..7,0..3)); 454 my @Tx=map("%xmm$_",(8..10)); 455 my $Kx="%xmm11"; 456 my @V=($A,$B,$C,$D,$E)=("%eax","%ebx","%ecx","%edx","%ebp"); # size optimization 457 my @T=("%esi","%edi"); 458 my $j=0; 459 my $rx=0; 460 my $K_XX_XX="%r14"; 461 my $fp="%r11"; 462 463 my $_rol=sub { &rol(@_) }; 464 my $_ror=sub { &ror(@_) }; 465 466 { my $sn; 467 sub align32() { 468 ++$sn; 469 $code.=<<___; 470 jmp .Lalign32_$sn # see "Decoded ICache" in manual 471 .align 32 472 .Lalign32_$sn: 473 ___ 474 } 475 } 476 477 $code.=<<___; 478 .type sha1_block_data_order_ssse3,\@function,3 479 .align 16 480 sha1_block_data_order_ssse3: 481 _ssse3_shortcut: 482 mov %rsp,$fp # frame pointer 483 push %rbx 484 push %rbp 485 push %r12 486 push %r13 # redundant, done to share Win64 SE handler 487 push %r14 488 lea `-64-($win64?6*16:0)`(%rsp),%rsp 489 ___ 490 $code.=<<___ if ($win64); 491 movaps %xmm6,-40-6*16($fp) 492 movaps %xmm7,-40-5*16($fp) 493 movaps %xmm8,-40-4*16($fp) 494 movaps %xmm9,-40-3*16($fp) 495 movaps %xmm10,-40-2*16($fp) 496 movaps %xmm11,-40-1*16($fp) 497 .Lprologue_ssse3: 498 ___ 499 $code.=<<___; 500 and \$-64,%rsp 501 mov %rdi,$ctx # reassigned argument 502 mov %rsi,$inp # reassigned argument 503 mov %rdx,$num # reassigned argument 504 505 shl \$6,$num 506 add $inp,$num 507 lea K_XX_XX+64(%rip),$K_XX_XX 508 509 mov 0($ctx),$A # load context 510 mov 4($ctx),$B 511 mov 8($ctx),$C 512 mov 12($ctx),$D 513 mov $B,@T[0] # magic seed 514 mov 16($ctx),$E 515 mov $C,@T[1] 516 xor $D,@T[1] 517 and @T[1],@T[0] 518 519 movdqa 64($K_XX_XX),@X[2] # pbswap mask 520 movdqa -64($K_XX_XX),@Tx[1] # K_00_19 521 movdqu 0($inp),@X[-4&7] # load input to %xmm[0-3] 522 movdqu 16($inp),@X[-3&7] 523 movdqu 32($inp),@X[-2&7] 524 movdqu 48($inp),@X[-1&7] 525 pshufb @X[2],@X[-4&7] # byte swap 526 pshufb @X[2],@X[-3&7] 527 pshufb @X[2],@X[-2&7] 528 add \$64,$inp 529 paddd @Tx[1],@X[-4&7] # add K_00_19 530 pshufb @X[2],@X[-1&7] 531 paddd @Tx[1],@X[-3&7] 532 paddd @Tx[1],@X[-2&7] 533 movdqa @X[-4&7],0(%rsp) # X[]+K xfer to IALU 534 psubd @Tx[1],@X[-4&7] # restore X[] 535 movdqa @X[-3&7],16(%rsp) 536 psubd @Tx[1],@X[-3&7] 537 movdqa @X[-2&7],32(%rsp) 538 psubd @Tx[1],@X[-2&7] 539 jmp .Loop_ssse3 540 ___ 541 542 sub AUTOLOAD() # thunk [simplified] 32-bit style perlasm 543 { my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; 544 my $arg = pop; 545 $arg = "\$$arg" if ($arg*1 eq $arg); 546 $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n"; 547 } 548 549 sub Xupdate_ssse3_16_31() # recall that $Xi starts with 4 550 { use integer; 551 my $body = shift; 552 my @insns = (&$body,&$body,&$body,&$body); # 40 instructions 553 my ($a,$b,$c,$d,$e); 554 555 eval(shift(@insns)); # ror 556 &pshufd (@X[0],@X[-4&7],0xee); # was &movdqa (@X[0],@X[-3&7]); 557 eval(shift(@insns)); 558 &movdqa (@Tx[0],@X[-1&7]); 559 &paddd (@Tx[1],@X[-1&7]); 560 eval(shift(@insns)); 561 eval(shift(@insns)); 562 563 &punpcklqdq(@X[0],@X[-3&7]); # compose "X[-14]" in "X[0]", was &palignr(@X[0],@X[-4&7],8); 564 eval(shift(@insns)); 565 eval(shift(@insns)); # rol 566 eval(shift(@insns)); 567 &psrldq (@Tx[0],4); # "X[-3]", 3 dwords 568 eval(shift(@insns)); 569 eval(shift(@insns)); 570 571 &pxor (@X[0],@X[-4&7]); # "X[0]"^="X[-16]" 572 eval(shift(@insns)); 573 eval(shift(@insns)); # ror 574 &pxor (@Tx[0],@X[-2&7]); # "X[-3]"^"X[-8]" 575 eval(shift(@insns)); 576 eval(shift(@insns)); 577 eval(shift(@insns)); 578 579 &pxor (@X[0],@Tx[0]); # "X[0]"^="X[-3]"^"X[-8]" 580 eval(shift(@insns)); 581 eval(shift(@insns)); # rol 582 &movdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU 583 eval(shift(@insns)); 584 eval(shift(@insns)); 585 586 &movdqa (@Tx[2],@X[0]); 587 eval(shift(@insns)); 588 eval(shift(@insns)); 589 eval(shift(@insns)); # ror 590 &movdqa (@Tx[0],@X[0]); 591 eval(shift(@insns)); 592 593 &pslldq (@Tx[2],12); # "X[0]"<<96, extract one dword 594 &paddd (@X[0],@X[0]); 595 eval(shift(@insns)); 596 eval(shift(@insns)); 597 598 &psrld (@Tx[0],31); 599 eval(shift(@insns)); 600 eval(shift(@insns)); # rol 601 eval(shift(@insns)); 602 &movdqa (@Tx[1],@Tx[2]); 603 eval(shift(@insns)); 604 eval(shift(@insns)); 605 606 &psrld (@Tx[2],30); 607 eval(shift(@insns)); 608 eval(shift(@insns)); # ror 609 &por (@X[0],@Tx[0]); # "X[0]"<<<=1 610 eval(shift(@insns)); 611 eval(shift(@insns)); 612 eval(shift(@insns)); 613 614 &pslld (@Tx[1],2); 615 &pxor (@X[0],@Tx[2]); 616 eval(shift(@insns)); 617 &movdqa (@Tx[2],eval(2*16*(($Xi)/5)-64)."($K_XX_XX)"); # K_XX_XX 618 eval(shift(@insns)); # rol 619 eval(shift(@insns)); 620 eval(shift(@insns)); 621 622 &pxor (@X[0],@Tx[1]); # "X[0]"^=("X[0]">>96)<<<2 623 &pshufd (@Tx[1],@X[-1&7],0xee) if ($Xi==7); # was &movdqa (@Tx[0],@X[-1&7]) in Xupdate_ssse3_32_79 624 625 foreach (@insns) { eval; } # remaining instructions [if any] 626 627 $Xi++; push(@X,shift(@X)); # "rotate" X[] 628 push(@Tx,shift(@Tx)); 629 } 630 631 sub Xupdate_ssse3_32_79() 632 { use integer; 633 my $body = shift; 634 my @insns = (&$body,&$body,&$body,&$body); # 32 to 44 instructions 635 my ($a,$b,$c,$d,$e); 636 637 eval(shift(@insns)) if ($Xi==8); 638 &pxor (@X[0],@X[-4&7]); # "X[0]"="X[-32]"^"X[-16]" 639 eval(shift(@insns)) if ($Xi==8); 640 eval(shift(@insns)); # body_20_39 641 eval(shift(@insns)); 642 eval(shift(@insns)) if (@insns[1] =~ /_ror/); 643 eval(shift(@insns)) if (@insns[0] =~ /_ror/); 644 &punpcklqdq(@Tx[0],@X[-1&7]); # compose "X[-6]", was &palignr(@Tx[0],@X[-2&7],8); 645 eval(shift(@insns)); 646 eval(shift(@insns)); # rol 647 648 &pxor (@X[0],@X[-7&7]); # "X[0]"^="X[-28]" 649 eval(shift(@insns)); 650 eval(shift(@insns)); 651 if ($Xi%5) { 652 &movdqa (@Tx[2],@Tx[1]);# "perpetuate" K_XX_XX... 653 } else { # ... or load next one 654 &movdqa (@Tx[2],eval(2*16*($Xi/5)-64)."($K_XX_XX)"); 655 } 656 eval(shift(@insns)); # ror 657 &paddd (@Tx[1],@X[-1&7]); 658 eval(shift(@insns)); 659 660 &pxor (@X[0],@Tx[0]); # "X[0]"^="X[-6]" 661 eval(shift(@insns)); # body_20_39 662 eval(shift(@insns)); 663 eval(shift(@insns)); 664 eval(shift(@insns)); # rol 665 eval(shift(@insns)) if (@insns[0] =~ /_ror/); 666 667 &movdqa (@Tx[0],@X[0]); 668 eval(shift(@insns)); 669 eval(shift(@insns)); 670 &movdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU 671 eval(shift(@insns)); # ror 672 eval(shift(@insns)); 673 eval(shift(@insns)); # body_20_39 674 675 &pslld (@X[0],2); 676 eval(shift(@insns)); 677 eval(shift(@insns)); 678 &psrld (@Tx[0],30); 679 eval(shift(@insns)) if (@insns[0] =~ /_rol/);# rol 680 eval(shift(@insns)); 681 eval(shift(@insns)); 682 eval(shift(@insns)); # ror 683 684 &por (@X[0],@Tx[0]); # "X[0]"<<<=2 685 eval(shift(@insns)); 686 eval(shift(@insns)); # body_20_39 687 eval(shift(@insns)) if (@insns[1] =~ /_rol/); 688 eval(shift(@insns)) if (@insns[0] =~ /_rol/); 689 &pshufd(@Tx[1],@X[-1&7],0xee) if ($Xi<19); # was &movdqa (@Tx[1],@X[0]) 690 eval(shift(@insns)); 691 eval(shift(@insns)); # rol 692 eval(shift(@insns)); 693 eval(shift(@insns)); 694 eval(shift(@insns)); # rol 695 eval(shift(@insns)); 696 697 foreach (@insns) { eval; } # remaining instructions 698 699 $Xi++; push(@X,shift(@X)); # "rotate" X[] 700 push(@Tx,shift(@Tx)); 701 } 702 703 sub Xuplast_ssse3_80() 704 { use integer; 705 my $body = shift; 706 my @insns = (&$body,&$body,&$body,&$body); # 32 instructions 707 my ($a,$b,$c,$d,$e); 708 709 eval(shift(@insns)); 710 eval(shift(@insns)); 711 eval(shift(@insns)); 712 eval(shift(@insns)); 713 &paddd (@Tx[1],@X[-1&7]); 714 eval(shift(@insns)); 715 eval(shift(@insns)); 716 717 &movdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer IALU 718 719 foreach (@insns) { eval; } # remaining instructions 720 721 &cmp ($inp,$num); 722 &je (".Ldone_ssse3"); 723 724 unshift(@Tx,pop(@Tx)); 725 726 &movdqa (@X[2],"64($K_XX_XX)"); # pbswap mask 727 &movdqa (@Tx[1],"-64($K_XX_XX)"); # K_00_19 728 &movdqu (@X[-4&7],"0($inp)"); # load input 729 &movdqu (@X[-3&7],"16($inp)"); 730 &movdqu (@X[-2&7],"32($inp)"); 731 &movdqu (@X[-1&7],"48($inp)"); 732 &pshufb (@X[-4&7],@X[2]); # byte swap 733 &add ($inp,64); 734 735 $Xi=0; 736 } 737 738 sub Xloop_ssse3() 739 { use integer; 740 my $body = shift; 741 my @insns = (&$body,&$body,&$body,&$body); # 32 instructions 742 my ($a,$b,$c,$d,$e); 743 744 eval(shift(@insns)); 745 eval(shift(@insns)); 746 eval(shift(@insns)); 747 &pshufb (@X[($Xi-3)&7],@X[2]); 748 eval(shift(@insns)); 749 eval(shift(@insns)); 750 eval(shift(@insns)); 751 eval(shift(@insns)); 752 &paddd (@X[($Xi-4)&7],@Tx[1]); 753 eval(shift(@insns)); 754 eval(shift(@insns)); 755 eval(shift(@insns)); 756 eval(shift(@insns)); 757 &movdqa (eval(16*$Xi)."(%rsp)",@X[($Xi-4)&7]); # X[]+K xfer to IALU 758 eval(shift(@insns)); 759 eval(shift(@insns)); 760 eval(shift(@insns)); 761 eval(shift(@insns)); 762 &psubd (@X[($Xi-4)&7],@Tx[1]); 763 764 foreach (@insns) { eval; } 765 $Xi++; 766 } 767 768 sub Xtail_ssse3() 769 { use integer; 770 my $body = shift; 771 my @insns = (&$body,&$body,&$body,&$body); # 32 instructions 772 my ($a,$b,$c,$d,$e); 773 774 foreach (@insns) { eval; } 775 } 776 777 sub body_00_19 () { # ((c^d)&b)^d 778 # on start @T[0]=(c^d)&b 779 return &body_20_39() if ($rx==19); $rx++; 780 ( 781 '($a,$b,$c,$d,$e)=@V;'. 782 '&$_ror ($b,$j?7:2)', # $b>>>2 783 '&xor (@T[0],$d)', 784 '&mov (@T[1],$a)', # $b for next round 785 786 '&add ($e,eval(4*($j&15))."(%rsp)")', # X[]+K xfer 787 '&xor ($b,$c)', # $c^$d for next round 788 789 '&$_rol ($a,5)', 790 '&add ($e,@T[0])', 791 '&and (@T[1],$b)', # ($b&($c^$d)) for next round 792 793 '&xor ($b,$c)', # restore $b 794 '&add ($e,$a);' .'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));' 795 ); 796 } 797 798 sub body_20_39 () { # b^d^c 799 # on entry @T[0]=b^d 800 return &body_40_59() if ($rx==39); $rx++; 801 ( 802 '($a,$b,$c,$d,$e)=@V;'. 803 '&add ($e,eval(4*($j&15))."(%rsp)")', # X[]+K xfer 804 '&xor (@T[0],$d) if($j==19);'. 805 '&xor (@T[0],$c) if($j> 19)', # ($b^$d^$c) 806 '&mov (@T[1],$a)', # $b for next round 807 808 '&$_rol ($a,5)', 809 '&add ($e,@T[0])', 810 '&xor (@T[1],$c) if ($j< 79)', # $b^$d for next round 811 812 '&$_ror ($b,7)', # $b>>>2 813 '&add ($e,$a);' .'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));' 814 ); 815 } 816 817 sub body_40_59 () { # ((b^c)&(c^d))^c 818 # on entry @T[0]=(b^c), (c^=d) 819 $rx++; 820 ( 821 '($a,$b,$c,$d,$e)=@V;'. 822 '&add ($e,eval(4*($j&15))."(%rsp)")', # X[]+K xfer 823 '&and (@T[0],$c) if ($j>=40)', # (b^c)&(c^d) 824 '&xor ($c,$d) if ($j>=40)', # restore $c 825 826 '&$_ror ($b,7)', # $b>>>2 827 '&mov (@T[1],$a)', # $b for next round 828 '&xor (@T[0],$c)', 829 830 '&$_rol ($a,5)', 831 '&add ($e,@T[0])', 832 '&xor (@T[1],$c) if ($j==59);'. 833 '&xor (@T[1],$b) if ($j< 59)', # b^c for next round 834 835 '&xor ($b,$c) if ($j< 59)', # c^d for next round 836 '&add ($e,$a);' .'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));' 837 ); 838 } 839 $code.=<<___; 840 .align 16 841 .Loop_ssse3: 842 ___ 843 &Xupdate_ssse3_16_31(\&body_00_19); 844 &Xupdate_ssse3_16_31(\&body_00_19); 845 &Xupdate_ssse3_16_31(\&body_00_19); 846 &Xupdate_ssse3_16_31(\&body_00_19); 847 &Xupdate_ssse3_32_79(\&body_00_19); 848 &Xupdate_ssse3_32_79(\&body_20_39); 849 &Xupdate_ssse3_32_79(\&body_20_39); 850 &Xupdate_ssse3_32_79(\&body_20_39); 851 &Xupdate_ssse3_32_79(\&body_20_39); 852 &Xupdate_ssse3_32_79(\&body_20_39); 853 &Xupdate_ssse3_32_79(\&body_40_59); 854 &Xupdate_ssse3_32_79(\&body_40_59); 855 &Xupdate_ssse3_32_79(\&body_40_59); 856 &Xupdate_ssse3_32_79(\&body_40_59); 857 &Xupdate_ssse3_32_79(\&body_40_59); 858 &Xupdate_ssse3_32_79(\&body_20_39); 859 &Xuplast_ssse3_80(\&body_20_39); # can jump to "done" 860 861 $saved_j=$j; @saved_V=@V; 862 863 &Xloop_ssse3(\&body_20_39); 864 &Xloop_ssse3(\&body_20_39); 865 &Xloop_ssse3(\&body_20_39); 866 867 $code.=<<___; 868 add 0($ctx),$A # update context 869 add 4($ctx),@T[0] 870 add 8($ctx),$C 871 add 12($ctx),$D 872 mov $A,0($ctx) 873 add 16($ctx),$E 874 mov @T[0],4($ctx) 875 mov @T[0],$B # magic seed 876 mov $C,8($ctx) 877 mov $C,@T[1] 878 mov $D,12($ctx) 879 xor $D,@T[1] 880 mov $E,16($ctx) 881 and @T[1],@T[0] 882 jmp .Loop_ssse3 883 884 .align 16 885 .Ldone_ssse3: 886 ___ 887 $j=$saved_j; @V=@saved_V; 888 889 &Xtail_ssse3(\&body_20_39); 890 &Xtail_ssse3(\&body_20_39); 891 &Xtail_ssse3(\&body_20_39); 892 893 $code.=<<___; 894 add 0($ctx),$A # update context 895 add 4($ctx),@T[0] 896 add 8($ctx),$C 897 mov $A,0($ctx) 898 add 12($ctx),$D 899 mov @T[0],4($ctx) 900 add 16($ctx),$E 901 mov $C,8($ctx) 902 mov $D,12($ctx) 903 mov $E,16($ctx) 904 ___ 905 $code.=<<___ if ($win64); 906 movaps -40-6*16($fp),%xmm6 907 movaps -40-5*16($fp),%xmm7 908 movaps -40-4*16($fp),%xmm8 909 movaps -40-3*16($fp),%xmm9 910 movaps -40-2*16($fp),%xmm10 911 movaps -40-1*16($fp),%xmm11 912 ___ 913 $code.=<<___; 914 mov -40($fp),%r14 915 mov -32($fp),%r13 916 mov -24($fp),%r12 917 mov -16($fp),%rbp 918 mov -8($fp),%rbx 919 lea ($fp),%rsp 920 .Lepilogue_ssse3: 921 ret 922 .size sha1_block_data_order_ssse3,.-sha1_block_data_order_ssse3 923 ___ 924 925 if ($avx) { 926 $Xi=4; # reset variables 927 @X=map("%xmm$_",(4..7,0..3)); 928 @Tx=map("%xmm$_",(8..10)); 929 $j=0; 930 $rx=0; 931 932 my $done_avx_label=".Ldone_avx"; 933 934 my $_rol=sub { &shld(@_[0],@_) }; 935 my $_ror=sub { &shrd(@_[0],@_) }; 936 937 $code.=<<___; 938 .type sha1_block_data_order_avx,\@function,3 939 .align 16 940 sha1_block_data_order_avx: 941 _avx_shortcut: 942 mov %rsp,$fp 943 push %rbx 944 push %rbp 945 push %r12 946 push %r13 # redundant, done to share Win64 SE handler 947 push %r14 948 lea `-64-($win64?6*16:0)`(%rsp),%rsp 949 vzeroupper 950 ___ 951 $code.=<<___ if ($win64); 952 vmovaps %xmm6,-40-6*16($fp) 953 vmovaps %xmm7,-40-5*16($fp) 954 vmovaps %xmm8,-40-4*16($fp) 955 vmovaps %xmm9,-40-3*16($fp) 956 vmovaps %xmm10,-40-2*16($fp) 957 vmovaps %xmm11,-40-1*16($fp) 958 .Lprologue_avx: 959 ___ 960 $code.=<<___; 961 and \$-64,%rsp 962 mov %rdi,$ctx # reassigned argument 963 mov %rsi,$inp # reassigned argument 964 mov %rdx,$num # reassigned argument 965 966 shl \$6,$num 967 add $inp,$num 968 lea K_XX_XX+64(%rip),$K_XX_XX 969 970 mov 0($ctx),$A # load context 971 mov 4($ctx),$B 972 mov 8($ctx),$C 973 mov 12($ctx),$D 974 mov $B,@T[0] # magic seed 975 mov 16($ctx),$E 976 mov $C,@T[1] 977 xor $D,@T[1] 978 and @T[1],@T[0] 979 980 vmovdqa 64($K_XX_XX),@X[2] # pbswap mask 981 vmovdqa -64($K_XX_XX),$Kx # K_00_19 982 vmovdqu 0($inp),@X[-4&7] # load input to %xmm[0-3] 983 vmovdqu 16($inp),@X[-3&7] 984 vmovdqu 32($inp),@X[-2&7] 985 vmovdqu 48($inp),@X[-1&7] 986 vpshufb @X[2],@X[-4&7],@X[-4&7] # byte swap 987 add \$64,$inp 988 vpshufb @X[2],@X[-3&7],@X[-3&7] 989 vpshufb @X[2],@X[-2&7],@X[-2&7] 990 vpshufb @X[2],@X[-1&7],@X[-1&7] 991 vpaddd $Kx,@X[-4&7],@X[0] # add K_00_19 992 vpaddd $Kx,@X[-3&7],@X[1] 993 vpaddd $Kx,@X[-2&7],@X[2] 994 vmovdqa @X[0],0(%rsp) # X[]+K xfer to IALU 995 vmovdqa @X[1],16(%rsp) 996 vmovdqa @X[2],32(%rsp) 997 jmp .Loop_avx 998 ___ 999 1000 sub Xupdate_avx_16_31() # recall that $Xi starts wtih 4 1001 { use integer; 1002 my $body = shift; 1003 my @insns = (&$body,&$body,&$body,&$body); # 40 instructions 1004 my ($a,$b,$c,$d,$e); 1005 1006 eval(shift(@insns)); 1007 eval(shift(@insns)); 1008 &vpalignr(@X[0],@X[-3&7],@X[-4&7],8); # compose "X[-14]" in "X[0]" 1009 eval(shift(@insns)); 1010 eval(shift(@insns)); 1011 1012 &vpaddd (@Tx[1],$Kx,@X[-1&7]); 1013 eval(shift(@insns)); 1014 eval(shift(@insns)); 1015 &vpsrldq(@Tx[0],@X[-1&7],4); # "X[-3]", 3 dwords 1016 eval(shift(@insns)); 1017 eval(shift(@insns)); 1018 &vpxor (@X[0],@X[0],@X[-4&7]); # "X[0]"^="X[-16]" 1019 eval(shift(@insns)); 1020 eval(shift(@insns)); 1021 1022 &vpxor (@Tx[0],@Tx[0],@X[-2&7]); # "X[-3]"^"X[-8]" 1023 eval(shift(@insns)); 1024 eval(shift(@insns)); 1025 eval(shift(@insns)); 1026 eval(shift(@insns)); 1027 1028 &vpxor (@X[0],@X[0],@Tx[0]); # "X[0]"^="X[-3]"^"X[-8]" 1029 eval(shift(@insns)); 1030 eval(shift(@insns)); 1031 &vmovdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU 1032 eval(shift(@insns)); 1033 eval(shift(@insns)); 1034 1035 &vpsrld (@Tx[0],@X[0],31); 1036 eval(shift(@insns)); 1037 eval(shift(@insns)); 1038 eval(shift(@insns)); 1039 eval(shift(@insns)); 1040 1041 &vpslldq(@Tx[2],@X[0],12); # "X[0]"<<96, extract one dword 1042 &vpaddd (@X[0],@X[0],@X[0]); 1043 eval(shift(@insns)); 1044 eval(shift(@insns)); 1045 eval(shift(@insns)); 1046 eval(shift(@insns)); 1047 1048 &vpsrld (@Tx[1],@Tx[2],30); 1049 &vpor (@X[0],@X[0],@Tx[0]); # "X[0]"<<<=1 1050 eval(shift(@insns)); 1051 eval(shift(@insns)); 1052 eval(shift(@insns)); 1053 eval(shift(@insns)); 1054 1055 &vpslld (@Tx[2],@Tx[2],2); 1056 &vpxor (@X[0],@X[0],@Tx[1]); 1057 eval(shift(@insns)); 1058 eval(shift(@insns)); 1059 eval(shift(@insns)); 1060 eval(shift(@insns)); 1061 1062 &vpxor (@X[0],@X[0],@Tx[2]); # "X[0]"^=("X[0]">>96)<<<2 1063 eval(shift(@insns)); 1064 eval(shift(@insns)); 1065 &vmovdqa ($Kx,eval(2*16*(($Xi)/5)-64)."($K_XX_XX)") if ($Xi%5==0); # K_XX_XX 1066 eval(shift(@insns)); 1067 eval(shift(@insns)); 1068 1069 1070 foreach (@insns) { eval; } # remaining instructions [if any] 1071 1072 $Xi++; push(@X,shift(@X)); # "rotate" X[] 1073 } 1074 1075 sub Xupdate_avx_32_79() 1076 { use integer; 1077 my $body = shift; 1078 my @insns = (&$body,&$body,&$body,&$body); # 32 to 44 instructions 1079 my ($a,$b,$c,$d,$e); 1080 1081 &vpalignr(@Tx[0],@X[-1&7],@X[-2&7],8); # compose "X[-6]" 1082 &vpxor (@X[0],@X[0],@X[-4&7]); # "X[0]"="X[-32]"^"X[-16]" 1083 eval(shift(@insns)); # body_20_39 1084 eval(shift(@insns)); 1085 eval(shift(@insns)); 1086 eval(shift(@insns)); # rol 1087 1088 &vpxor (@X[0],@X[0],@X[-7&7]); # "X[0]"^="X[-28]" 1089 eval(shift(@insns)); 1090 eval(shift(@insns)) if (@insns[0] !~ /&ro[rl]/); 1091 &vpaddd (@Tx[1],$Kx,@X[-1&7]); 1092 &vmovdqa ($Kx,eval(2*16*($Xi/5)-64)."($K_XX_XX)") if ($Xi%5==0); 1093 eval(shift(@insns)); # ror 1094 eval(shift(@insns)); 1095 1096 &vpxor (@X[0],@X[0],@Tx[0]); # "X[0]"^="X[-6]" 1097 eval(shift(@insns)); # body_20_39 1098 eval(shift(@insns)); 1099 eval(shift(@insns)); 1100 eval(shift(@insns)); # rol 1101 1102 &vpsrld (@Tx[0],@X[0],30); 1103 &vmovdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU 1104 eval(shift(@insns)); 1105 eval(shift(@insns)); 1106 eval(shift(@insns)); # ror 1107 eval(shift(@insns)); 1108 1109 &vpslld (@X[0],@X[0],2); 1110 eval(shift(@insns)); # body_20_39 1111 eval(shift(@insns)); 1112 eval(shift(@insns)); 1113 eval(shift(@insns)); # rol 1114 eval(shift(@insns)); 1115 eval(shift(@insns)); 1116 eval(shift(@insns)); # ror 1117 eval(shift(@insns)); 1118 1119 &vpor (@X[0],@X[0],@Tx[0]); # "X[0]"<<<=2 1120 eval(shift(@insns)); # body_20_39 1121 eval(shift(@insns)); 1122 eval(shift(@insns)); 1123 eval(shift(@insns)); # rol 1124 eval(shift(@insns)); 1125 eval(shift(@insns)); 1126 eval(shift(@insns)); # rol 1127 eval(shift(@insns)); 1128 1129 foreach (@insns) { eval; } # remaining instructions 1130 1131 $Xi++; push(@X,shift(@X)); # "rotate" X[] 1132 } 1133 1134 sub Xuplast_avx_80() 1135 { use integer; 1136 my $body = shift; 1137 my @insns = (&$body,&$body,&$body,&$body); # 32 instructions 1138 my ($a,$b,$c,$d,$e); 1139 1140 eval(shift(@insns)); 1141 &vpaddd (@Tx[1],$Kx,@X[-1&7]); 1142 eval(shift(@insns)); 1143 eval(shift(@insns)); 1144 eval(shift(@insns)); 1145 eval(shift(@insns)); 1146 1147 &vmovdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer IALU 1148 1149 foreach (@insns) { eval; } # remaining instructions 1150 1151 &cmp ($inp,$num); 1152 &je ($done_avx_label); 1153 1154 &vmovdqa(@X[2],"64($K_XX_XX)"); # pbswap mask 1155 &vmovdqa($Kx,"-64($K_XX_XX)"); # K_00_19 1156 &vmovdqu(@X[-4&7],"0($inp)"); # load input 1157 &vmovdqu(@X[-3&7],"16($inp)"); 1158 &vmovdqu(@X[-2&7],"32($inp)"); 1159 &vmovdqu(@X[-1&7],"48($inp)"); 1160 &vpshufb(@X[-4&7],@X[-4&7],@X[2]); # byte swap 1161 &add ($inp,64); 1162 1163 $Xi=0; 1164 } 1165 1166 sub Xloop_avx() 1167 { use integer; 1168 my $body = shift; 1169 my @insns = (&$body,&$body,&$body,&$body); # 32 instructions 1170 my ($a,$b,$c,$d,$e); 1171 1172 eval(shift(@insns)); 1173 eval(shift(@insns)); 1174 &vpshufb(@X[($Xi-3)&7],@X[($Xi-3)&7],@X[2]); 1175 eval(shift(@insns)); 1176 eval(shift(@insns)); 1177 &vpaddd (@X[$Xi&7],@X[($Xi-4)&7],$Kx); 1178 eval(shift(@insns)); 1179 eval(shift(@insns)); 1180 eval(shift(@insns)); 1181 eval(shift(@insns)); 1182 &vmovdqa(eval(16*$Xi)."(%rsp)",@X[$Xi&7]); # X[]+K xfer to IALU 1183 eval(shift(@insns)); 1184 eval(shift(@insns)); 1185 1186 foreach (@insns) { eval; } 1187 $Xi++; 1188 } 1189 1190 sub Xtail_avx() 1191 { use integer; 1192 my $body = shift; 1193 my @insns = (&$body,&$body,&$body,&$body); # 32 instructions 1194 my ($a,$b,$c,$d,$e); 1195 1196 foreach (@insns) { eval; } 1197 } 1198 1199 $code.=<<___; 1200 .align 16 1201 .Loop_avx: 1202 ___ 1203 &Xupdate_avx_16_31(\&body_00_19); 1204 &Xupdate_avx_16_31(\&body_00_19); 1205 &Xupdate_avx_16_31(\&body_00_19); 1206 &Xupdate_avx_16_31(\&body_00_19); 1207 &Xupdate_avx_32_79(\&body_00_19); 1208 &Xupdate_avx_32_79(\&body_20_39); 1209 &Xupdate_avx_32_79(\&body_20_39); 1210 &Xupdate_avx_32_79(\&body_20_39); 1211 &Xupdate_avx_32_79(\&body_20_39); 1212 &Xupdate_avx_32_79(\&body_20_39); 1213 &Xupdate_avx_32_79(\&body_40_59); 1214 &Xupdate_avx_32_79(\&body_40_59); 1215 &Xupdate_avx_32_79(\&body_40_59); 1216 &Xupdate_avx_32_79(\&body_40_59); 1217 &Xupdate_avx_32_79(\&body_40_59); 1218 &Xupdate_avx_32_79(\&body_20_39); 1219 &Xuplast_avx_80(\&body_20_39); # can jump to "done" 1220 1221 $saved_j=$j; @saved_V=@V; 1222 1223 &Xloop_avx(\&body_20_39); 1224 &Xloop_avx(\&body_20_39); 1225 &Xloop_avx(\&body_20_39); 1226 1227 $code.=<<___; 1228 add 0($ctx),$A # update context 1229 add 4($ctx),@T[0] 1230 add 8($ctx),$C 1231 add 12($ctx),$D 1232 mov $A,0($ctx) 1233 add 16($ctx),$E 1234 mov @T[0],4($ctx) 1235 mov @T[0],$B # magic seed 1236 mov $C,8($ctx) 1237 mov $C,@T[1] 1238 mov $D,12($ctx) 1239 xor $D,@T[1] 1240 mov $E,16($ctx) 1241 and @T[1],@T[0] 1242 jmp .Loop_avx 1243 1244 .align 16 1245 $done_avx_label: 1246 ___ 1247 $j=$saved_j; @V=@saved_V; 1248 1249 &Xtail_avx(\&body_20_39); 1250 &Xtail_avx(\&body_20_39); 1251 &Xtail_avx(\&body_20_39); 1252 1253 $code.=<<___; 1254 vzeroupper 1255 1256 add 0($ctx),$A # update context 1257 add 4($ctx),@T[0] 1258 add 8($ctx),$C 1259 mov $A,0($ctx) 1260 add 12($ctx),$D 1261 mov @T[0],4($ctx) 1262 add 16($ctx),$E 1263 mov $C,8($ctx) 1264 mov $D,12($ctx) 1265 mov $E,16($ctx) 1266 ___ 1267 $code.=<<___ if ($win64); 1268 movaps -40-6*16($fp),%xmm6 1269 movaps -40-5*16($fp),%xmm7 1270 movaps -40-4*16($fp),%xmm8 1271 movaps -40-3*16($fp),%xmm9 1272 movaps -40-2*16($fp),%xmm10 1273 movaps -40-1*16($fp),%xmm11 1274 ___ 1275 $code.=<<___; 1276 mov -40($fp),%r14 1277 mov -32($fp),%r13 1278 mov -24($fp),%r12 1279 mov -16($fp),%rbp 1280 mov -8($fp),%rbx 1281 lea ($fp),%rsp 1282 .Lepilogue_avx: 1283 ret 1284 .size sha1_block_data_order_avx,.-sha1_block_data_order_avx 1285 ___ 1286 1287 if ($avx>1) { 1288 use integer; 1289 $Xi=4; # reset variables 1290 @X=map("%ymm$_",(4..7,0..3)); 1291 @Tx=map("%ymm$_",(8..10)); 1292 $Kx="%ymm11"; 1293 $j=0; 1294 1295 my @ROTX=("%eax","%ebp","%ebx","%ecx","%edx","%esi"); 1296 my ($a5,$t0)=("%r12d","%edi"); 1297 1298 my ($A,$F,$B,$C,$D,$E)=@ROTX; 1299 my $rx=0; 1300 my $frame="%r13"; 1301 1302 $code.=<<___; 1303 .type sha1_block_data_order_avx2,\@function,3 1304 .align 16 1305 sha1_block_data_order_avx2: 1306 _avx2_shortcut: 1307 mov %rsp,$fp 1308 push %rbx 1309 push %rbp 1310 push %r12 1311 push %r13 1312 push %r14 1313 vzeroupper 1314 ___ 1315 $code.=<<___ if ($win64); 1316 lea -6*16(%rsp),%rsp 1317 vmovaps %xmm6,-40-6*16($fp) 1318 vmovaps %xmm7,-40-5*16($fp) 1319 vmovaps %xmm8,-40-4*16($fp) 1320 vmovaps %xmm9,-40-3*16($fp) 1321 vmovaps %xmm10,-40-2*16($fp) 1322 vmovaps %xmm11,-40-1*16($fp) 1323 .Lprologue_avx2: 1324 ___ 1325 $code.=<<___; 1326 mov %rdi,$ctx # reassigned argument 1327 mov %rsi,$inp # reassigned argument 1328 mov %rdx,$num # reassigned argument 1329 1330 lea -640(%rsp),%rsp 1331 shl \$6,$num 1332 lea 64($inp),$frame 1333 and \$-128,%rsp 1334 add $inp,$num 1335 lea K_XX_XX+64(%rip),$K_XX_XX 1336 1337 mov 0($ctx),$A # load context 1338 cmp $num,$frame 1339 cmovae $inp,$frame # next or same block 1340 mov 4($ctx),$F 1341 mov 8($ctx),$C 1342 mov 12($ctx),$D 1343 mov 16($ctx),$E 1344 vmovdqu 64($K_XX_XX),@X[2] # pbswap mask 1345 1346 vmovdqu ($inp),%xmm0 1347 vmovdqu 16($inp),%xmm1 1348 vmovdqu 32($inp),%xmm2 1349 vmovdqu 48($inp),%xmm3 1350 lea 64($inp),$inp 1351 vinserti128 \$1,($frame),@X[-4&7],@X[-4&7] 1352 vinserti128 \$1,16($frame),@X[-3&7],@X[-3&7] 1353 vpshufb @X[2],@X[-4&7],@X[-4&7] 1354 vinserti128 \$1,32($frame),@X[-2&7],@X[-2&7] 1355 vpshufb @X[2],@X[-3&7],@X[-3&7] 1356 vinserti128 \$1,48($frame),@X[-1&7],@X[-1&7] 1357 vpshufb @X[2],@X[-2&7],@X[-2&7] 1358 vmovdqu -64($K_XX_XX),$Kx # K_00_19 1359 vpshufb @X[2],@X[-1&7],@X[-1&7] 1360 1361 vpaddd $Kx,@X[-4&7],@X[0] # add K_00_19 1362 vpaddd $Kx,@X[-3&7],@X[1] 1363 vmovdqu @X[0],0(%rsp) # X[]+K xfer to IALU 1364 vpaddd $Kx,@X[-2&7],@X[2] 1365 vmovdqu @X[1],32(%rsp) 1366 vpaddd $Kx,@X[-1&7],@X[3] 1367 vmovdqu @X[2],64(%rsp) 1368 vmovdqu @X[3],96(%rsp) 1369 ___ 1370 for (;$Xi<8;$Xi++) { # Xupdate_avx2_16_31 1371 use integer; 1372 1373 &vpalignr(@X[0],@X[-3&7],@X[-4&7],8); # compose "X[-14]" in "X[0]" 1374 &vpsrldq(@Tx[0],@X[-1&7],4); # "X[-3]", 3 dwords 1375 &vpxor (@X[0],@X[0],@X[-4&7]); # "X[0]"^="X[-16]" 1376 &vpxor (@Tx[0],@Tx[0],@X[-2&7]); # "X[-3]"^"X[-8]" 1377 &vpxor (@X[0],@X[0],@Tx[0]); # "X[0]"^="X[-3]"^"X[-8]" 1378 &vpsrld (@Tx[0],@X[0],31); 1379 &vmovdqu($Kx,eval(2*16*(($Xi)/5)-64)."($K_XX_XX)") if ($Xi%5==0); # K_XX_XX 1380 &vpslldq(@Tx[2],@X[0],12); # "X[0]"<<96, extract one dword 1381 &vpaddd (@X[0],@X[0],@X[0]); 1382 &vpsrld (@Tx[1],@Tx[2],30); 1383 &vpor (@X[0],@X[0],@Tx[0]); # "X[0]"<<<=1 1384 &vpslld (@Tx[2],@Tx[2],2); 1385 &vpxor (@X[0],@X[0],@Tx[1]); 1386 &vpxor (@X[0],@X[0],@Tx[2]); # "X[0]"^=("X[0]">>96)<<<2 1387 &vpaddd (@Tx[1],@X[0],$Kx); 1388 &vmovdqu("32*$Xi(%rsp)",@Tx[1]); # X[]+K xfer to IALU 1389 1390 push(@X,shift(@X)); # "rotate" X[] 1391 } 1392 $code.=<<___; 1393 lea 128(%rsp),$frame 1394 jmp .Loop_avx2 1395 .align 32 1396 .Loop_avx2: 1397 rorx \$2,$F,$B 1398 andn $D,$F,$t0 1399 and $C,$F 1400 xor $t0,$F 1401 ___ 1402 sub bodyx_00_19 () { # 8 instructions, 3 cycles critical path 1403 # at start $f=(b&c)^(~b&d), $b>>>=2 1404 return &bodyx_20_39() if ($rx==19); $rx++; 1405 ( 1406 '($a,$f,$b,$c,$d,$e)=@ROTX;'. 1407 1408 '&add ($e,((32*($j/4)+4*($j%4))%256-128)."($frame)");'. # e+=X[i]+K 1409 '&lea ($frame,"256($frame)") if ($j%32==31);', 1410 '&andn ($t0,$a,$c)', # ~b&d for next round 1411 1412 '&add ($e,$f)', # e+=(b&c)^(~b&d) 1413 '&rorx ($a5,$a,27)', # a<<<5 1414 '&rorx ($f,$a,2)', # b>>>2 for next round 1415 '&and ($a,$b)', # b&c for next round 1416 1417 '&add ($e,$a5)', # e+=a<<<5 1418 '&xor ($a,$t0);'. # f=(b&c)^(~b&d) for next round 1419 1420 'unshift(@ROTX,pop(@ROTX)); $j++;' 1421 ) 1422 } 1423 1424 sub bodyx_20_39 () { # 7 instructions, 2 cycles critical path 1425 # on entry $f=b^c^d, $b>>>=2 1426 return &bodyx_40_59() if ($rx==39); $rx++; 1427 ( 1428 '($a,$f,$b,$c,$d,$e)=@ROTX;'. 1429 1430 '&add ($e,((32*($j/4)+4*($j%4))%256-128)."($frame)");'. # e+=X[i]+K 1431 '&lea ($frame,"256($frame)") if ($j%32==31);', 1432 1433 '&lea ($e,"($e,$f)")', # e+=b^c^d 1434 '&rorx ($a5,$a,27)', # a<<<5 1435 '&rorx ($f,$a,2) if ($j<79)', # b>>>2 in next round 1436 '&xor ($a,$b) if ($j<79)', # b^c for next round 1437 1438 '&add ($e,$a5)', # e+=a<<<5 1439 '&xor ($a,$c) if ($j<79);'. # f=b^c^d for next round 1440 1441 'unshift(@ROTX,pop(@ROTX)); $j++;' 1442 ) 1443 } 1444 1445 sub bodyx_40_59 () { # 10 instructions, 3 cycles critical path 1446 # on entry $f=((b^c)&(c^d)), $b>>>=2 1447 $rx++; 1448 ( 1449 '($a,$f,$b,$c,$d,$e)=@ROTX;'. 1450 1451 '&add ($e,((32*($j/4)+4*($j%4))%256-128)."($frame)");'. # e+=X[i]+K 1452 '&lea ($frame,"256($frame)") if ($j%32==31);', 1453 '&xor ($f,$c) if ($j>39)', # (b^c)&(c^d)^c 1454 '&mov ($t0,$b) if ($j<59)', # count on zero latency 1455 '&xor ($t0,$c) if ($j<59)', # c^d for next round 1456 1457 '&lea ($e,"($e,$f)")', # e+=(b^c)&(c^d)^c 1458 '&rorx ($a5,$a,27)', # a<<<5 1459 '&rorx ($f,$a,2)', # b>>>2 in next round 1460 '&xor ($a,$b)', # b^c for next round 1461 1462 '&add ($e,$a5)', # e+=a<<<5 1463 '&and ($a,$t0) if ($j< 59);'. # f=(b^c)&(c^d) for next round 1464 '&xor ($a,$c) if ($j==59);'. # f=b^c^d for next round 1465 1466 'unshift(@ROTX,pop(@ROTX)); $j++;' 1467 ) 1468 } 1469 1470 sub Xupdate_avx2_16_31() # recall that $Xi starts with 4 1471 { use integer; 1472 my $body = shift; 1473 my @insns = (&$body,&$body,&$body,&$body,&$body); # 35 instructions 1474 my ($a,$b,$c,$d,$e); 1475 1476 &vpalignr(@X[0],@X[-3&7],@X[-4&7],8); # compose "X[-14]" in "X[0]" 1477 eval(shift(@insns)); 1478 eval(shift(@insns)); 1479 eval(shift(@insns)); 1480 eval(shift(@insns)); 1481 1482 &vpsrldq(@Tx[0],@X[-1&7],4); # "X[-3]", 3 dwords 1483 eval(shift(@insns)); 1484 eval(shift(@insns)); 1485 eval(shift(@insns)); 1486 1487 &vpxor (@X[0],@X[0],@X[-4&7]); # "X[0]"^="X[-16]" 1488 &vpxor (@Tx[0],@Tx[0],@X[-2&7]); # "X[-3]"^"X[-8]" 1489 eval(shift(@insns)); 1490 eval(shift(@insns)); 1491 1492 &vpxor (@X[0],@X[0],@Tx[0]); # "X[0]"^="X[-3]"^"X[-8]" 1493 eval(shift(@insns)); 1494 eval(shift(@insns)); 1495 eval(shift(@insns)); 1496 eval(shift(@insns)); 1497 1498 &vpsrld (@Tx[0],@X[0],31); 1499 &vmovdqu($Kx,eval(2*16*(($Xi)/5)-64)."($K_XX_XX)") if ($Xi%5==0); # K_XX_XX 1500 eval(shift(@insns)); 1501 eval(shift(@insns)); 1502 eval(shift(@insns)); 1503 1504 &vpslldq(@Tx[2],@X[0],12); # "X[0]"<<96, extract one dword 1505 &vpaddd (@X[0],@X[0],@X[0]); 1506 eval(shift(@insns)); 1507 eval(shift(@insns)); 1508 1509 &vpsrld (@Tx[1],@Tx[2],30); 1510 &vpor (@X[0],@X[0],@Tx[0]); # "X[0]"<<<=1 1511 eval(shift(@insns)); 1512 eval(shift(@insns)); 1513 1514 &vpslld (@Tx[2],@Tx[2],2); 1515 &vpxor (@X[0],@X[0],@Tx[1]); 1516 eval(shift(@insns)); 1517 eval(shift(@insns)); 1518 1519 &vpxor (@X[0],@X[0],@Tx[2]); # "X[0]"^=("X[0]">>96)<<<2 1520 eval(shift(@insns)); 1521 eval(shift(@insns)); 1522 eval(shift(@insns)); 1523 1524 &vpaddd (@Tx[1],@X[0],$Kx); 1525 eval(shift(@insns)); 1526 eval(shift(@insns)); 1527 eval(shift(@insns)); 1528 &vmovdqu(eval(32*($Xi))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU 1529 1530 foreach (@insns) { eval; } # remaining instructions [if any] 1531 1532 $Xi++; 1533 push(@X,shift(@X)); # "rotate" X[] 1534 } 1535 1536 sub Xupdate_avx2_32_79() 1537 { use integer; 1538 my $body = shift; 1539 my @insns = (&$body,&$body,&$body,&$body,&$body); # 35 to 50 instructions 1540 my ($a,$b,$c,$d,$e); 1541 1542 &vpalignr(@Tx[0],@X[-1&7],@X[-2&7],8); # compose "X[-6]" 1543 &vpxor (@X[0],@X[0],@X[-4&7]); # "X[0]"="X[-32]"^"X[-16]" 1544 eval(shift(@insns)); 1545 eval(shift(@insns)); 1546 1547 &vpxor (@X[0],@X[0],@X[-7&7]); # "X[0]"^="X[-28]" 1548 &vmovdqu($Kx,eval(2*16*($Xi/5)-64)."($K_XX_XX)") if ($Xi%5==0); 1549 eval(shift(@insns)); 1550 eval(shift(@insns)); 1551 eval(shift(@insns)); 1552 1553 &vpxor (@X[0],@X[0],@Tx[0]); # "X[0]"^="X[-6]" 1554 eval(shift(@insns)); 1555 eval(shift(@insns)); 1556 eval(shift(@insns)); 1557 1558 &vpsrld (@Tx[0],@X[0],30); 1559 &vpslld (@X[0],@X[0],2); 1560 eval(shift(@insns)); 1561 eval(shift(@insns)); 1562 eval(shift(@insns)); 1563 1564 #&vpslld (@X[0],@X[0],2); 1565 eval(shift(@insns)); 1566 eval(shift(@insns)); 1567 eval(shift(@insns)); 1568 1569 &vpor (@X[0],@X[0],@Tx[0]); # "X[0]"<<<=2 1570 eval(shift(@insns)); 1571 eval(shift(@insns)); 1572 eval(shift(@insns)); 1573 eval(shift(@insns)); 1574 1575 &vpaddd (@Tx[1],@X[0],$Kx); 1576 eval(shift(@insns)); 1577 eval(shift(@insns)); 1578 eval(shift(@insns)); 1579 eval(shift(@insns)); 1580 1581 &vmovdqu("32*$Xi(%rsp)",@Tx[1]); # X[]+K xfer to IALU 1582 1583 foreach (@insns) { eval; } # remaining instructions 1584 1585 $Xi++; 1586 push(@X,shift(@X)); # "rotate" X[] 1587 } 1588 1589 sub Xloop_avx2() 1590 { use integer; 1591 my $body = shift; 1592 my @insns = (&$body,&$body,&$body,&$body,&$body); # 32 instructions 1593 my ($a,$b,$c,$d,$e); 1594 1595 foreach (@insns) { eval; } 1596 } 1597 1598 &align32(); 1599 &Xupdate_avx2_32_79(\&bodyx_00_19); 1600 &Xupdate_avx2_32_79(\&bodyx_00_19); 1601 &Xupdate_avx2_32_79(\&bodyx_00_19); 1602 &Xupdate_avx2_32_79(\&bodyx_00_19); 1603 1604 &Xupdate_avx2_32_79(\&bodyx_20_39); 1605 &Xupdate_avx2_32_79(\&bodyx_20_39); 1606 &Xupdate_avx2_32_79(\&bodyx_20_39); 1607 &Xupdate_avx2_32_79(\&bodyx_20_39); 1608 1609 &align32(); 1610 &Xupdate_avx2_32_79(\&bodyx_40_59); 1611 &Xupdate_avx2_32_79(\&bodyx_40_59); 1612 &Xupdate_avx2_32_79(\&bodyx_40_59); 1613 &Xupdate_avx2_32_79(\&bodyx_40_59); 1614 1615 &Xloop_avx2(\&bodyx_20_39); 1616 &Xloop_avx2(\&bodyx_20_39); 1617 &Xloop_avx2(\&bodyx_20_39); 1618 &Xloop_avx2(\&bodyx_20_39); 1619 1620 $code.=<<___; 1621 lea 128($inp),$frame 1622 lea 128($inp),%rdi # borrow $t0 1623 cmp $num,$frame 1624 cmovae $inp,$frame # next or previous block 1625 1626 # output is d-e-[a]-f-b-c => A=d,F=e,C=f,D=b,E=c 1627 add 0($ctx),@ROTX[0] # update context 1628 add 4($ctx),@ROTX[1] 1629 add 8($ctx),@ROTX[3] 1630 mov @ROTX[0],0($ctx) 1631 add 12($ctx),@ROTX[4] 1632 mov @ROTX[1],4($ctx) 1633 mov @ROTX[0],$A # A=d 1634 add 16($ctx),@ROTX[5] 1635 mov @ROTX[3],$a5 1636 mov @ROTX[3],8($ctx) 1637 mov @ROTX[4],$D # D=b 1638 #xchg @ROTX[5],$F # F=c, C=f 1639 mov @ROTX[4],12($ctx) 1640 mov @ROTX[1],$F # F=e 1641 mov @ROTX[5],16($ctx) 1642 #mov $F,16($ctx) 1643 mov @ROTX[5],$E # E=c 1644 mov $a5,$C # C=f 1645 #xchg $F,$E # E=c, F=e 1646 1647 cmp $num,$inp 1648 je .Ldone_avx2 1649 ___ 1650 1651 $Xi=4; # reset variables 1652 @X=map("%ymm$_",(4..7,0..3)); 1653 1654 $code.=<<___; 1655 vmovdqu 64($K_XX_XX),@X[2] # pbswap mask 1656 cmp $num,%rdi # borrowed $t0 1657 ja .Last_avx2 1658 1659 vmovdqu -64(%rdi),%xmm0 # low part of @X[-4&7] 1660 vmovdqu -48(%rdi),%xmm1 1661 vmovdqu -32(%rdi),%xmm2 1662 vmovdqu -16(%rdi),%xmm3 1663 vinserti128 \$1,0($frame),@X[-4&7],@X[-4&7] 1664 vinserti128 \$1,16($frame),@X[-3&7],@X[-3&7] 1665 vinserti128 \$1,32($frame),@X[-2&7],@X[-2&7] 1666 vinserti128 \$1,48($frame),@X[-1&7],@X[-1&7] 1667 jmp .Last_avx2 1668 1669 .align 32 1670 .Last_avx2: 1671 lea 128+16(%rsp),$frame 1672 rorx \$2,$F,$B 1673 andn $D,$F,$t0 1674 and $C,$F 1675 xor $t0,$F 1676 sub \$-128,$inp 1677 ___ 1678 $rx=$j=0; @ROTX=($A,$F,$B,$C,$D,$E); 1679 1680 &Xloop_avx2 (\&bodyx_00_19); 1681 &Xloop_avx2 (\&bodyx_00_19); 1682 &Xloop_avx2 (\&bodyx_00_19); 1683 &Xloop_avx2 (\&bodyx_00_19); 1684 1685 &Xloop_avx2 (\&bodyx_20_39); 1686 &vmovdqu ($Kx,"-64($K_XX_XX)"); # K_00_19 1687 &vpshufb (@X[-4&7],@X[-4&7],@X[2]); # byte swap 1688 &Xloop_avx2 (\&bodyx_20_39); 1689 &vpshufb (@X[-3&7],@X[-3&7],@X[2]); 1690 &vpaddd (@Tx[0],@X[-4&7],$Kx); # add K_00_19 1691 &Xloop_avx2 (\&bodyx_20_39); 1692 &vmovdqu ("0(%rsp)",@Tx[0]); 1693 &vpshufb (@X[-2&7],@X[-2&7],@X[2]); 1694 &vpaddd (@Tx[1],@X[-3&7],$Kx); 1695 &Xloop_avx2 (\&bodyx_20_39); 1696 &vmovdqu ("32(%rsp)",@Tx[1]); 1697 &vpshufb (@X[-1&7],@X[-1&7],@X[2]); 1698 &vpaddd (@X[2],@X[-2&7],$Kx); 1699 1700 &Xloop_avx2 (\&bodyx_40_59); 1701 &align32 (); 1702 &vmovdqu ("64(%rsp)",@X[2]); 1703 &vpaddd (@X[3],@X[-1&7],$Kx); 1704 &Xloop_avx2 (\&bodyx_40_59); 1705 &vmovdqu ("96(%rsp)",@X[3]); 1706 &Xloop_avx2 (\&bodyx_40_59); 1707 &Xupdate_avx2_16_31(\&bodyx_40_59); 1708 1709 &Xupdate_avx2_16_31(\&bodyx_20_39); 1710 &Xupdate_avx2_16_31(\&bodyx_20_39); 1711 &Xupdate_avx2_16_31(\&bodyx_20_39); 1712 &Xloop_avx2 (\&bodyx_20_39); 1713 1714 $code.=<<___; 1715 lea 128(%rsp),$frame 1716 1717 # output is d-e-[a]-f-b-c => A=d,F=e,C=f,D=b,E=c 1718 add 0($ctx),@ROTX[0] # update context 1719 add 4($ctx),@ROTX[1] 1720 add 8($ctx),@ROTX[3] 1721 mov @ROTX[0],0($ctx) 1722 add 12($ctx),@ROTX[4] 1723 mov @ROTX[1],4($ctx) 1724 mov @ROTX[0],$A # A=d 1725 add 16($ctx),@ROTX[5] 1726 mov @ROTX[3],$a5 1727 mov @ROTX[3],8($ctx) 1728 mov @ROTX[4],$D # D=b 1729 #xchg @ROTX[5],$F # F=c, C=f 1730 mov @ROTX[4],12($ctx) 1731 mov @ROTX[1],$F # F=e 1732 mov @ROTX[5],16($ctx) 1733 #mov $F,16($ctx) 1734 mov @ROTX[5],$E # E=c 1735 mov $a5,$C # C=f 1736 #xchg $F,$E # E=c, F=e 1737 1738 cmp $num,$inp 1739 jbe .Loop_avx2 1740 1741 .Ldone_avx2: 1742 vzeroupper 1743 ___ 1744 $code.=<<___ if ($win64); 1745 movaps -40-6*16($fp),%xmm6 1746 movaps -40-5*16($fp),%xmm7 1747 movaps -40-4*16($fp),%xmm8 1748 movaps -40-3*16($fp),%xmm9 1749 movaps -40-2*16($fp),%xmm10 1750 movaps -40-1*16($fp),%xmm11 1751 ___ 1752 $code.=<<___; 1753 mov -40($fp),%r14 1754 mov -32($fp),%r13 1755 mov -24($fp),%r12 1756 mov -16($fp),%rbp 1757 mov -8($fp),%rbx 1758 lea ($fp),%rsp 1759 .Lepilogue_avx2: 1760 ret 1761 .size sha1_block_data_order_avx2,.-sha1_block_data_order_avx2 1762 ___ 1763 } 1764 } 1765 $code.=<<___; 1766 .align 64 1767 K_XX_XX: 1768 .long 0x5a827999,0x5a827999,0x5a827999,0x5a827999 # K_00_19 1769 .long 0x5a827999,0x5a827999,0x5a827999,0x5a827999 # K_00_19 1770 .long 0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1 # K_20_39 1771 .long 0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1 # K_20_39 1772 .long 0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc # K_40_59 1773 .long 0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc # K_40_59 1774 .long 0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6 # K_60_79 1775 .long 0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6 # K_60_79 1776 .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f # pbswap mask 1777 .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f # pbswap mask 1778 .byte 0xf,0xe,0xd,0xc,0xb,0xa,0x9,0x8,0x7,0x6,0x5,0x4,0x3,0x2,0x1,0x0 1779 ___ 1780 }}} 1781 $code.=<<___; 1782 .asciz "SHA1 block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>" 1783 .align 64 1784 ___ 1785 1786 # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, 1787 # CONTEXT *context,DISPATCHER_CONTEXT *disp) 1788 if ($win64) { 1789 $rec="%rcx"; 1790 $frame="%rdx"; 1791 $context="%r8"; 1792 $disp="%r9"; 1793 1794 $code.=<<___; 1795 .extern __imp_RtlVirtualUnwind 1796 .type se_handler,\@abi-omnipotent 1797 .align 16 1798 se_handler: 1799 push %rsi 1800 push %rdi 1801 push %rbx 1802 push %rbp 1803 push %r12 1804 push %r13 1805 push %r14 1806 push %r15 1807 pushfq 1808 sub \$64,%rsp 1809 1810 mov 120($context),%rax # pull context->Rax 1811 mov 248($context),%rbx # pull context->Rip 1812 1813 lea .Lprologue(%rip),%r10 1814 cmp %r10,%rbx # context->Rip<.Lprologue 1815 jb .Lcommon_seh_tail 1816 1817 mov 152($context),%rax # pull context->Rsp 1818 1819 lea .Lepilogue(%rip),%r10 1820 cmp %r10,%rbx # context->Rip>=.Lepilogue 1821 jae .Lcommon_seh_tail 1822 1823 mov `16*4`(%rax),%rax # pull saved stack pointer 1824 1825 mov -8(%rax),%rbx 1826 mov -16(%rax),%rbp 1827 mov -24(%rax),%r12 1828 mov -32(%rax),%r13 1829 mov -40(%rax),%r14 1830 mov %rbx,144($context) # restore context->Rbx 1831 mov %rbp,160($context) # restore context->Rbp 1832 mov %r12,216($context) # restore context->R12 1833 mov %r13,224($context) # restore context->R13 1834 mov %r14,232($context) # restore context->R14 1835 1836 jmp .Lcommon_seh_tail 1837 .size se_handler,.-se_handler 1838 ___ 1839 1840 $code.=<<___ if ($shaext); 1841 .type shaext_handler,\@abi-omnipotent 1842 .align 16 1843 shaext_handler: 1844 push %rsi 1845 push %rdi 1846 push %rbx 1847 push %rbp 1848 push %r12 1849 push %r13 1850 push %r14 1851 push %r15 1852 pushfq 1853 sub \$64,%rsp 1854 1855 mov 120($context),%rax # pull context->Rax 1856 mov 248($context),%rbx # pull context->Rip 1857 1858 lea .Lprologue_shaext(%rip),%r10 1859 cmp %r10,%rbx # context->Rip<.Lprologue 1860 jb .Lcommon_seh_tail 1861 1862 lea .Lepilogue_shaext(%rip),%r10 1863 cmp %r10,%rbx # context->Rip>=.Lepilogue 1864 jae .Lcommon_seh_tail 1865 1866 lea -8-4*16(%rax),%rsi 1867 lea 512($context),%rdi # &context.Xmm6 1868 mov \$8,%ecx 1869 .long 0xa548f3fc # cld; rep movsq 1870 1871 jmp .Lcommon_seh_tail 1872 .size shaext_handler,.-shaext_handler 1873 ___ 1874 1875 $code.=<<___; 1876 .type ssse3_handler,\@abi-omnipotent 1877 .align 16 1878 ssse3_handler: 1879 push %rsi 1880 push %rdi 1881 push %rbx 1882 push %rbp 1883 push %r12 1884 push %r13 1885 push %r14 1886 push %r15 1887 pushfq 1888 sub \$64,%rsp 1889 1890 mov 120($context),%rax # pull context->Rax 1891 mov 248($context),%rbx # pull context->Rip 1892 1893 mov 8($disp),%rsi # disp->ImageBase 1894 mov 56($disp),%r11 # disp->HandlerData 1895 1896 mov 0(%r11),%r10d # HandlerData[0] 1897 lea (%rsi,%r10),%r10 # prologue label 1898 cmp %r10,%rbx # context->Rip<prologue label 1899 jb .Lcommon_seh_tail 1900 1901 mov 208($context),%rax # pull context->R11 1902 1903 mov 4(%r11),%r10d # HandlerData[1] 1904 lea (%rsi,%r10),%r10 # epilogue label 1905 cmp %r10,%rbx # context->Rip>=epilogue label 1906 jae .Lcommon_seh_tail 1907 1908 lea -40-6*16(%rax),%rsi 1909 lea 512($context),%rdi # &context.Xmm6 1910 mov \$12,%ecx 1911 .long 0xa548f3fc # cld; rep movsq 1912 1913 mov -8(%rax),%rbx 1914 mov -16(%rax),%rbp 1915 mov -24(%rax),%r12 1916 mov -32(%rax),%r13 1917 mov -40(%rax),%r14 1918 mov %rbx,144($context) # restore context->Rbx 1919 mov %rbp,160($context) # restore context->Rbp 1920 mov %r12,216($context) # restore cotnext->R12 1921 mov %r13,224($context) # restore cotnext->R13 1922 mov %r14,232($context) # restore cotnext->R14 1923 1924 .Lcommon_seh_tail: 1925 mov 8(%rax),%rdi 1926 mov 16(%rax),%rsi 1927 mov %rax,152($context) # restore context->Rsp 1928 mov %rsi,168($context) # restore context->Rsi 1929 mov %rdi,176($context) # restore context->Rdi 1930 1931 mov 40($disp),%rdi # disp->ContextRecord 1932 mov $context,%rsi # context 1933 mov \$154,%ecx # sizeof(CONTEXT) 1934 .long 0xa548f3fc # cld; rep movsq 1935 1936 mov $disp,%rsi 1937 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER 1938 mov 8(%rsi),%rdx # arg2, disp->ImageBase 1939 mov 0(%rsi),%r8 # arg3, disp->ControlPc 1940 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry 1941 mov 40(%rsi),%r10 # disp->ContextRecord 1942 lea 56(%rsi),%r11 # &disp->HandlerData 1943 lea 24(%rsi),%r12 # &disp->EstablisherFrame 1944 mov %r10,32(%rsp) # arg5 1945 mov %r11,40(%rsp) # arg6 1946 mov %r12,48(%rsp) # arg7 1947 mov %rcx,56(%rsp) # arg8, (NULL) 1948 call *__imp_RtlVirtualUnwind(%rip) 1949 1950 mov \$1,%eax # ExceptionContinueSearch 1951 add \$64,%rsp 1952 popfq 1953 pop %r15 1954 pop %r14 1955 pop %r13 1956 pop %r12 1957 pop %rbp 1958 pop %rbx 1959 pop %rdi 1960 pop %rsi 1961 ret 1962 .size ssse3_handler,.-ssse3_handler 1963 1964 .section .pdata 1965 .align 4 1966 .rva .LSEH_begin_sha1_block_data_order 1967 .rva .LSEH_end_sha1_block_data_order 1968 .rva .LSEH_info_sha1_block_data_order 1969 ___ 1970 $code.=<<___ if ($shaext); 1971 .rva .LSEH_begin_sha1_block_data_order_shaext 1972 .rva .LSEH_end_sha1_block_data_order_shaext 1973 .rva .LSEH_info_sha1_block_data_order_shaext 1974 ___ 1975 $code.=<<___; 1976 .rva .LSEH_begin_sha1_block_data_order_ssse3 1977 .rva .LSEH_end_sha1_block_data_order_ssse3 1978 .rva .LSEH_info_sha1_block_data_order_ssse3 1979 ___ 1980 $code.=<<___ if ($avx); 1981 .rva .LSEH_begin_sha1_block_data_order_avx 1982 .rva .LSEH_end_sha1_block_data_order_avx 1983 .rva .LSEH_info_sha1_block_data_order_avx 1984 ___ 1985 $code.=<<___ if ($avx>1); 1986 .rva .LSEH_begin_sha1_block_data_order_avx2 1987 .rva .LSEH_end_sha1_block_data_order_avx2 1988 .rva .LSEH_info_sha1_block_data_order_avx2 1989 ___ 1990 $code.=<<___; 1991 .section .xdata 1992 .align 8 1993 .LSEH_info_sha1_block_data_order: 1994 .byte 9,0,0,0 1995 .rva se_handler 1996 ___ 1997 $code.=<<___ if ($shaext); 1998 .LSEH_info_sha1_block_data_order_shaext: 1999 .byte 9,0,0,0 2000 .rva shaext_handler 2001 ___ 2002 $code.=<<___; 2003 .LSEH_info_sha1_block_data_order_ssse3: 2004 .byte 9,0,0,0 2005 .rva ssse3_handler 2006 .rva .Lprologue_ssse3,.Lepilogue_ssse3 # HandlerData[] 2007 ___ 2008 $code.=<<___ if ($avx); 2009 .LSEH_info_sha1_block_data_order_avx: 2010 .byte 9,0,0,0 2011 .rva ssse3_handler 2012 .rva .Lprologue_avx,.Lepilogue_avx # HandlerData[] 2013 ___ 2014 $code.=<<___ if ($avx>1); 2015 .LSEH_info_sha1_block_data_order_avx2: 2016 .byte 9,0,0,0 2017 .rva ssse3_handler 2018 .rva .Lprologue_avx2,.Lepilogue_avx2 # HandlerData[] 2019 ___ 2020 } 2021 2022 #################################################################### 2023 2024 sub sha1rnds4 { 2025 if (@_[0] =~ /\$([x0-9a-f]+),\s*%xmm([0-7]),\s*%xmm([0-7])/) { 2026 my @opcode=(0x0f,0x3a,0xcc); 2027 push @opcode,0xc0|($2&7)|(($3&7)<<3); # ModR/M 2028 my $c=$1; 2029 push @opcode,$c=~/^0/?oct($c):$c; 2030 return ".byte\t".join(',',@opcode); 2031 } else { 2032 return "sha1rnds4\t".@_[0]; 2033 } 2034 } 2035 2036 sub sha1op38 { 2037 my $instr = shift; 2038 my %opcodelet = ( 2039 "sha1nexte" => 0xc8, 2040 "sha1msg1" => 0xc9, 2041 "sha1msg2" => 0xca ); 2042 2043 if (defined($opcodelet{$instr}) && @_[0] =~ /%xmm([0-9]+),\s*%xmm([0-9]+)/) { 2044 my @opcode=(0x0f,0x38); 2045 my $rex=0; 2046 $rex|=0x04 if ($2>=8); 2047 $rex|=0x01 if ($1>=8); 2048 unshift @opcode,0x40|$rex if ($rex); 2049 push @opcode,$opcodelet{$instr}; 2050 push @opcode,0xc0|($1&7)|(($2&7)<<3); # ModR/M 2051 return ".byte\t".join(',',@opcode); 2052 } else { 2053 return $instr."\t".@_[0]; 2054 } 2055 } 2056 2057 foreach (split("\n",$code)) { 2058 s/\`([^\`]*)\`/eval $1/geo; 2059 2060 s/\b(sha1rnds4)\s+(.*)/sha1rnds4($2)/geo or 2061 s/\b(sha1[^\s]*)\s+(.*)/sha1op38($1,$2)/geo; 2062 2063 print $_,"\n"; 2064 } 2065 close STDOUT; 2066