1 #!/usr/bin/env perl 2 # 3 # ==================================================================== 4 # Written by Andy Polyakov <appro (at] openssl.org> for the OpenSSL 5 # project. Rights for redistribution and usage in source and binary 6 # forms are granted according to the OpenSSL license. 7 # ==================================================================== 8 # 9 # sha256/512_block procedure for x86_64. 10 # 11 # 40% improvement over compiler-generated code on Opteron. On EM64T 12 # sha256 was observed to run >80% faster and sha512 - >40%. No magical 13 # tricks, just straight implementation... I really wonder why gcc 14 # [being armed with inline assembler] fails to generate as fast code. 15 # The only thing which is cool about this module is that it's very 16 # same instruction sequence used for both SHA-256 and SHA-512. In 17 # former case the instructions operate on 32-bit operands, while in 18 # latter - on 64-bit ones. All I had to do is to get one flavor right, 19 # the other one passed the test right away:-) 20 # 21 # sha256_block runs in ~1005 cycles on Opteron, which gives you 22 # asymptotic performance of 64*1000/1005=63.7MBps times CPU clock 23 # frequency in GHz. sha512_block runs in ~1275 cycles, which results 24 # in 128*1000/1275=100MBps per GHz. Is there room for improvement? 25 # Well, if you compare it to IA-64 implementation, which maintains 26 # X[16] in register bank[!], tends to 4 instructions per CPU clock 27 # cycle and runs in 1003 cycles, 1275 is very good result for 3-way 28 # issue Opteron pipeline and X[16] maintained in memory. So that *if* 29 # there is a way to improve it, *then* the only way would be to try to 30 # offload X[16] updates to SSE unit, but that would require "deeper" 31 # loop unroll, which in turn would naturally cause size blow-up, not 32 # to mention increased complexity! And once again, only *if* it's 33 # actually possible to noticeably improve overall ILP, instruction 34 # level parallelism, on a given CPU implementation in this case. 35 # 36 # Special note on Intel EM64T. While Opteron CPU exhibits perfect 37 # perfromance ratio of 1.5 between 64- and 32-bit flavors [see above], 38 # [currently available] EM64T CPUs apparently are far from it. On the 39 # contrary, 64-bit version, sha512_block, is ~30% *slower* than 32-bit 40 # sha256_block:-( This is presumably because 64-bit shifts/rotates 41 # apparently are not atomic instructions, but implemented in microcode. 42 # 43 # May 2012. 44 # 45 # Optimization including one of Pavel Semjanov's ideas, alternative 46 # Maj, resulted in >=5% improvement on most CPUs, +20% SHA256 and 47 # unfortunately -2% SHA512 on P4 [which nobody should care about 48 # that much]. 49 # 50 # June 2012. 51 # 52 # Add SIMD code paths, see below for improvement coefficients. SSSE3 53 # code path was not attempted for SHA512, because improvement is not 54 # estimated to be high enough, noticeably less than 9%, to justify 55 # the effort, not on pre-AVX processors. [Obviously with exclusion 56 # for VIA Nano, but it has SHA512 instruction that is faster and 57 # should be used instead.] For reference, corresponding estimated 58 # upper limit for improvement for SSSE3 SHA256 is 28%. The fact that 59 # higher coefficients are observed on VIA Nano and Bulldozer has more 60 # to do with specifics of their architecture [which is topic for 61 # separate discussion]. 62 # 63 # November 2012. 64 # 65 # Add AVX2 code path. Two consecutive input blocks are loaded to 66 # 256-bit %ymm registers, with data from first block to least 67 # significant 128-bit halves and data from second to most significant. 68 # The data is then processed with same SIMD instruction sequence as 69 # for AVX, but with %ymm as operands. Side effect is increased stack 70 # frame, 448 additional bytes in SHA256 and 1152 in SHA512, and 1.2KB 71 # code size increase. 72 # 73 # March 2014. 74 # 75 # Add support for Intel SHA Extensions. 76 77 ###################################################################### 78 # Current performance in cycles per processed byte (less is better): 79 # 80 # SHA256 SSSE3 AVX/XOP(*) SHA512 AVX/XOP(*) 81 # 82 # AMD K8 14.9 - - 9.57 - 83 # P4 17.3 - - 30.8 - 84 # Core 2 15.6 13.8(+13%) - 9.97 - 85 # Westmere 14.8 12.3(+19%) - 9.58 - 86 # Sandy Bridge 17.4 14.2(+23%) 11.6(+50%(**)) 11.2 8.10(+38%(**)) 87 # Ivy Bridge 12.6 10.5(+20%) 10.3(+22%) 8.17 7.22(+13%) 88 # Haswell 12.2 9.28(+31%) 7.80(+56%) 7.66 5.40(+42%) 89 # Bulldozer 21.1 13.6(+54%) 13.6(+54%(***)) 13.5 8.58(+57%) 90 # VIA Nano 23.0 16.5(+39%) - 14.7 - 91 # Atom 23.0 18.9(+22%) - 14.7 - 92 # 93 # (*) whichever best applicable; 94 # (**) switch from ror to shrd stands for fair share of improvement; 95 # (***) execution time is fully determined by remaining integer-only 96 # part, body_00_15; reducing the amount of SIMD instructions 97 # below certain limit makes no difference/sense; to conserve 98 # space SHA256 XOP code path is therefore omitted; 99 100 $flavour = shift; 101 $output = shift; 102 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } 103 104 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); 105 106 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 107 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or 108 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or 109 die "can't locate x86_64-xlate.pl"; 110 111 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` 112 =~ /GNU assembler version ([2-9]\.[0-9]+)/) { 113 $avx = ($1>=2.19) + ($1>=2.22); 114 } 115 116 if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) && 117 `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) { 118 $avx = ($1>=2.09) + ($1>=2.10); 119 } 120 121 if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) && 122 `ml64 2>&1` =~ /Version ([0-9]+)\./) { 123 $avx = ($1>=10) + ($1>=11); 124 } 125 126 if (!$avx && `$ENV{CC} -v 2>&1` =~ /(^clang version|based on LLVM) ([3-9]\.[0-9]+)/) { 127 $avx = ($2>=3.0) + ($2>3.0); 128 } 129 130 $shaext=0; ### set to zero if compiling for 1.0.1 131 $avx=1 if (!$shaext && $avx); 132 133 open OUT,"| \"$^X\" $xlate $flavour"; 134 *STDOUT=*OUT; 135 136 if ($output =~ /512/) { 137 $func="sha512_block_data_order"; 138 $TABLE="K512"; 139 $SZ=8; 140 @ROT=($A,$B,$C,$D,$E,$F,$G,$H)=("%rax","%rbx","%rcx","%rdx", 141 "%r8", "%r9", "%r10","%r11"); 142 ($T1,$a0,$a1,$a2,$a3)=("%r12","%r13","%r14","%r15","%rdi"); 143 @Sigma0=(28,34,39); 144 @Sigma1=(14,18,41); 145 @sigma0=(1, 8, 7); 146 @sigma1=(19,61, 6); 147 $rounds=80; 148 } else { 149 $func="sha256_block_data_order"; 150 $TABLE="K256"; 151 $SZ=4; 152 @ROT=($A,$B,$C,$D,$E,$F,$G,$H)=("%eax","%ebx","%ecx","%edx", 153 "%r8d","%r9d","%r10d","%r11d"); 154 ($T1,$a0,$a1,$a2,$a3)=("%r12d","%r13d","%r14d","%r15d","%edi"); 155 @Sigma0=( 2,13,22); 156 @Sigma1=( 6,11,25); 157 @sigma0=( 7,18, 3); 158 @sigma1=(17,19,10); 159 $rounds=64; 160 } 161 162 $ctx="%rdi"; # 1st arg, zapped by $a3 163 $inp="%rsi"; # 2nd arg 164 $Tbl="%rbp"; 165 166 $_ctx="16*$SZ+0*8(%rsp)"; 167 $_inp="16*$SZ+1*8(%rsp)"; 168 $_end="16*$SZ+2*8(%rsp)"; 169 $_rsp="16*$SZ+3*8(%rsp)"; 170 $framesz="16*$SZ+4*8"; 171 172 173 sub ROUND_00_15() 174 { my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_; 175 my $STRIDE=$SZ; 176 $STRIDE += 16 if ($i%(16/$SZ)==(16/$SZ-1)); 177 178 $code.=<<___; 179 ror \$`$Sigma1[2]-$Sigma1[1]`,$a0 180 mov $f,$a2 181 182 xor $e,$a0 183 ror \$`$Sigma0[2]-$Sigma0[1]`,$a1 184 xor $g,$a2 # f^g 185 186 mov $T1,`$SZ*($i&0xf)`(%rsp) 187 xor $a,$a1 188 and $e,$a2 # (f^g)&e 189 190 ror \$`$Sigma1[1]-$Sigma1[0]`,$a0 191 add $h,$T1 # T1+=h 192 xor $g,$a2 # Ch(e,f,g)=((f^g)&e)^g 193 194 ror \$`$Sigma0[1]-$Sigma0[0]`,$a1 195 xor $e,$a0 196 add $a2,$T1 # T1+=Ch(e,f,g) 197 198 mov $a,$a2 199 add ($Tbl),$T1 # T1+=K[round] 200 xor $a,$a1 201 202 xor $b,$a2 # a^b, b^c in next round 203 ror \$$Sigma1[0],$a0 # Sigma1(e) 204 mov $b,$h 205 206 and $a2,$a3 207 ror \$$Sigma0[0],$a1 # Sigma0(a) 208 add $a0,$T1 # T1+=Sigma1(e) 209 210 xor $a3,$h # h=Maj(a,b,c)=Ch(a^b,c,b) 211 add $T1,$d # d+=T1 212 add $T1,$h # h+=T1 213 214 lea $STRIDE($Tbl),$Tbl # round++ 215 ___ 216 $code.=<<___ if ($i<15); 217 add $a1,$h # h+=Sigma0(a) 218 ___ 219 ($a2,$a3) = ($a3,$a2); 220 } 221 222 sub ROUND_16_XX() 223 { my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_; 224 225 $code.=<<___; 226 mov `$SZ*(($i+1)&0xf)`(%rsp),$a0 227 mov `$SZ*(($i+14)&0xf)`(%rsp),$a2 228 229 mov $a0,$T1 230 ror \$`$sigma0[1]-$sigma0[0]`,$a0 231 add $a1,$a # modulo-scheduled h+=Sigma0(a) 232 mov $a2,$a1 233 ror \$`$sigma1[1]-$sigma1[0]`,$a2 234 235 xor $T1,$a0 236 shr \$$sigma0[2],$T1 237 ror \$$sigma0[0],$a0 238 xor $a1,$a2 239 shr \$$sigma1[2],$a1 240 241 ror \$$sigma1[0],$a2 242 xor $a0,$T1 # sigma0(X[(i+1)&0xf]) 243 xor $a1,$a2 # sigma1(X[(i+14)&0xf]) 244 add `$SZ*(($i+9)&0xf)`(%rsp),$T1 245 246 add `$SZ*($i&0xf)`(%rsp),$T1 247 mov $e,$a0 248 add $a2,$T1 249 mov $a,$a1 250 ___ 251 &ROUND_00_15(@_); 252 } 253 254 $code=<<___; 255 .text 256 257 .extern OPENSSL_ia32cap_P 258 .globl $func 259 .type $func,\@function,3 260 .align 16 261 $func: 262 ___ 263 $code.=<<___ if ($SZ==4 || $avx); 264 lea OPENSSL_ia32cap_P(%rip),%r11 265 mov 0(%r11),%r9d 266 mov 4(%r11),%r10d 267 mov 8(%r11),%r11d 268 ___ 269 $code.=<<___ if ($SZ==4 && $shaext); 270 test \$`1<<29`,%r11d # check for SHA 271 jnz _shaext_shortcut 272 ___ 273 $code.=<<___ if ($avx && $SZ==8); 274 test \$`1<<11`,%r10d # check for XOP 275 jnz .Lxop_shortcut 276 ___ 277 $code.=<<___ if ($avx>1); 278 and \$`1<<8|1<<5|1<<3`,%r11d # check for BMI2+AVX2+BMI1 279 cmp \$`1<<8|1<<5|1<<3`,%r11d 280 je .Lavx2_shortcut 281 ___ 282 $code.=<<___ if ($avx); 283 and \$`1<<30`,%r9d # mask "Intel CPU" bit 284 and \$`1<<28|1<<9`,%r10d # mask AVX and SSSE3 bits 285 or %r9d,%r10d 286 cmp \$`1<<28|1<<9|1<<30`,%r10d 287 je .Lavx_shortcut 288 ___ 289 $code.=<<___ if ($SZ==4); 290 test \$`1<<9`,%r10d 291 jnz .Lssse3_shortcut 292 ___ 293 $code.=<<___; 294 push %rbx 295 push %rbp 296 push %r12 297 push %r13 298 push %r14 299 push %r15 300 mov %rsp,%r11 # copy %rsp 301 shl \$4,%rdx # num*16 302 sub \$$framesz,%rsp 303 lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ 304 and \$-64,%rsp # align stack frame 305 mov $ctx,$_ctx # save ctx, 1st arg 306 mov $inp,$_inp # save inp, 2nd arh 307 mov %rdx,$_end # save end pointer, "3rd" arg 308 mov %r11,$_rsp # save copy of %rsp 309 .Lprologue: 310 311 mov $SZ*0($ctx),$A 312 mov $SZ*1($ctx),$B 313 mov $SZ*2($ctx),$C 314 mov $SZ*3($ctx),$D 315 mov $SZ*4($ctx),$E 316 mov $SZ*5($ctx),$F 317 mov $SZ*6($ctx),$G 318 mov $SZ*7($ctx),$H 319 jmp .Lloop 320 321 .align 16 322 .Lloop: 323 mov $B,$a3 324 lea $TABLE(%rip),$Tbl 325 xor $C,$a3 # magic 326 ___ 327 for($i=0;$i<16;$i++) { 328 $code.=" mov $SZ*$i($inp),$T1\n"; 329 $code.=" mov @ROT[4],$a0\n"; 330 $code.=" mov @ROT[0],$a1\n"; 331 $code.=" bswap $T1\n"; 332 &ROUND_00_15($i,@ROT); 333 unshift(@ROT,pop(@ROT)); 334 } 335 $code.=<<___; 336 jmp .Lrounds_16_xx 337 .align 16 338 .Lrounds_16_xx: 339 ___ 340 for(;$i<32;$i++) { 341 &ROUND_16_XX($i,@ROT); 342 unshift(@ROT,pop(@ROT)); 343 } 344 345 $code.=<<___; 346 cmpb \$0,`$SZ-1`($Tbl) 347 jnz .Lrounds_16_xx 348 349 mov $_ctx,$ctx 350 add $a1,$A # modulo-scheduled h+=Sigma0(a) 351 lea 16*$SZ($inp),$inp 352 353 add $SZ*0($ctx),$A 354 add $SZ*1($ctx),$B 355 add $SZ*2($ctx),$C 356 add $SZ*3($ctx),$D 357 add $SZ*4($ctx),$E 358 add $SZ*5($ctx),$F 359 add $SZ*6($ctx),$G 360 add $SZ*7($ctx),$H 361 362 cmp $_end,$inp 363 364 mov $A,$SZ*0($ctx) 365 mov $B,$SZ*1($ctx) 366 mov $C,$SZ*2($ctx) 367 mov $D,$SZ*3($ctx) 368 mov $E,$SZ*4($ctx) 369 mov $F,$SZ*5($ctx) 370 mov $G,$SZ*6($ctx) 371 mov $H,$SZ*7($ctx) 372 jb .Lloop 373 374 mov $_rsp,%rsi 375 mov (%rsi),%r15 376 mov 8(%rsi),%r14 377 mov 16(%rsi),%r13 378 mov 24(%rsi),%r12 379 mov 32(%rsi),%rbp 380 mov 40(%rsi),%rbx 381 lea 48(%rsi),%rsp 382 .Lepilogue: 383 ret 384 .size $func,.-$func 385 ___ 386 387 if ($SZ==4) { 388 $code.=<<___; 389 .align 64 390 .type $TABLE,\@object 391 $TABLE: 392 .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 393 .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 394 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 395 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 396 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 397 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 398 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 399 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 400 .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc 401 .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc 402 .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da 403 .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da 404 .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 405 .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 406 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 407 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 408 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 409 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 410 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 411 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 412 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 413 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 414 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 415 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 416 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 417 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 418 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 419 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 420 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 421 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 422 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 423 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 424 425 .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f 426 .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f 427 .long 0x03020100,0x0b0a0908,0xffffffff,0xffffffff 428 .long 0x03020100,0x0b0a0908,0xffffffff,0xffffffff 429 .long 0xffffffff,0xffffffff,0x03020100,0x0b0a0908 430 .long 0xffffffff,0xffffffff,0x03020100,0x0b0a0908 431 .asciz "SHA256 block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>" 432 ___ 433 } else { 434 $code.=<<___; 435 .align 64 436 .type $TABLE,\@object 437 $TABLE: 438 .quad 0x428a2f98d728ae22,0x7137449123ef65cd 439 .quad 0x428a2f98d728ae22,0x7137449123ef65cd 440 .quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc 441 .quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc 442 .quad 0x3956c25bf348b538,0x59f111f1b605d019 443 .quad 0x3956c25bf348b538,0x59f111f1b605d019 444 .quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118 445 .quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118 446 .quad 0xd807aa98a3030242,0x12835b0145706fbe 447 .quad 0xd807aa98a3030242,0x12835b0145706fbe 448 .quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2 449 .quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2 450 .quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1 451 .quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1 452 .quad 0x9bdc06a725c71235,0xc19bf174cf692694 453 .quad 0x9bdc06a725c71235,0xc19bf174cf692694 454 .quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3 455 .quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3 456 .quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65 457 .quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65 458 .quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483 459 .quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483 460 .quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5 461 .quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5 462 .quad 0x983e5152ee66dfab,0xa831c66d2db43210 463 .quad 0x983e5152ee66dfab,0xa831c66d2db43210 464 .quad 0xb00327c898fb213f,0xbf597fc7beef0ee4 465 .quad 0xb00327c898fb213f,0xbf597fc7beef0ee4 466 .quad 0xc6e00bf33da88fc2,0xd5a79147930aa725 467 .quad 0xc6e00bf33da88fc2,0xd5a79147930aa725 468 .quad 0x06ca6351e003826f,0x142929670a0e6e70 469 .quad 0x06ca6351e003826f,0x142929670a0e6e70 470 .quad 0x27b70a8546d22ffc,0x2e1b21385c26c926 471 .quad 0x27b70a8546d22ffc,0x2e1b21385c26c926 472 .quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df 473 .quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df 474 .quad 0x650a73548baf63de,0x766a0abb3c77b2a8 475 .quad 0x650a73548baf63de,0x766a0abb3c77b2a8 476 .quad 0x81c2c92e47edaee6,0x92722c851482353b 477 .quad 0x81c2c92e47edaee6,0x92722c851482353b 478 .quad 0xa2bfe8a14cf10364,0xa81a664bbc423001 479 .quad 0xa2bfe8a14cf10364,0xa81a664bbc423001 480 .quad 0xc24b8b70d0f89791,0xc76c51a30654be30 481 .quad 0xc24b8b70d0f89791,0xc76c51a30654be30 482 .quad 0xd192e819d6ef5218,0xd69906245565a910 483 .quad 0xd192e819d6ef5218,0xd69906245565a910 484 .quad 0xf40e35855771202a,0x106aa07032bbd1b8 485 .quad 0xf40e35855771202a,0x106aa07032bbd1b8 486 .quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53 487 .quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53 488 .quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8 489 .quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8 490 .quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb 491 .quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb 492 .quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3 493 .quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3 494 .quad 0x748f82ee5defb2fc,0x78a5636f43172f60 495 .quad 0x748f82ee5defb2fc,0x78a5636f43172f60 496 .quad 0x84c87814a1f0ab72,0x8cc702081a6439ec 497 .quad 0x84c87814a1f0ab72,0x8cc702081a6439ec 498 .quad 0x90befffa23631e28,0xa4506cebde82bde9 499 .quad 0x90befffa23631e28,0xa4506cebde82bde9 500 .quad 0xbef9a3f7b2c67915,0xc67178f2e372532b 501 .quad 0xbef9a3f7b2c67915,0xc67178f2e372532b 502 .quad 0xca273eceea26619c,0xd186b8c721c0c207 503 .quad 0xca273eceea26619c,0xd186b8c721c0c207 504 .quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178 505 .quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178 506 .quad 0x06f067aa72176fba,0x0a637dc5a2c898a6 507 .quad 0x06f067aa72176fba,0x0a637dc5a2c898a6 508 .quad 0x113f9804bef90dae,0x1b710b35131c471b 509 .quad 0x113f9804bef90dae,0x1b710b35131c471b 510 .quad 0x28db77f523047d84,0x32caab7b40c72493 511 .quad 0x28db77f523047d84,0x32caab7b40c72493 512 .quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c 513 .quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c 514 .quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a 515 .quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a 516 .quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817 517 .quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817 518 519 .quad 0x0001020304050607,0x08090a0b0c0d0e0f 520 .quad 0x0001020304050607,0x08090a0b0c0d0e0f 521 .asciz "SHA512 block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>" 522 ___ 523 } 524 525 ###################################################################### 526 # SIMD code paths 527 # 528 if ($SZ==4 && $shaext) {{{ 529 ###################################################################### 530 # Intel SHA Extensions implementation of SHA256 update function. 531 # 532 my ($ctx,$inp,$num,$Tbl)=("%rdi","%rsi","%rdx","%rcx"); 533 534 my ($Wi,$ABEF,$CDGH,$TMP,$BSWAP,$ABEF_SAVE,$CDGH_SAVE)=map("%xmm$_",(0..2,7..10)); 535 my @MSG=map("%xmm$_",(3..6)); 536 537 $code.=<<___; 538 .type sha256_block_data_order_shaext,\@function,3 539 .align 64 540 sha256_block_data_order_shaext: 541 _shaext_shortcut: 542 ___ 543 $code.=<<___ if ($win64); 544 lea `-8-5*16`(%rsp),%rsp 545 movaps %xmm6,-8-5*16(%rax) 546 movaps %xmm7,-8-4*16(%rax) 547 movaps %xmm8,-8-3*16(%rax) 548 movaps %xmm9,-8-2*16(%rax) 549 movaps %xmm10,-8-1*16(%rax) 550 .Lprologue_shaext: 551 ___ 552 $code.=<<___; 553 lea K256+0x80(%rip),$Tbl 554 movdqu ($ctx),$ABEF # DCBA 555 movdqu 16($ctx),$CDGH # HGFE 556 movdqa 0x200-0x80($Tbl),$TMP # byte swap mask 557 558 pshufd \$0x1b,$ABEF,$Wi # ABCD 559 pshufd \$0xb1,$ABEF,$ABEF # CDAB 560 pshufd \$0x1b,$CDGH,$CDGH # EFGH 561 movdqa $TMP,$BSWAP # offload 562 palignr \$8,$CDGH,$ABEF # ABEF 563 punpcklqdq $Wi,$CDGH # CDGH 564 jmp .Loop_shaext 565 566 .align 16 567 .Loop_shaext: 568 movdqu ($inp),@MSG[0] 569 movdqu 0x10($inp),@MSG[1] 570 movdqu 0x20($inp),@MSG[2] 571 pshufb $TMP,@MSG[0] 572 movdqu 0x30($inp),@MSG[3] 573 574 movdqa 0*32-0x80($Tbl),$Wi 575 paddd @MSG[0],$Wi 576 pshufb $TMP,@MSG[1] 577 movdqa $CDGH,$CDGH_SAVE # offload 578 sha256rnds2 $ABEF,$CDGH # 0-3 579 pshufd \$0x0e,$Wi,$Wi 580 nop 581 movdqa $ABEF,$ABEF_SAVE # offload 582 sha256rnds2 $CDGH,$ABEF 583 584 movdqa 1*32-0x80($Tbl),$Wi 585 paddd @MSG[1],$Wi 586 pshufb $TMP,@MSG[2] 587 sha256rnds2 $ABEF,$CDGH # 4-7 588 pshufd \$0x0e,$Wi,$Wi 589 lea 0x40($inp),$inp 590 sha256msg1 @MSG[1],@MSG[0] 591 sha256rnds2 $CDGH,$ABEF 592 593 movdqa 2*32-0x80($Tbl),$Wi 594 paddd @MSG[2],$Wi 595 pshufb $TMP,@MSG[3] 596 sha256rnds2 $ABEF,$CDGH # 8-11 597 pshufd \$0x0e,$Wi,$Wi 598 movdqa @MSG[3],$TMP 599 palignr \$4,@MSG[2],$TMP 600 nop 601 paddd $TMP,@MSG[0] 602 sha256msg1 @MSG[2],@MSG[1] 603 sha256rnds2 $CDGH,$ABEF 604 605 movdqa 3*32-0x80($Tbl),$Wi 606 paddd @MSG[3],$Wi 607 sha256msg2 @MSG[3],@MSG[0] 608 sha256rnds2 $ABEF,$CDGH # 12-15 609 pshufd \$0x0e,$Wi,$Wi 610 movdqa @MSG[0],$TMP 611 palignr \$4,@MSG[3],$TMP 612 nop 613 paddd $TMP,@MSG[1] 614 sha256msg1 @MSG[3],@MSG[2] 615 sha256rnds2 $CDGH,$ABEF 616 ___ 617 for($i=4;$i<16-3;$i++) { 618 $code.=<<___; 619 movdqa $i*32-0x80($Tbl),$Wi 620 paddd @MSG[0],$Wi 621 sha256msg2 @MSG[0],@MSG[1] 622 sha256rnds2 $ABEF,$CDGH # 16-19... 623 pshufd \$0x0e,$Wi,$Wi 624 movdqa @MSG[1],$TMP 625 palignr \$4,@MSG[0],$TMP 626 nop 627 paddd $TMP,@MSG[2] 628 sha256msg1 @MSG[0],@MSG[3] 629 sha256rnds2 $CDGH,$ABEF 630 ___ 631 push(@MSG,shift(@MSG)); 632 } 633 $code.=<<___; 634 movdqa 13*32-0x80($Tbl),$Wi 635 paddd @MSG[0],$Wi 636 sha256msg2 @MSG[0],@MSG[1] 637 sha256rnds2 $ABEF,$CDGH # 52-55 638 pshufd \$0x0e,$Wi,$Wi 639 movdqa @MSG[1],$TMP 640 palignr \$4,@MSG[0],$TMP 641 sha256rnds2 $CDGH,$ABEF 642 paddd $TMP,@MSG[2] 643 644 movdqa 14*32-0x80($Tbl),$Wi 645 paddd @MSG[1],$Wi 646 sha256rnds2 $ABEF,$CDGH # 56-59 647 pshufd \$0x0e,$Wi,$Wi 648 sha256msg2 @MSG[1],@MSG[2] 649 movdqa $BSWAP,$TMP 650 sha256rnds2 $CDGH,$ABEF 651 652 movdqa 15*32-0x80($Tbl),$Wi 653 paddd @MSG[2],$Wi 654 nop 655 sha256rnds2 $ABEF,$CDGH # 60-63 656 pshufd \$0x0e,$Wi,$Wi 657 dec $num 658 nop 659 sha256rnds2 $CDGH,$ABEF 660 661 paddd $CDGH_SAVE,$CDGH 662 paddd $ABEF_SAVE,$ABEF 663 jnz .Loop_shaext 664 665 pshufd \$0xb1,$CDGH,$CDGH # DCHG 666 pshufd \$0x1b,$ABEF,$TMP # FEBA 667 pshufd \$0xb1,$ABEF,$ABEF # BAFE 668 punpckhqdq $CDGH,$ABEF # DCBA 669 palignr \$8,$TMP,$CDGH # HGFE 670 671 movdqu $ABEF,($ctx) 672 movdqu $CDGH,16($ctx) 673 ___ 674 $code.=<<___ if ($win64); 675 movaps -8-5*16(%rax),%xmm6 676 movaps -8-4*16(%rax),%xmm7 677 movaps -8-3*16(%rax),%xmm8 678 movaps -8-2*16(%rax),%xmm9 679 movaps -8-1*16(%rax),%xmm10 680 mov %rax,%rsp 681 .Lepilogue_shaext: 682 ___ 683 $code.=<<___; 684 ret 685 .size sha256_block_data_order_shaext,.-sha256_block_data_order_shaext 686 ___ 687 }}} 688 {{{ 689 690 my $a4=$T1; 691 my ($a,$b,$c,$d,$e,$f,$g,$h); 692 693 sub AUTOLOAD() # thunk [simplified] 32-bit style perlasm 694 { my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; 695 my $arg = pop; 696 $arg = "\$$arg" if ($arg*1 eq $arg); 697 $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n"; 698 } 699 700 sub body_00_15 () { 701 ( 702 '($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'. 703 704 '&ror ($a0,$Sigma1[2]-$Sigma1[1])', 705 '&mov ($a,$a1)', 706 '&mov ($a4,$f)', 707 708 '&ror ($a1,$Sigma0[2]-$Sigma0[1])', 709 '&xor ($a0,$e)', 710 '&xor ($a4,$g)', # f^g 711 712 '&ror ($a0,$Sigma1[1]-$Sigma1[0])', 713 '&xor ($a1,$a)', 714 '&and ($a4,$e)', # (f^g)&e 715 716 '&xor ($a0,$e)', 717 '&add ($h,$SZ*($i&15)."(%rsp)")', # h+=X[i]+K[i] 718 '&mov ($a2,$a)', 719 720 '&xor ($a4,$g)', # Ch(e,f,g)=((f^g)&e)^g 721 '&ror ($a1,$Sigma0[1]-$Sigma0[0])', 722 '&xor ($a2,$b)', # a^b, b^c in next round 723 724 '&add ($h,$a4)', # h+=Ch(e,f,g) 725 '&ror ($a0,$Sigma1[0])', # Sigma1(e) 726 '&and ($a3,$a2)', # (b^c)&(a^b) 727 728 '&xor ($a1,$a)', 729 '&add ($h,$a0)', # h+=Sigma1(e) 730 '&xor ($a3,$b)', # Maj(a,b,c)=Ch(a^b,c,b) 731 732 '&ror ($a1,$Sigma0[0])', # Sigma0(a) 733 '&add ($d,$h)', # d+=h 734 '&add ($h,$a3)', # h+=Maj(a,b,c) 735 736 '&mov ($a0,$d)', 737 '&add ($a1,$h);'. # h+=Sigma0(a) 738 '($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;' 739 ); 740 } 741 742 ###################################################################### 743 # SSSE3 code path 744 # 745 if ($SZ==4) { # SHA256 only 746 my @X = map("%xmm$_",(0..3)); 747 my ($t0,$t1,$t2,$t3, $t4,$t5) = map("%xmm$_",(4..9)); 748 749 $code.=<<___; 750 .type ${func}_ssse3,\@function,3 751 .align 64 752 ${func}_ssse3: 753 .Lssse3_shortcut: 754 push %rbx 755 push %rbp 756 push %r12 757 push %r13 758 push %r14 759 push %r15 760 mov %rsp,%r11 # copy %rsp 761 shl \$4,%rdx # num*16 762 sub \$`$framesz+$win64*16*4`,%rsp 763 lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ 764 and \$-64,%rsp # align stack frame 765 mov $ctx,$_ctx # save ctx, 1st arg 766 mov $inp,$_inp # save inp, 2nd arh 767 mov %rdx,$_end # save end pointer, "3rd" arg 768 mov %r11,$_rsp # save copy of %rsp 769 ___ 770 $code.=<<___ if ($win64); 771 movaps %xmm6,16*$SZ+32(%rsp) 772 movaps %xmm7,16*$SZ+48(%rsp) 773 movaps %xmm8,16*$SZ+64(%rsp) 774 movaps %xmm9,16*$SZ+80(%rsp) 775 ___ 776 $code.=<<___; 777 .Lprologue_ssse3: 778 779 mov $SZ*0($ctx),$A 780 mov $SZ*1($ctx),$B 781 mov $SZ*2($ctx),$C 782 mov $SZ*3($ctx),$D 783 mov $SZ*4($ctx),$E 784 mov $SZ*5($ctx),$F 785 mov $SZ*6($ctx),$G 786 mov $SZ*7($ctx),$H 787 ___ 788 789 $code.=<<___; 790 #movdqa $TABLE+`$SZ*2*$rounds`+32(%rip),$t4 791 #movdqa $TABLE+`$SZ*2*$rounds`+64(%rip),$t5 792 jmp .Lloop_ssse3 793 .align 16 794 .Lloop_ssse3: 795 movdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3 796 movdqu 0x00($inp),@X[0] 797 movdqu 0x10($inp),@X[1] 798 movdqu 0x20($inp),@X[2] 799 pshufb $t3,@X[0] 800 movdqu 0x30($inp),@X[3] 801 lea $TABLE(%rip),$Tbl 802 pshufb $t3,@X[1] 803 movdqa 0x00($Tbl),$t0 804 movdqa 0x20($Tbl),$t1 805 pshufb $t3,@X[2] 806 paddd @X[0],$t0 807 movdqa 0x40($Tbl),$t2 808 pshufb $t3,@X[3] 809 movdqa 0x60($Tbl),$t3 810 paddd @X[1],$t1 811 paddd @X[2],$t2 812 paddd @X[3],$t3 813 movdqa $t0,0x00(%rsp) 814 mov $A,$a1 815 movdqa $t1,0x10(%rsp) 816 mov $B,$a3 817 movdqa $t2,0x20(%rsp) 818 xor $C,$a3 # magic 819 movdqa $t3,0x30(%rsp) 820 mov $E,$a0 821 jmp .Lssse3_00_47 822 823 .align 16 824 .Lssse3_00_47: 825 sub \$`-16*2*$SZ`,$Tbl # size optimization 826 ___ 827 sub Xupdate_256_SSSE3 () { 828 ( 829 '&movdqa ($t0,@X[1]);', 830 '&movdqa ($t3,@X[3])', 831 '&palignr ($t0,@X[0],$SZ)', # X[1..4] 832 '&palignr ($t3,@X[2],$SZ);', # X[9..12] 833 '&movdqa ($t1,$t0)', 834 '&movdqa ($t2,$t0);', 835 '&psrld ($t0,$sigma0[2])', 836 '&paddd (@X[0],$t3);', # X[0..3] += X[9..12] 837 '&psrld ($t2,$sigma0[0])', 838 '&pshufd ($t3,@X[3],0b11111010)',# X[14..15] 839 '&pslld ($t1,8*$SZ-$sigma0[1]);'. 840 '&pxor ($t0,$t2)', 841 '&psrld ($t2,$sigma0[1]-$sigma0[0]);'. 842 '&pxor ($t0,$t1)', 843 '&pslld ($t1,$sigma0[1]-$sigma0[0]);'. 844 '&pxor ($t0,$t2);', 845 '&movdqa ($t2,$t3)', 846 '&pxor ($t0,$t1);', # sigma0(X[1..4]) 847 '&psrld ($t3,$sigma1[2])', 848 '&paddd (@X[0],$t0);', # X[0..3] += sigma0(X[1..4]) 849 '&psrlq ($t2,$sigma1[0])', 850 '&pxor ($t3,$t2);', 851 '&psrlq ($t2,$sigma1[1]-$sigma1[0])', 852 '&pxor ($t3,$t2)', 853 '&pshufb ($t3,$t4)', # sigma1(X[14..15]) 854 '&paddd (@X[0],$t3)', # X[0..1] += sigma1(X[14..15]) 855 '&pshufd ($t3,@X[0],0b01010000)',# X[16..17] 856 '&movdqa ($t2,$t3);', 857 '&psrld ($t3,$sigma1[2])', 858 '&psrlq ($t2,$sigma1[0])', 859 '&pxor ($t3,$t2);', 860 '&psrlq ($t2,$sigma1[1]-$sigma1[0])', 861 '&pxor ($t3,$t2);', 862 '&movdqa ($t2,16*2*$j."($Tbl)")', 863 '&pshufb ($t3,$t5)', 864 '&paddd (@X[0],$t3)' # X[2..3] += sigma1(X[16..17]) 865 ); 866 } 867 868 sub SSSE3_256_00_47 () { 869 my $j = shift; 870 my $body = shift; 871 my @X = @_; 872 my @insns = (&$body,&$body,&$body,&$body); # 104 instructions 873 874 if (0) { 875 foreach (Xupdate_256_SSSE3()) { # 36 instructions 876 eval; 877 eval(shift(@insns)); 878 eval(shift(@insns)); 879 eval(shift(@insns)); 880 } 881 } else { # squeeze extra 4% on Westmere and 19% on Atom 882 eval(shift(@insns)); #@ 883 &movdqa ($t0,@X[1]); 884 eval(shift(@insns)); 885 eval(shift(@insns)); 886 &movdqa ($t3,@X[3]); 887 eval(shift(@insns)); #@ 888 eval(shift(@insns)); 889 eval(shift(@insns)); 890 eval(shift(@insns)); #@ 891 eval(shift(@insns)); 892 &palignr ($t0,@X[0],$SZ); # X[1..4] 893 eval(shift(@insns)); 894 eval(shift(@insns)); 895 &palignr ($t3,@X[2],$SZ); # X[9..12] 896 eval(shift(@insns)); 897 eval(shift(@insns)); 898 eval(shift(@insns)); 899 eval(shift(@insns)); #@ 900 &movdqa ($t1,$t0); 901 eval(shift(@insns)); 902 eval(shift(@insns)); 903 &movdqa ($t2,$t0); 904 eval(shift(@insns)); #@ 905 eval(shift(@insns)); 906 &psrld ($t0,$sigma0[2]); 907 eval(shift(@insns)); 908 eval(shift(@insns)); 909 eval(shift(@insns)); 910 &paddd (@X[0],$t3); # X[0..3] += X[9..12] 911 eval(shift(@insns)); #@ 912 eval(shift(@insns)); 913 &psrld ($t2,$sigma0[0]); 914 eval(shift(@insns)); 915 eval(shift(@insns)); 916 &pshufd ($t3,@X[3],0b11111010); # X[4..15] 917 eval(shift(@insns)); 918 eval(shift(@insns)); #@ 919 &pslld ($t1,8*$SZ-$sigma0[1]); 920 eval(shift(@insns)); 921 eval(shift(@insns)); 922 &pxor ($t0,$t2); 923 eval(shift(@insns)); #@ 924 eval(shift(@insns)); 925 eval(shift(@insns)); 926 eval(shift(@insns)); #@ 927 &psrld ($t2,$sigma0[1]-$sigma0[0]); 928 eval(shift(@insns)); 929 &pxor ($t0,$t1); 930 eval(shift(@insns)); 931 eval(shift(@insns)); 932 &pslld ($t1,$sigma0[1]-$sigma0[0]); 933 eval(shift(@insns)); 934 eval(shift(@insns)); 935 &pxor ($t0,$t2); 936 eval(shift(@insns)); 937 eval(shift(@insns)); #@ 938 &movdqa ($t2,$t3); 939 eval(shift(@insns)); 940 eval(shift(@insns)); 941 &pxor ($t0,$t1); # sigma0(X[1..4]) 942 eval(shift(@insns)); #@ 943 eval(shift(@insns)); 944 eval(shift(@insns)); 945 &psrld ($t3,$sigma1[2]); 946 eval(shift(@insns)); 947 eval(shift(@insns)); 948 &paddd (@X[0],$t0); # X[0..3] += sigma0(X[1..4]) 949 eval(shift(@insns)); #@ 950 eval(shift(@insns)); 951 &psrlq ($t2,$sigma1[0]); 952 eval(shift(@insns)); 953 eval(shift(@insns)); 954 eval(shift(@insns)); 955 &pxor ($t3,$t2); 956 eval(shift(@insns)); #@ 957 eval(shift(@insns)); 958 eval(shift(@insns)); 959 eval(shift(@insns)); #@ 960 &psrlq ($t2,$sigma1[1]-$sigma1[0]); 961 eval(shift(@insns)); 962 eval(shift(@insns)); 963 &pxor ($t3,$t2); 964 eval(shift(@insns)); #@ 965 eval(shift(@insns)); 966 eval(shift(@insns)); 967 #&pshufb ($t3,$t4); # sigma1(X[14..15]) 968 &pshufd ($t3,$t3,0b10000000); 969 eval(shift(@insns)); 970 eval(shift(@insns)); 971 eval(shift(@insns)); 972 &psrldq ($t3,8); 973 eval(shift(@insns)); 974 eval(shift(@insns)); #@ 975 eval(shift(@insns)); 976 eval(shift(@insns)); 977 eval(shift(@insns)); #@ 978 &paddd (@X[0],$t3); # X[0..1] += sigma1(X[14..15]) 979 eval(shift(@insns)); 980 eval(shift(@insns)); 981 eval(shift(@insns)); 982 &pshufd ($t3,@X[0],0b01010000); # X[16..17] 983 eval(shift(@insns)); 984 eval(shift(@insns)); #@ 985 eval(shift(@insns)); 986 &movdqa ($t2,$t3); 987 eval(shift(@insns)); 988 eval(shift(@insns)); 989 &psrld ($t3,$sigma1[2]); 990 eval(shift(@insns)); 991 eval(shift(@insns)); #@ 992 &psrlq ($t2,$sigma1[0]); 993 eval(shift(@insns)); 994 eval(shift(@insns)); 995 &pxor ($t3,$t2); 996 eval(shift(@insns)); #@ 997 eval(shift(@insns)); 998 eval(shift(@insns)); 999 eval(shift(@insns)); #@ 1000 eval(shift(@insns)); 1001 &psrlq ($t2,$sigma1[1]-$sigma1[0]); 1002 eval(shift(@insns)); 1003 eval(shift(@insns)); 1004 eval(shift(@insns)); 1005 &pxor ($t3,$t2); 1006 eval(shift(@insns)); 1007 eval(shift(@insns)); 1008 eval(shift(@insns)); #@ 1009 #&pshufb ($t3,$t5); 1010 &pshufd ($t3,$t3,0b00001000); 1011 eval(shift(@insns)); 1012 eval(shift(@insns)); 1013 &movdqa ($t2,16*2*$j."($Tbl)"); 1014 eval(shift(@insns)); #@ 1015 eval(shift(@insns)); 1016 &pslldq ($t3,8); 1017 eval(shift(@insns)); 1018 eval(shift(@insns)); 1019 eval(shift(@insns)); 1020 &paddd (@X[0],$t3); # X[2..3] += sigma1(X[16..17]) 1021 eval(shift(@insns)); #@ 1022 eval(shift(@insns)); 1023 eval(shift(@insns)); 1024 } 1025 &paddd ($t2,@X[0]); 1026 foreach (@insns) { eval; } # remaining instructions 1027 &movdqa (16*$j."(%rsp)",$t2); 1028 } 1029 1030 for ($i=0,$j=0; $j<4; $j++) { 1031 &SSSE3_256_00_47($j,\&body_00_15,@X); 1032 push(@X,shift(@X)); # rotate(@X) 1033 } 1034 &cmpb ($SZ-1+16*2*$SZ."($Tbl)",0); 1035 &jne (".Lssse3_00_47"); 1036 1037 for ($i=0; $i<16; ) { 1038 foreach(body_00_15()) { eval; } 1039 } 1040 $code.=<<___; 1041 mov $_ctx,$ctx 1042 mov $a1,$A 1043 1044 add $SZ*0($ctx),$A 1045 lea 16*$SZ($inp),$inp 1046 add $SZ*1($ctx),$B 1047 add $SZ*2($ctx),$C 1048 add $SZ*3($ctx),$D 1049 add $SZ*4($ctx),$E 1050 add $SZ*5($ctx),$F 1051 add $SZ*6($ctx),$G 1052 add $SZ*7($ctx),$H 1053 1054 cmp $_end,$inp 1055 1056 mov $A,$SZ*0($ctx) 1057 mov $B,$SZ*1($ctx) 1058 mov $C,$SZ*2($ctx) 1059 mov $D,$SZ*3($ctx) 1060 mov $E,$SZ*4($ctx) 1061 mov $F,$SZ*5($ctx) 1062 mov $G,$SZ*6($ctx) 1063 mov $H,$SZ*7($ctx) 1064 jb .Lloop_ssse3 1065 1066 mov $_rsp,%rsi 1067 ___ 1068 $code.=<<___ if ($win64); 1069 movaps 16*$SZ+32(%rsp),%xmm6 1070 movaps 16*$SZ+48(%rsp),%xmm7 1071 movaps 16*$SZ+64(%rsp),%xmm8 1072 movaps 16*$SZ+80(%rsp),%xmm9 1073 ___ 1074 $code.=<<___; 1075 mov (%rsi),%r15 1076 mov 8(%rsi),%r14 1077 mov 16(%rsi),%r13 1078 mov 24(%rsi),%r12 1079 mov 32(%rsi),%rbp 1080 mov 40(%rsi),%rbx 1081 lea 48(%rsi),%rsp 1082 .Lepilogue_ssse3: 1083 ret 1084 .size ${func}_ssse3,.-${func}_ssse3 1085 ___ 1086 } 1087 1088 if ($avx) {{ 1089 ###################################################################### 1090 # XOP code path 1091 # 1092 if ($SZ==8) { # SHA512 only 1093 $code.=<<___; 1094 .type ${func}_xop,\@function,3 1095 .align 64 1096 ${func}_xop: 1097 .Lxop_shortcut: 1098 push %rbx 1099 push %rbp 1100 push %r12 1101 push %r13 1102 push %r14 1103 push %r15 1104 mov %rsp,%r11 # copy %rsp 1105 shl \$4,%rdx # num*16 1106 sub \$`$framesz+$win64*16*($SZ==4?4:6)`,%rsp 1107 lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ 1108 and \$-64,%rsp # align stack frame 1109 mov $ctx,$_ctx # save ctx, 1st arg 1110 mov $inp,$_inp # save inp, 2nd arh 1111 mov %rdx,$_end # save end pointer, "3rd" arg 1112 mov %r11,$_rsp # save copy of %rsp 1113 ___ 1114 $code.=<<___ if ($win64); 1115 movaps %xmm6,16*$SZ+32(%rsp) 1116 movaps %xmm7,16*$SZ+48(%rsp) 1117 movaps %xmm8,16*$SZ+64(%rsp) 1118 movaps %xmm9,16*$SZ+80(%rsp) 1119 ___ 1120 $code.=<<___ if ($win64 && $SZ>4); 1121 movaps %xmm10,16*$SZ+96(%rsp) 1122 movaps %xmm11,16*$SZ+112(%rsp) 1123 ___ 1124 $code.=<<___; 1125 .Lprologue_xop: 1126 1127 vzeroupper 1128 mov $SZ*0($ctx),$A 1129 mov $SZ*1($ctx),$B 1130 mov $SZ*2($ctx),$C 1131 mov $SZ*3($ctx),$D 1132 mov $SZ*4($ctx),$E 1133 mov $SZ*5($ctx),$F 1134 mov $SZ*6($ctx),$G 1135 mov $SZ*7($ctx),$H 1136 jmp .Lloop_xop 1137 ___ 1138 if ($SZ==4) { # SHA256 1139 my @X = map("%xmm$_",(0..3)); 1140 my ($t0,$t1,$t2,$t3) = map("%xmm$_",(4..7)); 1141 1142 $code.=<<___; 1143 .align 16 1144 .Lloop_xop: 1145 vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3 1146 vmovdqu 0x00($inp),@X[0] 1147 vmovdqu 0x10($inp),@X[1] 1148 vmovdqu 0x20($inp),@X[2] 1149 vmovdqu 0x30($inp),@X[3] 1150 vpshufb $t3,@X[0],@X[0] 1151 lea $TABLE(%rip),$Tbl 1152 vpshufb $t3,@X[1],@X[1] 1153 vpshufb $t3,@X[2],@X[2] 1154 vpaddd 0x00($Tbl),@X[0],$t0 1155 vpshufb $t3,@X[3],@X[3] 1156 vpaddd 0x20($Tbl),@X[1],$t1 1157 vpaddd 0x40($Tbl),@X[2],$t2 1158 vpaddd 0x60($Tbl),@X[3],$t3 1159 vmovdqa $t0,0x00(%rsp) 1160 mov $A,$a1 1161 vmovdqa $t1,0x10(%rsp) 1162 mov $B,$a3 1163 vmovdqa $t2,0x20(%rsp) 1164 xor $C,$a3 # magic 1165 vmovdqa $t3,0x30(%rsp) 1166 mov $E,$a0 1167 jmp .Lxop_00_47 1168 1169 .align 16 1170 .Lxop_00_47: 1171 sub \$`-16*2*$SZ`,$Tbl # size optimization 1172 ___ 1173 sub XOP_256_00_47 () { 1174 my $j = shift; 1175 my $body = shift; 1176 my @X = @_; 1177 my @insns = (&$body,&$body,&$body,&$body); # 104 instructions 1178 1179 &vpalignr ($t0,@X[1],@X[0],$SZ); # X[1..4] 1180 eval(shift(@insns)); 1181 eval(shift(@insns)); 1182 &vpalignr ($t3,@X[3],@X[2],$SZ); # X[9..12] 1183 eval(shift(@insns)); 1184 eval(shift(@insns)); 1185 &vprotd ($t1,$t0,8*$SZ-$sigma0[1]); 1186 eval(shift(@insns)); 1187 eval(shift(@insns)); 1188 &vpsrld ($t0,$t0,$sigma0[2]); 1189 eval(shift(@insns)); 1190 eval(shift(@insns)); 1191 &vpaddd (@X[0],@X[0],$t3); # X[0..3] += X[9..12] 1192 eval(shift(@insns)); 1193 eval(shift(@insns)); 1194 eval(shift(@insns)); 1195 eval(shift(@insns)); 1196 &vprotd ($t2,$t1,$sigma0[1]-$sigma0[0]); 1197 eval(shift(@insns)); 1198 eval(shift(@insns)); 1199 &vpxor ($t0,$t0,$t1); 1200 eval(shift(@insns)); 1201 eval(shift(@insns)); 1202 eval(shift(@insns)); 1203 eval(shift(@insns)); 1204 &vprotd ($t3,@X[3],8*$SZ-$sigma1[1]); 1205 eval(shift(@insns)); 1206 eval(shift(@insns)); 1207 &vpxor ($t0,$t0,$t2); # sigma0(X[1..4]) 1208 eval(shift(@insns)); 1209 eval(shift(@insns)); 1210 &vpsrld ($t2,@X[3],$sigma1[2]); 1211 eval(shift(@insns)); 1212 eval(shift(@insns)); 1213 &vpaddd (@X[0],@X[0],$t0); # X[0..3] += sigma0(X[1..4]) 1214 eval(shift(@insns)); 1215 eval(shift(@insns)); 1216 &vprotd ($t1,$t3,$sigma1[1]-$sigma1[0]); 1217 eval(shift(@insns)); 1218 eval(shift(@insns)); 1219 &vpxor ($t3,$t3,$t2); 1220 eval(shift(@insns)); 1221 eval(shift(@insns)); 1222 eval(shift(@insns)); 1223 eval(shift(@insns)); 1224 &vpxor ($t3,$t3,$t1); # sigma1(X[14..15]) 1225 eval(shift(@insns)); 1226 eval(shift(@insns)); 1227 eval(shift(@insns)); 1228 eval(shift(@insns)); 1229 &vpsrldq ($t3,$t3,8); 1230 eval(shift(@insns)); 1231 eval(shift(@insns)); 1232 eval(shift(@insns)); 1233 eval(shift(@insns)); 1234 &vpaddd (@X[0],@X[0],$t3); # X[0..1] += sigma1(X[14..15]) 1235 eval(shift(@insns)); 1236 eval(shift(@insns)); 1237 eval(shift(@insns)); 1238 eval(shift(@insns)); 1239 &vprotd ($t3,@X[0],8*$SZ-$sigma1[1]); 1240 eval(shift(@insns)); 1241 eval(shift(@insns)); 1242 &vpsrld ($t2,@X[0],$sigma1[2]); 1243 eval(shift(@insns)); 1244 eval(shift(@insns)); 1245 &vprotd ($t1,$t3,$sigma1[1]-$sigma1[0]); 1246 eval(shift(@insns)); 1247 eval(shift(@insns)); 1248 &vpxor ($t3,$t3,$t2); 1249 eval(shift(@insns)); 1250 eval(shift(@insns)); 1251 eval(shift(@insns)); 1252 eval(shift(@insns)); 1253 &vpxor ($t3,$t3,$t1); # sigma1(X[16..17]) 1254 eval(shift(@insns)); 1255 eval(shift(@insns)); 1256 eval(shift(@insns)); 1257 eval(shift(@insns)); 1258 &vpslldq ($t3,$t3,8); # 22 instructions 1259 eval(shift(@insns)); 1260 eval(shift(@insns)); 1261 eval(shift(@insns)); 1262 eval(shift(@insns)); 1263 &vpaddd (@X[0],@X[0],$t3); # X[2..3] += sigma1(X[16..17]) 1264 eval(shift(@insns)); 1265 eval(shift(@insns)); 1266 eval(shift(@insns)); 1267 eval(shift(@insns)); 1268 &vpaddd ($t2,@X[0],16*2*$j."($Tbl)"); 1269 foreach (@insns) { eval; } # remaining instructions 1270 &vmovdqa (16*$j."(%rsp)",$t2); 1271 } 1272 1273 for ($i=0,$j=0; $j<4; $j++) { 1274 &XOP_256_00_47($j,\&body_00_15,@X); 1275 push(@X,shift(@X)); # rotate(@X) 1276 } 1277 &cmpb ($SZ-1+16*2*$SZ."($Tbl)",0); 1278 &jne (".Lxop_00_47"); 1279 1280 for ($i=0; $i<16; ) { 1281 foreach(body_00_15()) { eval; } 1282 } 1283 1284 } else { # SHA512 1285 my @X = map("%xmm$_",(0..7)); 1286 my ($t0,$t1,$t2,$t3) = map("%xmm$_",(8..11)); 1287 1288 $code.=<<___; 1289 .align 16 1290 .Lloop_xop: 1291 vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3 1292 vmovdqu 0x00($inp),@X[0] 1293 lea $TABLE+0x80(%rip),$Tbl # size optimization 1294 vmovdqu 0x10($inp),@X[1] 1295 vmovdqu 0x20($inp),@X[2] 1296 vpshufb $t3,@X[0],@X[0] 1297 vmovdqu 0x30($inp),@X[3] 1298 vpshufb $t3,@X[1],@X[1] 1299 vmovdqu 0x40($inp),@X[4] 1300 vpshufb $t3,@X[2],@X[2] 1301 vmovdqu 0x50($inp),@X[5] 1302 vpshufb $t3,@X[3],@X[3] 1303 vmovdqu 0x60($inp),@X[6] 1304 vpshufb $t3,@X[4],@X[4] 1305 vmovdqu 0x70($inp),@X[7] 1306 vpshufb $t3,@X[5],@X[5] 1307 vpaddq -0x80($Tbl),@X[0],$t0 1308 vpshufb $t3,@X[6],@X[6] 1309 vpaddq -0x60($Tbl),@X[1],$t1 1310 vpshufb $t3,@X[7],@X[7] 1311 vpaddq -0x40($Tbl),@X[2],$t2 1312 vpaddq -0x20($Tbl),@X[3],$t3 1313 vmovdqa $t0,0x00(%rsp) 1314 vpaddq 0x00($Tbl),@X[4],$t0 1315 vmovdqa $t1,0x10(%rsp) 1316 vpaddq 0x20($Tbl),@X[5],$t1 1317 vmovdqa $t2,0x20(%rsp) 1318 vpaddq 0x40($Tbl),@X[6],$t2 1319 vmovdqa $t3,0x30(%rsp) 1320 vpaddq 0x60($Tbl),@X[7],$t3 1321 vmovdqa $t0,0x40(%rsp) 1322 mov $A,$a1 1323 vmovdqa $t1,0x50(%rsp) 1324 mov $B,$a3 1325 vmovdqa $t2,0x60(%rsp) 1326 xor $C,$a3 # magic 1327 vmovdqa $t3,0x70(%rsp) 1328 mov $E,$a0 1329 jmp .Lxop_00_47 1330 1331 .align 16 1332 .Lxop_00_47: 1333 add \$`16*2*$SZ`,$Tbl 1334 ___ 1335 sub XOP_512_00_47 () { 1336 my $j = shift; 1337 my $body = shift; 1338 my @X = @_; 1339 my @insns = (&$body,&$body); # 52 instructions 1340 1341 &vpalignr ($t0,@X[1],@X[0],$SZ); # X[1..2] 1342 eval(shift(@insns)); 1343 eval(shift(@insns)); 1344 &vpalignr ($t3,@X[5],@X[4],$SZ); # X[9..10] 1345 eval(shift(@insns)); 1346 eval(shift(@insns)); 1347 &vprotq ($t1,$t0,8*$SZ-$sigma0[1]); 1348 eval(shift(@insns)); 1349 eval(shift(@insns)); 1350 &vpsrlq ($t0,$t0,$sigma0[2]); 1351 eval(shift(@insns)); 1352 eval(shift(@insns)); 1353 &vpaddq (@X[0],@X[0],$t3); # X[0..1] += X[9..10] 1354 eval(shift(@insns)); 1355 eval(shift(@insns)); 1356 eval(shift(@insns)); 1357 eval(shift(@insns)); 1358 &vprotq ($t2,$t1,$sigma0[1]-$sigma0[0]); 1359 eval(shift(@insns)); 1360 eval(shift(@insns)); 1361 &vpxor ($t0,$t0,$t1); 1362 eval(shift(@insns)); 1363 eval(shift(@insns)); 1364 eval(shift(@insns)); 1365 eval(shift(@insns)); 1366 &vprotq ($t3,@X[7],8*$SZ-$sigma1[1]); 1367 eval(shift(@insns)); 1368 eval(shift(@insns)); 1369 &vpxor ($t0,$t0,$t2); # sigma0(X[1..2]) 1370 eval(shift(@insns)); 1371 eval(shift(@insns)); 1372 &vpsrlq ($t2,@X[7],$sigma1[2]); 1373 eval(shift(@insns)); 1374 eval(shift(@insns)); 1375 &vpaddq (@X[0],@X[0],$t0); # X[0..1] += sigma0(X[1..2]) 1376 eval(shift(@insns)); 1377 eval(shift(@insns)); 1378 &vprotq ($t1,$t3,$sigma1[1]-$sigma1[0]); 1379 eval(shift(@insns)); 1380 eval(shift(@insns)); 1381 &vpxor ($t3,$t3,$t2); 1382 eval(shift(@insns)); 1383 eval(shift(@insns)); 1384 eval(shift(@insns)); 1385 eval(shift(@insns)); 1386 &vpxor ($t3,$t3,$t1); # sigma1(X[14..15]) 1387 eval(shift(@insns)); 1388 eval(shift(@insns)); 1389 eval(shift(@insns)); 1390 eval(shift(@insns)); 1391 &vpaddq (@X[0],@X[0],$t3); # X[0..1] += sigma1(X[14..15]) 1392 eval(shift(@insns)); 1393 eval(shift(@insns)); 1394 eval(shift(@insns)); 1395 eval(shift(@insns)); 1396 &vpaddq ($t2,@X[0],16*2*$j-0x80."($Tbl)"); 1397 foreach (@insns) { eval; } # remaining instructions 1398 &vmovdqa (16*$j."(%rsp)",$t2); 1399 } 1400 1401 for ($i=0,$j=0; $j<8; $j++) { 1402 &XOP_512_00_47($j,\&body_00_15,@X); 1403 push(@X,shift(@X)); # rotate(@X) 1404 } 1405 &cmpb ($SZ-1+16*2*$SZ-0x80."($Tbl)",0); 1406 &jne (".Lxop_00_47"); 1407 1408 for ($i=0; $i<16; ) { 1409 foreach(body_00_15()) { eval; } 1410 } 1411 } 1412 $code.=<<___; 1413 mov $_ctx,$ctx 1414 mov $a1,$A 1415 1416 add $SZ*0($ctx),$A 1417 lea 16*$SZ($inp),$inp 1418 add $SZ*1($ctx),$B 1419 add $SZ*2($ctx),$C 1420 add $SZ*3($ctx),$D 1421 add $SZ*4($ctx),$E 1422 add $SZ*5($ctx),$F 1423 add $SZ*6($ctx),$G 1424 add $SZ*7($ctx),$H 1425 1426 cmp $_end,$inp 1427 1428 mov $A,$SZ*0($ctx) 1429 mov $B,$SZ*1($ctx) 1430 mov $C,$SZ*2($ctx) 1431 mov $D,$SZ*3($ctx) 1432 mov $E,$SZ*4($ctx) 1433 mov $F,$SZ*5($ctx) 1434 mov $G,$SZ*6($ctx) 1435 mov $H,$SZ*7($ctx) 1436 jb .Lloop_xop 1437 1438 mov $_rsp,%rsi 1439 vzeroupper 1440 ___ 1441 $code.=<<___ if ($win64); 1442 movaps 16*$SZ+32(%rsp),%xmm6 1443 movaps 16*$SZ+48(%rsp),%xmm7 1444 movaps 16*$SZ+64(%rsp),%xmm8 1445 movaps 16*$SZ+80(%rsp),%xmm9 1446 ___ 1447 $code.=<<___ if ($win64 && $SZ>4); 1448 movaps 16*$SZ+96(%rsp),%xmm10 1449 movaps 16*$SZ+112(%rsp),%xmm11 1450 ___ 1451 $code.=<<___; 1452 mov (%rsi),%r15 1453 mov 8(%rsi),%r14 1454 mov 16(%rsi),%r13 1455 mov 24(%rsi),%r12 1456 mov 32(%rsi),%rbp 1457 mov 40(%rsi),%rbx 1458 lea 48(%rsi),%rsp 1459 .Lepilogue_xop: 1460 ret 1461 .size ${func}_xop,.-${func}_xop 1462 ___ 1463 } 1464 ###################################################################### 1465 # AVX+shrd code path 1466 # 1467 local *ror = sub { &shrd(@_[0],@_) }; 1468 1469 $code.=<<___; 1470 .type ${func}_avx,\@function,3 1471 .align 64 1472 ${func}_avx: 1473 .Lavx_shortcut: 1474 push %rbx 1475 push %rbp 1476 push %r12 1477 push %r13 1478 push %r14 1479 push %r15 1480 mov %rsp,%r11 # copy %rsp 1481 shl \$4,%rdx # num*16 1482 sub \$`$framesz+$win64*16*($SZ==4?4:6)`,%rsp 1483 lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ 1484 and \$-64,%rsp # align stack frame 1485 mov $ctx,$_ctx # save ctx, 1st arg 1486 mov $inp,$_inp # save inp, 2nd arh 1487 mov %rdx,$_end # save end pointer, "3rd" arg 1488 mov %r11,$_rsp # save copy of %rsp 1489 ___ 1490 $code.=<<___ if ($win64); 1491 movaps %xmm6,16*$SZ+32(%rsp) 1492 movaps %xmm7,16*$SZ+48(%rsp) 1493 movaps %xmm8,16*$SZ+64(%rsp) 1494 movaps %xmm9,16*$SZ+80(%rsp) 1495 ___ 1496 $code.=<<___ if ($win64 && $SZ>4); 1497 movaps %xmm10,16*$SZ+96(%rsp) 1498 movaps %xmm11,16*$SZ+112(%rsp) 1499 ___ 1500 $code.=<<___; 1501 .Lprologue_avx: 1502 1503 vzeroupper 1504 mov $SZ*0($ctx),$A 1505 mov $SZ*1($ctx),$B 1506 mov $SZ*2($ctx),$C 1507 mov $SZ*3($ctx),$D 1508 mov $SZ*4($ctx),$E 1509 mov $SZ*5($ctx),$F 1510 mov $SZ*6($ctx),$G 1511 mov $SZ*7($ctx),$H 1512 ___ 1513 if ($SZ==4) { # SHA256 1514 my @X = map("%xmm$_",(0..3)); 1515 my ($t0,$t1,$t2,$t3, $t4,$t5) = map("%xmm$_",(4..9)); 1516 1517 $code.=<<___; 1518 vmovdqa $TABLE+`$SZ*2*$rounds`+32(%rip),$t4 1519 vmovdqa $TABLE+`$SZ*2*$rounds`+64(%rip),$t5 1520 jmp .Lloop_avx 1521 .align 16 1522 .Lloop_avx: 1523 vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3 1524 vmovdqu 0x00($inp),@X[0] 1525 vmovdqu 0x10($inp),@X[1] 1526 vmovdqu 0x20($inp),@X[2] 1527 vmovdqu 0x30($inp),@X[3] 1528 vpshufb $t3,@X[0],@X[0] 1529 lea $TABLE(%rip),$Tbl 1530 vpshufb $t3,@X[1],@X[1] 1531 vpshufb $t3,@X[2],@X[2] 1532 vpaddd 0x00($Tbl),@X[0],$t0 1533 vpshufb $t3,@X[3],@X[3] 1534 vpaddd 0x20($Tbl),@X[1],$t1 1535 vpaddd 0x40($Tbl),@X[2],$t2 1536 vpaddd 0x60($Tbl),@X[3],$t3 1537 vmovdqa $t0,0x00(%rsp) 1538 mov $A,$a1 1539 vmovdqa $t1,0x10(%rsp) 1540 mov $B,$a3 1541 vmovdqa $t2,0x20(%rsp) 1542 xor $C,$a3 # magic 1543 vmovdqa $t3,0x30(%rsp) 1544 mov $E,$a0 1545 jmp .Lavx_00_47 1546 1547 .align 16 1548 .Lavx_00_47: 1549 sub \$`-16*2*$SZ`,$Tbl # size optimization 1550 ___ 1551 sub Xupdate_256_AVX () { 1552 ( 1553 '&vpalignr ($t0,@X[1],@X[0],$SZ)', # X[1..4] 1554 '&vpalignr ($t3,@X[3],@X[2],$SZ)', # X[9..12] 1555 '&vpsrld ($t2,$t0,$sigma0[0]);', 1556 '&vpaddd (@X[0],@X[0],$t3)', # X[0..3] += X[9..12] 1557 '&vpsrld ($t3,$t0,$sigma0[2])', 1558 '&vpslld ($t1,$t0,8*$SZ-$sigma0[1]);', 1559 '&vpxor ($t0,$t3,$t2)', 1560 '&vpshufd ($t3,@X[3],0b11111010)',# X[14..15] 1561 '&vpsrld ($t2,$t2,$sigma0[1]-$sigma0[0]);', 1562 '&vpxor ($t0,$t0,$t1)', 1563 '&vpslld ($t1,$t1,$sigma0[1]-$sigma0[0]);', 1564 '&vpxor ($t0,$t0,$t2)', 1565 '&vpsrld ($t2,$t3,$sigma1[2]);', 1566 '&vpxor ($t0,$t0,$t1)', # sigma0(X[1..4]) 1567 '&vpsrlq ($t3,$t3,$sigma1[0]);', 1568 '&vpaddd (@X[0],@X[0],$t0)', # X[0..3] += sigma0(X[1..4]) 1569 '&vpxor ($t2,$t2,$t3);', 1570 '&vpsrlq ($t3,$t3,$sigma1[1]-$sigma1[0])', 1571 '&vpxor ($t2,$t2,$t3)', 1572 '&vpshufb ($t2,$t2,$t4)', # sigma1(X[14..15]) 1573 '&vpaddd (@X[0],@X[0],$t2)', # X[0..1] += sigma1(X[14..15]) 1574 '&vpshufd ($t3,@X[0],0b01010000)',# X[16..17] 1575 '&vpsrld ($t2,$t3,$sigma1[2])', 1576 '&vpsrlq ($t3,$t3,$sigma1[0])', 1577 '&vpxor ($t2,$t2,$t3);', 1578 '&vpsrlq ($t3,$t3,$sigma1[1]-$sigma1[0])', 1579 '&vpxor ($t2,$t2,$t3)', 1580 '&vpshufb ($t2,$t2,$t5)', 1581 '&vpaddd (@X[0],@X[0],$t2)' # X[2..3] += sigma1(X[16..17]) 1582 ); 1583 } 1584 1585 sub AVX_256_00_47 () { 1586 my $j = shift; 1587 my $body = shift; 1588 my @X = @_; 1589 my @insns = (&$body,&$body,&$body,&$body); # 104 instructions 1590 1591 foreach (Xupdate_256_AVX()) { # 29 instructions 1592 eval; 1593 eval(shift(@insns)); 1594 eval(shift(@insns)); 1595 eval(shift(@insns)); 1596 } 1597 &vpaddd ($t2,@X[0],16*2*$j."($Tbl)"); 1598 foreach (@insns) { eval; } # remaining instructions 1599 &vmovdqa (16*$j."(%rsp)",$t2); 1600 } 1601 1602 for ($i=0,$j=0; $j<4; $j++) { 1603 &AVX_256_00_47($j,\&body_00_15,@X); 1604 push(@X,shift(@X)); # rotate(@X) 1605 } 1606 &cmpb ($SZ-1+16*2*$SZ."($Tbl)",0); 1607 &jne (".Lavx_00_47"); 1608 1609 for ($i=0; $i<16; ) { 1610 foreach(body_00_15()) { eval; } 1611 } 1612 1613 } else { # SHA512 1614 my @X = map("%xmm$_",(0..7)); 1615 my ($t0,$t1,$t2,$t3) = map("%xmm$_",(8..11)); 1616 1617 $code.=<<___; 1618 jmp .Lloop_avx 1619 .align 16 1620 .Lloop_avx: 1621 vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3 1622 vmovdqu 0x00($inp),@X[0] 1623 lea $TABLE+0x80(%rip),$Tbl # size optimization 1624 vmovdqu 0x10($inp),@X[1] 1625 vmovdqu 0x20($inp),@X[2] 1626 vpshufb $t3,@X[0],@X[0] 1627 vmovdqu 0x30($inp),@X[3] 1628 vpshufb $t3,@X[1],@X[1] 1629 vmovdqu 0x40($inp),@X[4] 1630 vpshufb $t3,@X[2],@X[2] 1631 vmovdqu 0x50($inp),@X[5] 1632 vpshufb $t3,@X[3],@X[3] 1633 vmovdqu 0x60($inp),@X[6] 1634 vpshufb $t3,@X[4],@X[4] 1635 vmovdqu 0x70($inp),@X[7] 1636 vpshufb $t3,@X[5],@X[5] 1637 vpaddq -0x80($Tbl),@X[0],$t0 1638 vpshufb $t3,@X[6],@X[6] 1639 vpaddq -0x60($Tbl),@X[1],$t1 1640 vpshufb $t3,@X[7],@X[7] 1641 vpaddq -0x40($Tbl),@X[2],$t2 1642 vpaddq -0x20($Tbl),@X[3],$t3 1643 vmovdqa $t0,0x00(%rsp) 1644 vpaddq 0x00($Tbl),@X[4],$t0 1645 vmovdqa $t1,0x10(%rsp) 1646 vpaddq 0x20($Tbl),@X[5],$t1 1647 vmovdqa $t2,0x20(%rsp) 1648 vpaddq 0x40($Tbl),@X[6],$t2 1649 vmovdqa $t3,0x30(%rsp) 1650 vpaddq 0x60($Tbl),@X[7],$t3 1651 vmovdqa $t0,0x40(%rsp) 1652 mov $A,$a1 1653 vmovdqa $t1,0x50(%rsp) 1654 mov $B,$a3 1655 vmovdqa $t2,0x60(%rsp) 1656 xor $C,$a3 # magic 1657 vmovdqa $t3,0x70(%rsp) 1658 mov $E,$a0 1659 jmp .Lavx_00_47 1660 1661 .align 16 1662 .Lavx_00_47: 1663 add \$`16*2*$SZ`,$Tbl 1664 ___ 1665 sub Xupdate_512_AVX () { 1666 ( 1667 '&vpalignr ($t0,@X[1],@X[0],$SZ)', # X[1..2] 1668 '&vpalignr ($t3,@X[5],@X[4],$SZ)', # X[9..10] 1669 '&vpsrlq ($t2,$t0,$sigma0[0])', 1670 '&vpaddq (@X[0],@X[0],$t3);', # X[0..1] += X[9..10] 1671 '&vpsrlq ($t3,$t0,$sigma0[2])', 1672 '&vpsllq ($t1,$t0,8*$SZ-$sigma0[1]);', 1673 '&vpxor ($t0,$t3,$t2)', 1674 '&vpsrlq ($t2,$t2,$sigma0[1]-$sigma0[0]);', 1675 '&vpxor ($t0,$t0,$t1)', 1676 '&vpsllq ($t1,$t1,$sigma0[1]-$sigma0[0]);', 1677 '&vpxor ($t0,$t0,$t2)', 1678 '&vpsrlq ($t3,@X[7],$sigma1[2]);', 1679 '&vpxor ($t0,$t0,$t1)', # sigma0(X[1..2]) 1680 '&vpsllq ($t2,@X[7],8*$SZ-$sigma1[1]);', 1681 '&vpaddq (@X[0],@X[0],$t0)', # X[0..1] += sigma0(X[1..2]) 1682 '&vpsrlq ($t1,@X[7],$sigma1[0]);', 1683 '&vpxor ($t3,$t3,$t2)', 1684 '&vpsllq ($t2,$t2,$sigma1[1]-$sigma1[0]);', 1685 '&vpxor ($t3,$t3,$t1)', 1686 '&vpsrlq ($t1,$t1,$sigma1[1]-$sigma1[0]);', 1687 '&vpxor ($t3,$t3,$t2)', 1688 '&vpxor ($t3,$t3,$t1)', # sigma1(X[14..15]) 1689 '&vpaddq (@X[0],@X[0],$t3)', # X[0..1] += sigma1(X[14..15]) 1690 ); 1691 } 1692 1693 sub AVX_512_00_47 () { 1694 my $j = shift; 1695 my $body = shift; 1696 my @X = @_; 1697 my @insns = (&$body,&$body); # 52 instructions 1698 1699 foreach (Xupdate_512_AVX()) { # 23 instructions 1700 eval; 1701 eval(shift(@insns)); 1702 eval(shift(@insns)); 1703 } 1704 &vpaddq ($t2,@X[0],16*2*$j-0x80."($Tbl)"); 1705 foreach (@insns) { eval; } # remaining instructions 1706 &vmovdqa (16*$j."(%rsp)",$t2); 1707 } 1708 1709 for ($i=0,$j=0; $j<8; $j++) { 1710 &AVX_512_00_47($j,\&body_00_15,@X); 1711 push(@X,shift(@X)); # rotate(@X) 1712 } 1713 &cmpb ($SZ-1+16*2*$SZ-0x80."($Tbl)",0); 1714 &jne (".Lavx_00_47"); 1715 1716 for ($i=0; $i<16; ) { 1717 foreach(body_00_15()) { eval; } 1718 } 1719 } 1720 $code.=<<___; 1721 mov $_ctx,$ctx 1722 mov $a1,$A 1723 1724 add $SZ*0($ctx),$A 1725 lea 16*$SZ($inp),$inp 1726 add $SZ*1($ctx),$B 1727 add $SZ*2($ctx),$C 1728 add $SZ*3($ctx),$D 1729 add $SZ*4($ctx),$E 1730 add $SZ*5($ctx),$F 1731 add $SZ*6($ctx),$G 1732 add $SZ*7($ctx),$H 1733 1734 cmp $_end,$inp 1735 1736 mov $A,$SZ*0($ctx) 1737 mov $B,$SZ*1($ctx) 1738 mov $C,$SZ*2($ctx) 1739 mov $D,$SZ*3($ctx) 1740 mov $E,$SZ*4($ctx) 1741 mov $F,$SZ*5($ctx) 1742 mov $G,$SZ*6($ctx) 1743 mov $H,$SZ*7($ctx) 1744 jb .Lloop_avx 1745 1746 mov $_rsp,%rsi 1747 vzeroupper 1748 ___ 1749 $code.=<<___ if ($win64); 1750 movaps 16*$SZ+32(%rsp),%xmm6 1751 movaps 16*$SZ+48(%rsp),%xmm7 1752 movaps 16*$SZ+64(%rsp),%xmm8 1753 movaps 16*$SZ+80(%rsp),%xmm9 1754 ___ 1755 $code.=<<___ if ($win64 && $SZ>4); 1756 movaps 16*$SZ+96(%rsp),%xmm10 1757 movaps 16*$SZ+112(%rsp),%xmm11 1758 ___ 1759 $code.=<<___; 1760 mov (%rsi),%r15 1761 mov 8(%rsi),%r14 1762 mov 16(%rsi),%r13 1763 mov 24(%rsi),%r12 1764 mov 32(%rsi),%rbp 1765 mov 40(%rsi),%rbx 1766 lea 48(%rsi),%rsp 1767 .Lepilogue_avx: 1768 ret 1769 .size ${func}_avx,.-${func}_avx 1770 ___ 1771 1772 if ($avx>1) {{ 1773 ###################################################################### 1774 # AVX2+BMI code path 1775 # 1776 my $a5=$SZ==4?"%esi":"%rsi"; # zap $inp 1777 my $PUSH8=8*2*$SZ; 1778 use integer; 1779 1780 sub bodyx_00_15 () { 1781 # at start $a1 should be zero, $a3 - $b^$c and $a4 copy of $f 1782 ( 1783 '($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'. 1784 1785 '&add ($h,(32*($i/(16/$SZ))+$SZ*($i%(16/$SZ)))%$PUSH8.$base)', # h+=X[i]+K[i] 1786 '&and ($a4,$e)', # f&e 1787 '&rorx ($a0,$e,$Sigma1[2])', 1788 '&rorx ($a2,$e,$Sigma1[1])', 1789 1790 '&lea ($a,"($a,$a1)")', # h+=Sigma0(a) from the past 1791 '&lea ($h,"($h,$a4)")', 1792 '&andn ($a4,$e,$g)', # ~e&g 1793 '&xor ($a0,$a2)', 1794 1795 '&rorx ($a1,$e,$Sigma1[0])', 1796 '&lea ($h,"($h,$a4)")', # h+=Ch(e,f,g)=(e&f)+(~e&g) 1797 '&xor ($a0,$a1)', # Sigma1(e) 1798 '&mov ($a2,$a)', 1799 1800 '&rorx ($a4,$a,$Sigma0[2])', 1801 '&lea ($h,"($h,$a0)")', # h+=Sigma1(e) 1802 '&xor ($a2,$b)', # a^b, b^c in next round 1803 '&rorx ($a1,$a,$Sigma0[1])', 1804 1805 '&rorx ($a0,$a,$Sigma0[0])', 1806 '&lea ($d,"($d,$h)")', # d+=h 1807 '&and ($a3,$a2)', # (b^c)&(a^b) 1808 '&xor ($a1,$a4)', 1809 1810 '&xor ($a3,$b)', # Maj(a,b,c)=Ch(a^b,c,b) 1811 '&xor ($a1,$a0)', # Sigma0(a) 1812 '&lea ($h,"($h,$a3)");'. # h+=Maj(a,b,c) 1813 '&mov ($a4,$e)', # copy of f in future 1814 1815 '($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;' 1816 ); 1817 # and at the finish one has to $a+=$a1 1818 } 1819 1820 $code.=<<___; 1821 .type ${func}_avx2,\@function,3 1822 .align 64 1823 ${func}_avx2: 1824 .Lavx2_shortcut: 1825 push %rbx 1826 push %rbp 1827 push %r12 1828 push %r13 1829 push %r14 1830 push %r15 1831 mov %rsp,%r11 # copy %rsp 1832 sub \$`2*$SZ*$rounds+4*8+$win64*16*($SZ==4?4:6)`,%rsp 1833 shl \$4,%rdx # num*16 1834 and \$-256*$SZ,%rsp # align stack frame 1835 lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ 1836 add \$`2*$SZ*($rounds-8)`,%rsp 1837 mov $ctx,$_ctx # save ctx, 1st arg 1838 mov $inp,$_inp # save inp, 2nd arh 1839 mov %rdx,$_end # save end pointer, "3rd" arg 1840 mov %r11,$_rsp # save copy of %rsp 1841 ___ 1842 $code.=<<___ if ($win64); 1843 movaps %xmm6,16*$SZ+32(%rsp) 1844 movaps %xmm7,16*$SZ+48(%rsp) 1845 movaps %xmm8,16*$SZ+64(%rsp) 1846 movaps %xmm9,16*$SZ+80(%rsp) 1847 ___ 1848 $code.=<<___ if ($win64 && $SZ>4); 1849 movaps %xmm10,16*$SZ+96(%rsp) 1850 movaps %xmm11,16*$SZ+112(%rsp) 1851 ___ 1852 $code.=<<___; 1853 .Lprologue_avx2: 1854 1855 vzeroupper 1856 sub \$-16*$SZ,$inp # inp++, size optimization 1857 mov $SZ*0($ctx),$A 1858 mov $inp,%r12 # borrow $T1 1859 mov $SZ*1($ctx),$B 1860 cmp %rdx,$inp # $_end 1861 mov $SZ*2($ctx),$C 1862 cmove %rsp,%r12 # next block or random data 1863 mov $SZ*3($ctx),$D 1864 mov $SZ*4($ctx),$E 1865 mov $SZ*5($ctx),$F 1866 mov $SZ*6($ctx),$G 1867 mov $SZ*7($ctx),$H 1868 ___ 1869 if ($SZ==4) { # SHA256 1870 my @X = map("%ymm$_",(0..3)); 1871 my ($t0,$t1,$t2,$t3, $t4,$t5) = map("%ymm$_",(4..9)); 1872 1873 $code.=<<___; 1874 vmovdqa $TABLE+`$SZ*2*$rounds`+32(%rip),$t4 1875 vmovdqa $TABLE+`$SZ*2*$rounds`+64(%rip),$t5 1876 jmp .Loop_avx2 1877 .align 16 1878 .Loop_avx2: 1879 vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3 1880 vmovdqu -16*$SZ+0($inp),%xmm0 1881 vmovdqu -16*$SZ+16($inp),%xmm1 1882 vmovdqu -16*$SZ+32($inp),%xmm2 1883 vmovdqu -16*$SZ+48($inp),%xmm3 1884 #mov $inp,$_inp # offload $inp 1885 vinserti128 \$1,(%r12),@X[0],@X[0] 1886 vinserti128 \$1,16(%r12),@X[1],@X[1] 1887 vpshufb $t3,@X[0],@X[0] 1888 vinserti128 \$1,32(%r12),@X[2],@X[2] 1889 vpshufb $t3,@X[1],@X[1] 1890 vinserti128 \$1,48(%r12),@X[3],@X[3] 1891 1892 lea $TABLE(%rip),$Tbl 1893 vpshufb $t3,@X[2],@X[2] 1894 vpaddd 0x00($Tbl),@X[0],$t0 1895 vpshufb $t3,@X[3],@X[3] 1896 vpaddd 0x20($Tbl),@X[1],$t1 1897 vpaddd 0x40($Tbl),@X[2],$t2 1898 vpaddd 0x60($Tbl),@X[3],$t3 1899 vmovdqa $t0,0x00(%rsp) 1900 xor $a1,$a1 1901 vmovdqa $t1,0x20(%rsp) 1902 lea -$PUSH8(%rsp),%rsp 1903 mov $B,$a3 1904 vmovdqa $t2,0x00(%rsp) 1905 xor $C,$a3 # magic 1906 vmovdqa $t3,0x20(%rsp) 1907 mov $F,$a4 1908 sub \$-16*2*$SZ,$Tbl # size optimization 1909 jmp .Lavx2_00_47 1910 1911 .align 16 1912 .Lavx2_00_47: 1913 ___ 1914 1915 sub AVX2_256_00_47 () { 1916 my $j = shift; 1917 my $body = shift; 1918 my @X = @_; 1919 my @insns = (&$body,&$body,&$body,&$body); # 96 instructions 1920 my $base = "+2*$PUSH8(%rsp)"; 1921 1922 &lea ("%rsp","-$PUSH8(%rsp)") if (($j%2)==0); 1923 foreach (Xupdate_256_AVX()) { # 29 instructions 1924 eval; 1925 eval(shift(@insns)); 1926 eval(shift(@insns)); 1927 eval(shift(@insns)); 1928 } 1929 &vpaddd ($t2,@X[0],16*2*$j."($Tbl)"); 1930 foreach (@insns) { eval; } # remaining instructions 1931 &vmovdqa ((32*$j)%$PUSH8."(%rsp)",$t2); 1932 } 1933 1934 for ($i=0,$j=0; $j<4; $j++) { 1935 &AVX2_256_00_47($j,\&bodyx_00_15,@X); 1936 push(@X,shift(@X)); # rotate(@X) 1937 } 1938 &lea ($Tbl,16*2*$SZ."($Tbl)"); 1939 &cmpb (($SZ-1)."($Tbl)",0); 1940 &jne (".Lavx2_00_47"); 1941 1942 for ($i=0; $i<16; ) { 1943 my $base=$i<8?"+$PUSH8(%rsp)":"(%rsp)"; 1944 foreach(bodyx_00_15()) { eval; } 1945 } 1946 } else { # SHA512 1947 my @X = map("%ymm$_",(0..7)); 1948 my ($t0,$t1,$t2,$t3) = map("%ymm$_",(8..11)); 1949 1950 $code.=<<___; 1951 jmp .Loop_avx2 1952 .align 16 1953 .Loop_avx2: 1954 vmovdqu -16*$SZ($inp),%xmm0 1955 vmovdqu -16*$SZ+16($inp),%xmm1 1956 vmovdqu -16*$SZ+32($inp),%xmm2 1957 lea $TABLE+0x80(%rip),$Tbl # size optimization 1958 vmovdqu -16*$SZ+48($inp),%xmm3 1959 vmovdqu -16*$SZ+64($inp),%xmm4 1960 vmovdqu -16*$SZ+80($inp),%xmm5 1961 vmovdqu -16*$SZ+96($inp),%xmm6 1962 vmovdqu -16*$SZ+112($inp),%xmm7 1963 #mov $inp,$_inp # offload $inp 1964 vmovdqa `$SZ*2*$rounds-0x80`($Tbl),$t2 1965 vinserti128 \$1,(%r12),@X[0],@X[0] 1966 vinserti128 \$1,16(%r12),@X[1],@X[1] 1967 vpshufb $t2,@X[0],@X[0] 1968 vinserti128 \$1,32(%r12),@X[2],@X[2] 1969 vpshufb $t2,@X[1],@X[1] 1970 vinserti128 \$1,48(%r12),@X[3],@X[3] 1971 vpshufb $t2,@X[2],@X[2] 1972 vinserti128 \$1,64(%r12),@X[4],@X[4] 1973 vpshufb $t2,@X[3],@X[3] 1974 vinserti128 \$1,80(%r12),@X[5],@X[5] 1975 vpshufb $t2,@X[4],@X[4] 1976 vinserti128 \$1,96(%r12),@X[6],@X[6] 1977 vpshufb $t2,@X[5],@X[5] 1978 vinserti128 \$1,112(%r12),@X[7],@X[7] 1979 1980 vpaddq -0x80($Tbl),@X[0],$t0 1981 vpshufb $t2,@X[6],@X[6] 1982 vpaddq -0x60($Tbl),@X[1],$t1 1983 vpshufb $t2,@X[7],@X[7] 1984 vpaddq -0x40($Tbl),@X[2],$t2 1985 vpaddq -0x20($Tbl),@X[3],$t3 1986 vmovdqa $t0,0x00(%rsp) 1987 vpaddq 0x00($Tbl),@X[4],$t0 1988 vmovdqa $t1,0x20(%rsp) 1989 vpaddq 0x20($Tbl),@X[5],$t1 1990 vmovdqa $t2,0x40(%rsp) 1991 vpaddq 0x40($Tbl),@X[6],$t2 1992 vmovdqa $t3,0x60(%rsp) 1993 lea -$PUSH8(%rsp),%rsp 1994 vpaddq 0x60($Tbl),@X[7],$t3 1995 vmovdqa $t0,0x00(%rsp) 1996 xor $a1,$a1 1997 vmovdqa $t1,0x20(%rsp) 1998 mov $B,$a3 1999 vmovdqa $t2,0x40(%rsp) 2000 xor $C,$a3 # magic 2001 vmovdqa $t3,0x60(%rsp) 2002 mov $F,$a4 2003 add \$16*2*$SZ,$Tbl 2004 jmp .Lavx2_00_47 2005 2006 .align 16 2007 .Lavx2_00_47: 2008 ___ 2009 2010 sub AVX2_512_00_47 () { 2011 my $j = shift; 2012 my $body = shift; 2013 my @X = @_; 2014 my @insns = (&$body,&$body); # 48 instructions 2015 my $base = "+2*$PUSH8(%rsp)"; 2016 2017 &lea ("%rsp","-$PUSH8(%rsp)") if (($j%4)==0); 2018 foreach (Xupdate_512_AVX()) { # 23 instructions 2019 eval; 2020 if ($_ !~ /\;$/) { 2021 eval(shift(@insns)); 2022 eval(shift(@insns)); 2023 eval(shift(@insns)); 2024 } 2025 } 2026 &vpaddq ($t2,@X[0],16*2*$j-0x80."($Tbl)"); 2027 foreach (@insns) { eval; } # remaining instructions 2028 &vmovdqa ((32*$j)%$PUSH8."(%rsp)",$t2); 2029 } 2030 2031 for ($i=0,$j=0; $j<8; $j++) { 2032 &AVX2_512_00_47($j,\&bodyx_00_15,@X); 2033 push(@X,shift(@X)); # rotate(@X) 2034 } 2035 &lea ($Tbl,16*2*$SZ."($Tbl)"); 2036 &cmpb (($SZ-1-0x80)."($Tbl)",0); 2037 &jne (".Lavx2_00_47"); 2038 2039 for ($i=0; $i<16; ) { 2040 my $base=$i<8?"+$PUSH8(%rsp)":"(%rsp)"; 2041 foreach(bodyx_00_15()) { eval; } 2042 } 2043 } 2044 $code.=<<___; 2045 mov `2*$SZ*$rounds`(%rsp),$ctx # $_ctx 2046 add $a1,$A 2047 #mov `2*$SZ*$rounds+8`(%rsp),$inp # $_inp 2048 lea `2*$SZ*($rounds-8)`(%rsp),$Tbl 2049 2050 add $SZ*0($ctx),$A 2051 add $SZ*1($ctx),$B 2052 add $SZ*2($ctx),$C 2053 add $SZ*3($ctx),$D 2054 add $SZ*4($ctx),$E 2055 add $SZ*5($ctx),$F 2056 add $SZ*6($ctx),$G 2057 add $SZ*7($ctx),$H 2058 2059 mov $A,$SZ*0($ctx) 2060 mov $B,$SZ*1($ctx) 2061 mov $C,$SZ*2($ctx) 2062 mov $D,$SZ*3($ctx) 2063 mov $E,$SZ*4($ctx) 2064 mov $F,$SZ*5($ctx) 2065 mov $G,$SZ*6($ctx) 2066 mov $H,$SZ*7($ctx) 2067 2068 cmp `$PUSH8+2*8`($Tbl),$inp # $_end 2069 je .Ldone_avx2 2070 2071 xor $a1,$a1 2072 mov $B,$a3 2073 xor $C,$a3 # magic 2074 mov $F,$a4 2075 jmp .Lower_avx2 2076 .align 16 2077 .Lower_avx2: 2078 ___ 2079 for ($i=0; $i<8; ) { 2080 my $base="+16($Tbl)"; 2081 foreach(bodyx_00_15()) { eval; } 2082 } 2083 $code.=<<___; 2084 lea -$PUSH8($Tbl),$Tbl 2085 cmp %rsp,$Tbl 2086 jae .Lower_avx2 2087 2088 mov `2*$SZ*$rounds`(%rsp),$ctx # $_ctx 2089 add $a1,$A 2090 #mov `2*$SZ*$rounds+8`(%rsp),$inp # $_inp 2091 lea `2*$SZ*($rounds-8)`(%rsp),%rsp 2092 2093 add $SZ*0($ctx),$A 2094 add $SZ*1($ctx),$B 2095 add $SZ*2($ctx),$C 2096 add $SZ*3($ctx),$D 2097 add $SZ*4($ctx),$E 2098 add $SZ*5($ctx),$F 2099 lea `2*16*$SZ`($inp),$inp # inp+=2 2100 add $SZ*6($ctx),$G 2101 mov $inp,%r12 2102 add $SZ*7($ctx),$H 2103 cmp $_end,$inp 2104 2105 mov $A,$SZ*0($ctx) 2106 cmove %rsp,%r12 # next block or stale data 2107 mov $B,$SZ*1($ctx) 2108 mov $C,$SZ*2($ctx) 2109 mov $D,$SZ*3($ctx) 2110 mov $E,$SZ*4($ctx) 2111 mov $F,$SZ*5($ctx) 2112 mov $G,$SZ*6($ctx) 2113 mov $H,$SZ*7($ctx) 2114 2115 jbe .Loop_avx2 2116 lea (%rsp),$Tbl 2117 2118 .Ldone_avx2: 2119 lea ($Tbl),%rsp 2120 mov $_rsp,%rsi 2121 vzeroupper 2122 ___ 2123 $code.=<<___ if ($win64); 2124 movaps 16*$SZ+32(%rsp),%xmm6 2125 movaps 16*$SZ+48(%rsp),%xmm7 2126 movaps 16*$SZ+64(%rsp),%xmm8 2127 movaps 16*$SZ+80(%rsp),%xmm9 2128 ___ 2129 $code.=<<___ if ($win64 && $SZ>4); 2130 movaps 16*$SZ+96(%rsp),%xmm10 2131 movaps 16*$SZ+112(%rsp),%xmm11 2132 ___ 2133 $code.=<<___; 2134 mov (%rsi),%r15 2135 mov 8(%rsi),%r14 2136 mov 16(%rsi),%r13 2137 mov 24(%rsi),%r12 2138 mov 32(%rsi),%rbp 2139 mov 40(%rsi),%rbx 2140 lea 48(%rsi),%rsp 2141 .Lepilogue_avx2: 2142 ret 2143 .size ${func}_avx2,.-${func}_avx2 2144 ___ 2145 }} 2146 }}}}} 2147 2148 # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, 2149 # CONTEXT *context,DISPATCHER_CONTEXT *disp) 2150 if ($win64) { 2151 $rec="%rcx"; 2152 $frame="%rdx"; 2153 $context="%r8"; 2154 $disp="%r9"; 2155 2156 $code.=<<___; 2157 .extern __imp_RtlVirtualUnwind 2158 .type se_handler,\@abi-omnipotent 2159 .align 16 2160 se_handler: 2161 push %rsi 2162 push %rdi 2163 push %rbx 2164 push %rbp 2165 push %r12 2166 push %r13 2167 push %r14 2168 push %r15 2169 pushfq 2170 sub \$64,%rsp 2171 2172 mov 120($context),%rax # pull context->Rax 2173 mov 248($context),%rbx # pull context->Rip 2174 2175 mov 8($disp),%rsi # disp->ImageBase 2176 mov 56($disp),%r11 # disp->HanderlData 2177 2178 mov 0(%r11),%r10d # HandlerData[0] 2179 lea (%rsi,%r10),%r10 # prologue label 2180 cmp %r10,%rbx # context->Rip<prologue label 2181 jb .Lin_prologue 2182 2183 mov 152($context),%rax # pull context->Rsp 2184 2185 mov 4(%r11),%r10d # HandlerData[1] 2186 lea (%rsi,%r10),%r10 # epilogue label 2187 cmp %r10,%rbx # context->Rip>=epilogue label 2188 jae .Lin_prologue 2189 ___ 2190 $code.=<<___ if ($avx>1); 2191 lea .Lavx2_shortcut(%rip),%r10 2192 cmp %r10,%rbx # context->Rip<avx2_shortcut 2193 jb .Lnot_in_avx2 2194 2195 and \$-256*$SZ,%rax 2196 add \$`2*$SZ*($rounds-8)`,%rax 2197 .Lnot_in_avx2: 2198 ___ 2199 $code.=<<___; 2200 mov %rax,%rsi # put aside Rsp 2201 mov 16*$SZ+3*8(%rax),%rax # pull $_rsp 2202 lea 48(%rax),%rax 2203 2204 mov -8(%rax),%rbx 2205 mov -16(%rax),%rbp 2206 mov -24(%rax),%r12 2207 mov -32(%rax),%r13 2208 mov -40(%rax),%r14 2209 mov -48(%rax),%r15 2210 mov %rbx,144($context) # restore context->Rbx 2211 mov %rbp,160($context) # restore context->Rbp 2212 mov %r12,216($context) # restore context->R12 2213 mov %r13,224($context) # restore context->R13 2214 mov %r14,232($context) # restore context->R14 2215 mov %r15,240($context) # restore context->R15 2216 2217 lea .Lepilogue(%rip),%r10 2218 cmp %r10,%rbx 2219 jb .Lin_prologue # non-AVX code 2220 2221 lea 16*$SZ+4*8(%rsi),%rsi # Xmm6- save area 2222 lea 512($context),%rdi # &context.Xmm6 2223 mov \$`$SZ==4?8:12`,%ecx 2224 .long 0xa548f3fc # cld; rep movsq 2225 2226 .Lin_prologue: 2227 mov 8(%rax),%rdi 2228 mov 16(%rax),%rsi 2229 mov %rax,152($context) # restore context->Rsp 2230 mov %rsi,168($context) # restore context->Rsi 2231 mov %rdi,176($context) # restore context->Rdi 2232 2233 mov 40($disp),%rdi # disp->ContextRecord 2234 mov $context,%rsi # context 2235 mov \$154,%ecx # sizeof(CONTEXT) 2236 .long 0xa548f3fc # cld; rep movsq 2237 2238 mov $disp,%rsi 2239 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER 2240 mov 8(%rsi),%rdx # arg2, disp->ImageBase 2241 mov 0(%rsi),%r8 # arg3, disp->ControlPc 2242 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry 2243 mov 40(%rsi),%r10 # disp->ContextRecord 2244 lea 56(%rsi),%r11 # &disp->HandlerData 2245 lea 24(%rsi),%r12 # &disp->EstablisherFrame 2246 mov %r10,32(%rsp) # arg5 2247 mov %r11,40(%rsp) # arg6 2248 mov %r12,48(%rsp) # arg7 2249 mov %rcx,56(%rsp) # arg8, (NULL) 2250 call *__imp_RtlVirtualUnwind(%rip) 2251 2252 mov \$1,%eax # ExceptionContinueSearch 2253 add \$64,%rsp 2254 popfq 2255 pop %r15 2256 pop %r14 2257 pop %r13 2258 pop %r12 2259 pop %rbp 2260 pop %rbx 2261 pop %rdi 2262 pop %rsi 2263 ret 2264 .size se_handler,.-se_handler 2265 ___ 2266 $code.=<<___ if ($SZ == 4 && $shaext); 2267 .type shaext_handler,\@abi-omnipotent 2268 .align 16 2269 shaext_handler: 2270 push %rsi 2271 push %rdi 2272 push %rbx 2273 push %rbp 2274 push %r12 2275 push %r13 2276 push %r14 2277 push %r15 2278 pushfq 2279 sub \$64,%rsp 2280 2281 mov 120($context),%rax # pull context->Rax 2282 mov 248($context),%rbx # pull context->Rip 2283 2284 lea .Lprologue_shaext(%rip),%r10 2285 cmp %r10,%rbx # context->Rip<.Lprologue 2286 jb .Lin_prologue 2287 2288 lea .Lepilogue_shaext(%rip),%r10 2289 cmp %r10,%rbx # context->Rip>=.Lepilogue 2290 jae .Lin_prologue 2291 2292 lea -8-5*16(%rax),%rsi 2293 lea 512($context),%rdi # &context.Xmm6 2294 mov \$10,%ecx 2295 .long 0xa548f3fc # cld; rep movsq 2296 2297 jmp .Lin_prologue 2298 .size shaext_handler,.-shaext_handler 2299 ___ 2300 $code.=<<___; 2301 .section .pdata 2302 .align 4 2303 .rva .LSEH_begin_$func 2304 .rva .LSEH_end_$func 2305 .rva .LSEH_info_$func 2306 ___ 2307 $code.=<<___ if ($SZ==4 && $shext); 2308 .rva .LSEH_begin_${func}_shaext 2309 .rva .LSEH_end_${func}_shaext 2310 .rva .LSEH_info_${func}_shaext 2311 ___ 2312 $code.=<<___ if ($SZ==4); 2313 .rva .LSEH_begin_${func}_ssse3 2314 .rva .LSEH_end_${func}_ssse3 2315 .rva .LSEH_info_${func}_ssse3 2316 ___ 2317 $code.=<<___ if ($avx && $SZ==8); 2318 .rva .LSEH_begin_${func}_xop 2319 .rva .LSEH_end_${func}_xop 2320 .rva .LSEH_info_${func}_xop 2321 ___ 2322 $code.=<<___ if ($avx); 2323 .rva .LSEH_begin_${func}_avx 2324 .rva .LSEH_end_${func}_avx 2325 .rva .LSEH_info_${func}_avx 2326 ___ 2327 $code.=<<___ if ($avx>1); 2328 .rva .LSEH_begin_${func}_avx2 2329 .rva .LSEH_end_${func}_avx2 2330 .rva .LSEH_info_${func}_avx2 2331 ___ 2332 $code.=<<___; 2333 .section .xdata 2334 .align 8 2335 .LSEH_info_$func: 2336 .byte 9,0,0,0 2337 .rva se_handler 2338 .rva .Lprologue,.Lepilogue # HandlerData[] 2339 ___ 2340 $code.=<<___ if ($SZ==4 && $shaext); 2341 .LSEH_info_${func}_shaext: 2342 .byte 9,0,0,0 2343 .rva shaext_handler 2344 ___ 2345 $code.=<<___ if ($SZ==4); 2346 .LSEH_info_${func}_ssse3: 2347 .byte 9,0,0,0 2348 .rva se_handler 2349 .rva .Lprologue_ssse3,.Lepilogue_ssse3 # HandlerData[] 2350 ___ 2351 $code.=<<___ if ($avx && $SZ==8); 2352 .LSEH_info_${func}_xop: 2353 .byte 9,0,0,0 2354 .rva se_handler 2355 .rva .Lprologue_xop,.Lepilogue_xop # HandlerData[] 2356 ___ 2357 $code.=<<___ if ($avx); 2358 .LSEH_info_${func}_avx: 2359 .byte 9,0,0,0 2360 .rva se_handler 2361 .rva .Lprologue_avx,.Lepilogue_avx # HandlerData[] 2362 ___ 2363 $code.=<<___ if ($avx>1); 2364 .LSEH_info_${func}_avx2: 2365 .byte 9,0,0,0 2366 .rva se_handler 2367 .rva .Lprologue_avx2,.Lepilogue_avx2 # HandlerData[] 2368 ___ 2369 } 2370 2371 sub sha256op38 { 2372 my $instr = shift; 2373 my %opcodelet = ( 2374 "sha256rnds2" => 0xcb, 2375 "sha256msg1" => 0xcc, 2376 "sha256msg2" => 0xcd ); 2377 2378 if (defined($opcodelet{$instr}) && @_[0] =~ /%xmm([0-7]),\s*%xmm([0-7])/) { 2379 my @opcode=(0x0f,0x38); 2380 push @opcode,$opcodelet{$instr}; 2381 push @opcode,0xc0|($1&7)|(($2&7)<<3); # ModR/M 2382 return ".byte\t".join(',',@opcode); 2383 } else { 2384 return $instr."\t".@_[0]; 2385 } 2386 } 2387 2388 foreach (split("\n",$code)) { 2389 s/\`([^\`]*)\`/eval $1/geo; 2390 2391 s/\b(sha256[^\s]*)\s+(.*)/sha256op38($1,$2)/geo; 2392 2393 print $_,"\n"; 2394 } 2395 close STDOUT; 2396